Web Scraping

Here is the simple code used to scrape Liberty Times Net.

In [ ]:
import requests
import re
import pandas as pd
from bs4 import BeautifulSoup

# Define a helper function to remove all occurences of a certain value from a list
def remove_values_from_list(the_list, val):
   return [value for value in the_list if value != val]

# Get the links to every article up to page n
def getLinks(n):
    
    # Start at page 1
    path = 'http://iservice.ltn.com.tw/Service/english/index.php?page=1'
    
    # Main list with all links to return
    links_all = []
    
    # Loop through pages till page n
    for i in range(n):
        page = requests.get(path)
        soup = BeautifulSoup(page.content, 'html.parser')
        index = soup.find('div', id='content_english')
        
        # Select links and store them in a list
        links_res = index.find_all('a')
        links_temp = []
        for item in links_res: 
            links_temp.append(item['href'])
        del links_temp[10:]

        # Concat proper full url
        for idx, val in enumerate(links_temp):
            links_temp[idx] = 'http://iservice.ltn.com.tw/Service/english/' + val
        
        # Append to main list
        links_all.extend(links_temp)
        
        # Update path to next page
        s = path.index('=')
        path = path.replace(path[s+1:], str(i+2))
        
    return links_all

# Get all information we want from an article
def extractContent(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')

    # Get title
    try: 
        title = soup.find('h2', class_='title').get_text()
        title = title[9:]
    except:
        title = 'null'
    
    # Append all content into a list
    try:
        content_raw = soup.find(id='newsContent')
        content = content_raw.find_all('p')
        article = []
        for item in content:
            item = item.get_text().replace('\r\n', '')
            article.append(item)
        article = remove_values_from_list(article, '\n')

        # Get name of translator
        translator = article[0]
        for item in translator:
            if item == '◎' or item == ' ':
                translator = translator.replace(item,'')

        # Get rid of translator and dictionary
        del article[0]
        del article[-4:]

        # Separate article into EN and CH
        articleEN = []
        articleCH = []
        for idx, val in enumerate(article):
            if idx%2 == 0:
                articleEN.append(val)
            else:
                articleCH.append(val)
    except:
        translator = 'null'
        articleEN = ['null']
        articleCH = ['null']
    return title, translator, articleEN, articleCH


# Main code
links = getLinks(300)
title = []
translator = []
articleEN = []
articleCH = []
url = []
for idx, link in enumerate(links):
    if ((idx+1)%10) == 0:
        print('Now on page ' + str(int((idx+1)/10)))
    title_temp, translator_temp, articleEN_temp, articleCH_temp = extractContent(link)
    title.append(title_temp)
    translator.append(translator_temp)
    articleEN.append(articleEN_temp)
    articleCH.append(articleCH_temp)
    url.append(link)

# Store results in pandas.DataFrame
results = pd.DataFrame({
        'Title': title,
        'Translator': translator,
        'articleEN': articleEN,
        'articleCH': articleCH,
        'url': url
})

# Create a Pandas Excel writer using XlsxWriter as the engine.
writer = pd.ExcelWriter('liberty_results.xlsx', engine='xlsxwriter')

# Convert the dataframe to an XlsxWriter Excel object.
results.to_excel(writer, sheet_name='1-300')

# Close the Pandas Excel writer and output the Excel file.
writer.save()