Scraping Liberty Times Net
Posted on Jun 14, 2018 in Project Code • 11 min read
Web Scraping -LTN¶
This is the python web scraping code I've written to collect corpora for a translation project.
Disclaimer: the results are for non-profit student research only.
In [ ]:
import requests
import re
import pandas as pd
from bs4 import BeautifulSoup
In [ ]:
# Define a helper function to remove all occurences of a certain value from a list
def remove_values_from_list(the_list, val):
return [value for value in the_list if value != val]
First loop through n pages and get all the links to every article.
In [ ]:
# Get the links to every article up to page n
def getLinks(n):
# Start at page 1
path = 'http://iservice.ltn.com.tw/Service/english/index.php?page=1'
# Main list with all links to return
links_all = []
# Loop through pages till page n
for i in range(n):
page = requests.get(path)
soup = BeautifulSoup(page.content, 'html.parser')
index = soup.find('div', id='content_english')
# Select links and store them in a list
links_res = index.find_all('a')
links_temp = []
for item in links_res:
links_temp.append(item['href'])
del links_temp[10:]
# Concat proper full url
for idx, val in enumerate(links_temp):
links_temp[idx] = 'http://iservice.ltn.com.tw/Service/english/' + val
# Append to main list
links_all.extend(links_temp)
# Update path to next page
s = path.index('=')
path = path.replace(path[s+1:], str(i+2))
return links_all
For each link, extract the article and separate the English part and the Chinese part.
In [ ]:
# Get all information we want from an article
def extractContent(url):
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
# Get title
try:
title = soup.find('h2', class_='title').get_text()
title = title[9:]
except:
title = 'null'
# Append all content into a list
try:
content_raw = soup.find(id='newsContent')
content = content_raw.find_all('p')
article = []
for item in content:
item = item.get_text().replace('\r\n', '')
article.append(item)
article = remove_values_from_list(article, '\n')
# Get name of translator
translator = article[0]
for item in translator:
if item == '◎' or item == ' ':
translator = translator.replace(item,'')
# Get rid of translator and dictionary
del article[0]
del article[-4:]
# Separate article into EN and CH
articleEN = []
articleCH = []
for idx, val in enumerate(article):
if idx%2 == 0:
articleEN.append(val)
else:
articleCH.append(val)
except:
translator = 'null'
articleEN = ['null']
articleCH = ['null']
return title, translator, articleEN, articleCH
Execute, store results in a pandas dataframe, and save as an excel file.
In [ ]:
# Main code
links = getLinks(300)
title = []
translator = []
articleEN = []
articleCH = []
url = []
for idx, link in enumerate(links):
if ((idx+1)%10) == 0:
print('Now on page ' + str(int((idx+1)/10)))
title_temp, translator_temp, articleEN_temp, articleCH_temp = extractContent(link)
title.append(title_temp)
translator.append(translator_temp)
articleEN.append(articleEN_temp)
articleCH.append(articleCH_temp)
url.append(link)
# Store results in pandas.DataFrame
results = pd.DataFrame({
'Title': title,
'Translator': translator,
'articleEN': articleEN,
'articleCH': articleCH,
'url': url
})
# Create a Pandas Excel writer using XlsxWriter as the engine.
writer = pd.ExcelWriter('liberty_results.xlsx', engine='xlsxwriter')
# Convert the dataframe to an XlsxWriter Excel object.
results.to_excel(writer, sheet_name='1-300')
# Close the Pandas Excel writer and output the Excel file.
writer.save()