In [ ]:
import pandas as pd
import numpy as np
import os
import tempfile
from gensim import corpora, models, similarities
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import nltk.data
nltk.download('punkt')
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
df = pd.read_excel('/Users/Shayne/webScraping/final_EN.xlsx')
# Convert Series slice into numpy array
documents = df.loc[:, 'articleEN'].values.reshape(len(df),1)
# Save numpy array as txt file
# IMPORTANT: Separate each item in array by lines (\n)
# np.savetxt('documents.txt',documents,fmt='%s',delimiter='\n')
documents1D = documents.flatten()
documents1D = documents1D.tolist()
# Tokenise corpus with nltk
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
data = []
for item in documents1D:
data.append(tokenizer.tokenize(item))
data_flat = [j for sub in data for j in sub]
# Save tokenised data one doc per line
file = open('documents.txt', 'w')
for item in data_flat:
file.write("%s\n" % item)
class MyCorpus(object):
def __iter__(self):
for line in open('documents.txt'):
# assume there's one document per line, tokens separated by whitespace
yield dictionary.doc2bow(line.lower().split())
corpus = MyCorpus()
# Construct dictionary without loading the entire file into memory
from six import iteritems
# Get a set of stopwords from NLTK
stopwords = set(stopwords.words('english'))
# collect statistics about all tokens
dictionary = corpora.Dictionary(line.lower().split() for line in open('documents.txt'))
# remove stop words and words that appear only once
stop_ids = [dictionary.token2id[stopword] for stopword in stopwords if stopword in dictionary.token2id]
once_ids = [tokenid for tokenid, docfreq in iteritems(dictionary.dfs) if docfreq == 1]
# remove stop words and words that appear only once
dictionary.filter_tokens(stop_ids + once_ids)
# remove gaps in id sequence after words that were removed
dictionary.compactify()
# Save dictionary
dictionary.save('corpus.dict')
# Save corpus in the Matrix Market format
corpora.MmCorpus.serialize('corpus.mm', corpus)
# Load corpus and dictionary
if (os.path.exists("corpus.dict")):
dictionary = corpora.Dictionary.load('corpus.dict')
corpus = corpora.MmCorpus('corpus.mm')
print("Used files generated from first tutorial")
else:
print("Please run first tutorial to generate data set")
# Step 1 -- initialize a model
tfidf = models.TfidfModel(corpus)
# Step 2 -- apply transformation to the corpus
corpus_tfidf = tfidf[corpus]
# initialize an LSI transformation
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=50)
# create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi
corpus_lsi = lsi[corpus_tfidf]
# Save model
lsi.save('model.lsi')
# transform corpus to LSI space and index it
index = similarities.MatrixSimilarity(lsi[corpus], num_features=len(dictionary))
# Save index
index.save('corpus.index')
documents_tr = df.loc[:, 'translatedEN'].values.reshape(len(df),1)
documents1D_tr = documents_tr.flatten()
documents1D_tr = documents1D_tr.tolist()
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
data_translated = []
for item in documents1D_tr:
data_translated.append(tokenizer.tokenize(item))
data_translated = [j for sub in data_translated for j in sub]
# Save tokenised data to txt
file = open('documents_translated.txt', 'w')
for item in data_translated:
file.write("%s\n" % item)
# Save tokenised data to csv
temp = pd.DataFrame(data={"translated": data_translated})
temp.to_csv("./documents_translated.csv", sep=',')
# Load corpus
corpus = corpora.MmCorpus('corpus.mm')
# Load dictionary
dictionary = corpora.Dictionary.load('corpus.dict')
# Load model
lsi = models.LsiModel.load('model.lsi')
# Load index
index = similarities.MatrixSimilarity.load('corpus.index')
# Calculate sims
sims_all = []
for idx, line in enumerate(data_translated):
if (idx%500) == 0:
print('Completed 500 lines! Hooray!!!! ' + str(queryLen - idx) + ' more lines to go!')
try:
vec_bow = dictionary.doc2bow(line.lower().split())
vec_lsi = lsi[vec_bow]
sims_all.append(max(index[vec_lsi]))
except Exception as e:
print(e)
sims_all.append('null')
# Save tokenised data to csv
temp = pd.DataFrame(data={"sims": sims_all})
temp.to_csv("./sims.csv", sep=',')