How To Use Gensim API

Posted on Jun 14, 2018 in Project Code • 12 min read

Gensim

Here is the code I wroted to compare Google machine translated text with the bio-translation.

In [ ]:
import pandas as pd
import numpy as np
import os
import tempfile

from gensim import corpora, models, similarities

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import nltk.data
nltk.download('punkt')

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
In [ ]:
df = pd.read_excel('PATH TO FILE')

# Convert Series slice into numpy array
documents = df.loc[:, 'articleEN'].values.reshape(len(df),1)

# Save numpy array as txt file
# IMPORTANT: Separate each item in array by lines (\n)
In [ ]:
documents1D = documents.flatten()
documents1D = documents1D.tolist()
In [ ]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
data = []
for item in documents1D:
    data.append(tokenizer.tokenize(item))
data_flat = [j for sub in data for j in sub]
In [ ]:
len(data_flat)
In [ ]:
file = open('documents.txt', 'w')
for item in data_flat:
    file.write("%s\n" % item)
In [ ]:
class MyCorpus(object):
    def __iter__(self):
        for line in open('documents.txt'):
            # assume there's one document per line, tokens separated by whitespace
            yield dictionary.doc2bow(line.lower().split())
In [ ]:
corpus = MyCorpus()
In [ ]:
# Construct dictionary without loading the entire file into memory
from six import iteritems

# Get a set of stopwords from NLTK 
stopwords = set(stopwords.words('english'))

# collect statistics about all tokens
dictionary = corpora.Dictionary(line.lower().split() for line in open('documents.txt'))

# remove stop words and words that appear only once
stop_ids = [dictionary.token2id[stopword] for stopword in stopwords if stopword in dictionary.token2id]
once_ids = [tokenid for tokenid, docfreq in iteritems(dictionary.dfs) if docfreq == 1]

# remove stop words and words that appear only once
dictionary.filter_tokens(stop_ids + once_ids)

# remove gaps in id sequence after words that were removed
dictionary.compactify()

# Save dictionary 
dictionary.save('corpus.dict')
In [ ]:
# Save corpus in the Matrix Market format
corpora.MmCorpus.serialize('corpus.mm', corpus)
In [ ]:
# Load corpus and dictionary 
if (os.path.exists("corpus.dict")):
    dictionary = corpora.Dictionary.load('corpus.dict')
    corpus = corpora.MmCorpus('corpus.mm')
    print("Used files generated from first tutorial")
else:
    print("Please run first tutorial to generate data set")
In [ ]:
# Step 1 -- initialize a model
tfidf = models.TfidfModel(corpus)

# Step 2 -- apply transformation to the corpus
corpus_tfidf = tfidf[corpus]

# initialize an LSI transformation
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=50) 

# create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi
corpus_lsi = lsi[corpus_tfidf] 
In [ ]:
# Save model
lsi.save('model.lsi')
In [ ]:
# transform corpus to LSI space and index it
index = similarities.MatrixSimilarity(lsi[corpus], num_features=len(dictionary))
In [ ]:
# Save index
index.save('corpus.index')
In [ ]:
documents_tr = df.loc[:, 'translatedEN'].values.reshape(len(df),1)
documents1D_tr = documents_tr.flatten()
documents1D_tr = documents1D_tr.tolist()
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
data_translated = []
for item in documents1D_tr:
    data_translated.append(tokenizer.tokenize(item))
data_translated = [j for sub in data_translated for j in sub]
In [ ]:
# Save tokenised data to txt
file = open('documents_translated.txt', 'w')
for item in data_translated:
    file.write("%s\n" % item)
In [ ]:
# Save tokenised data to csv
temp = pd.DataFrame(data={"translated": data_translated})
temp.to_csv("./documents_translated.csv", sep=',')
In [ ]:
len(data_flat)
In [ ]:
len(data_translated)
In [ ]:
queryLen = len(data_translated)
In [ ]:
# Load corpus
corpus = corpora.MmCorpus('corpus.mm')
# Load dictionary
dictionary = corpora.Dictionary.load('corpus.dict')
# Load model
lsi = models.LsiModel.load('model.lsi')
# Load index
index = similarities.MatrixSimilarity.load('corpus.index')
In [ ]:
sims_all = []
for idx, line in enumerate(data_translated):
    if (idx%500) == 0:
        print('Completed 500 lines! Hooray!!!! ' + str(queryLen - idx) + ' more lines to go!')
    try:
        vec_bow = dictionary.doc2bow(line.lower().split())
        vec_lsi = lsi[vec_bow]
        sims_all.append(max(index[vec_lsi]))
    except Exception as e:
        print(e)
        sims_all.append('null')
In [ ]:
# Save tokenised data to csv
temp = pd.DataFrame(data={"sims": sims_all})
temp.to_csv("./sims.csv", sep=',')