Latent semantic indexing (LSI) is an indexing and retrieval method that uses a mathematical technique called Singular value decomposition
(SVD) to identify patterns in the relationships between the terms and
concepts contained in an unstructured collection of text. LSI is based
on the principle that words that are used in the same contexts tend to
have similar meanings. A key feature of LSI is its ability to extract
the conceptual content of a body of text by establishing associations
between those terms that occur in similar contexts.
from gensim import corpora, models, similarities
documents = ["Human machine interface for lab abc computer applications",
"A survey of user opinion of computer system response time",
"The EPS user interface management system",
"System and human system engineering testing of EPS",
"Relation of user perceived response time to error measurement",
"The generation of random binary unordered trees",
"The intersection graph of paths in trees",
"Graph minors IV Widths of trees and well quasi ordering",
"Graph minors A survey"]
stoplist = set('for a of the and to in'.split())
texts = [[word for word in document.lower().split() if word not in stoplist] for document in documents]
all_tokens = sum(texts, [])
tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) == 1)
texts=[[word for word in text if word not in tokens_once] for text in texts]
dictionary = corpora.Dictionary(texts)
dictionary.save('/tmp/deerwester.dict')
print dictionary.token2id
new_doc = "Human computer interaction"
new_vec = dictionary.doc2bow(new_doc.lower().split()) #The function doc2bow() simply counts the number of occurences of each distinct word, converts the word to its integer word id and returns the result as a sparse vector.
print new_vec
corpus = [dictionary.doc2bow(text) for text in texts]
corpora.MmCorpus.serialize('/tmp/deerwester.mm', corpus) # store to disk, for later use
print corpus
#Corpus Streaming – One Document at a Time
class MyCorpus(object):
def __iter__(self):
for line in open('mycorpus.txt'):
# assume there's one document per line, tokens separated by whitespace
yield dictionary.doc2bow(line.lower().split())
#the corpus is now much more memory friendly, because at most one vector resides in RAM at a time
corpus_memory_friendly = MyCorpus() # doesn't load the corpus into memory!
for vector in corpus_memory_friendly: # load one vector into memory at a time
print vector
# topics and transformations
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
tfidf = models.TfidfModel(corpus) # step 1: initialize a model
corpus_tfidf = tfidf[corpus]
for doc in corpus_tfidf:
print doc
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=2) # initialize an LSI transformation
corpus_lsi = lsi[corpus_tfidf] # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi
lsi.print_topics(2) # the number of topics are self-chosen
for doc in corpus_lsi: # both bow->tfidf and tfidf->lsi transformations are actually executed here, on the fly
print doc
lsi.save('/tmp/model.lsi') # same for tfidf, lda, ...
lsi = models.LsiModel.load('/tmp/model.lsi')
#available transformations
model = tfidfmodel.TfidfModel(bow_corpus, normalize=True) #tf-idf
model = lsimodel.LsiModel(tfidf_corpus, id2word=dictionary, num_topics=300) #LSI
model = rpmodel.RpModel(tfidf_corpus, num_topics=500) #Random Projection
model = ldamodel.LdaModel(bow_corpus, id2word=dictionary, num_topics=100) #LDA
model = hdpmodel.HdpModel(bow_corpus, id2word=dictionary) #Hierarchical Dirichlet Process, HDP
# similarity queries
from gensim import corpora, models, similarities
dictionary = corpora.Dictionary.load('/tmp/deerwester.dict')
corpus = corpora.MmCorpus('/tmp/deerwester.mm')
vec_lsi = lsi[new_vec] # convert the query to LSI space
index = similarities.MatrixSimilarity(lsi[corpus]) # transform corpus to LSI space and index it
index.save('/tmp/deerwester.index')
index = similarities.MatrixSimilarity.load('/tmp/deerwester.index')
sims = index[vec_lsi] # perform a similarity query against the corpus
print list(enumerate(sims)) # print (document_number, document_similarity) 2-tuples
sims = sorted(enumerate(sims), key=lambda item: -item[1])
print sims # print sorted (document number, similarity score) 2-tuples
No comments:
Post a Comment