mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 07:57:35 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			99 lines
		
	
	
		
			3.5 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			99 lines
		
	
	
		
			3.5 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| """Match a large set of multi-word expressions in O(1) time.
 | |
| 
 | |
| The idea is to associate each word in the vocabulary with a tag, noting whether
 | |
| they begin, end, or are inside at least one pattern. An additional tag is used
 | |
| for single-word patterns. Complete patterns are also stored in a hash set.
 | |
| 
 | |
| When we process a document, we look up the words in the vocabulary, to associate
 | |
| the words with the tags.  We then search for tag-sequences that correspond to
 | |
| valid candidates. Finally, we look up the candidates in the hash set.
 | |
| 
 | |
| For instance, to search for the phrases "Barack Hussein Obama" and "Hilary Clinton", we
 | |
| would associate "Barack" and "Hilary" with the B tag, Hussein with the I tag,
 | |
| and Obama and Clinton with the L tag.
 | |
| 
 | |
| The document "Barack Clinton and Hilary Clinton" would have the tag sequence
 | |
| [{B}, {L}, {}, {B}, {L}], so we'd get two matches. However, only the second candidate
 | |
| is in the phrase dictionary, so only one is returned as a match.
 | |
| 
 | |
| The algorithm is O(n) at run-time for document of length n because we're only ever
 | |
| matching over the tag patterns. So no matter how many phrases we're looking for,
 | |
| our pattern set stays very small (exact size depends on the maximum length we're
 | |
| looking for, as the query language currently has no quantifiers)
 | |
| """
 | |
| from __future__ import print_function, unicode_literals, division
 | |
| from ast import literal_eval
 | |
| from bz2 import BZ2File
 | |
| import time
 | |
| import math
 | |
| import codecs
 | |
| 
 | |
| import plac
 | |
| 
 | |
| from preshed.maps import PreshMap
 | |
| from preshed.counter import PreshCounter
 | |
| from spacy.strings import hash_string
 | |
| from spacy.en import English
 | |
| from spacy.matcher import PhraseMatcher
 | |
| 
 | |
| 
 | |
| def read_gazetteer(tokenizer, loc, n=-1):
 | |
|     for i, line in enumerate(open(loc)):
 | |
|         phrase = literal_eval('u' + line.strip())
 | |
|         if ' (' in phrase and phrase.endswith(')'):
 | |
|             phrase = phrase.split(' (', 1)[0]
 | |
|         if i >= n:
 | |
|             break
 | |
|         phrase = tokenizer(phrase)
 | |
|         if all((t.is_lower and t.prob >= -10) for t in phrase):
 | |
|             continue
 | |
|         if len(phrase) >= 2:
 | |
|             yield phrase
 | |
| 
 | |
| 
 | |
| def read_text(bz2_loc):
 | |
|     with BZ2File(bz2_loc) as file_:
 | |
|         for line in file_:
 | |
|             yield line.decode('utf8')
 | |
| 
 | |
| 
 | |
| def get_matches(tokenizer, phrases, texts, max_length=6):
 | |
|     matcher = PhraseMatcher(tokenizer.vocab, phrases, max_length=max_length)
 | |
|     print("Match")
 | |
|     for text in texts:
 | |
|         doc = tokenizer(text)
 | |
|         matches = matcher(doc)
 | |
|         for mwe in doc.ents:
 | |
|             yield mwe
 | |
| 
 | |
| 
 | |
| def main(patterns_loc, text_loc, counts_loc, n=10000000):
 | |
|     nlp = English(parser=False, tagger=False, entity=False)
 | |
|     print("Make matcher")
 | |
|     phrases = read_gazetteer(nlp.tokenizer, patterns_loc, n=n)
 | |
|     counts = PreshCounter()
 | |
|     t1 = time.time()
 | |
|     for mwe in get_matches(nlp.tokenizer, phrases, read_text(text_loc)):
 | |
|         counts.inc(hash_string(mwe.text), 1)
 | |
|     t2 = time.time()
 | |
|     print("10m tokens in %d s" % (t2 - t1))
 | |
|     
 | |
|     with codecs.open(counts_loc, 'w', 'utf8') as file_:
 | |
|         for phrase in read_gazetteer(nlp.tokenizer, patterns_loc, n=n):
 | |
|             text = phrase.string
 | |
|             key = hash_string(text)
 | |
|             count = counts[key]
 | |
|             if count != 0:
 | |
|                 file_.write('%d\t%s\n' % (count, text))
 | |
|     
 | |
| 
 | |
| if __name__ == '__main__':
 | |
|     if False:
 | |
|         import cProfile
 | |
|         import pstats
 | |
|         cProfile.runctx("plac.call(main)", globals(), locals(), "Profile.prof")
 | |
|         s = pstats.Stats("Profile.prof")
 | |
|         s.strip_dirs().sort_stats("time").print_stats()
 | |
|     else:
 | |
|         plac.call(main)
 |