mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-25 21:21:10 +03:00 
			
		
		
		
	Format example
This commit is contained in:
		
							parent
							
								
									45798cc53e
								
							
						
					
					
						commit
						40b57ea4ac
					
				|  | @ -55,15 +55,15 @@ import spacy | |||
|     patterns_loc=("Path to gazetteer", "positional", None, str), | ||||
|     text_loc=("Path to Reddit corpus file", "positional", None, str), | ||||
|     n=("Number of texts to read", "option", "n", int), | ||||
|     lang=("Language class to initialise", "option", "l", str)) | ||||
| def main(patterns_loc, text_loc, n=10000, lang='en'): | ||||
|     nlp = spacy.blank('en') | ||||
|     lang=("Language class to initialise", "option", "l", str), | ||||
| ) | ||||
| def main(patterns_loc, text_loc, n=10000, lang="en"): | ||||
|     nlp = spacy.blank("en") | ||||
|     nlp.vocab.lex_attr_getters = {} | ||||
|     phrases = read_gazetteer(nlp.tokenizer, patterns_loc) | ||||
|     count = 0 | ||||
|     t1 = time.time() | ||||
|     for ent_id, text in get_matches(nlp.tokenizer, phrases, | ||||
|                                     read_text(text_loc, n=n)): | ||||
|     for ent_id, text in get_matches(nlp.tokenizer, phrases, read_text(text_loc, n=n)): | ||||
|         count += 1 | ||||
|     t2 = time.time() | ||||
|     print("%d docs in %.3f s. %d matches" % (n, (t2 - t1), count)) | ||||
|  | @ -72,7 +72,7 @@ def main(patterns_loc, text_loc, n=10000, lang='en'): | |||
| def read_gazetteer(tokenizer, loc, n=-1): | ||||
|     for i, line in enumerate(open(loc)): | ||||
|         data = ujson.loads(line.strip()) | ||||
|         phrase = tokenizer(data['text']) | ||||
|         phrase = tokenizer(data["text"]) | ||||
|         for w in phrase: | ||||
|             _ = tokenizer.vocab[w.text] | ||||
|         if len(phrase) >= 2: | ||||
|  | @ -83,14 +83,14 @@ def read_text(bz2_loc, n=10000): | |||
|     with BZ2File(bz2_loc) as file_: | ||||
|         for i, line in enumerate(file_): | ||||
|             data = ujson.loads(line) | ||||
|             yield data['body'] | ||||
|             yield data["body"] | ||||
|             if i >= n: | ||||
|                 break | ||||
| 
 | ||||
| 
 | ||||
| def get_matches(tokenizer, phrases, texts, max_length=6): | ||||
|     matcher = PhraseMatcher(tokenizer.vocab, max_length=max_length) | ||||
|     matcher.add('Phrase', None, *phrases) | ||||
|     matcher.add("Phrase", None, *phrases) | ||||
|     for text in texts: | ||||
|         doc = tokenizer(text) | ||||
|         for w in doc: | ||||
|  | @ -100,10 +100,11 @@ def get_matches(tokenizer, phrases, texts, max_length=6): | |||
|             yield (ent_id, doc[start:end].text) | ||||
| 
 | ||||
| 
 | ||||
| if __name__ == '__main__': | ||||
| if __name__ == "__main__": | ||||
|     if False: | ||||
|         import cProfile | ||||
|         import pstats | ||||
| 
 | ||||
|         cProfile.runctx("plac.call(main)", globals(), locals(), "Profile.prof") | ||||
|         s = pstats.Stats("Profile.prof") | ||||
|         s.strip_dirs().sort_stats("time").print_stats() | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user