mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	* Add multi word matching code
This commit is contained in:
		
							parent
							
								
									262c215b55
								
							
						
					
					
						commit
						e4ba8a4b5a
					
				
							
								
								
									
										94
									
								
								examples/multi_word_matches.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										94
									
								
								examples/multi_word_matches.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,94 @@
 | 
				
			||||||
 | 
					"""Match a large set of multi-word expressions in O(1) time.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					The idea is to associate each word in the vocabulary with a tag, noting whether
 | 
				
			||||||
 | 
					they begin, end, or are inside at least one pattern. An additional tag is used
 | 
				
			||||||
 | 
					for single-word patterns. Complete patterns are also stored in a hash set.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					When we process a document, we look up the words in the vocabulary, to associate
 | 
				
			||||||
 | 
					the words with the tags.  We then search for tag-sequences that correspond to
 | 
				
			||||||
 | 
					valid candidates. Finally, we look up the candidates in the hash set.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					For instance, to search for the phrases "Barack Hussein Obama" and "Hilary Clinton", we
 | 
				
			||||||
 | 
					would associate "Barack" and "Hilary" with the B tag, Hussein with the I tag,
 | 
				
			||||||
 | 
					and Obama and Clinton with the L tag.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					The document "Barack Clinton and Hilary Clinton" would have the tag sequence
 | 
				
			||||||
 | 
					[{B}, {L}, {}, {B}, {L}], so we'd get two matches. However, only the second candidate
 | 
				
			||||||
 | 
					is in the phrase dictionary, so only one is returned as a match.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					The algorithm is O(n) at run-time for document of length n because we're only ever
 | 
				
			||||||
 | 
					matching over the tag patterns. So no matter how many phrases we're looking for,
 | 
				
			||||||
 | 
					our pattern set stays very small (exact size depends on the maximum length we're
 | 
				
			||||||
 | 
					looking for, as the query language currently has no quantifiers)
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					from __future__ import print_function, unicode_literals, division
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import plac
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from preshed.maps import PreshMap
 | 
				
			||||||
 | 
					from spacy.strings import hash_string
 | 
				
			||||||
 | 
					from spacy.en import English
 | 
				
			||||||
 | 
					from spacy.matcher import Matcher
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from spacy.attrs import FLAG63 as U_ENT
 | 
				
			||||||
 | 
					from spacy.attrs import FLAG62 as L_ENT
 | 
				
			||||||
 | 
					from spacy.attrs import FLAG61 as I_ENT
 | 
				
			||||||
 | 
					from spacy.attrs import FLAG60 as B_ENT
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def get_bilou(length):
 | 
				
			||||||
 | 
					    if length == 1:
 | 
				
			||||||
 | 
					        return [U_ENT]
 | 
				
			||||||
 | 
					    else:
 | 
				
			||||||
 | 
					        return [B_ENT] + [I_ENT] * (length - 2) + [L_ENT]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def make_matcher(vocab, max_length):
 | 
				
			||||||
 | 
					    abstract_patterns = []
 | 
				
			||||||
 | 
					    for length in range(1, max_length+1):
 | 
				
			||||||
 | 
					        abstract_patterns.append([{tag: True} for tag in get_bilou(length)])
 | 
				
			||||||
 | 
					    return Matcher(vocab, {'Candidate': ('CAND', {}, abstract_patterns)})
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def get_matches(matcher, pattern_ids, doc):
 | 
				
			||||||
 | 
					    matches = []
 | 
				
			||||||
 | 
					    for label, start, end in matcher(doc):
 | 
				
			||||||
 | 
					        candidate = doc[start : end]
 | 
				
			||||||
 | 
					        if pattern_ids[hash_string(candidate.text)] == True:
 | 
				
			||||||
 | 
					            start = candidate[0].idx
 | 
				
			||||||
 | 
					            end = candidate[-1].idx + len(candidate[-1])
 | 
				
			||||||
 | 
					            matches.append((start, end, candidate.root.tag_, candidate.text))
 | 
				
			||||||
 | 
					    return matches
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def merge_matches(doc, matches):
 | 
				
			||||||
 | 
					    for start, end, tag, text in matches:
 | 
				
			||||||
 | 
					        doc.merge(start, end, tag, text, 'MWE')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def main():
 | 
				
			||||||
 | 
					    nlp = English(parser=False, tagger=False, entity=False)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    gazetteer = [u'M.I.A.', 'Shiny Happy People', 'James E. Jones']
 | 
				
			||||||
 | 
					    example_text = u'The artist M.I.A. did a cover of Shiny Happy People. People is not an entity.'
 | 
				
			||||||
 | 
					    pattern_ids = PreshMap()
 | 
				
			||||||
 | 
					    max_length = 0
 | 
				
			||||||
 | 
					    for pattern_str in gazetteer:
 | 
				
			||||||
 | 
					        pattern = nlp.tokenizer(pattern_str)
 | 
				
			||||||
 | 
					        bilou_tags = get_bilou(len(pattern))
 | 
				
			||||||
 | 
					        for word, tag in zip(pattern, bilou_tags):
 | 
				
			||||||
 | 
					            lexeme = nlp.vocab[word.orth]
 | 
				
			||||||
 | 
					            lexeme.set_flag(tag, True)
 | 
				
			||||||
 | 
					        pattern_ids[hash_string(pattern.text)] = True
 | 
				
			||||||
 | 
					        max_length = max(max_length, len(pattern))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    matcher = make_matcher(nlp.vocab, max_length)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    doc = nlp(example_text)
 | 
				
			||||||
 | 
					    matches = get_matches(matcher, pattern_ids, doc)
 | 
				
			||||||
 | 
					    merge_matches(doc, matches)
 | 
				
			||||||
 | 
					    for token in doc:
 | 
				
			||||||
 | 
					        print(token.text, token.ent_type_)
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					if __name__ == '__main__':
 | 
				
			||||||
 | 
					    plac.call(main)
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user