mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 07:57:35 +03:00 
			
		
		
		
	Update Matcher docstrings and API docs
This commit is contained in:
		
							parent
							
								
									c8580da686
								
							
						
					
					
						commit
						fe5d8819ea
					
				|  | @ -159,14 +159,14 @@ def _convert_strings(token_specs, string_store): | |||
| 
 | ||||
| 
 | ||||
| def merge_phrase(matcher, doc, i, matches): | ||||
|     '''Callback to merge a phrase on match''' | ||||
|     """Callback to merge a phrase on match.""" | ||||
|     ent_id, label, start, end = matches[i] | ||||
|     span = doc[start : end] | ||||
|     span.merge(ent_type=label, ent_id=ent_id) | ||||
| 
 | ||||
| 
 | ||||
| cdef class Matcher: | ||||
|     '''Match sequences of tokens, based on pattern rules.''' | ||||
|     """Match sequences of tokens, based on pattern rules.""" | ||||
|     cdef Pool mem | ||||
|     cdef vector[TokenPatternC*] patterns | ||||
|     cdef readonly Vocab vocab | ||||
|  | @ -175,37 +175,13 @@ cdef class Matcher: | |||
|     cdef public object _callbacks | ||||
|     cdef public object _acceptors | ||||
| 
 | ||||
|     @classmethod | ||||
|     def load(cls, path, vocab): | ||||
|         """ | ||||
|         Load the matcher and patterns from a file path. | ||||
| 
 | ||||
|         Arguments: | ||||
|             path (Path): | ||||
|                 Path to a JSON-formatted patterns file. | ||||
|             vocab (Vocab): | ||||
|                 The vocabulary that the documents to match over will refer to. | ||||
|         Returns: | ||||
|             Matcher: The newly constructed object. | ||||
|         """ | ||||
|         if (path / 'gazetteer.json').exists(): | ||||
|             with (path / 'gazetteer.json').open('r', encoding='utf8') as file_: | ||||
|                 patterns = ujson.load(file_) | ||||
|         else: | ||||
|             patterns = {} | ||||
|         return cls(vocab, patterns) | ||||
| 
 | ||||
|     def __init__(self, vocab, patterns={}): | ||||
|         """ | ||||
|         Create the Matcher. | ||||
|         """Create the Matcher. | ||||
| 
 | ||||
|         Arguments: | ||||
|             vocab (Vocab): | ||||
|                 The vocabulary object, which must be shared with the documents | ||||
|                 the matcher will operate on. | ||||
|             patterns (dict): Patterns to add to the matcher. | ||||
|         Returns: | ||||
|             The newly constructed object. | ||||
|         vocab (Vocab): The vocabulary object, which must be shared with the | ||||
|             documents the matcher will operate on. | ||||
|         patterns (dict): Patterns to add to the matcher. | ||||
|         RETURNS (Matcher): The newly constructed object. | ||||
|         """ | ||||
|         self._patterns = {} | ||||
|         self._entities = {} | ||||
|  | @ -226,22 +202,15 @@ cdef class Matcher: | |||
| 
 | ||||
|     def add_entity(self, entity_key, attrs=None, if_exists='raise', | ||||
|                    acceptor=None, on_match=None): | ||||
|         """ | ||||
|         Add an entity to the matcher. | ||||
|         # TODO: replace with new Matcher.add() | ||||
|         """Add an entity to the matcher. | ||||
| 
 | ||||
|         Arguments: | ||||
|             entity_key (unicode or int): | ||||
|                 An ID for the entity. | ||||
|             attrs: | ||||
|                 Attributes to associate with the Matcher. | ||||
|             if_exists ('raise', 'ignore' or 'update'): | ||||
|                 Controls what happens if the entity ID already exists. Defaults to 'raise'. | ||||
|             acceptor: | ||||
|                 Callback function to filter matches of the entity. | ||||
|             on_match: | ||||
|                 Callback function to act on matches of the entity. | ||||
|         Returns: | ||||
|             None | ||||
|         entity_key (unicode or int): An ID for the entity. | ||||
|         attrs (dict): Attributes to associate with the `Matcher`. | ||||
|         if_exists (unicode): `'raise'`, `'ignore'` or `'update'`. Controls what | ||||
|             happens if the entity ID already exists. Defaults to `'raise'`. | ||||
|         acceptor (function): Callback function to filter matches of the entity. | ||||
|         on_match (function): Callback function to act on matches of the entity. | ||||
|         """ | ||||
|         if if_exists not in ('raise', 'ignore', 'update'): | ||||
|             raise ValueError( | ||||
|  | @ -264,18 +233,12 @@ cdef class Matcher: | |||
|         self._callbacks[entity_key] = on_match | ||||
| 
 | ||||
|     def add_pattern(self, entity_key, token_specs, label=""): | ||||
|         """ | ||||
|         Add a pattern to the matcher. | ||||
|         # TODO: replace with new Matcher.add() | ||||
|         """Add a pattern to the matcher. | ||||
| 
 | ||||
|         Arguments: | ||||
|             entity_key (unicode or int): | ||||
|                 An ID for the entity. | ||||
|             token_specs: | ||||
|                 Description of the pattern to be matched. | ||||
|             label: | ||||
|                 Label to assign to the matched pattern. Defaults to "". | ||||
|         Returns: | ||||
|             None | ||||
|         entity_key (unicode): An ID for the entity. | ||||
|         token_specs (list): Description of the pattern to be matched. | ||||
|         label (unicode): Label to assign to the matched pattern. Defaults to `""`. | ||||
|         """ | ||||
|         token_specs = list(token_specs) | ||||
|         if len(token_specs) == 0: | ||||
|  | @ -296,6 +259,7 @@ cdef class Matcher: | |||
|         self._patterns[entity_key].append((label, token_specs)) | ||||
| 
 | ||||
|     def add(self, entity_key, label, attrs, specs, acceptor=None, on_match=None): | ||||
|         # TODO: replace with new Matcher.add() | ||||
|         self.add_entity(entity_key, attrs=attrs, if_exists='update', | ||||
|                         acceptor=acceptor, on_match=on_match) | ||||
|         for spec in specs: | ||||
|  | @ -308,25 +272,21 @@ cdef class Matcher: | |||
|             return entity_key | ||||
| 
 | ||||
|     def has_entity(self, entity_key): | ||||
|         """ | ||||
|         Check whether the matcher has an entity. | ||||
|         # TODO: deprecate | ||||
|         """Check whether the matcher has an entity. | ||||
| 
 | ||||
|         Arguments: | ||||
|             entity_key (string or int): The entity key to check. | ||||
|         Returns: | ||||
|             bool: Whether the matcher has the entity. | ||||
|         entity_key (string or int): The entity key to check. | ||||
|         RETURNS (bool): Whether the matcher has the entity. | ||||
|         """ | ||||
|         entity_key = self.normalize_entity_key(entity_key) | ||||
|         return entity_key in self._entities | ||||
| 
 | ||||
|     def get_entity(self, entity_key): | ||||
|         """ | ||||
|         Retrieve the attributes stored for an entity. | ||||
|         # TODO: deprecate | ||||
|         """Retrieve the attributes stored for an entity. | ||||
| 
 | ||||
|         Arguments: | ||||
|             entity_key (unicode or int): The entity to retrieve. | ||||
|         Returns: | ||||
|             The entity attributes if present, otherwise None. | ||||
|         entity_key (unicode or int): The entity to retrieve. | ||||
|         RETURNS (dict): The entity attributes if present, otherwise None. | ||||
|         """ | ||||
|         entity_key = self.normalize_entity_key(entity_key) | ||||
|         if entity_key in self._entities: | ||||
|  | @ -335,17 +295,12 @@ cdef class Matcher: | |||
|             return None | ||||
| 
 | ||||
|     def __call__(self, Doc doc, acceptor=None): | ||||
|         """ | ||||
|         Find all token sequences matching the supplied patterns on the Doc. | ||||
|         """Find all token sequences matching the supplied patterns on the `Doc`. | ||||
| 
 | ||||
|         Arguments: | ||||
|             doc (Doc): | ||||
|                 The document to match over. | ||||
|         Returns: | ||||
|             list | ||||
|             A list of (entity_key, label_id, start, end) tuples, | ||||
|             describing the matches. A match tuple describes a span doc[start:end]. | ||||
|             The label_id and entity_key are both integers. | ||||
|         doc (Doc): The document to match over. | ||||
|         RETURNS (list): A list of `(entity_key, label_id, start, end)` tuples, | ||||
|             describing the matches. A match tuple describes a span | ||||
|             `doc[start:end]`. The `label_id` and `entity_key` are both integers. | ||||
|         """ | ||||
|         if acceptor is not None: | ||||
|             raise ValueError( | ||||
|  | @ -449,18 +404,13 @@ cdef class Matcher: | |||
|         return matches | ||||
| 
 | ||||
|     def pipe(self, docs, batch_size=1000, n_threads=2): | ||||
|         """ | ||||
|         Match a stream of documents, yielding them in turn. | ||||
|         """Match a stream of documents, yielding them in turn. | ||||
| 
 | ||||
|         Arguments: | ||||
|             docs: A stream of documents. | ||||
|             batch_size (int): | ||||
|                 The number of documents to accumulate into a working set. | ||||
|             n_threads (int): | ||||
|                 The number of threads with which to work on the buffer in parallel, | ||||
|                 if the Matcher implementation supports multi-threading. | ||||
|         Yields: | ||||
|             Doc Documents, in order. | ||||
|         docs (iterable): A stream of documents. | ||||
|         batch_size (int): The number of documents to accumulate into a working set. | ||||
|         n_threads (int): The number of threads with which to work on the buffer | ||||
|             in parallel, if the `Matcher` implementation supports multi-threading. | ||||
|         YIELDS (Doc): Documents, in order. | ||||
|         """ | ||||
|         for doc in docs: | ||||
|             self(doc) | ||||
|  |  | |||
|  | @ -4,31 +4,26 @@ include ../../_includes/_mixins | |||
| 
 | ||||
| p Match sequences of tokens, based on pattern rules. | ||||
| 
 | ||||
| +h(2, "load") Matcher.load | ||||
|     +tag classmethod | ||||
| 
 | ||||
| p Load the matcher and patterns from a file path. | ||||
| 
 | ||||
| +table(["Name", "Type", "Description"]) | ||||
|     +row | ||||
|         +cell #[code path] | ||||
|         +cell #[code Path] | ||||
|         +cell Path to a JSON-formatted patterns file. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code vocab] | ||||
|         +cell #[code Vocab] | ||||
|         +cell The vocabulary that the documents to match over will refer to. | ||||
| 
 | ||||
|     +footrow | ||||
|         +cell returns | ||||
|         +cell #[code Matcher] | ||||
|         +cell The newly constructed object. | ||||
| +infobox("⚠️ Deprecation note") | ||||
|     |  As of spaCy 2.0, #[code Matcher.add_pattern] and #[code Matcher.add_entity] | ||||
|     |  are deprecated and have been replaced with a simpler | ||||
|     |  #[+api("matcher#add") #[code Matcher.add]] that lets you add a list of | ||||
|     |  patterns and a callback for a given match ID. | ||||
|     |  #[code Matcher.load] (not useful, as it didn't allow specifying callbacks), | ||||
|     |  #[code Matcher.has_entity] and #[code Matcher.get_entity] (now redundant) | ||||
|     |  have been removed. | ||||
| 
 | ||||
| +h(2, "init") Matcher.__init__ | ||||
|     +tag method | ||||
| 
 | ||||
| p Create the Matcher. | ||||
| p Create the rule-based #[code Matcher]. | ||||
| 
 | ||||
| +aside-code("Example"). | ||||
|     from spacy.matcher import Matcher | ||||
|     from spacy.attrs import LOWER | ||||
| 
 | ||||
|     patterns = {"HelloWorld": [{LOWER: "hello"}, {LOWER: "world"}]} | ||||
|     matcher = Matcher(nlp.vocab) | ||||
| 
 | ||||
| +table(["Name", "Type", "Description"]) | ||||
|     +row | ||||
|  | @ -41,7 +36,7 @@ p Create the Matcher. | |||
|     +row | ||||
|         +cell #[code patterns] | ||||
|         +cell dict | ||||
|         +cell Patterns to add to the matcher. | ||||
|         +cell Patterns to add to the matcher, keyed by ID. | ||||
| 
 | ||||
|     +footrow | ||||
|         +cell returns | ||||
|  | @ -51,7 +46,28 @@ p Create the Matcher. | |||
| +h(2, "call") Matcher.__call__ | ||||
|     +tag method | ||||
| 
 | ||||
| p Find all token sequences matching the supplied patterns on the Doc. | ||||
| p Find all token sequences matching the supplied patterns on the #[code Doc]. | ||||
| 
 | ||||
| +aside-code("Example"). | ||||
|     from spacy.matcher import Matcher | ||||
|     from spacy.attrs import LOWER | ||||
| 
 | ||||
|     matcher = Matcher(nlp.vocab) | ||||
|     pattern = [{LOWER: "hello"}, {LOWER: "world"}] | ||||
|     matcher.add_pattern("HelloWorld", pattern, on_match=None) | ||||
|     doc = nlp(u'hello world!') | ||||
|     matches = matcher(doc) | ||||
| 
 | ||||
| +infobox("Important note") | ||||
|     |  By default, the matcher #[strong does not perform any action] on matches, | ||||
|     |  like tagging matched phrases with entity types. Instead, actions need to | ||||
|     |  be specified when #[strong adding patterns or entities], by | ||||
|     |  passing in a callback function as the #[code on_match] argument on | ||||
|     |  #[+api("matcher#add") #[code add]]. This allows you to define custom | ||||
|     |  actions per pattern within the same matcher. For example, you might only | ||||
|     |  want to merge some entity types, and set custom flags for other matched | ||||
|     |  patterns. For more details and examples, see the usage workflow on | ||||
|     |  #[+a("/docs/usage/rule-based-matching") rule-based matching]. | ||||
| 
 | ||||
| +table(["Name", "Type", "Description"]) | ||||
|     +row | ||||
|  | @ -76,7 +92,7 @@ p Match a stream of documents, yielding them in turn. | |||
| +table(["Name", "Type", "Description"]) | ||||
|     +row | ||||
|         +cell #[code docs] | ||||
|         +cell - | ||||
|         +cell iterable | ||||
|         +cell A stream of documents. | ||||
| 
 | ||||
|     +row | ||||
|  | @ -97,83 +113,44 @@ p Match a stream of documents, yielding them in turn. | |||
|         +cell #[code Doc] | ||||
|         +cell Documents, in order. | ||||
| 
 | ||||
| +h(2, "add_entity") Matcher.add_entity | ||||
| +h(2, "add_pattern") Matcher.add | ||||
|     +tag method | ||||
| 
 | ||||
| p Add an entity to the matcher. | ||||
| p | ||||
|     |  Add one or more patterns to the matcher, along with a callback function | ||||
|     |  to handle the matches. The callback function will receive the arguments | ||||
|     |  #[code matcher], #[code doc], #[code id] and #[code matches]. | ||||
| 
 | ||||
| +aside-code("Example"). | ||||
|     from spacy.matcher import Matcher | ||||
|     from spacy.attrs import LOWER, ORTH | ||||
| 
 | ||||
|     def on_match(matcher, doc, id, matches): | ||||
|         print('Matched!', matches) | ||||
| 
 | ||||
|     matcher = Matcher(nlp.vocab) | ||||
|     matcher.add('HelloWorld', [{LOWER: "hello"}, {LOWER: "world"}], on_match=on_match) | ||||
|     matcher.add('GoogleMaps', [{ORTH: "Google"}, {ORTH: "Maps"}], on_match=on_match) | ||||
| 
 | ||||
|     doc = nlp(u'HELLO WORLD on Google Maps.') | ||||
|     matches = matcher(doc) | ||||
| 
 | ||||
| +table(["Name", "Type", "Description"]) | ||||
|     +row | ||||
|         +cell #[code entity_key] | ||||
|         +cell unicode / int | ||||
|         +cell An ID for the entity. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code attrs] | ||||
|         +cell - | ||||
|         +cell Attributes to associate with the Matcher. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code if_exists] | ||||
|         +cell #[code match_id] | ||||
|         +cell unicode | ||||
|         +cell | ||||
|             |  #[code 'raise'], #[code 'ignore'] or #[code 'update']. Controls | ||||
|             |  what happens if the entity ID already exists. Defaults to | ||||
|             |  #[code 'raise']. | ||||
|         +cell An ID for the thing you're matching. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code acceptor] | ||||
|         +cell - | ||||
|         +cell Callback function to filter matches of the entity. | ||||
|         +cell #[code *patterns] | ||||
|         +cell list | ||||
|         +cell | ||||
|             |  Match pattern. A pattern consists of a list of dicts, where each | ||||
|             |  dict describes a token. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code on_match] | ||||
|         +cell - | ||||
|         +cell Callback function to act on matches of the entity. | ||||
| 
 | ||||
|     +footrow | ||||
|         +cell returns | ||||
|         +cell #[code None] | ||||
|         +cell - | ||||
| 
 | ||||
| +h(2, "add_pattern") Matcher.add_pattern | ||||
|     +tag method | ||||
| 
 | ||||
| p Add a pattern to the matcher. | ||||
| 
 | ||||
| +table(["Name", "Type", "Description"]) | ||||
|     +row | ||||
|         +cell #[code entity_key] | ||||
|         +cell unicode / int | ||||
|         +cell An ID for the entity. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code token_specs] | ||||
|         +cell - | ||||
|         +cell Description of the pattern to be matched. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code label] | ||||
|         +cell unicode / int | ||||
|         +cell Label to assign to the matched pattern. Defaults to #[code ""]. | ||||
| 
 | ||||
|     +footrow | ||||
|         +cell returns | ||||
|         +cell #[code None] | ||||
|         +cell - | ||||
| 
 | ||||
| +h(2, "has_entity") Matcher.has_entity | ||||
|     +tag method | ||||
| 
 | ||||
| p Check whether the matcher has an entity. | ||||
| 
 | ||||
| +table(["Name", "Type", "Description"]) | ||||
|     +row | ||||
|         +cell #[code entity_key] | ||||
|         +cell unicode / int | ||||
|         +cell The entity key to check. | ||||
| 
 | ||||
|     +footrow | ||||
|         +cell returns | ||||
|         +cell bool | ||||
|         +cell Whether the matcher has the entity. | ||||
|         +cell function | ||||
|         +cell | ||||
|             |  Callback function to act on matches. Takes the arguments | ||||
|             |  #[code matcher], #[code doc], #[code id] and #[code matches]. | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user