Update Matcher docstrings and API docs

2025-10-31 07:57:35 +03:00 · 2017-05-19 21:47:06 +02:00 · 2017-05-19 21:47:06 +02:00 · fe5d8819ea
commit fe5d8819ea
parent c8580da686
2 changed files with 110 additions and 183 deletions
--- a/spacy/matcher.pyx
+++ b/spacy/matcher.pyx
@ -159,14 +159,14 @@ def _convert_strings(token_specs, string_store):


 def merge_phrase(matcher, doc, i, matches):
-    '''Callback to merge a phrase on match'''
+    """Callback to merge a phrase on match."""
    ent_id, label, start, end = matches[i]
    span = doc[start : end]
    span.merge(ent_type=label, ent_id=ent_id)


 cdef class Matcher:
-    '''Match sequences of tokens, based on pattern rules.'''
+    """Match sequences of tokens, based on pattern rules."""
    cdef Pool mem
    cdef vector[TokenPatternC*] patterns
    cdef readonly Vocab vocab
@ -175,37 +175,13 @@ cdef class Matcher:
    cdef public object _callbacks
    cdef public object _acceptors

-    @classmethod
-    def load(cls, path, vocab):
-        """
-        Load the matcher and patterns from a file path.
-
-        Arguments:
-            path (Path):
-                Path to a JSON-formatted patterns file.
-            vocab (Vocab):
-                The vocabulary that the documents to match over will refer to.
-        Returns:
-            Matcher: The newly constructed object.
-        """
-        if (path / 'gazetteer.json').exists():
-            with (path / 'gazetteer.json').open('r', encoding='utf8') as file_:
-                patterns = ujson.load(file_)
-        else:
-            patterns = {}
-        return cls(vocab, patterns)
-
    def __init__(self, vocab, patterns={}):
-        """
-        Create the Matcher.
+        """Create the Matcher.

-        Arguments:
-            vocab (Vocab):
-                The vocabulary object, which must be shared with the documents
-                the matcher will operate on.
-            patterns (dict): Patterns to add to the matcher.
-        Returns:
-            The newly constructed object.
+        vocab (Vocab): The vocabulary object, which must be shared with the
+            documents the matcher will operate on.
+        patterns (dict): Patterns to add to the matcher.
+        RETURNS (Matcher): The newly constructed object.
        """
        self._patterns = {}
        self._entities = {}
@ -226,22 +202,15 @@ cdef class Matcher:

    def add_entity(self, entity_key, attrs=None, if_exists='raise',
                   acceptor=None, on_match=None):
-        """
-        Add an entity to the matcher.
+        # TODO: replace with new Matcher.add()
+        """Add an entity to the matcher.

-        Arguments:
-            entity_key (unicode or int):
-                An ID for the entity.
-            attrs:
-                Attributes to associate with the Matcher.
-            if_exists ('raise', 'ignore' or 'update'):
-                Controls what happens if the entity ID already exists. Defaults to 'raise'.
-            acceptor:
-                Callback function to filter matches of the entity.
-            on_match:
-                Callback function to act on matches of the entity.
-        Returns:
-            None
+        entity_key (unicode or int): An ID for the entity.
+        attrs (dict): Attributes to associate with the `Matcher`.
+        if_exists (unicode): `'raise'`, `'ignore'` or `'update'`. Controls what
+            happens if the entity ID already exists. Defaults to `'raise'`.
+        acceptor (function): Callback function to filter matches of the entity.
+        on_match (function): Callback function to act on matches of the entity.
        """
        if if_exists not in ('raise', 'ignore', 'update'):
            raise ValueError(
@ -264,18 +233,12 @@ cdef class Matcher:
        self._callbacks[entity_key] = on_match

    def add_pattern(self, entity_key, token_specs, label=""):
-        """
-        Add a pattern to the matcher.
+        # TODO: replace with new Matcher.add()
+        """Add a pattern to the matcher.

-        Arguments:
-            entity_key (unicode or int):
-                An ID for the entity.
-            token_specs:
-                Description of the pattern to be matched.
-            label:
-                Label to assign to the matched pattern. Defaults to "".
-        Returns:
-            None
+        entity_key (unicode): An ID for the entity.
+        token_specs (list): Description of the pattern to be matched.
+        label (unicode): Label to assign to the matched pattern. Defaults to `""`.
        """
        token_specs = list(token_specs)
        if len(token_specs) == 0:
@ -296,6 +259,7 @@ cdef class Matcher:
        self._patterns[entity_key].append((label, token_specs))

    def add(self, entity_key, label, attrs, specs, acceptor=None, on_match=None):
+        # TODO: replace with new Matcher.add()
        self.add_entity(entity_key, attrs=attrs, if_exists='update',
                        acceptor=acceptor, on_match=on_match)
        for spec in specs:
@ -308,25 +272,21 @@ cdef class Matcher:
            return entity_key

    def has_entity(self, entity_key):
-        """
-        Check whether the matcher has an entity.
+        # TODO: deprecate
+        """Check whether the matcher has an entity.

-        Arguments:
-            entity_key (string or int): The entity key to check.
-        Returns:
-            bool: Whether the matcher has the entity.
+        entity_key (string or int): The entity key to check.
+        RETURNS (bool): Whether the matcher has the entity.
        """
        entity_key = self.normalize_entity_key(entity_key)
        return entity_key in self._entities

    def get_entity(self, entity_key):
-        """
-        Retrieve the attributes stored for an entity.
+        # TODO: deprecate
+        """Retrieve the attributes stored for an entity.

-        Arguments:
-            entity_key (unicode or int): The entity to retrieve.
-        Returns:
-            The entity attributes if present, otherwise None.
+        entity_key (unicode or int): The entity to retrieve.
+        RETURNS (dict): The entity attributes if present, otherwise None.
        """
        entity_key = self.normalize_entity_key(entity_key)
        if entity_key in self._entities:
@ -335,17 +295,12 @@ cdef class Matcher:
            return None

    def __call__(self, Doc doc, acceptor=None):
-        """
-        Find all token sequences matching the supplied patterns on the Doc.
+        """Find all token sequences matching the supplied patterns on the `Doc`.

-        Arguments:
-            doc (Doc):
-                The document to match over.
-        Returns:
-            list
-            A list of (entity_key, label_id, start, end) tuples,
-            describing the matches. A match tuple describes a span doc[start:end].
-            The label_id and entity_key are both integers.
+        doc (Doc): The document to match over.
+        RETURNS (list): A list of `(entity_key, label_id, start, end)` tuples,
+            describing the matches. A match tuple describes a span
+            `doc[start:end]`. The `label_id` and `entity_key` are both integers.
        """
        if acceptor is not None:
            raise ValueError(
@ -449,18 +404,13 @@ cdef class Matcher:
        return matches

    def pipe(self, docs, batch_size=1000, n_threads=2):
-        """
-        Match a stream of documents, yielding them in turn.
+        """Match a stream of documents, yielding them in turn.

-        Arguments:
-            docs: A stream of documents.
-            batch_size (int):
-                The number of documents to accumulate into a working set.
-            n_threads (int):
-                The number of threads with which to work on the buffer in parallel,
-                if the Matcher implementation supports multi-threading.
-        Yields:
-            Doc Documents, in order.
+        docs (iterable): A stream of documents.
+        batch_size (int): The number of documents to accumulate into a working set.
+        n_threads (int): The number of threads with which to work on the buffer
+            in parallel, if the `Matcher` implementation supports multi-threading.
+        YIELDS (Doc): Documents, in order.
        """
        for doc in docs:
            self(doc)
--- a/website/docs/api/matcher.jade
+++ b/website/docs/api/matcher.jade
@ -4,31 +4,26 @@ include ../../_includes/_mixins

 p Match sequences of tokens, based on pattern rules.

-+h(2, "load") Matcher.load
-    +tag classmethod
-
-p Load the matcher and patterns from a file path.
-
-+table(["Name", "Type", "Description"])
-    +row
-        +cell #[code path]
-        +cell #[code Path]
-        +cell Path to a JSON-formatted patterns file.
-
-    +row
-        +cell #[code vocab]
-        +cell #[code Vocab]
-        +cell The vocabulary that the documents to match over will refer to.
-
-    +footrow
-        +cell returns
-        +cell #[code Matcher]
-        +cell The newly constructed object.
+infobox("⚠️ Deprecation note")
+    |  As of spaCy 2.0, #[code Matcher.add_pattern] and #[code Matcher.add_entity]
+    |  are deprecated and have been replaced with a simpler
+    |  #[+api("matcher#add") #[code Matcher.add]] that lets you add a list of
+    |  patterns and a callback for a given match ID.
+    |  #[code Matcher.load] (not useful, as it didn't allow specifying callbacks),
+    |  #[code Matcher.has_entity] and #[code Matcher.get_entity] (now redundant)
+    |  have been removed.

 +h(2, "init") Matcher.__init__
    +tag method

-p Create the Matcher.
+p Create the rule-based #[code Matcher].
+
+aside-code("Example").
+    from spacy.matcher import Matcher
+    from spacy.attrs import LOWER
+
+    patterns = {"HelloWorld": [{LOWER: "hello"}, {LOWER: "world"}]}
+    matcher = Matcher(nlp.vocab)

 +table(["Name", "Type", "Description"])
    +row
@ -41,7 +36,7 @@ p Create the Matcher.
    +row
        +cell #[code patterns]
        +cell dict
-        +cell Patterns to add to the matcher.
+        +cell Patterns to add to the matcher, keyed by ID.

    +footrow
        +cell returns
@ -51,7 +46,28 @@ p Create the Matcher.
 +h(2, "call") Matcher.__call__
    +tag method

-p Find all token sequences matching the supplied patterns on the Doc.
+p Find all token sequences matching the supplied patterns on the #[code Doc].
+
+aside-code("Example").
+    from spacy.matcher import Matcher
+    from spacy.attrs import LOWER
+
+    matcher = Matcher(nlp.vocab)
+    pattern = [{LOWER: "hello"}, {LOWER: "world"}]
+    matcher.add_pattern("HelloWorld", pattern, on_match=None)
+    doc = nlp(u'hello world!')
+    matches = matcher(doc)
+
+infobox("Important note")
+    |  By default, the matcher #[strong does not perform any action] on matches,
+    |  like tagging matched phrases with entity types. Instead, actions need to
+    |  be specified when #[strong adding patterns or entities], by
+    |  passing in a callback function as the #[code on_match] argument on
+    |  #[+api("matcher#add") #[code add]]. This allows you to define custom
+    |  actions per pattern within the same matcher. For example, you might only
+    |  want to merge some entity types, and set custom flags for other matched
+    |  patterns. For more details and examples, see the usage workflow on
+    |  #[+a("/docs/usage/rule-based-matching") rule-based matching].

 +table(["Name", "Type", "Description"])
    +row
@ -76,7 +92,7 @@ p Match a stream of documents, yielding them in turn.
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code docs]
-        +cell -
+        +cell iterable
        +cell A stream of documents.

    +row
@ -97,83 +113,44 @@ p Match a stream of documents, yielding them in turn.
        +cell #[code Doc]
        +cell Documents, in order.

-+h(2, "add_entity") Matcher.add_entity
+h(2, "add_pattern") Matcher.add
    +tag method

-p Add an entity to the matcher.
+p
+    |  Add one or more patterns to the matcher, along with a callback function
+    |  to handle the matches. The callback function will receive the arguments
+    |  #[code matcher], #[code doc], #[code id] and #[code matches].
+
+aside-code("Example").
+    from spacy.matcher import Matcher
+    from spacy.attrs import LOWER, ORTH
+
+    def on_match(matcher, doc, id, matches):
+        print('Matched!', matches)
+
+    matcher = Matcher(nlp.vocab)
+    matcher.add('HelloWorld', [{LOWER: "hello"}, {LOWER: "world"}], on_match=on_match)
+    matcher.add('GoogleMaps', [{ORTH: "Google"}, {ORTH: "Maps"}], on_match=on_match)
+
+    doc = nlp(u'HELLO WORLD on Google Maps.')
+    matches = matcher(doc)

 +table(["Name", "Type", "Description"])
    +row
-        +cell #[code entity_key]
-        +cell unicode / int
-        +cell An ID for the entity.
-
-    +row
-        +cell #[code attrs]
-        +cell -
-        +cell Attributes to associate with the Matcher.
-
-    +row
-        +cell #[code if_exists]
+        +cell #[code match_id]
        +cell unicode
-        +cell
-            |  #[code 'raise'], #[code 'ignore'] or #[code 'update']. Controls
-            |  what happens if the entity ID already exists. Defaults to
-            |  #[code 'raise'].
+        +cell An ID for the thing you're matching.

    +row
-        +cell #[code acceptor]
-        +cell -
-        +cell Callback function to filter matches of the entity.
+        +cell #[code *patterns]
+        +cell list
+        +cell
+            |  Match pattern. A pattern consists of a list of dicts, where each
+            |  dict describes a token.

    +row
        +cell #[code on_match]
-        +cell -
-        +cell Callback function to act on matches of the entity.
-
-    +footrow
-        +cell returns
-        +cell #[code None]
-        +cell -
-
-+h(2, "add_pattern") Matcher.add_pattern
-    +tag method
-
-p Add a pattern to the matcher.
-
-+table(["Name", "Type", "Description"])
-    +row
-        +cell #[code entity_key]
-        +cell unicode / int
-        +cell An ID for the entity.
-
-    +row
-        +cell #[code token_specs]
-        +cell -
-        +cell Description of the pattern to be matched.
-
-    +row
-        +cell #[code label]
-        +cell unicode / int
-        +cell Label to assign to the matched pattern. Defaults to #[code ""].
-
-    +footrow
-        +cell returns
-        +cell #[code None]
-        +cell -
-
-+h(2, "has_entity") Matcher.has_entity
-    +tag method
-
-p Check whether the matcher has an entity.
-
-+table(["Name", "Type", "Description"])
-    +row
-        +cell #[code entity_key]
-        +cell unicode / int
-        +cell The entity key to check.
-
-    +footrow
-        +cell returns
-        +cell bool
-        +cell Whether the matcher has the entity.
+        +cell function
+        +cell
+            |  Callback function to act on matches. Takes the arguments
+            |  #[code matcher], #[code doc], #[code id] and #[code matches].