Update docs and fix consistency

2025-08-09 06:34:54 +03:00 · 2020-08-09 22:31:52 +02:00 · 2020-08-09 22:31:52 +02:00 · d5c78c7a34
commit d5c78c7a34
parent 7c6854d8d4
10 changed files with 326 additions and 54 deletions
--- a/spacy/cli/pretrain.py
+++ b/spacy/cli/pretrain.py
@ -35,7 +35,7 @@ def pretrain_cli(
    config_path: Path = Arg(..., help="Path to config file", exists=True, dir_okay=False),
    code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
    resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"),
-    epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using '--resume_path'. Prevents unintended overwriting of existing weight files."),
+    epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using --resume-path. Prevents unintended overwriting of existing weight files."),
    use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
    # fmt: on
 ):
--- a/spacy/matcher/dependencymatcher.pyx
+++ b/spacy/matcher/dependencymatcher.pyx
@ -68,11 +68,11 @@ cdef class DependencyMatcher:
        key (str): The match ID.
        RETURNS (bool): Whether the matcher contains rules for this match ID.
        """
-        return self._normalize_key(key) in self._patterns
+        return self.has_key(key)

-    def validateInput(self, pattern, key):
+    def validate_input(self, pattern, key):
        idx = 0
-        visitedNodes = {}
+        visited_nodes = {}
        for relation in pattern:
            if "PATTERN" not in relation or "SPEC" not in relation:
                raise ValueError(Errors.E098.format(key=key))
@ -83,7 +83,7 @@ cdef class DependencyMatcher:
                    and "NBOR_NAME" not in relation["SPEC"]
                ):
                    raise ValueError(Errors.E099.format(key=key))
-                visitedNodes[relation["SPEC"]["NODE_NAME"]] = True
+                visited_nodes[relation["SPEC"]["NODE_NAME"]] = True
            else:
                if not(
                    "NODE_NAME" in relation["SPEC"]
@ -92,22 +92,28 @@ cdef class DependencyMatcher:
                ):
                    raise ValueError(Errors.E100.format(key=key))
                if (
-                    relation["SPEC"]["NODE_NAME"] in visitedNodes
-                    or relation["SPEC"]["NBOR_NAME"] not in visitedNodes
+                    relation["SPEC"]["NODE_NAME"] in visited_nodes
+                    or relation["SPEC"]["NBOR_NAME"] not in visited_nodes
                ):
                    raise ValueError(Errors.E101.format(key=key))
-                visitedNodes[relation["SPEC"]["NODE_NAME"]] = True
-                visitedNodes[relation["SPEC"]["NBOR_NAME"]] = True
+                visited_nodes[relation["SPEC"]["NODE_NAME"]] = True
+                visited_nodes[relation["SPEC"]["NBOR_NAME"]] = True
            idx = idx + 1

    def add(self, key, patterns, *_patterns, on_match=None):
+        """Add a new matcher rule to the matcher.
+
+        key (str): The match ID.
+        patterns (list): The patterns to add for the given key.
+        on_match (callable): Optional callback executed on match.
+        """
        if patterns is None or hasattr(patterns, "__call__"):  # old API
            on_match = patterns
            patterns = _patterns
        for pattern in patterns:
            if len(pattern) == 0:
                raise ValueError(Errors.E012.format(key=key))
-            self.validateInput(pattern,key)
+            self.validate_input(pattern,key)
        key = self._normalize_key(key)
        _patterns = []
        for pattern in patterns:
@ -187,8 +193,7 @@ cdef class DependencyMatcher:
        key (string or int): The key to check.
        RETURNS (bool): Whether the matcher has the rule.
        """
-        key = self._normalize_key(key)
-        return key in self._patterns
+        return self._normalize_key(key) in self._patterns

    def get(self, key, default=None):
        """Retrieve the pattern stored for a key.
@ -202,6 +207,13 @@ cdef class DependencyMatcher:
        return (self._callbacks[key], self._patterns[key])

    def __call__(self, Doc doc):
+        """Find all token sequences matching the supplied pattern.
+
+        doclike (Doc or Span): The document to match over.
+        RETURNS (list): A list of `(key, start, end)` tuples,
+            describing the matches. A match tuple describes a span
+            `doc[start:end]`. The `label_id` and `key` are both integers.
+        """
        matched_key_trees = []
        matches = self.token_matcher(doc)
        for key in list(self._patterns.keys()):
@ -241,25 +253,25 @@ cdef class DependencyMatcher:
                    on_match(self, doc, i, matched_key_trees)
        return matched_key_trees

-    def recurse(self,tree,id_to_position,_node_operator_map,int patternLength,visitedNodes,matched_trees):
+    def recurse(self,tree,id_to_position,_node_operator_map,int patternLength,visited_nodes,matched_trees):
        cdef bool isValid;
        if(patternLength == len(id_to_position.keys())):
            isValid = True
            for node in range(patternLength):
                if(node in tree):
                    for idx, (relop,nbor) in enumerate(tree[node]):
-                        computed_nbors = numpy.asarray(_node_operator_map[visitedNodes[node]][relop])
+                        computed_nbors = numpy.asarray(_node_operator_map[visited_nodes[node]][relop])
                        isNbor = False
                        for computed_nbor in computed_nbors:
-                            if(computed_nbor.i == visitedNodes[nbor]):
+                            if(computed_nbor.i == visited_nodes[nbor]):
                                isNbor = True
                        isValid = isValid & isNbor
            if(isValid):
-                matched_trees.append(visitedNodes)
+                matched_trees.append(visited_nodes)
            return
        allPatternNodes = numpy.asarray(id_to_position[patternLength])
        for patternNode in allPatternNodes:
-            self.recurse(tree,id_to_position,_node_operator_map,patternLength+1,visitedNodes+[patternNode],matched_trees)
+            self.recurse(tree,id_to_position,_node_operator_map,patternLength+1,visited_nodes+[patternNode],matched_trees)

    # Given a node and an edge operator, to return the list of nodes
    # from the doc that belong to node+operator. This is used to store
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@ -70,7 +70,7 @@ cdef class Matcher:
        key (str): The match ID.
        RETURNS (bool): Whether the matcher contains rules for this match ID.
        """
-        return self._normalize_key(key) in self._patterns
+        return self.has_key(key)

    def add(self, key, patterns, *, on_match=None, greedy: str=None):
        """Add a match-rule to the matcher. A match-rule consists of: an ID
@ -162,8 +162,7 @@ cdef class Matcher:
        key (string or int): The key to check.
        RETURNS (bool): Whether the matcher has the rule.
        """
-        key = self._normalize_key(key)
-        return key in self._patterns
+        return self._normalize_key(key) in self._patterns

    def get(self, key, default=None):
        """Retrieve the pattern stored for a key.
@ -179,7 +178,7 @@ cdef class Matcher:
    def pipe(self, docs, batch_size=1000, return_matches=False, as_tuples=False):
        """Match a stream of documents, yielding them in turn.

-        docs (iterable): A stream of documents.
+        docs (Iterable[Union[Doc, Span]]): A stream of documents or spans.
        batch_size (int): Number of documents to accumulate into a working set.
        return_matches (bool): Yield the match lists along with the docs, making
            results (doc, matches) tuples.
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@ -75,8 +75,8 @@ class Morphologizer(Tagger):
        model (thinc.api.Model): The Thinc Model powering the pipeline component.
        name (str): The component instance name, used to add entries to the
            losses during training.
-        labels_morph (dict): TODO:
-        labels_pos (dict): TODO:
+        labels_morph (dict): Mapping of morph + POS tags to morph labels.
+        labels_pos (dict): Mapping of morph + POS tags to POS tags.

        DOCS: https://spacy.io/api/morphologizer#init
        """
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@ -601,9 +601,7 @@ $ python -m spacy train [config_path] [--output] [--code] [--verbose] [overrides

 ## Pretrain {#pretrain new="2.1" tag="experimental"}

-<!-- TODO: document new pretrain command and link to new pretraining docs -->
-
-Pre-train the "token to vector" ([`Tok2vec`](/api/tok2vec)) layer of pipeline
+Pretrain the "token to vector" ([`Tok2vec`](/api/tok2vec)) layer of pipeline
 components on [raw text](/api/data-formats#pretrain), using an approximate
 language-modeling objective. Specifically, we load pretrained vectors, and train
 a component like a CNN, BiLSTM, etc to predict vectors which match the
@ -611,7 +609,8 @@ pretrained ones. The weights are saved to a directory after each epoch. You can
 then include a **path to one of these pretrained weights files** in your
 [training config](/usage/training#config) as the `init_tok2vec` setting when you
 train your model. This technique may be especially helpful if you have little
-labelled data.
+labelled data. See the usage docs on [pretraining](/usage/training#pretraining)
+for more info.

 <Infobox title="Changed in v3.0" variant="warning">

@ -634,8 +633,8 @@ $ python -m spacy pretrain [texts_loc] [output_dir] [config_path]
 | `output_dir`            | positional | Directory to write models to on each epoch.                                                                                                                                  |
 | `config_path`           | positional | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters.                                                                        |
 | `--code`, `-c`          | option     | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-models) for new architectures.                         |
-| `--resume-path`, `-r`   | option     | TODO:                                                                                                                                                                        |
-| `--epoch-resume`, `-er` | option     | TODO:                                                                                                                                                                        |
+| `--resume-path`, `-r`   | option     | Path to pretrained weights from which to resume pretraining.                                                                                                                 |
+| `--epoch-resume`, `-er` | option     | The epoch to resume counting from when using `--resume-path`. Prevents unintended overwriting of existing weight files.                                                      |
 | `--help`, `-h`          | flag       | Show help message and available arguments.                                                                                                                                   |
 | overrides               |            | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--training.use_gpu 1`.                |
 | **CREATES**             | weights    | The pretrained weights that can be used to initialize `spacy train`.                                                                                                         |
--- a/website/docs/api/data-formats.md
+++ b/website/docs/api/data-formats.md
@ -20,9 +20,9 @@ Config files define the training process and model pipeline and can be passed to
 [`spacy train`](/api/cli#train). They use
 [Thinc's configuration system](https://thinc.ai/docs/usage-config) under the
 hood. For details on how to use training configs, see the
-[usage documentation](/usage/training#config).
-
-<!-- TODO: add details on getting started and init config -->
+[usage documentation](/usage/training#config). To get started with a blank
+config or fill a partial config with all defaults, you can use the
+[`init config`](/api/cli#init-config) command.

 > #### What does the @ mean?
 >
@ -52,8 +52,6 @@ your config and check that it's valid, you can run the

 </Infobox>

-<!-- TODO: once we know how we want to implement "starter config" workflow or outputting a full default config for the user, update this section with the command -->
-
 ### nlp {#config-nlp tag="section"}

 > #### Example
@ -154,8 +152,6 @@ This section is optional and defines settings and controls for
 [language model pretraining](/usage/training#pretraining). It's used when you
 run [`spacy pretrain`](/api/cli#pretrain).

-<!-- TODO: complete -->
-
 | Name                         | Type                                                | Description                                                                   | Default                                             |
 | ---------------------------- | --------------------------------------------------- | ----------------------------------------------------------------------------- | --------------------------------------------------- |
 | `max_epochs`                 | int                                                 | Maximum number of epochs.                                                     | `1000`                                              |
--- a/website/docs/api/dependencymatcher.md
+++ b/website/docs/api/dependencymatcher.md
@ -5,4 +5,194 @@ tag: class
 source: spacy/matcher/dependencymatcher.pyx
 ---

-TODO: write
+The `DependencyMatcher` follows the same API as the [`Matcher`](/api/matcher)
+and [`PhraseMatcher`](/api/phrasematcher) and lets you match on dependency trees
+using the
+[Semgrex syntax](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html).
+It requires a pretrained [`DependencyParser`](/api/parser) or other component
+that sets the `Token.dep` attribute.
+
+## Pattern format {#patterns}
+
+> ```json
+> ### Example
+> [
+>   {
+>     "SPEC": {"NODE_NAME": "founded"},
+>     "PATTERN": {"ORTH": "founded"}
+>   },
+>   {
+>     "SPEC": {
+>       "NODE_NAME": "founder",
+>       "NBOR_RELOP": ">",
+>       "NBOR_NAME": "founded"
+>   },
+>     "PATTERN": {"DEP": "nsubj"}
+>   },
+>   {
+>     "SPEC": {
+>       "NODE_NAME": "object",
+>       "NBOR_RELOP": ">",
+>       "NBOR_NAME": "founded"
+>   },
+>     "PATTERN": {"DEP": "dobj"}
+>   }
+> ]
+> ```
+
+A pattern added to the `DependencyMatcher` consists of a list of dictionaries,
+with each dictionary describing a node to match. Each pattern should have the
+following top-level keys:
+
+| Name      | Type | Description                                                                                                                 |
+| --------- | ---- | --------------------------------------------------------------------------------------------------------------------------- |
+| `PATTERN` | dict | The token attributes to match in the same format as patterns provided to the regular token-based [`Matcher`](/api/matcher). |
+| `SPEC`    | dict | The relationships of the nodes in the subtree that should be matched.                                                       |
+
+The `SPEC` includes the following fields:
+
+| Name         | Type | Description                                                                                                                                                            |
+| ------------ | ---- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `NODE_NAME`  | str  | A unique name for this node to refer to it in other specs.                                                                                                             |
+| `NBOR_RELOP` | str  | A [Semgrex](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html) operator that describes how the two nodes are related. |
+| `NBOR_NAME`  | str  | The unique name of the node that this node is connected to.                                                                                                            |
+
+## DependencyMatcher.\_\_init\_\_ {#init tag="method"}
+
+Create a rule-based `DependencyMatcher`.
+
+> #### Example
+>
+> ```python
+> from spacy.matcher import DependencyMatcher
+> matcher = DependencyMatcher(nlp.vocab)
+> ```
+
+| Name    | Type    | Description                                                                                 |
+| ------- | ------- | ------------------------------------------------------------------------------------------- |
+| `vocab` | `Vocab` | The vocabulary object, which must be shared with the documents the matcher will operate on. |
+
+## DependencyMatcher.\_\call\_\_ {#call tag="method"}
+
+Find all token sequences matching the supplied patterns on the `Doc` or `Span`.
+
+> #### Example
+>
+> ```python
+> from spacy.matcher import Matcher
+>
+> matcher = Matcher(nlp.vocab)
+> pattern = [
+>     {"SPEC": {"NODE_NAME": "founded"}, "PATTERN": {"ORTH": "founded"}},
+>     {"SPEC": {"NODE_NAME": "founder", "NBOR_RELOP": ">", "NBOR_NAME": "founded"}, "PATTERN": {"DEP": "nsubj"}},
+> ]
+> matcher.add("Founder", [pattern])
+> doc = nlp("Bill Gates founded Microsoft.")
+> matches = matcher(doc)
+> ```
+
+| Name        | Type         | Description                                                                                                                                                              |
+| ----------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `doclike`   | `Doc`/`Span` | The `Doc` or `Span` to match over.                                                                                                                                       |
+| **RETURNS** | list         | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. The `match_id` is the ID of the added match pattern. |
+
+## DependencyMatcher.\_\_len\_\_ {#len tag="method"}
+
+Get the number of rules (edges) added to the dependency matcher. Note that this
+only returns the number of rules (identical with the number of IDs), not the
+number of individual patterns.
+
+> #### Example
+>
+> ```python
+> matcher = DependencyMatcher(nlp.vocab)
+> assert len(matcher) == 0
+> pattern = [
+>     {"SPEC": {"NODE_NAME": "founded"}, "PATTERN": {"ORTH": "founded"}},
+>     {"SPEC": {"NODE_NAME": "START_ENTITY", "NBOR_RELOP": ">", "NBOR_NAME": "founded"}, "PATTERN": {"DEP": "nsubj"}},
+> ]
+> matcher.add("Rule", [pattern])
+> assert len(matcher) == 1
+> ```
+
+| Name        | Type | Description          |
+| ----------- | ---- | -------------------- |
+| **RETURNS** | int  | The number of rules. |
+
+## DependencyMatcher.\_\_contains\_\_ {#contains tag="method"}
+
+Check whether the matcher contains rules for a match ID.
+
+> #### Example
+>
+> ```python
+> matcher = Matcher(nlp.vocab)
+> assert "Rule" not in matcher
+> matcher.add("Rule", [pattern])
+> assert "Rule" in matcher
+> ```
+
+| Name        | Type | Description                                           |
+| ----------- | ---- | ----------------------------------------------------- |
+| `key`       | str  | The match ID.                                         |
+| **RETURNS** | bool | Whether the matcher contains rules for this match ID. |
+
+## DependencyMatcher.add {#add tag="method"}
+
+Add a rule to the matcher, consisting of an ID key, one or more patterns, and an
+optional callback function to act on the matches. The callback function will
+receive the arguments `matcher`, `doc`, `i` and `matches`. If a pattern already
+exists for the given ID, the patterns will be extended. An `on_match` callback
+will be overwritten.
+
+> #### Example
+>
+> ```python
+> def on_match(matcher, doc, id, matches):
+>     print('Matched!', matches)
+>
+> matcher = Matcher(nlp.vocab)
+> matcher.add("TEST_PATTERNS", patterns)
+> ```
+
+| Name           | Type               | Description                                                                                   |
+| -------------- | ------------------ | --------------------------------------------------------------------------------------------- |
+| `match_id`     | str                | An ID for the thing you're matching.                                                          |
+| `patterns`     | list               | Match pattern. A pattern consists of a list of dicts, where each dict describes a token.      |
+| _keyword-only_ |                    |                                                                                               |
+| `on_match`     | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. |
+
+## DependencyMatcher.remove {#remove tag="method"}
+
+Remove a rule from the matcher. A `KeyError` is raised if the match ID does not
+exist.
+
+> #### Example
+>
+> ```python
+> matcher.add("Rule", [pattern]])
+> assert "Rule" in matcher
+> matcher.remove("Rule")
+> assert "Rule" not in matcher
+> ```
+
+| Name  | Type | Description               |
+| ----- | ---- | ------------------------- |
+| `key` | str  | The ID of the match rule. |
+
+## DependencyMatcher.get {#get tag="method"}
+
+Retrieve the pattern stored for a key. Returns the rule as an
+`(on_match, patterns)` tuple containing the callback and available patterns.
+
+> #### Example
+>
+> ```python
+> matcher.add("Rule", [pattern], on_match=on_match)
+> on_match, patterns = matcher.get("Rule")
+> ```
+
+| Name        | Type  | Description                                   |
+| ----------- | ----- | --------------------------------------------- |
+| `key`       | str   | The ID of the match rule.                     |
+| **RETURNS** | tuple | The rule, as an `(on_match, patterns)` tuple. |
--- a/website/docs/api/matcher.md
+++ b/website/docs/api/matcher.md
@ -5,6 +5,82 @@ tag: class
 source: spacy/matcher/matcher.pyx
 ---

+The `Matcher` lets you find words and phrases using rules describing their token
+attributes. Rules can refer to token annotations (like the text or
+part-of-speech tags), as well as lexical attributes like `Token.is_punct`.
+Applying the matcher to a [`Doc`](/api/doc) gives you access to the matched
+tokens in context. For in-depth examples and workflows for combining rules and
+statistical models, see the [usage guide](/usage/rule-based-matching) on
+rule-based matching.
+
+## Pattern format {#patterns}
+
+> ```json
+> ### Example
+> [
+>   {"LOWER": "i"},
+>   {"LEMMA": {"IN": ["like", "love"]}},
+>   {"POS": "NOUN", "OP": "+"}
+> ]
+> ```
+
+A pattern added to the `Matcher` consists of a list of dictionaries. Each
+dictionary describes **one token** and its attributes. The available token
+pattern keys correspond to a number of
+[`Token` attributes](/api/token#attributes). The supported attributes for
+rule-based matching are:
+
+| Attribute                              | Type |  Description                                                                                           |
+| -------------------------------------- | ---- | ------------------------------------------------------------------------------------------------------ |
+| `ORTH`                                 | str  | The exact verbatim text of a token.                                                                    |
+| `TEXT` <Tag variant="new">2.1</Tag>    | str  | The exact verbatim text of a token.                                                                    |
+| `LOWER`                                | str  | The lowercase form of the token text.                                                                  |
+|  `LENGTH`                              | int  | The length of the token text.                                                                          |
+|  `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT`    | bool | Token text consists of alphabetic characters, ASCII characters, digits.                                |
+|  `IS_LOWER`, `IS_UPPER`, `IS_TITLE`    | bool | Token text is in lowercase, uppercase, titlecase.                                                      |
+|  `IS_PUNCT`, `IS_SPACE`, `IS_STOP`     | bool | Token is punctuation, whitespace, stop word.                                                           |
+|  `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL`  | bool | Token text resembles a number, URL, email.                                                             |
+|  `POS`, `TAG`, `DEP`, `LEMMA`, `SHAPE` | str  | The token's simple and extended part-of-speech tag, dependency label, lemma, shape.                    |
+| `ENT_TYPE`                             | str  | The token's entity label.                                                                              |
+| `_` <Tag variant="new">2.1</Tag>       | dict | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). |
+| `OP`                                   | str  | Operator or quantifier to determine how often to match a token pattern.                                |
+
+Operators and quantifiers define **how often** a token pattern should be
+matched:
+
+> ```json
+> ### Example
+> [
+>   {"POS": "ADJ", "OP": "*"},
+>   {"POS": "NOUN", "OP": "+"}
+> ]
+> ```
+
+| OP  | Description                                                      |
+| --- | ---------------------------------------------------------------- |
+| `!` | Negate the pattern, by requiring it to match exactly 0 times.    |
+| `?` | Make the pattern optional, by allowing it to match 0 or 1 times. |
+| `+` | Require the pattern to match 1 or more times.                    |
+| `*` | Allow the pattern to match zero or more times.                   |
+
+Token patterns can also map to a **dictionary of properties** instead of a
+single value to indicate whether the expected value is a member of a list or how
+it compares to another value.
+
+> ```json
+> ### Example
+> [
+>   {"LEMMA": {"IN": ["like", "love", "enjoy"]}},
+>   {"POS": "PROPN", "LENGTH": {">=": 10}},
+> ]
+> ```
+
+| Attribute                  | Type       | Description                                                                       |
+| -------------------------- | ---------- | --------------------------------------------------------------------------------- |
+| `IN`                       | any        | Attribute value is member of a list.                                              |
+| `NOT_IN`                   | any        | Attribute value is _not_ member of a list.                                        |
+| `==`, `>=`, `<=`, `>`, `<` | int, float | Attribute value is equal, greater or equal, smaller or equal, greater or smaller. |
+
 ## Matcher.\_\_init\_\_ {#init tag="method"}

 Create the rule-based `Matcher`. If `validate=True` is set, all patterns added
@ -60,7 +136,7 @@ Match a stream of documents, yielding them in turn.

 | Name                                          | Type     | Description                                                                                                                                                                                                                |
 | --------------------------------------------- | -------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `docs`                                        | iterable | A stream of documents.                                                                                                                                                                                                     |
+| `docs`                                        | iterable | A stream of documents or spans.                                                                                                                                                                                            |
 | `batch_size`                                  | int      | The number of documents to accumulate into a working set.                                                                                                                                                                  |
 | `return_matches` <Tag variant="new">2.1</Tag> | bool     | Yield the match lists along with the docs, making results `(doc, matches)` tuples.                                                                                                                                         |
 | `as_tuples`                                   | bool     | Interpret the input stream as `(doc, context)` tuples, and yield `(result, context)` tuples out. If both `return_matches` and `as_tuples` are `True`, the output will be a sequence of `((doc, matches), context)` tuples. |
@ -105,11 +181,11 @@ Check whether the matcher contains rules for a match ID.

 ## Matcher.add {#add tag="method" new="2"}

-Add a rule to the matcher, consisting of an ID key, one or more patterns, and a
-callback function to act on the matches. The callback function will receive the
-arguments `matcher`, `doc`, `i` and `matches`. If a pattern already exists for
-the given ID, the patterns will be extended. An `on_match` callback will be
-overwritten.
+Add a rule to the matcher, consisting of an ID key, one or more patterns, and an
+optional callback function to act on the matches. The callback function will
+receive the arguments `matcher`, `doc`, `i` and `matches`. If a pattern already
+exists for the given ID, the patterns will be extended. An `on_match` callback
+will be overwritten.

 > #### Example
 >
@ -141,12 +217,13 @@ patterns = [[{"TEXT": "Google"}, {"TEXT": "Now"}], [{"TEXT": "GoogleNow"}]]

 </Infobox>

-| Name           | Type               | Description                                                                                   |
-| -------------- | ------------------ | --------------------------------------------------------------------------------------------- |
-| `match_id`     | str                | An ID for the thing you're matching.                                                          |
-| `patterns`     | list               | Match pattern. A pattern consists of a list of dicts, where each dict describes a token.      |
-| _keyword-only_ |                    |                                                                                               |
-| `on_match`     | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. |
+| Name                                | Type               | Description                                                                                   |
+| ----------------------------------- | ------------------ | --------------------------------------------------------------------------------------------- |
+| `match_id`                          | str                | An ID for the thing you're matching.                                                          |
+| `patterns`                          | `List[List[dict]]` | Match pattern. A pattern consists of a list of dicts, where each dict describes a token.      |
+| _keyword-only_                      |                    |                                                                                               |
+| `on_match`                          | callable / `None`  | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. |
+| `greedy` <Tag variant="new">3</Tag> | str                | Optional filter for greedy matches. Can either be `"FIRST"` or `"LONGEST"`.                   |

 ## Matcher.remove {#remove tag="method" new="2"}

--- a/website/docs/api/morphologizer.md
+++ b/website/docs/api/morphologizer.md
@ -63,16 +63,14 @@ Create a new pipeline instance. In your application, you would normally use a
 shortcut for this and instantiate the component using its string name and
 [`nlp.add_pipe`](/api/language#add_pipe).

-<!-- TODO: finish API docs -->
-
 | Name           | Type    | Description                                                                                 |
 | -------------- | ------- | ------------------------------------------------------------------------------------------- |
 | `vocab`        | `Vocab` | The shared vocabulary.                                                                      |
 | `model`        | `Model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component.             |
 | `name`         | str     | String name of the component instance. Used to add entries to the `losses` during training. |
 | _keyword-only_ |         |                                                                                             |
-| `labels_morph` | dict    |                                                                                             |
-| `labels_pos`   | dict    |                                                                                             |
+| `labels_morph` | dict    | Mapping of morph + POS tags to morph labels.                                                |
+| `labels_pos`   | dict    | Mapping of morph + POS tags to POS tags.                                                    |

 ## Morphologizer.\_\_call\_\_ {#call tag="method"}

--- a/website/docs/api/phrasematcher.md
+++ b/website/docs/api/phrasematcher.md
@ -9,7 +9,8 @@ new: 2
 The `PhraseMatcher` lets you efficiently match large terminology lists. While
 the [`Matcher`](/api/matcher) lets you match sequences based on lists of token
 descriptions, the `PhraseMatcher` accepts match patterns in the form of `Doc`
-objects.
+objects. See the [usage guide](/usage/rule-based-matching#phrasematcher) for
+examples.

 ## PhraseMatcher.\_\_init\_\_ {#init tag="method"}