Update docs and fix consistency

2025-12-14 21:54:18 +03:00 · 2020-08-09 22:31:52 +02:00 · 2020-08-09 22:31:52 +02:00 · d5c78c7a34
commit d5c78c7a34
parent 7c6854d8d4
10 changed files with 326 additions and 54 deletions
--- a/spacy/cli/pretrain.py
+++ b/spacy/cli/pretrain.py
@ -35,7 +35,7 @@ def pretrain_cli(
    config_path: Path = Arg(..., help="Path to config file", exists=True, dir_okay=False),
    code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
    resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"),
-    epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using '--resume_path'. Prevents unintended overwriting of existing weight files."),
+    epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using --resume-path. Prevents unintended overwriting of existing weight files."),
    use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
    # fmt: on
 ):
--- a/spacy/matcher/dependencymatcher.pyx
+++ b/spacy/matcher/dependencymatcher.pyx
@ -68,11 +68,11 @@ cdef class DependencyMatcher:
        key (str): The match ID.
        RETURNS (bool): Whether the matcher contains rules for this match ID.
        """
-        return self._normalize_key(key) in self._patterns
+        return self.has_key(key)
-    def validateInput(self, pattern, key):
+    def validate_input(self, pattern, key):
        idx = 0
-        visitedNodes = {}
+        visited_nodes = {}
        for relation in pattern:
            if "PATTERN" not in relation or "SPEC" not in relation:
                raise ValueError(Errors.E098.format(key=key))
@ -83,7 +83,7 @@ cdef class DependencyMatcher:
                    and "NBOR_NAME" not in relation["SPEC"]
                ):
                    raise ValueError(Errors.E099.format(key=key))
-                visitedNodes[relation["SPEC"]["NODE_NAME"]] = True
+                visited_nodes[relation["SPEC"]["NODE_NAME"]] = True
            else:
                if not(
                    "NODE_NAME" in relation["SPEC"]
@ -92,22 +92,28 @@ cdef class DependencyMatcher:
                ):
                    raise ValueError(Errors.E100.format(key=key))
                if (
-                    relation["SPEC"]["NODE_NAME"] in visitedNodes
+                    relation["SPEC"]["NODE_NAME"] in visited_nodes
-                    or relation["SPEC"]["NBOR_NAME"] not in visitedNodes
+                    or relation["SPEC"]["NBOR_NAME"] not in visited_nodes
                ):
                    raise ValueError(Errors.E101.format(key=key))
-                visitedNodes[relation["SPEC"]["NODE_NAME"]] = True
+                visited_nodes[relation["SPEC"]["NODE_NAME"]] = True
-                visitedNodes[relation["SPEC"]["NBOR_NAME"]] = True
+                visited_nodes[relation["SPEC"]["NBOR_NAME"]] = True
            idx = idx + 1
    def add(self, key, patterns, *_patterns, on_match=None):
        """Add a new matcher rule to the matcher.
        key (str): The match ID.
        patterns (list): The patterns to add for the given key.
        on_match (callable): Optional callback executed on match.
        """
        if patterns is None or hasattr(patterns, "__call__"):  # old API
            on_match = patterns
            patterns = _patterns
        for pattern in patterns:
            if len(pattern) == 0:
                raise ValueError(Errors.E012.format(key=key))
-            self.validateInput(pattern,key)
+            self.validate_input(pattern,key)
        key = self._normalize_key(key)
        _patterns = []
        for pattern in patterns:
@ -187,8 +193,7 @@ cdef class DependencyMatcher:
        key (string or int): The key to check.
        RETURNS (bool): Whether the matcher has the rule.
        """
-        key = self._normalize_key(key)
+        return self._normalize_key(key) in self._patterns
        return key in self._patterns
    def get(self, key, default=None):
        """Retrieve the pattern stored for a key.
@ -202,6 +207,13 @@ cdef class DependencyMatcher:
        return (self._callbacks[key], self._patterns[key])
    def __call__(self, Doc doc):
        """Find all token sequences matching the supplied pattern.
        doclike (Doc or Span): The document to match over.
        RETURNS (list): A list of `(key, start, end)` tuples,
            describing the matches. A match tuple describes a span
            `doc[start:end]`. The `label_id` and `key` are both integers.
        """
        matched_key_trees = []
        matches = self.token_matcher(doc)
        for key in list(self._patterns.keys()):
@ -241,25 +253,25 @@ cdef class DependencyMatcher:
                    on_match(self, doc, i, matched_key_trees)
        return matched_key_trees
-    def recurse(self,tree,id_to_position,_node_operator_map,int patternLength,visitedNodes,matched_trees):
+    def recurse(self,tree,id_to_position,_node_operator_map,int patternLength,visited_nodes,matched_trees):
        cdef bool isValid;
        if(patternLength == len(id_to_position.keys())):
            isValid = True
            for node in range(patternLength):
                if(node in tree):
                    for idx, (relop,nbor) in enumerate(tree[node]):
-                        computed_nbors = numpy.asarray(_node_operator_map[visitedNodes[node]][relop])
+                        computed_nbors = numpy.asarray(_node_operator_map[visited_nodes[node]][relop])
                        isNbor = False
                        for computed_nbor in computed_nbors:
-                            if(computed_nbor.i == visitedNodes[nbor]):
+                            if(computed_nbor.i == visited_nodes[nbor]):
                                isNbor = True
                        isValid = isValid & isNbor
            if(isValid):
-                matched_trees.append(visitedNodes)
+                matched_trees.append(visited_nodes)
            return
        allPatternNodes = numpy.asarray(id_to_position[patternLength])
        for patternNode in allPatternNodes:
-            self.recurse(tree,id_to_position,_node_operator_map,patternLength+1,visitedNodes+[patternNode],matched_trees)
+            self.recurse(tree,id_to_position,_node_operator_map,patternLength+1,visited_nodes+[patternNode],matched_trees)
    # Given a node and an edge operator, to return the list of nodes
    # from the doc that belong to node+operator. This is used to store
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@ -70,7 +70,7 @@ cdef class Matcher:
        key (str): The match ID.
        RETURNS (bool): Whether the matcher contains rules for this match ID.
        """
-        return self._normalize_key(key) in self._patterns
+        return self.has_key(key)
    def add(self, key, patterns, *, on_match=None, greedy: str=None):
        """Add a match-rule to the matcher. A match-rule consists of: an ID
@ -162,8 +162,7 @@ cdef class Matcher:
        key (string or int): The key to check.
        RETURNS (bool): Whether the matcher has the rule.
        """
-        key = self._normalize_key(key)
+        return self._normalize_key(key) in self._patterns
        return key in self._patterns
    def get(self, key, default=None):
        """Retrieve the pattern stored for a key.
@ -179,7 +178,7 @@ cdef class Matcher:
    def pipe(self, docs, batch_size=1000, return_matches=False, as_tuples=False):
        """Match a stream of documents, yielding them in turn.
-        docs (iterable): A stream of documents.
+        docs (Iterable[Union[Doc, Span]]): A stream of documents or spans.
        batch_size (int): Number of documents to accumulate into a working set.
        return_matches (bool): Yield the match lists along with the docs, making
            results (doc, matches) tuples.
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@ -75,8 +75,8 @@ class Morphologizer(Tagger):
        model (thinc.api.Model): The Thinc Model powering the pipeline component.
        name (str): The component instance name, used to add entries to the
            losses during training.
-        labels_morph (dict): TODO:
+        labels_morph (dict): Mapping of morph + POS tags to morph labels.
-        labels_pos (dict): TODO:
+        labels_pos (dict): Mapping of morph + POS tags to POS tags.
        DOCS: https://spacy.io/api/morphologizer#init
        """
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@ -601,9 +601,7 @@ $ python -m spacy train [config_path] [--output] [--code] [--verbose] [overrides
 ## Pretrain {#pretrain new="2.1" tag="experimental"}
-<!-- TODO: document new pretrain command and link to new pretraining docs -->
+Pretrain the "token to vector" ([`Tok2vec`](/api/tok2vec)) layer of pipeline
 Pre-train the "token to vector" ([`Tok2vec`](/api/tok2vec)) layer of pipeline
 components on [raw text](/api/data-formats#pretrain), using an approximate
 language-modeling objective. Specifically, we load pretrained vectors, and train
 a component like a CNN, BiLSTM, etc to predict vectors which match the
@ -611,7 +609,8 @@ pretrained ones. The weights are saved to a directory after each epoch. You can
 then include a **path to one of these pretrained weights files** in your
 [training config](/usage/training#config) as the `init_tok2vec` setting when you
 train your model. This technique may be especially helpful if you have little
-labelled data.
+labelled data. See the usage docs on [pretraining](/usage/training#pretraining)
 for more info.
 <Infobox title="Changed in v3.0" variant="warning">
@ -634,8 +633,8 @@ $ python -m spacy pretrain [texts_loc] [output_dir] [config_path]
 | `output_dir`            | positional | Directory to write models to on each epoch.                                                                                                                                  |
 | `config_path`           | positional | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters.                                                                        |
 | `--code`, `-c`          | option     | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-models) for new architectures.                         |
-| `--resume-path`, `-r`   | option     | TODO:                                                                                                                                                                        |
+| `--resume-path`, `-r`   | option     | Path to pretrained weights from which to resume pretraining.                                                                                                                 |
-| `--epoch-resume`, `-er` | option     | TODO:                                                                                                                                                                        |
+| `--epoch-resume`, `-er` | option     | The epoch to resume counting from when using `--resume-path`. Prevents unintended overwriting of existing weight files.                                                      |
 | `--help`, `-h`          | flag       | Show help message and available arguments.                                                                                                                                   |
 | overrides               |            | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--training.use_gpu 1`.                |
 | **CREATES**             | weights    | The pretrained weights that can be used to initialize `spacy train`.                                                                                                         |
--- a/website/docs/api/data-formats.md
+++ b/website/docs/api/data-formats.md
@ -20,9 +20,9 @@ Config files define the training process and model pipeline and can be passed to
 [`spacy train`](/api/cli#train). They use
 [Thinc's configuration system](https://thinc.ai/docs/usage-config) under the
 hood. For details on how to use training configs, see the
-[usage documentation](/usage/training#config).
+[usage documentation](/usage/training#config). To get started with a blank
-
+config or fill a partial config with all defaults, you can use the
-<!-- TODO: add details on getting started and init config -->
+[`init config`](/api/cli#init-config) command.
 > #### What does the @ mean?
 >
@ -52,8 +52,6 @@ your config and check that it's valid, you can run the
 </Infobox>
 <!-- TODO: once we know how we want to implement "starter config" workflow or outputting a full default config for the user, update this section with the command -->
 ### nlp {#config-nlp tag="section"}
 > #### Example
@ -154,8 +152,6 @@ This section is optional and defines settings and controls for
 [language model pretraining](/usage/training#pretraining). It's used when you
 run [`spacy pretrain`](/api/cli#pretrain).
 <!-- TODO: complete -->
 | Name                         | Type                                                | Description                                                                   | Default                                             |
 | ---------------------------- | --------------------------------------------------- | ----------------------------------------------------------------------------- | --------------------------------------------------- |
 | `max_epochs`                 | int                                                 | Maximum number of epochs.                                                     | `1000`                                              |
--- a/website/docs/api/dependencymatcher.md
+++ b/website/docs/api/dependencymatcher.md
@ -5,4 +5,194 @@ tag: class
 source: spacy/matcher/dependencymatcher.pyx
 ---
-TODO: write
+The `DependencyMatcher` follows the same API as the [`Matcher`](/api/matcher)
 and [`PhraseMatcher`](/api/phrasematcher) and lets you match on dependency trees
 using the
 [Semgrex syntax](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html).
 It requires a pretrained [`DependencyParser`](/api/parser) or other component
 that sets the `Token.dep` attribute.
 ## Pattern format {#patterns}
 > ```json
 > ### Example
 > [
 >   {
 >     "SPEC": {"NODE_NAME": "founded"},
 >     "PATTERN": {"ORTH": "founded"}
 >   },
 >   {
 >     "SPEC": {
 >       "NODE_NAME": "founder",
 >       "NBOR_RELOP": ">",
 >       "NBOR_NAME": "founded"
 >   },
 >     "PATTERN": {"DEP": "nsubj"}
 >   },
 >   {
 >     "SPEC": {
 >       "NODE_NAME": "object",
 >       "NBOR_RELOP": ">",
 >       "NBOR_NAME": "founded"
 >   },
 >     "PATTERN": {"DEP": "dobj"}
 >   }
 > ]
 > ```
 A pattern added to the `DependencyMatcher` consists of a list of dictionaries,
 with each dictionary describing a node to match. Each pattern should have the
 following top-level keys:
 | Name      | Type | Description                                                                                                                 |
 | --------- | ---- | --------------------------------------------------------------------------------------------------------------------------- |
 | `PATTERN` | dict | The token attributes to match in the same format as patterns provided to the regular token-based [`Matcher`](/api/matcher). |
 | `SPEC`    | dict | The relationships of the nodes in the subtree that should be matched.                                                       |
 The `SPEC` includes the following fields:
 | Name         | Type | Description                                                                                                                                                            |
 | ------------ | ---- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `NODE_NAME`  | str  | A unique name for this node to refer to it in other specs.                                                                                                             |
 | `NBOR_RELOP` | str  | A [Semgrex](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html) operator that describes how the two nodes are related. |
 | `NBOR_NAME`  | str  | The unique name of the node that this node is connected to.                                                                                                            |
 ## DependencyMatcher.\_\_init\_\_ {#init tag="method"}
 Create a rule-based `DependencyMatcher`.
 > #### Example
 >
 > ```python
 > from spacy.matcher import DependencyMatcher
 > matcher = DependencyMatcher(nlp.vocab)
 > ```
 | Name    | Type    | Description                                                                                 |
 | ------- | ------- | ------------------------------------------------------------------------------------------- |
 | `vocab` | `Vocab` | The vocabulary object, which must be shared with the documents the matcher will operate on. |
 ## DependencyMatcher.\_\call\_\_ {#call tag="method"}
 Find all token sequences matching the supplied patterns on the `Doc` or `Span`.
 > #### Example
 >
 > ```python
 > from spacy.matcher import Matcher
 >
 > matcher = Matcher(nlp.vocab)
 > pattern = [
 >     {"SPEC": {"NODE_NAME": "founded"}, "PATTERN": {"ORTH": "founded"}},
 >     {"SPEC": {"NODE_NAME": "founder", "NBOR_RELOP": ">", "NBOR_NAME": "founded"}, "PATTERN": {"DEP": "nsubj"}},
 > ]
 > matcher.add("Founder", [pattern])
 > doc = nlp("Bill Gates founded Microsoft.")
 > matches = matcher(doc)
 > ```
 | Name        | Type         | Description                                                                                                                                                              |
 | ----------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | `doclike`   | `Doc`/`Span` | The `Doc` or `Span` to match over.                                                                                                                                       |
 | **RETURNS** | list         | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. The `match_id` is the ID of the added match pattern. |
 ## DependencyMatcher.\_\_len\_\_ {#len tag="method"}
 Get the number of rules (edges) added to the dependency matcher. Note that this
 only returns the number of rules (identical with the number of IDs), not the
 number of individual patterns.
 > #### Example
 >
 > ```python
 > matcher = DependencyMatcher(nlp.vocab)
 > assert len(matcher) == 0
 > pattern = [
 >     {"SPEC": {"NODE_NAME": "founded"}, "PATTERN": {"ORTH": "founded"}},
 >     {"SPEC": {"NODE_NAME": "START_ENTITY", "NBOR_RELOP": ">", "NBOR_NAME": "founded"}, "PATTERN": {"DEP": "nsubj"}},
 > ]
 > matcher.add("Rule", [pattern])
 > assert len(matcher) == 1
 > ```
 | Name        | Type | Description          |
 | ----------- | ---- | -------------------- |
 | **RETURNS** | int  | The number of rules. |
 ## DependencyMatcher.\_\_contains\_\_ {#contains tag="method"}
 Check whether the matcher contains rules for a match ID.
 > #### Example
 >
 > ```python
 > matcher = Matcher(nlp.vocab)
 > assert "Rule" not in matcher
 > matcher.add("Rule", [pattern])
 > assert "Rule" in matcher
 > ```
 | Name        | Type | Description                                           |
 | ----------- | ---- | ----------------------------------------------------- |
 | `key`       | str  | The match ID.                                         |
 | **RETURNS** | bool | Whether the matcher contains rules for this match ID. |
 ## DependencyMatcher.add {#add tag="method"}
 Add a rule to the matcher, consisting of an ID key, one or more patterns, and an
 optional callback function to act on the matches. The callback function will
 receive the arguments `matcher`, `doc`, `i` and `matches`. If a pattern already
 exists for the given ID, the patterns will be extended. An `on_match` callback
 will be overwritten.
 > #### Example
 >
 > ```python
 > def on_match(matcher, doc, id, matches):
 >     print('Matched!', matches)
 >
 > matcher = Matcher(nlp.vocab)
 > matcher.add("TEST_PATTERNS", patterns)
 > ```
 | Name           | Type               | Description                                                                                   |
 | -------------- | ------------------ | --------------------------------------------------------------------------------------------- |
 | `match_id`     | str                | An ID for the thing you're matching.                                                          |
 | `patterns`     | list               | Match pattern. A pattern consists of a list of dicts, where each dict describes a token.      |
 | _keyword-only_ |                    |                                                                                               |
 | `on_match`     | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. |
 ## DependencyMatcher.remove {#remove tag="method"}
 Remove a rule from the matcher. A `KeyError` is raised if the match ID does not
 exist.
 > #### Example
 >
 > ```python
 > matcher.add("Rule", [pattern]])
 > assert "Rule" in matcher
 > matcher.remove("Rule")
 > assert "Rule" not in matcher
 > ```
 | Name  | Type | Description               |
 | ----- | ---- | ------------------------- |
 | `key` | str  | The ID of the match rule. |
 ## DependencyMatcher.get {#get tag="method"}
 Retrieve the pattern stored for a key. Returns the rule as an
 `(on_match, patterns)` tuple containing the callback and available patterns.
 > #### Example
 >
 > ```python
 > matcher.add("Rule", [pattern], on_match=on_match)
 > on_match, patterns = matcher.get("Rule")
 > ```
 | Name        | Type  | Description                                   |
 | ----------- | ----- | --------------------------------------------- |
 | `key`       | str   | The ID of the match rule.                     |
 | **RETURNS** | tuple | The rule, as an `(on_match, patterns)` tuple. |
--- a/website/docs/api/matcher.md
+++ b/website/docs/api/matcher.md
@ -5,6 +5,82 @@ tag: class
 source: spacy/matcher/matcher.pyx
 ---
 The `Matcher` lets you find words and phrases using rules describing their token
 attributes. Rules can refer to token annotations (like the text or
 part-of-speech tags), as well as lexical attributes like `Token.is_punct`.
 Applying the matcher to a [`Doc`](/api/doc) gives you access to the matched
 tokens in context. For in-depth examples and workflows for combining rules and
 statistical models, see the [usage guide](/usage/rule-based-matching) on
 rule-based matching.
 ## Pattern format {#patterns}
 > ```json
 > ### Example
 > [
 >   {"LOWER": "i"},
 >   {"LEMMA": {"IN": ["like", "love"]}},
 >   {"POS": "NOUN", "OP": "+"}
 > ]
 > ```
 A pattern added to the `Matcher` consists of a list of dictionaries. Each
 dictionary describes **one token** and its attributes. The available token
 pattern keys correspond to a number of
 [`Token` attributes](/api/token#attributes). The supported attributes for
 rule-based matching are:
 | Attribute                              | Type |  Description                                                                                           |
 | -------------------------------------- | ---- | ------------------------------------------------------------------------------------------------------ |
 | `ORTH`                                 | str  | The exact verbatim text of a token.                                                                    |
 | `TEXT` <Tag variant="new">2.1</Tag>    | str  | The exact verbatim text of a token.                                                                    |
 | `LOWER`                                | str  | The lowercase form of the token text.                                                                  |
 |  `LENGTH`                              | int  | The length of the token text.                                                                          |
 |  `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT`    | bool | Token text consists of alphabetic characters, ASCII characters, digits.                                |
 |  `IS_LOWER`, `IS_UPPER`, `IS_TITLE`    | bool | Token text is in lowercase, uppercase, titlecase.                                                      |
 |  `IS_PUNCT`, `IS_SPACE`, `IS_STOP`     | bool | Token is punctuation, whitespace, stop word.                                                           |
 |  `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL`  | bool | Token text resembles a number, URL, email.                                                             |
 |  `POS`, `TAG`, `DEP`, `LEMMA`, `SHAPE` | str  | The token's simple and extended part-of-speech tag, dependency label, lemma, shape.                    |
 | `ENT_TYPE`                             | str  | The token's entity label.                                                                              |
 | `_` <Tag variant="new">2.1</Tag>       | dict | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). |
 | `OP`                                   | str  | Operator or quantifier to determine how often to match a token pattern.                                |
 Operators and quantifiers define **how often** a token pattern should be
 matched:
 > ```json
 > ### Example
 > [
 >   {"POS": "ADJ", "OP": "*"},
 >   {"POS": "NOUN", "OP": "+"}
 > ]
 > ```
 | OP  | Description                                                      |
 | --- | ---------------------------------------------------------------- |
 | `!` | Negate the pattern, by requiring it to match exactly 0 times.    |
 | `?` | Make the pattern optional, by allowing it to match 0 or 1 times. |
 | `+` | Require the pattern to match 1 or more times.                    |
 | `*` | Allow the pattern to match zero or more times.                   |
 Token patterns can also map to a **dictionary of properties** instead of a
 single value to indicate whether the expected value is a member of a list or how
 it compares to another value.
 > ```json
 > ### Example
 > [
 >   {"LEMMA": {"IN": ["like", "love", "enjoy"]}},
 >   {"POS": "PROPN", "LENGTH": {">=": 10}},
 > ]
 > ```
 | Attribute                  | Type       | Description                                                                       |
 | -------------------------- | ---------- | --------------------------------------------------------------------------------- |
 | `IN`                       | any        | Attribute value is member of a list.                                              |
 | `NOT_IN`                   | any        | Attribute value is _not_ member of a list.                                        |
 | `==`, `>=`, `<=`, `>`, `<` | int, float | Attribute value is equal, greater or equal, smaller or equal, greater or smaller. |
 ## Matcher.\_\_init\_\_ {#init tag="method"}
 Create the rule-based `Matcher`. If `validate=True` is set, all patterns added
@ -60,7 +136,7 @@ Match a stream of documents, yielding them in turn.
 | Name                                          | Type     | Description                                                                                                                                                                                                                |
 | --------------------------------------------- | -------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `docs`                                        | iterable | A stream of documents.                                                                                                                                                                                                     |
+| `docs`                                        | iterable | A stream of documents or spans.                                                                                                                                                                                            |
 | `batch_size`                                  | int      | The number of documents to accumulate into a working set.                                                                                                                                                                  |
 | `return_matches` <Tag variant="new">2.1</Tag> | bool     | Yield the match lists along with the docs, making results `(doc, matches)` tuples.                                                                                                                                         |
 | `as_tuples`                                   | bool     | Interpret the input stream as `(doc, context)` tuples, and yield `(result, context)` tuples out. If both `return_matches` and `as_tuples` are `True`, the output will be a sequence of `((doc, matches), context)` tuples. |
@ -105,11 +181,11 @@ Check whether the matcher contains rules for a match ID.
 ## Matcher.add {#add tag="method" new="2"}
-Add a rule to the matcher, consisting of an ID key, one or more patterns, and a
+Add a rule to the matcher, consisting of an ID key, one or more patterns, and an
-callback function to act on the matches. The callback function will receive the
+optional callback function to act on the matches. The callback function will
-arguments `matcher`, `doc`, `i` and `matches`. If a pattern already exists for
+receive the arguments `matcher`, `doc`, `i` and `matches`. If a pattern already
-the given ID, the patterns will be extended. An `on_match` callback will be
+exists for the given ID, the patterns will be extended. An `on_match` callback
-overwritten.
+will be overwritten.
 > #### Example
 >
@ -141,12 +217,13 @@ patterns = [[{"TEXT": "Google"}, {"TEXT": "Now"}], [{"TEXT": "GoogleNow"}]]
 </Infobox>
-| Name           | Type               | Description                                                                                   |
+| Name                                | Type               | Description                                                                                   |
-| -------------- | ------------------ | --------------------------------------------------------------------------------------------- |
+| ----------------------------------- | ------------------ | --------------------------------------------------------------------------------------------- |
-| `match_id`     | str                | An ID for the thing you're matching.                                                          |
+| `match_id`                          | str                | An ID for the thing you're matching.                                                          |
-| `patterns`     | list               | Match pattern. A pattern consists of a list of dicts, where each dict describes a token.      |
+| `patterns`                          | `List[List[dict]]` | Match pattern. A pattern consists of a list of dicts, where each dict describes a token.      |
-| _keyword-only_ |                    |                                                                                               |
+| _keyword-only_                      |                    |                                                                                               |
-| `on_match`     | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. |
+| `on_match`                          | callable / `None`  | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. |
 | `greedy` <Tag variant="new">3</Tag> | str                | Optional filter for greedy matches. Can either be `"FIRST"` or `"LONGEST"`.                   |
 ## Matcher.remove {#remove tag="method" new="2"}
--- a/website/docs/api/morphologizer.md
+++ b/website/docs/api/morphologizer.md
@ -63,16 +63,14 @@ Create a new pipeline instance. In your application, you would normally use a
 shortcut for this and instantiate the component using its string name and
 [`nlp.add_pipe`](/api/language#add_pipe).
 <!-- TODO: finish API docs -->
 | Name           | Type    | Description                                                                                 |
 | -------------- | ------- | ------------------------------------------------------------------------------------------- |
 | `vocab`        | `Vocab` | The shared vocabulary.                                                                      |
 | `model`        | `Model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component.             |
 | `name`         | str     | String name of the component instance. Used to add entries to the `losses` during training. |
 | _keyword-only_ |         |                                                                                             |
-| `labels_morph` | dict    |                                                                                             |
+| `labels_morph` | dict    | Mapping of morph + POS tags to morph labels.                                                |
-| `labels_pos`   | dict    |                                                                                             |
+| `labels_pos`   | dict    | Mapping of morph + POS tags to POS tags.                                                    |
 ## Morphologizer.\_\_call\_\_ {#call tag="method"}
--- a/website/docs/api/phrasematcher.md
+++ b/website/docs/api/phrasematcher.md
@ -9,7 +9,8 @@ new: 2
 The `PhraseMatcher` lets you efficiently match large terminology lists. While
 the [`Matcher`](/api/matcher) lets you match sequences based on lists of token
 descriptions, the `PhraseMatcher` accepts match patterns in the form of `Doc`
-objects.
+objects. See the [usage guide](/usage/rule-based-matching#phrasematcher) for
 examples.
 ## PhraseMatcher.\_\_init\_\_ {#init tag="method"}