Merge branch 'develop' into nightly.spacy.io

2025-07-02 02:43:36 +03:00 · 2020-08-11 01:21:47 +02:00 · 2020-08-11 01:21:47 +02:00 · a77713947d
commit a77713947d
parent 922250ca58 10f42e3a39
21 changed files with 389 additions and 252 deletions
--- a/spacy/lexeme.pxd
+++ b/spacy/lexeme.pxd
@ -18,11 +18,12 @@ cdef class Lexeme:
    cdef readonly attr_t orth

    @staticmethod
-    cdef inline Lexeme from_ptr(LexemeC* lex, Vocab vocab, int vector_length):
+    cdef inline Lexeme from_ptr(LexemeC* lex, Vocab vocab):
        cdef Lexeme self = Lexeme.__new__(Lexeme, vocab, lex.orth)
        self.c = lex
        self.vocab = vocab
        self.orth = lex.orth
+        return self

    @staticmethod
    cdef inline void set_struct_attr(LexemeC* lex, attr_id_t name, attr_t value) nogil:
--- a/spacy/pipeline/pipe.pyx
+++ b/spacy/pipeline/pipe.pyx
@ -1,4 +1,4 @@
-# cython: infer_types=True, profile=True, binding=True
+# cython: infer_types=True, profile=True
 import srsly

 from ..tokens.doc cimport Doc
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@ -43,7 +43,7 @@ DEFAULT_TAGGER_MODEL = Config().from_str(default_model_config)["model"]
    scores=["tag_acc"],
    default_score_weights={"tag_acc": 1.0},
 )
-def make_tagger(nlp: Language, name: str, model: Model[List[Doc], List[Floats2d]]):
+def make_tagger(nlp: Language, name: str, model: Model):
    """Construct a part-of-speech tagger component.

    model (Model[List[Doc], List[Floats2d]]): A model instance that predicts
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@ -172,7 +172,7 @@ class TextCategorizer(Pipe):
        return scores

    def set_annotations(self, docs: Iterable[Doc], scores) -> None:
-        """Modify a batch of documents, using pre-computed scores.
+        """Modify a batch of [`Doc`](/api/doc) objects, using pre-computed scores.

        docs (Iterable[Doc]): The documents to modify.
        scores: The scores to set, produced by TextCategorizer.predict.
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@ -2,6 +2,7 @@ import pytest
 import numpy
 from spacy.tokens import Doc, Span
 from spacy.vocab import Vocab
+from spacy.lexeme import Lexeme
 from spacy.lang.en import English
 from spacy.attrs import ENT_TYPE, ENT_IOB, SENT_START, HEAD, DEP, MORPH

@ -389,3 +390,11 @@ def test_doc_lang(en_vocab):
    assert doc.lang == en_vocab.strings["en"]
    assert doc[0].lang_ == "en"
    assert doc[0].lang == en_vocab.strings["en"]
+
+
+def test_token_lexeme(en_vocab):
+    """Test that tokens expose their lexeme."""
+    token = Doc(en_vocab, words=["Hello", "world"])[0]
+    assert isinstance(token.lex, Lexeme)
+    assert token.lex.text == token.text
+    assert en_vocab[token.orth] == token.lex
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@ -226,6 +226,11 @@ cdef class Token:
            cdef hash_t key = self.vocab.morphology.add(features)
            self.c.morph = key

+    @property
+    def lex(self):
+        """RETURNS (Lexeme): The underlying lexeme."""
+        return self.vocab[self.c.lex.orth]
+
    @property
    def lex_id(self):
        """RETURNS (int): Sequential ID of the token's lexical type."""
--- a/website/docs/api/dependencyparser.md
+++ b/website/docs/api/dependencyparser.md
@ -162,7 +162,8 @@ Initialize the pipe for training, using data examples if available. Returns an

 ## DependencyParser.predict {#predict tag="method"}

-Apply the pipeline's model to a batch of docs, without modifying them.
+Apply the component's model to a batch of [`Doc`](/api/doc) objects, without
+modifying them.

 > #### Example
 >
@ -178,7 +179,7 @@ Apply the pipeline's model to a batch of docs, without modifying them.

 ## DependencyParser.set_annotations {#set_annotations tag="method"}

-Modify a batch of documents, using pre-computed scores.
+Modify a batch of [`Doc`](/api/doc) objects, using pre-computed scores.

 > #### Example
 >
--- a/website/docs/api/entitylinker.md
+++ b/website/docs/api/entitylinker.md
@ -162,9 +162,9 @@ Initialize the pipe for training, using data examples if available. Returns an

 ## EntityLinker.predict {#predict tag="method"}

-Apply the pipeline's model to a batch of docs, without modifying them. Returns
-the KB IDs for each entity in each doc, including `NIL` if there is no
-prediction.
+Apply the component's model to a batch of [`Doc`](/api/doc) objects, without
+modifying them. Returns the KB IDs for each entity in each doc, including `NIL`
+if there is no prediction.

 > #### Example
 >
--- a/website/docs/api/entityrecognizer.md
+++ b/website/docs/api/entityrecognizer.md
@ -151,7 +151,8 @@ Initialize the pipe for training, using data examples if available. Returns an

 ## EntityRecognizer.predict {#predict tag="method"}

-Apply the pipeline's model to a batch of docs, without modifying them.
+Apply the component's model to a batch of [`Doc`](/api/doc) objects, without
+modifying them.

 > #### Example
 >
@ -167,7 +168,7 @@ Apply the pipeline's model to a batch of docs, without modifying them.

 ## EntityRecognizer.set_annotations {#set_annotations tag="method"}

-Modify a batch of documents, using pre-computed scores.
+Modify a batch of [`Doc`](/api/doc) objects, using pre-computed scores.

 > #### Example
 >
--- a/website/docs/api/morphologizer.md
+++ b/website/docs/api/morphologizer.md
@ -142,7 +142,8 @@ Initialize the pipe for training, using data examples if available. Returns an

 ## Morphologizer.predict {#predict tag="method"}

-Apply the pipeline's model to a batch of docs, without modifying them.
+Apply the component's model to a batch of [`Doc`](/api/doc) objects, without
+modifying them.

 > #### Example
 >
@ -158,7 +159,7 @@ Apply the pipeline's model to a batch of docs, without modifying them.

 ## Morphologizer.set_annotations {#set_annotations tag="method"}

-Modify a batch of documents, using pre-computed scores.
+Modify a batch of [`Doc`](/api/doc) objects, using pre-computed scores.

 > #### Example
 >
@ -175,8 +176,9 @@ Modify a batch of documents, using pre-computed scores.

 ## Morphologizer.update {#update tag="method"}

-Learn from a batch of documents and gold-standard information, updating the
-pipe's model. Delegates to [`predict`](/api/morphologizer#predict) and
+Learn from a batch of [`Example`](/api/example) objects containing the
+predictions and gold-standard annotations, and update the component's model.
+Delegates to [`predict`](/api/morphologizer#predict) and
 [`get_loss`](/api/morphologizer#get_loss).

 > #### Example
--- a/website/docs/api/pipe.md
+++ b/website/docs/api/pipe.md
@ -8,7 +8,18 @@ This class is a base class and **not instantiated directly**. Trainable pipeline
 components like the [`EntityRecognizer`](/api/entityrecognizer) or
 [`TextCategorizer`](/api/textcategorizer) inherit from it and it defines the
 interface that components should follow to function as trainable components in a
-spaCy pipeline.
+spaCy pipeline. See the docs on
+[writing trainable components](/usage/processing-pipelines#trainable) for how to
+use the `Pipe` base class to implement custom components.
+
+> #### Why is Pipe implemented in Cython?
+>
+> The `Pipe` class is implemented in a `.pyx` module, the extension used by
+> [Cython](/api/cython). This is needed so that **other** Cython classes, like
+> the [`EntityRecognizer`](/api/entityrecognizer) can inherit from it. But it
+> doesn't mean you have to implement trainable components in Cython – pure
+> Python components like the [`TextCategorizer`](/api/textcategorizer) can also
+> inherit from `Pipe`.

 ```python
 https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/pipe.pyx
@ -115,7 +126,8 @@ Initialize the pipe for training, using data examples if available. Returns an

 ## Pipe.predict {#predict tag="method"}

-Apply the pipeline's model to a batch of docs, without modifying them.
+Apply the component's model to a batch of [`Doc`](/api/doc) objects, without
+modifying them.

 <Infobox variant="danger">

@ -137,7 +149,7 @@ This method needs to be overwritten with your own custom `predict` method.

 ## Pipe.set_annotations {#set_annotations tag="method"}

-Modify a batch of documents, using pre-computed scores.
+Modify a batch of [`Doc`](/api/doc) objects, using pre-computed scores.

 <Infobox variant="danger">

@ -161,8 +173,8 @@ method.

 ## Pipe.update {#update tag="method"}

-Learn from a batch of documents and gold-standard information, updating the
-pipe's model. Delegates to [`predict`](/api/pipe#predict).
+Learn from a batch of [`Example`](/api/example) objects containing the
+predictions and gold-standard annotations, and update the component's model.

 <Infobox variant="danger">

--- a/website/docs/api/sentencerecognizer.md
+++ b/website/docs/api/sentencerecognizer.md
@ -136,7 +136,8 @@ Initialize the pipe for training, using data examples if available. Returns an

 ## SentenceRecognizer.predict {#predict tag="method"}

-Apply the pipeline's model to a batch of docs, without modifying them.
+Apply the component's model to a batch of [`Doc`](/api/doc) objects, without
+modifying them.

 > #### Example
 >
@ -152,7 +153,7 @@ Apply the pipeline's model to a batch of docs, without modifying them.

 ## SentenceRecognizer.set_annotations {#set_annotations tag="method"}

-Modify a batch of documents, using pre-computed scores.
+Modify a batch of [`Doc`](/api/doc) objects, using pre-computed scores.

 > #### Example
 >
@ -169,8 +170,9 @@ Modify a batch of documents, using pre-computed scores.

 ## SentenceRecognizer.update {#update tag="method"}

-Learn from a batch of documents and gold-standard information, updating the
-pipe's model. Delegates to [`predict`](/api/sentencerecognizer#predict) and
+Learn from a batch of [`Example`](/api/example) objects containing the
+predictions and gold-standard annotations, and update the component's model.
+Delegates to [`predict`](/api/sentencerecognizer#predict) and
 [`get_loss`](/api/sentencerecognizer#get_loss).

 > #### Example
--- a/website/docs/api/tagger.md
+++ b/website/docs/api/tagger.md
@ -134,7 +134,8 @@ Initialize the pipe for training, using data examples if available. Returns an

 ## Tagger.predict {#predict tag="method"}

-Apply the pipeline's model to a batch of docs, without modifying them.
+Apply the component's model to a batch of [`Doc`](/api/doc) objects, without
+modifying them.

 > #### Example
 >
@ -150,7 +151,7 @@ Apply the pipeline's model to a batch of docs, without modifying them.

 ## Tagger.set_annotations {#set_annotations tag="method"}

-Modify a batch of documents, using pre-computed scores.
+Modify a batch of [`Doc`](/api/doc) objects, using pre-computed scores.

 > #### Example
 >
@ -167,8 +168,9 @@ Modify a batch of documents, using pre-computed scores.

 ## Tagger.update {#update tag="method"}

-Learn from a batch of documents and gold-standard information, updating the
-pipe's model. Delegates to [`predict`](/api/tagger#predict) and
+Learn from a batch of [`Example`](/api/example) objects containing the
+predictions and gold-standard annotations, and update the component's model.
+Delegates to [`predict`](/api/tagger#predict) and
 [`get_loss`](/api/tagger#get_loss).

 > #### Example
--- a/website/docs/api/textcategorizer.md
+++ b/website/docs/api/textcategorizer.md
@ -142,7 +142,8 @@ Initialize the pipe for training, using data examples if available. Returns an

 ## TextCategorizer.predict {#predict tag="method"}

-Apply the pipeline's model to a batch of docs, without modifying them.
+Apply the component's model to a batch of [`Doc`](/api/doc) objects, without
+modifying them.

 > #### Example
 >
@ -158,7 +159,7 @@ Apply the pipeline's model to a batch of docs, without modifying them.

 ## TextCategorizer.set_annotations {#set_annotations tag="method"}

-Modify a batch of documents, using pre-computed scores.
+Modify a batch of [`Doc`](/api/doc) objects, using pre-computed scores.

 > #### Example
 >
@ -175,8 +176,9 @@ Modify a batch of documents, using pre-computed scores.

 ## TextCategorizer.update {#update tag="method"}

-Learn from a batch of documents and gold-standard information, updating the
-pipe's model. Delegates to [`predict`](/api/textcategorizer#predict) and
+Learn from a batch of [`Example`](/api/example) objects containing the
+predictions and gold-standard annotations, and update the component's model.
+Delegates to [`predict`](/api/textcategorizer#predict) and
 [`get_loss`](/api/textcategorizer#get_loss).

 > #### Example
--- a/website/docs/api/tok2vec.md
+++ b/website/docs/api/tok2vec.md
@ -145,7 +145,8 @@ Initialize the pipe for training, using data examples if available. Returns an

 ## Tok2Vec.predict {#predict tag="method"}

-Apply the pipeline's model to a batch of docs, without modifying them.
+Apply the component's model to a batch of [`Doc`](/api/doc) objects, without
+modifying them.

 > #### Example
 >
@ -161,7 +162,7 @@ Apply the pipeline's model to a batch of docs, without modifying them.

 ## Tok2Vec.set_annotations {#set_annotations tag="method"}

-Modify a batch of documents, using pre-computed scores.
+Modify a batch of [`Doc`](/api/doc) objects, using pre-computed scores.

 > #### Example
 >
@ -178,8 +179,9 @@ Modify a batch of documents, using pre-computed scores.

 ## Tok2Vec.update {#update tag="method"}

-Learn from a batch of documents and gold-standard information, updating the
-pipe's model. Delegates to [`predict`](/api/tok2vec#predict).
+Learn from a batch of [`Example`](/api/example) objects containing the
+predictions and gold-standard annotations, and update the component's model.
+Delegates to [`predict`](/api/tok2vec#predict).

 > #### Example
 >
--- a/website/docs/api/token.md
+++ b/website/docs/api/token.md
@ -393,9 +393,10 @@ The L2 norm of the token's vector representation.
 ## Attributes {#attributes}

 | Name                                         | Type                    | Description                                                                                                                                                                                                                                                    |
-| -------------------------------------------- | --------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| -------------------------------------------- | ----------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `doc`                                        | `Doc`                   | The parent document.                                                                                                                                                                                                                                           |
-| `sent` <Tag variant="new">2.0.12</Tag>       | `Span`          | The sentence span that this token is a part of.                                                                                                                                                                                                                |
+| `lex` <Tag variant="new">3</Tag>             | [`Lexeme`](/api/lexeme) | The underlying lexeme.                                                                                                                                                                                                                                         |
+| `sent` <Tag variant="new">2.0.12</Tag>       | [`Span`](/api/span)     | The sentence span that this token is a part of.                                                                                                                                                                                                                |
 | `text`                                       | str                     | Verbatim text content.                                                                                                                                                                                                                                         |
 | `text_with_ws`                               | str                     | Text content, with trailing space character if present.                                                                                                                                                                                                        |
 | `whitespace_`                                | str                     | Trailing space character if present.                                                                                                                                                                                                                           |
--- a/website/docs/api/transformer.md
+++ b/website/docs/api/transformer.md
@ -179,7 +179,8 @@ Initialize the pipe for training, using data examples if available. Returns an

 ## Transformer.predict {#predict tag="method"}

-Apply the pipeline's model to a batch of docs, without modifying them.
+Apply the component's model to a batch of [`Doc`](/api/doc) objects, without
+modifying them.

 > #### Example
 >
--- a/website/docs/images/architecture.svg
+++ b/website/docs/images/architecture.svg
--- a/website/docs/usage/101/_architecture.md
+++ b/website/docs/usage/101/_architecture.md
@ -14,8 +14,6 @@ of the pipeline. The `Language` object coordinates these components. It takes
 raw text and sends it through the pipeline, returning an **annotated document**.
 It also orchestrates training and serialization.

-<!-- TODO: update graphic -->
-
 ![Library architecture](../../images/architecture.svg)

 ### Container objects {#architecture-containers}
@ -85,4 +83,4 @@ operates on a `Doc` and gives you access to the matched tokens **in context**.
 | [`MorphAnalysis`](/api/morphanalysis) | A morphological analysis.                                                                                        |
 | [`KnowledgeBase`](/api/kb)            | Storage for entities and aliases of a knowledge base for entity linking.                                         |
 | [`Scorer`](/api/scorer)               | Compute evaluation scores.                                                                                       |
-| [`Corpus`](/api/corpis)               | Class for managing annotated corpora for training and evaluation data.                                           |
+| [`Corpus`](/api/corpus)               | Class for managing annotated corpora for training and evaluation data.                                           |
--- a/website/docs/usage/processing-pipelines.md
+++ b/website/docs/usage/processing-pipelines.md
@ -5,6 +5,7 @@ menu:
  - ['Processing Text', 'processing']
  - ['How Pipelines Work', 'pipelines']
  - ['Custom Components', 'custom-components']
+  # - ['Trainable Components', 'trainable-components']
  - ['Extension Attributes', 'custom-components-attributes']
  - ['Plugins & Wrappers', 'plugins']
 ---
@ -885,10 +886,14 @@ available, falls back to looking up the regular factory name.
 </Infobox>

 <!-- TODO:
+## Trainable components {#trainable-components new="3"}

-### Trainable components {#trainable new="3"}
+spaCy's [`Pipe`](/api/pipe) class helps you implement your own trainable
+components that have their own model instance, make predictions over `Doc`
+objects and can be updated using [`spacy train`](/api/cli#train). This lets you
+plug fully custom machine learning components into your pipeline.

-->
+--->

 ## Extension attributes {#custom-components-attributes new="2"}

--- a/website/docs/usage/spacy-101.md
+++ b/website/docs/usage/spacy-101.md
@ -6,11 +6,11 @@ menu:
  - ['Features', 'features']
  - ['Linguistic Annotations', 'annotations']
  - ['Pipelines', 'pipelines']
+  - ['Architecture', 'architecture']
  - ['Vocab', 'vocab']
  - ['Serialization', 'serialization']
  - ['Training', 'training']
  - ['Language Data', 'language-data']
-  - ['Architecture', 'architecture']
  - ['Community & FAQ', 'community-faq']
 ---

@ -71,12 +71,11 @@ systems, or to pre-process text for **deep learning**.
 - [Named entities](#annotations-ner)
 - [Word vectors and similarity](#vectors-similarity)
 - [Pipelines](#pipelines)
+- [Library architecture](#architecture)
 - [Vocab, hashes and lexemes](#vocab)
 - [Serialization](#serialization)
 - [Training](#training)
 - [Language data](#language-data)
- [Lightning tour](#lightning-tour)
- [Architecture](#architecture)
 - [Community & FAQ](#community)

 </Infobox>
@ -266,6 +265,12 @@ guide on [language processing pipelines](/usage/processing-pipelines).

 </Infobox>

+## Architecture {#architecture}
+
+import Architecture101 from 'usage/101/\_architecture.md'
+
+<Architecture101 />
+
 ## Vocab, hashes and lexemes {#vocab}

 Whenever possible, spaCy tries to store data in a vocabulary, the
@ -411,12 +416,6 @@ import LanguageData101 from 'usage/101/\_language-data.md'

 <LanguageData101 />

-## Architecture {#architecture}
-
-import Architecture101 from 'usage/101/\_architecture.md'
-
-<Architecture101 />
-
 ## Community & FAQ {#community-faq}

 We're very happy to see the spaCy community grow and include a mix of people