Merge remote-tracking branch 'upstream/develop' into feature/trf-docs

2025-08-22 13:04:56 +03:00 · 2020-11-19 14:15:35 +01:00 · 2020-11-19 14:15:35 +01:00 · 636be3c791
commit 636be3c791
parent 789fb3d124 165993d8e5
12 changed files with 154 additions and 86 deletions
--- a/setup.cfg
+++ b/setup.cfg
@ -86,6 +86,8 @@ cuda101 =
    cupy-cuda101>=5.0.0b4,<9.0.0
 cuda102 =
    cupy-cuda102>=5.0.0b4,<9.0.0
+cuda110 =
+    cupy-cuda110>=5.0.0b4,<9.0.0
 # Language tokenizers with external dependencies
 ja =
    sudachipy>=0.4.9
@ -94,8 +96,6 @@ ko =
    natto-py==0.9.0
 th =
    pythainlp>=2.0
-zh =
-    spacy-pkuseg==0.0.26

 [bdist_wheel]
 universal = false
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@ -143,6 +143,9 @@ nO = null
@architectures = "spacy-transformers.TransformerListener.v1"
 grad_factor = 1.0

+[components.textcat.model.tok2vec.pooling]
+@layers = "reduce_mean.v1"
+
 [components.textcat.model.linear_model]
@architectures = "spacy.TextCatBOW.v1"
 exclusive_classes = false
--- a/spacy/lang/zh/init.py
+++ b/spacy/lang/zh/init.py
@ -17,7 +17,7 @@ from ... import util


 # fmt: off
-_PKUSEG_INSTALL_MSG = "install spacy-pkuseg with `pip install spacy-pkuseg==0.0.26`"
+_PKUSEG_INSTALL_MSG = "install spacy-pkuseg with `pip install \"spacy-pkuseg>=0.0.27,<0.1.0\"` or `conda install -c conda-forge \"spacy-pkuseg>=0.0.27,<0.1.0\"`"
 # fmt: on

 DEFAULT_CONFIG = """
--- a/spacy/ml/models/textcat.py
+++ b/spacy/ml/models/textcat.py
@ -61,14 +61,14 @@ def build_bow_text_classifier(


@registry.architectures.register("spacy.TextCatEnsemble.v2")
-def build_text_classifier(
+def build_text_classifier_v2(
    tok2vec: Model[List[Doc], List[Floats2d]],
    linear_model: Model[List[Doc], Floats2d],
    nO: Optional[int] = None,
 ) -> Model[List[Doc], Floats2d]:
    exclusive_classes = not linear_model.attrs["multi_label"]
    with Model.define_operators({">>": chain, "|": concatenate}):
-        width = tok2vec.get_dim("nO")
+        width = tok2vec.maybe_get_dim("nO")
        cnn_model = (
                tok2vec
                >> list2ragged()
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@ -92,9 +92,6 @@ class Morphologizer(Tagger):
        # 2) labels_pos stores a mapping from morph+POS->POS
        cfg = {"labels_morph": labels_morph or {}, "labels_pos": labels_pos or {}}
        self.cfg = dict(sorted(cfg.items()))
-        # add mappings for empty morph
-        self.cfg["labels_morph"][Morphology.EMPTY_MORPH] = Morphology.EMPTY_MORPH
-        self.cfg["labels_pos"][Morphology.EMPTY_MORPH] = POS_IDS[""]

    @property
    def labels(self):
@ -201,8 +198,8 @@ class Morphologizer(Tagger):
                doc_tag_ids = doc_tag_ids.get()
            for j, tag_id in enumerate(doc_tag_ids):
                morph = self.labels[tag_id]
-                doc.c[j].morph = self.vocab.morphology.add(self.cfg["labels_morph"][morph])
-                doc.c[j].pos = self.cfg["labels_pos"][morph]
+                doc.c[j].morph = self.vocab.morphology.add(self.cfg["labels_morph"].get(morph, 0))
+                doc.c[j].pos = self.cfg["labels_pos"].get(morph, 0)

    def get_loss(self, examples, scores):
        """Find the loss and gradient of loss for the batch of documents and
@ -228,12 +225,12 @@ class Morphologizer(Tagger):
                # doesn't, so if either is None, treat both as None here so that
                # truths doesn't end up with an unknown morph+POS combination
                if pos is None or morph is None:
-                    pos = None
-                    morph = None
-                label_dict = Morphology.feats_to_dict(morph)
-                if pos:
-                    label_dict[self.POS_FEAT] = pos
-                label = self.vocab.strings[self.vocab.morphology.add(label_dict)]
+                    label = None
+                else:
+                    label_dict = Morphology.feats_to_dict(morph)
+                    if pos:
+                        label_dict[self.POS_FEAT] = pos
+                    label = self.vocab.strings[self.vocab.morphology.add(label_dict)]
                eg_truths.append(label)
            truths.append(eg_truths)
        d_scores, loss = loss_func(scores, truths)
--- a/spacy/scorer.py
+++ b/spacy/scorer.py
@ -512,7 +512,7 @@ class Scorer:
        negative_labels (Iterable[str]): The string values that refer to no annotation (e.g. "NIL")
        RETURNS (Dict[str, Any]): A dictionary containing the scores.

-        DOCS (TODO): https://nightly.spacy.io/api/scorer#score_links
+        DOCS: https://nightly.spacy.io/api/scorer#score_links
        """
        f_per_type = {}
        for example in examples:
--- a/spacy/tests/pipeline/test_morphologizer.py
+++ b/spacy/tests/pipeline/test_morphologizer.py
@ -116,3 +116,23 @@ def test_overfitting_IO():
    no_batch_deps = [doc.to_array([MORPH]) for doc in [nlp(text) for text in texts]]
    assert_equal(batch_deps_1, batch_deps_2)
    assert_equal(batch_deps_1, no_batch_deps)
+
+    # Test without POS
+    nlp.remove_pipe("morphologizer")
+    nlp.add_pipe("morphologizer")
+    for example in train_examples:
+        for token in example.reference:
+            token.pos_ = ""
+    optimizer = nlp.initialize(get_examples=lambda: train_examples)
+    for i in range(50):
+        losses = {}
+        nlp.update(train_examples, sgd=optimizer, losses=losses)
+    assert losses["morphologizer"] < 0.00001
+
+    # Test the trained model
+    test_text = "I like blue ham"
+    doc = nlp(test_text)
+    gold_morphs = ["Feat=N", "Feat=V", "", ""]
+    gold_pos_tags = ["", "", "", ""]
+    assert [str(t.morph) for t in doc] == gold_morphs
+    assert [t.pos_ for t in doc] == gold_pos_tags
--- a/website/docs/api/transformer.md
+++ b/website/docs/api/transformer.md
@ -61,11 +61,11 @@ on the transformer architectures and their arguments and hyperparameters.
 > nlp.add_pipe("transformer", config=DEFAULT_CONFIG)
 > ```

-| Setting                 | Description                                                                                                                                                                                                                                                                                                           |
-| ----------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `max_batch_items`       | Maximum size of a padded batch. Defaults to `4096`. ~~int~~                                                                                                                                                                                                                                                           |
-| `set_extra_annotations` | Function that takes a batch of `Doc` objects and transformer outputs to set additional annotations on the `Doc`. The `Doc._.transformer_data` attribute is set prior to calling the callback. Defaults to `null_annotation_setter` (no additional annotations). ~~Callable[[List[Doc], FullTransformerBatch], None]~~ |
-| `model`                 | The Thinc [`Model`](https://thinc.ai/docs/api-model) wrapping the transformer. Defaults to [TransformerModel](/api/architectures#TransformerModel). ~~Model[List[Doc], FullTransformerBatch]~~                                                                                                                        |
+| Setting                 | Description                                                                                                                                                                                                                                                                                                   |
+| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `max_batch_items`       | Maximum size of a padded batch. Defaults to `4096`. ~~int~~                                                                                                                                                                                                                                                   |
+| `set_extra_annotations` | Function that takes a batch of `Doc` objects and transformer outputs to set additional annotations on the `Doc`. The `Doc._.trf_data` attribute is set prior to calling the callback. Defaults to `null_annotation_setter` (no additional annotations). ~~Callable[[List[Doc], FullTransformerBatch], None]~~ |
+| `model`                 | The Thinc [`Model`](https://thinc.ai/docs/api-model) wrapping the transformer. Defaults to [TransformerModel](/api/architectures#TransformerModel). ~~Model[List[Doc], FullTransformerBatch]~~                                                                                                                |

 ```python
 https://github.com/explosion/spacy-transformers/blob/master/spacy_transformers/pipeline_component.py
--- a/website/docs/usage/index.md
+++ b/website/docs/usage/index.md
@ -174,6 +174,7 @@ $ source .env/bin/activate                      # activate virtual env
 $ export PYTHONPATH=`pwd`                       # set Python path to spaCy dir
 $ pip install -r requirements.txt               # install all requirements
 $ python setup.py build_ext --inplace           # compile spaCy
+$ python setup.py install                       # install spaCy
 ```

 Compared to regular install via pip, the
--- a/website/docs/usage/layers-architectures.md
+++ b/website/docs/usage/layers-architectures.md
@ -843,6 +843,27 @@ def __call__(self, Doc doc):
    return doc
 ```

+There is one more optional method to implement: [`score`](/api/pipe#score) 
+calculates the performance of your component on a set of examples, and 
+returns the results as a dictionary:
+
+```python
+### The score method
+def score(self, examples: Iterable[Example]) -> Dict[str, Any]:
+    prf = PRFScore()
+    for example in examples:
+        ...
+
+    return {
+        "rel_micro_p": prf.precision,
+        "rel_micro_r": prf.recall,
+        "rel_micro_f": prf.fscore,
+    }
+```
+
+This is particularly useful to see the scores on the development corpus 
+when training the component with [`spacy train`](/api/cli#training).
+
 Once our `TrainablePipe` subclass is fully implemented, we can
 [register](/usage/processing-pipelines#custom-components-factories) the
 component with the [`@Language.factory`](/api/language#factory) decorator. This
@ -865,6 +886,11 @@ assigns it a name and lets you create the component with
 > [components.relation_extractor.model.get_candidates]
 > @misc = "rel_cand_generator.v1"
 > max_length = 20
+> 
+> [training.score_weights]
+> rel_micro_p = 0.0
+> rel_micro_r = 0.0
+> rel_micro_f = 1.0
 > ```

 ```python
@ -876,6 +902,28 @@ def make_relation_extractor(nlp, name, model):
    return RelationExtractor(nlp.vocab, model, name)
 ```

+You can extend the decorator to include information such as the type of 
+annotations that are required for this component to run, the type of annotations 
+it produces, and the scores that can be calculated:
+
+```python
+### Factory annotations {highlight="5-11"}
+from spacy.language import Language
+
+@Language.factory(
+    "relation_extractor",
+    requires=["doc.ents", "token.ent_iob", "token.ent_type"],
+    assigns=["doc._.rel"],
+    default_score_weights={
+        "rel_micro_p": None,
+        "rel_micro_r": None,
+        "rel_micro_f": None,
+    },
+)
+def make_relation_extractor(nlp, name, model):
+    return RelationExtractor(nlp.vocab, model, name)
+```
+
 <!-- TODO: <Project id="tutorials/ner-relations">

 </Project> -->
--- a/website/docs/usage/training.md
+++ b/website/docs/usage/training.md
@ -969,7 +969,7 @@ import spacy
 from spacy.tokens import Doc, DocBin

 nlp = spacy.blank("en")
-docbin = DocBin(nlp.vocab)
+docbin = DocBin()
 words = ["Apple", "is", "looking", "at", "buying", "U.K.", "startup", "."]
 spaces = [True, True, True, True, True, True, True, False]
 ents = ["B-ORG", "O", "O", "O", "O", "B-GPE", "O", "O"]
--- a/website/src/widgets/quickstart-install.js
+++ b/website/src/widgets/quickstart-install.js
@ -7,7 +7,7 @@ import { repo } from '../components/util'
 const DEFAULT_MODELS = ['en']
 const DEFAULT_OPT = 'efficiency'
 const DEFAULT_HARDWARE = 'cpu'
-const DEFAULT_CUDA = 'cuda100'
+const DEFAULT_CUDA = 'cuda102'
 const CUDA = {
    '8.0': 'cuda80',
    '9.0': 'cuda90',
@ -16,56 +16,9 @@ const CUDA = {
    '10.0': 'cuda100',
    '10.1': 'cuda101',
    '10.2': 'cuda102',
+    '11.0': 'cuda110',
 }
-const LANG_EXTRAS = ['zh', 'ja'] // only for languages with models
-const DATA = [
-    {
-        id: 'os',
-        title: 'Operating system',
-        options: [
-            { id: 'mac', title: 'macOS / OSX', checked: true },
-            { id: 'windows', title: 'Windows' },
-            { id: 'linux', title: 'Linux' },
-        ],
-    },
-    {
-        id: 'package',
-        title: 'Package manager',
-        options: [
-            { id: 'pip', title: 'pip', checked: true },
-            { id: 'conda', title: 'conda' },
-            { id: 'source', title: 'from source' },
-        ],
-    },
-    {
-        id: 'hardware',
-        title: 'Hardware',
-        options: [
-            { id: 'cpu', title: 'CPU', checked: DEFAULT_HARDWARE === 'cpu' },
-            { id: 'gpu', title: 'GPU', checked: DEFAULT_HARDWARE == 'gpu' },
-        ],
-        dropdown: Object.keys(CUDA).map(id => ({ id: CUDA[id], title: `CUDA ${id}` })),
-        defaultValue: DEFAULT_CUDA,
-    },
-    {
-        id: 'config',
-        title: 'Configuration',
-        multiple: true,
-        options: [
-            {
-                id: 'venv',
-                title: 'virtual env',
-                help: 'Use a virtual environment and install spaCy into a user directory',
-            },
-            {
-                id: 'train',
-                title: 'train models',
-                help:
-                    'Check this if you plan to train your own models with spaCy to install extra dependencies and data resources',
-            },
-        ],
-    },
-]
+const LANG_EXTRAS = ['ja'] // only for languages with models

 const QuickstartInstall = ({ id, title }) => {
    const [train, setTrain] = useState(false)
@ -99,7 +52,56 @@ const QuickstartInstall = ({ id, title }) => {
                const pkg = nightly ? 'spacy-nightly' : 'spacy'
                const models = languages.filter(({ models }) => models !== null)
                const data = [
-                    ...DATA,
+                    {
+                        id: 'os',
+                        title: 'Operating system',
+                        options: [
+                            { id: 'mac', title: 'macOS / OSX', checked: true },
+                            { id: 'windows', title: 'Windows' },
+                            { id: 'linux', title: 'Linux' },
+                        ],
+                    },
+                    {
+                        id: 'package',
+                        title: 'Package manager',
+                        options: [
+                            { id: 'pip', title: 'pip', checked: true },
+                            !nightly ? { id: 'conda', title: 'conda' } : null,
+                            { id: 'source', title: 'from source' },
+                        ].filter(o => o),
+                    },
+                    {
+                        id: 'hardware',
+                        title: 'Hardware',
+                        options: [
+                            { id: 'cpu', title: 'CPU', checked: DEFAULT_HARDWARE === 'cpu' },
+                            { id: 'gpu', title: 'GPU', checked: DEFAULT_HARDWARE == 'gpu' },
+                        ],
+                        dropdown: Object.keys(CUDA).map(id => ({
+                            id: CUDA[id],
+                            title: `CUDA ${id}`,
+                        })),
+                        defaultValue: DEFAULT_CUDA,
+                    },
+                    {
+                        id: 'config',
+                        title: 'Configuration',
+                        multiple: true,
+                        options: [
+                            {
+                                id: 'venv',
+                                title: 'virtual env',
+                                help:
+                                    'Use a virtual environment and install spaCy into a user directory',
+                            },
+                            {
+                                id: 'train',
+                                title: 'train models',
+                                help:
+                                    'Check this if you plan to train your own models with spaCy to install extra dependencies and data resources',
+                            },
+                        ],
+                    },
                    {
                        id: 'models',
                        title: 'Trained pipelines',
@ -141,11 +143,6 @@ const QuickstartInstall = ({ id, title }) => {
                        setters={setters}
                        showDropdown={showDropdown}
                    >
-                        {nightly && (
-                            <QS package="conda" comment prompt={false}>
-                                # 🚨 Nightly releases are currently only available via pip
-                            </QS>
-                        )}
                        <QS config="venv">python -m venv .env</QS>
                        <QS config="venv" os="mac">
                            source .env/bin/activate
@ -180,15 +177,17 @@ const QuickstartInstall = ({ id, title }) => {
                        </QS>
                        <QS package="source">pip install -r requirements.txt</QS>
                        <QS package="source">python setup.py build_ext --inplace</QS>
-                        {(train || hardware == 'gpu') && (
-                            <QS package="source">pip install -e '.[{pipExtras}]'</QS>
-                        )}
-
-                        <QS config="train" package="conda">
-                            conda install -c conda-forge spacy-transformers
+                        <QS package="source">
+                            pip install {train || hardware == 'gpu' ? `'.[${pipExtras}]'` : '.'}
+                        </QS>
+                        <QS config="train" package="conda" comment prompt={false}>
+                            # packages only available via pip
                        </QS>
                        <QS config="train" package="conda">
-                            conda install -c conda-forge spacy-lookups-data
+                            pip install spacy-transformers
+                        </QS>
+                        <QS config="train" package="conda">
+                            pip install spacy-lookups-data
                        </QS>

                        {models.map(({ code, models: modelOptions }) => {