Merge remote-tracking branch 'upstream/develop' into feature/trf-docs

2025-11-08 20:07:51 +03:00 · 2020-11-19 14:15:35 +01:00 · 2020-11-19 14:15:35 +01:00 · 636be3c791
commit 636be3c791
parent 789fb3d124 165993d8e5
12 changed files with 154 additions and 86 deletions
--- a/setup.cfg
+++ b/setup.cfg
@ -86,6 +86,8 @@ cuda101 =
    cupy-cuda101>=5.0.0b4,<9.0.0
 cuda102 =
    cupy-cuda102>=5.0.0b4,<9.0.0
 cuda110 =
    cupy-cuda110>=5.0.0b4,<9.0.0
 # Language tokenizers with external dependencies
 ja =
    sudachipy>=0.4.9
@ -94,8 +96,6 @@ ko =
    natto-py==0.9.0
 th =
    pythainlp>=2.0
 zh =
    spacy-pkuseg==0.0.26
 [bdist_wheel]
 universal = false
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@ -143,6 +143,9 @@ nO = null
@architectures = "spacy-transformers.TransformerListener.v1"
 grad_factor = 1.0
 [components.textcat.model.tok2vec.pooling]
@layers = "reduce_mean.v1"
 [components.textcat.model.linear_model]
@architectures = "spacy.TextCatBOW.v1"
 exclusive_classes = false
--- a/spacy/lang/zh/init.py
+++ b/spacy/lang/zh/init.py
@ -17,7 +17,7 @@ from ... import util
 # fmt: off
-_PKUSEG_INSTALL_MSG = "install spacy-pkuseg with `pip install spacy-pkuseg==0.0.26`"
+_PKUSEG_INSTALL_MSG = "install spacy-pkuseg with `pip install \"spacy-pkuseg>=0.0.27,<0.1.0\"` or `conda install -c conda-forge \"spacy-pkuseg>=0.0.27,<0.1.0\"`"
 # fmt: on
 DEFAULT_CONFIG = """
--- a/spacy/ml/models/textcat.py
+++ b/spacy/ml/models/textcat.py
@ -61,14 +61,14 @@ def build_bow_text_classifier(
@registry.architectures.register("spacy.TextCatEnsemble.v2")
-def build_text_classifier(
+def build_text_classifier_v2(
    tok2vec: Model[List[Doc], List[Floats2d]],
    linear_model: Model[List[Doc], Floats2d],
    nO: Optional[int] = None,
 ) -> Model[List[Doc], Floats2d]:
    exclusive_classes = not linear_model.attrs["multi_label"]
    with Model.define_operators({">>": chain, "|": concatenate}):
-        width = tok2vec.get_dim("nO")
+        width = tok2vec.maybe_get_dim("nO")
        cnn_model = (
                tok2vec
                >> list2ragged()
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@ -92,9 +92,6 @@ class Morphologizer(Tagger):
        # 2) labels_pos stores a mapping from morph+POS->POS
        cfg = {"labels_morph": labels_morph or {}, "labels_pos": labels_pos or {}}
        self.cfg = dict(sorted(cfg.items()))
        # add mappings for empty morph
        self.cfg["labels_morph"][Morphology.EMPTY_MORPH] = Morphology.EMPTY_MORPH
        self.cfg["labels_pos"][Morphology.EMPTY_MORPH] = POS_IDS[""]
    @property
    def labels(self):
@ -201,8 +198,8 @@ class Morphologizer(Tagger):
                doc_tag_ids = doc_tag_ids.get()
            for j, tag_id in enumerate(doc_tag_ids):
                morph = self.labels[tag_id]
-                doc.c[j].morph = self.vocab.morphology.add(self.cfg["labels_morph"][morph])
+                doc.c[j].morph = self.vocab.morphology.add(self.cfg["labels_morph"].get(morph, 0))
-                doc.c[j].pos = self.cfg["labels_pos"][morph]
+                doc.c[j].pos = self.cfg["labels_pos"].get(morph, 0)
    def get_loss(self, examples, scores):
        """Find the loss and gradient of loss for the batch of documents and
@ -228,8 +225,8 @@ class Morphologizer(Tagger):
                # doesn't, so if either is None, treat both as None here so that
                # truths doesn't end up with an unknown morph+POS combination
                if pos is None or morph is None:
-                    pos = None
+                    label = None
-                    morph = None
+                else:
                    label_dict = Morphology.feats_to_dict(morph)
                    if pos:
                        label_dict[self.POS_FEAT] = pos
--- a/spacy/scorer.py
+++ b/spacy/scorer.py
@ -512,7 +512,7 @@ class Scorer:
        negative_labels (Iterable[str]): The string values that refer to no annotation (e.g. "NIL")
        RETURNS (Dict[str, Any]): A dictionary containing the scores.
-        DOCS (TODO): https://nightly.spacy.io/api/scorer#score_links
+        DOCS: https://nightly.spacy.io/api/scorer#score_links
        """
        f_per_type = {}
        for example in examples:
--- a/spacy/tests/pipeline/test_morphologizer.py
+++ b/spacy/tests/pipeline/test_morphologizer.py
@ -116,3 +116,23 @@ def test_overfitting_IO():
    no_batch_deps = [doc.to_array([MORPH]) for doc in [nlp(text) for text in texts]]
    assert_equal(batch_deps_1, batch_deps_2)
    assert_equal(batch_deps_1, no_batch_deps)
    # Test without POS
    nlp.remove_pipe("morphologizer")
    nlp.add_pipe("morphologizer")
    for example in train_examples:
        for token in example.reference:
            token.pos_ = ""
    optimizer = nlp.initialize(get_examples=lambda: train_examples)
    for i in range(50):
        losses = {}
        nlp.update(train_examples, sgd=optimizer, losses=losses)
    assert losses["morphologizer"] < 0.00001
    # Test the trained model
    test_text = "I like blue ham"
    doc = nlp(test_text)
    gold_morphs = ["Feat=N", "Feat=V", "", ""]
    gold_pos_tags = ["", "", "", ""]
    assert [str(t.morph) for t in doc] == gold_morphs
    assert [t.pos_ for t in doc] == gold_pos_tags
--- a/website/docs/api/transformer.md
+++ b/website/docs/api/transformer.md
@ -62,9 +62,9 @@ on the transformer architectures and their arguments and hyperparameters.
 > ```
 | Setting                 | Description                                                                                                                                                                                                                                                                                                   |
-| ----------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `max_batch_items`       | Maximum size of a padded batch. Defaults to `4096`. ~~int~~                                                                                                                                                                                                                                                   |
-| `set_extra_annotations` | Function that takes a batch of `Doc` objects and transformer outputs to set additional annotations on the `Doc`. The `Doc._.transformer_data` attribute is set prior to calling the callback. Defaults to `null_annotation_setter` (no additional annotations). ~~Callable[[List[Doc], FullTransformerBatch], None]~~ |
+| `set_extra_annotations` | Function that takes a batch of `Doc` objects and transformer outputs to set additional annotations on the `Doc`. The `Doc._.trf_data` attribute is set prior to calling the callback. Defaults to `null_annotation_setter` (no additional annotations). ~~Callable[[List[Doc], FullTransformerBatch], None]~~ |
 | `model`                 | The Thinc [`Model`](https://thinc.ai/docs/api-model) wrapping the transformer. Defaults to [TransformerModel](/api/architectures#TransformerModel). ~~Model[List[Doc], FullTransformerBatch]~~                                                                                                                |
 ```python
--- a/website/docs/usage/index.md
+++ b/website/docs/usage/index.md
@ -174,6 +174,7 @@ $ source .env/bin/activate                      # activate virtual env
 $ export PYTHONPATH=`pwd`                       # set Python path to spaCy dir
 $ pip install -r requirements.txt               # install all requirements
 $ python setup.py build_ext --inplace           # compile spaCy
 $ python setup.py install                       # install spaCy
 ```
 Compared to regular install via pip, the
--- a/website/docs/usage/layers-architectures.md
+++ b/website/docs/usage/layers-architectures.md
@ -843,6 +843,27 @@ def __call__(self, Doc doc):
    return doc
 ```
 There is one more optional method to implement: [`score`](/api/pipe#score) 
 calculates the performance of your component on a set of examples, and 
 returns the results as a dictionary:
 ```python
 ### The score method
 def score(self, examples: Iterable[Example]) -> Dict[str, Any]:
    prf = PRFScore()
    for example in examples:
        ...
    return {
        "rel_micro_p": prf.precision,
        "rel_micro_r": prf.recall,
        "rel_micro_f": prf.fscore,
    }
 ```
 This is particularly useful to see the scores on the development corpus 
 when training the component with [`spacy train`](/api/cli#training).
 Once our `TrainablePipe` subclass is fully implemented, we can
 [register](/usage/processing-pipelines#custom-components-factories) the
 component with the [`@Language.factory`](/api/language#factory) decorator. This
@ -865,6 +886,11 @@ assigns it a name and lets you create the component with
 > [components.relation_extractor.model.get_candidates]
 > @misc = "rel_cand_generator.v1"
 > max_length = 20
 > 
 > [training.score_weights]
 > rel_micro_p = 0.0
 > rel_micro_r = 0.0
 > rel_micro_f = 1.0
 > ```
 ```python
@ -876,6 +902,28 @@ def make_relation_extractor(nlp, name, model):
    return RelationExtractor(nlp.vocab, model, name)
 ```
 You can extend the decorator to include information such as the type of 
 annotations that are required for this component to run, the type of annotations 
 it produces, and the scores that can be calculated:
 ```python
 ### Factory annotations {highlight="5-11"}
 from spacy.language import Language
@Language.factory(
    "relation_extractor",
    requires=["doc.ents", "token.ent_iob", "token.ent_type"],
    assigns=["doc._.rel"],
    default_score_weights={
        "rel_micro_p": None,
        "rel_micro_r": None,
        "rel_micro_f": None,
    },
 )
 def make_relation_extractor(nlp, name, model):
    return RelationExtractor(nlp.vocab, model, name)
 ```
 <!-- TODO: <Project id="tutorials/ner-relations">
 </Project> -->
--- a/website/docs/usage/training.md
+++ b/website/docs/usage/training.md
@ -969,7 +969,7 @@ import spacy
 from spacy.tokens import Doc, DocBin
 nlp = spacy.blank("en")
-docbin = DocBin(nlp.vocab)
+docbin = DocBin()
 words = ["Apple", "is", "looking", "at", "buying", "U.K.", "startup", "."]
 spaces = [True, True, True, True, True, True, True, False]
 ents = ["B-ORG", "O", "O", "O", "O", "B-GPE", "O", "O"]
--- a/website/src/widgets/quickstart-install.js
+++ b/website/src/widgets/quickstart-install.js
@ -7,7 +7,7 @@ import { repo } from '../components/util'
 const DEFAULT_MODELS = ['en']
 const DEFAULT_OPT = 'efficiency'
 const DEFAULT_HARDWARE = 'cpu'
-const DEFAULT_CUDA = 'cuda100'
+const DEFAULT_CUDA = 'cuda102'
 const CUDA = {
    '8.0': 'cuda80',
    '9.0': 'cuda90',
@ -16,56 +16,9 @@ const CUDA = {
    '10.0': 'cuda100',
    '10.1': 'cuda101',
    '10.2': 'cuda102',
    '11.0': 'cuda110',
 }
-const LANG_EXTRAS = ['zh', 'ja'] // only for languages with models
+const LANG_EXTRAS = ['ja'] // only for languages with models
 const DATA = [
    {
        id: 'os',
        title: 'Operating system',
        options: [
            { id: 'mac', title: 'macOS / OSX', checked: true },
            { id: 'windows', title: 'Windows' },
            { id: 'linux', title: 'Linux' },
        ],
    },
    {
        id: 'package',
        title: 'Package manager',
        options: [
            { id: 'pip', title: 'pip', checked: true },
            { id: 'conda', title: 'conda' },
            { id: 'source', title: 'from source' },
        ],
    },
    {
        id: 'hardware',
        title: 'Hardware',
        options: [
            { id: 'cpu', title: 'CPU', checked: DEFAULT_HARDWARE === 'cpu' },
            { id: 'gpu', title: 'GPU', checked: DEFAULT_HARDWARE == 'gpu' },
        ],
        dropdown: Object.keys(CUDA).map(id => ({ id: CUDA[id], title: `CUDA ${id}` })),
        defaultValue: DEFAULT_CUDA,
    },
    {
        id: 'config',
        title: 'Configuration',
        multiple: true,
        options: [
            {
                id: 'venv',
                title: 'virtual env',
                help: 'Use a virtual environment and install spaCy into a user directory',
            },
            {
                id: 'train',
                title: 'train models',
                help:
                    'Check this if you plan to train your own models with spaCy to install extra dependencies and data resources',
            },
        ],
    },
 ]
 const QuickstartInstall = ({ id, title }) => {
    const [train, setTrain] = useState(false)
@ -99,7 +52,56 @@ const QuickstartInstall = ({ id, title }) => {
                const pkg = nightly ? 'spacy-nightly' : 'spacy'
                const models = languages.filter(({ models }) => models !== null)
                const data = [
-                    ...DATA,
+                    {
                        id: 'os',
                        title: 'Operating system',
                        options: [
                            { id: 'mac', title: 'macOS / OSX', checked: true },
                            { id: 'windows', title: 'Windows' },
                            { id: 'linux', title: 'Linux' },
                        ],
                    },
                    {
                        id: 'package',
                        title: 'Package manager',
                        options: [
                            { id: 'pip', title: 'pip', checked: true },
                            !nightly ? { id: 'conda', title: 'conda' } : null,
                            { id: 'source', title: 'from source' },
                        ].filter(o => o),
                    },
                    {
                        id: 'hardware',
                        title: 'Hardware',
                        options: [
                            { id: 'cpu', title: 'CPU', checked: DEFAULT_HARDWARE === 'cpu' },
                            { id: 'gpu', title: 'GPU', checked: DEFAULT_HARDWARE == 'gpu' },
                        ],
                        dropdown: Object.keys(CUDA).map(id => ({
                            id: CUDA[id],
                            title: `CUDA ${id}`,
                        })),
                        defaultValue: DEFAULT_CUDA,
                    },
                    {
                        id: 'config',
                        title: 'Configuration',
                        multiple: true,
                        options: [
                            {
                                id: 'venv',
                                title: 'virtual env',
                                help:
                                    'Use a virtual environment and install spaCy into a user directory',
                            },
                            {
                                id: 'train',
                                title: 'train models',
                                help:
                                    'Check this if you plan to train your own models with spaCy to install extra dependencies and data resources',
                            },
                        ],
                    },
                    {
                        id: 'models',
                        title: 'Trained pipelines',
@ -141,11 +143,6 @@ const QuickstartInstall = ({ id, title }) => {
                        setters={setters}
                        showDropdown={showDropdown}
                    >
                        {nightly && (
                            <QS package="conda" comment prompt={false}>
                                # 🚨 Nightly releases are currently only available via pip
                            </QS>
                        )}
                        <QS config="venv">python -m venv .env</QS>
                        <QS config="venv" os="mac">
                            source .env/bin/activate
@ -180,15 +177,17 @@ const QuickstartInstall = ({ id, title }) => {
                        </QS>
                        <QS package="source">pip install -r requirements.txt</QS>
                        <QS package="source">python setup.py build_ext --inplace</QS>
-                        {(train || hardware == 'gpu') && (
+                        <QS package="source">
-                            <QS package="source">pip install -e '.[{pipExtras}]'</QS>
+                            pip install {train || hardware == 'gpu' ? `'.[${pipExtras}]'` : '.'}
-                        )}
+                        </QS>
-
+                        <QS config="train" package="conda" comment prompt={false}>
-                        <QS config="train" package="conda">
+                            # packages only available via pip
                            conda install -c conda-forge spacy-transformers
                        </QS>
                        <QS config="train" package="conda">
-                            conda install -c conda-forge spacy-lookups-data
+                            pip install spacy-transformers
                        </QS>
                        <QS config="train" package="conda">
                            pip install spacy-lookups-data
                        </QS>
                        {models.map(({ code, models: modelOptions }) => {