From c9baf9d196cba07fe1b1c636bcab3c80c6b81b44 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 15 Nov 2021 12:40:55 +0100 Subject: [PATCH 01/20] Fix spancat for empty docs and zero suggestions (#9654) * Fix spancat for empty docs and zero suggestions * Use ops.xp.zeros in test --- spacy/ml/extract_spans.py | 10 +++++++-- spacy/pipeline/spancat.py | 2 +- spacy/tests/pipeline/test_spancat.py | 31 +++++++++++++++++++++++++++- 3 files changed, 39 insertions(+), 4 deletions(-) diff --git a/spacy/ml/extract_spans.py b/spacy/ml/extract_spans.py index 9bc972032..edc86ff9c 100644 --- a/spacy/ml/extract_spans.py +++ b/spacy/ml/extract_spans.py @@ -28,7 +28,13 @@ def forward( X, spans = source_spans assert spans.dataXd.ndim == 2 indices = _get_span_indices(ops, spans, X.lengths) - Y = Ragged(X.dataXd[indices], spans.dataXd[:, 1] - spans.dataXd[:, 0]) # type: ignore[arg-type, index] + if len(indices) > 0: + Y = Ragged(X.dataXd[indices], spans.dataXd[:, 1] - spans.dataXd[:, 0]) # type: ignore[arg-type, index] + else: + Y = Ragged( + ops.xp.zeros(X.dataXd.shape, dtype=X.dataXd.dtype), + ops.xp.zeros((len(X.lengths),), dtype="i"), + ) x_shape = X.dataXd.shape x_lengths = X.lengths @@ -53,7 +59,7 @@ def _get_span_indices(ops, spans: Ragged, lengths: Ints1d) -> Ints1d: for j in range(spans_i.shape[0]): indices.append(ops.xp.arange(spans_i[j, 0], spans_i[j, 1])) # type: ignore[call-overload, index] offset += length - return ops.flatten(indices) + return ops.flatten(indices, dtype="i", ndim_if_empty=1) def _ensure_cpu(spans: Ragged, lengths: Ints1d) -> Tuple[Ragged, Ints1d]: diff --git a/spacy/pipeline/spancat.py b/spacy/pipeline/spancat.py index 5b84ce8fb..829def1eb 100644 --- a/spacy/pipeline/spancat.py +++ b/spacy/pipeline/spancat.py @@ -78,7 +78,7 @@ def build_ngram_suggester(sizes: List[int]) -> Suggester: if len(spans) > 0: output = Ragged(ops.xp.vstack(spans), lengths_array) else: - output = Ragged(ops.xp.zeros((0, 0)), lengths_array) + output = Ragged(ops.xp.zeros((0, 0), dtype="i"), lengths_array) assert output.dataXd.ndim == 2 return output diff --git a/spacy/tests/pipeline/test_spancat.py b/spacy/tests/pipeline/test_spancat.py index 5c3a9d27d..2f7e952d3 100644 --- a/spacy/tests/pipeline/test_spancat.py +++ b/spacy/tests/pipeline/test_spancat.py @@ -1,7 +1,7 @@ import pytest import numpy from numpy.testing import assert_array_equal, assert_almost_equal -from thinc.api import get_current_ops +from thinc.api import get_current_ops, Ragged from spacy import util from spacy.lang.en import English @@ -29,6 +29,7 @@ TRAIN_DATA_OVERLAPPING = [ "I like London and Berlin", {"spans": {SPAN_KEY: [(7, 13, "LOC"), (18, 24, "LOC"), (7, 24, "DOUBLE_LOC")]}}, ), + ("", {"spans": {SPAN_KEY: []}}), ] @@ -365,3 +366,31 @@ def test_overfitting_IO_overlapping(): "London and Berlin", } assert set([span.label_ for span in spans2]) == {"LOC", "DOUBLE_LOC"} + + +def test_zero_suggestions(): + # Test with a suggester that returns 0 suggestions + + @registry.misc("test_zero_suggester") + def make_zero_suggester(): + def zero_suggester(docs, *, ops=None): + if ops is None: + ops = get_current_ops() + return Ragged( + ops.xp.zeros((0, 0), dtype="i"), ops.xp.zeros((len(docs),), dtype="i") + ) + + return zero_suggester + + fix_random_seed(0) + nlp = English() + spancat = nlp.add_pipe( + "spancat", + config={"suggester": {"@misc": "test_zero_suggester"}, "spans_key": SPAN_KEY}, + ) + train_examples = make_examples(nlp) + optimizer = nlp.initialize(get_examples=lambda: train_examples) + assert spancat.model.get_dim("nO") == 2 + assert set(spancat.labels) == {"LOC", "PERSON"} + + nlp.update(train_examples, sgd=optimizer) From 86fa37e8baf631348ec712a174c19c3ca7fb88cd Mon Sep 17 00:00:00 2001 From: Vishnu Nandakumar <38393302+Vishnunkumar@users.noreply.github.com> Date: Tue, 16 Nov 2021 10:36:19 +0530 Subject: [PATCH 02/20] Update universe.json with new library eng_spacysentiment (#9679) * Update universe.json * Update universe.json * Cleanup fields Co-authored-by: Paul O'Leary McCann --- website/meta/universe.json | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/website/meta/universe.json b/website/meta/universe.json index 9b7484a13..7f3813a95 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -3592,6 +3592,32 @@ "github": "xxyzz" }, "category": ["standalone"] + }, + { + "id": "eng_spacysentiment", + "title": "eng_spacysentiment", + "slogan": "Simple sentiment analysis using spaCy pipelines", + "description": "Sentiment analysis for simple english sentences using pre-trained spaCy pipelines", + "github": "vishnunkumar/spacysentiment", + "pip": "eng-spacysentiment", + "code_example": [ + "import eng_spacysentiment", + "nlp = eng_spacysentiment.load()", + "text = \"Welcome to Arsenals official YouTube channel Watch as we take you closer and show you the personality of the club\"", + "doc = nlp(text)", + "print(doc.cats)", + "# {'positive': 0.29878824949264526, 'negative': 0.7012117505073547}" + ], + "thumb": "", + "image": "", + "code_language": "python", + "author": "Vishnu Nandakumar", + "author_links": { + "github": "Vishnunkumar", + "twitter": "vishnun_uchiha" + }, + "category": ["pipeline"], + "tags": ["pipeline", "nlp", "sentiment"] } ], From f3981bd0c87b5f686593e51a53825b2c718eac6e Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Thu, 18 Nov 2021 14:38:30 +0000 Subject: [PATCH 03/20] Clarify how to fill in init_tok2vec after pretraining (#9639) * Clarify how to fill in init_tok2vec after pretraining * Ignore init_tok2vec arg in pretraining * Update docs, config setting * Remove obsolete note about not filling init_tok2vec early This seems to have also caught some lines that needed cleanup. --- spacy/training/pretrain.py | 2 ++ website/docs/api/data-formats.md | 2 +- website/docs/usage/embeddings-transformers.md | 35 +++++++++---------- 3 files changed, 19 insertions(+), 20 deletions(-) diff --git a/spacy/training/pretrain.py b/spacy/training/pretrain.py index 465406a49..52af84aaf 100644 --- a/spacy/training/pretrain.py +++ b/spacy/training/pretrain.py @@ -31,6 +31,8 @@ def pretrain( allocator = config["training"]["gpu_allocator"] if use_gpu >= 0 and allocator: set_gpu_allocator(allocator) + # ignore in pretraining because we're creating it now + config["initialize"]["init_tok2vec"] = None nlp = load_model_from_config(config) _config = nlp.config.interpolate() P = registry.resolve(_config["pretraining"], schema=ConfigSchemaPretrain) diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md index 001455f33..c6cd92799 100644 --- a/website/docs/api/data-formats.md +++ b/website/docs/api/data-formats.md @@ -248,7 +248,7 @@ Also see the usage guides on the | `after_init` | Optional callback to modify the `nlp` object after initialization. ~~Optional[Callable[[Language], Language]]~~ | | `before_init` | Optional callback to modify the `nlp` object before initialization. ~~Optional[Callable[[Language], Language]]~~ | | `components` | Additional arguments passed to the `initialize` method of a pipeline component, keyed by component name. If type annotations are available on the method, the config will be validated against them. The `initialize` methods will always receive the `get_examples` callback and the current `nlp` object. ~~Dict[str, Dict[str, Any]]~~ | -| `init_tok2vec` | Optional path to pretrained tok2vec weights created with [`spacy pretrain`](/api/cli#pretrain). Defaults to variable `${paths.init_tok2vec}`. ~~Optional[str]~~ | +| `init_tok2vec` | Optional path to pretrained tok2vec weights created with [`spacy pretrain`](/api/cli#pretrain). Defaults to variable `${paths.init_tok2vec}`. Ignored when actually running pretraining, as you're creating the file to be used later. ~~Optional[str]~~ | | `lookups` | Additional lexeme and vocab data from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data). Defaults to `null`. ~~Optional[Lookups]~~ | | `tokenizer` | Additional arguments passed to the `initialize` method of the specified tokenizer. Can be used for languages like Chinese that depend on dictionaries or trained models for tokenization. If type annotations are available on the method, the config will be validated against them. The `initialize` method will always receive the `get_examples` callback and the current `nlp` object. ~~Dict[str, Any]~~ | | `vectors` | Name or path of pipeline containing pretrained word vectors to use, e.g. created with [`init vectors`](/api/cli#init-vectors). Defaults to `null`. ~~Optional[str]~~ | diff --git a/website/docs/usage/embeddings-transformers.md b/website/docs/usage/embeddings-transformers.md index febed6f2f..708cdd8bf 100644 --- a/website/docs/usage/embeddings-transformers.md +++ b/website/docs/usage/embeddings-transformers.md @@ -391,8 +391,8 @@ A wide variety of PyTorch models are supported, but some might not work. If a model doesn't seem to work feel free to open an [issue](https://github.com/explosion/spacy/issues). Additionally note that Transformers loaded in spaCy can only be used for tensors, and pretrained -task-specific heads or text generation features cannot be used as part of -the `transformer` pipeline component. +task-specific heads or text generation features cannot be used as part of the +`transformer` pipeline component. @@ -715,8 +715,8 @@ network for a temporary task that forces the model to learn something about sentence structure and word cooccurrence statistics. Pretraining produces a **binary weights file** that can be loaded back in at the -start of training, using the configuration option `initialize.init_tok2vec`. -The weights file specifies an initial set of weights. Training then proceeds as +start of training, using the configuration option `initialize.init_tok2vec`. The +weights file specifies an initial set of weights. Training then proceeds as normal. You can only pretrain one subnetwork from your pipeline at a time, and the @@ -751,15 +751,14 @@ layer = "tok2vec" #### Connecting pretraining to training {#pretraining-training} -To benefit from pretraining, your training step needs to know to initialize -its `tok2vec` component with the weights learned from the pretraining step. -You do this by setting `initialize.init_tok2vec` to the filename of the -`.bin` file that you want to use from pretraining. +To benefit from pretraining, your training step needs to know to initialize its +`tok2vec` component with the weights learned from the pretraining step. You do +this by setting `initialize.init_tok2vec` to the filename of the `.bin` file +that you want to use from pretraining. -A pretraining step that runs for 5 epochs with an output path of `pretrain/`, -as an example, produces `pretrain/model0.bin` through `pretrain/model4.bin`. -To make use of the final output, you could fill in this value in your config -file: +A pretraining step that runs for 5 epochs with an output path of `pretrain/`, as +an example, produces `pretrain/model0.bin` through `pretrain/model4.bin`. To +make use of the final output, you could fill in this value in your config file: ```ini ### config.cfg @@ -773,16 +772,14 @@ init_tok2vec = ${paths.init_tok2vec} -The outputs of `spacy pretrain` are not the same data format as the -pre-packaged static word vectors that would go into -[`initialize.vectors`](/api/data-formats#config-initialize). -The pretraining output consists of the weights that the `tok2vec` -component should start with in an existing pipeline, so it goes in -`initialize.init_tok2vec`. +The outputs of `spacy pretrain` are not the same data format as the pre-packaged +static word vectors that would go into +[`initialize.vectors`](/api/data-formats#config-initialize). The pretraining +output consists of the weights that the `tok2vec` component should start with in +an existing pipeline, so it goes in `initialize.init_tok2vec`. - #### Pretraining objectives {#pretraining-objectives} > ```ini From ea450d652c32f65b947a1e1a498b45f29ed4dc29 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 19 Nov 2021 08:51:19 +0100 Subject: [PATCH 04/20] Exclude strings from v3.2+ source vector checks (#9697) Exclude strings from `Vector.to_bytes()` comparions for v3.2+ `Vectors` that now include the string store so that the source vector comparison is only comparing the vectors and not the strings. --- spacy/language.py | 7 +++++-- spacy/training/initialize.py | 2 +- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index aa57989ac..204b24ecb 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -701,7 +701,8 @@ class Language: if ( self.vocab.vectors.shape != source.vocab.vectors.shape or self.vocab.vectors.key2row != source.vocab.vectors.key2row - or self.vocab.vectors.to_bytes() != source.vocab.vectors.to_bytes() + or self.vocab.vectors.to_bytes(exclude=["strings"]) + != source.vocab.vectors.to_bytes(exclude=["strings"]) ): warnings.warn(Warnings.W113.format(name=source_name)) if source_name not in source.component_names: @@ -1822,7 +1823,9 @@ class Language: ) if model not in source_nlp_vectors_hashes: source_nlp_vectors_hashes[model] = hash( - source_nlps[model].vocab.vectors.to_bytes() + source_nlps[model].vocab.vectors.to_bytes( + exclude=["strings"] + ) ) if "_sourced_vectors_hashes" not in nlp.meta: nlp.meta["_sourced_vectors_hashes"] = {} diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py index 13ccfeb93..084204389 100644 --- a/spacy/training/initialize.py +++ b/spacy/training/initialize.py @@ -132,7 +132,7 @@ def init_vocab( logger.info(f"Added vectors: {vectors}") # warn if source model vectors are not identical sourced_vectors_hashes = nlp.meta.pop("_sourced_vectors_hashes", {}) - vectors_hash = hash(nlp.vocab.vectors.to_bytes()) + vectors_hash = hash(nlp.vocab.vectors.to_bytes(exclude=["strings"])) for sourced_component, sourced_vectors_hash in sourced_vectors_hashes.items(): if vectors_hash != sourced_vectors_hash: warnings.warn(Warnings.W113.format(name=sourced_component)) From 0e93b315f3a5f96f2190d7eae7f6085bafe9c747 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 19 Nov 2021 08:51:46 +0100 Subject: [PATCH 05/20] Convert labels to strings for README in package CLI (#9694) --- spacy/cli/package.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/cli/package.py b/spacy/cli/package.py index e76343dc3..76e14daf5 100644 --- a/spacy/cli/package.py +++ b/spacy/cli/package.py @@ -397,7 +397,7 @@ def _format_label_scheme(data: Dict[str, Any]) -> str: continue col1 = md.bold(md.code(pipe)) col2 = ", ".join( - [md.code(label.replace("|", "\\|")) for label in labels] + [md.code(str(label).replace("|", "\\|")) for label in labels] ) # noqa: W605 label_data.append((col1, col2)) n_labels += len(labels) From 13645dcbf5b2fe567be41d039c4cc4ebdae79ed6 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Mon, 22 Nov 2021 06:43:11 +0100 Subject: [PATCH 06/20] add note that annotating components is new since 3.1 (#9678) --- website/docs/api/data-formats.md | 38 ++++++++++++++++---------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md index c6cd92799..c51a6dbca 100644 --- a/website/docs/api/data-formats.md +++ b/website/docs/api/data-formats.md @@ -181,25 +181,25 @@ single corpus once and then divide it up into `train` and `dev` partitions. This section defines settings and controls for the training and evaluation process that are used when you run [`spacy train`](/api/cli#train). -| Name | Description | -| ----------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~ | -| `batcher` | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ | -| `before_to_disk` | Optional callback to modify `nlp` object right before it is saved to disk during and after training. Can be used to remove or reset config values or disable components. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~ | -| `dev_corpus` | Dot notation of the config location defining the dev corpus. Defaults to `corpora.dev`. ~~str~~ | -| `dropout` | The dropout rate. Defaults to `0.1`. ~~float~~ | -| `eval_frequency` | How often to evaluate during training (steps). Defaults to `200`. ~~int~~ | -| `frozen_components` | Pipeline component names that are "frozen" and shouldn't be initialized or updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~ | -| `annotating_components` | Pipeline component names that should set annotations on the predicted docs during training. See [here](/usage/training#annotating-components) for details. Defaults to `[]`. ~~List[str]~~ | -| `gpu_allocator` | Library for cupy to route GPU memory allocation to. Can be `"pytorch"` or `"tensorflow"`. Defaults to variable `${system.gpu_allocator}`. ~~str~~ | -| `logger` | Callable that takes the `nlp` and stdout and stderr `IO` objects, sets up the logger, and returns two new callables to log a training step and to finalize the logger. Defaults to [`ConsoleLogger`](/api/top-level#ConsoleLogger). ~~Callable[[Language, IO, IO], [Tuple[Callable[[Dict[str, Any]], None], Callable[[], None]]]]~~ | -| `max_epochs` | Maximum number of epochs to train for. `0` means an unlimited number of epochs. `-1` means that the train corpus should be streamed rather than loaded into memory with no shuffling within the training loop. Defaults to `0`. ~~int~~ | -| `max_steps` | Maximum number of update steps to train for. `0` means an unlimited number of steps. Defaults to `20000`. ~~int~~ | -| `optimizer` | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~ | -| `patience` | How many steps to continue without improvement in evaluation score. `0` disables early stopping. Defaults to `1600`. ~~int~~ | -| `score_weights` | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. Defaults to `{}`. ~~Dict[str, float]~~ | -| `seed` | The random seed. Defaults to variable `${system.seed}`. ~~int~~ | -| `train_corpus` | Dot notation of the config location defining the train corpus. Defaults to `corpora.train`. ~~str~~ | +| Name | Description | +| ---------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~ | +| `batcher` | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ | +| `before_to_disk` | Optional callback to modify `nlp` object right before it is saved to disk during and after training. Can be used to remove or reset config values or disable components. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~ | +| `dev_corpus` | Dot notation of the config location defining the dev corpus. Defaults to `corpora.dev`. ~~str~~ | +| `dropout` | The dropout rate. Defaults to `0.1`. ~~float~~ | +| `eval_frequency` | How often to evaluate during training (steps). Defaults to `200`. ~~int~~ | +| `frozen_components` | Pipeline component names that are "frozen" and shouldn't be initialized or updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~ | +| `annotating_components` 3.1 | Pipeline component names that should set annotations on the predicted docs during training. See [here](/usage/training#annotating-components) for details. Defaults to `[]`. ~~List[str]~~ | +| `gpu_allocator` | Library for cupy to route GPU memory allocation to. Can be `"pytorch"` or `"tensorflow"`. Defaults to variable `${system.gpu_allocator}`. ~~str~~ | +| `logger` | Callable that takes the `nlp` and stdout and stderr `IO` objects, sets up the logger, and returns two new callables to log a training step and to finalize the logger. Defaults to [`ConsoleLogger`](/api/top-level#ConsoleLogger). ~~Callable[[Language, IO, IO], [Tuple[Callable[[Dict[str, Any]], None], Callable[[], None]]]]~~ | +| `max_epochs` | Maximum number of epochs to train for. `0` means an unlimited number of epochs. `-1` means that the train corpus should be streamed rather than loaded into memory with no shuffling within the training loop. Defaults to `0`. ~~int~~ | +| `max_steps` | Maximum number of update steps to train for. `0` means an unlimited number of steps. Defaults to `20000`. ~~int~~ | +| `optimizer` | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~ | +| `patience` | How many steps to continue without improvement in evaluation score. `0` disables early stopping. Defaults to `1600`. ~~int~~ | +| `score_weights` | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. Defaults to `{}`. ~~Dict[str, float]~~ | +| `seed` | The random seed. Defaults to variable `${system.seed}`. ~~int~~ | +| `train_corpus` | Dot notation of the config location defining the train corpus. Defaults to `corpora.train`. ~~str~~ | ### pretraining {#config-pretraining tag="section,optional"} From 52b8c2d2e0241e1c515131c5e5f576d5dad65059 Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Mon, 22 Nov 2021 10:06:07 +0000 Subject: [PATCH 07/20] Add note on batch contract for listeners (#9691) * Add note on batch contract Using listeners requires batches to be consistent. This is obvious if you understand how the listener works, but it wasn't clearly stated in the Docs, and was subtle enough that the EntityLinker missed it. There is probably a clearer way to explain what the actual requirement is, but I figure this is a good start. * Rewrite to clarify role of caching --- website/docs/api/architectures.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md index 01ca4540b..44ba94d9e 100644 --- a/website/docs/api/architectures.md +++ b/website/docs/api/architectures.md @@ -124,6 +124,14 @@ Instead of defining its own `Tok2Vec` instance, a model architecture like [Tagger](/api/architectures#tagger) can define a listener as its `tok2vec` argument that connects to the shared `tok2vec` component in the pipeline. +Listeners work by caching the `Tok2Vec` output for a given batch of `Doc`s. This +means that in order for a component to work with the listener, the batch of +`Doc`s passed to the listener must be the same as the batch of `Doc`s passed to +the `Tok2Vec`. As a result, any manipulation of the `Doc`s which would affect +`Tok2Vec` output, such as to create special contexts or remove `Doc`s for which +no prediction can be made, must happen inside the model, **after** the call to +the `Tok2Vec` component. + | Name | Description | | ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | `width` | The width of the vectors produced by the "upstream" [`Tok2Vec`](/api/tok2vec) component. ~~int~~ | From 36c70479468b10e1c8578a5a75dec9e908340a6f Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 23 Nov 2021 14:55:55 +0100 Subject: [PATCH 08/20] Use reference parse to initialize parser moves (#9722) --- spacy/pipeline/_parser_internals/arc_eager.pyx | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/spacy/pipeline/_parser_internals/arc_eager.pyx b/spacy/pipeline/_parser_internals/arc_eager.pyx index f34975858..ddcc911c8 100644 --- a/spacy/pipeline/_parser_internals/arc_eager.pyx +++ b/spacy/pipeline/_parser_internals/arc_eager.pyx @@ -585,7 +585,10 @@ cdef class ArcEager(TransitionSystem): actions[RIGHT][label] = 1 actions[REDUCE][label] = 1 for example in kwargs.get('examples', []): - heads, labels = example.get_aligned_parse(projectivize=True) + # use heads and labels from the reference parse (without regard to + # misalignments between the predicted and reference) + example_gold_preproc = Example(example.reference, example.reference) + heads, labels = example_gold_preproc.get_aligned_parse(projectivize=True) for child, (head, label) in enumerate(zip(heads, labels)): if head is None or label is None: continue From a77f50baa43029d3676fdaa6079e0635444de21b Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 23 Nov 2021 15:17:19 +0100 Subject: [PATCH 09/20] Allow Scorer.score_spans to handle pred docs with missing annotation (#9701) If the predicted docs are missing annotation according to `has_annotation`, treat the docs as having no predictions rather than raising errors when the annotation is missing. The motivation for this is a combined tokenization+sents scorer for a component where the sents annotation is optional. To provide a single scorer in the component factory, it needs to be possible for the scorer to continue despite missing sents annotation in the case where the component is not annotating sents. --- spacy/scorer.py | 38 +++++++++++++++++++++----------------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/spacy/scorer.py b/spacy/scorer.py index cfdf34e62..4d596b5e1 100644 --- a/spacy/scorer.py +++ b/spacy/scorer.py @@ -359,14 +359,15 @@ class Scorer: pred_doc = example.predicted gold_doc = example.reference # Option to handle docs without annotation for this attribute - if has_annotation is not None: - if not has_annotation(gold_doc): - continue - # Find all labels in gold and doc - labels = set( - [k.label_ for k in getter(gold_doc, attr)] - + [k.label_ for k in getter(pred_doc, attr)] - ) + if has_annotation is not None and not has_annotation(gold_doc): + continue + # Find all labels in gold + labels = set([k.label_ for k in getter(gold_doc, attr)]) + # If labeled, find all labels in pred + if has_annotation is None or ( + has_annotation is not None and has_annotation(pred_doc) + ): + labels |= set([k.label_ for k in getter(pred_doc, attr)]) # Set up all labels for per type scoring and prepare gold per type gold_per_type: Dict[str, Set] = {label: set() for label in labels} for label in labels: @@ -384,16 +385,19 @@ class Scorer: gold_spans.add(gold_span) gold_per_type[span.label_].add(gold_span) pred_per_type: Dict[str, Set] = {label: set() for label in labels} - for span in example.get_aligned_spans_x2y( - getter(pred_doc, attr), allow_overlap + if has_annotation is None or ( + has_annotation is not None and has_annotation(pred_doc) ): - pred_span: Tuple - if labeled: - pred_span = (span.label_, span.start, span.end - 1) - else: - pred_span = (span.start, span.end - 1) - pred_spans.add(pred_span) - pred_per_type[span.label_].add(pred_span) + for span in example.get_aligned_spans_x2y( + getter(pred_doc, attr), allow_overlap + ): + pred_span: Tuple + if labeled: + pred_span = (span.label_, span.start, span.end - 1) + else: + pred_span = (span.start, span.end - 1) + pred_spans.add(pred_span) + pred_per_type[span.label_].add(pred_span) # Scores per label if labeled: for k, v in score_per_type.items(): From 9ac6d4991eb34d47f2e42bf7418918d49cf76219 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 23 Nov 2021 15:33:33 +0100 Subject: [PATCH 10/20] Add doc_cleaner component (#9659) * Add doc_cleaner component * Fix types * Fix loop * Rephrase method description --- spacy/errors.py | 1 + spacy/pipeline/functions.py | 64 ++++++++++++++++++++++++++ spacy/tests/pipeline/test_functions.py | 25 ++++++++++ website/docs/api/pipeline-functions.md | 22 +++++++++ 4 files changed, 112 insertions(+) diff --git a/spacy/errors.py b/spacy/errors.py index 5fe550145..84c407422 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -191,6 +191,7 @@ class Warnings(metaclass=ErrorsWithCodes): "lead to errors.") W115 = ("Skipping {method}: the floret vector table cannot be modified. " "Vectors are calculated from character ngrams.") + W116 = ("Unable to clean attribute '{attr}'.") class Errors(metaclass=ErrorsWithCodes): diff --git a/spacy/pipeline/functions.py b/spacy/pipeline/functions.py index f0a75dc2c..c005395bf 100644 --- a/spacy/pipeline/functions.py +++ b/spacy/pipeline/functions.py @@ -1,6 +1,8 @@ from typing import Dict, Any import srsly +import warnings +from ..errors import Warnings from ..language import Language from ..matcher import Matcher from ..tokens import Doc @@ -136,3 +138,65 @@ class TokenSplitter: "cfg": lambda p: self._set_config(srsly.read_json(p)), } util.from_disk(path, serializers, []) + + +@Language.factory( + "doc_cleaner", + default_config={"attrs": {"tensor": None, "_.trf_data": None}, "silent": True}, +) +def make_doc_cleaner(nlp: Language, name: str, *, attrs: Dict[str, Any], silent: bool): + return DocCleaner(attrs, silent=silent) + + +class DocCleaner: + def __init__(self, attrs: Dict[str, Any], *, silent: bool = True): + self.cfg: Dict[str, Any] = {"attrs": dict(attrs), "silent": silent} + + def __call__(self, doc: Doc) -> Doc: + attrs: dict = self.cfg["attrs"] + silent: bool = self.cfg["silent"] + for attr, value in attrs.items(): + obj = doc + parts = attr.split(".") + skip = False + for part in parts[:-1]: + if hasattr(obj, part): + obj = getattr(obj, part) + else: + skip = True + if not silent: + warnings.warn(Warnings.W116.format(attr=attr)) + if not skip: + if hasattr(obj, parts[-1]): + setattr(obj, parts[-1], value) + else: + if not silent: + warnings.warn(Warnings.W116.format(attr=attr)) + return doc + + def to_bytes(self, **kwargs): + serializers = { + "cfg": lambda: srsly.json_dumps(self.cfg), + } + return util.to_bytes(serializers, []) + + def from_bytes(self, data, **kwargs): + deserializers = { + "cfg": lambda b: self.cfg.update(srsly.json_loads(b)), + } + util.from_bytes(data, deserializers, []) + return self + + def to_disk(self, path, **kwargs): + path = util.ensure_path(path) + serializers = { + "cfg": lambda p: srsly.write_json(p, self.cfg), + } + return util.to_disk(path, serializers, []) + + def from_disk(self, path, **kwargs): + path = util.ensure_path(path) + serializers = { + "cfg": lambda p: self.cfg.update(srsly.read_json(p)), + } + util.from_disk(path, serializers, []) diff --git a/spacy/tests/pipeline/test_functions.py b/spacy/tests/pipeline/test_functions.py index 454d7b08b..e4adfe2fe 100644 --- a/spacy/tests/pipeline/test_functions.py +++ b/spacy/tests/pipeline/test_functions.py @@ -3,6 +3,8 @@ from spacy.pipeline.functions import merge_subtokens from spacy.language import Language from spacy.tokens import Span, Doc +from ..doc.test_underscore import clean_underscore # noqa: F401 + @pytest.fixture def doc(en_vocab): @@ -74,3 +76,26 @@ def test_token_splitter(): "i", ] assert all(len(t.text) <= token_splitter.split_length for t in doc) + + +@pytest.mark.usefixtures("clean_underscore") +def test_factories_doc_cleaner(): + nlp = Language() + nlp.add_pipe("doc_cleaner") + doc = nlp.make_doc("text") + doc.tensor = [1, 2, 3] + doc = nlp(doc) + assert doc.tensor is None + + nlp = Language() + nlp.add_pipe("doc_cleaner", config={"silent": False}) + with pytest.warns(UserWarning): + doc = nlp("text") + + Doc.set_extension("test_attr", default=-1) + nlp = Language() + nlp.add_pipe("doc_cleaner", config={"attrs": {"_.test_attr": 0}}) + doc = nlp.make_doc("text") + doc._.test_attr = 100 + doc = nlp(doc) + assert doc._.test_attr == 0 diff --git a/website/docs/api/pipeline-functions.md b/website/docs/api/pipeline-functions.md index a776eca9b..ff19d3e71 100644 --- a/website/docs/api/pipeline-functions.md +++ b/website/docs/api/pipeline-functions.md @@ -130,3 +130,25 @@ exceed the transformer model max length. | `min_length` | The minimum length for a token to be split. Defaults to `25`. ~~int~~ | | `split_length` | The length of the split tokens. Defaults to `5`. ~~int~~ | | **RETURNS** | The modified `Doc` with the split tokens. ~~Doc~~ | + +## doc_cleaner {#doc_cleaner tag="function" new="3.2.1"} + +Clean up `Doc` attributes. Intended for use at the end of pipelines with +`tok2vec` or `transformer` pipeline components that store tensors and other +values that can require a lot of memory and frequently aren't needed after the +whole pipeline has run. + +> #### Example +> +> ```python +> config = {"attrs": {"tensor": None}} +> nlp.add_pipe("doc_cleaner", config=config) +> doc = nlp("text") +> assert doc.tensor is None +> ``` + +| Setting | Description | +| ----------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `attrs` | A dict of the `Doc` attributes and the values to set them to. Defaults to `{"tensor": None, "_.trf_data": None}` to clean up after `tok2vec` and `transformer` components. ~~dict~~ | +| `silent` | If `False`, show warnings if attributes aren't found or can't be set. Defaults to `True`. ~~bool~~ | +| **RETURNS** | The modified `Doc` with the modified attributes. ~~Doc~~ | From a7d7e80adb9f325efa209ef0deb7365bdc76ee04 Mon Sep 17 00:00:00 2001 From: Duygu Altinok Date: Tue, 23 Nov 2021 16:26:05 +0100 Subject: [PATCH 11/20] EntityRuler improve disk load error message (#9658) * added error string * added serialization test * added more to if statements * wrote file to tempdir * added tempdir * changed parameter a bit * Update spacy/tests/pipeline/test_entity_ruler.py Co-authored-by: Sofie Van Landeghem --- spacy/errors.py | 1 + spacy/pipeline/entityruler.py | 12 ++++++++++-- spacy/tests/pipeline/test_entity_ruler.py | 22 ++++++++++++++++++++++ 3 files changed, 33 insertions(+), 2 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index 84c407422..c5e364013 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -888,6 +888,7 @@ class Errors(metaclass=ErrorsWithCodes): E1021 = ("`pos` value \"{pp}\" is not a valid Universal Dependencies tag. " "Non-UD tags should use the `tag` property.") E1022 = ("Words must be of type str or int, but input is of type '{wtype}'") + E1023 = ("Couldn't read EntityRuler from the {path}. This file doesn't exist.") # Deprecated model shortcuts, only used in errors and warnings diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py index 2c3db2575..78d7a0be2 100644 --- a/spacy/pipeline/entityruler.py +++ b/spacy/pipeline/entityruler.py @@ -431,10 +431,16 @@ class EntityRuler(Pipe): path = ensure_path(path) self.clear() depr_patterns_path = path.with_suffix(".jsonl") - if depr_patterns_path.is_file(): + if path.suffix == ".jsonl": # user provides a jsonl + if path.is_file: + patterns = srsly.read_jsonl(path) + self.add_patterns(patterns) + else: + raise ValueError(Errors.E1023.format(path=path)) + elif depr_patterns_path.is_file(): patterns = srsly.read_jsonl(depr_patterns_path) self.add_patterns(patterns) - else: + elif path.is_dir(): # path is a valid directory cfg = {} deserializers_patterns = { "patterns": lambda p: self.add_patterns( @@ -451,6 +457,8 @@ class EntityRuler(Pipe): self.nlp.vocab, attr=self.phrase_matcher_attr ) from_disk(path, deserializers_patterns, {}) + else: # path is not a valid directory or file + raise ValueError(Errors.E146.format(path=path)) return self def to_disk( diff --git a/spacy/tests/pipeline/test_entity_ruler.py b/spacy/tests/pipeline/test_entity_ruler.py index dc0ca0301..e66b49518 100644 --- a/spacy/tests/pipeline/test_entity_ruler.py +++ b/spacy/tests/pipeline/test_entity_ruler.py @@ -5,6 +5,8 @@ from spacy.tokens import Span from spacy.language import Language from spacy.pipeline import EntityRuler from spacy.errors import MatchPatternError +from spacy.tests.util import make_tempdir + from thinc.api import NumpyOps, get_current_ops @@ -238,3 +240,23 @@ def test_entity_ruler_multiprocessing(nlp, n_process): for doc in nlp.pipe(texts, n_process=2): for ent in doc.ents: assert ent.ent_id_ == "1234" + + +def test_entity_ruler_serialize_jsonl(nlp, patterns): + ruler = nlp.add_pipe("entity_ruler") + ruler.add_patterns(patterns) + with make_tempdir() as d: + ruler.to_disk(d / "test_ruler.jsonl") + ruler.from_disk(d / "test_ruler.jsonl") # read from an existing jsonl file + with pytest.raises(ValueError): + ruler.from_disk(d / "non_existing.jsonl") # read from a bad jsonl file + + +def test_entity_ruler_serialize_dir(nlp, patterns): + ruler = nlp.add_pipe("entity_ruler") + ruler.add_patterns(patterns) + with make_tempdir() as d: + ruler.to_disk(d / "test_ruler") + ruler.from_disk(d / "test_ruler") # read from an existing directory + with pytest.raises(ValueError): + ruler.from_disk(d / "non_existing_dir") # read from a bad directory From 0bbf86bba8f596f0cbf0132527ab2f767343c488 Mon Sep 17 00:00:00 2001 From: Valentin-Gabriel Soumah <60576980+Pantalaymon@users.noreply.github.com> Date: Tue, 23 Nov 2021 17:29:23 +0100 Subject: [PATCH 12/20] Create Pantalaymon.md Submitting agreement to spacy in order to contribute to Coreferee project . --- .github/contributors/Pantalaymon.md | 106 ++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) create mode 100644 .github/contributors/Pantalaymon.md diff --git a/.github/contributors/Pantalaymon.md b/.github/contributors/Pantalaymon.md new file mode 100644 index 000000000..f017f2947 --- /dev/null +++ b/.github/contributors/Pantalaymon.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name |Valentin-Gabriel Soumah| +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 2021-11-23 | +| GitHub username | Pantalaymon | +| Website (optional) | | From a4c43e5c577d7a143ef7e2fd74ccea33aace96b7 Mon Sep 17 00:00:00 2001 From: Natalia Rodnova <4512370+nrodnova@users.noreply.github.com> Date: Wed, 24 Nov 2021 02:37:10 -0700 Subject: [PATCH 13/20] Allow Matcher to match on ENT_ID and ENT_KB_ID (#9688) * Added ENT_ID and ENT_KB_ID into the list of the attributes that Matcher matches on * Added ENT_ID and ENT_KB_ID to TEST_PATTERNS in test_pattern_validation.py. Disabled tests that I added before * Update website/docs/api/matcher.md * Format * Remove skipped tests Co-authored-by: Adriane Boyd --- spacy/schemas.py | 2 ++ spacy/tests/matcher/test_pattern_validation.py | 4 ++++ website/docs/api/matcher.md | 2 ++ 3 files changed, 8 insertions(+) diff --git a/spacy/schemas.py b/spacy/schemas.py index b3ea11d8b..cf58688ef 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -222,6 +222,8 @@ class TokenPattern(BaseModel): lemma: Optional[StringValue] = None shape: Optional[StringValue] = None ent_type: Optional[StringValue] = None + ent_id: Optional[StringValue] = None + ent_kb_id: Optional[StringValue] = None norm: Optional[StringValue] = None length: Optional[NumberValue] = None spacy: Optional[StrictBool] = None diff --git a/spacy/tests/matcher/test_pattern_validation.py b/spacy/tests/matcher/test_pattern_validation.py index 4d21aea81..74feb7c5d 100644 --- a/spacy/tests/matcher/test_pattern_validation.py +++ b/spacy/tests/matcher/test_pattern_validation.py @@ -22,6 +22,8 @@ TEST_PATTERNS = [ ([{"TEXT": {"VALUE": "foo"}}], 2, 0), # prev: (1, 0) ([{"IS_DIGIT": -1}], 1, 0), ([{"ORTH": -1}], 1, 0), + ([{"ENT_ID": -1}], 1, 0), + ([{"ENT_KB_ID": -1}], 1, 0), # Good patterns ([{"TEXT": "foo"}, {"LOWER": "bar"}], 0, 0), ([{"LEMMA": {"IN": ["love", "like"]}}, {"POS": "DET", "OP": "?"}], 0, 0), @@ -33,6 +35,8 @@ TEST_PATTERNS = [ ([{"orth": "foo"}], 0, 0), # prev: xfail ([{"IS_SENT_START": True}], 0, 0), ([{"SENT_START": True}], 0, 0), + ([{"ENT_ID": "STRING"}], 0, 0), + ([{"ENT_KB_ID": "STRING"}], 0, 0), ] diff --git a/website/docs/api/matcher.md b/website/docs/api/matcher.md index c34560dec..803105ba2 100644 --- a/website/docs/api/matcher.md +++ b/website/docs/api/matcher.md @@ -44,6 +44,8 @@ rule-based matching are: | `SPACY` | Token has a trailing space. ~~bool~~ | |  `POS`, `TAG`, `MORPH`, `DEP`, `LEMMA`, `SHAPE` | The token's simple and extended part-of-speech tag, morphological analysis, dependency label, lemma, shape. ~~str~~ | | `ENT_TYPE` | The token's entity label. ~~str~~ | +| `ENT_ID` | The token's entity ID (`ent_id`). ~~str~~ | +| `ENT_KB_ID` | The token's entity knowledge base ID (`ent_kb_id`). ~~str~~ | | `_` 2.1 | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). ~~Dict[str, Any]~~ | | `OP` | Operator or quantifier to determine how often to match a token pattern. ~~str~~ | From 5c445332632079489acf214a675f0a193b383915 Mon Sep 17 00:00:00 2001 From: Tuomo Hiippala Date: Sun, 28 Nov 2021 12:33:16 +0200 Subject: [PATCH 14/20] add entry for Applied Language Technology under "Courses" (#9755) Added the following entry into `universe.json`: ``` { "type": "education", "id": "applt-course", "title": "Applied Language Technology", "slogan": "NLP for newcomers using spaCy and Stanza", "description": "These learning materials provide an introduction to applied language technology for audiences who are unfamiliar with language technology and programming. The learning materials assume no previous knowledge of the Python programming language.", "url": "https://applied-language-technology.readthedocs.io/", "image": "https://www.mv.helsinki.fi/home/thiippal/images/applt-preview.jpg", "thumb": "https://applied-language-technology.readthedocs.io/en/latest/_static/logo.png", "author": "Tuomo Hiippala", "author_links": { "twitter": "tuomo_h", "github": "thiippal", "website": "https://www.mv.helsinki.fi/home/thiippal/" }, "category": ["courses"] }, ``` --- website/meta/universe.json | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/website/meta/universe.json b/website/meta/universe.json index 7f3813a95..d11b0e8c5 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -1752,6 +1752,23 @@ }, "category": ["courses"] }, + { + "type": "education", + "id": "applt-course", + "title": "Applied Language Technology", + "slogan": "NLP for newcomers using spaCy and Stanza", + "description": "These learning materials provide an introduction to applied language technology for audiences who are unfamiliar with language technology and programming. The learning materials assume no previous knowledge of the Python programming language.", + "url": "https://applied-language-technology.readthedocs.io/", + "image": "https://www.mv.helsinki.fi/home/thiippal/images/applt-preview.jpg", + "thumb": "https://applied-language-technology.readthedocs.io/en/latest/_static/logo.png", + "author": "Tuomo Hiippala", + "author_links": { + "twitter": "tuomo_h", + "github": "thiippal", + "website": "https://www.mv.helsinki.fi/home/thiippal/" + }, + "category": ["courses"] + }, { "type": "education", "id": "video-spacys-ner-model", From 7b134b8fbd64bd8cfad2a0ecd7be9b6a7d7a907d Mon Sep 17 00:00:00 2001 From: Richard Hudson Date: Sun, 28 Nov 2021 21:59:23 +0100 Subject: [PATCH 15/20] New tests for a number of alpha languages (#9703) * Added Slovak * Added Slovenian tests * Added Estonian tests * Added Croatian tests * Added Latvian tests * Added Icelandic tests * Added Afrikaans tests * Added language-independent tests * Added Kannada tests * Tidied up * Added Albanian tests * Formatted with black * Added failing tests for anomalies * Update spacy/tests/lang/af/test_text.py Co-authored-by: Sofie Van Landeghem * Added context to failing Estonian tokenizer test Co-authored-by: Sofie Van Landeghem * Added context to failing Croatian tokenizer test Co-authored-by: Sofie Van Landeghem * Added context to failing Icelandic tokenizer test Co-authored-by: Sofie Van Landeghem * Added context to failing Latvian tokenizer test Co-authored-by: Sofie Van Landeghem * Added context to failing Slovak tokenizer test Co-authored-by: Sofie Van Landeghem * Added context to failing Slovenian tokenizer test Co-authored-by: Sofie Van Landeghem Co-authored-by: Sofie Van Landeghem --- spacy/tests/conftest.py | 40 ++++++++++++++++++++++ spacy/tests/lang/af/__init__.py | 0 spacy/tests/lang/af/test_text.py | 22 ++++++++++++ spacy/tests/lang/af/test_tokenizer.py | 29 ++++++++++++++++ spacy/tests/lang/et/__init__.py | 0 spacy/tests/lang/et/test_text.py | 26 +++++++++++++++ spacy/tests/lang/et/test_tokenizer.py | 29 ++++++++++++++++ spacy/tests/lang/hr/__init__.py | 0 spacy/tests/lang/hr/test_text.py | 26 +++++++++++++++ spacy/tests/lang/hr/test_tokenizer.py | 31 +++++++++++++++++ spacy/tests/lang/is/__init__.py | 0 spacy/tests/lang/is/test_text.py | 26 +++++++++++++++ spacy/tests/lang/is/test_tokenizer.py | 30 +++++++++++++++++ spacy/tests/lang/lv/__init__.py | 0 spacy/tests/lang/lv/test_text.py | 27 +++++++++++++++ spacy/tests/lang/lv/test_tokenizer.py | 30 +++++++++++++++++ spacy/tests/lang/sk/__init__.py | 0 spacy/tests/lang/sk/test_text.py | 48 +++++++++++++++++++++++++++ spacy/tests/lang/sk/test_tokenizer.py | 15 +++++++++ spacy/tests/lang/sl/__init__.py | 0 spacy/tests/lang/sl/test_text.py | 27 +++++++++++++++ spacy/tests/lang/sl/test_tokenizer.py | 32 ++++++++++++++++++ spacy/tests/lang/sq/__init__.py | 0 spacy/tests/lang/sq/test_text.py | 25 ++++++++++++++ spacy/tests/lang/sq/test_tokenizer.py | 31 +++++++++++++++++ spacy/tests/lang/xx/__init__.py | 0 spacy/tests/lang/xx/test_text.py | 24 ++++++++++++++ spacy/tests/lang/xx/test_tokenizer.py | 25 ++++++++++++++ 28 files changed, 543 insertions(+) create mode 100644 spacy/tests/lang/af/__init__.py create mode 100644 spacy/tests/lang/af/test_text.py create mode 100644 spacy/tests/lang/af/test_tokenizer.py create mode 100644 spacy/tests/lang/et/__init__.py create mode 100644 spacy/tests/lang/et/test_text.py create mode 100644 spacy/tests/lang/et/test_tokenizer.py create mode 100644 spacy/tests/lang/hr/__init__.py create mode 100644 spacy/tests/lang/hr/test_text.py create mode 100644 spacy/tests/lang/hr/test_tokenizer.py create mode 100644 spacy/tests/lang/is/__init__.py create mode 100644 spacy/tests/lang/is/test_text.py create mode 100644 spacy/tests/lang/is/test_tokenizer.py create mode 100644 spacy/tests/lang/lv/__init__.py create mode 100644 spacy/tests/lang/lv/test_text.py create mode 100644 spacy/tests/lang/lv/test_tokenizer.py create mode 100644 spacy/tests/lang/sk/__init__.py create mode 100644 spacy/tests/lang/sk/test_text.py create mode 100644 spacy/tests/lang/sk/test_tokenizer.py create mode 100644 spacy/tests/lang/sl/__init__.py create mode 100644 spacy/tests/lang/sl/test_text.py create mode 100644 spacy/tests/lang/sl/test_tokenizer.py create mode 100644 spacy/tests/lang/sq/__init__.py create mode 100644 spacy/tests/lang/sq/test_text.py create mode 100644 spacy/tests/lang/sq/test_tokenizer.py create mode 100644 spacy/tests/lang/xx/__init__.py create mode 100644 spacy/tests/lang/xx/test_text.py create mode 100644 spacy/tests/lang/xx/test_tokenizer.py diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 88c7adfe3..ffca79bb9 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -49,6 +49,11 @@ def tokenizer(): return get_lang_class("xx")().tokenizer +@pytest.fixture(scope="session") +def af_tokenizer(): + return get_lang_class("af")().tokenizer + + @pytest.fixture(scope="session") def am_tokenizer(): return get_lang_class("am")().tokenizer @@ -125,6 +130,11 @@ def es_vocab(): return get_lang_class("es")().vocab +@pytest.fixture(scope="session") +def et_tokenizer(): + return get_lang_class("et")().tokenizer + + @pytest.fixture(scope="session") def eu_tokenizer(): return get_lang_class("eu")().tokenizer @@ -185,6 +195,11 @@ def id_tokenizer(): return get_lang_class("id")().tokenizer +@pytest.fixture(scope="session") +def is_tokenizer(): + return get_lang_class("is")().tokenizer + + @pytest.fixture(scope="session") def it_tokenizer(): return get_lang_class("it")().tokenizer @@ -212,6 +227,11 @@ def lt_tokenizer(): return get_lang_class("lt")().tokenizer +@pytest.fixture(scope="session") +def lv_tokenizer(): + return get_lang_class("lv")().tokenizer + + @pytest.fixture(scope="session") def mk_tokenizer(): return get_lang_class("mk")().tokenizer @@ -279,11 +299,26 @@ def sa_tokenizer(): return get_lang_class("sa")().tokenizer +@pytest.fixture(scope="session") +def sk_tokenizer(): + return get_lang_class("sk")().tokenizer + + +@pytest.fixture(scope="session") +def sl_tokenizer(): + return get_lang_class("sl")().tokenizer + + @pytest.fixture(scope="session") def sr_tokenizer(): return get_lang_class("sr")().tokenizer +@pytest.fixture(scope="session") +def sq_tokenizer(): + return get_lang_class("sq")().tokenizer + + @pytest.fixture(scope="session") def sv_tokenizer(): return get_lang_class("sv")().tokenizer @@ -344,6 +379,11 @@ def vi_tokenizer(): return get_lang_class("vi")().tokenizer +@pytest.fixture(scope="session") +def xx_tokenizer(): + return get_lang_class("xx")().tokenizer + + @pytest.fixture(scope="session") def yo_tokenizer(): return get_lang_class("yo")().tokenizer diff --git a/spacy/tests/lang/af/__init__.py b/spacy/tests/lang/af/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tests/lang/af/test_text.py b/spacy/tests/lang/af/test_text.py new file mode 100644 index 000000000..99c2a9f4c --- /dev/null +++ b/spacy/tests/lang/af/test_text.py @@ -0,0 +1,22 @@ +import pytest + + +def test_long_text(af_tokenizer): + # Excerpt: Universal Declaration of Human Rights; “'n” changed to “die” in first sentence + text = """ +Hierdie Universele Verklaring van Menseregte as die algemene standaard vir die verwesenliking deur alle mense en nasies, +om te verseker dat elke individu en elke deel van die gemeenskap hierdie Verklaring in ag sal neem en deur opvoeding, +respek vir hierdie regte en vryhede te bevorder, op nasionale en internasionale vlak, daarna sal strewe om die universele +en effektiewe erkenning en agting van hierdie regte te verseker, nie net vir die mense van die Lidstate nie, maar ook vir +die mense in die gebiede onder hul jurisdiksie. + +""" + tokens = af_tokenizer(text) + assert len(tokens) == 100 + + +@pytest.mark.xfail +def test_indefinite_article(af_tokenizer): + text = "as 'n algemene standaard" + tokens = af_tokenizer(text) + assert len(tokens) == 4 diff --git a/spacy/tests/lang/af/test_tokenizer.py b/spacy/tests/lang/af/test_tokenizer.py new file mode 100644 index 000000000..db52db5e3 --- /dev/null +++ b/spacy/tests/lang/af/test_tokenizer.py @@ -0,0 +1,29 @@ +import pytest + +AF_BASIC_TOKENIZATION_TESTS = [ + ( + "Elkeen het die reg tot lewe, vryheid en sekuriteit van persoon.", + [ + "Elkeen", + "het", + "die", + "reg", + "tot", + "lewe", + ",", + "vryheid", + "en", + "sekuriteit", + "van", + "persoon", + ".", + ], + ), +] + + +@pytest.mark.parametrize("text,expected_tokens", AF_BASIC_TOKENIZATION_TESTS) +def test_af_tokenizer_basic(af_tokenizer, text, expected_tokens): + tokens = af_tokenizer(text) + token_list = [token.text for token in tokens if not token.is_space] + assert expected_tokens == token_list diff --git a/spacy/tests/lang/et/__init__.py b/spacy/tests/lang/et/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tests/lang/et/test_text.py b/spacy/tests/lang/et/test_text.py new file mode 100644 index 000000000..9515a7cc1 --- /dev/null +++ b/spacy/tests/lang/et/test_text.py @@ -0,0 +1,26 @@ +import pytest + + +def test_long_text(et_tokenizer): + # Excerpt: European Convention on Human Rights + text = """ +arvestades, et nimetatud deklaratsiooni eesmärk on tagada selles +kuulutatud õiguste üldine ja tõhus tunnustamine ning järgimine; +arvestades, et Euroopa Nõukogu eesmärk on saavutada tema +liikmete suurem ühtsus ning et üheks selle eesmärgi saavutamise +vahendiks on inimõiguste ja põhivabaduste järgimine ning +elluviimine; +taaskinnitades oma sügavat usku neisse põhivabadustesse, mis +on õigluse ja rahu aluseks maailmas ning mida kõige paremini +tagab ühelt poolt tõhus poliitiline demokraatia ning teiselt poolt +inimõiguste, millest nad sõltuvad, üldine mõistmine ja järgimine; +""" + tokens = et_tokenizer(text) + assert len(tokens) == 94 + + +@pytest.mark.xfail +def test_ordinal_number(et_tokenizer): + text = "10. detsembril 1948" + tokens = et_tokenizer(text) + assert len(tokens) == 3 diff --git a/spacy/tests/lang/et/test_tokenizer.py b/spacy/tests/lang/et/test_tokenizer.py new file mode 100644 index 000000000..f0f8079ca --- /dev/null +++ b/spacy/tests/lang/et/test_tokenizer.py @@ -0,0 +1,29 @@ +import pytest + +ET_BASIC_TOKENIZATION_TESTS = [ + ( + "Kedagi ei või piinata ega ebainimlikult või alandavalt kohelda " + "ega karistada.", + [ + "Kedagi", + "ei", + "või", + "piinata", + "ega", + "ebainimlikult", + "või", + "alandavalt", + "kohelda", + "ega", + "karistada", + ".", + ], + ), +] + + +@pytest.mark.parametrize("text,expected_tokens", ET_BASIC_TOKENIZATION_TESTS) +def test_et_tokenizer_basic(et_tokenizer, text, expected_tokens): + tokens = et_tokenizer(text) + token_list = [token.text for token in tokens if not token.is_space] + assert expected_tokens == token_list diff --git a/spacy/tests/lang/hr/__init__.py b/spacy/tests/lang/hr/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tests/lang/hr/test_text.py b/spacy/tests/lang/hr/test_text.py new file mode 100644 index 000000000..82e65afe7 --- /dev/null +++ b/spacy/tests/lang/hr/test_text.py @@ -0,0 +1,26 @@ +import pytest + + +def test_long_text(hr_tokenizer): + # Excerpt: European Convention on Human Rights + text = """ +uzimajući u obzir da ta deklaracija nastoji osigurati opće i djelotvorno +priznanje i poštovanje u njoj proglašenih prava; +uzimajući u obzir da je cilj Vijeća Europe postizanje većeg jedinstva +njegovih članica, i da je jedan od načina postizanja toga cilja +očuvanje i daljnje ostvarivanje ljudskih prava i temeljnih sloboda; +potvrđujući svoju duboku privrženost tim temeljnim slobodama +koje su osnova pravde i mira u svijetu i koje su najbolje zaštićene +istinskom političkom demokracijom s jedne strane te zajedničkim +razumijevanjem i poštovanjem ljudskih prava o kojima te slobode +ovise s druge strane; +""" + tokens = hr_tokenizer(text) + assert len(tokens) == 105 + + +@pytest.mark.xfail +def test_ordinal_number(hr_tokenizer): + text = "10. prosinca 1948" + tokens = hr_tokenizer(text) + assert len(tokens) == 3 diff --git a/spacy/tests/lang/hr/test_tokenizer.py b/spacy/tests/lang/hr/test_tokenizer.py new file mode 100644 index 000000000..dace33b2d --- /dev/null +++ b/spacy/tests/lang/hr/test_tokenizer.py @@ -0,0 +1,31 @@ +import pytest + +HR_BASIC_TOKENIZATION_TESTS = [ + ( + "Nitko se ne smije podvrgnuti mučenju ni nečovječnom ili " + "ponižavajućem postupanju ili kazni.", + [ + "Nitko", + "se", + "ne", + "smije", + "podvrgnuti", + "mučenju", + "ni", + "nečovječnom", + "ili", + "ponižavajućem", + "postupanju", + "ili", + "kazni", + ".", + ], + ), +] + + +@pytest.mark.parametrize("text,expected_tokens", HR_BASIC_TOKENIZATION_TESTS) +def test_hr_tokenizer_basic(hr_tokenizer, text, expected_tokens): + tokens = hr_tokenizer(text) + token_list = [token.text for token in tokens if not token.is_space] + assert expected_tokens == token_list diff --git a/spacy/tests/lang/is/__init__.py b/spacy/tests/lang/is/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tests/lang/is/test_text.py b/spacy/tests/lang/is/test_text.py new file mode 100644 index 000000000..6e3654a6e --- /dev/null +++ b/spacy/tests/lang/is/test_text.py @@ -0,0 +1,26 @@ +import pytest + + +def test_long_text(is_tokenizer): + # Excerpt: European Convention on Human Rights + text = """ +hafa í huga, að yfirlýsing þessi hefur það markmið að tryggja +almenna og raunhæfa viðurkenningu og vernd þeirra réttinda, +sem þar er lýst; +hafa í huga, að markmið Evrópuráðs er að koma á nánari einingu +aðildarríkjanna og að ein af leiðunum að því marki er sú, að +mannréttindi og mannfrelsi séu í heiðri höfð og efld; +lýsa á ný eindreginni trú sinni á það mannfrelsi, sem er undirstaða +réttlætis og friðar í heiminum og best er tryggt, annars vegar með +virku, lýðræðislegu stjórnarfari og, hins vegar, almennum skilningi +og varðveislu þeirra mannréttinda, sem eru grundvöllur frelsisins; +""" + tokens = is_tokenizer(text) + assert len(tokens) == 120 + + +@pytest.mark.xfail +def test_ordinal_number(is_tokenizer): + text = "10. desember 1948" + tokens = is_tokenizer(text) + assert len(tokens) == 3 diff --git a/spacy/tests/lang/is/test_tokenizer.py b/spacy/tests/lang/is/test_tokenizer.py new file mode 100644 index 000000000..0c05a6050 --- /dev/null +++ b/spacy/tests/lang/is/test_tokenizer.py @@ -0,0 +1,30 @@ +import pytest + +IS_BASIC_TOKENIZATION_TESTS = [ + ( + "Enginn maður skal sæta pyndingum eða ómannlegri eða " + "vanvirðandi meðferð eða refsingu. ", + [ + "Enginn", + "maður", + "skal", + "sæta", + "pyndingum", + "eða", + "ómannlegri", + "eða", + "vanvirðandi", + "meðferð", + "eða", + "refsingu", + ".", + ], + ), +] + + +@pytest.mark.parametrize("text,expected_tokens", IS_BASIC_TOKENIZATION_TESTS) +def test_is_tokenizer_basic(is_tokenizer, text, expected_tokens): + tokens = is_tokenizer(text) + token_list = [token.text for token in tokens if not token.is_space] + assert expected_tokens == token_list diff --git a/spacy/tests/lang/lv/__init__.py b/spacy/tests/lang/lv/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tests/lang/lv/test_text.py b/spacy/tests/lang/lv/test_text.py new file mode 100644 index 000000000..5ca5fd0a7 --- /dev/null +++ b/spacy/tests/lang/lv/test_text.py @@ -0,0 +1,27 @@ +import pytest + + +def test_long_text(lv_tokenizer): + # Excerpt: European Convention on Human Rights + text = """ +Ievērodamas, ka šī deklarācija paredz nodrošināt vispārēju un +efektīvu tajā pasludināto tiesību atzīšanu un ievērošanu; +Ievērodamas, ka Eiropas Padomes mērķis ir panākt lielāku vienotību +tās dalībvalstu starpā un ka viens no līdzekļiem, kā šo mērķi +sasniegt, ir cilvēka tiesību un pamatbrīvību ievērošana un turpmāka +īstenošana; +No jauna apliecinādamas patiesu pārliecību, ka šīs pamatbrīvības +ir taisnīguma un miera pamats visā pasaulē un ka tās vislabāk var +nodrošināt patiess demokrātisks politisks režīms no vienas puses un +vispārējo cilvēktiesību, uz kurām tās pamatojas, kopīga izpratne un +ievērošana no otras puses; +""" + tokens = lv_tokenizer(text) + assert len(tokens) == 109 + + +@pytest.mark.xfail +def test_ordinal_number(lv_tokenizer): + text = "10. decembrī" + tokens = lv_tokenizer(text) + assert len(tokens) == 2 diff --git a/spacy/tests/lang/lv/test_tokenizer.py b/spacy/tests/lang/lv/test_tokenizer.py new file mode 100644 index 000000000..3ce7ad5fa --- /dev/null +++ b/spacy/tests/lang/lv/test_tokenizer.py @@ -0,0 +1,30 @@ +import pytest + +LV_BASIC_TOKENIZATION_TESTS = [ + ( + "Nevienu nedrīkst spīdzināt vai cietsirdīgi vai pazemojoši ar viņu " + "apieties vai sodīt.", + [ + "Nevienu", + "nedrīkst", + "spīdzināt", + "vai", + "cietsirdīgi", + "vai", + "pazemojoši", + "ar", + "viņu", + "apieties", + "vai", + "sodīt", + ".", + ], + ), +] + + +@pytest.mark.parametrize("text,expected_tokens", LV_BASIC_TOKENIZATION_TESTS) +def test_lv_tokenizer_basic(lv_tokenizer, text, expected_tokens): + tokens = lv_tokenizer(text) + token_list = [token.text for token in tokens if not token.is_space] + assert expected_tokens == token_list diff --git a/spacy/tests/lang/sk/__init__.py b/spacy/tests/lang/sk/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tests/lang/sk/test_text.py b/spacy/tests/lang/sk/test_text.py new file mode 100644 index 000000000..62ea2a783 --- /dev/null +++ b/spacy/tests/lang/sk/test_text.py @@ -0,0 +1,48 @@ +import pytest + + +def test_long_text(sk_tokenizer): + # Excerpt: European Convention on Human Rights + text = """ +majúc na zreteli, že cieľom tejto deklarácie je zabezpečiť všeobecné +a účinné uznávanie a dodržiavanie práv v nej vyhlásených; +majúc na zreteli, že cieľom Rady Európy je dosiahnutie väčšej +jednoty medzi jej členmi, a že jedným zo spôsobov, ktorým sa +má tento cieľ napĺňať, je ochrana a ďalší rozvoj ľudských práv +a základných slobôd; +znovu potvrdzujúc svoju hlbokú vieru v tie základné slobody, ktoré +sú základom spravodlivosti a mieru vo svete, a ktoré sú najlepšie +zachovávané na jednej strane účinnou politickou demokraciou +a na strane druhej spoločným poňatím a dodržiavaním ľudských +práv, od ktorých závisia; + """ + tokens = sk_tokenizer(text) + assert len(tokens) == 118 + + +@pytest.mark.parametrize( + "text,match", + [ + ("10", True), + ("1", True), + ("10,000", True), + ("10,00", True), + ("štyri", True), + ("devätnásť", True), + ("milión", True), + ("pes", False), + (",", False), + ("1/2", True), + ], +) +def test_lex_attrs_like_number(sk_tokenizer, text, match): + tokens = sk_tokenizer(text) + assert len(tokens) == 1 + assert tokens[0].like_num == match + + +@pytest.mark.xfail +def test_ordinal_number(sk_tokenizer): + text = "10. decembra 1948" + tokens = sk_tokenizer(text) + assert len(tokens) == 3 diff --git a/spacy/tests/lang/sk/test_tokenizer.py b/spacy/tests/lang/sk/test_tokenizer.py new file mode 100644 index 000000000..247847284 --- /dev/null +++ b/spacy/tests/lang/sk/test_tokenizer.py @@ -0,0 +1,15 @@ +import pytest + +SK_BASIC_TOKENIZATION_TESTS = [ + ( + "Kedy sa narodil Andrej Kiska?", + ["Kedy", "sa", "narodil", "Andrej", "Kiska", "?"], + ), +] + + +@pytest.mark.parametrize("text,expected_tokens", SK_BASIC_TOKENIZATION_TESTS) +def test_sk_tokenizer_basic(sk_tokenizer, text, expected_tokens): + tokens = sk_tokenizer(text) + token_list = [token.text for token in tokens if not token.is_space] + assert expected_tokens == token_list diff --git a/spacy/tests/lang/sl/__init__.py b/spacy/tests/lang/sl/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tests/lang/sl/test_text.py b/spacy/tests/lang/sl/test_text.py new file mode 100644 index 000000000..ddc5b6b5d --- /dev/null +++ b/spacy/tests/lang/sl/test_text.py @@ -0,0 +1,27 @@ +import pytest + + +def test_long_text(sl_tokenizer): + # Excerpt: European Convention on Human Rights + text = """ +upoštevajoč, da si ta deklaracija prizadeva zagotoviti splošno in +učinkovito priznavanje in spoštovanje v njej razglašenih pravic, +upoštevajoč, da je cilj Sveta Evrope doseči večjo enotnost med +njegovimi članicami, in da je eden izmed načinov za zagotavljanje +tega cilja varstvo in nadaljnji razvoj človekovih pravic in temeljnih +svoboščin, +ponovno potrjujoč svojo globoko vero v temeljne svoboščine, na +katerih temeljita pravičnost in mir v svetu, in ki jih je mogoče najbolje +zavarovati na eni strani z dejansko politično demokracijo in na drugi +strani s skupnim razumevanjem in spoštovanjem človekovih pravic, +od katerih so te svoboščine odvisne, +""" + tokens = sl_tokenizer(text) + assert len(tokens) == 116 + + +@pytest.mark.xfail +def test_ordinal_number(sl_tokenizer): + text = "10. decembra 1948" + tokens = sl_tokenizer(text) + assert len(tokens) == 3 diff --git a/spacy/tests/lang/sl/test_tokenizer.py b/spacy/tests/lang/sl/test_tokenizer.py new file mode 100644 index 000000000..f2b15b0ff --- /dev/null +++ b/spacy/tests/lang/sl/test_tokenizer.py @@ -0,0 +1,32 @@ +import pytest + +SL_BASIC_TOKENIZATION_TESTS = [ + ( + "Vsakdo ima pravico do spoštovanja njegovega zasebnega in " + "družinskega življenja, doma in dopisovanja.", + [ + "Vsakdo", + "ima", + "pravico", + "do", + "spoštovanja", + "njegovega", + "zasebnega", + "in", + "družinskega", + "življenja", + ",", + "doma", + "in", + "dopisovanja", + ".", + ], + ), +] + + +@pytest.mark.parametrize("text,expected_tokens", SL_BASIC_TOKENIZATION_TESTS) +def test_sl_tokenizer_basic(sl_tokenizer, text, expected_tokens): + tokens = sl_tokenizer(text) + token_list = [token.text for token in tokens if not token.is_space] + assert expected_tokens == token_list diff --git a/spacy/tests/lang/sq/__init__.py b/spacy/tests/lang/sq/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tests/lang/sq/test_text.py b/spacy/tests/lang/sq/test_text.py new file mode 100644 index 000000000..44eedaa54 --- /dev/null +++ b/spacy/tests/lang/sq/test_text.py @@ -0,0 +1,25 @@ +import pytest + + +def test_long_text(sq_tokenizer): + # Excerpt: European Convention on Human Rights + text = """ +Qeveritë nënshkruese, anëtare të Këshillit të Evropës, +Duke pasur parasysh Deklaratën Universale të të Drejtave të +Njeriut, të shpallur nga Asambleja e Përgjithshme e Kombeve të +Bashkuara më 10 dhjetor 1948; +Duke pasur parasysh, se kjo Deklaratë ka për qëllim të sigurojë +njohjen dhe zbatimin universal dhe efektiv të të drejtave të +shpallura në të; +Duke pasur parasysh se qëllimi i Këshillit të Evropës është që të +realizojë një bashkim më të ngushtë midis anëtarëve të tij dhe +se një nga mjetet për të arritur këtë qëllim është mbrojtja dhe +zhvillimi i të drejtave të njeriut dhe i lirive themelore; +Duke ripohuar besimin e tyre të thellë në këto liri themelore që +përbëjnë themelet e drejtësisë dhe të paqes në botë, ruajtja e të +cilave mbështetet kryesisht mbi një regjim politik demokratik nga +njëra anë, dhe nga ana tjetër mbi një kuptim dhe respektim të +përbashkët të të drejtave të njeriut nga të cilat varen; +""" + tokens = sq_tokenizer(text) + assert len(tokens) == 182 diff --git a/spacy/tests/lang/sq/test_tokenizer.py b/spacy/tests/lang/sq/test_tokenizer.py new file mode 100644 index 000000000..8fd25f588 --- /dev/null +++ b/spacy/tests/lang/sq/test_tokenizer.py @@ -0,0 +1,31 @@ +import pytest + +SQ_BASIC_TOKENIZATION_TESTS = [ + ( + "Askush nuk mund t’i nënshtrohet torturës ose dënimeve ose " + "trajtimeve çnjerëzore ose poshtëruese.", + [ + "Askush", + "nuk", + "mund", + "t’i", + "nënshtrohet", + "torturës", + "ose", + "dënimeve", + "ose", + "trajtimeve", + "çnjerëzore", + "ose", + "poshtëruese", + ".", + ], + ), +] + + +@pytest.mark.parametrize("text,expected_tokens", SQ_BASIC_TOKENIZATION_TESTS) +def test_sq_tokenizer_basic(sq_tokenizer, text, expected_tokens): + tokens = sq_tokenizer(text) + token_list = [token.text for token in tokens if not token.is_space] + assert expected_tokens == token_list diff --git a/spacy/tests/lang/xx/__init__.py b/spacy/tests/lang/xx/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tests/lang/xx/test_text.py b/spacy/tests/lang/xx/test_text.py new file mode 100644 index 000000000..477f0ebe2 --- /dev/null +++ b/spacy/tests/lang/xx/test_text.py @@ -0,0 +1,24 @@ +import pytest + + +def test_long_text(xx_tokenizer): + # Excerpt: Text in Skolt Sami taken from https://www.samediggi.fi + text = """ +Säʹmmla lie Euroopp unioon oʹdinakai alggmeer. Säʹmmlai alggmeerstatus lij raʹvvjum Lääʹddjânnam vuâđđlääʹjjest. +Alggmeer kriteeʹr vuâđđâʹvve meeraikõskksaž tuâjjorganisaatio, ILO, suåppmõʹšše nââmar 169. +Suåppmõõžž mieʹldd jiõččvälddsaž jânnmin jälsteei meeraid ââʹnet alggmeeran, +ko sij puõlvvâʹvve naroodâst, kååʹtt jânnam välddmõõžž leʹbe aazztummuž leʹbe ânnʼjõž riikkraaʹji šõddâm ääiʹj jälste +jânnmest leʹbe tõn mäddtiõđlaž vuuʹdest, koozz jânnam kooll. Alggmeer ij leäkku mieʹrreei sââʹjest jiiʹjjes jälstemvuuʹdest. +Alggmeer âlgg jiõčč ââʹnned jiiʹjjes alggmeeran leʹbe leeʹd tõn miõlâst, što sij lie alggmeer. +Alggmeer lij õlggâm seeilted vuõiggâdvuõđlaž sââʹjest huõlǩâni obbnes leʹbe vueʹzzi jiiʹjjes sosiaalʼlaž, täälʼlaž, +kulttuurlaž da poliittlaž instituutioid. + +Säʹmmlai statuuzz ǩeeʹrjteš Lääʹddjânnam vuâđđläkka eeʹjj 1995. Säʹmmlain alggmeeran lij vuõiggâdvuõtt tuõʹllʼjed da +ooudâsviikkâd ǩiõlâz da kulttuurâz di tõõzz kuulli ääʹrbvuâlaž jieʹllemvueʹjjeez. Sääʹmǩiõl ââʹnnmest veʹrǧǧniiʹǩǩi +åʹrnn lij šiõttuum jiiʹjjes lääʹǩǩ. Säʹmmlain lij leämmaž eeʹjjest 1996 vueʹljeeʹl dommvuuʹdsteez ǩiõlâz da kulttuurâz kuõskki +vuâđđlääʹjj meâldlaž jiõččvaaldâšm. Säʹmmlai jiõččvaldšma kuulli tuâjaid håidd säʹmmlai vaalin vaʹlljääm parlameʹntt, +Sääʹmteʹǧǧ. +""" + + tokens = xx_tokenizer(text) + assert len(tokens) == 179 diff --git a/spacy/tests/lang/xx/test_tokenizer.py b/spacy/tests/lang/xx/test_tokenizer.py new file mode 100644 index 000000000..15c760a6b --- /dev/null +++ b/spacy/tests/lang/xx/test_tokenizer.py @@ -0,0 +1,25 @@ +import pytest + +XX_BASIC_TOKENIZATION_TESTS = [ + ( + "Lääʹddjânnmest lie nuʹtt 10 000 säʹmmliʹžžed. Seeʹst pâʹjjel", + [ + "Lääʹddjânnmest", + "lie", + "nuʹtt", + "10", + "000", + "säʹmmliʹžžed", + ".", + "Seeʹst", + "pâʹjjel", + ], + ), +] + + +@pytest.mark.parametrize("text,expected_tokens", XX_BASIC_TOKENIZATION_TESTS) +def test_xx_tokenizer_basic(xx_tokenizer, text, expected_tokens): + tokens = xx_tokenizer(text) + token_list = [token.text for token in tokens if not token.is_space] + assert expected_tokens == token_list From ac05de2c6c708e33ebad6c901e674e1e8bdc0688 Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Mon, 29 Nov 2021 07:31:02 +0000 Subject: [PATCH 16/20] Fix Language-specific factory handling in package command (#9674) * Use internal names for factories If a component factory is registered like `@French.factory(...)` instead of `@Language.factory(...)`, the name in the factories registry will be prefixed with the language code. However in the nlp.config object the factory will be listed without the language code. The `add_pipe` code has fallback logic to handle this, but packaging code and the registry itself don't. This change makes it so that the factory name in nlp.config is the language-specific form. It's not clear if this will break anything else, but it does seem to fix the inconsistency and resolve the specific user issue that brought this to our attention. * Change approach to use fallback in package lookup This adds fallback logic to the package lookup, so it doesn't have to touch the way the config is built. It seems to fix the tests too. * Remove unecessary line * Add test Thsi also adds an assert that seems to have been forgotten. --- spacy/cli/package.py | 12 +++++++++++- spacy/tests/test_cli.py | 11 ++++++++++- 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/spacy/cli/package.py b/spacy/cli/package.py index 76e14daf5..f9d2a9af2 100644 --- a/spacy/cli/package.py +++ b/spacy/cli/package.py @@ -4,6 +4,7 @@ from pathlib import Path from wasabi import Printer, MarkdownRenderer, get_raw_input from thinc.api import Config from collections import defaultdict +from catalogue import RegistryError import srsly import sys @@ -212,9 +213,18 @@ def get_third_party_dependencies( if "factory" in component: funcs["factories"].add(component["factory"]) modules = set() + lang = config["nlp"]["lang"] for reg_name, func_names in funcs.items(): for func_name in func_names: - func_info = util.registry.find(reg_name, func_name) + # Try the lang-specific version and fall back + try: + func_info = util.registry.find(reg_name, lang + "." + func_name) + except RegistryError: + try: + func_info = util.registry.find(reg_name, func_name) + except RegistryError as regerr: + # lang-specific version being absent is not actually an issue + raise regerr from None module_name = func_info.get("module") # type: ignore[attr-defined] if module_name: # the code is part of a module, not a --code file modules.add(func_info["module"].split(".")[0]) # type: ignore[index] diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index 3243d426b..c6b00b140 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -565,7 +565,16 @@ def test_get_third_party_dependencies(): } }, ) - get_third_party_dependencies(nlp.config) == [] + assert get_third_party_dependencies(nlp.config) == [] + + # Test with lang-specific factory + @Dutch.factory("third_party_test") + def test_factory(nlp, name): + return lambda x: x + + nlp.add_pipe("third_party_test") + # Before #9674 this would throw an exception + get_third_party_dependencies(nlp.config) @pytest.mark.parametrize( From 6763cbfdc03ed801576c99a5d35623cf55925e22 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 29 Nov 2021 14:14:21 +0100 Subject: [PATCH 17/20] Update Catalan acknowledgements for v3.2 (#9763) --- website/docs/usage/v3-2.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/usage/v3-2.md b/website/docs/usage/v3-2.md index 766d1c0a9..d1d45c7ba 100644 --- a/website/docs/usage/v3-2.md +++ b/website/docs/usage/v3-2.md @@ -159,7 +159,7 @@ their contributions! - All Universal Dependencies training data has been updated to v2.8. - The Catalan data, tokenizer and lemmatizer have been updated, thanks to Carlos - Rodriguez and the Barcelona Supercomputing Center! + Rodriguez, Carme Armentano and the Barcelona Supercomputing Center! - The transformer pipelines are trained using spacy-transformers v1.1, with improved IO and more options for [model config and output](/api/architectures#TransformerModel). From 1be8a4dab305466cc731f1bd9124ae13df274d54 Mon Sep 17 00:00:00 2001 From: Narayan Acharya Date: Mon, 29 Nov 2021 11:13:26 -0500 Subject: [PATCH 18/20] Displacy serve entity linking support without `manual=True` support. (#9748) * Add support for kb_id to be displayed via displacy.serve. The current support is only limited to the manual option in displacy.render * Commit to check pre-commit hooks are run. * Update spacy/displacy/__init__.py Co-authored-by: Sofie Van Landeghem * Changes as per suggestions on the PR. * Update website/docs/api/top-level.md Co-authored-by: Sofie Van Landeghem * Update website/docs/api/top-level.md Co-authored-by: Sofie Van Landeghem * tag option as new from 3.2.1 onwards Co-authored-by: Sofie Van Landeghem Co-authored-by: svlandeg --- spacy/displacy/__init__.py | 12 ++++++++++-- spacy/tests/test_displacy.py | 36 +++++++++++++++++++++++++++++++++-- website/docs/api/top-level.md | 26 ++++++++++++++++--------- 3 files changed, 61 insertions(+), 13 deletions(-) diff --git a/spacy/displacy/__init__.py b/spacy/displacy/__init__.py index d9418f675..25d530c83 100644 --- a/spacy/displacy/__init__.py +++ b/spacy/displacy/__init__.py @@ -181,11 +181,19 @@ def parse_deps(orig_doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]: def parse_ents(doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]: """Generate named entities in [{start: i, end: i, label: 'label'}] format. - doc (Doc): Document do parse. + doc (Doc): Document to parse. + options (Dict[str, Any]): NER-specific visualisation options. RETURNS (dict): Generated entities keyed by text (original text) and ents. """ + kb_url_template = options.get("kb_url_template", None) ents = [ - {"start": ent.start_char, "end": ent.end_char, "label": ent.label_} + { + "start": ent.start_char, + "end": ent.end_char, + "label": ent.label_, + "kb_id": ent.kb_id_ if ent.kb_id_ else "", + "kb_url": kb_url_template.format(ent.kb_id_) if kb_url_template else "#", + } for ent in doc.ents ] if not ents: diff --git a/spacy/tests/test_displacy.py b/spacy/tests/test_displacy.py index 040dd657f..790925888 100644 --- a/spacy/tests/test_displacy.py +++ b/spacy/tests/test_displacy.py @@ -1,8 +1,9 @@ import pytest + from spacy import displacy from spacy.displacy.render import DependencyRenderer, EntityRenderer -from spacy.tokens import Span, Doc from spacy.lang.fa import Persian +from spacy.tokens import Span, Doc def test_displacy_parse_ents(en_vocab): @@ -12,7 +13,38 @@ def test_displacy_parse_ents(en_vocab): ents = displacy.parse_ents(doc) assert isinstance(ents, dict) assert ents["text"] == "But Google is starting from behind " - assert ents["ents"] == [{"start": 4, "end": 10, "label": "ORG"}] + assert ents["ents"] == [ + {"start": 4, "end": 10, "label": "ORG", "kb_id": "", "kb_url": "#"} + ] + + doc.ents = [Span(doc, 1, 2, label=doc.vocab.strings["ORG"], kb_id="Q95")] + ents = displacy.parse_ents(doc) + assert isinstance(ents, dict) + assert ents["text"] == "But Google is starting from behind " + assert ents["ents"] == [ + {"start": 4, "end": 10, "label": "ORG", "kb_id": "Q95", "kb_url": "#"} + ] + + +def test_displacy_parse_ents_with_kb_id_options(en_vocab): + """Test that named entities with kb_id on a Doc are converted into displaCy's format.""" + doc = Doc(en_vocab, words=["But", "Google", "is", "starting", "from", "behind"]) + doc.ents = [Span(doc, 1, 2, label=doc.vocab.strings["ORG"], kb_id="Q95")] + + ents = displacy.parse_ents( + doc, {"kb_url_template": "https://www.wikidata.org/wiki/{}"} + ) + assert isinstance(ents, dict) + assert ents["text"] == "But Google is starting from behind " + assert ents["ents"] == [ + { + "start": 4, + "end": 10, + "label": "ORG", + "kb_id": "Q95", + "kb_url": "https://www.wikidata.org/wiki/Q95", + } + ] def test_displacy_parse_deps(en_vocab): diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md index 4361db4c0..be19f9c3a 100644 --- a/website/docs/api/top-level.md +++ b/website/docs/api/top-level.md @@ -313,11 +313,12 @@ If a setting is not present in the options, the default value will be used. > displacy.serve(doc, style="ent", options=options) > ``` -| Name | Description | -| --------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `ents` | Entity types to highlight or `None` for all types (default). ~~Optional[List[str]]~~ | -| `colors` | Color overrides. Entity types should be mapped to color names or values. ~~Dict[str, str]~~ | -| `template` 2.2 | Optional template to overwrite the HTML used to render entity spans. Should be a format string and can use `{bg}`, `{text}` and `{label}`. See [`templates.py`](%%GITHUB_SPACY/spacy/displacy/templates.py) for examples. ~~Optional[str]~~ | +| Name | Description | +| ------------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `ents` | Entity types to highlight or `None` for all types (default). ~~Optional[List[str]]~~ | +| `colors` | Color overrides. Entity types should be mapped to color names or values. ~~Dict[str, str]~~ | +| `template` 2.2 | Optional template to overwrite the HTML used to render entity spans. Should be a format string and can use `{bg}`, `{text}` and `{label}`. See [`templates.py`](%%GITHUB_SPACY/spacy/displacy/templates.py) for examples. ~~Optional[str]~~ | +| `kb_url_template` 3.2.1 | Optional template to construct the KB url for the entity to link to. Expects a python f-string format with single field to fill in. ~~Optional[str]~~ | By default, displaCy comes with colors for all entity types used by [spaCy's trained pipelines](/models). If you're using custom entity types, you @@ -326,6 +327,14 @@ or pipeline package can also expose a [`spacy_displacy_colors` entry point](/usage/saving-loading#entry-points-displacy) to add custom labels and their colors automatically. +By default, displaCy links to `#` for entities without a `kb_id` set on their +span. If you wish to link an entity to their URL then consider using the +`kb_url_template` option from above. For example if the `kb_id` on a span is +`Q95` and this is a Wikidata identifier then this option can be set to +`https://www.wikidata.org/wiki/{}`. Clicking on your entity in the rendered HTML +should redirect you to their Wikidata page, in this case +`https://www.wikidata.org/wiki/Q95`. + ## registry {#registry source="spacy/util.py" new="3"} spaCy's function registry extends @@ -412,10 +421,10 @@ finished. To log each training step, a and the accuracy scores on the development set. The built-in, default logger is the ConsoleLogger, which prints results to the -console in tabular format. The +console in tabular format. The [spacy-loggers](https://github.com/explosion/spacy-loggers) package, included as -a dependency of spaCy, enables other loggers: currently it provides one that sends -results to a [Weights & Biases](https://www.wandb.com/) dashboard. +a dependency of spaCy, enables other loggers: currently it provides one that +sends results to a [Weights & Biases](https://www.wandb.com/) dashboard. Instead of using one of the built-in loggers, you can [implement your own](/usage/training#custom-logging). @@ -466,7 +475,6 @@ start decreasing across epochs. - ## Readers {#readers} ### File readers {#file-readers source="github.com/explosion/srsly" new="3"} From c19f0c1604f7141a050292bf79d6eae3997b18c5 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 30 Nov 2021 10:08:51 +0100 Subject: [PATCH 19/20] Switch to latest CI images (#9773) --- azure-pipelines.yml | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 4291b6e0a..71a793911 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -23,7 +23,7 @@ jobs: # defined in .flake8 and overwrites the selected codes. - job: "Validate" pool: - vmImage: "ubuntu-18.04" + vmImage: "ubuntu-latest" steps: - task: UsePythonVersion@0 inputs: @@ -39,49 +39,49 @@ jobs: matrix: # We're only running one platform per Python version to speed up builds Python36Linux: - imageName: "ubuntu-18.04" + imageName: "ubuntu-latest" python.version: "3.6" # Python36Windows: - # imageName: "windows-2019" + # imageName: "windows-latest" # python.version: "3.6" # Python36Mac: - # imageName: "macos-10.14" + # imageName: "macos-latest" # python.version: "3.6" # Python37Linux: - # imageName: "ubuntu-18.04" + # imageName: "ubuntu-latest" # python.version: "3.7" Python37Windows: - imageName: "windows-2019" + imageName: "windows-latest" python.version: "3.7" # Python37Mac: - # imageName: "macos-10.14" + # imageName: "macos-latest" # python.version: "3.7" # Python38Linux: - # imageName: "ubuntu-18.04" + # imageName: "ubuntu-latest" # python.version: "3.8" # Python38Windows: - # imageName: "windows-2019" + # imageName: "windows-latest" # python.version: "3.8" Python38Mac: - imageName: "macos-10.14" + imageName: "macos-latest" python.version: "3.8" Python39Linux: - imageName: "ubuntu-18.04" + imageName: "ubuntu-latest" python.version: "3.9" # Python39Windows: - # imageName: "windows-2019" + # imageName: "windows-latest" # python.version: "3.9" # Python39Mac: - # imageName: "macos-10.14" + # imageName: "macos-latest" # python.version: "3.9" Python310Linux: - imageName: "ubuntu-20.04" + imageName: "ubuntu-latest" python.version: "3.10" Python310Windows: - imageName: "windows-2019" + imageName: "windows-latest" python.version: "3.10" Python310Mac: - imageName: "macos-10.15" + imageName: "macos-latest" python.version: "3.10" maxParallel: 4 pool: From 72f7f4e68a5076a87dd9402812bfb72e479237ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= Date: Tue, 30 Nov 2021 11:58:59 +0100 Subject: [PATCH 20/20] morphologizer: avoid recreating label tuple for each token (#9764) * morphologizer: avoid recreating label tuple for each token The `labels` property converts the dictionary key set to a tuple. This property was used for every annotated token, recreating the tuple over and over again. Construct the tuple once in the set_annotations function and reuse it. On a Finnish pipeline that I was experimenting with, this results in a speedup of ~15% (~13000 -> ~15000 WPS). * tagger: avoid recreating label tuple for each token --- spacy/pipeline/morphologizer.pyx | 3 ++- spacy/pipeline/tagger.pyx | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx index db425b69a..73d3799b1 100644 --- a/spacy/pipeline/morphologizer.pyx +++ b/spacy/pipeline/morphologizer.pyx @@ -231,12 +231,13 @@ class Morphologizer(Tagger): cdef Vocab vocab = self.vocab cdef bint overwrite = self.cfg["overwrite"] cdef bint extend = self.cfg["extend"] + labels = self.labels for i, doc in enumerate(docs): doc_tag_ids = batch_tag_ids[i] if hasattr(doc_tag_ids, "get"): doc_tag_ids = doc_tag_ids.get() for j, tag_id in enumerate(doc_tag_ids): - morph = self.labels[tag_id] + morph = labels[tag_id] # set morph if doc.c[j].morph == 0 or overwrite or extend: if overwrite and extend: diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx index a9cbac37a..c0768dfec 100644 --- a/spacy/pipeline/tagger.pyx +++ b/spacy/pipeline/tagger.pyx @@ -166,13 +166,14 @@ class Tagger(TrainablePipe): cdef Doc doc cdef Vocab vocab = self.vocab cdef bint overwrite = self.cfg["overwrite"] + labels = self.labels for i, doc in enumerate(docs): doc_tag_ids = batch_tag_ids[i] if hasattr(doc_tag_ids, "get"): doc_tag_ids = doc_tag_ids.get() for j, tag_id in enumerate(doc_tag_ids): if doc.c[j].tag == 0 or overwrite: - doc.c[j].tag = self.vocab.strings[self.labels[tag_id]] + doc.c[j].tag = self.vocab.strings[labels[tag_id]] def update(self, examples, *, drop=0., sgd=None, losses=None): """Learn from a batch of documents and gold-standard information,