From 95e3cf576bef4bf44a9736d3564fe87a1c742cc7 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Mon, 26 Apr 2021 16:53:21 +0200 Subject: [PATCH 001/176] Optionally append lang for packaged model name (#7417) * Add empty lines at the end of Python files * Only prepend the lang code if it's not there already * Update spacy/cli/package.py * fix whitespace stripping --- spacy/cli/package.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/spacy/cli/package.py b/spacy/cli/package.py index eaffde1d7..5b8daf048 100644 --- a/spacy/cli/package.py +++ b/spacy/cli/package.py @@ -112,7 +112,9 @@ def package( msg.fail("Invalid pipeline meta.json") print("\n".join(errors)) sys.exit(1) - model_name = meta["lang"] + "_" + meta["name"] + model_name = meta["name"] + if not model_name.startswith(meta['lang'] + "_"): + model_name = f"{meta['lang']}_{model_name}" model_name_v = model_name + "-" + meta["version"] main_path = output_dir / model_name_v package_path = main_path / model_name @@ -294,7 +296,7 @@ def setup_package(): if __name__ == '__main__': setup_package() -""".strip() +""".lstrip() TEMPLATE_MANIFEST = """ @@ -314,4 +316,4 @@ __version__ = get_model_meta(Path(__file__).parent)['version'] def load(**overrides): return load_model_from_init_py(__file__, **overrides) -""".strip() +""".lstrip() From e0b29f8ef7e4693355e481795af04413ccdf0d55 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Mon, 26 Apr 2021 16:53:38 +0200 Subject: [PATCH 002/176] Fix scoring normalization (#7629) * fix scoring normalization * score weights by total sum instead of per component * cleanup * more cleanup --- spacy/tests/pipeline/test_pipe_factories.py | 31 +++++++++++++-------- spacy/util.py | 30 ++++---------------- 2 files changed, 25 insertions(+), 36 deletions(-) diff --git a/spacy/tests/pipeline/test_pipe_factories.py b/spacy/tests/pipeline/test_pipe_factories.py index a7071abfd..c5cc62661 100644 --- a/spacy/tests/pipeline/test_pipe_factories.py +++ b/spacy/tests/pipeline/test_pipe_factories.py @@ -334,24 +334,31 @@ def test_language_factories_invalid(): @pytest.mark.parametrize( - "weights,expected", + "weights,override,expected", [ - ([{"a": 1.0}, {"b": 1.0}, {"c": 1.0}], {"a": 0.33, "b": 0.33, "c": 0.33}), - ([{"a": 1.0}, {"b": 50}, {"c": 123}], {"a": 0.33, "b": 0.33, "c": 0.33}), + ([{"a": 1.0}, {"b": 1.0}, {"c": 1.0}], {}, {"a": 0.33, "b": 0.33, "c": 0.33}), + ([{"a": 1.0}, {"b": 50}, {"c": 100}], {}, {"a": 0.01, "b": 0.33, "c": 0.66}), ( [{"a": 0.7, "b": 0.3}, {"c": 1.0}, {"d": 0.5, "e": 0.5}], + {}, {"a": 0.23, "b": 0.1, "c": 0.33, "d": 0.17, "e": 0.17}, ), ( - [{"a": 100, "b": 400}, {"c": 0.5, "d": 0.5}], - {"a": 0.1, "b": 0.4, "c": 0.25, "d": 0.25}, + [{"a": 100, "b": 300}, {"c": 50, "d": 50}], + {}, + {"a": 0.2, "b": 0.6, "c": 0.1, "d": 0.1}, ), - ([{"a": 0.5, "b": 0.5}, {"b": 1.0}], {"a": 0.25, "b": 0.75}), - ([{"a": 0.0, "b": 0.0}, {"c": 0.0}], {"a": 0.0, "b": 0.0, "c": 0.0}), + ([{"a": 0.5, "b": 0.5}, {"b": 1.0}], {}, {"a": 0.33, "b": 0.67}), + ([{"a": 0.5, "b": 0.0}], {}, {"a": 1.0, "b": 0.0}), + ([{"a": 0.5, "b": 0.5}, {"b": 1.0}], {"a": 0.0}, {"a": 0.0, "b": 1.0}), + ([{"a": 0.0, "b": 0.0}, {"c": 0.0}], {}, {"a": 0.0, "b": 0.0, "c": 0.0}), + ([{"a": 0.0, "b": 0.0}, {"c": 1.0}], {}, {"a": 0.0, "b": 0.0, "c": 1.0}), + ([{"a": 0.0, "b": 0.0}, {"c": 0.0}], {"c": 0.2}, {"a": 0.0, "b": 0.0, "c": 1.0}), + ([{"a": 0.5, "b": 0.5, "c": 1.0, "d": 1.0}], {"a": 0.0, "b": 0.0}, {"a": 0.0, "b": 0.0, "c": 0.5, "d": 0.5}), ], ) -def test_language_factories_combine_score_weights(weights, expected): - result = combine_score_weights(weights) +def test_language_factories_combine_score_weights(weights, override, expected): + result = combine_score_weights(weights, override) assert sum(result.values()) in (0.99, 1.0, 0.0) assert result == expected @@ -377,17 +384,17 @@ def test_language_factories_scores(): # Test with custom defaults config = nlp.config.copy() config["training"]["score_weights"]["a1"] = 0.0 - config["training"]["score_weights"]["b3"] = 1.0 + config["training"]["score_weights"]["b3"] = 1.3 nlp = English.from_config(config) score_weights = nlp.config["training"]["score_weights"] - expected = {"a1": 0.0, "a2": 0.5, "b1": 0.03, "b2": 0.12, "b3": 0.34} + expected = {"a1": 0.0, "a2": 0.12, "b1": 0.05, "b2": 0.17, "b3": 0.65} assert score_weights == expected # Test with null values config = nlp.config.copy() config["training"]["score_weights"]["a1"] = None nlp = English.from_config(config) score_weights = nlp.config["training"]["score_weights"] - expected = {"a1": None, "a2": 0.5, "b1": 0.03, "b2": 0.12, "b3": 0.35} + expected = {"a1": None, "a2": 0.12, "b1": 0.05, "b2": 0.17, "b3": 0.66} assert score_weights == expected diff --git a/spacy/util.py b/spacy/util.py index 512c6b742..0166bd190 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -1369,32 +1369,14 @@ def combine_score_weights( should be preserved. RETURNS (Dict[str, float]): The combined and normalized weights. """ + # We divide each weight by the total weight sum. # We first need to extract all None/null values for score weights that # shouldn't be shown in the table *or* be weighted - result = {} - all_weights = [] - for w_dict in weights: - filtered_weights = {} - for key, value in w_dict.items(): - value = overrides.get(key, value) - if value is None: - result[key] = None - else: - filtered_weights[key] = value - all_weights.append(filtered_weights) - for w_dict in all_weights: - # We need to account for weights that don't sum to 1.0 and normalize - # the score weights accordingly, then divide score by the number of - # components. - total = sum(w_dict.values()) - for key, value in w_dict.items(): - if total == 0: - weight = 0.0 - else: - weight = round(value / total / len(all_weights), 2) - prev_weight = result.get(key, 0.0) - prev_weight = 0.0 if prev_weight is None else prev_weight - result[key] = prev_weight + weight + result = {key: overrides.get(key, value) for w_dict in weights for (key, value) in w_dict.items()} + weight_sum = sum([v if v else 0.0 for v in result.values()]) + for key, value in result.items(): + if value and weight_sum > 0: + result[key] = round(value / weight_sum, 2) return result From c105ed10fd5d9eb924f767911dfc6400e0386505 Mon Sep 17 00:00:00 2001 From: Jacopo Farina Date: Mon, 26 Apr 2021 16:53:43 +0200 Subject: [PATCH 003/176] Remove torino from stop words (#7634) Torino is the proper name of a city and the token has no other meaning --- spacy/lang/it/stop_words.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/lang/it/stop_words.py b/spacy/lang/it/stop_words.py index e97613912..4178ed452 100644 --- a/spacy/lang/it/stop_words.py +++ b/spacy/lang/it/stop_words.py @@ -72,7 +72,7 @@ steste stesti stette stettero stetti stia stiamo stiano stiate sto su sua subito successivamente successivo sue sugl sugli sui sul sull sulla sulle sullo suo suoi -tale tali talvolta tanto te tempo ti titolo torino tra tranne tre trenta +tale tali talvolta tanto te tempo ti titolo tra tranne tre trenta troppo trovato tu tua tue tuo tuoi tutta tuttavia tutte tutti tutto uguali ulteriore ultimo un una uno uomo From 95c08336567788827deabfa3fcc500c03e382a20 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 26 Apr 2021 16:53:53 +0200 Subject: [PATCH 004/176] Add training option to set annotations on update (#7767) * Add training option to set annotations on update Add a `[training]` option called `set_annotations_on_update` to specify a list of components for which the predicted annotations should be set on `example.predicted` immediately after that component has been updated. The predicted annotations can be accessed by later components in the pipeline during the processing of the batch in the same `update` call. * Rename to annotates / annotating_components * Add test for `annotating_components` when training from config * Add documentation --- spacy/default_config.cfg | 2 + spacy/language.py | 29 ++++- spacy/schemas.py | 1 + .../pipeline/test_annotates_on_update.py | 113 ++++++++++++++++++ spacy/tests/pipeline/test_pipe_methods.py | 40 +++++++ spacy/training/loop.py | 16 ++- website/docs/api/data-formats.md | 37 +++--- website/docs/api/transformer.md | 21 ++-- website/docs/usage/training.md | 68 ++++++++++- 9 files changed, 289 insertions(+), 38 deletions(-) create mode 100644 spacy/tests/pipeline/test_annotates_on_update.py diff --git a/spacy/default_config.cfg b/spacy/default_config.cfg index 7f092d5f5..ceb7357fc 100644 --- a/spacy/default_config.cfg +++ b/spacy/default_config.cfg @@ -80,6 +80,8 @@ eval_frequency = 200 score_weights = {} # Names of pipeline components that shouldn't be updated during training frozen_components = [] +# Names of pipeline components that should set annotations during training +annotating_components = [] # Location in the config where the dev corpus is defined dev_corpus = "corpora.dev" # Location in the config where the train corpus is defined diff --git a/spacy/language.py b/spacy/language.py index 6f6470533..1a447c11b 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -1074,6 +1074,7 @@ class Language: losses: Optional[Dict[str, float]] = None, component_cfg: Optional[Dict[str, Dict[str, Any]]] = None, exclude: Iterable[str] = SimpleFrozenList(), + annotates: Iterable[str] = SimpleFrozenList(), ): """Update the models in the pipeline. @@ -1081,10 +1082,13 @@ class Language: _: Should not be set - serves to catch backwards-incompatible scripts. drop (float): The dropout rate. sgd (Optimizer): An optimizer. - losses (Dict[str, float]): Dictionary to update with the loss, keyed by component. + losses (Dict[str, float]): Dictionary to update with the loss, keyed by + component. component_cfg (Dict[str, Dict]): Config parameters for specific pipeline components, keyed by component name. exclude (Iterable[str]): Names of components that shouldn't be updated. + annotates (Iterable[str]): Names of components that should set + annotations on the predicted examples after updating. RETURNS (Dict[str, float]): The updated losses dictionary DOCS: https://spacy.io/api/language#update @@ -1103,15 +1107,16 @@ class Language: sgd = self._optimizer if component_cfg is None: component_cfg = {} + pipe_kwargs = {} for i, (name, proc) in enumerate(self.pipeline): component_cfg.setdefault(name, {}) + pipe_kwargs[name] = deepcopy(component_cfg[name]) component_cfg[name].setdefault("drop", drop) + pipe_kwargs[name].setdefault("batch_size", self.batch_size) for name, proc in self.pipeline: - if name in exclude or not hasattr(proc, "update"): - continue - proc.update(examples, sgd=None, losses=losses, **component_cfg[name]) - if sgd not in (None, False): - for name, proc in self.pipeline: + if name not in exclude and hasattr(proc, "update"): + proc.update(examples, sgd=None, losses=losses, **component_cfg[name]) + if sgd not in (None, False): if ( name not in exclude and hasattr(proc, "is_trainable") @@ -1119,6 +1124,18 @@ class Language: and proc.model not in (True, False, None) ): proc.finish_update(sgd) + if name in annotates: + for doc, eg in zip( + _pipe( + (eg.predicted for eg in examples), + proc=proc, + name=name, + default_error_handler=self.default_error_handler, + kwargs=pipe_kwargs[name], + ), + examples, + ): + eg.predicted = doc return losses def rehearse( diff --git a/spacy/schemas.py b/spacy/schemas.py index 2f25c785f..92315399d 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -313,6 +313,7 @@ class ConfigSchemaTraining(BaseModel): optimizer: Optimizer = Field(..., title="The optimizer to use") logger: Logger = Field(..., title="The logger to track training progress") frozen_components: List[str] = Field(..., title="Pipeline components that shouldn't be updated during training") + annotating_components: List[str] = Field(..., title="Pipeline components that should set annotations during training") before_to_disk: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after training, before it's saved to disk") # fmt: on diff --git a/spacy/tests/pipeline/test_annotates_on_update.py b/spacy/tests/pipeline/test_annotates_on_update.py new file mode 100644 index 000000000..b17855d85 --- /dev/null +++ b/spacy/tests/pipeline/test_annotates_on_update.py @@ -0,0 +1,113 @@ +from typing import Callable, Iterable, Iterator +import pytest +import io + +from thinc.api import Config +from spacy.language import Language +from spacy.training import Example +from spacy.training.loop import train +from spacy.lang.en import English +from spacy.util import registry, load_model_from_config + + +@pytest.fixture +def config_str(): + return """ + [nlp] + lang = "en" + pipeline = ["sentencizer","assert_sents"] + disabled = [] + before_creation = null + after_creation = null + after_pipeline_creation = null + batch_size = 1000 + tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"} + + [components] + + [components.assert_sents] + factory = "assert_sents" + + [components.sentencizer] + factory = "sentencizer" + punct_chars = null + + [training] + dev_corpus = "corpora.dev" + train_corpus = "corpora.train" + annotating_components = ["sentencizer"] + max_steps = 2 + + [corpora] + + [corpora.dev] + @readers = "unannotated_corpus" + + [corpora.train] + @readers = "unannotated_corpus" + """ + + +def test_annotates_on_update(): + # The custom component checks for sentence annotation + @Language.factory("assert_sents", default_config={}) + def assert_sents(nlp, name): + return AssertSents(name) + + class AssertSents: + def __init__(self, name, **cfg): + self.name = name + pass + + def __call__(self, doc): + if not doc.has_annotation("SENT_START"): + raise ValueError("No sents") + return doc + + def update(self, examples, *, drop=0.0, sgd=None, losses=None): + for example in examples: + if not example.predicted.has_annotation("SENT_START"): + raise ValueError("No sents") + return {} + + nlp = English() + nlp.add_pipe("sentencizer") + nlp.add_pipe("assert_sents") + + # When the pipeline runs, annotations are set + doc = nlp("This is a sentence.") + + examples = [] + for text in ["a a", "b b", "c c"]: + examples.append(Example(nlp.make_doc(text), nlp(text))) + + for example in examples: + assert not example.predicted.has_annotation("SENT_START") + + # If updating without setting annotations, assert_sents will raise an error + with pytest.raises(ValueError): + nlp.update(examples) + + # Updating while setting annotations for the sentencizer succeeds + nlp.update(examples, annotates=["sentencizer"]) + + +def test_annotating_components_from_config(config_str): + @registry.readers("unannotated_corpus") + def create_unannotated_corpus() -> Callable[[Language], Iterable[Example]]: + return UnannotatedCorpus() + + class UnannotatedCorpus: + def __call__(self, nlp: Language) -> Iterator[Example]: + for text in ["a a", "b b", "c c"]: + doc = nlp.make_doc(text) + yield Example(doc, doc) + + orig_config = Config().from_str(config_str) + nlp = load_model_from_config(orig_config, auto_fill=True, validate=True) + assert nlp.config["training"]["annotating_components"] == ["sentencizer"] + train(nlp) + + nlp.config["training"]["annotating_components"] = [] + with pytest.raises(ValueError): + train(nlp) diff --git a/spacy/tests/pipeline/test_pipe_methods.py b/spacy/tests/pipeline/test_pipe_methods.py index 9af8395a6..0b84db4c0 100644 --- a/spacy/tests/pipeline/test_pipe_methods.py +++ b/spacy/tests/pipeline/test_pipe_methods.py @@ -1,7 +1,9 @@ import pytest from spacy.language import Language from spacy.pipeline import TrainablePipe +from spacy.training import Example from spacy.util import SimpleFrozenList, get_arg_names +from spacy.lang.en import English @pytest.fixture @@ -417,3 +419,41 @@ def test_pipe_methods_initialize(): assert "test" in nlp.config["initialize"]["components"] nlp.remove_pipe("test") assert "test" not in nlp.config["initialize"]["components"] + + +def test_update_with_annotates(): + name = "test_with_annotates" + results = {} + + def make_component(name): + results[name] = "" + + def component(doc): + nonlocal results + results[name] += doc.text + return doc + + return component + + c1 = Language.component(f"{name}1", func=make_component(f"{name}1")) + c2 = Language.component(f"{name}2", func=make_component(f"{name}2")) + + components = set([f"{name}1", f"{name}2"]) + + nlp = English() + texts = ["a", "bb", "ccc"] + examples = [] + for text in texts: + examples.append(Example(nlp.make_doc(text), nlp.make_doc(text))) + + for components_to_annotate in [[], [f"{name}1"], [f"{name}1", f"{name}2"], [f"{name}2", f"{name}1"]]: + for key in results: + results[key] = "" + nlp = English(vocab=nlp.vocab) + nlp.add_pipe(f"{name}1") + nlp.add_pipe(f"{name}2") + nlp.update(examples, annotates=components_to_annotate) + for component in components_to_annotate: + assert results[component] == "".join(eg.predicted.text for eg in examples) + for component in components - set(components_to_annotate): + assert results[component] == "" diff --git a/spacy/training/loop.py b/spacy/training/loop.py index ecfa12fdb..85aa458f0 100644 --- a/spacy/training/loop.py +++ b/spacy/training/loop.py @@ -74,6 +74,8 @@ def train( # Components that shouldn't be updated during training frozen_components = T["frozen_components"] + # Components that should set annotations on update + annotating_components = T["annotating_components"] # Create iterator, which yields out info after each optimization step. training_step_iterator = train_while_improving( nlp, @@ -86,11 +88,17 @@ def train( max_steps=T["max_steps"], eval_frequency=T["eval_frequency"], exclude=frozen_components, + annotating_components=annotating_components, ) clean_output_dir(output_path) stdout.write(msg.info(f"Pipeline: {nlp.pipe_names}") + "\n") if frozen_components: stdout.write(msg.info(f"Frozen components: {frozen_components}") + "\n") + if annotating_components: + stdout.write( + msg.info(f"Set annotations on update for: {annotating_components}") + + "\n" + ) stdout.write(msg.info(f"Initial learn rate: {optimizer.learn_rate}") + "\n") with nlp.select_pipes(disable=frozen_components): log_step, finalize_logger = train_logger(nlp, stdout, stderr) @@ -142,6 +150,7 @@ def train_while_improving( patience: int, max_steps: int, exclude: List[str], + annotating_components: List[str], ): """Train until an evaluation stops improving. Works as a generator, with each iteration yielding a tuple `(batch, info, is_best_checkpoint)`, @@ -193,7 +202,12 @@ def train_while_improving( dropout = next(dropouts) for subbatch in subdivide_batch(batch, accumulate_gradient): nlp.update( - subbatch, drop=dropout, losses=losses, sgd=False, exclude=exclude + subbatch, + drop=dropout, + losses=losses, + sgd=False, + exclude=exclude, + annotates=annotating_components, ) # TODO: refactor this so we don't have to run it separately in here for name, proc in nlp.pipeline: diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md index 0c2a4c9f3..576ab8394 100644 --- a/website/docs/api/data-formats.md +++ b/website/docs/api/data-formats.md @@ -182,24 +182,25 @@ single corpus once and then divide it up into `train` and `dev` partitions. This section defines settings and controls for the training and evaluation process that are used when you run [`spacy train`](/api/cli#train). -| Name | Description | -| --------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~ | -| `batcher` | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ | -| `before_to_disk` | Optional callback to modify `nlp` object right before it is saved to disk during and after training. Can be used to remove or reset config values or disable components. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~ | -| `dev_corpus` | Dot notation of the config location defining the dev corpus. Defaults to `corpora.dev`. ~~str~~ | -| `dropout` | The dropout rate. Defaults to `0.1`. ~~float~~ | -| `eval_frequency` | How often to evaluate during training (steps). Defaults to `200`. ~~int~~ | -| `frozen_components` | Pipeline component names that are "frozen" and shouldn't be initialized or updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~ | -| `gpu_allocator` | Library for cupy to route GPU memory allocation to. Can be `"pytorch"` or `"tensorflow"`. Defaults to variable `${system.gpu_allocator}`. ~~str~~ | -| `logger` | Callable that takes the `nlp` and stdout and stderr `IO` objects, sets up the logger, and returns two new callables to log a training step and to finalize the logger. Defaults to [`ConsoleLogger`](/api/top-level#ConsoleLogger). ~~Callable[[Language, IO, IO], [Tuple[Callable[[Dict[str, Any]], None], Callable[[], None]]]]~~ | -| `max_epochs` | Maximum number of epochs to train for. `0` means an unlimited number of epochs. `-1` means that the train corpus should be streamed rather than loaded into memory with no shuffling within the training loop. Defaults to `0`. ~~int~~ | -| `max_steps` | Maximum number of update steps to train for. `0` means an unlimited number of steps. Defaults to `20000`. ~~int~~ | -| `optimizer` | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~ | -| `patience` | How many steps to continue without improvement in evaluation score. `0` disables early stopping. Defaults to `1600`. ~~int~~ | -| `score_weights` | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. Defaults to `{}`. ~~Dict[str, float]~~ | -| `seed` | The random seed. Defaults to variable `${system.seed}`. ~~int~~ | -| `train_corpus` | Dot notation of the config location defining the train corpus. Defaults to `corpora.train`. ~~str~~ | +| Name | Description | +| ----------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~ | +| `batcher` | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ | +| `before_to_disk` | Optional callback to modify `nlp` object right before it is saved to disk during and after training. Can be used to remove or reset config values or disable components. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~ | +| `dev_corpus` | Dot notation of the config location defining the dev corpus. Defaults to `corpora.dev`. ~~str~~ | +| `dropout` | The dropout rate. Defaults to `0.1`. ~~float~~ | +| `eval_frequency` | How often to evaluate during training (steps). Defaults to `200`. ~~int~~ | +| `frozen_components` | Pipeline component names that are "frozen" and shouldn't be initialized or updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~ | +| `annotating_components` | Pipeline component names that should set annotations on the predicted docs during training. See [here](/usage/training#annotating-components) for details. Defaults to `[]`. ~~List[str]~~ | +| `gpu_allocator` | Library for cupy to route GPU memory allocation to. Can be `"pytorch"` or `"tensorflow"`. Defaults to variable `${system.gpu_allocator}`. ~~str~~ | +| `logger` | Callable that takes the `nlp` and stdout and stderr `IO` objects, sets up the logger, and returns two new callables to log a training step and to finalize the logger. Defaults to [`ConsoleLogger`](/api/top-level#ConsoleLogger). ~~Callable[[Language, IO, IO], [Tuple[Callable[[Dict[str, Any]], None], Callable[[], None]]]]~~ | +| `max_epochs` | Maximum number of epochs to train for. `0` means an unlimited number of epochs. `-1` means that the train corpus should be streamed rather than loaded into memory with no shuffling within the training loop. Defaults to `0`. ~~int~~ | +| `max_steps` | Maximum number of update steps to train for. `0` means an unlimited number of steps. Defaults to `20000`. ~~int~~ | +| `optimizer` | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~ | +| `patience` | How many steps to continue without improvement in evaluation score. `0` disables early stopping. Defaults to `1600`. ~~int~~ | +| `score_weights` | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. Defaults to `{}`. ~~Dict[str, float]~~ | +| `seed` | The random seed. Defaults to variable `${system.seed}`. ~~int~~ | +| `train_corpus` | Dot notation of the config location defining the train corpus. Defaults to `corpora.train`. ~~str~~ | ### pretraining {#config-pretraining tag="section,optional"} diff --git a/website/docs/api/transformer.md b/website/docs/api/transformer.md index 5aaa1d23e..4698529a1 100644 --- a/website/docs/api/transformer.md +++ b/website/docs/api/transformer.md @@ -245,14 +245,14 @@ and call the optimizer, while the others simply increment the gradients. > losses = trf.update(examples, sgd=optimizer) > ``` -| Name | Description | -| ----------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `examples` | A batch of [`Example`](/api/example) objects. Only the [`Example.predicted`](/api/example#predicted) `Doc` object is used, the reference `Doc` is ignored. ~~Iterable[Example]~~ | -| _keyword-only_ | | -| `drop` | The dropout rate. ~~float~~ | -| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | -| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ | -| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ | +| Name | Description | +| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `examples` | A batch of [`Example`](/api/example) objects. Only the [`Example.predicted`](/api/example#predicted) `Doc` object is used, the reference `Doc` is ignored. ~~Iterable[Example]~~ | +| _keyword-only_ | | +| `drop` | The dropout rate. ~~float~~ | +| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | +| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ | +| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ | ## Transformer.create_optimizer {#create_optimizer tag="method"} @@ -493,6 +493,11 @@ This requires sentence boundaries to be set (e.g. by the depending on the sentence lengths. However, it does provide the transformer with more meaningful windows to attend over. +To set sentence boundaries with the `sentencizer` during training, add a +`sentencizer` to the beginning of the pipeline and include it in +[`[training.annotating_components]`](/usage/training#annotating-components) to +have it set the sentence boundaries before the `transformer` component runs. + ### strided_spans.v1 {#strided_spans tag="registered function"} > #### Example config diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md index 9f929fe19..1b345050c 100644 --- a/website/docs/usage/training.md +++ b/website/docs/usage/training.md @@ -414,11 +414,11 @@ as-is. They are also excluded when calling > #### Note on frozen components > > Even though frozen components are not **updated** during training, they will -> still **run** during training and evaluation. This is very important, because -> they may still impact your model's performance – for instance, a sentence -> boundary detector can impact what the parser or entity recognizer considers a -> valid parse. So the evaluation results should always reflect what your -> pipeline will produce at runtime. +> still **run** during evaluation. This is very important, because they may +> still impact your model's performance – for instance, a sentence boundary +> detector can impact what the parser or entity recognizer considers a valid +> parse. So the evaluation results should always reflect what your pipeline will +> produce at runtime. ```ini [nlp] @@ -455,6 +455,64 @@ replace_listeners = ["model.tok2vec"] +### Using predictions from preceding components {#annotating-components new="3.1"} + +By default, components are updated in isolation during training, which means +that they don't see the predictions of any earlier components in the pipeline. A +component receives [`Example.predicted`](/api/example) as input and compares its +predictions to [`Example.reference`](/api/example) without saving its +annotations in the `predicted` doc. + +Instead, if certain components should **set their annotations** during training, +use the setting `annotating_components` in the `[training]` block to specify a +list of components. For example, the feature `DEP` from the parser could be used +as a tagger feature by including `DEP` in the tok2vec `attrs` and including +`parser` in `annotating_components`: + +```ini +### config.cfg (excerpt) {highlight="7,12"} +[nlp] +pipeline = ["parser", "tagger"] + +[components.tagger.model.tok2vec.embed] +@architectures = "spacy.MultiHashEmbed.v1" +width = ${components.tagger.model.tok2vec.encode.width} +attrs = ["NORM","DEP"] +rows = [5000,2500] +include_static_vectors = false + +[training] +annotating_components = ["parser"] +``` + +Any component in the pipeline can be included as an annotating component, +including frozen components. Frozen components can set annotations during +training just as they would set annotations during evaluation or when the final +pipeline is run. The config excerpt below shows how a frozen `ner` component and +a `sentencizer` can provide the required `doc.sents` and `doc.ents` for the +entity linker during training: + +```ini +### config.cfg (excerpt) +[nlp] +pipeline = ["sentencizer", "ner", "entity_linker"] + +[components.ner] +source = "en_core_web_sm" + +[training] +frozen_components = ["ner"] +annotating_components = ["sentencizer", "ner"] +``` + + + +Be aware that non-frozen annotating components with statistical models will +**run twice** on each batch, once to update the model and once to apply the +now-updated model to the predicted docs. + + + ### Using registered functions {#config-functions} The training configuration defined in the config file doesn't have to only From ceee1ecf1735830abe5bfe0e22ac5ecd83e4eebc Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 26 Apr 2021 16:54:02 +0200 Subject: [PATCH 005/176] Replace cpdef variables with cdef (#7834) --- spacy/kb.pxd | 2 +- spacy/tokenizer.pxd | 2 +- spacy/vocab.pxd | 12 ++++++------ 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/spacy/kb.pxd b/spacy/kb.pxd index 4a71b26a2..a823dbe1e 100644 --- a/spacy/kb.pxd +++ b/spacy/kb.pxd @@ -28,7 +28,7 @@ cdef class Candidate: cdef class KnowledgeBase: cdef Pool mem - cpdef readonly Vocab vocab + cdef readonly Vocab vocab cdef int64_t entity_vector_length # This maps 64bit keys (hash of unique entity string) diff --git a/spacy/tokenizer.pxd b/spacy/tokenizer.pxd index 2a44d7729..719e8e6f5 100644 --- a/spacy/tokenizer.pxd +++ b/spacy/tokenizer.pxd @@ -14,7 +14,7 @@ cdef class Tokenizer: cdef Pool mem cdef PreshMap _cache cdef PreshMap _specials - cpdef readonly Vocab vocab + cdef readonly Vocab vocab cdef object _token_match cdef object _url_match diff --git a/spacy/vocab.pxd b/spacy/vocab.pxd index b5bcf7658..9067476f7 100644 --- a/spacy/vocab.pxd +++ b/spacy/vocab.pxd @@ -25,12 +25,12 @@ cdef struct _Cached: cdef class Vocab: cdef Pool mem - cpdef readonly StringStore strings - cpdef public Morphology morphology - cpdef public object vectors - cpdef public object _lookups - cpdef public object writing_system - cpdef public object get_noun_chunks + cdef readonly StringStore strings + cdef public Morphology morphology + cdef public object vectors + cdef public object _lookups + cdef public object writing_system + cdef public object get_noun_chunks cdef readonly int length cdef public object data_dir cdef public object lex_attr_getters From ae855a46259c6a76a0ab1dc317bb46c111fd1809 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 26 Apr 2021 16:54:23 +0200 Subject: [PATCH 006/176] Clean up Morphology imports and definitions (#7441) * Clean up Morphology imports and definitions * Whitespace formatting --- spacy/morphology.pxd | 19 ++++--------------- spacy/morphology.pyx | 13 ++----------- 2 files changed, 6 insertions(+), 26 deletions(-) diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd index 4fe8f7428..8d449d065 100644 --- a/spacy/morphology.pxd +++ b/spacy/morphology.pxd @@ -1,14 +1,11 @@ from cymem.cymem cimport Pool -from preshed.maps cimport PreshMap, PreshMapArray -from libc.stdint cimport uint64_t -from murmurhash cimport mrmr +from preshed.maps cimport PreshMap cimport numpy as np +from libc.stdint cimport uint64_t -from .structs cimport TokenC, MorphAnalysisC +from .structs cimport MorphAnalysisC from .strings cimport StringStore -from .typedefs cimport hash_t, attr_t, flags_t -from .parts_of_speech cimport univ_pos_t -from . cimport symbols +from .typedefs cimport attr_t, hash_t cdef class Morphology: @@ -16,14 +13,6 @@ cdef class Morphology: cdef readonly StringStore strings cdef PreshMap tags # Keyed by hash, value is pointer to tag - cdef public object lemmatizer - cdef readonly object tag_map - cdef readonly object tag_names - cdef readonly object reverse_index - cdef readonly object _exc - cdef readonly PreshMapArray _cache - cdef readonly int n_tags - cdef MorphAnalysisC create_morph_tag(self, field_feature_pairs) except * cdef int insert(self, MorphAnalysisC tag) except -1 diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index e8469223a..c3ffc46a1 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -1,20 +1,11 @@ # cython: infer_types -from libc.string cimport memset - -import srsly -from collections import Counter import numpy import warnings -from .attrs cimport POS, IS_SPACE -from .parts_of_speech cimport SPACE -from .lexeme cimport Lexeme +from .attrs cimport POS -from .strings import get_string_id -from .attrs import LEMMA, intify_attrs from .parts_of_speech import IDS as POS_IDS -from .errors import Errors, Warnings -from .util import ensure_path +from .errors import Warnings from . import symbols From 874cd025395b9bbcfb4ab5991fdf24cc99fd95e1 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 26 Apr 2021 17:06:32 +0200 Subject: [PATCH 007/176] Set spacy-legacy to >=3.0.5 (#7897) Set `spacy-legacy` to `>=3.0.5` due to `spacy.StaticVectors.v1` init bug. --- requirements.txt | 2 +- setup.cfg | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 1947dd2de..a8a15a01b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ # Our libraries -spacy-legacy>=3.0.4,<3.1.0 +spacy-legacy>=3.0.5,<3.1.0 cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 thinc>=8.0.3,<8.1.0 diff --git a/setup.cfg b/setup.cfg index 9e1293335..2fedd8f5c 100644 --- a/setup.cfg +++ b/setup.cfg @@ -37,7 +37,7 @@ setup_requires = thinc>=8.0.3,<8.1.0 install_requires = # Our libraries - spacy-legacy>=3.0.4,<3.1.0 + spacy-legacy>=3.0.5,<3.1.0 murmurhash>=0.28.0,<1.1.0 cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 From 946a4284bead9bc15aaafc50d69c82a8f253da33 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 26 Apr 2021 17:06:32 +0200 Subject: [PATCH 008/176] Set spacy-legacy to >=3.0.5 (#7897) Set `spacy-legacy` to `>=3.0.5` due to `spacy.StaticVectors.v1` init bug. --- requirements.txt | 2 +- setup.cfg | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 1947dd2de..a8a15a01b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ # Our libraries -spacy-legacy>=3.0.4,<3.1.0 +spacy-legacy>=3.0.5,<3.1.0 cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 thinc>=8.0.3,<8.1.0 diff --git a/setup.cfg b/setup.cfg index 9e1293335..2fedd8f5c 100644 --- a/setup.cfg +++ b/setup.cfg @@ -37,7 +37,7 @@ setup_requires = thinc>=8.0.3,<8.1.0 install_requires = # Our libraries - spacy-legacy>=3.0.4,<3.1.0 + spacy-legacy>=3.0.5,<3.1.0 murmurhash>=0.28.0,<1.1.0 cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 From 1690595e4d243378dd13542090c658429fd87d15 Mon Sep 17 00:00:00 2001 From: Janis Klaise Date: Tue, 27 Apr 2021 08:13:39 +0100 Subject: [PATCH 009/176] Update load_lookups return type and docstring (#7907) * Update load_lookups return type and docstring * Add contributor agreement --- .github/contributors/jklaise.md | 106 ++++++++++++++++++++++++++++++++ spacy/lookups.py | 8 +-- 2 files changed, 110 insertions(+), 4 deletions(-) create mode 100644 .github/contributors/jklaise.md diff --git a/.github/contributors/jklaise.md b/.github/contributors/jklaise.md new file mode 100644 index 000000000..66d77ee48 --- /dev/null +++ b/.github/contributors/jklaise.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name |Janis Klaise | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date |26/04/2021 | +| GitHub username |jklaise | +| Website (optional) |janisklaise.com | diff --git a/spacy/lookups.py b/spacy/lookups.py index 76535d1de..f635f0dcf 100644 --- a/spacy/lookups.py +++ b/spacy/lookups.py @@ -1,4 +1,4 @@ -from typing import Dict, Any, List, Union, Optional +from typing import Any, List, Union, Optional from pathlib import Path import srsly from preshed.bloom import BloomFilter @@ -14,16 +14,16 @@ UNSET = object() def load_lookups( lang: str, tables: List[str], strict: bool = True -) -> Optional[Dict[str, Any]]: +) -> 'Lookups': """Load the data from the spacy-lookups-data package for a given language, - if available. Returns an empty dict if there's no data or if the package + if available. Returns an empty `Lookups` container if there's no data or if the package is not installed. lang (str): The language code (corresponds to entry point exposed by the spacy-lookups-data package). tables (List[str]): Name of tables to load, e.g. ["lemma_lookup", "lemma_exc"] strict (bool): Whether to raise an error if a table doesn't exist. - RETURNS (Dict[str, Any]): The lookups, keyed by table name. + RETURNS (Lookups): The lookups container containing the loaded tables. """ # TODO: import spacy_lookups_data instead of going via entry points here? lookups = Lookups() From de6b5ed14dcb036c02e92664365ea2b1fb6cf21c Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Tue, 27 Apr 2021 16:16:35 +0900 Subject: [PATCH 010/176] Fix percent unk display in debug data (#7886) * Fix percent unk display This was showing (ratio %), so 10% would show as 0.10%. Fix by multiplying ration by 100. Might want to add a warning if this is over a threshold. * Only show whole-integer percents --- spacy/cli/debug_data.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index 3351e53fe..1ebf65957 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -173,8 +173,8 @@ def debug_data( ) n_missing_vectors = sum(gold_train_data["words_missing_vectors"].values()) msg.warn( - "{} words in training data without vectors ({:0.2f}%)".format( - n_missing_vectors, n_missing_vectors / gold_train_data["n_words"] + "{} words in training data without vectors ({:.0f}%)".format( + n_missing_vectors, 100 * (n_missing_vectors / gold_train_data["n_words"]) ), ) msg.text( From 8007d5c8148460d08a6aa500dff0eabb0f504f23 Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Wed, 28 Apr 2021 16:17:15 +0900 Subject: [PATCH 011/176] Check if the resume path points to a directory (#7919) This came up in #7878, but if --resume-path is a directory then loading the weights will fail. On Linux this will give a straightforward error message, but on Windows it gives "Permission Denied", which is confusing. --- spacy/cli/pretrain.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py index 1f8fc99cc..fe3ce0dad 100644 --- a/spacy/cli/pretrain.py +++ b/spacy/cli/pretrain.py @@ -95,6 +95,13 @@ def verify_cli_args(config_path, output_dir, resume_path, epoch_resume): "then the new directory will be created for you.", ) if resume_path is not None: + if resume_path.is_dir(): + # This is necessary because Windows gives a Permission Denied when we + # try to open the directory later, which is confusing. See #7878 + msg.fail( + "--resume-path should be a weights file, but {resume_path} is a directory.", + exits=True, + ) model_name = re.search(r"model\d+\.bin", str(resume_path)) if not model_name and not epoch_resume: msg.fail( From f4080983eab96a1c43a98d2553bc2a2cdea3986d Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Wed, 28 Apr 2021 10:18:24 +0200 Subject: [PATCH 012/176] Extend to cupy 9.0.0 (#7914) --- .github/azure-steps.yml | 2 +- setup.cfg | 22 +++++++++++----------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/.github/azure-steps.yml b/.github/azure-steps.yml index 750e096d0..d536f2eb8 100644 --- a/.github/azure-steps.yml +++ b/.github/azure-steps.yml @@ -41,7 +41,7 @@ steps: displayName: "Install test requirements" - script: | - ${{ parameters.prefix }} python -m pip install -U cupy-cuda110 + ${{ parameters.prefix }} python -m pip install -U cupy-cuda110 -f https://github.com/cupy/cupy/releases/v9.0.0 ${{ parameters.prefix }} python -m pip install "torch==1.7.1+cu110" -f https://download.pytorch.org/whl/torch_stable.html displayName: "Install GPU requirements" condition: eq(${{ parameters.gpu }}, true) diff --git a/setup.cfg b/setup.cfg index 2fedd8f5c..63d603a9c 100644 --- a/setup.cfg +++ b/setup.cfg @@ -71,27 +71,27 @@ transformers = ray = spacy_ray>=0.1.0,<1.0.0 cuda = - cupy>=5.0.0b4,<9.0.0 + cupy>=5.0.0b4,<10.0.0 cuda80 = - cupy-cuda80>=5.0.0b4,<9.0.0 + cupy-cuda80>=5.0.0b4,<10.0.0 cuda90 = - cupy-cuda90>=5.0.0b4,<9.0.0 + cupy-cuda90>=5.0.0b4,<10.0.0 cuda91 = - cupy-cuda91>=5.0.0b4,<9.0.0 + cupy-cuda91>=5.0.0b4,<10.0.0 cuda92 = - cupy-cuda92>=5.0.0b4,<9.0.0 + cupy-cuda92>=5.0.0b4,<10.0.0 cuda100 = - cupy-cuda100>=5.0.0b4,<9.0.0 + cupy-cuda100>=5.0.0b4,<10.0.0 cuda101 = - cupy-cuda101>=5.0.0b4,<9.0.0 + cupy-cuda101>=5.0.0b4,<10.0.0 cuda102 = - cupy-cuda102>=5.0.0b4,<9.0.0 + cupy-cuda102>=5.0.0b4,<10.0.0 cuda110 = - cupy-cuda110>=5.0.0b4,<9.0.0 + cupy-cuda110>=5.0.0b4,<10.0.0 cuda111 = - cupy-cuda111>=5.0.0b4,<9.0.0 + cupy-cuda111>=5.0.0b4,<10.0.0 cuda112 = - cupy-cuda112>=5.0.0b4,<9.0.0 + cupy-cuda112>=5.0.0b4,<10.0.0 # Language tokenizers with external dependencies ja = sudachipy>=0.4.9 From 49aed683cce4d58baca10e7cb4fe89fbfc209a36 Mon Sep 17 00:00:00 2001 From: Sevdimali Date: Wed, 28 Apr 2021 16:42:02 +0400 Subject: [PATCH 013/176] Azerbaijani language added (#7911) --- .github/contributors/sevdimali.md | 106 ++++++++++++++++++++++ spacy/lang/az/__init__.py | 21 +++++ spacy/lang/az/examples.py | 18 ++++ spacy/lang/az/lex_attrs.py | 89 ++++++++++++++++++ spacy/lang/az/stop_words.py | 145 ++++++++++++++++++++++++++++++ 5 files changed, 379 insertions(+) create mode 100644 .github/contributors/sevdimali.md create mode 100644 spacy/lang/az/__init__.py create mode 100644 spacy/lang/az/examples.py create mode 100644 spacy/lang/az/lex_attrs.py create mode 100644 spacy/lang/az/stop_words.py diff --git a/.github/contributors/sevdimali.md b/.github/contributors/sevdimali.md new file mode 100644 index 000000000..6b96abdf8 --- /dev/null +++ b/.github/contributors/sevdimali.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Sevdimali | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 10/4/2021 | +| GitHub username | sevdimali | +| Website (optional) | https://sevdimali.me | diff --git a/spacy/lang/az/__init__.py b/spacy/lang/az/__init__.py new file mode 100644 index 000000000..6a4288d1e --- /dev/null +++ b/spacy/lang/az/__init__.py @@ -0,0 +1,21 @@ +from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH +from .stop_words import STOP_WORDS +from .syntax_iterators import SYNTAX_ITERATORS +from .lex_attrs import LEX_ATTRS +from ...language import Language + + +class AzerbaijaniDefaults(Language.Defaults): + tokenizer_exceptions = TOKENIZER_EXCEPTIONS + lex_attr_getters = LEX_ATTRS + stop_words = STOP_WORDS + token_match = TOKEN_MATCH + syntax_iterators = SYNTAX_ITERATORS + + +class Azerbaijani(Language): + lang = "az" + Defaults = AzerbaijaniDefaults + + +__all__ = ["Azerbaijani"] diff --git a/spacy/lang/az/examples.py b/spacy/lang/az/examples.py new file mode 100644 index 000000000..f3331a8cb --- /dev/null +++ b/spacy/lang/az/examples.py @@ -0,0 +1,18 @@ +""" +Example sentences to test spaCy and its language models. +>>> from spacy.lang.az.examples import sentences +>>> docs = nlp.pipe(sentences) +""" + + +sentences = [ + "Bu bir cümlədir.", + "Necəsən?", + "Qarabağ ordeni vətən müharibəsində qələbə münasibəti ilə təsis edilmişdir.", + "Məktəbimizə Bakıdan bir tarix müəllimi gəlmişdi.", + "Atılan növbəti mərmilər lap yaxınlıqda partladı.", + "Sinqapur koronavirus baxımından ən təhlükəsiz ölkələr sırasındadır.", + "Marsda ilk sınaq uçuşu həyata keçirilib.", + "SSRİ dağılandan bəri 5 sahil dövləti Xəzərin statusunu müəyyən edə bilməyiblər.", + "Videoda beyninə xüsusi çip yerləşdirilmiş meymun əks olunub.", +] diff --git a/spacy/lang/az/lex_attrs.py b/spacy/lang/az/lex_attrs.py new file mode 100644 index 000000000..73a5e2762 --- /dev/null +++ b/spacy/lang/az/lex_attrs.py @@ -0,0 +1,89 @@ +from ...attrs import LIKE_NUM + + +# Eleven, twelve etc. are written separate: on bir, on iki + +_num_words = [ + "bir", + "iki", + "üç", + "dörd", + "beş", + "altı", + "yeddi", + "səkkiz", + "doqquz", + "on", + "iyirmi", + "otuz", + "qırx", + "əlli", + "altmış", + "yetmiş", + "səksən", + "doxsan", + "yüz", + "min", + "milyon", + "milyard", + "trilyon", + "kvadrilyon", + "kentilyon", +] + + +_ordinal_words = [ + "birinci", + "ikinci", + "üçüncü", + "dördüncü", + "beşinci", + "altıncı", + "yedinci", + "səkkizinci", + "doqquzuncu", + "onuncu", + "iyirminci", + "otuzuncu", + "qırxıncı", + "əllinci", + "altmışıncı", + "yetmişinci", + "səksəninci", + "doxsanıncı", + "yüzüncü", + "mininci", + "milyonuncu", + "milyardıncı", + "trilyonuncu", + "kvadrilyonuncu", + "kentilyonuncu", +] + +_ordinal_endings = ("inci", "ıncı", "nci", "ncı", "uncu", "üncü") + + +def like_num(text): + if text.startswith(("+", "-", "±", "~")): + text = text[1:] + text = text.replace(",", "").replace(".", "") + if text.isdigit(): + return True + if text.count("/") == 1: + num, denom = text.split("/") + if num.isdigit() and denom.isdigit(): + return True + text_lower = text.lower() + # Check cardinal number + if text_lower in _num_words: + return True + # Check ordinal number + if text_lower in _ordinal_words: + return True + if text_lower.endswith(_ordinal_endings): + if text_lower[:-3].isdigit() or text_lower[:-4].isdigit(): + return True + return False + + +LEX_ATTRS = {LIKE_NUM: like_num} diff --git a/spacy/lang/az/stop_words.py b/spacy/lang/az/stop_words.py new file mode 100644 index 000000000..2114939ba --- /dev/null +++ b/spacy/lang/az/stop_words.py @@ -0,0 +1,145 @@ +# Source: https://github.com/eliasdabbas/advertools/blob/master/advertools/stopwords.py +STOP_WORDS = set( + """ +amma +arasında +artıq +ay +az +bax +belə +beş +bilər +bir +biraz +biri +birşey +biz +bizim +bizlər +bu +buna +bundan +bunların +bunu +bunun +buradan +bütün +bəli +bəlkə +bəy +bəzi +bəzən +daha +dedi +deyil +dir +düz +də +dək +dən +dəqiqə +edir +edən +elə +et +etdi +etmə +etmək +faiz +gilə +görə +ha +haqqında +harada +heç +hə +həm +həmin +həmişə +hər +idi +il +ildə +ilk +ilə +in +indi +istifadə +isə +ki +kim +kimi +kimə +lakin +lap +mirşey +məhz +mən +mənə +niyə +nə +nəhayət +o +obirisi +of +olan +olar +olaraq +oldu +olduğu +olmadı +olmaz +olmuşdur +olsun +olur +on +ona +ondan +onlar +onlardan +onların +onsuzda +onu +onun +oradan +qarşı +qədər +saat +sadəcə +saniyə +siz +sizin +sizlər +sonra +səhv +sən +sənin +sənə +təəssüf +var +və +xan +xanım +xeyr +ya +yalnız +yaxşı +yeddi +yenə +yox +yoxdur +yoxsa +yəni +zaman +çox +çünki +öz +özü +üçün +əgər +əlbəttə +ən +əslində +""".split() +) From 7cf5bd072fc1ca65be2a9eb3115aa838ba83b04d Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Thu, 29 Apr 2021 16:58:54 +0200 Subject: [PATCH 014/176] Refactor util.to_ternary_int (#7944) * Refactor to avoid literal comparison with `is` * Extend tests --- spacy/tests/test_misc.py | 16 ++++++++++++++++ spacy/util.py | 12 ++++++++---- 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py index 0d09999a9..b38a50f71 100644 --- a/spacy/tests/test_misc.py +++ b/spacy/tests/test_misc.py @@ -8,6 +8,7 @@ from spacy import prefer_gpu, require_gpu, require_cpu from spacy.ml._precomputable_affine import PrecomputableAffine from spacy.ml._precomputable_affine import _backprop_precomputable_affine_padding from spacy.util import dot_to_object, SimpleFrozenList, import_file +from spacy.util import to_ternary_int from thinc.api import Config, Optimizer, ConfigValidationError, get_current_ops from thinc.api import set_current_ops from spacy.training.batchers import minibatch_by_words @@ -386,3 +387,18 @@ def make_dummy_component( nlp = English.from_config(config) nlp.add_pipe("dummy_component") nlp.initialize() + + +def test_to_ternary_int(): + assert to_ternary_int(True) == 1 + assert to_ternary_int(None) == 0 + assert to_ternary_int(False) == -1 + assert to_ternary_int(1) == 1 + assert to_ternary_int(1.0) == 1 + assert to_ternary_int(0) == 0 + assert to_ternary_int(0.0) == 0 + assert to_ternary_int(-1) == -1 + assert to_ternary_int(5) == -1 + assert to_ternary_int(-10) == -1 + assert to_ternary_int("string") == -1 + assert to_ternary_int([0, "string"]) == -1 diff --git a/spacy/util.py b/spacy/util.py index 512c6b742..84142d5d8 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -1533,11 +1533,15 @@ def to_ternary_int(val) -> int: attributes such as SENT_START: True/1/1.0 is 1 (True), None/0/0.0 is 0 (None), any other values are -1 (False). """ - if isinstance(val, float): - val = int(val) - if val is True or val is 1: + if val is True: return 1 - elif val is None or val is 0: + elif val is None: + return 0 + elif val is False: + return -1 + elif val == 1: + return 1 + elif val == 0: return 0 else: return -1 From cf032ec31e38f57940edfb93f041bcd373871554 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Thu, 29 Apr 2021 19:11:28 +0200 Subject: [PATCH 015/176] Update to catalogue>=2.0.4 (#7951) --- requirements.txt | 2 +- setup.cfg | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index a8a15a01b..09d1cabda 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,7 +8,7 @@ ml_datasets>=0.2.0,<0.3.0 murmurhash>=0.28.0,<1.1.0 wasabi>=0.8.1,<1.1.0 srsly>=2.4.1,<3.0.0 -catalogue>=2.0.3,<2.1.0 +catalogue>=2.0.4,<2.1.0 typer>=0.3.0,<0.4.0 pathy>=0.3.5 # Third party dependencies diff --git a/setup.cfg b/setup.cfg index 63d603a9c..5cda00fb2 100644 --- a/setup.cfg +++ b/setup.cfg @@ -45,7 +45,7 @@ install_requires = blis>=0.4.0,<0.8.0 wasabi>=0.8.1,<1.1.0 srsly>=2.4.1,<3.0.0 - catalogue>=2.0.3,<2.1.0 + catalogue>=2.0.4,<2.1.0 typer>=0.3.0,<0.4.0 pathy>=0.3.5 # Third-party dependencies From 2320791f6dc42f7724cedc86a420572c90aa7a5c Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 30 Apr 2021 12:21:31 +0200 Subject: [PATCH 016/176] Fix Transformer.initialize example (#7963) --- website/docs/api/transformer.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/api/transformer.md b/website/docs/api/transformer.md index 5aaa1d23e..6de2b0a87 100644 --- a/website/docs/api/transformer.md +++ b/website/docs/api/transformer.md @@ -175,7 +175,7 @@ by [`Language.initialize`](/api/language#initialize). > > ```python > trf = nlp.add_pipe("transformer") -> trf.initialize(lambda: [], nlp=nlp) +> trf.initialize(lambda: iter([]), nlp=nlp) > ``` | Name | Description | From 12d3d0feddc4f813d1cc63ab2465e31e9c8816cc Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 3 May 2021 11:48:12 +1000 Subject: [PATCH 017/176] Fix quickstart default checked of conditional fields [ci skip] --- website/src/components/quickstart.js | 3 ++- website/src/widgets/quickstart-training.js | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/website/src/components/quickstart.js b/website/src/components/quickstart.js index 90a8e0983..a32db8975 100644 --- a/website/src/components/quickstart.js +++ b/website/src/components/quickstart.js @@ -105,12 +105,13 @@ const Quickstart = ({ multiple, other, help, + hidden, }) => { // Optional function that's called with the value const setterFunc = setters[id] || (() => {}) // Check if dropdown should be shown const dropdownGetter = showDropdown[id] || (() => true) - return ( + return hidden ? null : (