From 59271e887a81f02ac7b608bb71459dff5d49cfa9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Octavio=20Queiroz=20Dias?= Date: Sat, 6 Feb 2021 09:44:51 -0300 Subject: [PATCH 01/35] fix: TransformerListener with TextCatEnsemble (#6951) * bug: Regression test Issue #6946 * fix: Fix issue #6946 * chore: Remove regression test --- spacy/ml/models/textcat.py | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy/ml/models/textcat.py b/spacy/ml/models/textcat.py index e0c11ed99..0234530e6 100644 --- a/spacy/ml/models/textcat.py +++ b/spacy/ml/models/textcat.py @@ -107,6 +107,7 @@ def init_ensemble_textcat(model, X, Y) -> Model: model.get_ref("maxout_layer").set_dim("nO", tok2vec_width) model.get_ref("maxout_layer").set_dim("nI", tok2vec_width) model.get_ref("norm_layer").set_dim("nI", tok2vec_width) + model.get_ref("norm_layer").set_dim("nO", tok2vec_width) init_chain(model, X, Y) return model From a7977b51434c0aacf0ba37467b5451a204b6a568 Mon Sep 17 00:00:00 2001 From: melonwater211 <78627222+melonwater211@users.noreply.github.com> Date: Sat, 6 Feb 2021 15:51:34 -0800 Subject: [PATCH 02/35] The test `spacy/tests/vocab_vectors/test_lexeme.py::test_vocab_lexeme_add_flag_auto_id` seems to fail occasionally when the test suite is run in a random order. (#6956) ```python def test_vocab_lexeme_add_flag_auto_id(en_vocab): is_len4 = en_vocab.add_flag(lambda string: len(string) == 4) assert en_vocab["1999"].check_flag(is_len4) is True assert en_vocab["1999"].check_flag(IS_DIGIT) is True assert en_vocab["199"].check_flag(is_len4) is False > assert en_vocab["199"].check_flag(IS_DIGIT) is True E assert False is True E + where False = (3) E + where = .check_flag spacy/tests/vocab_vectors/test_lexeme.py:49: AssertionError ``` > `pytest==6.1.1` > > `numpy==1.19.2` > > `Python version: 3.8.3` To reproduce the error, run `pytest --random-order-bucket=global --random-order-seed=170158 -v spacy/tests` If `test_vocab_lexeme_add_flag_auto_id` is run after `test_vocab_lexeme_add_flag_provided_id`, it fails. It seems like `test_vocab_lexeme_add_flag_provided_id` uses the `IS_DIGIT` bit for testing purposes but does not reset the bit. This solution seems to work but, if anyone has a better fix, please let me know and I will integrate it. --- spacy/tests/vocab_vectors/test_lexeme.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tests/vocab_vectors/test_lexeme.py b/spacy/tests/vocab_vectors/test_lexeme.py index 4288f427c..4eeff5175 100644 --- a/spacy/tests/vocab_vectors/test_lexeme.py +++ b/spacy/tests/vocab_vectors/test_lexeme.py @@ -55,7 +55,7 @@ def test_vocab_lexeme_add_flag_provided_id(en_vocab): assert en_vocab["199"].check_flag(IS_DIGIT) is False assert en_vocab["the"].check_flag(is_len4) is False assert en_vocab["dogs"].check_flag(is_len4) is True - + en_vocab.add_flag(lambda string: string.isdigit(), flag_id=IS_DIGIT) def test_vocab_lexeme_oov_rank(en_vocab): """Test that default rank is OOV_RANK.""" From a323ef90df7c5a72e9deb9272fdb39a40fa8a9b5 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Sun, 7 Feb 2021 00:51:56 +0100 Subject: [PATCH 03/35] ensure the loss value is cast as float (#6928) --- spacy/pipeline/entity_linker.py | 2 +- spacy/pipeline/multitask.pyx | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py index 6ea735dde..262bcf677 100644 --- a/spacy/pipeline/entity_linker.py +++ b/spacy/pipeline/entity_linker.py @@ -273,7 +273,7 @@ class EntityLinker(TrainablePipe): gradients = self.distance.get_grad(sentence_encodings, entity_encodings) loss = self.distance.get_loss(sentence_encodings, entity_encodings) loss = loss / len(entity_encodings) - return loss, gradients + return float(loss), gradients def predict(self, docs: Iterable[Doc]) -> List[str]: """Apply the pipeline's model to a batch of docs, without modifying them. diff --git a/spacy/pipeline/multitask.pyx b/spacy/pipeline/multitask.pyx index cfb492612..990b6a1de 100644 --- a/spacy/pipeline/multitask.pyx +++ b/spacy/pipeline/multitask.pyx @@ -197,7 +197,7 @@ class ClozeMultitask(TrainablePipe): target = vectors[ids] gradient = self.distance.get_grad(prediction, target) loss = self.distance.get_loss(prediction, target) - return loss, gradient + return float(loss), gradient def update(self, examples, *, drop=0., sgd=None, losses=None): pass From 6ed423c16c99206ff2b81176d9565d0e1c1b7071 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Sun, 7 Feb 2021 01:05:43 +0100 Subject: [PATCH 04/35] reduce memory load when reading all vectors from file (#6945) * reduce memory load when reading all vectors from file * one more small typo fix --- spacy/lexeme.pyx | 2 +- spacy/training/initialize.py | 16 ++++++++++------ website/docs/api/top-level.md | 2 +- 3 files changed, 12 insertions(+), 8 deletions(-) diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index 25461b4b7..c8e0f2965 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -451,7 +451,7 @@ cdef class Lexeme: Lexeme.c_set_flag(self.c, IS_QUOTE, x) property is_left_punct: - """RETURNS (bool): Whether the lexeme is left punctuation, e.g. ).""" + """RETURNS (bool): Whether the lexeme is left punctuation, e.g. (.""" def __get__(self): return Lexeme.c_check_flag(self.c, IS_LEFT_PUNCT) diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py index 7457dc359..25bb73c78 100644 --- a/spacy/training/initialize.py +++ b/spacy/training/initialize.py @@ -215,8 +215,7 @@ def convert_vectors( def read_vectors(vectors_loc: Path, truncate_vectors: int): - f = open_file(vectors_loc) - f = ensure_shape(f) + f = ensure_shape(vectors_loc) shape = tuple(int(size) for size in next(f).split()) if truncate_vectors >= 1: shape = (truncate_vectors, shape[1]) @@ -251,11 +250,12 @@ def open_file(loc: Union[str, Path]) -> IO: return loc.open("r", encoding="utf8") -def ensure_shape(lines): +def ensure_shape(vectors_loc): """Ensure that the first line of the data is the vectors shape. If it's not, we read in the data and output the shape as the first result, so that the reader doesn't have to deal with the problem. """ + lines = open_file(vectors_loc) first_line = next(lines) try: shape = tuple(int(size) for size in first_line.split()) @@ -269,7 +269,11 @@ def ensure_shape(lines): # Figure out the shape, make it the first value, and then give the # rest of the data. width = len(first_line.split()) - 1 - captured = [first_line] + list(lines) - length = len(captured) + length = 1 + for _ in lines: + length += 1 yield f"{length} {width}" - yield from captured + # Reading the lines in again from file. This to avoid having to + # store all the results in a list in memory + lines2 = open_file(vectors_loc) + yield from lines2 diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md index 3a2c65553..37f619f3e 100644 --- a/website/docs/api/top-level.md +++ b/website/docs/api/top-level.md @@ -727,7 +727,7 @@ capitalization by including a mix of capitalized and lowercase examples. See the Create a data augmentation callback that uses orth-variant replacement. The callback can be added to a corpus or other data iterator during training. It's -is especially useful for punctuation and case replacement, to help generalize +especially useful for punctuation and case replacement, to help generalize beyond corpora that don't have smart quotes, or only have smart quotes etc. | Name | Description | From 6108dabdc8412f09499c586802a12bceba2fd516 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 8 Feb 2021 09:21:36 +0100 Subject: [PATCH 05/35] Rephrase error related to sample data initialization Now that the initialize step is fully implemented, the source of E923 is typically missing or improperly converted/formatted data rather than a bug in spaCy, so rephrase the error and message and remove the prompt to open an issue. --- spacy/errors.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index 54fd2d8b0..79a15fbaa 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -579,8 +579,8 @@ class Errors: E922 = ("Component '{name}' has been initialized with an output dimension of " "{nO} - cannot add any more labels.") E923 = ("It looks like there is no proper sample data to initialize the " - "Model of component '{name}'. This is likely a bug in spaCy, so " - "feel free to open an issue: https://github.com/explosion/spaCy/issues") + "Model of component '{name}'. To check your input data paths and " + "annotation, run: python -m spacy debug data config.cfg") E924 = ("The '{name}' component does not seem to be initialized properly. " "This is likely a bug in spaCy, so feel free to open an issue: " "https://github.com/explosion/spaCy/issues") From bb7482bef86bb2f6bfe68031ae0d51633bb17d6b Mon Sep 17 00:00:00 2001 From: svlandeg Date: Mon, 8 Feb 2021 18:39:59 +0100 Subject: [PATCH 06/35] fix link --- website/docs/usage/training.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md index ab3f9c4fd..1758f677a 100644 --- a/website/docs/usage/training.md +++ b/website/docs/usage/training.md @@ -185,7 +185,7 @@ sections of a config file are: For a full overview of spaCy's config format and settings, see the [data format documentation](/api/data-formats#config) and -[Thinc's config system docs](https://thinc.ai/usage/config). The settings +[Thinc's config system docs](https://thinc.ai/docs/usage-config). The settings available for the different architectures are documented with the [model architectures API](/api/architectures). See the Thinc documentation for [optimizers](https://thinc.ai/docs/api-optimizers) and From e897e7aaadd67ad8ac857b28181464f32bd9bb3b Mon Sep 17 00:00:00 2001 From: tarskiandhutch <45950680+tarskiandhutch@users.noreply.github.com> Date: Mon, 8 Feb 2021 15:24:57 -0500 Subject: [PATCH 07/35] Line 70: syntax error Original config definition treated dictionary key as a function argument. --- website/docs/api/lemmatizer.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/api/lemmatizer.md b/website/docs/api/lemmatizer.md index e838c75b2..f186535f7 100644 --- a/website/docs/api/lemmatizer.md +++ b/website/docs/api/lemmatizer.md @@ -67,7 +67,7 @@ data format used by the lookup and rule-based lemmatizers, see > lemmatizer = nlp.add_pipe("lemmatizer") > > # Construction via add_pipe with custom settings -> config = {"mode": "rule", overwrite=True} +> config = {"mode": "rule", "overwrite": True} > lemmatizer = nlp.add_pipe("lemmatizer", config=config) > ``` From 8ed788660b1103077d8a90eb4664772a040058d5 Mon Sep 17 00:00:00 2001 From: Koichi Yasuoka Date: Tue, 9 Feb 2021 14:43:02 +0900 Subject: [PATCH 08/35] Several callable objects do not have __qualname__ --- spacy/util.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/spacy/util.py b/spacy/util.py index f55b03db8..013c87acc 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -930,6 +930,8 @@ def is_same_func(func1: Callable, func2: Callable) -> bool: """ if not callable(func1) or not callable(func2): return False + if not hasattr(func1,"__qualname__") or not hasattr(func2,"__qualname__"): + return False same_name = func1.__qualname__ == func2.__qualname__ same_file = inspect.getfile(func1) == inspect.getfile(func2) same_code = inspect.getsourcelines(func1) == inspect.getsourcelines(func2) From 9a7f33c91672551ed75db09a224a9eb2fa34d7b2 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 9 Feb 2021 21:28:33 +0100 Subject: [PATCH 09/35] final 3.0 benchmark numbers --- website/docs/usage/_benchmarks-models.md | 6 +++--- website/docs/usage/facts-figures.md | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/website/docs/usage/_benchmarks-models.md b/website/docs/usage/_benchmarks-models.md index be49406bc..5bf9e63ca 100644 --- a/website/docs/usage/_benchmarks-models.md +++ b/website/docs/usage/_benchmarks-models.md @@ -4,8 +4,8 @@ import { Help } from 'components/typography'; import Link from 'components/link' | Pipeline | Parser | Tagger | NER | | ---------------------------------------------------------- | -----: | -----: | ---: | -| [`en_core_web_trf`](/models/en#en_core_web_trf) (spaCy v3) | 95.2 | 97.8 | 89.9 | -| [`en_core_web_lg`](/models/en#en_core_web_lg) (spaCy v3) | 91.9 | 97.4 | 85.5 | +| [`en_core_web_trf`](/models/en#en_core_web_trf) (spaCy v3) | 95.1 | 97.8 | 89.8 | +| [`en_core_web_lg`](/models/en#en_core_web_lg) (spaCy v3) | 92.0 | 97.4 | 85.5 | | `en_core_web_lg` (spaCy v2) | 91.9 | 97.2 | 85.5 |
@@ -22,7 +22,7 @@ the development set). | Named Entity Recognition System | OntoNotes | CoNLL '03 | | -------------------------------- | --------: | --------: | -| spaCy RoBERTa (2020) | 89.7 | 91.6 | +| spaCy RoBERTa (2020) | 89.8 | 91.6 | | Stanza (StanfordNLP)1 | 88.8 | 92.1 | | Flair2 | 89.7 | 93.1 | diff --git a/website/docs/usage/facts-figures.md b/website/docs/usage/facts-figures.md index e77f384b5..4bee31ed0 100644 --- a/website/docs/usage/facts-figures.md +++ b/website/docs/usage/facts-figures.md @@ -77,7 +77,7 @@ import Benchmarks from 'usage/\_benchmarks-models.md' | Dependency Parsing System | UAS | LAS | | ------------------------------------------------------------------------------ | ---: | ---: | -| spaCy RoBERTa (2020) | 95.5 | 94.3 | +| spaCy RoBERTa (2020) | 95.1 | 93.7 | | [Mrini et al.](https://khalilmrini.github.io/Label_Attention_Layer.pdf) (2019) | 97.4 | 96.3 | | [Zhou and Zhao](https://www.aclweb.org/anthology/P19-1230/) (2019) | 97.2 | 95.7 | From c08b3f294cdfb07c4f75e88f955339111074d025 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 10 Feb 2021 13:45:27 +1100 Subject: [PATCH 10/35] Support env vars and CLI overrides for project.yml --- spacy/cli/_util.py | 57 ++++++++++++++++++++++++---------- spacy/cli/project/run.py | 16 +++++++--- spacy/schemas.py | 1 + spacy/tests/test_cli.py | 17 ++++++++++ website/docs/usage/projects.md | 45 +++++++++++++++++++++------ 5 files changed, 105 insertions(+), 31 deletions(-) diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py index e66420024..86b3ab356 100644 --- a/spacy/cli/_util.py +++ b/spacy/cli/_util.py @@ -16,7 +16,7 @@ import os from ..schemas import ProjectConfigSchema, validate from ..util import import_file, run_command, make_tempdir, registry, logger -from ..util import is_compatible_version, ENV_VARS +from ..util import is_compatible_version, SimpleFrozenDict, ENV_VARS from .. import about if TYPE_CHECKING: @@ -111,26 +111,33 @@ def _parse_overrides(args: List[str], is_cli: bool = False) -> Dict[str, Any]: value = "true" else: value = args.pop(0) - # Just like we do in the config, we're calling json.loads on the - # values. But since they come from the CLI, it'd be unintuitive to - # explicitly mark strings with escaped quotes. So we're working - # around that here by falling back to a string if parsing fails. - # TODO: improve logic to handle simple types like list of strings? - try: - result[opt] = srsly.json_loads(value) - except ValueError: - result[opt] = str(value) + result[opt] = _parse_override(value) else: msg.fail(f"{err}: name should start with --", exits=1) return result -def load_project_config(path: Path, interpolate: bool = True) -> Dict[str, Any]: +def _parse_override(value: Any) -> Any: + # Just like we do in the config, we're calling json.loads on the + # values. But since they come from the CLI, it'd be unintuitive to + # explicitly mark strings with escaped quotes. So we're working + # around that here by falling back to a string if parsing fails. + # TODO: improve logic to handle simple types like list of strings? + try: + return srsly.json_loads(value) + except ValueError: + return str(value) + + +def load_project_config( + path: Path, interpolate: bool = True, overrides: Dict[str, Any] = SimpleFrozenDict() +) -> Dict[str, Any]: """Load the project.yml file from a directory and validate it. Also make sure that all directories defined in the config exist. path (Path): The path to the project directory. interpolate (bool): Whether to substitute project variables. + overrides (Dict[str, Any]): Optional config overrides. RETURNS (Dict[str, Any]): The loaded project.yml. """ config_path = path / PROJECT_FILE @@ -154,20 +161,36 @@ def load_project_config(path: Path, interpolate: bool = True) -> Dict[str, Any]: if not dir_path.exists(): dir_path.mkdir(parents=True) if interpolate: - err = "project.yml validation error" + err = f"{PROJECT_FILE} validation error" with show_validation_error(title=err, hint_fill=False): - config = substitute_project_variables(config) + config = substitute_project_variables(config, overrides) return config -def substitute_project_variables(config: Dict[str, Any], overrides: Dict = {}): - key = "vars" +def substitute_project_variables( + config: Dict[str, Any], + overrides: Dict[str, Any] = SimpleFrozenDict(), + key: str = "vars", + env_key: str = "env", +) -> Dict[str, Any]: + """Interpolate variables in the project file using the config system. + + config (Dict[str, Any]): The project config. + overrides (Dict[str, Any]): Optional config overrides. + key (str): Key containing variables in project config. + env_key (str): Key containing environment variable mapping in project config. + RETURNS (Dict[str, Any]): The interpolated project config. + """ config.setdefault(key, {}) - config[key].update(overrides) + config.setdefault(env_key, {}) + # Substitute references to env vars with their values + for config_var, env_var in config[env_key].items(): + config[env_key][config_var] = _parse_override(os.environ.get(env_var, "")) # Need to put variables in the top scope again so we can have a top-level # section "project" (otherwise, a list of commands in the top scope wouldn't) # be allowed by Thinc's config system - cfg = Config({"project": config, key: config[key]}) + cfg = Config({"project": config, key: config[key], env_key: config[env_key]}) + cfg = Config().from_str(cfg.to_str(), overrides=overrides) interpolated = cfg.interpolate() return dict(interpolated["project"]) diff --git a/spacy/cli/project/run.py b/spacy/cli/project/run.py index 17c881595..5339d2a21 100644 --- a/spacy/cli/project/run.py +++ b/spacy/cli/project/run.py @@ -3,19 +3,23 @@ from pathlib import Path from wasabi import msg import sys import srsly +import typer from ... import about from ...git_info import GIT_VERSION from ...util import working_dir, run_command, split_command, is_cwd, join_command from ...util import SimpleFrozenList, is_minor_version_match, ENV_VARS -from ...util import check_bool_env_var +from ...util import check_bool_env_var, SimpleFrozenDict from .._util import PROJECT_FILE, PROJECT_LOCK, load_project_config, get_hash -from .._util import get_checksum, project_cli, Arg, Opt, COMMAND +from .._util import get_checksum, project_cli, Arg, Opt, COMMAND, parse_config_overrides -@project_cli.command("run") +@project_cli.command( + "run", context_settings={"allow_extra_args": True, "ignore_unknown_options": True} +) def project_run_cli( # fmt: off + ctx: typer.Context, # This is only used to read additional arguments subcommand: str = Arg(None, help=f"Name of command defined in the {PROJECT_FILE}"), project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False), force: bool = Opt(False, "--force", "-F", help="Force re-running steps, even if nothing changed"), @@ -33,13 +37,15 @@ def project_run_cli( if show_help or not subcommand: print_run_help(project_dir, subcommand) else: - project_run(project_dir, subcommand, force=force, dry=dry) + overrides = parse_config_overrides(ctx.args) + project_run(project_dir, subcommand, overrides=overrides, force=force, dry=dry) def project_run( project_dir: Path, subcommand: str, *, + overrides: Dict[str, Any] = SimpleFrozenDict(), force: bool = False, dry: bool = False, capture: bool = False, @@ -59,7 +65,7 @@ def project_run( when you want to turn over execution to the command, and capture=True when you want to run the command more like a function. """ - config = load_project_config(project_dir) + config = load_project_config(project_dir, overrides=overrides) commands = {cmd["name"]: cmd for cmd in config.get("commands", [])} workflows = config.get("workflows", {}) validate_subcommand(commands.keys(), workflows.keys(), subcommand) diff --git a/spacy/schemas.py b/spacy/schemas.py index d041845f3..2f25c785f 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -446,6 +446,7 @@ class ProjectConfigCommand(BaseModel): class ProjectConfigSchema(BaseModel): # fmt: off vars: Dict[StrictStr, Any] = Field({}, title="Optional variables to substitute in commands") + env: Dict[StrictStr, Any] = Field({}, title="Optional variable names to substitute in commands, mapped to environment variable names") assets: List[Union[ProjectConfigAssetURL, ProjectConfigAssetGit]] = Field([], title="Data assets") workflows: Dict[StrictStr, List[StrictStr]] = Field({}, title="Named workflows, mapped to list of project commands to run in order") commands: List[ProjectConfigCommand] = Field([], title="Project command shortucts") diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index bfbee677a..a3834f31a 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -325,6 +325,23 @@ def test_project_config_interpolation(): substitute_project_variables(project) +def test_project_config_interpolation_env(): + variables = {"a": 10} + env_var = "SPACY_TEST_FOO" + env_vars = {"foo": env_var} + commands = [{"name": "x", "script": ["hello ${vars.a} ${env.foo}"]}] + project = {"commands": commands, "vars": variables, "env": env_vars} + with make_tempdir() as d: + srsly.write_yaml(d / "project.yml", project) + cfg = load_project_config(d) + assert cfg["commands"][0]["script"][0] == "hello 10 " + os.environ[env_var] = "123" + with make_tempdir() as d: + srsly.write_yaml(d / "project.yml", project) + cfg = load_project_config(d) + assert cfg["commands"][0]["script"][0] == "hello 10 123" + + @pytest.mark.parametrize( "args,expected", [ diff --git a/website/docs/usage/projects.md b/website/docs/usage/projects.md index 492345f2f..97b5b9f28 100644 --- a/website/docs/usage/projects.md +++ b/website/docs/usage/projects.md @@ -69,9 +69,9 @@ python -m spacy project clone pipelines/tagger_parser_ud By default, the project will be cloned into the current working directory. You can specify an optional second argument to define the output directory. The -`--repo` option lets you define a custom repo to clone from if you don't want -to use the spaCy [`projects`](https://github.com/explosion/projects) repo. You -can also use any private repo you have access to with Git. +`--repo` option lets you define a custom repo to clone from if you don't want to +use the spaCy [`projects`](https://github.com/explosion/projects) repo. You can +also use any private repo you have access to with Git. ### 2. Fetch the project assets {#assets} @@ -221,6 +221,7 @@ pipelines. | `title` | An optional project title used in `--help` message and [auto-generated docs](#custom-docs). | | `description` | An optional project description used in [auto-generated docs](#custom-docs). | | `vars` | A dictionary of variables that can be referenced in paths, URLs and scripts, just like [`config.cfg` variables](/usage/training#config-interpolation). For example, `${vars.name}` will use the value of the variable `name`. Variables need to be defined in the section `vars`, but can be a nested dict, so you're able to reference `${vars.model.name}`. | +| `env` | A dictionary of variables, mapped to the names of environment variables that will be read in when running the project. For example, `${env.name}` will use the value of the environment variable defined as `name`. | | `directories` | An optional list of [directories](#project-files) that should be created in the project for assets, training outputs, metrics etc. spaCy will make sure that these directories always exist. | | `assets` | A list of assets that can be fetched with the [`project assets`](/api/cli#project-assets) command. `url` defines a URL or local path, `dest` is the destination file relative to the project directory, and an optional `checksum` ensures that an error is raised if the file's checksum doesn't match. Instead of `url`, you can also provide a `git` block with the keys `repo`, `branch` and `path`, to download from a Git repo. | | `workflows` | A dictionary of workflow names, mapped to a list of command names, to execute in order. Workflows can be run with the [`project run`](/api/cli#project-run) command. | @@ -310,8 +311,8 @@ company-internal and not available over the internet. In that case, you can specify the destination paths and a checksum, and leave out the URL. When your teammates clone and run your project, they can place the files in the respective directory themselves. The [`project assets`](/api/cli#project-assets) command -will alert you about missing files and mismatched checksums, so you can ensure that -others are running your project with the same data. +will alert you about missing files and mismatched checksums, so you can ensure +that others are running your project with the same data. ### Dependencies and outputs {#deps-outputs} @@ -358,9 +359,10 @@ graphs based on the dependencies and outputs, and won't re-run previous steps automatically. For instance, if you only run the command `train` that depends on data created by `preprocess` and those files are missing, spaCy will show an error – it won't just re-run `preprocess`. If you're looking for more advanced -data management, check out the [Data Version Control (DVC) integration](#dvc). If you're planning on integrating your spaCy project with DVC, you -can also use `outputs_no_cache` instead of `outputs` to define outputs that -won't be cached or tracked. +data management, check out the [Data Version Control (DVC) integration](#dvc). +If you're planning on integrating your spaCy project with DVC, you can also use +`outputs_no_cache` instead of `outputs` to define outputs that won't be cached +or tracked. ### Files and directory structure {#project-files} @@ -467,7 +469,9 @@ In your `project.yml`, you can then run the script by calling `python scripts/custom_evaluation.py` with the function arguments. You can also use the `vars` section to define reusable variables that will be substituted in commands, paths and URLs. In this example, the batch size is defined as a -variable will be added in place of `${vars.batch_size}` in the script. +variable will be added in place of `${vars.batch_size}` in the script. Just like +in the [training config](/usage/training##config-overrides), you can also +override settings on the command line – for example using `--vars.batch_size`. > #### Calling into Python > @@ -491,6 +495,29 @@ commands: - 'corpus/eval.json' ``` +You can also use the `env` section to reference **environment variables** and +make their values available to the commands. This can be useful for overriding +settings on the command line and passing through system-level settings. + +> #### Usage example +> +> ```bash +> export GPU_ID=1 +> BATCH_SIZE=128 python -m spacy project run evaluate +> ``` + +```yaml +### project.yml +env: + batch_size: BATCH_SIZE + gpu_id: GPU_ID + +commands: + - name: evaluate + script: + - 'python scripts/custom_evaluation.py ${env.batch_size}' +``` + ### Documenting your project {#custom-docs} > #### Readme Example From 21176c69b0c9743c5e107fed6637208f97f48fd0 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 10 Feb 2021 14:12:00 +1100 Subject: [PATCH 11/35] Update and add test --- spacy/tests/pipeline/test_pipe_factories.py | 21 +++++++++++++++++---- spacy/util.py | 2 +- 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/spacy/tests/pipeline/test_pipe_factories.py b/spacy/tests/pipeline/test_pipe_factories.py index 2af4b1efb..e1706ffb1 100644 --- a/spacy/tests/pipeline/test_pipe_factories.py +++ b/spacy/tests/pipeline/test_pipe_factories.py @@ -451,13 +451,27 @@ def test_pipe_factories_from_source_config(): assert config["arg"] == "world" -def test_pipe_factories_decorator_idempotent(): +class PipeFactoriesIdempotent: + def __init__(self, nlp, name): + ... + + def __call__(self, doc): + ... + + +@pytest.mark.parametrize( + "i,func,func2", + [ + (0, lambda nlp, name: lambda doc: doc, lambda doc: doc), + (1, PipeFactoriesIdempotent, PipeFactoriesIdempotent(None, None)), + ], +) +def test_pipe_factories_decorator_idempotent(i, func, func2): """Check that decorator can be run multiple times if the function is the same. This is especially relevant for live reloading because we don't want spaCy to raise an error if a module registering components is reloaded. """ - name = "test_pipe_factories_decorator_idempotent" - func = lambda nlp, name: lambda doc: doc + name = f"test_pipe_factories_decorator_idempotent_{i}" for i in range(5): Language.factory(name, func=func) nlp = Language() @@ -466,7 +480,6 @@ def test_pipe_factories_decorator_idempotent(): # Make sure it also works for component decorator, which creates the # factory function name2 = f"{name}2" - func2 = lambda doc: doc for i in range(5): Language.component(name2, func=func2) nlp = Language() diff --git a/spacy/util.py b/spacy/util.py index 013c87acc..aa9bf301e 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -930,7 +930,7 @@ def is_same_func(func1: Callable, func2: Callable) -> bool: """ if not callable(func1) or not callable(func2): return False - if not hasattr(func1,"__qualname__") or not hasattr(func2,"__qualname__"): + if not hasattr(func1, "__qualname__") or not hasattr(func2, "__qualname__"): return False same_name = func1.__qualname__ == func2.__qualname__ same_file = inspect.getfile(func1) == inspect.getfile(func2) From 61b04a70d5a7ec8b6e22ccddf74f9654a261043d Mon Sep 17 00:00:00 2001 From: Peter Baumann <54291192+peter-exos@users.noreply.github.com> Date: Wed, 10 Feb 2021 07:43:32 -0500 Subject: [PATCH 12/35] Run PhraseMatcher on Spans (#6918) * Add regression test * Run PhraseMatcher on Spans * Add test for PhraseMatcher on Spans and Docs * Add SCA * Add test with 3 matches in Doc, 1 match in Span * Update docs * Use doc.length for find_matches in tokenizer Co-authored-by: Adriane Boyd --- .github/contributors/peter-exos.md | 106 +++++++++++++++++++++ spacy/matcher/phrasematcher.pxd | 2 +- spacy/matcher/phrasematcher.pyx | 28 ++++-- spacy/tests/matcher/test_phrase_matcher.py | 30 ++++++ spacy/tests/regression/test_issue6839.py | 15 +++ spacy/tokenizer.pyx | 2 +- website/docs/api/phrasematcher.md | 4 +- 7 files changed, 174 insertions(+), 13 deletions(-) create mode 100644 .github/contributors/peter-exos.md create mode 100644 spacy/tests/regression/test_issue6839.py diff --git a/.github/contributors/peter-exos.md b/.github/contributors/peter-exos.md new file mode 100644 index 000000000..e0ef1346e --- /dev/null +++ b/.github/contributors/peter-exos.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [ ] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [x] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Peter Baumann | +| Company name (if applicable) | Exos Financial | +| Title or role (if applicable) | data scientist | +| Date | Feb 1st, 2021 | +| GitHub username | peter-exos | +| Website (optional) | | diff --git a/spacy/matcher/phrasematcher.pxd b/spacy/matcher/phrasematcher.pxd index 3b42f3fab..1bdc19012 100644 --- a/spacy/matcher/phrasematcher.pxd +++ b/spacy/matcher/phrasematcher.pxd @@ -18,4 +18,4 @@ cdef class PhraseMatcher: cdef Pool mem cdef key_t _terminal_hash - cdef void find_matches(self, Doc doc, vector[SpanC] *matches) nogil + cdef void find_matches(self, Doc doc, int start_idx, int end_idx, vector[SpanC] *matches) nogil diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx index 2baf2ffb7..088456b9a 100644 --- a/spacy/matcher/phrasematcher.pyx +++ b/spacy/matcher/phrasematcher.pyx @@ -230,10 +230,10 @@ cdef class PhraseMatcher: result = internal_node map_set(self.mem, result, self.vocab.strings[key], NULL) - def __call__(self, doc, *, as_spans=False): + def __call__(self, object doclike, *, as_spans=False): """Find all sequences matching the supplied patterns on the `Doc`. - doc (Doc): The document to match over. + doclike (Doc or Span): The document to match over. as_spans (bool): Return Span objects with labels instead of (match_id, start, end) tuples. RETURNS (list): A list of `(match_id, start, end)` tuples, @@ -244,12 +244,22 @@ cdef class PhraseMatcher: DOCS: https://spacy.io/api/phrasematcher#call """ matches = [] - if doc is None or len(doc) == 0: + if doclike is None or len(doclike) == 0: # if doc is empty or None just return empty list return matches + if isinstance(doclike, Doc): + doc = doclike + start_idx = 0 + end_idx = len(doc) + elif isinstance(doclike, Span): + doc = doclike.doc + start_idx = doclike.start + end_idx = doclike.end + else: + raise ValueError(Errors.E195.format(good="Doc or Span", got=type(doclike).__name__)) cdef vector[SpanC] c_matches - self.find_matches(doc, &c_matches) + self.find_matches(doc, start_idx, end_idx, &c_matches) for i in range(c_matches.size()): matches.append((c_matches[i].label, c_matches[i].start, c_matches[i].end)) for i, (ent_id, start, end) in enumerate(matches): @@ -261,17 +271,17 @@ cdef class PhraseMatcher: else: return matches - cdef void find_matches(self, Doc doc, vector[SpanC] *matches) nogil: + cdef void find_matches(self, Doc doc, int start_idx, int end_idx, vector[SpanC] *matches) nogil: cdef MapStruct* current_node = self.c_map cdef int start = 0 - cdef int idx = 0 - cdef int idy = 0 + cdef int idx = start_idx + cdef int idy = start_idx cdef key_t key cdef void* value cdef int i = 0 cdef SpanC ms cdef void* result - while idx < doc.length: + while idx < end_idx: start = idx token = Token.get_struct_attr(&doc.c[idx], self.attr) # look for sequences from this position @@ -279,7 +289,7 @@ cdef class PhraseMatcher: if result: current_node = result idy = idx + 1 - while idy < doc.length: + while idy < end_idx: result = map_get(current_node, self._terminal_hash) if result: i = 0 diff --git a/spacy/tests/matcher/test_phrase_matcher.py b/spacy/tests/matcher/test_phrase_matcher.py index c7a3fef7d..230ca3b19 100644 --- a/spacy/tests/matcher/test_phrase_matcher.py +++ b/spacy/tests/matcher/test_phrase_matcher.py @@ -323,3 +323,33 @@ def test_phrase_matcher_deprecated(en_vocab): @pytest.mark.parametrize("attr", ["SENT_START", "IS_SENT_START"]) def test_phrase_matcher_sent_start(en_vocab, attr): _ = PhraseMatcher(en_vocab, attr=attr) # noqa: F841 + + +def test_span_in_phrasematcher(en_vocab): + """Ensure that PhraseMatcher accepts Span and Doc as input""" + doc = Doc(en_vocab, + words=["I", "like", "Spans", "and", "Docs", "in", "my", "input", ",", "and", "nothing", "else", "."]) + span = doc[:8] + pattern = Doc(en_vocab, words=["Spans", "and", "Docs"]) + matcher = PhraseMatcher(en_vocab) + matcher.add("SPACY", [pattern]) + matches_doc = matcher(doc) + matches_span = matcher(span) + assert len(matches_doc) == 1 + assert len(matches_span) == 1 + + +def test_span_v_doc_in_phrasematcher(en_vocab): + """Ensure that PhraseMatcher only returns matches in input Span and not in entire Doc""" + doc = Doc(en_vocab, + words=["I", "like", "Spans", "and", "Docs", "in", "my", "input", ",", + "Spans", "and", "Docs", "in", "my", "matchers", "," + "and", "Spans", "and", "Docs", "everywhere" "."]) + span = doc[9:15] # second clause + pattern = Doc(en_vocab, words=["Spans", "and", "Docs"]) + matcher = PhraseMatcher(en_vocab) + matcher.add("SPACY", [pattern]) + matches_doc = matcher(doc) + matches_span = matcher(span) + assert len(matches_doc) == 3 + assert len(matches_span) == 1 diff --git a/spacy/tests/regression/test_issue6839.py b/spacy/tests/regression/test_issue6839.py new file mode 100644 index 000000000..2148cf867 --- /dev/null +++ b/spacy/tests/regression/test_issue6839.py @@ -0,0 +1,15 @@ +from spacy.tokens import Doc +from spacy.matcher import PhraseMatcher + + +def test_span_in_phrasematcher(en_vocab): + """Ensure that PhraseMatcher accepts Span as input""" + doc = Doc(en_vocab, + words=["I", "like", "Spans", "and", "Docs", "in", "my", "input", ",", "and", "nothing", "else", "."]) + span = doc[:8] + pattern = Doc(en_vocab, words=["Spans", "and", "Docs"]) + matcher = PhraseMatcher(en_vocab) + matcher.add("SPACY", [pattern]) + matches = matcher(span) + assert matches + diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 8c620165e..5bd6e7aa3 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -245,7 +245,7 @@ cdef class Tokenizer: cdef int offset cdef int modified_doc_length # Find matches for special cases - self._special_matcher.find_matches(doc, &c_matches) + self._special_matcher.find_matches(doc, 0, doc.length, &c_matches) # Skip processing if no matches if c_matches.size() == 0: return True diff --git a/website/docs/api/phrasematcher.md b/website/docs/api/phrasematcher.md index 47bbdcf6a..540476949 100644 --- a/website/docs/api/phrasematcher.md +++ b/website/docs/api/phrasematcher.md @@ -44,7 +44,7 @@ be shown. ## PhraseMatcher.\_\_call\_\_ {#call tag="method"} -Find all token sequences matching the supplied patterns on the `Doc`. +Find all token sequences matching the supplied patterns on the `Doc` or `Span`. > #### Example > @@ -59,7 +59,7 @@ Find all token sequences matching the supplied patterns on the `Doc`. | Name | Description | | ------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `doc` | The document to match over. ~~Doc~~ | +| `doclike` | The `Doc` or `Span` to match over. ~~Union[Doc, Span]~~ | | _keyword-only_ | | | `as_spans` 3 | Instead of tuples, return a list of [`Span`](/api/span) objects of the matches, with the `match_id` assigned as the span label. Defaults to `False`. ~~bool~~ | | **RETURNS** | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. The `match_id` is the ID of the added match pattern. If `as_spans` is set to `True`, a list of `Span` objects is returned instead. ~~Union[List[Tuple[int, int, int]], List[Span]]~~ | From 24046fef17f211ec7e131c87f7371001f15fa625 Mon Sep 17 00:00:00 2001 From: Shumi <76557637+Shumie82@users.noreply.github.com> Date: Wed, 10 Feb 2021 20:12:33 +0200 Subject: [PATCH 13/35] South African Setswana language Please accept the additional of Setswana language --- spacy/lang/tn/__init__.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 spacy/lang/tn/__init__.py diff --git a/spacy/lang/tn/__init__.py b/spacy/lang/tn/__init__.py new file mode 100644 index 000000000..911214331 --- /dev/null +++ b/spacy/lang/tn/__init__.py @@ -0,0 +1,18 @@ +from .stop_words import STOP_WORDS +from .lex_attrs import LEX_ATTRS +from .punctuation import TOKENIZER_INFIXES +from ...language import Language + + +class SetswanaDefaults(Language.Defaults): + suffixes = TOKENIZER_SUFFIXES + stop_words = STOP_WORDS + lex_attr_getters = LEX_ATTRS + + +class Setswana(Language): + lang = "tn" + Defaults = SetswanaDefaults + + +__all__ = ["Setswana"] From f6be28cfb231111a970d60b19efda2996c917373 Mon Sep 17 00:00:00 2001 From: Shumi <76557637+Shumie82@users.noreply.github.com> Date: Wed, 10 Feb 2021 20:15:13 +0200 Subject: [PATCH 14/35] Added files to Setswana Language Add South African Setswana Language --- spacy/lang/tn/examples.py | 19 ++++++ spacy/lang/tn/lex_attrs.py | 110 +++++++++++++++++++++++++++++++++++ spacy/lang/tn/punctuation.py | 19 ++++++ spacy/lang/tn/stop_words.py | 24 ++++++++ spacy/lang/tn/tag_map.py | 22 +++++++ 5 files changed, 194 insertions(+) create mode 100644 spacy/lang/tn/examples.py create mode 100644 spacy/lang/tn/lex_attrs.py create mode 100644 spacy/lang/tn/punctuation.py create mode 100644 spacy/lang/tn/stop_words.py create mode 100644 spacy/lang/tn/tag_map.py diff --git a/spacy/lang/tn/examples.py b/spacy/lang/tn/examples.py new file mode 100644 index 000000000..9039a1624 --- /dev/null +++ b/spacy/lang/tn/examples.py @@ -0,0 +1,19 @@ +# coding: utf8 +from __future__ import unicode_literals + + +""" +Example sentences to test spaCy and its language models. +>>> from spacy.lang.en.examples import sentences +>>> docs = nlp.pipe(sentences) +""" + + +sentences = [ + "Apple e nyaka go reka JSE ka tlhwatlhwa ta R1 billion", + "Johannesburg ke toropo e kgolo mo Afrika Borwa.", + "O ko kae?", + "ke mang presidente ya Afrika Borwa?", + "ke eng toropo kgolo ya Afrika Borwa?", + "Nelson Mandela o belegwe leng?", +] \ No newline at end of file diff --git a/spacy/lang/tn/lex_attrs.py b/spacy/lang/tn/lex_attrs.py new file mode 100644 index 000000000..daef45d72 --- /dev/null +++ b/spacy/lang/tn/lex_attrs.py @@ -0,0 +1,110 @@ +coding: utf8 + +from __future__ import unicode_literals + +from ...attrs import LIKE_NUM + +_num_words = [ + "lefela", + "nngwe", + "pedi", + "tharo", + "nne", + "tlhano", + "thataro", + "supa", + "robedi", + "robongwe", + "lesome", + "lesomenngwe", + "lesomepedi", + "sometharo", + "somenne", + "sometlhano", + "somethataro", + "somesupa", + "somerobedi", + "somerobongwe", + "someamabedi", + "someamararo", + "someamane", + "someamatlhano", + "someamarataro", + "someamasupa", + "someamarobedi", + "someamarobongwe", + "lekgolo", + "sekete", + "milione", + "bilione", + "terilione", + "kwatirilione", + "gajillione", + "bazillione", +] + + +_ordinal_words = [ + "ntlha", + "bobedi", + "boraro", + "bone", + "botlhano", + "borataro", + "bosupa", + "borobedi ", + "borobongwe", + "bolesome", + "bolesomengwe", + "bolesomepedi", + "bolesometharo", + "bolesomenne", + "bolesometlhano", + "bolesomethataro", + "bolesomesupa", + "bolesomerobedi", + "bolesomerobongwe", + "somamabedi", + "someamararo", + "someamane", + "someamatlhano", + "someamarataro", + "someamasupa", + "someamarobedi", + "someamarobongwe", + "lekgolo", + "sekete", + "milione", + "bilione", + "terilione", + "kwatirilione", + "gajillione", + "bazillione", +] + +def like_num(text): + if text.startswith(("+", "-", "±", "~")): + text = text[1:] + text = text.replace(",", "").replace(".", "") + if text.isdigit(): + return True + if text.count("/") == 1: + num, denom = text.split("/") + if num.isdigit() and denom.isdigit(): + return True + + text_lower = text.lower() + if text_lower in _num_words: + return True + + # CHeck ordinal number + if text_lower in _ordinal_words: + return True + if text_lower.endswith("th"): + if text_lower[:-2].isdigit(): + return True + + return False + + +LEX_ATTRS = {LIKE_NUM: like_num} diff --git a/spacy/lang/tn/punctuation.py b/spacy/lang/tn/punctuation.py new file mode 100644 index 000000000..241ad39af --- /dev/null +++ b/spacy/lang/tn/punctuation.py @@ -0,0 +1,19 @@ +from ..char_classes import LIST_ELLIPSES, LIST_ICONS, HYPHENS +from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA + +_infixes = ( + LIST_ELLIPSES + + LIST_ICONS + + [ + r"(?<=[0-9])[+\-\*^](?=[0-9-])", + r"(?<=[{al}{q}])\.(?=[{au}{q}])".format( + al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES + ), + r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), + r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS), + r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA), + ] +) + + +TOKENIZER_INFIXES = _infixes \ No newline at end of file diff --git a/spacy/lang/tn/stop_words.py b/spacy/lang/tn/stop_words.py new file mode 100644 index 000000000..65681f6ee --- /dev/null +++ b/spacy/lang/tn/stop_words.py @@ -0,0 +1,24 @@ +coding: utf8 + +from __future__ import unicode_literals + + +# Stop words +STOP_WORDS = set(""" +ke gareng ga selekanyo tlhwatlhwa yo mongwe se +sengwe fa go le jalo gongwe ba na mo tikologong +jaaka kwa morago nna gonne ka sa pele nako teng +tlase fela ntle magareng tsona feta bobedi kgabaganya +moo gape kgatlhanong botlhe tsotlhe bokana e esi +setseng mororo dinako golo kgolo nnye wena gago +o ntse ntle tla goreng gangwe mang yotlhe gore +eo yona tseraganyo eng ne sentle re rona thata +godimo fitlha pedi masomamabedi lesomepedi mmogo +tharo tseo boraro tseno yone jaanong bobona bona +lesome tsaya tsamaiso nngwe masomethataro thataro +tsa mmatota tota sale thoko supa dira tshwanetse di mmalwa masisi +bonala e tshwanang bogolo tsenya tsweetswee karolo +sepe tlhalosa dirwa robedi robongwe lesomenngwe gaisa +tlhano lesometlhano botlalo lekgolo +""".split()) +print(STOP_WORDS) \ No newline at end of file diff --git a/spacy/lang/tn/tag_map.py b/spacy/lang/tn/tag_map.py new file mode 100644 index 000000000..1c7f0647f --- /dev/null +++ b/spacy/lang/tn/tag_map.py @@ -0,0 +1,22 @@ +# coding: utf8 +from __future__ import unicode_literals + +from ...symbols import POS, PUNCT, SYM, ADJ, CCONJ, NUM, DET, ADV, ADP, X, VERB +from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON + + +TAG_MAP = { + "INT": {POS: INTJ}, + "JUNC": {POS: CCONJ}, + "$": {POS: PUNCT}, + "PROPOSS": {POS: PRON}, + "PROQUANT": {POS: PRON}, + "PROEMP": {POS: PRON}, + "NUM": {POS: NUM}, + "N": {POS: NOUN}, + "AUX": {POS: VERB}, + "ADV": {POS: ADV}, + "ADJ": {POS: ADJ}, + "V": {POS: VERB}, + "VCOP": {POS: VERB}, +} From 7c8721b1bd3b12719a2db395e237d8b496a3414c Mon Sep 17 00:00:00 2001 From: Shumi <76557637+Shumie82@users.noreply.github.com> Date: Wed, 10 Feb 2021 20:21:22 +0200 Subject: [PATCH 15/35] Update tag_map.py Updated tag_map --- spacy/lang/tn/tag_map.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/lang/tn/tag_map.py b/spacy/lang/tn/tag_map.py index 1c7f0647f..e26f4c4e1 100644 --- a/spacy/lang/tn/tag_map.py +++ b/spacy/lang/tn/tag_map.py @@ -1,4 +1,4 @@ -# coding: utf8 +coding: utf8 from __future__ import unicode_literals from ...symbols import POS, PUNCT, SYM, ADJ, CCONJ, NUM, DET, ADV, ADP, X, VERB From ed3397727e3cf3cc7b8ff9a89224fe894424392d Mon Sep 17 00:00:00 2001 From: Shumi <76557637+Shumie82@users.noreply.github.com> Date: Wed, 10 Feb 2021 20:41:18 +0200 Subject: [PATCH 16/35] Delete tag_map.py Tag map file is deleted. I will add it later because it was failing validations --- spacy/lang/tn/tag_map.py | 22 ---------------------- 1 file changed, 22 deletions(-) delete mode 100644 spacy/lang/tn/tag_map.py diff --git a/spacy/lang/tn/tag_map.py b/spacy/lang/tn/tag_map.py deleted file mode 100644 index e26f4c4e1..000000000 --- a/spacy/lang/tn/tag_map.py +++ /dev/null @@ -1,22 +0,0 @@ -coding: utf8 -from __future__ import unicode_literals - -from ...symbols import POS, PUNCT, SYM, ADJ, CCONJ, NUM, DET, ADV, ADP, X, VERB -from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON - - -TAG_MAP = { - "INT": {POS: INTJ}, - "JUNC": {POS: CCONJ}, - "$": {POS: PUNCT}, - "PROPOSS": {POS: PRON}, - "PROQUANT": {POS: PRON}, - "PROEMP": {POS: PRON}, - "NUM": {POS: NUM}, - "N": {POS: NOUN}, - "AUX": {POS: VERB}, - "ADV": {POS: ADV}, - "ADJ": {POS: ADJ}, - "V": {POS: VERB}, - "VCOP": {POS: VERB}, -} From ad9ce3c8f607a4b5214f589b97f213e8a99724d4 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 11 Feb 2021 11:37:39 +1100 Subject: [PATCH 17/35] Fix issue #6950: allow pickling Tok2Vec with listeners --- spacy/pipeline/tok2vec.py | 6 ++- spacy/tests/regression/test_issue6950.py | 59 ++++++++++++++++++++++++ 2 files changed, 64 insertions(+), 1 deletion(-) create mode 100644 spacy/tests/regression/test_issue6950.py diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py index 5e723c14c..61ba498c9 100644 --- a/spacy/pipeline/tok2vec.py +++ b/spacy/pipeline/tok2vec.py @@ -121,7 +121,7 @@ class Tok2Vec(TrainablePipe): tokvecs = self.model.predict(docs) batch_id = Tok2VecListener.get_batch_id(docs) for listener in self.listeners: - listener.receive(batch_id, tokvecs, lambda dX: []) + listener.receive(batch_id, tokvecs, _empty_backprop) return tokvecs def set_annotations(self, docs: Sequence[Doc], tokvecses) -> None: @@ -300,3 +300,7 @@ def forward(model: Tok2VecListener, inputs, is_train: bool): else: outputs = [doc.tensor for doc in inputs] return outputs, lambda dX: [] + + +def _empty_backprop(dX): # for pickling + return [] diff --git a/spacy/tests/regression/test_issue6950.py b/spacy/tests/regression/test_issue6950.py new file mode 100644 index 000000000..f9d75a4ff --- /dev/null +++ b/spacy/tests/regression/test_issue6950.py @@ -0,0 +1,59 @@ +from spacy.lang.en import English +from spacy.training import Example +from spacy.util import load_config_from_str +import pickle + + +CONFIG = """ +[nlp] +lang = "en" +pipeline = ["tok2vec", "tagger"] + +[components] + +[components.tok2vec] +factory = "tok2vec" + +[components.tok2vec.model] +@architectures = "spacy.Tok2Vec.v1" + +[components.tok2vec.model.embed] +@architectures = "spacy.MultiHashEmbed.v1" +width = ${components.tok2vec.model.encode:width} +attrs = ["NORM","PREFIX","SUFFIX","SHAPE"] +rows = [5000,2500,2500,2500] +include_static_vectors = false + +[components.tok2vec.model.encode] +@architectures = "spacy.MaxoutWindowEncoder.v1" +width = 96 +depth = 4 +window_size = 1 +maxout_pieces = 3 + +[components.ner] +factory = "ner" + +[components.tagger] +factory = "tagger" + +[components.tagger.model] +@architectures = "spacy.Tagger.v1" +nO = null + +[components.tagger.model.tok2vec] +@architectures = "spacy.Tok2VecListener.v1" +width = ${components.tok2vec.model.encode:width} +upstream = "*" +""" + + +def test_issue6950(): + """Test that the nlp object with initialized tok2vec with listeners pickles + correctly (and doesn't have lambdas). + """ + nlp = English.from_config(load_config_from_str(CONFIG)) + nlp.initialize(lambda: [Example.from_dict(nlp.make_doc("hello"), {"tags": ["V"]})]) + pickle.dumps(nlp) + nlp("hello") + pickle.dumps(nlp) From 26bf642afd6ddb496846dad9283b195897afe30b Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 11 Feb 2021 16:45:23 +1100 Subject: [PATCH 18/35] Fix issue #7019: Handle None scores in evaluate printer (#7026) --- spacy/cli/evaluate.py | 16 +++++++++++----- spacy/tests/regression/test_issue7019.py | 12 ++++++++++++ 2 files changed, 23 insertions(+), 5 deletions(-) create mode 100644 spacy/tests/regression/test_issue7019.py diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py index 539a182f4..02f9b6528 100644 --- a/spacy/cli/evaluate.py +++ b/spacy/cli/evaluate.py @@ -175,10 +175,13 @@ def render_parses( def print_prf_per_type( msg: Printer, scores: Dict[str, Dict[str, float]], name: str, type: str ) -> None: - data = [ - (k, f"{v['p']*100:.2f}", f"{v['r']*100:.2f}", f"{v['f']*100:.2f}") - for k, v in scores.items() - ] + data = [] + for key, value in scores.items(): + row = [key] + for k in ("p", "r", "f"): + v = value[k] + row.append(f"{v * 100:.2f}" if isinstance(v, (int, float)) else v) + data.append(row) msg.table( data, header=("", "P", "R", "F"), @@ -191,7 +194,10 @@ def print_textcats_auc_per_cat( msg: Printer, scores: Dict[str, Dict[str, float]] ) -> None: msg.table( - [(k, f"{v:.2f}") for k, v in scores.items()], + [ + (k, f"{v:.2f}" if isinstance(v, (float, int)) else v) + for k, v in scores.items() + ], header=("", "ROC AUC"), aligns=("l", "r"), title="Textcat ROC AUC (per label)", diff --git a/spacy/tests/regression/test_issue7019.py b/spacy/tests/regression/test_issue7019.py new file mode 100644 index 000000000..53958b594 --- /dev/null +++ b/spacy/tests/regression/test_issue7019.py @@ -0,0 +1,12 @@ +from spacy.cli.evaluate import print_textcats_auc_per_cat, print_prf_per_type +from wasabi import msg + + +def test_issue7019(): + scores = {"LABEL_A": 0.39829102, "LABEL_B": 0.938298329382, "LABEL_C": None} + print_textcats_auc_per_cat(msg, scores) + scores = { + "LABEL_A": {"p": 0.3420302, "r": 0.3929020, "f": 0.49823928932}, + "LABEL_B": {"p": None, "r": None, "f": None}, + } + print_prf_per_type(msg, scores, name="foo", type="bar") From 39eeba6760c6011e3372ea2a359bfc7b056bfa1e Mon Sep 17 00:00:00 2001 From: Shumi <76557637+Shumie82@users.noreply.github.com> Date: Thu, 11 Feb 2021 21:20:46 +0200 Subject: [PATCH 19/35] Update __init__.py Added infixes = TOKENIZER_INFIXES --- spacy/lang/tn/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy/lang/tn/__init__.py b/spacy/lang/tn/__init__.py index 911214331..648772528 100644 --- a/spacy/lang/tn/__init__.py +++ b/spacy/lang/tn/__init__.py @@ -6,6 +6,7 @@ from ...language import Language class SetswanaDefaults(Language.Defaults): suffixes = TOKENIZER_SUFFIXES + infixes = TOKENIZER_INFIXES stop_words = STOP_WORDS lex_attr_getters = LEX_ATTRS From 37ec67f868ec803423cd76af28f8116c326ebedd Mon Sep 17 00:00:00 2001 From: Shumi <76557637+Shumie82@users.noreply.github.com> Date: Thu, 11 Feb 2021 21:25:58 +0200 Subject: [PATCH 20/35] Update examples.py I have removed two lines: # coding: utf8 from __future__ import unicode_literals And updated: >>> from spacy.lang.tn.examples import sentences --- spacy/lang/tn/examples.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/spacy/lang/tn/examples.py b/spacy/lang/tn/examples.py index 9039a1624..7b33fae5a 100644 --- a/spacy/lang/tn/examples.py +++ b/spacy/lang/tn/examples.py @@ -1,10 +1,6 @@ -# coding: utf8 -from __future__ import unicode_literals - - """ Example sentences to test spaCy and its language models. ->>> from spacy.lang.en.examples import sentences +>>> from spacy.lang.tn.examples import sentences >>> docs = nlp.pipe(sentences) """ @@ -16,4 +12,4 @@ sentences = [ "ke mang presidente ya Afrika Borwa?", "ke eng toropo kgolo ya Afrika Borwa?", "Nelson Mandela o belegwe leng?", -] \ No newline at end of file +] From 0d57e84b7baa35aaadeba7346c63a98c07511869 Mon Sep 17 00:00:00 2001 From: Shumi <76557637+Shumie82@users.noreply.github.com> Date: Thu, 11 Feb 2021 21:28:23 +0200 Subject: [PATCH 21/35] Update lex_attrs.py I have removed line 1 to 4 --- spacy/lang/tn/lex_attrs.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/spacy/lang/tn/lex_attrs.py b/spacy/lang/tn/lex_attrs.py index daef45d72..33a16a09a 100644 --- a/spacy/lang/tn/lex_attrs.py +++ b/spacy/lang/tn/lex_attrs.py @@ -1,7 +1,3 @@ -coding: utf8 - -from __future__ import unicode_literals - from ...attrs import LIKE_NUM _num_words = [ From 4e514f1ea8afcf341cc1d9b923eb7667b4b287c9 Mon Sep 17 00:00:00 2001 From: Shumi <76557637+Shumie82@users.noreply.github.com> Date: Thu, 11 Feb 2021 21:30:34 +0200 Subject: [PATCH 22/35] Update stop_words.py I have deleted line 1 to 5 and the statement print(STOP_WORDS) --- spacy/lang/tn/stop_words.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/spacy/lang/tn/stop_words.py b/spacy/lang/tn/stop_words.py index 65681f6ee..a627ef362 100644 --- a/spacy/lang/tn/stop_words.py +++ b/spacy/lang/tn/stop_words.py @@ -1,8 +1,3 @@ -coding: utf8 - -from __future__ import unicode_literals - - # Stop words STOP_WORDS = set(""" ke gareng ga selekanyo tlhwatlhwa yo mongwe se @@ -21,4 +16,3 @@ bonala e tshwanang bogolo tsenya tsweetswee karolo sepe tlhalosa dirwa robedi robongwe lesomenngwe gaisa tlhano lesometlhano botlalo lekgolo """.split()) -print(STOP_WORDS) \ No newline at end of file From a52d466bfcb13d4e15cd0cba945b7862209c2cee Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 11 Feb 2021 20:50:55 +0100 Subject: [PATCH 23/35] any instead of all --- spacy/pipeline/tok2vec.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py index 61ba498c9..4a396eaeb 100644 --- a/spacy/pipeline/tok2vec.py +++ b/spacy/pipeline/tok2vec.py @@ -291,7 +291,7 @@ def forward(model: Tok2VecListener, inputs, is_train: bool): # of data. # When the components batch differently, we don't receive a matching # prediction from the upstream, so we can't predict. - if not all(doc.tensor.size for doc in inputs): + if not any(doc.tensor.size for doc in inputs): # But we do need to do *something* if the tensor hasn't been set. # The compromise is to at least return data of the right shape, # so the output is valid. From ebeedfc70ba3f50cae9ecf98224de125cc6fa51a Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 11 Feb 2021 20:56:48 +0100 Subject: [PATCH 24/35] regression test for 7029 --- pretrain.cfg | 218 +++++++++++++++++++++++ pretrain_gpu.cfg | 217 ++++++++++++++++++++++ spacy/tests/regression/test_issue7029.py | 71 ++++++++ 3 files changed, 506 insertions(+) create mode 100644 pretrain.cfg create mode 100644 pretrain_gpu.cfg create mode 100644 spacy/tests/regression/test_issue7029.py diff --git a/pretrain.cfg b/pretrain.cfg new file mode 100644 index 000000000..50bd72350 --- /dev/null +++ b/pretrain.cfg @@ -0,0 +1,218 @@ +[paths] +train = null +dev = null +vectors = null +init_tok2vec = null +raw_text = null + +[system] +gpu_allocator = null +seed = 0 + +[nlp] +lang = "en" +pipeline = ["tok2vec","tagger","parser","ner"] +batch_size = 1000 +disabled = [] +before_creation = null +after_creation = null +after_pipeline_creation = null +tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"} + +[components] + +[components.ner] +factory = "ner" +moves = null +update_with_oracle_cut_size = 100 + +[components.ner.model] +@architectures = "spacy.TransitionBasedParser.v2" +state_type = "ner" +extra_state_tokens = false +hidden_width = 64 +maxout_pieces = 2 +use_upper = true +nO = null + +[components.ner.model.tok2vec] +@architectures = "spacy.Tok2VecListener.v1" +width = ${components.tok2vec.model.encode.width} +upstream = "*" + +[components.parser] +factory = "parser" +learn_tokens = false +min_action_freq = 30 +moves = null +update_with_oracle_cut_size = 100 + +[components.parser.model] +@architectures = "spacy.TransitionBasedParser.v2" +state_type = "parser" +extra_state_tokens = false +hidden_width = 128 +maxout_pieces = 3 +use_upper = true +nO = null + +[components.parser.model.tok2vec] +@architectures = "spacy.Tok2VecListener.v1" +width = ${components.tok2vec.model.encode.width} +upstream = "*" + +[components.tagger] +factory = "tagger" + +[components.tagger.model] +@architectures = "spacy.Tagger.v1" +nO = null + +[components.tagger.model.tok2vec] +@architectures = "spacy.Tok2VecListener.v1" +width = ${components.tok2vec.model.encode.width} +upstream = "*" + +[components.tok2vec] +factory = "tok2vec" + +[components.tok2vec.model] +@architectures = "spacy.Tok2Vec.v2" + +[components.tok2vec.model.embed] +@architectures = "spacy.MultiHashEmbed.v1" +width = ${components.tok2vec.model.encode.width} +attrs = ["NORM","PREFIX","SUFFIX","SHAPE"] +rows = [5000,2500,2500,2500] +include_static_vectors = false + +[components.tok2vec.model.encode] +@architectures = "spacy.MaxoutWindowEncoder.v2" +width = 96 +depth = 4 +window_size = 1 +maxout_pieces = 3 + +[corpora] + +[corpora.dev] +@readers = "spacy.Corpus.v1" +path = ${paths.dev} +max_length = 0 +gold_preproc = false +limit = 0 +augmenter = null + +[corpora.pretrain] +@readers = "spacy.JsonlCorpus.v1" +path = ${paths.raw_text} +min_length = 5 +max_length = 500 +limit = 0 + +[corpora.train] +@readers = "spacy.Corpus.v1" +path = ${paths.train} +max_length = 2000 +gold_preproc = false +limit = 0 +augmenter = null + +[training] +dev_corpus = "corpora.dev" +train_corpus = "corpora.train" +seed = ${system.seed} +gpu_allocator = ${system.gpu_allocator} +dropout = 0.1 +accumulate_gradient = 1 +patience = 1600 +max_epochs = 0 +max_steps = 20000 +eval_frequency = 200 +frozen_components = [] +before_to_disk = null + +[training.batcher] +@batchers = "spacy.batch_by_words.v1" +discard_oversize = false +tolerance = 0.2 +get_length = null + +[training.batcher.size] +@schedules = "compounding.v1" +start = 100 +stop = 1000 +compound = 1.001 +t = 0.0 + +[training.logger] +@loggers = "spacy.ConsoleLogger.v1" +progress_bar = false + +[training.optimizer] +@optimizers = "Adam.v1" +beta1 = 0.9 +beta2 = 0.999 +L2_is_weight_decay = true +L2 = 0.01 +grad_clip = 1.0 +use_averages = false +eps = 0.00000001 +learn_rate = 0.001 + +[training.score_weights] +dep_las_per_type = null +sents_p = null +sents_r = null +ents_per_type = null +tag_acc = 0.33 +dep_uas = 0.17 +dep_las = 0.17 +sents_f = 0.0 +ents_f = 0.33 +ents_p = 0.0 +ents_r = 0.0 + +[pretraining] +max_epochs = 1000 +dropout = 0.2 +n_save_every = null +component = "tok2vec" +layer = "" +corpus = "corpora.pretrain" + +[pretraining.batcher] +@batchers = "spacy.batch_by_words.v1" +size = 3000 +discard_oversize = false +tolerance = 0.2 +get_length = null + +[pretraining.objective] +@architectures = "spacy.PretrainCharacters.v1" +maxout_pieces = 3 +hidden_size = 300 +n_characters = 4 + +[pretraining.optimizer] +@optimizers = "Adam.v1" +beta1 = 0.9 +beta2 = 0.999 +L2_is_weight_decay = true +L2 = 0.01 +grad_clip = 1.0 +use_averages = true +eps = 0.00000001 +learn_rate = 0.001 + +[initialize] +vectors = null +init_tok2vec = ${paths.init_tok2vec} +vocab_data = null +lookups = null +before_init = null +after_init = null + +[initialize.components] + +[initialize.tokenizer] \ No newline at end of file diff --git a/pretrain_gpu.cfg b/pretrain_gpu.cfg new file mode 100644 index 000000000..6f9c9195d --- /dev/null +++ b/pretrain_gpu.cfg @@ -0,0 +1,217 @@ +[paths] +train = null +dev = null +vectors = null +init_tok2vec = null +raw_text = null + +[system] +gpu_allocator = "pytorch" +seed = 0 + +[nlp] +lang = "en" +pipeline = ["transformer","tagger","parser","ner"] +batch_size = 128 +disabled = [] +before_creation = null +after_creation = null +after_pipeline_creation = null +tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"} + +[components] + +[components.ner] +factory = "ner" +moves = null +update_with_oracle_cut_size = 100 + +[components.ner.model] +@architectures = "spacy.TransitionBasedParser.v2" +state_type = "ner" +extra_state_tokens = false +hidden_width = 64 +maxout_pieces = 2 +use_upper = false +nO = null + +[components.ner.model.tok2vec] +@architectures = "spacy-transformers.TransformerListener.v1" +grad_factor = 1.0 +pooling = {"@layers":"reduce_mean.v1"} +upstream = "*" + +[components.parser] +factory = "parser" +learn_tokens = false +min_action_freq = 30 +moves = null +update_with_oracle_cut_size = 100 + +[components.parser.model] +@architectures = "spacy.TransitionBasedParser.v2" +state_type = "parser" +extra_state_tokens = false +hidden_width = 128 +maxout_pieces = 3 +use_upper = false +nO = null + +[components.parser.model.tok2vec] +@architectures = "spacy-transformers.TransformerListener.v1" +grad_factor = 1.0 +pooling = {"@layers":"reduce_mean.v1"} +upstream = "*" + +[components.tagger] +factory = "tagger" + +[components.tagger.model] +@architectures = "spacy.Tagger.v1" +nO = null + +[components.tagger.model.tok2vec] +@architectures = "spacy-transformers.TransformerListener.v1" +grad_factor = 1.0 +pooling = {"@layers":"reduce_mean.v1"} +upstream = "*" + +[components.transformer] +factory = "transformer" +max_batch_items = 4096 +set_extra_annotations = {"@annotation_setters":"spacy-transformers.null_annotation_setter.v1"} + +[components.transformer.model] +@architectures = "spacy-transformers.TransformerModel.v1" +name = "roberta-base" + +[components.transformer.model.get_spans] +@span_getters = "spacy-transformers.strided_spans.v1" +window = 128 +stride = 96 + +[components.transformer.model.tokenizer_config] +use_fast = true + +[corpora] + +[corpora.dev] +@readers = "spacy.Corpus.v1" +path = ${paths.dev} +max_length = 0 +gold_preproc = false +limit = 0 +augmenter = null + +[corpora.pretrain] +@readers = "spacy.JsonlCorpus.v1" +path = ${paths.raw_text} +min_length = 5 +max_length = 500 +limit = 0 + +[corpora.train] +@readers = "spacy.Corpus.v1" +path = ${paths.train} +max_length = 500 +gold_preproc = false +limit = 0 +augmenter = null + +[training] +accumulate_gradient = 3 +dev_corpus = "corpora.dev" +train_corpus = "corpora.train" +seed = ${system.seed} +gpu_allocator = ${system.gpu_allocator} +dropout = 0.1 +patience = 1600 +max_epochs = 0 +max_steps = 20000 +eval_frequency = 200 +frozen_components = [] +before_to_disk = null + +[training.batcher] +@batchers = "spacy.batch_by_padded.v1" +discard_oversize = true +size = 2000 +buffer = 256 +get_length = null + +[training.logger] +@loggers = "spacy.ConsoleLogger.v1" +progress_bar = false + +[training.optimizer] +@optimizers = "Adam.v1" +beta1 = 0.9 +beta2 = 0.999 +L2_is_weight_decay = true +L2 = 0.01 +grad_clip = 1.0 +use_averages = false +eps = 0.00000001 + +[training.optimizer.learn_rate] +@schedules = "warmup_linear.v1" +warmup_steps = 250 +total_steps = 20000 +initial_rate = 0.00005 + +[training.score_weights] +dep_las_per_type = null +sents_p = null +sents_r = null +ents_per_type = null +tag_acc = 0.33 +dep_uas = 0.17 +dep_las = 0.17 +sents_f = 0.0 +ents_f = 0.33 +ents_p = 0.0 +ents_r = 0.0 + +[pretraining] +max_epochs = 1000 +dropout = 0.2 +n_save_every = null +component = "tok2vec" +layer = "" +corpus = "corpora.pretrain" + +[pretraining.batcher] +@batchers = "spacy.batch_by_words.v1" +size = 3000 +discard_oversize = false +tolerance = 0.2 +get_length = null + +[pretraining.objective] +@architectures = "spacy.PretrainCharacters.v1" +maxout_pieces = 3 +hidden_size = 300 +n_characters = 4 + +[pretraining.optimizer] +@optimizers = "Adam.v1" +beta1 = 0.9 +beta2 = 0.999 +L2_is_weight_decay = true +L2 = 0.01 +grad_clip = 1.0 +use_averages = true +eps = 0.00000001 +learn_rate = 0.001 + +[initialize] +vectors = null +init_tok2vec = ${paths.init_tok2vec} +vocab_data = null +lookups = null +before_init = null +after_init = null + +[initialize.components] + +[initialize.tokenizer] \ No newline at end of file diff --git a/spacy/tests/regression/test_issue7029.py b/spacy/tests/regression/test_issue7029.py new file mode 100644 index 000000000..4033bb725 --- /dev/null +++ b/spacy/tests/regression/test_issue7029.py @@ -0,0 +1,71 @@ +from spacy.lang.en import English +from spacy.training import Example +from spacy.util import load_config_from_str + + +CONFIG = """ +[nlp] +lang = "en" +pipeline = ["tok2vec", "tagger"] + +[components] + +[components.tok2vec] +factory = "tok2vec" + +[components.tok2vec.model] +@architectures = "spacy.Tok2Vec.v1" + +[components.tok2vec.model.embed] +@architectures = "spacy.MultiHashEmbed.v1" +width = ${components.tok2vec.model.encode:width} +attrs = ["NORM","PREFIX","SUFFIX","SHAPE"] +rows = [5000,2500,2500,2500] +include_static_vectors = false + +[components.tok2vec.model.encode] +@architectures = "spacy.MaxoutWindowEncoder.v1" +width = 96 +depth = 4 +window_size = 1 +maxout_pieces = 3 + +[components.ner] +factory = "ner" + +[components.tagger] +factory = "tagger" + +[components.tagger.model] +@architectures = "spacy.Tagger.v1" +nO = null + +[components.tagger.model.tok2vec] +@architectures = "spacy.Tok2VecListener.v1" +width = ${components.tok2vec.model.encode:width} +upstream = "*" +""" + + +TRAIN_DATA = [ + ("I like green eggs", {"tags": ["N", "V", "J", "N"]}), + ("Eat blue ham", {"tags": ["V", "J", "N"]}), +] + + +def test_issue7029(): + """Test that an empty document doesn't mess up an entire batch. + """ + nlp = English.from_config(load_config_from_str(CONFIG)) + train_examples = [] + for t in TRAIN_DATA: + train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) + optimizer = nlp.initialize(get_examples=lambda: train_examples) + for i in range(50): + losses = {} + nlp.update(train_examples, sgd=optimizer, losses=losses) + texts = ["first", "second", "thrid", "fourth", "and", "then", "some", ""] + nlp.select_pipes(enable=["tok2vec", "tagger"]) + docs1 = list(nlp.pipe(texts, batch_size=1)) + docs2 = list(nlp.pipe(texts, batch_size=4)) + assert [doc[0].tag_ for doc in docs1[:-1]] == [doc[0].tag_ for doc in docs2[:-1]] From 967df5901d573835cbd5a341a1e30eeee5b8121d Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 11 Feb 2021 20:57:43 +0100 Subject: [PATCH 25/35] cleanup --- pretrain.cfg | 218 ----------------------------------------------- pretrain_gpu.cfg | 217 ---------------------------------------------- 2 files changed, 435 deletions(-) delete mode 100644 pretrain.cfg delete mode 100644 pretrain_gpu.cfg diff --git a/pretrain.cfg b/pretrain.cfg deleted file mode 100644 index 50bd72350..000000000 --- a/pretrain.cfg +++ /dev/null @@ -1,218 +0,0 @@ -[paths] -train = null -dev = null -vectors = null -init_tok2vec = null -raw_text = null - -[system] -gpu_allocator = null -seed = 0 - -[nlp] -lang = "en" -pipeline = ["tok2vec","tagger","parser","ner"] -batch_size = 1000 -disabled = [] -before_creation = null -after_creation = null -after_pipeline_creation = null -tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"} - -[components] - -[components.ner] -factory = "ner" -moves = null -update_with_oracle_cut_size = 100 - -[components.ner.model] -@architectures = "spacy.TransitionBasedParser.v2" -state_type = "ner" -extra_state_tokens = false -hidden_width = 64 -maxout_pieces = 2 -use_upper = true -nO = null - -[components.ner.model.tok2vec] -@architectures = "spacy.Tok2VecListener.v1" -width = ${components.tok2vec.model.encode.width} -upstream = "*" - -[components.parser] -factory = "parser" -learn_tokens = false -min_action_freq = 30 -moves = null -update_with_oracle_cut_size = 100 - -[components.parser.model] -@architectures = "spacy.TransitionBasedParser.v2" -state_type = "parser" -extra_state_tokens = false -hidden_width = 128 -maxout_pieces = 3 -use_upper = true -nO = null - -[components.parser.model.tok2vec] -@architectures = "spacy.Tok2VecListener.v1" -width = ${components.tok2vec.model.encode.width} -upstream = "*" - -[components.tagger] -factory = "tagger" - -[components.tagger.model] -@architectures = "spacy.Tagger.v1" -nO = null - -[components.tagger.model.tok2vec] -@architectures = "spacy.Tok2VecListener.v1" -width = ${components.tok2vec.model.encode.width} -upstream = "*" - -[components.tok2vec] -factory = "tok2vec" - -[components.tok2vec.model] -@architectures = "spacy.Tok2Vec.v2" - -[components.tok2vec.model.embed] -@architectures = "spacy.MultiHashEmbed.v1" -width = ${components.tok2vec.model.encode.width} -attrs = ["NORM","PREFIX","SUFFIX","SHAPE"] -rows = [5000,2500,2500,2500] -include_static_vectors = false - -[components.tok2vec.model.encode] -@architectures = "spacy.MaxoutWindowEncoder.v2" -width = 96 -depth = 4 -window_size = 1 -maxout_pieces = 3 - -[corpora] - -[corpora.dev] -@readers = "spacy.Corpus.v1" -path = ${paths.dev} -max_length = 0 -gold_preproc = false -limit = 0 -augmenter = null - -[corpora.pretrain] -@readers = "spacy.JsonlCorpus.v1" -path = ${paths.raw_text} -min_length = 5 -max_length = 500 -limit = 0 - -[corpora.train] -@readers = "spacy.Corpus.v1" -path = ${paths.train} -max_length = 2000 -gold_preproc = false -limit = 0 -augmenter = null - -[training] -dev_corpus = "corpora.dev" -train_corpus = "corpora.train" -seed = ${system.seed} -gpu_allocator = ${system.gpu_allocator} -dropout = 0.1 -accumulate_gradient = 1 -patience = 1600 -max_epochs = 0 -max_steps = 20000 -eval_frequency = 200 -frozen_components = [] -before_to_disk = null - -[training.batcher] -@batchers = "spacy.batch_by_words.v1" -discard_oversize = false -tolerance = 0.2 -get_length = null - -[training.batcher.size] -@schedules = "compounding.v1" -start = 100 -stop = 1000 -compound = 1.001 -t = 0.0 - -[training.logger] -@loggers = "spacy.ConsoleLogger.v1" -progress_bar = false - -[training.optimizer] -@optimizers = "Adam.v1" -beta1 = 0.9 -beta2 = 0.999 -L2_is_weight_decay = true -L2 = 0.01 -grad_clip = 1.0 -use_averages = false -eps = 0.00000001 -learn_rate = 0.001 - -[training.score_weights] -dep_las_per_type = null -sents_p = null -sents_r = null -ents_per_type = null -tag_acc = 0.33 -dep_uas = 0.17 -dep_las = 0.17 -sents_f = 0.0 -ents_f = 0.33 -ents_p = 0.0 -ents_r = 0.0 - -[pretraining] -max_epochs = 1000 -dropout = 0.2 -n_save_every = null -component = "tok2vec" -layer = "" -corpus = "corpora.pretrain" - -[pretraining.batcher] -@batchers = "spacy.batch_by_words.v1" -size = 3000 -discard_oversize = false -tolerance = 0.2 -get_length = null - -[pretraining.objective] -@architectures = "spacy.PretrainCharacters.v1" -maxout_pieces = 3 -hidden_size = 300 -n_characters = 4 - -[pretraining.optimizer] -@optimizers = "Adam.v1" -beta1 = 0.9 -beta2 = 0.999 -L2_is_weight_decay = true -L2 = 0.01 -grad_clip = 1.0 -use_averages = true -eps = 0.00000001 -learn_rate = 0.001 - -[initialize] -vectors = null -init_tok2vec = ${paths.init_tok2vec} -vocab_data = null -lookups = null -before_init = null -after_init = null - -[initialize.components] - -[initialize.tokenizer] \ No newline at end of file diff --git a/pretrain_gpu.cfg b/pretrain_gpu.cfg deleted file mode 100644 index 6f9c9195d..000000000 --- a/pretrain_gpu.cfg +++ /dev/null @@ -1,217 +0,0 @@ -[paths] -train = null -dev = null -vectors = null -init_tok2vec = null -raw_text = null - -[system] -gpu_allocator = "pytorch" -seed = 0 - -[nlp] -lang = "en" -pipeline = ["transformer","tagger","parser","ner"] -batch_size = 128 -disabled = [] -before_creation = null -after_creation = null -after_pipeline_creation = null -tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"} - -[components] - -[components.ner] -factory = "ner" -moves = null -update_with_oracle_cut_size = 100 - -[components.ner.model] -@architectures = "spacy.TransitionBasedParser.v2" -state_type = "ner" -extra_state_tokens = false -hidden_width = 64 -maxout_pieces = 2 -use_upper = false -nO = null - -[components.ner.model.tok2vec] -@architectures = "spacy-transformers.TransformerListener.v1" -grad_factor = 1.0 -pooling = {"@layers":"reduce_mean.v1"} -upstream = "*" - -[components.parser] -factory = "parser" -learn_tokens = false -min_action_freq = 30 -moves = null -update_with_oracle_cut_size = 100 - -[components.parser.model] -@architectures = "spacy.TransitionBasedParser.v2" -state_type = "parser" -extra_state_tokens = false -hidden_width = 128 -maxout_pieces = 3 -use_upper = false -nO = null - -[components.parser.model.tok2vec] -@architectures = "spacy-transformers.TransformerListener.v1" -grad_factor = 1.0 -pooling = {"@layers":"reduce_mean.v1"} -upstream = "*" - -[components.tagger] -factory = "tagger" - -[components.tagger.model] -@architectures = "spacy.Tagger.v1" -nO = null - -[components.tagger.model.tok2vec] -@architectures = "spacy-transformers.TransformerListener.v1" -grad_factor = 1.0 -pooling = {"@layers":"reduce_mean.v1"} -upstream = "*" - -[components.transformer] -factory = "transformer" -max_batch_items = 4096 -set_extra_annotations = {"@annotation_setters":"spacy-transformers.null_annotation_setter.v1"} - -[components.transformer.model] -@architectures = "spacy-transformers.TransformerModel.v1" -name = "roberta-base" - -[components.transformer.model.get_spans] -@span_getters = "spacy-transformers.strided_spans.v1" -window = 128 -stride = 96 - -[components.transformer.model.tokenizer_config] -use_fast = true - -[corpora] - -[corpora.dev] -@readers = "spacy.Corpus.v1" -path = ${paths.dev} -max_length = 0 -gold_preproc = false -limit = 0 -augmenter = null - -[corpora.pretrain] -@readers = "spacy.JsonlCorpus.v1" -path = ${paths.raw_text} -min_length = 5 -max_length = 500 -limit = 0 - -[corpora.train] -@readers = "spacy.Corpus.v1" -path = ${paths.train} -max_length = 500 -gold_preproc = false -limit = 0 -augmenter = null - -[training] -accumulate_gradient = 3 -dev_corpus = "corpora.dev" -train_corpus = "corpora.train" -seed = ${system.seed} -gpu_allocator = ${system.gpu_allocator} -dropout = 0.1 -patience = 1600 -max_epochs = 0 -max_steps = 20000 -eval_frequency = 200 -frozen_components = [] -before_to_disk = null - -[training.batcher] -@batchers = "spacy.batch_by_padded.v1" -discard_oversize = true -size = 2000 -buffer = 256 -get_length = null - -[training.logger] -@loggers = "spacy.ConsoleLogger.v1" -progress_bar = false - -[training.optimizer] -@optimizers = "Adam.v1" -beta1 = 0.9 -beta2 = 0.999 -L2_is_weight_decay = true -L2 = 0.01 -grad_clip = 1.0 -use_averages = false -eps = 0.00000001 - -[training.optimizer.learn_rate] -@schedules = "warmup_linear.v1" -warmup_steps = 250 -total_steps = 20000 -initial_rate = 0.00005 - -[training.score_weights] -dep_las_per_type = null -sents_p = null -sents_r = null -ents_per_type = null -tag_acc = 0.33 -dep_uas = 0.17 -dep_las = 0.17 -sents_f = 0.0 -ents_f = 0.33 -ents_p = 0.0 -ents_r = 0.0 - -[pretraining] -max_epochs = 1000 -dropout = 0.2 -n_save_every = null -component = "tok2vec" -layer = "" -corpus = "corpora.pretrain" - -[pretraining.batcher] -@batchers = "spacy.batch_by_words.v1" -size = 3000 -discard_oversize = false -tolerance = 0.2 -get_length = null - -[pretraining.objective] -@architectures = "spacy.PretrainCharacters.v1" -maxout_pieces = 3 -hidden_size = 300 -n_characters = 4 - -[pretraining.optimizer] -@optimizers = "Adam.v1" -beta1 = 0.9 -beta2 = 0.999 -L2_is_weight_decay = true -L2 = 0.01 -grad_clip = 1.0 -use_averages = true -eps = 0.00000001 -learn_rate = 0.001 - -[initialize] -vectors = null -init_tok2vec = ${paths.init_tok2vec} -vocab_data = null -lookups = null -before_init = null -after_init = null - -[initialize.components] - -[initialize.tokenizer] \ No newline at end of file From 278e9eaa148799a3d92eaec8684267e3a10603f0 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 11 Feb 2021 21:08:04 +0100 Subject: [PATCH 26/35] remove ner --- spacy/tests/regression/test_issue7029.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/spacy/tests/regression/test_issue7029.py b/spacy/tests/regression/test_issue7029.py index 4033bb725..2ff730e29 100644 --- a/spacy/tests/regression/test_issue7029.py +++ b/spacy/tests/regression/test_issue7029.py @@ -30,9 +30,6 @@ depth = 4 window_size = 1 maxout_pieces = 3 -[components.ner] -factory = "ner" - [components.tagger] factory = "tagger" From aa3ad8825d8da3bce01265de7b3f87064bb16dd5 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Fri, 12 Feb 2021 13:14:30 +0100 Subject: [PATCH 27/35] loop instead of any --- spacy/pipeline/tok2vec.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py index 4a396eaeb..26a4c998c 100644 --- a/spacy/pipeline/tok2vec.py +++ b/spacy/pipeline/tok2vec.py @@ -291,14 +291,16 @@ def forward(model: Tok2VecListener, inputs, is_train: bool): # of data. # When the components batch differently, we don't receive a matching # prediction from the upstream, so we can't predict. - if not any(doc.tensor.size for doc in inputs): - # But we do need to do *something* if the tensor hasn't been set. - # The compromise is to at least return data of the right shape, - # so the output is valid. - width = model.get_dim("nO") - outputs = [model.ops.alloc2f(len(doc), width) for doc in inputs] - else: - outputs = [doc.tensor for doc in inputs] + outputs = [] + width = model.get_dim("nO") + for doc in inputs: + if doc.tensor.size == 0: + # But we do need to do *something* if the tensor hasn't been set. + # The compromise is to at least return data of the right shape, + # so the output is valid. + outputs.append(model.ops.alloc2f(len(doc), width)) + else: + outputs.append(doc.tensor) return outputs, lambda dX: [] From 5e47a54d29963eba24faecdcc510e6d957a5a707 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 12 Feb 2021 13:27:46 +0100 Subject: [PATCH 28/35] Include noun chunks method when pickling Vocab --- spacy/tests/test_pickles.py | 26 +++++++++++++++++++++++++- spacy/vocab.pyx | 6 ++++-- 2 files changed, 29 insertions(+), 3 deletions(-) diff --git a/spacy/tests/test_pickles.py b/spacy/tests/test_pickles.py index e4c67b672..0c56ae0d2 100644 --- a/spacy/tests/test_pickles.py +++ b/spacy/tests/test_pickles.py @@ -1,7 +1,9 @@ import pytest import numpy import srsly +from spacy.lang.en import English from spacy.strings import StringStore +from spacy.tokens import Doc from spacy.vocab import Vocab from spacy.attrs import NORM @@ -20,7 +22,10 @@ def test_pickle_string_store(text1, text2): @pytest.mark.parametrize("text1,text2", [("dog", "cat")]) def test_pickle_vocab(text1, text2): - vocab = Vocab(lex_attr_getters={int(NORM): lambda string: string[:-1]}) + vocab = Vocab( + lex_attr_getters={int(NORM): lambda string: string[:-1]}, + get_noun_chunks=English.Defaults.syntax_iterators.get("noun_chunks"), + ) vocab.set_vector("dog", numpy.ones((5,), dtype="f")) lex1 = vocab[text1] lex2 = vocab[text2] @@ -34,4 +39,23 @@ def test_pickle_vocab(text1, text2): assert unpickled[text2].norm == lex2.norm assert unpickled[text1].norm != unpickled[text2].norm assert unpickled.vectors is not None + assert unpickled.get_noun_chunks is not None assert list(vocab["dog"].vector) == [1.0, 1.0, 1.0, 1.0, 1.0] + + +def test_pickle_doc(en_vocab): + words = ["a", "b", "c"] + deps = ["dep"] * len(words) + heads = [0] * len(words) + doc = Doc( + en_vocab, + words=words, + deps=deps, + heads=heads, + ) + data = srsly.pickle_dumps(doc) + unpickled = srsly.pickle_loads(data) + assert [t.text for t in unpickled] == words + assert [t.dep_ for t in unpickled] == deps + assert [t.head.i for t in unpickled] == heads + assert list(doc.noun_chunks) == [] diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 31f4e2422..db73e9d91 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -551,12 +551,13 @@ def pickle_vocab(vocab): data_dir = vocab.data_dir lex_attr_getters = srsly.pickle_dumps(vocab.lex_attr_getters) lookups = vocab.lookups + get_noun_chunks = vocab.get_noun_chunks return (unpickle_vocab, - (sstore, vectors, morph, data_dir, lex_attr_getters, lookups)) + (sstore, vectors, morph, data_dir, lex_attr_getters, lookups, get_noun_chunks)) def unpickle_vocab(sstore, vectors, morphology, data_dir, - lex_attr_getters, lookups): + lex_attr_getters, lookups, get_noun_chunks): cdef Vocab vocab = Vocab() vocab.vectors = vectors vocab.strings = sstore @@ -564,6 +565,7 @@ def unpickle_vocab(sstore, vectors, morphology, data_dir, vocab.data_dir = data_dir vocab.lex_attr_getters = srsly.pickle_loads(lex_attr_getters) vocab.lookups = lookups + vocab.get_noun_chunks = get_noun_chunks return vocab From 03b4ec7d7fac8d3c2e5360d50075eddf0478396b Mon Sep 17 00:00:00 2001 From: svlandeg Date: Fri, 12 Feb 2021 14:30:16 +0100 Subject: [PATCH 29/35] fix typo --- spacy/tests/regression/test_issue7029.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tests/regression/test_issue7029.py b/spacy/tests/regression/test_issue7029.py index 2ff730e29..dcfb8d9e7 100644 --- a/spacy/tests/regression/test_issue7029.py +++ b/spacy/tests/regression/test_issue7029.py @@ -61,7 +61,7 @@ def test_issue7029(): for i in range(50): losses = {} nlp.update(train_examples, sgd=optimizer, losses=losses) - texts = ["first", "second", "thrid", "fourth", "and", "then", "some", ""] + texts = ["first", "second", "third", "fourth", "and", "then", "some", ""] nlp.select_pipes(enable=["tok2vec", "tagger"]) docs1 = list(nlp.pipe(texts, batch_size=1)) docs2 = list(nlp.pipe(texts, batch_size=4)) From 0ee2ae86bfd174f253541534282c682cb03af982 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 12 Feb 2021 15:55:17 +0100 Subject: [PATCH 30/35] Update trf quickstart recommendations Add/update trf recommendations for Bengali, Hindi, Sinhala, and Tamil based on #7044. --- .../quickstart_training_recommendations.yml | 21 +++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/spacy/cli/templates/quickstart_training_recommendations.yml b/spacy/cli/templates/quickstart_training_recommendations.yml index b8c24245a..a3ec62421 100644 --- a/spacy/cli/templates/quickstart_training_recommendations.yml +++ b/spacy/cli/templates/quickstart_training_recommendations.yml @@ -28,6 +28,15 @@ bg: accuracy: name: iarfmoose/roberta-base-bulgarian size_factor: 3 +bn: + word_vectors: null + transformer: + efficiency: + name: sagorsarker/bangla-bert-base + size_factor: 3 + accuracy: + name: sagorsarker/bangla-bert-base + size_factor: 3 da: word_vectors: da_core_news_lg transformer: @@ -104,10 +113,10 @@ hi: word_vectors: null transformer: efficiency: - name: monsoon-nlp/hindi-tpu-electra + name: ai4bharat/indic-bert size_factor: 3 accuracy: - name: monsoon-nlp/hindi-tpu-electra + name: ai4bharat/indic-bert size_factor: 3 id: word_vectors: null @@ -185,10 +194,10 @@ si: word_vectors: null transformer: efficiency: - name: keshan/SinhalaBERTo + name: setu4993/LaBSE size_factor: 3 accuracy: - name: keshan/SinhalaBERTo + name: setu4993/LaBSE size_factor: 3 sv: word_vectors: null @@ -203,10 +212,10 @@ ta: word_vectors: null transformer: efficiency: - name: monsoon-nlp/tamillion + name: ai4bharat/indic-bert size_factor: 3 accuracy: - name: monsoon-nlp/tamillion + name: ai4bharat/indic-bert size_factor: 3 te: word_vectors: null From 6c450decfc01e2d82f0b7c8f799654d79158fa4c Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 13 Feb 2021 11:51:21 +1100 Subject: [PATCH 31/35] Fix punctuation settings and add to initialize tests --- spacy/lang/tn/__init__.py | 3 +-- spacy/tests/lang/test_initialize.py | 3 ++- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/spacy/lang/tn/__init__.py b/spacy/lang/tn/__init__.py index 648772528..99907c28a 100644 --- a/spacy/lang/tn/__init__.py +++ b/spacy/lang/tn/__init__.py @@ -5,8 +5,7 @@ from ...language import Language class SetswanaDefaults(Language.Defaults): - suffixes = TOKENIZER_SUFFIXES - infixes = TOKENIZER_INFIXES + infixes = TOKENIZER_INFIXES stop_words = STOP_WORDS lex_attr_getters = LEX_ATTRS diff --git a/spacy/tests/lang/test_initialize.py b/spacy/tests/lang/test_initialize.py index de1871e64..46f1f2bd1 100644 --- a/spacy/tests/lang/test_initialize.py +++ b/spacy/tests/lang/test_initialize.py @@ -8,7 +8,8 @@ from spacy.util import get_lang_class LANGUAGES = ["af", "ar", "bg", "bn", "ca", "cs", "da", "de", "el", "en", "es", "et", "fa", "fi", "fr", "ga", "he", "hi", "hr", "hu", "id", "is", "it", "kn", "lt", "lv", "nb", "nl", "pl", "pt", "ro", "si", "sk", - "sl", "sq", "sr", "sv", "ta", "te", "tl", "tr", "tt", "ur", 'yo'] + "sl", "sq", "sr", "sv", "ta", "te", "tl", "tn", "tr", "tt", "ur", + "yo"] # fmt: on From 06e66d4ced1396818fc6fb8505e12a159370febd Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 13 Feb 2021 12:33:17 +1100 Subject: [PATCH 32/35] Update languages.json [ci skip] --- website/meta/languages.json | 1 + 1 file changed, 1 insertion(+) diff --git a/website/meta/languages.json b/website/meta/languages.json index 059639dba..579dca9fe 100644 --- a/website/meta/languages.json +++ b/website/meta/languages.json @@ -198,6 +198,7 @@ "has_examples": true }, { "code": "tl", "name": "Tagalog" }, + { "code": "tn", "name": "Setswana", "has_examples": true }, { "code": "tr", "name": "Turkish", "example": "Bu bir cümledir.", "has_examples": true }, { "code": "tt", "name": "Tatar", "has_examples": true }, { From 9ba715ed16014afad499193c30e91baa719f2bda Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 13 Feb 2021 12:55:56 +1100 Subject: [PATCH 33/35] Tidy up and auto-format --- spacy/lang/am/stop_words.py | 46 ++-- spacy/lang/tn/lex_attrs.py | 3 +- spacy/lang/tn/punctuation.py | 2 +- spacy/lang/tn/stop_words.py | 28 ++- spacy/pipeline/functions.py | 2 +- spacy/tests/matcher/test_phrase_matcher.py | 18 +- spacy/tests/regression/test_issue6501-7000.py | 229 ++++++++++++++++++ spacy/tests/regression/test_issue6730.py | 23 -- spacy/tests/regression/test_issue6755.py | 5 - spacy/tests/regression/test_issue6815.py | 35 --- spacy/tests/regression/test_issue6839.py | 15 -- spacy/tests/regression/test_issue6908.py | 102 -------- spacy/tests/regression/test_issue6950.py | 59 ----- spacy/tests/regression/test_issue7029.py | 3 +- spacy/tests/vocab_vectors/test_lexeme.py | 1 + 15 files changed, 285 insertions(+), 286 deletions(-) create mode 100644 spacy/tests/regression/test_issue6501-7000.py delete mode 100644 spacy/tests/regression/test_issue6730.py delete mode 100644 spacy/tests/regression/test_issue6755.py delete mode 100644 spacy/tests/regression/test_issue6815.py delete mode 100644 spacy/tests/regression/test_issue6839.py delete mode 100644 spacy/tests/regression/test_issue6908.py delete mode 100644 spacy/tests/regression/test_issue6950.py diff --git a/spacy/lang/am/stop_words.py b/spacy/lang/am/stop_words.py index eaf318693..5487ada5a 100644 --- a/spacy/lang/am/stop_words.py +++ b/spacy/lang/am/stop_words.py @@ -4,30 +4,30 @@ STOP_WORDS = set( """ ግን አንቺ አንተ እናንተ ያንተ ያንቺ የናንተ ራስህን ራስሽን ራሳችሁን -ሁሉ ኋላ በሰሞኑ አሉ በኋላ ሁኔታ በኩል አስታውቀዋል ሆነ በውስጥ -አስታውሰዋል ሆኑ ባጣም እስካሁን ሆኖም በተለይ አሳሰበ ሁል በተመለከተ -አሳስበዋል ላይ በተመሳሳይ አስፈላጊ ሌላ የተለያየ አስገነዘቡ ሌሎች የተለያዩ -አስገንዝበዋል ልዩ ተባለ አብራርተዋል መሆኑ ተገለጸ አስረድተዋል ተገልጿል -ማለቱ ተጨማሪ እባክህ የሚገኝ ተከናወነ እባክሽ ማድረግ ችግር አንጻር ማን -ትናንት እስኪደርስ ነበረች እንኳ ሰሞኑን ነበሩ እንኳን ሲሆን ነበር እዚሁ ሲል -ነው እንደገለጹት አለ ና እንደተናገሩት ቢሆን ነገር እንዳስረዱት ብለዋል ነገሮች -እንደገና ብዙ ናት ወቅት ቦታ ናቸው እንዲሁም በርካታ አሁን እንጂ እስከ -ማለት የሚሆኑት ስለማናቸውም ውስጥ ይሆናሉ ሲባል ከሆነው ስለዚሁ ከአንድ -ያልሆነ ሳለ የነበረውን ከአንዳንድ በማናቸውም በሙሉ የሆነው ያሉ በእነዚሁ -ወር መሆናቸው ከሌሎች በዋና አንዲት ወይም -በላይ እንደ በማቀድ ለሌሎች በሆኑ ቢሆንም ጊዜና ይሆኑበታል በሆነ አንዱ -ለዚህ ለሆነው ለነዚህ ከዚህ የሌላውን ሶስተኛ አንዳንድ ለማንኛውም የሆነ ከሁለት -የነገሩ ሰኣት አንደኛ እንዲሆን እንደነዚህ ማንኛውም ካልሆነ የሆኑት ጋር ቢያንስ +ሁሉ ኋላ በሰሞኑ አሉ በኋላ ሁኔታ በኩል አስታውቀዋል ሆነ በውስጥ +አስታውሰዋል ሆኑ ባጣም እስካሁን ሆኖም በተለይ አሳሰበ ሁል በተመለከተ +አሳስበዋል ላይ በተመሳሳይ አስፈላጊ ሌላ የተለያየ አስገነዘቡ ሌሎች የተለያዩ +አስገንዝበዋል ልዩ ተባለ አብራርተዋል መሆኑ ተገለጸ አስረድተዋል ተገልጿል +ማለቱ ተጨማሪ እባክህ የሚገኝ ተከናወነ እባክሽ ማድረግ ችግር አንጻር ማን +ትናንት እስኪደርስ ነበረች እንኳ ሰሞኑን ነበሩ እንኳን ሲሆን ነበር እዚሁ ሲል +ነው እንደገለጹት አለ ና እንደተናገሩት ቢሆን ነገር እንዳስረዱት ብለዋል ነገሮች +እንደገና ብዙ ናት ወቅት ቦታ ናቸው እንዲሁም በርካታ አሁን እንጂ እስከ +ማለት የሚሆኑት ስለማናቸውም ውስጥ ይሆናሉ ሲባል ከሆነው ስለዚሁ ከአንድ +ያልሆነ ሳለ የነበረውን ከአንዳንድ በማናቸውም በሙሉ የሆነው ያሉ በእነዚሁ +ወር መሆናቸው ከሌሎች በዋና አንዲት ወይም +በላይ እንደ በማቀድ ለሌሎች በሆኑ ቢሆንም ጊዜና ይሆኑበታል በሆነ አንዱ +ለዚህ ለሆነው ለነዚህ ከዚህ የሌላውን ሶስተኛ አንዳንድ ለማንኛውም የሆነ ከሁለት +የነገሩ ሰኣት አንደኛ እንዲሆን እንደነዚህ ማንኛውም ካልሆነ የሆኑት ጋር ቢያንስ ይህንንም እነደሆነ እነዚህን ይኸው የማናቸውም -በሙሉም ይህችው በተለይም አንዱን የሚችለውን በነዚህ ከእነዚህ በሌላ -የዚሁ ከእነዚሁ ለዚሁ በሚገባ ለእያንዳንዱ የአንቀጹ ወደ ይህም ስለሆነ ወይ -ማናቸውንም ተብሎ እነዚህ መሆናቸውን የሆነችን ከአስር ሳይሆን ከዚያ የለውም -የማይበልጥ እንደሆነና እንዲሆኑ በሚችሉ ብቻ ብሎ ከሌላ የሌላቸውን -ለሆነ በሌሎች ሁለቱንም በቀር ይህ በታች አንደሆነ በነሱ -ይህን የሌላ እንዲህ ከሆነ ያላቸው በነዚሁ በሚል የዚህ ይህንኑ -በእንደዚህ ቁጥር ማናቸውም ሆነው ባሉ በዚህ በስተቀር ሲሆንና -በዚህም መሆን ምንጊዜም እነዚህም በዚህና ያለ ስም -ሲኖር ከዚህም መሆኑን በሁኔታው የማያንስ እነዚህኑ ማንም ከነዚሁ +በሙሉም ይህችው በተለይም አንዱን የሚችለውን በነዚህ ከእነዚህ በሌላ +የዚሁ ከእነዚሁ ለዚሁ በሚገባ ለእያንዳንዱ የአንቀጹ ወደ ይህም ስለሆነ ወይ +ማናቸውንም ተብሎ እነዚህ መሆናቸውን የሆነችን ከአስር ሳይሆን ከዚያ የለውም +የማይበልጥ እንደሆነና እንዲሆኑ በሚችሉ ብቻ ብሎ ከሌላ የሌላቸውን +ለሆነ በሌሎች ሁለቱንም በቀር ይህ በታች አንደሆነ በነሱ +ይህን የሌላ እንዲህ ከሆነ ያላቸው በነዚሁ በሚል የዚህ ይህንኑ +በእንደዚህ ቁጥር ማናቸውም ሆነው ባሉ በዚህ በስተቀር ሲሆንና +በዚህም መሆን ምንጊዜም እነዚህም በዚህና ያለ ስም +ሲኖር ከዚህም መሆኑን በሁኔታው የማያንስ እነዚህኑ ማንም ከነዚሁ ያላቸውን እጅግ ሲሆኑ ለሆኑ ሊሆን ለማናቸውም """.split() ) diff --git a/spacy/lang/tn/lex_attrs.py b/spacy/lang/tn/lex_attrs.py index 33a16a09a..c136d0ab2 100644 --- a/spacy/lang/tn/lex_attrs.py +++ b/spacy/lang/tn/lex_attrs.py @@ -78,6 +78,7 @@ _ordinal_words = [ "bazillione", ] + def like_num(text): if text.startswith(("+", "-", "±", "~")): text = text[1:] @@ -98,7 +99,7 @@ def like_num(text): return True if text_lower.endswith("th"): if text_lower[:-2].isdigit(): - return True + return True return False diff --git a/spacy/lang/tn/punctuation.py b/spacy/lang/tn/punctuation.py index 241ad39af..a52755564 100644 --- a/spacy/lang/tn/punctuation.py +++ b/spacy/lang/tn/punctuation.py @@ -16,4 +16,4 @@ _infixes = ( ) -TOKENIZER_INFIXES = _infixes \ No newline at end of file +TOKENIZER_INFIXES = _infixes diff --git a/spacy/lang/tn/stop_words.py b/spacy/lang/tn/stop_words.py index a627ef362..f614771dd 100644 --- a/spacy/lang/tn/stop_words.py +++ b/spacy/lang/tn/stop_words.py @@ -1,18 +1,20 @@ # Stop words -STOP_WORDS = set(""" -ke gareng ga selekanyo tlhwatlhwa yo mongwe se +STOP_WORDS = set( + """ +ke gareng ga selekanyo tlhwatlhwa yo mongwe se sengwe fa go le jalo gongwe ba na mo tikologong -jaaka kwa morago nna gonne ka sa pele nako teng +jaaka kwa morago nna gonne ka sa pele nako teng tlase fela ntle magareng tsona feta bobedi kgabaganya moo gape kgatlhanong botlhe tsotlhe bokana e esi -setseng mororo dinako golo kgolo nnye wena gago -o ntse ntle tla goreng gangwe mang yotlhe gore -eo yona tseraganyo eng ne sentle re rona thata -godimo fitlha pedi masomamabedi lesomepedi mmogo -tharo tseo boraro tseno yone jaanong bobona bona -lesome tsaya tsamaiso nngwe masomethataro thataro +setseng mororo dinako golo kgolo nnye wena gago +o ntse ntle tla goreng gangwe mang yotlhe gore +eo yona tseraganyo eng ne sentle re rona thata +godimo fitlha pedi masomamabedi lesomepedi mmogo +tharo tseo boraro tseno yone jaanong bobona bona +lesome tsaya tsamaiso nngwe masomethataro thataro tsa mmatota tota sale thoko supa dira tshwanetse di mmalwa masisi -bonala e tshwanang bogolo tsenya tsweetswee karolo -sepe tlhalosa dirwa robedi robongwe lesomenngwe gaisa -tlhano lesometlhano botlalo lekgolo -""".split()) +bonala e tshwanang bogolo tsenya tsweetswee karolo +sepe tlhalosa dirwa robedi robongwe lesomenngwe gaisa +tlhano lesometlhano botlalo lekgolo +""".split() +) diff --git a/spacy/pipeline/functions.py b/spacy/pipeline/functions.py index cf8baf9da..03c7db422 100644 --- a/spacy/pipeline/functions.py +++ b/spacy/pipeline/functions.py @@ -76,7 +76,7 @@ def merge_subtokens(doc: Doc, label: str = "subtok") -> Doc: retokenizes=True, ) def make_token_splitter( - nlp: Language, name: str, *, min_length=0, split_length=0, + nlp: Language, name: str, *, min_length: int = 0, split_length: int = 0 ): return TokenSplitter(min_length=min_length, split_length=split_length) diff --git a/spacy/tests/matcher/test_phrase_matcher.py b/spacy/tests/matcher/test_phrase_matcher.py index 230ca3b19..478949601 100644 --- a/spacy/tests/matcher/test_phrase_matcher.py +++ b/spacy/tests/matcher/test_phrase_matcher.py @@ -327,8 +327,10 @@ def test_phrase_matcher_sent_start(en_vocab, attr): def test_span_in_phrasematcher(en_vocab): """Ensure that PhraseMatcher accepts Span and Doc as input""" - doc = Doc(en_vocab, - words=["I", "like", "Spans", "and", "Docs", "in", "my", "input", ",", "and", "nothing", "else", "."]) + # fmt: off + words = ["I", "like", "Spans", "and", "Docs", "in", "my", "input", ",", "and", "nothing", "else", "."] + # fmt: on + doc = Doc(en_vocab, words=words) span = doc[:8] pattern = Doc(en_vocab, words=["Spans", "and", "Docs"]) matcher = PhraseMatcher(en_vocab) @@ -341,10 +343,14 @@ def test_span_in_phrasematcher(en_vocab): def test_span_v_doc_in_phrasematcher(en_vocab): """Ensure that PhraseMatcher only returns matches in input Span and not in entire Doc""" - doc = Doc(en_vocab, - words=["I", "like", "Spans", "and", "Docs", "in", "my", "input", ",", - "Spans", "and", "Docs", "in", "my", "matchers", "," - "and", "Spans", "and", "Docs", "everywhere" "."]) + # fmt: off + words = [ + "I", "like", "Spans", "and", "Docs", "in", "my", "input", ",", "Spans", + "and", "Docs", "in", "my", "matchers", "," "and", "Spans", "and", "Docs", + "everywhere", "." + ] + # fmt: on + doc = Doc(en_vocab, words=words) span = doc[9:15] # second clause pattern = Doc(en_vocab, words=["Spans", "and", "Docs"]) matcher = PhraseMatcher(en_vocab) diff --git a/spacy/tests/regression/test_issue6501-7000.py b/spacy/tests/regression/test_issue6501-7000.py new file mode 100644 index 000000000..3007f1dc6 --- /dev/null +++ b/spacy/tests/regression/test_issue6501-7000.py @@ -0,0 +1,229 @@ +import pytest +from spacy.lang.en import English +import numpy as np +import spacy +from spacy.tokens import Doc +from spacy.matcher import PhraseMatcher +from spacy.tokens import DocBin +from spacy.util import load_config_from_str +from spacy.training import Example +from spacy.training.initialize import init_nlp +import pickle + +from ..util import make_tempdir + + +def test_issue6730(en_vocab): + """Ensure that the KB does not accept empty strings, but otherwise IO works fine.""" + from spacy.kb import KnowledgeBase + + kb = KnowledgeBase(en_vocab, entity_vector_length=3) + kb.add_entity(entity="1", freq=148, entity_vector=[1, 2, 3]) + + with pytest.raises(ValueError): + kb.add_alias(alias="", entities=["1"], probabilities=[0.4]) + assert kb.contains_alias("") is False + + kb.add_alias(alias="x", entities=["1"], probabilities=[0.2]) + kb.add_alias(alias="y", entities=["1"], probabilities=[0.1]) + + with make_tempdir() as tmp_dir: + kb.to_disk(tmp_dir) + kb.from_disk(tmp_dir) + assert kb.get_size_aliases() == 2 + assert set(kb.get_alias_strings()) == {"x", "y"} + + +def test_issue6755(en_tokenizer): + doc = en_tokenizer("This is a magnificent sentence.") + span = doc[:0] + assert span.text_with_ws == "" + assert span.text == "" + + +@pytest.mark.parametrize( + "sentence, start_idx,end_idx,label", + [("Welcome to Mumbai, my friend", 11, 17, "GPE")], +) +def test_issue6815_1(sentence, start_idx, end_idx, label): + nlp = English() + doc = nlp(sentence) + span = doc[:].char_span(start_idx, end_idx, label=label) + assert span.label_ == label + + +@pytest.mark.parametrize( + "sentence, start_idx,end_idx,kb_id", [("Welcome to Mumbai, my friend", 11, 17, 5)] +) +def test_issue6815_2(sentence, start_idx, end_idx, kb_id): + nlp = English() + doc = nlp(sentence) + span = doc[:].char_span(start_idx, end_idx, kb_id=kb_id) + assert span.kb_id == kb_id + + +@pytest.mark.parametrize( + "sentence, start_idx,end_idx,vector", + [("Welcome to Mumbai, my friend", 11, 17, np.array([0.1, 0.2, 0.3]))], +) +def test_issue6815_3(sentence, start_idx, end_idx, vector): + nlp = English() + doc = nlp(sentence) + span = doc[:].char_span(start_idx, end_idx, vector=vector) + assert (span.vector == vector).all() + + +def test_issue6839(en_vocab): + """Ensure that PhraseMatcher accepts Span as input""" + # fmt: off + words = ["I", "like", "Spans", "and", "Docs", "in", "my", "input", ",", "and", "nothing", "else", "."] + # fmt: on + doc = Doc(en_vocab, words=words) + span = doc[:8] + pattern = Doc(en_vocab, words=["Spans", "and", "Docs"]) + matcher = PhraseMatcher(en_vocab) + matcher.add("SPACY", [pattern]) + matches = matcher(span) + assert matches + + +CONFIG_ISSUE_6908 = """ +[paths] +train = "TRAIN_PLACEHOLDER" +raw = null +init_tok2vec = null +vectors = null + +[system] +seed = 0 +gpu_allocator = null + +[nlp] +lang = "en" +pipeline = ["textcat"] +tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"} +disabled = [] +before_creation = null +after_creation = null +after_pipeline_creation = null +batch_size = 1000 + +[components] + +[components.textcat] +factory = "TEXTCAT_PLACEHOLDER" + +[corpora] + +[corpora.train] +@readers = "spacy.Corpus.v1" +path = ${paths:train} + +[corpora.dev] +@readers = "spacy.Corpus.v1" +path = ${paths:train} + + +[training] +train_corpus = "corpora.train" +dev_corpus = "corpora.dev" +seed = ${system.seed} +gpu_allocator = ${system.gpu_allocator} +frozen_components = [] +before_to_disk = null + +[pretraining] + +[initialize] +vectors = ${paths.vectors} +init_tok2vec = ${paths.init_tok2vec} +vocab_data = null +lookups = null +before_init = null +after_init = null + +[initialize.components] + +[initialize.components.textcat] +labels = ['label1', 'label2'] + +[initialize.tokenizer] +""" + + +@pytest.mark.parametrize( + "component_name", ["textcat", "textcat_multilabel"], +) +def test_issue6908(component_name): + """Test intializing textcat with labels in a list""" + + def create_data(out_file): + nlp = spacy.blank("en") + doc = nlp.make_doc("Some text") + doc.cats = {"label1": 0, "label2": 1} + out_data = DocBin(docs=[doc]).to_bytes() + with out_file.open("wb") as file_: + file_.write(out_data) + + with make_tempdir() as tmp_path: + train_path = tmp_path / "train.spacy" + create_data(train_path) + config_str = CONFIG_ISSUE_6908.replace("TEXTCAT_PLACEHOLDER", component_name) + config_str = config_str.replace("TRAIN_PLACEHOLDER", train_path.as_posix()) + config = load_config_from_str(config_str) + init_nlp(config) + + +CONFIG_ISSUE_6950 = """ +[nlp] +lang = "en" +pipeline = ["tok2vec", "tagger"] + +[components] + +[components.tok2vec] +factory = "tok2vec" + +[components.tok2vec.model] +@architectures = "spacy.Tok2Vec.v1" + +[components.tok2vec.model.embed] +@architectures = "spacy.MultiHashEmbed.v1" +width = ${components.tok2vec.model.encode:width} +attrs = ["NORM","PREFIX","SUFFIX","SHAPE"] +rows = [5000,2500,2500,2500] +include_static_vectors = false + +[components.tok2vec.model.encode] +@architectures = "spacy.MaxoutWindowEncoder.v1" +width = 96 +depth = 4 +window_size = 1 +maxout_pieces = 3 + +[components.ner] +factory = "ner" + +[components.tagger] +factory = "tagger" + +[components.tagger.model] +@architectures = "spacy.Tagger.v1" +nO = null + +[components.tagger.model.tok2vec] +@architectures = "spacy.Tok2VecListener.v1" +width = ${components.tok2vec.model.encode:width} +upstream = "*" +""" + + +def test_issue6950(): + """Test that the nlp object with initialized tok2vec with listeners pickles + correctly (and doesn't have lambdas). + """ + nlp = English.from_config(load_config_from_str(CONFIG_ISSUE_6950)) + nlp.initialize(lambda: [Example.from_dict(nlp.make_doc("hello"), {"tags": ["V"]})]) + pickle.dumps(nlp) + nlp("hello") + pickle.dumps(nlp) diff --git a/spacy/tests/regression/test_issue6730.py b/spacy/tests/regression/test_issue6730.py deleted file mode 100644 index 4c2979899..000000000 --- a/spacy/tests/regression/test_issue6730.py +++ /dev/null @@ -1,23 +0,0 @@ -import pytest -from ..util import make_tempdir - - -def test_issue6730(en_vocab): - """Ensure that the KB does not accept empty strings, but otherwise IO works fine.""" - from spacy.kb import KnowledgeBase - - kb = KnowledgeBase(en_vocab, entity_vector_length=3) - kb.add_entity(entity="1", freq=148, entity_vector=[1, 2, 3]) - - with pytest.raises(ValueError): - kb.add_alias(alias="", entities=["1"], probabilities=[0.4]) - assert kb.contains_alias("") is False - - kb.add_alias(alias="x", entities=["1"], probabilities=[0.2]) - kb.add_alias(alias="y", entities=["1"], probabilities=[0.1]) - - with make_tempdir() as tmp_dir: - kb.to_disk(tmp_dir) - kb.from_disk(tmp_dir) - assert kb.get_size_aliases() == 2 - assert set(kb.get_alias_strings()) == {"x", "y"} diff --git a/spacy/tests/regression/test_issue6755.py b/spacy/tests/regression/test_issue6755.py deleted file mode 100644 index 15ddd6fbc..000000000 --- a/spacy/tests/regression/test_issue6755.py +++ /dev/null @@ -1,5 +0,0 @@ -def test_issue6755(en_tokenizer): - doc = en_tokenizer("This is a magnificent sentence.") - span = doc[:0] - assert span.text_with_ws == "" - assert span.text == "" diff --git a/spacy/tests/regression/test_issue6815.py b/spacy/tests/regression/test_issue6815.py deleted file mode 100644 index 7d523e00b..000000000 --- a/spacy/tests/regression/test_issue6815.py +++ /dev/null @@ -1,35 +0,0 @@ -import pytest -from spacy.lang.en import English -import numpy as np - - -@pytest.mark.parametrize( - "sentence, start_idx,end_idx,label", - [("Welcome to Mumbai, my friend", 11, 17, "GPE")], -) -def test_char_span_label(sentence, start_idx, end_idx, label): - nlp = English() - doc = nlp(sentence) - span = doc[:].char_span(start_idx, end_idx, label=label) - assert span.label_ == label - - -@pytest.mark.parametrize( - "sentence, start_idx,end_idx,kb_id", [("Welcome to Mumbai, my friend", 11, 17, 5)] -) -def test_char_span_kb_id(sentence, start_idx, end_idx, kb_id): - nlp = English() - doc = nlp(sentence) - span = doc[:].char_span(start_idx, end_idx, kb_id=kb_id) - assert span.kb_id == kb_id - - -@pytest.mark.parametrize( - "sentence, start_idx,end_idx,vector", - [("Welcome to Mumbai, my friend", 11, 17, np.array([0.1, 0.2, 0.3]))], -) -def test_char_span_vector(sentence, start_idx, end_idx, vector): - nlp = English() - doc = nlp(sentence) - span = doc[:].char_span(start_idx, end_idx, vector=vector) - assert (span.vector == vector).all() diff --git a/spacy/tests/regression/test_issue6839.py b/spacy/tests/regression/test_issue6839.py deleted file mode 100644 index 2148cf867..000000000 --- a/spacy/tests/regression/test_issue6839.py +++ /dev/null @@ -1,15 +0,0 @@ -from spacy.tokens import Doc -from spacy.matcher import PhraseMatcher - - -def test_span_in_phrasematcher(en_vocab): - """Ensure that PhraseMatcher accepts Span as input""" - doc = Doc(en_vocab, - words=["I", "like", "Spans", "and", "Docs", "in", "my", "input", ",", "and", "nothing", "else", "."]) - span = doc[:8] - pattern = Doc(en_vocab, words=["Spans", "and", "Docs"]) - matcher = PhraseMatcher(en_vocab) - matcher.add("SPACY", [pattern]) - matches = matcher(span) - assert matches - diff --git a/spacy/tests/regression/test_issue6908.py b/spacy/tests/regression/test_issue6908.py deleted file mode 100644 index a12ae9e13..000000000 --- a/spacy/tests/regression/test_issue6908.py +++ /dev/null @@ -1,102 +0,0 @@ -import pytest -import spacy -from spacy.language import Language -from spacy.tokens import DocBin -from spacy import util -from spacy.schemas import ConfigSchemaInit - -from spacy.training.initialize import init_nlp - -from ..util import make_tempdir - -TEXTCAT_WITH_LABELS_ARRAY_CONFIG = """ -[paths] -train = "TRAIN_PLACEHOLDER" -raw = null -init_tok2vec = null -vectors = null - -[system] -seed = 0 -gpu_allocator = null - -[nlp] -lang = "en" -pipeline = ["textcat"] -tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"} -disabled = [] -before_creation = null -after_creation = null -after_pipeline_creation = null -batch_size = 1000 - -[components] - -[components.textcat] -factory = "TEXTCAT_PLACEHOLDER" - -[corpora] - -[corpora.train] -@readers = "spacy.Corpus.v1" -path = ${paths:train} - -[corpora.dev] -@readers = "spacy.Corpus.v1" -path = ${paths:train} - - -[training] -train_corpus = "corpora.train" -dev_corpus = "corpora.dev" -seed = ${system.seed} -gpu_allocator = ${system.gpu_allocator} -frozen_components = [] -before_to_disk = null - -[pretraining] - -[initialize] -vectors = ${paths.vectors} -init_tok2vec = ${paths.init_tok2vec} -vocab_data = null -lookups = null -before_init = null -after_init = null - -[initialize.components] - -[initialize.components.textcat] -labels = ['label1', 'label2'] - -[initialize.tokenizer] -""" - - -@pytest.mark.parametrize( - "component_name", - ["textcat", "textcat_multilabel"], -) -def test_textcat_initialize_labels_validation(component_name): - """Test intializing textcat with labels in a list""" - - def create_data(out_file): - nlp = spacy.blank("en") - doc = nlp.make_doc("Some text") - doc.cats = {"label1": 0, "label2": 1} - - out_data = DocBin(docs=[doc]).to_bytes() - with out_file.open("wb") as file_: - file_.write(out_data) - - with make_tempdir() as tmp_path: - train_path = tmp_path / "train.spacy" - create_data(train_path) - - config_str = TEXTCAT_WITH_LABELS_ARRAY_CONFIG.replace( - "TEXTCAT_PLACEHOLDER", component_name - ) - config_str = config_str.replace("TRAIN_PLACEHOLDER", train_path.as_posix()) - - config = util.load_config_from_str(config_str) - init_nlp(config) diff --git a/spacy/tests/regression/test_issue6950.py b/spacy/tests/regression/test_issue6950.py deleted file mode 100644 index f9d75a4ff..000000000 --- a/spacy/tests/regression/test_issue6950.py +++ /dev/null @@ -1,59 +0,0 @@ -from spacy.lang.en import English -from spacy.training import Example -from spacy.util import load_config_from_str -import pickle - - -CONFIG = """ -[nlp] -lang = "en" -pipeline = ["tok2vec", "tagger"] - -[components] - -[components.tok2vec] -factory = "tok2vec" - -[components.tok2vec.model] -@architectures = "spacy.Tok2Vec.v1" - -[components.tok2vec.model.embed] -@architectures = "spacy.MultiHashEmbed.v1" -width = ${components.tok2vec.model.encode:width} -attrs = ["NORM","PREFIX","SUFFIX","SHAPE"] -rows = [5000,2500,2500,2500] -include_static_vectors = false - -[components.tok2vec.model.encode] -@architectures = "spacy.MaxoutWindowEncoder.v1" -width = 96 -depth = 4 -window_size = 1 -maxout_pieces = 3 - -[components.ner] -factory = "ner" - -[components.tagger] -factory = "tagger" - -[components.tagger.model] -@architectures = "spacy.Tagger.v1" -nO = null - -[components.tagger.model.tok2vec] -@architectures = "spacy.Tok2VecListener.v1" -width = ${components.tok2vec.model.encode:width} -upstream = "*" -""" - - -def test_issue6950(): - """Test that the nlp object with initialized tok2vec with listeners pickles - correctly (and doesn't have lambdas). - """ - nlp = English.from_config(load_config_from_str(CONFIG)) - nlp.initialize(lambda: [Example.from_dict(nlp.make_doc("hello"), {"tags": ["V"]})]) - pickle.dumps(nlp) - nlp("hello") - pickle.dumps(nlp) diff --git a/spacy/tests/regression/test_issue7029.py b/spacy/tests/regression/test_issue7029.py index dcfb8d9e7..cee48522d 100644 --- a/spacy/tests/regression/test_issue7029.py +++ b/spacy/tests/regression/test_issue7029.py @@ -51,8 +51,7 @@ TRAIN_DATA = [ def test_issue7029(): - """Test that an empty document doesn't mess up an entire batch. - """ + """Test that an empty document doesn't mess up an entire batch.""" nlp = English.from_config(load_config_from_str(CONFIG)) train_examples = [] for t in TRAIN_DATA: diff --git a/spacy/tests/vocab_vectors/test_lexeme.py b/spacy/tests/vocab_vectors/test_lexeme.py index 4eeff5175..b6fee6628 100644 --- a/spacy/tests/vocab_vectors/test_lexeme.py +++ b/spacy/tests/vocab_vectors/test_lexeme.py @@ -57,6 +57,7 @@ def test_vocab_lexeme_add_flag_provided_id(en_vocab): assert en_vocab["dogs"].check_flag(is_len4) is True en_vocab.add_flag(lambda string: string.isdigit(), flag_id=IS_DIGIT) + def test_vocab_lexeme_oov_rank(en_vocab): """Test that default rank is OOV_RANK.""" lex = en_vocab["word"] From b31471b5b8c1d16dd0960f4e121b32c5f1e7fefa Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 13 Feb 2021 23:50:00 +1100 Subject: [PATCH 34/35] Set version to v3.0.2 --- spacy/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/about.py b/spacy/about.py index 1787c2fcc..771105515 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy" -__version__ = "3.0.1" +__version__ = "3.0.2" __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" __projects__ = "https://github.com/explosion/projects" From 660642902ae5d73b910108a28aa6c461da48d9d9 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 14 Feb 2021 13:36:13 +1100 Subject: [PATCH 35/35] Increment version [ci skip] --- spacy/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/about.py b/spacy/about.py index 771105515..c19e1aeaa 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy" -__version__ = "3.0.2" +__version__ = "3.0.3" __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" __projects__ = "https://github.com/explosion/projects"