diff --git a/.github/azure-steps.yml b/.github/azure-steps.yml index 9d57219ca..cc0247b3a 100644 --- a/.github/azure-steps.yml +++ b/.github/azure-steps.yml @@ -10,6 +10,7 @@ steps: inputs: versionSpec: ${{ parameters.python_version }} architecture: ${{ parameters.architecture }} + allowUnstable: true - bash: | echo "##vso[task.setvariable variable=python_version]${{ parameters.python_version }}" diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index b959262e3..df59697b1 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -6,7 +6,7 @@ repos: language_version: python3.7 additional_dependencies: ['click==8.0.4'] - repo: https://gitlab.com/pycqa/flake8 - rev: 3.9.2 + rev: 5.0.4 hooks: - id: flake8 args: diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 2f5201614..357cce835 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -85,6 +85,15 @@ jobs: Python310Mac: imageName: "macos-latest" python.version: "3.10" + Python311Linux: + imageName: 'ubuntu-latest' + python.version: '3.11.0-rc.2' + Python311Windows: + imageName: 'windows-latest' + python.version: '3.11.0-rc.2' + Python311Mac: + imageName: 'macos-latest' + python.version: '3.11.0-rc.2' maxParallel: 4 pool: vmImage: $(imageName) diff --git a/requirements.txt b/requirements.txt index 446560c06..9d6bbb2c4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,7 +15,7 @@ pathy>=0.3.5 numpy>=1.15.0 requests>=2.13.0,<3.0.0 tqdm>=4.38.0,<5.0.0 -pydantic>=1.7.4,!=1.8,!=1.8.1,<1.10.0 +pydantic>=1.7.4,!=1.8,!=1.8.1,<1.11.0 jinja2 langcodes>=3.2.0,<4.0.0 # Official Python utilities @@ -28,7 +28,7 @@ cython>=0.25,<3.0 pytest>=5.2.0,!=7.1.0 pytest-timeout>=1.3.0,<2.0.0 mock>=2.0.0,<3.0.0 -flake8>=3.8.0,<3.10.0 +flake8>=3.8.0,<6.0.0 hypothesis>=3.27.0,<7.0.0 mypy>=0.980,<0.990; platform_machine != "aarch64" and python_version >= "3.7" types-dataclasses>=0.1.3; python_version < "3.7" diff --git a/setup.cfg b/setup.cfg index 2dc5e7042..c2653feba 100644 --- a/setup.cfg +++ b/setup.cfg @@ -56,7 +56,7 @@ install_requires = tqdm>=4.38.0,<5.0.0 numpy>=1.15.0 requests>=2.13.0,<3.0.0 - pydantic>=1.7.4,!=1.8,!=1.8.1,<1.10.0 + pydantic>=1.7.4,!=1.8,!=1.8.1,<1.11.0 jinja2 # Official Python utilities setuptools diff --git a/spacy/about.py b/spacy/about.py index 843c15aba..ce86e6294 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy" -__version__ = "3.4.1" +__version__ = "3.4.2" __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" __projects__ = "https://github.com/explosion/projects" diff --git a/spacy/cli/project/dvc.py b/spacy/cli/project/dvc.py index 83dc5efbf..a15353855 100644 --- a/spacy/cli/project/dvc.py +++ b/spacy/cli/project/dvc.py @@ -25,6 +25,7 @@ def project_update_dvc_cli( project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False), workflow: Optional[str] = Arg(None, help=f"Name of workflow defined in {PROJECT_FILE}. Defaults to first workflow if not set."), verbose: bool = Opt(False, "--verbose", "-V", help="Print more info"), + quiet: bool = Opt(False, "--quiet", "-q", help="Print less info"), force: bool = Opt(False, "--force", "-F", help="Force update DVC config"), # fmt: on ): @@ -36,7 +37,7 @@ def project_update_dvc_cli( DOCS: https://spacy.io/api/cli#project-dvc """ - project_update_dvc(project_dir, workflow, verbose=verbose, force=force) + project_update_dvc(project_dir, workflow, verbose=verbose, quiet=quiet, force=force) def project_update_dvc( @@ -44,6 +45,7 @@ def project_update_dvc( workflow: Optional[str] = None, *, verbose: bool = False, + quiet: bool = False, force: bool = False, ) -> None: """Update the auto-generated Data Version Control (DVC) config file. A DVC @@ -54,11 +56,12 @@ def project_update_dvc( workflow (Optional[str]): Optional name of workflow defined in project.yml. If not set, the first workflow will be used. verbose (bool): Print more info. + quiet (bool): Print less info. force (bool): Force update DVC config. """ config = load_project_config(project_dir) updated = update_dvc_config( - project_dir, config, workflow, verbose=verbose, force=force + project_dir, config, workflow, verbose=verbose, quiet=quiet, force=force ) help_msg = "To execute the workflow with DVC, run: dvc repro" if updated: @@ -72,7 +75,7 @@ def update_dvc_config( config: Dict[str, Any], workflow: Optional[str] = None, verbose: bool = False, - silent: bool = False, + quiet: bool = False, force: bool = False, ) -> bool: """Re-run the DVC commands in dry mode and update dvc.yaml file in the @@ -83,7 +86,7 @@ def update_dvc_config( path (Path): The path to the project directory. config (Dict[str, Any]): The loaded project.yml. verbose (bool): Whether to print additional info (via DVC). - silent (bool): Don't output anything (via DVC). + quiet (bool): Don't output anything (via DVC). force (bool): Force update, even if hashes match. RETURNS (bool): Whether the DVC config file was updated. """ @@ -105,6 +108,14 @@ def update_dvc_config( dvc_config_path.unlink() dvc_commands = [] config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])} + + # some flags that apply to every command + flags = [] + if verbose: + flags.append("--verbose") + if quiet: + flags.append("--quiet") + for name in workflows[workflow]: command = config_commands[name] deps = command.get("deps", []) @@ -118,14 +129,26 @@ def update_dvc_config( deps_cmd = [c for cl in [["-d", p] for p in deps] for c in cl] outputs_cmd = [c for cl in [["-o", p] for p in outputs] for c in cl] outputs_nc_cmd = [c for cl in [["-O", p] for p in outputs_no_cache] for c in cl] - dvc_cmd = ["run", "-n", name, "-w", str(path), "--no-exec"] + + dvc_cmd = ["run", *flags, "-n", name, "-w", str(path), "--no-exec"] if command.get("no_skip"): dvc_cmd.append("--always-changed") full_cmd = [*dvc_cmd, *deps_cmd, *outputs_cmd, *outputs_nc_cmd, *project_cmd] dvc_commands.append(join_command(full_cmd)) + + if not dvc_commands: + # If we don't check for this, then there will be an error when reading the + # config, since DVC wouldn't create it. + msg.fail( + "No usable commands for DVC found. This can happen if none of your " + "commands have dependencies or outputs.", + exits=1, + ) + with working_dir(path): - dvc_flags = {"--verbose": verbose, "--quiet": silent} - run_dvc_commands(dvc_commands, flags=dvc_flags) + for c in dvc_commands: + dvc_command = "dvc " + c + run_command(dvc_command) with dvc_config_path.open("r+", encoding="utf8") as f: content = f.read() f.seek(0, 0) @@ -133,26 +156,6 @@ def update_dvc_config( return True -def run_dvc_commands( - commands: Iterable[str] = SimpleFrozenList(), flags: Dict[str, bool] = {} -) -> None: - """Run a sequence of DVC commands in a subprocess, in order. - - commands (List[str]): The string commands without the leading "dvc". - flags (Dict[str, bool]): Conditional flags to be added to command. Makes it - easier to pass flags like --quiet that depend on a variable or - command-line setting while avoiding lots of nested conditionals. - """ - for c in commands: - command = split_command(c) - dvc_command = ["dvc", *command] - # Add the flags if they are set to True - for flag, is_active in flags.items(): - if is_active: - dvc_command.append(flag) - run_command(dvc_command) - - def check_workflows(workflows: List[str], workflow: Optional[str] = None) -> None: """Validate workflows provided in project.yml and check that a given workflow can be used to generate a DVC config. diff --git a/spacy/lang/ru/lemmatizer.py b/spacy/lang/ru/lemmatizer.py index 720d3a8cb..c37a3a91a 100644 --- a/spacy/lang/ru/lemmatizer.py +++ b/spacy/lang/ru/lemmatizer.py @@ -23,7 +23,7 @@ class RussianLemmatizer(Lemmatizer): overwrite: bool = False, scorer: Optional[Callable] = lemmatizer_score, ) -> None: - if mode == "pymorphy2": + if mode in {"pymorphy2", "pymorphy2_lookup"}: try: from pymorphy2 import MorphAnalyzer except ImportError: diff --git a/spacy/lang/uk/lemmatizer.py b/spacy/lang/uk/lemmatizer.py index 97ee80479..8337e7328 100644 --- a/spacy/lang/uk/lemmatizer.py +++ b/spacy/lang/uk/lemmatizer.py @@ -18,7 +18,7 @@ class UkrainianLemmatizer(RussianLemmatizer): overwrite: bool = False, scorer: Optional[Callable] = lemmatizer_score, ) -> None: - if mode == "pymorphy2": + if mode in {"pymorphy2", "pymorphy2_lookup"}: try: from pymorphy2 import MorphAnalyzer except ImportError: diff --git a/spacy/pipeline/edit_tree_lemmatizer.py b/spacy/pipeline/edit_tree_lemmatizer.py index b7d615f6d..12f9b73a3 100644 --- a/spacy/pipeline/edit_tree_lemmatizer.py +++ b/spacy/pipeline/edit_tree_lemmatizer.py @@ -1,7 +1,6 @@ from typing import cast, Any, Callable, Dict, Iterable, List, Optional -from typing import Sequence, Tuple, Union +from typing import Tuple from collections import Counter -from copy import deepcopy from itertools import islice import numpy as np @@ -149,9 +148,7 @@ class EditTreeLemmatizer(TrainablePipe): if not any(len(doc) for doc in docs): # Handle cases where there are no tokens in any docs. n_labels = len(self.cfg["labels"]) - guesses: List[Ints2d] = [ - self.model.ops.alloc((0, n_labels), dtype="i") for doc in docs - ] + guesses: List[Ints2d] = [self.model.ops.alloc2i(0, n_labels) for _ in docs] assert len(guesses) == n_docs return guesses scores = self.model.predict(docs) diff --git a/spacy/pipeline/spancat.py b/spacy/pipeline/spancat.py index 5ede622c2..956bbb72c 100644 --- a/spacy/pipeline/spancat.py +++ b/spacy/pipeline/spancat.py @@ -133,6 +133,9 @@ def make_spancat( spans_key (str): Key of the doc.spans dict to save the spans under. During initialization and training, the component will look for spans on the reference document under the same key. + scorer (Optional[Callable]): The scoring method. Defaults to + Scorer.score_spans for the Doc.spans[spans_key] with overlapping + spans allowed. threshold (float): Minimum probability to consider a prediction positive. Spans with a positive prediction will be saved on the Doc. Defaults to 0.5. diff --git a/spacy/pipeline/textcat_multilabel.py b/spacy/pipeline/textcat_multilabel.py index 10aef46aa..493c440c3 100644 --- a/spacy/pipeline/textcat_multilabel.py +++ b/spacy/pipeline/textcat_multilabel.py @@ -96,8 +96,8 @@ def make_multilabel_textcat( model: Model[List[Doc], List[Floats2d]], threshold: float, scorer: Optional[Callable], -) -> "TextCategorizer": - """Create a TextCategorizer component. The text categorizer predicts categories +) -> "MultiLabel_TextCategorizer": + """Create a MultiLabel_TextCategorizer component. The text categorizer predicts categories over a whole document. It can learn one or more labels, and the labels are considered to be non-mutually exclusive, which means that there can be zero or more labels per doc). @@ -105,6 +105,7 @@ def make_multilabel_textcat( model (Model[List[Doc], List[Floats2d]]): A model instance that predicts scores for each category. threshold (float): Cutoff to consider a prediction "positive". + scorer (Optional[Callable]): The scoring method. """ return MultiLabel_TextCategorizer( nlp.vocab, model, name, threshold=threshold, scorer=scorer @@ -147,6 +148,7 @@ class MultiLabel_TextCategorizer(TextCategorizer): name (str): The component instance name, used to add entries to the losses during training. threshold (float): Cutoff to consider a prediction "positive". + scorer (Optional[Callable]): The scoring method. DOCS: https://spacy.io/api/textcategorizer#init """ diff --git a/spacy/schemas.py b/spacy/schemas.py index 048082134..c824d76b9 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -181,12 +181,12 @@ class TokenPatternNumber(BaseModel): IS_SUBSET: Optional[List[StrictInt]] = Field(None, alias="is_subset") IS_SUPERSET: Optional[List[StrictInt]] = Field(None, alias="is_superset") INTERSECTS: Optional[List[StrictInt]] = Field(None, alias="intersects") - EQ: Union[StrictInt, StrictFloat] = Field(None, alias="==") - NEQ: Union[StrictInt, StrictFloat] = Field(None, alias="!=") - GEQ: Union[StrictInt, StrictFloat] = Field(None, alias=">=") - LEQ: Union[StrictInt, StrictFloat] = Field(None, alias="<=") - GT: Union[StrictInt, StrictFloat] = Field(None, alias=">") - LT: Union[StrictInt, StrictFloat] = Field(None, alias="<") + EQ: Optional[Union[StrictInt, StrictFloat]] = Field(None, alias="==") + NEQ: Optional[Union[StrictInt, StrictFloat]] = Field(None, alias="!=") + GEQ: Optional[Union[StrictInt, StrictFloat]] = Field(None, alias=">=") + LEQ: Optional[Union[StrictInt, StrictFloat]] = Field(None, alias="<=") + GT: Optional[Union[StrictInt, StrictFloat]] = Field(None, alias=">") + LT: Optional[Union[StrictInt, StrictFloat]] = Field(None, alias="<") class Config: extra = "forbid" @@ -430,7 +430,7 @@ class ProjectConfigAssetURL(BaseModel): # fmt: off dest: StrictStr = Field(..., title="Destination of downloaded asset") url: Optional[StrictStr] = Field(None, title="URL of asset") - checksum: str = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})") + checksum: Optional[str] = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})") description: StrictStr = Field("", title="Description of asset") # fmt: on @@ -438,7 +438,7 @@ class ProjectConfigAssetURL(BaseModel): class ProjectConfigAssetGit(BaseModel): # fmt: off git: ProjectConfigAssetGitItem = Field(..., title="Git repo information") - checksum: str = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})") + checksum: Optional[str] = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})") description: Optional[StrictStr] = Field(None, title="Description of asset") # fmt: on @@ -508,9 +508,9 @@ class DocJSONSchema(BaseModel): None, title="Indices of sentences' start and end indices" ) text: StrictStr = Field(..., title="Document text") - spans: Dict[StrictStr, List[Dict[StrictStr, Union[StrictStr, StrictInt]]]] = Field( - None, title="Span information - end/start indices, label, KB ID" - ) + spans: Optional[ + Dict[StrictStr, List[Dict[StrictStr, Union[StrictStr, StrictInt]]]] + ] = Field(None, title="Span information - end/start indices, label, KB ID") tokens: List[Dict[StrictStr, Union[StrictStr, StrictInt]]] = Field( ..., title="Token information - ID, start, annotations" ) @@ -519,9 +519,9 @@ class DocJSONSchema(BaseModel): title="Any custom data stored in the document's _ attribute", alias="_", ) - underscore_token: Optional[Dict[StrictStr, Dict[StrictStr, Any]]] = Field( + underscore_token: Optional[Dict[StrictStr, List[Dict[StrictStr, Any]]]] = Field( None, title="Any custom data stored in the token's _ attribute" ) - underscore_span: Optional[Dict[StrictStr, Dict[StrictStr, Any]]] = Field( + underscore_span: Optional[Dict[StrictStr, List[Dict[StrictStr, Any]]]] = Field( None, title="Any custom data stored in the span's _ attribute" ) diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 3c1c1333a..0fc74243d 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -343,6 +343,14 @@ def ru_lemmatizer(): return get_lang_class("ru")().add_pipe("lemmatizer") +@pytest.fixture +def ru_lookup_lemmatizer(): + pytest.importorskip("pymorphy2") + return get_lang_class("ru")().add_pipe( + "lemmatizer", config={"mode": "pymorphy2_lookup"} + ) + + @pytest.fixture(scope="session") def sa_tokenizer(): return get_lang_class("sa")().tokenizer @@ -422,6 +430,15 @@ def uk_lemmatizer(): return get_lang_class("uk")().add_pipe("lemmatizer") +@pytest.fixture +def uk_lookup_lemmatizer(): + pytest.importorskip("pymorphy2") + pytest.importorskip("pymorphy2_dicts_uk") + return get_lang_class("uk")().add_pipe( + "lemmatizer", config={"mode": "pymorphy2_lookup"} + ) + + @pytest.fixture(scope="session") def ur_tokenizer(): return get_lang_class("ur")().tokenizer diff --git a/spacy/tests/doc/test_json_doc_conversion.py b/spacy/tests/doc/test_json_doc_conversion.py index 0d7c061c9..19698cfb2 100644 --- a/spacy/tests/doc/test_json_doc_conversion.py +++ b/spacy/tests/doc/test_json_doc_conversion.py @@ -128,7 +128,9 @@ def test_doc_to_json_with_token_span_attributes(doc): doc._.json_test1 = "hello world" doc._.json_test2 = [1, 2, 3] doc[0:1]._.span_test = "span_attribute" + doc[0:2]._.span_test = "span_attribute_2" doc[0]._.token_test = 117 + doc[1]._.token_test = 118 doc.spans["span_group"] = [doc[0:1]] json_doc = doc.to_json( underscore=["json_test1", "json_test2", "token_test", "span_test"] @@ -139,8 +141,10 @@ def test_doc_to_json_with_token_span_attributes(doc): assert json_doc["_"]["json_test2"] == [1, 2, 3] assert "underscore_token" in json_doc assert "underscore_span" in json_doc - assert json_doc["underscore_token"]["token_test"]["value"] == 117 - assert json_doc["underscore_span"]["span_test"]["value"] == "span_attribute" + assert json_doc["underscore_token"]["token_test"][0]["value"] == 117 + assert json_doc["underscore_token"]["token_test"][1]["value"] == 118 + assert json_doc["underscore_span"]["span_test"][0]["value"] == "span_attribute" + assert json_doc["underscore_span"]["span_test"][1]["value"] == "span_attribute_2" assert len(schemas.validate(schemas.DocJSONSchema, json_doc)) == 0 assert srsly.json_loads(srsly.json_dumps(json_doc)) == json_doc @@ -161,8 +165,8 @@ def test_doc_to_json_with_custom_user_data(doc): assert json_doc["_"]["json_test"] == "hello world" assert "underscore_token" in json_doc assert "underscore_span" in json_doc - assert json_doc["underscore_token"]["token_test"]["value"] == 117 - assert json_doc["underscore_span"]["span_test"]["value"] == "span_attribute" + assert json_doc["underscore_token"]["token_test"][0]["value"] == 117 + assert json_doc["underscore_span"]["span_test"][0]["value"] == "span_attribute" assert len(schemas.validate(schemas.DocJSONSchema, json_doc)) == 0 assert srsly.json_loads(srsly.json_dumps(json_doc)) == json_doc @@ -181,8 +185,8 @@ def test_doc_to_json_with_token_span_same_identifier(doc): assert json_doc["_"]["my_ext"] == "hello world" assert "underscore_token" in json_doc assert "underscore_span" in json_doc - assert json_doc["underscore_token"]["my_ext"]["value"] == 117 - assert json_doc["underscore_span"]["my_ext"]["value"] == "span_attribute" + assert json_doc["underscore_token"]["my_ext"][0]["value"] == 117 + assert json_doc["underscore_span"]["my_ext"][0]["value"] == "span_attribute" assert len(schemas.validate(schemas.DocJSONSchema, json_doc)) == 0 assert srsly.json_loads(srsly.json_dumps(json_doc)) == json_doc @@ -195,10 +199,9 @@ def test_doc_to_json_with_token_attributes_missing(doc): doc[0]._.token_test = 117 json_doc = doc.to_json(underscore=["span_test"]) - assert "underscore_token" in json_doc assert "underscore_span" in json_doc - assert json_doc["underscore_span"]["span_test"]["value"] == "span_attribute" - assert "token_test" not in json_doc["underscore_token"] + assert json_doc["underscore_span"]["span_test"][0]["value"] == "span_attribute" + assert "underscore_token" not in json_doc assert len(schemas.validate(schemas.DocJSONSchema, json_doc)) == 0 @@ -283,7 +286,9 @@ def test_json_to_doc_with_token_span_attributes(doc): doc._.json_test1 = "hello world" doc._.json_test2 = [1, 2, 3] doc[0:1]._.span_test = "span_attribute" + doc[0:2]._.span_test = "span_attribute_2" doc[0]._.token_test = 117 + doc[1]._.token_test = 118 json_doc = doc.to_json( underscore=["json_test1", "json_test2", "token_test", "span_test"] @@ -295,7 +300,9 @@ def test_json_to_doc_with_token_span_attributes(doc): assert new_doc._.json_test1 == "hello world" assert new_doc._.json_test2 == [1, 2, 3] assert new_doc[0]._.token_test == 117 + assert new_doc[1]._.token_test == 118 assert new_doc[0:1]._.span_test == "span_attribute" + assert new_doc[0:2]._.span_test == "span_attribute_2" assert new_doc.user_data == doc.user_data assert new_doc.to_bytes(exclude=["user_data"]) == doc.to_bytes( exclude=["user_data"] diff --git a/spacy/tests/lang/ru/test_lemmatizer.py b/spacy/tests/lang/ru/test_lemmatizer.py index 9ca7f441b..e82fd4f8c 100644 --- a/spacy/tests/lang/ru/test_lemmatizer.py +++ b/spacy/tests/lang/ru/test_lemmatizer.py @@ -78,3 +78,17 @@ def test_ru_lemmatizer_punct(ru_lemmatizer): assert ru_lemmatizer.pymorphy2_lemmatize(doc[0]) == ['"'] doc = Doc(ru_lemmatizer.vocab, words=["»"], pos=["PUNCT"]) assert ru_lemmatizer.pymorphy2_lemmatize(doc[0]) == ['"'] + + +def test_ru_doc_lookup_lemmatization(ru_lookup_lemmatizer): + words = ["мама", "мыла", "раму"] + pos = ["NOUN", "VERB", "NOUN"] + morphs = [ + "Animacy=Anim|Case=Nom|Gender=Fem|Number=Sing", + "Aspect=Imp|Gender=Fem|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act", + "Animacy=Anim|Case=Acc|Gender=Fem|Number=Sing", + ] + doc = Doc(ru_lookup_lemmatizer.vocab, words=words, pos=pos, morphs=morphs) + doc = ru_lookup_lemmatizer(doc) + lemmas = [token.lemma_ for token in doc] + assert lemmas == ["мама", "мыла", "раму"] diff --git a/spacy/tests/lang/uk/test_lemmatizer.py b/spacy/tests/lang/uk/test_lemmatizer.py index 57dd4198a..788744aa1 100644 --- a/spacy/tests/lang/uk/test_lemmatizer.py +++ b/spacy/tests/lang/uk/test_lemmatizer.py @@ -9,3 +9,11 @@ def test_uk_lemmatizer(uk_lemmatizer): """Check that the default uk lemmatizer runs.""" doc = Doc(uk_lemmatizer.vocab, words=["a", "b", "c"]) uk_lemmatizer(doc) + assert [token.lemma for token in doc] + + +def test_uk_lookup_lemmatizer(uk_lookup_lemmatizer): + """Check that the lookup uk lemmatizer runs.""" + doc = Doc(uk_lookup_lemmatizer.vocab, words=["a", "b", "c"]) + uk_lookup_lemmatizer(doc) + assert [token.lemma for token in doc] diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index d7d2fd8e6..295f91c28 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -1608,24 +1608,20 @@ cdef class Doc: Doc.set_extension(attr) self._.set(attr, doc_json["_"][attr]) - if doc_json.get("underscore_token", {}): - for token_attr in doc_json["underscore_token"]: - token_start = doc_json["underscore_token"][token_attr]["token_start"] - value = doc_json["underscore_token"][token_attr]["value"] - - if not Token.has_extension(token_attr): - Token.set_extension(token_attr) - self[token_start]._.set(token_attr, value) + for token_attr in doc_json.get("underscore_token", {}): + if not Token.has_extension(token_attr): + Token.set_extension(token_attr) + for token_data in doc_json["underscore_token"][token_attr]: + start = token_by_char(self.c, self.length, token_data["start"]) + value = token_data["value"] + self[start]._.set(token_attr, value) - if doc_json.get("underscore_span", {}): - for span_attr in doc_json["underscore_span"]: - token_start = doc_json["underscore_span"][span_attr]["token_start"] - token_end = doc_json["underscore_span"][span_attr]["token_end"] - value = doc_json["underscore_span"][span_attr]["value"] - - if not Span.has_extension(span_attr): - Span.set_extension(span_attr) - self[token_start:token_end]._.set(span_attr, value) + for span_attr in doc_json.get("underscore_span", {}): + if not Span.has_extension(span_attr): + Span.set_extension(span_attr) + for span_data in doc_json["underscore_span"][span_attr]: + value = span_data["value"] + self.char_span(span_data["start"], span_data["end"])._.set(span_attr, value) return self def to_json(self, underscore=None): @@ -1673,30 +1669,34 @@ cdef class Doc: if underscore: user_keys = set() if self.user_data: - data["_"] = {} - data["underscore_token"] = {} - data["underscore_span"] = {} - for data_key in self.user_data: + for data_key, value in self.user_data.copy().items(): if type(data_key) == tuple and len(data_key) >= 4 and data_key[0] == "._.": attr = data_key[1] start = data_key[2] end = data_key[3] if attr in underscore: user_keys.add(attr) - value = self.user_data[data_key] if not srsly.is_json_serializable(value): raise ValueError(Errors.E107.format(attr=attr, value=repr(value))) # Check if doc attribute if start is None: + if "_" not in data: + data["_"] = {} data["_"][attr] = value # Check if token attribute elif end is None: + if "underscore_token" not in data: + data["underscore_token"] = {} if attr not in data["underscore_token"]: - data["underscore_token"][attr] = {"token_start": start, "value": value} + data["underscore_token"][attr] = [] + data["underscore_token"][attr].append({"start": start, "value": value}) # Else span attribute else: + if "underscore_span" not in data: + data["underscore_span"] = {} if attr not in data["underscore_span"]: - data["underscore_span"][attr] = {"token_start": start, "token_end": end, "value": value} + data["underscore_span"][attr] = [] + data["underscore_span"][attr].append({"start": start, "end": end, "value": value}) for attr in underscore: if attr not in user_keys: diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index e5cd3089b..fc2c46022 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -1482,7 +1482,7 @@ You'll also need to add the assets you want to track with ```cli -$ python -m spacy project dvc [project_dir] [workflow] [--force] [--verbose] +$ python -m spacy project dvc [project_dir] [workflow] [--force] [--verbose] [--quiet] ``` > #### Example @@ -1499,6 +1499,7 @@ $ python -m spacy project dvc [project_dir] [workflow] [--force] [--verbose] | `workflow` | Name of workflow defined in `project.yml`. Defaults to first workflow if not set. ~~Optional[str] \(option)~~ | | `--force`, `-F` | Force-updating config file. ~~bool (flag)~~ | | `--verbose`, `-V` | Print more output generated by DVC. ~~bool (flag)~~ | +| `--quiet`, `-q` | Print no output generated by DVC. ~~bool (flag)~~ | | `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | | **CREATES** | A `dvc.yaml` file in the project directory, based on the steps defined in the given workflow. | diff --git a/website/meta/universe.json b/website/meta/universe.json index a6a1a0fc7..637e9d6ce 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -2460,20 +2460,20 @@ "import spacy", "from spacy_wordnet.wordnet_annotator import WordnetAnnotator ", "", - "# Load an spacy model (supported models are \"es\" and \"en\") ", - "nlp = spacy.load('en')", - "# Spacy 3.x", - "nlp.add_pipe(\"spacy_wordnet\", after='tagger', config={'lang': nlp.lang})", - "# Spacy 2.x", + "# Load a spaCy model (supported languages are \"es\" and \"en\") ", + "nlp = spacy.load('en_core_web_sm')", + "# spaCy 3.x", + "nlp.add_pipe(\"spacy_wordnet\", after='tagger')", + "# spaCy 2.x", "# nlp.add_pipe(WordnetAnnotator(nlp.lang), after='tagger')", "token = nlp('prices')[0]", "", - "# wordnet object link spacy token with nltk wordnet interface by giving acces to", + "# WordNet object links spaCy token with NLTK WordNet interface by giving access to", "# synsets and lemmas ", "token._.wordnet.synsets()", "token._.wordnet.lemmas()", "", - "# And automatically tags with wordnet domains", + "# And automatically add info about WordNet domains", "token._.wordnet.wordnet_domains()" ], "author": "recognai",