Merge remote-tracking branch 'upstream/master' into chore/update-v4-from-master

This commit is contained in:
Adriane Boyd 2022-10-21 09:13:32 +02:00
commit 103b24fb25
22 changed files with 228 additions and 99 deletions

View File

@ -10,6 +10,7 @@ steps:
inputs: inputs:
versionSpec: ${{ parameters.python_version }} versionSpec: ${{ parameters.python_version }}
architecture: ${{ parameters.architecture }} architecture: ${{ parameters.architecture }}
allowUnstable: true
- bash: | - bash: |
echo "##vso[task.setvariable variable=python_version]${{ parameters.python_version }}" echo "##vso[task.setvariable variable=python_version]${{ parameters.python_version }}"

View File

@ -6,7 +6,7 @@ repos:
language_version: python3.7 language_version: python3.7
additional_dependencies: ['click==8.0.4'] additional_dependencies: ['click==8.0.4']
- repo: https://gitlab.com/pycqa/flake8 - repo: https://gitlab.com/pycqa/flake8
rev: 3.9.2 rev: 5.0.4
hooks: hooks:
- id: flake8 - id: flake8
args: args:

View File

@ -85,6 +85,15 @@ jobs:
Python310Mac: Python310Mac:
imageName: "macos-latest" imageName: "macos-latest"
python.version: "3.10" python.version: "3.10"
Python311Linux:
imageName: 'ubuntu-latest'
python.version: '3.11.0-rc.2'
Python311Windows:
imageName: 'windows-latest'
python.version: '3.11.0-rc.2'
Python311Mac:
imageName: 'macos-latest'
python.version: '3.11.0-rc.2'
maxParallel: 4 maxParallel: 4
pool: pool:
vmImage: $(imageName) vmImage: $(imageName)

View File

@ -15,7 +15,7 @@ pathy>=0.3.5
numpy>=1.15.0 numpy>=1.15.0
requests>=2.13.0,<3.0.0 requests>=2.13.0,<3.0.0
tqdm>=4.38.0,<5.0.0 tqdm>=4.38.0,<5.0.0
pydantic>=1.7.4,!=1.8,!=1.8.1,<1.10.0 pydantic>=1.7.4,!=1.8,!=1.8.1,<1.11.0
jinja2 jinja2
langcodes>=3.2.0,<4.0.0 langcodes>=3.2.0,<4.0.0
# Official Python utilities # Official Python utilities
@ -28,7 +28,7 @@ cython>=0.25,<3.0
pytest>=5.2.0,!=7.1.0 pytest>=5.2.0,!=7.1.0
pytest-timeout>=1.3.0,<2.0.0 pytest-timeout>=1.3.0,<2.0.0
mock>=2.0.0,<3.0.0 mock>=2.0.0,<3.0.0
flake8>=3.8.0,<3.10.0 flake8>=3.8.0,<6.0.0
hypothesis>=3.27.0,<7.0.0 hypothesis>=3.27.0,<7.0.0
mypy>=0.980,<0.990; platform_machine != "aarch64" and python_version >= "3.7" mypy>=0.980,<0.990; platform_machine != "aarch64" and python_version >= "3.7"
types-dataclasses>=0.1.3; python_version < "3.7" types-dataclasses>=0.1.3; python_version < "3.7"

View File

@ -48,7 +48,7 @@ install_requires =
tqdm>=4.38.0,<5.0.0 tqdm>=4.38.0,<5.0.0
numpy>=1.15.0 numpy>=1.15.0
requests>=2.13.0,<3.0.0 requests>=2.13.0,<3.0.0
pydantic>=1.7.4,!=1.8,!=1.8.1,<1.10.0 pydantic>=1.7.4,!=1.8,!=1.8.1,<1.11.0
jinja2 jinja2
# Official Python utilities # Official Python utilities
setuptools setuptools

View File

@ -1,6 +1,6 @@
# fmt: off # fmt: off
__title__ = "spacy" __title__ = "spacy"
__version__ = "3.4.1" __version__ = "3.4.2"
__download_url__ = "https://github.com/explosion/spacy-models/releases/download" __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
__projects__ = "https://github.com/explosion/projects" __projects__ = "https://github.com/explosion/projects"

View File

@ -25,6 +25,7 @@ def project_update_dvc_cli(
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False), project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
workflow: Optional[str] = Arg(None, help=f"Name of workflow defined in {PROJECT_FILE}. Defaults to first workflow if not set."), workflow: Optional[str] = Arg(None, help=f"Name of workflow defined in {PROJECT_FILE}. Defaults to first workflow if not set."),
verbose: bool = Opt(False, "--verbose", "-V", help="Print more info"), verbose: bool = Opt(False, "--verbose", "-V", help="Print more info"),
quiet: bool = Opt(False, "--quiet", "-q", help="Print less info"),
force: bool = Opt(False, "--force", "-F", help="Force update DVC config"), force: bool = Opt(False, "--force", "-F", help="Force update DVC config"),
# fmt: on # fmt: on
): ):
@ -36,7 +37,7 @@ def project_update_dvc_cli(
DOCS: https://spacy.io/api/cli#project-dvc DOCS: https://spacy.io/api/cli#project-dvc
""" """
project_update_dvc(project_dir, workflow, verbose=verbose, force=force) project_update_dvc(project_dir, workflow, verbose=verbose, quiet=quiet, force=force)
def project_update_dvc( def project_update_dvc(
@ -44,6 +45,7 @@ def project_update_dvc(
workflow: Optional[str] = None, workflow: Optional[str] = None,
*, *,
verbose: bool = False, verbose: bool = False,
quiet: bool = False,
force: bool = False, force: bool = False,
) -> None: ) -> None:
"""Update the auto-generated Data Version Control (DVC) config file. A DVC """Update the auto-generated Data Version Control (DVC) config file. A DVC
@ -54,11 +56,12 @@ def project_update_dvc(
workflow (Optional[str]): Optional name of workflow defined in project.yml. workflow (Optional[str]): Optional name of workflow defined in project.yml.
If not set, the first workflow will be used. If not set, the first workflow will be used.
verbose (bool): Print more info. verbose (bool): Print more info.
quiet (bool): Print less info.
force (bool): Force update DVC config. force (bool): Force update DVC config.
""" """
config = load_project_config(project_dir) config = load_project_config(project_dir)
updated = update_dvc_config( updated = update_dvc_config(
project_dir, config, workflow, verbose=verbose, force=force project_dir, config, workflow, verbose=verbose, quiet=quiet, force=force
) )
help_msg = "To execute the workflow with DVC, run: dvc repro" help_msg = "To execute the workflow with DVC, run: dvc repro"
if updated: if updated:
@ -72,7 +75,7 @@ def update_dvc_config(
config: Dict[str, Any], config: Dict[str, Any],
workflow: Optional[str] = None, workflow: Optional[str] = None,
verbose: bool = False, verbose: bool = False,
silent: bool = False, quiet: bool = False,
force: bool = False, force: bool = False,
) -> bool: ) -> bool:
"""Re-run the DVC commands in dry mode and update dvc.yaml file in the """Re-run the DVC commands in dry mode and update dvc.yaml file in the
@ -83,7 +86,7 @@ def update_dvc_config(
path (Path): The path to the project directory. path (Path): The path to the project directory.
config (Dict[str, Any]): The loaded project.yml. config (Dict[str, Any]): The loaded project.yml.
verbose (bool): Whether to print additional info (via DVC). verbose (bool): Whether to print additional info (via DVC).
silent (bool): Don't output anything (via DVC). quiet (bool): Don't output anything (via DVC).
force (bool): Force update, even if hashes match. force (bool): Force update, even if hashes match.
RETURNS (bool): Whether the DVC config file was updated. RETURNS (bool): Whether the DVC config file was updated.
""" """
@ -105,6 +108,14 @@ def update_dvc_config(
dvc_config_path.unlink() dvc_config_path.unlink()
dvc_commands = [] dvc_commands = []
config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])} config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
# some flags that apply to every command
flags = []
if verbose:
flags.append("--verbose")
if quiet:
flags.append("--quiet")
for name in workflows[workflow]: for name in workflows[workflow]:
command = config_commands[name] command = config_commands[name]
deps = command.get("deps", []) deps = command.get("deps", [])
@ -118,14 +129,26 @@ def update_dvc_config(
deps_cmd = [c for cl in [["-d", p] for p in deps] for c in cl] deps_cmd = [c for cl in [["-d", p] for p in deps] for c in cl]
outputs_cmd = [c for cl in [["-o", p] for p in outputs] for c in cl] outputs_cmd = [c for cl in [["-o", p] for p in outputs] for c in cl]
outputs_nc_cmd = [c for cl in [["-O", p] for p in outputs_no_cache] for c in cl] outputs_nc_cmd = [c for cl in [["-O", p] for p in outputs_no_cache] for c in cl]
dvc_cmd = ["run", "-n", name, "-w", str(path), "--no-exec"]
dvc_cmd = ["run", *flags, "-n", name, "-w", str(path), "--no-exec"]
if command.get("no_skip"): if command.get("no_skip"):
dvc_cmd.append("--always-changed") dvc_cmd.append("--always-changed")
full_cmd = [*dvc_cmd, *deps_cmd, *outputs_cmd, *outputs_nc_cmd, *project_cmd] full_cmd = [*dvc_cmd, *deps_cmd, *outputs_cmd, *outputs_nc_cmd, *project_cmd]
dvc_commands.append(join_command(full_cmd)) dvc_commands.append(join_command(full_cmd))
if not dvc_commands:
# If we don't check for this, then there will be an error when reading the
# config, since DVC wouldn't create it.
msg.fail(
"No usable commands for DVC found. This can happen if none of your "
"commands have dependencies or outputs.",
exits=1,
)
with working_dir(path): with working_dir(path):
dvc_flags = {"--verbose": verbose, "--quiet": silent} for c in dvc_commands:
run_dvc_commands(dvc_commands, flags=dvc_flags) dvc_command = "dvc " + c
run_command(dvc_command)
with dvc_config_path.open("r+", encoding="utf8") as f: with dvc_config_path.open("r+", encoding="utf8") as f:
content = f.read() content = f.read()
f.seek(0, 0) f.seek(0, 0)
@ -133,26 +156,6 @@ def update_dvc_config(
return True return True
def run_dvc_commands(
commands: Iterable[str] = SimpleFrozenList(), flags: Dict[str, bool] = {}
) -> None:
"""Run a sequence of DVC commands in a subprocess, in order.
commands (List[str]): The string commands without the leading "dvc".
flags (Dict[str, bool]): Conditional flags to be added to command. Makes it
easier to pass flags like --quiet that depend on a variable or
command-line setting while avoiding lots of nested conditionals.
"""
for c in commands:
command = split_command(c)
dvc_command = ["dvc", *command]
# Add the flags if they are set to True
for flag, is_active in flags.items():
if is_active:
dvc_command.append(flag)
run_command(dvc_command)
def check_workflows(workflows: List[str], workflow: Optional[str] = None) -> None: def check_workflows(workflows: List[str], workflow: Optional[str] = None) -> None:
"""Validate workflows provided in project.yml and check that a given """Validate workflows provided in project.yml and check that a given
workflow can be used to generate a DVC config. workflow can be used to generate a DVC config.

View File

@ -23,7 +23,7 @@ class RussianLemmatizer(Lemmatizer):
overwrite: bool = False, overwrite: bool = False,
scorer: Optional[Callable] = lemmatizer_score, scorer: Optional[Callable] = lemmatizer_score,
) -> None: ) -> None:
if mode == "pymorphy2": if mode in {"pymorphy2", "pymorphy2_lookup"}:
try: try:
from pymorphy2 import MorphAnalyzer from pymorphy2 import MorphAnalyzer
except ImportError: except ImportError:

View File

@ -18,7 +18,7 @@ class UkrainianLemmatizer(RussianLemmatizer):
overwrite: bool = False, overwrite: bool = False,
scorer: Optional[Callable] = lemmatizer_score, scorer: Optional[Callable] = lemmatizer_score,
) -> None: ) -> None:
if mode == "pymorphy2": if mode in {"pymorphy2", "pymorphy2_lookup"}:
try: try:
from pymorphy2 import MorphAnalyzer from pymorphy2 import MorphAnalyzer
except ImportError: except ImportError:

View File

@ -1,7 +1,6 @@
from typing import cast, Any, Callable, Dict, Iterable, List, Optional from typing import cast, Any, Callable, Dict, Iterable, List, Optional, Union
from typing import Sequence, Tuple, Union from typing import Tuple
from collections import Counter from collections import Counter
from copy import deepcopy
from itertools import islice from itertools import islice
import numpy as np import numpy as np

View File

@ -30,17 +30,17 @@ scorer = {"@layers": "spacy.LinearLogistic.v1"}
hidden_size = 128 hidden_size = 128
[model.tok2vec] [model.tok2vec]
@architectures = "spacy.Tok2Vec.v1" @architectures = "spacy.Tok2Vec.v2"
[model.tok2vec.embed] [model.tok2vec.embed]
@architectures = "spacy.MultiHashEmbed.v1" @architectures = "spacy.MultiHashEmbed.v2"
width = 96 width = 96
rows = [5000, 2000, 1000, 1000] rows = [5000, 2000, 1000, 1000]
attrs = ["ORTH", "PREFIX", "SUFFIX", "SHAPE"] attrs = ["ORTH", "PREFIX", "SUFFIX", "SHAPE"]
include_static_vectors = false include_static_vectors = false
[model.tok2vec.encode] [model.tok2vec.encode]
@architectures = "spacy.MaxoutWindowEncoder.v1" @architectures = "spacy.MaxoutWindowEncoder.v2"
width = ${model.tok2vec.embed.width} width = ${model.tok2vec.embed.width}
window_size = 1 window_size = 1
maxout_pieces = 3 maxout_pieces = 3
@ -139,6 +139,9 @@ def make_spancat(
spans_key (str): Key of the doc.spans dict to save the spans under. During spans_key (str): Key of the doc.spans dict to save the spans under. During
initialization and training, the component will look for spans on the initialization and training, the component will look for spans on the
reference document under the same key. reference document under the same key.
scorer (Optional[Callable]): The scoring method. Defaults to
Scorer.score_spans for the Doc.spans[spans_key] with overlapping
spans allowed.
threshold (float): Minimum probability to consider a prediction positive. threshold (float): Minimum probability to consider a prediction positive.
Spans with a positive prediction will be saved on the Doc. Defaults to Spans with a positive prediction will be saved on the Doc. Defaults to
0.5. 0.5.

View File

@ -19,7 +19,7 @@ multi_label_default_config = """
@architectures = "spacy.TextCatEnsemble.v2" @architectures = "spacy.TextCatEnsemble.v2"
[model.tok2vec] [model.tok2vec]
@architectures = "spacy.Tok2Vec.v1" @architectures = "spacy.Tok2Vec.v2"
[model.tok2vec.embed] [model.tok2vec.embed]
@architectures = "spacy.MultiHashEmbed.v2" @architectures = "spacy.MultiHashEmbed.v2"
@ -29,7 +29,7 @@ attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"]
include_static_vectors = false include_static_vectors = false
[model.tok2vec.encode] [model.tok2vec.encode]
@architectures = "spacy.MaxoutWindowEncoder.v1" @architectures = "spacy.MaxoutWindowEncoder.v2"
width = ${model.tok2vec.embed.width} width = ${model.tok2vec.embed.width}
window_size = 1 window_size = 1
maxout_pieces = 3 maxout_pieces = 3
@ -98,7 +98,7 @@ def make_multilabel_textcat(
threshold: float, threshold: float,
scorer: Optional[Callable], scorer: Optional[Callable],
save_activations: bool, save_activations: bool,
) -> "TextCategorizer": ) -> "MultiLabel_TextCategorizer":
"""Create a TextCategorizer component. The text categorizer predicts categories """Create a TextCategorizer component. The text categorizer predicts categories
over a whole document. It can learn one or more labels, and the labels are considered over a whole document. It can learn one or more labels, and the labels are considered
to be non-mutually exclusive, which means that there can be zero or more labels to be non-mutually exclusive, which means that there can be zero or more labels
@ -107,6 +107,7 @@ def make_multilabel_textcat(
model (Model[List[Doc], List[Floats2d]]): A model instance that predicts model (Model[List[Doc], List[Floats2d]]): A model instance that predicts
scores for each category. scores for each category.
threshold (float): Cutoff to consider a prediction "positive". threshold (float): Cutoff to consider a prediction "positive".
scorer (Optional[Callable]): The scoring method.
""" """
return MultiLabel_TextCategorizer( return MultiLabel_TextCategorizer(
nlp.vocab, nlp.vocab,
@ -155,7 +156,11 @@ class MultiLabel_TextCategorizer(TextCategorizer):
name (str): The component instance name, used to add entries to the name (str): The component instance name, used to add entries to the
losses during training. losses during training.
threshold (float): Cutoff to consider a prediction "positive". threshold (float): Cutoff to consider a prediction "positive".
<<<<<<< HEAD
save_activations (bool): save model activations in Doc when annotating. save_activations (bool): save model activations in Doc when annotating.
=======
scorer (Optional[Callable]): The scoring method.
>>>>>>> upstream/master
DOCS: https://spacy.io/api/textcategorizer#init DOCS: https://spacy.io/api/textcategorizer#init
""" """

View File

@ -181,12 +181,12 @@ class TokenPatternNumber(BaseModel):
IS_SUBSET: Optional[List[StrictInt]] = Field(None, alias="is_subset") IS_SUBSET: Optional[List[StrictInt]] = Field(None, alias="is_subset")
IS_SUPERSET: Optional[List[StrictInt]] = Field(None, alias="is_superset") IS_SUPERSET: Optional[List[StrictInt]] = Field(None, alias="is_superset")
INTERSECTS: Optional[List[StrictInt]] = Field(None, alias="intersects") INTERSECTS: Optional[List[StrictInt]] = Field(None, alias="intersects")
EQ: Union[StrictInt, StrictFloat] = Field(None, alias="==") EQ: Optional[Union[StrictInt, StrictFloat]] = Field(None, alias="==")
NEQ: Union[StrictInt, StrictFloat] = Field(None, alias="!=") NEQ: Optional[Union[StrictInt, StrictFloat]] = Field(None, alias="!=")
GEQ: Union[StrictInt, StrictFloat] = Field(None, alias=">=") GEQ: Optional[Union[StrictInt, StrictFloat]] = Field(None, alias=">=")
LEQ: Union[StrictInt, StrictFloat] = Field(None, alias="<=") LEQ: Optional[Union[StrictInt, StrictFloat]] = Field(None, alias="<=")
GT: Union[StrictInt, StrictFloat] = Field(None, alias=">") GT: Optional[Union[StrictInt, StrictFloat]] = Field(None, alias=">")
LT: Union[StrictInt, StrictFloat] = Field(None, alias="<") LT: Optional[Union[StrictInt, StrictFloat]] = Field(None, alias="<")
class Config: class Config:
extra = "forbid" extra = "forbid"
@ -430,7 +430,7 @@ class ProjectConfigAssetURL(BaseModel):
# fmt: off # fmt: off
dest: StrictStr = Field(..., title="Destination of downloaded asset") dest: StrictStr = Field(..., title="Destination of downloaded asset")
url: Optional[StrictStr] = Field(None, title="URL of asset") url: Optional[StrictStr] = Field(None, title="URL of asset")
checksum: str = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})") checksum: Optional[str] = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})")
description: StrictStr = Field("", title="Description of asset") description: StrictStr = Field("", title="Description of asset")
# fmt: on # fmt: on
@ -438,7 +438,7 @@ class ProjectConfigAssetURL(BaseModel):
class ProjectConfigAssetGit(BaseModel): class ProjectConfigAssetGit(BaseModel):
# fmt: off # fmt: off
git: ProjectConfigAssetGitItem = Field(..., title="Git repo information") git: ProjectConfigAssetGitItem = Field(..., title="Git repo information")
checksum: str = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})") checksum: Optional[str] = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})")
description: Optional[StrictStr] = Field(None, title="Description of asset") description: Optional[StrictStr] = Field(None, title="Description of asset")
# fmt: on # fmt: on
@ -508,9 +508,9 @@ class DocJSONSchema(BaseModel):
None, title="Indices of sentences' start and end indices" None, title="Indices of sentences' start and end indices"
) )
text: StrictStr = Field(..., title="Document text") text: StrictStr = Field(..., title="Document text")
spans: Dict[StrictStr, List[Dict[StrictStr, Union[StrictStr, StrictInt]]]] = Field( spans: Optional[
None, title="Span information - end/start indices, label, KB ID" Dict[StrictStr, List[Dict[StrictStr, Union[StrictStr, StrictInt]]]]
) ] = Field(None, title="Span information - end/start indices, label, KB ID")
tokens: List[Dict[StrictStr, Union[StrictStr, StrictInt]]] = Field( tokens: List[Dict[StrictStr, Union[StrictStr, StrictInt]]] = Field(
..., title="Token information - ID, start, annotations" ..., title="Token information - ID, start, annotations"
) )
@ -519,9 +519,9 @@ class DocJSONSchema(BaseModel):
title="Any custom data stored in the document's _ attribute", title="Any custom data stored in the document's _ attribute",
alias="_", alias="_",
) )
underscore_token: Optional[Dict[StrictStr, Dict[StrictStr, Any]]] = Field( underscore_token: Optional[Dict[StrictStr, List[Dict[StrictStr, Any]]]] = Field(
None, title="Any custom data stored in the token's _ attribute" None, title="Any custom data stored in the token's _ attribute"
) )
underscore_span: Optional[Dict[StrictStr, Dict[StrictStr, Any]]] = Field( underscore_span: Optional[Dict[StrictStr, List[Dict[StrictStr, Any]]]] = Field(
None, title="Any custom data stored in the span's _ attribute" None, title="Any custom data stored in the span's _ attribute"
) )

View File

@ -357,6 +357,14 @@ def ru_lemmatizer():
return get_lang_class("ru")().add_pipe("lemmatizer") return get_lang_class("ru")().add_pipe("lemmatizer")
@pytest.fixture
def ru_lookup_lemmatizer():
pytest.importorskip("pymorphy2")
return get_lang_class("ru")().add_pipe(
"lemmatizer", config={"mode": "pymorphy2_lookup"}
)
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
def sa_tokenizer(): def sa_tokenizer():
return get_lang_class("sa")().tokenizer return get_lang_class("sa")().tokenizer
@ -436,6 +444,15 @@ def uk_lemmatizer():
return get_lang_class("uk")().add_pipe("lemmatizer") return get_lang_class("uk")().add_pipe("lemmatizer")
@pytest.fixture
def uk_lookup_lemmatizer():
pytest.importorskip("pymorphy2")
pytest.importorskip("pymorphy2_dicts_uk")
return get_lang_class("uk")().add_pipe(
"lemmatizer", config={"mode": "pymorphy2_lookup"}
)
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
def ur_tokenizer(): def ur_tokenizer():
return get_lang_class("ur")().tokenizer return get_lang_class("ur")().tokenizer

View File

@ -128,7 +128,9 @@ def test_doc_to_json_with_token_span_attributes(doc):
doc._.json_test1 = "hello world" doc._.json_test1 = "hello world"
doc._.json_test2 = [1, 2, 3] doc._.json_test2 = [1, 2, 3]
doc[0:1]._.span_test = "span_attribute" doc[0:1]._.span_test = "span_attribute"
doc[0:2]._.span_test = "span_attribute_2"
doc[0]._.token_test = 117 doc[0]._.token_test = 117
doc[1]._.token_test = 118
doc.spans["span_group"] = [doc[0:1]] doc.spans["span_group"] = [doc[0:1]]
json_doc = doc.to_json( json_doc = doc.to_json(
underscore=["json_test1", "json_test2", "token_test", "span_test"] underscore=["json_test1", "json_test2", "token_test", "span_test"]
@ -139,8 +141,10 @@ def test_doc_to_json_with_token_span_attributes(doc):
assert json_doc["_"]["json_test2"] == [1, 2, 3] assert json_doc["_"]["json_test2"] == [1, 2, 3]
assert "underscore_token" in json_doc assert "underscore_token" in json_doc
assert "underscore_span" in json_doc assert "underscore_span" in json_doc
assert json_doc["underscore_token"]["token_test"]["value"] == 117 assert json_doc["underscore_token"]["token_test"][0]["value"] == 117
assert json_doc["underscore_span"]["span_test"]["value"] == "span_attribute" assert json_doc["underscore_token"]["token_test"][1]["value"] == 118
assert json_doc["underscore_span"]["span_test"][0]["value"] == "span_attribute"
assert json_doc["underscore_span"]["span_test"][1]["value"] == "span_attribute_2"
assert len(schemas.validate(schemas.DocJSONSchema, json_doc)) == 0 assert len(schemas.validate(schemas.DocJSONSchema, json_doc)) == 0
assert srsly.json_loads(srsly.json_dumps(json_doc)) == json_doc assert srsly.json_loads(srsly.json_dumps(json_doc)) == json_doc
@ -161,8 +165,8 @@ def test_doc_to_json_with_custom_user_data(doc):
assert json_doc["_"]["json_test"] == "hello world" assert json_doc["_"]["json_test"] == "hello world"
assert "underscore_token" in json_doc assert "underscore_token" in json_doc
assert "underscore_span" in json_doc assert "underscore_span" in json_doc
assert json_doc["underscore_token"]["token_test"]["value"] == 117 assert json_doc["underscore_token"]["token_test"][0]["value"] == 117
assert json_doc["underscore_span"]["span_test"]["value"] == "span_attribute" assert json_doc["underscore_span"]["span_test"][0]["value"] == "span_attribute"
assert len(schemas.validate(schemas.DocJSONSchema, json_doc)) == 0 assert len(schemas.validate(schemas.DocJSONSchema, json_doc)) == 0
assert srsly.json_loads(srsly.json_dumps(json_doc)) == json_doc assert srsly.json_loads(srsly.json_dumps(json_doc)) == json_doc
@ -181,8 +185,8 @@ def test_doc_to_json_with_token_span_same_identifier(doc):
assert json_doc["_"]["my_ext"] == "hello world" assert json_doc["_"]["my_ext"] == "hello world"
assert "underscore_token" in json_doc assert "underscore_token" in json_doc
assert "underscore_span" in json_doc assert "underscore_span" in json_doc
assert json_doc["underscore_token"]["my_ext"]["value"] == 117 assert json_doc["underscore_token"]["my_ext"][0]["value"] == 117
assert json_doc["underscore_span"]["my_ext"]["value"] == "span_attribute" assert json_doc["underscore_span"]["my_ext"][0]["value"] == "span_attribute"
assert len(schemas.validate(schemas.DocJSONSchema, json_doc)) == 0 assert len(schemas.validate(schemas.DocJSONSchema, json_doc)) == 0
assert srsly.json_loads(srsly.json_dumps(json_doc)) == json_doc assert srsly.json_loads(srsly.json_dumps(json_doc)) == json_doc
@ -195,10 +199,9 @@ def test_doc_to_json_with_token_attributes_missing(doc):
doc[0]._.token_test = 117 doc[0]._.token_test = 117
json_doc = doc.to_json(underscore=["span_test"]) json_doc = doc.to_json(underscore=["span_test"])
assert "underscore_token" in json_doc
assert "underscore_span" in json_doc assert "underscore_span" in json_doc
assert json_doc["underscore_span"]["span_test"]["value"] == "span_attribute" assert json_doc["underscore_span"]["span_test"][0]["value"] == "span_attribute"
assert "token_test" not in json_doc["underscore_token"] assert "underscore_token" not in json_doc
assert len(schemas.validate(schemas.DocJSONSchema, json_doc)) == 0 assert len(schemas.validate(schemas.DocJSONSchema, json_doc)) == 0
@ -283,7 +286,9 @@ def test_json_to_doc_with_token_span_attributes(doc):
doc._.json_test1 = "hello world" doc._.json_test1 = "hello world"
doc._.json_test2 = [1, 2, 3] doc._.json_test2 = [1, 2, 3]
doc[0:1]._.span_test = "span_attribute" doc[0:1]._.span_test = "span_attribute"
doc[0:2]._.span_test = "span_attribute_2"
doc[0]._.token_test = 117 doc[0]._.token_test = 117
doc[1]._.token_test = 118
json_doc = doc.to_json( json_doc = doc.to_json(
underscore=["json_test1", "json_test2", "token_test", "span_test"] underscore=["json_test1", "json_test2", "token_test", "span_test"]
@ -295,7 +300,9 @@ def test_json_to_doc_with_token_span_attributes(doc):
assert new_doc._.json_test1 == "hello world" assert new_doc._.json_test1 == "hello world"
assert new_doc._.json_test2 == [1, 2, 3] assert new_doc._.json_test2 == [1, 2, 3]
assert new_doc[0]._.token_test == 117 assert new_doc[0]._.token_test == 117
assert new_doc[1]._.token_test == 118
assert new_doc[0:1]._.span_test == "span_attribute" assert new_doc[0:1]._.span_test == "span_attribute"
assert new_doc[0:2]._.span_test == "span_attribute_2"
assert new_doc.user_data == doc.user_data assert new_doc.user_data == doc.user_data
assert new_doc.to_bytes(exclude=["user_data"]) == doc.to_bytes( assert new_doc.to_bytes(exclude=["user_data"]) == doc.to_bytes(
exclude=["user_data"] exclude=["user_data"]

View File

@ -78,3 +78,17 @@ def test_ru_lemmatizer_punct(ru_lemmatizer):
assert ru_lemmatizer.pymorphy2_lemmatize(doc[0]) == ['"'] assert ru_lemmatizer.pymorphy2_lemmatize(doc[0]) == ['"']
doc = Doc(ru_lemmatizer.vocab, words=["»"], pos=["PUNCT"]) doc = Doc(ru_lemmatizer.vocab, words=["»"], pos=["PUNCT"])
assert ru_lemmatizer.pymorphy2_lemmatize(doc[0]) == ['"'] assert ru_lemmatizer.pymorphy2_lemmatize(doc[0]) == ['"']
def test_ru_doc_lookup_lemmatization(ru_lookup_lemmatizer):
words = ["мама", "мыла", "раму"]
pos = ["NOUN", "VERB", "NOUN"]
morphs = [
"Animacy=Anim|Case=Nom|Gender=Fem|Number=Sing",
"Aspect=Imp|Gender=Fem|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act",
"Animacy=Anim|Case=Acc|Gender=Fem|Number=Sing",
]
doc = Doc(ru_lookup_lemmatizer.vocab, words=words, pos=pos, morphs=morphs)
doc = ru_lookup_lemmatizer(doc)
lemmas = [token.lemma_ for token in doc]
assert lemmas == ["мама", "мыла", "раму"]

View File

@ -9,3 +9,11 @@ def test_uk_lemmatizer(uk_lemmatizer):
"""Check that the default uk lemmatizer runs.""" """Check that the default uk lemmatizer runs."""
doc = Doc(uk_lemmatizer.vocab, words=["a", "b", "c"]) doc = Doc(uk_lemmatizer.vocab, words=["a", "b", "c"])
uk_lemmatizer(doc) uk_lemmatizer(doc)
assert [token.lemma for token in doc]
def test_uk_lookup_lemmatizer(uk_lookup_lemmatizer):
"""Check that the lookup uk lemmatizer runs."""
doc = Doc(uk_lookup_lemmatizer.vocab, words=["a", "b", "c"])
uk_lookup_lemmatizer(doc)
assert [token.lemma for token in doc]

View File

@ -1619,24 +1619,20 @@ cdef class Doc:
Doc.set_extension(attr) Doc.set_extension(attr)
self._.set(attr, doc_json["_"][attr]) self._.set(attr, doc_json["_"][attr])
if doc_json.get("underscore_token", {}): for token_attr in doc_json.get("underscore_token", {}):
for token_attr in doc_json["underscore_token"]:
token_start = doc_json["underscore_token"][token_attr]["token_start"]
value = doc_json["underscore_token"][token_attr]["value"]
if not Token.has_extension(token_attr): if not Token.has_extension(token_attr):
Token.set_extension(token_attr) Token.set_extension(token_attr)
self[token_start]._.set(token_attr, value) for token_data in doc_json["underscore_token"][token_attr]:
start = token_by_char(self.c, self.length, token_data["start"])
if doc_json.get("underscore_span", {}): value = token_data["value"]
for span_attr in doc_json["underscore_span"]: self[start]._.set(token_attr, value)
token_start = doc_json["underscore_span"][span_attr]["token_start"]
token_end = doc_json["underscore_span"][span_attr]["token_end"]
value = doc_json["underscore_span"][span_attr]["value"]
for span_attr in doc_json.get("underscore_span", {}):
if not Span.has_extension(span_attr): if not Span.has_extension(span_attr):
Span.set_extension(span_attr) Span.set_extension(span_attr)
self[token_start:token_end]._.set(span_attr, value) for span_data in doc_json["underscore_span"][span_attr]:
value = span_data["value"]
self.char_span(span_data["start"], span_data["end"])._.set(span_attr, value)
return self return self
def to_json(self, underscore=None): def to_json(self, underscore=None):
@ -1684,30 +1680,34 @@ cdef class Doc:
if underscore: if underscore:
user_keys = set() user_keys = set()
if self.user_data: if self.user_data:
data["_"] = {} for data_key, value in self.user_data.copy().items():
data["underscore_token"] = {}
data["underscore_span"] = {}
for data_key in self.user_data:
if type(data_key) == tuple and len(data_key) >= 4 and data_key[0] == "._.": if type(data_key) == tuple and len(data_key) >= 4 and data_key[0] == "._.":
attr = data_key[1] attr = data_key[1]
start = data_key[2] start = data_key[2]
end = data_key[3] end = data_key[3]
if attr in underscore: if attr in underscore:
user_keys.add(attr) user_keys.add(attr)
value = self.user_data[data_key]
if not srsly.is_json_serializable(value): if not srsly.is_json_serializable(value):
raise ValueError(Errors.E107.format(attr=attr, value=repr(value))) raise ValueError(Errors.E107.format(attr=attr, value=repr(value)))
# Check if doc attribute # Check if doc attribute
if start is None: if start is None:
if "_" not in data:
data["_"] = {}
data["_"][attr] = value data["_"][attr] = value
# Check if token attribute # Check if token attribute
elif end is None: elif end is None:
if "underscore_token" not in data:
data["underscore_token"] = {}
if attr not in data["underscore_token"]: if attr not in data["underscore_token"]:
data["underscore_token"][attr] = {"token_start": start, "value": value} data["underscore_token"][attr] = []
data["underscore_token"][attr].append({"start": start, "value": value})
# Else span attribute # Else span attribute
else: else:
if "underscore_span" not in data:
data["underscore_span"] = {}
if attr not in data["underscore_span"]: if attr not in data["underscore_span"]:
data["underscore_span"][attr] = {"token_start": start, "token_end": end, "value": value} data["underscore_span"][attr] = []
data["underscore_span"][attr].append({"start": start, "end": end, "value": value})
for attr in underscore: for attr in underscore:
if attr not in user_keys: if attr not in user_keys:

View File

@ -1482,7 +1482,7 @@ You'll also need to add the assets you want to track with
</Infobox> </Infobox>
```cli ```cli
$ python -m spacy project dvc [project_dir] [workflow] [--force] [--verbose] $ python -m spacy project dvc [project_dir] [workflow] [--force] [--verbose] [--quiet]
``` ```
> #### Example > #### Example
@ -1499,6 +1499,7 @@ $ python -m spacy project dvc [project_dir] [workflow] [--force] [--verbose]
| `workflow` | Name of workflow defined in `project.yml`. Defaults to first workflow if not set. ~~Optional[str] \(option)~~ | | `workflow` | Name of workflow defined in `project.yml`. Defaults to first workflow if not set. ~~Optional[str] \(option)~~ |
| `--force`, `-F` | Force-updating config file. ~~bool (flag)~~ | | `--force`, `-F` | Force-updating config file. ~~bool (flag)~~ |
| `--verbose`, `-V` | Print more output generated by DVC. ~~bool (flag)~~ | | `--verbose`, `-V` | Print more output generated by DVC. ~~bool (flag)~~ |
| `--quiet`, `-q` | Print no output generated by DVC. ~~bool (flag)~~ |
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | | `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
| **CREATES** | A `dvc.yaml` file in the project directory, based on the steps defined in the given workflow. | | **CREATES** | A `dvc.yaml` file in the project directory, based on the steps defined in the given workflow. |

View File

@ -21,9 +21,9 @@ Create the knowledge base.
> #### Example > #### Example
> >
> ```python > ```python
> from spacy.kb import KnowledgeBase > from spacy.kb import InMemoryLookupKB
> vocab = nlp.vocab > vocab = nlp.vocab
> kb = KnowledgeBase(vocab=vocab, entity_vector_length=64) > kb = InMemoryLookupKB(vocab=vocab, entity_vector_length=64)
> ``` > ```
| Name | Description | | Name | Description |

View File

@ -243,6 +243,27 @@ pipelines.
> python -m spacy project run test . --vars.foo bar > python -m spacy project run test . --vars.foo bar
> ``` > ```
> #### Tip: Environment Variables
>
> Commands in a project file are not executed in a shell, so they don't have
> direct access to environment variables. But you can insert environment
> variables using the `env` dictionary to make values available for
> interpolation, just like values in `vars`. Here's an example `env` dict that
> makes `$PATH` available as `ENV_PATH`:
>
> ```yaml
> env:
> ENV_PATH: PATH
> ```
>
> This can be used in a project command like so:
>
> ```yaml
> - name: "echo-path"
> script:
> - "echo ${env.ENV_PATH}"
> ```
| Section | Description | | Section | Description |
| --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `title` | An optional project title used in `--help` message and [auto-generated docs](#custom-docs). | | `title` | An optional project title used in `--help` message and [auto-generated docs](#custom-docs). |

View File

@ -1,5 +1,46 @@
{ {
"resources": [ "resources": [
{
"id": "spacy-cleaner",
"title": "spacy-cleaner",
"slogan": "Easily clean text with spaCy!",
"description": "**spacy-cleaner** utilises spaCy `Language` models to replace, remove, and \n mutate spaCy tokens. Cleaning actions available are:\n\n* Remove/replace stopwords.\n* Remove/replace punctuation.\n* Remove/replace numbers.\n* Remove/replace emails.\n* Remove/replace URLs.\n* Perform lemmatisation.\n\nSee our [docs](https://ce11an.github.io/spacy-cleaner/) for more information.",
"github": "Ce11an/spacy-cleaner",
"pip": "spacy-cleaner",
"code_example": [
"import spacy",
"import spacy_cleaner",
"from spacy_cleaner.processing import removers, replacers, mutators",
"",
"model = spacy.load(\"en_core_web_sm\")",
"pipeline = spacy_cleaner.Pipeline(",
" model,",
" removers.remove_stopword_token,",
" replacers.replace_punctuation_token,",
" mutators.mutate_lemma_token,",
")",
"",
"texts = [\"Hello, my name is Cellan! I love to swim!\"]",
"",
"pipeline.clean(texts)",
"# ['hello _IS_PUNCT_ Cellan _IS_PUNCT_ love swim _IS_PUNCT_']"
],
"code_language": "python",
"url": "https://ce11an.github.io/spacy-cleaner/",
"image": "https://raw.githubusercontent.com/Ce11an/spacy-cleaner/main/docs/assets/images/spacemen.png",
"author": "Cellan Hall",
"author_links": {
"twitter": "Ce11an",
"github": "Ce11an",
"website": "https://www.linkedin.com/in/cellan-hall/"
},
"category": [
"extension"
],
"tags": [
"text-processing"
]
},
{ {
"id": "Zshot", "id": "Zshot",
"title": "Zshot", "title": "Zshot",
@ -2460,20 +2501,20 @@
"import spacy", "import spacy",
"from spacy_wordnet.wordnet_annotator import WordnetAnnotator ", "from spacy_wordnet.wordnet_annotator import WordnetAnnotator ",
"", "",
"# Load an spacy model (supported models are \"es\" and \"en\") ", "# Load a spaCy model (supported languages are \"es\" and \"en\") ",
"nlp = spacy.load('en')", "nlp = spacy.load('en_core_web_sm')",
"# Spacy 3.x", "# spaCy 3.x",
"nlp.add_pipe(\"spacy_wordnet\", after='tagger', config={'lang': nlp.lang})", "nlp.add_pipe(\"spacy_wordnet\", after='tagger')",
"# Spacy 2.x", "# spaCy 2.x",
"# nlp.add_pipe(WordnetAnnotator(nlp.lang), after='tagger')", "# nlp.add_pipe(WordnetAnnotator(nlp.lang), after='tagger')",
"token = nlp('prices')[0]", "token = nlp('prices')[0]",
"", "",
"# wordnet object link spacy token with nltk wordnet interface by giving acces to", "# WordNet object links spaCy token with NLTK WordNet interface by giving access to",
"# synsets and lemmas ", "# synsets and lemmas ",
"token._.wordnet.synsets()", "token._.wordnet.synsets()",
"token._.wordnet.lemmas()", "token._.wordnet.lemmas()",
"", "",
"# And automatically tags with wordnet domains", "# And automatically add info about WordNet domains",
"token._.wordnet.wordnet_domains()" "token._.wordnet.wordnet_domains()"
], ],
"author": "recognai", "author": "recognai",