Merge pull request #11686 from adrianeboyd/chore/update-v4-from-master

Update v4 from master
This commit is contained in:
Adriane Boyd 2022-10-21 12:55:53 +02:00 committed by GitHub
commit a4bd890f32
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
22 changed files with 228 additions and 99 deletions

View File

@ -10,6 +10,7 @@ steps:
inputs:
versionSpec: ${{ parameters.python_version }}
architecture: ${{ parameters.architecture }}
allowUnstable: true
- bash: |
echo "##vso[task.setvariable variable=python_version]${{ parameters.python_version }}"

View File

@ -6,7 +6,7 @@ repos:
language_version: python3.7
additional_dependencies: ['click==8.0.4']
- repo: https://gitlab.com/pycqa/flake8
rev: 3.9.2
rev: 5.0.4
hooks:
- id: flake8
args:

View File

@ -85,6 +85,15 @@ jobs:
Python310Mac:
imageName: "macos-latest"
python.version: "3.10"
Python311Linux:
imageName: 'ubuntu-latest'
python.version: '3.11.0-rc.2'
Python311Windows:
imageName: 'windows-latest'
python.version: '3.11.0-rc.2'
Python311Mac:
imageName: 'macos-latest'
python.version: '3.11.0-rc.2'
maxParallel: 4
pool:
vmImage: $(imageName)

View File

@ -15,7 +15,7 @@ pathy>=0.3.5
numpy>=1.15.0
requests>=2.13.0,<3.0.0
tqdm>=4.38.0,<5.0.0
pydantic>=1.7.4,!=1.8,!=1.8.1,<1.10.0
pydantic>=1.7.4,!=1.8,!=1.8.1,<1.11.0
jinja2
langcodes>=3.2.0,<4.0.0
# Official Python utilities
@ -28,7 +28,7 @@ cython>=0.25,<3.0
pytest>=5.2.0,!=7.1.0
pytest-timeout>=1.3.0,<2.0.0
mock>=2.0.0,<3.0.0
flake8>=3.8.0,<3.10.0
flake8>=3.8.0,<6.0.0
hypothesis>=3.27.0,<7.0.0
mypy>=0.980,<0.990; platform_machine != "aarch64" and python_version >= "3.7"
types-dataclasses>=0.1.3; python_version < "3.7"

View File

@ -48,7 +48,7 @@ install_requires =
tqdm>=4.38.0,<5.0.0
numpy>=1.15.0
requests>=2.13.0,<3.0.0
pydantic>=1.7.4,!=1.8,!=1.8.1,<1.10.0
pydantic>=1.7.4,!=1.8,!=1.8.1,<1.11.0
jinja2
# Official Python utilities
setuptools

View File

@ -1,6 +1,6 @@
# fmt: off
__title__ = "spacy"
__version__ = "3.4.1"
__version__ = "3.4.2"
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
__projects__ = "https://github.com/explosion/projects"

View File

@ -25,6 +25,7 @@ def project_update_dvc_cli(
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
workflow: Optional[str] = Arg(None, help=f"Name of workflow defined in {PROJECT_FILE}. Defaults to first workflow if not set."),
verbose: bool = Opt(False, "--verbose", "-V", help="Print more info"),
quiet: bool = Opt(False, "--quiet", "-q", help="Print less info"),
force: bool = Opt(False, "--force", "-F", help="Force update DVC config"),
# fmt: on
):
@ -36,7 +37,7 @@ def project_update_dvc_cli(
DOCS: https://spacy.io/api/cli#project-dvc
"""
project_update_dvc(project_dir, workflow, verbose=verbose, force=force)
project_update_dvc(project_dir, workflow, verbose=verbose, quiet=quiet, force=force)
def project_update_dvc(
@ -44,6 +45,7 @@ def project_update_dvc(
workflow: Optional[str] = None,
*,
verbose: bool = False,
quiet: bool = False,
force: bool = False,
) -> None:
"""Update the auto-generated Data Version Control (DVC) config file. A DVC
@ -54,11 +56,12 @@ def project_update_dvc(
workflow (Optional[str]): Optional name of workflow defined in project.yml.
If not set, the first workflow will be used.
verbose (bool): Print more info.
quiet (bool): Print less info.
force (bool): Force update DVC config.
"""
config = load_project_config(project_dir)
updated = update_dvc_config(
project_dir, config, workflow, verbose=verbose, force=force
project_dir, config, workflow, verbose=verbose, quiet=quiet, force=force
)
help_msg = "To execute the workflow with DVC, run: dvc repro"
if updated:
@ -72,7 +75,7 @@ def update_dvc_config(
config: Dict[str, Any],
workflow: Optional[str] = None,
verbose: bool = False,
silent: bool = False,
quiet: bool = False,
force: bool = False,
) -> bool:
"""Re-run the DVC commands in dry mode and update dvc.yaml file in the
@ -83,7 +86,7 @@ def update_dvc_config(
path (Path): The path to the project directory.
config (Dict[str, Any]): The loaded project.yml.
verbose (bool): Whether to print additional info (via DVC).
silent (bool): Don't output anything (via DVC).
quiet (bool): Don't output anything (via DVC).
force (bool): Force update, even if hashes match.
RETURNS (bool): Whether the DVC config file was updated.
"""
@ -105,6 +108,14 @@ def update_dvc_config(
dvc_config_path.unlink()
dvc_commands = []
config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
# some flags that apply to every command
flags = []
if verbose:
flags.append("--verbose")
if quiet:
flags.append("--quiet")
for name in workflows[workflow]:
command = config_commands[name]
deps = command.get("deps", [])
@ -118,14 +129,26 @@ def update_dvc_config(
deps_cmd = [c for cl in [["-d", p] for p in deps] for c in cl]
outputs_cmd = [c for cl in [["-o", p] for p in outputs] for c in cl]
outputs_nc_cmd = [c for cl in [["-O", p] for p in outputs_no_cache] for c in cl]
dvc_cmd = ["run", "-n", name, "-w", str(path), "--no-exec"]
dvc_cmd = ["run", *flags, "-n", name, "-w", str(path), "--no-exec"]
if command.get("no_skip"):
dvc_cmd.append("--always-changed")
full_cmd = [*dvc_cmd, *deps_cmd, *outputs_cmd, *outputs_nc_cmd, *project_cmd]
dvc_commands.append(join_command(full_cmd))
if not dvc_commands:
# If we don't check for this, then there will be an error when reading the
# config, since DVC wouldn't create it.
msg.fail(
"No usable commands for DVC found. This can happen if none of your "
"commands have dependencies or outputs.",
exits=1,
)
with working_dir(path):
dvc_flags = {"--verbose": verbose, "--quiet": silent}
run_dvc_commands(dvc_commands, flags=dvc_flags)
for c in dvc_commands:
dvc_command = "dvc " + c
run_command(dvc_command)
with dvc_config_path.open("r+", encoding="utf8") as f:
content = f.read()
f.seek(0, 0)
@ -133,26 +156,6 @@ def update_dvc_config(
return True
def run_dvc_commands(
commands: Iterable[str] = SimpleFrozenList(), flags: Dict[str, bool] = {}
) -> None:
"""Run a sequence of DVC commands in a subprocess, in order.
commands (List[str]): The string commands without the leading "dvc".
flags (Dict[str, bool]): Conditional flags to be added to command. Makes it
easier to pass flags like --quiet that depend on a variable or
command-line setting while avoiding lots of nested conditionals.
"""
for c in commands:
command = split_command(c)
dvc_command = ["dvc", *command]
# Add the flags if they are set to True
for flag, is_active in flags.items():
if is_active:
dvc_command.append(flag)
run_command(dvc_command)
def check_workflows(workflows: List[str], workflow: Optional[str] = None) -> None:
"""Validate workflows provided in project.yml and check that a given
workflow can be used to generate a DVC config.

View File

@ -23,7 +23,7 @@ class RussianLemmatizer(Lemmatizer):
overwrite: bool = False,
scorer: Optional[Callable] = lemmatizer_score,
) -> None:
if mode == "pymorphy2":
if mode in {"pymorphy2", "pymorphy2_lookup"}:
try:
from pymorphy2 import MorphAnalyzer
except ImportError:

View File

@ -18,7 +18,7 @@ class UkrainianLemmatizer(RussianLemmatizer):
overwrite: bool = False,
scorer: Optional[Callable] = lemmatizer_score,
) -> None:
if mode == "pymorphy2":
if mode in {"pymorphy2", "pymorphy2_lookup"}:
try:
from pymorphy2 import MorphAnalyzer
except ImportError:

View File

@ -1,7 +1,6 @@
from typing import cast, Any, Callable, Dict, Iterable, List, Optional
from typing import Sequence, Tuple, Union
from typing import cast, Any, Callable, Dict, Iterable, List, Optional, Union
from typing import Tuple
from collections import Counter
from copy import deepcopy
from itertools import islice
import numpy as np

View File

@ -30,17 +30,17 @@ scorer = {"@layers": "spacy.LinearLogistic.v1"}
hidden_size = 128
[model.tok2vec]
@architectures = "spacy.Tok2Vec.v1"
@architectures = "spacy.Tok2Vec.v2"
[model.tok2vec.embed]
@architectures = "spacy.MultiHashEmbed.v1"
@architectures = "spacy.MultiHashEmbed.v2"
width = 96
rows = [5000, 2000, 1000, 1000]
attrs = ["ORTH", "PREFIX", "SUFFIX", "SHAPE"]
include_static_vectors = false
[model.tok2vec.encode]
@architectures = "spacy.MaxoutWindowEncoder.v1"
@architectures = "spacy.MaxoutWindowEncoder.v2"
width = ${model.tok2vec.embed.width}
window_size = 1
maxout_pieces = 3
@ -139,6 +139,9 @@ def make_spancat(
spans_key (str): Key of the doc.spans dict to save the spans under. During
initialization and training, the component will look for spans on the
reference document under the same key.
scorer (Optional[Callable]): The scoring method. Defaults to
Scorer.score_spans for the Doc.spans[spans_key] with overlapping
spans allowed.
threshold (float): Minimum probability to consider a prediction positive.
Spans with a positive prediction will be saved on the Doc. Defaults to
0.5.

View File

@ -19,7 +19,7 @@ multi_label_default_config = """
@architectures = "spacy.TextCatEnsemble.v2"
[model.tok2vec]
@architectures = "spacy.Tok2Vec.v1"
@architectures = "spacy.Tok2Vec.v2"
[model.tok2vec.embed]
@architectures = "spacy.MultiHashEmbed.v2"
@ -29,7 +29,7 @@ attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"]
include_static_vectors = false
[model.tok2vec.encode]
@architectures = "spacy.MaxoutWindowEncoder.v1"
@architectures = "spacy.MaxoutWindowEncoder.v2"
width = ${model.tok2vec.embed.width}
window_size = 1
maxout_pieces = 3
@ -98,7 +98,7 @@ def make_multilabel_textcat(
threshold: float,
scorer: Optional[Callable],
save_activations: bool,
) -> "TextCategorizer":
) -> "MultiLabel_TextCategorizer":
"""Create a TextCategorizer component. The text categorizer predicts categories
over a whole document. It can learn one or more labels, and the labels are considered
to be non-mutually exclusive, which means that there can be zero or more labels
@ -107,6 +107,7 @@ def make_multilabel_textcat(
model (Model[List[Doc], List[Floats2d]]): A model instance that predicts
scores for each category.
threshold (float): Cutoff to consider a prediction "positive".
scorer (Optional[Callable]): The scoring method.
"""
return MultiLabel_TextCategorizer(
nlp.vocab,
@ -155,7 +156,11 @@ class MultiLabel_TextCategorizer(TextCategorizer):
name (str): The component instance name, used to add entries to the
losses during training.
threshold (float): Cutoff to consider a prediction "positive".
<<<<<<< HEAD
save_activations (bool): save model activations in Doc when annotating.
=======
scorer (Optional[Callable]): The scoring method.
>>>>>>> upstream/master
DOCS: https://spacy.io/api/textcategorizer#init
"""

View File

@ -181,12 +181,12 @@ class TokenPatternNumber(BaseModel):
IS_SUBSET: Optional[List[StrictInt]] = Field(None, alias="is_subset")
IS_SUPERSET: Optional[List[StrictInt]] = Field(None, alias="is_superset")
INTERSECTS: Optional[List[StrictInt]] = Field(None, alias="intersects")
EQ: Union[StrictInt, StrictFloat] = Field(None, alias="==")
NEQ: Union[StrictInt, StrictFloat] = Field(None, alias="!=")
GEQ: Union[StrictInt, StrictFloat] = Field(None, alias=">=")
LEQ: Union[StrictInt, StrictFloat] = Field(None, alias="<=")
GT: Union[StrictInt, StrictFloat] = Field(None, alias=">")
LT: Union[StrictInt, StrictFloat] = Field(None, alias="<")
EQ: Optional[Union[StrictInt, StrictFloat]] = Field(None, alias="==")
NEQ: Optional[Union[StrictInt, StrictFloat]] = Field(None, alias="!=")
GEQ: Optional[Union[StrictInt, StrictFloat]] = Field(None, alias=">=")
LEQ: Optional[Union[StrictInt, StrictFloat]] = Field(None, alias="<=")
GT: Optional[Union[StrictInt, StrictFloat]] = Field(None, alias=">")
LT: Optional[Union[StrictInt, StrictFloat]] = Field(None, alias="<")
class Config:
extra = "forbid"
@ -430,7 +430,7 @@ class ProjectConfigAssetURL(BaseModel):
# fmt: off
dest: StrictStr = Field(..., title="Destination of downloaded asset")
url: Optional[StrictStr] = Field(None, title="URL of asset")
checksum: str = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})")
checksum: Optional[str] = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})")
description: StrictStr = Field("", title="Description of asset")
# fmt: on
@ -438,7 +438,7 @@ class ProjectConfigAssetURL(BaseModel):
class ProjectConfigAssetGit(BaseModel):
# fmt: off
git: ProjectConfigAssetGitItem = Field(..., title="Git repo information")
checksum: str = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})")
checksum: Optional[str] = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})")
description: Optional[StrictStr] = Field(None, title="Description of asset")
# fmt: on
@ -508,9 +508,9 @@ class DocJSONSchema(BaseModel):
None, title="Indices of sentences' start and end indices"
)
text: StrictStr = Field(..., title="Document text")
spans: Dict[StrictStr, List[Dict[StrictStr, Union[StrictStr, StrictInt]]]] = Field(
None, title="Span information - end/start indices, label, KB ID"
)
spans: Optional[
Dict[StrictStr, List[Dict[StrictStr, Union[StrictStr, StrictInt]]]]
] = Field(None, title="Span information - end/start indices, label, KB ID")
tokens: List[Dict[StrictStr, Union[StrictStr, StrictInt]]] = Field(
..., title="Token information - ID, start, annotations"
)
@ -519,9 +519,9 @@ class DocJSONSchema(BaseModel):
title="Any custom data stored in the document's _ attribute",
alias="_",
)
underscore_token: Optional[Dict[StrictStr, Dict[StrictStr, Any]]] = Field(
underscore_token: Optional[Dict[StrictStr, List[Dict[StrictStr, Any]]]] = Field(
None, title="Any custom data stored in the token's _ attribute"
)
underscore_span: Optional[Dict[StrictStr, Dict[StrictStr, Any]]] = Field(
underscore_span: Optional[Dict[StrictStr, List[Dict[StrictStr, Any]]]] = Field(
None, title="Any custom data stored in the span's _ attribute"
)

View File

@ -357,6 +357,14 @@ def ru_lemmatizer():
return get_lang_class("ru")().add_pipe("lemmatizer")
@pytest.fixture
def ru_lookup_lemmatizer():
pytest.importorskip("pymorphy2")
return get_lang_class("ru")().add_pipe(
"lemmatizer", config={"mode": "pymorphy2_lookup"}
)
@pytest.fixture(scope="session")
def sa_tokenizer():
return get_lang_class("sa")().tokenizer
@ -436,6 +444,15 @@ def uk_lemmatizer():
return get_lang_class("uk")().add_pipe("lemmatizer")
@pytest.fixture
def uk_lookup_lemmatizer():
pytest.importorskip("pymorphy2")
pytest.importorskip("pymorphy2_dicts_uk")
return get_lang_class("uk")().add_pipe(
"lemmatizer", config={"mode": "pymorphy2_lookup"}
)
@pytest.fixture(scope="session")
def ur_tokenizer():
return get_lang_class("ur")().tokenizer

View File

@ -128,7 +128,9 @@ def test_doc_to_json_with_token_span_attributes(doc):
doc._.json_test1 = "hello world"
doc._.json_test2 = [1, 2, 3]
doc[0:1]._.span_test = "span_attribute"
doc[0:2]._.span_test = "span_attribute_2"
doc[0]._.token_test = 117
doc[1]._.token_test = 118
doc.spans["span_group"] = [doc[0:1]]
json_doc = doc.to_json(
underscore=["json_test1", "json_test2", "token_test", "span_test"]
@ -139,8 +141,10 @@ def test_doc_to_json_with_token_span_attributes(doc):
assert json_doc["_"]["json_test2"] == [1, 2, 3]
assert "underscore_token" in json_doc
assert "underscore_span" in json_doc
assert json_doc["underscore_token"]["token_test"]["value"] == 117
assert json_doc["underscore_span"]["span_test"]["value"] == "span_attribute"
assert json_doc["underscore_token"]["token_test"][0]["value"] == 117
assert json_doc["underscore_token"]["token_test"][1]["value"] == 118
assert json_doc["underscore_span"]["span_test"][0]["value"] == "span_attribute"
assert json_doc["underscore_span"]["span_test"][1]["value"] == "span_attribute_2"
assert len(schemas.validate(schemas.DocJSONSchema, json_doc)) == 0
assert srsly.json_loads(srsly.json_dumps(json_doc)) == json_doc
@ -161,8 +165,8 @@ def test_doc_to_json_with_custom_user_data(doc):
assert json_doc["_"]["json_test"] == "hello world"
assert "underscore_token" in json_doc
assert "underscore_span" in json_doc
assert json_doc["underscore_token"]["token_test"]["value"] == 117
assert json_doc["underscore_span"]["span_test"]["value"] == "span_attribute"
assert json_doc["underscore_token"]["token_test"][0]["value"] == 117
assert json_doc["underscore_span"]["span_test"][0]["value"] == "span_attribute"
assert len(schemas.validate(schemas.DocJSONSchema, json_doc)) == 0
assert srsly.json_loads(srsly.json_dumps(json_doc)) == json_doc
@ -181,8 +185,8 @@ def test_doc_to_json_with_token_span_same_identifier(doc):
assert json_doc["_"]["my_ext"] == "hello world"
assert "underscore_token" in json_doc
assert "underscore_span" in json_doc
assert json_doc["underscore_token"]["my_ext"]["value"] == 117
assert json_doc["underscore_span"]["my_ext"]["value"] == "span_attribute"
assert json_doc["underscore_token"]["my_ext"][0]["value"] == 117
assert json_doc["underscore_span"]["my_ext"][0]["value"] == "span_attribute"
assert len(schemas.validate(schemas.DocJSONSchema, json_doc)) == 0
assert srsly.json_loads(srsly.json_dumps(json_doc)) == json_doc
@ -195,10 +199,9 @@ def test_doc_to_json_with_token_attributes_missing(doc):
doc[0]._.token_test = 117
json_doc = doc.to_json(underscore=["span_test"])
assert "underscore_token" in json_doc
assert "underscore_span" in json_doc
assert json_doc["underscore_span"]["span_test"]["value"] == "span_attribute"
assert "token_test" not in json_doc["underscore_token"]
assert json_doc["underscore_span"]["span_test"][0]["value"] == "span_attribute"
assert "underscore_token" not in json_doc
assert len(schemas.validate(schemas.DocJSONSchema, json_doc)) == 0
@ -283,7 +286,9 @@ def test_json_to_doc_with_token_span_attributes(doc):
doc._.json_test1 = "hello world"
doc._.json_test2 = [1, 2, 3]
doc[0:1]._.span_test = "span_attribute"
doc[0:2]._.span_test = "span_attribute_2"
doc[0]._.token_test = 117
doc[1]._.token_test = 118
json_doc = doc.to_json(
underscore=["json_test1", "json_test2", "token_test", "span_test"]
@ -295,7 +300,9 @@ def test_json_to_doc_with_token_span_attributes(doc):
assert new_doc._.json_test1 == "hello world"
assert new_doc._.json_test2 == [1, 2, 3]
assert new_doc[0]._.token_test == 117
assert new_doc[1]._.token_test == 118
assert new_doc[0:1]._.span_test == "span_attribute"
assert new_doc[0:2]._.span_test == "span_attribute_2"
assert new_doc.user_data == doc.user_data
assert new_doc.to_bytes(exclude=["user_data"]) == doc.to_bytes(
exclude=["user_data"]

View File

@ -78,3 +78,17 @@ def test_ru_lemmatizer_punct(ru_lemmatizer):
assert ru_lemmatizer.pymorphy2_lemmatize(doc[0]) == ['"']
doc = Doc(ru_lemmatizer.vocab, words=["»"], pos=["PUNCT"])
assert ru_lemmatizer.pymorphy2_lemmatize(doc[0]) == ['"']
def test_ru_doc_lookup_lemmatization(ru_lookup_lemmatizer):
words = ["мама", "мыла", "раму"]
pos = ["NOUN", "VERB", "NOUN"]
morphs = [
"Animacy=Anim|Case=Nom|Gender=Fem|Number=Sing",
"Aspect=Imp|Gender=Fem|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act",
"Animacy=Anim|Case=Acc|Gender=Fem|Number=Sing",
]
doc = Doc(ru_lookup_lemmatizer.vocab, words=words, pos=pos, morphs=morphs)
doc = ru_lookup_lemmatizer(doc)
lemmas = [token.lemma_ for token in doc]
assert lemmas == ["мама", "мыла", "раму"]

View File

@ -9,3 +9,11 @@ def test_uk_lemmatizer(uk_lemmatizer):
"""Check that the default uk lemmatizer runs."""
doc = Doc(uk_lemmatizer.vocab, words=["a", "b", "c"])
uk_lemmatizer(doc)
assert [token.lemma for token in doc]
def test_uk_lookup_lemmatizer(uk_lookup_lemmatizer):
"""Check that the lookup uk lemmatizer runs."""
doc = Doc(uk_lookup_lemmatizer.vocab, words=["a", "b", "c"])
uk_lookup_lemmatizer(doc)
assert [token.lemma for token in doc]

View File

@ -1619,24 +1619,20 @@ cdef class Doc:
Doc.set_extension(attr)
self._.set(attr, doc_json["_"][attr])
if doc_json.get("underscore_token", {}):
for token_attr in doc_json["underscore_token"]:
token_start = doc_json["underscore_token"][token_attr]["token_start"]
value = doc_json["underscore_token"][token_attr]["value"]
if not Token.has_extension(token_attr):
Token.set_extension(token_attr)
self[token_start]._.set(token_attr, value)
for token_attr in doc_json.get("underscore_token", {}):
if not Token.has_extension(token_attr):
Token.set_extension(token_attr)
for token_data in doc_json["underscore_token"][token_attr]:
start = token_by_char(self.c, self.length, token_data["start"])
value = token_data["value"]
self[start]._.set(token_attr, value)
if doc_json.get("underscore_span", {}):
for span_attr in doc_json["underscore_span"]:
token_start = doc_json["underscore_span"][span_attr]["token_start"]
token_end = doc_json["underscore_span"][span_attr]["token_end"]
value = doc_json["underscore_span"][span_attr]["value"]
if not Span.has_extension(span_attr):
Span.set_extension(span_attr)
self[token_start:token_end]._.set(span_attr, value)
for span_attr in doc_json.get("underscore_span", {}):
if not Span.has_extension(span_attr):
Span.set_extension(span_attr)
for span_data in doc_json["underscore_span"][span_attr]:
value = span_data["value"]
self.char_span(span_data["start"], span_data["end"])._.set(span_attr, value)
return self
def to_json(self, underscore=None):
@ -1684,30 +1680,34 @@ cdef class Doc:
if underscore:
user_keys = set()
if self.user_data:
data["_"] = {}
data["underscore_token"] = {}
data["underscore_span"] = {}
for data_key in self.user_data:
for data_key, value in self.user_data.copy().items():
if type(data_key) == tuple and len(data_key) >= 4 and data_key[0] == "._.":
attr = data_key[1]
start = data_key[2]
end = data_key[3]
if attr in underscore:
user_keys.add(attr)
value = self.user_data[data_key]
if not srsly.is_json_serializable(value):
raise ValueError(Errors.E107.format(attr=attr, value=repr(value)))
# Check if doc attribute
if start is None:
if "_" not in data:
data["_"] = {}
data["_"][attr] = value
# Check if token attribute
elif end is None:
if "underscore_token" not in data:
data["underscore_token"] = {}
if attr not in data["underscore_token"]:
data["underscore_token"][attr] = {"token_start": start, "value": value}
data["underscore_token"][attr] = []
data["underscore_token"][attr].append({"start": start, "value": value})
# Else span attribute
else:
if "underscore_span" not in data:
data["underscore_span"] = {}
if attr not in data["underscore_span"]:
data["underscore_span"][attr] = {"token_start": start, "token_end": end, "value": value}
data["underscore_span"][attr] = []
data["underscore_span"][attr].append({"start": start, "end": end, "value": value})
for attr in underscore:
if attr not in user_keys:

View File

@ -1482,7 +1482,7 @@ You'll also need to add the assets you want to track with
</Infobox>
```cli
$ python -m spacy project dvc [project_dir] [workflow] [--force] [--verbose]
$ python -m spacy project dvc [project_dir] [workflow] [--force] [--verbose] [--quiet]
```
> #### Example
@ -1499,6 +1499,7 @@ $ python -m spacy project dvc [project_dir] [workflow] [--force] [--verbose]
| `workflow` | Name of workflow defined in `project.yml`. Defaults to first workflow if not set. ~~Optional[str] \(option)~~ |
| `--force`, `-F` | Force-updating config file. ~~bool (flag)~~ |
| `--verbose`, `-V` | Print more output generated by DVC. ~~bool (flag)~~ |
| `--quiet`, `-q` | Print no output generated by DVC. ~~bool (flag)~~ |
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
| **CREATES** | A `dvc.yaml` file in the project directory, based on the steps defined in the given workflow. |

View File

@ -21,9 +21,9 @@ Create the knowledge base.
> #### Example
>
> ```python
> from spacy.kb import KnowledgeBase
> from spacy.kb import InMemoryLookupKB
> vocab = nlp.vocab
> kb = KnowledgeBase(vocab=vocab, entity_vector_length=64)
> kb = InMemoryLookupKB(vocab=vocab, entity_vector_length=64)
> ```
| Name | Description |

View File

@ -243,6 +243,27 @@ pipelines.
> python -m spacy project run test . --vars.foo bar
> ```
> #### Tip: Environment Variables
>
> Commands in a project file are not executed in a shell, so they don't have
> direct access to environment variables. But you can insert environment
> variables using the `env` dictionary to make values available for
> interpolation, just like values in `vars`. Here's an example `env` dict that
> makes `$PATH` available as `ENV_PATH`:
>
> ```yaml
> env:
> ENV_PATH: PATH
> ```
>
> This can be used in a project command like so:
>
> ```yaml
> - name: "echo-path"
> script:
> - "echo ${env.ENV_PATH}"
> ```
| Section | Description |
| --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `title` | An optional project title used in `--help` message and [auto-generated docs](#custom-docs). |

View File

@ -1,5 +1,46 @@
{
"resources": [
{
"id": "spacy-cleaner",
"title": "spacy-cleaner",
"slogan": "Easily clean text with spaCy!",
"description": "**spacy-cleaner** utilises spaCy `Language` models to replace, remove, and \n mutate spaCy tokens. Cleaning actions available are:\n\n* Remove/replace stopwords.\n* Remove/replace punctuation.\n* Remove/replace numbers.\n* Remove/replace emails.\n* Remove/replace URLs.\n* Perform lemmatisation.\n\nSee our [docs](https://ce11an.github.io/spacy-cleaner/) for more information.",
"github": "Ce11an/spacy-cleaner",
"pip": "spacy-cleaner",
"code_example": [
"import spacy",
"import spacy_cleaner",
"from spacy_cleaner.processing import removers, replacers, mutators",
"",
"model = spacy.load(\"en_core_web_sm\")",
"pipeline = spacy_cleaner.Pipeline(",
" model,",
" removers.remove_stopword_token,",
" replacers.replace_punctuation_token,",
" mutators.mutate_lemma_token,",
")",
"",
"texts = [\"Hello, my name is Cellan! I love to swim!\"]",
"",
"pipeline.clean(texts)",
"# ['hello _IS_PUNCT_ Cellan _IS_PUNCT_ love swim _IS_PUNCT_']"
],
"code_language": "python",
"url": "https://ce11an.github.io/spacy-cleaner/",
"image": "https://raw.githubusercontent.com/Ce11an/spacy-cleaner/main/docs/assets/images/spacemen.png",
"author": "Cellan Hall",
"author_links": {
"twitter": "Ce11an",
"github": "Ce11an",
"website": "https://www.linkedin.com/in/cellan-hall/"
},
"category": [
"extension"
],
"tags": [
"text-processing"
]
},
{
"id": "Zshot",
"title": "Zshot",
@ -2460,20 +2501,20 @@
"import spacy",
"from spacy_wordnet.wordnet_annotator import WordnetAnnotator ",
"",
"# Load an spacy model (supported models are \"es\" and \"en\") ",
"nlp = spacy.load('en')",
"# Spacy 3.x",
"nlp.add_pipe(\"spacy_wordnet\", after='tagger', config={'lang': nlp.lang})",
"# Spacy 2.x",
"# Load a spaCy model (supported languages are \"es\" and \"en\") ",
"nlp = spacy.load('en_core_web_sm')",
"# spaCy 3.x",
"nlp.add_pipe(\"spacy_wordnet\", after='tagger')",
"# spaCy 2.x",
"# nlp.add_pipe(WordnetAnnotator(nlp.lang), after='tagger')",
"token = nlp('prices')[0]",
"",
"# wordnet object link spacy token with nltk wordnet interface by giving acces to",
"# WordNet object links spaCy token with NLTK WordNet interface by giving access to",
"# synsets and lemmas ",
"token._.wordnet.synsets()",
"token._.wordnet.lemmas()",
"",
"# And automatically tags with wordnet domains",
"# And automatically add info about WordNet domains",
"token._.wordnet.wordnet_domains()"
],
"author": "recognai",