Merge branch 'master' into feature/candidate-generation-by-docs

# Conflicts:
#	spacy/tests/pipeline/test_entity_linker.py
This commit is contained in:
Raphael Mitsch 2022-10-28 10:38:18 +02:00
commit 4f7b535ebb
31 changed files with 318 additions and 128 deletions

View File

@ -10,6 +10,7 @@ steps:
inputs: inputs:
versionSpec: ${{ parameters.python_version }} versionSpec: ${{ parameters.python_version }}
architecture: ${{ parameters.architecture }} architecture: ${{ parameters.architecture }}
allowUnstable: true
- bash: | - bash: |
echo "##vso[task.setvariable variable=python_version]${{ parameters.python_version }}" echo "##vso[task.setvariable variable=python_version]${{ parameters.python_version }}"

View File

@ -12,10 +12,10 @@ jobs:
if: github.repository_owner == 'explosion' if: github.repository_owner == 'explosion'
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- uses: actions/checkout@v2 - uses: actions/checkout@v3
with: with:
ref: ${{ github.head_ref }} ref: ${{ github.head_ref }}
- uses: actions/setup-python@v2 - uses: actions/setup-python@v3
- run: pip install black - run: pip install black
- name: Auto-format code if needed - name: Auto-format code if needed
run: black spacy run: black spacy
@ -23,10 +23,11 @@ jobs:
# code and makes GitHub think the action failed # code and makes GitHub think the action failed
- name: Check for modified files - name: Check for modified files
id: git-check id: git-check
run: echo ::set-output name=modified::$(if git diff-index --quiet HEAD --; then echo "false"; else echo "true"; fi) run: echo modified=$(if git diff-index --quiet HEAD --; then echo "false"; else echo "true"; fi) >> $GITHUB_OUTPUT
- name: Create Pull Request - name: Create Pull Request
if: steps.git-check.outputs.modified == 'true' if: steps.git-check.outputs.modified == 'true'
uses: peter-evans/create-pull-request@v3 uses: peter-evans/create-pull-request@v4
with: with:
title: Auto-format code with black title: Auto-format code with black
labels: meta labels: meta

View File

@ -6,7 +6,7 @@ repos:
language_version: python3.7 language_version: python3.7
additional_dependencies: ['click==8.0.4'] additional_dependencies: ['click==8.0.4']
- repo: https://gitlab.com/pycqa/flake8 - repo: https://gitlab.com/pycqa/flake8
rev: 3.9.2 rev: 5.0.4
hooks: hooks:
- id: flake8 - id: flake8
args: args:

View File

@ -8,7 +8,7 @@ be used in real products.
spaCy comes with spaCy comes with
[pretrained pipelines](https://spacy.io/models) and [pretrained pipelines](https://spacy.io/models) and
currently supports tokenization and training for **60+ languages**. It features currently supports tokenization and training for **70+ languages**. It features
state-of-the-art speed and **neural network models** for tagging, state-of-the-art speed and **neural network models** for tagging,
parsing, **named entity recognition**, **text classification** and more, parsing, **named entity recognition**, **text classification** and more,
multi-task learning with pretrained **transformers** like BERT, as well as a multi-task learning with pretrained **transformers** like BERT, as well as a
@ -16,7 +16,7 @@ production-ready [**training system**](https://spacy.io/usage/training) and easy
model packaging, deployment and workflow management. spaCy is commercial model packaging, deployment and workflow management. spaCy is commercial
open-source software, released under the MIT license. open-source software, released under the MIT license.
💫 **Version 3.4.0 out now!** 💫 **Version 3.4 out now!**
[Check out the release notes here.](https://github.com/explosion/spaCy/releases) [Check out the release notes here.](https://github.com/explosion/spaCy/releases)
[![Azure Pipelines](https://img.shields.io/azure-devops/build/explosion-ai/public/8/master.svg?logo=azure-pipelines&style=flat-square&label=build)](https://dev.azure.com/explosion-ai/public/_build?definitionId=8) [![Azure Pipelines](https://img.shields.io/azure-devops/build/explosion-ai/public/8/master.svg?logo=azure-pipelines&style=flat-square&label=build)](https://dev.azure.com/explosion-ai/public/_build?definitionId=8)
@ -79,7 +79,7 @@ more people can benefit from it.
## Features ## Features
- Support for **60+ languages** - Support for **70+ languages**
- **Trained pipelines** for different languages and tasks - **Trained pipelines** for different languages and tasks
- Multi-task learning with pretrained **transformers** like BERT - Multi-task learning with pretrained **transformers** like BERT
- Support for pretrained **word vectors** and embeddings - Support for pretrained **word vectors** and embeddings

View File

@ -76,15 +76,24 @@ jobs:
# Python39Mac: # Python39Mac:
# imageName: "macos-latest" # imageName: "macos-latest"
# python.version: "3.9" # python.version: "3.9"
Python310Linux: # Python310Linux:
imageName: "ubuntu-latest" # imageName: "ubuntu-latest"
python.version: "3.10" # python.version: "3.10"
Python310Windows: Python310Windows:
imageName: "windows-latest" imageName: "windows-latest"
python.version: "3.10" python.version: "3.10"
Python310Mac: # Python310Mac:
imageName: "macos-latest" # imageName: "macos-latest"
python.version: "3.10" # python.version: "3.10"
Python311Linux:
imageName: 'ubuntu-latest'
python.version: '3.11.0-rc.2'
Python311Windows:
imageName: 'windows-latest'
python.version: '3.11.0-rc.2'
Python311Mac:
imageName: 'macos-latest'
python.version: '3.11.0-rc.2'
maxParallel: 4 maxParallel: 4
pool: pool:
vmImage: $(imageName) vmImage: $(imageName)

View File

@ -15,7 +15,7 @@ pathy>=0.3.5
numpy>=1.15.0 numpy>=1.15.0
requests>=2.13.0,<3.0.0 requests>=2.13.0,<3.0.0
tqdm>=4.38.0,<5.0.0 tqdm>=4.38.0,<5.0.0
pydantic>=1.7.4,!=1.8,!=1.8.1,<1.10.0 pydantic>=1.7.4,!=1.8,!=1.8.1,<1.11.0
jinja2 jinja2
langcodes>=3.2.0,<4.0.0 langcodes>=3.2.0,<4.0.0
# Official Python utilities # Official Python utilities
@ -28,7 +28,7 @@ cython>=0.25,<3.0
pytest>=5.2.0,!=7.1.0 pytest>=5.2.0,!=7.1.0
pytest-timeout>=1.3.0,<2.0.0 pytest-timeout>=1.3.0,<2.0.0
mock>=2.0.0,<3.0.0 mock>=2.0.0,<3.0.0
flake8>=3.8.0,<3.10.0 flake8>=3.8.0,<6.0.0
hypothesis>=3.27.0,<7.0.0 hypothesis>=3.27.0,<7.0.0
mypy>=0.980,<0.990; platform_machine != "aarch64" and python_version >= "3.7" mypy>=0.980,<0.990; platform_machine != "aarch64" and python_version >= "3.7"
types-dataclasses>=0.1.3; python_version < "3.7" types-dataclasses>=0.1.3; python_version < "3.7"

View File

@ -56,7 +56,7 @@ install_requires =
tqdm>=4.38.0,<5.0.0 tqdm>=4.38.0,<5.0.0
numpy>=1.15.0 numpy>=1.15.0
requests>=2.13.0,<3.0.0 requests>=2.13.0,<3.0.0
pydantic>=1.7.4,!=1.8,!=1.8.1,<1.10.0 pydantic>=1.7.4,!=1.8,!=1.8.1,<1.11.0
jinja2 jinja2
# Official Python utilities # Official Python utilities
setuptools setuptools

View File

@ -1,6 +1,6 @@
# fmt: off # fmt: off
__title__ = "spacy" __title__ = "spacy"
__version__ = "3.4.1" __version__ = "3.4.2"
__download_url__ = "https://github.com/explosion/spacy-models/releases/download" __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
__projects__ = "https://github.com/explosion/projects" __projects__ = "https://github.com/explosion/projects"

View File

@ -25,6 +25,7 @@ def project_update_dvc_cli(
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False), project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
workflow: Optional[str] = Arg(None, help=f"Name of workflow defined in {PROJECT_FILE}. Defaults to first workflow if not set."), workflow: Optional[str] = Arg(None, help=f"Name of workflow defined in {PROJECT_FILE}. Defaults to first workflow if not set."),
verbose: bool = Opt(False, "--verbose", "-V", help="Print more info"), verbose: bool = Opt(False, "--verbose", "-V", help="Print more info"),
quiet: bool = Opt(False, "--quiet", "-q", help="Print less info"),
force: bool = Opt(False, "--force", "-F", help="Force update DVC config"), force: bool = Opt(False, "--force", "-F", help="Force update DVC config"),
# fmt: on # fmt: on
): ):
@ -36,7 +37,7 @@ def project_update_dvc_cli(
DOCS: https://spacy.io/api/cli#project-dvc DOCS: https://spacy.io/api/cli#project-dvc
""" """
project_update_dvc(project_dir, workflow, verbose=verbose, force=force) project_update_dvc(project_dir, workflow, verbose=verbose, quiet=quiet, force=force)
def project_update_dvc( def project_update_dvc(
@ -44,6 +45,7 @@ def project_update_dvc(
workflow: Optional[str] = None, workflow: Optional[str] = None,
*, *,
verbose: bool = False, verbose: bool = False,
quiet: bool = False,
force: bool = False, force: bool = False,
) -> None: ) -> None:
"""Update the auto-generated Data Version Control (DVC) config file. A DVC """Update the auto-generated Data Version Control (DVC) config file. A DVC
@ -54,11 +56,12 @@ def project_update_dvc(
workflow (Optional[str]): Optional name of workflow defined in project.yml. workflow (Optional[str]): Optional name of workflow defined in project.yml.
If not set, the first workflow will be used. If not set, the first workflow will be used.
verbose (bool): Print more info. verbose (bool): Print more info.
quiet (bool): Print less info.
force (bool): Force update DVC config. force (bool): Force update DVC config.
""" """
config = load_project_config(project_dir) config = load_project_config(project_dir)
updated = update_dvc_config( updated = update_dvc_config(
project_dir, config, workflow, verbose=verbose, force=force project_dir, config, workflow, verbose=verbose, quiet=quiet, force=force
) )
help_msg = "To execute the workflow with DVC, run: dvc repro" help_msg = "To execute the workflow with DVC, run: dvc repro"
if updated: if updated:
@ -72,7 +75,7 @@ def update_dvc_config(
config: Dict[str, Any], config: Dict[str, Any],
workflow: Optional[str] = None, workflow: Optional[str] = None,
verbose: bool = False, verbose: bool = False,
silent: bool = False, quiet: bool = False,
force: bool = False, force: bool = False,
) -> bool: ) -> bool:
"""Re-run the DVC commands in dry mode and update dvc.yaml file in the """Re-run the DVC commands in dry mode and update dvc.yaml file in the
@ -83,7 +86,7 @@ def update_dvc_config(
path (Path): The path to the project directory. path (Path): The path to the project directory.
config (Dict[str, Any]): The loaded project.yml. config (Dict[str, Any]): The loaded project.yml.
verbose (bool): Whether to print additional info (via DVC). verbose (bool): Whether to print additional info (via DVC).
silent (bool): Don't output anything (via DVC). quiet (bool): Don't output anything (via DVC).
force (bool): Force update, even if hashes match. force (bool): Force update, even if hashes match.
RETURNS (bool): Whether the DVC config file was updated. RETURNS (bool): Whether the DVC config file was updated.
""" """
@ -105,6 +108,14 @@ def update_dvc_config(
dvc_config_path.unlink() dvc_config_path.unlink()
dvc_commands = [] dvc_commands = []
config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])} config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
# some flags that apply to every command
flags = []
if verbose:
flags.append("--verbose")
if quiet:
flags.append("--quiet")
for name in workflows[workflow]: for name in workflows[workflow]:
command = config_commands[name] command = config_commands[name]
deps = command.get("deps", []) deps = command.get("deps", [])
@ -118,14 +129,26 @@ def update_dvc_config(
deps_cmd = [c for cl in [["-d", p] for p in deps] for c in cl] deps_cmd = [c for cl in [["-d", p] for p in deps] for c in cl]
outputs_cmd = [c for cl in [["-o", p] for p in outputs] for c in cl] outputs_cmd = [c for cl in [["-o", p] for p in outputs] for c in cl]
outputs_nc_cmd = [c for cl in [["-O", p] for p in outputs_no_cache] for c in cl] outputs_nc_cmd = [c for cl in [["-O", p] for p in outputs_no_cache] for c in cl]
dvc_cmd = ["run", "-n", name, "-w", str(path), "--no-exec"]
dvc_cmd = ["run", *flags, "-n", name, "-w", str(path), "--no-exec"]
if command.get("no_skip"): if command.get("no_skip"):
dvc_cmd.append("--always-changed") dvc_cmd.append("--always-changed")
full_cmd = [*dvc_cmd, *deps_cmd, *outputs_cmd, *outputs_nc_cmd, *project_cmd] full_cmd = [*dvc_cmd, *deps_cmd, *outputs_cmd, *outputs_nc_cmd, *project_cmd]
dvc_commands.append(join_command(full_cmd)) dvc_commands.append(join_command(full_cmd))
if not dvc_commands:
# If we don't check for this, then there will be an error when reading the
# config, since DVC wouldn't create it.
msg.fail(
"No usable commands for DVC found. This can happen if none of your "
"commands have dependencies or outputs.",
exits=1,
)
with working_dir(path): with working_dir(path):
dvc_flags = {"--verbose": verbose, "--quiet": silent} for c in dvc_commands:
run_dvc_commands(dvc_commands, flags=dvc_flags) dvc_command = "dvc " + c
run_command(dvc_command)
with dvc_config_path.open("r+", encoding="utf8") as f: with dvc_config_path.open("r+", encoding="utf8") as f:
content = f.read() content = f.read()
f.seek(0, 0) f.seek(0, 0)
@ -133,26 +156,6 @@ def update_dvc_config(
return True return True
def run_dvc_commands(
commands: Iterable[str] = SimpleFrozenList(), flags: Dict[str, bool] = {}
) -> None:
"""Run a sequence of DVC commands in a subprocess, in order.
commands (List[str]): The string commands without the leading "dvc".
flags (Dict[str, bool]): Conditional flags to be added to command. Makes it
easier to pass flags like --quiet that depend on a variable or
command-line setting while avoiding lots of nested conditionals.
"""
for c in commands:
command = split_command(c)
dvc_command = ["dvc", *command]
# Add the flags if they are set to True
for flag, is_active in flags.items():
if is_active:
dvc_command.append(flag)
run_command(dvc_command)
def check_workflows(workflows: List[str], workflow: Optional[str] = None) -> None: def check_workflows(workflows: List[str], workflow: Optional[str] = None) -> None:
"""Validate workflows provided in project.yml and check that a given """Validate workflows provided in project.yml and check that a given
workflow can be used to generate a DVC config. workflow can be used to generate a DVC config.

View File

@ -23,7 +23,7 @@ class RussianLemmatizer(Lemmatizer):
overwrite: bool = False, overwrite: bool = False,
scorer: Optional[Callable] = lemmatizer_score, scorer: Optional[Callable] = lemmatizer_score,
) -> None: ) -> None:
if mode == "pymorphy2": if mode in {"pymorphy2", "pymorphy2_lookup"}:
try: try:
from pymorphy2 import MorphAnalyzer from pymorphy2 import MorphAnalyzer
except ImportError: except ImportError:

View File

@ -18,7 +18,7 @@ class UkrainianLemmatizer(RussianLemmatizer):
overwrite: bool = False, overwrite: bool = False,
scorer: Optional[Callable] = lemmatizer_score, scorer: Optional[Callable] = lemmatizer_score,
) -> None: ) -> None:
if mode == "pymorphy2": if mode in {"pymorphy2", "pymorphy2_lookup"}:
try: try:
from pymorphy2 import MorphAnalyzer from pymorphy2 import MorphAnalyzer
except ImportError: except ImportError:

View File

@ -71,11 +71,10 @@ def span_maker_forward(model, docs: List[Doc], is_train) -> Tuple[Ragged, Callab
cands.append((start_token, end_token)) cands.append((start_token, end_token))
candidates.append(ops.asarray2i(cands)) candidates.append(ops.asarray2i(cands))
candlens = ops.asarray1i([len(cands) for cands in candidates]) lengths = model.ops.asarray1i([len(cands) for cands in candidates])
candidates = ops.xp.concatenate(candidates) out = Ragged(model.ops.flatten(candidates), lengths)
outputs = Ragged(candidates, candlens)
# because this is just rearranging docs, the backprop does nothing # because this is just rearranging docs, the backprop does nothing
return outputs, lambda x: [] return out, lambda x: []
@registry.misc("spacy.KBFromFile.v1") @registry.misc("spacy.KBFromFile.v1")

View File

@ -1,7 +1,6 @@
from typing import cast, Any, Callable, Dict, Iterable, List, Optional from typing import cast, Any, Callable, Dict, Iterable, List, Optional
from typing import Sequence, Tuple, Union from typing import Tuple
from collections import Counter from collections import Counter
from copy import deepcopy
from itertools import islice from itertools import islice
import numpy as np import numpy as np
@ -149,9 +148,7 @@ class EditTreeLemmatizer(TrainablePipe):
if not any(len(doc) for doc in docs): if not any(len(doc) for doc in docs):
# Handle cases where there are no tokens in any docs. # Handle cases where there are no tokens in any docs.
n_labels = len(self.cfg["labels"]) n_labels = len(self.cfg["labels"])
guesses: List[Ints2d] = [ guesses: List[Ints2d] = [self.model.ops.alloc2i(0, n_labels) for _ in docs]
self.model.ops.alloc((0, n_labels), dtype="i") for doc in docs
]
assert len(guesses) == n_docs assert len(guesses) == n_docs
return guesses return guesses
scores = self.model.predict(docs) scores = self.model.predict(docs)

View File

@ -133,6 +133,9 @@ def make_spancat(
spans_key (str): Key of the doc.spans dict to save the spans under. During spans_key (str): Key of the doc.spans dict to save the spans under. During
initialization and training, the component will look for spans on the initialization and training, the component will look for spans on the
reference document under the same key. reference document under the same key.
scorer (Optional[Callable]): The scoring method. Defaults to
Scorer.score_spans for the Doc.spans[spans_key] with overlapping
spans allowed.
threshold (float): Minimum probability to consider a prediction positive. threshold (float): Minimum probability to consider a prediction positive.
Spans with a positive prediction will be saved on the Doc. Defaults to Spans with a positive prediction will be saved on the Doc. Defaults to
0.5. 0.5.

View File

@ -24,8 +24,8 @@ single_label_default_config = """
[model.tok2vec.embed] [model.tok2vec.embed]
@architectures = "spacy.MultiHashEmbed.v2" @architectures = "spacy.MultiHashEmbed.v2"
width = 64 width = 64
rows = [2000, 2000, 1000, 1000, 1000, 1000] rows = [2000, 2000, 500, 1000, 500]
attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"] attrs = ["NORM", "LOWER", "PREFIX", "SUFFIX", "SHAPE"]
include_static_vectors = false include_static_vectors = false
[model.tok2vec.encode] [model.tok2vec.encode]

View File

@ -24,8 +24,8 @@ multi_label_default_config = """
[model.tok2vec.embed] [model.tok2vec.embed]
@architectures = "spacy.MultiHashEmbed.v2" @architectures = "spacy.MultiHashEmbed.v2"
width = 64 width = 64
rows = [2000, 2000, 1000, 1000, 1000, 1000] rows = [2000, 2000, 500, 1000, 500]
attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"] attrs = ["NORM", "LOWER", "PREFIX", "SUFFIX", "SHAPE"]
include_static_vectors = false include_static_vectors = false
[model.tok2vec.encode] [model.tok2vec.encode]
@ -96,8 +96,8 @@ def make_multilabel_textcat(
model: Model[List[Doc], List[Floats2d]], model: Model[List[Doc], List[Floats2d]],
threshold: float, threshold: float,
scorer: Optional[Callable], scorer: Optional[Callable],
) -> "TextCategorizer": ) -> "MultiLabel_TextCategorizer":
"""Create a TextCategorizer component. The text categorizer predicts categories """Create a MultiLabel_TextCategorizer component. The text categorizer predicts categories
over a whole document. It can learn one or more labels, and the labels are considered over a whole document. It can learn one or more labels, and the labels are considered
to be non-mutually exclusive, which means that there can be zero or more labels to be non-mutually exclusive, which means that there can be zero or more labels
per doc). per doc).
@ -105,6 +105,7 @@ def make_multilabel_textcat(
model (Model[List[Doc], List[Floats2d]]): A model instance that predicts model (Model[List[Doc], List[Floats2d]]): A model instance that predicts
scores for each category. scores for each category.
threshold (float): Cutoff to consider a prediction "positive". threshold (float): Cutoff to consider a prediction "positive".
scorer (Optional[Callable]): The scoring method.
""" """
return MultiLabel_TextCategorizer( return MultiLabel_TextCategorizer(
nlp.vocab, model, name, threshold=threshold, scorer=scorer nlp.vocab, model, name, threshold=threshold, scorer=scorer
@ -147,6 +148,7 @@ class MultiLabel_TextCategorizer(TextCategorizer):
name (str): The component instance name, used to add entries to the name (str): The component instance name, used to add entries to the
losses during training. losses during training.
threshold (float): Cutoff to consider a prediction "positive". threshold (float): Cutoff to consider a prediction "positive".
scorer (Optional[Callable]): The scoring method.
DOCS: https://spacy.io/api/textcategorizer#init DOCS: https://spacy.io/api/textcategorizer#init
""" """

View File

@ -181,12 +181,12 @@ class TokenPatternNumber(BaseModel):
IS_SUBSET: Optional[List[StrictInt]] = Field(None, alias="is_subset") IS_SUBSET: Optional[List[StrictInt]] = Field(None, alias="is_subset")
IS_SUPERSET: Optional[List[StrictInt]] = Field(None, alias="is_superset") IS_SUPERSET: Optional[List[StrictInt]] = Field(None, alias="is_superset")
INTERSECTS: Optional[List[StrictInt]] = Field(None, alias="intersects") INTERSECTS: Optional[List[StrictInt]] = Field(None, alias="intersects")
EQ: Union[StrictInt, StrictFloat] = Field(None, alias="==") EQ: Optional[Union[StrictInt, StrictFloat]] = Field(None, alias="==")
NEQ: Union[StrictInt, StrictFloat] = Field(None, alias="!=") NEQ: Optional[Union[StrictInt, StrictFloat]] = Field(None, alias="!=")
GEQ: Union[StrictInt, StrictFloat] = Field(None, alias=">=") GEQ: Optional[Union[StrictInt, StrictFloat]] = Field(None, alias=">=")
LEQ: Union[StrictInt, StrictFloat] = Field(None, alias="<=") LEQ: Optional[Union[StrictInt, StrictFloat]] = Field(None, alias="<=")
GT: Union[StrictInt, StrictFloat] = Field(None, alias=">") GT: Optional[Union[StrictInt, StrictFloat]] = Field(None, alias=">")
LT: Union[StrictInt, StrictFloat] = Field(None, alias="<") LT: Optional[Union[StrictInt, StrictFloat]] = Field(None, alias="<")
class Config: class Config:
extra = "forbid" extra = "forbid"
@ -430,7 +430,7 @@ class ProjectConfigAssetURL(BaseModel):
# fmt: off # fmt: off
dest: StrictStr = Field(..., title="Destination of downloaded asset") dest: StrictStr = Field(..., title="Destination of downloaded asset")
url: Optional[StrictStr] = Field(None, title="URL of asset") url: Optional[StrictStr] = Field(None, title="URL of asset")
checksum: str = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})") checksum: Optional[str] = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})")
description: StrictStr = Field("", title="Description of asset") description: StrictStr = Field("", title="Description of asset")
# fmt: on # fmt: on
@ -438,7 +438,7 @@ class ProjectConfigAssetURL(BaseModel):
class ProjectConfigAssetGit(BaseModel): class ProjectConfigAssetGit(BaseModel):
# fmt: off # fmt: off
git: ProjectConfigAssetGitItem = Field(..., title="Git repo information") git: ProjectConfigAssetGitItem = Field(..., title="Git repo information")
checksum: str = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})") checksum: Optional[str] = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})")
description: Optional[StrictStr] = Field(None, title="Description of asset") description: Optional[StrictStr] = Field(None, title="Description of asset")
# fmt: on # fmt: on
@ -508,9 +508,9 @@ class DocJSONSchema(BaseModel):
None, title="Indices of sentences' start and end indices" None, title="Indices of sentences' start and end indices"
) )
text: StrictStr = Field(..., title="Document text") text: StrictStr = Field(..., title="Document text")
spans: Dict[StrictStr, List[Dict[StrictStr, Union[StrictStr, StrictInt]]]] = Field( spans: Optional[
None, title="Span information - end/start indices, label, KB ID" Dict[StrictStr, List[Dict[StrictStr, Union[StrictStr, StrictInt]]]]
) ] = Field(None, title="Span information - end/start indices, label, KB ID")
tokens: List[Dict[StrictStr, Union[StrictStr, StrictInt]]] = Field( tokens: List[Dict[StrictStr, Union[StrictStr, StrictInt]]] = Field(
..., title="Token information - ID, start, annotations" ..., title="Token information - ID, start, annotations"
) )
@ -519,9 +519,9 @@ class DocJSONSchema(BaseModel):
title="Any custom data stored in the document's _ attribute", title="Any custom data stored in the document's _ attribute",
alias="_", alias="_",
) )
underscore_token: Optional[Dict[StrictStr, Dict[StrictStr, Any]]] = Field( underscore_token: Optional[Dict[StrictStr, List[Dict[StrictStr, Any]]]] = Field(
None, title="Any custom data stored in the token's _ attribute" None, title="Any custom data stored in the token's _ attribute"
) )
underscore_span: Optional[Dict[StrictStr, Dict[StrictStr, Any]]] = Field( underscore_span: Optional[Dict[StrictStr, List[Dict[StrictStr, Any]]]] = Field(
None, title="Any custom data stored in the span's _ attribute" None, title="Any custom data stored in the span's _ attribute"
) )

View File

@ -343,6 +343,14 @@ def ru_lemmatizer():
return get_lang_class("ru")().add_pipe("lemmatizer") return get_lang_class("ru")().add_pipe("lemmatizer")
@pytest.fixture
def ru_lookup_lemmatizer():
pytest.importorskip("pymorphy2")
return get_lang_class("ru")().add_pipe(
"lemmatizer", config={"mode": "pymorphy2_lookup"}
)
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
def sa_tokenizer(): def sa_tokenizer():
return get_lang_class("sa")().tokenizer return get_lang_class("sa")().tokenizer
@ -422,6 +430,15 @@ def uk_lemmatizer():
return get_lang_class("uk")().add_pipe("lemmatizer") return get_lang_class("uk")().add_pipe("lemmatizer")
@pytest.fixture
def uk_lookup_lemmatizer():
pytest.importorskip("pymorphy2")
pytest.importorskip("pymorphy2_dicts_uk")
return get_lang_class("uk")().add_pipe(
"lemmatizer", config={"mode": "pymorphy2_lookup"}
)
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
def ur_tokenizer(): def ur_tokenizer():
return get_lang_class("ur")().tokenizer return get_lang_class("ur")().tokenizer

View File

@ -128,7 +128,9 @@ def test_doc_to_json_with_token_span_attributes(doc):
doc._.json_test1 = "hello world" doc._.json_test1 = "hello world"
doc._.json_test2 = [1, 2, 3] doc._.json_test2 = [1, 2, 3]
doc[0:1]._.span_test = "span_attribute" doc[0:1]._.span_test = "span_attribute"
doc[0:2]._.span_test = "span_attribute_2"
doc[0]._.token_test = 117 doc[0]._.token_test = 117
doc[1]._.token_test = 118
doc.spans["span_group"] = [doc[0:1]] doc.spans["span_group"] = [doc[0:1]]
json_doc = doc.to_json( json_doc = doc.to_json(
underscore=["json_test1", "json_test2", "token_test", "span_test"] underscore=["json_test1", "json_test2", "token_test", "span_test"]
@ -139,8 +141,10 @@ def test_doc_to_json_with_token_span_attributes(doc):
assert json_doc["_"]["json_test2"] == [1, 2, 3] assert json_doc["_"]["json_test2"] == [1, 2, 3]
assert "underscore_token" in json_doc assert "underscore_token" in json_doc
assert "underscore_span" in json_doc assert "underscore_span" in json_doc
assert json_doc["underscore_token"]["token_test"]["value"] == 117 assert json_doc["underscore_token"]["token_test"][0]["value"] == 117
assert json_doc["underscore_span"]["span_test"]["value"] == "span_attribute" assert json_doc["underscore_token"]["token_test"][1]["value"] == 118
assert json_doc["underscore_span"]["span_test"][0]["value"] == "span_attribute"
assert json_doc["underscore_span"]["span_test"][1]["value"] == "span_attribute_2"
assert len(schemas.validate(schemas.DocJSONSchema, json_doc)) == 0 assert len(schemas.validate(schemas.DocJSONSchema, json_doc)) == 0
assert srsly.json_loads(srsly.json_dumps(json_doc)) == json_doc assert srsly.json_loads(srsly.json_dumps(json_doc)) == json_doc
@ -161,8 +165,8 @@ def test_doc_to_json_with_custom_user_data(doc):
assert json_doc["_"]["json_test"] == "hello world" assert json_doc["_"]["json_test"] == "hello world"
assert "underscore_token" in json_doc assert "underscore_token" in json_doc
assert "underscore_span" in json_doc assert "underscore_span" in json_doc
assert json_doc["underscore_token"]["token_test"]["value"] == 117 assert json_doc["underscore_token"]["token_test"][0]["value"] == 117
assert json_doc["underscore_span"]["span_test"]["value"] == "span_attribute" assert json_doc["underscore_span"]["span_test"][0]["value"] == "span_attribute"
assert len(schemas.validate(schemas.DocJSONSchema, json_doc)) == 0 assert len(schemas.validate(schemas.DocJSONSchema, json_doc)) == 0
assert srsly.json_loads(srsly.json_dumps(json_doc)) == json_doc assert srsly.json_loads(srsly.json_dumps(json_doc)) == json_doc
@ -181,8 +185,8 @@ def test_doc_to_json_with_token_span_same_identifier(doc):
assert json_doc["_"]["my_ext"] == "hello world" assert json_doc["_"]["my_ext"] == "hello world"
assert "underscore_token" in json_doc assert "underscore_token" in json_doc
assert "underscore_span" in json_doc assert "underscore_span" in json_doc
assert json_doc["underscore_token"]["my_ext"]["value"] == 117 assert json_doc["underscore_token"]["my_ext"][0]["value"] == 117
assert json_doc["underscore_span"]["my_ext"]["value"] == "span_attribute" assert json_doc["underscore_span"]["my_ext"][0]["value"] == "span_attribute"
assert len(schemas.validate(schemas.DocJSONSchema, json_doc)) == 0 assert len(schemas.validate(schemas.DocJSONSchema, json_doc)) == 0
assert srsly.json_loads(srsly.json_dumps(json_doc)) == json_doc assert srsly.json_loads(srsly.json_dumps(json_doc)) == json_doc
@ -195,10 +199,9 @@ def test_doc_to_json_with_token_attributes_missing(doc):
doc[0]._.token_test = 117 doc[0]._.token_test = 117
json_doc = doc.to_json(underscore=["span_test"]) json_doc = doc.to_json(underscore=["span_test"])
assert "underscore_token" in json_doc
assert "underscore_span" in json_doc assert "underscore_span" in json_doc
assert json_doc["underscore_span"]["span_test"]["value"] == "span_attribute" assert json_doc["underscore_span"]["span_test"][0]["value"] == "span_attribute"
assert "token_test" not in json_doc["underscore_token"] assert "underscore_token" not in json_doc
assert len(schemas.validate(schemas.DocJSONSchema, json_doc)) == 0 assert len(schemas.validate(schemas.DocJSONSchema, json_doc)) == 0
@ -283,7 +286,9 @@ def test_json_to_doc_with_token_span_attributes(doc):
doc._.json_test1 = "hello world" doc._.json_test1 = "hello world"
doc._.json_test2 = [1, 2, 3] doc._.json_test2 = [1, 2, 3]
doc[0:1]._.span_test = "span_attribute" doc[0:1]._.span_test = "span_attribute"
doc[0:2]._.span_test = "span_attribute_2"
doc[0]._.token_test = 117 doc[0]._.token_test = 117
doc[1]._.token_test = 118
json_doc = doc.to_json( json_doc = doc.to_json(
underscore=["json_test1", "json_test2", "token_test", "span_test"] underscore=["json_test1", "json_test2", "token_test", "span_test"]
@ -295,7 +300,9 @@ def test_json_to_doc_with_token_span_attributes(doc):
assert new_doc._.json_test1 == "hello world" assert new_doc._.json_test1 == "hello world"
assert new_doc._.json_test2 == [1, 2, 3] assert new_doc._.json_test2 == [1, 2, 3]
assert new_doc[0]._.token_test == 117 assert new_doc[0]._.token_test == 117
assert new_doc[1]._.token_test == 118
assert new_doc[0:1]._.span_test == "span_attribute" assert new_doc[0:1]._.span_test == "span_attribute"
assert new_doc[0:2]._.span_test == "span_attribute_2"
assert new_doc.user_data == doc.user_data assert new_doc.user_data == doc.user_data
assert new_doc.to_bytes(exclude=["user_data"]) == doc.to_bytes( assert new_doc.to_bytes(exclude=["user_data"]) == doc.to_bytes(
exclude=["user_data"] exclude=["user_data"]

View File

@ -78,3 +78,17 @@ def test_ru_lemmatizer_punct(ru_lemmatizer):
assert ru_lemmatizer.pymorphy2_lemmatize(doc[0]) == ['"'] assert ru_lemmatizer.pymorphy2_lemmatize(doc[0]) == ['"']
doc = Doc(ru_lemmatizer.vocab, words=["»"], pos=["PUNCT"]) doc = Doc(ru_lemmatizer.vocab, words=["»"], pos=["PUNCT"])
assert ru_lemmatizer.pymorphy2_lemmatize(doc[0]) == ['"'] assert ru_lemmatizer.pymorphy2_lemmatize(doc[0]) == ['"']
def test_ru_doc_lookup_lemmatization(ru_lookup_lemmatizer):
words = ["мама", "мыла", "раму"]
pos = ["NOUN", "VERB", "NOUN"]
morphs = [
"Animacy=Anim|Case=Nom|Gender=Fem|Number=Sing",
"Aspect=Imp|Gender=Fem|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act",
"Animacy=Anim|Case=Acc|Gender=Fem|Number=Sing",
]
doc = Doc(ru_lookup_lemmatizer.vocab, words=words, pos=pos, morphs=morphs)
doc = ru_lookup_lemmatizer(doc)
lemmas = [token.lemma_ for token in doc]
assert lemmas == ["мама", "мыла", "раму"]

View File

@ -9,3 +9,11 @@ def test_uk_lemmatizer(uk_lemmatizer):
"""Check that the default uk lemmatizer runs.""" """Check that the default uk lemmatizer runs."""
doc = Doc(uk_lemmatizer.vocab, words=["a", "b", "c"]) doc = Doc(uk_lemmatizer.vocab, words=["a", "b", "c"])
uk_lemmatizer(doc) uk_lemmatizer(doc)
assert [token.lemma for token in doc]
def test_uk_lookup_lemmatizer(uk_lookup_lemmatizer):
"""Check that the lookup uk lemmatizer runs."""
doc = Doc(uk_lookup_lemmatizer.vocab, words=["a", "b", "c"])
uk_lookup_lemmatizer(doc)
assert [token.lemma for token in doc]

View File

@ -9,6 +9,7 @@ from spacy.compat import pickle
from spacy.kb import Candidate, InMemoryLookupKB, get_candidates, KnowledgeBase from spacy.kb import Candidate, InMemoryLookupKB, get_candidates, KnowledgeBase
from spacy.lang.en import English from spacy.lang.en import English
from spacy.ml import load_kb from spacy.ml import load_kb
from spacy.ml.models.entity_linker import build_span_maker
from spacy.pipeline import EntityLinker from spacy.pipeline import EntityLinker
from spacy.pipeline.legacy import EntityLinker_v1 from spacy.pipeline.legacy import EntityLinker_v1
from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
@ -728,7 +729,11 @@ TRAIN_DATA = [
("Russ Cochran was a member of University of Kentucky's golf team.", ("Russ Cochran was a member of University of Kentucky's golf team.",
{"links": {(0, 12): {"Q7381115": 0.0, "Q2146908": 1.0}}, {"links": {(0, 12): {"Q7381115": 0.0, "Q2146908": 1.0}},
"entities": [(0, 12, "PERSON"), (43, 51, "LOC")], "entities": [(0, 12, "PERSON"), (43, 51, "LOC")],
"sent_starts": [1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}) "sent_starts": [1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}),
# having a blank instance shouldn't break things
("The weather is nice today.",
{"links": {}, "entities": [],
"sent_starts": [1, -1, 0, 0, 0, 0]})
] ]
GOLD_entities = ["Q2146908", "Q7381115", "Q7381115", "Q2146908"] GOLD_entities = ["Q2146908", "Q7381115", "Q7381115", "Q2146908"]
# fmt: on # fmt: on
@ -1211,6 +1216,21 @@ def test_threshold(meet_threshold: bool, config: Dict[str, Any]):
assert doc.ents[0].kb_id_ == entity_id if meet_threshold else EntityLinker.NIL assert doc.ents[0].kb_id_ == entity_id if meet_threshold else EntityLinker.NIL
def test_span_maker_forward_with_empty():
"""The forward pass of the span maker may have a doc with no entities."""
nlp = English()
doc1 = nlp("a b c")
ent = doc1[0:1]
ent.label_ = "X"
doc1.ents = [ent]
# no entities
doc2 = nlp("x y z")
# just to get a model
span_maker = build_span_maker()
span_maker([doc1, doc2], False)
def test_nel_candidate_processing(): def test_nel_candidate_processing():
"""Test that NEL handles candidate streams correctly in a set of documents with & without entities as well as empty """Test that NEL handles candidate streams correctly in a set of documents with & without entities as well as empty
documents. documents.

View File

@ -231,7 +231,7 @@ def test_tok2vec_listener_callback():
def test_tok2vec_listener_overfitting(): def test_tok2vec_listener_overfitting():
""" Test that a pipeline with a listener properly overfits, even if 'tok2vec' is in the annotating components """ """Test that a pipeline with a listener properly overfits, even if 'tok2vec' is in the annotating components"""
orig_config = Config().from_str(cfg_string) orig_config = Config().from_str(cfg_string)
nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True) nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
train_examples = [] train_examples = []
@ -264,7 +264,7 @@ def test_tok2vec_listener_overfitting():
def test_tok2vec_frozen_not_annotating(): def test_tok2vec_frozen_not_annotating():
""" Test that a pipeline with a frozen tok2vec raises an error when the tok2vec is not annotating """ """Test that a pipeline with a frozen tok2vec raises an error when the tok2vec is not annotating"""
orig_config = Config().from_str(cfg_string) orig_config = Config().from_str(cfg_string)
nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True) nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
train_examples = [] train_examples = []
@ -274,12 +274,16 @@ def test_tok2vec_frozen_not_annotating():
for i in range(2): for i in range(2):
losses = {} losses = {}
with pytest.raises(ValueError, match=r"the tok2vec embedding layer is not updated"): with pytest.raises(
nlp.update(train_examples, sgd=optimizer, losses=losses, exclude=["tok2vec"]) ValueError, match=r"the tok2vec embedding layer is not updated"
):
nlp.update(
train_examples, sgd=optimizer, losses=losses, exclude=["tok2vec"]
)
def test_tok2vec_frozen_overfitting(): def test_tok2vec_frozen_overfitting():
""" Test that a pipeline with a frozen & annotating tok2vec can still overfit """ """Test that a pipeline with a frozen & annotating tok2vec can still overfit"""
orig_config = Config().from_str(cfg_string) orig_config = Config().from_str(cfg_string)
nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True) nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
train_examples = [] train_examples = []
@ -289,7 +293,13 @@ def test_tok2vec_frozen_overfitting():
for i in range(100): for i in range(100):
losses = {} losses = {}
nlp.update(train_examples, sgd=optimizer, losses=losses, exclude=["tok2vec"], annotates=["tok2vec"]) nlp.update(
train_examples,
sgd=optimizer,
losses=losses,
exclude=["tok2vec"],
annotates=["tok2vec"],
)
assert losses["tagger"] < 0.0001 assert losses["tagger"] < 0.0001
# test the trained model # test the trained model

View File

@ -23,7 +23,7 @@ def get_textcat_bow_kwargs():
def get_textcat_cnn_kwargs(): def get_textcat_cnn_kwargs():
return {"tok2vec": test_tok2vec(), "exclusive_classes": False, "nO": 13} return {"tok2vec": make_test_tok2vec(), "exclusive_classes": False, "nO": 13}
def get_all_params(model): def get_all_params(model):
@ -65,7 +65,7 @@ def get_tok2vec_kwargs():
} }
def test_tok2vec(): def make_test_tok2vec():
return build_Tok2Vec_model(**get_tok2vec_kwargs()) return build_Tok2Vec_model(**get_tok2vec_kwargs())

View File

@ -1608,24 +1608,20 @@ cdef class Doc:
Doc.set_extension(attr) Doc.set_extension(attr)
self._.set(attr, doc_json["_"][attr]) self._.set(attr, doc_json["_"][attr])
if doc_json.get("underscore_token", {}): for token_attr in doc_json.get("underscore_token", {}):
for token_attr in doc_json["underscore_token"]:
token_start = doc_json["underscore_token"][token_attr]["token_start"]
value = doc_json["underscore_token"][token_attr]["value"]
if not Token.has_extension(token_attr): if not Token.has_extension(token_attr):
Token.set_extension(token_attr) Token.set_extension(token_attr)
self[token_start]._.set(token_attr, value) for token_data in doc_json["underscore_token"][token_attr]:
start = token_by_char(self.c, self.length, token_data["start"])
if doc_json.get("underscore_span", {}): value = token_data["value"]
for span_attr in doc_json["underscore_span"]: self[start]._.set(token_attr, value)
token_start = doc_json["underscore_span"][span_attr]["token_start"]
token_end = doc_json["underscore_span"][span_attr]["token_end"]
value = doc_json["underscore_span"][span_attr]["value"]
for span_attr in doc_json.get("underscore_span", {}):
if not Span.has_extension(span_attr): if not Span.has_extension(span_attr):
Span.set_extension(span_attr) Span.set_extension(span_attr)
self[token_start:token_end]._.set(span_attr, value) for span_data in doc_json["underscore_span"][span_attr]:
value = span_data["value"]
self.char_span(span_data["start"], span_data["end"])._.set(span_attr, value)
return self return self
def to_json(self, underscore=None): def to_json(self, underscore=None):
@ -1673,30 +1669,34 @@ cdef class Doc:
if underscore: if underscore:
user_keys = set() user_keys = set()
if self.user_data: if self.user_data:
data["_"] = {} for data_key, value in self.user_data.copy().items():
data["underscore_token"] = {}
data["underscore_span"] = {}
for data_key in self.user_data:
if type(data_key) == tuple and len(data_key) >= 4 and data_key[0] == "._.": if type(data_key) == tuple and len(data_key) >= 4 and data_key[0] == "._.":
attr = data_key[1] attr = data_key[1]
start = data_key[2] start = data_key[2]
end = data_key[3] end = data_key[3]
if attr in underscore: if attr in underscore:
user_keys.add(attr) user_keys.add(attr)
value = self.user_data[data_key]
if not srsly.is_json_serializable(value): if not srsly.is_json_serializable(value):
raise ValueError(Errors.E107.format(attr=attr, value=repr(value))) raise ValueError(Errors.E107.format(attr=attr, value=repr(value)))
# Check if doc attribute # Check if doc attribute
if start is None: if start is None:
if "_" not in data:
data["_"] = {}
data["_"][attr] = value data["_"][attr] = value
# Check if token attribute # Check if token attribute
elif end is None: elif end is None:
if "underscore_token" not in data:
data["underscore_token"] = {}
if attr not in data["underscore_token"]: if attr not in data["underscore_token"]:
data["underscore_token"][attr] = {"token_start": start, "value": value} data["underscore_token"][attr] = []
data["underscore_token"][attr].append({"start": start, "value": value})
# Else span attribute # Else span attribute
else: else:
if "underscore_span" not in data:
data["underscore_span"] = {}
if attr not in data["underscore_span"]: if attr not in data["underscore_span"]:
data["underscore_span"][attr] = {"token_start": start, "token_end": end, "value": value} data["underscore_span"][attr] = []
data["underscore_span"][attr].append({"start": start, "end": end, "value": value})
for attr in underscore: for attr in underscore:
if attr not in user_keys: if attr not in user_keys:

View File

@ -1482,7 +1482,7 @@ You'll also need to add the assets you want to track with
</Infobox> </Infobox>
```cli ```cli
$ python -m spacy project dvc [project_dir] [workflow] [--force] [--verbose] $ python -m spacy project dvc [project_dir] [workflow] [--force] [--verbose] [--quiet]
``` ```
> #### Example > #### Example
@ -1499,6 +1499,7 @@ $ python -m spacy project dvc [project_dir] [workflow] [--force] [--verbose]
| `workflow` | Name of workflow defined in `project.yml`. Defaults to first workflow if not set. ~~Optional[str] \(option)~~ | | `workflow` | Name of workflow defined in `project.yml`. Defaults to first workflow if not set. ~~Optional[str] \(option)~~ |
| `--force`, `-F` | Force-updating config file. ~~bool (flag)~~ | | `--force`, `-F` | Force-updating config file. ~~bool (flag)~~ |
| `--verbose`, `-V` | Print more output generated by DVC. ~~bool (flag)~~ | | `--verbose`, `-V` | Print more output generated by DVC. ~~bool (flag)~~ |
| `--quiet`, `-q` | Print no output generated by DVC. ~~bool (flag)~~ |
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | | `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
| **CREATES** | A `dvc.yaml` file in the project directory, based on the steps defined in the given workflow. | | **CREATES** | A `dvc.yaml` file in the project directory, based on the steps defined in the given workflow. |

View File

@ -243,6 +243,27 @@ pipelines.
> python -m spacy project run test . --vars.foo bar > python -m spacy project run test . --vars.foo bar
> ``` > ```
> #### Tip: Environment Variables
>
> Commands in a project file are not executed in a shell, so they don't have
> direct access to environment variables. But you can insert environment
> variables using the `env` dictionary to make values available for
> interpolation, just like values in `vars`. Here's an example `env` dict that
> makes `$PATH` available as `ENV_PATH`:
>
> ```yaml
> env:
> ENV_PATH: PATH
> ```
>
> This can be used in a project command like so:
>
> ```yaml
> - name: "echo-path"
> script:
> - "echo ${env.ENV_PATH}"
> ```
| Section | Description | | Section | Description |
| --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `title` | An optional project title used in `--help` message and [auto-generated docs](#custom-docs). | | `title` | An optional project title used in `--help` message and [auto-generated docs](#custom-docs). |

View File

@ -4,12 +4,22 @@
"code": "af", "code": "af",
"name": "Afrikaans" "name": "Afrikaans"
}, },
{
"code": "am",
"name": "Amharic",
"has_examples": true
},
{ {
"code": "ar", "code": "ar",
"name": "Arabic", "name": "Arabic",
"example": "هذه جملة", "example": "هذه جملة",
"has_examples": true "has_examples": true
}, },
{
"code": "az",
"name": "Azerbaijani",
"has_examples": true
},
{ {
"code": "bg", "code": "bg",
"name": "Bulgarian", "name": "Bulgarian",
@ -142,6 +152,11 @@
"code": "ga", "code": "ga",
"name": "Irish" "name": "Irish"
}, },
{
"code": "grc",
"name": "Ancient Greek",
"has_examples": true
},
{ {
"code": "gu", "code": "gu",
"name": "Gujarati", "name": "Gujarati",
@ -260,6 +275,10 @@
"example": "Адамга эң кыйыны — күн сайын адам болуу", "example": "Адамга эң кыйыны — күн сайын адам болуу",
"has_examples": true "has_examples": true
}, },
{
"code": "la",
"name": "Latin"
},
{ {
"code": "lb", "code": "lb",
"name": "Luxembourgish", "name": "Luxembourgish",
@ -448,6 +467,11 @@
"example": "นี่คือประโยค", "example": "นี่คือประโยค",
"has_examples": true "has_examples": true
}, },
{
"code": "ti",
"name": "Tigrinya",
"has_examples": true
},
{ {
"code": "tl", "code": "tl",
"name": "Tagalog" "name": "Tagalog"

View File

@ -1,5 +1,46 @@
{ {
"resources": [ "resources": [
{
"id": "spacy-cleaner",
"title": "spacy-cleaner",
"slogan": "Easily clean text with spaCy!",
"description": "**spacy-cleaner** utilises spaCy `Language` models to replace, remove, and \n mutate spaCy tokens. Cleaning actions available are:\n\n* Remove/replace stopwords.\n* Remove/replace punctuation.\n* Remove/replace numbers.\n* Remove/replace emails.\n* Remove/replace URLs.\n* Perform lemmatisation.\n\nSee our [docs](https://ce11an.github.io/spacy-cleaner/) for more information.",
"github": "Ce11an/spacy-cleaner",
"pip": "spacy-cleaner",
"code_example": [
"import spacy",
"import spacy_cleaner",
"from spacy_cleaner.processing import removers, replacers, mutators",
"",
"model = spacy.load(\"en_core_web_sm\")",
"pipeline = spacy_cleaner.Pipeline(",
" model,",
" removers.remove_stopword_token,",
" replacers.replace_punctuation_token,",
" mutators.mutate_lemma_token,",
")",
"",
"texts = [\"Hello, my name is Cellan! I love to swim!\"]",
"",
"pipeline.clean(texts)",
"# ['hello _IS_PUNCT_ Cellan _IS_PUNCT_ love swim _IS_PUNCT_']"
],
"code_language": "python",
"url": "https://ce11an.github.io/spacy-cleaner/",
"image": "https://raw.githubusercontent.com/Ce11an/spacy-cleaner/main/docs/assets/images/spacemen.png",
"author": "Cellan Hall",
"author_links": {
"twitter": "Ce11an",
"github": "Ce11an",
"website": "https://www.linkedin.com/in/cellan-hall/"
},
"category": [
"extension"
],
"tags": [
"text-processing"
]
},
{ {
"id": "Zshot", "id": "Zshot",
"title": "Zshot", "title": "Zshot",
@ -2460,20 +2501,20 @@
"import spacy", "import spacy",
"from spacy_wordnet.wordnet_annotator import WordnetAnnotator ", "from spacy_wordnet.wordnet_annotator import WordnetAnnotator ",
"", "",
"# Load an spacy model (supported models are \"es\" and \"en\") ", "# Load a spaCy model (supported languages are \"es\" and \"en\") ",
"nlp = spacy.load('en')", "nlp = spacy.load('en_core_web_sm')",
"# Spacy 3.x", "# spaCy 3.x",
"nlp.add_pipe(\"spacy_wordnet\", after='tagger', config={'lang': nlp.lang})", "nlp.add_pipe(\"spacy_wordnet\", after='tagger')",
"# Spacy 2.x", "# spaCy 2.x",
"# nlp.add_pipe(WordnetAnnotator(nlp.lang), after='tagger')", "# nlp.add_pipe(WordnetAnnotator(nlp.lang), after='tagger')",
"token = nlp('prices')[0]", "token = nlp('prices')[0]",
"", "",
"# wordnet object link spacy token with nltk wordnet interface by giving acces to", "# WordNet object links spaCy token with NLTK WordNet interface by giving access to",
"# synsets and lemmas ", "# synsets and lemmas ",
"token._.wordnet.synsets()", "token._.wordnet.synsets()",
"token._.wordnet.lemmas()", "token._.wordnet.lemmas()",
"", "",
"# And automatically tags with wordnet domains", "# And automatically add info about WordNet domains",
"token._.wordnet.wordnet_domains()" "token._.wordnet.wordnet_domains()"
], ],
"author": "recognai", "author": "recognai",

View File

@ -149,6 +149,9 @@
& > span & > span
display: block display: block
a
text-decoration: underline
.small .small
font-size: var(--font-size-code) font-size: var(--font-size-code)
line-height: 1.65 line-height: 1.65

View File

@ -159,6 +159,9 @@ const QuickstartInstall = ({ id, title }) => {
setters={setters} setters={setters}
showDropdown={showDropdown} showDropdown={showDropdown}
> >
<QS os="mac" hardware="gpu" platform="arm">
# Note M1 GPU support is experimental, see <a href="https://github.com/explosion/thinc/issues/792">Thinc issue #792</a>
</QS>
<QS package="pip" config="venv"> <QS package="pip" config="venv">
python -m venv .env python -m venv .env
</QS> </QS>
@ -198,7 +201,13 @@ const QuickstartInstall = ({ id, title }) => {
{nightly ? ' --pre' : ''} {nightly ? ' --pre' : ''}
</QS> </QS>
<QS package="conda">conda install -c conda-forge spacy</QS> <QS package="conda">conda install -c conda-forge spacy</QS>
<QS package="conda" hardware="gpu"> <QS package="conda" hardware="gpu" os="windows">
conda install -c conda-forge cupy
</QS>
<QS package="conda" hardware="gpu" os="linux">
conda install -c conda-forge cupy
</QS>
<QS package="conda" hardware="gpu" os="mac" platform="x86">
conda install -c conda-forge cupy conda install -c conda-forge cupy
</QS> </QS>
<QS package="conda" config="train"> <QS package="conda" config="train">