mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-14 18:22:27 +03:00
Merge branch 'master' into feature/candidate-generation-by-docs
# Conflicts: # spacy/tests/pipeline/test_entity_linker.py
This commit is contained in:
commit
4f7b535ebb
1
.github/azure-steps.yml
vendored
1
.github/azure-steps.yml
vendored
|
@ -10,6 +10,7 @@ steps:
|
|||
inputs:
|
||||
versionSpec: ${{ parameters.python_version }}
|
||||
architecture: ${{ parameters.architecture }}
|
||||
allowUnstable: true
|
||||
|
||||
- bash: |
|
||||
echo "##vso[task.setvariable variable=python_version]${{ parameters.python_version }}"
|
||||
|
|
9
.github/workflows/autoblack.yml
vendored
9
.github/workflows/autoblack.yml
vendored
|
@ -12,10 +12,10 @@ jobs:
|
|||
if: github.repository_owner == 'explosion'
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- uses: actions/checkout@v3
|
||||
with:
|
||||
ref: ${{ github.head_ref }}
|
||||
- uses: actions/setup-python@v2
|
||||
- uses: actions/setup-python@v3
|
||||
- run: pip install black
|
||||
- name: Auto-format code if needed
|
||||
run: black spacy
|
||||
|
@ -23,10 +23,11 @@ jobs:
|
|||
# code and makes GitHub think the action failed
|
||||
- name: Check for modified files
|
||||
id: git-check
|
||||
run: echo ::set-output name=modified::$(if git diff-index --quiet HEAD --; then echo "false"; else echo "true"; fi)
|
||||
run: echo modified=$(if git diff-index --quiet HEAD --; then echo "false"; else echo "true"; fi) >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Create Pull Request
|
||||
if: steps.git-check.outputs.modified == 'true'
|
||||
uses: peter-evans/create-pull-request@v3
|
||||
uses: peter-evans/create-pull-request@v4
|
||||
with:
|
||||
title: Auto-format code with black
|
||||
labels: meta
|
||||
|
|
|
@ -6,7 +6,7 @@ repos:
|
|||
language_version: python3.7
|
||||
additional_dependencies: ['click==8.0.4']
|
||||
- repo: https://gitlab.com/pycqa/flake8
|
||||
rev: 3.9.2
|
||||
rev: 5.0.4
|
||||
hooks:
|
||||
- id: flake8
|
||||
args:
|
||||
|
|
|
@ -8,7 +8,7 @@ be used in real products.
|
|||
|
||||
spaCy comes with
|
||||
[pretrained pipelines](https://spacy.io/models) and
|
||||
currently supports tokenization and training for **60+ languages**. It features
|
||||
currently supports tokenization and training for **70+ languages**. It features
|
||||
state-of-the-art speed and **neural network models** for tagging,
|
||||
parsing, **named entity recognition**, **text classification** and more,
|
||||
multi-task learning with pretrained **transformers** like BERT, as well as a
|
||||
|
@ -16,7 +16,7 @@ production-ready [**training system**](https://spacy.io/usage/training) and easy
|
|||
model packaging, deployment and workflow management. spaCy is commercial
|
||||
open-source software, released under the MIT license.
|
||||
|
||||
💫 **Version 3.4.0 out now!**
|
||||
💫 **Version 3.4 out now!**
|
||||
[Check out the release notes here.](https://github.com/explosion/spaCy/releases)
|
||||
|
||||
[](https://dev.azure.com/explosion-ai/public/_build?definitionId=8)
|
||||
|
@ -79,7 +79,7 @@ more people can benefit from it.
|
|||
|
||||
## Features
|
||||
|
||||
- Support for **60+ languages**
|
||||
- Support for **70+ languages**
|
||||
- **Trained pipelines** for different languages and tasks
|
||||
- Multi-task learning with pretrained **transformers** like BERT
|
||||
- Support for pretrained **word vectors** and embeddings
|
||||
|
|
|
@ -76,15 +76,24 @@ jobs:
|
|||
# Python39Mac:
|
||||
# imageName: "macos-latest"
|
||||
# python.version: "3.9"
|
||||
Python310Linux:
|
||||
imageName: "ubuntu-latest"
|
||||
python.version: "3.10"
|
||||
# Python310Linux:
|
||||
# imageName: "ubuntu-latest"
|
||||
# python.version: "3.10"
|
||||
Python310Windows:
|
||||
imageName: "windows-latest"
|
||||
python.version: "3.10"
|
||||
Python310Mac:
|
||||
imageName: "macos-latest"
|
||||
python.version: "3.10"
|
||||
# Python310Mac:
|
||||
# imageName: "macos-latest"
|
||||
# python.version: "3.10"
|
||||
Python311Linux:
|
||||
imageName: 'ubuntu-latest'
|
||||
python.version: '3.11.0-rc.2'
|
||||
Python311Windows:
|
||||
imageName: 'windows-latest'
|
||||
python.version: '3.11.0-rc.2'
|
||||
Python311Mac:
|
||||
imageName: 'macos-latest'
|
||||
python.version: '3.11.0-rc.2'
|
||||
maxParallel: 4
|
||||
pool:
|
||||
vmImage: $(imageName)
|
||||
|
|
|
@ -15,7 +15,7 @@ pathy>=0.3.5
|
|||
numpy>=1.15.0
|
||||
requests>=2.13.0,<3.0.0
|
||||
tqdm>=4.38.0,<5.0.0
|
||||
pydantic>=1.7.4,!=1.8,!=1.8.1,<1.10.0
|
||||
pydantic>=1.7.4,!=1.8,!=1.8.1,<1.11.0
|
||||
jinja2
|
||||
langcodes>=3.2.0,<4.0.0
|
||||
# Official Python utilities
|
||||
|
@ -28,7 +28,7 @@ cython>=0.25,<3.0
|
|||
pytest>=5.2.0,!=7.1.0
|
||||
pytest-timeout>=1.3.0,<2.0.0
|
||||
mock>=2.0.0,<3.0.0
|
||||
flake8>=3.8.0,<3.10.0
|
||||
flake8>=3.8.0,<6.0.0
|
||||
hypothesis>=3.27.0,<7.0.0
|
||||
mypy>=0.980,<0.990; platform_machine != "aarch64" and python_version >= "3.7"
|
||||
types-dataclasses>=0.1.3; python_version < "3.7"
|
||||
|
|
|
@ -56,7 +56,7 @@ install_requires =
|
|||
tqdm>=4.38.0,<5.0.0
|
||||
numpy>=1.15.0
|
||||
requests>=2.13.0,<3.0.0
|
||||
pydantic>=1.7.4,!=1.8,!=1.8.1,<1.10.0
|
||||
pydantic>=1.7.4,!=1.8,!=1.8.1,<1.11.0
|
||||
jinja2
|
||||
# Official Python utilities
|
||||
setuptools
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
# fmt: off
|
||||
__title__ = "spacy"
|
||||
__version__ = "3.4.1"
|
||||
__version__ = "3.4.2"
|
||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||
__projects__ = "https://github.com/explosion/projects"
|
||||
|
|
|
@ -25,6 +25,7 @@ def project_update_dvc_cli(
|
|||
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
|
||||
workflow: Optional[str] = Arg(None, help=f"Name of workflow defined in {PROJECT_FILE}. Defaults to first workflow if not set."),
|
||||
verbose: bool = Opt(False, "--verbose", "-V", help="Print more info"),
|
||||
quiet: bool = Opt(False, "--quiet", "-q", help="Print less info"),
|
||||
force: bool = Opt(False, "--force", "-F", help="Force update DVC config"),
|
||||
# fmt: on
|
||||
):
|
||||
|
@ -36,7 +37,7 @@ def project_update_dvc_cli(
|
|||
|
||||
DOCS: https://spacy.io/api/cli#project-dvc
|
||||
"""
|
||||
project_update_dvc(project_dir, workflow, verbose=verbose, force=force)
|
||||
project_update_dvc(project_dir, workflow, verbose=verbose, quiet=quiet, force=force)
|
||||
|
||||
|
||||
def project_update_dvc(
|
||||
|
@ -44,6 +45,7 @@ def project_update_dvc(
|
|||
workflow: Optional[str] = None,
|
||||
*,
|
||||
verbose: bool = False,
|
||||
quiet: bool = False,
|
||||
force: bool = False,
|
||||
) -> None:
|
||||
"""Update the auto-generated Data Version Control (DVC) config file. A DVC
|
||||
|
@ -54,11 +56,12 @@ def project_update_dvc(
|
|||
workflow (Optional[str]): Optional name of workflow defined in project.yml.
|
||||
If not set, the first workflow will be used.
|
||||
verbose (bool): Print more info.
|
||||
quiet (bool): Print less info.
|
||||
force (bool): Force update DVC config.
|
||||
"""
|
||||
config = load_project_config(project_dir)
|
||||
updated = update_dvc_config(
|
||||
project_dir, config, workflow, verbose=verbose, force=force
|
||||
project_dir, config, workflow, verbose=verbose, quiet=quiet, force=force
|
||||
)
|
||||
help_msg = "To execute the workflow with DVC, run: dvc repro"
|
||||
if updated:
|
||||
|
@ -72,7 +75,7 @@ def update_dvc_config(
|
|||
config: Dict[str, Any],
|
||||
workflow: Optional[str] = None,
|
||||
verbose: bool = False,
|
||||
silent: bool = False,
|
||||
quiet: bool = False,
|
||||
force: bool = False,
|
||||
) -> bool:
|
||||
"""Re-run the DVC commands in dry mode and update dvc.yaml file in the
|
||||
|
@ -83,7 +86,7 @@ def update_dvc_config(
|
|||
path (Path): The path to the project directory.
|
||||
config (Dict[str, Any]): The loaded project.yml.
|
||||
verbose (bool): Whether to print additional info (via DVC).
|
||||
silent (bool): Don't output anything (via DVC).
|
||||
quiet (bool): Don't output anything (via DVC).
|
||||
force (bool): Force update, even if hashes match.
|
||||
RETURNS (bool): Whether the DVC config file was updated.
|
||||
"""
|
||||
|
@ -105,6 +108,14 @@ def update_dvc_config(
|
|||
dvc_config_path.unlink()
|
||||
dvc_commands = []
|
||||
config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
|
||||
|
||||
# some flags that apply to every command
|
||||
flags = []
|
||||
if verbose:
|
||||
flags.append("--verbose")
|
||||
if quiet:
|
||||
flags.append("--quiet")
|
||||
|
||||
for name in workflows[workflow]:
|
||||
command = config_commands[name]
|
||||
deps = command.get("deps", [])
|
||||
|
@ -118,14 +129,26 @@ def update_dvc_config(
|
|||
deps_cmd = [c for cl in [["-d", p] for p in deps] for c in cl]
|
||||
outputs_cmd = [c for cl in [["-o", p] for p in outputs] for c in cl]
|
||||
outputs_nc_cmd = [c for cl in [["-O", p] for p in outputs_no_cache] for c in cl]
|
||||
dvc_cmd = ["run", "-n", name, "-w", str(path), "--no-exec"]
|
||||
|
||||
dvc_cmd = ["run", *flags, "-n", name, "-w", str(path), "--no-exec"]
|
||||
if command.get("no_skip"):
|
||||
dvc_cmd.append("--always-changed")
|
||||
full_cmd = [*dvc_cmd, *deps_cmd, *outputs_cmd, *outputs_nc_cmd, *project_cmd]
|
||||
dvc_commands.append(join_command(full_cmd))
|
||||
|
||||
if not dvc_commands:
|
||||
# If we don't check for this, then there will be an error when reading the
|
||||
# config, since DVC wouldn't create it.
|
||||
msg.fail(
|
||||
"No usable commands for DVC found. This can happen if none of your "
|
||||
"commands have dependencies or outputs.",
|
||||
exits=1,
|
||||
)
|
||||
|
||||
with working_dir(path):
|
||||
dvc_flags = {"--verbose": verbose, "--quiet": silent}
|
||||
run_dvc_commands(dvc_commands, flags=dvc_flags)
|
||||
for c in dvc_commands:
|
||||
dvc_command = "dvc " + c
|
||||
run_command(dvc_command)
|
||||
with dvc_config_path.open("r+", encoding="utf8") as f:
|
||||
content = f.read()
|
||||
f.seek(0, 0)
|
||||
|
@ -133,26 +156,6 @@ def update_dvc_config(
|
|||
return True
|
||||
|
||||
|
||||
def run_dvc_commands(
|
||||
commands: Iterable[str] = SimpleFrozenList(), flags: Dict[str, bool] = {}
|
||||
) -> None:
|
||||
"""Run a sequence of DVC commands in a subprocess, in order.
|
||||
|
||||
commands (List[str]): The string commands without the leading "dvc".
|
||||
flags (Dict[str, bool]): Conditional flags to be added to command. Makes it
|
||||
easier to pass flags like --quiet that depend on a variable or
|
||||
command-line setting while avoiding lots of nested conditionals.
|
||||
"""
|
||||
for c in commands:
|
||||
command = split_command(c)
|
||||
dvc_command = ["dvc", *command]
|
||||
# Add the flags if they are set to True
|
||||
for flag, is_active in flags.items():
|
||||
if is_active:
|
||||
dvc_command.append(flag)
|
||||
run_command(dvc_command)
|
||||
|
||||
|
||||
def check_workflows(workflows: List[str], workflow: Optional[str] = None) -> None:
|
||||
"""Validate workflows provided in project.yml and check that a given
|
||||
workflow can be used to generate a DVC config.
|
||||
|
|
|
@ -23,7 +23,7 @@ class RussianLemmatizer(Lemmatizer):
|
|||
overwrite: bool = False,
|
||||
scorer: Optional[Callable] = lemmatizer_score,
|
||||
) -> None:
|
||||
if mode == "pymorphy2":
|
||||
if mode in {"pymorphy2", "pymorphy2_lookup"}:
|
||||
try:
|
||||
from pymorphy2 import MorphAnalyzer
|
||||
except ImportError:
|
||||
|
|
|
@ -18,7 +18,7 @@ class UkrainianLemmatizer(RussianLemmatizer):
|
|||
overwrite: bool = False,
|
||||
scorer: Optional[Callable] = lemmatizer_score,
|
||||
) -> None:
|
||||
if mode == "pymorphy2":
|
||||
if mode in {"pymorphy2", "pymorphy2_lookup"}:
|
||||
try:
|
||||
from pymorphy2 import MorphAnalyzer
|
||||
except ImportError:
|
||||
|
|
|
@ -71,11 +71,10 @@ def span_maker_forward(model, docs: List[Doc], is_train) -> Tuple[Ragged, Callab
|
|||
cands.append((start_token, end_token))
|
||||
|
||||
candidates.append(ops.asarray2i(cands))
|
||||
candlens = ops.asarray1i([len(cands) for cands in candidates])
|
||||
candidates = ops.xp.concatenate(candidates)
|
||||
outputs = Ragged(candidates, candlens)
|
||||
lengths = model.ops.asarray1i([len(cands) for cands in candidates])
|
||||
out = Ragged(model.ops.flatten(candidates), lengths)
|
||||
# because this is just rearranging docs, the backprop does nothing
|
||||
return outputs, lambda x: []
|
||||
return out, lambda x: []
|
||||
|
||||
|
||||
@registry.misc("spacy.KBFromFile.v1")
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
from typing import cast, Any, Callable, Dict, Iterable, List, Optional
|
||||
from typing import Sequence, Tuple, Union
|
||||
from typing import Tuple
|
||||
from collections import Counter
|
||||
from copy import deepcopy
|
||||
from itertools import islice
|
||||
import numpy as np
|
||||
|
||||
|
@ -149,9 +148,7 @@ class EditTreeLemmatizer(TrainablePipe):
|
|||
if not any(len(doc) for doc in docs):
|
||||
# Handle cases where there are no tokens in any docs.
|
||||
n_labels = len(self.cfg["labels"])
|
||||
guesses: List[Ints2d] = [
|
||||
self.model.ops.alloc((0, n_labels), dtype="i") for doc in docs
|
||||
]
|
||||
guesses: List[Ints2d] = [self.model.ops.alloc2i(0, n_labels) for _ in docs]
|
||||
assert len(guesses) == n_docs
|
||||
return guesses
|
||||
scores = self.model.predict(docs)
|
||||
|
|
|
@ -133,6 +133,9 @@ def make_spancat(
|
|||
spans_key (str): Key of the doc.spans dict to save the spans under. During
|
||||
initialization and training, the component will look for spans on the
|
||||
reference document under the same key.
|
||||
scorer (Optional[Callable]): The scoring method. Defaults to
|
||||
Scorer.score_spans for the Doc.spans[spans_key] with overlapping
|
||||
spans allowed.
|
||||
threshold (float): Minimum probability to consider a prediction positive.
|
||||
Spans with a positive prediction will be saved on the Doc. Defaults to
|
||||
0.5.
|
||||
|
|
|
@ -24,8 +24,8 @@ single_label_default_config = """
|
|||
[model.tok2vec.embed]
|
||||
@architectures = "spacy.MultiHashEmbed.v2"
|
||||
width = 64
|
||||
rows = [2000, 2000, 1000, 1000, 1000, 1000]
|
||||
attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"]
|
||||
rows = [2000, 2000, 500, 1000, 500]
|
||||
attrs = ["NORM", "LOWER", "PREFIX", "SUFFIX", "SHAPE"]
|
||||
include_static_vectors = false
|
||||
|
||||
[model.tok2vec.encode]
|
||||
|
|
|
@ -24,8 +24,8 @@ multi_label_default_config = """
|
|||
[model.tok2vec.embed]
|
||||
@architectures = "spacy.MultiHashEmbed.v2"
|
||||
width = 64
|
||||
rows = [2000, 2000, 1000, 1000, 1000, 1000]
|
||||
attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"]
|
||||
rows = [2000, 2000, 500, 1000, 500]
|
||||
attrs = ["NORM", "LOWER", "PREFIX", "SUFFIX", "SHAPE"]
|
||||
include_static_vectors = false
|
||||
|
||||
[model.tok2vec.encode]
|
||||
|
@ -96,8 +96,8 @@ def make_multilabel_textcat(
|
|||
model: Model[List[Doc], List[Floats2d]],
|
||||
threshold: float,
|
||||
scorer: Optional[Callable],
|
||||
) -> "TextCategorizer":
|
||||
"""Create a TextCategorizer component. The text categorizer predicts categories
|
||||
) -> "MultiLabel_TextCategorizer":
|
||||
"""Create a MultiLabel_TextCategorizer component. The text categorizer predicts categories
|
||||
over a whole document. It can learn one or more labels, and the labels are considered
|
||||
to be non-mutually exclusive, which means that there can be zero or more labels
|
||||
per doc).
|
||||
|
@ -105,6 +105,7 @@ def make_multilabel_textcat(
|
|||
model (Model[List[Doc], List[Floats2d]]): A model instance that predicts
|
||||
scores for each category.
|
||||
threshold (float): Cutoff to consider a prediction "positive".
|
||||
scorer (Optional[Callable]): The scoring method.
|
||||
"""
|
||||
return MultiLabel_TextCategorizer(
|
||||
nlp.vocab, model, name, threshold=threshold, scorer=scorer
|
||||
|
@ -147,6 +148,7 @@ class MultiLabel_TextCategorizer(TextCategorizer):
|
|||
name (str): The component instance name, used to add entries to the
|
||||
losses during training.
|
||||
threshold (float): Cutoff to consider a prediction "positive".
|
||||
scorer (Optional[Callable]): The scoring method.
|
||||
|
||||
DOCS: https://spacy.io/api/textcategorizer#init
|
||||
"""
|
||||
|
|
|
@ -181,12 +181,12 @@ class TokenPatternNumber(BaseModel):
|
|||
IS_SUBSET: Optional[List[StrictInt]] = Field(None, alias="is_subset")
|
||||
IS_SUPERSET: Optional[List[StrictInt]] = Field(None, alias="is_superset")
|
||||
INTERSECTS: Optional[List[StrictInt]] = Field(None, alias="intersects")
|
||||
EQ: Union[StrictInt, StrictFloat] = Field(None, alias="==")
|
||||
NEQ: Union[StrictInt, StrictFloat] = Field(None, alias="!=")
|
||||
GEQ: Union[StrictInt, StrictFloat] = Field(None, alias=">=")
|
||||
LEQ: Union[StrictInt, StrictFloat] = Field(None, alias="<=")
|
||||
GT: Union[StrictInt, StrictFloat] = Field(None, alias=">")
|
||||
LT: Union[StrictInt, StrictFloat] = Field(None, alias="<")
|
||||
EQ: Optional[Union[StrictInt, StrictFloat]] = Field(None, alias="==")
|
||||
NEQ: Optional[Union[StrictInt, StrictFloat]] = Field(None, alias="!=")
|
||||
GEQ: Optional[Union[StrictInt, StrictFloat]] = Field(None, alias=">=")
|
||||
LEQ: Optional[Union[StrictInt, StrictFloat]] = Field(None, alias="<=")
|
||||
GT: Optional[Union[StrictInt, StrictFloat]] = Field(None, alias=">")
|
||||
LT: Optional[Union[StrictInt, StrictFloat]] = Field(None, alias="<")
|
||||
|
||||
class Config:
|
||||
extra = "forbid"
|
||||
|
@ -430,7 +430,7 @@ class ProjectConfigAssetURL(BaseModel):
|
|||
# fmt: off
|
||||
dest: StrictStr = Field(..., title="Destination of downloaded asset")
|
||||
url: Optional[StrictStr] = Field(None, title="URL of asset")
|
||||
checksum: str = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})")
|
||||
checksum: Optional[str] = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})")
|
||||
description: StrictStr = Field("", title="Description of asset")
|
||||
# fmt: on
|
||||
|
||||
|
@ -438,7 +438,7 @@ class ProjectConfigAssetURL(BaseModel):
|
|||
class ProjectConfigAssetGit(BaseModel):
|
||||
# fmt: off
|
||||
git: ProjectConfigAssetGitItem = Field(..., title="Git repo information")
|
||||
checksum: str = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})")
|
||||
checksum: Optional[str] = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})")
|
||||
description: Optional[StrictStr] = Field(None, title="Description of asset")
|
||||
# fmt: on
|
||||
|
||||
|
@ -508,9 +508,9 @@ class DocJSONSchema(BaseModel):
|
|||
None, title="Indices of sentences' start and end indices"
|
||||
)
|
||||
text: StrictStr = Field(..., title="Document text")
|
||||
spans: Dict[StrictStr, List[Dict[StrictStr, Union[StrictStr, StrictInt]]]] = Field(
|
||||
None, title="Span information - end/start indices, label, KB ID"
|
||||
)
|
||||
spans: Optional[
|
||||
Dict[StrictStr, List[Dict[StrictStr, Union[StrictStr, StrictInt]]]]
|
||||
] = Field(None, title="Span information - end/start indices, label, KB ID")
|
||||
tokens: List[Dict[StrictStr, Union[StrictStr, StrictInt]]] = Field(
|
||||
..., title="Token information - ID, start, annotations"
|
||||
)
|
||||
|
@ -519,9 +519,9 @@ class DocJSONSchema(BaseModel):
|
|||
title="Any custom data stored in the document's _ attribute",
|
||||
alias="_",
|
||||
)
|
||||
underscore_token: Optional[Dict[StrictStr, Dict[StrictStr, Any]]] = Field(
|
||||
underscore_token: Optional[Dict[StrictStr, List[Dict[StrictStr, Any]]]] = Field(
|
||||
None, title="Any custom data stored in the token's _ attribute"
|
||||
)
|
||||
underscore_span: Optional[Dict[StrictStr, Dict[StrictStr, Any]]] = Field(
|
||||
underscore_span: Optional[Dict[StrictStr, List[Dict[StrictStr, Any]]]] = Field(
|
||||
None, title="Any custom data stored in the span's _ attribute"
|
||||
)
|
||||
|
|
|
@ -343,6 +343,14 @@ def ru_lemmatizer():
|
|||
return get_lang_class("ru")().add_pipe("lemmatizer")
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def ru_lookup_lemmatizer():
|
||||
pytest.importorskip("pymorphy2")
|
||||
return get_lang_class("ru")().add_pipe(
|
||||
"lemmatizer", config={"mode": "pymorphy2_lookup"}
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def sa_tokenizer():
|
||||
return get_lang_class("sa")().tokenizer
|
||||
|
@ -422,6 +430,15 @@ def uk_lemmatizer():
|
|||
return get_lang_class("uk")().add_pipe("lemmatizer")
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def uk_lookup_lemmatizer():
|
||||
pytest.importorskip("pymorphy2")
|
||||
pytest.importorskip("pymorphy2_dicts_uk")
|
||||
return get_lang_class("uk")().add_pipe(
|
||||
"lemmatizer", config={"mode": "pymorphy2_lookup"}
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def ur_tokenizer():
|
||||
return get_lang_class("ur")().tokenizer
|
||||
|
|
|
@ -128,7 +128,9 @@ def test_doc_to_json_with_token_span_attributes(doc):
|
|||
doc._.json_test1 = "hello world"
|
||||
doc._.json_test2 = [1, 2, 3]
|
||||
doc[0:1]._.span_test = "span_attribute"
|
||||
doc[0:2]._.span_test = "span_attribute_2"
|
||||
doc[0]._.token_test = 117
|
||||
doc[1]._.token_test = 118
|
||||
doc.spans["span_group"] = [doc[0:1]]
|
||||
json_doc = doc.to_json(
|
||||
underscore=["json_test1", "json_test2", "token_test", "span_test"]
|
||||
|
@ -139,8 +141,10 @@ def test_doc_to_json_with_token_span_attributes(doc):
|
|||
assert json_doc["_"]["json_test2"] == [1, 2, 3]
|
||||
assert "underscore_token" in json_doc
|
||||
assert "underscore_span" in json_doc
|
||||
assert json_doc["underscore_token"]["token_test"]["value"] == 117
|
||||
assert json_doc["underscore_span"]["span_test"]["value"] == "span_attribute"
|
||||
assert json_doc["underscore_token"]["token_test"][0]["value"] == 117
|
||||
assert json_doc["underscore_token"]["token_test"][1]["value"] == 118
|
||||
assert json_doc["underscore_span"]["span_test"][0]["value"] == "span_attribute"
|
||||
assert json_doc["underscore_span"]["span_test"][1]["value"] == "span_attribute_2"
|
||||
assert len(schemas.validate(schemas.DocJSONSchema, json_doc)) == 0
|
||||
assert srsly.json_loads(srsly.json_dumps(json_doc)) == json_doc
|
||||
|
||||
|
@ -161,8 +165,8 @@ def test_doc_to_json_with_custom_user_data(doc):
|
|||
assert json_doc["_"]["json_test"] == "hello world"
|
||||
assert "underscore_token" in json_doc
|
||||
assert "underscore_span" in json_doc
|
||||
assert json_doc["underscore_token"]["token_test"]["value"] == 117
|
||||
assert json_doc["underscore_span"]["span_test"]["value"] == "span_attribute"
|
||||
assert json_doc["underscore_token"]["token_test"][0]["value"] == 117
|
||||
assert json_doc["underscore_span"]["span_test"][0]["value"] == "span_attribute"
|
||||
assert len(schemas.validate(schemas.DocJSONSchema, json_doc)) == 0
|
||||
assert srsly.json_loads(srsly.json_dumps(json_doc)) == json_doc
|
||||
|
||||
|
@ -181,8 +185,8 @@ def test_doc_to_json_with_token_span_same_identifier(doc):
|
|||
assert json_doc["_"]["my_ext"] == "hello world"
|
||||
assert "underscore_token" in json_doc
|
||||
assert "underscore_span" in json_doc
|
||||
assert json_doc["underscore_token"]["my_ext"]["value"] == 117
|
||||
assert json_doc["underscore_span"]["my_ext"]["value"] == "span_attribute"
|
||||
assert json_doc["underscore_token"]["my_ext"][0]["value"] == 117
|
||||
assert json_doc["underscore_span"]["my_ext"][0]["value"] == "span_attribute"
|
||||
assert len(schemas.validate(schemas.DocJSONSchema, json_doc)) == 0
|
||||
assert srsly.json_loads(srsly.json_dumps(json_doc)) == json_doc
|
||||
|
||||
|
@ -195,10 +199,9 @@ def test_doc_to_json_with_token_attributes_missing(doc):
|
|||
doc[0]._.token_test = 117
|
||||
json_doc = doc.to_json(underscore=["span_test"])
|
||||
|
||||
assert "underscore_token" in json_doc
|
||||
assert "underscore_span" in json_doc
|
||||
assert json_doc["underscore_span"]["span_test"]["value"] == "span_attribute"
|
||||
assert "token_test" not in json_doc["underscore_token"]
|
||||
assert json_doc["underscore_span"]["span_test"][0]["value"] == "span_attribute"
|
||||
assert "underscore_token" not in json_doc
|
||||
assert len(schemas.validate(schemas.DocJSONSchema, json_doc)) == 0
|
||||
|
||||
|
||||
|
@ -283,7 +286,9 @@ def test_json_to_doc_with_token_span_attributes(doc):
|
|||
doc._.json_test1 = "hello world"
|
||||
doc._.json_test2 = [1, 2, 3]
|
||||
doc[0:1]._.span_test = "span_attribute"
|
||||
doc[0:2]._.span_test = "span_attribute_2"
|
||||
doc[0]._.token_test = 117
|
||||
doc[1]._.token_test = 118
|
||||
|
||||
json_doc = doc.to_json(
|
||||
underscore=["json_test1", "json_test2", "token_test", "span_test"]
|
||||
|
@ -295,7 +300,9 @@ def test_json_to_doc_with_token_span_attributes(doc):
|
|||
assert new_doc._.json_test1 == "hello world"
|
||||
assert new_doc._.json_test2 == [1, 2, 3]
|
||||
assert new_doc[0]._.token_test == 117
|
||||
assert new_doc[1]._.token_test == 118
|
||||
assert new_doc[0:1]._.span_test == "span_attribute"
|
||||
assert new_doc[0:2]._.span_test == "span_attribute_2"
|
||||
assert new_doc.user_data == doc.user_data
|
||||
assert new_doc.to_bytes(exclude=["user_data"]) == doc.to_bytes(
|
||||
exclude=["user_data"]
|
||||
|
|
|
@ -78,3 +78,17 @@ def test_ru_lemmatizer_punct(ru_lemmatizer):
|
|||
assert ru_lemmatizer.pymorphy2_lemmatize(doc[0]) == ['"']
|
||||
doc = Doc(ru_lemmatizer.vocab, words=["»"], pos=["PUNCT"])
|
||||
assert ru_lemmatizer.pymorphy2_lemmatize(doc[0]) == ['"']
|
||||
|
||||
|
||||
def test_ru_doc_lookup_lemmatization(ru_lookup_lemmatizer):
|
||||
words = ["мама", "мыла", "раму"]
|
||||
pos = ["NOUN", "VERB", "NOUN"]
|
||||
morphs = [
|
||||
"Animacy=Anim|Case=Nom|Gender=Fem|Number=Sing",
|
||||
"Aspect=Imp|Gender=Fem|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act",
|
||||
"Animacy=Anim|Case=Acc|Gender=Fem|Number=Sing",
|
||||
]
|
||||
doc = Doc(ru_lookup_lemmatizer.vocab, words=words, pos=pos, morphs=morphs)
|
||||
doc = ru_lookup_lemmatizer(doc)
|
||||
lemmas = [token.lemma_ for token in doc]
|
||||
assert lemmas == ["мама", "мыла", "раму"]
|
||||
|
|
|
@ -9,3 +9,11 @@ def test_uk_lemmatizer(uk_lemmatizer):
|
|||
"""Check that the default uk lemmatizer runs."""
|
||||
doc = Doc(uk_lemmatizer.vocab, words=["a", "b", "c"])
|
||||
uk_lemmatizer(doc)
|
||||
assert [token.lemma for token in doc]
|
||||
|
||||
|
||||
def test_uk_lookup_lemmatizer(uk_lookup_lemmatizer):
|
||||
"""Check that the lookup uk lemmatizer runs."""
|
||||
doc = Doc(uk_lookup_lemmatizer.vocab, words=["a", "b", "c"])
|
||||
uk_lookup_lemmatizer(doc)
|
||||
assert [token.lemma for token in doc]
|
||||
|
|
|
@ -9,6 +9,7 @@ from spacy.compat import pickle
|
|||
from spacy.kb import Candidate, InMemoryLookupKB, get_candidates, KnowledgeBase
|
||||
from spacy.lang.en import English
|
||||
from spacy.ml import load_kb
|
||||
from spacy.ml.models.entity_linker import build_span_maker
|
||||
from spacy.pipeline import EntityLinker
|
||||
from spacy.pipeline.legacy import EntityLinker_v1
|
||||
from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
|
||||
|
@ -728,7 +729,11 @@ TRAIN_DATA = [
|
|||
("Russ Cochran was a member of University of Kentucky's golf team.",
|
||||
{"links": {(0, 12): {"Q7381115": 0.0, "Q2146908": 1.0}},
|
||||
"entities": [(0, 12, "PERSON"), (43, 51, "LOC")],
|
||||
"sent_starts": [1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]})
|
||||
"sent_starts": [1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}),
|
||||
# having a blank instance shouldn't break things
|
||||
("The weather is nice today.",
|
||||
{"links": {}, "entities": [],
|
||||
"sent_starts": [1, -1, 0, 0, 0, 0]})
|
||||
]
|
||||
GOLD_entities = ["Q2146908", "Q7381115", "Q7381115", "Q2146908"]
|
||||
# fmt: on
|
||||
|
@ -1211,6 +1216,21 @@ def test_threshold(meet_threshold: bool, config: Dict[str, Any]):
|
|||
assert doc.ents[0].kb_id_ == entity_id if meet_threshold else EntityLinker.NIL
|
||||
|
||||
|
||||
def test_span_maker_forward_with_empty():
|
||||
"""The forward pass of the span maker may have a doc with no entities."""
|
||||
nlp = English()
|
||||
doc1 = nlp("a b c")
|
||||
ent = doc1[0:1]
|
||||
ent.label_ = "X"
|
||||
doc1.ents = [ent]
|
||||
# no entities
|
||||
doc2 = nlp("x y z")
|
||||
|
||||
# just to get a model
|
||||
span_maker = build_span_maker()
|
||||
span_maker([doc1, doc2], False)
|
||||
|
||||
|
||||
def test_nel_candidate_processing():
|
||||
"""Test that NEL handles candidate streams correctly in a set of documents with & without entities as well as empty
|
||||
documents.
|
||||
|
|
|
@ -231,7 +231,7 @@ def test_tok2vec_listener_callback():
|
|||
|
||||
|
||||
def test_tok2vec_listener_overfitting():
|
||||
""" Test that a pipeline with a listener properly overfits, even if 'tok2vec' is in the annotating components """
|
||||
"""Test that a pipeline with a listener properly overfits, even if 'tok2vec' is in the annotating components"""
|
||||
orig_config = Config().from_str(cfg_string)
|
||||
nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
|
||||
train_examples = []
|
||||
|
@ -264,7 +264,7 @@ def test_tok2vec_listener_overfitting():
|
|||
|
||||
|
||||
def test_tok2vec_frozen_not_annotating():
|
||||
""" Test that a pipeline with a frozen tok2vec raises an error when the tok2vec is not annotating """
|
||||
"""Test that a pipeline with a frozen tok2vec raises an error when the tok2vec is not annotating"""
|
||||
orig_config = Config().from_str(cfg_string)
|
||||
nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
|
||||
train_examples = []
|
||||
|
@ -274,12 +274,16 @@ def test_tok2vec_frozen_not_annotating():
|
|||
|
||||
for i in range(2):
|
||||
losses = {}
|
||||
with pytest.raises(ValueError, match=r"the tok2vec embedding layer is not updated"):
|
||||
nlp.update(train_examples, sgd=optimizer, losses=losses, exclude=["tok2vec"])
|
||||
with pytest.raises(
|
||||
ValueError, match=r"the tok2vec embedding layer is not updated"
|
||||
):
|
||||
nlp.update(
|
||||
train_examples, sgd=optimizer, losses=losses, exclude=["tok2vec"]
|
||||
)
|
||||
|
||||
|
||||
def test_tok2vec_frozen_overfitting():
|
||||
""" Test that a pipeline with a frozen & annotating tok2vec can still overfit """
|
||||
"""Test that a pipeline with a frozen & annotating tok2vec can still overfit"""
|
||||
orig_config = Config().from_str(cfg_string)
|
||||
nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
|
||||
train_examples = []
|
||||
|
@ -289,7 +293,13 @@ def test_tok2vec_frozen_overfitting():
|
|||
|
||||
for i in range(100):
|
||||
losses = {}
|
||||
nlp.update(train_examples, sgd=optimizer, losses=losses, exclude=["tok2vec"], annotates=["tok2vec"])
|
||||
nlp.update(
|
||||
train_examples,
|
||||
sgd=optimizer,
|
||||
losses=losses,
|
||||
exclude=["tok2vec"],
|
||||
annotates=["tok2vec"],
|
||||
)
|
||||
assert losses["tagger"] < 0.0001
|
||||
|
||||
# test the trained model
|
||||
|
|
|
@ -23,7 +23,7 @@ def get_textcat_bow_kwargs():
|
|||
|
||||
|
||||
def get_textcat_cnn_kwargs():
|
||||
return {"tok2vec": test_tok2vec(), "exclusive_classes": False, "nO": 13}
|
||||
return {"tok2vec": make_test_tok2vec(), "exclusive_classes": False, "nO": 13}
|
||||
|
||||
|
||||
def get_all_params(model):
|
||||
|
@ -65,7 +65,7 @@ def get_tok2vec_kwargs():
|
|||
}
|
||||
|
||||
|
||||
def test_tok2vec():
|
||||
def make_test_tok2vec():
|
||||
return build_Tok2Vec_model(**get_tok2vec_kwargs())
|
||||
|
||||
|
||||
|
|
|
@ -1608,24 +1608,20 @@ cdef class Doc:
|
|||
Doc.set_extension(attr)
|
||||
self._.set(attr, doc_json["_"][attr])
|
||||
|
||||
if doc_json.get("underscore_token", {}):
|
||||
for token_attr in doc_json["underscore_token"]:
|
||||
token_start = doc_json["underscore_token"][token_attr]["token_start"]
|
||||
value = doc_json["underscore_token"][token_attr]["value"]
|
||||
|
||||
for token_attr in doc_json.get("underscore_token", {}):
|
||||
if not Token.has_extension(token_attr):
|
||||
Token.set_extension(token_attr)
|
||||
self[token_start]._.set(token_attr, value)
|
||||
|
||||
if doc_json.get("underscore_span", {}):
|
||||
for span_attr in doc_json["underscore_span"]:
|
||||
token_start = doc_json["underscore_span"][span_attr]["token_start"]
|
||||
token_end = doc_json["underscore_span"][span_attr]["token_end"]
|
||||
value = doc_json["underscore_span"][span_attr]["value"]
|
||||
for token_data in doc_json["underscore_token"][token_attr]:
|
||||
start = token_by_char(self.c, self.length, token_data["start"])
|
||||
value = token_data["value"]
|
||||
self[start]._.set(token_attr, value)
|
||||
|
||||
for span_attr in doc_json.get("underscore_span", {}):
|
||||
if not Span.has_extension(span_attr):
|
||||
Span.set_extension(span_attr)
|
||||
self[token_start:token_end]._.set(span_attr, value)
|
||||
for span_data in doc_json["underscore_span"][span_attr]:
|
||||
value = span_data["value"]
|
||||
self.char_span(span_data["start"], span_data["end"])._.set(span_attr, value)
|
||||
return self
|
||||
|
||||
def to_json(self, underscore=None):
|
||||
|
@ -1673,30 +1669,34 @@ cdef class Doc:
|
|||
if underscore:
|
||||
user_keys = set()
|
||||
if self.user_data:
|
||||
data["_"] = {}
|
||||
data["underscore_token"] = {}
|
||||
data["underscore_span"] = {}
|
||||
for data_key in self.user_data:
|
||||
for data_key, value in self.user_data.copy().items():
|
||||
if type(data_key) == tuple and len(data_key) >= 4 and data_key[0] == "._.":
|
||||
attr = data_key[1]
|
||||
start = data_key[2]
|
||||
end = data_key[3]
|
||||
if attr in underscore:
|
||||
user_keys.add(attr)
|
||||
value = self.user_data[data_key]
|
||||
if not srsly.is_json_serializable(value):
|
||||
raise ValueError(Errors.E107.format(attr=attr, value=repr(value)))
|
||||
# Check if doc attribute
|
||||
if start is None:
|
||||
if "_" not in data:
|
||||
data["_"] = {}
|
||||
data["_"][attr] = value
|
||||
# Check if token attribute
|
||||
elif end is None:
|
||||
if "underscore_token" not in data:
|
||||
data["underscore_token"] = {}
|
||||
if attr not in data["underscore_token"]:
|
||||
data["underscore_token"][attr] = {"token_start": start, "value": value}
|
||||
data["underscore_token"][attr] = []
|
||||
data["underscore_token"][attr].append({"start": start, "value": value})
|
||||
# Else span attribute
|
||||
else:
|
||||
if "underscore_span" not in data:
|
||||
data["underscore_span"] = {}
|
||||
if attr not in data["underscore_span"]:
|
||||
data["underscore_span"][attr] = {"token_start": start, "token_end": end, "value": value}
|
||||
data["underscore_span"][attr] = []
|
||||
data["underscore_span"][attr].append({"start": start, "end": end, "value": value})
|
||||
|
||||
for attr in underscore:
|
||||
if attr not in user_keys:
|
||||
|
|
|
@ -1482,7 +1482,7 @@ You'll also need to add the assets you want to track with
|
|||
</Infobox>
|
||||
|
||||
```cli
|
||||
$ python -m spacy project dvc [project_dir] [workflow] [--force] [--verbose]
|
||||
$ python -m spacy project dvc [project_dir] [workflow] [--force] [--verbose] [--quiet]
|
||||
```
|
||||
|
||||
> #### Example
|
||||
|
@ -1499,6 +1499,7 @@ $ python -m spacy project dvc [project_dir] [workflow] [--force] [--verbose]
|
|||
| `workflow` | Name of workflow defined in `project.yml`. Defaults to first workflow if not set. ~~Optional[str] \(option)~~ |
|
||||
| `--force`, `-F` | Force-updating config file. ~~bool (flag)~~ |
|
||||
| `--verbose`, `-V` | Print more output generated by DVC. ~~bool (flag)~~ |
|
||||
| `--quiet`, `-q` | Print no output generated by DVC. ~~bool (flag)~~ |
|
||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||
| **CREATES** | A `dvc.yaml` file in the project directory, based on the steps defined in the given workflow. |
|
||||
|
||||
|
|
|
@ -243,6 +243,27 @@ pipelines.
|
|||
> python -m spacy project run test . --vars.foo bar
|
||||
> ```
|
||||
|
||||
> #### Tip: Environment Variables
|
||||
>
|
||||
> Commands in a project file are not executed in a shell, so they don't have
|
||||
> direct access to environment variables. But you can insert environment
|
||||
> variables using the `env` dictionary to make values available for
|
||||
> interpolation, just like values in `vars`. Here's an example `env` dict that
|
||||
> makes `$PATH` available as `ENV_PATH`:
|
||||
>
|
||||
> ```yaml
|
||||
> env:
|
||||
> ENV_PATH: PATH
|
||||
> ```
|
||||
>
|
||||
> This can be used in a project command like so:
|
||||
>
|
||||
> ```yaml
|
||||
> - name: "echo-path"
|
||||
> script:
|
||||
> - "echo ${env.ENV_PATH}"
|
||||
> ```
|
||||
|
||||
| Section | Description |
|
||||
| --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `title` | An optional project title used in `--help` message and [auto-generated docs](#custom-docs). |
|
||||
|
|
|
@ -4,12 +4,22 @@
|
|||
"code": "af",
|
||||
"name": "Afrikaans"
|
||||
},
|
||||
{
|
||||
"code": "am",
|
||||
"name": "Amharic",
|
||||
"has_examples": true
|
||||
},
|
||||
{
|
||||
"code": "ar",
|
||||
"name": "Arabic",
|
||||
"example": "هذه جملة",
|
||||
"has_examples": true
|
||||
},
|
||||
{
|
||||
"code": "az",
|
||||
"name": "Azerbaijani",
|
||||
"has_examples": true
|
||||
},
|
||||
{
|
||||
"code": "bg",
|
||||
"name": "Bulgarian",
|
||||
|
@ -142,6 +152,11 @@
|
|||
"code": "ga",
|
||||
"name": "Irish"
|
||||
},
|
||||
{
|
||||
"code": "grc",
|
||||
"name": "Ancient Greek",
|
||||
"has_examples": true
|
||||
},
|
||||
{
|
||||
"code": "gu",
|
||||
"name": "Gujarati",
|
||||
|
@ -260,6 +275,10 @@
|
|||
"example": "Адамга эң кыйыны — күн сайын адам болуу",
|
||||
"has_examples": true
|
||||
},
|
||||
{
|
||||
"code": "la",
|
||||
"name": "Latin"
|
||||
},
|
||||
{
|
||||
"code": "lb",
|
||||
"name": "Luxembourgish",
|
||||
|
@ -448,6 +467,11 @@
|
|||
"example": "นี่คือประโยค",
|
||||
"has_examples": true
|
||||
},
|
||||
{
|
||||
"code": "ti",
|
||||
"name": "Tigrinya",
|
||||
"has_examples": true
|
||||
},
|
||||
{
|
||||
"code": "tl",
|
||||
"name": "Tagalog"
|
||||
|
|
|
@ -1,5 +1,46 @@
|
|||
{
|
||||
"resources": [
|
||||
{
|
||||
"id": "spacy-cleaner",
|
||||
"title": "spacy-cleaner",
|
||||
"slogan": "Easily clean text with spaCy!",
|
||||
"description": "**spacy-cleaner** utilises spaCy `Language` models to replace, remove, and \n mutate spaCy tokens. Cleaning actions available are:\n\n* Remove/replace stopwords.\n* Remove/replace punctuation.\n* Remove/replace numbers.\n* Remove/replace emails.\n* Remove/replace URLs.\n* Perform lemmatisation.\n\nSee our [docs](https://ce11an.github.io/spacy-cleaner/) for more information.",
|
||||
"github": "Ce11an/spacy-cleaner",
|
||||
"pip": "spacy-cleaner",
|
||||
"code_example": [
|
||||
"import spacy",
|
||||
"import spacy_cleaner",
|
||||
"from spacy_cleaner.processing import removers, replacers, mutators",
|
||||
"",
|
||||
"model = spacy.load(\"en_core_web_sm\")",
|
||||
"pipeline = spacy_cleaner.Pipeline(",
|
||||
" model,",
|
||||
" removers.remove_stopword_token,",
|
||||
" replacers.replace_punctuation_token,",
|
||||
" mutators.mutate_lemma_token,",
|
||||
")",
|
||||
"",
|
||||
"texts = [\"Hello, my name is Cellan! I love to swim!\"]",
|
||||
"",
|
||||
"pipeline.clean(texts)",
|
||||
"# ['hello _IS_PUNCT_ Cellan _IS_PUNCT_ love swim _IS_PUNCT_']"
|
||||
],
|
||||
"code_language": "python",
|
||||
"url": "https://ce11an.github.io/spacy-cleaner/",
|
||||
"image": "https://raw.githubusercontent.com/Ce11an/spacy-cleaner/main/docs/assets/images/spacemen.png",
|
||||
"author": "Cellan Hall",
|
||||
"author_links": {
|
||||
"twitter": "Ce11an",
|
||||
"github": "Ce11an",
|
||||
"website": "https://www.linkedin.com/in/cellan-hall/"
|
||||
},
|
||||
"category": [
|
||||
"extension"
|
||||
],
|
||||
"tags": [
|
||||
"text-processing"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "Zshot",
|
||||
"title": "Zshot",
|
||||
|
@ -2460,20 +2501,20 @@
|
|||
"import spacy",
|
||||
"from spacy_wordnet.wordnet_annotator import WordnetAnnotator ",
|
||||
"",
|
||||
"# Load an spacy model (supported models are \"es\" and \"en\") ",
|
||||
"nlp = spacy.load('en')",
|
||||
"# Spacy 3.x",
|
||||
"nlp.add_pipe(\"spacy_wordnet\", after='tagger', config={'lang': nlp.lang})",
|
||||
"# Spacy 2.x",
|
||||
"# Load a spaCy model (supported languages are \"es\" and \"en\") ",
|
||||
"nlp = spacy.load('en_core_web_sm')",
|
||||
"# spaCy 3.x",
|
||||
"nlp.add_pipe(\"spacy_wordnet\", after='tagger')",
|
||||
"# spaCy 2.x",
|
||||
"# nlp.add_pipe(WordnetAnnotator(nlp.lang), after='tagger')",
|
||||
"token = nlp('prices')[0]",
|
||||
"",
|
||||
"# wordnet object link spacy token with nltk wordnet interface by giving acces to",
|
||||
"# WordNet object links spaCy token with NLTK WordNet interface by giving access to",
|
||||
"# synsets and lemmas ",
|
||||
"token._.wordnet.synsets()",
|
||||
"token._.wordnet.lemmas()",
|
||||
"",
|
||||
"# And automatically tags with wordnet domains",
|
||||
"# And automatically add info about WordNet domains",
|
||||
"token._.wordnet.wordnet_domains()"
|
||||
],
|
||||
"author": "recognai",
|
||||
|
|
|
@ -149,6 +149,9 @@
|
|||
& > span
|
||||
display: block
|
||||
|
||||
a
|
||||
text-decoration: underline
|
||||
|
||||
.small
|
||||
font-size: var(--font-size-code)
|
||||
line-height: 1.65
|
||||
|
|
|
@ -159,6 +159,9 @@ const QuickstartInstall = ({ id, title }) => {
|
|||
setters={setters}
|
||||
showDropdown={showDropdown}
|
||||
>
|
||||
<QS os="mac" hardware="gpu" platform="arm">
|
||||
# Note M1 GPU support is experimental, see <a href="https://github.com/explosion/thinc/issues/792">Thinc issue #792</a>
|
||||
</QS>
|
||||
<QS package="pip" config="venv">
|
||||
python -m venv .env
|
||||
</QS>
|
||||
|
@ -198,7 +201,13 @@ const QuickstartInstall = ({ id, title }) => {
|
|||
{nightly ? ' --pre' : ''}
|
||||
</QS>
|
||||
<QS package="conda">conda install -c conda-forge spacy</QS>
|
||||
<QS package="conda" hardware="gpu">
|
||||
<QS package="conda" hardware="gpu" os="windows">
|
||||
conda install -c conda-forge cupy
|
||||
</QS>
|
||||
<QS package="conda" hardware="gpu" os="linux">
|
||||
conda install -c conda-forge cupy
|
||||
</QS>
|
||||
<QS package="conda" hardware="gpu" os="mac" platform="x86">
|
||||
conda install -c conda-forge cupy
|
||||
</QS>
|
||||
<QS package="conda" config="train">
|
||||
|
|
Loading…
Reference in New Issue
Block a user