mirror of
https://github.com/explosion/spaCy.git
synced 2025-04-18 08:01:58 +03:00
Merge branch 'v3.5.x' into spacy.io
This commit is contained in:
commit
4d0fb098ee
.github
CONTRIBUTING.mdazure-pipelines.ymlpyproject.tomlrequirements.txtsetup.cfgspacy
about.py
cli
errors.pykb
language.pylexeme.pyilexeme.pyxmatcher
ml/models
pipeline
tests
tokens
training
util.pywebsite
5
.github/azure-steps.yml
vendored
5
.github/azure-steps.yml
vendored
|
@ -59,6 +59,11 @@ steps:
|
|||
displayName: 'Test download CLI'
|
||||
condition: eq(variables['python_version'], '3.8')
|
||||
|
||||
- script: |
|
||||
python -W error -m spacy info ca_core_news_sm | grep -q download_url
|
||||
displayName: 'Test download_url in info CLI'
|
||||
condition: eq(variables['python_version'], '3.8')
|
||||
|
||||
- script: |
|
||||
python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
|
||||
displayName: 'Test no warnings on load (#11713)'
|
||||
|
|
2
.github/workflows/autoblack.yml
vendored
2
.github/workflows/autoblack.yml
vendored
|
@ -16,7 +16,7 @@ jobs:
|
|||
with:
|
||||
ref: ${{ github.head_ref }}
|
||||
- uses: actions/setup-python@v4
|
||||
- run: pip install black
|
||||
- run: pip install black -c requirements.txt
|
||||
- name: Auto-format code if needed
|
||||
run: black spacy
|
||||
# We can't run black --check here because that returns a non-zero excit
|
||||
|
|
|
@ -173,6 +173,11 @@ formatting and [`flake8`](http://flake8.pycqa.org/en/latest/) for linting its
|
|||
Python modules. If you've built spaCy from source, you'll already have both
|
||||
tools installed.
|
||||
|
||||
As a general rule of thumb, we use f-strings for any formatting of strings.
|
||||
One exception are calls to Python's `logging` functionality.
|
||||
To avoid unnecessary string conversions in these cases, we use string formatting
|
||||
templates with `%s` and `%d` etc.
|
||||
|
||||
**⚠️ Note that formatting and linting is currently only possible for Python
|
||||
modules in `.py` files, not Cython modules in `.pyx` and `.pxd` files.**
|
||||
|
||||
|
|
|
@ -11,18 +11,28 @@ trigger:
|
|||
exclude:
|
||||
- "website/*"
|
||||
- "*.md"
|
||||
- "*.mdx"
|
||||
- ".github/workflows/*"
|
||||
pr:
|
||||
paths:
|
||||
exclude:
|
||||
- "*.md"
|
||||
- "*.mdx"
|
||||
- "website/docs/*"
|
||||
- "website/src/*"
|
||||
- "website/meta/*.tsx"
|
||||
- "website/meta/*.mjs"
|
||||
- "website/meta/languages.json"
|
||||
- "website/meta/site.json"
|
||||
- "website/meta/sidebars.json"
|
||||
- "website/meta/type-annotations.json"
|
||||
- "website/pages/*"
|
||||
- ".github/workflows/*"
|
||||
|
||||
jobs:
|
||||
# Perform basic checks for most important errors (syntax etc.) Uses the config
|
||||
# defined in .flake8 and overwrites the selected codes.
|
||||
# Check formatting and linting. Perform basic checks for most important errors
|
||||
# (syntax etc.) Uses the config defined in setup.cfg and overwrites the
|
||||
# selected codes.
|
||||
- job: "Validate"
|
||||
pool:
|
||||
vmImage: "ubuntu-latest"
|
||||
|
@ -30,6 +40,10 @@ jobs:
|
|||
- task: UsePythonVersion@0
|
||||
inputs:
|
||||
versionSpec: "3.7"
|
||||
- script: |
|
||||
pip install black -c requirements.txt
|
||||
python -m black spacy --check
|
||||
displayName: "black"
|
||||
- script: |
|
||||
pip install flake8==5.0.4
|
||||
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics
|
||||
|
|
|
@ -5,7 +5,7 @@ requires = [
|
|||
"cymem>=2.0.2,<2.1.0",
|
||||
"preshed>=3.0.2,<3.1.0",
|
||||
"murmurhash>=0.28.0,<1.1.0",
|
||||
"thinc>=8.1.0,<8.2.0",
|
||||
"thinc>=8.1.8,<8.2.0",
|
||||
"numpy>=1.15.0",
|
||||
]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
|
|
@ -3,7 +3,7 @@ spacy-legacy>=3.0.11,<3.1.0
|
|||
spacy-loggers>=1.0.0,<2.0.0
|
||||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
thinc>=8.1.0,<8.2.0
|
||||
thinc>=8.1.8,<8.2.0
|
||||
ml_datasets>=0.2.0,<0.3.0
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
wasabi>=0.9.1,<1.2.0
|
||||
|
@ -31,10 +31,10 @@ pytest-timeout>=1.3.0,<2.0.0
|
|||
mock>=2.0.0,<3.0.0
|
||||
flake8>=3.8.0,<6.0.0
|
||||
hypothesis>=3.27.0,<7.0.0
|
||||
mypy>=0.990,<0.1000; platform_machine != "aarch64" and python_version >= "3.7"
|
||||
mypy>=0.990,<1.1.0; platform_machine != "aarch64" and python_version >= "3.7"
|
||||
types-dataclasses>=0.1.3; python_version < "3.7"
|
||||
types-mock>=0.1.1
|
||||
types-setuptools>=57.0.0
|
||||
types-requests
|
||||
types-setuptools>=57.0.0
|
||||
black>=22.0,<23.0
|
||||
black==22.3.0
|
||||
|
|
|
@ -39,7 +39,7 @@ setup_requires =
|
|||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
thinc>=8.1.0,<8.2.0
|
||||
thinc>=8.1.8,<8.2.0
|
||||
install_requires =
|
||||
# Our libraries
|
||||
spacy-legacy>=3.0.11,<3.1.0
|
||||
|
@ -47,7 +47,7 @@ install_requires =
|
|||
murmurhash>=0.28.0,<1.1.0
|
||||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
thinc>=8.1.0,<8.2.0
|
||||
thinc>=8.1.8,<8.2.0
|
||||
wasabi>=0.9.1,<1.2.0
|
||||
srsly>=2.4.3,<3.0.0
|
||||
catalogue>=2.0.6,<2.1.0
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
# fmt: off
|
||||
__title__ = "spacy"
|
||||
__version__ = "3.5.0"
|
||||
__version__ = "3.5.1"
|
||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||
__projects__ = "https://github.com/explosion/projects"
|
||||
|
|
|
@ -90,9 +90,9 @@ def parse_config_overrides(
|
|||
cli_overrides = _parse_overrides(args, is_cli=True)
|
||||
if cli_overrides:
|
||||
keys = [k for k in cli_overrides if k not in env_overrides]
|
||||
logger.debug(f"Config overrides from CLI: {keys}")
|
||||
logger.debug("Config overrides from CLI: %s", keys)
|
||||
if env_overrides:
|
||||
logger.debug(f"Config overrides from env variables: {list(env_overrides)}")
|
||||
logger.debug("Config overrides from env variables: %s", list(env_overrides))
|
||||
return {**cli_overrides, **env_overrides}
|
||||
|
||||
|
||||
|
|
|
@ -17,6 +17,7 @@ from ..pipeline import TrainablePipe
|
|||
from ..pipeline._parser_internals import nonproj
|
||||
from ..pipeline._parser_internals.nonproj import DELIMITER
|
||||
from ..pipeline import Morphologizer, SpanCategorizer
|
||||
from ..pipeline._edit_tree_internals.edit_trees import EditTrees
|
||||
from ..morphology import Morphology
|
||||
from ..language import Language
|
||||
from ..util import registry, resolve_dot_names
|
||||
|
@ -671,6 +672,59 @@ def debug_data(
|
|||
f"Found {gold_train_data['n_cycles']} projectivized train sentence(s) with cycles"
|
||||
)
|
||||
|
||||
if "trainable_lemmatizer" in factory_names:
|
||||
msg.divider("Trainable Lemmatizer")
|
||||
trees_train: Set[str] = gold_train_data["lemmatizer_trees"]
|
||||
trees_dev: Set[str] = gold_dev_data["lemmatizer_trees"]
|
||||
# This is necessary context when someone is attempting to interpret whether the
|
||||
# number of trees exclusively in the dev set is meaningful.
|
||||
msg.info(f"{len(trees_train)} lemmatizer trees generated from training data")
|
||||
msg.info(f"{len(trees_dev)} lemmatizer trees generated from dev data")
|
||||
dev_not_train = trees_dev - trees_train
|
||||
|
||||
if len(dev_not_train) != 0:
|
||||
pct = len(dev_not_train) / len(trees_dev)
|
||||
msg.info(
|
||||
f"{len(dev_not_train)} lemmatizer trees ({pct*100:.1f}% of dev trees)"
|
||||
" were found exclusively in the dev data."
|
||||
)
|
||||
else:
|
||||
# Would we ever expect this case? It seems like it would be pretty rare,
|
||||
# and we might actually want a warning?
|
||||
msg.info("All trees in dev data present in training data.")
|
||||
|
||||
if gold_train_data["n_low_cardinality_lemmas"] > 0:
|
||||
n = gold_train_data["n_low_cardinality_lemmas"]
|
||||
msg.warn(f"{n} training docs with 0 or 1 unique lemmas.")
|
||||
|
||||
if gold_dev_data["n_low_cardinality_lemmas"] > 0:
|
||||
n = gold_dev_data["n_low_cardinality_lemmas"]
|
||||
msg.warn(f"{n} dev docs with 0 or 1 unique lemmas.")
|
||||
|
||||
if gold_train_data["no_lemma_annotations"] > 0:
|
||||
n = gold_train_data["no_lemma_annotations"]
|
||||
msg.warn(f"{n} training docs with no lemma annotations.")
|
||||
else:
|
||||
msg.good("All training docs have lemma annotations.")
|
||||
|
||||
if gold_dev_data["no_lemma_annotations"] > 0:
|
||||
n = gold_dev_data["no_lemma_annotations"]
|
||||
msg.warn(f"{n} dev docs with no lemma annotations.")
|
||||
else:
|
||||
msg.good("All dev docs have lemma annotations.")
|
||||
|
||||
if gold_train_data["partial_lemma_annotations"] > 0:
|
||||
n = gold_train_data["partial_lemma_annotations"]
|
||||
msg.info(f"{n} training docs with partial lemma annotations.")
|
||||
else:
|
||||
msg.good("All training docs have complete lemma annotations.")
|
||||
|
||||
if gold_dev_data["partial_lemma_annotations"] > 0:
|
||||
n = gold_dev_data["partial_lemma_annotations"]
|
||||
msg.info(f"{n} dev docs with partial lemma annotations.")
|
||||
else:
|
||||
msg.good("All dev docs have complete lemma annotations.")
|
||||
|
||||
msg.divider("Summary")
|
||||
good_counts = msg.counts[MESSAGES.GOOD]
|
||||
warn_counts = msg.counts[MESSAGES.WARN]
|
||||
|
@ -732,7 +786,13 @@ def _compile_gold(
|
|||
"n_cats_multilabel": 0,
|
||||
"n_cats_bad_values": 0,
|
||||
"texts": set(),
|
||||
"lemmatizer_trees": set(),
|
||||
"no_lemma_annotations": 0,
|
||||
"partial_lemma_annotations": 0,
|
||||
"n_low_cardinality_lemmas": 0,
|
||||
}
|
||||
if "trainable_lemmatizer" in factory_names:
|
||||
trees = EditTrees(nlp.vocab.strings)
|
||||
for eg in examples:
|
||||
gold = eg.reference
|
||||
doc = eg.predicted
|
||||
|
@ -862,6 +922,25 @@ def _compile_gold(
|
|||
data["n_nonproj"] += 1
|
||||
if nonproj.contains_cycle(aligned_heads):
|
||||
data["n_cycles"] += 1
|
||||
if "trainable_lemmatizer" in factory_names:
|
||||
# from EditTreeLemmatizer._labels_from_data
|
||||
if all(token.lemma == 0 for token in gold):
|
||||
data["no_lemma_annotations"] += 1
|
||||
continue
|
||||
if any(token.lemma == 0 for token in gold):
|
||||
data["partial_lemma_annotations"] += 1
|
||||
lemma_set = set()
|
||||
for token in gold:
|
||||
if token.lemma != 0:
|
||||
lemma_set.add(token.lemma)
|
||||
tree_id = trees.add(token.text, token.lemma_)
|
||||
tree_str = trees.tree_to_str(tree_id)
|
||||
data["lemmatizer_trees"].add(tree_str)
|
||||
# We want to identify cases where lemmas aren't assigned
|
||||
# or are all assigned the same value, as this would indicate
|
||||
# an issue since we're expecting a large set of lemmas
|
||||
if len(lemma_set) < 2 and len(gold) > 1:
|
||||
data["n_low_cardinality_lemmas"] += 1
|
||||
return data
|
||||
|
||||
|
||||
|
|
|
@ -1,6 +1,5 @@
|
|||
from typing import Optional, Dict, Any, Union, List
|
||||
import platform
|
||||
import pkg_resources
|
||||
import json
|
||||
from pathlib import Path
|
||||
from wasabi import Printer, MarkdownRenderer
|
||||
|
@ -10,6 +9,7 @@ from ._util import app, Arg, Opt, string_to_list
|
|||
from .download import get_model_filename, get_latest_version
|
||||
from .. import util
|
||||
from .. import about
|
||||
from ..compat import importlib_metadata
|
||||
|
||||
|
||||
@app.command("info")
|
||||
|
@ -137,15 +137,14 @@ def info_installed_model_url(model: str) -> Optional[str]:
|
|||
dist-info available.
|
||||
"""
|
||||
try:
|
||||
dist = pkg_resources.get_distribution(model)
|
||||
data = json.loads(dist.get_metadata("direct_url.json"))
|
||||
return data["url"]
|
||||
except pkg_resources.DistributionNotFound:
|
||||
# no such package
|
||||
return None
|
||||
dist = importlib_metadata.distribution(model)
|
||||
text = dist.read_text("direct_url.json")
|
||||
if isinstance(text, str):
|
||||
data = json.loads(text)
|
||||
return data["url"]
|
||||
except Exception:
|
||||
# something else, like no file or invalid JSON
|
||||
return None
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
def info_model_url(model: str) -> Dict[str, Any]:
|
||||
|
|
|
@ -252,7 +252,7 @@ def get_third_party_dependencies(
|
|||
raise regerr from None
|
||||
module_name = func_info.get("module") # type: ignore[attr-defined]
|
||||
if module_name: # the code is part of a module, not a --code file
|
||||
modules.add(func_info["module"].split(".")[0]) # type: ignore[index]
|
||||
modules.add(func_info["module"].split(".")[0]) # type: ignore[union-attr]
|
||||
dependencies = []
|
||||
for module_name in modules:
|
||||
if module_name in distributions:
|
||||
|
|
|
@ -39,14 +39,17 @@ def project_pull(project_dir: Path, remote: str, *, verbose: bool = False):
|
|||
# in the list.
|
||||
while commands:
|
||||
for i, cmd in enumerate(list(commands)):
|
||||
logger.debug(f"CMD: {cmd['name']}.")
|
||||
logger.debug("CMD: %s.", cmd["name"])
|
||||
deps = [project_dir / dep for dep in cmd.get("deps", [])]
|
||||
if all(dep.exists() for dep in deps):
|
||||
cmd_hash = get_command_hash("", "", deps, cmd["script"])
|
||||
for output_path in cmd.get("outputs", []):
|
||||
url = storage.pull(output_path, command_hash=cmd_hash)
|
||||
logger.debug(
|
||||
f"URL: {url} for {output_path} with command hash {cmd_hash}"
|
||||
"URL: %s for %s with command hash %s",
|
||||
url,
|
||||
output_path,
|
||||
cmd_hash,
|
||||
)
|
||||
yield url, output_path
|
||||
|
||||
|
@ -58,7 +61,7 @@ def project_pull(project_dir: Path, remote: str, *, verbose: bool = False):
|
|||
commands.pop(i)
|
||||
break
|
||||
else:
|
||||
logger.debug(f"Dependency missing. Skipping {cmd['name']} outputs.")
|
||||
logger.debug("Dependency missing. Skipping %s outputs.", cmd["name"])
|
||||
else:
|
||||
# If we didn't break the for loop, break the while loop.
|
||||
break
|
||||
|
|
|
@ -37,15 +37,15 @@ def project_push(project_dir: Path, remote: str):
|
|||
remote = config["remotes"][remote]
|
||||
storage = RemoteStorage(project_dir, remote)
|
||||
for cmd in config.get("commands", []):
|
||||
logger.debug(f"CMD: cmd['name']")
|
||||
logger.debug("CMD: %s", cmd["name"])
|
||||
deps = [project_dir / dep for dep in cmd.get("deps", [])]
|
||||
if any(not dep.exists() for dep in deps):
|
||||
logger.debug(f"Dependency missing. Skipping {cmd['name']} outputs")
|
||||
logger.debug("Dependency missing. Skipping %s outputs", cmd["name"])
|
||||
continue
|
||||
cmd_hash = get_command_hash(
|
||||
"", "", [project_dir / dep for dep in cmd.get("deps", [])], cmd["script"]
|
||||
)
|
||||
logger.debug(f"CMD_HASH: {cmd_hash}")
|
||||
logger.debug("CMD_HASH: %s", cmd_hash)
|
||||
for output_path in cmd.get("outputs", []):
|
||||
output_loc = project_dir / output_path
|
||||
if output_loc.exists() and _is_not_empty_dir(output_loc):
|
||||
|
@ -55,7 +55,7 @@ def project_push(project_dir: Path, remote: str):
|
|||
content_hash=get_content_hash(output_loc),
|
||||
)
|
||||
logger.debug(
|
||||
f"URL: {url} for output {output_path} with cmd_hash {cmd_hash}"
|
||||
"URL: %s for output %s with cmd_hash %s", url, output_path, cmd_hash
|
||||
)
|
||||
yield output_path, url
|
||||
|
||||
|
|
|
@ -2,7 +2,6 @@ from typing import Optional, List, Dict, Sequence, Any, Iterable, Tuple
|
|||
import os.path
|
||||
from pathlib import Path
|
||||
|
||||
import pkg_resources
|
||||
from wasabi import msg
|
||||
from wasabi.util import locale_escape
|
||||
import sys
|
||||
|
@ -331,6 +330,7 @@ def _check_requirements(requirements: List[str]) -> Tuple[bool, bool]:
|
|||
RETURNS (Tuple[bool, bool]): Whether (1) any packages couldn't be imported, (2) any packages with version conflicts
|
||||
exist.
|
||||
"""
|
||||
import pkg_resources
|
||||
|
||||
failed_pkgs_msgs: List[str] = []
|
||||
conflicting_pkgs_msgs: List[str] = []
|
||||
|
|
|
@ -3,7 +3,7 @@ the docs and the init config command. It encodes various best practices and
|
|||
can help generate the best possible configuration, given a user's requirements. #}
|
||||
{%- set use_transformer = hardware != "cpu" and transformer_data -%}
|
||||
{%- set transformer = transformer_data[optimize] if use_transformer else {} -%}
|
||||
{%- set listener_components = ["tagger", "morphologizer", "parser", "ner", "textcat", "textcat_multilabel", "entity_linker", "spancat", "trainable_lemmatizer"] -%}
|
||||
{%- set listener_components = ["tagger", "morphologizer", "parser", "ner", "textcat", "textcat_multilabel", "entity_linker", "spancat", "spancat_singlelabel", "trainable_lemmatizer"] -%}
|
||||
[paths]
|
||||
train = null
|
||||
dev = null
|
||||
|
@ -24,8 +24,11 @@ gpu_allocator = null
|
|||
lang = "{{ lang }}"
|
||||
{%- set has_textcat = ("textcat" in components or "textcat_multilabel" in components) -%}
|
||||
{%- set with_accuracy = optimize == "accuracy" -%}
|
||||
{%- set has_accurate_textcat = has_textcat and with_accuracy -%}
|
||||
{%- if ("tagger" in components or "morphologizer" in components or "parser" in components or "ner" in components or "spancat" in components or "trainable_lemmatizer" in components or "entity_linker" in components or has_accurate_textcat) -%}
|
||||
{# The BOW textcat doesn't need a source of features, so it can omit the
|
||||
tok2vec/transformer. #}
|
||||
{%- set with_accuracy_or_transformer = (use_transformer or with_accuracy) -%}
|
||||
{%- set textcat_needs_features = has_textcat and with_accuracy_or_transformer -%}
|
||||
{%- if ("tagger" in components or "morphologizer" in components or "parser" in components or "ner" in components or "spancat" in components or "spancat_singlelabel" in components or "trainable_lemmatizer" in components or "entity_linker" in components or textcat_needs_features) -%}
|
||||
{%- set full_pipeline = ["transformer" if use_transformer else "tok2vec"] + components -%}
|
||||
{%- else -%}
|
||||
{%- set full_pipeline = components -%}
|
||||
|
@ -156,6 +159,36 @@ grad_factor = 1.0
|
|||
sizes = [1,2,3]
|
||||
{% endif -%}
|
||||
|
||||
{% if "spancat_singlelabel" in components %}
|
||||
[components.spancat_singlelabel]
|
||||
factory = "spancat_singlelabel"
|
||||
negative_weight = 1.0
|
||||
allow_overlap = true
|
||||
scorer = {"@scorers":"spacy.spancat_scorer.v1"}
|
||||
spans_key = "sc"
|
||||
|
||||
[components.spancat_singlelabel.model]
|
||||
@architectures = "spacy.SpanCategorizer.v1"
|
||||
|
||||
[components.spancat_singlelabel.model.reducer]
|
||||
@layers = "spacy.mean_max_reducer.v1"
|
||||
hidden_size = 128
|
||||
|
||||
[components.spancat_singlelabel.model.scorer]
|
||||
@layers = "Softmax.v2"
|
||||
|
||||
[components.spancat_singlelabel.model.tok2vec]
|
||||
@architectures = "spacy-transformers.TransformerListener.v1"
|
||||
grad_factor = 1.0
|
||||
|
||||
[components.spancat_singlelabel.model.tok2vec.pooling]
|
||||
@layers = "reduce_mean.v1"
|
||||
|
||||
[components.spancat_singlelabel.suggester]
|
||||
@misc = "spacy.ngram_suggester.v1"
|
||||
sizes = [1,2,3]
|
||||
{% endif %}
|
||||
|
||||
{% if "trainable_lemmatizer" in components -%}
|
||||
[components.trainable_lemmatizer]
|
||||
factory = "trainable_lemmatizer"
|
||||
|
@ -221,10 +254,16 @@ no_output_layer = false
|
|||
|
||||
{% else -%}
|
||||
[components.textcat.model]
|
||||
@architectures = "spacy.TextCatBOW.v2"
|
||||
@architectures = "spacy.TextCatCNN.v2"
|
||||
exclusive_classes = true
|
||||
ngram_size = 1
|
||||
no_output_layer = false
|
||||
nO = null
|
||||
|
||||
[components.textcat.model.tok2vec]
|
||||
@architectures = "spacy-transformers.TransformerListener.v1"
|
||||
grad_factor = 1.0
|
||||
|
||||
[components.textcat.model.tok2vec.pooling]
|
||||
@layers = "reduce_mean.v1"
|
||||
{%- endif %}
|
||||
{%- endif %}
|
||||
|
||||
|
@ -252,10 +291,16 @@ no_output_layer = false
|
|||
|
||||
{% else -%}
|
||||
[components.textcat_multilabel.model]
|
||||
@architectures = "spacy.TextCatBOW.v2"
|
||||
@architectures = "spacy.TextCatCNN.v2"
|
||||
exclusive_classes = false
|
||||
ngram_size = 1
|
||||
no_output_layer = false
|
||||
nO = null
|
||||
|
||||
[components.textcat_multilabel.model.tok2vec]
|
||||
@architectures = "spacy-transformers.TransformerListener.v1"
|
||||
grad_factor = 1.0
|
||||
|
||||
[components.textcat_multilabel.model.tok2vec.pooling]
|
||||
@layers = "reduce_mean.v1"
|
||||
{%- endif %}
|
||||
{%- endif %}
|
||||
|
||||
|
@ -374,6 +419,33 @@ width = ${components.tok2vec.model.encode.width}
|
|||
sizes = [1,2,3]
|
||||
{% endif %}
|
||||
|
||||
{% if "spancat_singlelabel" in components %}
|
||||
[components.spancat_singlelabel]
|
||||
factory = "spancat_singlelabel"
|
||||
negative_weight = 1.0
|
||||
allow_overlap = true
|
||||
scorer = {"@scorers":"spacy.spancat_scorer.v1"}
|
||||
spans_key = "sc"
|
||||
|
||||
[components.spancat_singlelabel.model]
|
||||
@architectures = "spacy.SpanCategorizer.v1"
|
||||
|
||||
[components.spancat_singlelabel.model.reducer]
|
||||
@layers = "spacy.mean_max_reducer.v1"
|
||||
hidden_size = 128
|
||||
|
||||
[components.spancat_singlelabel.model.scorer]
|
||||
@layers = "Softmax.v2"
|
||||
|
||||
[components.spancat_singlelabel.model.tok2vec]
|
||||
@architectures = "spacy.Tok2VecListener.v1"
|
||||
width = ${components.tok2vec.model.encode.width}
|
||||
|
||||
[components.spancat_singlelabel.suggester]
|
||||
@misc = "spacy.ngram_suggester.v1"
|
||||
sizes = [1,2,3]
|
||||
{% endif %}
|
||||
|
||||
{% if "trainable_lemmatizer" in components -%}
|
||||
[components.trainable_lemmatizer]
|
||||
factory = "trainable_lemmatizer"
|
||||
|
|
|
@ -444,8 +444,7 @@ class Errors(metaclass=ErrorsWithCodes):
|
|||
E133 = ("The sum of prior probabilities for alias '{alias}' should not "
|
||||
"exceed 1, but found {sum}.")
|
||||
E134 = ("Entity '{entity}' is not defined in the Knowledge Base.")
|
||||
E139 = ("Knowledge base for component '{name}' is empty. Use the methods "
|
||||
"`kb.add_entity` and `kb.add_alias` to add entries.")
|
||||
E139 = ("Knowledge base for component '{name}' is empty.")
|
||||
E140 = ("The list of entities, prior probabilities and entity vectors "
|
||||
"should be of equal length.")
|
||||
E141 = ("Entity vectors should be of length {required} instead of the "
|
||||
|
@ -550,6 +549,8 @@ class Errors(metaclass=ErrorsWithCodes):
|
|||
"during training, make sure to include it in 'annotating components'")
|
||||
|
||||
# New errors added in v3.x
|
||||
E850 = ("The PretrainVectors objective currently only supports default "
|
||||
"vectors, not {mode} vectors.")
|
||||
E851 = ("The 'textcat' component labels should only have values of 0 or 1, "
|
||||
"but found value of '{val}'.")
|
||||
E852 = ("The tar file pulled from the remote attempted an unsafe path "
|
||||
|
@ -967,7 +968,8 @@ class Errors(metaclass=ErrorsWithCodes):
|
|||
E1049 = ("No available port found for displaCy on host {host}. Please specify an available port "
|
||||
"with `displacy.serve(doc, port=port)`")
|
||||
E1050 = ("Port {port} is already in use. Please specify an available port with `displacy.serve(doc, port=port)` "
|
||||
"or use `auto_switch_port=True` to pick an available port automatically.")
|
||||
"or use `auto_select_port=True` to pick an available port automatically.")
|
||||
E1051 = ("'allow_overlap' can only be False when max_positive is 1, but found 'max_positive': {max_positive}.")
|
||||
|
||||
|
||||
# Deprecated model shortcuts, only used in errors and warnings
|
||||
|
|
|
@ -46,6 +46,9 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
|||
self._alias_index = PreshMap(nr_aliases + 1)
|
||||
self._aliases_table = alias_vec(nr_aliases + 1)
|
||||
|
||||
def is_empty(self):
|
||||
return len(self) == 0
|
||||
|
||||
def __len__(self):
|
||||
return self.get_size_entities()
|
||||
|
||||
|
|
|
@ -104,7 +104,7 @@ def create_tokenizer() -> Callable[["Language"], Tokenizer]:
|
|||
|
||||
@registry.misc("spacy.LookupsDataLoader.v1")
|
||||
def load_lookups_data(lang, tables):
|
||||
util.logger.debug(f"Loading lookups from spacy-lookups-data: {tables}")
|
||||
util.logger.debug("Loading lookups from spacy-lookups-data: %s", tables)
|
||||
lookups = load_lookups(lang=lang, tables=tables)
|
||||
return lookups
|
||||
|
||||
|
@ -1969,7 +1969,7 @@ class Language:
|
|||
pipe = self.get_pipe(pipe_name)
|
||||
pipe_cfg = self._pipe_configs[pipe_name]
|
||||
if listeners:
|
||||
util.logger.debug(f"Replacing listeners of component '{pipe_name}'")
|
||||
util.logger.debug("Replacing listeners of component '%s'", pipe_name)
|
||||
if len(list(listeners)) != len(pipe_listeners):
|
||||
# The number of listeners defined in the component model doesn't
|
||||
# match the listeners to replace, so we won't be able to update
|
||||
|
|
|
@ -25,7 +25,8 @@ class Lexeme:
|
|||
def orth_(self) -> str: ...
|
||||
@property
|
||||
def text(self) -> str: ...
|
||||
lower: str
|
||||
orth: int
|
||||
lower: int
|
||||
norm: int
|
||||
shape: int
|
||||
prefix: int
|
||||
|
|
|
@ -199,7 +199,7 @@ cdef class Lexeme:
|
|||
return self.orth_
|
||||
|
||||
property lower:
|
||||
"""RETURNS (str): Lowercase form of the lexeme."""
|
||||
"""RETURNS (uint64): Lowercase form of the lexeme."""
|
||||
def __get__(self):
|
||||
return self.c.lower
|
||||
|
||||
|
|
|
@ -82,8 +82,12 @@ cdef class DependencyMatcher:
|
|||
"$-": self._imm_left_sib,
|
||||
"$++": self._right_sib,
|
||||
"$--": self._left_sib,
|
||||
">+": self._imm_right_child,
|
||||
">-": self._imm_left_child,
|
||||
">++": self._right_child,
|
||||
">--": self._left_child,
|
||||
"<+": self._imm_right_parent,
|
||||
"<-": self._imm_left_parent,
|
||||
"<++": self._right_parent,
|
||||
"<--": self._left_parent,
|
||||
}
|
||||
|
@ -427,12 +431,34 @@ cdef class DependencyMatcher:
|
|||
def _left_sib(self, doc, node):
|
||||
return [doc[child.i] for child in doc[node].head.children if child.i < node]
|
||||
|
||||
def _imm_right_child(self, doc, node):
|
||||
for child in doc[node].children:
|
||||
if child.i == node + 1:
|
||||
return [doc[child.i]]
|
||||
return []
|
||||
|
||||
def _imm_left_child(self, doc, node):
|
||||
for child in doc[node].children:
|
||||
if child.i == node - 1:
|
||||
return [doc[child.i]]
|
||||
return []
|
||||
|
||||
def _right_child(self, doc, node):
|
||||
return [doc[child.i] for child in doc[node].children if child.i > node]
|
||||
|
||||
def _left_child(self, doc, node):
|
||||
return [doc[child.i] for child in doc[node].children if child.i < node]
|
||||
|
||||
def _imm_right_parent(self, doc, node):
|
||||
if doc[node].head.i == node + 1:
|
||||
return [doc[node].head]
|
||||
return []
|
||||
|
||||
def _imm_left_parent(self, doc, node):
|
||||
if doc[node].head.i == node - 1:
|
||||
return [doc[node].head]
|
||||
return []
|
||||
|
||||
def _right_parent(self, doc, node):
|
||||
if doc[node].head.i > node:
|
||||
return [doc[node].head]
|
||||
|
|
|
@ -828,6 +828,11 @@ def _get_attr_values(spec, string_store):
|
|||
return attr_values
|
||||
|
||||
|
||||
def _predicate_cache_key(attr, predicate, value, *, regex=False, fuzzy=None):
|
||||
# tuple order affects performance
|
||||
return (attr, regex, fuzzy, predicate, srsly.json_dumps(value, sort_keys=True))
|
||||
|
||||
|
||||
# These predicate helper classes are used to match the REGEX, IN, >= etc
|
||||
# extensions to the matcher introduced in #3173.
|
||||
|
||||
|
@ -847,7 +852,7 @@ class _FuzzyPredicate:
|
|||
fuzz = self.predicate[len("FUZZY"):] # number after prefix
|
||||
self.fuzzy = int(fuzz) if fuzz else -1
|
||||
self.fuzzy_compare = fuzzy_compare
|
||||
self.key = (self.attr, self.fuzzy, self.predicate, srsly.json_dumps(value, sort_keys=True))
|
||||
self.key = _predicate_cache_key(self.attr, self.predicate, value, fuzzy=self.fuzzy)
|
||||
|
||||
def __call__(self, Token token):
|
||||
if self.is_extension:
|
||||
|
@ -869,7 +874,7 @@ class _RegexPredicate:
|
|||
self.value = re.compile(value)
|
||||
self.predicate = predicate
|
||||
self.is_extension = is_extension
|
||||
self.key = (self.attr, self.predicate, srsly.json_dumps(value, sort_keys=True))
|
||||
self.key = _predicate_cache_key(self.attr, self.predicate, value)
|
||||
if self.predicate not in self.operators:
|
||||
raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate))
|
||||
|
||||
|
@ -905,7 +910,7 @@ class _SetPredicate:
|
|||
self.value = set(get_string_id(v) for v in value)
|
||||
self.predicate = predicate
|
||||
self.is_extension = is_extension
|
||||
self.key = (self.attr, self.regex, self.fuzzy, self.predicate, srsly.json_dumps(value, sort_keys=True))
|
||||
self.key = _predicate_cache_key(self.attr, self.predicate, value, regex=self.regex, fuzzy=self.fuzzy)
|
||||
if self.predicate not in self.operators:
|
||||
raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate))
|
||||
|
||||
|
@ -977,7 +982,7 @@ class _ComparisonPredicate:
|
|||
self.value = value
|
||||
self.predicate = predicate
|
||||
self.is_extension = is_extension
|
||||
self.key = (self.attr, self.predicate, srsly.json_dumps(value, sort_keys=True))
|
||||
self.key = _predicate_cache_key(self.attr, self.predicate, value)
|
||||
if self.predicate not in self.operators:
|
||||
raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate))
|
||||
|
||||
|
@ -1092,7 +1097,7 @@ def _get_extension_extra_predicates(spec, extra_predicates, predicate_types,
|
|||
if isinstance(value, dict):
|
||||
for type_, cls in predicate_types.items():
|
||||
if type_ in value:
|
||||
key = (attr, type_, srsly.json_dumps(value[type_], sort_keys=True))
|
||||
key = _predicate_cache_key(attr, type_, value[type_])
|
||||
if key in seen_predicates:
|
||||
output.append(seen_predicates[key])
|
||||
else:
|
||||
|
|
|
@ -89,6 +89,14 @@ def load_kb(
|
|||
return kb_from_file
|
||||
|
||||
|
||||
@registry.misc("spacy.EmptyKB.v2")
|
||||
def empty_kb_for_config() -> Callable[[Vocab, int], KnowledgeBase]:
|
||||
def empty_kb_factory(vocab: Vocab, entity_vector_length: int):
|
||||
return InMemoryLookupKB(vocab=vocab, entity_vector_length=entity_vector_length)
|
||||
|
||||
return empty_kb_factory
|
||||
|
||||
|
||||
@registry.misc("spacy.EmptyKB.v1")
|
||||
def empty_kb(
|
||||
entity_vector_length: int,
|
||||
|
|
|
@ -8,6 +8,7 @@ from thinc.loss import Loss
|
|||
from ...util import registry, OOV_RANK
|
||||
from ...errors import Errors
|
||||
from ...attrs import ID
|
||||
from ...vectors import Mode as VectorsMode
|
||||
|
||||
import numpy
|
||||
from functools import partial
|
||||
|
@ -23,6 +24,8 @@ def create_pretrain_vectors(
|
|||
maxout_pieces: int, hidden_size: int, loss: str
|
||||
) -> Callable[["Vocab", Model], Model]:
|
||||
def create_vectors_objective(vocab: "Vocab", tok2vec: Model) -> Model:
|
||||
if vocab.vectors.mode != VectorsMode.default:
|
||||
raise ValueError(Errors.E850.format(mode=vocab.vectors.mode))
|
||||
if vocab.vectors.shape[1] == 0:
|
||||
raise ValueError(Errors.E875)
|
||||
model = build_cloze_multi_task_model(
|
||||
|
|
|
@ -5,8 +5,8 @@ from itertools import islice
|
|||
import numpy as np
|
||||
|
||||
import srsly
|
||||
from thinc.api import Config, Model, SequenceCategoricalCrossentropy
|
||||
from thinc.types import Floats2d, Ints1d, Ints2d
|
||||
from thinc.api import Config, Model, SequenceCategoricalCrossentropy, NumpyOps
|
||||
from thinc.types import Floats2d, Ints2d
|
||||
|
||||
from ._edit_tree_internals.edit_trees import EditTrees
|
||||
from ._edit_tree_internals.schemas import validate_edit_tree
|
||||
|
@ -20,6 +20,10 @@ from ..vocab import Vocab
|
|||
from .. import util
|
||||
|
||||
|
||||
# The cutoff value of *top_k* above which an alternative method is used to process guesses.
|
||||
TOP_K_GUARDRAIL = 20
|
||||
|
||||
|
||||
default_model_config = """
|
||||
[model]
|
||||
@architectures = "spacy.Tagger.v2"
|
||||
|
@ -115,6 +119,7 @@ class EditTreeLemmatizer(TrainablePipe):
|
|||
|
||||
self.cfg: Dict[str, Any] = {"labels": []}
|
||||
self.scorer = scorer
|
||||
self.numpy_ops = NumpyOps()
|
||||
|
||||
def get_loss(
|
||||
self, examples: Iterable[Example], scores: List[Floats2d]
|
||||
|
@ -144,6 +149,18 @@ class EditTreeLemmatizer(TrainablePipe):
|
|||
return float(loss), d_scores
|
||||
|
||||
def predict(self, docs: Iterable[Doc]) -> List[Ints2d]:
|
||||
if self.top_k == 1:
|
||||
scores2guesses = self._scores2guesses_top_k_equals_1
|
||||
elif self.top_k <= TOP_K_GUARDRAIL:
|
||||
scores2guesses = self._scores2guesses_top_k_greater_1
|
||||
else:
|
||||
scores2guesses = self._scores2guesses_top_k_guardrail
|
||||
# The behaviour of *_scores2guesses_top_k_greater_1()* is efficient for values
|
||||
# of *top_k>1* that are likely to be useful when the edit tree lemmatizer is used
|
||||
# for its principal purpose of lemmatizing tokens. However, the code could also
|
||||
# be used for other purposes, and with very large values of *top_k* the method
|
||||
# becomes inefficient. In such cases, *_scores2guesses_top_k_guardrail()* is used
|
||||
# instead.
|
||||
n_docs = len(list(docs))
|
||||
if not any(len(doc) for doc in docs):
|
||||
# Handle cases where there are no tokens in any docs.
|
||||
|
@ -153,20 +170,52 @@ class EditTreeLemmatizer(TrainablePipe):
|
|||
return guesses
|
||||
scores = self.model.predict(docs)
|
||||
assert len(scores) == n_docs
|
||||
guesses = self._scores2guesses(docs, scores)
|
||||
guesses = scores2guesses(docs, scores)
|
||||
assert len(guesses) == n_docs
|
||||
return guesses
|
||||
|
||||
def _scores2guesses(self, docs, scores):
|
||||
def _scores2guesses_top_k_equals_1(self, docs, scores):
|
||||
guesses = []
|
||||
for doc, doc_scores in zip(docs, scores):
|
||||
if self.top_k == 1:
|
||||
doc_guesses = doc_scores.argmax(axis=1).reshape(-1, 1)
|
||||
else:
|
||||
doc_guesses = np.argsort(doc_scores)[..., : -self.top_k - 1 : -1]
|
||||
doc_guesses = doc_scores.argmax(axis=1)
|
||||
doc_guesses = self.numpy_ops.asarray(doc_guesses)
|
||||
|
||||
if not isinstance(doc_guesses, np.ndarray):
|
||||
doc_guesses = doc_guesses.get()
|
||||
doc_compat_guesses = []
|
||||
for i, token in enumerate(doc):
|
||||
tree_id = self.cfg["labels"][doc_guesses[i]]
|
||||
if self.trees.apply(tree_id, token.text) is not None:
|
||||
doc_compat_guesses.append(tree_id)
|
||||
else:
|
||||
doc_compat_guesses.append(-1)
|
||||
guesses.append(np.array(doc_compat_guesses))
|
||||
|
||||
return guesses
|
||||
|
||||
def _scores2guesses_top_k_greater_1(self, docs, scores):
|
||||
guesses = []
|
||||
top_k = min(self.top_k, len(self.labels))
|
||||
for doc, doc_scores in zip(docs, scores):
|
||||
doc_scores = self.numpy_ops.asarray(doc_scores)
|
||||
doc_compat_guesses = []
|
||||
for i, token in enumerate(doc):
|
||||
for _ in range(top_k):
|
||||
candidate = int(doc_scores[i].argmax())
|
||||
candidate_tree_id = self.cfg["labels"][candidate]
|
||||
if self.trees.apply(candidate_tree_id, token.text) is not None:
|
||||
doc_compat_guesses.append(candidate_tree_id)
|
||||
break
|
||||
doc_scores[i, candidate] = np.finfo(np.float32).min
|
||||
else:
|
||||
doc_compat_guesses.append(-1)
|
||||
guesses.append(np.array(doc_compat_guesses))
|
||||
|
||||
return guesses
|
||||
|
||||
def _scores2guesses_top_k_guardrail(self, docs, scores):
|
||||
guesses = []
|
||||
for doc, doc_scores in zip(docs, scores):
|
||||
doc_guesses = np.argsort(doc_scores)[..., : -self.top_k - 1 : -1]
|
||||
doc_guesses = self.numpy_ops.asarray(doc_guesses)
|
||||
|
||||
doc_compat_guesses = []
|
||||
for token, candidates in zip(doc, doc_guesses):
|
||||
|
|
|
@ -54,6 +54,7 @@ DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"]
|
|||
"entity_vector_length": 64,
|
||||
"get_candidates": {"@misc": "spacy.CandidateGenerator.v1"},
|
||||
"get_candidates_batch": {"@misc": "spacy.CandidateBatchGenerator.v1"},
|
||||
"generate_empty_kb": {"@misc": "spacy.EmptyKB.v2"},
|
||||
"overwrite": True,
|
||||
"scorer": {"@scorers": "spacy.entity_linker_scorer.v1"},
|
||||
"use_gold_ents": True,
|
||||
|
@ -80,6 +81,7 @@ def make_entity_linker(
|
|||
get_candidates_batch: Callable[
|
||||
[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
|
||||
],
|
||||
generate_empty_kb: Callable[[Vocab, int], KnowledgeBase],
|
||||
overwrite: bool,
|
||||
scorer: Optional[Callable],
|
||||
use_gold_ents: bool,
|
||||
|
@ -101,6 +103,7 @@ def make_entity_linker(
|
|||
get_candidates_batch (
|
||||
Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]], Iterable[Candidate]]
|
||||
): Function that produces a list of candidates, given a certain knowledge base and several textual mentions.
|
||||
generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase.
|
||||
scorer (Optional[Callable]): The scoring method.
|
||||
use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another
|
||||
component must provide entity annotations.
|
||||
|
@ -135,6 +138,7 @@ def make_entity_linker(
|
|||
entity_vector_length=entity_vector_length,
|
||||
get_candidates=get_candidates,
|
||||
get_candidates_batch=get_candidates_batch,
|
||||
generate_empty_kb=generate_empty_kb,
|
||||
overwrite=overwrite,
|
||||
scorer=scorer,
|
||||
use_gold_ents=use_gold_ents,
|
||||
|
@ -175,6 +179,7 @@ class EntityLinker(TrainablePipe):
|
|||
get_candidates_batch: Callable[
|
||||
[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
|
||||
],
|
||||
generate_empty_kb: Callable[[Vocab, int], KnowledgeBase],
|
||||
overwrite: bool = BACKWARD_OVERWRITE,
|
||||
scorer: Optional[Callable] = entity_linker_score,
|
||||
use_gold_ents: bool,
|
||||
|
@ -198,6 +203,7 @@ class EntityLinker(TrainablePipe):
|
|||
Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]],
|
||||
Iterable[Candidate]]
|
||||
): Function that produces a list of candidates, given a certain knowledge base and several textual mentions.
|
||||
generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase.
|
||||
scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_links.
|
||||
use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another
|
||||
component must provide entity annotations.
|
||||
|
@ -220,6 +226,7 @@ class EntityLinker(TrainablePipe):
|
|||
self.model = model
|
||||
self.name = name
|
||||
self.labels_discard = list(labels_discard)
|
||||
# how many neighbour sentences to take into account
|
||||
self.n_sents = n_sents
|
||||
self.incl_prior = incl_prior
|
||||
self.incl_context = incl_context
|
||||
|
@ -227,9 +234,7 @@ class EntityLinker(TrainablePipe):
|
|||
self.get_candidates_batch = get_candidates_batch
|
||||
self.cfg: Dict[str, Any] = {"overwrite": overwrite}
|
||||
self.distance = CosineDistance(normalize=False)
|
||||
# how many neighbour sentences to take into account
|
||||
# create an empty KB by default
|
||||
self.kb = empty_kb(entity_vector_length)(self.vocab)
|
||||
self.kb = generate_empty_kb(self.vocab, entity_vector_length)
|
||||
self.scorer = scorer
|
||||
self.use_gold_ents = use_gold_ents
|
||||
self.candidates_batch_size = candidates_batch_size
|
||||
|
@ -250,7 +255,7 @@ class EntityLinker(TrainablePipe):
|
|||
# Raise an error if the knowledge base is not initialized.
|
||||
if self.kb is None:
|
||||
raise ValueError(Errors.E1018.format(name=self.name))
|
||||
if len(self.kb) == 0:
|
||||
if hasattr(self.kb, "is_empty") and self.kb.is_empty():
|
||||
raise ValueError(Errors.E139.format(name=self.name))
|
||||
|
||||
def initialize(
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
from typing import List, Dict, Callable, Tuple, Optional, Iterable, Any
|
||||
from typing import List, Dict, Callable, Tuple, Optional, Iterable, Any, cast, Union
|
||||
from dataclasses import dataclass
|
||||
from thinc.api import Config, Model, get_current_ops, set_dropout_rate, Ops
|
||||
from thinc.api import Optimizer
|
||||
from thinc.types import Ragged, Ints2d, Floats2d
|
||||
|
@ -43,7 +44,36 @@ maxout_pieces = 3
|
|||
depth = 4
|
||||
"""
|
||||
|
||||
spancat_singlelabel_default_config = """
|
||||
[model]
|
||||
@architectures = "spacy.SpanCategorizer.v1"
|
||||
scorer = {"@layers": "Softmax.v2"}
|
||||
|
||||
[model.reducer]
|
||||
@layers = spacy.mean_max_reducer.v1
|
||||
hidden_size = 128
|
||||
|
||||
[model.tok2vec]
|
||||
@architectures = "spacy.Tok2Vec.v2"
|
||||
[model.tok2vec.embed]
|
||||
@architectures = "spacy.MultiHashEmbed.v1"
|
||||
width = 96
|
||||
rows = [5000, 1000, 2500, 1000]
|
||||
attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"]
|
||||
include_static_vectors = false
|
||||
|
||||
[model.tok2vec.encode]
|
||||
@architectures = "spacy.MaxoutWindowEncoder.v2"
|
||||
width = ${model.tok2vec.embed.width}
|
||||
window_size = 1
|
||||
maxout_pieces = 3
|
||||
depth = 4
|
||||
"""
|
||||
|
||||
DEFAULT_SPANCAT_MODEL = Config().from_str(spancat_default_config)["model"]
|
||||
DEFAULT_SPANCAT_SINGLELABEL_MODEL = Config().from_str(
|
||||
spancat_singlelabel_default_config
|
||||
)["model"]
|
||||
|
||||
|
||||
@runtime_checkable
|
||||
|
@ -119,10 +149,14 @@ def make_spancat(
|
|||
threshold: float,
|
||||
max_positive: Optional[int],
|
||||
) -> "SpanCategorizer":
|
||||
"""Create a SpanCategorizer component. The span categorizer consists of two
|
||||
"""Create a SpanCategorizer component and configure it for multi-label
|
||||
classification to be able to assign multiple labels for each span.
|
||||
The span categorizer consists of two
|
||||
parts: a suggester function that proposes candidate spans, and a labeller
|
||||
model that predicts one or more labels for each span.
|
||||
|
||||
name (str): The component instance name, used to add entries to the
|
||||
losses during training.
|
||||
suggester (Callable[[Iterable[Doc], Optional[Ops]], Ragged]): A function that suggests spans.
|
||||
Spans are returned as a ragged array with two integer columns, for the
|
||||
start and end positions.
|
||||
|
@ -144,12 +178,80 @@ def make_spancat(
|
|||
"""
|
||||
return SpanCategorizer(
|
||||
nlp.vocab,
|
||||
suggester=suggester,
|
||||
model=model,
|
||||
spans_key=spans_key,
|
||||
threshold=threshold,
|
||||
max_positive=max_positive,
|
||||
suggester=suggester,
|
||||
name=name,
|
||||
spans_key=spans_key,
|
||||
negative_weight=None,
|
||||
allow_overlap=True,
|
||||
max_positive=max_positive,
|
||||
threshold=threshold,
|
||||
scorer=scorer,
|
||||
add_negative_label=False,
|
||||
)
|
||||
|
||||
|
||||
@Language.factory(
|
||||
"spancat_singlelabel",
|
||||
assigns=["doc.spans"],
|
||||
default_config={
|
||||
"spans_key": "sc",
|
||||
"model": DEFAULT_SPANCAT_SINGLELABEL_MODEL,
|
||||
"negative_weight": 1.0,
|
||||
"suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]},
|
||||
"scorer": {"@scorers": "spacy.spancat_scorer.v1"},
|
||||
"allow_overlap": True,
|
||||
},
|
||||
default_score_weights={"spans_sc_f": 1.0, "spans_sc_p": 0.0, "spans_sc_r": 0.0},
|
||||
)
|
||||
def make_spancat_singlelabel(
|
||||
nlp: Language,
|
||||
name: str,
|
||||
suggester: Suggester,
|
||||
model: Model[Tuple[List[Doc], Ragged], Floats2d],
|
||||
spans_key: str,
|
||||
negative_weight: float,
|
||||
allow_overlap: bool,
|
||||
scorer: Optional[Callable],
|
||||
) -> "SpanCategorizer":
|
||||
"""Create a SpanCategorizer component and configure it for multi-class
|
||||
classification. With this configuration each span can get at most one
|
||||
label. The span categorizer consists of two
|
||||
parts: a suggester function that proposes candidate spans, and a labeller
|
||||
model that predicts one or more labels for each span.
|
||||
|
||||
name (str): The component instance name, used to add entries to the
|
||||
losses during training.
|
||||
suggester (Callable[[Iterable[Doc], Optional[Ops]], Ragged]): A function that suggests spans.
|
||||
Spans are returned as a ragged array with two integer columns, for the
|
||||
start and end positions.
|
||||
model (Model[Tuple[List[Doc], Ragged], Floats2d]): A model instance that
|
||||
is given a list of documents and (start, end) indices representing
|
||||
candidate span offsets. The model predicts a probability for each category
|
||||
for each span.
|
||||
spans_key (str): Key of the doc.spans dict to save the spans under. During
|
||||
initialization and training, the component will look for spans on the
|
||||
reference document under the same key.
|
||||
scorer (Optional[Callable]): The scoring method. Defaults to
|
||||
Scorer.score_spans for the Doc.spans[spans_key] with overlapping
|
||||
spans allowed.
|
||||
negative_weight (float): Multiplier for the loss terms.
|
||||
Can be used to downweight the negative samples if there are too many.
|
||||
allow_overlap (bool): If True the data is assumed to contain overlapping spans.
|
||||
Otherwise it produces non-overlapping spans greedily prioritizing
|
||||
higher assigned label scores.
|
||||
"""
|
||||
return SpanCategorizer(
|
||||
nlp.vocab,
|
||||
model=model,
|
||||
suggester=suggester,
|
||||
name=name,
|
||||
spans_key=spans_key,
|
||||
negative_weight=negative_weight,
|
||||
allow_overlap=allow_overlap,
|
||||
max_positive=1,
|
||||
add_negative_label=True,
|
||||
threshold=None,
|
||||
scorer=scorer,
|
||||
)
|
||||
|
||||
|
@ -172,6 +274,27 @@ def make_spancat_scorer():
|
|||
return spancat_score
|
||||
|
||||
|
||||
@dataclass
|
||||
class _Intervals:
|
||||
"""
|
||||
Helper class to avoid storing overlapping spans.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.ranges = set()
|
||||
|
||||
def add(self, i, j):
|
||||
for e in range(i, j):
|
||||
self.ranges.add(e)
|
||||
|
||||
def __contains__(self, rang):
|
||||
i, j = rang
|
||||
for e in range(i, j):
|
||||
if e in self.ranges:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
class SpanCategorizer(TrainablePipe):
|
||||
"""Pipeline component to label spans of text.
|
||||
|
||||
|
@ -185,25 +308,43 @@ class SpanCategorizer(TrainablePipe):
|
|||
suggester: Suggester,
|
||||
name: str = "spancat",
|
||||
*,
|
||||
add_negative_label: bool = False,
|
||||
spans_key: str = "spans",
|
||||
threshold: float = 0.5,
|
||||
negative_weight: Optional[float] = 1.0,
|
||||
allow_overlap: Optional[bool] = True,
|
||||
max_positive: Optional[int] = None,
|
||||
threshold: Optional[float] = 0.5,
|
||||
scorer: Optional[Callable] = spancat_score,
|
||||
) -> None:
|
||||
"""Initialize the span categorizer.
|
||||
"""Initialize the multi-label or multi-class span categorizer.
|
||||
|
||||
vocab (Vocab): The shared vocabulary.
|
||||
model (thinc.api.Model): The Thinc Model powering the pipeline component.
|
||||
For multi-class classification (single label per span) we recommend
|
||||
using a Softmax classifier as a the final layer, while for multi-label
|
||||
classification (multiple possible labels per span) we recommend Logistic.
|
||||
suggester (Callable[[Iterable[Doc], Optional[Ops]], Ragged]): A function that suggests spans.
|
||||
Spans are returned as a ragged array with two integer columns, for the
|
||||
start and end positions.
|
||||
name (str): The component instance name, used to add entries to the
|
||||
losses during training.
|
||||
spans_key (str): Key of the Doc.spans dict to save the spans under.
|
||||
During initialization and training, the component will look for
|
||||
spans on the reference document under the same key. Defaults to
|
||||
`"spans"`.
|
||||
threshold (float): Minimum probability to consider a prediction
|
||||
positive. Spans with a positive prediction will be saved on the Doc.
|
||||
Defaults to 0.5.
|
||||
add_negative_label (bool): Learn to predict a special 'negative_label'
|
||||
when a Span is not annotated.
|
||||
threshold (Optional[float]): Minimum probability to consider a prediction
|
||||
positive. Defaults to 0.5. Spans with a positive prediction will be saved
|
||||
on the Doc.
|
||||
max_positive (Optional[int]): Maximum number of labels to consider
|
||||
positive per span. Defaults to None, indicating no limit.
|
||||
negative_weight (float): Multiplier for the loss terms.
|
||||
Can be used to downweight the negative samples if there are too many
|
||||
when add_negative_label is True. Otherwise its unused.
|
||||
allow_overlap (bool): If True the data is assumed to contain overlapping spans.
|
||||
Otherwise it produces non-overlapping spans greedily prioritizing
|
||||
higher assigned label scores. Only used when max_positive is 1.
|
||||
scorer (Optional[Callable]): The scoring method. Defaults to
|
||||
Scorer.score_spans for the Doc.spans[spans_key] with overlapping
|
||||
spans allowed.
|
||||
|
@ -215,12 +356,17 @@ class SpanCategorizer(TrainablePipe):
|
|||
"spans_key": spans_key,
|
||||
"threshold": threshold,
|
||||
"max_positive": max_positive,
|
||||
"negative_weight": negative_weight,
|
||||
"allow_overlap": allow_overlap,
|
||||
}
|
||||
self.vocab = vocab
|
||||
self.suggester = suggester
|
||||
self.model = model
|
||||
self.name = name
|
||||
self.scorer = scorer
|
||||
self.add_negative_label = add_negative_label
|
||||
if not allow_overlap and max_positive is not None and max_positive > 1:
|
||||
raise ValueError(Errors.E1051.format(max_positive=max_positive))
|
||||
|
||||
@property
|
||||
def key(self) -> str:
|
||||
|
@ -230,6 +376,21 @@ class SpanCategorizer(TrainablePipe):
|
|||
"""
|
||||
return str(self.cfg["spans_key"])
|
||||
|
||||
def _allow_extra_label(self) -> None:
|
||||
"""Raise an error if the component can not add any more labels."""
|
||||
nO = None
|
||||
if self.model.has_dim("nO"):
|
||||
nO = self.model.get_dim("nO")
|
||||
elif self.model.has_ref("output_layer") and self.model.get_ref(
|
||||
"output_layer"
|
||||
).has_dim("nO"):
|
||||
nO = self.model.get_ref("output_layer").get_dim("nO")
|
||||
if nO is not None and nO == self._n_labels:
|
||||
if not self.is_resizable:
|
||||
raise ValueError(
|
||||
Errors.E922.format(name=self.name, nO=self.model.get_dim("nO"))
|
||||
)
|
||||
|
||||
def add_label(self, label: str) -> int:
|
||||
"""Add a new label to the pipe.
|
||||
|
||||
|
@ -263,6 +424,27 @@ class SpanCategorizer(TrainablePipe):
|
|||
"""
|
||||
return list(self.labels)
|
||||
|
||||
@property
|
||||
def _label_map(self) -> Dict[str, int]:
|
||||
"""RETURNS (Dict[str, int]): The label map."""
|
||||
return {label: i for i, label in enumerate(self.labels)}
|
||||
|
||||
@property
|
||||
def _n_labels(self) -> int:
|
||||
"""RETURNS (int): Number of labels."""
|
||||
if self.add_negative_label:
|
||||
return len(self.labels) + 1
|
||||
else:
|
||||
return len(self.labels)
|
||||
|
||||
@property
|
||||
def _negative_label_i(self) -> Union[int, None]:
|
||||
"""RETURNS (Union[int, None]): Index of the negative label."""
|
||||
if self.add_negative_label:
|
||||
return len(self.label_data)
|
||||
else:
|
||||
return None
|
||||
|
||||
def predict(self, docs: Iterable[Doc]):
|
||||
"""Apply the pipeline's model to a batch of docs, without modifying them.
|
||||
|
||||
|
@ -304,14 +486,24 @@ class SpanCategorizer(TrainablePipe):
|
|||
|
||||
DOCS: https://spacy.io/api/spancategorizer#set_annotations
|
||||
"""
|
||||
labels = self.labels
|
||||
indices, scores = indices_scores
|
||||
offset = 0
|
||||
for i, doc in enumerate(docs):
|
||||
indices_i = indices[i].dataXd
|
||||
doc.spans[self.key] = self._make_span_group(
|
||||
doc, indices_i, scores[offset : offset + indices.lengths[i]], labels # type: ignore[arg-type]
|
||||
)
|
||||
allow_overlap = cast(bool, self.cfg["allow_overlap"])
|
||||
if self.cfg["max_positive"] == 1:
|
||||
doc.spans[self.key] = self._make_span_group_singlelabel(
|
||||
doc,
|
||||
indices_i,
|
||||
scores[offset : offset + indices.lengths[i]],
|
||||
allow_overlap,
|
||||
)
|
||||
else:
|
||||
doc.spans[self.key] = self._make_span_group_multilabel(
|
||||
doc,
|
||||
indices_i,
|
||||
scores[offset : offset + indices.lengths[i]],
|
||||
)
|
||||
offset += indices.lengths[i]
|
||||
|
||||
def update(
|
||||
|
@ -371,9 +563,11 @@ class SpanCategorizer(TrainablePipe):
|
|||
spans = Ragged(
|
||||
self.model.ops.to_numpy(spans.data), self.model.ops.to_numpy(spans.lengths)
|
||||
)
|
||||
label_map = {label: i for i, label in enumerate(self.labels)}
|
||||
target = numpy.zeros(scores.shape, dtype=scores.dtype)
|
||||
if self.add_negative_label:
|
||||
negative_spans = numpy.ones((scores.shape[0]))
|
||||
offset = 0
|
||||
label_map = self._label_map
|
||||
for i, eg in enumerate(examples):
|
||||
# Map (start, end) offset of spans to the row in the d_scores array,
|
||||
# so that we can adjust the gradient for predictions that were
|
||||
|
@ -390,10 +584,16 @@ class SpanCategorizer(TrainablePipe):
|
|||
row = spans_index[key]
|
||||
k = label_map[gold_span.label_]
|
||||
target[row, k] = 1.0
|
||||
if self.add_negative_label:
|
||||
# delete negative label target.
|
||||
negative_spans[row] = 0.0
|
||||
# The target is a flat array for all docs. Track the position
|
||||
# we're at within the flat array.
|
||||
offset += spans.lengths[i]
|
||||
target = self.model.ops.asarray(target, dtype="f") # type: ignore
|
||||
if self.add_negative_label:
|
||||
negative_samples = numpy.nonzero(negative_spans)[0]
|
||||
target[negative_samples, self._negative_label_i] = 1.0 # type: ignore
|
||||
# The target will have the values 0 (for untrue predictions) or 1
|
||||
# (for true predictions).
|
||||
# The scores should be in the range [0, 1].
|
||||
|
@ -402,6 +602,10 @@ class SpanCategorizer(TrainablePipe):
|
|||
# If the prediction is 0.9 and it's false, the gradient will be
|
||||
# 0.9 (0.9 - 0.0)
|
||||
d_scores = scores - target
|
||||
if self.add_negative_label:
|
||||
neg_weight = cast(float, self.cfg["negative_weight"])
|
||||
if neg_weight != 1.0:
|
||||
d_scores[negative_samples] *= neg_weight
|
||||
loss = float((d_scores**2).sum())
|
||||
return loss, d_scores
|
||||
|
||||
|
@ -438,7 +642,7 @@ class SpanCategorizer(TrainablePipe):
|
|||
if subbatch:
|
||||
docs = [eg.x for eg in subbatch]
|
||||
spans = build_ngram_suggester(sizes=[1])(docs)
|
||||
Y = self.model.ops.alloc2f(spans.dataXd.shape[0], len(self.labels))
|
||||
Y = self.model.ops.alloc2f(spans.dataXd.shape[0], self._n_labels)
|
||||
self.model.initialize(X=(docs, spans), Y=Y)
|
||||
else:
|
||||
self.model.initialize()
|
||||
|
@ -452,31 +656,96 @@ class SpanCategorizer(TrainablePipe):
|
|||
eg.reference.spans.get(self.key, []), allow_overlap=True
|
||||
)
|
||||
|
||||
def _make_span_group(
|
||||
self, doc: Doc, indices: Ints2d, scores: Floats2d, labels: List[str]
|
||||
def _make_span_group_multilabel(
|
||||
self,
|
||||
doc: Doc,
|
||||
indices: Ints2d,
|
||||
scores: Floats2d,
|
||||
) -> SpanGroup:
|
||||
"""Find the top-k labels for each span (k=max_positive)."""
|
||||
spans = SpanGroup(doc, name=self.key)
|
||||
max_positive = self.cfg["max_positive"]
|
||||
if scores.size == 0:
|
||||
return spans
|
||||
scores = self.model.ops.to_numpy(scores)
|
||||
indices = self.model.ops.to_numpy(indices)
|
||||
threshold = self.cfg["threshold"]
|
||||
max_positive = self.cfg["max_positive"]
|
||||
|
||||
keeps = scores >= threshold
|
||||
ranked = (scores * -1).argsort() # type: ignore
|
||||
if max_positive is not None:
|
||||
assert isinstance(max_positive, int)
|
||||
if self.add_negative_label:
|
||||
negative_scores = numpy.copy(scores[:, self._negative_label_i])
|
||||
scores[:, self._negative_label_i] = -numpy.inf
|
||||
ranked = (scores * -1).argsort() # type: ignore
|
||||
scores[:, self._negative_label_i] = negative_scores
|
||||
else:
|
||||
ranked = (scores * -1).argsort() # type: ignore
|
||||
span_filter = ranked[:, max_positive:]
|
||||
for i, row in enumerate(span_filter):
|
||||
keeps[i, row] = False
|
||||
spans.attrs["scores"] = scores[keeps].flatten()
|
||||
|
||||
indices = self.model.ops.to_numpy(indices)
|
||||
keeps = self.model.ops.to_numpy(keeps)
|
||||
|
||||
attrs_scores = []
|
||||
for i in range(indices.shape[0]):
|
||||
start = indices[i, 0]
|
||||
end = indices[i, 1]
|
||||
|
||||
for j, keep in enumerate(keeps[i]):
|
||||
if keep:
|
||||
spans.append(Span(doc, start, end, label=labels[j]))
|
||||
if j != self._negative_label_i:
|
||||
spans.append(Span(doc, start, end, label=self.labels[j]))
|
||||
attrs_scores.append(scores[i, j])
|
||||
spans.attrs["scores"] = numpy.array(attrs_scores)
|
||||
return spans
|
||||
|
||||
def _make_span_group_singlelabel(
|
||||
self,
|
||||
doc: Doc,
|
||||
indices: Ints2d,
|
||||
scores: Floats2d,
|
||||
allow_overlap: bool = True,
|
||||
) -> SpanGroup:
|
||||
"""Find the argmax label for each span."""
|
||||
# Handle cases when there are zero suggestions
|
||||
if scores.size == 0:
|
||||
return SpanGroup(doc, name=self.key)
|
||||
scores = self.model.ops.to_numpy(scores)
|
||||
indices = self.model.ops.to_numpy(indices)
|
||||
predicted = scores.argmax(axis=1)
|
||||
argmax_scores = numpy.take_along_axis(
|
||||
scores, numpy.expand_dims(predicted, 1), axis=1
|
||||
)
|
||||
keeps = numpy.ones(predicted.shape, dtype=bool)
|
||||
# Remove samples where the negative label is the argmax.
|
||||
if self.add_negative_label:
|
||||
keeps = numpy.logical_and(keeps, predicted != self._negative_label_i)
|
||||
# Filter samples according to threshold.
|
||||
threshold = self.cfg["threshold"]
|
||||
if threshold is not None:
|
||||
keeps = numpy.logical_and(keeps, (argmax_scores >= threshold).squeeze())
|
||||
# Sort spans according to argmax probability
|
||||
if not allow_overlap:
|
||||
# Get the probabilities
|
||||
sort_idx = (argmax_scores.squeeze() * -1).argsort()
|
||||
predicted = predicted[sort_idx]
|
||||
indices = indices[sort_idx]
|
||||
keeps = keeps[sort_idx]
|
||||
seen = _Intervals()
|
||||
spans = SpanGroup(doc, name=self.key)
|
||||
attrs_scores = []
|
||||
for i in range(indices.shape[0]):
|
||||
if not keeps[i]:
|
||||
continue
|
||||
|
||||
label = predicted[i]
|
||||
start = indices[i, 0]
|
||||
end = indices[i, 1]
|
||||
|
||||
if not allow_overlap:
|
||||
if (start, end) in seen:
|
||||
continue
|
||||
else:
|
||||
seen.add(start, end)
|
||||
attrs_scores.append(argmax_scores[i])
|
||||
spans.append(Span(doc, start, end, label=self.labels[label]))
|
||||
|
||||
return spans
|
||||
|
|
|
@ -163,6 +163,18 @@ def test_char_span(doc, i_sent, i, j, text):
|
|||
assert span.text == text
|
||||
|
||||
|
||||
def test_char_span_attributes(doc):
|
||||
label = "LABEL"
|
||||
kb_id = "KB_ID"
|
||||
span_id = "SPAN_ID"
|
||||
span1 = doc.char_span(20, 45, label=label, kb_id=kb_id, span_id=span_id)
|
||||
span2 = doc[1:].char_span(15, 40, label=label, kb_id=kb_id, span_id=span_id)
|
||||
assert span1.text == span2.text
|
||||
assert span1.label_ == span2.label_ == label
|
||||
assert span1.kb_id_ == span2.kb_id_ == kb_id
|
||||
assert span1.id_ == span2.id_ == span_id
|
||||
|
||||
|
||||
def test_spans_sent_spans(doc):
|
||||
sents = list(doc.sents)
|
||||
assert sents[0].start == 0
|
||||
|
@ -367,6 +379,14 @@ def test_spans_by_character(doc):
|
|||
span1.start_char + 1, span1.end_char, label="GPE", alignment_mode="unk"
|
||||
)
|
||||
|
||||
# Span.char_span + alignment mode "contract"
|
||||
span2 = doc[0:2].char_span(
|
||||
span1.start_char - 3, span1.end_char, label="GPE", alignment_mode="contract"
|
||||
)
|
||||
assert span1.start_char == span2.start_char
|
||||
assert span1.end_char == span2.end_char
|
||||
assert span2.label_ == "GPE"
|
||||
|
||||
|
||||
def test_span_to_array(doc):
|
||||
span = doc[1:-2]
|
||||
|
|
|
@ -316,16 +316,32 @@ def test_dependency_matcher_precedence_ops(en_vocab, op, num_matches):
|
|||
("the", "brown", "$--", 0),
|
||||
("brown", "the", "$--", 1),
|
||||
("brown", "brown", "$--", 0),
|
||||
("over", "jumped", "<+", 0),
|
||||
("quick", "fox", "<+", 0),
|
||||
("the", "quick", "<+", 0),
|
||||
("brown", "fox", "<+", 1),
|
||||
("quick", "fox", "<++", 1),
|
||||
("quick", "over", "<++", 0),
|
||||
("over", "jumped", "<++", 0),
|
||||
("the", "fox", "<++", 2),
|
||||
("brown", "fox", "<-", 0),
|
||||
("fox", "over", "<-", 0),
|
||||
("the", "over", "<-", 0),
|
||||
("over", "jumped", "<-", 1),
|
||||
("brown", "fox", "<--", 0),
|
||||
("fox", "jumped", "<--", 0),
|
||||
("fox", "over", "<--", 1),
|
||||
("fox", "brown", ">+", 0),
|
||||
("over", "fox", ">+", 0),
|
||||
("over", "the", ">+", 0),
|
||||
("jumped", "over", ">+", 1),
|
||||
("jumped", "over", ">++", 1),
|
||||
("fox", "lazy", ">++", 0),
|
||||
("over", "the", ">++", 0),
|
||||
("jumped", "over", ">-", 0),
|
||||
("fox", "quick", ">-", 0),
|
||||
("brown", "quick", ">-", 0),
|
||||
("fox", "brown", ">-", 1),
|
||||
("brown", "fox", ">--", 0),
|
||||
("fox", "brown", ">--", 1),
|
||||
("jumped", "fox", ">--", 1),
|
||||
|
|
|
@ -9,6 +9,8 @@ from spacy.lang.en import English
|
|||
from spacy.lang.it import Italian
|
||||
from spacy.language import Language
|
||||
from spacy.lookups import Lookups
|
||||
from spacy.pipeline import EntityRecognizer
|
||||
from spacy.pipeline.ner import DEFAULT_NER_MODEL
|
||||
from spacy.pipeline._parser_internals.ner import BiluoPushDown
|
||||
from spacy.training import Example, iob_to_biluo, split_bilu_label
|
||||
from spacy.tokens import Doc, Span
|
||||
|
@ -16,8 +18,6 @@ from spacy.vocab import Vocab
|
|||
import logging
|
||||
|
||||
from ..util import make_tempdir
|
||||
from ...pipeline import EntityRecognizer
|
||||
from ...pipeline.ner import DEFAULT_NER_MODEL
|
||||
|
||||
TRAIN_DATA = [
|
||||
("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}),
|
||||
|
|
|
@ -8,11 +8,11 @@ from spacy.lang.en import English
|
|||
from spacy.tokens import Doc
|
||||
from spacy.training import Example
|
||||
from spacy.vocab import Vocab
|
||||
from spacy.pipeline import DependencyParser
|
||||
from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL
|
||||
from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
|
||||
|
||||
from ...pipeline import DependencyParser
|
||||
from ...pipeline.dep_parser import DEFAULT_PARSER_MODEL
|
||||
from ..util import apply_transition_sequence, make_tempdir
|
||||
from ...pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
|
||||
|
||||
TRAIN_DATA = [
|
||||
(
|
||||
|
|
|
@ -101,14 +101,15 @@ def test_initialize_from_labels():
|
|||
}
|
||||
|
||||
|
||||
def test_no_data():
|
||||
@pytest.mark.parametrize("top_k", (1, 5, 30))
|
||||
def test_no_data(top_k):
|
||||
# Test that the lemmatizer provides a nice error when there's no tagging data / labels
|
||||
TEXTCAT_DATA = [
|
||||
("I'm so happy.", {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}),
|
||||
("I'm so angry", {"cats": {"POSITIVE": 0.0, "NEGATIVE": 1.0}}),
|
||||
]
|
||||
nlp = English()
|
||||
nlp.add_pipe("trainable_lemmatizer")
|
||||
nlp.add_pipe("trainable_lemmatizer", config={"top_k": top_k})
|
||||
nlp.add_pipe("textcat")
|
||||
|
||||
train_examples = []
|
||||
|
@ -119,10 +120,11 @@ def test_no_data():
|
|||
nlp.initialize(get_examples=lambda: train_examples)
|
||||
|
||||
|
||||
def test_incomplete_data():
|
||||
@pytest.mark.parametrize("top_k", (1, 5, 30))
|
||||
def test_incomplete_data(top_k):
|
||||
# Test that the lemmatizer works with incomplete information
|
||||
nlp = English()
|
||||
lemmatizer = nlp.add_pipe("trainable_lemmatizer")
|
||||
lemmatizer = nlp.add_pipe("trainable_lemmatizer", config={"top_k": top_k})
|
||||
lemmatizer.min_tree_freq = 1
|
||||
train_examples = []
|
||||
for t in PARTIAL_DATA:
|
||||
|
@ -154,9 +156,10 @@ def test_incomplete_data():
|
|||
assert xp.count_nonzero(dX[1][1]) == 0
|
||||
|
||||
|
||||
def test_overfitting_IO():
|
||||
@pytest.mark.parametrize("top_k", (1, 5, 30))
|
||||
def test_overfitting_IO(top_k):
|
||||
nlp = English()
|
||||
lemmatizer = nlp.add_pipe("trainable_lemmatizer")
|
||||
lemmatizer = nlp.add_pipe("trainable_lemmatizer", config={"top_k": top_k})
|
||||
lemmatizer.min_tree_freq = 1
|
||||
train_examples = []
|
||||
for t in TRAIN_DATA:
|
||||
|
@ -189,7 +192,7 @@ def test_overfitting_IO():
|
|||
# Check model after a {to,from}_bytes roundtrip
|
||||
nlp_bytes = nlp.to_bytes()
|
||||
nlp3 = English()
|
||||
nlp3.add_pipe("trainable_lemmatizer")
|
||||
nlp3.add_pipe("trainable_lemmatizer", config={"top_k": top_k})
|
||||
nlp3.from_bytes(nlp_bytes)
|
||||
doc3 = nlp3(test_text)
|
||||
assert doc3[0].lemma_ == "she"
|
||||
|
|
|
@ -353,6 +353,9 @@ def test_kb_default(nlp):
|
|||
"""Test that the default (empty) KB is loaded upon construction"""
|
||||
entity_linker = nlp.add_pipe("entity_linker", config={})
|
||||
assert len(entity_linker.kb) == 0
|
||||
with pytest.raises(ValueError, match="E139"):
|
||||
# this raises an error because the KB is empty
|
||||
entity_linker.validate_kb()
|
||||
assert entity_linker.kb.get_size_entities() == 0
|
||||
assert entity_linker.kb.get_size_aliases() == 0
|
||||
# 64 is the default value from pipeline.entity_linker
|
||||
|
|
|
@ -15,6 +15,8 @@ OPS = get_current_ops()
|
|||
|
||||
SPAN_KEY = "labeled_spans"
|
||||
|
||||
SPANCAT_COMPONENTS = ["spancat", "spancat_singlelabel"]
|
||||
|
||||
TRAIN_DATA = [
|
||||
("Who is Shaka Khan?", {"spans": {SPAN_KEY: [(7, 17, "PERSON")]}}),
|
||||
(
|
||||
|
@ -41,38 +43,42 @@ def make_examples(nlp, data=TRAIN_DATA):
|
|||
return train_examples
|
||||
|
||||
|
||||
def test_no_label():
|
||||
@pytest.mark.parametrize("name", SPANCAT_COMPONENTS)
|
||||
def test_no_label(name):
|
||||
nlp = Language()
|
||||
nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY})
|
||||
nlp.add_pipe(name, config={"spans_key": SPAN_KEY})
|
||||
with pytest.raises(ValueError):
|
||||
nlp.initialize()
|
||||
|
||||
|
||||
def test_no_resize():
|
||||
@pytest.mark.parametrize("name", SPANCAT_COMPONENTS)
|
||||
def test_no_resize(name):
|
||||
nlp = Language()
|
||||
spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY})
|
||||
spancat = nlp.add_pipe(name, config={"spans_key": SPAN_KEY})
|
||||
spancat.add_label("Thing")
|
||||
spancat.add_label("Phrase")
|
||||
assert spancat.labels == ("Thing", "Phrase")
|
||||
nlp.initialize()
|
||||
assert spancat.model.get_dim("nO") == 2
|
||||
assert spancat.model.get_dim("nO") == spancat._n_labels
|
||||
# this throws an error because the spancat can't be resized after initialization
|
||||
with pytest.raises(ValueError):
|
||||
spancat.add_label("Stuff")
|
||||
|
||||
|
||||
def test_implicit_labels():
|
||||
@pytest.mark.parametrize("name", SPANCAT_COMPONENTS)
|
||||
def test_implicit_labels(name):
|
||||
nlp = Language()
|
||||
spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY})
|
||||
spancat = nlp.add_pipe(name, config={"spans_key": SPAN_KEY})
|
||||
assert len(spancat.labels) == 0
|
||||
train_examples = make_examples(nlp)
|
||||
nlp.initialize(get_examples=lambda: train_examples)
|
||||
assert spancat.labels == ("PERSON", "LOC")
|
||||
|
||||
|
||||
def test_explicit_labels():
|
||||
@pytest.mark.parametrize("name", SPANCAT_COMPONENTS)
|
||||
def test_explicit_labels(name):
|
||||
nlp = Language()
|
||||
spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY})
|
||||
spancat = nlp.add_pipe(name, config={"spans_key": SPAN_KEY})
|
||||
assert len(spancat.labels) == 0
|
||||
spancat.add_label("PERSON")
|
||||
spancat.add_label("LOC")
|
||||
|
@ -102,13 +108,13 @@ def test_doc_gc():
|
|||
# XXX This fails with length 0 sometimes
|
||||
assert len(spangroup) > 0
|
||||
with pytest.raises(RuntimeError):
|
||||
span = spangroup[0]
|
||||
spangroup[0]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"max_positive,nr_results", [(None, 4), (1, 2), (2, 3), (3, 4), (4, 4)]
|
||||
)
|
||||
def test_make_spangroup(max_positive, nr_results):
|
||||
def test_make_spangroup_multilabel(max_positive, nr_results):
|
||||
fix_random_seed(0)
|
||||
nlp = Language()
|
||||
spancat = nlp.add_pipe(
|
||||
|
@ -120,10 +126,12 @@ def test_make_spangroup(max_positive, nr_results):
|
|||
indices = ngram_suggester([doc])[0].dataXd
|
||||
assert_array_equal(OPS.to_numpy(indices), numpy.asarray([[0, 1], [1, 2], [0, 2]]))
|
||||
labels = ["Thing", "City", "Person", "GreatCity"]
|
||||
for label in labels:
|
||||
spancat.add_label(label)
|
||||
scores = numpy.asarray(
|
||||
[[0.2, 0.4, 0.3, 0.1], [0.1, 0.6, 0.2, 0.4], [0.8, 0.7, 0.3, 0.9]], dtype="f"
|
||||
)
|
||||
spangroup = spancat._make_span_group(doc, indices, scores, labels)
|
||||
spangroup = spancat._make_span_group_multilabel(doc, indices, scores)
|
||||
assert len(spangroup) == nr_results
|
||||
|
||||
# first span is always the second token "London"
|
||||
|
@ -154,6 +162,118 @@ def test_make_spangroup(max_positive, nr_results):
|
|||
assert_almost_equal(0.9, spangroup.attrs["scores"][-1], 5)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"threshold,allow_overlap,nr_results",
|
||||
[(0.05, True, 3), (0.05, False, 1), (0.5, True, 2), (0.5, False, 1)],
|
||||
)
|
||||
def test_make_spangroup_singlelabel(threshold, allow_overlap, nr_results):
|
||||
fix_random_seed(0)
|
||||
nlp = Language()
|
||||
spancat = nlp.add_pipe(
|
||||
"spancat",
|
||||
config={
|
||||
"spans_key": SPAN_KEY,
|
||||
"threshold": threshold,
|
||||
"max_positive": 1,
|
||||
},
|
||||
)
|
||||
doc = nlp.make_doc("Greater London")
|
||||
ngram_suggester = registry.misc.get("spacy.ngram_suggester.v1")(sizes=[1, 2])
|
||||
indices = ngram_suggester([doc])[0].dataXd
|
||||
assert_array_equal(OPS.to_numpy(indices), numpy.asarray([[0, 1], [1, 2], [0, 2]]))
|
||||
labels = ["Thing", "City", "Person", "GreatCity"]
|
||||
for label in labels:
|
||||
spancat.add_label(label)
|
||||
scores = numpy.asarray(
|
||||
[[0.2, 0.4, 0.3, 0.1], [0.1, 0.6, 0.2, 0.4], [0.8, 0.7, 0.3, 0.9]], dtype="f"
|
||||
)
|
||||
spangroup = spancat._make_span_group_singlelabel(
|
||||
doc, indices, scores, allow_overlap
|
||||
)
|
||||
assert len(spangroup) == nr_results
|
||||
if threshold > 0.4:
|
||||
if allow_overlap:
|
||||
assert spangroup[0].text == "London"
|
||||
assert spangroup[0].label_ == "City"
|
||||
assert spangroup[1].text == "Greater London"
|
||||
assert spangroup[1].label_ == "GreatCity"
|
||||
|
||||
else:
|
||||
assert spangroup[0].text == "Greater London"
|
||||
assert spangroup[0].label_ == "GreatCity"
|
||||
else:
|
||||
if allow_overlap:
|
||||
assert spangroup[0].text == "Greater"
|
||||
assert spangroup[0].label_ == "City"
|
||||
assert spangroup[1].text == "London"
|
||||
assert spangroup[1].label_ == "City"
|
||||
assert spangroup[2].text == "Greater London"
|
||||
assert spangroup[2].label_ == "GreatCity"
|
||||
else:
|
||||
assert spangroup[0].text == "Greater London"
|
||||
|
||||
|
||||
def test_make_spangroup_negative_label():
|
||||
fix_random_seed(0)
|
||||
nlp_single = Language()
|
||||
nlp_multi = Language()
|
||||
spancat_single = nlp_single.add_pipe(
|
||||
"spancat",
|
||||
config={
|
||||
"spans_key": SPAN_KEY,
|
||||
"threshold": 0.1,
|
||||
"max_positive": 1,
|
||||
},
|
||||
)
|
||||
spancat_multi = nlp_multi.add_pipe(
|
||||
"spancat",
|
||||
config={
|
||||
"spans_key": SPAN_KEY,
|
||||
"threshold": 0.1,
|
||||
"max_positive": 2,
|
||||
},
|
||||
)
|
||||
spancat_single.add_negative_label = True
|
||||
spancat_multi.add_negative_label = True
|
||||
doc = nlp_single.make_doc("Greater London")
|
||||
labels = ["Thing", "City", "Person", "GreatCity"]
|
||||
for label in labels:
|
||||
spancat_multi.add_label(label)
|
||||
spancat_single.add_label(label)
|
||||
ngram_suggester = registry.misc.get("spacy.ngram_suggester.v1")(sizes=[1, 2])
|
||||
indices = ngram_suggester([doc])[0].dataXd
|
||||
assert_array_equal(OPS.to_numpy(indices), numpy.asarray([[0, 1], [1, 2], [0, 2]]))
|
||||
scores = numpy.asarray(
|
||||
[
|
||||
[0.2, 0.4, 0.3, 0.1, 0.1],
|
||||
[0.1, 0.6, 0.2, 0.4, 0.9],
|
||||
[0.8, 0.7, 0.3, 0.9, 0.1],
|
||||
],
|
||||
dtype="f",
|
||||
)
|
||||
spangroup_multi = spancat_multi._make_span_group_multilabel(doc, indices, scores)
|
||||
spangroup_single = spancat_single._make_span_group_singlelabel(doc, indices, scores)
|
||||
assert len(spangroup_single) == 2
|
||||
assert spangroup_single[0].text == "Greater"
|
||||
assert spangroup_single[0].label_ == "City"
|
||||
assert spangroup_single[1].text == "Greater London"
|
||||
assert spangroup_single[1].label_ == "GreatCity"
|
||||
|
||||
assert len(spangroup_multi) == 6
|
||||
assert spangroup_multi[0].text == "Greater"
|
||||
assert spangroup_multi[0].label_ == "City"
|
||||
assert spangroup_multi[1].text == "Greater"
|
||||
assert spangroup_multi[1].label_ == "Person"
|
||||
assert spangroup_multi[2].text == "London"
|
||||
assert spangroup_multi[2].label_ == "City"
|
||||
assert spangroup_multi[3].text == "London"
|
||||
assert spangroup_multi[3].label_ == "GreatCity"
|
||||
assert spangroup_multi[4].text == "Greater London"
|
||||
assert spangroup_multi[4].label_ == "Thing"
|
||||
assert spangroup_multi[5].text == "Greater London"
|
||||
assert spangroup_multi[5].label_ == "GreatCity"
|
||||
|
||||
|
||||
def test_ngram_suggester(en_tokenizer):
|
||||
# test different n-gram lengths
|
||||
for size in [1, 2, 3]:
|
||||
|
@ -371,9 +491,9 @@ def test_overfitting_IO_overlapping():
|
|||
assert set([span.label_ for span in spans2]) == {"LOC", "DOUBLE_LOC"}
|
||||
|
||||
|
||||
def test_zero_suggestions():
|
||||
@pytest.mark.parametrize("name", SPANCAT_COMPONENTS)
|
||||
def test_zero_suggestions(name):
|
||||
# Test with a suggester that can return 0 suggestions
|
||||
|
||||
@registry.misc("test_mixed_zero_suggester")
|
||||
def make_mixed_zero_suggester():
|
||||
def mixed_zero_suggester(docs, *, ops=None):
|
||||
|
@ -400,7 +520,7 @@ def test_zero_suggestions():
|
|||
fix_random_seed(0)
|
||||
nlp = English()
|
||||
spancat = nlp.add_pipe(
|
||||
"spancat",
|
||||
name,
|
||||
config={
|
||||
"suggester": {"@misc": "test_mixed_zero_suggester"},
|
||||
"spans_key": SPAN_KEY,
|
||||
|
@ -408,7 +528,7 @@ def test_zero_suggestions():
|
|||
)
|
||||
train_examples = make_examples(nlp)
|
||||
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
||||
assert spancat.model.get_dim("nO") == 2
|
||||
assert spancat.model.get_dim("nO") == spancat._n_labels
|
||||
assert set(spancat.labels) == {"LOC", "PERSON"}
|
||||
|
||||
nlp.update(train_examples, sgd=optimizer)
|
||||
|
@ -424,9 +544,10 @@ def test_zero_suggestions():
|
|||
list(nlp.pipe(["", "one", "three three three"]))
|
||||
|
||||
|
||||
def test_set_candidates():
|
||||
@pytest.mark.parametrize("name", SPANCAT_COMPONENTS)
|
||||
def test_set_candidates(name):
|
||||
nlp = Language()
|
||||
spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY})
|
||||
spancat = nlp.add_pipe(name, config={"spans_key": SPAN_KEY})
|
||||
train_examples = make_examples(nlp)
|
||||
nlp.initialize(get_examples=lambda: train_examples)
|
||||
texts = [
|
||||
|
|
|
@ -1,7 +1,10 @@
|
|||
from typing import Callable
|
||||
from pathlib import Path
|
||||
from typing import Callable, Iterable, Any, Dict
|
||||
|
||||
from spacy import util
|
||||
from spacy.util import ensure_path, registry, load_model_from_config
|
||||
import srsly
|
||||
|
||||
from spacy import util, Errors
|
||||
from spacy.util import ensure_path, registry, load_model_from_config, SimpleFrozenList
|
||||
from spacy.kb.kb_in_memory import InMemoryLookupKB
|
||||
from spacy.vocab import Vocab
|
||||
from thinc.api import Config
|
||||
|
@ -91,7 +94,10 @@ def test_serialize_subclassed_kb():
|
|||
|
||||
[components.entity_linker]
|
||||
factory = "entity_linker"
|
||||
|
||||
|
||||
[components.entity_linker.generate_empty_kb]
|
||||
@misc = "kb_test.CustomEmptyKB.v1"
|
||||
|
||||
[initialize]
|
||||
|
||||
[initialize.components]
|
||||
|
@ -99,7 +105,7 @@ def test_serialize_subclassed_kb():
|
|||
[initialize.components.entity_linker]
|
||||
|
||||
[initialize.components.entity_linker.kb_loader]
|
||||
@misc = "spacy.CustomKB.v1"
|
||||
@misc = "kb_test.CustomKB.v1"
|
||||
entity_vector_length = 342
|
||||
custom_field = 666
|
||||
"""
|
||||
|
@ -109,10 +115,57 @@ def test_serialize_subclassed_kb():
|
|||
super().__init__(vocab, entity_vector_length)
|
||||
self.custom_field = custom_field
|
||||
|
||||
@registry.misc("spacy.CustomKB.v1")
|
||||
def to_disk(self, path, exclude: Iterable[str] = SimpleFrozenList()):
|
||||
"""We overwrite InMemoryLookupKB.to_disk() to ensure that self.custom_field is stored as well."""
|
||||
path = ensure_path(path)
|
||||
if not path.exists():
|
||||
path.mkdir(parents=True)
|
||||
if not path.is_dir():
|
||||
raise ValueError(Errors.E928.format(loc=path))
|
||||
|
||||
def serialize_custom_fields(file_path: Path) -> None:
|
||||
srsly.write_json(file_path, {"custom_field": self.custom_field})
|
||||
|
||||
serialize = {
|
||||
"contents": lambda p: self.write_contents(p),
|
||||
"strings.json": lambda p: self.vocab.strings.to_disk(p),
|
||||
"custom_fields": lambda p: serialize_custom_fields(p),
|
||||
}
|
||||
util.to_disk(path, serialize, exclude)
|
||||
|
||||
def from_disk(self, path, exclude: Iterable[str] = SimpleFrozenList()):
|
||||
"""We overwrite InMemoryLookupKB.from_disk() to ensure that self.custom_field is loaded as well."""
|
||||
path = ensure_path(path)
|
||||
if not path.exists():
|
||||
raise ValueError(Errors.E929.format(loc=path))
|
||||
if not path.is_dir():
|
||||
raise ValueError(Errors.E928.format(loc=path))
|
||||
|
||||
def deserialize_custom_fields(file_path: Path) -> None:
|
||||
self.custom_field = srsly.read_json(file_path)["custom_field"]
|
||||
|
||||
deserialize: Dict[str, Callable[[Any], Any]] = {
|
||||
"contents": lambda p: self.read_contents(p),
|
||||
"strings.json": lambda p: self.vocab.strings.from_disk(p),
|
||||
"custom_fields": lambda p: deserialize_custom_fields(p),
|
||||
}
|
||||
util.from_disk(path, deserialize, exclude)
|
||||
|
||||
@registry.misc("kb_test.CustomEmptyKB.v1")
|
||||
def empty_custom_kb() -> Callable[[Vocab, int], SubInMemoryLookupKB]:
|
||||
def empty_kb_factory(vocab: Vocab, entity_vector_length: int):
|
||||
return SubInMemoryLookupKB(
|
||||
vocab=vocab,
|
||||
entity_vector_length=entity_vector_length,
|
||||
custom_field=0,
|
||||
)
|
||||
|
||||
return empty_kb_factory
|
||||
|
||||
@registry.misc("kb_test.CustomKB.v1")
|
||||
def custom_kb(
|
||||
entity_vector_length: int, custom_field: int
|
||||
) -> Callable[[Vocab], InMemoryLookupKB]:
|
||||
) -> Callable[[Vocab], SubInMemoryLookupKB]:
|
||||
def custom_kb_factory(vocab):
|
||||
kb = SubInMemoryLookupKB(
|
||||
vocab=vocab,
|
||||
|
@ -139,6 +192,6 @@ def test_serialize_subclassed_kb():
|
|||
nlp2 = util.load_model_from_path(tmp_dir)
|
||||
entity_linker2 = nlp2.get_pipe("entity_linker")
|
||||
# After IO, the KB is the standard one
|
||||
assert type(entity_linker2.kb) == InMemoryLookupKB
|
||||
assert type(entity_linker2.kb) == SubInMemoryLookupKB
|
||||
assert entity_linker2.kb.entity_vector_length == 342
|
||||
assert not hasattr(entity_linker2.kb, "custom_field")
|
||||
assert entity_linker2.kb.custom_field == 666
|
||||
|
|
|
@ -2,7 +2,6 @@ import os
|
|||
import math
|
||||
from collections import Counter
|
||||
from typing import Tuple, List, Dict, Any
|
||||
import pkg_resources
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
|
@ -29,6 +28,7 @@ from spacy.cli.debug_data import _print_span_characteristics
|
|||
from spacy.cli.debug_data import _get_spans_length_freq_dist
|
||||
from spacy.cli.download import get_compatibility, get_version
|
||||
from spacy.cli.init_config import RECOMMENDATIONS, init_config, fill_config
|
||||
from spacy.cli.init_pipeline import _init_labels
|
||||
from spacy.cli.package import get_third_party_dependencies
|
||||
from spacy.cli.package import _is_permitted_package_name
|
||||
from spacy.cli.project.remote_storage import RemoteStorage
|
||||
|
@ -47,7 +47,6 @@ from spacy.training.converters import conll_ner_to_docs, conllu_to_docs
|
|||
from spacy.training.converters import iob_to_docs
|
||||
from spacy.util import ENV_VARS, get_minor_version, load_model_from_config, load_config
|
||||
|
||||
from ..cli.init_pipeline import _init_labels
|
||||
from .util import make_tempdir
|
||||
|
||||
|
||||
|
@ -553,7 +552,14 @@ def test_parse_cli_overrides():
|
|||
|
||||
@pytest.mark.parametrize("lang", ["en", "nl"])
|
||||
@pytest.mark.parametrize(
|
||||
"pipeline", [["tagger", "parser", "ner"], [], ["ner", "textcat", "sentencizer"]]
|
||||
"pipeline",
|
||||
[
|
||||
["tagger", "parser", "ner"],
|
||||
[],
|
||||
["ner", "textcat", "sentencizer"],
|
||||
["morphologizer", "spancat", "entity_linker"],
|
||||
["spancat_singlelabel", "textcat_multilabel"],
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("optimize", ["efficiency", "accuracy"])
|
||||
@pytest.mark.parametrize("pretraining", [True, False])
|
||||
|
@ -1017,8 +1023,6 @@ def test_local_remote_storage_pull_missing():
|
|||
|
||||
|
||||
def test_cli_find_threshold(capsys):
|
||||
thresholds = numpy.linspace(0, 1, 10)
|
||||
|
||||
def make_examples(nlp: Language) -> List[Example]:
|
||||
docs: List[Example] = []
|
||||
|
||||
|
@ -1074,7 +1078,7 @@ def test_cli_find_threshold(capsys):
|
|||
)
|
||||
with make_tempdir() as nlp_dir:
|
||||
nlp.to_disk(nlp_dir)
|
||||
res = find_threshold(
|
||||
best_threshold, best_score, res = find_threshold(
|
||||
model=nlp_dir,
|
||||
data_path=docs_dir / "docs.spacy",
|
||||
pipe_name="tc_multi",
|
||||
|
@ -1082,16 +1086,14 @@ def test_cli_find_threshold(capsys):
|
|||
scores_key="cats_macro_f",
|
||||
silent=True,
|
||||
)
|
||||
assert res[0] != thresholds[0]
|
||||
assert thresholds[0] < res[0] < thresholds[9]
|
||||
assert res[1] == 1.0
|
||||
assert res[2][1.0] == 0.0
|
||||
assert best_score == max(res.values())
|
||||
assert res[1.0] == 0.0
|
||||
|
||||
# Test with spancat.
|
||||
nlp, _ = init_nlp((("spancat", {}),))
|
||||
with make_tempdir() as nlp_dir:
|
||||
nlp.to_disk(nlp_dir)
|
||||
res = find_threshold(
|
||||
best_threshold, best_score, res = find_threshold(
|
||||
model=nlp_dir,
|
||||
data_path=docs_dir / "docs.spacy",
|
||||
pipe_name="spancat",
|
||||
|
@ -1099,10 +1101,8 @@ def test_cli_find_threshold(capsys):
|
|||
scores_key="spans_sc_f",
|
||||
silent=True,
|
||||
)
|
||||
assert res[0] != thresholds[0]
|
||||
assert thresholds[0] < res[0] < thresholds[8]
|
||||
assert res[1] >= 0.6
|
||||
assert res[2][1.0] == 0.0
|
||||
assert best_score == max(res.values())
|
||||
assert res[1.0] == 0.0
|
||||
|
||||
# Having multiple textcat_multilabel components should work, since the name has to be specified.
|
||||
nlp, _ = init_nlp((("textcat_multilabel", {}),))
|
||||
|
@ -1132,6 +1132,7 @@ def test_cli_find_threshold(capsys):
|
|||
)
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore::DeprecationWarning")
|
||||
@pytest.mark.parametrize(
|
||||
"reqs,output",
|
||||
[
|
||||
|
@ -1164,6 +1165,8 @@ def test_cli_find_threshold(capsys):
|
|||
],
|
||||
)
|
||||
def test_project_check_requirements(reqs, output):
|
||||
import pkg_resources
|
||||
|
||||
# excessive guard against unlikely package name
|
||||
try:
|
||||
pkg_resources.require("spacyunknowndoesnotexist12345")
|
||||
|
@ -1207,3 +1210,69 @@ def test_walk_directory():
|
|||
assert (len(walk_directory(d, suffix="iob"))) == 2
|
||||
assert (len(walk_directory(d, suffix="conll"))) == 3
|
||||
assert (len(walk_directory(d, suffix="pdf"))) == 0
|
||||
|
||||
|
||||
def test_debug_data_trainable_lemmatizer_basic():
|
||||
examples = [
|
||||
("She likes green eggs", {"lemmas": ["she", "like", "green", "egg"]}),
|
||||
("Eat blue ham", {"lemmas": ["eat", "blue", "ham"]}),
|
||||
]
|
||||
nlp = Language()
|
||||
train_examples = []
|
||||
for t in examples:
|
||||
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||
|
||||
data = _compile_gold(train_examples, ["trainable_lemmatizer"], nlp, True)
|
||||
# ref test_edit_tree_lemmatizer::test_initialize_from_labels
|
||||
# this results in 4 trees
|
||||
assert len(data["lemmatizer_trees"]) == 4
|
||||
|
||||
|
||||
def test_debug_data_trainable_lemmatizer_partial():
|
||||
partial_examples = [
|
||||
# partial annotation
|
||||
("She likes green eggs", {"lemmas": ["", "like", "green", ""]}),
|
||||
# misaligned partial annotation
|
||||
(
|
||||
"He hates green eggs",
|
||||
{
|
||||
"words": ["He", "hat", "es", "green", "eggs"],
|
||||
"lemmas": ["", "hat", "e", "green", ""],
|
||||
},
|
||||
),
|
||||
]
|
||||
nlp = Language()
|
||||
train_examples = []
|
||||
for t in partial_examples:
|
||||
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||
|
||||
data = _compile_gold(train_examples, ["trainable_lemmatizer"], nlp, True)
|
||||
assert data["partial_lemma_annotations"] == 2
|
||||
|
||||
|
||||
def test_debug_data_trainable_lemmatizer_low_cardinality():
|
||||
low_cardinality_examples = [
|
||||
("She likes green eggs", {"lemmas": ["no", "no", "no", "no"]}),
|
||||
("Eat blue ham", {"lemmas": ["no", "no", "no"]}),
|
||||
]
|
||||
nlp = Language()
|
||||
train_examples = []
|
||||
for t in low_cardinality_examples:
|
||||
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||
|
||||
data = _compile_gold(train_examples, ["trainable_lemmatizer"], nlp, True)
|
||||
assert data["n_low_cardinality_lemmas"] == 2
|
||||
|
||||
|
||||
def test_debug_data_trainable_lemmatizer_not_annotated():
|
||||
unannotated_examples = [
|
||||
("She likes green eggs", {}),
|
||||
("Eat blue ham", {}),
|
||||
]
|
||||
nlp = Language()
|
||||
train_examples = []
|
||||
for t in unannotated_examples:
|
||||
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||
|
||||
data = _compile_gold(train_examples, ["trainable_lemmatizer"], nlp, True)
|
||||
assert data["no_lemma_annotations"] == 2
|
||||
|
|
|
@ -1,9 +1,20 @@
|
|||
import os
|
||||
from pathlib import Path
|
||||
import pytest
|
||||
import srsly
|
||||
from typer.testing import CliRunner
|
||||
from spacy.tokens import DocBin, Doc
|
||||
|
||||
from spacy.cli._util import app
|
||||
from .util import make_tempdir
|
||||
from spacy.cli._util import app, get_git_version
|
||||
from .util import make_tempdir, normalize_whitespace
|
||||
|
||||
|
||||
def has_git():
|
||||
try:
|
||||
get_git_version()
|
||||
return True
|
||||
except RuntimeError:
|
||||
return False
|
||||
|
||||
|
||||
def test_convert_auto():
|
||||
|
@ -37,6 +48,190 @@ def test_benchmark_accuracy_alias():
|
|||
# Verify that the `evaluate` alias works correctly.
|
||||
result_benchmark = CliRunner().invoke(app, ["benchmark", "accuracy", "--help"])
|
||||
result_evaluate = CliRunner().invoke(app, ["evaluate", "--help"])
|
||||
assert result_benchmark.stdout == result_evaluate.stdout.replace(
|
||||
"spacy evaluate", "spacy benchmark accuracy"
|
||||
assert normalize_whitespace(result_benchmark.stdout) == normalize_whitespace(
|
||||
result_evaluate.stdout.replace("spacy evaluate", "spacy benchmark accuracy")
|
||||
)
|
||||
|
||||
|
||||
def test_debug_data_trainable_lemmatizer_cli(en_vocab):
|
||||
train_docs = [
|
||||
Doc(en_vocab, words=["I", "like", "cats"], lemmas=["I", "like", "cat"]),
|
||||
Doc(
|
||||
en_vocab,
|
||||
words=["Dogs", "are", "great", "too"],
|
||||
lemmas=["dog", "be", "great", "too"],
|
||||
),
|
||||
]
|
||||
dev_docs = [
|
||||
Doc(en_vocab, words=["Cats", "are", "cute"], lemmas=["cat", "be", "cute"]),
|
||||
Doc(en_vocab, words=["Pets", "are", "great"], lemmas=["pet", "be", "great"]),
|
||||
]
|
||||
with make_tempdir() as d_in:
|
||||
train_bin = DocBin(docs=train_docs)
|
||||
train_bin.to_disk(d_in / "train.spacy")
|
||||
dev_bin = DocBin(docs=dev_docs)
|
||||
dev_bin.to_disk(d_in / "dev.spacy")
|
||||
# `debug data` requires an input pipeline config
|
||||
CliRunner().invoke(
|
||||
app,
|
||||
[
|
||||
"init",
|
||||
"config",
|
||||
f"{d_in}/config.cfg",
|
||||
"--lang",
|
||||
"en",
|
||||
"--pipeline",
|
||||
"trainable_lemmatizer",
|
||||
],
|
||||
)
|
||||
result_debug_data = CliRunner().invoke(
|
||||
app,
|
||||
[
|
||||
"debug",
|
||||
"data",
|
||||
f"{d_in}/config.cfg",
|
||||
"--paths.train",
|
||||
f"{d_in}/train.spacy",
|
||||
"--paths.dev",
|
||||
f"{d_in}/dev.spacy",
|
||||
],
|
||||
)
|
||||
# Instead of checking specific wording of the output, which may change,
|
||||
# we'll check that this section of the debug output is present.
|
||||
assert "= Trainable Lemmatizer =" in result_debug_data.stdout
|
||||
|
||||
|
||||
# project tests
|
||||
|
||||
SAMPLE_PROJECT = {
|
||||
"title": "Sample project",
|
||||
"description": "This is a project for testing",
|
||||
"assets": [
|
||||
{
|
||||
"dest": "assets/spacy-readme.md",
|
||||
"url": "https://github.com/explosion/spaCy/raw/dec81508d28b47f09a06203c472b37f00db6c869/README.md",
|
||||
"checksum": "411b2c89ccf34288fae8ed126bf652f7",
|
||||
},
|
||||
{
|
||||
"dest": "assets/citation.cff",
|
||||
"url": "https://github.com/explosion/spaCy/raw/master/CITATION.cff",
|
||||
"checksum": "c996bfd80202d480eb2e592369714e5e",
|
||||
"extra": True,
|
||||
},
|
||||
],
|
||||
"commands": [
|
||||
{
|
||||
"name": "ok",
|
||||
"help": "print ok",
|
||||
"script": ["python -c \"print('okokok')\""],
|
||||
},
|
||||
{
|
||||
"name": "create",
|
||||
"help": "make a file",
|
||||
"script": ["touch abc.txt"],
|
||||
"outputs": ["abc.txt"],
|
||||
},
|
||||
{
|
||||
"name": "clean",
|
||||
"help": "remove test file",
|
||||
"script": ["rm abc.txt"],
|
||||
},
|
||||
],
|
||||
}
|
||||
|
||||
SAMPLE_PROJECT_TEXT = srsly.yaml_dumps(SAMPLE_PROJECT)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def project_dir():
|
||||
with make_tempdir() as pdir:
|
||||
(pdir / "project.yml").write_text(SAMPLE_PROJECT_TEXT)
|
||||
yield pdir
|
||||
|
||||
|
||||
def test_project_document(project_dir):
|
||||
readme_path = project_dir / "README.md"
|
||||
assert not readme_path.exists(), "README already exists"
|
||||
result = CliRunner().invoke(
|
||||
app, ["project", "document", str(project_dir), "-o", str(readme_path)]
|
||||
)
|
||||
assert result.exit_code == 0
|
||||
assert readme_path.is_file()
|
||||
text = readme_path.read_text("utf-8")
|
||||
assert SAMPLE_PROJECT["description"] in text
|
||||
|
||||
|
||||
def test_project_assets(project_dir):
|
||||
asset_dir = project_dir / "assets"
|
||||
assert not asset_dir.exists(), "Assets dir is already present"
|
||||
result = CliRunner().invoke(app, ["project", "assets", str(project_dir)])
|
||||
assert result.exit_code == 0
|
||||
assert (asset_dir / "spacy-readme.md").is_file(), "Assets not downloaded"
|
||||
# check that extras work
|
||||
result = CliRunner().invoke(app, ["project", "assets", "--extra", str(project_dir)])
|
||||
assert result.exit_code == 0
|
||||
assert (asset_dir / "citation.cff").is_file(), "Extras not downloaded"
|
||||
|
||||
|
||||
def test_project_run(project_dir):
|
||||
# make sure dry run works
|
||||
test_file = project_dir / "abc.txt"
|
||||
result = CliRunner().invoke(
|
||||
app, ["project", "run", "--dry", "create", str(project_dir)]
|
||||
)
|
||||
assert result.exit_code == 0
|
||||
assert not test_file.is_file()
|
||||
result = CliRunner().invoke(app, ["project", "run", "create", str(project_dir)])
|
||||
assert result.exit_code == 0
|
||||
assert test_file.is_file()
|
||||
result = CliRunner().invoke(app, ["project", "run", "ok", str(project_dir)])
|
||||
assert result.exit_code == 0
|
||||
assert "okokok" in result.stdout
|
||||
|
||||
|
||||
@pytest.mark.skipif(not has_git(), reason="git not installed")
|
||||
@pytest.mark.parametrize(
|
||||
"options",
|
||||
[
|
||||
"",
|
||||
# "--sparse",
|
||||
"--branch v3",
|
||||
"--repo https://github.com/explosion/projects --branch v3",
|
||||
],
|
||||
)
|
||||
def test_project_clone(options):
|
||||
with make_tempdir() as workspace:
|
||||
out = workspace / "project"
|
||||
target = "benchmarks/ner_conll03"
|
||||
if not options:
|
||||
options = []
|
||||
else:
|
||||
options = options.split()
|
||||
result = CliRunner().invoke(
|
||||
app, ["project", "clone", target, *options, str(out)]
|
||||
)
|
||||
assert result.exit_code == 0
|
||||
assert (out / "README.md").is_file()
|
||||
|
||||
|
||||
def test_project_push_pull(project_dir):
|
||||
proj = dict(SAMPLE_PROJECT)
|
||||
remote = "xyz"
|
||||
|
||||
with make_tempdir() as remote_dir:
|
||||
proj["remotes"] = {remote: str(remote_dir)}
|
||||
proj_text = srsly.yaml_dumps(proj)
|
||||
(project_dir / "project.yml").write_text(proj_text)
|
||||
|
||||
test_file = project_dir / "abc.txt"
|
||||
result = CliRunner().invoke(app, ["project", "run", "create", str(project_dir)])
|
||||
assert result.exit_code == 0
|
||||
assert test_file.is_file()
|
||||
result = CliRunner().invoke(app, ["project", "push", remote, str(project_dir)])
|
||||
assert result.exit_code == 0
|
||||
result = CliRunner().invoke(app, ["project", "run", "clean", str(project_dir)])
|
||||
assert result.exit_code == 0
|
||||
assert not test_file.exists()
|
||||
result = CliRunner().invoke(app, ["project", "pull", remote, str(project_dir)])
|
||||
assert result.exit_code == 0
|
||||
assert test_file.is_file()
|
||||
|
|
|
@ -46,7 +46,7 @@ def assert_sents_error(doc):
|
|||
|
||||
def warn_error(proc_name, proc, docs, e):
|
||||
logger = logging.getLogger("spacy")
|
||||
logger.warning(f"Trouble with component {proc_name}.")
|
||||
logger.warning("Trouble with component %s.", proc_name)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
|
|
78
spacy/tests/training/test_corpus.py
Normal file
78
spacy/tests/training/test_corpus.py
Normal file
|
@ -0,0 +1,78 @@
|
|||
from typing import IO, Generator, Iterable, List, TextIO, Tuple
|
||||
from contextlib import contextmanager
|
||||
from pathlib import Path
|
||||
import pytest
|
||||
import tempfile
|
||||
|
||||
from spacy.lang.en import English
|
||||
from spacy.training import Example, PlainTextCorpus
|
||||
from spacy.util import make_tempdir
|
||||
|
||||
# Intentional newlines to check that they are skipped.
|
||||
PLAIN_TEXT_DOC = """
|
||||
|
||||
This is a doc. It contains two sentences.
|
||||
This is another doc.
|
||||
|
||||
A third doc.
|
||||
|
||||
"""
|
||||
|
||||
PLAIN_TEXT_DOC_TOKENIZED = [
|
||||
[
|
||||
"This",
|
||||
"is",
|
||||
"a",
|
||||
"doc",
|
||||
".",
|
||||
"It",
|
||||
"contains",
|
||||
"two",
|
||||
"sentences",
|
||||
".",
|
||||
],
|
||||
["This", "is", "another", "doc", "."],
|
||||
["A", "third", "doc", "."],
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("min_length", [0, 5])
|
||||
@pytest.mark.parametrize("max_length", [0, 5])
|
||||
def test_plain_text_reader(min_length, max_length):
|
||||
nlp = English()
|
||||
with _string_to_tmp_file(PLAIN_TEXT_DOC) as file_path:
|
||||
corpus = PlainTextCorpus(
|
||||
file_path, min_length=min_length, max_length=max_length
|
||||
)
|
||||
|
||||
check = [
|
||||
doc
|
||||
for doc in PLAIN_TEXT_DOC_TOKENIZED
|
||||
if len(doc) >= min_length and (max_length == 0 or len(doc) <= max_length)
|
||||
]
|
||||
reference, predicted = _examples_to_tokens(corpus(nlp))
|
||||
|
||||
assert reference == check
|
||||
assert predicted == check
|
||||
|
||||
|
||||
@contextmanager
|
||||
def _string_to_tmp_file(s: str) -> Generator[Path, None, None]:
|
||||
with make_tempdir() as d:
|
||||
file_path = Path(d) / "string.txt"
|
||||
with open(file_path, "w", encoding="utf-8") as f:
|
||||
f.write(s)
|
||||
yield file_path
|
||||
|
||||
|
||||
def _examples_to_tokens(
|
||||
examples: Iterable[Example],
|
||||
) -> Tuple[List[List[str]], List[List[str]]]:
|
||||
reference = []
|
||||
predicted = []
|
||||
|
||||
for eg in examples:
|
||||
reference.append([t.text for t in eg.reference])
|
||||
predicted.append([t.text for t in eg.predicted])
|
||||
|
||||
return reference, predicted
|
|
@ -2,17 +2,19 @@ from pathlib import Path
|
|||
import numpy as np
|
||||
import pytest
|
||||
import srsly
|
||||
from spacy.vocab import Vocab
|
||||
from thinc.api import Config
|
||||
from thinc.api import Config, get_current_ops
|
||||
|
||||
from spacy import util
|
||||
from spacy.lang.en import English
|
||||
from spacy.training.initialize import init_nlp
|
||||
from spacy.training.loop import train
|
||||
from spacy.training.pretrain import pretrain
|
||||
from spacy.tokens import Doc, DocBin
|
||||
from spacy.language import DEFAULT_CONFIG_PRETRAIN_PATH, DEFAULT_CONFIG_PATH
|
||||
from spacy.ml.models.multi_task import create_pretrain_vectors
|
||||
from spacy.vectors import Vectors
|
||||
from spacy.vocab import Vocab
|
||||
from ..util import make_tempdir
|
||||
from ... import util
|
||||
from ...lang.en import English
|
||||
from ...training.initialize import init_nlp
|
||||
from ...training.loop import train
|
||||
from ...training.pretrain import pretrain
|
||||
from ...tokens import Doc, DocBin
|
||||
from ...language import DEFAULT_CONFIG_PRETRAIN_PATH, DEFAULT_CONFIG_PATH
|
||||
|
||||
pretrain_string_listener = """
|
||||
[nlp]
|
||||
|
@ -346,3 +348,30 @@ def write_vectors_model(tmp_dir):
|
|||
nlp = English(vocab)
|
||||
nlp.to_disk(nlp_path)
|
||||
return str(nlp_path)
|
||||
|
||||
|
||||
def test_pretrain_default_vectors():
|
||||
nlp = English()
|
||||
nlp.add_pipe("tok2vec")
|
||||
nlp.initialize()
|
||||
|
||||
# default vectors are supported
|
||||
nlp.vocab.vectors = Vectors(shape=(10, 10))
|
||||
create_pretrain_vectors(1, 1, "cosine")(nlp.vocab, nlp.get_pipe("tok2vec").model)
|
||||
|
||||
# error for no vectors
|
||||
with pytest.raises(ValueError, match="E875"):
|
||||
nlp.vocab.vectors = Vectors()
|
||||
create_pretrain_vectors(1, 1, "cosine")(
|
||||
nlp.vocab, nlp.get_pipe("tok2vec").model
|
||||
)
|
||||
|
||||
# error for floret vectors
|
||||
with pytest.raises(ValueError, match="E850"):
|
||||
ops = get_current_ops()
|
||||
nlp.vocab.vectors = Vectors(
|
||||
data=ops.xp.zeros((10, 10)), mode="floret", hash_count=1
|
||||
)
|
||||
create_pretrain_vectors(1, 1, "cosine")(
|
||||
nlp.vocab, nlp.get_pipe("tok2vec").model
|
||||
)
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
import numpy
|
||||
import tempfile
|
||||
import contextlib
|
||||
import re
|
||||
import srsly
|
||||
from spacy.tokens import Doc
|
||||
from spacy.vocab import Vocab
|
||||
|
@ -95,3 +96,7 @@ def assert_packed_msg_equal(b1, b2):
|
|||
for (k1, v1), (k2, v2) in zip(sorted(msg1.items()), sorted(msg2.items())):
|
||||
assert k1 == k2
|
||||
assert v1 == v2
|
||||
|
||||
|
||||
def normalize_whitespace(s):
|
||||
return re.sub(r"\s+", " ", s)
|
||||
|
|
|
@ -108,6 +108,7 @@ class Doc:
|
|||
kb_id: Union[int, str] = ...,
|
||||
vector: Optional[Floats1d] = ...,
|
||||
alignment_mode: str = ...,
|
||||
span_id: Union[int, str] = ...,
|
||||
) -> Span: ...
|
||||
def similarity(self, other: Union[Doc, Span, Token, Lexeme]) -> float: ...
|
||||
@property
|
||||
|
|
|
@ -528,9 +528,9 @@ cdef class Doc:
|
|||
doc (Doc): The parent document.
|
||||
start_idx (int): The index of the first character of the span.
|
||||
end_idx (int): The index of the first character after the span.
|
||||
label (uint64 or string): A label to attach to the Span, e.g. for
|
||||
label (Union[int, str]): A label to attach to the Span, e.g. for
|
||||
named entities.
|
||||
kb_id (uint64 or string): An ID from a KB to capture the meaning of a
|
||||
kb_id (Union[int, str]): An ID from a KB to capture the meaning of a
|
||||
named entity.
|
||||
vector (ndarray[ndim=1, dtype='float32']): A meaning representation of
|
||||
the span.
|
||||
|
@ -539,6 +539,7 @@ cdef class Doc:
|
|||
with token boundaries), "contract" (span of all tokens completely
|
||||
within the character span), "expand" (span of all tokens at least
|
||||
partially covered by the character span). Defaults to "strict".
|
||||
span_id (Union[int, str]): An identifier to associate with the span.
|
||||
RETURNS (Span): The newly constructed object.
|
||||
|
||||
DOCS: https://spacy.io/api/doc#char_span
|
||||
|
|
|
@ -98,6 +98,9 @@ class Span:
|
|||
label: Union[int, str] = ...,
|
||||
kb_id: Union[int, str] = ...,
|
||||
vector: Optional[Floats1d] = ...,
|
||||
id: Union[int, str] = ...,
|
||||
alignment_mode: str = ...,
|
||||
span_id: Union[int, str] = ...,
|
||||
) -> Span: ...
|
||||
@property
|
||||
def conjuncts(self) -> Tuple[Token]: ...
|
||||
|
|
|
@ -362,7 +362,7 @@ cdef class Span:
|
|||
result = xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm)
|
||||
# ensure we get a scalar back (numpy does this automatically but cupy doesn't)
|
||||
return result.item()
|
||||
|
||||
|
||||
cpdef np.ndarray to_array(self, object py_attr_ids):
|
||||
"""Given a list of M attribute IDs, export the tokens to a numpy
|
||||
`ndarray` of shape `(N, M)`, where `N` is the length of the document.
|
||||
|
@ -639,21 +639,28 @@ cdef class Span:
|
|||
else:
|
||||
return self.doc[root]
|
||||
|
||||
def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None, id=0):
|
||||
def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None, id=0, alignment_mode="strict", span_id=0):
|
||||
"""Create a `Span` object from the slice `span.text[start : end]`.
|
||||
|
||||
start (int): The index of the first character of the span.
|
||||
end (int): The index of the first character after the span.
|
||||
label (uint64 or string): A label to attach to the Span, e.g. for
|
||||
label (Union[int, str]): A label to attach to the Span, e.g. for
|
||||
named entities.
|
||||
kb_id (uint64 or string): An ID from a KB to capture the meaning of a named entity.
|
||||
kb_id (Union[int, str]): An ID from a KB to capture the meaning of a named entity.
|
||||
vector (ndarray[ndim=1, dtype='float32']): A meaning representation of
|
||||
the span.
|
||||
id (Union[int, str]): Unused.
|
||||
alignment_mode (str): How character indices are aligned to token
|
||||
boundaries. Options: "strict" (character indices must be aligned
|
||||
with token boundaries), "contract" (span of all tokens completely
|
||||
within the character span), "expand" (span of all tokens at least
|
||||
partially covered by the character span). Defaults to "strict".
|
||||
span_id (Union[int, str]): An identifier to associate with the span.
|
||||
RETURNS (Span): The newly constructed object.
|
||||
"""
|
||||
start_idx += self.c.start_char
|
||||
end_idx += self.c.start_char
|
||||
return self.doc.char_span(start_idx, end_idx, label=label, kb_id=kb_id, vector=vector)
|
||||
return self.doc.char_span(start_idx, end_idx, label=label, kb_id=kb_id, vector=vector, alignment_mode=alignment_mode, span_id=span_id)
|
||||
|
||||
@property
|
||||
def conjuncts(self):
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from .corpus import Corpus, JsonlCorpus # noqa: F401
|
||||
from .corpus import Corpus, JsonlCorpus, PlainTextCorpus # noqa: F401
|
||||
from .example import Example, validate_examples, validate_get_examples # noqa: F401
|
||||
from .alignment import Alignment # noqa: F401
|
||||
from .augment import dont_augment, orth_variants_augmenter # noqa: F401
|
||||
|
|
|
@ -11,7 +11,7 @@ def create_copy_from_base_model(
|
|||
) -> Callable[[Language], Language]:
|
||||
def copy_from_base_model(nlp):
|
||||
if tokenizer:
|
||||
logger.info(f"Copying tokenizer from: {tokenizer}")
|
||||
logger.info("Copying tokenizer from: %s", tokenizer)
|
||||
base_nlp = load_model(tokenizer)
|
||||
if nlp.config["nlp"]["tokenizer"] == base_nlp.config["nlp"]["tokenizer"]:
|
||||
nlp.tokenizer.from_bytes(base_nlp.tokenizer.to_bytes(exclude=["vocab"]))
|
||||
|
@ -23,7 +23,7 @@ def create_copy_from_base_model(
|
|||
)
|
||||
)
|
||||
if vocab:
|
||||
logger.info(f"Copying vocab from: {vocab}")
|
||||
logger.info("Copying vocab from: %s", vocab)
|
||||
# only reload if the vocab is from a different model
|
||||
if tokenizer != vocab:
|
||||
base_nlp = load_model(vocab)
|
||||
|
|
|
@ -29,7 +29,7 @@ def create_docbin_reader(
|
|||
) -> Callable[["Language"], Iterable[Example]]:
|
||||
if path is None:
|
||||
raise ValueError(Errors.E913)
|
||||
util.logger.debug(f"Loading corpus from path: {path}")
|
||||
util.logger.debug("Loading corpus from path: %s", path)
|
||||
return Corpus(
|
||||
path,
|
||||
gold_preproc=gold_preproc,
|
||||
|
@ -58,6 +58,28 @@ def read_labels(path: Path, *, require: bool = False):
|
|||
return srsly.read_json(path)
|
||||
|
||||
|
||||
@util.registry.readers("spacy.PlainTextCorpus.v1")
|
||||
def create_plain_text_reader(
|
||||
path: Optional[Path],
|
||||
min_length: int = 0,
|
||||
max_length: int = 0,
|
||||
) -> Callable[["Language"], Iterable[Doc]]:
|
||||
"""Iterate Example objects from a file or directory of plain text
|
||||
UTF-8 files with one line per doc.
|
||||
|
||||
path (Path): The directory or filename to read from.
|
||||
min_length (int): Minimum document length (in tokens). Shorter documents
|
||||
will be skipped. Defaults to 0, which indicates no limit.
|
||||
max_length (int): Maximum document length (in tokens). Longer documents will
|
||||
be skipped. Defaults to 0, which indicates no limit.
|
||||
|
||||
DOCS: https://spacy.io/api/corpus#plaintextcorpus
|
||||
"""
|
||||
if path is None:
|
||||
raise ValueError(Errors.E913)
|
||||
return PlainTextCorpus(path, min_length=min_length, max_length=max_length)
|
||||
|
||||
|
||||
def walk_corpus(path: Union[str, Path], file_type) -> List[Path]:
|
||||
path = util.ensure_path(path)
|
||||
if not path.is_dir() and path.parts[-1].endswith(file_type):
|
||||
|
@ -257,3 +279,52 @@ class JsonlCorpus:
|
|||
# We don't *need* an example here, but it seems nice to
|
||||
# make it match the Corpus signature.
|
||||
yield Example(doc, Doc(nlp.vocab, words=words, spaces=spaces))
|
||||
|
||||
|
||||
class PlainTextCorpus:
|
||||
"""Iterate Example objects from a file or directory of plain text
|
||||
UTF-8 files with one line per doc.
|
||||
|
||||
path (Path): The directory or filename to read from.
|
||||
min_length (int): Minimum document length (in tokens). Shorter documents
|
||||
will be skipped. Defaults to 0, which indicates no limit.
|
||||
max_length (int): Maximum document length (in tokens). Longer documents will
|
||||
be skipped. Defaults to 0, which indicates no limit.
|
||||
|
||||
DOCS: https://spacy.io/api/corpus#plaintextcorpus
|
||||
"""
|
||||
|
||||
file_type = "txt"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
path: Optional[Union[str, Path]],
|
||||
*,
|
||||
min_length: int = 0,
|
||||
max_length: int = 0,
|
||||
) -> None:
|
||||
self.path = util.ensure_path(path)
|
||||
self.min_length = min_length
|
||||
self.max_length = max_length
|
||||
|
||||
def __call__(self, nlp: "Language") -> Iterator[Example]:
|
||||
"""Yield examples from the data.
|
||||
|
||||
nlp (Language): The current nlp object.
|
||||
YIELDS (Example): The example objects.
|
||||
|
||||
DOCS: https://spacy.io/api/corpus#plaintextcorpus-call
|
||||
"""
|
||||
for loc in walk_corpus(self.path, ".txt"):
|
||||
with open(loc, encoding="utf-8") as f:
|
||||
for text in f:
|
||||
text = text.rstrip("\r\n")
|
||||
if len(text):
|
||||
doc = nlp.make_doc(text)
|
||||
if self.min_length >= 1 and len(doc) < self.min_length:
|
||||
continue
|
||||
elif self.max_length >= 1 and len(doc) > self.max_length:
|
||||
continue
|
||||
# We don't *need* an example here, but it seems nice to
|
||||
# make it match the Corpus signature.
|
||||
yield Example(doc, doc.copy())
|
||||
|
|
|
@ -62,10 +62,10 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language":
|
|||
frozen_components = T["frozen_components"]
|
||||
# Sourced components that require resume_training
|
||||
resume_components = [p for p in sourced if p not in frozen_components]
|
||||
logger.info(f"Pipeline: {nlp.pipe_names}")
|
||||
logger.info("Pipeline: %s", nlp.pipe_names)
|
||||
if resume_components:
|
||||
with nlp.select_pipes(enable=resume_components):
|
||||
logger.info(f"Resuming training for: {resume_components}")
|
||||
logger.info("Resuming training for: %s", resume_components)
|
||||
nlp.resume_training(sgd=optimizer)
|
||||
# Make sure that listeners are defined before initializing further
|
||||
nlp._link_components()
|
||||
|
@ -73,16 +73,17 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language":
|
|||
if T["max_epochs"] == -1:
|
||||
sample_size = 100
|
||||
logger.debug(
|
||||
f"Due to streamed train corpus, using only first {sample_size} "
|
||||
f"examples for initialization. If necessary, provide all labels "
|
||||
f"in [initialize]. More info: https://spacy.io/api/cli#init_labels"
|
||||
"Due to streamed train corpus, using only first %s examples for initialization. "
|
||||
"If necessary, provide all labels in [initialize]. "
|
||||
"More info: https://spacy.io/api/cli#init_labels",
|
||||
sample_size,
|
||||
)
|
||||
nlp.initialize(
|
||||
lambda: islice(train_corpus(nlp), sample_size), sgd=optimizer
|
||||
)
|
||||
else:
|
||||
nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer)
|
||||
logger.info(f"Initialized pipeline components: {nlp.pipe_names}")
|
||||
logger.info("Initialized pipeline components: %s", nlp.pipe_names)
|
||||
# Detect components with listeners that are not frozen consistently
|
||||
for name, proc in nlp.pipeline:
|
||||
for listener in getattr(
|
||||
|
@ -109,7 +110,7 @@ def init_vocab(
|
|||
) -> None:
|
||||
if lookups:
|
||||
nlp.vocab.lookups = lookups
|
||||
logger.info(f"Added vocab lookups: {', '.join(lookups.tables)}")
|
||||
logger.info("Added vocab lookups: %s", ", ".join(lookups.tables))
|
||||
data_path = ensure_path(data)
|
||||
if data_path is not None:
|
||||
lex_attrs = srsly.read_jsonl(data_path)
|
||||
|
@ -125,11 +126,11 @@ def init_vocab(
|
|||
else:
|
||||
oov_prob = DEFAULT_OOV_PROB
|
||||
nlp.vocab.cfg.update({"oov_prob": oov_prob})
|
||||
logger.info(f"Added {len(nlp.vocab)} lexical entries to the vocab")
|
||||
logger.info("Added %d lexical entries to the vocab", len(nlp.vocab))
|
||||
logger.info("Created vocabulary")
|
||||
if vectors is not None:
|
||||
load_vectors_into_model(nlp, vectors)
|
||||
logger.info(f"Added vectors: {vectors}")
|
||||
logger.info("Added vectors: %s", vectors)
|
||||
# warn if source model vectors are not identical
|
||||
sourced_vectors_hashes = nlp.meta.pop("_sourced_vectors_hashes", {})
|
||||
vectors_hash = hash(nlp.vocab.vectors.to_bytes(exclude=["strings"]))
|
||||
|
@ -191,7 +192,7 @@ def init_tok2vec(
|
|||
if weights_data is not None:
|
||||
layer = get_tok2vec_ref(nlp, P)
|
||||
layer.from_bytes(weights_data)
|
||||
logger.info(f"Loaded pretrained weights from {init_tok2vec}")
|
||||
logger.info("Loaded pretrained weights from %s", init_tok2vec)
|
||||
return True
|
||||
return False
|
||||
|
||||
|
@ -216,13 +217,13 @@ def convert_vectors(
|
|||
nlp.vocab.deduplicate_vectors()
|
||||
else:
|
||||
if vectors_loc:
|
||||
logger.info(f"Reading vectors from {vectors_loc}")
|
||||
logger.info("Reading vectors from %s", vectors_loc)
|
||||
vectors_data, vector_keys, floret_settings = read_vectors(
|
||||
vectors_loc,
|
||||
truncate,
|
||||
mode=mode,
|
||||
)
|
||||
logger.info(f"Loaded vectors from {vectors_loc}")
|
||||
logger.info("Loaded vectors from %s", vectors_loc)
|
||||
else:
|
||||
vectors_data, vector_keys = (None, None)
|
||||
if vector_keys is not None and mode != VectorsMode.floret:
|
||||
|
|
|
@ -370,6 +370,6 @@ def clean_output_dir(path: Optional[Path]) -> None:
|
|||
if subdir.exists():
|
||||
try:
|
||||
shutil.rmtree(str(subdir))
|
||||
logger.debug(f"Removed existing output directory: {subdir}")
|
||||
logger.debug("Removed existing output directory: %s", subdir)
|
||||
except Exception as e:
|
||||
raise IOError(Errors.E901.format(path=path)) from e
|
||||
|
|
|
@ -32,6 +32,7 @@ import inspect
|
|||
import pkgutil
|
||||
import logging
|
||||
import socket
|
||||
import stat
|
||||
|
||||
try:
|
||||
import cupy.random
|
||||
|
@ -60,7 +61,7 @@ if TYPE_CHECKING:
|
|||
# fmt: off
|
||||
OOV_RANK = numpy.iinfo(numpy.uint64).max
|
||||
DEFAULT_OOV_PROB = -20
|
||||
LEXEME_NORM_LANGS = ["cs", "da", "de", "el", "en", "id", "lb", "mk", "pt", "ru", "sr", "ta", "th"]
|
||||
LEXEME_NORM_LANGS = ["cs", "da", "de", "el", "en", "grc", "id", "lb", "mk", "pt", "ru", "sr", "ta", "th"]
|
||||
|
||||
# Default order of sections in the config file. Not all sections needs to exist,
|
||||
# and additional sections are added at the end, in alphabetical order.
|
||||
|
@ -144,8 +145,17 @@ class registry(thinc.registry):
|
|||
return func
|
||||
|
||||
@classmethod
|
||||
def find(cls, registry_name: str, func_name: str) -> Callable:
|
||||
"""Get info about a registered function from the registry."""
|
||||
def find(
|
||||
cls, registry_name: str, func_name: str
|
||||
) -> Dict[str, Optional[Union[str, int]]]:
|
||||
"""Find information about a registered function, including the
|
||||
module and path to the file it's defined in, the line number and the
|
||||
docstring, if available.
|
||||
|
||||
registry_name (str): Name of the catalogue registry.
|
||||
func_name (str): Name of the registered function.
|
||||
RETURNS (Dict[str, Optional[Union[str, int]]]): The function info.
|
||||
"""
|
||||
# We're overwriting this classmethod so we're able to provide more
|
||||
# specific error messages and implement a fallback to spacy-legacy.
|
||||
if not hasattr(cls, registry_name):
|
||||
|
@ -1041,8 +1051,15 @@ def make_tempdir() -> Generator[Path, None, None]:
|
|||
"""
|
||||
d = Path(tempfile.mkdtemp())
|
||||
yield d
|
||||
|
||||
# On Windows, git clones use read-only files, which cause permission errors
|
||||
# when being deleted. This forcibly fixes permissions.
|
||||
def force_remove(rmfunc, path, ex):
|
||||
os.chmod(path, stat.S_IWRITE)
|
||||
rmfunc(path)
|
||||
|
||||
try:
|
||||
shutil.rmtree(str(d))
|
||||
shutil.rmtree(str(d), onerror=force_remove)
|
||||
except PermissionError as e:
|
||||
warnings.warn(Warnings.W091.format(dir=d, msg=e))
|
||||
|
||||
|
|
|
@ -899,15 +899,21 @@ The `EntityLinker` model architecture is a Thinc `Model` with a
|
|||
| `nO` | Output dimension, determined by the length of the vectors encoding each entity in the KB. If the `nO` dimension is not set, the entity linking component will set it when `initialize` is called. ~~Optional[int]~~ |
|
||||
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
|
||||
|
||||
### spacy.EmptyKB.v1 {id="EmptyKB"}
|
||||
### spacy.EmptyKB.v1 {id="EmptyKB.v1"}
|
||||
|
||||
A function that creates an empty `KnowledgeBase` from a [`Vocab`](/api/vocab)
|
||||
instance. This is the default when a new entity linker component is created.
|
||||
instance.
|
||||
|
||||
| Name | Description |
|
||||
| ---------------------- | ----------------------------------------------------------------------------------- |
|
||||
| `entity_vector_length` | The length of the vectors encoding each entity in the KB. Defaults to `64`. ~~int~~ |
|
||||
|
||||
### spacy.EmptyKB.v2 {id="EmptyKB"}
|
||||
|
||||
A function that creates an empty `KnowledgeBase` from a [`Vocab`](/api/vocab)
|
||||
instance. This is the default when a new entity linker component is created. It
|
||||
returns a `Callable[[Vocab, int], InMemoryLookupKB]`.
|
||||
|
||||
### spacy.KBFromFile.v1 {id="KBFromFile"}
|
||||
|
||||
A function that reads an existing `KnowledgeBase` from file.
|
||||
|
|
|
@ -175,3 +175,68 @@ Yield examples from the data.
|
|||
| ---------- | -------------------------------------- |
|
||||
| `nlp` | The current `nlp` object. ~~Language~~ |
|
||||
| **YIELDS** | The examples. ~~Example~~ |
|
||||
|
||||
## PlainTextCorpus {id="plaintextcorpus",tag="class",version="3.5.1"}
|
||||
|
||||
Iterate over documents from a plain text file. Can be used to read the raw text
|
||||
corpus for language model
|
||||
[pretraining](/usage/embeddings-transformers#pretraining). The expected file
|
||||
format is:
|
||||
|
||||
- UTF-8 encoding
|
||||
- One document per line
|
||||
- Blank lines are ignored.
|
||||
|
||||
```text {title="Example"}
|
||||
Can I ask where you work now and what you do, and if you enjoy it?
|
||||
They may just pull out of the Seattle market completely, at least until they have autonomous vehicles.
|
||||
My cynical view on this is that it will never be free to the public. Reason: what would be the draw of joining the military? Right now their selling point is free Healthcare and Education. Ironically both are run horribly and most, that I've talked to, come out wishing they never went in.
|
||||
```
|
||||
|
||||
### PlainTextCorpus.\_\_init\_\_ {id="plaintextcorpus-init",tag="method"}
|
||||
|
||||
Initialize the reader.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> from spacy.training import PlainTextCorpus
|
||||
>
|
||||
> corpus = PlainTextCorpus("./data/docs.txt")
|
||||
> ```
|
||||
>
|
||||
> ```ini
|
||||
> ### Example config
|
||||
> [corpora.pretrain]
|
||||
> @readers = "spacy.PlainTextCorpus.v1"
|
||||
> path = "corpus/raw_text.txt"
|
||||
> min_length = 0
|
||||
> max_length = 0
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| -------------- | -------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `path` | The directory or filename to read from. Expects newline-delimited documents in UTF8 format. ~~Union[str, Path]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `min_length` | Minimum document length (in tokens). Shorter documents will be skipped. Defaults to `0`, which indicates no limit. ~~int~~ |
|
||||
| `max_length` | Maximum document length (in tokens). Longer documents will be skipped. Defaults to `0`, which indicates no limit. ~~int~~ |
|
||||
|
||||
### PlainTextCorpus.\_\_call\_\_ {id="plaintextcorpus-call",tag="method"}
|
||||
|
||||
Yield examples from the data.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> from spacy.training import PlainTextCorpus
|
||||
> import spacy
|
||||
>
|
||||
> corpus = PlainTextCorpus("./docs.txt")
|
||||
> nlp = spacy.blank("en")
|
||||
> data = corpus(nlp)
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ---------- | -------------------------------------- |
|
||||
| `nlp` | The current `nlp` object. ~~Language~~ |
|
||||
| **YIELDS** | The examples. ~~Example~~ |
|
||||
|
|
|
@ -68,24 +68,28 @@ The following operators are supported by the `DependencyMatcher`, most of which
|
|||
come directly from
|
||||
[Semgrex](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html):
|
||||
|
||||
| Symbol | Description |
|
||||
| --------- | -------------------------------------------------------------------------------------------------------------------- |
|
||||
| `A < B` | `A` is the immediate dependent of `B`. |
|
||||
| `A > B` | `A` is the immediate head of `B`. |
|
||||
| `A << B` | `A` is the dependent in a chain to `B` following dep → head paths. |
|
||||
| `A >> B` | `A` is the head in a chain to `B` following head → dep paths. |
|
||||
| `A . B` | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree. |
|
||||
| `A .* B` | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(not in Semgrex)_. |
|
||||
| `A ; B` | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(not in Semgrex)_. |
|
||||
| `A ;* B` | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(not in Semgrex)_. |
|
||||
| `A $+ B` | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`. |
|
||||
| `A $- B` | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`. |
|
||||
| `A $++ B` | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`. |
|
||||
| `A $-- B` | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`. |
|
||||
| `A >++ B` | `B` is a right child of `A`, i.e. `A` is a parent of `B` and `A.i < B.i` _(not in Semgrex)_. |
|
||||
| `A >-- B` | `B` is a left child of `A`, i.e. `A` is a parent of `B` and `A.i > B.i` _(not in Semgrex)_. |
|
||||
| `A <++ B` | `B` is a right parent of `A`, i.e. `A` is a child of `B` and `A.i < B.i` _(not in Semgrex)_. |
|
||||
| `A <-- B` | `B` is a left parent of `A`, i.e. `A` is a child of `B` and `A.i > B.i` _(not in Semgrex)_. |
|
||||
| Symbol | Description |
|
||||
| --------------------------------------- | -------------------------------------------------------------------------------------------------------------------- |
|
||||
| `A < B` | `A` is the immediate dependent of `B`. |
|
||||
| `A > B` | `A` is the immediate head of `B`. |
|
||||
| `A << B` | `A` is the dependent in a chain to `B` following dep → head paths. |
|
||||
| `A >> B` | `A` is the head in a chain to `B` following head → dep paths. |
|
||||
| `A . B` | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree. |
|
||||
| `A .* B` | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(not in Semgrex)_. |
|
||||
| `A ; B` | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(not in Semgrex)_. |
|
||||
| `A ;* B` | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(not in Semgrex)_. |
|
||||
| `A $+ B` | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`. |
|
||||
| `A $- B` | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`. |
|
||||
| `A $++ B` | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`. |
|
||||
| `A $-- B` | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`. |
|
||||
| `A >+ B` <Tag variant="new">3.5.1</Tag> | `B` is a right immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i - 1` _(not in Semgrex)_. |
|
||||
| `A >- B` <Tag variant="new">3.5.1</Tag> | `B` is a left immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i + 1` _(not in Semgrex)_. |
|
||||
| `A >++ B` | `B` is a right child of `A`, i.e. `A` is a parent of `B` and `A.i < B.i` _(not in Semgrex)_. |
|
||||
| `A >-- B` | `B` is a left child of `A`, i.e. `A` is a parent of `B` and `A.i > B.i` _(not in Semgrex)_. |
|
||||
| `A <+ B` <Tag variant="new">3.5.1</Tag> | `B` is a right immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i - 1` _(not in Semgrex)_. |
|
||||
| `A <- B` <Tag variant="new">3.5.1</Tag> | `B` is a left immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i + 1` _(not in Semgrex)_. |
|
||||
| `A <++ B` | `B` is a right parent of `A`, i.e. `A` is a child of `B` and `A.i < B.i` _(not in Semgrex)_. |
|
||||
| `A <-- B` | `B` is a left parent of `A`, i.e. `A` is a child of `B` and `A.i > B.i` _(not in Semgrex)_. |
|
||||
|
||||
## DependencyMatcher.\_\_init\_\_ {id="init",tag="method"}
|
||||
|
||||
|
|
|
@ -209,15 +209,16 @@ alignment mode `"strict".
|
|||
> assert span.text == "New York"
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ---------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `start` | The index of the first character of the span. ~~int~~ |
|
||||
| `end` | The index of the last character after the span. ~~int~~ |
|
||||
| `label` | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~ |
|
||||
| `kb_id` | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~ |
|
||||
| `vector` | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~ |
|
||||
| `alignment_mode` | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ |
|
||||
| **RETURNS** | The newly constructed object or `None`. ~~Optional[Span]~~ |
|
||||
| Name | Description |
|
||||
| ---------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `start` | The index of the first character of the span. ~~int~~ |
|
||||
| `end` | The index of the last character after the span. ~~int~~ |
|
||||
| `label` | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~ |
|
||||
| `kb_id` | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~ |
|
||||
| `vector` | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~ |
|
||||
| `alignment_mode` | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ |
|
||||
| `span_id` <Tag variant="new">3.3.1</Tag> | An identifier to associate with the span. ~~Union[int, str]~~ |
|
||||
| **RETURNS** | The newly constructed object or `None`. ~~Optional[Span]~~ |
|
||||
|
||||
## Doc.set_ents {id="set_ents",tag="method",version="3"}
|
||||
|
||||
|
|
|
@ -53,19 +53,21 @@ architectures and their arguments and hyperparameters.
|
|||
> nlp.add_pipe("entity_linker", config=config)
|
||||
> ```
|
||||
|
||||
| Setting | Description |
|
||||
| ---------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `labels_discard` | NER labels that will automatically get a "NIL" prediction. Defaults to `[]`. ~~Iterable[str]~~ |
|
||||
| `n_sents` | The number of neighbouring sentences to take into account. Defaults to 0. ~~int~~ |
|
||||
| `incl_prior` | Whether or not to include prior probabilities from the KB in the model. Defaults to `True`. ~~bool~~ |
|
||||
| `incl_context` | Whether or not to include the local context in the model. Defaults to `True`. ~~bool~~ |
|
||||
| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~ |
|
||||
| `entity_vector_length` | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~ |
|
||||
| `use_gold_ents` | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~int~~ |
|
||||
| `get_candidates` | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ |
|
||||
| `overwrite` <Tag variant="new">3.2</Tag> | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~ |
|
||||
| `scorer` <Tag variant="new">3.2</Tag> | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ |
|
||||
| `threshold` <Tag variant="new">3.4</Tag> | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ |
|
||||
| Setting | Description |
|
||||
| --------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `labels_discard` | NER labels that will automatically get a "NIL" prediction. Defaults to `[]`. ~~Iterable[str]~~ |
|
||||
| `n_sents` | The number of neighbouring sentences to take into account. Defaults to 0. ~~int~~ |
|
||||
| `incl_prior` | Whether or not to include prior probabilities from the KB in the model. Defaults to `True`. ~~bool~~ |
|
||||
| `incl_context` | Whether or not to include the local context in the model. Defaults to `True`. ~~bool~~ |
|
||||
| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~ |
|
||||
| `entity_vector_length` | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~ |
|
||||
| `use_gold_ents` | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~int~~ |
|
||||
| `get_candidates` | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ |
|
||||
| `get_candidates_batch` <Tag variant="new">3.5</Tag> | Function that generates plausible candidates for a given batch of `Span` objects. Defaults to [CandidateBatchGenerator](/api/architectures#CandidateBatchGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]]~~ |
|
||||
| `generate_empty_kb` <Tag variant="new">3.6</Tag> | Function that generates an empty `KnowledgeBase` object. Defaults to [`spacy.EmptyKB.v2`](/api/architectures#EmptyKB), which generates an empty [`InMemoryLookupKB`](/api/inmemorylookupkb). ~~Callable[[Vocab, int], KnowledgeBase]~~ |
|
||||
| `overwrite` <Tag variant="new">3.2</Tag> | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~ |
|
||||
| `scorer` <Tag variant="new">3.2</Tag> | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ |
|
||||
| `threshold` <Tag variant="new">3.4</Tag> | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ |
|
||||
|
||||
```python
|
||||
%%GITHUB_SPACY/spacy/pipeline/entity_linker.py
|
||||
|
|
|
@ -186,14 +186,17 @@ the character indices don't map to a valid span.
|
|||
> assert span.text == "New York"
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ----------- | ----------------------------------------------------------------------------------------- |
|
||||
| `start` | The index of the first character of the span. ~~int~~ |
|
||||
| `end` | The index of the last character after the span. ~~int~~ |
|
||||
| `label` | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~ |
|
||||
| `kb_id` | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~ |
|
||||
| `vector` | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~ |
|
||||
| **RETURNS** | The newly constructed object or `None`. ~~Optional[Span]~~ |
|
||||
| Name | Description |
|
||||
| ----------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `start` | The index of the first character of the span. ~~int~~ |
|
||||
| `end` | The index of the last character after the span. ~~int~~ |
|
||||
| `label` | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~ |
|
||||
| `kb_id` | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~ |
|
||||
| `vector` | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~ |
|
||||
| `id` | Unused. ~~Union[int, str]~~ |
|
||||
| `alignment_mode` <Tag variant="new">3.5.1</Tag> | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ |
|
||||
| `span_id` <Tag variant="new">3.5.1</Tag> | An identifier to associate with the span. ~~Union[int, str]~~ |
|
||||
| **RETURNS** | The newly constructed object or `None`. ~~Optional[Span]~~ |
|
||||
|
||||
## Span.similarity {id="similarity",tag="method",model="vectors"}
|
||||
|
||||
|
|
|
@ -13,6 +13,13 @@ A span categorizer consists of two parts: a [suggester function](#suggesters)
|
|||
that proposes candidate spans, which may or may not overlap, and a labeler model
|
||||
that predicts zero or more labels for each candidate.
|
||||
|
||||
This component comes in two forms: `spancat` and `spancat_singlelabel` (added in
|
||||
spaCy v3.5.1). When you need to perform multi-label classification on your
|
||||
spans, use `spancat`. The `spancat` component uses a `Logistic` layer where the
|
||||
output class probabilities are independent for each class. However, if you need
|
||||
to predict at most one true class for a span, then use `spancat_singlelabel`. It
|
||||
uses a `Softmax` layer and treats the task as a multi-class problem.
|
||||
|
||||
Predicted spans will be saved in a [`SpanGroup`](/api/spangroup) on the doc.
|
||||
Individual span scores can be found in `spangroup.attrs["scores"]`.
|
||||
|
||||
|
@ -38,7 +45,7 @@ how the component should be configured. You can override its settings via the
|
|||
[model architectures](/api/architectures) documentation for details on the
|
||||
architectures and their arguments and hyperparameters.
|
||||
|
||||
> #### Example
|
||||
> #### Example (spancat)
|
||||
>
|
||||
> ```python
|
||||
> from spacy.pipeline.spancat import DEFAULT_SPANCAT_MODEL
|
||||
|
@ -52,14 +59,33 @@ architectures and their arguments and hyperparameters.
|
|||
> nlp.add_pipe("spancat", config=config)
|
||||
> ```
|
||||
|
||||
| Setting | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `suggester` | A function that [suggests spans](#suggesters). Spans are returned as a ragged array with two integer columns, for the start and end positions. Defaults to [`ngram_suggester`](#ngram_suggester). ~~Callable[[Iterable[Doc], Optional[Ops]], Ragged]~~ |
|
||||
| `model` | A model instance that is given a a list of documents and `(start, end)` indices representing candidate span offsets. The model predicts a probability for each category for each span. Defaults to [SpanCategorizer](/api/architectures#SpanCategorizer). ~~Model[Tuple[List[Doc], Ragged], Floats2d]~~ |
|
||||
| `spans_key` | Key of the [`Doc.spans`](/api/doc#spans) dict to save the spans under. During initialization and training, the component will look for spans on the reference document under the same key. Defaults to `"sc"`. ~~str~~ |
|
||||
| `threshold` | Minimum probability to consider a prediction positive. Spans with a positive prediction will be saved on the Doc. Defaults to `0.5`. ~~float~~ |
|
||||
| `max_positive` | Maximum number of labels to consider positive per span. Defaults to `None`, indicating no limit. ~~Optional[int]~~ |
|
||||
| `scorer` | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for `Doc.spans[spans_key]` with overlapping spans allowed. ~~Optional[Callable]~~ |
|
||||
> #### Example (spancat_singlelabel)
|
||||
>
|
||||
> ```python
|
||||
> from spacy.pipeline.spancat import DEFAULT_SPANCAT_SINGLELABEL_MODEL
|
||||
> config = {
|
||||
> "threshold": 0.5,
|
||||
> "spans_key": "labeled_spans",
|
||||
> "model": DEFAULT_SPANCAT_SINGLELABEL_MODEL,
|
||||
> "suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]},
|
||||
> # Additional spancat_singlelabel parameters
|
||||
> "negative_weight": 0.8,
|
||||
> "allow_overlap": True,
|
||||
> }
|
||||
> nlp.add_pipe("spancat_singlelabel", config=config)
|
||||
> ```
|
||||
|
||||
| Setting | Description |
|
||||
| --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `suggester` | A function that [suggests spans](#suggesters). Spans are returned as a ragged array with two integer columns, for the start and end positions. Defaults to [`ngram_suggester`](#ngram_suggester). ~~Callable[[Iterable[Doc], Optional[Ops]], Ragged]~~ |
|
||||
| `model` | A model instance that is given a a list of documents and `(start, end)` indices representing candidate span offsets. The model predicts a probability for each category for each span. Defaults to [SpanCategorizer](/api/architectures#SpanCategorizer). ~~Model[Tuple[List[Doc], Ragged], Floats2d]~~ |
|
||||
| `spans_key` | Key of the [`Doc.spans`](/api/doc#spans) dict to save the spans under. During initialization and training, the component will look for spans on the reference document under the same key. Defaults to `"sc"`. ~~str~~ |
|
||||
| `threshold` | Minimum probability to consider a prediction positive. Spans with a positive prediction will be saved on the Doc. Meant to be used in combination with the multi-class `spancat` component with a `Logistic` scoring layer. Defaults to `0.5`. ~~float~~ |
|
||||
| `max_positive` | Maximum number of labels to consider positive per span. Defaults to `None`, indicating no limit. Meant to be used together with the `spancat` component and defaults to 0 with `spancat_singlelabel`. ~~Optional[int]~~ |
|
||||
| `scorer` | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for `Doc.spans[spans_key]` with overlapping spans allowed. ~~Optional[Callable]~~ |
|
||||
| `add_negative_label` <Tag variant="new">3.5.1</Tag> | Whether to learn to predict a special negative label for each unannotated `Span` . This should be `True` when using a `Softmax` classifier layer and so its `True` by default for `spancat_singlelabel`. Spans with negative labels and their scores are not stored as annotations. ~~bool~~ |
|
||||
| `negative_weight` <Tag variant="new">3.5.1</Tag> | Multiplier for the loss terms. It can be used to downweight the negative samples if there are too many. It is only used when `add_negative_label` is `True`. Defaults to `1.0`. ~~float~~ |
|
||||
| `allow_overlap` <Tag variant="new">3.5.1</Tag> | If `True`, the data is assumed to contain overlapping spans. It is only available when `max_positive` is exactly 1. Defaults to `True`. ~~bool~~ |
|
||||
|
||||
```python
|
||||
%%GITHUB_SPACY/spacy/pipeline/spancat.py
|
||||
|
@ -71,6 +97,7 @@ architectures and their arguments and hyperparameters.
|
|||
>
|
||||
> ```python
|
||||
> # Construction via add_pipe with default model
|
||||
> # Replace 'spancat' with 'spancat_singlelabel' for exclusive classes
|
||||
> spancat = nlp.add_pipe("spancat")
|
||||
>
|
||||
> # Construction via add_pipe with custom model
|
||||
|
@ -86,16 +113,19 @@ Create a new pipeline instance. In your application, you would normally use a
|
|||
shortcut for this and instantiate the component using its string name and
|
||||
[`nlp.add_pipe`](/api/language#create_pipe).
|
||||
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `vocab` | The shared vocabulary. ~~Vocab~~ |
|
||||
| `model` | A model instance that is given a a list of documents and `(start, end)` indices representing candidate span offsets. The model predicts a probability for each category for each span. ~~Model[Tuple[List[Doc], Ragged], Floats2d]~~ |
|
||||
| `suggester` | A function that [suggests spans](#suggesters). Spans are returned as a ragged array with two integer columns, for the start and end positions. ~~Callable[[Iterable[Doc], Optional[Ops]], Ragged]~~ |
|
||||
| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
|
||||
| _keyword-only_ | |
|
||||
| `spans_key` | Key of the [`Doc.spans`](/api/doc#sans) dict to save the spans under. During initialization and training, the component will look for spans on the reference document under the same key. Defaults to `"sc"`. ~~str~~ |
|
||||
| `threshold` | Minimum probability to consider a prediction positive. Spans with a positive prediction will be saved on the Doc. Defaults to `0.5`. ~~float~~ |
|
||||
| `max_positive` | Maximum number of labels to consider positive per span. Defaults to `None`, indicating no limit. ~~Optional[int]~~ |
|
||||
| Name | Description |
|
||||
| --------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `vocab` | The shared vocabulary. ~~Vocab~~ |
|
||||
| `model` | A model instance that is given a a list of documents and `(start, end)` indices representing candidate span offsets. The model predicts a probability for each category for each span. ~~Model[Tuple[List[Doc], Ragged], Floats2d]~~ |
|
||||
| `suggester` | A function that [suggests spans](#suggesters). Spans are returned as a ragged array with two integer columns, for the start and end positions. ~~Callable[[Iterable[Doc], Optional[Ops]], Ragged]~~ |
|
||||
| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
|
||||
| _keyword-only_ | |
|
||||
| `spans_key` | Key of the [`Doc.spans`](/api/doc#sans) dict to save the spans under. During initialization and training, the component will look for spans on the reference document under the same key. Defaults to `"sc"`. ~~str~~ |
|
||||
| `threshold` | Minimum probability to consider a prediction positive. Spans with a positive prediction will be saved on the Doc. Defaults to `0.5`. ~~float~~ |
|
||||
| `max_positive` | Maximum number of labels to consider positive per span. Defaults to `None`, indicating no limit. ~~Optional[int]~~ |
|
||||
| `allow_overlap` <Tag variant="new">3.5.1</Tag> | If `True`, the data is assumed to contain overlapping spans. It is only available when `max_positive` is exactly 1. Defaults to `True`. ~~bool~~ |
|
||||
| `add_negative_label` <Tag variant="new">3.5.1</Tag> | Whether to learn to predict a special negative label for each unannotated `Span`. This should be `True` when using a `Softmax` classifier layer and so its `True` by default for `spancat_singlelabel` . Spans with negative labels and their scores are not stored as annotations. ~~bool~~ |
|
||||
| `negative_weight` <Tag variant="new">3.5.1</Tag> | Multiplier for the loss terms. It can be used to downweight the negative samples if there are too many . It is only used when `add_negative_label` is `True`. Defaults to `1.0`. ~~float~~ |
|
||||
|
||||
## SpanCategorizer.\_\_call\_\_ {id="call",tag="method"}
|
||||
|
||||
|
|
|
@ -1096,20 +1096,28 @@ The following operators are supported by the `DependencyMatcher`, most of which
|
|||
come directly from
|
||||
[Semgrex](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html):
|
||||
|
||||
| Symbol | Description |
|
||||
| --------- | -------------------------------------------------------------------------------------------------------------------- |
|
||||
| `A < B` | `A` is the immediate dependent of `B`. |
|
||||
| `A > B` | `A` is the immediate head of `B`. |
|
||||
| `A << B` | `A` is the dependent in a chain to `B` following dep → head paths. |
|
||||
| `A >> B` | `A` is the head in a chain to `B` following head → dep paths. |
|
||||
| `A . B` | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree. |
|
||||
| `A .* B` | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(not in Semgrex)_. |
|
||||
| `A ; B` | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(not in Semgrex)_. |
|
||||
| `A ;* B` | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(not in Semgrex)_. |
|
||||
| `A $+ B` | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`. |
|
||||
| `A $- B` | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`. |
|
||||
| `A $++ B` | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`. |
|
||||
| `A $-- B` | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`. |
|
||||
| Symbol | Description |
|
||||
| --------------------------------------- | -------------------------------------------------------------------------------------------------------------------- |
|
||||
| `A < B` | `A` is the immediate dependent of `B`. |
|
||||
| `A > B` | `A` is the immediate head of `B`. |
|
||||
| `A << B` | `A` is the dependent in a chain to `B` following dep → head paths. |
|
||||
| `A >> B` | `A` is the head in a chain to `B` following head → dep paths. |
|
||||
| `A . B` | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree. |
|
||||
| `A .* B` | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(not in Semgrex)_. |
|
||||
| `A ; B` | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(not in Semgrex)_. |
|
||||
| `A ;* B` | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(not in Semgrex)_. |
|
||||
| `A $+ B` | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`. |
|
||||
| `A $- B` | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`. |
|
||||
| `A $++ B` | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`. |
|
||||
| `A $-- B` | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`. |
|
||||
| `A >+ B` <Tag variant="new">3.5.1</Tag> | `B` is a right immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i - 1` _(not in Semgrex)_. |
|
||||
| `A >- B` <Tag variant="new">3.5.1</Tag> | `B` is a left immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i + 1` _(not in Semgrex)_. |
|
||||
| `A >++ B` | `B` is a right child of `A`, i.e. `A` is a parent of `B` and `A.i < B.i` _(not in Semgrex)_. |
|
||||
| `A >-- B` | `B` is a left child of `A`, i.e. `A` is a parent of `B` and `A.i > B.i` _(not in Semgrex)_. |
|
||||
| `A <+ B` <Tag variant="new">3.5.1</Tag> | `B` is a right immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i - 1` _(not in Semgrex)_. |
|
||||
| `A <- B` <Tag variant="new">3.5.1</Tag> | `B` is a left immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i + 1` _(not in Semgrex)_. |
|
||||
| `A <++ B` | `B` is a right parent of `A`, i.e. `A` is a child of `B` and `A.i < B.i` _(not in Semgrex)_. |
|
||||
| `A <-- B` | `B` is a left parent of `A`, i.e. `A` is a child of `B` and `A.i > B.i` _(not in Semgrex)_. |
|
||||
|
||||
### Designing dependency matcher patterns {id="dependencymatcher-patterns"}
|
||||
|
||||
|
|
|
@ -6,6 +6,7 @@
|
|||
"dev": "next dev",
|
||||
"build": "next build && npm run sitemap && next export",
|
||||
"prebuild": "pip install -r setup/requirements.txt && sh setup/setup.sh",
|
||||
"predev": "npm run prebuild",
|
||||
"sitemap": "next-sitemap --config next-sitemap.config.mjs",
|
||||
"start": "next start",
|
||||
"lint": "next lint",
|
||||
|
|
Loading…
Reference in New Issue
Block a user