Merge branch 'master' into spacy.io

This commit is contained in:
Adriane Boyd 2022-02-11 11:26:08 +01:00
commit d43082289c
64 changed files with 877 additions and 202 deletions

21
.github/workflows/gputests.yml vendored Normal file
View File

@ -0,0 +1,21 @@
name: Weekly GPU tests
on:
schedule:
- cron: '0 1 * * MON'
jobs:
weekly-gputests:
strategy:
fail-fast: false
matrix:
branch: [master, develop, v4]
runs-on: ubuntu-latest
steps:
- name: Trigger buildkite build
uses: buildkite/trigger-pipeline-action@v1.2.0
env:
PIPELINE: explosion-ai/spacy-slow-gpu-tests
BRANCH: ${{ matrix.branch }}
MESSAGE: ":github: Weekly GPU + slow tests - triggered from a GitHub Action"
BUILDKITE_API_ACCESS_TOKEN: ${{ secrets.BUILDKITE_SECRET }}

35
.github/workflows/slowtests.yml vendored Normal file
View File

@ -0,0 +1,35 @@
name: Daily slow tests
on:
schedule:
- cron: '0 0 * * *'
jobs:
daily-slowtests:
strategy:
fail-fast: false
matrix:
branch: [master, develop, v4]
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v1
- name: Get commits from past 24 hours
id: check_commits
run: |
today=$(date '+%Y-%m-%d %H:%M:%S')
yesterday=$(date -d "yesterday" '+%Y-%m-%d %H:%M:%S')
if git log --after=$yesterday --before=$today | grep commit ; then
echo "::set-output name=run_tests::true"
else
echo "::set-output name=run_tests::false"
fi
- name: Trigger buildkite build
if: steps.check_commits.outputs.run_tests == 'true'
uses: buildkite/trigger-pipeline-action@v1.2.0
env:
PIPELINE: explosion-ai/spacy-slow-tests
BRANCH: ${{ matrix.branch }}
MESSAGE: ":github: Daily slow tests - triggered from a GitHub Action"
BUILDKITE_API_ACCESS_TOKEN: ${{ secrets.BUILDKITE_SECRET }}

View File

@ -1,6 +1,6 @@
The MIT License (MIT) The MIT License (MIT)
Copyright (C) 2016-2021 ExplosionAI GmbH, 2016 spaCy GmbH, 2015 Matthew Honnibal Copyright (C) 2016-2022 ExplosionAI GmbH, 2016 spaCy GmbH, 2015 Matthew Honnibal
Permission is hereby granted, free of charge, to any person obtaining a copy Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal of this software and associated documentation files (the "Software"), to deal

View File

@ -1,11 +1,8 @@
recursive-include include *.h
recursive-include spacy *.pyi *.pyx *.pxd *.txt *.cfg *.jinja *.toml recursive-include spacy *.pyi *.pyx *.pxd *.txt *.cfg *.jinja *.toml
include LICENSE include LICENSE
include README.md include README.md
include pyproject.toml include pyproject.toml
include spacy/py.typed include spacy/py.typed
recursive-exclude spacy/lang *.json recursive-include spacy/cli *.yml
recursive-include spacy/lang *.json.gz
recursive-include spacy/cli *.json *.yml
recursive-include licenses * recursive-include licenses *
recursive-exclude spacy *.cpp recursive-exclude spacy *.cpp

View File

@ -31,7 +31,8 @@ pytest-timeout>=1.3.0,<2.0.0
mock>=2.0.0,<3.0.0 mock>=2.0.0,<3.0.0
flake8>=3.8.0,<3.10.0 flake8>=3.8.0,<3.10.0
hypothesis>=3.27.0,<7.0.0 hypothesis>=3.27.0,<7.0.0
mypy>=0.910 mypy==0.910
types-dataclasses>=0.1.3; python_version < "3.7" types-dataclasses>=0.1.3; python_version < "3.7"
types-mock>=0.1.1 types-mock>=0.1.1
types-requests types-requests
black>=22.0,<23.0

View File

@ -77,37 +77,39 @@ transformers =
ray = ray =
spacy_ray>=0.1.0,<1.0.0 spacy_ray>=0.1.0,<1.0.0
cuda = cuda =
cupy>=5.0.0b4,<10.0.0 cupy>=5.0.0b4,<11.0.0
cuda80 = cuda80 =
cupy-cuda80>=5.0.0b4,<10.0.0 cupy-cuda80>=5.0.0b4,<11.0.0
cuda90 = cuda90 =
cupy-cuda90>=5.0.0b4,<10.0.0 cupy-cuda90>=5.0.0b4,<11.0.0
cuda91 = cuda91 =
cupy-cuda91>=5.0.0b4,<10.0.0 cupy-cuda91>=5.0.0b4,<11.0.0
cuda92 = cuda92 =
cupy-cuda92>=5.0.0b4,<10.0.0 cupy-cuda92>=5.0.0b4,<11.0.0
cuda100 = cuda100 =
cupy-cuda100>=5.0.0b4,<10.0.0 cupy-cuda100>=5.0.0b4,<11.0.0
cuda101 = cuda101 =
cupy-cuda101>=5.0.0b4,<10.0.0 cupy-cuda101>=5.0.0b4,<11.0.0
cuda102 = cuda102 =
cupy-cuda102>=5.0.0b4,<10.0.0 cupy-cuda102>=5.0.0b4,<11.0.0
cuda110 = cuda110 =
cupy-cuda110>=5.0.0b4,<10.0.0 cupy-cuda110>=5.0.0b4,<11.0.0
cuda111 = cuda111 =
cupy-cuda111>=5.0.0b4,<10.0.0 cupy-cuda111>=5.0.0b4,<11.0.0
cuda112 = cuda112 =
cupy-cuda112>=5.0.0b4,<10.0.0 cupy-cuda112>=5.0.0b4,<11.0.0
cuda113 = cuda113 =
cupy-cuda113>=5.0.0b4,<10.0.0 cupy-cuda113>=5.0.0b4,<11.0.0
cuda114 = cuda114 =
cupy-cuda114>=5.0.0b4,<10.0.0 cupy-cuda114>=5.0.0b4,<11.0.0
cuda115 =
cupy-cuda115>=5.0.0b4,<11.0.0
apple = apple =
thinc-apple-ops>=0.0.4,<1.0.0 thinc-apple-ops>=0.0.4,<1.0.0
# Language tokenizers with external dependencies # Language tokenizers with external dependencies
ja = ja =
sudachipy>=0.4.9 sudachipy>=0.5.2,!=0.6.1
sudachidict_core>=20200330 sudachidict_core>=20211220
ko = ko =
natto-py==0.9.0 natto-py==0.9.0
th = th =

View File

@ -1,3 +1,6 @@
from .errors import Errors
IOB_STRINGS = ("", "I", "O", "B")
IDS = { IDS = {
"": NULL_ATTR, "": NULL_ATTR,
@ -64,7 +67,6 @@ IDS = {
"FLAG61": FLAG61, "FLAG61": FLAG61,
"FLAG62": FLAG62, "FLAG62": FLAG62,
"FLAG63": FLAG63, "FLAG63": FLAG63,
"ID": ID, "ID": ID,
"ORTH": ORTH, "ORTH": ORTH,
"LOWER": LOWER, "LOWER": LOWER,
@ -72,7 +74,6 @@ IDS = {
"SHAPE": SHAPE, "SHAPE": SHAPE,
"PREFIX": PREFIX, "PREFIX": PREFIX,
"SUFFIX": SUFFIX, "SUFFIX": SUFFIX,
"LENGTH": LENGTH, "LENGTH": LENGTH,
"LEMMA": LEMMA, "LEMMA": LEMMA,
"POS": POS, "POS": POS,
@ -87,7 +88,7 @@ IDS = {
"SPACY": SPACY, "SPACY": SPACY,
"LANG": LANG, "LANG": LANG,
"MORPH": MORPH, "MORPH": MORPH,
"IDX": IDX "IDX": IDX,
} }
@ -109,28 +110,66 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
""" """
inty_attrs = {} inty_attrs = {}
if _do_deprecated: if _do_deprecated:
if 'F' in stringy_attrs: if "F" in stringy_attrs:
stringy_attrs["ORTH"] = stringy_attrs.pop("F") stringy_attrs["ORTH"] = stringy_attrs.pop("F")
if 'L' in stringy_attrs: if "L" in stringy_attrs:
stringy_attrs["LEMMA"] = stringy_attrs.pop("L") stringy_attrs["LEMMA"] = stringy_attrs.pop("L")
if 'pos' in stringy_attrs: if "pos" in stringy_attrs:
stringy_attrs["TAG"] = stringy_attrs.pop("pos") stringy_attrs["TAG"] = stringy_attrs.pop("pos")
if 'morph' in stringy_attrs: if "morph" in stringy_attrs:
morphs = stringy_attrs.pop('morph') morphs = stringy_attrs.pop("morph")
if 'number' in stringy_attrs: if "number" in stringy_attrs:
stringy_attrs.pop('number') stringy_attrs.pop("number")
if 'tenspect' in stringy_attrs: if "tenspect" in stringy_attrs:
stringy_attrs.pop('tenspect') stringy_attrs.pop("tenspect")
morph_keys = [ morph_keys = [
'PunctType', 'PunctSide', 'Other', 'Degree', 'AdvType', 'Number', "PunctType",
'VerbForm', 'PronType', 'Aspect', 'Tense', 'PartType', 'Poss', "PunctSide",
'Hyph', 'ConjType', 'NumType', 'Foreign', 'VerbType', 'NounType', "Other",
'Gender', 'Mood', 'Negative', 'Tense', 'Voice', 'Abbr', "Degree",
'Derivation', 'Echo', 'Foreign', 'NameType', 'NounType', 'NumForm', "AdvType",
'NumValue', 'PartType', 'Polite', 'StyleVariant', "Number",
'PronType', 'AdjType', 'Person', 'Variant', 'AdpType', "VerbForm",
'Reflex', 'Negative', 'Mood', 'Aspect', 'Case', "PronType",
'Polarity', 'PrepCase', 'Animacy' # U20 "Aspect",
"Tense",
"PartType",
"Poss",
"Hyph",
"ConjType",
"NumType",
"Foreign",
"VerbType",
"NounType",
"Gender",
"Mood",
"Negative",
"Tense",
"Voice",
"Abbr",
"Derivation",
"Echo",
"Foreign",
"NameType",
"NounType",
"NumForm",
"NumValue",
"PartType",
"Polite",
"StyleVariant",
"PronType",
"AdjType",
"Person",
"Variant",
"AdpType",
"Reflex",
"Negative",
"Mood",
"Aspect",
"Case",
"Polarity",
"PrepCase",
"Animacy", # U20
] ]
for key in morph_keys: for key in morph_keys:
if key in stringy_attrs: if key in stringy_attrs:
@ -142,8 +181,13 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
for name, value in stringy_attrs.items(): for name, value in stringy_attrs.items():
int_key = intify_attr(name) int_key = intify_attr(name)
if int_key is not None: if int_key is not None:
if int_key == ENT_IOB:
if value in IOB_STRINGS:
value = IOB_STRINGS.index(value)
elif isinstance(value, str):
raise ValueError(Errors.E1025.format(value=value))
if strings_map is not None and isinstance(value, str): if strings_map is not None and isinstance(value, str):
if hasattr(strings_map, 'add'): if hasattr(strings_map, "add"):
value = strings_map.add(value) value = strings_map.add(value)
else: else:
value = strings_map[value] value = strings_map[value]

View File

@ -14,7 +14,7 @@ from ..training.initialize import get_sourced_components
from ..schemas import ConfigSchemaTraining from ..schemas import ConfigSchemaTraining
from ..pipeline._parser_internals import nonproj from ..pipeline._parser_internals import nonproj
from ..pipeline._parser_internals.nonproj import DELIMITER from ..pipeline._parser_internals.nonproj import DELIMITER
from ..pipeline import Morphologizer from ..pipeline import Morphologizer, SpanCategorizer
from ..morphology import Morphology from ..morphology import Morphology
from ..language import Language from ..language import Language
from ..util import registry, resolve_dot_names from ..util import registry, resolve_dot_names
@ -193,6 +193,70 @@ def debug_data(
else: else:
msg.info("No word vectors present in the package") msg.info("No word vectors present in the package")
if "spancat" in factory_names:
model_labels_spancat = _get_labels_from_spancat(nlp)
has_low_data_warning = False
has_no_neg_warning = False
msg.divider("Span Categorization")
msg.table(model_labels_spancat, header=["Spans Key", "Labels"], divider=True)
msg.text("Label counts in train data: ", show=verbose)
for spans_key, data_labels in gold_train_data["spancat"].items():
msg.text(
f"Key: {spans_key}, {_format_labels(data_labels.items(), counts=True)}",
show=verbose,
)
# Data checks: only take the spans keys in the actual spancat components
data_labels_in_component = {
spans_key: gold_train_data["spancat"][spans_key]
for spans_key in model_labels_spancat.keys()
}
for spans_key, data_labels in data_labels_in_component.items():
for label, count in data_labels.items():
# Check for missing labels
spans_key_in_model = spans_key in model_labels_spancat.keys()
if (spans_key_in_model) and (
label not in model_labels_spancat[spans_key]
):
msg.warn(
f"Label '{label}' is not present in the model labels of key '{spans_key}'. "
"Performance may degrade after training."
)
# Check for low number of examples per label
if count <= NEW_LABEL_THRESHOLD:
msg.warn(
f"Low number of examples for label '{label}' in key '{spans_key}' ({count})"
)
has_low_data_warning = True
# Check for negative examples
with msg.loading("Analyzing label distribution..."):
neg_docs = _get_examples_without_label(
train_dataset, label, "spancat", spans_key
)
if neg_docs == 0:
msg.warn(f"No examples for texts WITHOUT new label '{label}'")
has_no_neg_warning = True
if has_low_data_warning:
msg.text(
f"To train a new span type, your data should include at "
f"least {NEW_LABEL_THRESHOLD} instances of the new label",
show=verbose,
)
else:
msg.good("Good amount of examples for all labels")
if has_no_neg_warning:
msg.text(
"Training data should always include examples of spans "
"in context, as well as examples without a given span "
"type.",
show=verbose,
)
else:
msg.good("Examples without ocurrences available for all labels")
if "ner" in factory_names: if "ner" in factory_names:
# Get all unique NER labels present in the data # Get all unique NER labels present in the data
labels = set( labels = set(
@ -203,6 +267,7 @@ def debug_data(
has_low_data_warning = False has_low_data_warning = False
has_no_neg_warning = False has_no_neg_warning = False
has_ws_ents_error = False has_ws_ents_error = False
has_boundary_cross_ents_warning = False
msg.divider("Named Entity Recognition") msg.divider("Named Entity Recognition")
msg.info(f"{len(model_labels)} label(s)") msg.info(f"{len(model_labels)} label(s)")
@ -237,17 +302,25 @@ def debug_data(
has_low_data_warning = True has_low_data_warning = True
with msg.loading("Analyzing label distribution..."): with msg.loading("Analyzing label distribution..."):
neg_docs = _get_examples_without_label(train_dataset, label) neg_docs = _get_examples_without_label(train_dataset, label, "ner")
if neg_docs == 0: if neg_docs == 0:
msg.warn(f"No examples for texts WITHOUT new label '{label}'") msg.warn(f"No examples for texts WITHOUT new label '{label}'")
has_no_neg_warning = True has_no_neg_warning = True
if gold_train_data["boundary_cross_ents"]:
msg.warn(
f"{gold_train_data['boundary_cross_ents']} entity span(s) crossing sentence boundaries"
)
has_boundary_cross_ents_warning = True
if not has_low_data_warning: if not has_low_data_warning:
msg.good("Good amount of examples for all labels") msg.good("Good amount of examples for all labels")
if not has_no_neg_warning: if not has_no_neg_warning:
msg.good("Examples without occurrences available for all labels") msg.good("Examples without occurrences available for all labels")
if not has_ws_ents_error: if not has_ws_ents_error:
msg.good("No entities consisting of or starting/ending with whitespace") msg.good("No entities consisting of or starting/ending with whitespace")
if not has_boundary_cross_ents_warning:
msg.good("No entities crossing sentence boundaries")
if has_low_data_warning: if has_low_data_warning:
msg.text( msg.text(
@ -564,7 +637,9 @@ def _compile_gold(
"deps": Counter(), "deps": Counter(),
"words": Counter(), "words": Counter(),
"roots": Counter(), "roots": Counter(),
"spancat": dict(),
"ws_ents": 0, "ws_ents": 0,
"boundary_cross_ents": 0,
"n_words": 0, "n_words": 0,
"n_misaligned_words": 0, "n_misaligned_words": 0,
"words_missing_vectors": Counter(), "words_missing_vectors": Counter(),
@ -593,6 +668,7 @@ def _compile_gold(
if nlp.vocab.strings[word] not in nlp.vocab.vectors: if nlp.vocab.strings[word] not in nlp.vocab.vectors:
data["words_missing_vectors"].update([word]) data["words_missing_vectors"].update([word])
if "ner" in factory_names: if "ner" in factory_names:
sent_starts = eg.get_aligned_sent_starts()
for i, label in enumerate(eg.get_aligned_ner()): for i, label in enumerate(eg.get_aligned_ner()):
if label is None: if label is None:
continue continue
@ -602,8 +678,19 @@ def _compile_gold(
if label.startswith(("B-", "U-")): if label.startswith(("B-", "U-")):
combined_label = label.split("-")[1] combined_label = label.split("-")[1]
data["ner"][combined_label] += 1 data["ner"][combined_label] += 1
if sent_starts[i] == True and label.startswith(("I-", "L-")):
data["boundary_cross_ents"] += 1
elif label == "-": elif label == "-":
data["ner"]["-"] += 1 data["ner"]["-"] += 1
if "spancat" in factory_names:
for span_key in list(eg.reference.spans.keys()):
if span_key not in data["spancat"]:
data["spancat"][span_key] = Counter()
for i, span in enumerate(eg.reference.spans[span_key]):
if span.label_ is None:
continue
else:
data["spancat"][span_key][span.label_] += 1
if "textcat" in factory_names or "textcat_multilabel" in factory_names: if "textcat" in factory_names or "textcat_multilabel" in factory_names:
data["cats"].update(gold.cats) data["cats"].update(gold.cats)
if any(val not in (0, 1) for val in gold.cats.values()): if any(val not in (0, 1) for val in gold.cats.values()):
@ -674,21 +761,57 @@ def _format_labels(
return ", ".join([f"'{l}'" for l in cast(Iterable[str], labels)]) return ", ".join([f"'{l}'" for l in cast(Iterable[str], labels)])
def _get_examples_without_label(data: Sequence[Example], label: str) -> int: def _get_examples_without_label(
data: Sequence[Example],
label: str,
component: Literal["ner", "spancat"] = "ner",
spans_key: Optional[str] = "sc",
) -> int:
count = 0 count = 0
for eg in data: for eg in data:
if component == "ner":
labels = [ labels = [
label.split("-")[1] label.split("-")[1]
for label in eg.get_aligned_ner() for label in eg.get_aligned_ner()
if label not in ("O", "-", None) if label not in ("O", "-", None)
] ]
if component == "spancat":
labels = (
[span.label_ for span in eg.reference.spans[spans_key]]
if spans_key in eg.reference.spans
else []
)
if label not in labels: if label not in labels:
count += 1 count += 1
return count return count
def _get_labels_from_model(nlp: Language, pipe_name: str) -> Set[str]: def _get_labels_from_model(nlp: Language, factory_name: str) -> Set[str]:
if pipe_name not in nlp.pipe_names: pipe_names = [
return set() pipe_name
for pipe_name in nlp.pipe_names
if nlp.get_pipe_meta(pipe_name).factory == factory_name
]
labels: Set[str] = set()
for pipe_name in pipe_names:
pipe = nlp.get_pipe(pipe_name) pipe = nlp.get_pipe(pipe_name)
return set(pipe.labels) labels.update(pipe.labels)
return labels
def _get_labels_from_spancat(nlp: Language) -> Dict[str, Set[str]]:
pipe_names = [
pipe_name
for pipe_name in nlp.pipe_names
if nlp.get_pipe_meta(pipe_name).factory == "spancat"
]
labels: Dict[str, Set[str]] = {}
for pipe_name in pipe_names:
pipe = nlp.get_pipe(pipe_name)
assert isinstance(pipe, SpanCategorizer)
if pipe.key not in labels:
labels[pipe.key] = set()
labels[pipe.key].update(pipe.labels)
return labels

View File

@ -7,6 +7,7 @@ from collections import defaultdict
from catalogue import RegistryError from catalogue import RegistryError
import srsly import srsly
import sys import sys
import re
from ._util import app, Arg, Opt, string_to_list, WHEEL_SUFFIX, SDIST_SUFFIX from ._util import app, Arg, Opt, string_to_list, WHEEL_SUFFIX, SDIST_SUFFIX
from ..schemas import validate, ModelMetaSchema from ..schemas import validate, ModelMetaSchema
@ -109,6 +110,24 @@ def package(
", ".join(meta["requirements"]), ", ".join(meta["requirements"]),
) )
if name is not None: if name is not None:
if not name.isidentifier():
msg.fail(
f"Model name ('{name}') is not a valid module name. "
"This is required so it can be imported as a module.",
"We recommend names that use ASCII A-Z, a-z, _ (underscore), "
"and 0-9. "
"For specific details see: https://docs.python.org/3/reference/lexical_analysis.html#identifiers",
exits=1,
)
if not _is_permitted_package_name(name):
msg.fail(
f"Model name ('{name}') is not a permitted package name. "
"This is required to correctly load the model with spacy.load.",
"We recommend names that use ASCII A-Z, a-z, _ (underscore), "
"and 0-9. "
"For specific details see: https://www.python.org/dev/peps/pep-0426/#name",
exits=1,
)
meta["name"] = name meta["name"] = name
if version is not None: if version is not None:
meta["version"] = version meta["version"] = version
@ -162,7 +181,7 @@ def package(
imports="\n".join(f"from . import {m}" for m in imports) imports="\n".join(f"from . import {m}" for m in imports)
) )
create_file(package_path / "__init__.py", init_py) create_file(package_path / "__init__.py", init_py)
msg.good(f"Successfully created package '{model_name_v}'", main_path) msg.good(f"Successfully created package directory '{model_name_v}'", main_path)
if create_sdist: if create_sdist:
with util.working_dir(main_path): with util.working_dir(main_path):
util.run_command([sys.executable, "setup.py", "sdist"], capture=False) util.run_command([sys.executable, "setup.py", "sdist"], capture=False)
@ -171,8 +190,14 @@ def package(
if create_wheel: if create_wheel:
with util.working_dir(main_path): with util.working_dir(main_path):
util.run_command([sys.executable, "setup.py", "bdist_wheel"], capture=False) util.run_command([sys.executable, "setup.py", "bdist_wheel"], capture=False)
wheel = main_path / "dist" / f"{model_name_v}{WHEEL_SUFFIX}" wheel_name_squashed = re.sub("_+", "_", model_name_v)
wheel = main_path / "dist" / f"{wheel_name_squashed}{WHEEL_SUFFIX}"
msg.good(f"Successfully created binary wheel", wheel) msg.good(f"Successfully created binary wheel", wheel)
if "__" in model_name:
msg.warn(
f"Model name ('{model_name}') contains a run of underscores. "
"Runs of underscores are not significant in installed package names.",
)
def has_wheel() -> bool: def has_wheel() -> bool:
@ -422,6 +447,14 @@ def _format_label_scheme(data: Dict[str, Any]) -> str:
return md.text return md.text
def _is_permitted_package_name(package_name: str) -> bool:
# regex from: https://www.python.org/dev/peps/pep-0426/#name
permitted_match = re.search(
r"^([A-Z0-9]|[A-Z0-9][A-Z0-9._-]*[A-Z0-9])$", package_name, re.IGNORECASE
)
return permitted_match is not None
TEMPLATE_SETUP = """ TEMPLATE_SETUP = """
#!/usr/bin/env python #!/usr/bin/env python
import io import io

View File

@ -1,6 +1,7 @@
from typing import Any, Dict, Optional from typing import Any, Dict, Optional
from pathlib import Path from pathlib import Path
from wasabi import msg from wasabi import msg
import os
import re import re
import shutil import shutil
import requests import requests
@ -129,10 +130,17 @@ def fetch_asset(
the asset failed. the asset failed.
""" """
dest_path = (project_path / dest).resolve() dest_path = (project_path / dest).resolve()
if dest_path.exists() and checksum: if dest_path.exists():
# If there's already a file, check for checksum # If there's already a file, check for checksum
if checksum:
if checksum == get_checksum(dest_path): if checksum == get_checksum(dest_path):
msg.good(f"Skipping download with matching checksum: {dest}") msg.good(f"Skipping download with matching checksum: {dest}")
return
else:
# If there's not a checksum, make sure the file is a possibly valid size
if os.path.getsize(dest_path) == 0:
msg.warn(f"Asset exists but with size of 0 bytes, deleting: {dest}")
os.remove(dest_path)
# We might as well support the user here and create parent directories in # We might as well support the user here and create parent directories in
# case the asset dir isn't listed as a dir to create in the project.yml # case the asset dir isn't listed as a dir to create in the project.yml
if not dest_path.parent.exists(): if not dest_path.parent.exists():

View File

@ -6,6 +6,11 @@ can help generate the best possible configuration, given a user's requirements.
[paths] [paths]
train = null train = null
dev = null dev = null
{% if use_transformer or optimize == "efficiency" or not word_vectors -%}
vectors = null
{% else -%}
vectors = "{{ word_vectors }}"
{% endif -%}
[system] [system]
{% if use_transformer -%} {% if use_transformer -%}
@ -421,8 +426,4 @@ compound = 1.001
{% endif %} {% endif %}
[initialize] [initialize]
{% if use_transformer or optimize == "efficiency" or not word_vectors -%}
vectors = ${paths.vectors} vectors = ${paths.vectors}
{% else -%}
vectors = "{{ word_vectors }}"
{% endif -%}

View File

@ -68,12 +68,14 @@ seed = ${system.seed}
gpu_allocator = ${system.gpu_allocator} gpu_allocator = ${system.gpu_allocator}
dropout = 0.1 dropout = 0.1
accumulate_gradient = 1 accumulate_gradient = 1
# Controls early-stopping. 0 disables early stopping. # Controls early-stopping, i.e., the number of steps to continue without
# improvement before stopping. 0 disables early stopping.
patience = 1600 patience = 1600
# Number of epochs. 0 means unlimited. If >= 0, train corpus is loaded once in # Number of epochs. 0 means unlimited. If >= 0, train corpus is loaded once in
# memory and shuffled within the training loop. -1 means stream train corpus # memory and shuffled within the training loop. -1 means stream train corpus
# rather than loading in memory with no shuffling within the training loop. # rather than loading in memory with no shuffling within the training loop.
max_epochs = 0 max_epochs = 0
# Maximum number of update steps to train for. 0 means an unlimited number of steps.
max_steps = 20000 max_steps = 20000
eval_frequency = 200 eval_frequency = 200
# Control how scores are printed and checkpoints are evaluated. # Control how scores are printed and checkpoints are evaluated.

View File

@ -18,7 +18,7 @@ DEFAULT_LABEL_COLORS = {
"LOC": "#ff9561", "LOC": "#ff9561",
"PERSON": "#aa9cfc", "PERSON": "#aa9cfc",
"NORP": "#c887fb", "NORP": "#c887fb",
"FACILITY": "#9cc9cc", "FAC": "#9cc9cc",
"EVENT": "#ffeb80", "EVENT": "#ffeb80",
"LAW": "#ff8197", "LAW": "#ff8197",
"LANGUAGE": "#ff8197", "LANGUAGE": "#ff8197",

View File

@ -483,7 +483,7 @@ class Errors(metaclass=ErrorsWithCodes):
"components, since spans are only views of the Doc. Use Doc and " "components, since spans are only views of the Doc. Use Doc and "
"Token attributes (or custom extension attributes) only and remove " "Token attributes (or custom extension attributes) only and remove "
"the following: {attrs}") "the following: {attrs}")
E181 = ("Received invalid attributes for unkown object {obj}: {attrs}. " E181 = ("Received invalid attributes for unknown object {obj}: {attrs}. "
"Only Doc and Token attributes are supported.") "Only Doc and Token attributes are supported.")
E182 = ("Received invalid attribute declaration: {attr}\nDid you forget " E182 = ("Received invalid attribute declaration: {attr}\nDid you forget "
"to define the attribute? For example: `{attr}.???`") "to define the attribute? For example: `{attr}.???`")
@ -888,9 +888,12 @@ class Errors(metaclass=ErrorsWithCodes):
E1021 = ("`pos` value \"{pp}\" is not a valid Universal Dependencies tag. " E1021 = ("`pos` value \"{pp}\" is not a valid Universal Dependencies tag. "
"Non-UD tags should use the `tag` property.") "Non-UD tags should use the `tag` property.")
E1022 = ("Words must be of type str or int, but input is of type '{wtype}'") E1022 = ("Words must be of type str or int, but input is of type '{wtype}'")
E1023 = ("Couldn't read EntityRuler from the {path}. This file doesn't exist.") E1023 = ("Couldn't read EntityRuler from the {path}. This file doesn't "
E1024 = ("A pattern with ID \"{ent_id}\" is not present in EntityRuler patterns.") "exist.")
E1024 = ("A pattern with ID \"{ent_id}\" is not present in EntityRuler "
"patterns.")
E1025 = ("Cannot intify the value '{value}' as an IOB string. The only "
"supported values are: 'I', 'O', 'B' and ''")
# Deprecated model shortcuts, only used in errors and warnings # Deprecated model shortcuts, only used in errors and warnings

View File

@ -310,7 +310,6 @@ GLOSSARY = {
"re": "repeated element", "re": "repeated element",
"rs": "reported speech", "rs": "reported speech",
"sb": "subject", "sb": "subject",
"sb": "subject",
"sbp": "passivized subject (PP)", "sbp": "passivized subject (PP)",
"sp": "subject or predicate", "sp": "subject or predicate",
"svp": "separable verb prefix", "svp": "separable verb prefix",

View File

@ -90,7 +90,7 @@ _eleven_to_beyond = [
"अड़सठ", "अड़सठ",
"उनहत्तर", "उनहत्तर",
"सत्तर", "सत्तर",
"इकहत्तर" "इकहत्तर",
"बहत्तर", "बहत्तर",
"तिहत्तर", "तिहत्तर",
"चौहत्तर", "चौहत्तर",

View File

@ -59,7 +59,7 @@ sentences = [
"Czy w ciągu ostatnich 48 godzin spożyłeś leki zawierające paracetamol?", "Czy w ciągu ostatnich 48 godzin spożyłeś leki zawierające paracetamol?",
"Kto ma ochotę zapoznać się z innymi niż w książkach przygodami Muminków i ich przyjaciół, temu polecam komiks Tove Jansson „Muminki i morze”.", "Kto ma ochotę zapoznać się z innymi niż w książkach przygodami Muminków i ich przyjaciół, temu polecam komiks Tove Jansson „Muminki i morze”.",
"Apple está querendo comprar uma startup do Reino Unido por 100 milhões de dólares.", "Apple está querendo comprar uma startup do Reino Unido por 100 milhões de dólares.",
"Carros autônomos empurram a responsabilidade do seguro para os fabricantes.." "Carros autônomos empurram a responsabilidade do seguro para os fabricantes..",
"São Francisco considera banir os robôs de entrega que andam pelas calçadas.", "São Francisco considera banir os robôs de entrega que andam pelas calçadas.",
"Londres é a maior cidade do Reino Unido.", "Londres é a maior cidade do Reino Unido.",
# Translations from English: # Translations from English:

View File

@ -131,7 +131,7 @@ class Language:
self, self,
vocab: Union[Vocab, bool] = True, vocab: Union[Vocab, bool] = True,
*, *,
max_length: int = 10 ** 6, max_length: int = 10**6,
meta: Dict[str, Any] = {}, meta: Dict[str, Any] = {},
create_tokenizer: Optional[Callable[["Language"], Callable[[str], Doc]]] = None, create_tokenizer: Optional[Callable[["Language"], Callable[[str], Doc]]] = None,
batch_size: int = 1000, batch_size: int = 1000,
@ -354,12 +354,15 @@ class Language:
@property @property
def pipe_labels(self) -> Dict[str, List[str]]: def pipe_labels(self) -> Dict[str, List[str]]:
"""Get the labels set by the pipeline components, if available (if """Get the labels set by the pipeline components, if available (if
the component exposes a labels property). the component exposes a labels property and the labels are not
hidden).
RETURNS (Dict[str, List[str]]): Labels keyed by component name. RETURNS (Dict[str, List[str]]): Labels keyed by component name.
""" """
labels = {} labels = {}
for name, pipe in self._components: for name, pipe in self._components:
if hasattr(pipe, "hide_labels") and pipe.hide_labels is True:
continue
if hasattr(pipe, "labels"): if hasattr(pipe, "labels"):
labels[name] = list(pipe.labels) labels[name] = list(pipe.labels)
return SimpleFrozenDict(labels) return SimpleFrozenDict(labels)
@ -522,7 +525,7 @@ class Language:
requires: Iterable[str] = SimpleFrozenList(), requires: Iterable[str] = SimpleFrozenList(),
retokenizes: bool = False, retokenizes: bool = False,
func: Optional["Pipe"] = None, func: Optional["Pipe"] = None,
) -> Callable: ) -> Callable[..., Any]:
"""Register a new pipeline component. Can be used for stateless function """Register a new pipeline component. Can be used for stateless function
components that don't require a separate factory. Can be used as a components that don't require a separate factory. Can be used as a
decorator on a function or classmethod, or called as a function with the decorator on a function or classmethod, or called as a function with the
@ -1285,9 +1288,9 @@ class Language:
) )
except IOError: except IOError:
raise IOError(Errors.E884.format(vectors=I["vectors"])) raise IOError(Errors.E884.format(vectors=I["vectors"]))
if self.vocab.vectors.data.shape[1] >= 1: if self.vocab.vectors.shape[1] >= 1:
ops = get_current_ops() ops = get_current_ops()
self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data) self.vocab.vectors.to_ops(ops)
if hasattr(self.tokenizer, "initialize"): if hasattr(self.tokenizer, "initialize"):
tok_settings = validate_init_settings( tok_settings = validate_init_settings(
self.tokenizer.initialize, # type: ignore[union-attr] self.tokenizer.initialize, # type: ignore[union-attr]
@ -1332,8 +1335,8 @@ class Language:
DOCS: https://spacy.io/api/language#resume_training DOCS: https://spacy.io/api/language#resume_training
""" """
ops = get_current_ops() ops = get_current_ops()
if self.vocab.vectors.data.shape[1] >= 1: if self.vocab.vectors.shape[1] >= 1:
self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data) self.vocab.vectors.to_ops(ops)
for name, proc in self.pipeline: for name, proc in self.pipeline:
if hasattr(proc, "_rehearsal_model"): if hasattr(proc, "_rehearsal_model"):
proc._rehearsal_model = deepcopy(proc.model) # type: ignore[attr-defined] proc._rehearsal_model = deepcopy(proc.model) # type: ignore[attr-defined]

View File

@ -19,7 +19,7 @@ class Lexeme:
@property @property
def vector_norm(self) -> float: ... def vector_norm(self) -> float: ...
vector: Floats1d vector: Floats1d
rank: str rank: int
sentiment: float sentiment: float
@property @property
def orth_(self) -> str: ... def orth_(self) -> str: ...

View File

@ -130,7 +130,9 @@ cdef class Lexeme:
return 0.0 return 0.0
vector = self.vector vector = self.vector
xp = get_array_module(vector) xp = get_array_module(vector)
return (xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm)) result = xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm)
# ensure we get a scalar back (numpy does this automatically but cupy doesn't)
return result.item()
@property @property
def has_vector(self): def has_vector(self):

View File

@ -0,0 +1,66 @@
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
from .matcher import Matcher
from ..vocab import Vocab
from ..tokens.doc import Doc
from ..tokens.span import Span
class DependencyMatcher:
"""Match dependency parse tree based on pattern rules."""
_patterns: Dict[str, List[Any]]
_raw_patterns: Dict[str, List[Any]]
_tokens_to_key: Dict[str, List[Any]]
_root: Dict[str, List[Any]]
_tree: Dict[str, List[Any]]
_callbacks: Dict[
Any, Callable[[DependencyMatcher, Doc, int, List[Tuple[int, List[int]]]], Any]
]
_ops: Dict[str, Any]
vocab: Vocab
_matcher: Matcher
def __init__(self, vocab: Vocab, *, validate: bool = ...) -> None: ...
def __reduce__(
self,
) -> Tuple[
Callable[
[Vocab, Dict[str, Any], Dict[str, Callable[..., Any]]], DependencyMatcher
],
Tuple[
Vocab,
Dict[str, List[Any]],
Dict[
str,
Callable[
[DependencyMatcher, Doc, int, List[Tuple[int, List[int]]]], Any
],
],
],
None,
None,
]: ...
def __len__(self) -> int: ...
def __contains__(self, key: Union[str, int]) -> bool: ...
def add(
self,
key: Union[str, int],
patterns: List[List[Dict[str, Any]]],
*,
on_match: Optional[
Callable[[DependencyMatcher, Doc, int, List[Tuple[int, List[int]]]], Any]
] = ...
) -> None: ...
def has_key(self, key: Union[str, int]) -> bool: ...
def get(
self, key: Union[str, int], default: Optional[Any] = ...
) -> Tuple[
Optional[
Callable[[DependencyMatcher, Doc, int, List[Tuple[int, List[int]]]], Any]
],
List[List[Dict[str, Any]]],
]: ...
def remove(self, key: Union[str, int]) -> None: ...
def __call__(self, doclike: Union[Doc, Span]) -> List[Tuple[int, List[int]]]: ...
def unpickle_matcher(
vocab: Vocab, patterns: Dict[str, Any], callbacks: Dict[str, Callable[..., Any]]
) -> DependencyMatcher: ...

View File

@ -1,4 +1,6 @@
from typing import Any, List, Dict, Tuple, Optional, Callable, Union, Iterator, Iterable from typing import Any, List, Dict, Tuple, Optional, Callable, Union
from typing import Iterator, Iterable, overload
from ..compat import Literal
from ..vocab import Vocab from ..vocab import Vocab
from ..tokens import Doc, Span from ..tokens import Doc, Span
@ -31,12 +33,22 @@ class Matcher:
) -> Union[ ) -> Union[
Iterator[Tuple[Tuple[Doc, Any], Any]], Iterator[Tuple[Doc, Any]], Iterator[Doc] Iterator[Tuple[Tuple[Doc, Any], Any]], Iterator[Tuple[Doc, Any]], Iterator[Doc]
]: ... ]: ...
@overload
def __call__( def __call__(
self, self,
doclike: Union[Doc, Span], doclike: Union[Doc, Span],
*, *,
as_spans: bool = ..., as_spans: Literal[False] = ...,
allow_missing: bool = ..., allow_missing: bool = ...,
with_alignments: bool = ... with_alignments: bool = ...
) -> Union[List[Tuple[int, int, int]], List[Span]]: ... ) -> List[Tuple[int, int, int]]: ...
@overload
def __call__(
self,
doclike: Union[Doc, Span],
*,
as_spans: Literal[True],
allow_missing: bool = ...,
with_alignments: bool = ...
) -> List[Span]: ...
def _normalize_key(self, key: Any) -> Any: ... def _normalize_key(self, key: Any) -> Any: ...

View File

@ -18,7 +18,7 @@ from ..tokens.doc cimport Doc, get_token_attr_for_matcher
from ..tokens.span cimport Span from ..tokens.span cimport Span
from ..tokens.token cimport Token from ..tokens.token cimport Token
from ..tokens.morphanalysis cimport MorphAnalysis from ..tokens.morphanalysis cimport MorphAnalysis
from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA, MORPH from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA, MORPH, ENT_IOB
from ..schemas import validate_token_pattern from ..schemas import validate_token_pattern
from ..errors import Errors, MatchPatternError, Warnings from ..errors import Errors, MatchPatternError, Warnings
@ -798,6 +798,9 @@ def _get_attr_values(spec, string_store):
attr = "SENT_START" attr = "SENT_START"
attr = IDS.get(attr) attr = IDS.get(attr)
if isinstance(value, str): if isinstance(value, str):
if attr == ENT_IOB and value in Token.iob_strings():
value = Token.iob_strings().index(value)
else:
value = string_store.add(value) value = string_store.add(value)
elif isinstance(value, bool): elif isinstance(value, bool):
value = int(value) value = int(value)

View File

@ -1,6 +1,6 @@
from typing import List, Tuple, Union, Optional, Callable, Any, Dict from typing import List, Tuple, Union, Optional, Callable, Any, Dict, overload
from ..compat import Literal
from . import Matcher from .matcher import Matcher
from ..vocab import Vocab from ..vocab import Vocab
from ..tokens import Doc, Span from ..tokens import Doc, Span
@ -14,16 +14,24 @@ class PhraseMatcher:
def add( def add(
self, self,
key: str, key: str,
docs: List[List[Dict[str, Any]]], docs: List[Doc],
*, *,
on_match: Optional[ on_match: Optional[
Callable[[Matcher, Doc, int, List[Tuple[Any, ...]]], Any] Callable[[Matcher, Doc, int, List[Tuple[Any, ...]]], Any]
] = ..., ] = ...,
) -> None: ... ) -> None: ...
def remove(self, key: str) -> None: ... def remove(self, key: str) -> None: ...
@overload
def __call__( def __call__(
self, self,
doclike: Union[Doc, Span], doclike: Union[Doc, Span],
*, *,
as_spans: bool = ..., as_spans: Literal[False] = ...,
) -> Union[List[Tuple[int, int, int]], List[Span]]: ... ) -> List[Tuple[int, int, int]]: ...
@overload
def __call__(
self,
doclike: Union[Doc, Span],
*,
as_spans: Literal[True],
) -> List[Span]: ...

View File

@ -23,7 +23,7 @@ def create_pretrain_vectors(
maxout_pieces: int, hidden_size: int, loss: str maxout_pieces: int, hidden_size: int, loss: str
) -> Callable[["Vocab", Model], Model]: ) -> Callable[["Vocab", Model], Model]:
def create_vectors_objective(vocab: "Vocab", tok2vec: Model) -> Model: def create_vectors_objective(vocab: "Vocab", tok2vec: Model) -> Model:
if vocab.vectors.data.shape[1] == 0: if vocab.vectors.shape[1] == 0:
raise ValueError(Errors.E875) raise ValueError(Errors.E875)
model = build_cloze_multi_task_model( model = build_cloze_multi_task_model(
vocab, tok2vec, hidden_size=hidden_size, maxout_pieces=maxout_pieces vocab, tok2vec, hidden_size=hidden_size, maxout_pieces=maxout_pieces
@ -85,7 +85,7 @@ def get_characters_loss(ops, docs, prediction, nr_char):
target = ops.asarray(to_categorical(target_ids, n_classes=256), dtype="f") target = ops.asarray(to_categorical(target_ids, n_classes=256), dtype="f")
target = target.reshape((-1, 256 * nr_char)) target = target.reshape((-1, 256 * nr_char))
diff = prediction - target diff = prediction - target
loss = (diff ** 2).sum() loss = (diff**2).sum()
d_target = diff / float(prediction.shape[0]) d_target = diff / float(prediction.shape[0])
return loss, d_target return loss, d_target
@ -116,7 +116,7 @@ def build_multi_task_model(
def build_cloze_multi_task_model( def build_cloze_multi_task_model(
vocab: "Vocab", tok2vec: Model, maxout_pieces: int, hidden_size: int vocab: "Vocab", tok2vec: Model, maxout_pieces: int, hidden_size: int
) -> Model: ) -> Model:
nO = vocab.vectors.data.shape[1] nO = vocab.vectors.shape[1]
output_layer = chain( output_layer = chain(
cast(Model[List["Floats2d"], Floats2d], list2array()), cast(Model[List["Floats2d"], Floats2d], list2array()),
Maxout( Maxout(

View File

@ -94,7 +94,7 @@ def init(
nM = model.get_dim("nM") if model.has_dim("nM") else None nM = model.get_dim("nM") if model.has_dim("nM") else None
nO = model.get_dim("nO") if model.has_dim("nO") else None nO = model.get_dim("nO") if model.has_dim("nO") else None
if X is not None and len(X): if X is not None and len(X):
nM = X[0].vocab.vectors.data.shape[1] nM = X[0].vocab.vectors.shape[1]
if Y is not None: if Y is not None:
nO = Y.data.shape[1] nO = Y.data.shape[1]

View File

@ -1,3 +1,4 @@
from cython.operator cimport dereference as deref, preincrement as incr
from libc.string cimport memcpy, memset from libc.string cimport memcpy, memset
from libc.stdlib cimport calloc, free from libc.stdlib cimport calloc, free
from libc.stdint cimport uint32_t, uint64_t from libc.stdint cimport uint32_t, uint64_t
@ -184,16 +185,20 @@ cdef cppclass StateC:
int L(int head, int idx) nogil const: int L(int head, int idx) nogil const:
if idx < 1 or this._left_arcs.size() == 0: if idx < 1 or this._left_arcs.size() == 0:
return -1 return -1
cdef vector[int] lefts
for i in range(this._left_arcs.size()): # Work backwards through left-arcs to find the arc at the
arc = this._left_arcs.at(i) # requested index more quickly.
cdef size_t child_index = 0
it = this._left_arcs.const_rbegin()
while it != this._left_arcs.rend():
arc = deref(it)
if arc.head == head and arc.child != -1 and arc.child < head: if arc.head == head and arc.child != -1 and arc.child < head:
lefts.push_back(arc.child) child_index += 1
idx = (<int>lefts.size()) - idx if child_index == idx:
if idx < 0: return arc.child
incr(it)
return -1 return -1
else:
return lefts.at(idx)
int R(int head, int idx) nogil const: int R(int head, int idx) nogil const:
if idx < 1 or this._right_arcs.size() == 0: if idx < 1 or this._right_arcs.size() == 0:

View File

@ -604,7 +604,7 @@ cdef class ArcEager(TransitionSystem):
actions[SHIFT][''] += 1 actions[SHIFT][''] += 1
if min_freq is not None: if min_freq is not None:
for action, label_freqs in actions.items(): for action, label_freqs in actions.items():
for label, freq in list(label_freqs.items()): for label, freq in label_freqs.copy().items():
if freq < min_freq: if freq < min_freq:
label_freqs.pop(label) label_freqs.pop(label)
# Ensure these actions are present # Ensure these actions are present

View File

@ -26,6 +26,8 @@ class Pipe:
@property @property
def labels(self) -> Tuple[str, ...]: ... def labels(self) -> Tuple[str, ...]: ...
@property @property
def hide_labels(self) -> bool: ...
@property
def label_data(self) -> Any: ... def label_data(self) -> Any: ...
def _require_labels(self) -> None: ... def _require_labels(self) -> None: ...
def set_error_handler( def set_error_handler(

View File

@ -102,6 +102,10 @@ cdef class Pipe:
def labels(self) -> Tuple[str, ...]: def labels(self) -> Tuple[str, ...]:
return tuple() return tuple()
@property
def hide_labels(self) -> bool:
return False
@property @property
def label_data(self): def label_data(self):
"""Optional JSON-serializable data that would be sufficient to recreate """Optional JSON-serializable data that would be sufficient to recreate

View File

@ -99,6 +99,10 @@ class SentenceRecognizer(Tagger):
# are 0 # are 0
return tuple(["I", "S"]) return tuple(["I", "S"])
@property
def hide_labels(self):
return True
@property @property
def label_data(self): def label_data(self):
return None return None

View File

@ -377,7 +377,7 @@ class SpanCategorizer(TrainablePipe):
# If the prediction is 0.9 and it's false, the gradient will be # If the prediction is 0.9 and it's false, the gradient will be
# 0.9 (0.9 - 0.0) # 0.9 (0.9 - 0.0)
d_scores = scores - target d_scores = scores - target
loss = float((d_scores ** 2).sum()) loss = float((d_scores**2).sum())
return loss, d_scores return loss, d_scores
def initialize( def initialize(
@ -412,7 +412,7 @@ class SpanCategorizer(TrainablePipe):
self._require_labels() self._require_labels()
if subbatch: if subbatch:
docs = [eg.x for eg in subbatch] docs = [eg.x for eg in subbatch]
spans = self.suggester(docs) spans = build_ngram_suggester(sizes=[1])(docs)
Y = self.model.ops.alloc2f(spans.dataXd.shape[0], len(self.labels)) Y = self.model.ops.alloc2f(spans.dataXd.shape[0], len(self.labels))
self.model.initialize(X=(docs, spans), Y=Y) self.model.initialize(X=(docs, spans), Y=Y)
else: else:

View File

@ -281,7 +281,7 @@ class TextCategorizer(TrainablePipe):
bp_scores(gradient) bp_scores(gradient)
if sgd is not None: if sgd is not None:
self.finish_update(sgd) self.finish_update(sgd)
losses[self.name] += (gradient ** 2).sum() losses[self.name] += (gradient**2).sum()
return losses return losses
def _examples_to_truth( def _examples_to_truth(
@ -315,7 +315,7 @@ class TextCategorizer(TrainablePipe):
not_missing = self.model.ops.asarray(not_missing) # type: ignore not_missing = self.model.ops.asarray(not_missing) # type: ignore
d_scores = (scores - truths) / scores.shape[0] d_scores = (scores - truths) / scores.shape[0]
d_scores *= not_missing d_scores *= not_missing
mean_square_error = (d_scores ** 2).sum(axis=1).mean() mean_square_error = (d_scores**2).sum(axis=1).mean()
return float(mean_square_error), d_scores return float(mean_square_error), d_scores
def add_label(self, label: str) -> int: def add_label(self, label: str) -> int:

View File

@ -1,5 +1,6 @@
from typing import Dict, List, Union, Optional, Any, Callable, Type, Tuple from typing import Dict, List, Union, Optional, Any, Callable, Type, Tuple
from typing import Iterable, TypeVar, TYPE_CHECKING from typing import Iterable, TypeVar, TYPE_CHECKING
from .compat import Literal
from enum import Enum from enum import Enum
from pydantic import BaseModel, Field, ValidationError, validator, create_model from pydantic import BaseModel, Field, ValidationError, validator, create_model
from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool
@ -209,6 +210,7 @@ NumberValue = Union[TokenPatternNumber, StrictInt, StrictFloat]
UnderscoreValue = Union[ UnderscoreValue = Union[
TokenPatternString, TokenPatternNumber, str, int, float, list, bool TokenPatternString, TokenPatternNumber, str, int, float, list, bool
] ]
IobValue = Literal["", "I", "O", "B", 0, 1, 2, 3]
class TokenPattern(BaseModel): class TokenPattern(BaseModel):
@ -222,6 +224,7 @@ class TokenPattern(BaseModel):
lemma: Optional[StringValue] = None lemma: Optional[StringValue] = None
shape: Optional[StringValue] = None shape: Optional[StringValue] = None
ent_type: Optional[StringValue] = None ent_type: Optional[StringValue] = None
ent_iob: Optional[IobValue] = None
ent_id: Optional[StringValue] = None ent_id: Optional[StringValue] = None
ent_kb_id: Optional[StringValue] = None ent_kb_id: Optional[StringValue] = None
norm: Optional[StringValue] = None norm: Optional[StringValue] = None

View File

@ -567,6 +567,7 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
"Merging the docs is fun.", "Merging the docs is fun.",
"", "",
"They don't think alike. ", "They don't think alike. ",
"",
"Another doc.", "Another doc.",
] ]
en_texts_without_empty = [t for t in en_texts if len(t)] en_texts_without_empty = [t for t in en_texts if len(t)]
@ -574,9 +575,9 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
en_docs = [en_tokenizer(text) for text in en_texts] en_docs = [en_tokenizer(text) for text in en_texts]
en_docs[0].spans["group"] = [en_docs[0][1:4]] en_docs[0].spans["group"] = [en_docs[0][1:4]]
en_docs[2].spans["group"] = [en_docs[2][1:4]] en_docs[2].spans["group"] = [en_docs[2][1:4]]
en_docs[3].spans["group"] = [en_docs[3][0:1]] en_docs[4].spans["group"] = [en_docs[4][0:1]]
span_group_texts = sorted( span_group_texts = sorted(
[en_docs[0][1:4].text, en_docs[2][1:4].text, en_docs[3][0:1].text] [en_docs[0][1:4].text, en_docs[2][1:4].text, en_docs[4][0:1].text]
) )
de_doc = de_tokenizer(de_text) de_doc = de_tokenizer(de_text)
Token.set_extension("is_ambiguous", default=False) Token.set_extension("is_ambiguous", default=False)
@ -683,6 +684,7 @@ def test_has_annotation(en_vocab):
attrs = ("TAG", "POS", "MORPH", "LEMMA", "DEP", "HEAD", "ENT_IOB", "ENT_TYPE") attrs = ("TAG", "POS", "MORPH", "LEMMA", "DEP", "HEAD", "ENT_IOB", "ENT_TYPE")
for attr in attrs: for attr in attrs:
assert not doc.has_annotation(attr) assert not doc.has_annotation(attr)
assert not doc.has_annotation(attr, require_complete=True)
doc[0].tag_ = "A" doc[0].tag_ = "A"
doc[0].pos_ = "X" doc[0].pos_ = "X"
@ -708,6 +710,27 @@ def test_has_annotation(en_vocab):
assert doc.has_annotation(attr, require_complete=True) assert doc.has_annotation(attr, require_complete=True)
def test_has_annotation_sents(en_vocab):
doc = Doc(en_vocab, words=["Hello", "beautiful", "world"])
attrs = ("SENT_START", "IS_SENT_START", "IS_SENT_END")
for attr in attrs:
assert not doc.has_annotation(attr)
assert not doc.has_annotation(attr, require_complete=True)
# The first token (index 0) is always assumed to be a sentence start,
# and ignored by the check in doc.has_annotation
doc[1].is_sent_start = False
for attr in attrs:
assert doc.has_annotation(attr)
assert not doc.has_annotation(attr, require_complete=True)
doc[2].is_sent_start = False
for attr in attrs:
assert doc.has_annotation(attr)
assert doc.has_annotation(attr, require_complete=True)
def test_is_flags_deprecated(en_tokenizer): def test_is_flags_deprecated(en_tokenizer):
doc = en_tokenizer("test") doc = en_tokenizer("test")
with pytest.deprecated_call(): with pytest.deprecated_call():

View File

@ -1,4 +1,5 @@
import pytest import pytest
from spacy.attrs import intify_attrs, ENT_IOB
from spacy.attrs import IS_ALPHA, LEMMA, NORM, ORTH, intify_attrs from spacy.attrs import IS_ALPHA, LEMMA, NORM, ORTH, intify_attrs
from spacy.lang.en.stop_words import STOP_WORDS from spacy.lang.en.stop_words import STOP_WORDS
@ -33,6 +34,38 @@ def test_attrs_do_deprecated(text):
assert int_attrs == {ORTH: 10, IS_ALPHA: True} assert int_attrs == {ORTH: 10, IS_ALPHA: True}
def test_attrs_ent_iob_intify():
int_attrs = intify_attrs({"ENT_IOB": ""})
assert int_attrs == {ENT_IOB: 0}
int_attrs = intify_attrs({"ENT_IOB": "I"})
assert int_attrs == {ENT_IOB: 1}
int_attrs = intify_attrs({"ENT_IOB": "O"})
assert int_attrs == {ENT_IOB: 2}
int_attrs = intify_attrs({"ENT_IOB": "B"})
assert int_attrs == {ENT_IOB: 3}
int_attrs = intify_attrs({ENT_IOB: ""})
assert int_attrs == {ENT_IOB: 0}
int_attrs = intify_attrs({ENT_IOB: "I"})
assert int_attrs == {ENT_IOB: 1}
int_attrs = intify_attrs({ENT_IOB: "O"})
assert int_attrs == {ENT_IOB: 2}
int_attrs = intify_attrs({ENT_IOB: "B"})
assert int_attrs == {ENT_IOB: 3}
with pytest.raises(ValueError):
int_attrs = intify_attrs({"ENT_IOB": "XX"})
with pytest.raises(ValueError):
int_attrs = intify_attrs({ENT_IOB: "XX"})
@pytest.mark.parametrize("text,match", [(",", True), (" ", False), ("a", False)]) @pytest.mark.parametrize("text,match", [(",", True), (" ", False), ("a", False)])
def test_lex_attrs_is_punct(text, match): def test_lex_attrs_is_punct(text, match):
assert is_punct(text) == match assert is_punct(text) == match

View File

@ -642,3 +642,30 @@ def test_matcher_no_zero_length(en_vocab):
matcher = Matcher(en_vocab) matcher = Matcher(en_vocab)
matcher.add("TEST", [[{"TAG": "C", "OP": "?"}]]) matcher.add("TEST", [[{"TAG": "C", "OP": "?"}]])
assert len(matcher(doc)) == 0 assert len(matcher(doc)) == 0
def test_matcher_ent_iob_key(en_vocab):
"""Test that patterns with ent_iob works correctly."""
matcher = Matcher(en_vocab)
matcher.add("Rule", [[{"ENT_IOB": "I"}]])
doc1 = Doc(en_vocab, words=["I", "visited", "New", "York", "and", "California"])
doc1.ents = [Span(doc1, 2, 4, label="GPE"), Span(doc1, 5, 6, label="GPE")]
doc2 = Doc(en_vocab, words=["I", "visited", "my", "friend", "Alicia"])
doc2.ents = [Span(doc2, 4, 5, label="PERSON")]
matches1 = [doc1[start:end].text for _, start, end in matcher(doc1)]
matches2 = [doc2[start:end].text for _, start, end in matcher(doc2)]
assert len(matches1) == 1
assert matches1[0] == "York"
assert len(matches2) == 0
matcher = Matcher(en_vocab) # Test iob pattern with operators
matcher.add("Rule", [[{"ENT_IOB": "I", "OP": "+"}]])
doc = Doc(
en_vocab, words=["I", "visited", "my", "friend", "Anna", "Maria", "Esperanza"]
)
doc.ents = [Span(doc, 4, 7, label="PERSON")]
matches = [doc[start:end].text for _, start, end in matcher(doc)]
assert len(matches) == 3
assert matches[0] == "Maria"
assert matches[1] == "Maria Esperanza"
assert matches[2] == "Esperanza"

View File

@ -12,6 +12,7 @@ TEST_PATTERNS = [
([{"IS_PUNCT": True, "OP": "$"}], 1, 1), ([{"IS_PUNCT": True, "OP": "$"}], 1, 1),
([{"_": "foo"}], 1, 1), ([{"_": "foo"}], 1, 1),
('[{"TEXT": "foo"}, {"LOWER": "bar"}]', 1, 1), ('[{"TEXT": "foo"}, {"LOWER": "bar"}]', 1, 1),
([{"ENT_IOB": "foo"}], 1, 1),
([1, 2, 3], 3, 1), ([1, 2, 3], 3, 1),
# Bad patterns flagged outside of Matcher # Bad patterns flagged outside of Matcher
([{"_": {"foo": "bar", "baz": {"IN": "foo"}}}], 2, 0), # prev: (1, 0) ([{"_": {"foo": "bar", "baz": {"IN": "foo"}}}], 2, 0), # prev: (1, 0)

View File

@ -12,6 +12,7 @@ def test_build_dependencies():
"flake8", "flake8",
"hypothesis", "hypothesis",
"pre-commit", "pre-commit",
"black",
"mypy", "mypy",
"types-dataclasses", "types-dataclasses",
"types-mock", "types-mock",

View File

@ -97,3 +97,7 @@ def test_overfitting_IO():
] ]
assert_equal(batch_deps_1, batch_deps_2) assert_equal(batch_deps_1, batch_deps_2)
assert_equal(batch_deps_1, no_batch_deps) assert_equal(batch_deps_1, no_batch_deps)
# test internal pipe labels vs. Language.pipe_labels with hidden labels
assert nlp.get_pipe("senter").labels == ("I", "S")
assert "senter" not in nlp.pipe_labels

View File

@ -80,6 +80,8 @@ def test_explicit_labels():
assert spancat.labels == ("PERSON", "LOC") assert spancat.labels == ("PERSON", "LOC")
# TODO figure out why this is flaky
@pytest.mark.skip(reason="Test is unreliable for unknown reason")
def test_doc_gc(): def test_doc_gc():
# If the Doc object is garbage collected, the spans won't be functional afterwards # If the Doc object is garbage collected, the spans won't be functional afterwards
nlp = Language() nlp = Language()
@ -97,6 +99,7 @@ def test_doc_gc():
assert isinstance(spangroups, SpanGroups) assert isinstance(spangroups, SpanGroups)
for key, spangroup in spangroups.items(): for key, spangroup in spangroups.items():
assert isinstance(spangroup, SpanGroup) assert isinstance(spangroup, SpanGroup)
# XXX This fails with length 0 sometimes
assert len(spangroup) > 0 assert len(spangroup) > 0
with pytest.raises(RuntimeError): with pytest.raises(RuntimeError):
span = spangroup[0] span = spangroup[0]

View File

@ -12,14 +12,18 @@ from spacy.cli._util import is_subpath_of, load_project_config
from spacy.cli._util import parse_config_overrides, string_to_list from spacy.cli._util import parse_config_overrides, string_to_list
from spacy.cli._util import substitute_project_variables from spacy.cli._util import substitute_project_variables
from spacy.cli._util import validate_project_commands from spacy.cli._util import validate_project_commands
from spacy.cli.debug_data import _compile_gold, _get_labels_from_model
from spacy.cli.debug_data import _get_labels_from_spancat
from spacy.cli.download import get_compatibility, get_version from spacy.cli.download import get_compatibility, get_version
from spacy.cli.init_config import RECOMMENDATIONS, init_config, fill_config from spacy.cli.init_config import RECOMMENDATIONS, init_config, fill_config
from spacy.cli.package import get_third_party_dependencies from spacy.cli.package import get_third_party_dependencies
from spacy.cli.package import _is_permitted_package_name
from spacy.cli.validate import get_model_pkgs from spacy.cli.validate import get_model_pkgs
from spacy.lang.en import English from spacy.lang.en import English
from spacy.lang.nl import Dutch from spacy.lang.nl import Dutch
from spacy.language import Language from spacy.language import Language
from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate
from spacy.tokens import Doc
from spacy.training import Example, docs_to_json, offsets_to_biluo_tags from spacy.training import Example, docs_to_json, offsets_to_biluo_tags
from spacy.training.converters import conll_ner_to_docs, conllu_to_docs from spacy.training.converters import conll_ner_to_docs, conllu_to_docs
from spacy.training.converters import iob_to_docs from spacy.training.converters import iob_to_docs
@ -665,3 +669,54 @@ def test_get_third_party_dependencies():
) )
def test_is_subpath_of(parent, child, expected): def test_is_subpath_of(parent, child, expected):
assert is_subpath_of(parent, child) == expected assert is_subpath_of(parent, child) == expected
@pytest.mark.slow
@pytest.mark.parametrize(
"factory_name,pipe_name",
[
("ner", "ner"),
("ner", "my_ner"),
("spancat", "spancat"),
("spancat", "my_spancat"),
],
)
def test_get_labels_from_model(factory_name, pipe_name):
labels = ("A", "B")
nlp = English()
pipe = nlp.add_pipe(factory_name, name=pipe_name)
for label in labels:
pipe.add_label(label)
nlp.initialize()
assert nlp.get_pipe(pipe_name).labels == labels
if factory_name == "spancat":
assert _get_labels_from_spancat(nlp)[pipe.key] == set(labels)
else:
assert _get_labels_from_model(nlp, factory_name) == set(labels)
def test_permitted_package_names():
# https://www.python.org/dev/peps/pep-0426/#name
assert _is_permitted_package_name("Meine_Bäume") == False
assert _is_permitted_package_name("_package") == False
assert _is_permitted_package_name("package_") == False
assert _is_permitted_package_name(".package") == False
assert _is_permitted_package_name("package.") == False
assert _is_permitted_package_name("-package") == False
assert _is_permitted_package_name("package-") == False
def test_debug_data_compile_gold():
nlp = English()
pred = Doc(nlp.vocab, words=["Token", ".", "New", "York", "City"])
ref = Doc(nlp.vocab, words=["Token", ".", "New York City"], sent_starts=[True, False, True], ents=["O", "O", "B-ENT"])
eg = Example(pred, ref)
data = _compile_gold([eg], ["ner"], nlp, True)
assert data["boundary_cross_ents"] == 0
pred = Doc(nlp.vocab, words=["Token", ".", "New", "York", "City"])
ref = Doc(nlp.vocab, words=["Token", ".", "New York City"], sent_starts=[True, False, True], ents=["O", "B-ENT", "I-ENT"])
eg = Example(pred, ref)
data = _compile_gold([eg], ["ner"], nlp, True)
assert data["boundary_cross_ents"] == 1

View File

@ -9,6 +9,7 @@ from spacy.tokenizer import Tokenizer
from spacy.tokens import Doc from spacy.tokens import Doc
from spacy.training import Example from spacy.training import Example
from spacy.util import compile_prefix_regex, compile_suffix_regex, ensure_path from spacy.util import compile_prefix_regex, compile_suffix_regex, ensure_path
from spacy.util import compile_infix_regex
from spacy.vocab import Vocab from spacy.vocab import Vocab
from spacy.symbols import ORTH from spacy.symbols import ORTH
@ -503,3 +504,20 @@ def test_tokenizer_prefix_suffix_overlap_lookbehind(en_vocab):
assert tokens == ["a", "10", "."] assert tokens == ["a", "10", "."]
explain_tokens = [t[1] for t in tokenizer.explain("a10.")] explain_tokens = [t[1] for t in tokenizer.explain("a10.")]
assert tokens == explain_tokens assert tokens == explain_tokens
def test_tokenizer_infix_prefix(en_vocab):
# the prefix and suffix matches overlap in the suffix lookbehind
infixes = ["±"]
suffixes = ["%"]
infix_re = compile_infix_regex(infixes)
suffix_re = compile_suffix_regex(suffixes)
tokenizer = Tokenizer(
en_vocab,
infix_finditer=infix_re.finditer,
suffix_search=suffix_re.search,
)
tokens = [t.text for t in tokenizer("±10%")]
assert tokens == ["±10", "%"]
explain_tokens = [t[1] for t in tokenizer.explain("±10%")]
assert tokens == explain_tokens

View File

@ -35,6 +35,7 @@ def test_vectors_similarity_LL(vocab, vectors):
assert lex1.vector_norm != 0 assert lex1.vector_norm != 0
assert lex2.vector_norm != 0 assert lex2.vector_norm != 0
assert lex1.vector[0] != lex2.vector[0] and lex1.vector[1] != lex2.vector[1] assert lex1.vector[0] != lex2.vector[0] and lex1.vector[1] != lex2.vector[1]
assert isinstance(lex1.similarity(lex2), float)
assert numpy.isclose(lex1.similarity(lex2), get_cosine(vec1, vec2)) assert numpy.isclose(lex1.similarity(lex2), get_cosine(vec1, vec2))
assert numpy.isclose(lex2.similarity(lex2), lex1.similarity(lex1)) assert numpy.isclose(lex2.similarity(lex2), lex1.similarity(lex1))
@ -47,25 +48,46 @@ def test_vectors_similarity_TT(vocab, vectors):
assert doc[0].vector_norm != 0 assert doc[0].vector_norm != 0
assert doc[1].vector_norm != 0 assert doc[1].vector_norm != 0
assert doc[0].vector[0] != doc[1].vector[0] and doc[0].vector[1] != doc[1].vector[1] assert doc[0].vector[0] != doc[1].vector[0] and doc[0].vector[1] != doc[1].vector[1]
assert isinstance(doc[0].similarity(doc[1]), float)
assert numpy.isclose(doc[0].similarity(doc[1]), get_cosine(vec1, vec2)) assert numpy.isclose(doc[0].similarity(doc[1]), get_cosine(vec1, vec2))
assert numpy.isclose(doc[1].similarity(doc[0]), doc[0].similarity(doc[1])) assert numpy.isclose(doc[1].similarity(doc[0]), doc[0].similarity(doc[1]))
def test_vectors_similarity_SS(vocab, vectors):
[(word1, vec1), (word2, vec2)] = vectors
doc = Doc(vocab, words=[word1, word2])
assert isinstance(doc[0:1].similarity(doc[0:2]), float)
assert doc[0:1].similarity(doc[0:2]) == doc[0:2].similarity(doc[0:1])
def test_vectors_similarity_DD(vocab, vectors):
[(word1, vec1), (word2, vec2)] = vectors
doc1 = Doc(vocab, words=[word1, word2])
doc2 = Doc(vocab, words=[word2, word1])
assert isinstance(doc1.similarity(doc2), float)
assert doc1.similarity(doc2) == doc2.similarity(doc1)
def test_vectors_similarity_TD(vocab, vectors): def test_vectors_similarity_TD(vocab, vectors):
[(word1, vec1), (word2, vec2)] = vectors [(word1, vec1), (word2, vec2)] = vectors
doc = Doc(vocab, words=[word1, word2]) doc = Doc(vocab, words=[word1, word2])
with pytest.warns(UserWarning): with pytest.warns(UserWarning):
assert isinstance(doc.similarity(doc[0]), float)
assert isinstance(doc[0].similarity(doc), float)
assert doc.similarity(doc[0]) == doc[0].similarity(doc) assert doc.similarity(doc[0]) == doc[0].similarity(doc)
def test_vectors_similarity_DS(vocab, vectors):
[(word1, vec1), (word2, vec2)] = vectors
doc = Doc(vocab, words=[word1, word2])
assert doc.similarity(doc[:2]) == doc[:2].similarity(doc)
def test_vectors_similarity_TS(vocab, vectors): def test_vectors_similarity_TS(vocab, vectors):
[(word1, vec1), (word2, vec2)] = vectors [(word1, vec1), (word2, vec2)] = vectors
doc = Doc(vocab, words=[word1, word2]) doc = Doc(vocab, words=[word1, word2])
with pytest.warns(UserWarning): with pytest.warns(UserWarning):
assert isinstance(doc[:2].similarity(doc[0]), float)
assert isinstance(doc[0].similarity(doc[-2]), float)
assert doc[:2].similarity(doc[0]) == doc[0].similarity(doc[:2]) assert doc[:2].similarity(doc[0]) == doc[0].similarity(doc[:2])
def test_vectors_similarity_DS(vocab, vectors):
[(word1, vec1), (word2, vec2)] = vectors
doc = Doc(vocab, words=[word1, word2])
assert isinstance(doc.similarity(doc[:2]), float)
assert doc.similarity(doc[:2]) == doc[:2].similarity(doc)

View File

@ -421,7 +421,7 @@ def test_vector_is_oov():
def test_init_vectors_unset(): def test_init_vectors_unset():
v = Vectors(shape=(10, 10)) v = Vectors(shape=(10, 10))
assert v.is_full is False assert v.is_full is False
assert v.data.shape == (10, 10) assert v.shape == (10, 10)
with pytest.raises(ValueError): with pytest.raises(ValueError):
v = Vectors(shape=(10, 10), mode="floret") v = Vectors(shape=(10, 10), mode="floret")
@ -514,7 +514,7 @@ def test_floret_vectors(floret_vectors_vec_str, floret_vectors_hashvec_str):
# rows: 2 rows per ngram # rows: 2 rows per ngram
rows = OPS.xp.asarray( rows = OPS.xp.asarray(
[ [
h % nlp.vocab.vectors.data.shape[0] h % nlp.vocab.vectors.shape[0]
for ngram in ngrams for ngram in ngrams
for h in nlp.vocab.vectors._get_ngram_hashes(ngram) for h in nlp.vocab.vectors._get_ngram_hashes(ngram)
], ],
@ -544,17 +544,17 @@ def test_floret_vectors(floret_vectors_vec_str, floret_vectors_hashvec_str):
# an empty key returns 0s # an empty key returns 0s
assert_equal( assert_equal(
OPS.to_numpy(nlp.vocab[""].vector), OPS.to_numpy(nlp.vocab[""].vector),
numpy.zeros((nlp.vocab.vectors.data.shape[0],)), numpy.zeros((nlp.vocab.vectors.shape[0],)),
) )
# an empty batch returns 0s # an empty batch returns 0s
assert_equal( assert_equal(
OPS.to_numpy(nlp.vocab.vectors.get_batch([""])), OPS.to_numpy(nlp.vocab.vectors.get_batch([""])),
numpy.zeros((1, nlp.vocab.vectors.data.shape[0])), numpy.zeros((1, nlp.vocab.vectors.shape[0])),
) )
# an empty key within a batch returns 0s # an empty key within a batch returns 0s
assert_equal( assert_equal(
OPS.to_numpy(nlp.vocab.vectors.get_batch(["a", "", "b"])[1]), OPS.to_numpy(nlp.vocab.vectors.get_batch(["a", "", "b"])[1]),
numpy.zeros((nlp.vocab.vectors.data.shape[0],)), numpy.zeros((nlp.vocab.vectors.shape[0],)),
) )
# the loaded ngram vector table cannot be modified # the loaded ngram vector table cannot be modified

View File

@ -45,10 +45,12 @@ cdef class Tokenizer:
`re.compile(string).search` to match suffixes. `re.compile(string).search` to match suffixes.
`infix_finditer` (callable): A function matching the signature of `infix_finditer` (callable): A function matching the signature of
`re.compile(string).finditer` to find infixes. `re.compile(string).finditer` to find infixes.
token_match (callable): A boolean function matching strings to be token_match (callable): A function matching the signature of
`re.compile(string).match`, for matching strings to be
recognized as tokens. recognized as tokens.
url_match (callable): A boolean function matching strings to be url_match (callable): A function matching the signature of
recognized as tokens after considering prefixes and suffixes. `re.compile(string).match`, for matching strings to be
recognized as urls.
EXAMPLE: EXAMPLE:
>>> tokenizer = Tokenizer(nlp.vocab) >>> tokenizer = Tokenizer(nlp.vocab)
@ -681,6 +683,8 @@ cdef class Tokenizer:
infixes = infix_finditer(substring) infixes = infix_finditer(substring)
offset = 0 offset = 0
for match in infixes: for match in infixes:
if offset == 0 and match.start() == 0:
continue
if substring[offset : match.start()]: if substring[offset : match.start()]:
tokens.append(("TOKEN", substring[offset : match.start()])) tokens.append(("TOKEN", substring[offset : match.start()]))
if substring[match.start() : match.end()]: if substring[match.start() : match.end()]:

View File

@ -10,7 +10,7 @@ from ..lexeme import Lexeme
from ..vocab import Vocab from ..vocab import Vocab
from .underscore import Underscore from .underscore import Underscore
from pathlib import Path from pathlib import Path
import numpy import numpy as np
class DocMethod(Protocol): class DocMethod(Protocol):
def __call__(self: Doc, *args: Any, **kwargs: Any) -> Any: ... # type: ignore[misc] def __call__(self: Doc, *args: Any, **kwargs: Any) -> Any: ... # type: ignore[misc]
@ -26,7 +26,7 @@ class Doc:
user_hooks: Dict[str, Callable[..., Any]] user_hooks: Dict[str, Callable[..., Any]]
user_token_hooks: Dict[str, Callable[..., Any]] user_token_hooks: Dict[str, Callable[..., Any]]
user_span_hooks: Dict[str, Callable[..., Any]] user_span_hooks: Dict[str, Callable[..., Any]]
tensor: numpy.ndarray tensor: np.ndarray[Any, np.dtype[np.float_]]
user_data: Dict[str, Any] user_data: Dict[str, Any]
has_unknown_spaces: bool has_unknown_spaces: bool
_context: Any _context: Any
@ -144,7 +144,7 @@ class Doc:
) -> Doc: ... ) -> Doc: ...
def to_array( def to_array(
self, py_attr_ids: Union[int, str, List[Union[int, str]]] self, py_attr_ids: Union[int, str, List[Union[int, str]]]
) -> numpy.ndarray: ... ) -> np.ndarray[Any, np.dtype[np.float_]]: ...
@staticmethod @staticmethod
def from_docs( def from_docs(
docs: List[Doc], docs: List[Doc],

View File

@ -420,6 +420,8 @@ cdef class Doc:
cdef int range_start = 0 cdef int range_start = 0
if attr == "IS_SENT_START" or attr == self.vocab.strings["IS_SENT_START"]: if attr == "IS_SENT_START" or attr == self.vocab.strings["IS_SENT_START"]:
attr = SENT_START attr = SENT_START
elif attr == "IS_SENT_END" or attr == self.vocab.strings["IS_SENT_END"]:
attr = SENT_START
attr = intify_attr(attr) attr = intify_attr(attr)
# adjust attributes # adjust attributes
if attr == HEAD: if attr == HEAD:
@ -616,7 +618,7 @@ cdef class Doc:
""" """
if "has_vector" in self.user_hooks: if "has_vector" in self.user_hooks:
return self.user_hooks["has_vector"](self) return self.user_hooks["has_vector"](self)
elif self.vocab.vectors.data.size: elif self.vocab.vectors.size:
return True return True
elif self.tensor.size: elif self.tensor.size:
return True return True
@ -641,7 +643,7 @@ cdef class Doc:
if not len(self): if not len(self):
self._vector = xp.zeros((self.vocab.vectors_length,), dtype="f") self._vector = xp.zeros((self.vocab.vectors_length,), dtype="f")
return self._vector return self._vector
elif self.vocab.vectors.data.size > 0: elif self.vocab.vectors.size > 0:
self._vector = sum(t.vector for t in self) / len(self) self._vector = sum(t.vector for t in self) / len(self)
return self._vector return self._vector
elif self.tensor.size > 0: elif self.tensor.size > 0:
@ -1183,7 +1185,7 @@ cdef class Doc:
token_offset = -1 token_offset = -1
for doc in docs[:-1]: for doc in docs[:-1]:
token_offset += len(doc) token_offset += len(doc)
if not (len(doc) > 0 and doc[-1].is_space): if len(doc) > 0 and not doc[-1].is_space:
concat_spaces[token_offset] = True concat_spaces[token_offset] = True
concat_array = numpy.concatenate(arrays) concat_array = numpy.concatenate(arrays)

View File

@ -364,7 +364,9 @@ cdef class Span:
return 0.0 return 0.0
vector = self.vector vector = self.vector
xp = get_array_module(vector) xp = get_array_module(vector)
return xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm) result = xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm)
# ensure we get a scalar back (numpy does this automatically but cupy doesn't)
return result.item()
cpdef np.ndarray to_array(self, object py_attr_ids): cpdef np.ndarray to_array(self, object py_attr_ids):
"""Given a list of M attribute IDs, export the tokens to a numpy """Given a list of M attribute IDs, export the tokens to a numpy
@ -497,7 +499,7 @@ cdef class Span:
""" """
if "has_vector" in self.doc.user_span_hooks: if "has_vector" in self.doc.user_span_hooks:
return self.doc.user_span_hooks["has_vector"](self) return self.doc.user_span_hooks["has_vector"](self)
elif self.vocab.vectors.data.size > 0: elif self.vocab.vectors.size > 0:
return any(token.has_vector for token in self) return any(token.has_vector for token in self)
elif self.doc.tensor.size > 0: elif self.doc.tensor.size > 0:
return True return True

View File

@ -20,6 +20,7 @@ from .doc cimport set_children_from_heads
from .. import parts_of_speech from .. import parts_of_speech
from ..errors import Errors, Warnings from ..errors import Errors, Warnings
from ..attrs import IOB_STRINGS
from .underscore import Underscore, get_ext_args from .underscore import Underscore, get_ext_args
@ -209,7 +210,9 @@ cdef class Token:
return 0.0 return 0.0
vector = self.vector vector = self.vector
xp = get_array_module(vector) xp = get_array_module(vector)
return (xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm)) result = xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm)
# ensure we get a scalar back (numpy does this automatically but cupy doesn't)
return result.item()
def has_morph(self): def has_morph(self):
"""Check whether the token has annotated morph information. """Check whether the token has annotated morph information.
@ -484,8 +487,6 @@ cdef class Token:
RETURNS (bool / None): Whether the token starts a sentence. RETURNS (bool / None): Whether the token starts a sentence.
None if unknown. None if unknown.
DOCS: https://spacy.io/api/token#is_sent_start
""" """
def __get__(self): def __get__(self):
if self.c.sent_start == 0: if self.c.sent_start == 0:
@ -743,7 +744,7 @@ cdef class Token:
@classmethod @classmethod
def iob_strings(cls): def iob_strings(cls):
return ("", "I", "O", "B") return IOB_STRINGS
@property @property
def ent_iob_(self): def ent_iob_(self):

View File

@ -1,17 +1,31 @@
from typing import Dict, Any from typing import Dict, Any, List, Optional, Tuple, Union, TYPE_CHECKING
import functools import functools
import copy import copy
from ..errors import Errors from ..errors import Errors
if TYPE_CHECKING:
from .doc import Doc
from .span import Span
from .token import Token
class Underscore: class Underscore:
mutable_types = (dict, list, set) mutable_types = (dict, list, set)
doc_extensions: Dict[Any, Any] = {} doc_extensions: Dict[Any, Any] = {}
span_extensions: Dict[Any, Any] = {} span_extensions: Dict[Any, Any] = {}
token_extensions: Dict[Any, Any] = {} token_extensions: Dict[Any, Any] = {}
_extensions: Dict[str, Any]
_obj: Union["Doc", "Span", "Token"]
_start: Optional[int]
_end: Optional[int]
def __init__(self, extensions, obj, start=None, end=None): def __init__(
self,
extensions: Dict[str, Any],
obj: Union["Doc", "Span", "Token"],
start: Optional[int] = None,
end: Optional[int] = None,
):
object.__setattr__(self, "_extensions", extensions) object.__setattr__(self, "_extensions", extensions)
object.__setattr__(self, "_obj", obj) object.__setattr__(self, "_obj", obj)
# Assumption is that for doc values, _start and _end will both be None # Assumption is that for doc values, _start and _end will both be None
@ -23,12 +37,12 @@ class Underscore:
object.__setattr__(self, "_start", start) object.__setattr__(self, "_start", start)
object.__setattr__(self, "_end", end) object.__setattr__(self, "_end", end)
def __dir__(self): def __dir__(self) -> List[str]:
# Hack to enable autocomplete on custom extensions # Hack to enable autocomplete on custom extensions
extensions = list(self._extensions.keys()) extensions = list(self._extensions.keys())
return ["set", "get", "has"] + extensions return ["set", "get", "has"] + extensions
def __getattr__(self, name): def __getattr__(self, name: str) -> Any:
if name not in self._extensions: if name not in self._extensions:
raise AttributeError(Errors.E046.format(name=name)) raise AttributeError(Errors.E046.format(name=name))
default, method, getter, setter = self._extensions[name] default, method, getter, setter = self._extensions[name]
@ -56,7 +70,7 @@ class Underscore:
return new_default return new_default
return default return default
def __setattr__(self, name, value): def __setattr__(self, name: str, value: Any):
if name not in self._extensions: if name not in self._extensions:
raise AttributeError(Errors.E047.format(name=name)) raise AttributeError(Errors.E047.format(name=name))
default, method, getter, setter = self._extensions[name] default, method, getter, setter = self._extensions[name]
@ -65,28 +79,30 @@ class Underscore:
else: else:
self._doc.user_data[self._get_key(name)] = value self._doc.user_data[self._get_key(name)] = value
def set(self, name, value): def set(self, name: str, value: Any):
return self.__setattr__(name, value) return self.__setattr__(name, value)
def get(self, name): def get(self, name: str) -> Any:
return self.__getattr__(name) return self.__getattr__(name)
def has(self, name): def has(self, name: str) -> bool:
return name in self._extensions return name in self._extensions
def _get_key(self, name): def _get_key(self, name: str) -> Tuple[str, str, Optional[int], Optional[int]]:
return ("._.", name, self._start, self._end) return ("._.", name, self._start, self._end)
@classmethod @classmethod
def get_state(cls): def get_state(cls) -> Tuple[Dict[Any, Any], Dict[Any, Any], Dict[Any, Any]]:
return cls.token_extensions, cls.span_extensions, cls.doc_extensions return cls.token_extensions, cls.span_extensions, cls.doc_extensions
@classmethod @classmethod
def load_state(cls, state): def load_state(
cls, state: Tuple[Dict[Any, Any], Dict[Any, Any], Dict[Any, Any]]
) -> None:
cls.token_extensions, cls.span_extensions, cls.doc_extensions = state cls.token_extensions, cls.span_extensions, cls.doc_extensions = state
def get_ext_args(**kwargs): def get_ext_args(**kwargs: Any):
"""Validate and convert arguments. Reused in Doc, Token and Span.""" """Validate and convert arguments. Reused in Doc, Token and Span."""
default = kwargs.get("default") default = kwargs.get("default")
getter = kwargs.get("getter") getter = kwargs.get("getter")

View File

@ -164,7 +164,7 @@ def load_vectors_into_model(
len(vectors_nlp.vocab.vectors.keys()) == 0 len(vectors_nlp.vocab.vectors.keys()) == 0
and vectors_nlp.vocab.vectors.mode != VectorsMode.floret and vectors_nlp.vocab.vectors.mode != VectorsMode.floret
) or ( ) or (
vectors_nlp.vocab.vectors.data.shape[0] == 0 vectors_nlp.vocab.vectors.shape[0] == 0
and vectors_nlp.vocab.vectors.mode == VectorsMode.floret and vectors_nlp.vocab.vectors.mode == VectorsMode.floret
): ):
logger.warning(Warnings.W112.format(name=name)) logger.warning(Warnings.W112.format(name=name))

View File

@ -871,7 +871,6 @@ def get_package_path(name: str) -> Path:
name (str): Package name. name (str): Package name.
RETURNS (Path): Path to installed package. RETURNS (Path): Path to installed package.
""" """
name = name.lower() # use lowercase version to be safe
# Here we're importing the module just to find it. This is worryingly # Here we're importing the module just to find it. This is worryingly
# indirect, but it's otherwise very difficult to find the package. # indirect, but it's otherwise very difficult to find the package.
pkg = importlib.import_module(name) pkg = importlib.import_module(name)

View File

@ -1,5 +1,5 @@
cimport numpy as np cimport numpy as np
from libc.stdint cimport uint32_t from libc.stdint cimport uint32_t, uint64_t
from cython.operator cimport dereference as deref from cython.operator cimport dereference as deref
from libcpp.set cimport set as cppset from libcpp.set cimport set as cppset
from murmurhash.mrmr cimport hash128_x64 from murmurhash.mrmr cimport hash128_x64
@ -10,7 +10,7 @@ from typing import cast
import warnings import warnings
from enum import Enum from enum import Enum
import srsly import srsly
from thinc.api import get_array_module, get_current_ops from thinc.api import Ops, get_array_module, get_current_ops
from thinc.backends import get_array_ops from thinc.backends import get_array_ops
from thinc.types import Floats2d from thinc.types import Floats2d
@ -146,7 +146,7 @@ cdef class Vectors:
DOCS: https://spacy.io/api/vectors#size DOCS: https://spacy.io/api/vectors#size
""" """
return self.data.shape[0] * self.data.shape[1] return self.data.size
@property @property
def is_full(self): def is_full(self):
@ -274,7 +274,7 @@ cdef class Vectors:
self.data = resized_array self.data = resized_array
self._sync_unset() self._sync_unset()
removed_items = [] removed_items = []
for key, row in list(self.key2row.items()): for key, row in self.key2row.copy().items():
if row >= shape[0]: if row >= shape[0]:
self.key2row.pop(key) self.key2row.pop(key)
removed_items.append((key, row)) removed_items.append((key, row))
@ -353,12 +353,18 @@ cdef class Vectors:
key (str): The string key. key (str): The string key.
RETURNS: A list of the integer hashes. RETURNS: A list of the integer hashes.
""" """
cdef uint32_t[4] out # MurmurHash3_x64_128 returns an array of 2 uint64_t values.
cdef uint64_t[2] out
chars = s.encode("utf8") chars = s.encode("utf8")
cdef char* utf8_string = chars cdef char* utf8_string = chars
hash128_x64(utf8_string, len(chars), self.hash_seed, &out) hash128_x64(utf8_string, len(chars), self.hash_seed, &out)
rows = [out[i] for i in range(min(self.hash_count, 4))] rows = [
return rows out[0] & 0xffffffffu,
out[0] >> 32,
out[1] & 0xffffffffu,
out[1] >> 32,
]
return rows[:min(self.hash_count, 4)]
def _get_ngrams(self, unicode key): def _get_ngrams(self, unicode key):
"""Get all padded ngram strings using the ngram settings. """Get all padded ngram strings using the ngram settings.
@ -511,6 +517,9 @@ cdef class Vectors:
for i in range(len(queries)) ], dtype="uint64") for i in range(len(queries)) ], dtype="uint64")
return (keys, best_rows, scores) return (keys, best_rows, scores)
def to_ops(self, ops: Ops):
self.data = ops.asarray(self.data)
def _get_cfg(self): def _get_cfg(self):
if self.mode == Mode.default: if self.mode == Mode.default:
return { return {

View File

@ -283,7 +283,7 @@ cdef class Vocab:
@property @property
def vectors_length(self): def vectors_length(self):
return self.vectors.data.shape[1] return self.vectors.shape[1]
def reset_vectors(self, *, width=None, shape=None): def reset_vectors(self, *, width=None, shape=None):
"""Drop the current vector table. Because all vectors must be the same """Drop the current vector table. Because all vectors must be the same
@ -294,7 +294,7 @@ cdef class Vocab:
elif shape is not None: elif shape is not None:
self.vectors = Vectors(strings=self.strings, shape=shape) self.vectors = Vectors(strings=self.strings, shape=shape)
else: else:
width = width if width is not None else self.vectors.data.shape[1] width = width if width is not None else self.vectors.shape[1]
self.vectors = Vectors(strings=self.strings, shape=(self.vectors.shape[0], width)) self.vectors = Vectors(strings=self.strings, shape=(self.vectors.shape[0], width))
def prune_vectors(self, nr_row, batch_size=1024): def prune_vectors(self, nr_row, batch_size=1024):

View File

@ -79,6 +79,7 @@ train/test skew.
| `max_length` | Maximum document length. Longer documents will be split into sentences, if sentence boundaries are available. Defaults to `0` for no limit. ~~int~~ | | `max_length` | Maximum document length. Longer documents will be split into sentences, if sentence boundaries are available. Defaults to `0` for no limit. ~~int~~ |
| `limit` | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~ | | `limit` | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~ |
| `augmenter` | Optional data augmentation callback. ~~Callable[[Language, Example], Iterable[Example]]~~ | | `augmenter` | Optional data augmentation callback. ~~Callable[[Language, Example], Iterable[Example]]~~ |
| `shuffle` | Whether to shuffle the examples. Defaults to `False`. ~~bool~~ |
## Corpus.\_\_call\_\_ {#call tag="method"} ## Corpus.\_\_call\_\_ {#call tag="method"}

View File

@ -304,7 +304,7 @@ ancestor is found, e.g. if span excludes a necessary ancestor.
## Doc.has_annotation {#has_annotation tag="method"} ## Doc.has_annotation {#has_annotation tag="method"}
Check whether the doc contains annotation on a token attribute. Check whether the doc contains annotation on a [`Token` attribute](/api/token#attributes).
<Infobox title="Changed in v3.0" variant="warning"> <Infobox title="Changed in v3.0" variant="warning">

View File

@ -44,6 +44,7 @@ rule-based matching are:
| `SPACY` | Token has a trailing space. ~~bool~~ | | `SPACY` | Token has a trailing space. ~~bool~~ |
|  `POS`, `TAG`, `MORPH`, `DEP`, `LEMMA`, `SHAPE` | The token's simple and extended part-of-speech tag, morphological analysis, dependency label, lemma, shape. ~~str~~ | |  `POS`, `TAG`, `MORPH`, `DEP`, `LEMMA`, `SHAPE` | The token's simple and extended part-of-speech tag, morphological analysis, dependency label, lemma, shape. ~~str~~ |
| `ENT_TYPE` | The token's entity label. ~~str~~ | | `ENT_TYPE` | The token's entity label. ~~str~~ |
| `ENT_IOB` | The IOB part of the token's entity tag. ~~str~~ |
| `ENT_ID` | The token's entity ID (`ent_id`). ~~str~~ | | `ENT_ID` | The token's entity ID (`ent_id`). ~~str~~ |
| `ENT_KB_ID` | The token's entity knowledge base ID (`ent_kb_id`). ~~str~~ | | `ENT_KB_ID` | The token's entity knowledge base ID (`ent_kb_id`). ~~str~~ |
| `_` <Tag variant="new">2.1</Tag> | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). ~~Dict[str, Any]~~ | | `_` <Tag variant="new">2.1</Tag> | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). ~~Dict[str, Any]~~ |

View File

@ -349,23 +349,6 @@ A sequence containing the token and all the token's syntactic descendants.
| ---------- | ------------------------------------------------------------------------------------ | | ---------- | ------------------------------------------------------------------------------------ |
| **YIELDS** | A descendant token such that `self.is_ancestor(token)` or `token == self`. ~~Token~~ | | **YIELDS** | A descendant token such that `self.is_ancestor(token)` or `token == self`. ~~Token~~ |
## Token.is_sent_start {#is_sent_start tag="property" new="2"}
A boolean value indicating whether the token starts a sentence. `None` if
unknown. Defaults to `True` for the first token in the `Doc`.
> #### Example
>
> ```python
> doc = nlp("Give it back! He pleaded.")
> assert doc[4].is_sent_start
> assert not doc[5].is_sent_start
> ```
| Name | Description |
| ----------- | ------------------------------------------------------- |
| **RETURNS** | Whether the token starts a sentence. ~~Optional[bool]~~ |
## Token.has_vector {#has_vector tag="property" model="vectors"} ## Token.has_vector {#has_vector tag="property" model="vectors"}
A boolean value indicating whether a word vector is associated with the token. A boolean value indicating whether a word vector is associated with the token.
@ -465,6 +448,8 @@ The L2 norm of the token's vector representation.
| `is_punct` | Is the token punctuation? ~~bool~~ | | `is_punct` | Is the token punctuation? ~~bool~~ |
| `is_left_punct` | Is the token a left punctuation mark, e.g. `"("` ? ~~bool~~ | | `is_left_punct` | Is the token a left punctuation mark, e.g. `"("` ? ~~bool~~ |
| `is_right_punct` | Is the token a right punctuation mark, e.g. `")"` ? ~~bool~~ | | `is_right_punct` | Is the token a right punctuation mark, e.g. `")"` ? ~~bool~~ |
| `is_sent_start` | Does the token start a sentence? ~~bool~~ or `None` if unknown. Defaults to `True` for the first token in the `Doc`. |
| `is_sent_end` | Does the token end a sentence? ~~bool~~ or `None` if unknown. |
| `is_space` | Does the token consist of whitespace characters? Equivalent to `token.text.isspace()`. ~~bool~~ | | `is_space` | Does the token consist of whitespace characters? Equivalent to `token.text.isspace()`. ~~bool~~ |
| `is_bracket` | Is the token a bracket? ~~bool~~ | | `is_bracket` | Is the token a bracket? ~~bool~~ |
| `is_quote` | Is the token a quotation mark? ~~bool~~ | | `is_quote` | Is the token a quotation mark? ~~bool~~ |

View File

@ -371,6 +371,23 @@ Get the vectors for the provided keys efficiently as a batch.
| ------ | --------------------------------------- | | ------ | --------------------------------------- |
| `keys` | The keys. ~~Iterable[Union[int, str]]~~ | | `keys` | The keys. ~~Iterable[Union[int, str]]~~ |
## Vectors.to_ops {#to_ops tag="method"}
Change the embedding matrix to use different Thinc ops.
> #### Example
>
> ```python
> from thinc.api import NumpyOps
>
> vectors.to_ops(NumpyOps())
>
> ```
| Name | Description |
|-------|----------------------------------------------------------|
| `ops` | The Thinc ops to switch the embedding matrix to. ~~Ops~~ |
## Vectors.to_disk {#to_disk tag="method"} ## Vectors.to_disk {#to_disk tag="method"}
Save the current state to a directory. Save the current state to a directory.

View File

@ -831,6 +831,8 @@ def tokenizer_pseudo_code(
infixes = infix_finditer(substring) infixes = infix_finditer(substring)
offset = 0 offset = 0
for match in infixes: for match in infixes:
if offset == 0 and match.start() == 0:
continue
tokens.append(substring[offset : match.start()]) tokens.append(substring[offset : match.start()])
tokens.append(substring[match.start() : match.end()]) tokens.append(substring[match.start() : match.end()])
offset = match.end() offset = match.end()

View File

@ -213,6 +213,12 @@ format, train a pipeline, evaluate it and export metrics, package it and spin up
a quick web demo. It looks pretty similar to a config file used to define CI a quick web demo. It looks pretty similar to a config file used to define CI
pipelines. pipelines.
> #### Tip: Multi-line YAML syntax for long values
>
> YAML has [multi-line syntax](https://yaml-multiline.info/) that can be
> helpful for readability with longer values such as project descriptions or
> commands that take several arguments.
```yaml ```yaml
%%GITHUB_PROJECTS/pipelines/tagger_parser_ud/project.yml %%GITHUB_PROJECTS/pipelines/tagger_parser_ud/project.yml
``` ```

View File

@ -141,7 +141,8 @@
"website": "https://www.nr.no/~plison" "website": "https://www.nr.no/~plison"
}, },
"category": ["pipeline", "standalone", "research", "training"], "category": ["pipeline", "standalone", "research", "training"],
"tags": [] "tags": [],
"spacy_version": 3
}, },
{ {
"id": "numerizer", "id": "numerizer",
@ -952,6 +953,37 @@
"category": ["pipeline"], "category": ["pipeline"],
"tags": ["lemmatizer", "danish"] "tags": ["lemmatizer", "danish"]
}, },
{
"id": "augmenty",
"title": "Augmenty",
"slogan": "The cherry on top of your NLP pipeline",
"description": "Augmenty is an augmentation library based on spaCy for augmenting texts. Augmenty differs from other augmentation libraries in that it corrects (as far as possible) the token, sentence and document labels under the augmentation.",
"github": "kennethenevoldsen/augmenty",
"pip": "augmenty",
"code_example": [
"import spacy",
"import augmenty",
"",
"nlp = spacy.load('en_core_web_md')",
"",
"docs = nlp.pipe(['Augmenty is a great tool for text augmentation'])",
"",
"ent_dict = {'ORG': [['spaCy'], ['spaCy', 'Universe']]}",
"entity_augmenter = augmenty.load('ents_replace.v1',",
" ent_dict = ent_dict, level=1)",
"",
"for doc in augmenty.docs(docs, augmenter=entity_augmenter, nlp=nlp):",
" print(doc)"
],
"thumb": "https://github.com/KennethEnevoldsen/augmenty/blob/master/img/icon.png?raw=true",
"author": "Kenneth Enevoldsen",
"author_links": {
"github": "kennethenevoldsen",
"website": "https://www.kennethenevoldsen.com"
},
"category": ["training", "research"],
"tags": ["training", "research", "augmentation"]
},
{ {
"id": "dacy", "id": "dacy",
"title": "DaCy", "title": "DaCy",

View File

@ -8,10 +8,11 @@ import Title from '../components/title'
import Grid from '../components/grid' import Grid from '../components/grid'
import Button from '../components/button' import Button from '../components/button'
import Icon from '../components/icon' import Icon from '../components/icon'
import Tag from '../components/tag'
import CodeBlock, { InlineCode } from '../components/code' import CodeBlock, { InlineCode } from '../components/code'
import Aside from '../components/aside' import Aside from '../components/aside'
import Sidebar from '../components/sidebar' import Sidebar from '../components/sidebar'
import Section from '../components/section' import Section, { Hr } from '../components/section'
import Main from '../components/main' import Main from '../components/main'
import Footer from '../components/footer' import Footer from '../components/footer'
import { H3, H5, Label, InlineList } from '../components/typography' import { H3, H5, Label, InlineList } from '../components/typography'
@ -121,6 +122,18 @@ const UniverseContent = ({ content = [], categories, theme, pageContext, mdxComp
</Grid> </Grid>
</Section> </Section>
)} )}
<section className="search-exclude">
<H3>Found a mistake or something isn't working?</H3>
<p>
If you've come across a universe project that isn't working or is
incompatible with the reported spaCy version, let us know by{' '}
<Link to="https://github.com/explosion/spaCy/discussions/new">
opening a discussion thread
</Link>
.
</p>
</section>
<Hr />
<section className="search-exclude"> <section className="search-exclude">
<H3>Submit your project</H3> <H3>Submit your project</H3>
<p> <p>
@ -168,11 +181,22 @@ UniverseContent.propTypes = {
mdxComponents: PropTypes.object, mdxComponents: PropTypes.object,
} }
const SpaCyVersion = ({ version }) => {
const versions = !Array.isArray(version) ? [version] : version
return versions.map((v, i) => (
<>
<Tag tooltip={`This project is compatible with spaCy v${v}`}>spaCy v{v}</Tag>{' '}
</>
))
}
const Project = ({ data, components }) => ( const Project = ({ data, components }) => (
<> <>
<Title title={data.title || data.id} teaser={data.slogan} image={data.thumb}> <Title title={data.title || data.id} teaser={data.slogan} image={data.thumb}>
{data.github && ( {(data.github || data.spacy_version) && (
<p> <p>
{data.spacy_version && <SpaCyVersion version={data.spacy_version} />}
{data.github && (
<Link to={`https://github.com/${data.github}`} hidden> <Link to={`https://github.com/${data.github}`} hidden>
{[ {[
`release/${data.github}/all.svg?style=flat-square`, `release/${data.github}/all.svg?style=flat-square`,
@ -180,13 +204,18 @@ const Project = ({ data, components }) => (
`stars/${data.github}.svg?style=social&label=Stars`, `stars/${data.github}.svg?style=social&label=Stars`,
].map((url, i) => ( ].map((url, i) => (
<img <img
style={{ borderRadius: '1em', marginRight: '0.5rem' }} style={{
borderRadius: '1em',
marginRight: '0.5rem',
verticalAlign: 'middle',
}}
key={i} key={i}
src={`https://img.shields.io/github/${url}`} src={`https://img.shields.io/github/${url}`}
alt="" alt=""
/> />
))} ))}
</Link> </Link>
)}
</p> </p>
)} )}
</Title> </Title>
@ -335,6 +364,7 @@ const query = graphql`
url url
github github
description description
spacy_version
pip pip
cran cran
category category