Merge branch 'master' into spacy.io

This commit is contained in:
Adriane Boyd 2022-02-11 11:26:08 +01:00
commit d43082289c
64 changed files with 877 additions and 202 deletions

21
.github/workflows/gputests.yml vendored Normal file
View File

@ -0,0 +1,21 @@
name: Weekly GPU tests
on:
schedule:
- cron: '0 1 * * MON'
jobs:
weekly-gputests:
strategy:
fail-fast: false
matrix:
branch: [master, develop, v4]
runs-on: ubuntu-latest
steps:
- name: Trigger buildkite build
uses: buildkite/trigger-pipeline-action@v1.2.0
env:
PIPELINE: explosion-ai/spacy-slow-gpu-tests
BRANCH: ${{ matrix.branch }}
MESSAGE: ":github: Weekly GPU + slow tests - triggered from a GitHub Action"
BUILDKITE_API_ACCESS_TOKEN: ${{ secrets.BUILDKITE_SECRET }}

35
.github/workflows/slowtests.yml vendored Normal file
View File

@ -0,0 +1,35 @@
name: Daily slow tests
on:
schedule:
- cron: '0 0 * * *'
jobs:
daily-slowtests:
strategy:
fail-fast: false
matrix:
branch: [master, develop, v4]
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v1
- name: Get commits from past 24 hours
id: check_commits
run: |
today=$(date '+%Y-%m-%d %H:%M:%S')
yesterday=$(date -d "yesterday" '+%Y-%m-%d %H:%M:%S')
if git log --after=$yesterday --before=$today | grep commit ; then
echo "::set-output name=run_tests::true"
else
echo "::set-output name=run_tests::false"
fi
- name: Trigger buildkite build
if: steps.check_commits.outputs.run_tests == 'true'
uses: buildkite/trigger-pipeline-action@v1.2.0
env:
PIPELINE: explosion-ai/spacy-slow-tests
BRANCH: ${{ matrix.branch }}
MESSAGE: ":github: Daily slow tests - triggered from a GitHub Action"
BUILDKITE_API_ACCESS_TOKEN: ${{ secrets.BUILDKITE_SECRET }}

View File

@ -1,6 +1,6 @@
The MIT License (MIT)
Copyright (C) 2016-2021 ExplosionAI GmbH, 2016 spaCy GmbH, 2015 Matthew Honnibal
Copyright (C) 2016-2022 ExplosionAI GmbH, 2016 spaCy GmbH, 2015 Matthew Honnibal
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal

View File

@ -1,11 +1,8 @@
recursive-include include *.h
recursive-include spacy *.pyi *.pyx *.pxd *.txt *.cfg *.jinja *.toml
include LICENSE
include README.md
include pyproject.toml
include spacy/py.typed
recursive-exclude spacy/lang *.json
recursive-include spacy/lang *.json.gz
recursive-include spacy/cli *.json *.yml
recursive-include spacy/cli *.yml
recursive-include licenses *
recursive-exclude spacy *.cpp

View File

@ -31,7 +31,8 @@ pytest-timeout>=1.3.0,<2.0.0
mock>=2.0.0,<3.0.0
flake8>=3.8.0,<3.10.0
hypothesis>=3.27.0,<7.0.0
mypy>=0.910
mypy==0.910
types-dataclasses>=0.1.3; python_version < "3.7"
types-mock>=0.1.1
types-requests
black>=22.0,<23.0

View File

@ -77,37 +77,39 @@ transformers =
ray =
spacy_ray>=0.1.0,<1.0.0
cuda =
cupy>=5.0.0b4,<10.0.0
cupy>=5.0.0b4,<11.0.0
cuda80 =
cupy-cuda80>=5.0.0b4,<10.0.0
cupy-cuda80>=5.0.0b4,<11.0.0
cuda90 =
cupy-cuda90>=5.0.0b4,<10.0.0
cupy-cuda90>=5.0.0b4,<11.0.0
cuda91 =
cupy-cuda91>=5.0.0b4,<10.0.0
cupy-cuda91>=5.0.0b4,<11.0.0
cuda92 =
cupy-cuda92>=5.0.0b4,<10.0.0
cupy-cuda92>=5.0.0b4,<11.0.0
cuda100 =
cupy-cuda100>=5.0.0b4,<10.0.0
cupy-cuda100>=5.0.0b4,<11.0.0
cuda101 =
cupy-cuda101>=5.0.0b4,<10.0.0
cupy-cuda101>=5.0.0b4,<11.0.0
cuda102 =
cupy-cuda102>=5.0.0b4,<10.0.0
cupy-cuda102>=5.0.0b4,<11.0.0
cuda110 =
cupy-cuda110>=5.0.0b4,<10.0.0
cupy-cuda110>=5.0.0b4,<11.0.0
cuda111 =
cupy-cuda111>=5.0.0b4,<10.0.0
cupy-cuda111>=5.0.0b4,<11.0.0
cuda112 =
cupy-cuda112>=5.0.0b4,<10.0.0
cupy-cuda112>=5.0.0b4,<11.0.0
cuda113 =
cupy-cuda113>=5.0.0b4,<10.0.0
cupy-cuda113>=5.0.0b4,<11.0.0
cuda114 =
cupy-cuda114>=5.0.0b4,<10.0.0
cupy-cuda114>=5.0.0b4,<11.0.0
cuda115 =
cupy-cuda115>=5.0.0b4,<11.0.0
apple =
thinc-apple-ops>=0.0.4,<1.0.0
# Language tokenizers with external dependencies
ja =
sudachipy>=0.4.9
sudachidict_core>=20200330
sudachipy>=0.5.2,!=0.6.1
sudachidict_core>=20211220
ko =
natto-py==0.9.0
th =

View File

@ -1,3 +1,6 @@
from .errors import Errors
IOB_STRINGS = ("", "I", "O", "B")
IDS = {
"": NULL_ATTR,
@ -64,7 +67,6 @@ IDS = {
"FLAG61": FLAG61,
"FLAG62": FLAG62,
"FLAG63": FLAG63,
"ID": ID,
"ORTH": ORTH,
"LOWER": LOWER,
@ -72,7 +74,6 @@ IDS = {
"SHAPE": SHAPE,
"PREFIX": PREFIX,
"SUFFIX": SUFFIX,
"LENGTH": LENGTH,
"LEMMA": LEMMA,
"POS": POS,
@ -87,7 +88,7 @@ IDS = {
"SPACY": SPACY,
"LANG": LANG,
"MORPH": MORPH,
"IDX": IDX
"IDX": IDX,
}
@ -109,28 +110,66 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
"""
inty_attrs = {}
if _do_deprecated:
if 'F' in stringy_attrs:
if "F" in stringy_attrs:
stringy_attrs["ORTH"] = stringy_attrs.pop("F")
if 'L' in stringy_attrs:
if "L" in stringy_attrs:
stringy_attrs["LEMMA"] = stringy_attrs.pop("L")
if 'pos' in stringy_attrs:
if "pos" in stringy_attrs:
stringy_attrs["TAG"] = stringy_attrs.pop("pos")
if 'morph' in stringy_attrs:
morphs = stringy_attrs.pop('morph')
if 'number' in stringy_attrs:
stringy_attrs.pop('number')
if 'tenspect' in stringy_attrs:
stringy_attrs.pop('tenspect')
if "morph" in stringy_attrs:
morphs = stringy_attrs.pop("morph")
if "number" in stringy_attrs:
stringy_attrs.pop("number")
if "tenspect" in stringy_attrs:
stringy_attrs.pop("tenspect")
morph_keys = [
'PunctType', 'PunctSide', 'Other', 'Degree', 'AdvType', 'Number',
'VerbForm', 'PronType', 'Aspect', 'Tense', 'PartType', 'Poss',
'Hyph', 'ConjType', 'NumType', 'Foreign', 'VerbType', 'NounType',
'Gender', 'Mood', 'Negative', 'Tense', 'Voice', 'Abbr',
'Derivation', 'Echo', 'Foreign', 'NameType', 'NounType', 'NumForm',
'NumValue', 'PartType', 'Polite', 'StyleVariant',
'PronType', 'AdjType', 'Person', 'Variant', 'AdpType',
'Reflex', 'Negative', 'Mood', 'Aspect', 'Case',
'Polarity', 'PrepCase', 'Animacy' # U20
"PunctType",
"PunctSide",
"Other",
"Degree",
"AdvType",
"Number",
"VerbForm",
"PronType",
"Aspect",
"Tense",
"PartType",
"Poss",
"Hyph",
"ConjType",
"NumType",
"Foreign",
"VerbType",
"NounType",
"Gender",
"Mood",
"Negative",
"Tense",
"Voice",
"Abbr",
"Derivation",
"Echo",
"Foreign",
"NameType",
"NounType",
"NumForm",
"NumValue",
"PartType",
"Polite",
"StyleVariant",
"PronType",
"AdjType",
"Person",
"Variant",
"AdpType",
"Reflex",
"Negative",
"Mood",
"Aspect",
"Case",
"Polarity",
"PrepCase",
"Animacy", # U20
]
for key in morph_keys:
if key in stringy_attrs:
@ -142,8 +181,13 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
for name, value in stringy_attrs.items():
int_key = intify_attr(name)
if int_key is not None:
if int_key == ENT_IOB:
if value in IOB_STRINGS:
value = IOB_STRINGS.index(value)
elif isinstance(value, str):
raise ValueError(Errors.E1025.format(value=value))
if strings_map is not None and isinstance(value, str):
if hasattr(strings_map, 'add'):
if hasattr(strings_map, "add"):
value = strings_map.add(value)
else:
value = strings_map[value]

View File

@ -14,7 +14,7 @@ from ..training.initialize import get_sourced_components
from ..schemas import ConfigSchemaTraining
from ..pipeline._parser_internals import nonproj
from ..pipeline._parser_internals.nonproj import DELIMITER
from ..pipeline import Morphologizer
from ..pipeline import Morphologizer, SpanCategorizer
from ..morphology import Morphology
from ..language import Language
from ..util import registry, resolve_dot_names
@ -193,6 +193,70 @@ def debug_data(
else:
msg.info("No word vectors present in the package")
if "spancat" in factory_names:
model_labels_spancat = _get_labels_from_spancat(nlp)
has_low_data_warning = False
has_no_neg_warning = False
msg.divider("Span Categorization")
msg.table(model_labels_spancat, header=["Spans Key", "Labels"], divider=True)
msg.text("Label counts in train data: ", show=verbose)
for spans_key, data_labels in gold_train_data["spancat"].items():
msg.text(
f"Key: {spans_key}, {_format_labels(data_labels.items(), counts=True)}",
show=verbose,
)
# Data checks: only take the spans keys in the actual spancat components
data_labels_in_component = {
spans_key: gold_train_data["spancat"][spans_key]
for spans_key in model_labels_spancat.keys()
}
for spans_key, data_labels in data_labels_in_component.items():
for label, count in data_labels.items():
# Check for missing labels
spans_key_in_model = spans_key in model_labels_spancat.keys()
if (spans_key_in_model) and (
label not in model_labels_spancat[spans_key]
):
msg.warn(
f"Label '{label}' is not present in the model labels of key '{spans_key}'. "
"Performance may degrade after training."
)
# Check for low number of examples per label
if count <= NEW_LABEL_THRESHOLD:
msg.warn(
f"Low number of examples for label '{label}' in key '{spans_key}' ({count})"
)
has_low_data_warning = True
# Check for negative examples
with msg.loading("Analyzing label distribution..."):
neg_docs = _get_examples_without_label(
train_dataset, label, "spancat", spans_key
)
if neg_docs == 0:
msg.warn(f"No examples for texts WITHOUT new label '{label}'")
has_no_neg_warning = True
if has_low_data_warning:
msg.text(
f"To train a new span type, your data should include at "
f"least {NEW_LABEL_THRESHOLD} instances of the new label",
show=verbose,
)
else:
msg.good("Good amount of examples for all labels")
if has_no_neg_warning:
msg.text(
"Training data should always include examples of spans "
"in context, as well as examples without a given span "
"type.",
show=verbose,
)
else:
msg.good("Examples without ocurrences available for all labels")
if "ner" in factory_names:
# Get all unique NER labels present in the data
labels = set(
@ -203,6 +267,7 @@ def debug_data(
has_low_data_warning = False
has_no_neg_warning = False
has_ws_ents_error = False
has_boundary_cross_ents_warning = False
msg.divider("Named Entity Recognition")
msg.info(f"{len(model_labels)} label(s)")
@ -237,17 +302,25 @@ def debug_data(
has_low_data_warning = True
with msg.loading("Analyzing label distribution..."):
neg_docs = _get_examples_without_label(train_dataset, label)
neg_docs = _get_examples_without_label(train_dataset, label, "ner")
if neg_docs == 0:
msg.warn(f"No examples for texts WITHOUT new label '{label}'")
has_no_neg_warning = True
if gold_train_data["boundary_cross_ents"]:
msg.warn(
f"{gold_train_data['boundary_cross_ents']} entity span(s) crossing sentence boundaries"
)
has_boundary_cross_ents_warning = True
if not has_low_data_warning:
msg.good("Good amount of examples for all labels")
if not has_no_neg_warning:
msg.good("Examples without occurrences available for all labels")
if not has_ws_ents_error:
msg.good("No entities consisting of or starting/ending with whitespace")
if not has_boundary_cross_ents_warning:
msg.good("No entities crossing sentence boundaries")
if has_low_data_warning:
msg.text(
@ -564,7 +637,9 @@ def _compile_gold(
"deps": Counter(),
"words": Counter(),
"roots": Counter(),
"spancat": dict(),
"ws_ents": 0,
"boundary_cross_ents": 0,
"n_words": 0,
"n_misaligned_words": 0,
"words_missing_vectors": Counter(),
@ -593,6 +668,7 @@ def _compile_gold(
if nlp.vocab.strings[word] not in nlp.vocab.vectors:
data["words_missing_vectors"].update([word])
if "ner" in factory_names:
sent_starts = eg.get_aligned_sent_starts()
for i, label in enumerate(eg.get_aligned_ner()):
if label is None:
continue
@ -602,8 +678,19 @@ def _compile_gold(
if label.startswith(("B-", "U-")):
combined_label = label.split("-")[1]
data["ner"][combined_label] += 1
if sent_starts[i] == True and label.startswith(("I-", "L-")):
data["boundary_cross_ents"] += 1
elif label == "-":
data["ner"]["-"] += 1
if "spancat" in factory_names:
for span_key in list(eg.reference.spans.keys()):
if span_key not in data["spancat"]:
data["spancat"][span_key] = Counter()
for i, span in enumerate(eg.reference.spans[span_key]):
if span.label_ is None:
continue
else:
data["spancat"][span_key][span.label_] += 1
if "textcat" in factory_names or "textcat_multilabel" in factory_names:
data["cats"].update(gold.cats)
if any(val not in (0, 1) for val in gold.cats.values()):
@ -674,21 +761,57 @@ def _format_labels(
return ", ".join([f"'{l}'" for l in cast(Iterable[str], labels)])
def _get_examples_without_label(data: Sequence[Example], label: str) -> int:
def _get_examples_without_label(
data: Sequence[Example],
label: str,
component: Literal["ner", "spancat"] = "ner",
spans_key: Optional[str] = "sc",
) -> int:
count = 0
for eg in data:
if component == "ner":
labels = [
label.split("-")[1]
for label in eg.get_aligned_ner()
if label not in ("O", "-", None)
]
if component == "spancat":
labels = (
[span.label_ for span in eg.reference.spans[spans_key]]
if spans_key in eg.reference.spans
else []
)
if label not in labels:
count += 1
return count
def _get_labels_from_model(nlp: Language, pipe_name: str) -> Set[str]:
if pipe_name not in nlp.pipe_names:
return set()
def _get_labels_from_model(nlp: Language, factory_name: str) -> Set[str]:
pipe_names = [
pipe_name
for pipe_name in nlp.pipe_names
if nlp.get_pipe_meta(pipe_name).factory == factory_name
]
labels: Set[str] = set()
for pipe_name in pipe_names:
pipe = nlp.get_pipe(pipe_name)
return set(pipe.labels)
labels.update(pipe.labels)
return labels
def _get_labels_from_spancat(nlp: Language) -> Dict[str, Set[str]]:
pipe_names = [
pipe_name
for pipe_name in nlp.pipe_names
if nlp.get_pipe_meta(pipe_name).factory == "spancat"
]
labels: Dict[str, Set[str]] = {}
for pipe_name in pipe_names:
pipe = nlp.get_pipe(pipe_name)
assert isinstance(pipe, SpanCategorizer)
if pipe.key not in labels:
labels[pipe.key] = set()
labels[pipe.key].update(pipe.labels)
return labels

View File

@ -7,6 +7,7 @@ from collections import defaultdict
from catalogue import RegistryError
import srsly
import sys
import re
from ._util import app, Arg, Opt, string_to_list, WHEEL_SUFFIX, SDIST_SUFFIX
from ..schemas import validate, ModelMetaSchema
@ -109,6 +110,24 @@ def package(
", ".join(meta["requirements"]),
)
if name is not None:
if not name.isidentifier():
msg.fail(
f"Model name ('{name}') is not a valid module name. "
"This is required so it can be imported as a module.",
"We recommend names that use ASCII A-Z, a-z, _ (underscore), "
"and 0-9. "
"For specific details see: https://docs.python.org/3/reference/lexical_analysis.html#identifiers",
exits=1,
)
if not _is_permitted_package_name(name):
msg.fail(
f"Model name ('{name}') is not a permitted package name. "
"This is required to correctly load the model with spacy.load.",
"We recommend names that use ASCII A-Z, a-z, _ (underscore), "
"and 0-9. "
"For specific details see: https://www.python.org/dev/peps/pep-0426/#name",
exits=1,
)
meta["name"] = name
if version is not None:
meta["version"] = version
@ -162,7 +181,7 @@ def package(
imports="\n".join(f"from . import {m}" for m in imports)
)
create_file(package_path / "__init__.py", init_py)
msg.good(f"Successfully created package '{model_name_v}'", main_path)
msg.good(f"Successfully created package directory '{model_name_v}'", main_path)
if create_sdist:
with util.working_dir(main_path):
util.run_command([sys.executable, "setup.py", "sdist"], capture=False)
@ -171,8 +190,14 @@ def package(
if create_wheel:
with util.working_dir(main_path):
util.run_command([sys.executable, "setup.py", "bdist_wheel"], capture=False)
wheel = main_path / "dist" / f"{model_name_v}{WHEEL_SUFFIX}"
wheel_name_squashed = re.sub("_+", "_", model_name_v)
wheel = main_path / "dist" / f"{wheel_name_squashed}{WHEEL_SUFFIX}"
msg.good(f"Successfully created binary wheel", wheel)
if "__" in model_name:
msg.warn(
f"Model name ('{model_name}') contains a run of underscores. "
"Runs of underscores are not significant in installed package names.",
)
def has_wheel() -> bool:
@ -422,6 +447,14 @@ def _format_label_scheme(data: Dict[str, Any]) -> str:
return md.text
def _is_permitted_package_name(package_name: str) -> bool:
# regex from: https://www.python.org/dev/peps/pep-0426/#name
permitted_match = re.search(
r"^([A-Z0-9]|[A-Z0-9][A-Z0-9._-]*[A-Z0-9])$", package_name, re.IGNORECASE
)
return permitted_match is not None
TEMPLATE_SETUP = """
#!/usr/bin/env python
import io

View File

@ -1,6 +1,7 @@
from typing import Any, Dict, Optional
from pathlib import Path
from wasabi import msg
import os
import re
import shutil
import requests
@ -129,10 +130,17 @@ def fetch_asset(
the asset failed.
"""
dest_path = (project_path / dest).resolve()
if dest_path.exists() and checksum:
if dest_path.exists():
# If there's already a file, check for checksum
if checksum:
if checksum == get_checksum(dest_path):
msg.good(f"Skipping download with matching checksum: {dest}")
return
else:
# If there's not a checksum, make sure the file is a possibly valid size
if os.path.getsize(dest_path) == 0:
msg.warn(f"Asset exists but with size of 0 bytes, deleting: {dest}")
os.remove(dest_path)
# We might as well support the user here and create parent directories in
# case the asset dir isn't listed as a dir to create in the project.yml
if not dest_path.parent.exists():

View File

@ -6,6 +6,11 @@ can help generate the best possible configuration, given a user's requirements.
[paths]
train = null
dev = null
{% if use_transformer or optimize == "efficiency" or not word_vectors -%}
vectors = null
{% else -%}
vectors = "{{ word_vectors }}"
{% endif -%}
[system]
{% if use_transformer -%}
@ -421,8 +426,4 @@ compound = 1.001
{% endif %}
[initialize]
{% if use_transformer or optimize == "efficiency" or not word_vectors -%}
vectors = ${paths.vectors}
{% else -%}
vectors = "{{ word_vectors }}"
{% endif -%}

View File

@ -68,12 +68,14 @@ seed = ${system.seed}
gpu_allocator = ${system.gpu_allocator}
dropout = 0.1
accumulate_gradient = 1
# Controls early-stopping. 0 disables early stopping.
# Controls early-stopping, i.e., the number of steps to continue without
# improvement before stopping. 0 disables early stopping.
patience = 1600
# Number of epochs. 0 means unlimited. If >= 0, train corpus is loaded once in
# memory and shuffled within the training loop. -1 means stream train corpus
# rather than loading in memory with no shuffling within the training loop.
max_epochs = 0
# Maximum number of update steps to train for. 0 means an unlimited number of steps.
max_steps = 20000
eval_frequency = 200
# Control how scores are printed and checkpoints are evaluated.

View File

@ -18,7 +18,7 @@ DEFAULT_LABEL_COLORS = {
"LOC": "#ff9561",
"PERSON": "#aa9cfc",
"NORP": "#c887fb",
"FACILITY": "#9cc9cc",
"FAC": "#9cc9cc",
"EVENT": "#ffeb80",
"LAW": "#ff8197",
"LANGUAGE": "#ff8197",

View File

@ -483,7 +483,7 @@ class Errors(metaclass=ErrorsWithCodes):
"components, since spans are only views of the Doc. Use Doc and "
"Token attributes (or custom extension attributes) only and remove "
"the following: {attrs}")
E181 = ("Received invalid attributes for unkown object {obj}: {attrs}. "
E181 = ("Received invalid attributes for unknown object {obj}: {attrs}. "
"Only Doc and Token attributes are supported.")
E182 = ("Received invalid attribute declaration: {attr}\nDid you forget "
"to define the attribute? For example: `{attr}.???`")
@ -888,9 +888,12 @@ class Errors(metaclass=ErrorsWithCodes):
E1021 = ("`pos` value \"{pp}\" is not a valid Universal Dependencies tag. "
"Non-UD tags should use the `tag` property.")
E1022 = ("Words must be of type str or int, but input is of type '{wtype}'")
E1023 = ("Couldn't read EntityRuler from the {path}. This file doesn't exist.")
E1024 = ("A pattern with ID \"{ent_id}\" is not present in EntityRuler patterns.")
E1023 = ("Couldn't read EntityRuler from the {path}. This file doesn't "
"exist.")
E1024 = ("A pattern with ID \"{ent_id}\" is not present in EntityRuler "
"patterns.")
E1025 = ("Cannot intify the value '{value}' as an IOB string. The only "
"supported values are: 'I', 'O', 'B' and ''")
# Deprecated model shortcuts, only used in errors and warnings

View File

@ -310,7 +310,6 @@ GLOSSARY = {
"re": "repeated element",
"rs": "reported speech",
"sb": "subject",
"sb": "subject",
"sbp": "passivized subject (PP)",
"sp": "subject or predicate",
"svp": "separable verb prefix",

View File

@ -90,7 +90,7 @@ _eleven_to_beyond = [
"अड़सठ",
"उनहत्तर",
"सत्तर",
"इकहत्तर"
"इकहत्तर",
"बहत्तर",
"तिहत्तर",
"चौहत्तर",

View File

@ -59,7 +59,7 @@ sentences = [
"Czy w ciągu ostatnich 48 godzin spożyłeś leki zawierające paracetamol?",
"Kto ma ochotę zapoznać się z innymi niż w książkach przygodami Muminków i ich przyjaciół, temu polecam komiks Tove Jansson „Muminki i morze”.",
"Apple está querendo comprar uma startup do Reino Unido por 100 milhões de dólares.",
"Carros autônomos empurram a responsabilidade do seguro para os fabricantes.."
"Carros autônomos empurram a responsabilidade do seguro para os fabricantes..",
"São Francisco considera banir os robôs de entrega que andam pelas calçadas.",
"Londres é a maior cidade do Reino Unido.",
# Translations from English:

View File

@ -131,7 +131,7 @@ class Language:
self,
vocab: Union[Vocab, bool] = True,
*,
max_length: int = 10 ** 6,
max_length: int = 10**6,
meta: Dict[str, Any] = {},
create_tokenizer: Optional[Callable[["Language"], Callable[[str], Doc]]] = None,
batch_size: int = 1000,
@ -354,12 +354,15 @@ class Language:
@property
def pipe_labels(self) -> Dict[str, List[str]]:
"""Get the labels set by the pipeline components, if available (if
the component exposes a labels property).
the component exposes a labels property and the labels are not
hidden).
RETURNS (Dict[str, List[str]]): Labels keyed by component name.
"""
labels = {}
for name, pipe in self._components:
if hasattr(pipe, "hide_labels") and pipe.hide_labels is True:
continue
if hasattr(pipe, "labels"):
labels[name] = list(pipe.labels)
return SimpleFrozenDict(labels)
@ -522,7 +525,7 @@ class Language:
requires: Iterable[str] = SimpleFrozenList(),
retokenizes: bool = False,
func: Optional["Pipe"] = None,
) -> Callable:
) -> Callable[..., Any]:
"""Register a new pipeline component. Can be used for stateless function
components that don't require a separate factory. Can be used as a
decorator on a function or classmethod, or called as a function with the
@ -1285,9 +1288,9 @@ class Language:
)
except IOError:
raise IOError(Errors.E884.format(vectors=I["vectors"]))
if self.vocab.vectors.data.shape[1] >= 1:
if self.vocab.vectors.shape[1] >= 1:
ops = get_current_ops()
self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data)
self.vocab.vectors.to_ops(ops)
if hasattr(self.tokenizer, "initialize"):
tok_settings = validate_init_settings(
self.tokenizer.initialize, # type: ignore[union-attr]
@ -1332,8 +1335,8 @@ class Language:
DOCS: https://spacy.io/api/language#resume_training
"""
ops = get_current_ops()
if self.vocab.vectors.data.shape[1] >= 1:
self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data)
if self.vocab.vectors.shape[1] >= 1:
self.vocab.vectors.to_ops(ops)
for name, proc in self.pipeline:
if hasattr(proc, "_rehearsal_model"):
proc._rehearsal_model = deepcopy(proc.model) # type: ignore[attr-defined]

View File

@ -19,7 +19,7 @@ class Lexeme:
@property
def vector_norm(self) -> float: ...
vector: Floats1d
rank: str
rank: int
sentiment: float
@property
def orth_(self) -> str: ...

View File

@ -130,7 +130,9 @@ cdef class Lexeme:
return 0.0
vector = self.vector
xp = get_array_module(vector)
return (xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm))
result = xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm)
# ensure we get a scalar back (numpy does this automatically but cupy doesn't)
return result.item()
@property
def has_vector(self):

View File

@ -0,0 +1,66 @@
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
from .matcher import Matcher
from ..vocab import Vocab
from ..tokens.doc import Doc
from ..tokens.span import Span
class DependencyMatcher:
"""Match dependency parse tree based on pattern rules."""
_patterns: Dict[str, List[Any]]
_raw_patterns: Dict[str, List[Any]]
_tokens_to_key: Dict[str, List[Any]]
_root: Dict[str, List[Any]]
_tree: Dict[str, List[Any]]
_callbacks: Dict[
Any, Callable[[DependencyMatcher, Doc, int, List[Tuple[int, List[int]]]], Any]
]
_ops: Dict[str, Any]
vocab: Vocab
_matcher: Matcher
def __init__(self, vocab: Vocab, *, validate: bool = ...) -> None: ...
def __reduce__(
self,
) -> Tuple[
Callable[
[Vocab, Dict[str, Any], Dict[str, Callable[..., Any]]], DependencyMatcher
],
Tuple[
Vocab,
Dict[str, List[Any]],
Dict[
str,
Callable[
[DependencyMatcher, Doc, int, List[Tuple[int, List[int]]]], Any
],
],
],
None,
None,
]: ...
def __len__(self) -> int: ...
def __contains__(self, key: Union[str, int]) -> bool: ...
def add(
self,
key: Union[str, int],
patterns: List[List[Dict[str, Any]]],
*,
on_match: Optional[
Callable[[DependencyMatcher, Doc, int, List[Tuple[int, List[int]]]], Any]
] = ...
) -> None: ...
def has_key(self, key: Union[str, int]) -> bool: ...
def get(
self, key: Union[str, int], default: Optional[Any] = ...
) -> Tuple[
Optional[
Callable[[DependencyMatcher, Doc, int, List[Tuple[int, List[int]]]], Any]
],
List[List[Dict[str, Any]]],
]: ...
def remove(self, key: Union[str, int]) -> None: ...
def __call__(self, doclike: Union[Doc, Span]) -> List[Tuple[int, List[int]]]: ...
def unpickle_matcher(
vocab: Vocab, patterns: Dict[str, Any], callbacks: Dict[str, Callable[..., Any]]
) -> DependencyMatcher: ...

View File

@ -1,4 +1,6 @@
from typing import Any, List, Dict, Tuple, Optional, Callable, Union, Iterator, Iterable
from typing import Any, List, Dict, Tuple, Optional, Callable, Union
from typing import Iterator, Iterable, overload
from ..compat import Literal
from ..vocab import Vocab
from ..tokens import Doc, Span
@ -31,12 +33,22 @@ class Matcher:
) -> Union[
Iterator[Tuple[Tuple[Doc, Any], Any]], Iterator[Tuple[Doc, Any]], Iterator[Doc]
]: ...
@overload
def __call__(
self,
doclike: Union[Doc, Span],
*,
as_spans: bool = ...,
as_spans: Literal[False] = ...,
allow_missing: bool = ...,
with_alignments: bool = ...
) -> Union[List[Tuple[int, int, int]], List[Span]]: ...
) -> List[Tuple[int, int, int]]: ...
@overload
def __call__(
self,
doclike: Union[Doc, Span],
*,
as_spans: Literal[True],
allow_missing: bool = ...,
with_alignments: bool = ...
) -> List[Span]: ...
def _normalize_key(self, key: Any) -> Any: ...

View File

@ -18,7 +18,7 @@ from ..tokens.doc cimport Doc, get_token_attr_for_matcher
from ..tokens.span cimport Span
from ..tokens.token cimport Token
from ..tokens.morphanalysis cimport MorphAnalysis
from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA, MORPH
from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA, MORPH, ENT_IOB
from ..schemas import validate_token_pattern
from ..errors import Errors, MatchPatternError, Warnings
@ -798,6 +798,9 @@ def _get_attr_values(spec, string_store):
attr = "SENT_START"
attr = IDS.get(attr)
if isinstance(value, str):
if attr == ENT_IOB and value in Token.iob_strings():
value = Token.iob_strings().index(value)
else:
value = string_store.add(value)
elif isinstance(value, bool):
value = int(value)

View File

@ -1,6 +1,6 @@
from typing import List, Tuple, Union, Optional, Callable, Any, Dict
from . import Matcher
from typing import List, Tuple, Union, Optional, Callable, Any, Dict, overload
from ..compat import Literal
from .matcher import Matcher
from ..vocab import Vocab
from ..tokens import Doc, Span
@ -14,16 +14,24 @@ class PhraseMatcher:
def add(
self,
key: str,
docs: List[List[Dict[str, Any]]],
docs: List[Doc],
*,
on_match: Optional[
Callable[[Matcher, Doc, int, List[Tuple[Any, ...]]], Any]
] = ...,
) -> None: ...
def remove(self, key: str) -> None: ...
@overload
def __call__(
self,
doclike: Union[Doc, Span],
*,
as_spans: bool = ...,
) -> Union[List[Tuple[int, int, int]], List[Span]]: ...
as_spans: Literal[False] = ...,
) -> List[Tuple[int, int, int]]: ...
@overload
def __call__(
self,
doclike: Union[Doc, Span],
*,
as_spans: Literal[True],
) -> List[Span]: ...

View File

@ -23,7 +23,7 @@ def create_pretrain_vectors(
maxout_pieces: int, hidden_size: int, loss: str
) -> Callable[["Vocab", Model], Model]:
def create_vectors_objective(vocab: "Vocab", tok2vec: Model) -> Model:
if vocab.vectors.data.shape[1] == 0:
if vocab.vectors.shape[1] == 0:
raise ValueError(Errors.E875)
model = build_cloze_multi_task_model(
vocab, tok2vec, hidden_size=hidden_size, maxout_pieces=maxout_pieces
@ -85,7 +85,7 @@ def get_characters_loss(ops, docs, prediction, nr_char):
target = ops.asarray(to_categorical(target_ids, n_classes=256), dtype="f")
target = target.reshape((-1, 256 * nr_char))
diff = prediction - target
loss = (diff ** 2).sum()
loss = (diff**2).sum()
d_target = diff / float(prediction.shape[0])
return loss, d_target
@ -116,7 +116,7 @@ def build_multi_task_model(
def build_cloze_multi_task_model(
vocab: "Vocab", tok2vec: Model, maxout_pieces: int, hidden_size: int
) -> Model:
nO = vocab.vectors.data.shape[1]
nO = vocab.vectors.shape[1]
output_layer = chain(
cast(Model[List["Floats2d"], Floats2d], list2array()),
Maxout(

View File

@ -94,7 +94,7 @@ def init(
nM = model.get_dim("nM") if model.has_dim("nM") else None
nO = model.get_dim("nO") if model.has_dim("nO") else None
if X is not None and len(X):
nM = X[0].vocab.vectors.data.shape[1]
nM = X[0].vocab.vectors.shape[1]
if Y is not None:
nO = Y.data.shape[1]

View File

@ -1,3 +1,4 @@
from cython.operator cimport dereference as deref, preincrement as incr
from libc.string cimport memcpy, memset
from libc.stdlib cimport calloc, free
from libc.stdint cimport uint32_t, uint64_t
@ -184,16 +185,20 @@ cdef cppclass StateC:
int L(int head, int idx) nogil const:
if idx < 1 or this._left_arcs.size() == 0:
return -1
cdef vector[int] lefts
for i in range(this._left_arcs.size()):
arc = this._left_arcs.at(i)
# Work backwards through left-arcs to find the arc at the
# requested index more quickly.
cdef size_t child_index = 0
it = this._left_arcs.const_rbegin()
while it != this._left_arcs.rend():
arc = deref(it)
if arc.head == head and arc.child != -1 and arc.child < head:
lefts.push_back(arc.child)
idx = (<int>lefts.size()) - idx
if idx < 0:
child_index += 1
if child_index == idx:
return arc.child
incr(it)
return -1
else:
return lefts.at(idx)
int R(int head, int idx) nogil const:
if idx < 1 or this._right_arcs.size() == 0:

View File

@ -604,7 +604,7 @@ cdef class ArcEager(TransitionSystem):
actions[SHIFT][''] += 1
if min_freq is not None:
for action, label_freqs in actions.items():
for label, freq in list(label_freqs.items()):
for label, freq in label_freqs.copy().items():
if freq < min_freq:
label_freqs.pop(label)
# Ensure these actions are present

View File

@ -26,6 +26,8 @@ class Pipe:
@property
def labels(self) -> Tuple[str, ...]: ...
@property
def hide_labels(self) -> bool: ...
@property
def label_data(self) -> Any: ...
def _require_labels(self) -> None: ...
def set_error_handler(

View File

@ -102,6 +102,10 @@ cdef class Pipe:
def labels(self) -> Tuple[str, ...]:
return tuple()
@property
def hide_labels(self) -> bool:
return False
@property
def label_data(self):
"""Optional JSON-serializable data that would be sufficient to recreate

View File

@ -99,6 +99,10 @@ class SentenceRecognizer(Tagger):
# are 0
return tuple(["I", "S"])
@property
def hide_labels(self):
return True
@property
def label_data(self):
return None

View File

@ -377,7 +377,7 @@ class SpanCategorizer(TrainablePipe):
# If the prediction is 0.9 and it's false, the gradient will be
# 0.9 (0.9 - 0.0)
d_scores = scores - target
loss = float((d_scores ** 2).sum())
loss = float((d_scores**2).sum())
return loss, d_scores
def initialize(
@ -412,7 +412,7 @@ class SpanCategorizer(TrainablePipe):
self._require_labels()
if subbatch:
docs = [eg.x for eg in subbatch]
spans = self.suggester(docs)
spans = build_ngram_suggester(sizes=[1])(docs)
Y = self.model.ops.alloc2f(spans.dataXd.shape[0], len(self.labels))
self.model.initialize(X=(docs, spans), Y=Y)
else:

View File

@ -281,7 +281,7 @@ class TextCategorizer(TrainablePipe):
bp_scores(gradient)
if sgd is not None:
self.finish_update(sgd)
losses[self.name] += (gradient ** 2).sum()
losses[self.name] += (gradient**2).sum()
return losses
def _examples_to_truth(
@ -315,7 +315,7 @@ class TextCategorizer(TrainablePipe):
not_missing = self.model.ops.asarray(not_missing) # type: ignore
d_scores = (scores - truths) / scores.shape[0]
d_scores *= not_missing
mean_square_error = (d_scores ** 2).sum(axis=1).mean()
mean_square_error = (d_scores**2).sum(axis=1).mean()
return float(mean_square_error), d_scores
def add_label(self, label: str) -> int:

View File

@ -1,5 +1,6 @@
from typing import Dict, List, Union, Optional, Any, Callable, Type, Tuple
from typing import Iterable, TypeVar, TYPE_CHECKING
from .compat import Literal
from enum import Enum
from pydantic import BaseModel, Field, ValidationError, validator, create_model
from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool
@ -209,6 +210,7 @@ NumberValue = Union[TokenPatternNumber, StrictInt, StrictFloat]
UnderscoreValue = Union[
TokenPatternString, TokenPatternNumber, str, int, float, list, bool
]
IobValue = Literal["", "I", "O", "B", 0, 1, 2, 3]
class TokenPattern(BaseModel):
@ -222,6 +224,7 @@ class TokenPattern(BaseModel):
lemma: Optional[StringValue] = None
shape: Optional[StringValue] = None
ent_type: Optional[StringValue] = None
ent_iob: Optional[IobValue] = None
ent_id: Optional[StringValue] = None
ent_kb_id: Optional[StringValue] = None
norm: Optional[StringValue] = None

View File

@ -567,6 +567,7 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
"Merging the docs is fun.",
"",
"They don't think alike. ",
"",
"Another doc.",
]
en_texts_without_empty = [t for t in en_texts if len(t)]
@ -574,9 +575,9 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
en_docs = [en_tokenizer(text) for text in en_texts]
en_docs[0].spans["group"] = [en_docs[0][1:4]]
en_docs[2].spans["group"] = [en_docs[2][1:4]]
en_docs[3].spans["group"] = [en_docs[3][0:1]]
en_docs[4].spans["group"] = [en_docs[4][0:1]]
span_group_texts = sorted(
[en_docs[0][1:4].text, en_docs[2][1:4].text, en_docs[3][0:1].text]
[en_docs[0][1:4].text, en_docs[2][1:4].text, en_docs[4][0:1].text]
)
de_doc = de_tokenizer(de_text)
Token.set_extension("is_ambiguous", default=False)
@ -683,6 +684,7 @@ def test_has_annotation(en_vocab):
attrs = ("TAG", "POS", "MORPH", "LEMMA", "DEP", "HEAD", "ENT_IOB", "ENT_TYPE")
for attr in attrs:
assert not doc.has_annotation(attr)
assert not doc.has_annotation(attr, require_complete=True)
doc[0].tag_ = "A"
doc[0].pos_ = "X"
@ -708,6 +710,27 @@ def test_has_annotation(en_vocab):
assert doc.has_annotation(attr, require_complete=True)
def test_has_annotation_sents(en_vocab):
doc = Doc(en_vocab, words=["Hello", "beautiful", "world"])
attrs = ("SENT_START", "IS_SENT_START", "IS_SENT_END")
for attr in attrs:
assert not doc.has_annotation(attr)
assert not doc.has_annotation(attr, require_complete=True)
# The first token (index 0) is always assumed to be a sentence start,
# and ignored by the check in doc.has_annotation
doc[1].is_sent_start = False
for attr in attrs:
assert doc.has_annotation(attr)
assert not doc.has_annotation(attr, require_complete=True)
doc[2].is_sent_start = False
for attr in attrs:
assert doc.has_annotation(attr)
assert doc.has_annotation(attr, require_complete=True)
def test_is_flags_deprecated(en_tokenizer):
doc = en_tokenizer("test")
with pytest.deprecated_call():

View File

@ -1,4 +1,5 @@
import pytest
from spacy.attrs import intify_attrs, ENT_IOB
from spacy.attrs import IS_ALPHA, LEMMA, NORM, ORTH, intify_attrs
from spacy.lang.en.stop_words import STOP_WORDS
@ -33,6 +34,38 @@ def test_attrs_do_deprecated(text):
assert int_attrs == {ORTH: 10, IS_ALPHA: True}
def test_attrs_ent_iob_intify():
int_attrs = intify_attrs({"ENT_IOB": ""})
assert int_attrs == {ENT_IOB: 0}
int_attrs = intify_attrs({"ENT_IOB": "I"})
assert int_attrs == {ENT_IOB: 1}
int_attrs = intify_attrs({"ENT_IOB": "O"})
assert int_attrs == {ENT_IOB: 2}
int_attrs = intify_attrs({"ENT_IOB": "B"})
assert int_attrs == {ENT_IOB: 3}
int_attrs = intify_attrs({ENT_IOB: ""})
assert int_attrs == {ENT_IOB: 0}
int_attrs = intify_attrs({ENT_IOB: "I"})
assert int_attrs == {ENT_IOB: 1}
int_attrs = intify_attrs({ENT_IOB: "O"})
assert int_attrs == {ENT_IOB: 2}
int_attrs = intify_attrs({ENT_IOB: "B"})
assert int_attrs == {ENT_IOB: 3}
with pytest.raises(ValueError):
int_attrs = intify_attrs({"ENT_IOB": "XX"})
with pytest.raises(ValueError):
int_attrs = intify_attrs({ENT_IOB: "XX"})
@pytest.mark.parametrize("text,match", [(",", True), (" ", False), ("a", False)])
def test_lex_attrs_is_punct(text, match):
assert is_punct(text) == match

View File

@ -642,3 +642,30 @@ def test_matcher_no_zero_length(en_vocab):
matcher = Matcher(en_vocab)
matcher.add("TEST", [[{"TAG": "C", "OP": "?"}]])
assert len(matcher(doc)) == 0
def test_matcher_ent_iob_key(en_vocab):
"""Test that patterns with ent_iob works correctly."""
matcher = Matcher(en_vocab)
matcher.add("Rule", [[{"ENT_IOB": "I"}]])
doc1 = Doc(en_vocab, words=["I", "visited", "New", "York", "and", "California"])
doc1.ents = [Span(doc1, 2, 4, label="GPE"), Span(doc1, 5, 6, label="GPE")]
doc2 = Doc(en_vocab, words=["I", "visited", "my", "friend", "Alicia"])
doc2.ents = [Span(doc2, 4, 5, label="PERSON")]
matches1 = [doc1[start:end].text for _, start, end in matcher(doc1)]
matches2 = [doc2[start:end].text for _, start, end in matcher(doc2)]
assert len(matches1) == 1
assert matches1[0] == "York"
assert len(matches2) == 0
matcher = Matcher(en_vocab) # Test iob pattern with operators
matcher.add("Rule", [[{"ENT_IOB": "I", "OP": "+"}]])
doc = Doc(
en_vocab, words=["I", "visited", "my", "friend", "Anna", "Maria", "Esperanza"]
)
doc.ents = [Span(doc, 4, 7, label="PERSON")]
matches = [doc[start:end].text for _, start, end in matcher(doc)]
assert len(matches) == 3
assert matches[0] == "Maria"
assert matches[1] == "Maria Esperanza"
assert matches[2] == "Esperanza"

View File

@ -12,6 +12,7 @@ TEST_PATTERNS = [
([{"IS_PUNCT": True, "OP": "$"}], 1, 1),
([{"_": "foo"}], 1, 1),
('[{"TEXT": "foo"}, {"LOWER": "bar"}]', 1, 1),
([{"ENT_IOB": "foo"}], 1, 1),
([1, 2, 3], 3, 1),
# Bad patterns flagged outside of Matcher
([{"_": {"foo": "bar", "baz": {"IN": "foo"}}}], 2, 0), # prev: (1, 0)

View File

@ -12,6 +12,7 @@ def test_build_dependencies():
"flake8",
"hypothesis",
"pre-commit",
"black",
"mypy",
"types-dataclasses",
"types-mock",

View File

@ -97,3 +97,7 @@ def test_overfitting_IO():
]
assert_equal(batch_deps_1, batch_deps_2)
assert_equal(batch_deps_1, no_batch_deps)
# test internal pipe labels vs. Language.pipe_labels with hidden labels
assert nlp.get_pipe("senter").labels == ("I", "S")
assert "senter" not in nlp.pipe_labels

View File

@ -80,6 +80,8 @@ def test_explicit_labels():
assert spancat.labels == ("PERSON", "LOC")
# TODO figure out why this is flaky
@pytest.mark.skip(reason="Test is unreliable for unknown reason")
def test_doc_gc():
# If the Doc object is garbage collected, the spans won't be functional afterwards
nlp = Language()
@ -97,6 +99,7 @@ def test_doc_gc():
assert isinstance(spangroups, SpanGroups)
for key, spangroup in spangroups.items():
assert isinstance(spangroup, SpanGroup)
# XXX This fails with length 0 sometimes
assert len(spangroup) > 0
with pytest.raises(RuntimeError):
span = spangroup[0]

View File

@ -12,14 +12,18 @@ from spacy.cli._util import is_subpath_of, load_project_config
from spacy.cli._util import parse_config_overrides, string_to_list
from spacy.cli._util import substitute_project_variables
from spacy.cli._util import validate_project_commands
from spacy.cli.debug_data import _compile_gold, _get_labels_from_model
from spacy.cli.debug_data import _get_labels_from_spancat
from spacy.cli.download import get_compatibility, get_version
from spacy.cli.init_config import RECOMMENDATIONS, init_config, fill_config
from spacy.cli.package import get_third_party_dependencies
from spacy.cli.package import _is_permitted_package_name
from spacy.cli.validate import get_model_pkgs
from spacy.lang.en import English
from spacy.lang.nl import Dutch
from spacy.language import Language
from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate
from spacy.tokens import Doc
from spacy.training import Example, docs_to_json, offsets_to_biluo_tags
from spacy.training.converters import conll_ner_to_docs, conllu_to_docs
from spacy.training.converters import iob_to_docs
@ -665,3 +669,54 @@ def test_get_third_party_dependencies():
)
def test_is_subpath_of(parent, child, expected):
assert is_subpath_of(parent, child) == expected
@pytest.mark.slow
@pytest.mark.parametrize(
"factory_name,pipe_name",
[
("ner", "ner"),
("ner", "my_ner"),
("spancat", "spancat"),
("spancat", "my_spancat"),
],
)
def test_get_labels_from_model(factory_name, pipe_name):
labels = ("A", "B")
nlp = English()
pipe = nlp.add_pipe(factory_name, name=pipe_name)
for label in labels:
pipe.add_label(label)
nlp.initialize()
assert nlp.get_pipe(pipe_name).labels == labels
if factory_name == "spancat":
assert _get_labels_from_spancat(nlp)[pipe.key] == set(labels)
else:
assert _get_labels_from_model(nlp, factory_name) == set(labels)
def test_permitted_package_names():
# https://www.python.org/dev/peps/pep-0426/#name
assert _is_permitted_package_name("Meine_Bäume") == False
assert _is_permitted_package_name("_package") == False
assert _is_permitted_package_name("package_") == False
assert _is_permitted_package_name(".package") == False
assert _is_permitted_package_name("package.") == False
assert _is_permitted_package_name("-package") == False
assert _is_permitted_package_name("package-") == False
def test_debug_data_compile_gold():
nlp = English()
pred = Doc(nlp.vocab, words=["Token", ".", "New", "York", "City"])
ref = Doc(nlp.vocab, words=["Token", ".", "New York City"], sent_starts=[True, False, True], ents=["O", "O", "B-ENT"])
eg = Example(pred, ref)
data = _compile_gold([eg], ["ner"], nlp, True)
assert data["boundary_cross_ents"] == 0
pred = Doc(nlp.vocab, words=["Token", ".", "New", "York", "City"])
ref = Doc(nlp.vocab, words=["Token", ".", "New York City"], sent_starts=[True, False, True], ents=["O", "B-ENT", "I-ENT"])
eg = Example(pred, ref)
data = _compile_gold([eg], ["ner"], nlp, True)
assert data["boundary_cross_ents"] == 1

View File

@ -9,6 +9,7 @@ from spacy.tokenizer import Tokenizer
from spacy.tokens import Doc
from spacy.training import Example
from spacy.util import compile_prefix_regex, compile_suffix_regex, ensure_path
from spacy.util import compile_infix_regex
from spacy.vocab import Vocab
from spacy.symbols import ORTH
@ -503,3 +504,20 @@ def test_tokenizer_prefix_suffix_overlap_lookbehind(en_vocab):
assert tokens == ["a", "10", "."]
explain_tokens = [t[1] for t in tokenizer.explain("a10.")]
assert tokens == explain_tokens
def test_tokenizer_infix_prefix(en_vocab):
# the prefix and suffix matches overlap in the suffix lookbehind
infixes = ["±"]
suffixes = ["%"]
infix_re = compile_infix_regex(infixes)
suffix_re = compile_suffix_regex(suffixes)
tokenizer = Tokenizer(
en_vocab,
infix_finditer=infix_re.finditer,
suffix_search=suffix_re.search,
)
tokens = [t.text for t in tokenizer("±10%")]
assert tokens == ["±10", "%"]
explain_tokens = [t[1] for t in tokenizer.explain("±10%")]
assert tokens == explain_tokens

View File

@ -35,6 +35,7 @@ def test_vectors_similarity_LL(vocab, vectors):
assert lex1.vector_norm != 0
assert lex2.vector_norm != 0
assert lex1.vector[0] != lex2.vector[0] and lex1.vector[1] != lex2.vector[1]
assert isinstance(lex1.similarity(lex2), float)
assert numpy.isclose(lex1.similarity(lex2), get_cosine(vec1, vec2))
assert numpy.isclose(lex2.similarity(lex2), lex1.similarity(lex1))
@ -47,25 +48,46 @@ def test_vectors_similarity_TT(vocab, vectors):
assert doc[0].vector_norm != 0
assert doc[1].vector_norm != 0
assert doc[0].vector[0] != doc[1].vector[0] and doc[0].vector[1] != doc[1].vector[1]
assert isinstance(doc[0].similarity(doc[1]), float)
assert numpy.isclose(doc[0].similarity(doc[1]), get_cosine(vec1, vec2))
assert numpy.isclose(doc[1].similarity(doc[0]), doc[0].similarity(doc[1]))
def test_vectors_similarity_SS(vocab, vectors):
[(word1, vec1), (word2, vec2)] = vectors
doc = Doc(vocab, words=[word1, word2])
assert isinstance(doc[0:1].similarity(doc[0:2]), float)
assert doc[0:1].similarity(doc[0:2]) == doc[0:2].similarity(doc[0:1])
def test_vectors_similarity_DD(vocab, vectors):
[(word1, vec1), (word2, vec2)] = vectors
doc1 = Doc(vocab, words=[word1, word2])
doc2 = Doc(vocab, words=[word2, word1])
assert isinstance(doc1.similarity(doc2), float)
assert doc1.similarity(doc2) == doc2.similarity(doc1)
def test_vectors_similarity_TD(vocab, vectors):
[(word1, vec1), (word2, vec2)] = vectors
doc = Doc(vocab, words=[word1, word2])
with pytest.warns(UserWarning):
assert isinstance(doc.similarity(doc[0]), float)
assert isinstance(doc[0].similarity(doc), float)
assert doc.similarity(doc[0]) == doc[0].similarity(doc)
def test_vectors_similarity_DS(vocab, vectors):
[(word1, vec1), (word2, vec2)] = vectors
doc = Doc(vocab, words=[word1, word2])
assert doc.similarity(doc[:2]) == doc[:2].similarity(doc)
def test_vectors_similarity_TS(vocab, vectors):
[(word1, vec1), (word2, vec2)] = vectors
doc = Doc(vocab, words=[word1, word2])
with pytest.warns(UserWarning):
assert isinstance(doc[:2].similarity(doc[0]), float)
assert isinstance(doc[0].similarity(doc[-2]), float)
assert doc[:2].similarity(doc[0]) == doc[0].similarity(doc[:2])
def test_vectors_similarity_DS(vocab, vectors):
[(word1, vec1), (word2, vec2)] = vectors
doc = Doc(vocab, words=[word1, word2])
assert isinstance(doc.similarity(doc[:2]), float)
assert doc.similarity(doc[:2]) == doc[:2].similarity(doc)

View File

@ -421,7 +421,7 @@ def test_vector_is_oov():
def test_init_vectors_unset():
v = Vectors(shape=(10, 10))
assert v.is_full is False
assert v.data.shape == (10, 10)
assert v.shape == (10, 10)
with pytest.raises(ValueError):
v = Vectors(shape=(10, 10), mode="floret")
@ -514,7 +514,7 @@ def test_floret_vectors(floret_vectors_vec_str, floret_vectors_hashvec_str):
# rows: 2 rows per ngram
rows = OPS.xp.asarray(
[
h % nlp.vocab.vectors.data.shape[0]
h % nlp.vocab.vectors.shape[0]
for ngram in ngrams
for h in nlp.vocab.vectors._get_ngram_hashes(ngram)
],
@ -544,17 +544,17 @@ def test_floret_vectors(floret_vectors_vec_str, floret_vectors_hashvec_str):
# an empty key returns 0s
assert_equal(
OPS.to_numpy(nlp.vocab[""].vector),
numpy.zeros((nlp.vocab.vectors.data.shape[0],)),
numpy.zeros((nlp.vocab.vectors.shape[0],)),
)
# an empty batch returns 0s
assert_equal(
OPS.to_numpy(nlp.vocab.vectors.get_batch([""])),
numpy.zeros((1, nlp.vocab.vectors.data.shape[0])),
numpy.zeros((1, nlp.vocab.vectors.shape[0])),
)
# an empty key within a batch returns 0s
assert_equal(
OPS.to_numpy(nlp.vocab.vectors.get_batch(["a", "", "b"])[1]),
numpy.zeros((nlp.vocab.vectors.data.shape[0],)),
numpy.zeros((nlp.vocab.vectors.shape[0],)),
)
# the loaded ngram vector table cannot be modified

View File

@ -45,10 +45,12 @@ cdef class Tokenizer:
`re.compile(string).search` to match suffixes.
`infix_finditer` (callable): A function matching the signature of
`re.compile(string).finditer` to find infixes.
token_match (callable): A boolean function matching strings to be
token_match (callable): A function matching the signature of
`re.compile(string).match`, for matching strings to be
recognized as tokens.
url_match (callable): A boolean function matching strings to be
recognized as tokens after considering prefixes and suffixes.
url_match (callable): A function matching the signature of
`re.compile(string).match`, for matching strings to be
recognized as urls.
EXAMPLE:
>>> tokenizer = Tokenizer(nlp.vocab)
@ -681,6 +683,8 @@ cdef class Tokenizer:
infixes = infix_finditer(substring)
offset = 0
for match in infixes:
if offset == 0 and match.start() == 0:
continue
if substring[offset : match.start()]:
tokens.append(("TOKEN", substring[offset : match.start()]))
if substring[match.start() : match.end()]:

View File

@ -10,7 +10,7 @@ from ..lexeme import Lexeme
from ..vocab import Vocab
from .underscore import Underscore
from pathlib import Path
import numpy
import numpy as np
class DocMethod(Protocol):
def __call__(self: Doc, *args: Any, **kwargs: Any) -> Any: ... # type: ignore[misc]
@ -26,7 +26,7 @@ class Doc:
user_hooks: Dict[str, Callable[..., Any]]
user_token_hooks: Dict[str, Callable[..., Any]]
user_span_hooks: Dict[str, Callable[..., Any]]
tensor: numpy.ndarray
tensor: np.ndarray[Any, np.dtype[np.float_]]
user_data: Dict[str, Any]
has_unknown_spaces: bool
_context: Any
@ -144,7 +144,7 @@ class Doc:
) -> Doc: ...
def to_array(
self, py_attr_ids: Union[int, str, List[Union[int, str]]]
) -> numpy.ndarray: ...
) -> np.ndarray[Any, np.dtype[np.float_]]: ...
@staticmethod
def from_docs(
docs: List[Doc],

View File

@ -420,6 +420,8 @@ cdef class Doc:
cdef int range_start = 0
if attr == "IS_SENT_START" or attr == self.vocab.strings["IS_SENT_START"]:
attr = SENT_START
elif attr == "IS_SENT_END" or attr == self.vocab.strings["IS_SENT_END"]:
attr = SENT_START
attr = intify_attr(attr)
# adjust attributes
if attr == HEAD:
@ -616,7 +618,7 @@ cdef class Doc:
"""
if "has_vector" in self.user_hooks:
return self.user_hooks["has_vector"](self)
elif self.vocab.vectors.data.size:
elif self.vocab.vectors.size:
return True
elif self.tensor.size:
return True
@ -641,7 +643,7 @@ cdef class Doc:
if not len(self):
self._vector = xp.zeros((self.vocab.vectors_length,), dtype="f")
return self._vector
elif self.vocab.vectors.data.size > 0:
elif self.vocab.vectors.size > 0:
self._vector = sum(t.vector for t in self) / len(self)
return self._vector
elif self.tensor.size > 0:
@ -1183,7 +1185,7 @@ cdef class Doc:
token_offset = -1
for doc in docs[:-1]:
token_offset += len(doc)
if not (len(doc) > 0 and doc[-1].is_space):
if len(doc) > 0 and not doc[-1].is_space:
concat_spaces[token_offset] = True
concat_array = numpy.concatenate(arrays)

View File

@ -364,7 +364,9 @@ cdef class Span:
return 0.0
vector = self.vector
xp = get_array_module(vector)
return xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm)
result = xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm)
# ensure we get a scalar back (numpy does this automatically but cupy doesn't)
return result.item()
cpdef np.ndarray to_array(self, object py_attr_ids):
"""Given a list of M attribute IDs, export the tokens to a numpy
@ -497,7 +499,7 @@ cdef class Span:
"""
if "has_vector" in self.doc.user_span_hooks:
return self.doc.user_span_hooks["has_vector"](self)
elif self.vocab.vectors.data.size > 0:
elif self.vocab.vectors.size > 0:
return any(token.has_vector for token in self)
elif self.doc.tensor.size > 0:
return True

View File

@ -20,6 +20,7 @@ from .doc cimport set_children_from_heads
from .. import parts_of_speech
from ..errors import Errors, Warnings
from ..attrs import IOB_STRINGS
from .underscore import Underscore, get_ext_args
@ -209,7 +210,9 @@ cdef class Token:
return 0.0
vector = self.vector
xp = get_array_module(vector)
return (xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm))
result = xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm)
# ensure we get a scalar back (numpy does this automatically but cupy doesn't)
return result.item()
def has_morph(self):
"""Check whether the token has annotated morph information.
@ -484,8 +487,6 @@ cdef class Token:
RETURNS (bool / None): Whether the token starts a sentence.
None if unknown.
DOCS: https://spacy.io/api/token#is_sent_start
"""
def __get__(self):
if self.c.sent_start == 0:
@ -743,7 +744,7 @@ cdef class Token:
@classmethod
def iob_strings(cls):
return ("", "I", "O", "B")
return IOB_STRINGS
@property
def ent_iob_(self):

View File

@ -1,17 +1,31 @@
from typing import Dict, Any
from typing import Dict, Any, List, Optional, Tuple, Union, TYPE_CHECKING
import functools
import copy
from ..errors import Errors
if TYPE_CHECKING:
from .doc import Doc
from .span import Span
from .token import Token
class Underscore:
mutable_types = (dict, list, set)
doc_extensions: Dict[Any, Any] = {}
span_extensions: Dict[Any, Any] = {}
token_extensions: Dict[Any, Any] = {}
_extensions: Dict[str, Any]
_obj: Union["Doc", "Span", "Token"]
_start: Optional[int]
_end: Optional[int]
def __init__(self, extensions, obj, start=None, end=None):
def __init__(
self,
extensions: Dict[str, Any],
obj: Union["Doc", "Span", "Token"],
start: Optional[int] = None,
end: Optional[int] = None,
):
object.__setattr__(self, "_extensions", extensions)
object.__setattr__(self, "_obj", obj)
# Assumption is that for doc values, _start and _end will both be None
@ -23,12 +37,12 @@ class Underscore:
object.__setattr__(self, "_start", start)
object.__setattr__(self, "_end", end)
def __dir__(self):
def __dir__(self) -> List[str]:
# Hack to enable autocomplete on custom extensions
extensions = list(self._extensions.keys())
return ["set", "get", "has"] + extensions
def __getattr__(self, name):
def __getattr__(self, name: str) -> Any:
if name not in self._extensions:
raise AttributeError(Errors.E046.format(name=name))
default, method, getter, setter = self._extensions[name]
@ -56,7 +70,7 @@ class Underscore:
return new_default
return default
def __setattr__(self, name, value):
def __setattr__(self, name: str, value: Any):
if name not in self._extensions:
raise AttributeError(Errors.E047.format(name=name))
default, method, getter, setter = self._extensions[name]
@ -65,28 +79,30 @@ class Underscore:
else:
self._doc.user_data[self._get_key(name)] = value
def set(self, name, value):
def set(self, name: str, value: Any):
return self.__setattr__(name, value)
def get(self, name):
def get(self, name: str) -> Any:
return self.__getattr__(name)
def has(self, name):
def has(self, name: str) -> bool:
return name in self._extensions
def _get_key(self, name):
def _get_key(self, name: str) -> Tuple[str, str, Optional[int], Optional[int]]:
return ("._.", name, self._start, self._end)
@classmethod
def get_state(cls):
def get_state(cls) -> Tuple[Dict[Any, Any], Dict[Any, Any], Dict[Any, Any]]:
return cls.token_extensions, cls.span_extensions, cls.doc_extensions
@classmethod
def load_state(cls, state):
def load_state(
cls, state: Tuple[Dict[Any, Any], Dict[Any, Any], Dict[Any, Any]]
) -> None:
cls.token_extensions, cls.span_extensions, cls.doc_extensions = state
def get_ext_args(**kwargs):
def get_ext_args(**kwargs: Any):
"""Validate and convert arguments. Reused in Doc, Token and Span."""
default = kwargs.get("default")
getter = kwargs.get("getter")

View File

@ -164,7 +164,7 @@ def load_vectors_into_model(
len(vectors_nlp.vocab.vectors.keys()) == 0
and vectors_nlp.vocab.vectors.mode != VectorsMode.floret
) or (
vectors_nlp.vocab.vectors.data.shape[0] == 0
vectors_nlp.vocab.vectors.shape[0] == 0
and vectors_nlp.vocab.vectors.mode == VectorsMode.floret
):
logger.warning(Warnings.W112.format(name=name))

View File

@ -871,7 +871,6 @@ def get_package_path(name: str) -> Path:
name (str): Package name.
RETURNS (Path): Path to installed package.
"""
name = name.lower() # use lowercase version to be safe
# Here we're importing the module just to find it. This is worryingly
# indirect, but it's otherwise very difficult to find the package.
pkg = importlib.import_module(name)

View File

@ -1,5 +1,5 @@
cimport numpy as np
from libc.stdint cimport uint32_t
from libc.stdint cimport uint32_t, uint64_t
from cython.operator cimport dereference as deref
from libcpp.set cimport set as cppset
from murmurhash.mrmr cimport hash128_x64
@ -10,7 +10,7 @@ from typing import cast
import warnings
from enum import Enum
import srsly
from thinc.api import get_array_module, get_current_ops
from thinc.api import Ops, get_array_module, get_current_ops
from thinc.backends import get_array_ops
from thinc.types import Floats2d
@ -146,7 +146,7 @@ cdef class Vectors:
DOCS: https://spacy.io/api/vectors#size
"""
return self.data.shape[0] * self.data.shape[1]
return self.data.size
@property
def is_full(self):
@ -274,7 +274,7 @@ cdef class Vectors:
self.data = resized_array
self._sync_unset()
removed_items = []
for key, row in list(self.key2row.items()):
for key, row in self.key2row.copy().items():
if row >= shape[0]:
self.key2row.pop(key)
removed_items.append((key, row))
@ -353,12 +353,18 @@ cdef class Vectors:
key (str): The string key.
RETURNS: A list of the integer hashes.
"""
cdef uint32_t[4] out
# MurmurHash3_x64_128 returns an array of 2 uint64_t values.
cdef uint64_t[2] out
chars = s.encode("utf8")
cdef char* utf8_string = chars
hash128_x64(utf8_string, len(chars), self.hash_seed, &out)
rows = [out[i] for i in range(min(self.hash_count, 4))]
return rows
rows = [
out[0] & 0xffffffffu,
out[0] >> 32,
out[1] & 0xffffffffu,
out[1] >> 32,
]
return rows[:min(self.hash_count, 4)]
def _get_ngrams(self, unicode key):
"""Get all padded ngram strings using the ngram settings.
@ -511,6 +517,9 @@ cdef class Vectors:
for i in range(len(queries)) ], dtype="uint64")
return (keys, best_rows, scores)
def to_ops(self, ops: Ops):
self.data = ops.asarray(self.data)
def _get_cfg(self):
if self.mode == Mode.default:
return {

View File

@ -283,7 +283,7 @@ cdef class Vocab:
@property
def vectors_length(self):
return self.vectors.data.shape[1]
return self.vectors.shape[1]
def reset_vectors(self, *, width=None, shape=None):
"""Drop the current vector table. Because all vectors must be the same
@ -294,7 +294,7 @@ cdef class Vocab:
elif shape is not None:
self.vectors = Vectors(strings=self.strings, shape=shape)
else:
width = width if width is not None else self.vectors.data.shape[1]
width = width if width is not None else self.vectors.shape[1]
self.vectors = Vectors(strings=self.strings, shape=(self.vectors.shape[0], width))
def prune_vectors(self, nr_row, batch_size=1024):

View File

@ -79,6 +79,7 @@ train/test skew.
| `max_length` | Maximum document length. Longer documents will be split into sentences, if sentence boundaries are available. Defaults to `0` for no limit. ~~int~~ |
| `limit` | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~ |
| `augmenter` | Optional data augmentation callback. ~~Callable[[Language, Example], Iterable[Example]]~~ |
| `shuffle` | Whether to shuffle the examples. Defaults to `False`. ~~bool~~ |
## Corpus.\_\_call\_\_ {#call tag="method"}

View File

@ -304,7 +304,7 @@ ancestor is found, e.g. if span excludes a necessary ancestor.
## Doc.has_annotation {#has_annotation tag="method"}
Check whether the doc contains annotation on a token attribute.
Check whether the doc contains annotation on a [`Token` attribute](/api/token#attributes).
<Infobox title="Changed in v3.0" variant="warning">

View File

@ -44,6 +44,7 @@ rule-based matching are:
| `SPACY` | Token has a trailing space. ~~bool~~ |
|  `POS`, `TAG`, `MORPH`, `DEP`, `LEMMA`, `SHAPE` | The token's simple and extended part-of-speech tag, morphological analysis, dependency label, lemma, shape. ~~str~~ |
| `ENT_TYPE` | The token's entity label. ~~str~~ |
| `ENT_IOB` | The IOB part of the token's entity tag. ~~str~~ |
| `ENT_ID` | The token's entity ID (`ent_id`). ~~str~~ |
| `ENT_KB_ID` | The token's entity knowledge base ID (`ent_kb_id`). ~~str~~ |
| `_` <Tag variant="new">2.1</Tag> | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). ~~Dict[str, Any]~~ |

View File

@ -349,23 +349,6 @@ A sequence containing the token and all the token's syntactic descendants.
| ---------- | ------------------------------------------------------------------------------------ |
| **YIELDS** | A descendant token such that `self.is_ancestor(token)` or `token == self`. ~~Token~~ |
## Token.is_sent_start {#is_sent_start tag="property" new="2"}
A boolean value indicating whether the token starts a sentence. `None` if
unknown. Defaults to `True` for the first token in the `Doc`.
> #### Example
>
> ```python
> doc = nlp("Give it back! He pleaded.")
> assert doc[4].is_sent_start
> assert not doc[5].is_sent_start
> ```
| Name | Description |
| ----------- | ------------------------------------------------------- |
| **RETURNS** | Whether the token starts a sentence. ~~Optional[bool]~~ |
## Token.has_vector {#has_vector tag="property" model="vectors"}
A boolean value indicating whether a word vector is associated with the token.
@ -465,6 +448,8 @@ The L2 norm of the token's vector representation.
| `is_punct` | Is the token punctuation? ~~bool~~ |
| `is_left_punct` | Is the token a left punctuation mark, e.g. `"("` ? ~~bool~~ |
| `is_right_punct` | Is the token a right punctuation mark, e.g. `")"` ? ~~bool~~ |
| `is_sent_start` | Does the token start a sentence? ~~bool~~ or `None` if unknown. Defaults to `True` for the first token in the `Doc`. |
| `is_sent_end` | Does the token end a sentence? ~~bool~~ or `None` if unknown. |
| `is_space` | Does the token consist of whitespace characters? Equivalent to `token.text.isspace()`. ~~bool~~ |
| `is_bracket` | Is the token a bracket? ~~bool~~ |
| `is_quote` | Is the token a quotation mark? ~~bool~~ |

View File

@ -371,6 +371,23 @@ Get the vectors for the provided keys efficiently as a batch.
| ------ | --------------------------------------- |
| `keys` | The keys. ~~Iterable[Union[int, str]]~~ |
## Vectors.to_ops {#to_ops tag="method"}
Change the embedding matrix to use different Thinc ops.
> #### Example
>
> ```python
> from thinc.api import NumpyOps
>
> vectors.to_ops(NumpyOps())
>
> ```
| Name | Description |
|-------|----------------------------------------------------------|
| `ops` | The Thinc ops to switch the embedding matrix to. ~~Ops~~ |
## Vectors.to_disk {#to_disk tag="method"}
Save the current state to a directory.

View File

@ -831,6 +831,8 @@ def tokenizer_pseudo_code(
infixes = infix_finditer(substring)
offset = 0
for match in infixes:
if offset == 0 and match.start() == 0:
continue
tokens.append(substring[offset : match.start()])
tokens.append(substring[match.start() : match.end()])
offset = match.end()

View File

@ -213,6 +213,12 @@ format, train a pipeline, evaluate it and export metrics, package it and spin up
a quick web demo. It looks pretty similar to a config file used to define CI
pipelines.
> #### Tip: Multi-line YAML syntax for long values
>
> YAML has [multi-line syntax](https://yaml-multiline.info/) that can be
> helpful for readability with longer values such as project descriptions or
> commands that take several arguments.
```yaml
%%GITHUB_PROJECTS/pipelines/tagger_parser_ud/project.yml
```

View File

@ -141,7 +141,8 @@
"website": "https://www.nr.no/~plison"
},
"category": ["pipeline", "standalone", "research", "training"],
"tags": []
"tags": [],
"spacy_version": 3
},
{
"id": "numerizer",
@ -952,6 +953,37 @@
"category": ["pipeline"],
"tags": ["lemmatizer", "danish"]
},
{
"id": "augmenty",
"title": "Augmenty",
"slogan": "The cherry on top of your NLP pipeline",
"description": "Augmenty is an augmentation library based on spaCy for augmenting texts. Augmenty differs from other augmentation libraries in that it corrects (as far as possible) the token, sentence and document labels under the augmentation.",
"github": "kennethenevoldsen/augmenty",
"pip": "augmenty",
"code_example": [
"import spacy",
"import augmenty",
"",
"nlp = spacy.load('en_core_web_md')",
"",
"docs = nlp.pipe(['Augmenty is a great tool for text augmentation'])",
"",
"ent_dict = {'ORG': [['spaCy'], ['spaCy', 'Universe']]}",
"entity_augmenter = augmenty.load('ents_replace.v1',",
" ent_dict = ent_dict, level=1)",
"",
"for doc in augmenty.docs(docs, augmenter=entity_augmenter, nlp=nlp):",
" print(doc)"
],
"thumb": "https://github.com/KennethEnevoldsen/augmenty/blob/master/img/icon.png?raw=true",
"author": "Kenneth Enevoldsen",
"author_links": {
"github": "kennethenevoldsen",
"website": "https://www.kennethenevoldsen.com"
},
"category": ["training", "research"],
"tags": ["training", "research", "augmentation"]
},
{
"id": "dacy",
"title": "DaCy",

View File

@ -8,10 +8,11 @@ import Title from '../components/title'
import Grid from '../components/grid'
import Button from '../components/button'
import Icon from '../components/icon'
import Tag from '../components/tag'
import CodeBlock, { InlineCode } from '../components/code'
import Aside from '../components/aside'
import Sidebar from '../components/sidebar'
import Section from '../components/section'
import Section, { Hr } from '../components/section'
import Main from '../components/main'
import Footer from '../components/footer'
import { H3, H5, Label, InlineList } from '../components/typography'
@ -121,6 +122,18 @@ const UniverseContent = ({ content = [], categories, theme, pageContext, mdxComp
</Grid>
</Section>
)}
<section className="search-exclude">
<H3>Found a mistake or something isn't working?</H3>
<p>
If you've come across a universe project that isn't working or is
incompatible with the reported spaCy version, let us know by{' '}
<Link to="https://github.com/explosion/spaCy/discussions/new">
opening a discussion thread
</Link>
.
</p>
</section>
<Hr />
<section className="search-exclude">
<H3>Submit your project</H3>
<p>
@ -168,11 +181,22 @@ UniverseContent.propTypes = {
mdxComponents: PropTypes.object,
}
const SpaCyVersion = ({ version }) => {
const versions = !Array.isArray(version) ? [version] : version
return versions.map((v, i) => (
<>
<Tag tooltip={`This project is compatible with spaCy v${v}`}>spaCy v{v}</Tag>{' '}
</>
))
}
const Project = ({ data, components }) => (
<>
<Title title={data.title || data.id} teaser={data.slogan} image={data.thumb}>
{data.github && (
{(data.github || data.spacy_version) && (
<p>
{data.spacy_version && <SpaCyVersion version={data.spacy_version} />}
{data.github && (
<Link to={`https://github.com/${data.github}`} hidden>
{[
`release/${data.github}/all.svg?style=flat-square`,
@ -180,13 +204,18 @@ const Project = ({ data, components }) => (
`stars/${data.github}.svg?style=social&label=Stars`,
].map((url, i) => (
<img
style={{ borderRadius: '1em', marginRight: '0.5rem' }}
style={{
borderRadius: '1em',
marginRight: '0.5rem',
verticalAlign: 'middle',
}}
key={i}
src={`https://img.shields.io/github/${url}`}
alt=""
/>
))}
</Link>
)}
</p>
)}
</Title>
@ -335,6 +364,7 @@ const query = graphql`
url
github
description
spacy_version
pip
cran
category