mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-11 09:00:36 +03:00
Merge branch 'master' into spacy.io
This commit is contained in:
commit
d43082289c
21
.github/workflows/gputests.yml
vendored
Normal file
21
.github/workflows/gputests.yml
vendored
Normal file
|
@ -0,0 +1,21 @@
|
|||
name: Weekly GPU tests
|
||||
|
||||
on:
|
||||
schedule:
|
||||
- cron: '0 1 * * MON'
|
||||
|
||||
jobs:
|
||||
weekly-gputests:
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
branch: [master, develop, v4]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Trigger buildkite build
|
||||
uses: buildkite/trigger-pipeline-action@v1.2.0
|
||||
env:
|
||||
PIPELINE: explosion-ai/spacy-slow-gpu-tests
|
||||
BRANCH: ${{ matrix.branch }}
|
||||
MESSAGE: ":github: Weekly GPU + slow tests - triggered from a GitHub Action"
|
||||
BUILDKITE_API_ACCESS_TOKEN: ${{ secrets.BUILDKITE_SECRET }}
|
35
.github/workflows/slowtests.yml
vendored
Normal file
35
.github/workflows/slowtests.yml
vendored
Normal file
|
@ -0,0 +1,35 @@
|
|||
name: Daily slow tests
|
||||
|
||||
on:
|
||||
schedule:
|
||||
- cron: '0 0 * * *'
|
||||
|
||||
jobs:
|
||||
daily-slowtests:
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
branch: [master, develop, v4]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v1
|
||||
- name: Get commits from past 24 hours
|
||||
id: check_commits
|
||||
run: |
|
||||
today=$(date '+%Y-%m-%d %H:%M:%S')
|
||||
yesterday=$(date -d "yesterday" '+%Y-%m-%d %H:%M:%S')
|
||||
if git log --after=$yesterday --before=$today | grep commit ; then
|
||||
echo "::set-output name=run_tests::true"
|
||||
else
|
||||
echo "::set-output name=run_tests::false"
|
||||
fi
|
||||
|
||||
- name: Trigger buildkite build
|
||||
if: steps.check_commits.outputs.run_tests == 'true'
|
||||
uses: buildkite/trigger-pipeline-action@v1.2.0
|
||||
env:
|
||||
PIPELINE: explosion-ai/spacy-slow-tests
|
||||
BRANCH: ${{ matrix.branch }}
|
||||
MESSAGE: ":github: Daily slow tests - triggered from a GitHub Action"
|
||||
BUILDKITE_API_ACCESS_TOKEN: ${{ secrets.BUILDKITE_SECRET }}
|
2
LICENSE
2
LICENSE
|
@ -1,6 +1,6 @@
|
|||
The MIT License (MIT)
|
||||
|
||||
Copyright (C) 2016-2021 ExplosionAI GmbH, 2016 spaCy GmbH, 2015 Matthew Honnibal
|
||||
Copyright (C) 2016-2022 ExplosionAI GmbH, 2016 spaCy GmbH, 2015 Matthew Honnibal
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
|
|
|
@ -1,11 +1,8 @@
|
|||
recursive-include include *.h
|
||||
recursive-include spacy *.pyi *.pyx *.pxd *.txt *.cfg *.jinja *.toml
|
||||
include LICENSE
|
||||
include README.md
|
||||
include pyproject.toml
|
||||
include spacy/py.typed
|
||||
recursive-exclude spacy/lang *.json
|
||||
recursive-include spacy/lang *.json.gz
|
||||
recursive-include spacy/cli *.json *.yml
|
||||
recursive-include spacy/cli *.yml
|
||||
recursive-include licenses *
|
||||
recursive-exclude spacy *.cpp
|
||||
|
|
|
@ -31,7 +31,8 @@ pytest-timeout>=1.3.0,<2.0.0
|
|||
mock>=2.0.0,<3.0.0
|
||||
flake8>=3.8.0,<3.10.0
|
||||
hypothesis>=3.27.0,<7.0.0
|
||||
mypy>=0.910
|
||||
mypy==0.910
|
||||
types-dataclasses>=0.1.3; python_version < "3.7"
|
||||
types-mock>=0.1.1
|
||||
types-requests
|
||||
black>=22.0,<23.0
|
||||
|
|
32
setup.cfg
32
setup.cfg
|
@ -77,37 +77,39 @@ transformers =
|
|||
ray =
|
||||
spacy_ray>=0.1.0,<1.0.0
|
||||
cuda =
|
||||
cupy>=5.0.0b4,<10.0.0
|
||||
cupy>=5.0.0b4,<11.0.0
|
||||
cuda80 =
|
||||
cupy-cuda80>=5.0.0b4,<10.0.0
|
||||
cupy-cuda80>=5.0.0b4,<11.0.0
|
||||
cuda90 =
|
||||
cupy-cuda90>=5.0.0b4,<10.0.0
|
||||
cupy-cuda90>=5.0.0b4,<11.0.0
|
||||
cuda91 =
|
||||
cupy-cuda91>=5.0.0b4,<10.0.0
|
||||
cupy-cuda91>=5.0.0b4,<11.0.0
|
||||
cuda92 =
|
||||
cupy-cuda92>=5.0.0b4,<10.0.0
|
||||
cupy-cuda92>=5.0.0b4,<11.0.0
|
||||
cuda100 =
|
||||
cupy-cuda100>=5.0.0b4,<10.0.0
|
||||
cupy-cuda100>=5.0.0b4,<11.0.0
|
||||
cuda101 =
|
||||
cupy-cuda101>=5.0.0b4,<10.0.0
|
||||
cupy-cuda101>=5.0.0b4,<11.0.0
|
||||
cuda102 =
|
||||
cupy-cuda102>=5.0.0b4,<10.0.0
|
||||
cupy-cuda102>=5.0.0b4,<11.0.0
|
||||
cuda110 =
|
||||
cupy-cuda110>=5.0.0b4,<10.0.0
|
||||
cupy-cuda110>=5.0.0b4,<11.0.0
|
||||
cuda111 =
|
||||
cupy-cuda111>=5.0.0b4,<10.0.0
|
||||
cupy-cuda111>=5.0.0b4,<11.0.0
|
||||
cuda112 =
|
||||
cupy-cuda112>=5.0.0b4,<10.0.0
|
||||
cupy-cuda112>=5.0.0b4,<11.0.0
|
||||
cuda113 =
|
||||
cupy-cuda113>=5.0.0b4,<10.0.0
|
||||
cupy-cuda113>=5.0.0b4,<11.0.0
|
||||
cuda114 =
|
||||
cupy-cuda114>=5.0.0b4,<10.0.0
|
||||
cupy-cuda114>=5.0.0b4,<11.0.0
|
||||
cuda115 =
|
||||
cupy-cuda115>=5.0.0b4,<11.0.0
|
||||
apple =
|
||||
thinc-apple-ops>=0.0.4,<1.0.0
|
||||
# Language tokenizers with external dependencies
|
||||
ja =
|
||||
sudachipy>=0.4.9
|
||||
sudachidict_core>=20200330
|
||||
sudachipy>=0.5.2,!=0.6.1
|
||||
sudachidict_core>=20211220
|
||||
ko =
|
||||
natto-py==0.9.0
|
||||
th =
|
||||
|
|
|
@ -1,3 +1,6 @@
|
|||
from .errors import Errors
|
||||
|
||||
IOB_STRINGS = ("", "I", "O", "B")
|
||||
|
||||
IDS = {
|
||||
"": NULL_ATTR,
|
||||
|
@ -64,7 +67,6 @@ IDS = {
|
|||
"FLAG61": FLAG61,
|
||||
"FLAG62": FLAG62,
|
||||
"FLAG63": FLAG63,
|
||||
|
||||
"ID": ID,
|
||||
"ORTH": ORTH,
|
||||
"LOWER": LOWER,
|
||||
|
@ -72,7 +74,6 @@ IDS = {
|
|||
"SHAPE": SHAPE,
|
||||
"PREFIX": PREFIX,
|
||||
"SUFFIX": SUFFIX,
|
||||
|
||||
"LENGTH": LENGTH,
|
||||
"LEMMA": LEMMA,
|
||||
"POS": POS,
|
||||
|
@ -87,7 +88,7 @@ IDS = {
|
|||
"SPACY": SPACY,
|
||||
"LANG": LANG,
|
||||
"MORPH": MORPH,
|
||||
"IDX": IDX
|
||||
"IDX": IDX,
|
||||
}
|
||||
|
||||
|
||||
|
@ -109,28 +110,66 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
|
|||
"""
|
||||
inty_attrs = {}
|
||||
if _do_deprecated:
|
||||
if 'F' in stringy_attrs:
|
||||
if "F" in stringy_attrs:
|
||||
stringy_attrs["ORTH"] = stringy_attrs.pop("F")
|
||||
if 'L' in stringy_attrs:
|
||||
if "L" in stringy_attrs:
|
||||
stringy_attrs["LEMMA"] = stringy_attrs.pop("L")
|
||||
if 'pos' in stringy_attrs:
|
||||
if "pos" in stringy_attrs:
|
||||
stringy_attrs["TAG"] = stringy_attrs.pop("pos")
|
||||
if 'morph' in stringy_attrs:
|
||||
morphs = stringy_attrs.pop('morph')
|
||||
if 'number' in stringy_attrs:
|
||||
stringy_attrs.pop('number')
|
||||
if 'tenspect' in stringy_attrs:
|
||||
stringy_attrs.pop('tenspect')
|
||||
if "morph" in stringy_attrs:
|
||||
morphs = stringy_attrs.pop("morph")
|
||||
if "number" in stringy_attrs:
|
||||
stringy_attrs.pop("number")
|
||||
if "tenspect" in stringy_attrs:
|
||||
stringy_attrs.pop("tenspect")
|
||||
morph_keys = [
|
||||
'PunctType', 'PunctSide', 'Other', 'Degree', 'AdvType', 'Number',
|
||||
'VerbForm', 'PronType', 'Aspect', 'Tense', 'PartType', 'Poss',
|
||||
'Hyph', 'ConjType', 'NumType', 'Foreign', 'VerbType', 'NounType',
|
||||
'Gender', 'Mood', 'Negative', 'Tense', 'Voice', 'Abbr',
|
||||
'Derivation', 'Echo', 'Foreign', 'NameType', 'NounType', 'NumForm',
|
||||
'NumValue', 'PartType', 'Polite', 'StyleVariant',
|
||||
'PronType', 'AdjType', 'Person', 'Variant', 'AdpType',
|
||||
'Reflex', 'Negative', 'Mood', 'Aspect', 'Case',
|
||||
'Polarity', 'PrepCase', 'Animacy' # U20
|
||||
"PunctType",
|
||||
"PunctSide",
|
||||
"Other",
|
||||
"Degree",
|
||||
"AdvType",
|
||||
"Number",
|
||||
"VerbForm",
|
||||
"PronType",
|
||||
"Aspect",
|
||||
"Tense",
|
||||
"PartType",
|
||||
"Poss",
|
||||
"Hyph",
|
||||
"ConjType",
|
||||
"NumType",
|
||||
"Foreign",
|
||||
"VerbType",
|
||||
"NounType",
|
||||
"Gender",
|
||||
"Mood",
|
||||
"Negative",
|
||||
"Tense",
|
||||
"Voice",
|
||||
"Abbr",
|
||||
"Derivation",
|
||||
"Echo",
|
||||
"Foreign",
|
||||
"NameType",
|
||||
"NounType",
|
||||
"NumForm",
|
||||
"NumValue",
|
||||
"PartType",
|
||||
"Polite",
|
||||
"StyleVariant",
|
||||
"PronType",
|
||||
"AdjType",
|
||||
"Person",
|
||||
"Variant",
|
||||
"AdpType",
|
||||
"Reflex",
|
||||
"Negative",
|
||||
"Mood",
|
||||
"Aspect",
|
||||
"Case",
|
||||
"Polarity",
|
||||
"PrepCase",
|
||||
"Animacy", # U20
|
||||
]
|
||||
for key in morph_keys:
|
||||
if key in stringy_attrs:
|
||||
|
@ -142,8 +181,13 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
|
|||
for name, value in stringy_attrs.items():
|
||||
int_key = intify_attr(name)
|
||||
if int_key is not None:
|
||||
if int_key == ENT_IOB:
|
||||
if value in IOB_STRINGS:
|
||||
value = IOB_STRINGS.index(value)
|
||||
elif isinstance(value, str):
|
||||
raise ValueError(Errors.E1025.format(value=value))
|
||||
if strings_map is not None and isinstance(value, str):
|
||||
if hasattr(strings_map, 'add'):
|
||||
if hasattr(strings_map, "add"):
|
||||
value = strings_map.add(value)
|
||||
else:
|
||||
value = strings_map[value]
|
||||
|
|
|
@ -14,7 +14,7 @@ from ..training.initialize import get_sourced_components
|
|||
from ..schemas import ConfigSchemaTraining
|
||||
from ..pipeline._parser_internals import nonproj
|
||||
from ..pipeline._parser_internals.nonproj import DELIMITER
|
||||
from ..pipeline import Morphologizer
|
||||
from ..pipeline import Morphologizer, SpanCategorizer
|
||||
from ..morphology import Morphology
|
||||
from ..language import Language
|
||||
from ..util import registry, resolve_dot_names
|
||||
|
@ -193,6 +193,70 @@ def debug_data(
|
|||
else:
|
||||
msg.info("No word vectors present in the package")
|
||||
|
||||
if "spancat" in factory_names:
|
||||
model_labels_spancat = _get_labels_from_spancat(nlp)
|
||||
has_low_data_warning = False
|
||||
has_no_neg_warning = False
|
||||
|
||||
msg.divider("Span Categorization")
|
||||
msg.table(model_labels_spancat, header=["Spans Key", "Labels"], divider=True)
|
||||
|
||||
msg.text("Label counts in train data: ", show=verbose)
|
||||
for spans_key, data_labels in gold_train_data["spancat"].items():
|
||||
msg.text(
|
||||
f"Key: {spans_key}, {_format_labels(data_labels.items(), counts=True)}",
|
||||
show=verbose,
|
||||
)
|
||||
# Data checks: only take the spans keys in the actual spancat components
|
||||
data_labels_in_component = {
|
||||
spans_key: gold_train_data["spancat"][spans_key]
|
||||
for spans_key in model_labels_spancat.keys()
|
||||
}
|
||||
for spans_key, data_labels in data_labels_in_component.items():
|
||||
for label, count in data_labels.items():
|
||||
# Check for missing labels
|
||||
spans_key_in_model = spans_key in model_labels_spancat.keys()
|
||||
if (spans_key_in_model) and (
|
||||
label not in model_labels_spancat[spans_key]
|
||||
):
|
||||
msg.warn(
|
||||
f"Label '{label}' is not present in the model labels of key '{spans_key}'. "
|
||||
"Performance may degrade after training."
|
||||
)
|
||||
# Check for low number of examples per label
|
||||
if count <= NEW_LABEL_THRESHOLD:
|
||||
msg.warn(
|
||||
f"Low number of examples for label '{label}' in key '{spans_key}' ({count})"
|
||||
)
|
||||
has_low_data_warning = True
|
||||
# Check for negative examples
|
||||
with msg.loading("Analyzing label distribution..."):
|
||||
neg_docs = _get_examples_without_label(
|
||||
train_dataset, label, "spancat", spans_key
|
||||
)
|
||||
if neg_docs == 0:
|
||||
msg.warn(f"No examples for texts WITHOUT new label '{label}'")
|
||||
has_no_neg_warning = True
|
||||
|
||||
if has_low_data_warning:
|
||||
msg.text(
|
||||
f"To train a new span type, your data should include at "
|
||||
f"least {NEW_LABEL_THRESHOLD} instances of the new label",
|
||||
show=verbose,
|
||||
)
|
||||
else:
|
||||
msg.good("Good amount of examples for all labels")
|
||||
|
||||
if has_no_neg_warning:
|
||||
msg.text(
|
||||
"Training data should always include examples of spans "
|
||||
"in context, as well as examples without a given span "
|
||||
"type.",
|
||||
show=verbose,
|
||||
)
|
||||
else:
|
||||
msg.good("Examples without ocurrences available for all labels")
|
||||
|
||||
if "ner" in factory_names:
|
||||
# Get all unique NER labels present in the data
|
||||
labels = set(
|
||||
|
@ -203,6 +267,7 @@ def debug_data(
|
|||
has_low_data_warning = False
|
||||
has_no_neg_warning = False
|
||||
has_ws_ents_error = False
|
||||
has_boundary_cross_ents_warning = False
|
||||
|
||||
msg.divider("Named Entity Recognition")
|
||||
msg.info(f"{len(model_labels)} label(s)")
|
||||
|
@ -237,17 +302,25 @@ def debug_data(
|
|||
has_low_data_warning = True
|
||||
|
||||
with msg.loading("Analyzing label distribution..."):
|
||||
neg_docs = _get_examples_without_label(train_dataset, label)
|
||||
neg_docs = _get_examples_without_label(train_dataset, label, "ner")
|
||||
if neg_docs == 0:
|
||||
msg.warn(f"No examples for texts WITHOUT new label '{label}'")
|
||||
has_no_neg_warning = True
|
||||
|
||||
if gold_train_data["boundary_cross_ents"]:
|
||||
msg.warn(
|
||||
f"{gold_train_data['boundary_cross_ents']} entity span(s) crossing sentence boundaries"
|
||||
)
|
||||
has_boundary_cross_ents_warning = True
|
||||
|
||||
if not has_low_data_warning:
|
||||
msg.good("Good amount of examples for all labels")
|
||||
if not has_no_neg_warning:
|
||||
msg.good("Examples without occurrences available for all labels")
|
||||
if not has_ws_ents_error:
|
||||
msg.good("No entities consisting of or starting/ending with whitespace")
|
||||
if not has_boundary_cross_ents_warning:
|
||||
msg.good("No entities crossing sentence boundaries")
|
||||
|
||||
if has_low_data_warning:
|
||||
msg.text(
|
||||
|
@ -564,7 +637,9 @@ def _compile_gold(
|
|||
"deps": Counter(),
|
||||
"words": Counter(),
|
||||
"roots": Counter(),
|
||||
"spancat": dict(),
|
||||
"ws_ents": 0,
|
||||
"boundary_cross_ents": 0,
|
||||
"n_words": 0,
|
||||
"n_misaligned_words": 0,
|
||||
"words_missing_vectors": Counter(),
|
||||
|
@ -593,6 +668,7 @@ def _compile_gold(
|
|||
if nlp.vocab.strings[word] not in nlp.vocab.vectors:
|
||||
data["words_missing_vectors"].update([word])
|
||||
if "ner" in factory_names:
|
||||
sent_starts = eg.get_aligned_sent_starts()
|
||||
for i, label in enumerate(eg.get_aligned_ner()):
|
||||
if label is None:
|
||||
continue
|
||||
|
@ -602,8 +678,19 @@ def _compile_gold(
|
|||
if label.startswith(("B-", "U-")):
|
||||
combined_label = label.split("-")[1]
|
||||
data["ner"][combined_label] += 1
|
||||
if sent_starts[i] == True and label.startswith(("I-", "L-")):
|
||||
data["boundary_cross_ents"] += 1
|
||||
elif label == "-":
|
||||
data["ner"]["-"] += 1
|
||||
if "spancat" in factory_names:
|
||||
for span_key in list(eg.reference.spans.keys()):
|
||||
if span_key not in data["spancat"]:
|
||||
data["spancat"][span_key] = Counter()
|
||||
for i, span in enumerate(eg.reference.spans[span_key]):
|
||||
if span.label_ is None:
|
||||
continue
|
||||
else:
|
||||
data["spancat"][span_key][span.label_] += 1
|
||||
if "textcat" in factory_names or "textcat_multilabel" in factory_names:
|
||||
data["cats"].update(gold.cats)
|
||||
if any(val not in (0, 1) for val in gold.cats.values()):
|
||||
|
@ -674,21 +761,57 @@ def _format_labels(
|
|||
return ", ".join([f"'{l}'" for l in cast(Iterable[str], labels)])
|
||||
|
||||
|
||||
def _get_examples_without_label(data: Sequence[Example], label: str) -> int:
|
||||
def _get_examples_without_label(
|
||||
data: Sequence[Example],
|
||||
label: str,
|
||||
component: Literal["ner", "spancat"] = "ner",
|
||||
spans_key: Optional[str] = "sc",
|
||||
) -> int:
|
||||
count = 0
|
||||
for eg in data:
|
||||
if component == "ner":
|
||||
labels = [
|
||||
label.split("-")[1]
|
||||
for label in eg.get_aligned_ner()
|
||||
if label not in ("O", "-", None)
|
||||
]
|
||||
|
||||
if component == "spancat":
|
||||
labels = (
|
||||
[span.label_ for span in eg.reference.spans[spans_key]]
|
||||
if spans_key in eg.reference.spans
|
||||
else []
|
||||
)
|
||||
|
||||
if label not in labels:
|
||||
count += 1
|
||||
return count
|
||||
|
||||
|
||||
def _get_labels_from_model(nlp: Language, pipe_name: str) -> Set[str]:
|
||||
if pipe_name not in nlp.pipe_names:
|
||||
return set()
|
||||
def _get_labels_from_model(nlp: Language, factory_name: str) -> Set[str]:
|
||||
pipe_names = [
|
||||
pipe_name
|
||||
for pipe_name in nlp.pipe_names
|
||||
if nlp.get_pipe_meta(pipe_name).factory == factory_name
|
||||
]
|
||||
labels: Set[str] = set()
|
||||
for pipe_name in pipe_names:
|
||||
pipe = nlp.get_pipe(pipe_name)
|
||||
return set(pipe.labels)
|
||||
labels.update(pipe.labels)
|
||||
return labels
|
||||
|
||||
|
||||
def _get_labels_from_spancat(nlp: Language) -> Dict[str, Set[str]]:
|
||||
pipe_names = [
|
||||
pipe_name
|
||||
for pipe_name in nlp.pipe_names
|
||||
if nlp.get_pipe_meta(pipe_name).factory == "spancat"
|
||||
]
|
||||
labels: Dict[str, Set[str]] = {}
|
||||
for pipe_name in pipe_names:
|
||||
pipe = nlp.get_pipe(pipe_name)
|
||||
assert isinstance(pipe, SpanCategorizer)
|
||||
if pipe.key not in labels:
|
||||
labels[pipe.key] = set()
|
||||
labels[pipe.key].update(pipe.labels)
|
||||
return labels
|
||||
|
|
|
@ -7,6 +7,7 @@ from collections import defaultdict
|
|||
from catalogue import RegistryError
|
||||
import srsly
|
||||
import sys
|
||||
import re
|
||||
|
||||
from ._util import app, Arg, Opt, string_to_list, WHEEL_SUFFIX, SDIST_SUFFIX
|
||||
from ..schemas import validate, ModelMetaSchema
|
||||
|
@ -109,6 +110,24 @@ def package(
|
|||
", ".join(meta["requirements"]),
|
||||
)
|
||||
if name is not None:
|
||||
if not name.isidentifier():
|
||||
msg.fail(
|
||||
f"Model name ('{name}') is not a valid module name. "
|
||||
"This is required so it can be imported as a module.",
|
||||
"We recommend names that use ASCII A-Z, a-z, _ (underscore), "
|
||||
"and 0-9. "
|
||||
"For specific details see: https://docs.python.org/3/reference/lexical_analysis.html#identifiers",
|
||||
exits=1,
|
||||
)
|
||||
if not _is_permitted_package_name(name):
|
||||
msg.fail(
|
||||
f"Model name ('{name}') is not a permitted package name. "
|
||||
"This is required to correctly load the model with spacy.load.",
|
||||
"We recommend names that use ASCII A-Z, a-z, _ (underscore), "
|
||||
"and 0-9. "
|
||||
"For specific details see: https://www.python.org/dev/peps/pep-0426/#name",
|
||||
exits=1,
|
||||
)
|
||||
meta["name"] = name
|
||||
if version is not None:
|
||||
meta["version"] = version
|
||||
|
@ -162,7 +181,7 @@ def package(
|
|||
imports="\n".join(f"from . import {m}" for m in imports)
|
||||
)
|
||||
create_file(package_path / "__init__.py", init_py)
|
||||
msg.good(f"Successfully created package '{model_name_v}'", main_path)
|
||||
msg.good(f"Successfully created package directory '{model_name_v}'", main_path)
|
||||
if create_sdist:
|
||||
with util.working_dir(main_path):
|
||||
util.run_command([sys.executable, "setup.py", "sdist"], capture=False)
|
||||
|
@ -171,8 +190,14 @@ def package(
|
|||
if create_wheel:
|
||||
with util.working_dir(main_path):
|
||||
util.run_command([sys.executable, "setup.py", "bdist_wheel"], capture=False)
|
||||
wheel = main_path / "dist" / f"{model_name_v}{WHEEL_SUFFIX}"
|
||||
wheel_name_squashed = re.sub("_+", "_", model_name_v)
|
||||
wheel = main_path / "dist" / f"{wheel_name_squashed}{WHEEL_SUFFIX}"
|
||||
msg.good(f"Successfully created binary wheel", wheel)
|
||||
if "__" in model_name:
|
||||
msg.warn(
|
||||
f"Model name ('{model_name}') contains a run of underscores. "
|
||||
"Runs of underscores are not significant in installed package names.",
|
||||
)
|
||||
|
||||
|
||||
def has_wheel() -> bool:
|
||||
|
@ -422,6 +447,14 @@ def _format_label_scheme(data: Dict[str, Any]) -> str:
|
|||
return md.text
|
||||
|
||||
|
||||
def _is_permitted_package_name(package_name: str) -> bool:
|
||||
# regex from: https://www.python.org/dev/peps/pep-0426/#name
|
||||
permitted_match = re.search(
|
||||
r"^([A-Z0-9]|[A-Z0-9][A-Z0-9._-]*[A-Z0-9])$", package_name, re.IGNORECASE
|
||||
)
|
||||
return permitted_match is not None
|
||||
|
||||
|
||||
TEMPLATE_SETUP = """
|
||||
#!/usr/bin/env python
|
||||
import io
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
from typing import Any, Dict, Optional
|
||||
from pathlib import Path
|
||||
from wasabi import msg
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import requests
|
||||
|
@ -129,10 +130,17 @@ def fetch_asset(
|
|||
the asset failed.
|
||||
"""
|
||||
dest_path = (project_path / dest).resolve()
|
||||
if dest_path.exists() and checksum:
|
||||
if dest_path.exists():
|
||||
# If there's already a file, check for checksum
|
||||
if checksum:
|
||||
if checksum == get_checksum(dest_path):
|
||||
msg.good(f"Skipping download with matching checksum: {dest}")
|
||||
return
|
||||
else:
|
||||
# If there's not a checksum, make sure the file is a possibly valid size
|
||||
if os.path.getsize(dest_path) == 0:
|
||||
msg.warn(f"Asset exists but with size of 0 bytes, deleting: {dest}")
|
||||
os.remove(dest_path)
|
||||
# We might as well support the user here and create parent directories in
|
||||
# case the asset dir isn't listed as a dir to create in the project.yml
|
||||
if not dest_path.parent.exists():
|
||||
|
|
|
@ -6,6 +6,11 @@ can help generate the best possible configuration, given a user's requirements.
|
|||
[paths]
|
||||
train = null
|
||||
dev = null
|
||||
{% if use_transformer or optimize == "efficiency" or not word_vectors -%}
|
||||
vectors = null
|
||||
{% else -%}
|
||||
vectors = "{{ word_vectors }}"
|
||||
{% endif -%}
|
||||
|
||||
[system]
|
||||
{% if use_transformer -%}
|
||||
|
@ -421,8 +426,4 @@ compound = 1.001
|
|||
{% endif %}
|
||||
|
||||
[initialize]
|
||||
{% if use_transformer or optimize == "efficiency" or not word_vectors -%}
|
||||
vectors = ${paths.vectors}
|
||||
{% else -%}
|
||||
vectors = "{{ word_vectors }}"
|
||||
{% endif -%}
|
||||
|
|
|
@ -68,12 +68,14 @@ seed = ${system.seed}
|
|||
gpu_allocator = ${system.gpu_allocator}
|
||||
dropout = 0.1
|
||||
accumulate_gradient = 1
|
||||
# Controls early-stopping. 0 disables early stopping.
|
||||
# Controls early-stopping, i.e., the number of steps to continue without
|
||||
# improvement before stopping. 0 disables early stopping.
|
||||
patience = 1600
|
||||
# Number of epochs. 0 means unlimited. If >= 0, train corpus is loaded once in
|
||||
# memory and shuffled within the training loop. -1 means stream train corpus
|
||||
# rather than loading in memory with no shuffling within the training loop.
|
||||
max_epochs = 0
|
||||
# Maximum number of update steps to train for. 0 means an unlimited number of steps.
|
||||
max_steps = 20000
|
||||
eval_frequency = 200
|
||||
# Control how scores are printed and checkpoints are evaluated.
|
||||
|
|
|
@ -18,7 +18,7 @@ DEFAULT_LABEL_COLORS = {
|
|||
"LOC": "#ff9561",
|
||||
"PERSON": "#aa9cfc",
|
||||
"NORP": "#c887fb",
|
||||
"FACILITY": "#9cc9cc",
|
||||
"FAC": "#9cc9cc",
|
||||
"EVENT": "#ffeb80",
|
||||
"LAW": "#ff8197",
|
||||
"LANGUAGE": "#ff8197",
|
||||
|
|
|
@ -483,7 +483,7 @@ class Errors(metaclass=ErrorsWithCodes):
|
|||
"components, since spans are only views of the Doc. Use Doc and "
|
||||
"Token attributes (or custom extension attributes) only and remove "
|
||||
"the following: {attrs}")
|
||||
E181 = ("Received invalid attributes for unkown object {obj}: {attrs}. "
|
||||
E181 = ("Received invalid attributes for unknown object {obj}: {attrs}. "
|
||||
"Only Doc and Token attributes are supported.")
|
||||
E182 = ("Received invalid attribute declaration: {attr}\nDid you forget "
|
||||
"to define the attribute? For example: `{attr}.???`")
|
||||
|
@ -888,9 +888,12 @@ class Errors(metaclass=ErrorsWithCodes):
|
|||
E1021 = ("`pos` value \"{pp}\" is not a valid Universal Dependencies tag. "
|
||||
"Non-UD tags should use the `tag` property.")
|
||||
E1022 = ("Words must be of type str or int, but input is of type '{wtype}'")
|
||||
E1023 = ("Couldn't read EntityRuler from the {path}. This file doesn't exist.")
|
||||
E1024 = ("A pattern with ID \"{ent_id}\" is not present in EntityRuler patterns.")
|
||||
|
||||
E1023 = ("Couldn't read EntityRuler from the {path}. This file doesn't "
|
||||
"exist.")
|
||||
E1024 = ("A pattern with ID \"{ent_id}\" is not present in EntityRuler "
|
||||
"patterns.")
|
||||
E1025 = ("Cannot intify the value '{value}' as an IOB string. The only "
|
||||
"supported values are: 'I', 'O', 'B' and ''")
|
||||
|
||||
|
||||
# Deprecated model shortcuts, only used in errors and warnings
|
||||
|
|
|
@ -310,7 +310,6 @@ GLOSSARY = {
|
|||
"re": "repeated element",
|
||||
"rs": "reported speech",
|
||||
"sb": "subject",
|
||||
"sb": "subject",
|
||||
"sbp": "passivized subject (PP)",
|
||||
"sp": "subject or predicate",
|
||||
"svp": "separable verb prefix",
|
||||
|
|
|
@ -90,7 +90,7 @@ _eleven_to_beyond = [
|
|||
"अड़सठ",
|
||||
"उनहत्तर",
|
||||
"सत्तर",
|
||||
"इकहत्तर"
|
||||
"इकहत्तर",
|
||||
"बहत्तर",
|
||||
"तिहत्तर",
|
||||
"चौहत्तर",
|
||||
|
|
|
@ -59,7 +59,7 @@ sentences = [
|
|||
"Czy w ciągu ostatnich 48 godzin spożyłeś leki zawierające paracetamol?",
|
||||
"Kto ma ochotę zapoznać się z innymi niż w książkach przygodami Muminków i ich przyjaciół, temu polecam komiks Tove Jansson „Muminki i morze”.",
|
||||
"Apple está querendo comprar uma startup do Reino Unido por 100 milhões de dólares.",
|
||||
"Carros autônomos empurram a responsabilidade do seguro para os fabricantes.."
|
||||
"Carros autônomos empurram a responsabilidade do seguro para os fabricantes..",
|
||||
"São Francisco considera banir os robôs de entrega que andam pelas calçadas.",
|
||||
"Londres é a maior cidade do Reino Unido.",
|
||||
# Translations from English:
|
||||
|
|
|
@ -131,7 +131,7 @@ class Language:
|
|||
self,
|
||||
vocab: Union[Vocab, bool] = True,
|
||||
*,
|
||||
max_length: int = 10 ** 6,
|
||||
max_length: int = 10**6,
|
||||
meta: Dict[str, Any] = {},
|
||||
create_tokenizer: Optional[Callable[["Language"], Callable[[str], Doc]]] = None,
|
||||
batch_size: int = 1000,
|
||||
|
@ -354,12 +354,15 @@ class Language:
|
|||
@property
|
||||
def pipe_labels(self) -> Dict[str, List[str]]:
|
||||
"""Get the labels set by the pipeline components, if available (if
|
||||
the component exposes a labels property).
|
||||
the component exposes a labels property and the labels are not
|
||||
hidden).
|
||||
|
||||
RETURNS (Dict[str, List[str]]): Labels keyed by component name.
|
||||
"""
|
||||
labels = {}
|
||||
for name, pipe in self._components:
|
||||
if hasattr(pipe, "hide_labels") and pipe.hide_labels is True:
|
||||
continue
|
||||
if hasattr(pipe, "labels"):
|
||||
labels[name] = list(pipe.labels)
|
||||
return SimpleFrozenDict(labels)
|
||||
|
@ -522,7 +525,7 @@ class Language:
|
|||
requires: Iterable[str] = SimpleFrozenList(),
|
||||
retokenizes: bool = False,
|
||||
func: Optional["Pipe"] = None,
|
||||
) -> Callable:
|
||||
) -> Callable[..., Any]:
|
||||
"""Register a new pipeline component. Can be used for stateless function
|
||||
components that don't require a separate factory. Can be used as a
|
||||
decorator on a function or classmethod, or called as a function with the
|
||||
|
@ -1285,9 +1288,9 @@ class Language:
|
|||
)
|
||||
except IOError:
|
||||
raise IOError(Errors.E884.format(vectors=I["vectors"]))
|
||||
if self.vocab.vectors.data.shape[1] >= 1:
|
||||
if self.vocab.vectors.shape[1] >= 1:
|
||||
ops = get_current_ops()
|
||||
self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data)
|
||||
self.vocab.vectors.to_ops(ops)
|
||||
if hasattr(self.tokenizer, "initialize"):
|
||||
tok_settings = validate_init_settings(
|
||||
self.tokenizer.initialize, # type: ignore[union-attr]
|
||||
|
@ -1332,8 +1335,8 @@ class Language:
|
|||
DOCS: https://spacy.io/api/language#resume_training
|
||||
"""
|
||||
ops = get_current_ops()
|
||||
if self.vocab.vectors.data.shape[1] >= 1:
|
||||
self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data)
|
||||
if self.vocab.vectors.shape[1] >= 1:
|
||||
self.vocab.vectors.to_ops(ops)
|
||||
for name, proc in self.pipeline:
|
||||
if hasattr(proc, "_rehearsal_model"):
|
||||
proc._rehearsal_model = deepcopy(proc.model) # type: ignore[attr-defined]
|
||||
|
|
|
@ -19,7 +19,7 @@ class Lexeme:
|
|||
@property
|
||||
def vector_norm(self) -> float: ...
|
||||
vector: Floats1d
|
||||
rank: str
|
||||
rank: int
|
||||
sentiment: float
|
||||
@property
|
||||
def orth_(self) -> str: ...
|
||||
|
|
|
@ -130,7 +130,9 @@ cdef class Lexeme:
|
|||
return 0.0
|
||||
vector = self.vector
|
||||
xp = get_array_module(vector)
|
||||
return (xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm))
|
||||
result = xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm)
|
||||
# ensure we get a scalar back (numpy does this automatically but cupy doesn't)
|
||||
return result.item()
|
||||
|
||||
@property
|
||||
def has_vector(self):
|
||||
|
|
66
spacy/matcher/dependencymatcher.pyi
Normal file
66
spacy/matcher/dependencymatcher.pyi
Normal file
|
@ -0,0 +1,66 @@
|
|||
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
||||
from .matcher import Matcher
|
||||
from ..vocab import Vocab
|
||||
from ..tokens.doc import Doc
|
||||
from ..tokens.span import Span
|
||||
|
||||
class DependencyMatcher:
|
||||
"""Match dependency parse tree based on pattern rules."""
|
||||
|
||||
_patterns: Dict[str, List[Any]]
|
||||
_raw_patterns: Dict[str, List[Any]]
|
||||
_tokens_to_key: Dict[str, List[Any]]
|
||||
_root: Dict[str, List[Any]]
|
||||
_tree: Dict[str, List[Any]]
|
||||
_callbacks: Dict[
|
||||
Any, Callable[[DependencyMatcher, Doc, int, List[Tuple[int, List[int]]]], Any]
|
||||
]
|
||||
_ops: Dict[str, Any]
|
||||
vocab: Vocab
|
||||
_matcher: Matcher
|
||||
def __init__(self, vocab: Vocab, *, validate: bool = ...) -> None: ...
|
||||
def __reduce__(
|
||||
self,
|
||||
) -> Tuple[
|
||||
Callable[
|
||||
[Vocab, Dict[str, Any], Dict[str, Callable[..., Any]]], DependencyMatcher
|
||||
],
|
||||
Tuple[
|
||||
Vocab,
|
||||
Dict[str, List[Any]],
|
||||
Dict[
|
||||
str,
|
||||
Callable[
|
||||
[DependencyMatcher, Doc, int, List[Tuple[int, List[int]]]], Any
|
||||
],
|
||||
],
|
||||
],
|
||||
None,
|
||||
None,
|
||||
]: ...
|
||||
def __len__(self) -> int: ...
|
||||
def __contains__(self, key: Union[str, int]) -> bool: ...
|
||||
def add(
|
||||
self,
|
||||
key: Union[str, int],
|
||||
patterns: List[List[Dict[str, Any]]],
|
||||
*,
|
||||
on_match: Optional[
|
||||
Callable[[DependencyMatcher, Doc, int, List[Tuple[int, List[int]]]], Any]
|
||||
] = ...
|
||||
) -> None: ...
|
||||
def has_key(self, key: Union[str, int]) -> bool: ...
|
||||
def get(
|
||||
self, key: Union[str, int], default: Optional[Any] = ...
|
||||
) -> Tuple[
|
||||
Optional[
|
||||
Callable[[DependencyMatcher, Doc, int, List[Tuple[int, List[int]]]], Any]
|
||||
],
|
||||
List[List[Dict[str, Any]]],
|
||||
]: ...
|
||||
def remove(self, key: Union[str, int]) -> None: ...
|
||||
def __call__(self, doclike: Union[Doc, Span]) -> List[Tuple[int, List[int]]]: ...
|
||||
|
||||
def unpickle_matcher(
|
||||
vocab: Vocab, patterns: Dict[str, Any], callbacks: Dict[str, Callable[..., Any]]
|
||||
) -> DependencyMatcher: ...
|
|
@ -1,4 +1,6 @@
|
|||
from typing import Any, List, Dict, Tuple, Optional, Callable, Union, Iterator, Iterable
|
||||
from typing import Any, List, Dict, Tuple, Optional, Callable, Union
|
||||
from typing import Iterator, Iterable, overload
|
||||
from ..compat import Literal
|
||||
from ..vocab import Vocab
|
||||
from ..tokens import Doc, Span
|
||||
|
||||
|
@ -31,12 +33,22 @@ class Matcher:
|
|||
) -> Union[
|
||||
Iterator[Tuple[Tuple[Doc, Any], Any]], Iterator[Tuple[Doc, Any]], Iterator[Doc]
|
||||
]: ...
|
||||
@overload
|
||||
def __call__(
|
||||
self,
|
||||
doclike: Union[Doc, Span],
|
||||
*,
|
||||
as_spans: bool = ...,
|
||||
as_spans: Literal[False] = ...,
|
||||
allow_missing: bool = ...,
|
||||
with_alignments: bool = ...
|
||||
) -> Union[List[Tuple[int, int, int]], List[Span]]: ...
|
||||
) -> List[Tuple[int, int, int]]: ...
|
||||
@overload
|
||||
def __call__(
|
||||
self,
|
||||
doclike: Union[Doc, Span],
|
||||
*,
|
||||
as_spans: Literal[True],
|
||||
allow_missing: bool = ...,
|
||||
with_alignments: bool = ...
|
||||
) -> List[Span]: ...
|
||||
def _normalize_key(self, key: Any) -> Any: ...
|
||||
|
|
|
@ -18,7 +18,7 @@ from ..tokens.doc cimport Doc, get_token_attr_for_matcher
|
|||
from ..tokens.span cimport Span
|
||||
from ..tokens.token cimport Token
|
||||
from ..tokens.morphanalysis cimport MorphAnalysis
|
||||
from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA, MORPH
|
||||
from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA, MORPH, ENT_IOB
|
||||
|
||||
from ..schemas import validate_token_pattern
|
||||
from ..errors import Errors, MatchPatternError, Warnings
|
||||
|
@ -798,6 +798,9 @@ def _get_attr_values(spec, string_store):
|
|||
attr = "SENT_START"
|
||||
attr = IDS.get(attr)
|
||||
if isinstance(value, str):
|
||||
if attr == ENT_IOB and value in Token.iob_strings():
|
||||
value = Token.iob_strings().index(value)
|
||||
else:
|
||||
value = string_store.add(value)
|
||||
elif isinstance(value, bool):
|
||||
value = int(value)
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
from typing import List, Tuple, Union, Optional, Callable, Any, Dict
|
||||
|
||||
from . import Matcher
|
||||
from typing import List, Tuple, Union, Optional, Callable, Any, Dict, overload
|
||||
from ..compat import Literal
|
||||
from .matcher import Matcher
|
||||
from ..vocab import Vocab
|
||||
from ..tokens import Doc, Span
|
||||
|
||||
|
@ -14,16 +14,24 @@ class PhraseMatcher:
|
|||
def add(
|
||||
self,
|
||||
key: str,
|
||||
docs: List[List[Dict[str, Any]]],
|
||||
docs: List[Doc],
|
||||
*,
|
||||
on_match: Optional[
|
||||
Callable[[Matcher, Doc, int, List[Tuple[Any, ...]]], Any]
|
||||
] = ...,
|
||||
) -> None: ...
|
||||
def remove(self, key: str) -> None: ...
|
||||
@overload
|
||||
def __call__(
|
||||
self,
|
||||
doclike: Union[Doc, Span],
|
||||
*,
|
||||
as_spans: bool = ...,
|
||||
) -> Union[List[Tuple[int, int, int]], List[Span]]: ...
|
||||
as_spans: Literal[False] = ...,
|
||||
) -> List[Tuple[int, int, int]]: ...
|
||||
@overload
|
||||
def __call__(
|
||||
self,
|
||||
doclike: Union[Doc, Span],
|
||||
*,
|
||||
as_spans: Literal[True],
|
||||
) -> List[Span]: ...
|
||||
|
|
|
@ -23,7 +23,7 @@ def create_pretrain_vectors(
|
|||
maxout_pieces: int, hidden_size: int, loss: str
|
||||
) -> Callable[["Vocab", Model], Model]:
|
||||
def create_vectors_objective(vocab: "Vocab", tok2vec: Model) -> Model:
|
||||
if vocab.vectors.data.shape[1] == 0:
|
||||
if vocab.vectors.shape[1] == 0:
|
||||
raise ValueError(Errors.E875)
|
||||
model = build_cloze_multi_task_model(
|
||||
vocab, tok2vec, hidden_size=hidden_size, maxout_pieces=maxout_pieces
|
||||
|
@ -85,7 +85,7 @@ def get_characters_loss(ops, docs, prediction, nr_char):
|
|||
target = ops.asarray(to_categorical(target_ids, n_classes=256), dtype="f")
|
||||
target = target.reshape((-1, 256 * nr_char))
|
||||
diff = prediction - target
|
||||
loss = (diff ** 2).sum()
|
||||
loss = (diff**2).sum()
|
||||
d_target = diff / float(prediction.shape[0])
|
||||
return loss, d_target
|
||||
|
||||
|
@ -116,7 +116,7 @@ def build_multi_task_model(
|
|||
def build_cloze_multi_task_model(
|
||||
vocab: "Vocab", tok2vec: Model, maxout_pieces: int, hidden_size: int
|
||||
) -> Model:
|
||||
nO = vocab.vectors.data.shape[1]
|
||||
nO = vocab.vectors.shape[1]
|
||||
output_layer = chain(
|
||||
cast(Model[List["Floats2d"], Floats2d], list2array()),
|
||||
Maxout(
|
||||
|
|
|
@ -94,7 +94,7 @@ def init(
|
|||
nM = model.get_dim("nM") if model.has_dim("nM") else None
|
||||
nO = model.get_dim("nO") if model.has_dim("nO") else None
|
||||
if X is not None and len(X):
|
||||
nM = X[0].vocab.vectors.data.shape[1]
|
||||
nM = X[0].vocab.vectors.shape[1]
|
||||
if Y is not None:
|
||||
nO = Y.data.shape[1]
|
||||
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
from cython.operator cimport dereference as deref, preincrement as incr
|
||||
from libc.string cimport memcpy, memset
|
||||
from libc.stdlib cimport calloc, free
|
||||
from libc.stdint cimport uint32_t, uint64_t
|
||||
|
@ -184,16 +185,20 @@ cdef cppclass StateC:
|
|||
int L(int head, int idx) nogil const:
|
||||
if idx < 1 or this._left_arcs.size() == 0:
|
||||
return -1
|
||||
cdef vector[int] lefts
|
||||
for i in range(this._left_arcs.size()):
|
||||
arc = this._left_arcs.at(i)
|
||||
|
||||
# Work backwards through left-arcs to find the arc at the
|
||||
# requested index more quickly.
|
||||
cdef size_t child_index = 0
|
||||
it = this._left_arcs.const_rbegin()
|
||||
while it != this._left_arcs.rend():
|
||||
arc = deref(it)
|
||||
if arc.head == head and arc.child != -1 and arc.child < head:
|
||||
lefts.push_back(arc.child)
|
||||
idx = (<int>lefts.size()) - idx
|
||||
if idx < 0:
|
||||
child_index += 1
|
||||
if child_index == idx:
|
||||
return arc.child
|
||||
incr(it)
|
||||
|
||||
return -1
|
||||
else:
|
||||
return lefts.at(idx)
|
||||
|
||||
int R(int head, int idx) nogil const:
|
||||
if idx < 1 or this._right_arcs.size() == 0:
|
||||
|
|
|
@ -604,7 +604,7 @@ cdef class ArcEager(TransitionSystem):
|
|||
actions[SHIFT][''] += 1
|
||||
if min_freq is not None:
|
||||
for action, label_freqs in actions.items():
|
||||
for label, freq in list(label_freqs.items()):
|
||||
for label, freq in label_freqs.copy().items():
|
||||
if freq < min_freq:
|
||||
label_freqs.pop(label)
|
||||
# Ensure these actions are present
|
||||
|
|
|
@ -26,6 +26,8 @@ class Pipe:
|
|||
@property
|
||||
def labels(self) -> Tuple[str, ...]: ...
|
||||
@property
|
||||
def hide_labels(self) -> bool: ...
|
||||
@property
|
||||
def label_data(self) -> Any: ...
|
||||
def _require_labels(self) -> None: ...
|
||||
def set_error_handler(
|
||||
|
|
|
@ -102,6 +102,10 @@ cdef class Pipe:
|
|||
def labels(self) -> Tuple[str, ...]:
|
||||
return tuple()
|
||||
|
||||
@property
|
||||
def hide_labels(self) -> bool:
|
||||
return False
|
||||
|
||||
@property
|
||||
def label_data(self):
|
||||
"""Optional JSON-serializable data that would be sufficient to recreate
|
||||
|
|
|
@ -99,6 +99,10 @@ class SentenceRecognizer(Tagger):
|
|||
# are 0
|
||||
return tuple(["I", "S"])
|
||||
|
||||
@property
|
||||
def hide_labels(self):
|
||||
return True
|
||||
|
||||
@property
|
||||
def label_data(self):
|
||||
return None
|
||||
|
|
|
@ -377,7 +377,7 @@ class SpanCategorizer(TrainablePipe):
|
|||
# If the prediction is 0.9 and it's false, the gradient will be
|
||||
# 0.9 (0.9 - 0.0)
|
||||
d_scores = scores - target
|
||||
loss = float((d_scores ** 2).sum())
|
||||
loss = float((d_scores**2).sum())
|
||||
return loss, d_scores
|
||||
|
||||
def initialize(
|
||||
|
@ -412,7 +412,7 @@ class SpanCategorizer(TrainablePipe):
|
|||
self._require_labels()
|
||||
if subbatch:
|
||||
docs = [eg.x for eg in subbatch]
|
||||
spans = self.suggester(docs)
|
||||
spans = build_ngram_suggester(sizes=[1])(docs)
|
||||
Y = self.model.ops.alloc2f(spans.dataXd.shape[0], len(self.labels))
|
||||
self.model.initialize(X=(docs, spans), Y=Y)
|
||||
else:
|
||||
|
|
|
@ -281,7 +281,7 @@ class TextCategorizer(TrainablePipe):
|
|||
bp_scores(gradient)
|
||||
if sgd is not None:
|
||||
self.finish_update(sgd)
|
||||
losses[self.name] += (gradient ** 2).sum()
|
||||
losses[self.name] += (gradient**2).sum()
|
||||
return losses
|
||||
|
||||
def _examples_to_truth(
|
||||
|
@ -315,7 +315,7 @@ class TextCategorizer(TrainablePipe):
|
|||
not_missing = self.model.ops.asarray(not_missing) # type: ignore
|
||||
d_scores = (scores - truths) / scores.shape[0]
|
||||
d_scores *= not_missing
|
||||
mean_square_error = (d_scores ** 2).sum(axis=1).mean()
|
||||
mean_square_error = (d_scores**2).sum(axis=1).mean()
|
||||
return float(mean_square_error), d_scores
|
||||
|
||||
def add_label(self, label: str) -> int:
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
from typing import Dict, List, Union, Optional, Any, Callable, Type, Tuple
|
||||
from typing import Iterable, TypeVar, TYPE_CHECKING
|
||||
from .compat import Literal
|
||||
from enum import Enum
|
||||
from pydantic import BaseModel, Field, ValidationError, validator, create_model
|
||||
from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool
|
||||
|
@ -209,6 +210,7 @@ NumberValue = Union[TokenPatternNumber, StrictInt, StrictFloat]
|
|||
UnderscoreValue = Union[
|
||||
TokenPatternString, TokenPatternNumber, str, int, float, list, bool
|
||||
]
|
||||
IobValue = Literal["", "I", "O", "B", 0, 1, 2, 3]
|
||||
|
||||
|
||||
class TokenPattern(BaseModel):
|
||||
|
@ -222,6 +224,7 @@ class TokenPattern(BaseModel):
|
|||
lemma: Optional[StringValue] = None
|
||||
shape: Optional[StringValue] = None
|
||||
ent_type: Optional[StringValue] = None
|
||||
ent_iob: Optional[IobValue] = None
|
||||
ent_id: Optional[StringValue] = None
|
||||
ent_kb_id: Optional[StringValue] = None
|
||||
norm: Optional[StringValue] = None
|
||||
|
|
|
@ -567,6 +567,7 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
|
|||
"Merging the docs is fun.",
|
||||
"",
|
||||
"They don't think alike. ",
|
||||
"",
|
||||
"Another doc.",
|
||||
]
|
||||
en_texts_without_empty = [t for t in en_texts if len(t)]
|
||||
|
@ -574,9 +575,9 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
|
|||
en_docs = [en_tokenizer(text) for text in en_texts]
|
||||
en_docs[0].spans["group"] = [en_docs[0][1:4]]
|
||||
en_docs[2].spans["group"] = [en_docs[2][1:4]]
|
||||
en_docs[3].spans["group"] = [en_docs[3][0:1]]
|
||||
en_docs[4].spans["group"] = [en_docs[4][0:1]]
|
||||
span_group_texts = sorted(
|
||||
[en_docs[0][1:4].text, en_docs[2][1:4].text, en_docs[3][0:1].text]
|
||||
[en_docs[0][1:4].text, en_docs[2][1:4].text, en_docs[4][0:1].text]
|
||||
)
|
||||
de_doc = de_tokenizer(de_text)
|
||||
Token.set_extension("is_ambiguous", default=False)
|
||||
|
@ -683,6 +684,7 @@ def test_has_annotation(en_vocab):
|
|||
attrs = ("TAG", "POS", "MORPH", "LEMMA", "DEP", "HEAD", "ENT_IOB", "ENT_TYPE")
|
||||
for attr in attrs:
|
||||
assert not doc.has_annotation(attr)
|
||||
assert not doc.has_annotation(attr, require_complete=True)
|
||||
|
||||
doc[0].tag_ = "A"
|
||||
doc[0].pos_ = "X"
|
||||
|
@ -708,6 +710,27 @@ def test_has_annotation(en_vocab):
|
|||
assert doc.has_annotation(attr, require_complete=True)
|
||||
|
||||
|
||||
def test_has_annotation_sents(en_vocab):
|
||||
doc = Doc(en_vocab, words=["Hello", "beautiful", "world"])
|
||||
attrs = ("SENT_START", "IS_SENT_START", "IS_SENT_END")
|
||||
for attr in attrs:
|
||||
assert not doc.has_annotation(attr)
|
||||
assert not doc.has_annotation(attr, require_complete=True)
|
||||
|
||||
# The first token (index 0) is always assumed to be a sentence start,
|
||||
# and ignored by the check in doc.has_annotation
|
||||
|
||||
doc[1].is_sent_start = False
|
||||
for attr in attrs:
|
||||
assert doc.has_annotation(attr)
|
||||
assert not doc.has_annotation(attr, require_complete=True)
|
||||
|
||||
doc[2].is_sent_start = False
|
||||
for attr in attrs:
|
||||
assert doc.has_annotation(attr)
|
||||
assert doc.has_annotation(attr, require_complete=True)
|
||||
|
||||
|
||||
def test_is_flags_deprecated(en_tokenizer):
|
||||
doc = en_tokenizer("test")
|
||||
with pytest.deprecated_call():
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
import pytest
|
||||
from spacy.attrs import intify_attrs, ENT_IOB
|
||||
|
||||
from spacy.attrs import IS_ALPHA, LEMMA, NORM, ORTH, intify_attrs
|
||||
from spacy.lang.en.stop_words import STOP_WORDS
|
||||
|
@ -33,6 +34,38 @@ def test_attrs_do_deprecated(text):
|
|||
assert int_attrs == {ORTH: 10, IS_ALPHA: True}
|
||||
|
||||
|
||||
def test_attrs_ent_iob_intify():
|
||||
int_attrs = intify_attrs({"ENT_IOB": ""})
|
||||
assert int_attrs == {ENT_IOB: 0}
|
||||
|
||||
int_attrs = intify_attrs({"ENT_IOB": "I"})
|
||||
assert int_attrs == {ENT_IOB: 1}
|
||||
|
||||
int_attrs = intify_attrs({"ENT_IOB": "O"})
|
||||
assert int_attrs == {ENT_IOB: 2}
|
||||
|
||||
int_attrs = intify_attrs({"ENT_IOB": "B"})
|
||||
assert int_attrs == {ENT_IOB: 3}
|
||||
|
||||
int_attrs = intify_attrs({ENT_IOB: ""})
|
||||
assert int_attrs == {ENT_IOB: 0}
|
||||
|
||||
int_attrs = intify_attrs({ENT_IOB: "I"})
|
||||
assert int_attrs == {ENT_IOB: 1}
|
||||
|
||||
int_attrs = intify_attrs({ENT_IOB: "O"})
|
||||
assert int_attrs == {ENT_IOB: 2}
|
||||
|
||||
int_attrs = intify_attrs({ENT_IOB: "B"})
|
||||
assert int_attrs == {ENT_IOB: 3}
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
int_attrs = intify_attrs({"ENT_IOB": "XX"})
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
int_attrs = intify_attrs({ENT_IOB: "XX"})
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text,match", [(",", True), (" ", False), ("a", False)])
|
||||
def test_lex_attrs_is_punct(text, match):
|
||||
assert is_punct(text) == match
|
||||
|
|
|
@ -642,3 +642,30 @@ def test_matcher_no_zero_length(en_vocab):
|
|||
matcher = Matcher(en_vocab)
|
||||
matcher.add("TEST", [[{"TAG": "C", "OP": "?"}]])
|
||||
assert len(matcher(doc)) == 0
|
||||
|
||||
|
||||
def test_matcher_ent_iob_key(en_vocab):
|
||||
"""Test that patterns with ent_iob works correctly."""
|
||||
matcher = Matcher(en_vocab)
|
||||
matcher.add("Rule", [[{"ENT_IOB": "I"}]])
|
||||
doc1 = Doc(en_vocab, words=["I", "visited", "New", "York", "and", "California"])
|
||||
doc1.ents = [Span(doc1, 2, 4, label="GPE"), Span(doc1, 5, 6, label="GPE")]
|
||||
doc2 = Doc(en_vocab, words=["I", "visited", "my", "friend", "Alicia"])
|
||||
doc2.ents = [Span(doc2, 4, 5, label="PERSON")]
|
||||
matches1 = [doc1[start:end].text for _, start, end in matcher(doc1)]
|
||||
matches2 = [doc2[start:end].text for _, start, end in matcher(doc2)]
|
||||
assert len(matches1) == 1
|
||||
assert matches1[0] == "York"
|
||||
assert len(matches2) == 0
|
||||
|
||||
matcher = Matcher(en_vocab) # Test iob pattern with operators
|
||||
matcher.add("Rule", [[{"ENT_IOB": "I", "OP": "+"}]])
|
||||
doc = Doc(
|
||||
en_vocab, words=["I", "visited", "my", "friend", "Anna", "Maria", "Esperanza"]
|
||||
)
|
||||
doc.ents = [Span(doc, 4, 7, label="PERSON")]
|
||||
matches = [doc[start:end].text for _, start, end in matcher(doc)]
|
||||
assert len(matches) == 3
|
||||
assert matches[0] == "Maria"
|
||||
assert matches[1] == "Maria Esperanza"
|
||||
assert matches[2] == "Esperanza"
|
||||
|
|
|
@ -12,6 +12,7 @@ TEST_PATTERNS = [
|
|||
([{"IS_PUNCT": True, "OP": "$"}], 1, 1),
|
||||
([{"_": "foo"}], 1, 1),
|
||||
('[{"TEXT": "foo"}, {"LOWER": "bar"}]', 1, 1),
|
||||
([{"ENT_IOB": "foo"}], 1, 1),
|
||||
([1, 2, 3], 3, 1),
|
||||
# Bad patterns flagged outside of Matcher
|
||||
([{"_": {"foo": "bar", "baz": {"IN": "foo"}}}], 2, 0), # prev: (1, 0)
|
||||
|
|
|
@ -12,6 +12,7 @@ def test_build_dependencies():
|
|||
"flake8",
|
||||
"hypothesis",
|
||||
"pre-commit",
|
||||
"black",
|
||||
"mypy",
|
||||
"types-dataclasses",
|
||||
"types-mock",
|
||||
|
|
|
@ -97,3 +97,7 @@ def test_overfitting_IO():
|
|||
]
|
||||
assert_equal(batch_deps_1, batch_deps_2)
|
||||
assert_equal(batch_deps_1, no_batch_deps)
|
||||
|
||||
# test internal pipe labels vs. Language.pipe_labels with hidden labels
|
||||
assert nlp.get_pipe("senter").labels == ("I", "S")
|
||||
assert "senter" not in nlp.pipe_labels
|
||||
|
|
|
@ -80,6 +80,8 @@ def test_explicit_labels():
|
|||
assert spancat.labels == ("PERSON", "LOC")
|
||||
|
||||
|
||||
# TODO figure out why this is flaky
|
||||
@pytest.mark.skip(reason="Test is unreliable for unknown reason")
|
||||
def test_doc_gc():
|
||||
# If the Doc object is garbage collected, the spans won't be functional afterwards
|
||||
nlp = Language()
|
||||
|
@ -97,6 +99,7 @@ def test_doc_gc():
|
|||
assert isinstance(spangroups, SpanGroups)
|
||||
for key, spangroup in spangroups.items():
|
||||
assert isinstance(spangroup, SpanGroup)
|
||||
# XXX This fails with length 0 sometimes
|
||||
assert len(spangroup) > 0
|
||||
with pytest.raises(RuntimeError):
|
||||
span = spangroup[0]
|
||||
|
|
|
@ -12,14 +12,18 @@ from spacy.cli._util import is_subpath_of, load_project_config
|
|||
from spacy.cli._util import parse_config_overrides, string_to_list
|
||||
from spacy.cli._util import substitute_project_variables
|
||||
from spacy.cli._util import validate_project_commands
|
||||
from spacy.cli.debug_data import _compile_gold, _get_labels_from_model
|
||||
from spacy.cli.debug_data import _get_labels_from_spancat
|
||||
from spacy.cli.download import get_compatibility, get_version
|
||||
from spacy.cli.init_config import RECOMMENDATIONS, init_config, fill_config
|
||||
from spacy.cli.package import get_third_party_dependencies
|
||||
from spacy.cli.package import _is_permitted_package_name
|
||||
from spacy.cli.validate import get_model_pkgs
|
||||
from spacy.lang.en import English
|
||||
from spacy.lang.nl import Dutch
|
||||
from spacy.language import Language
|
||||
from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate
|
||||
from spacy.tokens import Doc
|
||||
from spacy.training import Example, docs_to_json, offsets_to_biluo_tags
|
||||
from spacy.training.converters import conll_ner_to_docs, conllu_to_docs
|
||||
from spacy.training.converters import iob_to_docs
|
||||
|
@ -665,3 +669,54 @@ def test_get_third_party_dependencies():
|
|||
)
|
||||
def test_is_subpath_of(parent, child, expected):
|
||||
assert is_subpath_of(parent, child) == expected
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
@pytest.mark.parametrize(
|
||||
"factory_name,pipe_name",
|
||||
[
|
||||
("ner", "ner"),
|
||||
("ner", "my_ner"),
|
||||
("spancat", "spancat"),
|
||||
("spancat", "my_spancat"),
|
||||
],
|
||||
)
|
||||
def test_get_labels_from_model(factory_name, pipe_name):
|
||||
labels = ("A", "B")
|
||||
|
||||
nlp = English()
|
||||
pipe = nlp.add_pipe(factory_name, name=pipe_name)
|
||||
for label in labels:
|
||||
pipe.add_label(label)
|
||||
nlp.initialize()
|
||||
assert nlp.get_pipe(pipe_name).labels == labels
|
||||
if factory_name == "spancat":
|
||||
assert _get_labels_from_spancat(nlp)[pipe.key] == set(labels)
|
||||
else:
|
||||
assert _get_labels_from_model(nlp, factory_name) == set(labels)
|
||||
|
||||
|
||||
def test_permitted_package_names():
|
||||
# https://www.python.org/dev/peps/pep-0426/#name
|
||||
assert _is_permitted_package_name("Meine_Bäume") == False
|
||||
assert _is_permitted_package_name("_package") == False
|
||||
assert _is_permitted_package_name("package_") == False
|
||||
assert _is_permitted_package_name(".package") == False
|
||||
assert _is_permitted_package_name("package.") == False
|
||||
assert _is_permitted_package_name("-package") == False
|
||||
assert _is_permitted_package_name("package-") == False
|
||||
|
||||
|
||||
def test_debug_data_compile_gold():
|
||||
nlp = English()
|
||||
pred = Doc(nlp.vocab, words=["Token", ".", "New", "York", "City"])
|
||||
ref = Doc(nlp.vocab, words=["Token", ".", "New York City"], sent_starts=[True, False, True], ents=["O", "O", "B-ENT"])
|
||||
eg = Example(pred, ref)
|
||||
data = _compile_gold([eg], ["ner"], nlp, True)
|
||||
assert data["boundary_cross_ents"] == 0
|
||||
|
||||
pred = Doc(nlp.vocab, words=["Token", ".", "New", "York", "City"])
|
||||
ref = Doc(nlp.vocab, words=["Token", ".", "New York City"], sent_starts=[True, False, True], ents=["O", "B-ENT", "I-ENT"])
|
||||
eg = Example(pred, ref)
|
||||
data = _compile_gold([eg], ["ner"], nlp, True)
|
||||
assert data["boundary_cross_ents"] == 1
|
|
@ -9,6 +9,7 @@ from spacy.tokenizer import Tokenizer
|
|||
from spacy.tokens import Doc
|
||||
from spacy.training import Example
|
||||
from spacy.util import compile_prefix_regex, compile_suffix_regex, ensure_path
|
||||
from spacy.util import compile_infix_regex
|
||||
from spacy.vocab import Vocab
|
||||
from spacy.symbols import ORTH
|
||||
|
||||
|
@ -503,3 +504,20 @@ def test_tokenizer_prefix_suffix_overlap_lookbehind(en_vocab):
|
|||
assert tokens == ["a", "10", "."]
|
||||
explain_tokens = [t[1] for t in tokenizer.explain("a10.")]
|
||||
assert tokens == explain_tokens
|
||||
|
||||
|
||||
def test_tokenizer_infix_prefix(en_vocab):
|
||||
# the prefix and suffix matches overlap in the suffix lookbehind
|
||||
infixes = ["±"]
|
||||
suffixes = ["%"]
|
||||
infix_re = compile_infix_regex(infixes)
|
||||
suffix_re = compile_suffix_regex(suffixes)
|
||||
tokenizer = Tokenizer(
|
||||
en_vocab,
|
||||
infix_finditer=infix_re.finditer,
|
||||
suffix_search=suffix_re.search,
|
||||
)
|
||||
tokens = [t.text for t in tokenizer("±10%")]
|
||||
assert tokens == ["±10", "%"]
|
||||
explain_tokens = [t[1] for t in tokenizer.explain("±10%")]
|
||||
assert tokens == explain_tokens
|
||||
|
|
|
@ -35,6 +35,7 @@ def test_vectors_similarity_LL(vocab, vectors):
|
|||
assert lex1.vector_norm != 0
|
||||
assert lex2.vector_norm != 0
|
||||
assert lex1.vector[0] != lex2.vector[0] and lex1.vector[1] != lex2.vector[1]
|
||||
assert isinstance(lex1.similarity(lex2), float)
|
||||
assert numpy.isclose(lex1.similarity(lex2), get_cosine(vec1, vec2))
|
||||
assert numpy.isclose(lex2.similarity(lex2), lex1.similarity(lex1))
|
||||
|
||||
|
@ -47,25 +48,46 @@ def test_vectors_similarity_TT(vocab, vectors):
|
|||
assert doc[0].vector_norm != 0
|
||||
assert doc[1].vector_norm != 0
|
||||
assert doc[0].vector[0] != doc[1].vector[0] and doc[0].vector[1] != doc[1].vector[1]
|
||||
assert isinstance(doc[0].similarity(doc[1]), float)
|
||||
assert numpy.isclose(doc[0].similarity(doc[1]), get_cosine(vec1, vec2))
|
||||
assert numpy.isclose(doc[1].similarity(doc[0]), doc[0].similarity(doc[1]))
|
||||
|
||||
|
||||
def test_vectors_similarity_SS(vocab, vectors):
|
||||
[(word1, vec1), (word2, vec2)] = vectors
|
||||
doc = Doc(vocab, words=[word1, word2])
|
||||
assert isinstance(doc[0:1].similarity(doc[0:2]), float)
|
||||
assert doc[0:1].similarity(doc[0:2]) == doc[0:2].similarity(doc[0:1])
|
||||
|
||||
|
||||
def test_vectors_similarity_DD(vocab, vectors):
|
||||
[(word1, vec1), (word2, vec2)] = vectors
|
||||
doc1 = Doc(vocab, words=[word1, word2])
|
||||
doc2 = Doc(vocab, words=[word2, word1])
|
||||
assert isinstance(doc1.similarity(doc2), float)
|
||||
assert doc1.similarity(doc2) == doc2.similarity(doc1)
|
||||
|
||||
|
||||
def test_vectors_similarity_TD(vocab, vectors):
|
||||
[(word1, vec1), (word2, vec2)] = vectors
|
||||
doc = Doc(vocab, words=[word1, word2])
|
||||
with pytest.warns(UserWarning):
|
||||
assert isinstance(doc.similarity(doc[0]), float)
|
||||
assert isinstance(doc[0].similarity(doc), float)
|
||||
assert doc.similarity(doc[0]) == doc[0].similarity(doc)
|
||||
|
||||
|
||||
def test_vectors_similarity_DS(vocab, vectors):
|
||||
[(word1, vec1), (word2, vec2)] = vectors
|
||||
doc = Doc(vocab, words=[word1, word2])
|
||||
assert doc.similarity(doc[:2]) == doc[:2].similarity(doc)
|
||||
|
||||
|
||||
def test_vectors_similarity_TS(vocab, vectors):
|
||||
[(word1, vec1), (word2, vec2)] = vectors
|
||||
doc = Doc(vocab, words=[word1, word2])
|
||||
with pytest.warns(UserWarning):
|
||||
assert isinstance(doc[:2].similarity(doc[0]), float)
|
||||
assert isinstance(doc[0].similarity(doc[-2]), float)
|
||||
assert doc[:2].similarity(doc[0]) == doc[0].similarity(doc[:2])
|
||||
|
||||
|
||||
def test_vectors_similarity_DS(vocab, vectors):
|
||||
[(word1, vec1), (word2, vec2)] = vectors
|
||||
doc = Doc(vocab, words=[word1, word2])
|
||||
assert isinstance(doc.similarity(doc[:2]), float)
|
||||
assert doc.similarity(doc[:2]) == doc[:2].similarity(doc)
|
||||
|
|
|
@ -421,7 +421,7 @@ def test_vector_is_oov():
|
|||
def test_init_vectors_unset():
|
||||
v = Vectors(shape=(10, 10))
|
||||
assert v.is_full is False
|
||||
assert v.data.shape == (10, 10)
|
||||
assert v.shape == (10, 10)
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
v = Vectors(shape=(10, 10), mode="floret")
|
||||
|
@ -514,7 +514,7 @@ def test_floret_vectors(floret_vectors_vec_str, floret_vectors_hashvec_str):
|
|||
# rows: 2 rows per ngram
|
||||
rows = OPS.xp.asarray(
|
||||
[
|
||||
h % nlp.vocab.vectors.data.shape[0]
|
||||
h % nlp.vocab.vectors.shape[0]
|
||||
for ngram in ngrams
|
||||
for h in nlp.vocab.vectors._get_ngram_hashes(ngram)
|
||||
],
|
||||
|
@ -544,17 +544,17 @@ def test_floret_vectors(floret_vectors_vec_str, floret_vectors_hashvec_str):
|
|||
# an empty key returns 0s
|
||||
assert_equal(
|
||||
OPS.to_numpy(nlp.vocab[""].vector),
|
||||
numpy.zeros((nlp.vocab.vectors.data.shape[0],)),
|
||||
numpy.zeros((nlp.vocab.vectors.shape[0],)),
|
||||
)
|
||||
# an empty batch returns 0s
|
||||
assert_equal(
|
||||
OPS.to_numpy(nlp.vocab.vectors.get_batch([""])),
|
||||
numpy.zeros((1, nlp.vocab.vectors.data.shape[0])),
|
||||
numpy.zeros((1, nlp.vocab.vectors.shape[0])),
|
||||
)
|
||||
# an empty key within a batch returns 0s
|
||||
assert_equal(
|
||||
OPS.to_numpy(nlp.vocab.vectors.get_batch(["a", "", "b"])[1]),
|
||||
numpy.zeros((nlp.vocab.vectors.data.shape[0],)),
|
||||
numpy.zeros((nlp.vocab.vectors.shape[0],)),
|
||||
)
|
||||
|
||||
# the loaded ngram vector table cannot be modified
|
||||
|
|
|
@ -45,10 +45,12 @@ cdef class Tokenizer:
|
|||
`re.compile(string).search` to match suffixes.
|
||||
`infix_finditer` (callable): A function matching the signature of
|
||||
`re.compile(string).finditer` to find infixes.
|
||||
token_match (callable): A boolean function matching strings to be
|
||||
token_match (callable): A function matching the signature of
|
||||
`re.compile(string).match`, for matching strings to be
|
||||
recognized as tokens.
|
||||
url_match (callable): A boolean function matching strings to be
|
||||
recognized as tokens after considering prefixes and suffixes.
|
||||
url_match (callable): A function matching the signature of
|
||||
`re.compile(string).match`, for matching strings to be
|
||||
recognized as urls.
|
||||
|
||||
EXAMPLE:
|
||||
>>> tokenizer = Tokenizer(nlp.vocab)
|
||||
|
@ -681,6 +683,8 @@ cdef class Tokenizer:
|
|||
infixes = infix_finditer(substring)
|
||||
offset = 0
|
||||
for match in infixes:
|
||||
if offset == 0 and match.start() == 0:
|
||||
continue
|
||||
if substring[offset : match.start()]:
|
||||
tokens.append(("TOKEN", substring[offset : match.start()]))
|
||||
if substring[match.start() : match.end()]:
|
||||
|
|
|
@ -10,7 +10,7 @@ from ..lexeme import Lexeme
|
|||
from ..vocab import Vocab
|
||||
from .underscore import Underscore
|
||||
from pathlib import Path
|
||||
import numpy
|
||||
import numpy as np
|
||||
|
||||
class DocMethod(Protocol):
|
||||
def __call__(self: Doc, *args: Any, **kwargs: Any) -> Any: ... # type: ignore[misc]
|
||||
|
@ -26,7 +26,7 @@ class Doc:
|
|||
user_hooks: Dict[str, Callable[..., Any]]
|
||||
user_token_hooks: Dict[str, Callable[..., Any]]
|
||||
user_span_hooks: Dict[str, Callable[..., Any]]
|
||||
tensor: numpy.ndarray
|
||||
tensor: np.ndarray[Any, np.dtype[np.float_]]
|
||||
user_data: Dict[str, Any]
|
||||
has_unknown_spaces: bool
|
||||
_context: Any
|
||||
|
@ -144,7 +144,7 @@ class Doc:
|
|||
) -> Doc: ...
|
||||
def to_array(
|
||||
self, py_attr_ids: Union[int, str, List[Union[int, str]]]
|
||||
) -> numpy.ndarray: ...
|
||||
) -> np.ndarray[Any, np.dtype[np.float_]]: ...
|
||||
@staticmethod
|
||||
def from_docs(
|
||||
docs: List[Doc],
|
||||
|
|
|
@ -420,6 +420,8 @@ cdef class Doc:
|
|||
cdef int range_start = 0
|
||||
if attr == "IS_SENT_START" or attr == self.vocab.strings["IS_SENT_START"]:
|
||||
attr = SENT_START
|
||||
elif attr == "IS_SENT_END" or attr == self.vocab.strings["IS_SENT_END"]:
|
||||
attr = SENT_START
|
||||
attr = intify_attr(attr)
|
||||
# adjust attributes
|
||||
if attr == HEAD:
|
||||
|
@ -616,7 +618,7 @@ cdef class Doc:
|
|||
"""
|
||||
if "has_vector" in self.user_hooks:
|
||||
return self.user_hooks["has_vector"](self)
|
||||
elif self.vocab.vectors.data.size:
|
||||
elif self.vocab.vectors.size:
|
||||
return True
|
||||
elif self.tensor.size:
|
||||
return True
|
||||
|
@ -641,7 +643,7 @@ cdef class Doc:
|
|||
if not len(self):
|
||||
self._vector = xp.zeros((self.vocab.vectors_length,), dtype="f")
|
||||
return self._vector
|
||||
elif self.vocab.vectors.data.size > 0:
|
||||
elif self.vocab.vectors.size > 0:
|
||||
self._vector = sum(t.vector for t in self) / len(self)
|
||||
return self._vector
|
||||
elif self.tensor.size > 0:
|
||||
|
@ -1183,7 +1185,7 @@ cdef class Doc:
|
|||
token_offset = -1
|
||||
for doc in docs[:-1]:
|
||||
token_offset += len(doc)
|
||||
if not (len(doc) > 0 and doc[-1].is_space):
|
||||
if len(doc) > 0 and not doc[-1].is_space:
|
||||
concat_spaces[token_offset] = True
|
||||
|
||||
concat_array = numpy.concatenate(arrays)
|
||||
|
|
|
@ -364,7 +364,9 @@ cdef class Span:
|
|||
return 0.0
|
||||
vector = self.vector
|
||||
xp = get_array_module(vector)
|
||||
return xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm)
|
||||
result = xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm)
|
||||
# ensure we get a scalar back (numpy does this automatically but cupy doesn't)
|
||||
return result.item()
|
||||
|
||||
cpdef np.ndarray to_array(self, object py_attr_ids):
|
||||
"""Given a list of M attribute IDs, export the tokens to a numpy
|
||||
|
@ -497,7 +499,7 @@ cdef class Span:
|
|||
"""
|
||||
if "has_vector" in self.doc.user_span_hooks:
|
||||
return self.doc.user_span_hooks["has_vector"](self)
|
||||
elif self.vocab.vectors.data.size > 0:
|
||||
elif self.vocab.vectors.size > 0:
|
||||
return any(token.has_vector for token in self)
|
||||
elif self.doc.tensor.size > 0:
|
||||
return True
|
||||
|
|
|
@ -20,6 +20,7 @@ from .doc cimport set_children_from_heads
|
|||
|
||||
from .. import parts_of_speech
|
||||
from ..errors import Errors, Warnings
|
||||
from ..attrs import IOB_STRINGS
|
||||
from .underscore import Underscore, get_ext_args
|
||||
|
||||
|
||||
|
@ -209,7 +210,9 @@ cdef class Token:
|
|||
return 0.0
|
||||
vector = self.vector
|
||||
xp = get_array_module(vector)
|
||||
return (xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm))
|
||||
result = xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm)
|
||||
# ensure we get a scalar back (numpy does this automatically but cupy doesn't)
|
||||
return result.item()
|
||||
|
||||
def has_morph(self):
|
||||
"""Check whether the token has annotated morph information.
|
||||
|
@ -484,8 +487,6 @@ cdef class Token:
|
|||
|
||||
RETURNS (bool / None): Whether the token starts a sentence.
|
||||
None if unknown.
|
||||
|
||||
DOCS: https://spacy.io/api/token#is_sent_start
|
||||
"""
|
||||
def __get__(self):
|
||||
if self.c.sent_start == 0:
|
||||
|
@ -743,7 +744,7 @@ cdef class Token:
|
|||
|
||||
@classmethod
|
||||
def iob_strings(cls):
|
||||
return ("", "I", "O", "B")
|
||||
return IOB_STRINGS
|
||||
|
||||
@property
|
||||
def ent_iob_(self):
|
||||
|
|
|
@ -1,17 +1,31 @@
|
|||
from typing import Dict, Any
|
||||
from typing import Dict, Any, List, Optional, Tuple, Union, TYPE_CHECKING
|
||||
import functools
|
||||
import copy
|
||||
|
||||
from ..errors import Errors
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .doc import Doc
|
||||
from .span import Span
|
||||
from .token import Token
|
||||
|
||||
|
||||
class Underscore:
|
||||
mutable_types = (dict, list, set)
|
||||
doc_extensions: Dict[Any, Any] = {}
|
||||
span_extensions: Dict[Any, Any] = {}
|
||||
token_extensions: Dict[Any, Any] = {}
|
||||
_extensions: Dict[str, Any]
|
||||
_obj: Union["Doc", "Span", "Token"]
|
||||
_start: Optional[int]
|
||||
_end: Optional[int]
|
||||
|
||||
def __init__(self, extensions, obj, start=None, end=None):
|
||||
def __init__(
|
||||
self,
|
||||
extensions: Dict[str, Any],
|
||||
obj: Union["Doc", "Span", "Token"],
|
||||
start: Optional[int] = None,
|
||||
end: Optional[int] = None,
|
||||
):
|
||||
object.__setattr__(self, "_extensions", extensions)
|
||||
object.__setattr__(self, "_obj", obj)
|
||||
# Assumption is that for doc values, _start and _end will both be None
|
||||
|
@ -23,12 +37,12 @@ class Underscore:
|
|||
object.__setattr__(self, "_start", start)
|
||||
object.__setattr__(self, "_end", end)
|
||||
|
||||
def __dir__(self):
|
||||
def __dir__(self) -> List[str]:
|
||||
# Hack to enable autocomplete on custom extensions
|
||||
extensions = list(self._extensions.keys())
|
||||
return ["set", "get", "has"] + extensions
|
||||
|
||||
def __getattr__(self, name):
|
||||
def __getattr__(self, name: str) -> Any:
|
||||
if name not in self._extensions:
|
||||
raise AttributeError(Errors.E046.format(name=name))
|
||||
default, method, getter, setter = self._extensions[name]
|
||||
|
@ -56,7 +70,7 @@ class Underscore:
|
|||
return new_default
|
||||
return default
|
||||
|
||||
def __setattr__(self, name, value):
|
||||
def __setattr__(self, name: str, value: Any):
|
||||
if name not in self._extensions:
|
||||
raise AttributeError(Errors.E047.format(name=name))
|
||||
default, method, getter, setter = self._extensions[name]
|
||||
|
@ -65,28 +79,30 @@ class Underscore:
|
|||
else:
|
||||
self._doc.user_data[self._get_key(name)] = value
|
||||
|
||||
def set(self, name, value):
|
||||
def set(self, name: str, value: Any):
|
||||
return self.__setattr__(name, value)
|
||||
|
||||
def get(self, name):
|
||||
def get(self, name: str) -> Any:
|
||||
return self.__getattr__(name)
|
||||
|
||||
def has(self, name):
|
||||
def has(self, name: str) -> bool:
|
||||
return name in self._extensions
|
||||
|
||||
def _get_key(self, name):
|
||||
def _get_key(self, name: str) -> Tuple[str, str, Optional[int], Optional[int]]:
|
||||
return ("._.", name, self._start, self._end)
|
||||
|
||||
@classmethod
|
||||
def get_state(cls):
|
||||
def get_state(cls) -> Tuple[Dict[Any, Any], Dict[Any, Any], Dict[Any, Any]]:
|
||||
return cls.token_extensions, cls.span_extensions, cls.doc_extensions
|
||||
|
||||
@classmethod
|
||||
def load_state(cls, state):
|
||||
def load_state(
|
||||
cls, state: Tuple[Dict[Any, Any], Dict[Any, Any], Dict[Any, Any]]
|
||||
) -> None:
|
||||
cls.token_extensions, cls.span_extensions, cls.doc_extensions = state
|
||||
|
||||
|
||||
def get_ext_args(**kwargs):
|
||||
def get_ext_args(**kwargs: Any):
|
||||
"""Validate and convert arguments. Reused in Doc, Token and Span."""
|
||||
default = kwargs.get("default")
|
||||
getter = kwargs.get("getter")
|
||||
|
|
|
@ -164,7 +164,7 @@ def load_vectors_into_model(
|
|||
len(vectors_nlp.vocab.vectors.keys()) == 0
|
||||
and vectors_nlp.vocab.vectors.mode != VectorsMode.floret
|
||||
) or (
|
||||
vectors_nlp.vocab.vectors.data.shape[0] == 0
|
||||
vectors_nlp.vocab.vectors.shape[0] == 0
|
||||
and vectors_nlp.vocab.vectors.mode == VectorsMode.floret
|
||||
):
|
||||
logger.warning(Warnings.W112.format(name=name))
|
||||
|
|
|
@ -871,7 +871,6 @@ def get_package_path(name: str) -> Path:
|
|||
name (str): Package name.
|
||||
RETURNS (Path): Path to installed package.
|
||||
"""
|
||||
name = name.lower() # use lowercase version to be safe
|
||||
# Here we're importing the module just to find it. This is worryingly
|
||||
# indirect, but it's otherwise very difficult to find the package.
|
||||
pkg = importlib.import_module(name)
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
cimport numpy as np
|
||||
from libc.stdint cimport uint32_t
|
||||
from libc.stdint cimport uint32_t, uint64_t
|
||||
from cython.operator cimport dereference as deref
|
||||
from libcpp.set cimport set as cppset
|
||||
from murmurhash.mrmr cimport hash128_x64
|
||||
|
@ -10,7 +10,7 @@ from typing import cast
|
|||
import warnings
|
||||
from enum import Enum
|
||||
import srsly
|
||||
from thinc.api import get_array_module, get_current_ops
|
||||
from thinc.api import Ops, get_array_module, get_current_ops
|
||||
from thinc.backends import get_array_ops
|
||||
from thinc.types import Floats2d
|
||||
|
||||
|
@ -146,7 +146,7 @@ cdef class Vectors:
|
|||
|
||||
DOCS: https://spacy.io/api/vectors#size
|
||||
"""
|
||||
return self.data.shape[0] * self.data.shape[1]
|
||||
return self.data.size
|
||||
|
||||
@property
|
||||
def is_full(self):
|
||||
|
@ -274,7 +274,7 @@ cdef class Vectors:
|
|||
self.data = resized_array
|
||||
self._sync_unset()
|
||||
removed_items = []
|
||||
for key, row in list(self.key2row.items()):
|
||||
for key, row in self.key2row.copy().items():
|
||||
if row >= shape[0]:
|
||||
self.key2row.pop(key)
|
||||
removed_items.append((key, row))
|
||||
|
@ -353,12 +353,18 @@ cdef class Vectors:
|
|||
key (str): The string key.
|
||||
RETURNS: A list of the integer hashes.
|
||||
"""
|
||||
cdef uint32_t[4] out
|
||||
# MurmurHash3_x64_128 returns an array of 2 uint64_t values.
|
||||
cdef uint64_t[2] out
|
||||
chars = s.encode("utf8")
|
||||
cdef char* utf8_string = chars
|
||||
hash128_x64(utf8_string, len(chars), self.hash_seed, &out)
|
||||
rows = [out[i] for i in range(min(self.hash_count, 4))]
|
||||
return rows
|
||||
rows = [
|
||||
out[0] & 0xffffffffu,
|
||||
out[0] >> 32,
|
||||
out[1] & 0xffffffffu,
|
||||
out[1] >> 32,
|
||||
]
|
||||
return rows[:min(self.hash_count, 4)]
|
||||
|
||||
def _get_ngrams(self, unicode key):
|
||||
"""Get all padded ngram strings using the ngram settings.
|
||||
|
@ -511,6 +517,9 @@ cdef class Vectors:
|
|||
for i in range(len(queries)) ], dtype="uint64")
|
||||
return (keys, best_rows, scores)
|
||||
|
||||
def to_ops(self, ops: Ops):
|
||||
self.data = ops.asarray(self.data)
|
||||
|
||||
def _get_cfg(self):
|
||||
if self.mode == Mode.default:
|
||||
return {
|
||||
|
|
|
@ -283,7 +283,7 @@ cdef class Vocab:
|
|||
|
||||
@property
|
||||
def vectors_length(self):
|
||||
return self.vectors.data.shape[1]
|
||||
return self.vectors.shape[1]
|
||||
|
||||
def reset_vectors(self, *, width=None, shape=None):
|
||||
"""Drop the current vector table. Because all vectors must be the same
|
||||
|
@ -294,7 +294,7 @@ cdef class Vocab:
|
|||
elif shape is not None:
|
||||
self.vectors = Vectors(strings=self.strings, shape=shape)
|
||||
else:
|
||||
width = width if width is not None else self.vectors.data.shape[1]
|
||||
width = width if width is not None else self.vectors.shape[1]
|
||||
self.vectors = Vectors(strings=self.strings, shape=(self.vectors.shape[0], width))
|
||||
|
||||
def prune_vectors(self, nr_row, batch_size=1024):
|
||||
|
|
|
@ -79,6 +79,7 @@ train/test skew.
|
|||
| `max_length` | Maximum document length. Longer documents will be split into sentences, if sentence boundaries are available. Defaults to `0` for no limit. ~~int~~ |
|
||||
| `limit` | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~ |
|
||||
| `augmenter` | Optional data augmentation callback. ~~Callable[[Language, Example], Iterable[Example]]~~ |
|
||||
| `shuffle` | Whether to shuffle the examples. Defaults to `False`. ~~bool~~ |
|
||||
|
||||
## Corpus.\_\_call\_\_ {#call tag="method"}
|
||||
|
||||
|
|
|
@ -304,7 +304,7 @@ ancestor is found, e.g. if span excludes a necessary ancestor.
|
|||
|
||||
## Doc.has_annotation {#has_annotation tag="method"}
|
||||
|
||||
Check whether the doc contains annotation on a token attribute.
|
||||
Check whether the doc contains annotation on a [`Token` attribute](/api/token#attributes).
|
||||
|
||||
<Infobox title="Changed in v3.0" variant="warning">
|
||||
|
||||
|
|
|
@ -44,6 +44,7 @@ rule-based matching are:
|
|||
| `SPACY` | Token has a trailing space. ~~bool~~ |
|
||||
| `POS`, `TAG`, `MORPH`, `DEP`, `LEMMA`, `SHAPE` | The token's simple and extended part-of-speech tag, morphological analysis, dependency label, lemma, shape. ~~str~~ |
|
||||
| `ENT_TYPE` | The token's entity label. ~~str~~ |
|
||||
| `ENT_IOB` | The IOB part of the token's entity tag. ~~str~~ |
|
||||
| `ENT_ID` | The token's entity ID (`ent_id`). ~~str~~ |
|
||||
| `ENT_KB_ID` | The token's entity knowledge base ID (`ent_kb_id`). ~~str~~ |
|
||||
| `_` <Tag variant="new">2.1</Tag> | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). ~~Dict[str, Any]~~ |
|
||||
|
|
|
@ -349,23 +349,6 @@ A sequence containing the token and all the token's syntactic descendants.
|
|||
| ---------- | ------------------------------------------------------------------------------------ |
|
||||
| **YIELDS** | A descendant token such that `self.is_ancestor(token)` or `token == self`. ~~Token~~ |
|
||||
|
||||
## Token.is_sent_start {#is_sent_start tag="property" new="2"}
|
||||
|
||||
A boolean value indicating whether the token starts a sentence. `None` if
|
||||
unknown. Defaults to `True` for the first token in the `Doc`.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> doc = nlp("Give it back! He pleaded.")
|
||||
> assert doc[4].is_sent_start
|
||||
> assert not doc[5].is_sent_start
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------------------- |
|
||||
| **RETURNS** | Whether the token starts a sentence. ~~Optional[bool]~~ |
|
||||
|
||||
## Token.has_vector {#has_vector tag="property" model="vectors"}
|
||||
|
||||
A boolean value indicating whether a word vector is associated with the token.
|
||||
|
@ -465,6 +448,8 @@ The L2 norm of the token's vector representation.
|
|||
| `is_punct` | Is the token punctuation? ~~bool~~ |
|
||||
| `is_left_punct` | Is the token a left punctuation mark, e.g. `"("` ? ~~bool~~ |
|
||||
| `is_right_punct` | Is the token a right punctuation mark, e.g. `")"` ? ~~bool~~ |
|
||||
| `is_sent_start` | Does the token start a sentence? ~~bool~~ or `None` if unknown. Defaults to `True` for the first token in the `Doc`. |
|
||||
| `is_sent_end` | Does the token end a sentence? ~~bool~~ or `None` if unknown. |
|
||||
| `is_space` | Does the token consist of whitespace characters? Equivalent to `token.text.isspace()`. ~~bool~~ |
|
||||
| `is_bracket` | Is the token a bracket? ~~bool~~ |
|
||||
| `is_quote` | Is the token a quotation mark? ~~bool~~ |
|
||||
|
|
|
@ -371,6 +371,23 @@ Get the vectors for the provided keys efficiently as a batch.
|
|||
| ------ | --------------------------------------- |
|
||||
| `keys` | The keys. ~~Iterable[Union[int, str]]~~ |
|
||||
|
||||
## Vectors.to_ops {#to_ops tag="method"}
|
||||
|
||||
Change the embedding matrix to use different Thinc ops.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> from thinc.api import NumpyOps
|
||||
>
|
||||
> vectors.to_ops(NumpyOps())
|
||||
>
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
|-------|----------------------------------------------------------|
|
||||
| `ops` | The Thinc ops to switch the embedding matrix to. ~~Ops~~ |
|
||||
|
||||
## Vectors.to_disk {#to_disk tag="method"}
|
||||
|
||||
Save the current state to a directory.
|
||||
|
|
|
@ -831,6 +831,8 @@ def tokenizer_pseudo_code(
|
|||
infixes = infix_finditer(substring)
|
||||
offset = 0
|
||||
for match in infixes:
|
||||
if offset == 0 and match.start() == 0:
|
||||
continue
|
||||
tokens.append(substring[offset : match.start()])
|
||||
tokens.append(substring[match.start() : match.end()])
|
||||
offset = match.end()
|
||||
|
|
|
@ -213,6 +213,12 @@ format, train a pipeline, evaluate it and export metrics, package it and spin up
|
|||
a quick web demo. It looks pretty similar to a config file used to define CI
|
||||
pipelines.
|
||||
|
||||
> #### Tip: Multi-line YAML syntax for long values
|
||||
>
|
||||
> YAML has [multi-line syntax](https://yaml-multiline.info/) that can be
|
||||
> helpful for readability with longer values such as project descriptions or
|
||||
> commands that take several arguments.
|
||||
|
||||
```yaml
|
||||
%%GITHUB_PROJECTS/pipelines/tagger_parser_ud/project.yml
|
||||
```
|
||||
|
|
|
@ -141,7 +141,8 @@
|
|||
"website": "https://www.nr.no/~plison"
|
||||
},
|
||||
"category": ["pipeline", "standalone", "research", "training"],
|
||||
"tags": []
|
||||
"tags": [],
|
||||
"spacy_version": 3
|
||||
},
|
||||
{
|
||||
"id": "numerizer",
|
||||
|
@ -952,6 +953,37 @@
|
|||
"category": ["pipeline"],
|
||||
"tags": ["lemmatizer", "danish"]
|
||||
},
|
||||
{
|
||||
"id": "augmenty",
|
||||
"title": "Augmenty",
|
||||
"slogan": "The cherry on top of your NLP pipeline",
|
||||
"description": "Augmenty is an augmentation library based on spaCy for augmenting texts. Augmenty differs from other augmentation libraries in that it corrects (as far as possible) the token, sentence and document labels under the augmentation.",
|
||||
"github": "kennethenevoldsen/augmenty",
|
||||
"pip": "augmenty",
|
||||
"code_example": [
|
||||
"import spacy",
|
||||
"import augmenty",
|
||||
"",
|
||||
"nlp = spacy.load('en_core_web_md')",
|
||||
"",
|
||||
"docs = nlp.pipe(['Augmenty is a great tool for text augmentation'])",
|
||||
"",
|
||||
"ent_dict = {'ORG': [['spaCy'], ['spaCy', 'Universe']]}",
|
||||
"entity_augmenter = augmenty.load('ents_replace.v1',",
|
||||
" ent_dict = ent_dict, level=1)",
|
||||
"",
|
||||
"for doc in augmenty.docs(docs, augmenter=entity_augmenter, nlp=nlp):",
|
||||
" print(doc)"
|
||||
],
|
||||
"thumb": "https://github.com/KennethEnevoldsen/augmenty/blob/master/img/icon.png?raw=true",
|
||||
"author": "Kenneth Enevoldsen",
|
||||
"author_links": {
|
||||
"github": "kennethenevoldsen",
|
||||
"website": "https://www.kennethenevoldsen.com"
|
||||
},
|
||||
"category": ["training", "research"],
|
||||
"tags": ["training", "research", "augmentation"]
|
||||
},
|
||||
{
|
||||
"id": "dacy",
|
||||
"title": "DaCy",
|
||||
|
|
|
@ -8,10 +8,11 @@ import Title from '../components/title'
|
|||
import Grid from '../components/grid'
|
||||
import Button from '../components/button'
|
||||
import Icon from '../components/icon'
|
||||
import Tag from '../components/tag'
|
||||
import CodeBlock, { InlineCode } from '../components/code'
|
||||
import Aside from '../components/aside'
|
||||
import Sidebar from '../components/sidebar'
|
||||
import Section from '../components/section'
|
||||
import Section, { Hr } from '../components/section'
|
||||
import Main from '../components/main'
|
||||
import Footer from '../components/footer'
|
||||
import { H3, H5, Label, InlineList } from '../components/typography'
|
||||
|
@ -121,6 +122,18 @@ const UniverseContent = ({ content = [], categories, theme, pageContext, mdxComp
|
|||
</Grid>
|
||||
</Section>
|
||||
)}
|
||||
<section className="search-exclude">
|
||||
<H3>Found a mistake or something isn't working?</H3>
|
||||
<p>
|
||||
If you've come across a universe project that isn't working or is
|
||||
incompatible with the reported spaCy version, let us know by{' '}
|
||||
<Link to="https://github.com/explosion/spaCy/discussions/new">
|
||||
opening a discussion thread
|
||||
</Link>
|
||||
.
|
||||
</p>
|
||||
</section>
|
||||
<Hr />
|
||||
<section className="search-exclude">
|
||||
<H3>Submit your project</H3>
|
||||
<p>
|
||||
|
@ -168,11 +181,22 @@ UniverseContent.propTypes = {
|
|||
mdxComponents: PropTypes.object,
|
||||
}
|
||||
|
||||
const SpaCyVersion = ({ version }) => {
|
||||
const versions = !Array.isArray(version) ? [version] : version
|
||||
return versions.map((v, i) => (
|
||||
<>
|
||||
<Tag tooltip={`This project is compatible with spaCy v${v}`}>spaCy v{v}</Tag>{' '}
|
||||
</>
|
||||
))
|
||||
}
|
||||
|
||||
const Project = ({ data, components }) => (
|
||||
<>
|
||||
<Title title={data.title || data.id} teaser={data.slogan} image={data.thumb}>
|
||||
{data.github && (
|
||||
{(data.github || data.spacy_version) && (
|
||||
<p>
|
||||
{data.spacy_version && <SpaCyVersion version={data.spacy_version} />}
|
||||
{data.github && (
|
||||
<Link to={`https://github.com/${data.github}`} hidden>
|
||||
{[
|
||||
`release/${data.github}/all.svg?style=flat-square`,
|
||||
|
@ -180,13 +204,18 @@ const Project = ({ data, components }) => (
|
|||
`stars/${data.github}.svg?style=social&label=Stars`,
|
||||
].map((url, i) => (
|
||||
<img
|
||||
style={{ borderRadius: '1em', marginRight: '0.5rem' }}
|
||||
style={{
|
||||
borderRadius: '1em',
|
||||
marginRight: '0.5rem',
|
||||
verticalAlign: 'middle',
|
||||
}}
|
||||
key={i}
|
||||
src={`https://img.shields.io/github/${url}`}
|
||||
alt=""
|
||||
/>
|
||||
))}
|
||||
</Link>
|
||||
)}
|
||||
</p>
|
||||
)}
|
||||
</Title>
|
||||
|
@ -335,6 +364,7 @@ const query = graphql`
|
|||
url
|
||||
github
|
||||
description
|
||||
spacy_version
|
||||
pip
|
||||
cran
|
||||
category
|
||||
|
|
Loading…
Reference in New Issue
Block a user