Merge remote-tracking branch 'upstream/master' into chore/update-develop-from-master-v3.1-3

This commit is contained in:
Adriane Boyd 2021-10-14 09:41:46 +02:00
commit d98d525bc8
39 changed files with 382 additions and 136 deletions

View File

@ -1,8 +1,11 @@
blank_issues_enabled: false
contact_links:
- name: ⚠️ Python 3.10 Support
url: https://github.com/explosion/spaCy/discussions/9418
about: Python 3.10 wheels haven't been released yet, see the link for details.
- name: 🗯 Discussions Forum
url: https://github.com/explosion/spaCy/discussions
about: Usage questions, general discussion and anything else that isn't a bug report.
about: Install issues, usage questions, general discussion and anything else that isn't a bug report.
- name: 📖 spaCy FAQ & Troubleshooting
url: https://github.com/explosion/spaCy/discussions/8226
about: Before you post, check out the FAQ for answers to common community questions!

View File

@ -100,3 +100,8 @@ steps:
python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
displayName: 'Test assemble CLI vectors warning'
condition: eq(variables['python_version'], '3.8')
- script: |
python .github/validate_universe_json.py website/meta/universe.json
displayName: 'Test website/meta/universe.json'
condition: eq(variables['python_version'], '3.8')

19
.github/validate_universe_json.py vendored Normal file
View File

@ -0,0 +1,19 @@
import json
import re
import sys
from pathlib import Path
def validate_json(document):
universe_file = Path(document)
with universe_file.open() as f:
universe_data = json.load(f)
for entry in universe_data["resources"]:
if "github" in entry:
assert not re.match(
r"^(http:)|^(https:)", entry["github"]
), "Github field should be user/repo, not a url"
if __name__ == "__main__":
validate_json(str(sys.argv[1]))

27
.github/workflows/explosionbot.yml vendored Normal file
View File

@ -0,0 +1,27 @@
name: Explosion Bot
on:
issue_comment:
types:
- created
- edited
jobs:
explosion-bot:
runs-on: ubuntu-18.04
steps:
- name: Dump GitHub context
env:
GITHUB_CONTEXT: ${{ toJson(github) }}
run: echo "$GITHUB_CONTEXT"
- uses: actions/checkout@v1
- uses: actions/setup-python@v1
- name: Install and run explosion-bot
run: |
pip install git+https://${{ secrets.EXPLOSIONBOT_TOKEN }}@github.com/explosion/explosion-bot
python -m explosionbot
env:
INPUT_TOKEN: ${{ secrets.EXPLOSIONBOT_TOKEN }}
INPUT_BK_TOKEN: ${{ secrets.BUILDKITE_SECRET }}
ENABLED_COMMANDS: "test_gpu"
ALLOWED_TEAMS: "spaCy"

View File

@ -1,8 +0,0 @@
@software{spacy,
author = {Honnibal, Matthew and Montani, Ines and Van Landeghem, Sofie and Boyd, Adriane},
title = {{spaCy: Industrial-strength Natural Language Processing in Python}},
year = 2020,
publisher = {Zenodo},
doi = {10.5281/zenodo.1212303},
url = {https://doi.org/10.5281/zenodo.1212303}
}

16
CITATION.cff Normal file
View File

@ -0,0 +1,16 @@
cff-version: 1.2.0
preferred-citation:
type: article
message: "If you use spaCy, please cite it as below."
authors:
- family-names: "Honnibal"
given-names: "Matthew"
- family-names: "Montani"
given-names: "Ines"
- family-names: "Van Landeghem"
given-names: "Sofie"
- family-names: "Boyd"
given-names: "Adriane"
title: "spaCy: Industrial-strength Natural Language Processing in Python"
doi: "10.5281/zenodo.1212303"
year: 2020

View File

@ -1,5 +1,5 @@
recursive-include include *.h
recursive-include spacy *.pyx *.pxd *.txt *.cfg *.jinja
recursive-include spacy *.pyx *.pxd *.txt *.cfg *.jinja *.toml
include LICENSE
include README.md
include pyproject.toml

View File

@ -16,6 +16,8 @@ pr:
exclude:
- "website/*"
- "*.md"
include:
- "website/meta/universe.json"
jobs:
# Perform basic checks for most important errors (syntax etc.) Uses the config

View File

@ -124,7 +124,8 @@ exclude =
[tool:pytest]
markers =
slow
slow: mark a test as slow
issue: reference specific issue
[mypy]
ignore_missing_imports = True

View File

@ -1,4 +1,4 @@
from typing import Optional
from typing import Optional, Dict, Any
from pathlib import Path
from wasabi import msg
import typer
@ -7,7 +7,7 @@ import sys
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
from ._util import import_code, setup_gpu
from ..training.loop import train
from ..training.loop import train as train_nlp
from ..training.initialize import init_nlp
from .. import util
@ -40,6 +40,18 @@ def train_cli(
DOCS: https://spacy.io/api/cli#train
"""
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
overrides = parse_config_overrides(ctx.args)
import_code(code_path)
train(config_path, output_path, use_gpu=use_gpu, overrides=overrides)
def train(
config_path: Path,
output_path: Optional[Path] = None,
*,
use_gpu: int = -1,
overrides: Dict[str, Any] = util.SimpleFrozenDict(),
):
# Make sure all files and paths exists if they are needed
if not config_path or (str(config_path) != "-" and not config_path.exists()):
msg.fail("Config file not found", config_path, exits=1)
@ -50,8 +62,6 @@ def train_cli(
output_path.mkdir(parents=True)
msg.good(f"Created output directory: {output_path}")
msg.info(f"Saving to output directory: {output_path}")
overrides = parse_config_overrides(ctx.args)
import_code(code_path)
setup_gpu(use_gpu)
with show_validation_error(config_path):
config = util.load_config(config_path, overrides=overrides, interpolate=False)
@ -60,4 +70,4 @@ def train_cli(
nlp = init_nlp(config, use_gpu=use_gpu)
msg.good("Initialized pipeline")
msg.divider("Training pipeline")
train(nlp, output_path, use_gpu=use_gpu, stdout=sys.stdout, stderr=sys.stderr)
train_nlp(nlp, output_path, use_gpu=use_gpu, stdout=sys.stdout, stderr=sys.stderr)

View File

@ -25,7 +25,7 @@ def setup_default_warnings():
filter_warning("once", error_msg=Warnings.W036.format(name=pipe))
# warn once about lemmatizer without required POS
filter_warning("once", error_msg="[W108]")
filter_warning("once", error_msg=Warnings.W108)
def filter_warning(action: str, error_msg: str):
@ -170,8 +170,8 @@ class Warnings:
"call the {matcher} on each Doc object.")
W107 = ("The property `Doc.{prop}` is deprecated. Use "
"`Doc.has_annotation(\"{attr}\")` instead.")
W108 = ("The rule-based lemmatizer did not find POS annotation for the "
"token '{text}'. Check that your pipeline includes components that "
W108 = ("The rule-based lemmatizer did not find POS annotation for one or "
"more tokens. Check that your pipeline includes components that "
"assign token.pos, typically 'tagger'+'attribute_ruler' or "
"'morphologizer'.")
W109 = ("Unable to save user hooks while serializing the doc. Re-add any "
@ -658,7 +658,9 @@ class Errors:
"{nO} - cannot add any more labels.")
E923 = ("It looks like there is no proper sample data to initialize the "
"Model of component '{name}'. To check your input data paths and "
"annotation, run: python -m spacy debug data config.cfg")
"annotation, run: python -m spacy debug data config.cfg "
"and include the same config override values you would specify "
"for the 'spacy train' command.")
E924 = ("The '{name}' component does not seem to be initialized properly. "
"This is likely a bug in spaCy, so feel free to open an issue: "
"https://github.com/explosion/spaCy/issues")
@ -793,7 +795,7 @@ class Errors:
"to token boundaries.")
E982 = ("The `Token.ent_iob` attribute should be an integer indexing "
"into {values}, but found {value}.")
E983 = ("Invalid key for '{dict}': {key}. Available keys: "
E983 = ("Invalid key(s) for '{dict}': {key}. Available keys: "
"{keys}")
E984 = ("Invalid component config for '{name}': component block needs either "
"a key `factory` specifying the registered function used to "

View File

@ -76,6 +76,6 @@ class CatalanLemmatizer(Lemmatizer):
forms.append(self.lookup_lemmatize(token)[0])
if not forms:
forms.append(string)
forms = list(set(forms))
forms = list(dict.fromkeys(forms))
self.cache[cache_key] = forms
return forms

View File

@ -75,6 +75,6 @@ class FrenchLemmatizer(Lemmatizer):
forms.append(self.lookup_lemmatize(token)[0])
if not forms:
forms.append(string)
forms = list(set(forms))
forms = list(dict.fromkeys(forms))
self.cache[cache_key] = forms
return forms

View File

@ -97,7 +97,7 @@ class DutchLemmatizer(Lemmatizer):
return forms
else:
oov_forms.append(form)
forms = list(set(oov_forms))
forms = list(dict.fromkeys(oov_forms))
# Back-off through remaining return value candidates.
if forms:
for form in forms:

View File

@ -58,7 +58,7 @@ class RussianLemmatizer(Lemmatizer):
if not len(filtered_analyses):
return [string.lower()]
if morphology is None or (len(morphology) == 1 and POS in morphology):
return list(set([analysis.normal_form for analysis in filtered_analyses]))
return list(dict.fromkeys([analysis.normal_form for analysis in filtered_analyses]))
if univ_pos in ("ADJ", "DET", "NOUN", "PROPN"):
features_to_compare = ["Case", "Number", "Gender"]
elif univ_pos == "NUM":
@ -89,7 +89,7 @@ class RussianLemmatizer(Lemmatizer):
filtered_analyses.append(analysis)
if not len(filtered_analyses):
return [string.lower()]
return list(set([analysis.normal_form for analysis in filtered_analyses]))
return list(dict.fromkeys([analysis.normal_form for analysis in filtered_analyses]))
def pymorphy2_lookup_lemmatize(self, token: Token) -> List[str]:
string = token.text

View File

@ -707,8 +707,9 @@ class Language:
source_config = source.config.interpolate()
pipe_config = util.copy_config(source_config["components"][source_name])
self._pipe_configs[name] = pipe_config
for s in source.vocab.strings:
self.vocab.strings.add(s)
if self.vocab.strings != source.vocab.strings:
for s in source.vocab.strings:
self.vocab.strings.add(s)
return pipe, pipe_config["factory"]
def add_pipe(
@ -1379,6 +1380,9 @@ class Language:
scorer = Scorer(**kwargs)
# reset annotation in predicted docs and time tokenization
start_time = timer()
# this is purely for timing
for eg in examples:
self.make_doc(eg.reference.text)
# apply all pipeline components
for name, pipe in self.pipeline:
kwargs = component_cfg.get(name, {})
@ -1708,6 +1712,7 @@ class Language:
# them here so they're only loaded once
source_nlps = {}
source_nlp_vectors_hashes = {}
vocab_b = None
for pipe_name in config["nlp"]["pipeline"]:
if pipe_name not in pipeline:
opts = ", ".join(pipeline.keys())
@ -1730,14 +1735,22 @@ class Language:
raw_config=raw_config,
)
else:
# We need the sourced components to reference the same
# vocab without modifying the current vocab state **AND**
# we still want to load the source model vectors to perform
# the vectors check. Since the source vectors clobber the
# current ones, we save the original vocab state and
# restore after this loop. Existing strings are preserved
# during deserialization, so they do not need any
# additional handling.
if vocab_b is None:
vocab_b = nlp.vocab.to_bytes(exclude=["lookups", "strings"])
model = pipe_cfg["source"]
if model not in source_nlps:
# We only need the components here and we intentionally
# do not load the model with the same vocab because
# this would cause the vectors to be copied into the
# current nlp object (all the strings will be added in
# create_pipe_from_source)
source_nlps[model] = util.load_model(model)
# Load with the same vocab, adding any strings
source_nlps[model] = util.load_model(
model, vocab=nlp.vocab, exclude=["lookups"]
)
source_name = pipe_cfg.get("component", pipe_name)
listeners_replaced = False
if "replace_listeners" in pipe_cfg:
@ -1764,6 +1777,9 @@ class Language:
# Delete from cache if listeners were replaced
if listeners_replaced:
del source_nlps[model]
# Restore the original vocab after sourcing if necessary
if vocab_b is not None:
nlp.vocab.from_bytes(vocab_b)
disabled_pipes = [*config["nlp"]["disabled"], *disable]
nlp._disabled = set(p for p in disabled_pipes if p not in exclude)
nlp.batch_size = config["nlp"]["batch_size"]

View File

@ -177,13 +177,14 @@ cdef class DependencyMatcher:
# Add 'RIGHT_ATTRS' to self._patterns[key]
_patterns = [[[pat["RIGHT_ATTRS"]] for pat in pattern] for pattern in patterns]
pattern_offset = len(self._patterns[key])
self._patterns[key].extend(_patterns)
# Add each node pattern of all the input patterns individually to the
# matcher. This enables only a single instance of Matcher to be used.
# Multiple adds are required to track each node pattern.
tokens_to_key_list = []
for i, current_patterns in enumerate(_patterns):
for i, current_patterns in enumerate(_patterns, start=pattern_offset):
# Preallocate list space
tokens_to_key = [None] * len(current_patterns)
@ -263,7 +264,9 @@ cdef class DependencyMatcher:
self._raw_patterns.pop(key)
self._tree.pop(key)
self._root.pop(key)
self._tokens_to_key.pop(key)
for mklist in self._tokens_to_key.pop(key):
for mkey in mklist:
self._matcher.remove(mkey)
def _get_keys_to_position_maps(self, doc):
"""

View File

@ -208,7 +208,7 @@ class Lemmatizer(Pipe):
univ_pos = token.pos_.lower()
if univ_pos in ("", "eol", "space"):
if univ_pos == "":
warnings.warn(Warnings.W108.format(text=string))
warnings.warn(Warnings.W108)
return [string.lower()]
# See Issue #435 for example of where this logic is requied.
if self.is_base_form(token):

View File

@ -4,6 +4,7 @@ from spacy.util import get_lang_class
def pytest_addoption(parser):
parser.addoption("--slow", action="store_true", help="include slow tests")
parser.addoption("--issue", action="store", help="test specific issues")
def pytest_runtest_setup(item):
@ -16,10 +17,24 @@ def pytest_runtest_setup(item):
# options weren't given.
return item.config.getoption(f"--{opt}", False)
# Integration of boolean flags
for opt in ["slow"]:
if opt in item.keywords and not getopt(opt):
pytest.skip(f"need --{opt} option to run")
# Special integration to mark tests with issue numbers
issues = getopt("issue")
if isinstance(issues, str):
if "issue" in item.keywords:
# Convert issues provided on the CLI to list of ints
issue_nos = [int(issue.strip()) for issue in issues.split(",")]
# Get all issues specified by decorators and check if they're provided
issue_refs = [mark.args[0] for mark in item.iter_markers(name="issue")]
if not any([ref in issue_nos for ref in issue_refs]):
pytest.skip(f"not referencing specified issues: {issue_nos}")
else:
pytest.skip("not referencing any issues")
# Fixtures for language tokenizers (languages sorted alphabetically)

View File

@ -368,3 +368,87 @@ def test_dependency_matcher_span_user_data(en_tokenizer):
assert doc_match[0] == span_match[0]
for doc_t_i, span_t_i in zip(doc_match[1], span_match[1]):
assert doc_t_i == span_t_i + offset
def test_dependency_matcher_order_issue(en_tokenizer):
# issue from #9263
doc = en_tokenizer("I like text")
doc[2].head = doc[1]
# this matches on attrs but not rel op
pattern1 = [
{"RIGHT_ID": "root", "RIGHT_ATTRS": {"ORTH": "like"}},
{
"LEFT_ID": "root",
"RIGHT_ID": "r",
"RIGHT_ATTRS": {"ORTH": "text"},
"REL_OP": "<",
},
]
# this matches on rel op but not attrs
pattern2 = [
{"RIGHT_ID": "root", "RIGHT_ATTRS": {"ORTH": "like"}},
{
"LEFT_ID": "root",
"RIGHT_ID": "r",
"RIGHT_ATTRS": {"ORTH": "fish"},
"REL_OP": ">",
},
]
matcher = DependencyMatcher(en_tokenizer.vocab)
# This should behave the same as the next pattern
matcher.add("check", [pattern1, pattern2])
matches = matcher(doc)
assert matches == []
# use a new matcher
matcher = DependencyMatcher(en_tokenizer.vocab)
# adding one at a time under same label gets a match
matcher.add("check", [pattern1])
matcher.add("check", [pattern2])
matches = matcher(doc)
assert matches == []
def test_dependency_matcher_remove(en_tokenizer):
# issue from #9263
doc = en_tokenizer("The red book")
doc[1].head = doc[2]
# this matches
pattern1 = [
{"RIGHT_ID": "root", "RIGHT_ATTRS": {"ORTH": "book"}},
{
"LEFT_ID": "root",
"RIGHT_ID": "r",
"RIGHT_ATTRS": {"ORTH": "red"},
"REL_OP": ">",
},
]
# add and then remove it
matcher = DependencyMatcher(en_tokenizer.vocab)
matcher.add("check", [pattern1])
matcher.remove("check")
# this matches on rel op but not attrs
pattern2 = [
{"RIGHT_ID": "root", "RIGHT_ATTRS": {"ORTH": "flag"}},
{
"LEFT_ID": "root",
"RIGHT_ID": "r",
"RIGHT_ATTRS": {"ORTH": "blue"},
"REL_OP": ">",
},
]
# Adding this new pattern with the same label, which should not match
matcher.add("check", [pattern2])
matches = matcher(doc)
assert matches == []

View File

View File

@ -114,7 +114,7 @@ def test_make_spangroup(max_positive, nr_results):
doc = nlp.make_doc("Greater London")
ngram_suggester = registry.misc.get("spacy.ngram_suggester.v1")(sizes=[1, 2])
indices = ngram_suggester([doc])[0].dataXd
assert_array_equal(indices, numpy.asarray([[0, 1], [1, 2], [0, 2]]))
assert_array_equal(OPS.to_numpy(indices), numpy.asarray([[0, 1], [1, 2], [0, 2]]))
labels = ["Thing", "City", "Person", "GreatCity"]
scores = numpy.asarray(
[[0.2, 0.4, 0.3, 0.1], [0.1, 0.6, 0.2, 0.4], [0.8, 0.7, 0.3, 0.9]], dtype="f"

View File

@ -49,8 +49,8 @@ def test_issue5551(textcat_config):
# All results should be the same because of the fixed seed
assert len(results) == 3
ops = get_current_ops()
assert_almost_equal(ops.to_numpy(results[0]), ops.to_numpy(results[1]))
assert_almost_equal(ops.to_numpy(results[0]), ops.to_numpy(results[2]))
assert_almost_equal(ops.to_numpy(results[0]), ops.to_numpy(results[1]), decimal=5)
assert_almost_equal(ops.to_numpy(results[0]), ops.to_numpy(results[2]), decimal=5)
def test_issue5838():

View File

@ -0,0 +1,54 @@
import pytest
from thinc.api import Adam
from spacy.attrs import NORM
from spacy.vocab import Vocab
from spacy import registry
from spacy.training import Example
from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL
from spacy.tokens import Doc
from spacy.pipeline import DependencyParser
@pytest.fixture
def vocab():
return Vocab(lex_attr_getters={NORM: lambda s: s})
def _parser_example(parser):
doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
gold = {"heads": [1, 1, 3, 3], "deps": ["right", "ROOT", "left", "ROOT"]}
return Example.from_dict(doc, gold)
@pytest.fixture
def parser(vocab):
vocab.strings.add("ROOT")
cfg = {"model": DEFAULT_PARSER_MODEL}
model = registry.resolve(cfg, validate=True)["model"]
parser = DependencyParser(vocab, model)
parser.cfg["token_vector_width"] = 4
parser.cfg["hidden_width"] = 32
# parser.add_label('right')
parser.add_label("left")
parser.initialize(lambda: [_parser_example(parser)])
sgd = Adam(0.001)
for i in range(10):
losses = {}
doc = Doc(vocab, words=["a", "b", "c", "d"])
example = Example.from_dict(
doc, {"heads": [1, 1, 3, 3], "deps": ["left", "ROOT", "left", "ROOT"]}
)
parser.update([example], sgd=sgd, losses=losses)
return parser
@pytest.mark.xfail(reason="Not fixed yet")
def test_partial_annotation(parser):
doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
doc[2].is_sent_start = False
# Note that if the following line is used, then doc[2].is_sent_start == False
# doc[3].is_sent_start = False
doc = parser(doc)
assert doc[2].is_sent_start == False

View File

@ -1,6 +1,8 @@
import pytest
from spacy.lang.en import English
@pytest.mark.issue(8168)
def test_issue8168():
nlp = English()
ruler = nlp.add_pipe("entity_ruler")

View File

@ -193,6 +193,7 @@ def test_models_update_consistently(seed, dropout, model_func, kwargs, get_X):
assert_array_almost_equal(
model1.ops.to_numpy(get_all_params(model1)),
model2.ops.to_numpy(get_all_params(model2)),
decimal=5,
)

View File

@ -82,15 +82,15 @@ def test_cat_readers(reader, additional_config):
[nlp]
lang = "en"
pipeline = ["tok2vec", "textcat"]
pipeline = ["tok2vec", "textcat_multilabel"]
[components]
[components.tok2vec]
factory = "tok2vec"
[components.textcat]
factory = "textcat"
[components.textcat_multilabel]
factory = "textcat_multilabel"
"""
config = Config().from_str(nlp_config_string)
config["corpora"]["@readers"] = reader

View File

@ -8,7 +8,7 @@ from thinc.api import NumpyOps
from .doc import Doc
from ..vocab import Vocab
from ..compat import copy_reg
from ..attrs import SPACY, ORTH, intify_attr
from ..attrs import SPACY, ORTH, intify_attr, IDS
from ..errors import Errors
from ..util import ensure_path, SimpleFrozenList
@ -64,7 +64,13 @@ class DocBin:
DOCS: https://spacy.io/api/docbin#init
"""
attrs = sorted([intify_attr(attr) for attr in attrs])
int_attrs = [intify_attr(attr) for attr in attrs]
if None in int_attrs:
non_valid = [attr for attr in attrs if intify_attr(attr) is None]
raise KeyError(
Errors.E983.format(dict="attrs", key=non_valid, keys=IDS.keys())
) from None
attrs = sorted(int_attrs)
self.version = "0.1"
self.attrs = [attr for attr in attrs if attr != ORTH and attr != SPACY]
self.attrs.insert(0, ORTH) # Ensure ORTH is always attrs[0]

View File

@ -1,4 +1,4 @@
from .corpus import Corpus # noqa: F401
from .corpus import Corpus, JsonlCorpus # noqa: F401
from .example import Example, validate_examples, validate_get_examples # noqa: F401
from .alignment import Alignment # noqa: F401
from .augment import dont_augment, orth_variants_augmenter # noqa: F401

View File

@ -144,7 +144,12 @@ def load_vectors_into_model(
) -> None:
"""Load word vectors from an installed model or path into a model instance."""
try:
vectors_nlp = load_model(name)
# Load with the same vocab, which automatically adds the vectors to
# the current nlp object. Exclude lookups so they are not modified.
exclude = ["lookups"]
if not add_strings:
exclude.append("strings")
vectors_nlp = load_model(name, vocab=nlp.vocab, exclude=exclude)
except ConfigValidationError as e:
title = f"Config validation error for vectors {name}"
desc = (
@ -158,15 +163,8 @@ def load_vectors_into_model(
if len(vectors_nlp.vocab.vectors.keys()) == 0:
logger.warning(Warnings.W112.format(name=name))
nlp.vocab.vectors = vectors_nlp.vocab.vectors
for lex in nlp.vocab:
lex.rank = nlp.vocab.vectors.key2row.get(lex.orth, OOV_RANK)
if add_strings:
# I guess we should add the strings from the vectors_nlp model?
# E.g. if someone does a similarity query, they might expect the strings.
for key in nlp.vocab.vectors.key2row:
if key in vectors_nlp.vocab.strings:
nlp.vocab.strings.add(vectors_nlp.vocab.strings[key])
def init_tok2vec(

View File

@ -1475,7 +1475,7 @@ def get_arg_names(func: Callable) -> List[str]:
RETURNS (List[str]): The argument names.
"""
argspec = inspect.getfullargspec(func)
return list(set([*argspec.args, *argspec.kwonlyargs]))
return list(dict.fromkeys([*argspec.args, *argspec.kwonlyargs]))
def combine_score_weights(

View File

@ -530,7 +530,6 @@ cdef class Vocab:
setters = {
"strings": lambda b: self.strings.from_bytes(b),
"lexemes": lambda b: self.lexemes_from_bytes(b),
"vectors": lambda b: serialize_vectors(b),
"lookups": lambda b: self.lookups.from_bytes(b),
}

View File

@ -260,16 +260,18 @@ $ python -m spacy convert [input_file] [output_dir] [--converter] [--file-type]
| Name | Description |
| ------------------------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------------- |
| `input_file` | Input file. ~~Path (positional)~~ |
| `input_path` | Input file or directory. ~~Path (positional)~~ |
| `output_dir` | Output directory for converted file. Defaults to `"-"`, meaning data will be written to `stdout`. ~~Optional[Path] \(option)~~ |
| `--converter`, `-c` <Tag variant="new">2</Tag> | Name of converter to use (see below). ~~str (option)~~ |
| `--file-type`, `-t` <Tag variant="new">2.1</Tag> | Type of file to create. Either `spacy` (default) for binary [`DocBin`](/api/docbin) data or `json` for v2.x JSON format. ~~str (option)~~ |
| `--n-sents`, `-n` | Number of sentences per document. Supported for: `conll`, `conllu`, `iob`, `ner` ~~int (option)~~ |
| `--seg-sents`, `-s` <Tag variant="new">2.2</Tag> | Segment sentences. Supported for: `conll`, `ner` ~~bool (flag)~~ |
| `--base`, `-b` | Trained spaCy pipeline for sentence segmentation to use as base (for `--seg-sents`). ~~Optional[str](option)~~ |
| `--base`, `-b`, `--model` | Trained spaCy pipeline for sentence segmentation to use as base (for `--seg-sents`). ~~Optional[str](option)~~ |
| `--morphology`, `-m` | Enable appending morphology to tags. Supported for: `conllu` ~~bool (flag)~~ |
| `--merge-subtokens`, `-T` | Merge CoNLL-U subtokens ~~bool (flag)~~ |
| `--ner-map`, `-nm` | NER tag mapping (as JSON-encoded dict of entity types). Supported for: `conllu` ~~Optional[Path](option)~~ |
| `--lang`, `-l` <Tag variant="new">2.1</Tag> | Language code (if tokenizer required). ~~Optional[str] \(option)~~ |
| `--concatenate`, `-C` | Concatenate output to a single file ~~bool (flag)~~ |
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
| **CREATES** | Binary [`DocBin`](/api/docbin) training data that can be used with [`spacy train`](/api/cli#train). |

View File

@ -474,8 +474,8 @@ The L2 norm of the token's vector representation.
| `like_email` | Does the token resemble an email address? ~~bool~~ |
| `is_oov` | Is the token out-of-vocabulary (i.e. does it not have a word vector)? ~~bool~~ |
| `is_stop` | Is the token part of a "stop list"? ~~bool~~ |
| `pos` | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/docs/u/pos/). ~~int~~ |
| `pos_` | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/docs/u/pos/). ~~str~~ |
| `pos` | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/u/pos/). ~~int~~ |
| `pos_` | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/u/pos/). ~~str~~ |
| `tag` | Fine-grained part-of-speech. ~~int~~ |
| `tag_` | Fine-grained part-of-speech. ~~str~~ |
| `morph` <Tag variant="new">3</Tag> | Morphological analysis. ~~MorphAnalysis~~ |

View File

@ -325,6 +325,5 @@ serialization by passing in the string names via the `exclude` argument.
| Name | Description |
| --------- | ----------------------------------------------------- |
| `strings` | The strings in the [`StringStore`](/api/stringstore). |
| `lexemes` | The lexeme data. |
| `vectors` | The word vectors, if available. |
| `lookups` | The lookup tables, if available. |

View File

@ -25,7 +25,7 @@ for token in doc:
> - **Text:** The original word text.
> - **Lemma:** The base form of the word.
> - **POS:** The simple [UPOS](https://universaldependencies.org/docs/u/pos/)
> - **POS:** The simple [UPOS](https://universaldependencies.org/u/pos/)
> part-of-speech tag.
> - **Tag:** The detailed part-of-speech tag.
> - **Dep:** Syntactic dependency, i.e. the relation between tokens.

View File

@ -284,7 +284,9 @@ $ python -m pytest --pyargs %%SPACY_PKG_NAME --slow # basic and slow test
## Troubleshooting guide {#troubleshooting}
This section collects some of the most common errors you may come across when
installing, loading and using spaCy, as well as their solutions.
installing, loading and using spaCy, as well as their solutions. Also see the
[Discussions FAQ Thread](https://github.com/explosion/spaCy/discussions/8226),
which is updated more frequently and covers more transitory issues.
> #### Help us improve this guide
>
@ -311,62 +313,6 @@ language's `Language` class instead, for example
</Accordion>
<Accordion title="No such option: --no-cache-dir" id="no-cache-dir">
```
no such option: --no-cache-dir
```
The `download` command uses pip to install the pipeline packages and sets the
`--no-cache-dir` flag to prevent it from requiring too much memory.
[This setting](https://pip.pypa.io/en/stable/reference/pip_install/#caching)
requires pip v6.0 or newer. Run `pip install -U pip` to upgrade to the latest
version of pip. To see which version you have installed, run `pip --version`.
</Accordion>
<Accordion title="sre_constants.error: bad character range" id="narrow-unicode">
```
sre_constants.error: bad character range
```
In [v2.1](/usage/v2-1), spaCy changed its implementation of regular expressions
for tokenization to make it up to 2-3 times faster. But this also means that
it's very important now that you run spaCy with a wide unicode build of Python.
This means that the build has 1114111 unicode characters available, instead of
only 65535 in a narrow unicode build. You can check this by running the
following command:
```bash
$ python -c "import sys; print(sys.maxunicode)"
```
If you're running a narrow unicode build, reinstall Python and use a wide
unicode build instead. You can also rebuild Python and set the
`--enable-unicode=ucs4` flag.
</Accordion>
<Accordion title="Unknown locale: UTF-8" id="unknown-locale">
```
ValueError: unknown locale: UTF-8
```
This error can sometimes occur on OSX and is likely related to a still
unresolved [Python bug](https://bugs.python.org/issue18378). However, it's easy
to fix: just add the following to your `~/.bash_profile` or `~/.zshrc` and then
run `source ~/.bash_profile` or `source ~/.zshrc`. Make sure to add **both
lines** for `LC_ALL` and `LANG`.
```bash
$ export LC_ALL=en_US.UTF-8
$ export LANG=en_US.UTF-8
```
</Accordion>
<Accordion title="Import error: No module named spacy" id="import-error">
```

View File

@ -1363,20 +1363,19 @@
"url": "https://explosion.ai/demos/sense2vec",
"code_example": [
"import spacy",
"from sense2vec import Sense2VecComponent",
"",
"nlp = spacy.load('en')",
"s2v = Sense2VecComponent('/path/to/reddit_vectors-1.1.0')",
"nlp.add_pipe(s2v)",
"nlp = spacy.load(\"en_core_web_sm\")",
"s2v = nlp.add_pipe(\"sense2vec\")",
"s2v.from_disk(\"/path/to/s2v_reddit_2015_md\")",
"",
"doc = nlp(\"A sentence about natural language processing.\")",
"assert doc[3].text == 'natural language processing'",
"freq = doc[3]._.s2v_freq",
"vector = doc[3]._.s2v_vec",
"most_similar = doc[3]._.s2v_most_similar(3)",
"# [(('natural language processing', 'NOUN'), 1.0),",
"# (('machine learning', 'NOUN'), 0.8986966609954834),",
"# (('computer vision', 'NOUN'), 0.8636297583580017)]"
"assert doc[3:6].text == \"natural language processing\"",
"freq = doc[3:6]._.s2v_freq",
"vector = doc[3:6]._.s2v_vec",
"most_similar = doc[3:6]._.s2v_most_similar(3)",
"# [(('machine learning', 'NOUN'), 0.8986967),",
"# (('computer vision', 'NOUN'), 0.8636297),",
"# (('deep learning', 'NOUN'), 0.8573361)]"
],
"category": ["pipeline", "standalone", "visualizers"],
"tags": ["vectors"],
@ -2970,11 +2969,10 @@
"github": "thomasthiebaud/spacy-fastlang",
"pip": "spacy_fastlang",
"code_example": [
"import spacy",
"from spacy_fastlang import LanguageDetector",
"import spacy_fastlang",
"",
"nlp = spacy.load('en_core_web_sm')",
"nlp.add_pipe(LanguageDetector())",
"nlp = spacy.load(\"en_core_web_sm\")",
"nlp.add_pipe(\"language_detector\")",
"doc = nlp('Life is like a box of chocolates. You never know what you are gonna get.')",
"",
"assert doc._.language == 'en'",
@ -3476,7 +3474,51 @@
"github": "bbieniek"
},
"category": ["apis"]
}
},
{
"id": "phruzz_matcher",
"title": "phruzz-matcher",
"slogan": "Phrase matcher using RapidFuzz",
"description": "Combination of the RapidFuzz library with Spacy PhraseMatcher The goal of this component is to find matches when there were NO \"perfect matches\" due to typos or abbreviations between a Spacy doc and a list of phrases.",
"github": "mjvallone/phruzz-matcher",
"pip": "phruzz_matcher",
"code_example": [
"import spacy",
"from spacy.language import Language",
"from phruzz_matcher.phrase_matcher import PhruzzMatcher",
"",
"famous_people = [",
" \"Brad Pitt\",",
" \"Demi Moore\",",
" \"Bruce Willis\",",
" \"Jim Carrey\",",
"]",
"",
"@Language.factory(\"phrase_matcher\")",
"def phrase_matcher(nlp: Language, name: str):",
" return PhruzzMatcher(nlp, famous_people, \"FAMOUS_PEOPLE\", 85)",
"",
"nlp = spacy.blank('es')",
"nlp.add_pipe(\"phrase_matcher\")",
"",
"doc = nlp(\"El otro día fui a un bar donde vi a brad pit y a Demi Moore, estaban tomando unas cervezas mientras charlaban de sus asuntos.\")",
"print(f\"doc.ents: {doc.ents}\")",
"",
"#OUTPUT",
"#doc.ents: (brad pit, Demi Moore)"
],
"thumb": "https://avatars.githubusercontent.com/u/961296?v=4",
"image": "",
"code_language": "python",
"author": "Martin Vallone",
"author_links": {
"github": "mjvallone",
"twitter": "vallotin",
"website": "https://fiqus.coop/"
},
"category": ["pipeline", "research", "standalone"],
"tags": ["spacy", "python", "nlp", "ner"]
}
],
"categories": [

View File

@ -34,6 +34,7 @@ const MODEL_META = {
core_sm: 'Vocabulary, syntax, entities',
dep: 'Vocabulary, syntax',
ent: 'Named entities',
sent: 'Sentence boundaries',
pytt: 'PyTorch Transformers',
trf: 'Transformers',
vectors: 'Word vectors',
@ -195,6 +196,7 @@ const Model = ({
const [isError, setIsError] = useState(true)
const [meta, setMeta] = useState({})
const { type, genre, size } = getModelComponents(name)
const display_type = type === 'core' && size === 'sm' ? 'core_sm' : type
const version = useMemo(() => getLatestVersion(name, compatibility, prereleases), [
name,
compatibility,
@ -231,7 +233,7 @@ const Model = ({
const rows = [
{ label: 'Language', tag: langId, content: langName },
{ label: 'Type', tag: type, content: MODEL_META[type] },
{ label: 'Type', tag: type, content: MODEL_META[display_type] },
{ label: 'Genre', tag: genre, content: MODEL_META[genre] },
{ label: 'Size', tag: size, content: meta.sizeFull },
{ label: 'Components', content: components, help: MODEL_META.components },