Merge remote-tracking branch 'upstream/master' into chore/update-develop-from-master-v3.1-3

This commit is contained in:
Adriane Boyd 2021-10-14 09:41:46 +02:00
commit d98d525bc8
39 changed files with 382 additions and 136 deletions

View File

@ -1,8 +1,11 @@
blank_issues_enabled: false blank_issues_enabled: false
contact_links: contact_links:
- name: ⚠️ Python 3.10 Support
url: https://github.com/explosion/spaCy/discussions/9418
about: Python 3.10 wheels haven't been released yet, see the link for details.
- name: 🗯 Discussions Forum - name: 🗯 Discussions Forum
url: https://github.com/explosion/spaCy/discussions url: https://github.com/explosion/spaCy/discussions
about: Usage questions, general discussion and anything else that isn't a bug report. about: Install issues, usage questions, general discussion and anything else that isn't a bug report.
- name: 📖 spaCy FAQ & Troubleshooting - name: 📖 spaCy FAQ & Troubleshooting
url: https://github.com/explosion/spaCy/discussions/8226 url: https://github.com/explosion/spaCy/discussions/8226
about: Before you post, check out the FAQ for answers to common community questions! about: Before you post, check out the FAQ for answers to common community questions!

View File

@ -100,3 +100,8 @@ steps:
python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113 python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
displayName: 'Test assemble CLI vectors warning' displayName: 'Test assemble CLI vectors warning'
condition: eq(variables['python_version'], '3.8') condition: eq(variables['python_version'], '3.8')
- script: |
python .github/validate_universe_json.py website/meta/universe.json
displayName: 'Test website/meta/universe.json'
condition: eq(variables['python_version'], '3.8')

19
.github/validate_universe_json.py vendored Normal file
View File

@ -0,0 +1,19 @@
import json
import re
import sys
from pathlib import Path
def validate_json(document):
universe_file = Path(document)
with universe_file.open() as f:
universe_data = json.load(f)
for entry in universe_data["resources"]:
if "github" in entry:
assert not re.match(
r"^(http:)|^(https:)", entry["github"]
), "Github field should be user/repo, not a url"
if __name__ == "__main__":
validate_json(str(sys.argv[1]))

27
.github/workflows/explosionbot.yml vendored Normal file
View File

@ -0,0 +1,27 @@
name: Explosion Bot
on:
issue_comment:
types:
- created
- edited
jobs:
explosion-bot:
runs-on: ubuntu-18.04
steps:
- name: Dump GitHub context
env:
GITHUB_CONTEXT: ${{ toJson(github) }}
run: echo "$GITHUB_CONTEXT"
- uses: actions/checkout@v1
- uses: actions/setup-python@v1
- name: Install and run explosion-bot
run: |
pip install git+https://${{ secrets.EXPLOSIONBOT_TOKEN }}@github.com/explosion/explosion-bot
python -m explosionbot
env:
INPUT_TOKEN: ${{ secrets.EXPLOSIONBOT_TOKEN }}
INPUT_BK_TOKEN: ${{ secrets.BUILDKITE_SECRET }}
ENABLED_COMMANDS: "test_gpu"
ALLOWED_TEAMS: "spaCy"

View File

@ -1,8 +0,0 @@
@software{spacy,
author = {Honnibal, Matthew and Montani, Ines and Van Landeghem, Sofie and Boyd, Adriane},
title = {{spaCy: Industrial-strength Natural Language Processing in Python}},
year = 2020,
publisher = {Zenodo},
doi = {10.5281/zenodo.1212303},
url = {https://doi.org/10.5281/zenodo.1212303}
}

16
CITATION.cff Normal file
View File

@ -0,0 +1,16 @@
cff-version: 1.2.0
preferred-citation:
type: article
message: "If you use spaCy, please cite it as below."
authors:
- family-names: "Honnibal"
given-names: "Matthew"
- family-names: "Montani"
given-names: "Ines"
- family-names: "Van Landeghem"
given-names: "Sofie"
- family-names: "Boyd"
given-names: "Adriane"
title: "spaCy: Industrial-strength Natural Language Processing in Python"
doi: "10.5281/zenodo.1212303"
year: 2020

View File

@ -1,5 +1,5 @@
recursive-include include *.h recursive-include include *.h
recursive-include spacy *.pyx *.pxd *.txt *.cfg *.jinja recursive-include spacy *.pyx *.pxd *.txt *.cfg *.jinja *.toml
include LICENSE include LICENSE
include README.md include README.md
include pyproject.toml include pyproject.toml

View File

@ -16,6 +16,8 @@ pr:
exclude: exclude:
- "website/*" - "website/*"
- "*.md" - "*.md"
include:
- "website/meta/universe.json"
jobs: jobs:
# Perform basic checks for most important errors (syntax etc.) Uses the config # Perform basic checks for most important errors (syntax etc.) Uses the config

View File

@ -124,7 +124,8 @@ exclude =
[tool:pytest] [tool:pytest]
markers = markers =
slow slow: mark a test as slow
issue: reference specific issue
[mypy] [mypy]
ignore_missing_imports = True ignore_missing_imports = True

View File

@ -1,4 +1,4 @@
from typing import Optional from typing import Optional, Dict, Any
from pathlib import Path from pathlib import Path
from wasabi import msg from wasabi import msg
import typer import typer
@ -7,7 +7,7 @@ import sys
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
from ._util import import_code, setup_gpu from ._util import import_code, setup_gpu
from ..training.loop import train from ..training.loop import train as train_nlp
from ..training.initialize import init_nlp from ..training.initialize import init_nlp
from .. import util from .. import util
@ -40,6 +40,18 @@ def train_cli(
DOCS: https://spacy.io/api/cli#train DOCS: https://spacy.io/api/cli#train
""" """
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO) util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
overrides = parse_config_overrides(ctx.args)
import_code(code_path)
train(config_path, output_path, use_gpu=use_gpu, overrides=overrides)
def train(
config_path: Path,
output_path: Optional[Path] = None,
*,
use_gpu: int = -1,
overrides: Dict[str, Any] = util.SimpleFrozenDict(),
):
# Make sure all files and paths exists if they are needed # Make sure all files and paths exists if they are needed
if not config_path or (str(config_path) != "-" and not config_path.exists()): if not config_path or (str(config_path) != "-" and not config_path.exists()):
msg.fail("Config file not found", config_path, exits=1) msg.fail("Config file not found", config_path, exits=1)
@ -50,8 +62,6 @@ def train_cli(
output_path.mkdir(parents=True) output_path.mkdir(parents=True)
msg.good(f"Created output directory: {output_path}") msg.good(f"Created output directory: {output_path}")
msg.info(f"Saving to output directory: {output_path}") msg.info(f"Saving to output directory: {output_path}")
overrides = parse_config_overrides(ctx.args)
import_code(code_path)
setup_gpu(use_gpu) setup_gpu(use_gpu)
with show_validation_error(config_path): with show_validation_error(config_path):
config = util.load_config(config_path, overrides=overrides, interpolate=False) config = util.load_config(config_path, overrides=overrides, interpolate=False)
@ -60,4 +70,4 @@ def train_cli(
nlp = init_nlp(config, use_gpu=use_gpu) nlp = init_nlp(config, use_gpu=use_gpu)
msg.good("Initialized pipeline") msg.good("Initialized pipeline")
msg.divider("Training pipeline") msg.divider("Training pipeline")
train(nlp, output_path, use_gpu=use_gpu, stdout=sys.stdout, stderr=sys.stderr) train_nlp(nlp, output_path, use_gpu=use_gpu, stdout=sys.stdout, stderr=sys.stderr)

View File

@ -25,7 +25,7 @@ def setup_default_warnings():
filter_warning("once", error_msg=Warnings.W036.format(name=pipe)) filter_warning("once", error_msg=Warnings.W036.format(name=pipe))
# warn once about lemmatizer without required POS # warn once about lemmatizer without required POS
filter_warning("once", error_msg="[W108]") filter_warning("once", error_msg=Warnings.W108)
def filter_warning(action: str, error_msg: str): def filter_warning(action: str, error_msg: str):
@ -170,8 +170,8 @@ class Warnings:
"call the {matcher} on each Doc object.") "call the {matcher} on each Doc object.")
W107 = ("The property `Doc.{prop}` is deprecated. Use " W107 = ("The property `Doc.{prop}` is deprecated. Use "
"`Doc.has_annotation(\"{attr}\")` instead.") "`Doc.has_annotation(\"{attr}\")` instead.")
W108 = ("The rule-based lemmatizer did not find POS annotation for the " W108 = ("The rule-based lemmatizer did not find POS annotation for one or "
"token '{text}'. Check that your pipeline includes components that " "more tokens. Check that your pipeline includes components that "
"assign token.pos, typically 'tagger'+'attribute_ruler' or " "assign token.pos, typically 'tagger'+'attribute_ruler' or "
"'morphologizer'.") "'morphologizer'.")
W109 = ("Unable to save user hooks while serializing the doc. Re-add any " W109 = ("Unable to save user hooks while serializing the doc. Re-add any "
@ -658,7 +658,9 @@ class Errors:
"{nO} - cannot add any more labels.") "{nO} - cannot add any more labels.")
E923 = ("It looks like there is no proper sample data to initialize the " E923 = ("It looks like there is no proper sample data to initialize the "
"Model of component '{name}'. To check your input data paths and " "Model of component '{name}'. To check your input data paths and "
"annotation, run: python -m spacy debug data config.cfg") "annotation, run: python -m spacy debug data config.cfg "
"and include the same config override values you would specify "
"for the 'spacy train' command.")
E924 = ("The '{name}' component does not seem to be initialized properly. " E924 = ("The '{name}' component does not seem to be initialized properly. "
"This is likely a bug in spaCy, so feel free to open an issue: " "This is likely a bug in spaCy, so feel free to open an issue: "
"https://github.com/explosion/spaCy/issues") "https://github.com/explosion/spaCy/issues")
@ -793,7 +795,7 @@ class Errors:
"to token boundaries.") "to token boundaries.")
E982 = ("The `Token.ent_iob` attribute should be an integer indexing " E982 = ("The `Token.ent_iob` attribute should be an integer indexing "
"into {values}, but found {value}.") "into {values}, but found {value}.")
E983 = ("Invalid key for '{dict}': {key}. Available keys: " E983 = ("Invalid key(s) for '{dict}': {key}. Available keys: "
"{keys}") "{keys}")
E984 = ("Invalid component config for '{name}': component block needs either " E984 = ("Invalid component config for '{name}': component block needs either "
"a key `factory` specifying the registered function used to " "a key `factory` specifying the registered function used to "

View File

@ -76,6 +76,6 @@ class CatalanLemmatizer(Lemmatizer):
forms.append(self.lookup_lemmatize(token)[0]) forms.append(self.lookup_lemmatize(token)[0])
if not forms: if not forms:
forms.append(string) forms.append(string)
forms = list(set(forms)) forms = list(dict.fromkeys(forms))
self.cache[cache_key] = forms self.cache[cache_key] = forms
return forms return forms

View File

@ -75,6 +75,6 @@ class FrenchLemmatizer(Lemmatizer):
forms.append(self.lookup_lemmatize(token)[0]) forms.append(self.lookup_lemmatize(token)[0])
if not forms: if not forms:
forms.append(string) forms.append(string)
forms = list(set(forms)) forms = list(dict.fromkeys(forms))
self.cache[cache_key] = forms self.cache[cache_key] = forms
return forms return forms

View File

@ -97,7 +97,7 @@ class DutchLemmatizer(Lemmatizer):
return forms return forms
else: else:
oov_forms.append(form) oov_forms.append(form)
forms = list(set(oov_forms)) forms = list(dict.fromkeys(oov_forms))
# Back-off through remaining return value candidates. # Back-off through remaining return value candidates.
if forms: if forms:
for form in forms: for form in forms:

View File

@ -58,7 +58,7 @@ class RussianLemmatizer(Lemmatizer):
if not len(filtered_analyses): if not len(filtered_analyses):
return [string.lower()] return [string.lower()]
if morphology is None or (len(morphology) == 1 and POS in morphology): if morphology is None or (len(morphology) == 1 and POS in morphology):
return list(set([analysis.normal_form for analysis in filtered_analyses])) return list(dict.fromkeys([analysis.normal_form for analysis in filtered_analyses]))
if univ_pos in ("ADJ", "DET", "NOUN", "PROPN"): if univ_pos in ("ADJ", "DET", "NOUN", "PROPN"):
features_to_compare = ["Case", "Number", "Gender"] features_to_compare = ["Case", "Number", "Gender"]
elif univ_pos == "NUM": elif univ_pos == "NUM":
@ -89,7 +89,7 @@ class RussianLemmatizer(Lemmatizer):
filtered_analyses.append(analysis) filtered_analyses.append(analysis)
if not len(filtered_analyses): if not len(filtered_analyses):
return [string.lower()] return [string.lower()]
return list(set([analysis.normal_form for analysis in filtered_analyses])) return list(dict.fromkeys([analysis.normal_form for analysis in filtered_analyses]))
def pymorphy2_lookup_lemmatize(self, token: Token) -> List[str]: def pymorphy2_lookup_lemmatize(self, token: Token) -> List[str]:
string = token.text string = token.text

View File

@ -707,8 +707,9 @@ class Language:
source_config = source.config.interpolate() source_config = source.config.interpolate()
pipe_config = util.copy_config(source_config["components"][source_name]) pipe_config = util.copy_config(source_config["components"][source_name])
self._pipe_configs[name] = pipe_config self._pipe_configs[name] = pipe_config
for s in source.vocab.strings: if self.vocab.strings != source.vocab.strings:
self.vocab.strings.add(s) for s in source.vocab.strings:
self.vocab.strings.add(s)
return pipe, pipe_config["factory"] return pipe, pipe_config["factory"]
def add_pipe( def add_pipe(
@ -1379,6 +1380,9 @@ class Language:
scorer = Scorer(**kwargs) scorer = Scorer(**kwargs)
# reset annotation in predicted docs and time tokenization # reset annotation in predicted docs and time tokenization
start_time = timer() start_time = timer()
# this is purely for timing
for eg in examples:
self.make_doc(eg.reference.text)
# apply all pipeline components # apply all pipeline components
for name, pipe in self.pipeline: for name, pipe in self.pipeline:
kwargs = component_cfg.get(name, {}) kwargs = component_cfg.get(name, {})
@ -1708,6 +1712,7 @@ class Language:
# them here so they're only loaded once # them here so they're only loaded once
source_nlps = {} source_nlps = {}
source_nlp_vectors_hashes = {} source_nlp_vectors_hashes = {}
vocab_b = None
for pipe_name in config["nlp"]["pipeline"]: for pipe_name in config["nlp"]["pipeline"]:
if pipe_name not in pipeline: if pipe_name not in pipeline:
opts = ", ".join(pipeline.keys()) opts = ", ".join(pipeline.keys())
@ -1730,14 +1735,22 @@ class Language:
raw_config=raw_config, raw_config=raw_config,
) )
else: else:
# We need the sourced components to reference the same
# vocab without modifying the current vocab state **AND**
# we still want to load the source model vectors to perform
# the vectors check. Since the source vectors clobber the
# current ones, we save the original vocab state and
# restore after this loop. Existing strings are preserved
# during deserialization, so they do not need any
# additional handling.
if vocab_b is None:
vocab_b = nlp.vocab.to_bytes(exclude=["lookups", "strings"])
model = pipe_cfg["source"] model = pipe_cfg["source"]
if model not in source_nlps: if model not in source_nlps:
# We only need the components here and we intentionally # Load with the same vocab, adding any strings
# do not load the model with the same vocab because source_nlps[model] = util.load_model(
# this would cause the vectors to be copied into the model, vocab=nlp.vocab, exclude=["lookups"]
# current nlp object (all the strings will be added in )
# create_pipe_from_source)
source_nlps[model] = util.load_model(model)
source_name = pipe_cfg.get("component", pipe_name) source_name = pipe_cfg.get("component", pipe_name)
listeners_replaced = False listeners_replaced = False
if "replace_listeners" in pipe_cfg: if "replace_listeners" in pipe_cfg:
@ -1764,6 +1777,9 @@ class Language:
# Delete from cache if listeners were replaced # Delete from cache if listeners were replaced
if listeners_replaced: if listeners_replaced:
del source_nlps[model] del source_nlps[model]
# Restore the original vocab after sourcing if necessary
if vocab_b is not None:
nlp.vocab.from_bytes(vocab_b)
disabled_pipes = [*config["nlp"]["disabled"], *disable] disabled_pipes = [*config["nlp"]["disabled"], *disable]
nlp._disabled = set(p for p in disabled_pipes if p not in exclude) nlp._disabled = set(p for p in disabled_pipes if p not in exclude)
nlp.batch_size = config["nlp"]["batch_size"] nlp.batch_size = config["nlp"]["batch_size"]

View File

@ -177,13 +177,14 @@ cdef class DependencyMatcher:
# Add 'RIGHT_ATTRS' to self._patterns[key] # Add 'RIGHT_ATTRS' to self._patterns[key]
_patterns = [[[pat["RIGHT_ATTRS"]] for pat in pattern] for pattern in patterns] _patterns = [[[pat["RIGHT_ATTRS"]] for pat in pattern] for pattern in patterns]
pattern_offset = len(self._patterns[key])
self._patterns[key].extend(_patterns) self._patterns[key].extend(_patterns)
# Add each node pattern of all the input patterns individually to the # Add each node pattern of all the input patterns individually to the
# matcher. This enables only a single instance of Matcher to be used. # matcher. This enables only a single instance of Matcher to be used.
# Multiple adds are required to track each node pattern. # Multiple adds are required to track each node pattern.
tokens_to_key_list = [] tokens_to_key_list = []
for i, current_patterns in enumerate(_patterns): for i, current_patterns in enumerate(_patterns, start=pattern_offset):
# Preallocate list space # Preallocate list space
tokens_to_key = [None] * len(current_patterns) tokens_to_key = [None] * len(current_patterns)
@ -263,7 +264,9 @@ cdef class DependencyMatcher:
self._raw_patterns.pop(key) self._raw_patterns.pop(key)
self._tree.pop(key) self._tree.pop(key)
self._root.pop(key) self._root.pop(key)
self._tokens_to_key.pop(key) for mklist in self._tokens_to_key.pop(key):
for mkey in mklist:
self._matcher.remove(mkey)
def _get_keys_to_position_maps(self, doc): def _get_keys_to_position_maps(self, doc):
""" """

View File

@ -208,7 +208,7 @@ class Lemmatizer(Pipe):
univ_pos = token.pos_.lower() univ_pos = token.pos_.lower()
if univ_pos in ("", "eol", "space"): if univ_pos in ("", "eol", "space"):
if univ_pos == "": if univ_pos == "":
warnings.warn(Warnings.W108.format(text=string)) warnings.warn(Warnings.W108)
return [string.lower()] return [string.lower()]
# See Issue #435 for example of where this logic is requied. # See Issue #435 for example of where this logic is requied.
if self.is_base_form(token): if self.is_base_form(token):

View File

@ -4,6 +4,7 @@ from spacy.util import get_lang_class
def pytest_addoption(parser): def pytest_addoption(parser):
parser.addoption("--slow", action="store_true", help="include slow tests") parser.addoption("--slow", action="store_true", help="include slow tests")
parser.addoption("--issue", action="store", help="test specific issues")
def pytest_runtest_setup(item): def pytest_runtest_setup(item):
@ -16,10 +17,24 @@ def pytest_runtest_setup(item):
# options weren't given. # options weren't given.
return item.config.getoption(f"--{opt}", False) return item.config.getoption(f"--{opt}", False)
# Integration of boolean flags
for opt in ["slow"]: for opt in ["slow"]:
if opt in item.keywords and not getopt(opt): if opt in item.keywords and not getopt(opt):
pytest.skip(f"need --{opt} option to run") pytest.skip(f"need --{opt} option to run")
# Special integration to mark tests with issue numbers
issues = getopt("issue")
if isinstance(issues, str):
if "issue" in item.keywords:
# Convert issues provided on the CLI to list of ints
issue_nos = [int(issue.strip()) for issue in issues.split(",")]
# Get all issues specified by decorators and check if they're provided
issue_refs = [mark.args[0] for mark in item.iter_markers(name="issue")]
if not any([ref in issue_nos for ref in issue_refs]):
pytest.skip(f"not referencing specified issues: {issue_nos}")
else:
pytest.skip("not referencing any issues")
# Fixtures for language tokenizers (languages sorted alphabetically) # Fixtures for language tokenizers (languages sorted alphabetically)

View File

@ -368,3 +368,87 @@ def test_dependency_matcher_span_user_data(en_tokenizer):
assert doc_match[0] == span_match[0] assert doc_match[0] == span_match[0]
for doc_t_i, span_t_i in zip(doc_match[1], span_match[1]): for doc_t_i, span_t_i in zip(doc_match[1], span_match[1]):
assert doc_t_i == span_t_i + offset assert doc_t_i == span_t_i + offset
def test_dependency_matcher_order_issue(en_tokenizer):
# issue from #9263
doc = en_tokenizer("I like text")
doc[2].head = doc[1]
# this matches on attrs but not rel op
pattern1 = [
{"RIGHT_ID": "root", "RIGHT_ATTRS": {"ORTH": "like"}},
{
"LEFT_ID": "root",
"RIGHT_ID": "r",
"RIGHT_ATTRS": {"ORTH": "text"},
"REL_OP": "<",
},
]
# this matches on rel op but not attrs
pattern2 = [
{"RIGHT_ID": "root", "RIGHT_ATTRS": {"ORTH": "like"}},
{
"LEFT_ID": "root",
"RIGHT_ID": "r",
"RIGHT_ATTRS": {"ORTH": "fish"},
"REL_OP": ">",
},
]
matcher = DependencyMatcher(en_tokenizer.vocab)
# This should behave the same as the next pattern
matcher.add("check", [pattern1, pattern2])
matches = matcher(doc)
assert matches == []
# use a new matcher
matcher = DependencyMatcher(en_tokenizer.vocab)
# adding one at a time under same label gets a match
matcher.add("check", [pattern1])
matcher.add("check", [pattern2])
matches = matcher(doc)
assert matches == []
def test_dependency_matcher_remove(en_tokenizer):
# issue from #9263
doc = en_tokenizer("The red book")
doc[1].head = doc[2]
# this matches
pattern1 = [
{"RIGHT_ID": "root", "RIGHT_ATTRS": {"ORTH": "book"}},
{
"LEFT_ID": "root",
"RIGHT_ID": "r",
"RIGHT_ATTRS": {"ORTH": "red"},
"REL_OP": ">",
},
]
# add and then remove it
matcher = DependencyMatcher(en_tokenizer.vocab)
matcher.add("check", [pattern1])
matcher.remove("check")
# this matches on rel op but not attrs
pattern2 = [
{"RIGHT_ID": "root", "RIGHT_ATTRS": {"ORTH": "flag"}},
{
"LEFT_ID": "root",
"RIGHT_ID": "r",
"RIGHT_ATTRS": {"ORTH": "blue"},
"REL_OP": ">",
},
]
# Adding this new pattern with the same label, which should not match
matcher.add("check", [pattern2])
matches = matcher(doc)
assert matches == []

View File

View File

@ -114,7 +114,7 @@ def test_make_spangroup(max_positive, nr_results):
doc = nlp.make_doc("Greater London") doc = nlp.make_doc("Greater London")
ngram_suggester = registry.misc.get("spacy.ngram_suggester.v1")(sizes=[1, 2]) ngram_suggester = registry.misc.get("spacy.ngram_suggester.v1")(sizes=[1, 2])
indices = ngram_suggester([doc])[0].dataXd indices = ngram_suggester([doc])[0].dataXd
assert_array_equal(indices, numpy.asarray([[0, 1], [1, 2], [0, 2]])) assert_array_equal(OPS.to_numpy(indices), numpy.asarray([[0, 1], [1, 2], [0, 2]]))
labels = ["Thing", "City", "Person", "GreatCity"] labels = ["Thing", "City", "Person", "GreatCity"]
scores = numpy.asarray( scores = numpy.asarray(
[[0.2, 0.4, 0.3, 0.1], [0.1, 0.6, 0.2, 0.4], [0.8, 0.7, 0.3, 0.9]], dtype="f" [[0.2, 0.4, 0.3, 0.1], [0.1, 0.6, 0.2, 0.4], [0.8, 0.7, 0.3, 0.9]], dtype="f"

View File

@ -49,8 +49,8 @@ def test_issue5551(textcat_config):
# All results should be the same because of the fixed seed # All results should be the same because of the fixed seed
assert len(results) == 3 assert len(results) == 3
ops = get_current_ops() ops = get_current_ops()
assert_almost_equal(ops.to_numpy(results[0]), ops.to_numpy(results[1])) assert_almost_equal(ops.to_numpy(results[0]), ops.to_numpy(results[1]), decimal=5)
assert_almost_equal(ops.to_numpy(results[0]), ops.to_numpy(results[2])) assert_almost_equal(ops.to_numpy(results[0]), ops.to_numpy(results[2]), decimal=5)
def test_issue5838(): def test_issue5838():

View File

@ -0,0 +1,54 @@
import pytest
from thinc.api import Adam
from spacy.attrs import NORM
from spacy.vocab import Vocab
from spacy import registry
from spacy.training import Example
from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL
from spacy.tokens import Doc
from spacy.pipeline import DependencyParser
@pytest.fixture
def vocab():
return Vocab(lex_attr_getters={NORM: lambda s: s})
def _parser_example(parser):
doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
gold = {"heads": [1, 1, 3, 3], "deps": ["right", "ROOT", "left", "ROOT"]}
return Example.from_dict(doc, gold)
@pytest.fixture
def parser(vocab):
vocab.strings.add("ROOT")
cfg = {"model": DEFAULT_PARSER_MODEL}
model = registry.resolve(cfg, validate=True)["model"]
parser = DependencyParser(vocab, model)
parser.cfg["token_vector_width"] = 4
parser.cfg["hidden_width"] = 32
# parser.add_label('right')
parser.add_label("left")
parser.initialize(lambda: [_parser_example(parser)])
sgd = Adam(0.001)
for i in range(10):
losses = {}
doc = Doc(vocab, words=["a", "b", "c", "d"])
example = Example.from_dict(
doc, {"heads": [1, 1, 3, 3], "deps": ["left", "ROOT", "left", "ROOT"]}
)
parser.update([example], sgd=sgd, losses=losses)
return parser
@pytest.mark.xfail(reason="Not fixed yet")
def test_partial_annotation(parser):
doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
doc[2].is_sent_start = False
# Note that if the following line is used, then doc[2].is_sent_start == False
# doc[3].is_sent_start = False
doc = parser(doc)
assert doc[2].is_sent_start == False

View File

@ -1,6 +1,8 @@
import pytest
from spacy.lang.en import English from spacy.lang.en import English
@pytest.mark.issue(8168)
def test_issue8168(): def test_issue8168():
nlp = English() nlp = English()
ruler = nlp.add_pipe("entity_ruler") ruler = nlp.add_pipe("entity_ruler")

View File

@ -193,6 +193,7 @@ def test_models_update_consistently(seed, dropout, model_func, kwargs, get_X):
assert_array_almost_equal( assert_array_almost_equal(
model1.ops.to_numpy(get_all_params(model1)), model1.ops.to_numpy(get_all_params(model1)),
model2.ops.to_numpy(get_all_params(model2)), model2.ops.to_numpy(get_all_params(model2)),
decimal=5,
) )

View File

@ -82,15 +82,15 @@ def test_cat_readers(reader, additional_config):
[nlp] [nlp]
lang = "en" lang = "en"
pipeline = ["tok2vec", "textcat"] pipeline = ["tok2vec", "textcat_multilabel"]
[components] [components]
[components.tok2vec] [components.tok2vec]
factory = "tok2vec" factory = "tok2vec"
[components.textcat] [components.textcat_multilabel]
factory = "textcat" factory = "textcat_multilabel"
""" """
config = Config().from_str(nlp_config_string) config = Config().from_str(nlp_config_string)
config["corpora"]["@readers"] = reader config["corpora"]["@readers"] = reader

View File

@ -8,7 +8,7 @@ from thinc.api import NumpyOps
from .doc import Doc from .doc import Doc
from ..vocab import Vocab from ..vocab import Vocab
from ..compat import copy_reg from ..compat import copy_reg
from ..attrs import SPACY, ORTH, intify_attr from ..attrs import SPACY, ORTH, intify_attr, IDS
from ..errors import Errors from ..errors import Errors
from ..util import ensure_path, SimpleFrozenList from ..util import ensure_path, SimpleFrozenList
@ -64,7 +64,13 @@ class DocBin:
DOCS: https://spacy.io/api/docbin#init DOCS: https://spacy.io/api/docbin#init
""" """
attrs = sorted([intify_attr(attr) for attr in attrs]) int_attrs = [intify_attr(attr) for attr in attrs]
if None in int_attrs:
non_valid = [attr for attr in attrs if intify_attr(attr) is None]
raise KeyError(
Errors.E983.format(dict="attrs", key=non_valid, keys=IDS.keys())
) from None
attrs = sorted(int_attrs)
self.version = "0.1" self.version = "0.1"
self.attrs = [attr for attr in attrs if attr != ORTH and attr != SPACY] self.attrs = [attr for attr in attrs if attr != ORTH and attr != SPACY]
self.attrs.insert(0, ORTH) # Ensure ORTH is always attrs[0] self.attrs.insert(0, ORTH) # Ensure ORTH is always attrs[0]

View File

@ -1,4 +1,4 @@
from .corpus import Corpus # noqa: F401 from .corpus import Corpus, JsonlCorpus # noqa: F401
from .example import Example, validate_examples, validate_get_examples # noqa: F401 from .example import Example, validate_examples, validate_get_examples # noqa: F401
from .alignment import Alignment # noqa: F401 from .alignment import Alignment # noqa: F401
from .augment import dont_augment, orth_variants_augmenter # noqa: F401 from .augment import dont_augment, orth_variants_augmenter # noqa: F401

View File

@ -144,7 +144,12 @@ def load_vectors_into_model(
) -> None: ) -> None:
"""Load word vectors from an installed model or path into a model instance.""" """Load word vectors from an installed model or path into a model instance."""
try: try:
vectors_nlp = load_model(name) # Load with the same vocab, which automatically adds the vectors to
# the current nlp object. Exclude lookups so they are not modified.
exclude = ["lookups"]
if not add_strings:
exclude.append("strings")
vectors_nlp = load_model(name, vocab=nlp.vocab, exclude=exclude)
except ConfigValidationError as e: except ConfigValidationError as e:
title = f"Config validation error for vectors {name}" title = f"Config validation error for vectors {name}"
desc = ( desc = (
@ -158,15 +163,8 @@ def load_vectors_into_model(
if len(vectors_nlp.vocab.vectors.keys()) == 0: if len(vectors_nlp.vocab.vectors.keys()) == 0:
logger.warning(Warnings.W112.format(name=name)) logger.warning(Warnings.W112.format(name=name))
nlp.vocab.vectors = vectors_nlp.vocab.vectors
for lex in nlp.vocab: for lex in nlp.vocab:
lex.rank = nlp.vocab.vectors.key2row.get(lex.orth, OOV_RANK) lex.rank = nlp.vocab.vectors.key2row.get(lex.orth, OOV_RANK)
if add_strings:
# I guess we should add the strings from the vectors_nlp model?
# E.g. if someone does a similarity query, they might expect the strings.
for key in nlp.vocab.vectors.key2row:
if key in vectors_nlp.vocab.strings:
nlp.vocab.strings.add(vectors_nlp.vocab.strings[key])
def init_tok2vec( def init_tok2vec(

View File

@ -1475,7 +1475,7 @@ def get_arg_names(func: Callable) -> List[str]:
RETURNS (List[str]): The argument names. RETURNS (List[str]): The argument names.
""" """
argspec = inspect.getfullargspec(func) argspec = inspect.getfullargspec(func)
return list(set([*argspec.args, *argspec.kwonlyargs])) return list(dict.fromkeys([*argspec.args, *argspec.kwonlyargs]))
def combine_score_weights( def combine_score_weights(

View File

@ -530,7 +530,6 @@ cdef class Vocab:
setters = { setters = {
"strings": lambda b: self.strings.from_bytes(b), "strings": lambda b: self.strings.from_bytes(b),
"lexemes": lambda b: self.lexemes_from_bytes(b),
"vectors": lambda b: serialize_vectors(b), "vectors": lambda b: serialize_vectors(b),
"lookups": lambda b: self.lookups.from_bytes(b), "lookups": lambda b: self.lookups.from_bytes(b),
} }

View File

@ -260,16 +260,18 @@ $ python -m spacy convert [input_file] [output_dir] [--converter] [--file-type]
| Name | Description | | Name | Description |
| ------------------------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------------- | | ------------------------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------------- |
| `input_file` | Input file. ~~Path (positional)~~ | | `input_path` | Input file or directory. ~~Path (positional)~~ |
| `output_dir` | Output directory for converted file. Defaults to `"-"`, meaning data will be written to `stdout`. ~~Optional[Path] \(option)~~ | | `output_dir` | Output directory for converted file. Defaults to `"-"`, meaning data will be written to `stdout`. ~~Optional[Path] \(option)~~ |
| `--converter`, `-c` <Tag variant="new">2</Tag> | Name of converter to use (see below). ~~str (option)~~ | | `--converter`, `-c` <Tag variant="new">2</Tag> | Name of converter to use (see below). ~~str (option)~~ |
| `--file-type`, `-t` <Tag variant="new">2.1</Tag> | Type of file to create. Either `spacy` (default) for binary [`DocBin`](/api/docbin) data or `json` for v2.x JSON format. ~~str (option)~~ | | `--file-type`, `-t` <Tag variant="new">2.1</Tag> | Type of file to create. Either `spacy` (default) for binary [`DocBin`](/api/docbin) data or `json` for v2.x JSON format. ~~str (option)~~ |
| `--n-sents`, `-n` | Number of sentences per document. Supported for: `conll`, `conllu`, `iob`, `ner` ~~int (option)~~ | | `--n-sents`, `-n` | Number of sentences per document. Supported for: `conll`, `conllu`, `iob`, `ner` ~~int (option)~~ |
| `--seg-sents`, `-s` <Tag variant="new">2.2</Tag> | Segment sentences. Supported for: `conll`, `ner` ~~bool (flag)~~ | | `--seg-sents`, `-s` <Tag variant="new">2.2</Tag> | Segment sentences. Supported for: `conll`, `ner` ~~bool (flag)~~ |
| `--base`, `-b` | Trained spaCy pipeline for sentence segmentation to use as base (for `--seg-sents`). ~~Optional[str](option)~~ | | `--base`, `-b`, `--model` | Trained spaCy pipeline for sentence segmentation to use as base (for `--seg-sents`). ~~Optional[str](option)~~ |
| `--morphology`, `-m` | Enable appending morphology to tags. Supported for: `conllu` ~~bool (flag)~~ | | `--morphology`, `-m` | Enable appending morphology to tags. Supported for: `conllu` ~~bool (flag)~~ |
| `--merge-subtokens`, `-T` | Merge CoNLL-U subtokens ~~bool (flag)~~ |
| `--ner-map`, `-nm` | NER tag mapping (as JSON-encoded dict of entity types). Supported for: `conllu` ~~Optional[Path](option)~~ | | `--ner-map`, `-nm` | NER tag mapping (as JSON-encoded dict of entity types). Supported for: `conllu` ~~Optional[Path](option)~~ |
| `--lang`, `-l` <Tag variant="new">2.1</Tag> | Language code (if tokenizer required). ~~Optional[str] \(option)~~ | | `--lang`, `-l` <Tag variant="new">2.1</Tag> | Language code (if tokenizer required). ~~Optional[str] \(option)~~ |
| `--concatenate`, `-C` | Concatenate output to a single file ~~bool (flag)~~ |
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | | `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
| **CREATES** | Binary [`DocBin`](/api/docbin) training data that can be used with [`spacy train`](/api/cli#train). | | **CREATES** | Binary [`DocBin`](/api/docbin) training data that can be used with [`spacy train`](/api/cli#train). |

View File

@ -474,8 +474,8 @@ The L2 norm of the token's vector representation.
| `like_email` | Does the token resemble an email address? ~~bool~~ | | `like_email` | Does the token resemble an email address? ~~bool~~ |
| `is_oov` | Is the token out-of-vocabulary (i.e. does it not have a word vector)? ~~bool~~ | | `is_oov` | Is the token out-of-vocabulary (i.e. does it not have a word vector)? ~~bool~~ |
| `is_stop` | Is the token part of a "stop list"? ~~bool~~ | | `is_stop` | Is the token part of a "stop list"? ~~bool~~ |
| `pos` | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/docs/u/pos/). ~~int~~ | | `pos` | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/u/pos/). ~~int~~ |
| `pos_` | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/docs/u/pos/). ~~str~~ | | `pos_` | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/u/pos/). ~~str~~ |
| `tag` | Fine-grained part-of-speech. ~~int~~ | | `tag` | Fine-grained part-of-speech. ~~int~~ |
| `tag_` | Fine-grained part-of-speech. ~~str~~ | | `tag_` | Fine-grained part-of-speech. ~~str~~ |
| `morph` <Tag variant="new">3</Tag> | Morphological analysis. ~~MorphAnalysis~~ | | `morph` <Tag variant="new">3</Tag> | Morphological analysis. ~~MorphAnalysis~~ |

View File

@ -325,6 +325,5 @@ serialization by passing in the string names via the `exclude` argument.
| Name | Description | | Name | Description |
| --------- | ----------------------------------------------------- | | --------- | ----------------------------------------------------- |
| `strings` | The strings in the [`StringStore`](/api/stringstore). | | `strings` | The strings in the [`StringStore`](/api/stringstore). |
| `lexemes` | The lexeme data. |
| `vectors` | The word vectors, if available. | | `vectors` | The word vectors, if available. |
| `lookups` | The lookup tables, if available. | | `lookups` | The lookup tables, if available. |

View File

@ -25,7 +25,7 @@ for token in doc:
> - **Text:** The original word text. > - **Text:** The original word text.
> - **Lemma:** The base form of the word. > - **Lemma:** The base form of the word.
> - **POS:** The simple [UPOS](https://universaldependencies.org/docs/u/pos/) > - **POS:** The simple [UPOS](https://universaldependencies.org/u/pos/)
> part-of-speech tag. > part-of-speech tag.
> - **Tag:** The detailed part-of-speech tag. > - **Tag:** The detailed part-of-speech tag.
> - **Dep:** Syntactic dependency, i.e. the relation between tokens. > - **Dep:** Syntactic dependency, i.e. the relation between tokens.

View File

@ -284,7 +284,9 @@ $ python -m pytest --pyargs %%SPACY_PKG_NAME --slow # basic and slow test
## Troubleshooting guide {#troubleshooting} ## Troubleshooting guide {#troubleshooting}
This section collects some of the most common errors you may come across when This section collects some of the most common errors you may come across when
installing, loading and using spaCy, as well as their solutions. installing, loading and using spaCy, as well as their solutions. Also see the
[Discussions FAQ Thread](https://github.com/explosion/spaCy/discussions/8226),
which is updated more frequently and covers more transitory issues.
> #### Help us improve this guide > #### Help us improve this guide
> >
@ -311,62 +313,6 @@ language's `Language` class instead, for example
</Accordion> </Accordion>
<Accordion title="No such option: --no-cache-dir" id="no-cache-dir">
```
no such option: --no-cache-dir
```
The `download` command uses pip to install the pipeline packages and sets the
`--no-cache-dir` flag to prevent it from requiring too much memory.
[This setting](https://pip.pypa.io/en/stable/reference/pip_install/#caching)
requires pip v6.0 or newer. Run `pip install -U pip` to upgrade to the latest
version of pip. To see which version you have installed, run `pip --version`.
</Accordion>
<Accordion title="sre_constants.error: bad character range" id="narrow-unicode">
```
sre_constants.error: bad character range
```
In [v2.1](/usage/v2-1), spaCy changed its implementation of regular expressions
for tokenization to make it up to 2-3 times faster. But this also means that
it's very important now that you run spaCy with a wide unicode build of Python.
This means that the build has 1114111 unicode characters available, instead of
only 65535 in a narrow unicode build. You can check this by running the
following command:
```bash
$ python -c "import sys; print(sys.maxunicode)"
```
If you're running a narrow unicode build, reinstall Python and use a wide
unicode build instead. You can also rebuild Python and set the
`--enable-unicode=ucs4` flag.
</Accordion>
<Accordion title="Unknown locale: UTF-8" id="unknown-locale">
```
ValueError: unknown locale: UTF-8
```
This error can sometimes occur on OSX and is likely related to a still
unresolved [Python bug](https://bugs.python.org/issue18378). However, it's easy
to fix: just add the following to your `~/.bash_profile` or `~/.zshrc` and then
run `source ~/.bash_profile` or `source ~/.zshrc`. Make sure to add **both
lines** for `LC_ALL` and `LANG`.
```bash
$ export LC_ALL=en_US.UTF-8
$ export LANG=en_US.UTF-8
```
</Accordion>
<Accordion title="Import error: No module named spacy" id="import-error"> <Accordion title="Import error: No module named spacy" id="import-error">
``` ```

View File

@ -1363,20 +1363,19 @@
"url": "https://explosion.ai/demos/sense2vec", "url": "https://explosion.ai/demos/sense2vec",
"code_example": [ "code_example": [
"import spacy", "import spacy",
"from sense2vec import Sense2VecComponent",
"", "",
"nlp = spacy.load('en')", "nlp = spacy.load(\"en_core_web_sm\")",
"s2v = Sense2VecComponent('/path/to/reddit_vectors-1.1.0')", "s2v = nlp.add_pipe(\"sense2vec\")",
"nlp.add_pipe(s2v)", "s2v.from_disk(\"/path/to/s2v_reddit_2015_md\")",
"", "",
"doc = nlp(\"A sentence about natural language processing.\")", "doc = nlp(\"A sentence about natural language processing.\")",
"assert doc[3].text == 'natural language processing'", "assert doc[3:6].text == \"natural language processing\"",
"freq = doc[3]._.s2v_freq", "freq = doc[3:6]._.s2v_freq",
"vector = doc[3]._.s2v_vec", "vector = doc[3:6]._.s2v_vec",
"most_similar = doc[3]._.s2v_most_similar(3)", "most_similar = doc[3:6]._.s2v_most_similar(3)",
"# [(('natural language processing', 'NOUN'), 1.0),", "# [(('machine learning', 'NOUN'), 0.8986967),",
"# (('machine learning', 'NOUN'), 0.8986966609954834),", "# (('computer vision', 'NOUN'), 0.8636297),",
"# (('computer vision', 'NOUN'), 0.8636297583580017)]" "# (('deep learning', 'NOUN'), 0.8573361)]"
], ],
"category": ["pipeline", "standalone", "visualizers"], "category": ["pipeline", "standalone", "visualizers"],
"tags": ["vectors"], "tags": ["vectors"],
@ -2970,11 +2969,10 @@
"github": "thomasthiebaud/spacy-fastlang", "github": "thomasthiebaud/spacy-fastlang",
"pip": "spacy_fastlang", "pip": "spacy_fastlang",
"code_example": [ "code_example": [
"import spacy", "import spacy_fastlang",
"from spacy_fastlang import LanguageDetector",
"", "",
"nlp = spacy.load('en_core_web_sm')", "nlp = spacy.load(\"en_core_web_sm\")",
"nlp.add_pipe(LanguageDetector())", "nlp.add_pipe(\"language_detector\")",
"doc = nlp('Life is like a box of chocolates. You never know what you are gonna get.')", "doc = nlp('Life is like a box of chocolates. You never know what you are gonna get.')",
"", "",
"assert doc._.language == 'en'", "assert doc._.language == 'en'",
@ -3476,6 +3474,50 @@
"github": "bbieniek" "github": "bbieniek"
}, },
"category": ["apis"] "category": ["apis"]
},
{
"id": "phruzz_matcher",
"title": "phruzz-matcher",
"slogan": "Phrase matcher using RapidFuzz",
"description": "Combination of the RapidFuzz library with Spacy PhraseMatcher The goal of this component is to find matches when there were NO \"perfect matches\" due to typos or abbreviations between a Spacy doc and a list of phrases.",
"github": "mjvallone/phruzz-matcher",
"pip": "phruzz_matcher",
"code_example": [
"import spacy",
"from spacy.language import Language",
"from phruzz_matcher.phrase_matcher import PhruzzMatcher",
"",
"famous_people = [",
" \"Brad Pitt\",",
" \"Demi Moore\",",
" \"Bruce Willis\",",
" \"Jim Carrey\",",
"]",
"",
"@Language.factory(\"phrase_matcher\")",
"def phrase_matcher(nlp: Language, name: str):",
" return PhruzzMatcher(nlp, famous_people, \"FAMOUS_PEOPLE\", 85)",
"",
"nlp = spacy.blank('es')",
"nlp.add_pipe(\"phrase_matcher\")",
"",
"doc = nlp(\"El otro día fui a un bar donde vi a brad pit y a Demi Moore, estaban tomando unas cervezas mientras charlaban de sus asuntos.\")",
"print(f\"doc.ents: {doc.ents}\")",
"",
"#OUTPUT",
"#doc.ents: (brad pit, Demi Moore)"
],
"thumb": "https://avatars.githubusercontent.com/u/961296?v=4",
"image": "",
"code_language": "python",
"author": "Martin Vallone",
"author_links": {
"github": "mjvallone",
"twitter": "vallotin",
"website": "https://fiqus.coop/"
},
"category": ["pipeline", "research", "standalone"],
"tags": ["spacy", "python", "nlp", "ner"]
} }
], ],

View File

@ -34,6 +34,7 @@ const MODEL_META = {
core_sm: 'Vocabulary, syntax, entities', core_sm: 'Vocabulary, syntax, entities',
dep: 'Vocabulary, syntax', dep: 'Vocabulary, syntax',
ent: 'Named entities', ent: 'Named entities',
sent: 'Sentence boundaries',
pytt: 'PyTorch Transformers', pytt: 'PyTorch Transformers',
trf: 'Transformers', trf: 'Transformers',
vectors: 'Word vectors', vectors: 'Word vectors',
@ -195,6 +196,7 @@ const Model = ({
const [isError, setIsError] = useState(true) const [isError, setIsError] = useState(true)
const [meta, setMeta] = useState({}) const [meta, setMeta] = useState({})
const { type, genre, size } = getModelComponents(name) const { type, genre, size } = getModelComponents(name)
const display_type = type === 'core' && size === 'sm' ? 'core_sm' : type
const version = useMemo(() => getLatestVersion(name, compatibility, prereleases), [ const version = useMemo(() => getLatestVersion(name, compatibility, prereleases), [
name, name,
compatibility, compatibility,
@ -231,7 +233,7 @@ const Model = ({
const rows = [ const rows = [
{ label: 'Language', tag: langId, content: langName }, { label: 'Language', tag: langId, content: langName },
{ label: 'Type', tag: type, content: MODEL_META[type] }, { label: 'Type', tag: type, content: MODEL_META[display_type] },
{ label: 'Genre', tag: genre, content: MODEL_META[genre] }, { label: 'Genre', tag: genre, content: MODEL_META[genre] },
{ label: 'Size', tag: size, content: meta.sizeFull }, { label: 'Size', tag: size, content: meta.sizeFull },
{ label: 'Components', content: components, help: MODEL_META.components }, { label: 'Components', content: components, help: MODEL_META.components },