diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml index 09de1cd05..fce1a1064 100644 --- a/.github/ISSUE_TEMPLATE/config.yml +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -1,8 +1,11 @@ blank_issues_enabled: false contact_links: + - name: ⚠️ Python 3.10 Support + url: https://github.com/explosion/spaCy/discussions/9418 + about: Python 3.10 wheels haven't been released yet, see the link for details. - name: 🗯 Discussions Forum url: https://github.com/explosion/spaCy/discussions - about: Usage questions, general discussion and anything else that isn't a bug report. + about: Install issues, usage questions, general discussion and anything else that isn't a bug report. - name: 📖 spaCy FAQ & Troubleshooting url: https://github.com/explosion/spaCy/discussions/8226 about: Before you post, check out the FAQ for answers to common community questions! diff --git a/.github/azure-steps.yml b/.github/azure-steps.yml index 50e81799e..543804b9f 100644 --- a/.github/azure-steps.yml +++ b/.github/azure-steps.yml @@ -100,3 +100,8 @@ steps: python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113 displayName: 'Test assemble CLI vectors warning' condition: eq(variables['python_version'], '3.8') + + - script: | + python .github/validate_universe_json.py website/meta/universe.json + displayName: 'Test website/meta/universe.json' + condition: eq(variables['python_version'], '3.8') diff --git a/.github/validate_universe_json.py b/.github/validate_universe_json.py new file mode 100644 index 000000000..b96b7b347 --- /dev/null +++ b/.github/validate_universe_json.py @@ -0,0 +1,19 @@ +import json +import re +import sys +from pathlib import Path + + +def validate_json(document): + universe_file = Path(document) + with universe_file.open() as f: + universe_data = json.load(f) + for entry in universe_data["resources"]: + if "github" in entry: + assert not re.match( + r"^(http:)|^(https:)", entry["github"] + ), "Github field should be user/repo, not a url" + + +if __name__ == "__main__": + validate_json(str(sys.argv[1])) diff --git a/.github/workflows/explosionbot.yml b/.github/workflows/explosionbot.yml new file mode 100644 index 000000000..7d9ee45e9 --- /dev/null +++ b/.github/workflows/explosionbot.yml @@ -0,0 +1,27 @@ +name: Explosion Bot + +on: + issue_comment: + types: + - created + - edited + +jobs: + explosion-bot: + runs-on: ubuntu-18.04 + steps: + - name: Dump GitHub context + env: + GITHUB_CONTEXT: ${{ toJson(github) }} + run: echo "$GITHUB_CONTEXT" + - uses: actions/checkout@v1 + - uses: actions/setup-python@v1 + - name: Install and run explosion-bot + run: | + pip install git+https://${{ secrets.EXPLOSIONBOT_TOKEN }}@github.com/explosion/explosion-bot + python -m explosionbot + env: + INPUT_TOKEN: ${{ secrets.EXPLOSIONBOT_TOKEN }} + INPUT_BK_TOKEN: ${{ secrets.BUILDKITE_SECRET }} + ENABLED_COMMANDS: "test_gpu" + ALLOWED_TEAMS: "spaCy" \ No newline at end of file diff --git a/CITATION b/CITATION deleted file mode 100644 index bdaa90677..000000000 --- a/CITATION +++ /dev/null @@ -1,8 +0,0 @@ -@software{spacy, - author = {Honnibal, Matthew and Montani, Ines and Van Landeghem, Sofie and Boyd, Adriane}, - title = {{spaCy: Industrial-strength Natural Language Processing in Python}}, - year = 2020, - publisher = {Zenodo}, - doi = {10.5281/zenodo.1212303}, - url = {https://doi.org/10.5281/zenodo.1212303} -} diff --git a/CITATION.cff b/CITATION.cff new file mode 100644 index 000000000..88c05b2a3 --- /dev/null +++ b/CITATION.cff @@ -0,0 +1,16 @@ +cff-version: 1.2.0 +preferred-citation: + type: article + message: "If you use spaCy, please cite it as below." + authors: + - family-names: "Honnibal" + given-names: "Matthew" + - family-names: "Montani" + given-names: "Ines" + - family-names: "Van Landeghem" + given-names: "Sofie" + - family-names: "Boyd" + given-names: "Adriane" + title: "spaCy: Industrial-strength Natural Language Processing in Python" + doi: "10.5281/zenodo.1212303" + year: 2020 diff --git a/MANIFEST.in b/MANIFEST.in index 99fc174bd..d022223cd 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,5 +1,5 @@ recursive-include include *.h -recursive-include spacy *.pyx *.pxd *.txt *.cfg *.jinja +recursive-include spacy *.pyx *.pxd *.txt *.cfg *.jinja *.toml include LICENSE include README.md include pyproject.toml diff --git a/azure-pipelines.yml b/azure-pipelines.yml index ac80b8a10..844946845 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -16,6 +16,8 @@ pr: exclude: - "website/*" - "*.md" + include: + - "website/meta/universe.json" jobs: # Perform basic checks for most important errors (syntax etc.) Uses the config diff --git a/setup.cfg b/setup.cfg index 4313612d4..e3a9af5c1 100644 --- a/setup.cfg +++ b/setup.cfg @@ -124,7 +124,8 @@ exclude = [tool:pytest] markers = - slow + slow: mark a test as slow + issue: reference specific issue [mypy] ignore_missing_imports = True diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 9fd87dbc1..664fc2aaf 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -1,4 +1,4 @@ -from typing import Optional +from typing import Optional, Dict, Any from pathlib import Path from wasabi import msg import typer @@ -7,7 +7,7 @@ import sys from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error from ._util import import_code, setup_gpu -from ..training.loop import train +from ..training.loop import train as train_nlp from ..training.initialize import init_nlp from .. import util @@ -40,6 +40,18 @@ def train_cli( DOCS: https://spacy.io/api/cli#train """ util.logger.setLevel(logging.DEBUG if verbose else logging.INFO) + overrides = parse_config_overrides(ctx.args) + import_code(code_path) + train(config_path, output_path, use_gpu=use_gpu, overrides=overrides) + + +def train( + config_path: Path, + output_path: Optional[Path] = None, + *, + use_gpu: int = -1, + overrides: Dict[str, Any] = util.SimpleFrozenDict(), +): # Make sure all files and paths exists if they are needed if not config_path or (str(config_path) != "-" and not config_path.exists()): msg.fail("Config file not found", config_path, exits=1) @@ -50,8 +62,6 @@ def train_cli( output_path.mkdir(parents=True) msg.good(f"Created output directory: {output_path}") msg.info(f"Saving to output directory: {output_path}") - overrides = parse_config_overrides(ctx.args) - import_code(code_path) setup_gpu(use_gpu) with show_validation_error(config_path): config = util.load_config(config_path, overrides=overrides, interpolate=False) @@ -60,4 +70,4 @@ def train_cli( nlp = init_nlp(config, use_gpu=use_gpu) msg.good("Initialized pipeline") msg.divider("Training pipeline") - train(nlp, output_path, use_gpu=use_gpu, stdout=sys.stdout, stderr=sys.stderr) + train_nlp(nlp, output_path, use_gpu=use_gpu, stdout=sys.stdout, stderr=sys.stderr) diff --git a/spacy/errors.py b/spacy/errors.py index 120828fd6..4b617ecf3 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -25,7 +25,7 @@ def setup_default_warnings(): filter_warning("once", error_msg=Warnings.W036.format(name=pipe)) # warn once about lemmatizer without required POS - filter_warning("once", error_msg="[W108]") + filter_warning("once", error_msg=Warnings.W108) def filter_warning(action: str, error_msg: str): @@ -170,8 +170,8 @@ class Warnings: "call the {matcher} on each Doc object.") W107 = ("The property `Doc.{prop}` is deprecated. Use " "`Doc.has_annotation(\"{attr}\")` instead.") - W108 = ("The rule-based lemmatizer did not find POS annotation for the " - "token '{text}'. Check that your pipeline includes components that " + W108 = ("The rule-based lemmatizer did not find POS annotation for one or " + "more tokens. Check that your pipeline includes components that " "assign token.pos, typically 'tagger'+'attribute_ruler' or " "'morphologizer'.") W109 = ("Unable to save user hooks while serializing the doc. Re-add any " @@ -658,7 +658,9 @@ class Errors: "{nO} - cannot add any more labels.") E923 = ("It looks like there is no proper sample data to initialize the " "Model of component '{name}'. To check your input data paths and " - "annotation, run: python -m spacy debug data config.cfg") + "annotation, run: python -m spacy debug data config.cfg " + "and include the same config override values you would specify " + "for the 'spacy train' command.") E924 = ("The '{name}' component does not seem to be initialized properly. " "This is likely a bug in spaCy, so feel free to open an issue: " "https://github.com/explosion/spaCy/issues") @@ -793,7 +795,7 @@ class Errors: "to token boundaries.") E982 = ("The `Token.ent_iob` attribute should be an integer indexing " "into {values}, but found {value}.") - E983 = ("Invalid key for '{dict}': {key}. Available keys: " + E983 = ("Invalid key(s) for '{dict}': {key}. Available keys: " "{keys}") E984 = ("Invalid component config for '{name}': component block needs either " "a key `factory` specifying the registered function used to " diff --git a/spacy/lang/ca/lemmatizer.py b/spacy/lang/ca/lemmatizer.py index 2518eb720..2fd012912 100644 --- a/spacy/lang/ca/lemmatizer.py +++ b/spacy/lang/ca/lemmatizer.py @@ -76,6 +76,6 @@ class CatalanLemmatizer(Lemmatizer): forms.append(self.lookup_lemmatize(token)[0]) if not forms: forms.append(string) - forms = list(set(forms)) + forms = list(dict.fromkeys(forms)) self.cache[cache_key] = forms return forms diff --git a/spacy/lang/fr/lemmatizer.py b/spacy/lang/fr/lemmatizer.py index bb5a270ab..c6422cf96 100644 --- a/spacy/lang/fr/lemmatizer.py +++ b/spacy/lang/fr/lemmatizer.py @@ -75,6 +75,6 @@ class FrenchLemmatizer(Lemmatizer): forms.append(self.lookup_lemmatize(token)[0]) if not forms: forms.append(string) - forms = list(set(forms)) + forms = list(dict.fromkeys(forms)) self.cache[cache_key] = forms return forms diff --git a/spacy/lang/nl/lemmatizer.py b/spacy/lang/nl/lemmatizer.py index 6c025dcf6..4f6b2ef30 100644 --- a/spacy/lang/nl/lemmatizer.py +++ b/spacy/lang/nl/lemmatizer.py @@ -97,7 +97,7 @@ class DutchLemmatizer(Lemmatizer): return forms else: oov_forms.append(form) - forms = list(set(oov_forms)) + forms = list(dict.fromkeys(oov_forms)) # Back-off through remaining return value candidates. if forms: for form in forms: diff --git a/spacy/lang/ru/lemmatizer.py b/spacy/lang/ru/lemmatizer.py index 185e09718..a56938641 100644 --- a/spacy/lang/ru/lemmatizer.py +++ b/spacy/lang/ru/lemmatizer.py @@ -58,7 +58,7 @@ class RussianLemmatizer(Lemmatizer): if not len(filtered_analyses): return [string.lower()] if morphology is None or (len(morphology) == 1 and POS in morphology): - return list(set([analysis.normal_form for analysis in filtered_analyses])) + return list(dict.fromkeys([analysis.normal_form for analysis in filtered_analyses])) if univ_pos in ("ADJ", "DET", "NOUN", "PROPN"): features_to_compare = ["Case", "Number", "Gender"] elif univ_pos == "NUM": @@ -89,7 +89,7 @@ class RussianLemmatizer(Lemmatizer): filtered_analyses.append(analysis) if not len(filtered_analyses): return [string.lower()] - return list(set([analysis.normal_form for analysis in filtered_analyses])) + return list(dict.fromkeys([analysis.normal_form for analysis in filtered_analyses])) def pymorphy2_lookup_lemmatize(self, token: Token) -> List[str]: string = token.text diff --git a/spacy/language.py b/spacy/language.py index d87f86bd3..fd3773f82 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -707,8 +707,9 @@ class Language: source_config = source.config.interpolate() pipe_config = util.copy_config(source_config["components"][source_name]) self._pipe_configs[name] = pipe_config - for s in source.vocab.strings: - self.vocab.strings.add(s) + if self.vocab.strings != source.vocab.strings: + for s in source.vocab.strings: + self.vocab.strings.add(s) return pipe, pipe_config["factory"] def add_pipe( @@ -1379,6 +1380,9 @@ class Language: scorer = Scorer(**kwargs) # reset annotation in predicted docs and time tokenization start_time = timer() + # this is purely for timing + for eg in examples: + self.make_doc(eg.reference.text) # apply all pipeline components for name, pipe in self.pipeline: kwargs = component_cfg.get(name, {}) @@ -1708,6 +1712,7 @@ class Language: # them here so they're only loaded once source_nlps = {} source_nlp_vectors_hashes = {} + vocab_b = None for pipe_name in config["nlp"]["pipeline"]: if pipe_name not in pipeline: opts = ", ".join(pipeline.keys()) @@ -1730,14 +1735,22 @@ class Language: raw_config=raw_config, ) else: + # We need the sourced components to reference the same + # vocab without modifying the current vocab state **AND** + # we still want to load the source model vectors to perform + # the vectors check. Since the source vectors clobber the + # current ones, we save the original vocab state and + # restore after this loop. Existing strings are preserved + # during deserialization, so they do not need any + # additional handling. + if vocab_b is None: + vocab_b = nlp.vocab.to_bytes(exclude=["lookups", "strings"]) model = pipe_cfg["source"] if model not in source_nlps: - # We only need the components here and we intentionally - # do not load the model with the same vocab because - # this would cause the vectors to be copied into the - # current nlp object (all the strings will be added in - # create_pipe_from_source) - source_nlps[model] = util.load_model(model) + # Load with the same vocab, adding any strings + source_nlps[model] = util.load_model( + model, vocab=nlp.vocab, exclude=["lookups"] + ) source_name = pipe_cfg.get("component", pipe_name) listeners_replaced = False if "replace_listeners" in pipe_cfg: @@ -1764,6 +1777,9 @@ class Language: # Delete from cache if listeners were replaced if listeners_replaced: del source_nlps[model] + # Restore the original vocab after sourcing if necessary + if vocab_b is not None: + nlp.vocab.from_bytes(vocab_b) disabled_pipes = [*config["nlp"]["disabled"], *disable] nlp._disabled = set(p for p in disabled_pipes if p not in exclude) nlp.batch_size = config["nlp"]["batch_size"] diff --git a/spacy/matcher/dependencymatcher.pyx b/spacy/matcher/dependencymatcher.pyx index f204ce224..a602ba737 100644 --- a/spacy/matcher/dependencymatcher.pyx +++ b/spacy/matcher/dependencymatcher.pyx @@ -177,13 +177,14 @@ cdef class DependencyMatcher: # Add 'RIGHT_ATTRS' to self._patterns[key] _patterns = [[[pat["RIGHT_ATTRS"]] for pat in pattern] for pattern in patterns] + pattern_offset = len(self._patterns[key]) self._patterns[key].extend(_patterns) # Add each node pattern of all the input patterns individually to the # matcher. This enables only a single instance of Matcher to be used. # Multiple adds are required to track each node pattern. tokens_to_key_list = [] - for i, current_patterns in enumerate(_patterns): + for i, current_patterns in enumerate(_patterns, start=pattern_offset): # Preallocate list space tokens_to_key = [None] * len(current_patterns) @@ -263,7 +264,9 @@ cdef class DependencyMatcher: self._raw_patterns.pop(key) self._tree.pop(key) self._root.pop(key) - self._tokens_to_key.pop(key) + for mklist in self._tokens_to_key.pop(key): + for mkey in mklist: + self._matcher.remove(mkey) def _get_keys_to_position_maps(self, doc): """ diff --git a/spacy/pipeline/lemmatizer.py b/spacy/pipeline/lemmatizer.py index 74f502d80..5adae10d2 100644 --- a/spacy/pipeline/lemmatizer.py +++ b/spacy/pipeline/lemmatizer.py @@ -208,7 +208,7 @@ class Lemmatizer(Pipe): univ_pos = token.pos_.lower() if univ_pos in ("", "eol", "space"): if univ_pos == "": - warnings.warn(Warnings.W108.format(text=string)) + warnings.warn(Warnings.W108) return [string.lower()] # See Issue #435 for example of where this logic is requied. if self.is_base_form(token): diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index a5dedcc87..10982bac1 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -4,6 +4,7 @@ from spacy.util import get_lang_class def pytest_addoption(parser): parser.addoption("--slow", action="store_true", help="include slow tests") + parser.addoption("--issue", action="store", help="test specific issues") def pytest_runtest_setup(item): @@ -16,10 +17,24 @@ def pytest_runtest_setup(item): # options weren't given. return item.config.getoption(f"--{opt}", False) + # Integration of boolean flags for opt in ["slow"]: if opt in item.keywords and not getopt(opt): pytest.skip(f"need --{opt} option to run") + # Special integration to mark tests with issue numbers + issues = getopt("issue") + if isinstance(issues, str): + if "issue" in item.keywords: + # Convert issues provided on the CLI to list of ints + issue_nos = [int(issue.strip()) for issue in issues.split(",")] + # Get all issues specified by decorators and check if they're provided + issue_refs = [mark.args[0] for mark in item.iter_markers(name="issue")] + if not any([ref in issue_nos for ref in issue_refs]): + pytest.skip(f"not referencing specified issues: {issue_nos}") + else: + pytest.skip("not referencing any issues") + # Fixtures for language tokenizers (languages sorted alphabetically) diff --git a/spacy/tests/matcher/test_dependency_matcher.py b/spacy/tests/matcher/test_dependency_matcher.py index 0e1eae588..61ae43c52 100644 --- a/spacy/tests/matcher/test_dependency_matcher.py +++ b/spacy/tests/matcher/test_dependency_matcher.py @@ -368,3 +368,87 @@ def test_dependency_matcher_span_user_data(en_tokenizer): assert doc_match[0] == span_match[0] for doc_t_i, span_t_i in zip(doc_match[1], span_match[1]): assert doc_t_i == span_t_i + offset + + +def test_dependency_matcher_order_issue(en_tokenizer): + # issue from #9263 + doc = en_tokenizer("I like text") + doc[2].head = doc[1] + + # this matches on attrs but not rel op + pattern1 = [ + {"RIGHT_ID": "root", "RIGHT_ATTRS": {"ORTH": "like"}}, + { + "LEFT_ID": "root", + "RIGHT_ID": "r", + "RIGHT_ATTRS": {"ORTH": "text"}, + "REL_OP": "<", + }, + ] + + # this matches on rel op but not attrs + pattern2 = [ + {"RIGHT_ID": "root", "RIGHT_ATTRS": {"ORTH": "like"}}, + { + "LEFT_ID": "root", + "RIGHT_ID": "r", + "RIGHT_ATTRS": {"ORTH": "fish"}, + "REL_OP": ">", + }, + ] + + matcher = DependencyMatcher(en_tokenizer.vocab) + + # This should behave the same as the next pattern + matcher.add("check", [pattern1, pattern2]) + matches = matcher(doc) + + assert matches == [] + + # use a new matcher + matcher = DependencyMatcher(en_tokenizer.vocab) + # adding one at a time under same label gets a match + matcher.add("check", [pattern1]) + matcher.add("check", [pattern2]) + matches = matcher(doc) + + assert matches == [] + + +def test_dependency_matcher_remove(en_tokenizer): + # issue from #9263 + doc = en_tokenizer("The red book") + doc[1].head = doc[2] + + # this matches + pattern1 = [ + {"RIGHT_ID": "root", "RIGHT_ATTRS": {"ORTH": "book"}}, + { + "LEFT_ID": "root", + "RIGHT_ID": "r", + "RIGHT_ATTRS": {"ORTH": "red"}, + "REL_OP": ">", + }, + ] + + # add and then remove it + matcher = DependencyMatcher(en_tokenizer.vocab) + matcher.add("check", [pattern1]) + matcher.remove("check") + + # this matches on rel op but not attrs + pattern2 = [ + {"RIGHT_ID": "root", "RIGHT_ATTRS": {"ORTH": "flag"}}, + { + "LEFT_ID": "root", + "RIGHT_ID": "r", + "RIGHT_ATTRS": {"ORTH": "blue"}, + "REL_OP": ">", + }, + ] + + # Adding this new pattern with the same label, which should not match + matcher.add("check", [pattern2]) + matches = matcher(doc) + + assert matches == [] diff --git a/spacy/tests/package/__init__.py b/spacy/tests/package/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tests/pipeline/test_spancat.py b/spacy/tests/pipeline/test_spancat.py index 7b759f8f6..d4d0617d7 100644 --- a/spacy/tests/pipeline/test_spancat.py +++ b/spacy/tests/pipeline/test_spancat.py @@ -114,7 +114,7 @@ def test_make_spangroup(max_positive, nr_results): doc = nlp.make_doc("Greater London") ngram_suggester = registry.misc.get("spacy.ngram_suggester.v1")(sizes=[1, 2]) indices = ngram_suggester([doc])[0].dataXd - assert_array_equal(indices, numpy.asarray([[0, 1], [1, 2], [0, 2]])) + assert_array_equal(OPS.to_numpy(indices), numpy.asarray([[0, 1], [1, 2], [0, 2]])) labels = ["Thing", "City", "Person", "GreatCity"] scores = numpy.asarray( [[0.2, 0.4, 0.3, 0.1], [0.1, 0.6, 0.2, 0.4], [0.8, 0.7, 0.3, 0.9]], dtype="f" diff --git a/spacy/tests/regression/test_issue5501-6000.py b/spacy/tests/regression/test_issue5501-6000.py index a35de92fa..355ffffeb 100644 --- a/spacy/tests/regression/test_issue5501-6000.py +++ b/spacy/tests/regression/test_issue5501-6000.py @@ -49,8 +49,8 @@ def test_issue5551(textcat_config): # All results should be the same because of the fixed seed assert len(results) == 3 ops = get_current_ops() - assert_almost_equal(ops.to_numpy(results[0]), ops.to_numpy(results[1])) - assert_almost_equal(ops.to_numpy(results[0]), ops.to_numpy(results[2])) + assert_almost_equal(ops.to_numpy(results[0]), ops.to_numpy(results[1]), decimal=5) + assert_almost_equal(ops.to_numpy(results[0]), ops.to_numpy(results[2]), decimal=5) def test_issue5838(): diff --git a/spacy/tests/regression/test_issue7716.py b/spacy/tests/regression/test_issue7716.py new file mode 100644 index 000000000..811952792 --- /dev/null +++ b/spacy/tests/regression/test_issue7716.py @@ -0,0 +1,54 @@ +import pytest +from thinc.api import Adam +from spacy.attrs import NORM +from spacy.vocab import Vocab +from spacy import registry +from spacy.training import Example +from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL +from spacy.tokens import Doc +from spacy.pipeline import DependencyParser + + +@pytest.fixture +def vocab(): + return Vocab(lex_attr_getters={NORM: lambda s: s}) + + +def _parser_example(parser): + doc = Doc(parser.vocab, words=["a", "b", "c", "d"]) + gold = {"heads": [1, 1, 3, 3], "deps": ["right", "ROOT", "left", "ROOT"]} + return Example.from_dict(doc, gold) + + +@pytest.fixture +def parser(vocab): + vocab.strings.add("ROOT") + cfg = {"model": DEFAULT_PARSER_MODEL} + model = registry.resolve(cfg, validate=True)["model"] + parser = DependencyParser(vocab, model) + parser.cfg["token_vector_width"] = 4 + parser.cfg["hidden_width"] = 32 + # parser.add_label('right') + parser.add_label("left") + parser.initialize(lambda: [_parser_example(parser)]) + sgd = Adam(0.001) + + for i in range(10): + losses = {} + doc = Doc(vocab, words=["a", "b", "c", "d"]) + example = Example.from_dict( + doc, {"heads": [1, 1, 3, 3], "deps": ["left", "ROOT", "left", "ROOT"]} + ) + parser.update([example], sgd=sgd, losses=losses) + return parser + + +@pytest.mark.xfail(reason="Not fixed yet") +def test_partial_annotation(parser): + doc = Doc(parser.vocab, words=["a", "b", "c", "d"]) + doc[2].is_sent_start = False + # Note that if the following line is used, then doc[2].is_sent_start == False + # doc[3].is_sent_start = False + + doc = parser(doc) + assert doc[2].is_sent_start == False diff --git a/spacy/tests/regression/test_issue8168.py b/spacy/tests/regression/test_issue8168.py index fbddf643c..e3f3b5cfa 100644 --- a/spacy/tests/regression/test_issue8168.py +++ b/spacy/tests/regression/test_issue8168.py @@ -1,6 +1,8 @@ +import pytest from spacy.lang.en import English +@pytest.mark.issue(8168) def test_issue8168(): nlp = English() ruler = nlp.add_pipe("entity_ruler") diff --git a/spacy/tests/test_models.py b/spacy/tests/test_models.py index 47540198a..2306cabb7 100644 --- a/spacy/tests/test_models.py +++ b/spacy/tests/test_models.py @@ -193,6 +193,7 @@ def test_models_update_consistently(seed, dropout, model_func, kwargs, get_X): assert_array_almost_equal( model1.ops.to_numpy(get_all_params(model1)), model2.ops.to_numpy(get_all_params(model2)), + decimal=5, ) diff --git a/spacy/tests/training/test_readers.py b/spacy/tests/training/test_readers.py index f53660818..1f262c011 100644 --- a/spacy/tests/training/test_readers.py +++ b/spacy/tests/training/test_readers.py @@ -82,15 +82,15 @@ def test_cat_readers(reader, additional_config): [nlp] lang = "en" - pipeline = ["tok2vec", "textcat"] + pipeline = ["tok2vec", "textcat_multilabel"] [components] [components.tok2vec] factory = "tok2vec" - [components.textcat] - factory = "textcat" + [components.textcat_multilabel] + factory = "textcat_multilabel" """ config = Config().from_str(nlp_config_string) config["corpora"]["@readers"] = reader diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/_serialize.py index 5e7d170f8..1d26b968c 100644 --- a/spacy/tokens/_serialize.py +++ b/spacy/tokens/_serialize.py @@ -8,7 +8,7 @@ from thinc.api import NumpyOps from .doc import Doc from ..vocab import Vocab from ..compat import copy_reg -from ..attrs import SPACY, ORTH, intify_attr +from ..attrs import SPACY, ORTH, intify_attr, IDS from ..errors import Errors from ..util import ensure_path, SimpleFrozenList @@ -64,7 +64,13 @@ class DocBin: DOCS: https://spacy.io/api/docbin#init """ - attrs = sorted([intify_attr(attr) for attr in attrs]) + int_attrs = [intify_attr(attr) for attr in attrs] + if None in int_attrs: + non_valid = [attr for attr in attrs if intify_attr(attr) is None] + raise KeyError( + Errors.E983.format(dict="attrs", key=non_valid, keys=IDS.keys()) + ) from None + attrs = sorted(int_attrs) self.version = "0.1" self.attrs = [attr for attr in attrs if attr != ORTH and attr != SPACY] self.attrs.insert(0, ORTH) # Ensure ORTH is always attrs[0] diff --git a/spacy/training/__init__.py b/spacy/training/__init__.py index 22f1e64b1..a4feb01f4 100644 --- a/spacy/training/__init__.py +++ b/spacy/training/__init__.py @@ -1,4 +1,4 @@ -from .corpus import Corpus # noqa: F401 +from .corpus import Corpus, JsonlCorpus # noqa: F401 from .example import Example, validate_examples, validate_get_examples # noqa: F401 from .alignment import Alignment # noqa: F401 from .augment import dont_augment, orth_variants_augmenter # noqa: F401 diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py index bd014f75f..4eb8ea276 100644 --- a/spacy/training/initialize.py +++ b/spacy/training/initialize.py @@ -144,7 +144,12 @@ def load_vectors_into_model( ) -> None: """Load word vectors from an installed model or path into a model instance.""" try: - vectors_nlp = load_model(name) + # Load with the same vocab, which automatically adds the vectors to + # the current nlp object. Exclude lookups so they are not modified. + exclude = ["lookups"] + if not add_strings: + exclude.append("strings") + vectors_nlp = load_model(name, vocab=nlp.vocab, exclude=exclude) except ConfigValidationError as e: title = f"Config validation error for vectors {name}" desc = ( @@ -158,15 +163,8 @@ def load_vectors_into_model( if len(vectors_nlp.vocab.vectors.keys()) == 0: logger.warning(Warnings.W112.format(name=name)) - nlp.vocab.vectors = vectors_nlp.vocab.vectors for lex in nlp.vocab: lex.rank = nlp.vocab.vectors.key2row.get(lex.orth, OOV_RANK) - if add_strings: - # I guess we should add the strings from the vectors_nlp model? - # E.g. if someone does a similarity query, they might expect the strings. - for key in nlp.vocab.vectors.key2row: - if key in vectors_nlp.vocab.strings: - nlp.vocab.strings.add(vectors_nlp.vocab.strings[key]) def init_tok2vec( diff --git a/spacy/util.py b/spacy/util.py index fc1c0e76d..b25be5361 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -1475,7 +1475,7 @@ def get_arg_names(func: Callable) -> List[str]: RETURNS (List[str]): The argument names. """ argspec = inspect.getfullargspec(func) - return list(set([*argspec.args, *argspec.kwonlyargs])) + return list(dict.fromkeys([*argspec.args, *argspec.kwonlyargs])) def combine_score_weights( diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 402528f28..ef4435656 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -530,7 +530,6 @@ cdef class Vocab: setters = { "strings": lambda b: self.strings.from_bytes(b), - "lexemes": lambda b: self.lexemes_from_bytes(b), "vectors": lambda b: serialize_vectors(b), "lookups": lambda b: self.lookups.from_bytes(b), } diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index aadeebd77..470d11a3a 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -260,16 +260,18 @@ $ python -m spacy convert [input_file] [output_dir] [--converter] [--file-type] | Name | Description | | ------------------------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------------- | -| `input_file` | Input file. ~~Path (positional)~~ | +| `input_path` | Input file or directory. ~~Path (positional)~~ | | `output_dir` | Output directory for converted file. Defaults to `"-"`, meaning data will be written to `stdout`. ~~Optional[Path] \(option)~~ | | `--converter`, `-c` 2 | Name of converter to use (see below). ~~str (option)~~ | | `--file-type`, `-t` 2.1 | Type of file to create. Either `spacy` (default) for binary [`DocBin`](/api/docbin) data or `json` for v2.x JSON format. ~~str (option)~~ | | `--n-sents`, `-n` | Number of sentences per document. Supported for: `conll`, `conllu`, `iob`, `ner` ~~int (option)~~ | | `--seg-sents`, `-s` 2.2 | Segment sentences. Supported for: `conll`, `ner` ~~bool (flag)~~ | -| `--base`, `-b` | Trained spaCy pipeline for sentence segmentation to use as base (for `--seg-sents`). ~~Optional[str](option)~~ | +| `--base`, `-b`, `--model` | Trained spaCy pipeline for sentence segmentation to use as base (for `--seg-sents`). ~~Optional[str](option)~~ | | `--morphology`, `-m` | Enable appending morphology to tags. Supported for: `conllu` ~~bool (flag)~~ | +| `--merge-subtokens`, `-T` | Merge CoNLL-U subtokens ~~bool (flag)~~ | | `--ner-map`, `-nm` | NER tag mapping (as JSON-encoded dict of entity types). Supported for: `conllu` ~~Optional[Path](option)~~ | | `--lang`, `-l` 2.1 | Language code (if tokenizer required). ~~Optional[str] \(option)~~ | +| `--concatenate`, `-C` | Concatenate output to a single file ~~bool (flag)~~ | | `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | | **CREATES** | Binary [`DocBin`](/api/docbin) training data that can be used with [`spacy train`](/api/cli#train). | diff --git a/website/docs/api/token.md b/website/docs/api/token.md index 44c92d1ee..44a2ea9e8 100644 --- a/website/docs/api/token.md +++ b/website/docs/api/token.md @@ -474,8 +474,8 @@ The L2 norm of the token's vector representation. | `like_email` | Does the token resemble an email address? ~~bool~~ | | `is_oov` | Is the token out-of-vocabulary (i.e. does it not have a word vector)? ~~bool~~ | | `is_stop` | Is the token part of a "stop list"? ~~bool~~ | -| `pos` | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/docs/u/pos/). ~~int~~ | -| `pos_` | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/docs/u/pos/). ~~str~~ | +| `pos` | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/u/pos/). ~~int~~ | +| `pos_` | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/u/pos/). ~~str~~ | | `tag` | Fine-grained part-of-speech. ~~int~~ | | `tag_` | Fine-grained part-of-speech. ~~str~~ | | `morph` 3 | Morphological analysis. ~~MorphAnalysis~~ | diff --git a/website/docs/api/vocab.md b/website/docs/api/vocab.md index 40a3c3b22..c37b27a0e 100644 --- a/website/docs/api/vocab.md +++ b/website/docs/api/vocab.md @@ -325,6 +325,5 @@ serialization by passing in the string names via the `exclude` argument. | Name | Description | | --------- | ----------------------------------------------------- | | `strings` | The strings in the [`StringStore`](/api/stringstore). | -| `lexemes` | The lexeme data. | | `vectors` | The word vectors, if available. | | `lookups` | The lookup tables, if available. | diff --git a/website/docs/usage/101/_pos-deps.md b/website/docs/usage/101/_pos-deps.md index a531b245e..93ad0961a 100644 --- a/website/docs/usage/101/_pos-deps.md +++ b/website/docs/usage/101/_pos-deps.md @@ -25,7 +25,7 @@ for token in doc: > - **Text:** The original word text. > - **Lemma:** The base form of the word. -> - **POS:** The simple [UPOS](https://universaldependencies.org/docs/u/pos/) +> - **POS:** The simple [UPOS](https://universaldependencies.org/u/pos/) > part-of-speech tag. > - **Tag:** The detailed part-of-speech tag. > - **Dep:** Syntactic dependency, i.e. the relation between tokens. diff --git a/website/docs/usage/index.md b/website/docs/usage/index.md index 665d334f8..707dd3215 100644 --- a/website/docs/usage/index.md +++ b/website/docs/usage/index.md @@ -284,7 +284,9 @@ $ python -m pytest --pyargs %%SPACY_PKG_NAME --slow # basic and slow test ## Troubleshooting guide {#troubleshooting} This section collects some of the most common errors you may come across when -installing, loading and using spaCy, as well as their solutions. +installing, loading and using spaCy, as well as their solutions. Also see the +[Discussions FAQ Thread](https://github.com/explosion/spaCy/discussions/8226), +which is updated more frequently and covers more transitory issues. > #### Help us improve this guide > @@ -311,62 +313,6 @@ language's `Language` class instead, for example - - -``` -no such option: --no-cache-dir -``` - -The `download` command uses pip to install the pipeline packages and sets the -`--no-cache-dir` flag to prevent it from requiring too much memory. -[This setting](https://pip.pypa.io/en/stable/reference/pip_install/#caching) -requires pip v6.0 or newer. Run `pip install -U pip` to upgrade to the latest -version of pip. To see which version you have installed, run `pip --version`. - - - - - -``` -sre_constants.error: bad character range -``` - -In [v2.1](/usage/v2-1), spaCy changed its implementation of regular expressions -for tokenization to make it up to 2-3 times faster. But this also means that -it's very important now that you run spaCy with a wide unicode build of Python. -This means that the build has 1114111 unicode characters available, instead of -only 65535 in a narrow unicode build. You can check this by running the -following command: - -```bash -$ python -c "import sys; print(sys.maxunicode)" -``` - -If you're running a narrow unicode build, reinstall Python and use a wide -unicode build instead. You can also rebuild Python and set the -`--enable-unicode=ucs4` flag. - - - - - -``` -ValueError: unknown locale: UTF-8 -``` - -This error can sometimes occur on OSX and is likely related to a still -unresolved [Python bug](https://bugs.python.org/issue18378). However, it's easy -to fix: just add the following to your `~/.bash_profile` or `~/.zshrc` and then -run `source ~/.bash_profile` or `source ~/.zshrc`. Make sure to add **both -lines** for `LC_ALL` and `LANG`. - -```bash -$ export LC_ALL=en_US.UTF-8 -$ export LANG=en_US.UTF-8 -``` - - - ``` diff --git a/website/meta/universe.json b/website/meta/universe.json index 28fe058eb..7438a8932 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -1363,20 +1363,19 @@ "url": "https://explosion.ai/demos/sense2vec", "code_example": [ "import spacy", - "from sense2vec import Sense2VecComponent", "", - "nlp = spacy.load('en')", - "s2v = Sense2VecComponent('/path/to/reddit_vectors-1.1.0')", - "nlp.add_pipe(s2v)", + "nlp = spacy.load(\"en_core_web_sm\")", + "s2v = nlp.add_pipe(\"sense2vec\")", + "s2v.from_disk(\"/path/to/s2v_reddit_2015_md\")", "", "doc = nlp(\"A sentence about natural language processing.\")", - "assert doc[3].text == 'natural language processing'", - "freq = doc[3]._.s2v_freq", - "vector = doc[3]._.s2v_vec", - "most_similar = doc[3]._.s2v_most_similar(3)", - "# [(('natural language processing', 'NOUN'), 1.0),", - "# (('machine learning', 'NOUN'), 0.8986966609954834),", - "# (('computer vision', 'NOUN'), 0.8636297583580017)]" + "assert doc[3:6].text == \"natural language processing\"", + "freq = doc[3:6]._.s2v_freq", + "vector = doc[3:6]._.s2v_vec", + "most_similar = doc[3:6]._.s2v_most_similar(3)", + "# [(('machine learning', 'NOUN'), 0.8986967),", + "# (('computer vision', 'NOUN'), 0.8636297),", + "# (('deep learning', 'NOUN'), 0.8573361)]" ], "category": ["pipeline", "standalone", "visualizers"], "tags": ["vectors"], @@ -2970,11 +2969,10 @@ "github": "thomasthiebaud/spacy-fastlang", "pip": "spacy_fastlang", "code_example": [ - "import spacy", - "from spacy_fastlang import LanguageDetector", + "import spacy_fastlang", "", - "nlp = spacy.load('en_core_web_sm')", - "nlp.add_pipe(LanguageDetector())", + "nlp = spacy.load(\"en_core_web_sm\")", + "nlp.add_pipe(\"language_detector\")", "doc = nlp('Life is like a box of chocolates. You never know what you are gonna get.')", "", "assert doc._.language == 'en'", @@ -3476,7 +3474,51 @@ "github": "bbieniek" }, "category": ["apis"] - } + }, + { + "id": "phruzz_matcher", + "title": "phruzz-matcher", + "slogan": "Phrase matcher using RapidFuzz", + "description": "Combination of the RapidFuzz library with Spacy PhraseMatcher The goal of this component is to find matches when there were NO \"perfect matches\" due to typos or abbreviations between a Spacy doc and a list of phrases.", + "github": "mjvallone/phruzz-matcher", + "pip": "phruzz_matcher", + "code_example": [ + "import spacy", + "from spacy.language import Language", + "from phruzz_matcher.phrase_matcher import PhruzzMatcher", + "", + "famous_people = [", + " \"Brad Pitt\",", + " \"Demi Moore\",", + " \"Bruce Willis\",", + " \"Jim Carrey\",", + "]", + "", + "@Language.factory(\"phrase_matcher\")", + "def phrase_matcher(nlp: Language, name: str):", + " return PhruzzMatcher(nlp, famous_people, \"FAMOUS_PEOPLE\", 85)", + "", + "nlp = spacy.blank('es')", + "nlp.add_pipe(\"phrase_matcher\")", + "", + "doc = nlp(\"El otro día fui a un bar donde vi a brad pit y a Demi Moore, estaban tomando unas cervezas mientras charlaban de sus asuntos.\")", + "print(f\"doc.ents: {doc.ents}\")", + "", + "#OUTPUT", + "#doc.ents: (brad pit, Demi Moore)" + ], + "thumb": "https://avatars.githubusercontent.com/u/961296?v=4", + "image": "", + "code_language": "python", + "author": "Martin Vallone", + "author_links": { + "github": "mjvallone", + "twitter": "vallotin", + "website": "https://fiqus.coop/" + }, + "category": ["pipeline", "research", "standalone"], + "tags": ["spacy", "python", "nlp", "ner"] + } ], "categories": [ diff --git a/website/src/templates/models.js b/website/src/templates/models.js index 21ade5e36..554823ebf 100644 --- a/website/src/templates/models.js +++ b/website/src/templates/models.js @@ -34,6 +34,7 @@ const MODEL_META = { core_sm: 'Vocabulary, syntax, entities', dep: 'Vocabulary, syntax', ent: 'Named entities', + sent: 'Sentence boundaries', pytt: 'PyTorch Transformers', trf: 'Transformers', vectors: 'Word vectors', @@ -195,6 +196,7 @@ const Model = ({ const [isError, setIsError] = useState(true) const [meta, setMeta] = useState({}) const { type, genre, size } = getModelComponents(name) + const display_type = type === 'core' && size === 'sm' ? 'core_sm' : type const version = useMemo(() => getLatestVersion(name, compatibility, prereleases), [ name, compatibility, @@ -231,7 +233,7 @@ const Model = ({ const rows = [ { label: 'Language', tag: langId, content: langName }, - { label: 'Type', tag: type, content: MODEL_META[type] }, + { label: 'Type', tag: type, content: MODEL_META[display_type] }, { label: 'Genre', tag: genre, content: MODEL_META[genre] }, { label: 'Size', tag: size, content: meta.sizeFull }, { label: 'Components', content: components, help: MODEL_META.components },