diff --git a/.github/workflows/gputests.yml b/.github/workflows/gputests.yml new file mode 100644 index 000000000..14c1552bf --- /dev/null +++ b/.github/workflows/gputests.yml @@ -0,0 +1,21 @@ +name: Weekly GPU tests + +on: + schedule: + - cron: '0 1 * * MON' + +jobs: + weekly-gputests: + strategy: + fail-fast: false + matrix: + branch: [master, develop, v4] + runs-on: ubuntu-latest + steps: + - name: Trigger buildkite build + uses: buildkite/trigger-pipeline-action@v1.2.0 + env: + PIPELINE: explosion-ai/spacy-slow-gpu-tests + BRANCH: ${{ matrix.branch }} + MESSAGE: ":github: Weekly GPU + slow tests - triggered from a GitHub Action" + BUILDKITE_API_ACCESS_TOKEN: ${{ secrets.BUILDKITE_SECRET }} diff --git a/.github/workflows/slowtests.yml b/.github/workflows/slowtests.yml new file mode 100644 index 000000000..9490b53bd --- /dev/null +++ b/.github/workflows/slowtests.yml @@ -0,0 +1,35 @@ +name: Daily slow tests + +on: + schedule: + - cron: '0 0 * * *' + +jobs: + daily-slowtests: + strategy: + fail-fast: false + matrix: + branch: [master, develop, v4] + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v1 + - name: Get commits from past 24 hours + id: check_commits + run: | + today=$(date '+%Y-%m-%d %H:%M:%S') + yesterday=$(date -d "yesterday" '+%Y-%m-%d %H:%M:%S') + if git log --after=$yesterday --before=$today | grep commit ; then + echo "::set-output name=run_tests::true" + else + echo "::set-output name=run_tests::false" + fi + + - name: Trigger buildkite build + if: steps.check_commits.outputs.run_tests == 'true' + uses: buildkite/trigger-pipeline-action@v1.2.0 + env: + PIPELINE: explosion-ai/spacy-slow-tests + BRANCH: ${{ matrix.branch }} + MESSAGE: ":github: Daily slow tests - triggered from a GitHub Action" + BUILDKITE_API_ACCESS_TOKEN: ${{ secrets.BUILDKITE_SECRET }} diff --git a/LICENSE b/LICENSE index 86f501b92..d76864579 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ The MIT License (MIT) -Copyright (C) 2016-2021 ExplosionAI GmbH, 2016 spaCy GmbH, 2015 Matthew Honnibal +Copyright (C) 2016-2022 ExplosionAI GmbH, 2016 spaCy GmbH, 2015 Matthew Honnibal Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/MANIFEST.in b/MANIFEST.in index c1524d460..b7826e456 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,11 +1,8 @@ -recursive-include include *.h recursive-include spacy *.pyi *.pyx *.pxd *.txt *.cfg *.jinja *.toml include LICENSE include README.md include pyproject.toml include spacy/py.typed -recursive-exclude spacy/lang *.json -recursive-include spacy/lang *.json.gz -recursive-include spacy/cli *.json *.yml +recursive-include spacy/cli *.yml recursive-include licenses * recursive-exclude spacy *.cpp diff --git a/requirements.txt b/requirements.txt index 7e200be51..ca4099be5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -31,7 +31,8 @@ pytest-timeout>=1.3.0,<2.0.0 mock>=2.0.0,<3.0.0 flake8>=3.8.0,<3.10.0 hypothesis>=3.27.0,<7.0.0 -mypy>=0.910 +mypy==0.910 types-dataclasses>=0.1.3; python_version < "3.7" types-mock>=0.1.1 types-requests +black>=22.0,<23.0 diff --git a/setup.cfg b/setup.cfg index 72f4b39da..586a044ff 100644 --- a/setup.cfg +++ b/setup.cfg @@ -77,37 +77,39 @@ transformers = ray = spacy_ray>=0.1.0,<1.0.0 cuda = - cupy>=5.0.0b4,<10.0.0 + cupy>=5.0.0b4,<11.0.0 cuda80 = - cupy-cuda80>=5.0.0b4,<10.0.0 + cupy-cuda80>=5.0.0b4,<11.0.0 cuda90 = - cupy-cuda90>=5.0.0b4,<10.0.0 + cupy-cuda90>=5.0.0b4,<11.0.0 cuda91 = - cupy-cuda91>=5.0.0b4,<10.0.0 + cupy-cuda91>=5.0.0b4,<11.0.0 cuda92 = - cupy-cuda92>=5.0.0b4,<10.0.0 + cupy-cuda92>=5.0.0b4,<11.0.0 cuda100 = - cupy-cuda100>=5.0.0b4,<10.0.0 + cupy-cuda100>=5.0.0b4,<11.0.0 cuda101 = - cupy-cuda101>=5.0.0b4,<10.0.0 + cupy-cuda101>=5.0.0b4,<11.0.0 cuda102 = - cupy-cuda102>=5.0.0b4,<10.0.0 + cupy-cuda102>=5.0.0b4,<11.0.0 cuda110 = - cupy-cuda110>=5.0.0b4,<10.0.0 + cupy-cuda110>=5.0.0b4,<11.0.0 cuda111 = - cupy-cuda111>=5.0.0b4,<10.0.0 + cupy-cuda111>=5.0.0b4,<11.0.0 cuda112 = - cupy-cuda112>=5.0.0b4,<10.0.0 + cupy-cuda112>=5.0.0b4,<11.0.0 cuda113 = - cupy-cuda113>=5.0.0b4,<10.0.0 + cupy-cuda113>=5.0.0b4,<11.0.0 cuda114 = - cupy-cuda114>=5.0.0b4,<10.0.0 + cupy-cuda114>=5.0.0b4,<11.0.0 +cuda115 = + cupy-cuda115>=5.0.0b4,<11.0.0 apple = thinc-apple-ops>=0.0.4,<1.0.0 # Language tokenizers with external dependencies ja = - sudachipy>=0.4.9 - sudachidict_core>=20200330 + sudachipy>=0.5.2,!=0.6.1 + sudachidict_core>=20211220 ko = natto-py==0.9.0 th = diff --git a/spacy/attrs.pyx b/spacy/attrs.pyx index 640fb2f3c..dc8eed7c3 100644 --- a/spacy/attrs.pyx +++ b/spacy/attrs.pyx @@ -1,3 +1,6 @@ +from .errors import Errors + +IOB_STRINGS = ("", "I", "O", "B") IDS = { "": NULL_ATTR, @@ -64,7 +67,6 @@ IDS = { "FLAG61": FLAG61, "FLAG62": FLAG62, "FLAG63": FLAG63, - "ID": ID, "ORTH": ORTH, "LOWER": LOWER, @@ -72,7 +74,6 @@ IDS = { "SHAPE": SHAPE, "PREFIX": PREFIX, "SUFFIX": SUFFIX, - "LENGTH": LENGTH, "LEMMA": LEMMA, "POS": POS, @@ -87,7 +88,7 @@ IDS = { "SPACY": SPACY, "LANG": LANG, "MORPH": MORPH, - "IDX": IDX + "IDX": IDX, } @@ -109,28 +110,66 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False): """ inty_attrs = {} if _do_deprecated: - if 'F' in stringy_attrs: + if "F" in stringy_attrs: stringy_attrs["ORTH"] = stringy_attrs.pop("F") - if 'L' in stringy_attrs: + if "L" in stringy_attrs: stringy_attrs["LEMMA"] = stringy_attrs.pop("L") - if 'pos' in stringy_attrs: + if "pos" in stringy_attrs: stringy_attrs["TAG"] = stringy_attrs.pop("pos") - if 'morph' in stringy_attrs: - morphs = stringy_attrs.pop('morph') - if 'number' in stringy_attrs: - stringy_attrs.pop('number') - if 'tenspect' in stringy_attrs: - stringy_attrs.pop('tenspect') + if "morph" in stringy_attrs: + morphs = stringy_attrs.pop("morph") + if "number" in stringy_attrs: + stringy_attrs.pop("number") + if "tenspect" in stringy_attrs: + stringy_attrs.pop("tenspect") morph_keys = [ - 'PunctType', 'PunctSide', 'Other', 'Degree', 'AdvType', 'Number', - 'VerbForm', 'PronType', 'Aspect', 'Tense', 'PartType', 'Poss', - 'Hyph', 'ConjType', 'NumType', 'Foreign', 'VerbType', 'NounType', - 'Gender', 'Mood', 'Negative', 'Tense', 'Voice', 'Abbr', - 'Derivation', 'Echo', 'Foreign', 'NameType', 'NounType', 'NumForm', - 'NumValue', 'PartType', 'Polite', 'StyleVariant', - 'PronType', 'AdjType', 'Person', 'Variant', 'AdpType', - 'Reflex', 'Negative', 'Mood', 'Aspect', 'Case', - 'Polarity', 'PrepCase', 'Animacy' # U20 + "PunctType", + "PunctSide", + "Other", + "Degree", + "AdvType", + "Number", + "VerbForm", + "PronType", + "Aspect", + "Tense", + "PartType", + "Poss", + "Hyph", + "ConjType", + "NumType", + "Foreign", + "VerbType", + "NounType", + "Gender", + "Mood", + "Negative", + "Tense", + "Voice", + "Abbr", + "Derivation", + "Echo", + "Foreign", + "NameType", + "NounType", + "NumForm", + "NumValue", + "PartType", + "Polite", + "StyleVariant", + "PronType", + "AdjType", + "Person", + "Variant", + "AdpType", + "Reflex", + "Negative", + "Mood", + "Aspect", + "Case", + "Polarity", + "PrepCase", + "Animacy", # U20 ] for key in morph_keys: if key in stringy_attrs: @@ -142,8 +181,13 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False): for name, value in stringy_attrs.items(): int_key = intify_attr(name) if int_key is not None: + if int_key == ENT_IOB: + if value in IOB_STRINGS: + value = IOB_STRINGS.index(value) + elif isinstance(value, str): + raise ValueError(Errors.E1025.format(value=value)) if strings_map is not None and isinstance(value, str): - if hasattr(strings_map, 'add'): + if hasattr(strings_map, "add"): value = strings_map.add(value) else: value = strings_map[value] diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index 3143e2c62..a63795148 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -14,7 +14,7 @@ from ..training.initialize import get_sourced_components from ..schemas import ConfigSchemaTraining from ..pipeline._parser_internals import nonproj from ..pipeline._parser_internals.nonproj import DELIMITER -from ..pipeline import Morphologizer +from ..pipeline import Morphologizer, SpanCategorizer from ..morphology import Morphology from ..language import Language from ..util import registry, resolve_dot_names @@ -193,6 +193,70 @@ def debug_data( else: msg.info("No word vectors present in the package") + if "spancat" in factory_names: + model_labels_spancat = _get_labels_from_spancat(nlp) + has_low_data_warning = False + has_no_neg_warning = False + + msg.divider("Span Categorization") + msg.table(model_labels_spancat, header=["Spans Key", "Labels"], divider=True) + + msg.text("Label counts in train data: ", show=verbose) + for spans_key, data_labels in gold_train_data["spancat"].items(): + msg.text( + f"Key: {spans_key}, {_format_labels(data_labels.items(), counts=True)}", + show=verbose, + ) + # Data checks: only take the spans keys in the actual spancat components + data_labels_in_component = { + spans_key: gold_train_data["spancat"][spans_key] + for spans_key in model_labels_spancat.keys() + } + for spans_key, data_labels in data_labels_in_component.items(): + for label, count in data_labels.items(): + # Check for missing labels + spans_key_in_model = spans_key in model_labels_spancat.keys() + if (spans_key_in_model) and ( + label not in model_labels_spancat[spans_key] + ): + msg.warn( + f"Label '{label}' is not present in the model labels of key '{spans_key}'. " + "Performance may degrade after training." + ) + # Check for low number of examples per label + if count <= NEW_LABEL_THRESHOLD: + msg.warn( + f"Low number of examples for label '{label}' in key '{spans_key}' ({count})" + ) + has_low_data_warning = True + # Check for negative examples + with msg.loading("Analyzing label distribution..."): + neg_docs = _get_examples_without_label( + train_dataset, label, "spancat", spans_key + ) + if neg_docs == 0: + msg.warn(f"No examples for texts WITHOUT new label '{label}'") + has_no_neg_warning = True + + if has_low_data_warning: + msg.text( + f"To train a new span type, your data should include at " + f"least {NEW_LABEL_THRESHOLD} instances of the new label", + show=verbose, + ) + else: + msg.good("Good amount of examples for all labels") + + if has_no_neg_warning: + msg.text( + "Training data should always include examples of spans " + "in context, as well as examples without a given span " + "type.", + show=verbose, + ) + else: + msg.good("Examples without ocurrences available for all labels") + if "ner" in factory_names: # Get all unique NER labels present in the data labels = set( @@ -203,6 +267,7 @@ def debug_data( has_low_data_warning = False has_no_neg_warning = False has_ws_ents_error = False + has_boundary_cross_ents_warning = False msg.divider("Named Entity Recognition") msg.info(f"{len(model_labels)} label(s)") @@ -237,17 +302,25 @@ def debug_data( has_low_data_warning = True with msg.loading("Analyzing label distribution..."): - neg_docs = _get_examples_without_label(train_dataset, label) + neg_docs = _get_examples_without_label(train_dataset, label, "ner") if neg_docs == 0: msg.warn(f"No examples for texts WITHOUT new label '{label}'") has_no_neg_warning = True + if gold_train_data["boundary_cross_ents"]: + msg.warn( + f"{gold_train_data['boundary_cross_ents']} entity span(s) crossing sentence boundaries" + ) + has_boundary_cross_ents_warning = True + if not has_low_data_warning: msg.good("Good amount of examples for all labels") if not has_no_neg_warning: msg.good("Examples without occurrences available for all labels") if not has_ws_ents_error: msg.good("No entities consisting of or starting/ending with whitespace") + if not has_boundary_cross_ents_warning: + msg.good("No entities crossing sentence boundaries") if has_low_data_warning: msg.text( @@ -564,7 +637,9 @@ def _compile_gold( "deps": Counter(), "words": Counter(), "roots": Counter(), + "spancat": dict(), "ws_ents": 0, + "boundary_cross_ents": 0, "n_words": 0, "n_misaligned_words": 0, "words_missing_vectors": Counter(), @@ -593,6 +668,7 @@ def _compile_gold( if nlp.vocab.strings[word] not in nlp.vocab.vectors: data["words_missing_vectors"].update([word]) if "ner" in factory_names: + sent_starts = eg.get_aligned_sent_starts() for i, label in enumerate(eg.get_aligned_ner()): if label is None: continue @@ -602,8 +678,19 @@ def _compile_gold( if label.startswith(("B-", "U-")): combined_label = label.split("-")[1] data["ner"][combined_label] += 1 + if sent_starts[i] == True and label.startswith(("I-", "L-")): + data["boundary_cross_ents"] += 1 elif label == "-": data["ner"]["-"] += 1 + if "spancat" in factory_names: + for span_key in list(eg.reference.spans.keys()): + if span_key not in data["spancat"]: + data["spancat"][span_key] = Counter() + for i, span in enumerate(eg.reference.spans[span_key]): + if span.label_ is None: + continue + else: + data["spancat"][span_key][span.label_] += 1 if "textcat" in factory_names or "textcat_multilabel" in factory_names: data["cats"].update(gold.cats) if any(val not in (0, 1) for val in gold.cats.values()): @@ -674,21 +761,57 @@ def _format_labels( return ", ".join([f"'{l}'" for l in cast(Iterable[str], labels)]) -def _get_examples_without_label(data: Sequence[Example], label: str) -> int: +def _get_examples_without_label( + data: Sequence[Example], + label: str, + component: Literal["ner", "spancat"] = "ner", + spans_key: Optional[str] = "sc", +) -> int: count = 0 for eg in data: - labels = [ - label.split("-")[1] - for label in eg.get_aligned_ner() - if label not in ("O", "-", None) - ] + if component == "ner": + labels = [ + label.split("-")[1] + for label in eg.get_aligned_ner() + if label not in ("O", "-", None) + ] + + if component == "spancat": + labels = ( + [span.label_ for span in eg.reference.spans[spans_key]] + if spans_key in eg.reference.spans + else [] + ) + if label not in labels: count += 1 return count -def _get_labels_from_model(nlp: Language, pipe_name: str) -> Set[str]: - if pipe_name not in nlp.pipe_names: - return set() - pipe = nlp.get_pipe(pipe_name) - return set(pipe.labels) +def _get_labels_from_model(nlp: Language, factory_name: str) -> Set[str]: + pipe_names = [ + pipe_name + for pipe_name in nlp.pipe_names + if nlp.get_pipe_meta(pipe_name).factory == factory_name + ] + labels: Set[str] = set() + for pipe_name in pipe_names: + pipe = nlp.get_pipe(pipe_name) + labels.update(pipe.labels) + return labels + + +def _get_labels_from_spancat(nlp: Language) -> Dict[str, Set[str]]: + pipe_names = [ + pipe_name + for pipe_name in nlp.pipe_names + if nlp.get_pipe_meta(pipe_name).factory == "spancat" + ] + labels: Dict[str, Set[str]] = {} + for pipe_name in pipe_names: + pipe = nlp.get_pipe(pipe_name) + assert isinstance(pipe, SpanCategorizer) + if pipe.key not in labels: + labels[pipe.key] = set() + labels[pipe.key].update(pipe.labels) + return labels diff --git a/spacy/cli/package.py b/spacy/cli/package.py index f9d2a9af2..b8c8397b6 100644 --- a/spacy/cli/package.py +++ b/spacy/cli/package.py @@ -7,6 +7,7 @@ from collections import defaultdict from catalogue import RegistryError import srsly import sys +import re from ._util import app, Arg, Opt, string_to_list, WHEEL_SUFFIX, SDIST_SUFFIX from ..schemas import validate, ModelMetaSchema @@ -109,6 +110,24 @@ def package( ", ".join(meta["requirements"]), ) if name is not None: + if not name.isidentifier(): + msg.fail( + f"Model name ('{name}') is not a valid module name. " + "This is required so it can be imported as a module.", + "We recommend names that use ASCII A-Z, a-z, _ (underscore), " + "and 0-9. " + "For specific details see: https://docs.python.org/3/reference/lexical_analysis.html#identifiers", + exits=1, + ) + if not _is_permitted_package_name(name): + msg.fail( + f"Model name ('{name}') is not a permitted package name. " + "This is required to correctly load the model with spacy.load.", + "We recommend names that use ASCII A-Z, a-z, _ (underscore), " + "and 0-9. " + "For specific details see: https://www.python.org/dev/peps/pep-0426/#name", + exits=1, + ) meta["name"] = name if version is not None: meta["version"] = version @@ -162,7 +181,7 @@ def package( imports="\n".join(f"from . import {m}" for m in imports) ) create_file(package_path / "__init__.py", init_py) - msg.good(f"Successfully created package '{model_name_v}'", main_path) + msg.good(f"Successfully created package directory '{model_name_v}'", main_path) if create_sdist: with util.working_dir(main_path): util.run_command([sys.executable, "setup.py", "sdist"], capture=False) @@ -171,8 +190,14 @@ def package( if create_wheel: with util.working_dir(main_path): util.run_command([sys.executable, "setup.py", "bdist_wheel"], capture=False) - wheel = main_path / "dist" / f"{model_name_v}{WHEEL_SUFFIX}" + wheel_name_squashed = re.sub("_+", "_", model_name_v) + wheel = main_path / "dist" / f"{wheel_name_squashed}{WHEEL_SUFFIX}" msg.good(f"Successfully created binary wheel", wheel) + if "__" in model_name: + msg.warn( + f"Model name ('{model_name}') contains a run of underscores. " + "Runs of underscores are not significant in installed package names.", + ) def has_wheel() -> bool: @@ -422,6 +447,14 @@ def _format_label_scheme(data: Dict[str, Any]) -> str: return md.text +def _is_permitted_package_name(package_name: str) -> bool: + # regex from: https://www.python.org/dev/peps/pep-0426/#name + permitted_match = re.search( + r"^([A-Z0-9]|[A-Z0-9][A-Z0-9._-]*[A-Z0-9])$", package_name, re.IGNORECASE + ) + return permitted_match is not None + + TEMPLATE_SETUP = """ #!/usr/bin/env python import io diff --git a/spacy/cli/project/assets.py b/spacy/cli/project/assets.py index b5057e401..5e0cdfdf2 100644 --- a/spacy/cli/project/assets.py +++ b/spacy/cli/project/assets.py @@ -1,6 +1,7 @@ from typing import Any, Dict, Optional from pathlib import Path from wasabi import msg +import os import re import shutil import requests @@ -129,10 +130,17 @@ def fetch_asset( the asset failed. """ dest_path = (project_path / dest).resolve() - if dest_path.exists() and checksum: + if dest_path.exists(): # If there's already a file, check for checksum - if checksum == get_checksum(dest_path): - msg.good(f"Skipping download with matching checksum: {dest}") + if checksum: + if checksum == get_checksum(dest_path): + msg.good(f"Skipping download with matching checksum: {dest}") + return + else: + # If there's not a checksum, make sure the file is a possibly valid size + if os.path.getsize(dest_path) == 0: + msg.warn(f"Asset exists but with size of 0 bytes, deleting: {dest}") + os.remove(dest_path) # We might as well support the user here and create parent directories in # case the asset dir isn't listed as a dir to create in the project.yml if not dest_path.parent.exists(): diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja index b78806fec..fb79a4f60 100644 --- a/spacy/cli/templates/quickstart_training.jinja +++ b/spacy/cli/templates/quickstart_training.jinja @@ -6,6 +6,11 @@ can help generate the best possible configuration, given a user's requirements. [paths] train = null dev = null +{% if use_transformer or optimize == "efficiency" or not word_vectors -%} +vectors = null +{% else -%} +vectors = "{{ word_vectors }}" +{% endif -%} [system] {% if use_transformer -%} @@ -421,8 +426,4 @@ compound = 1.001 {% endif %} [initialize] -{% if use_transformer or optimize == "efficiency" or not word_vectors -%} vectors = ${paths.vectors} -{% else -%} -vectors = "{{ word_vectors }}" -{% endif -%} diff --git a/spacy/default_config.cfg b/spacy/default_config.cfg index ceb7357fc..86a72926e 100644 --- a/spacy/default_config.cfg +++ b/spacy/default_config.cfg @@ -68,12 +68,14 @@ seed = ${system.seed} gpu_allocator = ${system.gpu_allocator} dropout = 0.1 accumulate_gradient = 1 -# Controls early-stopping. 0 disables early stopping. +# Controls early-stopping, i.e., the number of steps to continue without +# improvement before stopping. 0 disables early stopping. patience = 1600 # Number of epochs. 0 means unlimited. If >= 0, train corpus is loaded once in # memory and shuffled within the training loop. -1 means stream train corpus # rather than loading in memory with no shuffling within the training loop. max_epochs = 0 +# Maximum number of update steps to train for. 0 means an unlimited number of steps. max_steps = 20000 eval_frequency = 200 # Control how scores are printed and checkpoints are evaluated. diff --git a/spacy/displacy/render.py b/spacy/displacy/render.py index 14d741a3d..a032d843b 100644 --- a/spacy/displacy/render.py +++ b/spacy/displacy/render.py @@ -18,7 +18,7 @@ DEFAULT_LABEL_COLORS = { "LOC": "#ff9561", "PERSON": "#aa9cfc", "NORP": "#c887fb", - "FACILITY": "#9cc9cc", + "FAC": "#9cc9cc", "EVENT": "#ffeb80", "LAW": "#ff8197", "LANGUAGE": "#ff8197", diff --git a/spacy/errors.py b/spacy/errors.py index 673674222..b45c4f9db 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -483,7 +483,7 @@ class Errors(metaclass=ErrorsWithCodes): "components, since spans are only views of the Doc. Use Doc and " "Token attributes (or custom extension attributes) only and remove " "the following: {attrs}") - E181 = ("Received invalid attributes for unkown object {obj}: {attrs}. " + E181 = ("Received invalid attributes for unknown object {obj}: {attrs}. " "Only Doc and Token attributes are supported.") E182 = ("Received invalid attribute declaration: {attr}\nDid you forget " "to define the attribute? For example: `{attr}.???`") @@ -888,11 +888,14 @@ class Errors(metaclass=ErrorsWithCodes): E1021 = ("`pos` value \"{pp}\" is not a valid Universal Dependencies tag. " "Non-UD tags should use the `tag` property.") E1022 = ("Words must be of type str or int, but input is of type '{wtype}'") - E1023 = ("Couldn't read EntityRuler from the {path}. This file doesn't exist.") - E1024 = ("A pattern with ID \"{ent_id}\" is not present in EntityRuler patterns.") + E1023 = ("Couldn't read EntityRuler from the {path}. This file doesn't " + "exist.") + E1024 = ("A pattern with ID \"{ent_id}\" is not present in EntityRuler " + "patterns.") + E1025 = ("Cannot intify the value '{value}' as an IOB string. The only " + "supported values are: 'I', 'O', 'B' and ''") - # Deprecated model shortcuts, only used in errors and warnings OLD_MODEL_SHORTCUTS = { "en": "en_core_web_sm", "de": "de_core_news_sm", "es": "es_core_news_sm", diff --git a/spacy/glossary.py b/spacy/glossary.py index e45704fc5..57254330f 100644 --- a/spacy/glossary.py +++ b/spacy/glossary.py @@ -310,7 +310,6 @@ GLOSSARY = { "re": "repeated element", "rs": "reported speech", "sb": "subject", - "sb": "subject", "sbp": "passivized subject (PP)", "sp": "subject or predicate", "svp": "separable verb prefix", diff --git a/spacy/lang/hi/lex_attrs.py b/spacy/lang/hi/lex_attrs.py index a18c2e513..ee845e8b1 100644 --- a/spacy/lang/hi/lex_attrs.py +++ b/spacy/lang/hi/lex_attrs.py @@ -90,7 +90,7 @@ _eleven_to_beyond = [ "अड़सठ", "उनहत्तर", "सत्तर", - "इकहत्तर" + "इकहत्तर", "बहत्तर", "तिहत्तर", "चौहत्तर", diff --git a/spacy/lang/xx/examples.py b/spacy/lang/xx/examples.py index 8d63c3c20..34570d747 100644 --- a/spacy/lang/xx/examples.py +++ b/spacy/lang/xx/examples.py @@ -59,7 +59,7 @@ sentences = [ "Czy w ciągu ostatnich 48 godzin spożyłeś leki zawierające paracetamol?", "Kto ma ochotę zapoznać się z innymi niż w książkach przygodami Muminków i ich przyjaciół, temu polecam komiks Tove Jansson „Muminki i morze”.", "Apple está querendo comprar uma startup do Reino Unido por 100 milhões de dólares.", - "Carros autônomos empurram a responsabilidade do seguro para os fabricantes.." + "Carros autônomos empurram a responsabilidade do seguro para os fabricantes..", "São Francisco considera banir os robôs de entrega que andam pelas calçadas.", "Londres é a maior cidade do Reino Unido.", # Translations from English: diff --git a/spacy/language.py b/spacy/language.py index 638616316..e8fd2720c 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -131,7 +131,7 @@ class Language: self, vocab: Union[Vocab, bool] = True, *, - max_length: int = 10 ** 6, + max_length: int = 10**6, meta: Dict[str, Any] = {}, create_tokenizer: Optional[Callable[["Language"], Callable[[str], Doc]]] = None, batch_size: int = 1000, @@ -354,12 +354,15 @@ class Language: @property def pipe_labels(self) -> Dict[str, List[str]]: """Get the labels set by the pipeline components, if available (if - the component exposes a labels property). + the component exposes a labels property and the labels are not + hidden). RETURNS (Dict[str, List[str]]): Labels keyed by component name. """ labels = {} for name, pipe in self._components: + if hasattr(pipe, "hide_labels") and pipe.hide_labels is True: + continue if hasattr(pipe, "labels"): labels[name] = list(pipe.labels) return SimpleFrozenDict(labels) @@ -522,7 +525,7 @@ class Language: requires: Iterable[str] = SimpleFrozenList(), retokenizes: bool = False, func: Optional["Pipe"] = None, - ) -> Callable: + ) -> Callable[..., Any]: """Register a new pipeline component. Can be used for stateless function components that don't require a separate factory. Can be used as a decorator on a function or classmethod, or called as a function with the @@ -1285,9 +1288,9 @@ class Language: ) except IOError: raise IOError(Errors.E884.format(vectors=I["vectors"])) - if self.vocab.vectors.data.shape[1] >= 1: + if self.vocab.vectors.shape[1] >= 1: ops = get_current_ops() - self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data) + self.vocab.vectors.to_ops(ops) if hasattr(self.tokenizer, "initialize"): tok_settings = validate_init_settings( self.tokenizer.initialize, # type: ignore[union-attr] @@ -1332,8 +1335,8 @@ class Language: DOCS: https://spacy.io/api/language#resume_training """ ops = get_current_ops() - if self.vocab.vectors.data.shape[1] >= 1: - self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data) + if self.vocab.vectors.shape[1] >= 1: + self.vocab.vectors.to_ops(ops) for name, proc in self.pipeline: if hasattr(proc, "_rehearsal_model"): proc._rehearsal_model = deepcopy(proc.model) # type: ignore[attr-defined] diff --git a/spacy/lexeme.pyi b/spacy/lexeme.pyi index 4eae6be43..4fcaa82cf 100644 --- a/spacy/lexeme.pyi +++ b/spacy/lexeme.pyi @@ -19,7 +19,7 @@ class Lexeme: @property def vector_norm(self) -> float: ... vector: Floats1d - rank: str + rank: int sentiment: float @property def orth_(self) -> str: ... diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index 792e405dd..6c66effde 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -130,8 +130,10 @@ cdef class Lexeme: return 0.0 vector = self.vector xp = get_array_module(vector) - return (xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm)) - + result = xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm) + # ensure we get a scalar back (numpy does this automatically but cupy doesn't) + return result.item() + @property def has_vector(self): """RETURNS (bool): Whether a word vector is associated with the object. diff --git a/spacy/matcher/dependencymatcher.pyi b/spacy/matcher/dependencymatcher.pyi new file mode 100644 index 000000000..c19d3a71c --- /dev/null +++ b/spacy/matcher/dependencymatcher.pyi @@ -0,0 +1,66 @@ +from typing import Any, Callable, Dict, List, Optional, Tuple, Union +from .matcher import Matcher +from ..vocab import Vocab +from ..tokens.doc import Doc +from ..tokens.span import Span + +class DependencyMatcher: + """Match dependency parse tree based on pattern rules.""" + + _patterns: Dict[str, List[Any]] + _raw_patterns: Dict[str, List[Any]] + _tokens_to_key: Dict[str, List[Any]] + _root: Dict[str, List[Any]] + _tree: Dict[str, List[Any]] + _callbacks: Dict[ + Any, Callable[[DependencyMatcher, Doc, int, List[Tuple[int, List[int]]]], Any] + ] + _ops: Dict[str, Any] + vocab: Vocab + _matcher: Matcher + def __init__(self, vocab: Vocab, *, validate: bool = ...) -> None: ... + def __reduce__( + self, + ) -> Tuple[ + Callable[ + [Vocab, Dict[str, Any], Dict[str, Callable[..., Any]]], DependencyMatcher + ], + Tuple[ + Vocab, + Dict[str, List[Any]], + Dict[ + str, + Callable[ + [DependencyMatcher, Doc, int, List[Tuple[int, List[int]]]], Any + ], + ], + ], + None, + None, + ]: ... + def __len__(self) -> int: ... + def __contains__(self, key: Union[str, int]) -> bool: ... + def add( + self, + key: Union[str, int], + patterns: List[List[Dict[str, Any]]], + *, + on_match: Optional[ + Callable[[DependencyMatcher, Doc, int, List[Tuple[int, List[int]]]], Any] + ] = ... + ) -> None: ... + def has_key(self, key: Union[str, int]) -> bool: ... + def get( + self, key: Union[str, int], default: Optional[Any] = ... + ) -> Tuple[ + Optional[ + Callable[[DependencyMatcher, Doc, int, List[Tuple[int, List[int]]]], Any] + ], + List[List[Dict[str, Any]]], + ]: ... + def remove(self, key: Union[str, int]) -> None: ... + def __call__(self, doclike: Union[Doc, Span]) -> List[Tuple[int, List[int]]]: ... + +def unpickle_matcher( + vocab: Vocab, patterns: Dict[str, Any], callbacks: Dict[str, Callable[..., Any]] +) -> DependencyMatcher: ... diff --git a/spacy/matcher/matcher.pyi b/spacy/matcher/matcher.pyi index ec4a88eaf..390629ff8 100644 --- a/spacy/matcher/matcher.pyi +++ b/spacy/matcher/matcher.pyi @@ -1,4 +1,6 @@ -from typing import Any, List, Dict, Tuple, Optional, Callable, Union, Iterator, Iterable +from typing import Any, List, Dict, Tuple, Optional, Callable, Union +from typing import Iterator, Iterable, overload +from ..compat import Literal from ..vocab import Vocab from ..tokens import Doc, Span @@ -31,12 +33,22 @@ class Matcher: ) -> Union[ Iterator[Tuple[Tuple[Doc, Any], Any]], Iterator[Tuple[Doc, Any]], Iterator[Doc] ]: ... + @overload def __call__( self, doclike: Union[Doc, Span], *, - as_spans: bool = ..., + as_spans: Literal[False] = ..., allow_missing: bool = ..., with_alignments: bool = ... - ) -> Union[List[Tuple[int, int, int]], List[Span]]: ... + ) -> List[Tuple[int, int, int]]: ... + @overload + def __call__( + self, + doclike: Union[Doc, Span], + *, + as_spans: Literal[True], + allow_missing: bool = ..., + with_alignments: bool = ... + ) -> List[Span]: ... def _normalize_key(self, key: Any) -> Any: ... diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index 745d7cf43..6aa58f0e3 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -18,7 +18,7 @@ from ..tokens.doc cimport Doc, get_token_attr_for_matcher from ..tokens.span cimport Span from ..tokens.token cimport Token from ..tokens.morphanalysis cimport MorphAnalysis -from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA, MORPH +from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA, MORPH, ENT_IOB from ..schemas import validate_token_pattern from ..errors import Errors, MatchPatternError, Warnings @@ -798,7 +798,10 @@ def _get_attr_values(spec, string_store): attr = "SENT_START" attr = IDS.get(attr) if isinstance(value, str): - value = string_store.add(value) + if attr == ENT_IOB and value in Token.iob_strings(): + value = Token.iob_strings().index(value) + else: + value = string_store.add(value) elif isinstance(value, bool): value = int(value) elif isinstance(value, int): diff --git a/spacy/matcher/phrasematcher.pyi b/spacy/matcher/phrasematcher.pyi index 741bf7bb6..68e3386e4 100644 --- a/spacy/matcher/phrasematcher.pyi +++ b/spacy/matcher/phrasematcher.pyi @@ -1,6 +1,6 @@ -from typing import List, Tuple, Union, Optional, Callable, Any, Dict - -from . import Matcher +from typing import List, Tuple, Union, Optional, Callable, Any, Dict, overload +from ..compat import Literal +from .matcher import Matcher from ..vocab import Vocab from ..tokens import Doc, Span @@ -14,16 +14,24 @@ class PhraseMatcher: def add( self, key: str, - docs: List[List[Dict[str, Any]]], + docs: List[Doc], *, on_match: Optional[ Callable[[Matcher, Doc, int, List[Tuple[Any, ...]]], Any] ] = ..., ) -> None: ... def remove(self, key: str) -> None: ... + @overload def __call__( self, doclike: Union[Doc, Span], *, - as_spans: bool = ..., - ) -> Union[List[Tuple[int, int, int]], List[Span]]: ... + as_spans: Literal[False] = ..., + ) -> List[Tuple[int, int, int]]: ... + @overload + def __call__( + self, + doclike: Union[Doc, Span], + *, + as_spans: Literal[True], + ) -> List[Span]: ... diff --git a/spacy/ml/models/multi_task.py b/spacy/ml/models/multi_task.py index 37473b7f4..a7d67c6dd 100644 --- a/spacy/ml/models/multi_task.py +++ b/spacy/ml/models/multi_task.py @@ -23,7 +23,7 @@ def create_pretrain_vectors( maxout_pieces: int, hidden_size: int, loss: str ) -> Callable[["Vocab", Model], Model]: def create_vectors_objective(vocab: "Vocab", tok2vec: Model) -> Model: - if vocab.vectors.data.shape[1] == 0: + if vocab.vectors.shape[1] == 0: raise ValueError(Errors.E875) model = build_cloze_multi_task_model( vocab, tok2vec, hidden_size=hidden_size, maxout_pieces=maxout_pieces @@ -85,7 +85,7 @@ def get_characters_loss(ops, docs, prediction, nr_char): target = ops.asarray(to_categorical(target_ids, n_classes=256), dtype="f") target = target.reshape((-1, 256 * nr_char)) diff = prediction - target - loss = (diff ** 2).sum() + loss = (diff**2).sum() d_target = diff / float(prediction.shape[0]) return loss, d_target @@ -116,7 +116,7 @@ def build_multi_task_model( def build_cloze_multi_task_model( vocab: "Vocab", tok2vec: Model, maxout_pieces: int, hidden_size: int ) -> Model: - nO = vocab.vectors.data.shape[1] + nO = vocab.vectors.shape[1] output_layer = chain( cast(Model[List["Floats2d"], Floats2d], list2array()), Maxout( diff --git a/spacy/ml/staticvectors.py b/spacy/ml/staticvectors.py index 8dd65833b..8d9b1af9b 100644 --- a/spacy/ml/staticvectors.py +++ b/spacy/ml/staticvectors.py @@ -94,7 +94,7 @@ def init( nM = model.get_dim("nM") if model.has_dim("nM") else None nO = model.get_dim("nO") if model.has_dim("nO") else None if X is not None and len(X): - nM = X[0].vocab.vectors.data.shape[1] + nM = X[0].vocab.vectors.shape[1] if Y is not None: nO = Y.data.shape[1] diff --git a/spacy/pipeline/_parser_internals/_state.pxd b/spacy/pipeline/_parser_internals/_state.pxd index 161f3ca48..27623e7c6 100644 --- a/spacy/pipeline/_parser_internals/_state.pxd +++ b/spacy/pipeline/_parser_internals/_state.pxd @@ -1,3 +1,4 @@ +from cython.operator cimport dereference as deref, preincrement as incr from libc.string cimport memcpy, memset from libc.stdlib cimport calloc, free from libc.stdint cimport uint32_t, uint64_t @@ -184,16 +185,20 @@ cdef cppclass StateC: int L(int head, int idx) nogil const: if idx < 1 or this._left_arcs.size() == 0: return -1 - cdef vector[int] lefts - for i in range(this._left_arcs.size()): - arc = this._left_arcs.at(i) + + # Work backwards through left-arcs to find the arc at the + # requested index more quickly. + cdef size_t child_index = 0 + it = this._left_arcs.const_rbegin() + while it != this._left_arcs.rend(): + arc = deref(it) if arc.head == head and arc.child != -1 and arc.child < head: - lefts.push_back(arc.child) - idx = (lefts.size()) - idx - if idx < 0: - return -1 - else: - return lefts.at(idx) + child_index += 1 + if child_index == idx: + return arc.child + incr(it) + + return -1 int R(int head, int idx) nogil const: if idx < 1 or this._right_arcs.size() == 0: diff --git a/spacy/pipeline/_parser_internals/arc_eager.pyx b/spacy/pipeline/_parser_internals/arc_eager.pyx index ddcc911c8..029e2e29e 100644 --- a/spacy/pipeline/_parser_internals/arc_eager.pyx +++ b/spacy/pipeline/_parser_internals/arc_eager.pyx @@ -604,7 +604,7 @@ cdef class ArcEager(TransitionSystem): actions[SHIFT][''] += 1 if min_freq is not None: for action, label_freqs in actions.items(): - for label, freq in list(label_freqs.items()): + for label, freq in label_freqs.copy().items(): if freq < min_freq: label_freqs.pop(label) # Ensure these actions are present diff --git a/spacy/pipeline/pipe.pyi b/spacy/pipeline/pipe.pyi index c7c0568f9..9dd6a9d50 100644 --- a/spacy/pipeline/pipe.pyi +++ b/spacy/pipeline/pipe.pyi @@ -26,6 +26,8 @@ class Pipe: @property def labels(self) -> Tuple[str, ...]: ... @property + def hide_labels(self) -> bool: ... + @property def label_data(self) -> Any: ... def _require_labels(self) -> None: ... def set_error_handler( diff --git a/spacy/pipeline/pipe.pyx b/spacy/pipeline/pipe.pyx index 9eddc1e3f..d24e4d574 100644 --- a/spacy/pipeline/pipe.pyx +++ b/spacy/pipeline/pipe.pyx @@ -102,6 +102,10 @@ cdef class Pipe: def labels(self) -> Tuple[str, ...]: return tuple() + @property + def hide_labels(self) -> bool: + return False + @property def label_data(self): """Optional JSON-serializable data that would be sufficient to recreate diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx index 54ce021af..5d2688463 100644 --- a/spacy/pipeline/senter.pyx +++ b/spacy/pipeline/senter.pyx @@ -99,6 +99,10 @@ class SentenceRecognizer(Tagger): # are 0 return tuple(["I", "S"]) + @property + def hide_labels(self): + return True + @property def label_data(self): return None diff --git a/spacy/pipeline/spancat.py b/spacy/pipeline/spancat.py index 829def1eb..5d0d8f17e 100644 --- a/spacy/pipeline/spancat.py +++ b/spacy/pipeline/spancat.py @@ -377,7 +377,7 @@ class SpanCategorizer(TrainablePipe): # If the prediction is 0.9 and it's false, the gradient will be # 0.9 (0.9 - 0.0) d_scores = scores - target - loss = float((d_scores ** 2).sum()) + loss = float((d_scores**2).sum()) return loss, d_scores def initialize( @@ -412,7 +412,7 @@ class SpanCategorizer(TrainablePipe): self._require_labels() if subbatch: docs = [eg.x for eg in subbatch] - spans = self.suggester(docs) + spans = build_ngram_suggester(sizes=[1])(docs) Y = self.model.ops.alloc2f(spans.dataXd.shape[0], len(self.labels)) self.model.initialize(X=(docs, spans), Y=Y) else: diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py index 30a65ec52..7f5510933 100644 --- a/spacy/pipeline/textcat.py +++ b/spacy/pipeline/textcat.py @@ -281,7 +281,7 @@ class TextCategorizer(TrainablePipe): bp_scores(gradient) if sgd is not None: self.finish_update(sgd) - losses[self.name] += (gradient ** 2).sum() + losses[self.name] += (gradient**2).sum() return losses def _examples_to_truth( @@ -315,7 +315,7 @@ class TextCategorizer(TrainablePipe): not_missing = self.model.ops.asarray(not_missing) # type: ignore d_scores = (scores - truths) / scores.shape[0] d_scores *= not_missing - mean_square_error = (d_scores ** 2).sum(axis=1).mean() + mean_square_error = (d_scores**2).sum(axis=1).mean() return float(mean_square_error), d_scores def add_label(self, label: str) -> int: diff --git a/spacy/schemas.py b/spacy/schemas.py index cf58688ef..1dfd8ee85 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -1,5 +1,6 @@ from typing import Dict, List, Union, Optional, Any, Callable, Type, Tuple from typing import Iterable, TypeVar, TYPE_CHECKING +from .compat import Literal from enum import Enum from pydantic import BaseModel, Field, ValidationError, validator, create_model from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool @@ -209,6 +210,7 @@ NumberValue = Union[TokenPatternNumber, StrictInt, StrictFloat] UnderscoreValue = Union[ TokenPatternString, TokenPatternNumber, str, int, float, list, bool ] +IobValue = Literal["", "I", "O", "B", 0, 1, 2, 3] class TokenPattern(BaseModel): @@ -222,6 +224,7 @@ class TokenPattern(BaseModel): lemma: Optional[StringValue] = None shape: Optional[StringValue] = None ent_type: Optional[StringValue] = None + ent_iob: Optional[IobValue] = None ent_id: Optional[StringValue] = None ent_kb_id: Optional[StringValue] = None norm: Optional[StringValue] = None diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index c6195d7e2..858c7cbb6 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -567,6 +567,7 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer): "Merging the docs is fun.", "", "They don't think alike. ", + "", "Another doc.", ] en_texts_without_empty = [t for t in en_texts if len(t)] @@ -574,9 +575,9 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer): en_docs = [en_tokenizer(text) for text in en_texts] en_docs[0].spans["group"] = [en_docs[0][1:4]] en_docs[2].spans["group"] = [en_docs[2][1:4]] - en_docs[3].spans["group"] = [en_docs[3][0:1]] + en_docs[4].spans["group"] = [en_docs[4][0:1]] span_group_texts = sorted( - [en_docs[0][1:4].text, en_docs[2][1:4].text, en_docs[3][0:1].text] + [en_docs[0][1:4].text, en_docs[2][1:4].text, en_docs[4][0:1].text] ) de_doc = de_tokenizer(de_text) Token.set_extension("is_ambiguous", default=False) @@ -683,6 +684,7 @@ def test_has_annotation(en_vocab): attrs = ("TAG", "POS", "MORPH", "LEMMA", "DEP", "HEAD", "ENT_IOB", "ENT_TYPE") for attr in attrs: assert not doc.has_annotation(attr) + assert not doc.has_annotation(attr, require_complete=True) doc[0].tag_ = "A" doc[0].pos_ = "X" @@ -708,6 +710,27 @@ def test_has_annotation(en_vocab): assert doc.has_annotation(attr, require_complete=True) +def test_has_annotation_sents(en_vocab): + doc = Doc(en_vocab, words=["Hello", "beautiful", "world"]) + attrs = ("SENT_START", "IS_SENT_START", "IS_SENT_END") + for attr in attrs: + assert not doc.has_annotation(attr) + assert not doc.has_annotation(attr, require_complete=True) + + # The first token (index 0) is always assumed to be a sentence start, + # and ignored by the check in doc.has_annotation + + doc[1].is_sent_start = False + for attr in attrs: + assert doc.has_annotation(attr) + assert not doc.has_annotation(attr, require_complete=True) + + doc[2].is_sent_start = False + for attr in attrs: + assert doc.has_annotation(attr) + assert doc.has_annotation(attr, require_complete=True) + + def test_is_flags_deprecated(en_tokenizer): doc = en_tokenizer("test") with pytest.deprecated_call(): diff --git a/spacy/tests/lang/test_attrs.py b/spacy/tests/lang/test_attrs.py index 5350c1fe5..1c27c1744 100644 --- a/spacy/tests/lang/test_attrs.py +++ b/spacy/tests/lang/test_attrs.py @@ -1,4 +1,5 @@ import pytest +from spacy.attrs import intify_attrs, ENT_IOB from spacy.attrs import IS_ALPHA, LEMMA, NORM, ORTH, intify_attrs from spacy.lang.en.stop_words import STOP_WORDS @@ -33,6 +34,38 @@ def test_attrs_do_deprecated(text): assert int_attrs == {ORTH: 10, IS_ALPHA: True} +def test_attrs_ent_iob_intify(): + int_attrs = intify_attrs({"ENT_IOB": ""}) + assert int_attrs == {ENT_IOB: 0} + + int_attrs = intify_attrs({"ENT_IOB": "I"}) + assert int_attrs == {ENT_IOB: 1} + + int_attrs = intify_attrs({"ENT_IOB": "O"}) + assert int_attrs == {ENT_IOB: 2} + + int_attrs = intify_attrs({"ENT_IOB": "B"}) + assert int_attrs == {ENT_IOB: 3} + + int_attrs = intify_attrs({ENT_IOB: ""}) + assert int_attrs == {ENT_IOB: 0} + + int_attrs = intify_attrs({ENT_IOB: "I"}) + assert int_attrs == {ENT_IOB: 1} + + int_attrs = intify_attrs({ENT_IOB: "O"}) + assert int_attrs == {ENT_IOB: 2} + + int_attrs = intify_attrs({ENT_IOB: "B"}) + assert int_attrs == {ENT_IOB: 3} + + with pytest.raises(ValueError): + int_attrs = intify_attrs({"ENT_IOB": "XX"}) + + with pytest.raises(ValueError): + int_attrs = intify_attrs({ENT_IOB: "XX"}) + + @pytest.mark.parametrize("text,match", [(",", True), (" ", False), ("a", False)]) def test_lex_attrs_is_punct(text, match): assert is_punct(text) == match diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py index c02d65cdf..a27baf130 100644 --- a/spacy/tests/matcher/test_matcher_api.py +++ b/spacy/tests/matcher/test_matcher_api.py @@ -642,3 +642,30 @@ def test_matcher_no_zero_length(en_vocab): matcher = Matcher(en_vocab) matcher.add("TEST", [[{"TAG": "C", "OP": "?"}]]) assert len(matcher(doc)) == 0 + + +def test_matcher_ent_iob_key(en_vocab): + """Test that patterns with ent_iob works correctly.""" + matcher = Matcher(en_vocab) + matcher.add("Rule", [[{"ENT_IOB": "I"}]]) + doc1 = Doc(en_vocab, words=["I", "visited", "New", "York", "and", "California"]) + doc1.ents = [Span(doc1, 2, 4, label="GPE"), Span(doc1, 5, 6, label="GPE")] + doc2 = Doc(en_vocab, words=["I", "visited", "my", "friend", "Alicia"]) + doc2.ents = [Span(doc2, 4, 5, label="PERSON")] + matches1 = [doc1[start:end].text for _, start, end in matcher(doc1)] + matches2 = [doc2[start:end].text for _, start, end in matcher(doc2)] + assert len(matches1) == 1 + assert matches1[0] == "York" + assert len(matches2) == 0 + + matcher = Matcher(en_vocab) # Test iob pattern with operators + matcher.add("Rule", [[{"ENT_IOB": "I", "OP": "+"}]]) + doc = Doc( + en_vocab, words=["I", "visited", "my", "friend", "Anna", "Maria", "Esperanza"] + ) + doc.ents = [Span(doc, 4, 7, label="PERSON")] + matches = [doc[start:end].text for _, start, end in matcher(doc)] + assert len(matches) == 3 + assert matches[0] == "Maria" + assert matches[1] == "Maria Esperanza" + assert matches[2] == "Esperanza" diff --git a/spacy/tests/matcher/test_pattern_validation.py b/spacy/tests/matcher/test_pattern_validation.py index 74feb7c5d..8c265785c 100644 --- a/spacy/tests/matcher/test_pattern_validation.py +++ b/spacy/tests/matcher/test_pattern_validation.py @@ -12,6 +12,7 @@ TEST_PATTERNS = [ ([{"IS_PUNCT": True, "OP": "$"}], 1, 1), ([{"_": "foo"}], 1, 1), ('[{"TEXT": "foo"}, {"LOWER": "bar"}]', 1, 1), + ([{"ENT_IOB": "foo"}], 1, 1), ([1, 2, 3], 3, 1), # Bad patterns flagged outside of Matcher ([{"_": {"foo": "bar", "baz": {"IN": "foo"}}}], 2, 0), # prev: (1, 0) diff --git a/spacy/tests/package/test_requirements.py b/spacy/tests/package/test_requirements.py index 75908df59..e20227455 100644 --- a/spacy/tests/package/test_requirements.py +++ b/spacy/tests/package/test_requirements.py @@ -12,6 +12,7 @@ def test_build_dependencies(): "flake8", "hypothesis", "pre-commit", + "black", "mypy", "types-dataclasses", "types-mock", diff --git a/spacy/tests/pipeline/test_senter.py b/spacy/tests/pipeline/test_senter.py index 7a256f79b..047f59bef 100644 --- a/spacy/tests/pipeline/test_senter.py +++ b/spacy/tests/pipeline/test_senter.py @@ -97,3 +97,7 @@ def test_overfitting_IO(): ] assert_equal(batch_deps_1, batch_deps_2) assert_equal(batch_deps_1, no_batch_deps) + + # test internal pipe labels vs. Language.pipe_labels with hidden labels + assert nlp.get_pipe("senter").labels == ("I", "S") + assert "senter" not in nlp.pipe_labels diff --git a/spacy/tests/pipeline/test_spancat.py b/spacy/tests/pipeline/test_spancat.py index 2f7e952d3..8060bc621 100644 --- a/spacy/tests/pipeline/test_spancat.py +++ b/spacy/tests/pipeline/test_spancat.py @@ -80,6 +80,8 @@ def test_explicit_labels(): assert spancat.labels == ("PERSON", "LOC") +# TODO figure out why this is flaky +@pytest.mark.skip(reason="Test is unreliable for unknown reason") def test_doc_gc(): # If the Doc object is garbage collected, the spans won't be functional afterwards nlp = Language() @@ -97,6 +99,7 @@ def test_doc_gc(): assert isinstance(spangroups, SpanGroups) for key, spangroup in spangroups.items(): assert isinstance(spangroup, SpanGroup) + # XXX This fails with length 0 sometimes assert len(spangroup) > 0 with pytest.raises(RuntimeError): span = spangroup[0] diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index b0862eab6..9d3f1ee71 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -12,14 +12,18 @@ from spacy.cli._util import is_subpath_of, load_project_config from spacy.cli._util import parse_config_overrides, string_to_list from spacy.cli._util import substitute_project_variables from spacy.cli._util import validate_project_commands +from spacy.cli.debug_data import _compile_gold, _get_labels_from_model +from spacy.cli.debug_data import _get_labels_from_spancat from spacy.cli.download import get_compatibility, get_version from spacy.cli.init_config import RECOMMENDATIONS, init_config, fill_config from spacy.cli.package import get_third_party_dependencies +from spacy.cli.package import _is_permitted_package_name from spacy.cli.validate import get_model_pkgs from spacy.lang.en import English from spacy.lang.nl import Dutch from spacy.language import Language from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate +from spacy.tokens import Doc from spacy.training import Example, docs_to_json, offsets_to_biluo_tags from spacy.training.converters import conll_ner_to_docs, conllu_to_docs from spacy.training.converters import iob_to_docs @@ -665,3 +669,54 @@ def test_get_third_party_dependencies(): ) def test_is_subpath_of(parent, child, expected): assert is_subpath_of(parent, child) == expected + + +@pytest.mark.slow +@pytest.mark.parametrize( + "factory_name,pipe_name", + [ + ("ner", "ner"), + ("ner", "my_ner"), + ("spancat", "spancat"), + ("spancat", "my_spancat"), + ], +) +def test_get_labels_from_model(factory_name, pipe_name): + labels = ("A", "B") + + nlp = English() + pipe = nlp.add_pipe(factory_name, name=pipe_name) + for label in labels: + pipe.add_label(label) + nlp.initialize() + assert nlp.get_pipe(pipe_name).labels == labels + if factory_name == "spancat": + assert _get_labels_from_spancat(nlp)[pipe.key] == set(labels) + else: + assert _get_labels_from_model(nlp, factory_name) == set(labels) + + +def test_permitted_package_names(): + # https://www.python.org/dev/peps/pep-0426/#name + assert _is_permitted_package_name("Meine_Bäume") == False + assert _is_permitted_package_name("_package") == False + assert _is_permitted_package_name("package_") == False + assert _is_permitted_package_name(".package") == False + assert _is_permitted_package_name("package.") == False + assert _is_permitted_package_name("-package") == False + assert _is_permitted_package_name("package-") == False + + +def test_debug_data_compile_gold(): + nlp = English() + pred = Doc(nlp.vocab, words=["Token", ".", "New", "York", "City"]) + ref = Doc(nlp.vocab, words=["Token", ".", "New York City"], sent_starts=[True, False, True], ents=["O", "O", "B-ENT"]) + eg = Example(pred, ref) + data = _compile_gold([eg], ["ner"], nlp, True) + assert data["boundary_cross_ents"] == 0 + + pred = Doc(nlp.vocab, words=["Token", ".", "New", "York", "City"]) + ref = Doc(nlp.vocab, words=["Token", ".", "New York City"], sent_starts=[True, False, True], ents=["O", "B-ENT", "I-ENT"]) + eg = Example(pred, ref) + data = _compile_gold([eg], ["ner"], nlp, True) + assert data["boundary_cross_ents"] == 1 \ No newline at end of file diff --git a/spacy/tests/tokenizer/test_tokenizer.py b/spacy/tests/tokenizer/test_tokenizer.py index c2aeffcb5..a7270cb1e 100644 --- a/spacy/tests/tokenizer/test_tokenizer.py +++ b/spacy/tests/tokenizer/test_tokenizer.py @@ -9,6 +9,7 @@ from spacy.tokenizer import Tokenizer from spacy.tokens import Doc from spacy.training import Example from spacy.util import compile_prefix_regex, compile_suffix_regex, ensure_path +from spacy.util import compile_infix_regex from spacy.vocab import Vocab from spacy.symbols import ORTH @@ -503,3 +504,20 @@ def test_tokenizer_prefix_suffix_overlap_lookbehind(en_vocab): assert tokens == ["a", "10", "."] explain_tokens = [t[1] for t in tokenizer.explain("a10.")] assert tokens == explain_tokens + + +def test_tokenizer_infix_prefix(en_vocab): + # the prefix and suffix matches overlap in the suffix lookbehind + infixes = ["±"] + suffixes = ["%"] + infix_re = compile_infix_regex(infixes) + suffix_re = compile_suffix_regex(suffixes) + tokenizer = Tokenizer( + en_vocab, + infix_finditer=infix_re.finditer, + suffix_search=suffix_re.search, + ) + tokens = [t.text for t in tokenizer("±10%")] + assert tokens == ["±10", "%"] + explain_tokens = [t[1] for t in tokenizer.explain("±10%")] + assert tokens == explain_tokens diff --git a/spacy/tests/vocab_vectors/test_similarity.py b/spacy/tests/vocab_vectors/test_similarity.py index 3b9308f4d..47cd1f060 100644 --- a/spacy/tests/vocab_vectors/test_similarity.py +++ b/spacy/tests/vocab_vectors/test_similarity.py @@ -35,6 +35,7 @@ def test_vectors_similarity_LL(vocab, vectors): assert lex1.vector_norm != 0 assert lex2.vector_norm != 0 assert lex1.vector[0] != lex2.vector[0] and lex1.vector[1] != lex2.vector[1] + assert isinstance(lex1.similarity(lex2), float) assert numpy.isclose(lex1.similarity(lex2), get_cosine(vec1, vec2)) assert numpy.isclose(lex2.similarity(lex2), lex1.similarity(lex1)) @@ -47,25 +48,46 @@ def test_vectors_similarity_TT(vocab, vectors): assert doc[0].vector_norm != 0 assert doc[1].vector_norm != 0 assert doc[0].vector[0] != doc[1].vector[0] and doc[0].vector[1] != doc[1].vector[1] + assert isinstance(doc[0].similarity(doc[1]), float) assert numpy.isclose(doc[0].similarity(doc[1]), get_cosine(vec1, vec2)) assert numpy.isclose(doc[1].similarity(doc[0]), doc[0].similarity(doc[1])) +def test_vectors_similarity_SS(vocab, vectors): + [(word1, vec1), (word2, vec2)] = vectors + doc = Doc(vocab, words=[word1, word2]) + assert isinstance(doc[0:1].similarity(doc[0:2]), float) + assert doc[0:1].similarity(doc[0:2]) == doc[0:2].similarity(doc[0:1]) + + +def test_vectors_similarity_DD(vocab, vectors): + [(word1, vec1), (word2, vec2)] = vectors + doc1 = Doc(vocab, words=[word1, word2]) + doc2 = Doc(vocab, words=[word2, word1]) + assert isinstance(doc1.similarity(doc2), float) + assert doc1.similarity(doc2) == doc2.similarity(doc1) + + def test_vectors_similarity_TD(vocab, vectors): [(word1, vec1), (word2, vec2)] = vectors doc = Doc(vocab, words=[word1, word2]) with pytest.warns(UserWarning): + assert isinstance(doc.similarity(doc[0]), float) + assert isinstance(doc[0].similarity(doc), float) assert doc.similarity(doc[0]) == doc[0].similarity(doc) -def test_vectors_similarity_DS(vocab, vectors): - [(word1, vec1), (word2, vec2)] = vectors - doc = Doc(vocab, words=[word1, word2]) - assert doc.similarity(doc[:2]) == doc[:2].similarity(doc) - - def test_vectors_similarity_TS(vocab, vectors): [(word1, vec1), (word2, vec2)] = vectors doc = Doc(vocab, words=[word1, word2]) with pytest.warns(UserWarning): + assert isinstance(doc[:2].similarity(doc[0]), float) + assert isinstance(doc[0].similarity(doc[-2]), float) assert doc[:2].similarity(doc[0]) == doc[0].similarity(doc[:2]) + + +def test_vectors_similarity_DS(vocab, vectors): + [(word1, vec1), (word2, vec2)] = vectors + doc = Doc(vocab, words=[word1, word2]) + assert isinstance(doc.similarity(doc[:2]), float) + assert doc.similarity(doc[:2]) == doc[:2].similarity(doc) diff --git a/spacy/tests/vocab_vectors/test_vectors.py b/spacy/tests/vocab_vectors/test_vectors.py index 9dc40b499..0650a7487 100644 --- a/spacy/tests/vocab_vectors/test_vectors.py +++ b/spacy/tests/vocab_vectors/test_vectors.py @@ -421,7 +421,7 @@ def test_vector_is_oov(): def test_init_vectors_unset(): v = Vectors(shape=(10, 10)) assert v.is_full is False - assert v.data.shape == (10, 10) + assert v.shape == (10, 10) with pytest.raises(ValueError): v = Vectors(shape=(10, 10), mode="floret") @@ -514,7 +514,7 @@ def test_floret_vectors(floret_vectors_vec_str, floret_vectors_hashvec_str): # rows: 2 rows per ngram rows = OPS.xp.asarray( [ - h % nlp.vocab.vectors.data.shape[0] + h % nlp.vocab.vectors.shape[0] for ngram in ngrams for h in nlp.vocab.vectors._get_ngram_hashes(ngram) ], @@ -544,17 +544,17 @@ def test_floret_vectors(floret_vectors_vec_str, floret_vectors_hashvec_str): # an empty key returns 0s assert_equal( OPS.to_numpy(nlp.vocab[""].vector), - numpy.zeros((nlp.vocab.vectors.data.shape[0],)), + numpy.zeros((nlp.vocab.vectors.shape[0],)), ) # an empty batch returns 0s assert_equal( OPS.to_numpy(nlp.vocab.vectors.get_batch([""])), - numpy.zeros((1, nlp.vocab.vectors.data.shape[0])), + numpy.zeros((1, nlp.vocab.vectors.shape[0])), ) # an empty key within a batch returns 0s assert_equal( OPS.to_numpy(nlp.vocab.vectors.get_batch(["a", "", "b"])[1]), - numpy.zeros((nlp.vocab.vectors.data.shape[0],)), + numpy.zeros((nlp.vocab.vectors.shape[0],)), ) # the loaded ngram vector table cannot be modified diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index f8df13610..91f228032 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -45,10 +45,12 @@ cdef class Tokenizer: `re.compile(string).search` to match suffixes. `infix_finditer` (callable): A function matching the signature of `re.compile(string).finditer` to find infixes. - token_match (callable): A boolean function matching strings to be + token_match (callable): A function matching the signature of + `re.compile(string).match`, for matching strings to be recognized as tokens. - url_match (callable): A boolean function matching strings to be - recognized as tokens after considering prefixes and suffixes. + url_match (callable): A function matching the signature of + `re.compile(string).match`, for matching strings to be + recognized as urls. EXAMPLE: >>> tokenizer = Tokenizer(nlp.vocab) @@ -681,6 +683,8 @@ cdef class Tokenizer: infixes = infix_finditer(substring) offset = 0 for match in infixes: + if offset == 0 and match.start() == 0: + continue if substring[offset : match.start()]: tokens.append(("TOKEN", substring[offset : match.start()])) if substring[match.start() : match.end()]: diff --git a/spacy/tokens/doc.pyi b/spacy/tokens/doc.pyi index f540002c9..7e9340d58 100644 --- a/spacy/tokens/doc.pyi +++ b/spacy/tokens/doc.pyi @@ -10,7 +10,7 @@ from ..lexeme import Lexeme from ..vocab import Vocab from .underscore import Underscore from pathlib import Path -import numpy +import numpy as np class DocMethod(Protocol): def __call__(self: Doc, *args: Any, **kwargs: Any) -> Any: ... # type: ignore[misc] @@ -26,7 +26,7 @@ class Doc: user_hooks: Dict[str, Callable[..., Any]] user_token_hooks: Dict[str, Callable[..., Any]] user_span_hooks: Dict[str, Callable[..., Any]] - tensor: numpy.ndarray + tensor: np.ndarray[Any, np.dtype[np.float_]] user_data: Dict[str, Any] has_unknown_spaces: bool _context: Any @@ -144,7 +144,7 @@ class Doc: ) -> Doc: ... def to_array( self, py_attr_ids: Union[int, str, List[Union[int, str]]] - ) -> numpy.ndarray: ... + ) -> np.ndarray[Any, np.dtype[np.float_]]: ... @staticmethod def from_docs( docs: List[Doc], diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 362a17784..d33764ac9 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -420,6 +420,8 @@ cdef class Doc: cdef int range_start = 0 if attr == "IS_SENT_START" or attr == self.vocab.strings["IS_SENT_START"]: attr = SENT_START + elif attr == "IS_SENT_END" or attr == self.vocab.strings["IS_SENT_END"]: + attr = SENT_START attr = intify_attr(attr) # adjust attributes if attr == HEAD: @@ -616,7 +618,7 @@ cdef class Doc: """ if "has_vector" in self.user_hooks: return self.user_hooks["has_vector"](self) - elif self.vocab.vectors.data.size: + elif self.vocab.vectors.size: return True elif self.tensor.size: return True @@ -641,7 +643,7 @@ cdef class Doc: if not len(self): self._vector = xp.zeros((self.vocab.vectors_length,), dtype="f") return self._vector - elif self.vocab.vectors.data.size > 0: + elif self.vocab.vectors.size > 0: self._vector = sum(t.vector for t in self) / len(self) return self._vector elif self.tensor.size > 0: @@ -1183,7 +1185,7 @@ cdef class Doc: token_offset = -1 for doc in docs[:-1]: token_offset += len(doc) - if not (len(doc) > 0 and doc[-1].is_space): + if len(doc) > 0 and not doc[-1].is_space: concat_spaces[token_offset] = True concat_array = numpy.concatenate(arrays) diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 0c36c754b..970c09d60 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -364,8 +364,10 @@ cdef class Span: return 0.0 vector = self.vector xp = get_array_module(vector) - return xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm) - + result = xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm) + # ensure we get a scalar back (numpy does this automatically but cupy doesn't) + return result.item() + cpdef np.ndarray to_array(self, object py_attr_ids): """Given a list of M attribute IDs, export the tokens to a numpy `ndarray` of shape `(N, M)`, where `N` is the length of the document. @@ -497,7 +499,7 @@ cdef class Span: """ if "has_vector" in self.doc.user_span_hooks: return self.doc.user_span_hooks["has_vector"](self) - elif self.vocab.vectors.data.size > 0: + elif self.vocab.vectors.size > 0: return any(token.has_vector for token in self) elif self.doc.tensor.size > 0: return True diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index aa97e2b07..d14930348 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -20,6 +20,7 @@ from .doc cimport set_children_from_heads from .. import parts_of_speech from ..errors import Errors, Warnings +from ..attrs import IOB_STRINGS from .underscore import Underscore, get_ext_args @@ -209,8 +210,10 @@ cdef class Token: return 0.0 vector = self.vector xp = get_array_module(vector) - return (xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm)) - + result = xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm) + # ensure we get a scalar back (numpy does this automatically but cupy doesn't) + return result.item() + def has_morph(self): """Check whether the token has annotated morph information. Return False when the morph annotation is unset/missing. @@ -484,8 +487,6 @@ cdef class Token: RETURNS (bool / None): Whether the token starts a sentence. None if unknown. - - DOCS: https://spacy.io/api/token#is_sent_start """ def __get__(self): if self.c.sent_start == 0: @@ -743,7 +744,7 @@ cdef class Token: @classmethod def iob_strings(cls): - return ("", "I", "O", "B") + return IOB_STRINGS @property def ent_iob_(self): diff --git a/spacy/tokens/underscore.py b/spacy/tokens/underscore.py index 7fa7bf095..e9a4e1862 100644 --- a/spacy/tokens/underscore.py +++ b/spacy/tokens/underscore.py @@ -1,17 +1,31 @@ -from typing import Dict, Any +from typing import Dict, Any, List, Optional, Tuple, Union, TYPE_CHECKING import functools import copy - from ..errors import Errors +if TYPE_CHECKING: + from .doc import Doc + from .span import Span + from .token import Token + class Underscore: mutable_types = (dict, list, set) doc_extensions: Dict[Any, Any] = {} span_extensions: Dict[Any, Any] = {} token_extensions: Dict[Any, Any] = {} + _extensions: Dict[str, Any] + _obj: Union["Doc", "Span", "Token"] + _start: Optional[int] + _end: Optional[int] - def __init__(self, extensions, obj, start=None, end=None): + def __init__( + self, + extensions: Dict[str, Any], + obj: Union["Doc", "Span", "Token"], + start: Optional[int] = None, + end: Optional[int] = None, + ): object.__setattr__(self, "_extensions", extensions) object.__setattr__(self, "_obj", obj) # Assumption is that for doc values, _start and _end will both be None @@ -23,12 +37,12 @@ class Underscore: object.__setattr__(self, "_start", start) object.__setattr__(self, "_end", end) - def __dir__(self): + def __dir__(self) -> List[str]: # Hack to enable autocomplete on custom extensions extensions = list(self._extensions.keys()) return ["set", "get", "has"] + extensions - def __getattr__(self, name): + def __getattr__(self, name: str) -> Any: if name not in self._extensions: raise AttributeError(Errors.E046.format(name=name)) default, method, getter, setter = self._extensions[name] @@ -56,7 +70,7 @@ class Underscore: return new_default return default - def __setattr__(self, name, value): + def __setattr__(self, name: str, value: Any): if name not in self._extensions: raise AttributeError(Errors.E047.format(name=name)) default, method, getter, setter = self._extensions[name] @@ -65,28 +79,30 @@ class Underscore: else: self._doc.user_data[self._get_key(name)] = value - def set(self, name, value): + def set(self, name: str, value: Any): return self.__setattr__(name, value) - def get(self, name): + def get(self, name: str) -> Any: return self.__getattr__(name) - def has(self, name): + def has(self, name: str) -> bool: return name in self._extensions - def _get_key(self, name): + def _get_key(self, name: str) -> Tuple[str, str, Optional[int], Optional[int]]: return ("._.", name, self._start, self._end) @classmethod - def get_state(cls): + def get_state(cls) -> Tuple[Dict[Any, Any], Dict[Any, Any], Dict[Any, Any]]: return cls.token_extensions, cls.span_extensions, cls.doc_extensions @classmethod - def load_state(cls, state): + def load_state( + cls, state: Tuple[Dict[Any, Any], Dict[Any, Any], Dict[Any, Any]] + ) -> None: cls.token_extensions, cls.span_extensions, cls.doc_extensions = state -def get_ext_args(**kwargs): +def get_ext_args(**kwargs: Any): """Validate and convert arguments. Reused in Doc, Token and Span.""" default = kwargs.get("default") getter = kwargs.get("getter") diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py index 084204389..b59288e38 100644 --- a/spacy/training/initialize.py +++ b/spacy/training/initialize.py @@ -164,7 +164,7 @@ def load_vectors_into_model( len(vectors_nlp.vocab.vectors.keys()) == 0 and vectors_nlp.vocab.vectors.mode != VectorsMode.floret ) or ( - vectors_nlp.vocab.vectors.data.shape[0] == 0 + vectors_nlp.vocab.vectors.shape[0] == 0 and vectors_nlp.vocab.vectors.mode == VectorsMode.floret ): logger.warning(Warnings.W112.format(name=name)) diff --git a/spacy/util.py b/spacy/util.py index 14714143c..2a8b9f5cc 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -871,7 +871,6 @@ def get_package_path(name: str) -> Path: name (str): Package name. RETURNS (Path): Path to installed package. """ - name = name.lower() # use lowercase version to be safe # Here we're importing the module just to find it. This is worryingly # indirect, but it's otherwise very difficult to find the package. pkg = importlib.import_module(name) diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx index 6d6783af4..bc4863703 100644 --- a/spacy/vectors.pyx +++ b/spacy/vectors.pyx @@ -1,5 +1,5 @@ cimport numpy as np -from libc.stdint cimport uint32_t +from libc.stdint cimport uint32_t, uint64_t from cython.operator cimport dereference as deref from libcpp.set cimport set as cppset from murmurhash.mrmr cimport hash128_x64 @@ -10,7 +10,7 @@ from typing import cast import warnings from enum import Enum import srsly -from thinc.api import get_array_module, get_current_ops +from thinc.api import Ops, get_array_module, get_current_ops from thinc.backends import get_array_ops from thinc.types import Floats2d @@ -146,7 +146,7 @@ cdef class Vectors: DOCS: https://spacy.io/api/vectors#size """ - return self.data.shape[0] * self.data.shape[1] + return self.data.size @property def is_full(self): @@ -274,7 +274,7 @@ cdef class Vectors: self.data = resized_array self._sync_unset() removed_items = [] - for key, row in list(self.key2row.items()): + for key, row in self.key2row.copy().items(): if row >= shape[0]: self.key2row.pop(key) removed_items.append((key, row)) @@ -353,12 +353,18 @@ cdef class Vectors: key (str): The string key. RETURNS: A list of the integer hashes. """ - cdef uint32_t[4] out + # MurmurHash3_x64_128 returns an array of 2 uint64_t values. + cdef uint64_t[2] out chars = s.encode("utf8") cdef char* utf8_string = chars hash128_x64(utf8_string, len(chars), self.hash_seed, &out) - rows = [out[i] for i in range(min(self.hash_count, 4))] - return rows + rows = [ + out[0] & 0xffffffffu, + out[0] >> 32, + out[1] & 0xffffffffu, + out[1] >> 32, + ] + return rows[:min(self.hash_count, 4)] def _get_ngrams(self, unicode key): """Get all padded ngram strings using the ngram settings. @@ -511,6 +517,9 @@ cdef class Vectors: for i in range(len(queries)) ], dtype="uint64") return (keys, best_rows, scores) + def to_ops(self, ops: Ops): + self.data = ops.asarray(self.data) + def _get_cfg(self): if self.mode == Mode.default: return { diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index e2e7ad1db..badd291ed 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -283,7 +283,7 @@ cdef class Vocab: @property def vectors_length(self): - return self.vectors.data.shape[1] + return self.vectors.shape[1] def reset_vectors(self, *, width=None, shape=None): """Drop the current vector table. Because all vectors must be the same @@ -294,7 +294,7 @@ cdef class Vocab: elif shape is not None: self.vectors = Vectors(strings=self.strings, shape=shape) else: - width = width if width is not None else self.vectors.data.shape[1] + width = width if width is not None else self.vectors.shape[1] self.vectors = Vectors(strings=self.strings, shape=(self.vectors.shape[0], width)) def prune_vectors(self, nr_row, batch_size=1024): diff --git a/website/docs/api/corpus.md b/website/docs/api/corpus.md index 986c6f458..35afc8fea 100644 --- a/website/docs/api/corpus.md +++ b/website/docs/api/corpus.md @@ -79,6 +79,7 @@ train/test skew. | `max_length` | Maximum document length. Longer documents will be split into sentences, if sentence boundaries are available. Defaults to `0` for no limit. ~~int~~ | | `limit` | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~ | | `augmenter` | Optional data augmentation callback. ~~Callable[[Language, Example], Iterable[Example]]~~ | +| `shuffle` | Whether to shuffle the examples. Defaults to `False`. ~~bool~~ | ## Corpus.\_\_call\_\_ {#call tag="method"} diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md index 9836b8c21..c21328caf 100644 --- a/website/docs/api/doc.md +++ b/website/docs/api/doc.md @@ -304,7 +304,7 @@ ancestor is found, e.g. if span excludes a necessary ancestor. ## Doc.has_annotation {#has_annotation tag="method"} -Check whether the doc contains annotation on a token attribute. +Check whether the doc contains annotation on a [`Token` attribute](/api/token#attributes). diff --git a/website/docs/api/matcher.md b/website/docs/api/matcher.md index 803105ba2..3e7f9dc04 100644 --- a/website/docs/api/matcher.md +++ b/website/docs/api/matcher.md @@ -44,6 +44,7 @@ rule-based matching are: | `SPACY` | Token has a trailing space. ~~bool~~ | |  `POS`, `TAG`, `MORPH`, `DEP`, `LEMMA`, `SHAPE` | The token's simple and extended part-of-speech tag, morphological analysis, dependency label, lemma, shape. ~~str~~ | | `ENT_TYPE` | The token's entity label. ~~str~~ | +| `ENT_IOB` | The IOB part of the token's entity tag. ~~str~~ | | `ENT_ID` | The token's entity ID (`ent_id`). ~~str~~ | | `ENT_KB_ID` | The token's entity knowledge base ID (`ent_kb_id`). ~~str~~ | | `_` 2.1 | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). ~~Dict[str, Any]~~ | diff --git a/website/docs/api/token.md b/website/docs/api/token.md index 44a2ea9e8..3c3d12d54 100644 --- a/website/docs/api/token.md +++ b/website/docs/api/token.md @@ -349,23 +349,6 @@ A sequence containing the token and all the token's syntactic descendants. | ---------- | ------------------------------------------------------------------------------------ | | **YIELDS** | A descendant token such that `self.is_ancestor(token)` or `token == self`. ~~Token~~ | -## Token.is_sent_start {#is_sent_start tag="property" new="2"} - -A boolean value indicating whether the token starts a sentence. `None` if -unknown. Defaults to `True` for the first token in the `Doc`. - -> #### Example -> -> ```python -> doc = nlp("Give it back! He pleaded.") -> assert doc[4].is_sent_start -> assert not doc[5].is_sent_start -> ``` - -| Name | Description | -| ----------- | ------------------------------------------------------- | -| **RETURNS** | Whether the token starts a sentence. ~~Optional[bool]~~ | - ## Token.has_vector {#has_vector tag="property" model="vectors"} A boolean value indicating whether a word vector is associated with the token. @@ -465,6 +448,8 @@ The L2 norm of the token's vector representation. | `is_punct` | Is the token punctuation? ~~bool~~ | | `is_left_punct` | Is the token a left punctuation mark, e.g. `"("` ? ~~bool~~ | | `is_right_punct` | Is the token a right punctuation mark, e.g. `")"` ? ~~bool~~ | +| `is_sent_start` | Does the token start a sentence? ~~bool~~ or `None` if unknown. Defaults to `True` for the first token in the `Doc`. | +| `is_sent_end` | Does the token end a sentence? ~~bool~~ or `None` if unknown. | | `is_space` | Does the token consist of whitespace characters? Equivalent to `token.text.isspace()`. ~~bool~~ | | `is_bracket` | Is the token a bracket? ~~bool~~ | | `is_quote` | Is the token a quotation mark? ~~bool~~ | diff --git a/website/docs/api/vectors.md b/website/docs/api/vectors.md index 84d2c00ad..b3bee822c 100644 --- a/website/docs/api/vectors.md +++ b/website/docs/api/vectors.md @@ -371,6 +371,23 @@ Get the vectors for the provided keys efficiently as a batch. | ------ | --------------------------------------- | | `keys` | The keys. ~~Iterable[Union[int, str]]~~ | +## Vectors.to_ops {#to_ops tag="method"} + +Change the embedding matrix to use different Thinc ops. + +> #### Example +> +> ```python +> from thinc.api import NumpyOps +> +> vectors.to_ops(NumpyOps()) +> +> ``` + +| Name | Description | +|-------|----------------------------------------------------------| +| `ops` | The Thinc ops to switch the embedding matrix to. ~~Ops~~ | + ## Vectors.to_disk {#to_disk tag="method"} Save the current state to a directory. diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md index f748fa8d6..f8baf5588 100644 --- a/website/docs/usage/linguistic-features.md +++ b/website/docs/usage/linguistic-features.md @@ -831,6 +831,8 @@ def tokenizer_pseudo_code( infixes = infix_finditer(substring) offset = 0 for match in infixes: + if offset == 0 and match.start() == 0: + continue tokens.append(substring[offset : match.start()]) tokens.append(substring[match.start() : match.end()]) offset = match.end() diff --git a/website/docs/usage/projects.md b/website/docs/usage/projects.md index e0e787a1d..57d226913 100644 --- a/website/docs/usage/projects.md +++ b/website/docs/usage/projects.md @@ -213,6 +213,12 @@ format, train a pipeline, evaluate it and export metrics, package it and spin up a quick web demo. It looks pretty similar to a config file used to define CI pipelines. +> #### Tip: Multi-line YAML syntax for long values +> +> YAML has [multi-line syntax](https://yaml-multiline.info/) that can be +> helpful for readability with longer values such as project descriptions or +> commands that take several arguments. + ```yaml %%GITHUB_PROJECTS/pipelines/tagger_parser_ud/project.yml ``` diff --git a/website/meta/universe.json b/website/meta/universe.json index 8ac28c326..4ded8880f 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -141,7 +141,8 @@ "website": "https://www.nr.no/~plison" }, "category": ["pipeline", "standalone", "research", "training"], - "tags": [] + "tags": [], + "spacy_version": 3 }, { "id": "numerizer", @@ -952,6 +953,37 @@ "category": ["pipeline"], "tags": ["lemmatizer", "danish"] }, + { + "id": "augmenty", + "title": "Augmenty", + "slogan": "The cherry on top of your NLP pipeline", + "description": "Augmenty is an augmentation library based on spaCy for augmenting texts. Augmenty differs from other augmentation libraries in that it corrects (as far as possible) the token, sentence and document labels under the augmentation.", + "github": "kennethenevoldsen/augmenty", + "pip": "augmenty", + "code_example": [ + "import spacy", + "import augmenty", + "", + "nlp = spacy.load('en_core_web_md')", + "", + "docs = nlp.pipe(['Augmenty is a great tool for text augmentation'])", + "", + "ent_dict = {'ORG': [['spaCy'], ['spaCy', 'Universe']]}", + "entity_augmenter = augmenty.load('ents_replace.v1',", + " ent_dict = ent_dict, level=1)", + "", + "for doc in augmenty.docs(docs, augmenter=entity_augmenter, nlp=nlp):", + " print(doc)" + ], + "thumb": "https://github.com/KennethEnevoldsen/augmenty/blob/master/img/icon.png?raw=true", + "author": "Kenneth Enevoldsen", + "author_links": { + "github": "kennethenevoldsen", + "website": "https://www.kennethenevoldsen.com" + }, + "category": ["training", "research"], + "tags": ["training", "research", "augmentation"] + }, { "id": "dacy", "title": "DaCy", diff --git a/website/src/templates/universe.js b/website/src/templates/universe.js index cfc8fdd0e..10f2520d9 100644 --- a/website/src/templates/universe.js +++ b/website/src/templates/universe.js @@ -8,10 +8,11 @@ import Title from '../components/title' import Grid from '../components/grid' import Button from '../components/button' import Icon from '../components/icon' +import Tag from '../components/tag' import CodeBlock, { InlineCode } from '../components/code' import Aside from '../components/aside' import Sidebar from '../components/sidebar' -import Section from '../components/section' +import Section, { Hr } from '../components/section' import Main from '../components/main' import Footer from '../components/footer' import { H3, H5, Label, InlineList } from '../components/typography' @@ -121,6 +122,18 @@ const UniverseContent = ({ content = [], categories, theme, pageContext, mdxComp )} +
+

Found a mistake or something isn't working?

+

+ If you've come across a universe project that isn't working or is + incompatible with the reported spaCy version, let us know by{' '} + + opening a discussion thread + + . +

+
+

Submit your project

@@ -168,25 +181,41 @@ UniverseContent.propTypes = { mdxComponents: PropTypes.object, } +const SpaCyVersion = ({ version }) => { + const versions = !Array.isArray(version) ? [version] : version + return versions.map((v, i) => ( + <> + spaCy v{v}{' '} + + )) +} + const Project = ({ data, components }) => ( <> - {data.github && ( + {(data.github || data.spacy_version) && ( <p> - <Link to={`https://github.com/${data.github}`} hidden> - {[ - `release/${data.github}/all.svg?style=flat-square`, - `license/${data.github}.svg?style=flat-square`, - `stars/${data.github}.svg?style=social&label=Stars`, - ].map((url, i) => ( - <img - style={{ borderRadius: '1em', marginRight: '0.5rem' }} - key={i} - src={`https://img.shields.io/github/${url}`} - alt="" - /> - ))} - </Link> + {data.spacy_version && <SpaCyVersion version={data.spacy_version} />} + {data.github && ( + <Link to={`https://github.com/${data.github}`} hidden> + {[ + `release/${data.github}/all.svg?style=flat-square`, + `license/${data.github}.svg?style=flat-square`, + `stars/${data.github}.svg?style=social&label=Stars`, + ].map((url, i) => ( + <img + style={{ + borderRadius: '1em', + marginRight: '0.5rem', + verticalAlign: 'middle', + }} + key={i} + src={`https://img.shields.io/github/${url}`} + alt="" + /> + ))} + </Link> + )} </p> )} @@ -335,6 +364,7 @@ const query = graphql` url github description + spacy_version pip cran category