diff --git a/.github/azure-steps.yml b/.github/azure-steps.yml index d0db75f9a..d1154756c 100644 --- a/.github/azure-steps.yml +++ b/.github/azure-steps.yml @@ -63,6 +63,16 @@ steps: # python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')" # displayName: 'Test no warnings on load (#11713)' # condition: eq(variables['python_version'], '3.8') +# +# - script: | +# python -m spacy download ca_core_news_sm 2>&1 | grep -q skipping +# displayName: 'Test skip re-download (#12188)' +# condition: eq(variables['python_version'], '3.8') + +# - script: | +# python -W error -m spacy info ca_core_news_sm | grep -q download_url +# displayName: 'Test download_url in info CLI' +# condition: eq(variables['python_version'] '3.8') - script: | python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json . diff --git a/.github/workflows/autoblack.yml b/.github/workflows/autoblack.yml index 70882c3cc..555322782 100644 --- a/.github/workflows/autoblack.yml +++ b/.github/workflows/autoblack.yml @@ -16,7 +16,7 @@ jobs: with: ref: ${{ github.head_ref }} - uses: actions/setup-python@v4 - - run: pip install black + - run: pip install black -c requirements.txt - name: Auto-format code if needed run: black spacy # We can't run black --check here because that returns a non-zero excit diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index e2c5e98fd..8efe733f9 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -3,7 +3,7 @@ repos: rev: 22.3.0 hooks: - id: black - language_version: python3.7 + language_version: python3.8 additional_dependencies: ['click==8.0.4'] - repo: https://github.com/pycqa/flake8 rev: 5.0.4 diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 1f396bd71..3c0b27c1d 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -173,6 +173,11 @@ formatting and [`flake8`](http://flake8.pycqa.org/en/latest/) for linting its Python modules. If you've built spaCy from source, you'll already have both tools installed. +As a general rule of thumb, we use f-strings for any formatting of strings. +One exception are calls to Python's `logging` functionality. +To avoid unnecessary string conversions in these cases, we use string formatting +templates with `%s` and `%d` etc. + **⚠️ Note that formatting and linting is currently only possible for Python modules in `.py` files, not Cython modules in `.pyx` and `.pxd` files.** @@ -271,7 +276,7 @@ except: # noqa: E722 ### Python conventions -All Python code must be written **compatible with Python 3.6+**. More detailed +All Python code must be written **compatible with Python 3.8+**. More detailed code conventions can be found in the [developer docs](https://github.com/explosion/spaCy/blob/master/extra/DEVELOPER_DOCS/Code%20Conventions.md). #### I/O and handling paths diff --git a/Makefile b/Makefile index 4de628663..24a9bcee4 100644 --- a/Makefile +++ b/Makefile @@ -5,7 +5,7 @@ override SPACY_EXTRAS = spacy-lookups-data==1.0.2 jieba spacy-pkuseg==0.0.28 sud endif ifndef PYVER -override PYVER = 3.6 +override PYVER = 3.8 endif VENV := ./env$(PYVER) diff --git a/README.md b/README.md index 195424551..bf8083e0e 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,7 @@ production-ready [**training system**](https://spacy.io/usage/training) and easy model packaging, deployment and workflow management. spaCy is commercial open-source software, released under the [MIT license](https://github.com/explosion/spaCy/blob/master/LICENSE). -💫 **Version 3.4 out now!** +💫 **Version 3.5 out now!** [Check out the release notes here.](https://github.com/explosion/spaCy/releases) [![Azure Pipelines](https://img.shields.io/azure-devops/build/explosion-ai/public/8/master.svg?logo=azure-pipelines&style=flat-square&label=build)](https://dev.azure.com/explosion-ai/public/_build?definitionId=8) @@ -105,7 +105,7 @@ For detailed installation instructions, see the - **Operating system**: macOS / OS X · Linux · Windows (Cygwin, MinGW, Visual Studio) -- **Python version**: Python 3.6+ (only 64 bit) +- **Python version**: Python 3.8+ (only 64 bit) - **Package managers**: [pip] · [conda] (via `conda-forge`) [pip]: https://pypi.org/project/spacy/ diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 0f7ea91f9..9b7ebbe01 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -11,25 +11,39 @@ trigger: exclude: - "website/*" - "*.md" + - "*.mdx" - ".github/workflows/*" pr: paths: exclude: - "*.md" + - "*.mdx" - "website/docs/*" - "website/src/*" + - "website/meta/*.tsx" + - "website/meta/*.mjs" + - "website/meta/languages.json" + - "website/meta/site.json" + - "website/meta/sidebars.json" + - "website/meta/type-annotations.json" + - "website/pages/*" - ".github/workflows/*" jobs: - # Perform basic checks for most important errors (syntax etc.) Uses the config - # defined in .flake8 and overwrites the selected codes. + # Check formatting and linting. Perform basic checks for most important errors + # (syntax etc.) Uses the config defined in setup.cfg and overwrites the + # selected codes. - job: "Validate" pool: vmImage: "ubuntu-latest" steps: - task: UsePythonVersion@0 inputs: - versionSpec: "3.7" + versionSpec: "3.8" + - script: | + pip install black -c requirements.txt + python -m black spacy --check + displayName: "black" - script: | pip install flake8==5.0.4 python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics @@ -40,24 +54,6 @@ jobs: strategy: matrix: # We're only running one platform per Python version to speed up builds - Python36Linux: - imageName: "ubuntu-20.04" - python.version: "3.6" - # Python36Windows: - # imageName: "windows-latest" - # python.version: "3.6" - # Python36Mac: - # imageName: "macos-latest" - # python.version: "3.6" - # Python37Linux: - # imageName: "ubuntu-20.04" - # python.version: "3.7" - Python37Windows: - imageName: "windows-latest" - python.version: "3.7" - # Python37Mac: - # imageName: "macos-latest" - # python.version: "3.7" # Python38Linux: # imageName: "ubuntu-latest" # python.version: "3.8" diff --git a/pyproject.toml b/pyproject.toml index 7abd7a96f..837cf1fd8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,7 @@ requires = [ "cymem>=2.0.2,<2.1.0", "preshed>=3.0.2,<3.1.0", "murmurhash>=0.28.0,<1.1.0", - "thinc>=8.1.0,<8.2.0", + "thinc>=9.0.0.dev2,<9.1.0", "numpy>=1.15.0", ] build-backend = "setuptools.build_meta" diff --git a/requirements.txt b/requirements.txt index 1bd4518af..6f4b61918 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,9 +1,9 @@ # Our libraries -spacy-legacy>=3.0.11,<3.1.0 +spacy-legacy>=4.0.0.dev0,<4.1.0 spacy-loggers>=1.0.0,<2.0.0 cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 -thinc>=8.1.0,<8.2.0 +thinc>=9.0.0.dev2,<9.1.0 ml_datasets>=0.2.0,<0.3.0 murmurhash>=0.28.0,<1.1.0 wasabi>=0.9.1,<1.2.0 @@ -22,7 +22,6 @@ langcodes>=3.2.0,<4.0.0 # Official Python utilities setuptools packaging>=20.0 -typing_extensions>=3.7.4.1,<4.5.0; python_version < "3.8" # Development dependencies pre-commit>=2.13.0 cython>=0.25,<3.0 @@ -31,10 +30,10 @@ pytest-timeout>=1.3.0,<2.0.0 mock>=2.0.0,<3.0.0 flake8>=3.8.0,<6.0.0 hypothesis>=3.27.0,<7.0.0 -mypy>=0.990,<0.1000; platform_machine != "aarch64" and python_version >= "3.7" +mypy>=0.990,<1.1.0; platform_machine != "aarch64" and python_version >= "3.7" types-dataclasses>=0.1.3; python_version < "3.7" types-mock>=0.1.1 types-setuptools>=57.0.0 types-requests types-setuptools>=57.0.0 -black>=22.0,<23.0 +black==22.3.0 diff --git a/setup.cfg b/setup.cfg index cddc5148c..975ec03ce 100644 --- a/setup.cfg +++ b/setup.cfg @@ -17,8 +17,6 @@ classifiers = Operating System :: Microsoft :: Windows Programming Language :: Cython Programming Language :: Python :: 3 - Programming Language :: Python :: 3.6 - Programming Language :: Python :: 3.7 Programming Language :: Python :: 3.8 Programming Language :: Python :: 3.9 Programming Language :: Python :: 3.10 @@ -31,23 +29,15 @@ project_urls = [options] zip_safe = false include_package_data = true -python_requires = >=3.6 -setup_requires = - cython>=0.25,<3.0 - numpy>=1.15.0 - # We also need our Cython packages here to compile against - cymem>=2.0.2,<2.1.0 - preshed>=3.0.2,<3.1.0 - murmurhash>=0.28.0,<1.1.0 - thinc>=8.1.0,<8.2.0 +python_requires = >=3.8 install_requires = # Our libraries - spacy-legacy>=3.0.11,<3.1.0 + spacy-legacy>=4.0.0.dev0,<4.1.0 spacy-loggers>=1.0.0,<2.0.0 murmurhash>=0.28.0,<1.1.0 cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 - thinc>=8.1.0,<8.2.0 + thinc>=9.0.0.dev2,<9.1.0 wasabi>=0.9.1,<1.2.0 srsly>=2.4.3,<3.0.0 catalogue>=2.0.6,<2.1.0 @@ -63,7 +53,6 @@ install_requires = # Official Python utilities setuptools packaging>=20.0 - typing_extensions>=3.7.4.1,<4.5.0; python_version < "3.8" langcodes>=3.2.0,<4.0.0 [options.entry_points] @@ -120,7 +109,7 @@ ja = sudachipy>=0.5.2,!=0.6.1 sudachidict_core>=20211220 ko = - natto-py>=0.9.0 + mecab-ko>=1.0.0 th = pythainlp>=2.0 diff --git a/setup.py b/setup.py index 243554c7a..d5b82ec68 100755 --- a/setup.py +++ b/setup.py @@ -33,13 +33,10 @@ MOD_NAMES = [ "spacy.kb.candidate", "spacy.kb.kb", "spacy.kb.kb_in_memory", - "spacy.ml.parser_model", + "spacy.ml.tb_framework", "spacy.morphology", - "spacy.pipeline.dep_parser", "spacy.pipeline._edit_tree_internals.edit_trees", "spacy.pipeline.morphologizer", - "spacy.pipeline.multitask", - "spacy.pipeline.ner", "spacy.pipeline.pipe", "spacy.pipeline.trainable_pipe", "spacy.pipeline.sentencizer", @@ -47,12 +44,15 @@ MOD_NAMES = [ "spacy.pipeline.tagger", "spacy.pipeline.transition_parser", "spacy.pipeline._parser_internals.arc_eager", + "spacy.pipeline._parser_internals.batch", "spacy.pipeline._parser_internals.ner", "spacy.pipeline._parser_internals.nonproj", + "spacy.pipeline._parser_internals.search", "spacy.pipeline._parser_internals._state", "spacy.pipeline._parser_internals.stateclass", "spacy.pipeline._parser_internals.transition_system", "spacy.pipeline._parser_internals._beam_utils", + "spacy.pipeline._parser_internals._parser_utils", "spacy.tokenizer", "spacy.training.align", "spacy.training.gold_io", @@ -62,12 +62,13 @@ MOD_NAMES = [ "spacy.tokens.span_group", "spacy.tokens.graph", "spacy.tokens.morphanalysis", - "spacy.tokens._retokenize", + "spacy.tokens.retokenizer", "spacy.matcher.matcher", "spacy.matcher.phrasematcher", "spacy.matcher.dependencymatcher", "spacy.symbols", "spacy.vectors", + "spacy.tests.parser._search", ] COMPILE_OPTIONS = { "msvc": ["/Ox", "/EHsc"], diff --git a/spacy/about.py b/spacy/about.py index 640e9e93b..eddbeea09 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy" -__version__ = "3.5.0" +__version__ = "4.0.0.dev0" __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" __projects__ = "https://github.com/explosion/projects" diff --git a/spacy/attrs.pxd b/spacy/attrs.pxd index 33d5372de..b8a7a1f08 100644 --- a/spacy/attrs.pxd +++ b/spacy/attrs.pxd @@ -1,98 +1,49 @@ -# Reserve 64 values for flag features from . cimport symbols cdef enum attr_id_t: - NULL_ATTR - IS_ALPHA - IS_ASCII - IS_DIGIT - IS_LOWER - IS_PUNCT - IS_SPACE - IS_TITLE - IS_UPPER - LIKE_URL - LIKE_NUM - LIKE_EMAIL - IS_STOP - IS_OOV_DEPRECATED - IS_BRACKET - IS_QUOTE - IS_LEFT_PUNCT - IS_RIGHT_PUNCT - IS_CURRENCY + NULL_ATTR = 0 + IS_ALPHA = symbols.IS_ALPHA + IS_ASCII = symbols.IS_ASCII + IS_DIGIT = symbols.IS_DIGIT + IS_LOWER = symbols.IS_LOWER + IS_PUNCT = symbols.IS_PUNCT + IS_SPACE = symbols.IS_SPACE + IS_TITLE = symbols.IS_TITLE + IS_UPPER = symbols.IS_UPPER + LIKE_URL = symbols.LIKE_URL + LIKE_NUM = symbols.LIKE_NUM + LIKE_EMAIL = symbols.LIKE_EMAIL + IS_STOP = symbols.IS_STOP + IS_BRACKET = symbols.IS_BRACKET + IS_QUOTE = symbols.IS_QUOTE + IS_LEFT_PUNCT = symbols.IS_LEFT_PUNCT + IS_RIGHT_PUNCT = symbols.IS_RIGHT_PUNCT + IS_CURRENCY = symbols.IS_CURRENCY - FLAG19 = 19 - FLAG20 - FLAG21 - FLAG22 - FLAG23 - FLAG24 - FLAG25 - FLAG26 - FLAG27 - FLAG28 - FLAG29 - FLAG30 - FLAG31 - FLAG32 - FLAG33 - FLAG34 - FLAG35 - FLAG36 - FLAG37 - FLAG38 - FLAG39 - FLAG40 - FLAG41 - FLAG42 - FLAG43 - FLAG44 - FLAG45 - FLAG46 - FLAG47 - FLAG48 - FLAG49 - FLAG50 - FLAG51 - FLAG52 - FLAG53 - FLAG54 - FLAG55 - FLAG56 - FLAG57 - FLAG58 - FLAG59 - FLAG60 - FLAG61 - FLAG62 - FLAG63 + ID = symbols.ID + ORTH = symbols.ORTH + LOWER = symbols.LOWER + NORM = symbols.NORM + SHAPE = symbols.SHAPE + PREFIX = symbols.PREFIX + SUFFIX = symbols.SUFFIX - ID - ORTH - LOWER - NORM - SHAPE - PREFIX - SUFFIX + LENGTH = symbols.LENGTH + CLUSTER = symbols.CLUSTER + LEMMA = symbols.LEMMA + POS = symbols.POS + TAG = symbols.TAG + DEP = symbols.DEP + ENT_IOB = symbols.ENT_IOB + ENT_TYPE = symbols.ENT_TYPE + HEAD = symbols.HEAD + SENT_START = symbols.SENT_START + SPACY = symbols.SPACY + PROB = symbols.PROB - LENGTH - CLUSTER - LEMMA - POS - TAG - DEP - ENT_IOB - ENT_TYPE - HEAD - SENT_START - SPACY - PROB - - LANG + LANG = symbols.LANG ENT_KB_ID = symbols.ENT_KB_ID - MORPH + MORPH = symbols.MORPH ENT_ID = symbols.ENT_ID - IDX - SENT_END \ No newline at end of file + IDX = symbols.IDX diff --git a/spacy/attrs.pyx b/spacy/attrs.pyx index dc8eed7c3..9b0ae3400 100644 --- a/spacy/attrs.pyx +++ b/spacy/attrs.pyx @@ -16,57 +16,11 @@ IDS = { "LIKE_NUM": LIKE_NUM, "LIKE_EMAIL": LIKE_EMAIL, "IS_STOP": IS_STOP, - "IS_OOV_DEPRECATED": IS_OOV_DEPRECATED, "IS_BRACKET": IS_BRACKET, "IS_QUOTE": IS_QUOTE, "IS_LEFT_PUNCT": IS_LEFT_PUNCT, "IS_RIGHT_PUNCT": IS_RIGHT_PUNCT, "IS_CURRENCY": IS_CURRENCY, - "FLAG19": FLAG19, - "FLAG20": FLAG20, - "FLAG21": FLAG21, - "FLAG22": FLAG22, - "FLAG23": FLAG23, - "FLAG24": FLAG24, - "FLAG25": FLAG25, - "FLAG26": FLAG26, - "FLAG27": FLAG27, - "FLAG28": FLAG28, - "FLAG29": FLAG29, - "FLAG30": FLAG30, - "FLAG31": FLAG31, - "FLAG32": FLAG32, - "FLAG33": FLAG33, - "FLAG34": FLAG34, - "FLAG35": FLAG35, - "FLAG36": FLAG36, - "FLAG37": FLAG37, - "FLAG38": FLAG38, - "FLAG39": FLAG39, - "FLAG40": FLAG40, - "FLAG41": FLAG41, - "FLAG42": FLAG42, - "FLAG43": FLAG43, - "FLAG44": FLAG44, - "FLAG45": FLAG45, - "FLAG46": FLAG46, - "FLAG47": FLAG47, - "FLAG48": FLAG48, - "FLAG49": FLAG49, - "FLAG50": FLAG50, - "FLAG51": FLAG51, - "FLAG52": FLAG52, - "FLAG53": FLAG53, - "FLAG54": FLAG54, - "FLAG55": FLAG55, - "FLAG56": FLAG56, - "FLAG57": FLAG57, - "FLAG58": FLAG58, - "FLAG59": FLAG59, - "FLAG60": FLAG60, - "FLAG61": FLAG61, - "FLAG62": FLAG62, - "FLAG63": FLAG63, "ID": ID, "ORTH": ORTH, "LOWER": LOWER, @@ -92,12 +46,11 @@ IDS = { } -# ATTR IDs, in order of the symbol -NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])] +NAMES = {v: k for k, v in IDS.items()} locals().update(IDS) -def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False): +def intify_attrs(stringy_attrs, strings_map=None): """ Normalize a dictionary of attributes, converting them to ints. @@ -109,75 +62,6 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False): converted to ints. """ inty_attrs = {} - if _do_deprecated: - if "F" in stringy_attrs: - stringy_attrs["ORTH"] = stringy_attrs.pop("F") - if "L" in stringy_attrs: - stringy_attrs["LEMMA"] = stringy_attrs.pop("L") - if "pos" in stringy_attrs: - stringy_attrs["TAG"] = stringy_attrs.pop("pos") - if "morph" in stringy_attrs: - morphs = stringy_attrs.pop("morph") - if "number" in stringy_attrs: - stringy_attrs.pop("number") - if "tenspect" in stringy_attrs: - stringy_attrs.pop("tenspect") - morph_keys = [ - "PunctType", - "PunctSide", - "Other", - "Degree", - "AdvType", - "Number", - "VerbForm", - "PronType", - "Aspect", - "Tense", - "PartType", - "Poss", - "Hyph", - "ConjType", - "NumType", - "Foreign", - "VerbType", - "NounType", - "Gender", - "Mood", - "Negative", - "Tense", - "Voice", - "Abbr", - "Derivation", - "Echo", - "Foreign", - "NameType", - "NounType", - "NumForm", - "NumValue", - "PartType", - "Polite", - "StyleVariant", - "PronType", - "AdjType", - "Person", - "Variant", - "AdpType", - "Reflex", - "Negative", - "Mood", - "Aspect", - "Case", - "Polarity", - "PrepCase", - "Animacy", # U20 - ] - for key in morph_keys: - if key in stringy_attrs: - stringy_attrs.pop(key) - elif key.lower() in stringy_attrs: - stringy_attrs.pop(key.lower()) - elif key.upper() in stringy_attrs: - stringy_attrs.pop(key.upper()) for name, value in stringy_attrs.items(): int_key = intify_attr(name) if int_key is not None: diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py index ba3892b1d..d763fba1f 100644 --- a/spacy/cli/_util.py +++ b/spacy/cli/_util.py @@ -1,4 +1,4 @@ -from typing import Dict, Any, Union, List, Optional, Tuple, Iterable +from typing import Dict, Any, Union, List, Optional, Tuple, Iterable, Literal from typing import TYPE_CHECKING, overload import sys import shutil @@ -16,10 +16,10 @@ from thinc.util import gpu_is_available from configparser import InterpolationError import os -from ..compat import Literal from ..schemas import ProjectConfigSchema, validate from ..util import import_file, run_command, make_tempdir, registry, logger from ..util import is_compatible_version, SimpleFrozenDict, ENV_VARS +from ..errors import RENAMED_LANGUAGE_CODES from .. import about if TYPE_CHECKING: @@ -90,9 +90,9 @@ def parse_config_overrides( cli_overrides = _parse_overrides(args, is_cli=True) if cli_overrides: keys = [k for k in cli_overrides if k not in env_overrides] - logger.debug(f"Config overrides from CLI: {keys}") + logger.debug("Config overrides from CLI: %s", keys) if env_overrides: - logger.debug(f"Config overrides from env variables: {list(env_overrides)}") + logger.debug("Config overrides from env variables: %s", list(env_overrides)) return {**cli_overrides, **env_overrides} @@ -135,6 +135,16 @@ def _parse_override(value: Any) -> Any: return str(value) +def _handle_renamed_language_codes(lang: Optional[str]) -> None: + # Throw error for renamed language codes in v4 + if lang in RENAMED_LANGUAGE_CODES: + msg.fail( + title="Renamed language code", + text=f"Language code '{lang}' was replaced with '{RENAMED_LANGUAGE_CODES[lang]}' in spaCy v4. Update the language code from '{lang}' to '{RENAMED_LANGUAGE_CODES[lang]}'.", + exits=1, + ) + + def load_project_config( path: Path, interpolate: bool = True, overrides: Dict[str, Any] = SimpleFrozenDict() ) -> Dict[str, Any]: diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py index 68d454b3e..66f9461a9 100644 --- a/spacy/cli/convert.py +++ b/spacy/cli/convert.py @@ -7,7 +7,7 @@ import re import sys import itertools -from ._util import app, Arg, Opt, walk_directory +from ._util import app, Arg, Opt, _handle_renamed_language_codes, walk_directory from ..training import docs_to_json from ..tokens import Doc, DocBin from ..training.converters import iob_to_docs, conll_ner_to_docs, json_to_docs @@ -112,6 +112,10 @@ def convert( input_path = Path(input_path) if not msg: msg = Printer(no_print=silent) + + # Throw error for renamed language codes in v4 + _handle_renamed_language_codes(lang) + ner_map = srsly.read_json(ner_map) if ner_map is not None else None doc_files = [] for input_loc in walk_directory(input_path, converter): diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index a85324e87..1c242cec8 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -1,5 +1,5 @@ from typing import Any, Dict, Iterable, List, Optional, Sequence, Set, Tuple, Union -from typing import cast, overload +from typing import Literal, cast, overload from pathlib import Path from collections import Counter import sys @@ -17,10 +17,10 @@ from ..pipeline import TrainablePipe from ..pipeline._parser_internals import nonproj from ..pipeline._parser_internals.nonproj import DELIMITER from ..pipeline import Morphologizer, SpanCategorizer +from ..pipeline._edit_tree_internals.edit_trees import EditTrees from ..morphology import Morphology from ..language import Language from ..util import registry, resolve_dot_names -from ..compat import Literal from ..vectors import Mode as VectorsMode from .. import util @@ -671,6 +671,59 @@ def debug_data( f"Found {gold_train_data['n_cycles']} projectivized train sentence(s) with cycles" ) + if "trainable_lemmatizer" in factory_names: + msg.divider("Trainable Lemmatizer") + trees_train: Set[str] = gold_train_data["lemmatizer_trees"] + trees_dev: Set[str] = gold_dev_data["lemmatizer_trees"] + # This is necessary context when someone is attempting to interpret whether the + # number of trees exclusively in the dev set is meaningful. + msg.info(f"{len(trees_train)} lemmatizer trees generated from training data") + msg.info(f"{len(trees_dev)} lemmatizer trees generated from dev data") + dev_not_train = trees_dev - trees_train + + if len(dev_not_train) != 0: + pct = len(dev_not_train) / len(trees_dev) + msg.info( + f"{len(dev_not_train)} lemmatizer trees ({pct*100:.1f}% of dev trees)" + " were found exclusively in the dev data." + ) + else: + # Would we ever expect this case? It seems like it would be pretty rare, + # and we might actually want a warning? + msg.info("All trees in dev data present in training data.") + + if gold_train_data["n_low_cardinality_lemmas"] > 0: + n = gold_train_data["n_low_cardinality_lemmas"] + msg.warn(f"{n} training docs with 0 or 1 unique lemmas.") + + if gold_dev_data["n_low_cardinality_lemmas"] > 0: + n = gold_dev_data["n_low_cardinality_lemmas"] + msg.warn(f"{n} dev docs with 0 or 1 unique lemmas.") + + if gold_train_data["no_lemma_annotations"] > 0: + n = gold_train_data["no_lemma_annotations"] + msg.warn(f"{n} training docs with no lemma annotations.") + else: + msg.good("All training docs have lemma annotations.") + + if gold_dev_data["no_lemma_annotations"] > 0: + n = gold_dev_data["no_lemma_annotations"] + msg.warn(f"{n} dev docs with no lemma annotations.") + else: + msg.good("All dev docs have lemma annotations.") + + if gold_train_data["partial_lemma_annotations"] > 0: + n = gold_train_data["partial_lemma_annotations"] + msg.info(f"{n} training docs with partial lemma annotations.") + else: + msg.good("All training docs have complete lemma annotations.") + + if gold_dev_data["partial_lemma_annotations"] > 0: + n = gold_dev_data["partial_lemma_annotations"] + msg.info(f"{n} dev docs with partial lemma annotations.") + else: + msg.good("All dev docs have complete lemma annotations.") + msg.divider("Summary") good_counts = msg.counts[MESSAGES.GOOD] warn_counts = msg.counts[MESSAGES.WARN] @@ -732,7 +785,13 @@ def _compile_gold( "n_cats_multilabel": 0, "n_cats_bad_values": 0, "texts": set(), + "lemmatizer_trees": set(), + "no_lemma_annotations": 0, + "partial_lemma_annotations": 0, + "n_low_cardinality_lemmas": 0, } + if "trainable_lemmatizer" in factory_names: + trees = EditTrees(nlp.vocab.strings) for eg in examples: gold = eg.reference doc = eg.predicted @@ -862,6 +921,25 @@ def _compile_gold( data["n_nonproj"] += 1 if nonproj.contains_cycle(aligned_heads): data["n_cycles"] += 1 + if "trainable_lemmatizer" in factory_names: + # from EditTreeLemmatizer._labels_from_data + if all(token.lemma == 0 for token in gold): + data["no_lemma_annotations"] += 1 + continue + if any(token.lemma == 0 for token in gold): + data["partial_lemma_annotations"] += 1 + lemma_set = set() + for token in gold: + if token.lemma != 0: + lemma_set.add(token.lemma) + tree_id = trees.add(token.text, token.lemma_) + tree_str = trees.tree_to_str(tree_id) + data["lemmatizer_trees"].add(tree_str) + # We want to identify cases where lemmas aren't assigned + # or are all assigned the same value, as this would indicate + # an issue since we're expecting a large set of lemmas + if len(lemma_set) < 2 and len(gold) > 1: + data["n_low_cardinality_lemmas"] += 1 return data diff --git a/spacy/cli/download.py b/spacy/cli/download.py index 0c9a32b93..90471c55e 100644 --- a/spacy/cli/download.py +++ b/spacy/cli/download.py @@ -7,8 +7,8 @@ import typer from ._util import app, Arg, Opt, WHEEL_SUFFIX, SDIST_SUFFIX from .. import about from ..util import is_package, get_minor_version, run_command -from ..util import is_prerelease_version -from ..errors import OLD_MODEL_SHORTCUTS +from ..util import is_prerelease_version, get_installed_models +from ..util import get_package_version @app.command( @@ -61,15 +61,17 @@ def download( version = components[-1] else: model_name = model - if model in OLD_MODEL_SHORTCUTS: - msg.warn( - f"As of spaCy v3.0, shortcuts like '{model}' are deprecated. Please " - f"use the full pipeline package name '{OLD_MODEL_SHORTCUTS[model]}' instead." - ) - model_name = OLD_MODEL_SHORTCUTS[model] compatibility = get_compatibility() version = get_version(model_name, compatibility) + # If we already have this version installed, skip downloading + installed = get_installed_models() + if model_name in installed: + installed_version = get_package_version(model_name) + if installed_version == version: + msg.warn(f"{model_name} v{version} already installed, skipping") + return + filename = get_model_filename(model_name, version, sdist) download_model(filename, pip_args) diff --git a/spacy/cli/info.py b/spacy/cli/info.py index 974bc0f4e..23b69a81d 100644 --- a/spacy/cli/info.py +++ b/spacy/cli/info.py @@ -1,10 +1,10 @@ from typing import Optional, Dict, Any, Union, List import platform -import pkg_resources import json from pathlib import Path from wasabi import Printer, MarkdownRenderer import srsly +import importlib.metadata from ._util import app, Arg, Opt, string_to_list from .download import get_model_filename, get_latest_version @@ -137,15 +137,14 @@ def info_installed_model_url(model: str) -> Optional[str]: dist-info available. """ try: - dist = pkg_resources.get_distribution(model) - data = json.loads(dist.get_metadata("direct_url.json")) - return data["url"] - except pkg_resources.DistributionNotFound: - # no such package - return None + dist = importlib.metadata.distribution(model) + text = dist.read_text("direct_url.json") + if isinstance(text, str): + data = json.loads(text) + return data["url"] except Exception: - # something else, like no file or invalid JSON - return None + pass + return None def info_model_url(model: str) -> Dict[str, Any]: diff --git a/spacy/cli/init_config.py b/spacy/cli/init_config.py index b634caa4c..40e598e5f 100644 --- a/spacy/cli/init_config.py +++ b/spacy/cli/init_config.py @@ -8,11 +8,11 @@ import re from jinja2 import Template from .. import util -from ..language import DEFAULT_CONFIG_PRETRAIN_PATH +from ..language import DEFAULT_CONFIG_DISTILL_PATH, DEFAULT_CONFIG_PRETRAIN_PATH from ..schemas import RecommendationSchema from ..util import SimpleFrozenList from ._util import init_cli, Arg, Opt, show_validation_error, COMMAND -from ._util import string_to_list, import_code +from ._util import string_to_list, import_code, _handle_renamed_language_codes ROOT = Path(__file__).parent / "templates" @@ -43,7 +43,7 @@ class InitValues: def init_config_cli( # fmt: off output_file: Path = Arg(..., help="File to save the config to or - for stdout (will only output config and no additional logging info)", allow_dash=True), - lang: str = Opt(InitValues.lang, "--lang", "-l", help="Two-letter code of the language to use"), + lang: str = Opt(InitValues.lang, "--lang", "-l", help="Code of the language to use"), pipeline: str = Opt(",".join(InitValues.pipeline), "--pipeline", "-p", help="Comma-separated names of trainable pipeline components to include (without 'tok2vec' or 'transformer')"), optimize: Optimizations = Opt(InitValues.optimize, "--optimize", "-o", help="Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters."), gpu: bool = Opt(InitValues.gpu, "--gpu", "-G", help="Whether the model can run on GPU. This will impact the choice of architecture, pretrained weights and related hyperparameters."), @@ -83,6 +83,7 @@ def init_fill_config_cli( # fmt: off base_path: Path = Arg(..., help="Path to base config to fill", exists=True, dir_okay=False), output_file: Path = Arg("-", help="Path to output .cfg file (or - for stdout)", allow_dash=True), + distillation: bool = Opt(False, "--distillation", "-dt", help="Include config for distillation (with 'spacy distill')"), pretraining: bool = Opt(False, "--pretraining", "-pt", help="Include config for pretraining (with 'spacy pretrain')"), diff: bool = Opt(False, "--diff", "-D", help="Print a visual diff highlighting the changes"), code_path: Optional[Path] = Opt(None, "--code-path", "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"), @@ -98,13 +99,20 @@ def init_fill_config_cli( DOCS: https://spacy.io/api/cli#init-fill-config """ import_code(code_path) - fill_config(output_file, base_path, pretraining=pretraining, diff=diff) + fill_config( + output_file, + base_path, + distillation=distillation, + pretraining=pretraining, + diff=diff, + ) def fill_config( output_file: Path, base_path: Path, *, + distillation: bool = False, pretraining: bool = False, diff: bool = False, silent: bool = False, @@ -123,6 +131,9 @@ def fill_config( # replaced with their actual config after loading, so we have to re-add them sourced = util.get_sourced_components(config) filled["components"].update(sourced) + if distillation: + distillation_config = util.load_config(DEFAULT_CONFIG_DISTILL_PATH) + filled = distillation_config.merge(filled) if pretraining: validate_config_for_pretrain(filled, msg) pretrain_config = util.load_config(DEFAULT_CONFIG_PRETRAIN_PATH) @@ -158,6 +169,10 @@ def init_config( msg = Printer(no_print=silent) with TEMPLATE_PATH.open("r") as f: template = Template(f.read()) + + # Throw error for renamed language codes in v4 + _handle_renamed_language_codes(lang) + # Filter out duplicates since tok2vec and transformer are added by template pipeline = [pipe for pipe in pipeline if pipe not in ("tok2vec", "transformer")] defaults = RECOMMENDATIONS["__default__"] diff --git a/spacy/cli/init_pipeline.py b/spacy/cli/init_pipeline.py index d53a61b8e..5d5c14957 100644 --- a/spacy/cli/init_pipeline.py +++ b/spacy/cli/init_pipeline.py @@ -9,7 +9,7 @@ from .. import util from ..training.initialize import init_nlp, convert_vectors from ..language import Language from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error -from ._util import import_code, setup_gpu +from ._util import import_code, setup_gpu, _handle_renamed_language_codes @init_cli.command("vectors") @@ -21,7 +21,6 @@ def init_vectors_cli( prune: int = Opt(-1, "--prune", "-p", help="Optional number of vectors to prune to"), truncate: int = Opt(0, "--truncate", "-t", help="Optional number of vectors to truncate to when reading in vectors file"), mode: str = Opt("default", "--mode", "-m", help="Vectors mode: default or floret"), - name: Optional[str] = Opt(None, "--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"), verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"), jsonl_loc: Optional[Path] = Opt(None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file", hidden=True), # fmt: on @@ -31,6 +30,10 @@ def init_vectors_cli( a model with vectors. """ util.logger.setLevel(logging.DEBUG if verbose else logging.INFO) + + # Throw error for renamed language codes in v4 + _handle_renamed_language_codes(lang) + msg.info(f"Creating blank nlp object for language '{lang}'") nlp = util.get_lang_class(lang)() if jsonl_loc is not None: @@ -40,7 +43,6 @@ def init_vectors_cli( vectors_loc, truncate=truncate, prune=prune, - name=name, mode=mode, ) msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors") diff --git a/spacy/cli/package.py b/spacy/cli/package.py index 324c5d1bb..6351f28eb 100644 --- a/spacy/cli/package.py +++ b/spacy/cli/package.py @@ -252,7 +252,7 @@ def get_third_party_dependencies( raise regerr from None module_name = func_info.get("module") # type: ignore[attr-defined] if module_name: # the code is part of a module, not a --code file - modules.add(func_info["module"].split(".")[0]) # type: ignore[index] + modules.add(func_info["module"].split(".")[0]) # type: ignore[union-attr] dependencies = [] for module_name in modules: if module_name in distributions: diff --git a/spacy/cli/project/pull.py b/spacy/cli/project/pull.py index 6e3cde88c..8894baa50 100644 --- a/spacy/cli/project/pull.py +++ b/spacy/cli/project/pull.py @@ -39,14 +39,17 @@ def project_pull(project_dir: Path, remote: str, *, verbose: bool = False): # in the list. while commands: for i, cmd in enumerate(list(commands)): - logger.debug(f"CMD: {cmd['name']}.") + logger.debug("CMD: %s.", cmd["name"]) deps = [project_dir / dep for dep in cmd.get("deps", [])] if all(dep.exists() for dep in deps): cmd_hash = get_command_hash("", "", deps, cmd["script"]) for output_path in cmd.get("outputs", []): url = storage.pull(output_path, command_hash=cmd_hash) logger.debug( - f"URL: {url} for {output_path} with command hash {cmd_hash}" + "URL: %s for %s with command hash %s", + url, + output_path, + cmd_hash, ) yield url, output_path @@ -58,7 +61,7 @@ def project_pull(project_dir: Path, remote: str, *, verbose: bool = False): commands.pop(i) break else: - logger.debug(f"Dependency missing. Skipping {cmd['name']} outputs.") + logger.debug("Dependency missing. Skipping %s outputs.", cmd["name"]) else: # If we didn't break the for loop, break the while loop. break diff --git a/spacy/cli/project/push.py b/spacy/cli/project/push.py index bc779e9cd..a8178de21 100644 --- a/spacy/cli/project/push.py +++ b/spacy/cli/project/push.py @@ -37,15 +37,15 @@ def project_push(project_dir: Path, remote: str): remote = config["remotes"][remote] storage = RemoteStorage(project_dir, remote) for cmd in config.get("commands", []): - logger.debug(f"CMD: cmd['name']") + logger.debug("CMD: %s", cmd["name"]) deps = [project_dir / dep for dep in cmd.get("deps", [])] if any(not dep.exists() for dep in deps): - logger.debug(f"Dependency missing. Skipping {cmd['name']} outputs") + logger.debug("Dependency missing. Skipping %s outputs", cmd["name"]) continue cmd_hash = get_command_hash( "", "", [project_dir / dep for dep in cmd.get("deps", [])], cmd["script"] ) - logger.debug(f"CMD_HASH: {cmd_hash}") + logger.debug("CMD_HASH: %s", cmd_hash) for output_path in cmd.get("outputs", []): output_loc = project_dir / output_path if output_loc.exists() and _is_not_empty_dir(output_loc): @@ -55,7 +55,7 @@ def project_push(project_dir: Path, remote: str): content_hash=get_content_hash(output_loc), ) logger.debug( - f"URL: {url} for output {output_path} with cmd_hash {cmd_hash}" + "URL: %s for output %s with cmd_hash %s", url, output_path, cmd_hash ) yield output_path, url diff --git a/spacy/cli/project/run.py b/spacy/cli/project/run.py index 6dd174902..0f4858a99 100644 --- a/spacy/cli/project/run.py +++ b/spacy/cli/project/run.py @@ -2,7 +2,6 @@ from typing import Optional, List, Dict, Sequence, Any, Iterable, Tuple import os.path from pathlib import Path -import pkg_resources from wasabi import msg from wasabi.util import locale_escape import sys @@ -331,6 +330,7 @@ def _check_requirements(requirements: List[str]) -> Tuple[bool, bool]: RETURNS (Tuple[bool, bool]): Whether (1) any packages couldn't be imported, (2) any packages with version conflicts exist. """ + import pkg_resources failed_pkgs_msgs: List[str] = [] conflicting_pkgs_msgs: List[str] = [] diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja index b961ac892..eb48d1de5 100644 --- a/spacy/cli/templates/quickstart_training.jinja +++ b/spacy/cli/templates/quickstart_training.jinja @@ -87,12 +87,11 @@ grad_factor = 1.0 factory = "parser" [components.parser.model] -@architectures = "spacy.TransitionBasedParser.v2" +@architectures = "spacy.TransitionBasedParser.v3" state_type = "parser" extra_state_tokens = false hidden_width = 128 maxout_pieces = 3 -use_upper = false nO = null [components.parser.model.tok2vec] @@ -108,12 +107,11 @@ grad_factor = 1.0 factory = "ner" [components.ner.model] -@architectures = "spacy.TransitionBasedParser.v2" +@architectures = "spacy.TransitionBasedParser.v3" state_type = "ner" extra_state_tokens = false hidden_width = 64 maxout_pieces = 2 -use_upper = false nO = null [components.ner.model.tok2vec] @@ -314,12 +312,11 @@ width = ${components.tok2vec.model.encode.width} factory = "parser" [components.parser.model] -@architectures = "spacy.TransitionBasedParser.v2" +@architectures = "spacy.TransitionBasedParser.v3" state_type = "parser" extra_state_tokens = false hidden_width = 128 maxout_pieces = 3 -use_upper = true nO = null [components.parser.model.tok2vec] @@ -332,12 +329,11 @@ width = ${components.tok2vec.model.encode.width} factory = "ner" [components.ner.model] -@architectures = "spacy.TransitionBasedParser.v2" +@architectures = "spacy.TransitionBasedParser.v3" state_type = "ner" extra_state_tokens = false hidden_width = 64 maxout_pieces = 2 -use_upper = true nO = null [components.ner.model.tok2vec] diff --git a/spacy/compat.py b/spacy/compat.py index 89132735d..5344b7cd4 100644 --- a/spacy/compat.py +++ b/spacy/compat.py @@ -22,19 +22,6 @@ try: except ImportError: cupy = None -if sys.version_info[:2] >= (3, 8): # Python 3.8+ - from typing import Literal, Protocol, runtime_checkable -else: - from typing_extensions import Literal, Protocol, runtime_checkable # noqa: F401 - -# Important note: The importlib_metadata "backport" includes functionality -# that's not part of the built-in importlib.metadata. We should treat this -# import like the built-in and only use what's available there. -try: # Python 3.8+ - import importlib.metadata as importlib_metadata -except ImportError: - from catalogue import _importlib_metadata as importlib_metadata # type: ignore[no-redef] # noqa: F401 - from thinc.api import Optimizer # noqa: F401 pickle = pickle diff --git a/spacy/default_config_distillation.cfg b/spacy/default_config_distillation.cfg new file mode 100644 index 000000000..1926fafa9 --- /dev/null +++ b/spacy/default_config_distillation.cfg @@ -0,0 +1,34 @@ +[paths] +raw_text = null + +[distillation] +corpus = "corpora.distillation" +dropout = 0.1 +max_epochs = 1 +max_steps = 0 +student_to_teacher = {} + +[distillation.batcher] +@batchers = "spacy.batch_by_words.v1" +size = 3000 +discard_oversize = false +tolerance = 0.2 + +[distillation.optimizer] +@optimizers = "Adam.v1" +beta1 = 0.9 +beta2 = 0.999 +L2_is_weight_decay = true +L2 = 0.01 +grad_clip = 1.0 +use_averages = true +eps = 1e-8 +learn_rate = 1e-4 + +[corpora] + +[corpora.distillation] +@readers = "spacy.PlainTextCorpus.v1" +path = ${paths.raw_text} +min_length = 0 +max_length = 0 diff --git a/spacy/errors.py b/spacy/errors.py index d143e341c..390de126e 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -1,5 +1,5 @@ +from typing import Literal import warnings -from .compat import Literal class ErrorsWithCodes(type): @@ -131,13 +131,6 @@ class Warnings(metaclass=ErrorsWithCodes): "and make it independent. For example, `replace_listeners = " "[\"model.tok2vec\"]` See the documentation for details: " "https://spacy.io/usage/training#config-components-listeners") - W088 = ("The pipeline component {name} implements a `begin_training` " - "method, which won't be called by spaCy. As of v3.0, `begin_training` " - "has been renamed to `initialize`, so you likely want to rename the " - "component method. See the documentation for details: " - "https://spacy.io/api/language#initialize") - W089 = ("As of spaCy v3.0, the `nlp.begin_training` method has been renamed " - "to `nlp.initialize`.") W090 = ("Could not locate any {format} files in path '{path}'.") W091 = ("Could not clean/remove the temp directory at {dir}: {msg}.") W092 = ("Ignoring annotations for sentence starts, as dependency heads are set.") @@ -216,6 +209,8 @@ class Warnings(metaclass=ErrorsWithCodes): "`enabled` ({enabled}). Be aware that this might affect other components in your pipeline.") W124 = ("{host}:{port} is already in use, using the nearest available port {serve_port} as an alternative.") + W400 = ("`use_upper=False` is ignored, the upper layer is always enabled") + class Errors(metaclass=ErrorsWithCodes): E001 = ("No component '{name}' found in pipeline. Available names: {opts}") @@ -251,9 +246,7 @@ class Errors(metaclass=ErrorsWithCodes): "https://spacy.io/usage/models") E011 = ("Unknown operator: '{op}'. Options: {opts}") E012 = ("Cannot add pattern for zero tokens to matcher.\nKey: {key}") - E016 = ("MultitaskObjective target should be function or one of: dep, " - "tag, ent, dep_tag_offset, ent_tag.") - E017 = ("Can only add unicode or bytes. Got type: {value_type}") + E017 = ("Can only add 'str' inputs to StringStore. Got type: {value_type}") E018 = ("Can't retrieve string for hash '{hash_value}'. This usually " "refers to an issue with the `Vocab` or `StringStore`.") E019 = ("Can't create transition with unknown action ID: {action}. Action " @@ -444,8 +437,7 @@ class Errors(metaclass=ErrorsWithCodes): E133 = ("The sum of prior probabilities for alias '{alias}' should not " "exceed 1, but found {sum}.") E134 = ("Entity '{entity}' is not defined in the Knowledge Base.") - E139 = ("Knowledge base for component '{name}' is empty. Use the methods " - "`kb.add_entity` and `kb.add_alias` to add entries.") + E139 = ("Knowledge base for component '{name}' is empty.") E140 = ("The list of entities, prior probabilities and entity vectors " "should be of equal length.") E141 = ("Entity vectors should be of length {required} instead of the " @@ -466,13 +458,13 @@ class Errors(metaclass=ErrorsWithCodes): "same, but found '{nlp}' and '{vocab}' respectively.") E152 = ("The attribute {attr} is not supported for token patterns. " "Please use the option `validate=True` with the Matcher, PhraseMatcher, " - "EntityRuler or AttributeRuler for more details.") + "SpanRuler or AttributeRuler for more details.") E153 = ("The value type {vtype} is not supported for token patterns. " "Please use the option validate=True with Matcher, PhraseMatcher, " - "EntityRuler or AttributeRuler for more details.") + "SpanRuler or AttributeRuler for more details.") E154 = ("One of the attributes or values is not supported for token " "patterns. Please use the option `validate=True` with the Matcher, " - "PhraseMatcher, or EntityRuler for more details.") + "PhraseMatcher, or SpanRuler for more details.") E155 = ("The pipeline needs to include a {pipe} in order to use " "Matcher or PhraseMatcher with the attribute {attr}. " "Try using `nlp()` instead of `nlp.make_doc()` or `list(nlp.pipe())` " @@ -496,7 +488,7 @@ class Errors(metaclass=ErrorsWithCodes): "Current DocBin: {current}\nOther DocBin: {other}") E169 = ("Can't find module: {module}") E170 = ("Cannot apply transition {name}: invalid for the current state.") - E171 = ("Matcher.add received invalid 'on_match' callback argument: expected " + E171 = ("{name}.add received invalid 'on_match' callback argument: expected " "callable or None, but got: {arg_type}") E175 = ("Can't remove rule for unknown match pattern ID: {key}") E176 = ("Alias '{alias}' is not defined in the Knowledge Base.") @@ -733,13 +725,6 @@ class Errors(metaclass=ErrorsWithCodes): "method in component '{name}'. If you want to use this " "method, make sure it's overwritten on the subclass.") E940 = ("Found NaN values in scores.") - E941 = ("Can't find model '{name}'. It looks like you're trying to load a " - "model from a shortcut, which is obsolete as of spaCy v3.0. To " - "load the model, use its full name instead:\n\n" - "nlp = spacy.load(\"{full}\")\n\nFor more details on the available " - "models, see the models directory: https://spacy.io/models. If you " - "want to create a blank model, use spacy.blank: " - "nlp = spacy.blank(\"{name}\")") E942 = ("Executing `after_{name}` callback failed. Expected the function to " "return an initialized nlp object but got: {value}. Maybe " "you forgot to return the modified object in your function?") @@ -753,7 +738,7 @@ class Errors(metaclass=ErrorsWithCodes): "loaded nlp object, but got: {source}") E947 = ("`Matcher.add` received invalid `greedy` argument: expected " "a string value from {expected} but got: '{arg}'") - E948 = ("`Matcher.add` received invalid 'patterns' argument: expected " + E948 = ("`{name}.add` received invalid 'patterns' argument: expected " "a list, but got: {arg_type}") E949 = ("Unable to align tokens for the predicted and reference docs. It " "is only possible to align the docs when both texts are the same " @@ -927,8 +912,6 @@ class Errors(metaclass=ErrorsWithCodes): E1021 = ("`pos` value \"{pp}\" is not a valid Universal Dependencies tag. " "Non-UD tags should use the `tag` property.") E1022 = ("Words must be of type str or int, but input is of type '{wtype}'") - E1023 = ("Couldn't read EntityRuler from the {path}. This file doesn't " - "exist.") E1024 = ("A pattern with {attr_type} '{label}' is not present in " "'{component}' patterns.") E1025 = ("Cannot intify the value '{value}' as an IOB string. The only " @@ -967,17 +950,19 @@ class Errors(metaclass=ErrorsWithCodes): E1049 = ("No available port found for displaCy on host {host}. Please specify an available port " "with `displacy.serve(doc, port=port)`") E1050 = ("Port {port} is already in use. Please specify an available port with `displacy.serve(doc, port=port)` " - "or use `auto_switch_port=True` to pick an available port automatically.") + "or use `auto_select_port=True` to pick an available port automatically.") + # v4 error strings + E4000 = ("Expected a Doc as input, but got: '{type}'") + E4001 = ("Expected input to be one of the following types: ({expected_types}), " + "but got '{received_type}'") + E4002 = ("Pipe '{name}' requires a teacher pipe for distillation.") + E4003 = ("Training examples for distillation must have the exact same tokens in the " + "reference and predicted docs.") + E4004 = ("Backprop is not supported when is_train is not set.") + E4005 = ("EntityLinker_v1 is not supported in spaCy v4. Update your configuration.") -# Deprecated model shortcuts, only used in errors and warnings -OLD_MODEL_SHORTCUTS = { - "en": "en_core_web_sm", "de": "de_core_news_sm", "es": "es_core_news_sm", - "pt": "pt_core_news_sm", "fr": "fr_core_news_sm", "it": "it_core_news_sm", - "nl": "nl_core_news_sm", "el": "el_core_news_sm", "nb": "nb_core_news_sm", - "lt": "lt_core_news_sm", "xx": "xx_ent_wiki_sm" -} - +RENAMED_LANGUAGE_CODES = {"xx": "mul", "is": "isl"} # fmt: on diff --git a/spacy/kb/kb_in_memory.pyx b/spacy/kb/kb_in_memory.pyx index 485e52c2f..2a74d047b 100644 --- a/spacy/kb/kb_in_memory.pyx +++ b/spacy/kb/kb_in_memory.pyx @@ -25,7 +25,7 @@ cdef class InMemoryLookupKB(KnowledgeBase): """An `InMemoryLookupKB` instance stores unique identifiers for entities and their textual aliases, to support entity linking of named entities to real-world concepts. - DOCS: https://spacy.io/api/kb_in_memory + DOCS: https://spacy.io/api/inmemorylookupkb """ def __init__(self, Vocab vocab, entity_vector_length): @@ -46,6 +46,9 @@ cdef class InMemoryLookupKB(KnowledgeBase): self._alias_index = PreshMap(nr_aliases + 1) self._aliases_table = alias_vec(nr_aliases + 1) + def is_empty(self): + return len(self) == 0 + def __len__(self): return self.get_size_entities() diff --git a/spacy/lang/is/__init__.py b/spacy/lang/isl/__init__.py similarity index 93% rename from spacy/lang/is/__init__.py rename to spacy/lang/isl/__init__.py index 318363beb..16d1f7957 100644 --- a/spacy/lang/is/__init__.py +++ b/spacy/lang/isl/__init__.py @@ -7,7 +7,7 @@ class IcelandicDefaults(BaseDefaults): class Icelandic(Language): - lang = "is" + lang = "isl" Defaults = IcelandicDefaults diff --git a/spacy/lang/is/stop_words.py b/spacy/lang/isl/stop_words.py similarity index 100% rename from spacy/lang/is/stop_words.py rename to spacy/lang/isl/stop_words.py diff --git a/spacy/lang/ko/__init__.py b/spacy/lang/ko/__init__.py index 0e02e4a2d..1220aa141 100644 --- a/spacy/lang/ko/__init__.py +++ b/spacy/lang/ko/__init__.py @@ -18,34 +18,23 @@ DEFAULT_CONFIG = """ [nlp.tokenizer] @tokenizers = "spacy.ko.KoreanTokenizer" +mecab_args = "" """ @registry.tokenizers("spacy.ko.KoreanTokenizer") -def create_tokenizer(): +def create_tokenizer(mecab_args: str): def korean_tokenizer_factory(nlp): - return KoreanTokenizer(nlp.vocab) + return KoreanTokenizer(nlp.vocab, mecab_args=mecab_args) return korean_tokenizer_factory class KoreanTokenizer(DummyTokenizer): - def __init__(self, vocab: Vocab): + def __init__(self, vocab: Vocab, *, mecab_args: str = ""): self.vocab = vocab - self._mecab = try_mecab_import() # type: ignore[func-returns-value] - self._mecab_tokenizer = None - - @property - def mecab_tokenizer(self): - # This is a property so that initializing a pipeline with blank:ko is - # possible without actually requiring mecab-ko, e.g. to run - # `spacy init vectors ko` for a pipeline that will have a different - # tokenizer in the end. The languages need to match for the vectors - # to be imported and there's no way to pass a custom config to - # `init vectors`. - if self._mecab_tokenizer is None: - self._mecab_tokenizer = self._mecab("-F%f[0],%f[7]") - return self._mecab_tokenizer + mecab = try_mecab_import() + self.mecab_tokenizer = mecab.Tagger(mecab_args) def __reduce__(self): return KoreanTokenizer, (self.vocab,) @@ -68,13 +57,15 @@ class KoreanTokenizer(DummyTokenizer): def detailed_tokens(self, text: str) -> Iterator[Dict[str, Any]]: # 품사 태그(POS)[0], 의미 부류(semantic class)[1], 종성 유무(jongseong)[2], 읽기(reading)[3], # 타입(type)[4], 첫번째 품사(start pos)[5], 마지막 품사(end pos)[6], 표현(expression)[7], * - for node in self.mecab_tokenizer.parse(text, as_nodes=True): - if node.is_eos(): + for line in self.mecab_tokenizer.parse(text).split("\n"): + if line == "EOS": break - surface = node.surface - feature = node.feature - tag, _, expr = feature.partition(",") - lemma, _, remainder = expr.partition("/") + surface, _, expr = line.partition("\t") + features = expr.split("/")[0].split(",") + tag = features[0] + lemma = "*" + if len(features) >= 8: + lemma = features[7] if lemma == "*": lemma = surface yield {"surface": surface, "lemma": lemma, "tag": tag} @@ -97,20 +88,94 @@ class Korean(Language): Defaults = KoreanDefaults -def try_mecab_import() -> None: +def try_mecab_import(): try: - from natto import MeCab + import mecab_ko as MeCab return MeCab except ImportError: raise ImportError( 'The Korean tokenizer ("spacy.ko.KoreanTokenizer") requires ' - "[mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md), " - "[mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic), " - "and [natto-py](https://github.com/buruzaemon/natto-py)" + "the python package `mecab-ko`: pip install mecab-ko" ) from None +@registry.tokenizers("spacy.KoreanNattoTokenizer.v1") +def create_natto_tokenizer(): + def korean_natto_tokenizer_factory(nlp): + return KoreanNattoTokenizer(nlp.vocab) + + return korean_natto_tokenizer_factory + + +class KoreanNattoTokenizer(DummyTokenizer): + def __init__(self, vocab: Vocab): + self.vocab = vocab + self._mecab = self._try_mecab_import() # type: ignore[func-returns-value] + self._mecab_tokenizer = None + + @property + def mecab_tokenizer(self): + # This is a property so that initializing a pipeline with blank:ko is + # possible without actually requiring mecab-ko, e.g. to run + # `spacy init vectors ko` for a pipeline that will have a different + # tokenizer in the end. The languages need to match for the vectors + # to be imported and there's no way to pass a custom config to + # `init vectors`. + if self._mecab_tokenizer is None: + self._mecab_tokenizer = self._mecab("-F%f[0],%f[7]") + return self._mecab_tokenizer + + def __reduce__(self): + return KoreanNattoTokenizer, (self.vocab,) + + def __call__(self, text: str) -> Doc: + dtokens = list(self.detailed_tokens(text)) + surfaces = [dt["surface"] for dt in dtokens] + doc = Doc(self.vocab, words=surfaces, spaces=list(check_spaces(text, surfaces))) + for token, dtoken in zip(doc, dtokens): + first_tag, sep, eomi_tags = dtoken["tag"].partition("+") + token.tag_ = first_tag # stem(어간) or pre-final(선어말 어미) + if token.tag_ in TAG_MAP: + token.pos = TAG_MAP[token.tag_][POS] + else: + token.pos = X + token.lemma_ = dtoken["lemma"] + doc.user_data["full_tags"] = [dt["tag"] for dt in dtokens] + return doc + + def detailed_tokens(self, text: str) -> Iterator[Dict[str, Any]]: + # 품사 태그(POS)[0], 의미 부류(semantic class)[1], 종성 유무(jongseong)[2], 읽기(reading)[3], + # 타입(type)[4], 첫번째 품사(start pos)[5], 마지막 품사(end pos)[6], 표현(expression)[7], * + for node in self.mecab_tokenizer.parse(text, as_nodes=True): + if node.is_eos(): + break + surface = node.surface + feature = node.feature + tag, _, expr = feature.partition(",") + lemma, _, remainder = expr.partition("/") + if lemma == "*" or lemma == "": + lemma = surface + yield {"surface": surface, "lemma": lemma, "tag": tag} + + def score(self, examples): + validate_examples(examples, "KoreanTokenizer.score") + return Scorer.score_tokenization(examples) + + def _try_mecab_import(self): + try: + from natto import MeCab + + return MeCab + except ImportError: + raise ImportError( + 'The Korean Natto tokenizer ("spacy.ko.KoreanNattoTokenizer") requires ' + "[mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md), " + "[mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic), " + "and [natto-py](https://github.com/buruzaemon/natto-py)" + ) from None + + def check_spaces(text, tokens): prev_end = -1 start = 0 diff --git a/spacy/lang/xx/__init__.py b/spacy/lang/mul/__init__.py similarity index 67% rename from spacy/lang/xx/__init__.py rename to spacy/lang/mul/__init__.py index aff8403ff..5170f1e86 100644 --- a/spacy/lang/xx/__init__.py +++ b/spacy/lang/mul/__init__.py @@ -3,10 +3,10 @@ from ...language import Language class MultiLanguage(Language): """Language class to be used for models that support multiple languages. - This module allows models to specify their language ID as 'xx'. + This module allows models to specify their language ID as 'mul'. """ - lang = "xx" + lang = "mul" __all__ = ["MultiLanguage"] diff --git a/spacy/lang/xx/examples.py b/spacy/lang/mul/examples.py similarity index 100% rename from spacy/lang/xx/examples.py rename to spacy/lang/mul/examples.py diff --git a/spacy/lang/sv/__init__.py b/spacy/lang/sv/__init__.py index 6963e8b79..28e5085a8 100644 --- a/spacy/lang/sv/__init__.py +++ b/spacy/lang/sv/__init__.py @@ -6,10 +6,7 @@ from .lex_attrs import LEX_ATTRS from .syntax_iterators import SYNTAX_ITERATORS from ...language import Language, BaseDefaults from ...pipeline import Lemmatizer - - -# Punctuation stolen from Danish -from ..da.punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES +from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES class SwedishDefaults(BaseDefaults): diff --git a/spacy/lang/sv/punctuation.py b/spacy/lang/sv/punctuation.py new file mode 100644 index 000000000..67f1bcdc4 --- /dev/null +++ b/spacy/lang/sv/punctuation.py @@ -0,0 +1,33 @@ +from ..char_classes import LIST_ELLIPSES, LIST_ICONS +from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER +from ..punctuation import TOKENIZER_SUFFIXES + + +_quotes = CONCAT_QUOTES.replace("'", "") + +_infixes = ( + LIST_ELLIPSES + + LIST_ICONS + + [ + r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER), + r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA), + r"(?<=[{a}])[<>=](?=[{a}])".format(a=ALPHA), + r"(?<=[{a}]):(?=[{a}])".format(a=ALPHA_UPPER), + r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), + r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes), + r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA), + r"(?<=[{a}0-9])[<>=/](?=[{a}])".format(a=ALPHA), + r"(?<=[{a}0-9]):(?=[{a}])".format(a=ALPHA_UPPER), + ] +) + +_suffixes = [ + suffix + for suffix in TOKENIZER_SUFFIXES + if suffix not in ["'s", "'S", "’s", "’S", r"\'"] +] +_suffixes += [r"(?<=[^sSxXzZ])\'"] + + +TOKENIZER_INFIXES = _infixes +TOKENIZER_SUFFIXES = _suffixes diff --git a/spacy/lang/tokenizer_exceptions.py b/spacy/lang/tokenizer_exceptions.py index d76fe4262..a5e388ca8 100644 --- a/spacy/lang/tokenizer_exceptions.py +++ b/spacy/lang/tokenizer_exceptions.py @@ -17,10 +17,6 @@ URL_PATTERN = ( r"(?:\S+(?::\S*)?@)?" r"(?:" # IP address exclusion - # private & local networks - r"(?!(?:10|127)(?:\.\d{1,3}){3})" - r"(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})" - r"(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})" # IP address dotted notation octets # excludes loopback network 0.0.0.0 # excludes reserved space >= 224.0.0.0 diff --git a/spacy/language.py b/spacy/language.py index 170041bbe..bb8993902 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -1,4 +1,4 @@ -from typing import Iterator, Optional, Any, Dict, Callable, Iterable +from typing import Iterator, Optional, Any, Dict, Callable, Iterable, Literal from typing import Union, Tuple, List, Set, Pattern, Sequence from typing import NoReturn, TYPE_CHECKING, TypeVar, cast, overload @@ -22,7 +22,7 @@ from . import ty from .tokens.underscore import Underscore from .vocab import Vocab, create_vocab from .pipe_analysis import validate_attrs, analyze_pipes, print_pipe_analysis -from .training import Example, validate_examples +from .training import Example, validate_examples, validate_distillation_examples from .training.initialize import init_vocab, init_tok2vec from .scorer import Scorer from .util import registry, SimpleFrozenList, _pipe, raise_error, _DEFAULT_EMPTY_PIPES @@ -40,7 +40,6 @@ from .git_info import GIT_VERSION from . import util from . import about from .lookups import load_lookups -from .compat import Literal PipeCallable = Callable[[Doc], Doc] @@ -49,6 +48,9 @@ PipeCallable = Callable[[Doc], Doc] # This is the base config will all settings (training etc.) DEFAULT_CONFIG_PATH = Path(__file__).parent / "default_config.cfg" DEFAULT_CONFIG = util.load_config(DEFAULT_CONFIG_PATH) +# This is the base config for the [distillation] block and currently not included +# in the main config and only added via the 'init fill-config' command +DEFAULT_CONFIG_DISTILL_PATH = Path(__file__).parent / "default_config_distillation.cfg" # This is the base config for the [pretraining] block and currently not included # in the main config and only added via the 'init fill-config' command DEFAULT_CONFIG_PRETRAIN_PATH = Path(__file__).parent / "default_config_pretraining.cfg" @@ -104,7 +106,7 @@ def create_tokenizer() -> Callable[["Language"], Tokenizer]: @registry.misc("spacy.LookupsDataLoader.v1") def load_lookups_data(lang, tables): - util.logger.debug(f"Loading lookups from spacy-lookups-data: {tables}") + util.logger.debug("Loading lookups from spacy-lookups-data: %s", tables) lookups = load_lookups(lang=lang, tables=tables) return lookups @@ -172,8 +174,7 @@ class Language: if not isinstance(vocab, Vocab) and vocab is not True: raise ValueError(Errors.E918.format(vocab=vocab, vocab_type=type(Vocab))) if vocab is True: - vectors_name = meta.get("vectors", {}).get("name") - vocab = create_vocab(self.lang, self.Defaults, vectors_name=vectors_name) + vocab = create_vocab(self.lang, self.Defaults) else: if (self.lang and vocab.lang) and (self.lang != vocab.lang): raise ValueError(Errors.E150.format(nlp=self.lang, vocab=vocab.lang)) @@ -227,7 +228,6 @@ class Language: "width": self.vocab.vectors_length, "vectors": len(self.vocab.vectors), "keys": self.vocab.vectors.n_keys, - "name": self.vocab.vectors.name, "mode": self.vocab.vectors.mode, } self._meta["labels"] = dict(self.pipe_labels) @@ -1018,6 +1018,102 @@ class Language: raise ValueError(Errors.E005.format(name=name, returned_type=type(doc))) return doc + def distill( + self, + teacher: "Language", + examples: Iterable[Example], + *, + drop: float = 0.0, + sgd: Optional[Optimizer] = None, + losses: Optional[Dict[str, float]] = None, + component_cfg: Optional[Dict[str, Dict[str, Any]]] = None, + exclude: Iterable[str] = SimpleFrozenList(), + annotates: Iterable[str] = SimpleFrozenList(), + student_to_teacher: Optional[Dict[str, str]] = None, + ): + """Distill the models in a student pipeline from a teacher pipeline. + teacher (Language): Teacher to distill from. + examples (Iterable[Example]): Distillation examples. The reference + (teacher) and predicted (student) docs must have the same number of + tokens and the same orthography. + drop (float): The dropout rate. + sgd (Optional[Optimizer]): An optimizer. + losses (Optional(Dict[str, float])): Dictionary to update with the loss, + keyed by component. + component_cfg (Optional[Dict[str, Dict[str, Any]]]): Config parameters + for specific pipeline components, keyed by component name. + exclude (Iterable[str]): Names of components that shouldn't be updated. + annotates (Iterable[str]): Names of components that should set + annotations on the predicted examples after updating. + student_to_teacher (Optional[Dict[str, str]]): Map student pipe name to + teacher pipe name, only needed for pipes where the student pipe + name does not match the teacher pipe name. + RETURNS (Dict[str, float]): The updated losses dictionary + + DOCS: https://spacy.io/api/language#distill + """ + if student_to_teacher is None: + student_to_teacher = {} + if losses is None: + losses = {} + if isinstance(examples, list) and len(examples) == 0: + return losses + + validate_distillation_examples(examples, "Language.distill") + examples = _copy_examples(examples, copy_x=True, copy_y=True) + + if sgd is None: + if self._optimizer is None: + self._optimizer = self.create_optimizer() + sgd = self._optimizer + + if component_cfg is None: + component_cfg = {} + pipe_kwargs = {} + for student_name, student_proc in self.pipeline: + component_cfg.setdefault(student_name, {}) + pipe_kwargs[student_name] = deepcopy(component_cfg[student_name]) + component_cfg[student_name].setdefault("drop", drop) + pipe_kwargs[student_name].setdefault("batch_size", self.batch_size) + + teacher_pipes = dict(teacher.pipeline) + for student_name, student_proc in self.pipeline: + if student_name in annotates: + for doc, eg in zip( + _pipe( + (eg.predicted for eg in examples), + proc=student_proc, + name=student_name, + default_error_handler=self.default_error_handler, + kwargs=pipe_kwargs[student_name], + ), + examples, + ): + eg.predicted = doc + + if ( + student_name not in exclude + and isinstance(student_proc, ty.DistillableComponent) + and student_proc.is_distillable + ): + # A missing teacher pipe is not an error, some student pipes + # do not need a teacher, such as tok2vec layer losses. + teacher_name = ( + student_to_teacher[student_name] + if student_name in student_to_teacher + else student_name + ) + teacher_pipe = teacher_pipes.get(teacher_name, None) + student_proc.distill( + teacher_pipe, + examples, + sgd=sgd, + losses=losses, + **component_cfg[student_name], + ) + + return losses + def disable_pipes(self, *names) -> "DisabledPipes": """Disable one or more pipeline components. If used as a context manager, the pipeline will be restored to the initial state at the end @@ -1150,17 +1246,12 @@ class Language: component_cfg[name].setdefault("drop", drop) pipe_kwargs[name].setdefault("batch_size", self.batch_size) for name, proc in self.pipeline: - # ignore statements are used here because mypy ignores hasattr - if name not in exclude and hasattr(proc, "update"): - proc.update(examples, sgd=None, losses=losses, **component_cfg[name]) # type: ignore - if sgd not in (None, False): - if ( - name not in exclude - and isinstance(proc, ty.TrainableComponent) - and proc.is_trainable - and proc.model not in (True, False, None) - ): - proc.finish_update(sgd) + if ( + name not in exclude + and isinstance(proc, ty.TrainableComponent) + and proc.is_trainable + ): + proc.update(examples, sgd=None, losses=losses, **component_cfg[name]) if name in annotates: for doc, eg in zip( _pipe( @@ -1173,6 +1264,17 @@ class Language: examples, ): eg.predicted = doc + # Only finish the update after all component updates are done. Some + # components may share weights (such as tok2vec) and we only want + # to apply weight updates after all gradients are accumulated. + for name, proc in self.pipeline: + if ( + name not in exclude + and isinstance(proc, ty.TrainableComponent) + and proc.is_trainable + ): + proc.finish_update(sgd) + return losses def rehearse( @@ -1230,25 +1332,20 @@ class Language: return losses - def begin_training( - self, - get_examples: Optional[Callable[[], Iterable[Example]]] = None, - *, - sgd: Optional[Optimizer] = None, - ) -> Optimizer: - warnings.warn(Warnings.W089, DeprecationWarning) - return self.initialize(get_examples, sgd=sgd) - def initialize( self, get_examples: Optional[Callable[[], Iterable[Example]]] = None, *, + labels: Optional[Dict[str, Any]] = None, sgd: Optional[Optimizer] = None, ) -> Optimizer: """Initialize the pipe for training, using data examples if available. get_examples (Callable[[], Iterable[Example]]): Optional function that returns gold-standard Example objects. + labels (Optional[Dict[str, Any]]): Labels to pass to pipe initialization, + using the names of the pipes as keys. Overrides labels that are in + the model configuration. sgd (Optional[Optimizer]): An optimizer to use for updates. If not provided, will be created using the .create_optimizer() method. RETURNS (thinc.api.Optimizer): The optimizer. @@ -1293,6 +1390,8 @@ class Language: for name, proc in self.pipeline: if isinstance(proc, ty.InitializableComponent): p_settings = I["components"].get(name, {}) + if labels is not None and name in labels: + p_settings["labels"] = labels[name] p_settings = validate_init_settings( proc.initialize, p_settings, section="components", name=name ) @@ -1726,6 +1825,7 @@ class Language: # using the nlp.config with all defaults. config = util.copy_config(config) orig_pipeline = config.pop("components", {}) + orig_distill = config.pop("distill", None) orig_pretraining = config.pop("pretraining", None) config["components"] = {} if auto_fill: @@ -1734,6 +1834,9 @@ class Language: filled = config filled["components"] = orig_pipeline config["components"] = orig_pipeline + if orig_distill is not None: + filled["distill"] = orig_distill + config["distill"] = orig_distill if orig_pretraining is not None: filled["pretraining"] = orig_pretraining config["pretraining"] = orig_pretraining @@ -1960,7 +2063,7 @@ class Language: pipe = self.get_pipe(pipe_name) pipe_cfg = self._pipe_configs[pipe_name] if listeners: - util.logger.debug(f"Replacing listeners of component '{pipe_name}'") + util.logger.debug("Replacing listeners of component '%s'", pipe_name) if len(list(listeners)) != len(pipe_listeners): # The number of listeners defined in the component model doesn't # match the listeners to replace, so we won't be able to update @@ -2083,9 +2186,6 @@ class Language: if path.exists(): data = srsly.read_json(path) self.meta.update(data) - # self.meta always overrides meta["vectors"] with the metadata - # from self.vocab.vectors, so set the name directly - self.vocab.vectors.name = data.get("vectors", {}).get("name") def deserialize_vocab(path: Path) -> None: if path.exists(): @@ -2154,9 +2254,6 @@ class Language: def deserialize_meta(b): data = srsly.json_loads(b) self.meta.update(data) - # self.meta always overrides meta["vectors"] with the metadata - # from self.vocab.vectors, so set the name directly - self.vocab.vectors.name = data.get("vectors", {}).get("name") deserializers: Dict[str, Callable[[bytes], Any]] = {} deserializers["config.cfg"] = lambda b: self.config.from_bytes( @@ -2223,13 +2320,18 @@ class DisabledPipes(list): self[:] = [] -def _copy_examples(examples: Iterable[Example]) -> List[Example]: +def _copy_examples( + examples: Iterable[Example], *, copy_x: bool = True, copy_y: bool = False +) -> List[Example]: """Make a copy of a batch of examples, copying the predicted Doc as well. This is used in contexts where we need to take ownership of the examples so that they can be mutated, for instance during Language.evaluate and Language.update. """ - return [Example(eg.x.copy(), eg.y) for eg in examples] + return [ + Example(eg.x.copy() if copy_x else eg.x, eg.y.copy() if copy_y else eg.y) + for eg in examples + ] def _apply_pipes( diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd index 8dea0d6a2..2d14edcd6 100644 --- a/spacy/lexeme.pxd +++ b/spacy/lexeme.pxd @@ -5,7 +5,6 @@ from .attrs cimport attr_id_t from .attrs cimport ID, ORTH, LOWER, NORM, SHAPE, PREFIX, SUFFIX, LENGTH, LANG from .structs cimport LexemeC -from .strings cimport StringStore from .vocab cimport Vocab diff --git a/spacy/lexeme.pyi b/spacy/lexeme.pyi index 4fcaa82cf..4942b18aa 100644 --- a/spacy/lexeme.pyi +++ b/spacy/lexeme.pyi @@ -20,7 +20,6 @@ class Lexeme: def vector_norm(self) -> float: ... vector: Floats1d rank: int - sentiment: float @property def orth_(self) -> str: ... @property diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index 6c66effde..e57098f17 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -41,7 +41,7 @@ cdef class Lexeme: """ self.vocab = vocab self.orth = orth - self.c = vocab.get_by_orth(vocab.mem, orth) + self.c = vocab.get_by_orth(orth) if self.c.orth != orth: raise ValueError(Errors.E071.format(orth=orth, vocab_orth=self.c.orth)) @@ -173,19 +173,6 @@ cdef class Lexeme: def __set__(self, value): self.c.id = value - property sentiment: - """RETURNS (float): A scalar value indicating the positivity or - negativity of the lexeme.""" - def __get__(self): - sentiment_table = self.vocab.lookups.get_table("lexeme_sentiment", {}) - return sentiment_table.get(self.c.orth, 0.0) - - def __set__(self, float x): - if "lexeme_sentiment" not in self.vocab.lookups: - self.vocab.lookups.add_table("lexeme_sentiment") - sentiment_table = self.vocab.lookups.get_table("lexeme_sentiment") - sentiment_table[self.c.orth] = x - @property def orth_(self): """RETURNS (str): The original verbatim text of the lexeme diff --git a/spacy/matcher/dependencymatcher.pyx b/spacy/matcher/dependencymatcher.pyx index 74c2d002f..e2a1b8a3b 100644 --- a/spacy/matcher/dependencymatcher.pyx +++ b/spacy/matcher/dependencymatcher.pyx @@ -82,8 +82,12 @@ cdef class DependencyMatcher: "$-": self._imm_left_sib, "$++": self._right_sib, "$--": self._left_sib, + ">+": self._imm_right_child, + ">-": self._imm_left_child, ">++": self._right_child, ">--": self._left_child, + "<+": self._imm_right_parent, + "<-": self._imm_left_parent, "<++": self._right_parent, "<--": self._left_parent, } @@ -165,9 +169,9 @@ cdef class DependencyMatcher: on_match (callable): Optional callback executed on match. """ if on_match is not None and not hasattr(on_match, "__call__"): - raise ValueError(Errors.E171.format(arg_type=type(on_match))) - if patterns is None or not isinstance(patterns, List): # old API - raise ValueError(Errors.E948.format(arg_type=type(patterns))) + raise ValueError(Errors.E171.format(name="DependencyMatcher", arg_type=type(on_match))) + if patterns is None or not isinstance(patterns, List): + raise ValueError(Errors.E948.format(name="DependencyMatcher", arg_type=type(patterns))) for pattern in patterns: if len(pattern) == 0: raise ValueError(Errors.E012.format(key=key)) @@ -427,12 +431,34 @@ cdef class DependencyMatcher: def _left_sib(self, doc, node): return [doc[child.i] for child in doc[node].head.children if child.i < node] + def _imm_right_child(self, doc, node): + for child in doc[node].children: + if child.i == node + 1: + return [doc[child.i]] + return [] + + def _imm_left_child(self, doc, node): + for child in doc[node].children: + if child.i == node - 1: + return [doc[child.i]] + return [] + def _right_child(self, doc, node): return [doc[child.i] for child in doc[node].children if child.i > node] def _left_child(self, doc, node): return [doc[child.i] for child in doc[node].children if child.i < node] + def _imm_right_parent(self, doc, node): + if doc[node].head.i == node + 1: + return [doc[node].head] + return [] + + def _imm_left_parent(self, doc, node): + if doc[node].head.i == node - 1: + return [doc[node].head] + return [] + def _right_parent(self, doc, node): if doc[node].head.i > node: return [doc[node].head] diff --git a/spacy/matcher/matcher.pyi b/spacy/matcher/matcher.pyi index 48922865b..9797463aa 100644 --- a/spacy/matcher/matcher.pyi +++ b/spacy/matcher/matcher.pyi @@ -1,6 +1,5 @@ -from typing import Any, List, Dict, Tuple, Optional, Callable, Union +from typing import Any, List, Dict, Tuple, Optional, Callable, Union, Literal from typing import Iterator, Iterable, overload -from ..compat import Literal from ..vocab import Vocab from ..tokens import Doc, Span diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index ea1b4b66b..17bdfd394 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -23,7 +23,7 @@ from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA, MORPH from .levenshtein import levenshtein_compare from ..schemas import validate_token_pattern from ..errors import Errors, MatchPatternError, Warnings -from ..strings import get_string_id +from ..strings cimport get_string_id from ..attrs import IDS from ..util import registry @@ -115,9 +115,9 @@ cdef class Matcher: """ errors = {} if on_match is not None and not hasattr(on_match, "__call__"): - raise ValueError(Errors.E171.format(arg_type=type(on_match))) - if patterns is None or not isinstance(patterns, List): # old API - raise ValueError(Errors.E948.format(arg_type=type(patterns))) + raise ValueError(Errors.E171.format(name="Matcher", arg_type=type(on_match))) + if patterns is None or not isinstance(patterns, List): + raise ValueError(Errors.E948.format(name="Matcher", arg_type=type(patterns))) if greedy is not None and greedy not in ["FIRST", "LONGEST"]: raise ValueError(Errors.E947.format(expected=["FIRST", "LONGEST"], arg=greedy)) for i, pattern in enumerate(patterns): @@ -265,6 +265,10 @@ cdef class Matcher: # non-overlapping ones this `match` can be either (start, end) or # (start, end, alignments) depending on `with_alignments=` option. for key, *match in matches: + # Adjust span matches to doc offsets + if isinstance(doclike, Span): + match[0] += doclike.start + match[1] += doclike.start span_filter = self._filter.get(key) if span_filter is not None: pairs = pairs_by_id.get(key, []) @@ -295,9 +299,6 @@ cdef class Matcher: if as_spans: final_results = [] for key, start, end, *_ in final_matches: - if isinstance(doclike, Span): - start += doclike.start - end += doclike.start final_results.append(Span(doc, start, end, label=key)) elif with_alignments: # convert alignments List[Dict[str, int]] --> List[int] @@ -828,6 +829,11 @@ def _get_attr_values(spec, string_store): return attr_values +def _predicate_cache_key(attr, predicate, value, *, regex=False, fuzzy=None): + # tuple order affects performance + return (attr, regex, fuzzy, predicate, srsly.json_dumps(value, sort_keys=True)) + + # These predicate helper classes are used to match the REGEX, IN, >= etc # extensions to the matcher introduced in #3173. @@ -847,7 +853,7 @@ class _FuzzyPredicate: fuzz = self.predicate[len("FUZZY"):] # number after prefix self.fuzzy = int(fuzz) if fuzz else -1 self.fuzzy_compare = fuzzy_compare - self.key = (self.attr, self.fuzzy, self.predicate, srsly.json_dumps(value, sort_keys=True)) + self.key = _predicate_cache_key(self.attr, self.predicate, value, fuzzy=self.fuzzy) def __call__(self, Token token): if self.is_extension: @@ -869,7 +875,7 @@ class _RegexPredicate: self.value = re.compile(value) self.predicate = predicate self.is_extension = is_extension - self.key = (self.attr, self.predicate, srsly.json_dumps(value, sort_keys=True)) + self.key = _predicate_cache_key(self.attr, self.predicate, value) if self.predicate not in self.operators: raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate)) @@ -905,7 +911,7 @@ class _SetPredicate: self.value = set(get_string_id(v) for v in value) self.predicate = predicate self.is_extension = is_extension - self.key = (self.attr, self.regex, self.fuzzy, self.predicate, srsly.json_dumps(value, sort_keys=True)) + self.key = _predicate_cache_key(self.attr, self.predicate, value, regex=self.regex, fuzzy=self.fuzzy) if self.predicate not in self.operators: raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate)) @@ -977,7 +983,7 @@ class _ComparisonPredicate: self.value = value self.predicate = predicate self.is_extension = is_extension - self.key = (self.attr, self.predicate, srsly.json_dumps(value, sort_keys=True)) + self.key = _predicate_cache_key(self.attr, self.predicate, value) if self.predicate not in self.operators: raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate)) @@ -1092,7 +1098,7 @@ def _get_extension_extra_predicates(spec, extra_predicates, predicate_types, if isinstance(value, dict): for type_, cls in predicate_types.items(): if type_ in value: - key = (attr, type_, srsly.json_dumps(value[type_], sort_keys=True)) + key = _predicate_cache_key(attr, type_, value[type_]) if key in seen_predicates: output.append(seen_predicates[key]) else: diff --git a/spacy/matcher/phrasematcher.pyi b/spacy/matcher/phrasematcher.pyi index 68e3386e4..af3a2d23b 100644 --- a/spacy/matcher/phrasematcher.pyi +++ b/spacy/matcher/phrasematcher.pyi @@ -1,5 +1,5 @@ -from typing import List, Tuple, Union, Optional, Callable, Any, Dict, overload -from ..compat import Literal +from typing import List, Tuple, Union, Optional, Callable, Any, Dict, Literal +from typing import overload from .matcher import Matcher from ..vocab import Vocab from ..tokens import Doc, Span @@ -20,6 +20,15 @@ class PhraseMatcher: Callable[[Matcher, Doc, int, List[Tuple[Any, ...]]], Any] ] = ..., ) -> None: ... + def _add_from_arrays( + self, + key: str, + specs: List[List[int]], + *, + on_match: Optional[ + Callable[[Matcher, Doc, int, List[Tuple[Any, ...]]], Any] + ] = ..., + ) -> None: ... def remove(self, key: str) -> None: ... @overload def __call__( diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx index 382029872..ebe1213c7 100644 --- a/spacy/matcher/phrasematcher.pyx +++ b/spacy/matcher/phrasematcher.pyx @@ -1,4 +1,6 @@ # cython: infer_types=True, profile=True +from typing import List +from collections import defaultdict from libc.stdint cimport uintptr_t from preshed.maps cimport map_init, map_set, map_get, map_clear, map_iter @@ -39,7 +41,7 @@ cdef class PhraseMatcher: """ self.vocab = vocab self._callbacks = {} - self._docs = {} + self._docs = defaultdict(set) self._validate = validate self.mem = Pool() @@ -155,66 +157,24 @@ cdef class PhraseMatcher: del self._callbacks[key] del self._docs[key] - def add(self, key, docs, *_docs, on_match=None): - """Add a match-rule to the phrase-matcher. A match-rule consists of: an ID - key, an on_match callback, and one or more patterns. - Since spaCy v2.2.2, PhraseMatcher.add takes a list of patterns as the - second argument, with the on_match callback as an optional keyword - argument. + def _add_from_arrays(self, key, specs, *, on_match=None): + """Add a preprocessed list of specs, with an optional callback. key (str): The match ID. - docs (list): List of `Doc` objects representing match patterns. + specs (List[List[int]]): A list of lists of hashes to match. on_match (callable): Callback executed on match. - *_docs (Doc): For backwards compatibility: list of patterns to add - as variable arguments. Will be ignored if a list of patterns is - provided as the second argument. - - DOCS: https://spacy.io/api/phrasematcher#add """ - if docs is None or hasattr(docs, "__call__"): # old API - on_match = docs - docs = _docs - - _ = self.vocab[key] - self._callbacks[key] = on_match - self._docs.setdefault(key, set()) - cdef MapStruct* current_node cdef MapStruct* internal_node cdef void* result - if isinstance(docs, Doc): - raise ValueError(Errors.E179.format(key=key)) - for doc in docs: - if len(doc) == 0: - continue - if isinstance(doc, Doc): - attrs = (TAG, POS, MORPH, LEMMA, DEP) - has_annotation = {attr: doc.has_annotation(attr) for attr in attrs} - for attr in attrs: - if self.attr == attr and not has_annotation[attr]: - if attr == TAG: - pipe = "tagger" - elif attr in (POS, MORPH): - pipe = "morphologizer or tagger+attribute_ruler" - elif attr == LEMMA: - pipe = "lemmatizer" - elif attr == DEP: - pipe = "parser" - error_msg = Errors.E155.format(pipe=pipe, attr=self.vocab.strings.as_string(attr)) - raise ValueError(error_msg) - if self._validate and any(has_annotation.values()) \ - and self.attr not in attrs: - string_attr = self.vocab.strings[self.attr] - warnings.warn(Warnings.W012.format(key=key, attr=string_attr)) - keyword = self._convert_to_array(doc) - else: - keyword = doc - self._docs[key].add(tuple(keyword)) + self._callbacks[key] = on_match + for spec in specs: + self._docs[key].add(tuple(spec)) current_node = self.c_map - for token in keyword: + for token in spec: if token == self._terminal_hash: warnings.warn(Warnings.W021) break @@ -233,6 +193,57 @@ cdef class PhraseMatcher: result = internal_node map_set(self.mem, result, self.vocab.strings[key], NULL) + + def add(self, key, docs, *, on_match=None): + """Add a match-rule to the phrase-matcher. A match-rule consists of: an ID + key, a list of one or more patterns, and (optionally) an on_match callback. + + key (str): The match ID. + docs (list): List of `Doc` objects representing match patterns. + on_match (callable): Callback executed on match. + + If any of the input Docs are invalid, no internal state will be updated. + + DOCS: https://spacy.io/api/phrasematcher#add + """ + if isinstance(docs, Doc): + raise ValueError(Errors.E179.format(key=key)) + if docs is None or not isinstance(docs, List): + raise ValueError(Errors.E948.format(name="PhraseMatcher", arg_type=type(docs))) + if on_match is not None and not hasattr(on_match, "__call__"): + raise ValueError(Errors.E171.format(name="PhraseMatcher", arg_type=type(on_match))) + + _ = self.vocab[key] + specs = [] + + for doc in docs: + if len(doc) == 0: + continue + if not isinstance(doc, Doc): + raise ValueError(Errors.E4000.format(type=type(doc))) + + attrs = (TAG, POS, MORPH, LEMMA, DEP) + has_annotation = {attr: doc.has_annotation(attr) for attr in attrs} + for attr in attrs: + if self.attr == attr and not has_annotation[attr]: + if attr == TAG: + pipe = "tagger" + elif attr in (POS, MORPH): + pipe = "morphologizer or tagger+attribute_ruler" + elif attr == LEMMA: + pipe = "lemmatizer" + elif attr == DEP: + pipe = "parser" + error_msg = Errors.E155.format(pipe=pipe, attr=self.vocab.strings.as_string(attr)) + raise ValueError(error_msg) + if self._validate and any(has_annotation.values()) \ + and self.attr not in attrs: + string_attr = self.vocab.strings[self.attr] + warnings.warn(Warnings.W012.format(key=key, attr=string_attr)) + specs.append(self._convert_to_array(doc)) + + self._add_from_arrays(key, specs, on_match=on_match) + def __call__(self, object doclike, *, as_spans=False): """Find all sequences matching the supplied patterns on the `Doc`. @@ -345,7 +356,7 @@ def unpickle_matcher(vocab, docs, callbacks, attr): matcher = PhraseMatcher(vocab, attr=attr) for key, specs in docs.items(): callback = callbacks.get(key, None) - matcher.add(key, specs, on_match=callback) + matcher._add_from_arrays(key, specs, on_match=callback) return matcher diff --git a/spacy/ml/_precomputable_affine.py b/spacy/ml/_precomputable_affine.py deleted file mode 100644 index 1c20c622b..000000000 --- a/spacy/ml/_precomputable_affine.py +++ /dev/null @@ -1,164 +0,0 @@ -from thinc.api import Model, normal_init - -from ..util import registry - - -@registry.layers("spacy.PrecomputableAffine.v1") -def PrecomputableAffine(nO, nI, nF, nP, dropout=0.1): - model = Model( - "precomputable_affine", - forward, - init=init, - dims={"nO": nO, "nI": nI, "nF": nF, "nP": nP}, - params={"W": None, "b": None, "pad": None}, - attrs={"dropout_rate": dropout}, - ) - return model - - -def forward(model, X, is_train): - nF = model.get_dim("nF") - nO = model.get_dim("nO") - nP = model.get_dim("nP") - nI = model.get_dim("nI") - W = model.get_param("W") - # Preallocate array for layer output, including padding. - Yf = model.ops.alloc2f(X.shape[0] + 1, nF * nO * nP, zeros=False) - model.ops.gemm(X, W.reshape((nF * nO * nP, nI)), trans2=True, out=Yf[1:]) - Yf = Yf.reshape((Yf.shape[0], nF, nO, nP)) - - # Set padding. Padding has shape (1, nF, nO, nP). Unfortunately, we cannot - # change its shape to (nF, nO, nP) without breaking existing models. So - # we'll squeeze the first dimension here. - Yf[0] = model.ops.xp.squeeze(model.get_param("pad"), 0) - - def backward(dY_ids): - # This backprop is particularly tricky, because we get back a different - # thing from what we put out. We put out an array of shape: - # (nB, nF, nO, nP), and get back: - # (nB, nO, nP) and ids (nB, nF) - # The ids tell us the values of nF, so we would have: - # - # dYf = zeros((nB, nF, nO, nP)) - # for b in range(nB): - # for f in range(nF): - # dYf[b, ids[b, f]] += dY[b] - # - # However, we avoid building that array for efficiency -- and just pass - # in the indices. - dY, ids = dY_ids - assert dY.ndim == 3 - assert dY.shape[1] == nO, dY.shape - assert dY.shape[2] == nP, dY.shape - # nB = dY.shape[0] - model.inc_grad("pad", _backprop_precomputable_affine_padding(model, dY, ids)) - Xf = X[ids] - Xf = Xf.reshape((Xf.shape[0], nF * nI)) - - model.inc_grad("b", dY.sum(axis=0)) - dY = dY.reshape((dY.shape[0], nO * nP)) - - Wopfi = W.transpose((1, 2, 0, 3)) - Wopfi = Wopfi.reshape((nO * nP, nF * nI)) - dXf = model.ops.gemm(dY.reshape((dY.shape[0], nO * nP)), Wopfi) - - dWopfi = model.ops.gemm(dY, Xf, trans1=True) - dWopfi = dWopfi.reshape((nO, nP, nF, nI)) - # (o, p, f, i) --> (f, o, p, i) - dWopfi = dWopfi.transpose((2, 0, 1, 3)) - model.inc_grad("W", dWopfi) - return dXf.reshape((dXf.shape[0], nF, nI)) - - return Yf, backward - - -def _backprop_precomputable_affine_padding(model, dY, ids): - nB = dY.shape[0] - nF = model.get_dim("nF") - nP = model.get_dim("nP") - nO = model.get_dim("nO") - # Backprop the "padding", used as a filler for missing values. - # Values that are missing are set to -1, and each state vector could - # have multiple missing values. The padding has different values for - # different missing features. The gradient of the padding vector is: - # - # for b in range(nB): - # for f in range(nF): - # if ids[b, f] < 0: - # d_pad[f] += dY[b] - # - # Which can be rewritten as: - # - # (ids < 0).T @ dY - mask = model.ops.asarray(ids < 0, dtype="f") - d_pad = model.ops.gemm(mask, dY.reshape(nB, nO * nP), trans1=True) - return d_pad.reshape((1, nF, nO, nP)) - - -def init(model, X=None, Y=None): - """This is like the 'layer sequential unit variance', but instead - of taking the actual inputs, we randomly generate whitened data. - - Why's this all so complicated? We have a huge number of inputs, - and the maxout unit makes guessing the dynamics tricky. Instead - we set the maxout weights to values that empirically result in - whitened outputs given whitened inputs. - """ - if model.has_param("W") and model.get_param("W").any(): - return - - nF = model.get_dim("nF") - nO = model.get_dim("nO") - nP = model.get_dim("nP") - nI = model.get_dim("nI") - W = model.ops.alloc4f(nF, nO, nP, nI) - b = model.ops.alloc2f(nO, nP) - pad = model.ops.alloc4f(1, nF, nO, nP) - - ops = model.ops - W = normal_init(ops, W.shape, mean=float(ops.xp.sqrt(1.0 / nF * nI))) - pad = normal_init(ops, pad.shape, mean=1.0) - model.set_param("W", W) - model.set_param("b", b) - model.set_param("pad", pad) - - ids = ops.alloc((5000, nF), dtype="f") - ids += ops.xp.random.uniform(0, 1000, ids.shape) - ids = ops.asarray(ids, dtype="i") - tokvecs = ops.alloc((5000, nI), dtype="f") - tokvecs += ops.xp.random.normal(loc=0.0, scale=1.0, size=tokvecs.size).reshape( - tokvecs.shape - ) - - def predict(ids, tokvecs): - # nS ids. nW tokvecs. Exclude the padding array. - hiddens = model.predict(tokvecs[:-1]) # (nW, f, o, p) - vectors = model.ops.alloc((ids.shape[0], nO * nP), dtype="f") - # need nS vectors - hiddens = hiddens.reshape((hiddens.shape[0] * nF, nO * nP)) - model.ops.scatter_add(vectors, ids.flatten(), hiddens) - vectors = vectors.reshape((vectors.shape[0], nO, nP)) - vectors += b - vectors = model.ops.asarray(vectors) - if nP >= 2: - return model.ops.maxout(vectors)[0] - else: - return vectors * (vectors >= 0) - - tol_var = 0.01 - tol_mean = 0.01 - t_max = 10 - W = model.get_param("W").copy() - b = model.get_param("b").copy() - for t_i in range(t_max): - acts1 = predict(ids, tokvecs) - var = model.ops.xp.var(acts1) - mean = model.ops.xp.mean(acts1) - if abs(var - 1.0) >= tol_var: - W /= model.ops.xp.sqrt(var) - model.set_param("W", W) - elif abs(mean) >= tol_mean: - b -= mean - model.set_param("b", b) - else: - break diff --git a/spacy/ml/callbacks.py b/spacy/ml/callbacks.py index 3b60ec2ab..393f208a6 100644 --- a/spacy/ml/callbacks.py +++ b/spacy/ml/callbacks.py @@ -23,6 +23,7 @@ DEFAULT_NVTX_ANNOTATABLE_PIPE_METHODS = [ "update", "rehearse", "get_loss", + "get_teacher_student_loss", "initialize", "begin_update", "finish_update", diff --git a/spacy/ml/_character_embed.py b/spacy/ml/character_embed.py similarity index 100% rename from spacy/ml/_character_embed.py rename to spacy/ml/character_embed.py diff --git a/spacy/ml/models/entity_linker.py b/spacy/ml/models/entity_linker.py index 299b6bb52..7332ca199 100644 --- a/spacy/ml/models/entity_linker.py +++ b/spacy/ml/models/entity_linker.py @@ -89,6 +89,14 @@ def load_kb( return kb_from_file +@registry.misc("spacy.EmptyKB.v2") +def empty_kb_for_config() -> Callable[[Vocab, int], KnowledgeBase]: + def empty_kb_factory(vocab: Vocab, entity_vector_length: int): + return InMemoryLookupKB(vocab=vocab, entity_vector_length=entity_vector_length) + + return empty_kb_factory + + @registry.misc("spacy.EmptyKB.v1") def empty_kb( entity_vector_length: int, diff --git a/spacy/ml/models/parser.py b/spacy/ml/models/parser.py index a70d84dea..01312983d 100644 --- a/spacy/ml/models/parser.py +++ b/spacy/ml/models/parser.py @@ -1,17 +1,19 @@ -from typing import Optional, List, cast -from thinc.api import Model, chain, list2array, Linear, zero_init, use_ops +from typing import Optional, List, Tuple, Any, Literal from thinc.types import Floats2d +from thinc.api import Model +import warnings -from ...errors import Errors -from ...compat import Literal +from ...errors import Errors, Warnings from ...util import registry -from .._precomputable_affine import PrecomputableAffine from ..tb_framework import TransitionModel -from ...tokens import Doc +from ...tokens.doc import Doc + +TransitionSystem = Any # TODO +State = Any # TODO -@registry.architectures("spacy.TransitionBasedParser.v2") -def build_tb_parser_model( +@registry.architectures.register("spacy.TransitionBasedParser.v2") +def transition_parser_v2( tok2vec: Model[List[Doc], List[Floats2d]], state_type: Literal["parser", "ner"], extra_state_tokens: bool, @@ -19,6 +21,46 @@ def build_tb_parser_model( maxout_pieces: int, use_upper: bool, nO: Optional[int] = None, +) -> Model: + if not use_upper: + warnings.warn(Warnings.W400) + + return build_tb_parser_model( + tok2vec, + state_type, + extra_state_tokens, + hidden_width, + maxout_pieces, + nO=nO, + ) + + +@registry.architectures.register("spacy.TransitionBasedParser.v3") +def transition_parser_v3( + tok2vec: Model[List[Doc], List[Floats2d]], + state_type: Literal["parser", "ner"], + extra_state_tokens: bool, + hidden_width: int, + maxout_pieces: int, + nO: Optional[int] = None, +) -> Model: + return build_tb_parser_model( + tok2vec, + state_type, + extra_state_tokens, + hidden_width, + maxout_pieces, + nO=nO, + ) + + +def build_tb_parser_model( + tok2vec: Model[List[Doc], List[Floats2d]], + state_type: Literal["parser", "ner"], + extra_state_tokens: bool, + hidden_width: int, + maxout_pieces: int, + nO: Optional[int] = None, ) -> Model: """ Build a transition-based parser model. Can apply to NER or dependency-parsing. @@ -51,14 +93,7 @@ def build_tb_parser_model( feature sets (for the NER) or 13 (for the parser). hidden_width (int): The width of the hidden layer. maxout_pieces (int): How many pieces to use in the state prediction layer. - Recommended values are 1, 2 or 3. If 1, the maxout non-linearity - is replaced with a ReLu non-linearity if use_upper=True, and no - non-linearity if use_upper=False. - use_upper (bool): Whether to use an additional hidden layer after the state - vector in order to predict the action scores. It is recommended to set - this to False for large pretrained models such as transformers, and True - for smaller networks. The upper layer is computed on CPU, which becomes - a bottleneck on larger GPU-based models, where it's also less necessary. + Recommended values are 1, 2 or 3. nO (int or None): The number of actions the model will predict between. Usually inferred from data at the beginning of training, or loaded from disk. @@ -69,106 +104,11 @@ def build_tb_parser_model( nr_feature_tokens = 6 if extra_state_tokens else 3 else: raise ValueError(Errors.E917.format(value=state_type)) - t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None - tok2vec = chain( - tok2vec, - list2array(), - Linear(hidden_width, t2v_width), + return TransitionModel( + tok2vec=tok2vec, + state_tokens=nr_feature_tokens, + hidden_width=hidden_width, + maxout_pieces=maxout_pieces, + nO=nO, + unseen_classes=set(), ) - tok2vec.set_dim("nO", hidden_width) - lower = _define_lower( - nO=hidden_width if use_upper else nO, - nF=nr_feature_tokens, - nI=tok2vec.get_dim("nO"), - nP=maxout_pieces, - ) - upper = None - if use_upper: - with use_ops("cpu"): - # Initialize weights at zero, as it's a classification layer. - upper = _define_upper(nO=nO, nI=None) - return TransitionModel(tok2vec, lower, upper, resize_output) - - -def _define_upper(nO, nI): - return Linear(nO=nO, nI=nI, init_W=zero_init) - - -def _define_lower(nO, nF, nI, nP): - return PrecomputableAffine(nO=nO, nF=nF, nI=nI, nP=nP) - - -def resize_output(model, new_nO): - if model.attrs["has_upper"]: - return _resize_upper(model, new_nO) - return _resize_lower(model, new_nO) - - -def _resize_upper(model, new_nO): - upper = model.get_ref("upper") - if upper.has_dim("nO") is None: - upper.set_dim("nO", new_nO) - return model - elif new_nO == upper.get_dim("nO"): - return model - - smaller = upper - nI = smaller.maybe_get_dim("nI") - with use_ops("cpu"): - larger = _define_upper(nO=new_nO, nI=nI) - # it could be that the model is not initialized yet, then skip this bit - if smaller.has_param("W"): - larger_W = larger.ops.alloc2f(new_nO, nI) - larger_b = larger.ops.alloc1f(new_nO) - smaller_W = smaller.get_param("W") - smaller_b = smaller.get_param("b") - # Weights are stored in (nr_out, nr_in) format, so we're basically - # just adding rows here. - if smaller.has_dim("nO"): - old_nO = smaller.get_dim("nO") - larger_W[:old_nO] = smaller_W - larger_b[:old_nO] = smaller_b - for i in range(old_nO, new_nO): - model.attrs["unseen_classes"].add(i) - - larger.set_param("W", larger_W) - larger.set_param("b", larger_b) - model._layers[-1] = larger - model.set_ref("upper", larger) - return model - - -def _resize_lower(model, new_nO): - lower = model.get_ref("lower") - if lower.has_dim("nO") is None: - lower.set_dim("nO", new_nO) - return model - - smaller = lower - nI = smaller.maybe_get_dim("nI") - nF = smaller.maybe_get_dim("nF") - nP = smaller.maybe_get_dim("nP") - larger = _define_lower(nO=new_nO, nI=nI, nF=nF, nP=nP) - # it could be that the model is not initialized yet, then skip this bit - if smaller.has_param("W"): - larger_W = larger.ops.alloc4f(nF, new_nO, nP, nI) - larger_b = larger.ops.alloc2f(new_nO, nP) - larger_pad = larger.ops.alloc4f(1, nF, new_nO, nP) - smaller_W = smaller.get_param("W") - smaller_b = smaller.get_param("b") - smaller_pad = smaller.get_param("pad") - # Copy the old weights and padding into the new layer - if smaller.has_dim("nO"): - old_nO = smaller.get_dim("nO") - larger_W[:, 0:old_nO, :, :] = smaller_W - larger_pad[:, :, 0:old_nO, :] = smaller_pad - larger_b[0:old_nO, :] = smaller_b - for i in range(old_nO, new_nO): - model.attrs["unseen_classes"].add(i) - - larger.set_param("W", larger_W) - larger.set_param("b", larger_b) - larger.set_param("pad", larger_pad) - model._layers[1] = larger - model.set_ref("lower", larger) - return model diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py index 30c7360ff..79772ad80 100644 --- a/spacy/ml/models/tok2vec.py +++ b/spacy/ml/models/tok2vec.py @@ -7,7 +7,7 @@ from thinc.api import expand_window, residual, Maxout, Mish, PyTorchLSTM from ...tokens import Doc from ...util import registry from ...errors import Errors -from ...ml import _character_embed +from ...ml import character_embed from ..staticvectors import StaticVectors from ..featureextractor import FeatureExtractor from ...pipeline.tok2vec import Tok2VecListener @@ -226,7 +226,7 @@ def CharacterEmbed( if feature is None: raise ValueError(Errors.E911.format(feat=feature)) char_embed = chain( - _character_embed.CharacterEmbed(nM=nM, nC=nC), + character_embed.CharacterEmbed(nM=nM, nC=nC), cast(Model[List[Floats2d], Ragged], list2ragged()), ) feature_extractor: Model[List[Doc], Ragged] = chain( diff --git a/spacy/ml/parser_model.pxd b/spacy/ml/parser_model.pxd deleted file mode 100644 index 8def6cea5..000000000 --- a/spacy/ml/parser_model.pxd +++ /dev/null @@ -1,49 +0,0 @@ -from libc.string cimport memset, memcpy -from thinc.backends.cblas cimport CBlas -from ..typedefs cimport weight_t, hash_t -from ..pipeline._parser_internals._state cimport StateC - - -cdef struct SizesC: - int states - int classes - int hiddens - int pieces - int feats - int embed_width - - -cdef struct WeightsC: - const float* feat_weights - const float* feat_bias - const float* hidden_bias - const float* hidden_weights - const float* seen_classes - - -cdef struct ActivationsC: - int* token_ids - float* unmaxed - float* scores - float* hiddens - int* is_valid - int _curr_size - int _max_size - - -cdef WeightsC get_c_weights(model) except * - -cdef SizesC get_c_sizes(model, int batch_size) except * - -cdef ActivationsC alloc_activations(SizesC n) nogil - -cdef void free_activations(const ActivationsC* A) nogil - -cdef void predict_states(CBlas cblas, ActivationsC* A, StateC** states, - const WeightsC* W, SizesC n) nogil - -cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) nogil - -cdef void cpu_log_loss(float* d_scores, - const float* costs, const int* is_valid, const float* scores, int O) nogil - diff --git a/spacy/ml/parser_model.pyx b/spacy/ml/parser_model.pyx deleted file mode 100644 index 961bf4d70..000000000 --- a/spacy/ml/parser_model.pyx +++ /dev/null @@ -1,492 +0,0 @@ -# cython: infer_types=True, cdivision=True, boundscheck=False -cimport numpy as np -from libc.math cimport exp -from libc.string cimport memset, memcpy -from libc.stdlib cimport calloc, free, realloc -from thinc.backends.linalg cimport Vec, VecVec -from thinc.backends.cblas cimport saxpy, sgemm - -import numpy -import numpy.random -from thinc.api import Model, CupyOps, NumpyOps, get_ops - -from .. import util -from ..errors import Errors -from ..typedefs cimport weight_t, class_t, hash_t -from ..pipeline._parser_internals.stateclass cimport StateClass - - -cdef WeightsC get_c_weights(model) except *: - cdef WeightsC output - cdef precompute_hiddens state2vec = model.state2vec - output.feat_weights = state2vec.get_feat_weights() - output.feat_bias = state2vec.bias.data - cdef np.ndarray vec2scores_W - cdef np.ndarray vec2scores_b - if model.vec2scores is None: - output.hidden_weights = NULL - output.hidden_bias = NULL - else: - vec2scores_W = model.vec2scores.get_param("W") - vec2scores_b = model.vec2scores.get_param("b") - output.hidden_weights = vec2scores_W.data - output.hidden_bias = vec2scores_b.data - cdef np.ndarray class_mask = model._class_mask - output.seen_classes = class_mask.data - return output - - -cdef SizesC get_c_sizes(model, int batch_size) except *: - cdef SizesC output - output.states = batch_size - if model.vec2scores is None: - output.classes = model.state2vec.get_dim("nO") - else: - output.classes = model.vec2scores.get_dim("nO") - output.hiddens = model.state2vec.get_dim("nO") - output.pieces = model.state2vec.get_dim("nP") - output.feats = model.state2vec.get_dim("nF") - output.embed_width = model.tokvecs.shape[1] - return output - - -cdef ActivationsC alloc_activations(SizesC n) nogil: - cdef ActivationsC A - memset(&A, 0, sizeof(A)) - resize_activations(&A, n) - return A - - -cdef void free_activations(const ActivationsC* A) nogil: - free(A.token_ids) - free(A.scores) - free(A.unmaxed) - free(A.hiddens) - free(A.is_valid) - - -cdef void resize_activations(ActivationsC* A, SizesC n) nogil: - if n.states <= A._max_size: - A._curr_size = n.states - return - if A._max_size == 0: - A.token_ids = calloc(n.states * n.feats, sizeof(A.token_ids[0])) - A.scores = calloc(n.states * n.classes, sizeof(A.scores[0])) - A.unmaxed = calloc(n.states * n.hiddens * n.pieces, sizeof(A.unmaxed[0])) - A.hiddens = calloc(n.states * n.hiddens, sizeof(A.hiddens[0])) - A.is_valid = calloc(n.states * n.classes, sizeof(A.is_valid[0])) - A._max_size = n.states - else: - A.token_ids = realloc(A.token_ids, - n.states * n.feats * sizeof(A.token_ids[0])) - A.scores = realloc(A.scores, - n.states * n.classes * sizeof(A.scores[0])) - A.unmaxed = realloc(A.unmaxed, - n.states * n.hiddens * n.pieces * sizeof(A.unmaxed[0])) - A.hiddens = realloc(A.hiddens, - n.states * n.hiddens * sizeof(A.hiddens[0])) - A.is_valid = realloc(A.is_valid, - n.states * n.classes * sizeof(A.is_valid[0])) - A._max_size = n.states - A._curr_size = n.states - - -cdef void predict_states(CBlas cblas, ActivationsC* A, StateC** states, - const WeightsC* W, SizesC n) nogil: - cdef double one = 1.0 - resize_activations(A, n) - for i in range(n.states): - states[i].set_context_tokens(&A.token_ids[i*n.feats], n.feats) - memset(A.unmaxed, 0, n.states * n.hiddens * n.pieces * sizeof(float)) - memset(A.hiddens, 0, n.states * n.hiddens * sizeof(float)) - sum_state_features(cblas, A.unmaxed, - W.feat_weights, A.token_ids, n.states, n.feats, n.hiddens * n.pieces) - for i in range(n.states): - VecVec.add_i(&A.unmaxed[i*n.hiddens*n.pieces], - W.feat_bias, 1., n.hiddens * n.pieces) - for j in range(n.hiddens): - index = i * n.hiddens * n.pieces + j * n.pieces - which = Vec.arg_max(&A.unmaxed[index], n.pieces) - A.hiddens[i*n.hiddens + j] = A.unmaxed[index + which] - memset(A.scores, 0, n.states * n.classes * sizeof(float)) - if W.hidden_weights == NULL: - memcpy(A.scores, A.hiddens, n.states * n.classes * sizeof(float)) - else: - # Compute hidden-to-output - sgemm(cblas)(False, True, n.states, n.classes, n.hiddens, - 1.0, A.hiddens, n.hiddens, - W.hidden_weights, n.hiddens, - 0.0, A.scores, n.classes) - # Add bias - for i in range(n.states): - VecVec.add_i(&A.scores[i*n.classes], - W.hidden_bias, 1., n.classes) - # Set unseen classes to minimum value - i = 0 - min_ = A.scores[0] - for i in range(1, n.states * n.classes): - if A.scores[i] < min_: - min_ = A.scores[i] - for i in range(n.states): - for j in range(n.classes): - if not W.seen_classes[j]: - A.scores[i*n.classes+j] = min_ - - -cdef void sum_state_features(CBlas cblas, float* output, - const float* cached, const int* token_ids, int B, int F, int O) nogil: - cdef int idx, b, f, i - cdef const float* feature - padding = cached - cached += F * O - cdef int id_stride = F*O - cdef float one = 1. - for b in range(B): - for f in range(F): - if token_ids[f] < 0: - feature = &padding[f*O] - else: - idx = token_ids[f] * id_stride + f*O - feature = &cached[idx] - saxpy(cblas)(O, one, feature, 1, &output[b*O], 1) - token_ids += F - - -cdef void cpu_log_loss(float* d_scores, - const float* costs, const int* is_valid, const float* scores, - int O) nogil: - """Do multi-label log loss""" - cdef double max_, gmax, Z, gZ - best = arg_max_if_gold(scores, costs, is_valid, O) - guess = Vec.arg_max(scores, O) - if best == -1 or guess == -1: - # These shouldn't happen, but if they do, we want to make sure we don't - # cause an OOB access. - return - Z = 1e-10 - gZ = 1e-10 - max_ = scores[guess] - gmax = scores[best] - for i in range(O): - Z += exp(scores[i] - max_) - if costs[i] <= costs[best]: - gZ += exp(scores[i] - gmax) - for i in range(O): - if costs[i] <= costs[best]: - d_scores[i] = (exp(scores[i]-max_) / Z) - (exp(scores[i]-gmax)/gZ) - else: - d_scores[i] = exp(scores[i]-max_) / Z - - -cdef int arg_max_if_gold(const weight_t* scores, const weight_t* costs, - const int* is_valid, int n) nogil: - # Find minimum cost - cdef float cost = 1 - for i in range(n): - if is_valid[i] and costs[i] < cost: - cost = costs[i] - # Now find best-scoring with that cost - cdef int best = -1 - for i in range(n): - if costs[i] <= cost and is_valid[i]: - if best == -1 or scores[i] > scores[best]: - best = i - return best - - -cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) nogil: - cdef int best = -1 - for i in range(n): - if is_valid[i] >= 1: - if best == -1 or scores[i] > scores[best]: - best = i - return best - - - -class ParserStepModel(Model): - def __init__(self, docs, layers, *, has_upper, unseen_classes=None, train=True, - dropout=0.1): - Model.__init__(self, name="parser_step_model", forward=step_forward) - self.attrs["has_upper"] = has_upper - self.attrs["dropout_rate"] = dropout - self.tokvecs, self.bp_tokvecs = layers[0](docs, is_train=train) - if layers[1].get_dim("nP") >= 2: - activation = "maxout" - elif has_upper: - activation = None - else: - activation = "relu" - self.state2vec = precompute_hiddens(len(docs), self.tokvecs, layers[1], - activation=activation, train=train) - if has_upper: - self.vec2scores = layers[-1] - else: - self.vec2scores = None - self.cuda_stream = util.get_cuda_stream(non_blocking=True) - self.backprops = [] - self._class_mask = numpy.zeros((self.nO,), dtype='f') - self._class_mask.fill(1) - if unseen_classes is not None: - for class_ in unseen_classes: - self._class_mask[class_] = 0. - - def clear_memory(self): - del self.tokvecs - del self.bp_tokvecs - del self.state2vec - del self.backprops - del self._class_mask - - @property - def nO(self): - if self.attrs["has_upper"]: - return self.vec2scores.get_dim("nO") - else: - return self.state2vec.get_dim("nO") - - def class_is_unseen(self, class_): - return self._class_mask[class_] - - def mark_class_unseen(self, class_): - self._class_mask[class_] = 0 - - def mark_class_seen(self, class_): - self._class_mask[class_] = 1 - - def get_token_ids(self, states): - cdef StateClass state - states = [state for state in states if not state.is_final()] - cdef np.ndarray ids = numpy.zeros((len(states), self.state2vec.nF), - dtype='i', order='C') - ids.fill(-1) - c_ids = ids.data - for state in states: - state.c.set_context_tokens(c_ids, ids.shape[1]) - c_ids += ids.shape[1] - return ids - - def backprop_step(self, token_ids, d_vector, get_d_tokvecs): - if isinstance(self.state2vec.ops, CupyOps) \ - and not isinstance(token_ids, self.state2vec.ops.xp.ndarray): - # Move token_ids and d_vector to GPU, asynchronously - self.backprops.append(( - util.get_async(self.cuda_stream, token_ids), - util.get_async(self.cuda_stream, d_vector), - get_d_tokvecs - )) - else: - self.backprops.append((token_ids, d_vector, get_d_tokvecs)) - - - def finish_steps(self, golds): - # Add a padding vector to the d_tokvecs gradient, so that missing - # values don't affect the real gradient. - d_tokvecs = self.ops.alloc((self.tokvecs.shape[0]+1, self.tokvecs.shape[1])) - # Tells CUDA to block, so our async copies complete. - if self.cuda_stream is not None: - self.cuda_stream.synchronize() - for ids, d_vector, bp_vector in self.backprops: - d_state_features = bp_vector((d_vector, ids)) - ids = ids.flatten() - d_state_features = d_state_features.reshape( - (ids.size, d_state_features.shape[2])) - self.ops.scatter_add(d_tokvecs, ids, - d_state_features) - # Padded -- see update() - self.bp_tokvecs(d_tokvecs[:-1]) - return d_tokvecs - -NUMPY_OPS = NumpyOps() - -def step_forward(model: ParserStepModel, states, is_train): - token_ids = model.get_token_ids(states) - vector, get_d_tokvecs = model.state2vec(token_ids, is_train) - mask = None - if model.attrs["has_upper"]: - dropout_rate = model.attrs["dropout_rate"] - if is_train and dropout_rate > 0: - mask = NUMPY_OPS.get_dropout_mask(vector.shape, 0.1) - vector *= mask - scores, get_d_vector = model.vec2scores(vector, is_train) - else: - scores = NumpyOps().asarray(vector) - get_d_vector = lambda d_scores: d_scores - # If the class is unseen, make sure its score is minimum - scores[:, model._class_mask == 0] = numpy.nanmin(scores) - - def backprop_parser_step(d_scores): - # Zero vectors for unseen classes - d_scores *= model._class_mask - d_vector = get_d_vector(d_scores) - if mask is not None: - d_vector *= mask - model.backprop_step(token_ids, d_vector, get_d_tokvecs) - return None - return scores, backprop_parser_step - - -cdef class precompute_hiddens: - """Allow a model to be "primed" by pre-computing input features in bulk. - - This is used for the parser, where we want to take a batch of documents, - and compute vectors for each (token, position) pair. These vectors can then - be reused, especially for beam-search. - - Let's say we're using 12 features for each state, e.g. word at start of - buffer, three words on stack, their children, etc. In the normal arc-eager - system, a document of length N is processed in 2*N states. This means we'll - create 2*N*12 feature vectors --- but if we pre-compute, we only need - N*12 vector computations. The saving for beam-search is much better: - if we have a beam of k, we'll normally make 2*N*12*K computations -- - so we can save the factor k. This also gives a nice CPU/GPU division: - we can do all our hard maths up front, packed into large multiplications, - and do the hard-to-program parsing on the CPU. - """ - cdef readonly int nF, nO, nP - cdef bint _is_synchronized - cdef public object ops - cdef public object numpy_ops - cdef np.ndarray _features - cdef np.ndarray _cached - cdef np.ndarray bias - cdef object _cuda_stream - cdef object _bp_hiddens - cdef object activation - - def __init__(self, batch_size, tokvecs, lower_model, cuda_stream=None, - activation="maxout", train=False): - gpu_cached, bp_features = lower_model(tokvecs, train) - cdef np.ndarray cached - if not isinstance(gpu_cached, numpy.ndarray): - # Note the passing of cuda_stream here: it lets - # cupy make the copy asynchronously. - # We then have to block before first use. - cached = gpu_cached.get(stream=cuda_stream) - else: - cached = gpu_cached - if not isinstance(lower_model.get_param("b"), numpy.ndarray): - self.bias = lower_model.get_param("b").get(stream=cuda_stream) - else: - self.bias = lower_model.get_param("b") - self.nF = cached.shape[1] - if lower_model.has_dim("nP"): - self.nP = lower_model.get_dim("nP") - else: - self.nP = 1 - self.nO = cached.shape[2] - self.ops = lower_model.ops - self.numpy_ops = NumpyOps() - assert activation in (None, "relu", "maxout") - self.activation = activation - self._is_synchronized = False - self._cuda_stream = cuda_stream - self._cached = cached - self._bp_hiddens = bp_features - - cdef const float* get_feat_weights(self) except NULL: - if not self._is_synchronized and self._cuda_stream is not None: - self._cuda_stream.synchronize() - self._is_synchronized = True - return self._cached.data - - def has_dim(self, name): - if name == "nF": - return self.nF if self.nF is not None else True - elif name == "nP": - return self.nP if self.nP is not None else True - elif name == "nO": - return self.nO if self.nO is not None else True - else: - return False - - def get_dim(self, name): - if name == "nF": - return self.nF - elif name == "nP": - return self.nP - elif name == "nO": - return self.nO - else: - raise ValueError(Errors.E1033.format(name=name)) - - def set_dim(self, name, value): - if name == "nF": - self.nF = value - elif name == "nP": - self.nP = value - elif name == "nO": - self.nO = value - else: - raise ValueError(Errors.E1033.format(name=name)) - - def __call__(self, X, bint is_train): - if is_train: - return self.begin_update(X) - else: - return self.predict(X), lambda X: X - - def predict(self, X): - return self.begin_update(X)[0] - - def begin_update(self, token_ids): - cdef np.ndarray state_vector = numpy.zeros( - (token_ids.shape[0], self.nO, self.nP), dtype='f') - # This is tricky, but (assuming GPU available); - # - Input to forward on CPU - # - Output from forward on CPU - # - Input to backward on GPU! - # - Output from backward on GPU - bp_hiddens = self._bp_hiddens - - cdef CBlas cblas - if isinstance(self.ops, CupyOps): - cblas = NUMPY_OPS.cblas() - else: - cblas = self.ops.cblas() - - feat_weights = self.get_feat_weights() - cdef int[:, ::1] ids = token_ids - sum_state_features(cblas, state_vector.data, - feat_weights, &ids[0,0], - token_ids.shape[0], self.nF, self.nO*self.nP) - state_vector += self.bias - state_vector, bp_nonlinearity = self._nonlinearity(state_vector) - - def backward(d_state_vector_ids): - d_state_vector, token_ids = d_state_vector_ids - d_state_vector = bp_nonlinearity(d_state_vector) - d_tokens = bp_hiddens((d_state_vector, token_ids)) - return d_tokens - return state_vector, backward - - def _nonlinearity(self, state_vector): - if self.activation == "maxout": - return self._maxout_nonlinearity(state_vector) - else: - return self._relu_nonlinearity(state_vector) - - def _maxout_nonlinearity(self, state_vector): - state_vector, mask = self.numpy_ops.maxout(state_vector) - # We're outputting to CPU, but we need this variable on GPU for the - # backward pass. - mask = self.ops.asarray(mask) - - def backprop_maxout(d_best): - return self.ops.backprop_maxout(d_best, mask, self.nP) - - return state_vector, backprop_maxout - - def _relu_nonlinearity(self, state_vector): - state_vector = state_vector.reshape((state_vector.shape[0], -1)) - mask = state_vector >= 0. - state_vector *= mask - # We're outputting to CPU, but we need this variable on GPU for the - # backward pass. - mask = self.ops.asarray(mask) - - def backprop_relu(d_best): - d_best *= mask - return d_best.reshape((d_best.shape + (1,))) - - return state_vector, backprop_relu diff --git a/spacy/ml/tb_framework.pxd b/spacy/ml/tb_framework.pxd new file mode 100644 index 000000000..965508519 --- /dev/null +++ b/spacy/ml/tb_framework.pxd @@ -0,0 +1,28 @@ +from libc.stdint cimport int8_t + + +cdef struct SizesC: + int states + int classes + int hiddens + int pieces + int feats + int embed_width + int tokens + + +cdef struct WeightsC: + const float* feat_weights + const float* feat_bias + const float* hidden_bias + const float* hidden_weights + const int8_t* seen_mask + + +cdef struct ActivationsC: + int* token_ids + float* unmaxed + float* hiddens + int* is_valid + int _curr_size + int _max_size diff --git a/spacy/ml/tb_framework.py b/spacy/ml/tb_framework.py deleted file mode 100644 index ab4a969e2..000000000 --- a/spacy/ml/tb_framework.py +++ /dev/null @@ -1,50 +0,0 @@ -from thinc.api import Model, noop -from .parser_model import ParserStepModel -from ..util import registry - - -@registry.layers("spacy.TransitionModel.v1") -def TransitionModel( - tok2vec, lower, upper, resize_output, dropout=0.2, unseen_classes=set() -): - """Set up a stepwise transition-based model""" - if upper is None: - has_upper = False - upper = noop() - else: - has_upper = True - # don't define nO for this object, because we can't dynamically change it - return Model( - name="parser_model", - forward=forward, - dims={"nI": tok2vec.maybe_get_dim("nI")}, - layers=[tok2vec, lower, upper], - refs={"tok2vec": tok2vec, "lower": lower, "upper": upper}, - init=init, - attrs={ - "has_upper": has_upper, - "unseen_classes": set(unseen_classes), - "resize_output": resize_output, - }, - ) - - -def forward(model, X, is_train): - step_model = ParserStepModel( - X, - model.layers, - unseen_classes=model.attrs["unseen_classes"], - train=is_train, - has_upper=model.attrs["has_upper"], - ) - - return step_model, step_model.finish_steps - - -def init(model, X=None, Y=None): - model.get_ref("tok2vec").initialize(X=X) - lower = model.get_ref("lower") - lower.initialize() - if model.attrs["has_upper"]: - statevecs = model.ops.alloc2f(2, lower.get_dim("nO")) - model.get_ref("upper").initialize(X=statevecs) diff --git a/spacy/ml/tb_framework.pyx b/spacy/ml/tb_framework.pyx new file mode 100644 index 000000000..9b2114900 --- /dev/null +++ b/spacy/ml/tb_framework.pyx @@ -0,0 +1,623 @@ +# cython: infer_types=True, cdivision=True, boundscheck=False +from typing import List, Tuple, Any, Optional, TypeVar, cast +from libc.string cimport memset, memcpy +from libc.stdlib cimport calloc, free, realloc +from libcpp.vector cimport vector +import numpy +cimport numpy as np +from thinc.api import Model, normal_init, chain, list2array, Linear +from thinc.api import uniform_init, glorot_uniform_init, zero_init +from thinc.api import NumpyOps +from thinc.backends.cblas cimport CBlas, saxpy, sgemm +from thinc.types import Floats1d, Floats2d, Floats3d, Floats4d +from thinc.types import Ints1d, Ints2d + +from ..errors import Errors +from ..pipeline._parser_internals import _beam_utils +from ..pipeline._parser_internals.batch import GreedyBatch +from ..pipeline._parser_internals._parser_utils cimport arg_max +from ..pipeline._parser_internals.transition_system cimport c_transition_batch, c_apply_actions +from ..pipeline._parser_internals.transition_system cimport TransitionSystem +from ..pipeline._parser_internals.stateclass cimport StateC, StateClass +from ..tokens.doc import Doc +from ..util import registry + + +State = Any # TODO + + +@registry.layers("spacy.TransitionModel.v2") +def TransitionModel( + *, + tok2vec: Model[List[Doc], List[Floats2d]], + beam_width: int = 1, + beam_density: float = 0.0, + state_tokens: int, + hidden_width: int, + maxout_pieces: int, + nO: Optional[int] = None, + unseen_classes=set(), +) -> Model[Tuple[List[Doc], TransitionSystem], List[Tuple[State, List[Floats2d]]]]: + """Set up a transition-based parsing model, using a maxout hidden + layer and a linear output layer. + """ + t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None + tok2vec_projected = chain(tok2vec, list2array(), Linear(hidden_width, t2v_width)) # type: ignore + tok2vec_projected.set_dim("nO", hidden_width) + + # FIXME: we use `output` as a container for the output layer's + # weights and biases. Thinc optimizers cannot handle resizing + # of parameters. So, when the parser model is resized, we + # construct a new `output` layer, which has a different key in + # the optimizer. Once the optimizer supports parameter resizing, + # we can replace the `output` layer by `output_W` and `output_b` + # parameters in this model. + output = Linear(nO=None, nI=hidden_width, init_W=zero_init) + + return Model( + name="parser_model", + forward=forward, + init=init, + layers=[tok2vec_projected, output], + refs={ + "tok2vec": tok2vec_projected, + "output": output, + }, + params={ + "hidden_W": None, # Floats2d W for the hidden layer + "hidden_b": None, # Floats1d bias for the hidden layer + "hidden_pad": None, # Floats1d padding for the hidden layer + }, + dims={ + "nO": None, # Output size + "nP": maxout_pieces, + "nH": hidden_width, + "nI": tok2vec_projected.maybe_get_dim("nO"), + "nF": state_tokens, + }, + attrs={ + "beam_width": beam_width, + "beam_density": beam_density, + "unseen_classes": set(unseen_classes), + "resize_output": resize_output, + }, + ) + + +def resize_output(model: Model, new_nO: int) -> Model: + old_nO = model.maybe_get_dim("nO") + output = model.get_ref("output") + if old_nO is None: + model.set_dim("nO", new_nO) + output.set_dim("nO", new_nO) + output.initialize() + return model + elif new_nO <= old_nO: + return model + elif output.has_param("W"): + nH = model.get_dim("nH") + new_output = Linear(nO=new_nO, nI=nH, init_W=zero_init) + new_output.initialize() + new_W = new_output.get_param("W") + new_b = new_output.get_param("b") + old_W = output.get_param("W") + old_b = output.get_param("b") + new_W[:old_nO] = old_W # type: ignore + new_b[:old_nO] = old_b # type: ignore + for i in range(old_nO, new_nO): + model.attrs["unseen_classes"].add(i) + model.layers[-1] = new_output + model.set_ref("output", new_output) + # TODO: Avoid this private intrusion + model._dims["nO"] = new_nO + return model + + +def init( + model, + X: Optional[Tuple[List[Doc], TransitionSystem]] = None, + Y: Optional[Tuple[List[State], List[Floats2d]]] = None, +): + if X is not None: + docs, moves = X + model.get_ref("tok2vec").initialize(X=docs) + else: + model.get_ref("tok2vec").initialize() + inferred_nO = _infer_nO(Y) + if inferred_nO is not None: + current_nO = model.maybe_get_dim("nO") + if current_nO is None or current_nO != inferred_nO: + model.attrs["resize_output"](model, inferred_nO) + nO = model.get_dim("nO") + nP = model.get_dim("nP") + nH = model.get_dim("nH") + nI = model.get_dim("nI") + nF = model.get_dim("nF") + ops = model.ops + + Wl = ops.alloc2f(nH * nP, nF * nI) + bl = ops.alloc1f(nH * nP) + padl = ops.alloc1f(nI) + # Wl = zero_init(ops, Wl.shape) + Wl = glorot_uniform_init(ops, Wl.shape) + padl = uniform_init(ops, padl.shape) # type: ignore + # TODO: Experiment with whether better to initialize output_W + model.set_param("hidden_W", Wl) + model.set_param("hidden_b", bl) + model.set_param("hidden_pad", padl) + # model = _lsuv_init(model) + return model + + +class TransitionModelInputs: + """ + Input to transition model. + """ + + # dataclass annotation is not yet supported in Cython 0.29.x, + # so, we'll do something close to it. + + actions: Optional[List[Ints1d]] + docs: List[Doc] + max_moves: int + moves: TransitionSystem + states: Optional[List[State]] + + __slots__ = [ + "actions", + "docs", + "max_moves", + "moves", + "states", + ] + + def __init__( + self, + docs: List[Doc], + moves: TransitionSystem, + actions: Optional[List[Ints1d]]=None, + max_moves: int=0, + states: Optional[List[State]]=None): + """ + actions (Optional[List[Ints1d]]): actions to apply for each Doc. + docs (List[Doc]): Docs to predict transition sequences for. + max_moves: (int): the maximum number of moves to apply, values less + than 1 will apply moves to states until they are final states. + moves (TransitionSystem): the transition system to use when predicting + the transition sequences. + states (Optional[List[States]]): the initial states to predict the + transition sequences for. When absent, the initial states are + initialized from the provided Docs. + """ + self.actions = actions + self.docs = docs + self.moves = moves + self.max_moves = max_moves + self.states = states + + +def forward(model, inputs: TransitionModelInputs, is_train: bool): + docs = inputs.docs + moves = inputs.moves + actions = inputs.actions + + beam_width = model.attrs["beam_width"] + hidden_pad = model.get_param("hidden_pad") + tok2vec = model.get_ref("tok2vec") + + states = moves.init_batch(docs) if inputs.states is None else inputs.states + tokvecs, backprop_tok2vec = tok2vec(docs, is_train) + tokvecs = model.ops.xp.vstack((tokvecs, hidden_pad)) + feats, backprop_feats = _forward_precomputable_affine(model, tokvecs, is_train) + seen_mask = _get_seen_mask(model) + + if not is_train and beam_width == 1 and isinstance(model.ops, NumpyOps): + # Note: max_moves is only used during training, so we don't need to + # pass it to the greedy inference path. + return _forward_greedy_cpu(model, moves, states, feats, seen_mask, actions=actions) + else: + return _forward_fallback(model, moves, states, tokvecs, backprop_tok2vec, + feats, backprop_feats, seen_mask, is_train, actions=actions, + max_moves=inputs.max_moves) + + +def _forward_greedy_cpu(model: Model, TransitionSystem moves, states: List[StateClass], np.ndarray feats, + np.ndarray[np.npy_bool, ndim=1] seen_mask, actions: Optional[List[Ints1d]]=None): + cdef vector[StateC*] c_states + cdef StateClass state + for state in states: + if not state.is_final(): + c_states.push_back(state.c) + weights = _get_c_weights(model, feats.data, seen_mask) + # Precomputed features have rows for each token, plus one for padding. + cdef int n_tokens = feats.shape[0] - 1 + sizes = _get_c_sizes(model, c_states.size(), n_tokens) + cdef CBlas cblas = model.ops.cblas() + scores = _parse_batch(cblas, moves, &c_states[0], weights, sizes, actions=actions) + + def backprop(dY): + raise ValueError(Errors.E4004) + + return (states, scores), backprop + +cdef list _parse_batch(CBlas cblas, TransitionSystem moves, StateC** states, + WeightsC weights, SizesC sizes, actions: Optional[List[Ints1d]]=None): + cdef int i, j + cdef vector[StateC *] unfinished + cdef ActivationsC activations = _alloc_activations(sizes) + cdef np.ndarray step_scores + cdef np.ndarray step_actions + + scores = [] + while sizes.states >= 1 and (actions is None or len(actions) > 0): + step_scores = numpy.empty((sizes.states, sizes.classes), dtype="f") + step_actions = actions[0] if actions is not None else None + assert step_actions is None or step_actions.size == sizes.states, \ + f"number of step actions ({step_actions.size}) must equal number of states ({sizes.states})" + with nogil: + _predict_states(cblas, &activations, step_scores.data, states, &weights, sizes) + if actions is None: + # Validate actions, argmax, take action. + c_transition_batch(moves, states, step_scores.data, sizes.classes, + sizes.states) + else: + c_apply_actions(moves, states, step_actions.data, sizes.states) + for i in range(sizes.states): + if not states[i].is_final(): + unfinished.push_back(states[i]) + for i in range(unfinished.size()): + states[i] = unfinished[i] + sizes.states = unfinished.size() + scores.append(step_scores) + unfinished.clear() + actions = actions[1:] if actions is not None else None + _free_activations(&activations) + + return scores + + +def _forward_fallback( + model: Model, + moves: TransitionSystem, + states: List[StateClass], + tokvecs, backprop_tok2vec, + feats, + backprop_feats, + seen_mask, + is_train: bool, + actions: Optional[List[Ints1d]]=None, + max_moves: int=0): + nF = model.get_dim("nF") + output = model.get_ref("output") + hidden_b = model.get_param("hidden_b") + nH = model.get_dim("nH") + nP = model.get_dim("nP") + + beam_width = model.attrs["beam_width"] + beam_density = model.attrs["beam_density"] + + ops = model.ops + + all_ids = [] + all_which = [] + all_statevecs = [] + all_scores = [] + if beam_width == 1: + batch = GreedyBatch(moves, states, None) + else: + batch = _beam_utils.BeamBatch( + moves, states, None, width=beam_width, density=beam_density + ) + arange = ops.xp.arange(nF) + n_moves = 0 + while not batch.is_done: + ids = numpy.zeros((len(batch.get_unfinished_states()), nF), dtype="i") + for i, state in enumerate(batch.get_unfinished_states()): + state.set_context_tokens(ids, i, nF) + # Sum the state features, add the bias and apply the activation (maxout) + # to create the state vectors. + preacts2f = feats[ids, arange].sum(axis=1) # type: ignore + preacts2f += hidden_b + preacts = ops.reshape3f(preacts2f, preacts2f.shape[0], nH, nP) + assert preacts.shape[0] == len(batch.get_unfinished_states()), preacts.shape + statevecs, which = ops.maxout(preacts) + # We don't use output's backprop, since we want to backprop for + # all states at once, rather than a single state. + scores = output.predict(statevecs) + scores[:, seen_mask] = ops.xp.nanmin(scores) + # Transition the states, filtering out any that are finished. + cpu_scores = ops.to_numpy(scores) + if actions is None: + batch.advance(cpu_scores) + else: + batch.advance_with_actions(actions[0]) + actions = actions[1:] + all_scores.append(scores) + if is_train: + # Remember intermediate results for the backprop. + all_ids.append(ids) + all_statevecs.append(statevecs) + all_which.append(which) + if n_moves >= max_moves >= 1: + break + n_moves += 1 + + def backprop_parser(d_states_d_scores): + ids = ops.xp.vstack(all_ids) + which = ops.xp.vstack(all_which) + statevecs = ops.xp.vstack(all_statevecs) + _, d_scores = d_states_d_scores + if model.attrs.get("unseen_classes"): + # If we have a negative gradient (i.e. the probability should + # increase) on any classes we filtered out as unseen, mark + # them as seen. + for clas in set(model.attrs["unseen_classes"]): + if (d_scores[:, clas] < 0).any(): + model.attrs["unseen_classes"].remove(clas) + d_scores *= seen_mask == False + # Calculate the gradients for the parameters of the output layer. + # The weight gemm is (nS, nO) @ (nS, nH).T + output.inc_grad("b", d_scores.sum(axis=0)) + output.inc_grad("W", ops.gemm(d_scores, statevecs, trans1=True)) + # Now calculate d_statevecs, by backproping through the output linear layer. + # This gemm is (nS, nO) @ (nO, nH) + output_W = output.get_param("W") + d_statevecs = ops.gemm(d_scores, output_W) + # Backprop through the maxout activation + d_preacts = ops.backprop_maxout(d_statevecs, which, nP) + d_preacts2f = ops.reshape2f(d_preacts, d_preacts.shape[0], nH * nP) + model.inc_grad("hidden_b", d_preacts2f.sum(axis=0)) + # We don't need to backprop the summation, because we pass back the IDs instead + d_state_features = backprop_feats((d_preacts2f, ids)) + d_tokvecs = ops.alloc2f(tokvecs.shape[0], tokvecs.shape[1]) + ops.scatter_add(d_tokvecs, ids, d_state_features) + model.inc_grad("hidden_pad", d_tokvecs[-1]) + return (backprop_tok2vec(d_tokvecs[:-1]), None) + + return (list(batch), all_scores), backprop_parser + + +def _get_seen_mask(model: Model) -> numpy.array[bool, 1]: + mask = model.ops.xp.zeros(model.get_dim("nO"), dtype="bool") + for class_ in model.attrs.get("unseen_classes", set()): + mask[class_] = True + return mask + + +def _forward_precomputable_affine(model, X: Floats2d, is_train: bool): + W: Floats2d = model.get_param("hidden_W") + nF = model.get_dim("nF") + nH = model.get_dim("nH") + nP = model.get_dim("nP") + nI = model.get_dim("nI") + # The weights start out (nH * nP, nF * nI). Transpose and reshape to (nF * nH *nP, nI) + W3f = model.ops.reshape3f(W, nH * nP, nF, nI) + W3f = W3f.transpose((1, 0, 2)) + W2f = model.ops.reshape2f(W3f, nF * nH * nP, nI) + assert X.shape == (X.shape[0], nI), X.shape + Yf_ = model.ops.gemm(X, W2f, trans2=True) + Yf = model.ops.reshape3f(Yf_, Yf_.shape[0], nF, nH * nP) + + def backward(dY_ids: Tuple[Floats3d, Ints2d]): + # This backprop is particularly tricky, because we get back a different + # thing from what we put out. We put out an array of shape: + # (nB, nF, nH, nP), and get back: + # (nB, nH, nP) and ids (nB, nF) + # The ids tell us the values of nF, so we would have: + # + # dYf = zeros((nB, nF, nH, nP)) + # for b in range(nB): + # for f in range(nF): + # dYf[b, ids[b, f]] += dY[b] + # + # However, we avoid building that array for efficiency -- and just pass + # in the indices. + dY, ids = dY_ids + dXf = model.ops.gemm(dY, W) + Xf = X[ids].reshape((ids.shape[0], -1)) + dW = model.ops.gemm(dY, Xf, trans1=True) + model.inc_grad("hidden_W", dW) + return model.ops.reshape3f(dXf, dXf.shape[0], nF, nI) + + return Yf, backward + + +def _infer_nO(Y: Optional[Tuple[List[State], List[Floats2d]]]) -> Optional[int]: + if Y is None: + return None + _, scores = Y + if len(scores) == 0: + return None + assert scores[0].shape[0] >= 1 + assert len(scores[0].shape) == 2 + return scores[0].shape[1] + + +def _lsuv_init(model: Model): + """This is like the 'layer sequential unit variance', but instead + of taking the actual inputs, we randomly generate whitened data. + + Why's this all so complicated? We have a huge number of inputs, + and the maxout unit makes guessing the dynamics tricky. Instead + we set the maxout weights to values that empirically result in + whitened outputs given whitened inputs. + """ + W = model.maybe_get_param("hidden_W") + if W is not None and W.any(): + return + + nF = model.get_dim("nF") + nH = model.get_dim("nH") + nP = model.get_dim("nP") + nI = model.get_dim("nI") + W = model.ops.alloc4f(nF, nH, nP, nI) + b = model.ops.alloc2f(nH, nP) + pad = model.ops.alloc4f(1, nF, nH, nP) + + ops = model.ops + W = normal_init(ops, W.shape, mean=float(ops.xp.sqrt(1.0 / nF * nI))) + pad = normal_init(ops, pad.shape, mean=1.0) + model.set_param("W", W) + model.set_param("b", b) + model.set_param("pad", pad) + + ids = ops.alloc_f((5000, nF), dtype="f") + ids += ops.xp.random.uniform(0, 1000, ids.shape) + ids = ops.asarray(ids, dtype="i") + tokvecs = ops.alloc_f((5000, nI), dtype="f") + tokvecs += ops.xp.random.normal(loc=0.0, scale=1.0, size=tokvecs.size).reshape( + tokvecs.shape + ) + + def predict(ids, tokvecs): + # nS ids. nW tokvecs. Exclude the padding array. + hiddens, _ = _forward_precomputable_affine(model, tokvecs[:-1], False) + vectors = model.ops.alloc2f(ids.shape[0], nH * nP) + # need nS vectors + hiddens = hiddens.reshape((hiddens.shape[0] * nF, nH * nP)) + model.ops.scatter_add(vectors, ids.flatten(), hiddens) + vectors3f = model.ops.reshape3f(vectors, vectors.shape[0], nH, nP) + vectors3f += b + return model.ops.maxout(vectors3f)[0] + + tol_var = 0.01 + tol_mean = 0.01 + t_max = 10 + W = cast(Floats4d, model.get_param("hidden_W").copy()) + b = cast(Floats2d, model.get_param("hidden_b").copy()) + for t_i in range(t_max): + acts1 = predict(ids, tokvecs) + var = model.ops.xp.var(acts1) + mean = model.ops.xp.mean(acts1) + if abs(var - 1.0) >= tol_var: + W /= model.ops.xp.sqrt(var) + model.set_param("hidden_W", W) + elif abs(mean) >= tol_mean: + b -= mean + model.set_param("hidden_b", b) + else: + break + return model + + +cdef WeightsC _get_c_weights(model, const float* feats, np.ndarray[np.npy_bool, ndim=1] seen_mask) except *: + output = model.get_ref("output") + cdef np.ndarray hidden_b = model.get_param("hidden_b") + cdef np.ndarray output_W = output.get_param("W") + cdef np.ndarray output_b = output.get_param("b") + + cdef WeightsC weights + weights.feat_weights = feats + weights.feat_bias = hidden_b.data + weights.hidden_weights = output_W.data + weights.hidden_bias = output_b.data + weights.seen_mask = seen_mask.data + + return weights + + +cdef SizesC _get_c_sizes(model, int batch_size, int tokens) except *: + cdef SizesC sizes + sizes.states = batch_size + sizes.classes = model.get_dim("nO") + sizes.hiddens = model.get_dim("nH") + sizes.pieces = model.get_dim("nP") + sizes.feats = model.get_dim("nF") + sizes.embed_width = model.get_dim("nI") + sizes.tokens = tokens + return sizes + + +cdef ActivationsC _alloc_activations(SizesC n) nogil: + cdef ActivationsC A + memset(&A, 0, sizeof(A)) + _resize_activations(&A, n) + return A + + +cdef void _free_activations(const ActivationsC* A) nogil: + free(A.token_ids) + free(A.unmaxed) + free(A.hiddens) + free(A.is_valid) + + +cdef void _resize_activations(ActivationsC* A, SizesC n) nogil: + if n.states <= A._max_size: + A._curr_size = n.states + return + if A._max_size == 0: + A.token_ids = calloc(n.states * n.feats, sizeof(A.token_ids[0])) + A.unmaxed = calloc(n.states * n.hiddens * n.pieces, sizeof(A.unmaxed[0])) + A.hiddens = calloc(n.states * n.hiddens, sizeof(A.hiddens[0])) + A.is_valid = calloc(n.states * n.classes, sizeof(A.is_valid[0])) + A._max_size = n.states + else: + A.token_ids = realloc(A.token_ids, + n.states * n.feats * sizeof(A.token_ids[0])) + A.unmaxed = realloc(A.unmaxed, + n.states * n.hiddens * n.pieces * sizeof(A.unmaxed[0])) + A.hiddens = realloc(A.hiddens, + n.states * n.hiddens * sizeof(A.hiddens[0])) + A.is_valid = realloc(A.is_valid, + n.states * n.classes * sizeof(A.is_valid[0])) + A._max_size = n.states + A._curr_size = n.states + + +cdef void _predict_states(CBlas cblas, ActivationsC* A, float* scores, StateC** states, const WeightsC* W, SizesC n) nogil: + _resize_activations(A, n) + for i in range(n.states): + states[i].set_context_tokens(&A.token_ids[i*n.feats], n.feats) + memset(A.unmaxed, 0, n.states * n.hiddens * n.pieces * sizeof(float)) + _sum_state_features(cblas, A.unmaxed, W.feat_weights, A.token_ids, n) + for i in range(n.states): + saxpy(cblas)(n.hiddens * n.pieces, 1., W.feat_bias, 1, &A.unmaxed[i*n.hiddens*n.pieces], 1) + for j in range(n.hiddens): + index = i * n.hiddens * n.pieces + j * n.pieces + which = arg_max(&A.unmaxed[index], n.pieces) + A.hiddens[i*n.hiddens + j] = A.unmaxed[index + which] + if W.hidden_weights == NULL: + memcpy(scores, A.hiddens, n.states * n.classes * sizeof(float)) + else: + # Compute hidden-to-output + sgemm(cblas)(False, True, n.states, n.classes, n.hiddens, + 1.0, A.hiddens, n.hiddens, + W.hidden_weights, n.hiddens, + 0.0, scores, n.classes) + # Add bias + for i in range(n.states): + saxpy(cblas)(n.classes, 1., W.hidden_bias, 1, &scores[i*n.classes], 1) + # Set unseen classes to minimum value + i = 0 + min_ = scores[0] + for i in range(1, n.states * n.classes): + if scores[i] < min_: + min_ = scores[i] + for i in range(n.states): + for j in range(n.classes): + if W.seen_mask[j]: + scores[i*n.classes+j] = min_ + + +cdef void _sum_state_features(CBlas cblas, float* output, + const float* cached, const int* token_ids, SizesC n) nogil: + cdef int idx, b, f, i + cdef const float* feature + cdef int B = n.states + cdef int O = n.hiddens * n.pieces + cdef int F = n.feats + cdef int T = n.tokens + padding = cached + (T * F * O) + cdef int id_stride = F*O + cdef float one = 1. + for b in range(B): + for f in range(F): + if token_ids[f] < 0: + feature = &padding[f*O] + else: + idx = token_ids[f] * id_stride + f*O + feature = &cached[idx] + saxpy(cblas)(O, one, feature, 1, &output[b*O], 1) + token_ids += F + diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd index 8d449d065..63faab5be 100644 --- a/spacy/morphology.pxd +++ b/spacy/morphology.pxd @@ -1,23 +1,41 @@ -from cymem.cymem cimport Pool -from preshed.maps cimport PreshMap cimport numpy as np -from libc.stdint cimport uint64_t +from libc.stdint cimport uint32_t, uint64_t +from libcpp.unordered_map cimport unordered_map +from libcpp.vector cimport vector +from libcpp.memory cimport shared_ptr -from .structs cimport MorphAnalysisC from .strings cimport StringStore from .typedefs cimport attr_t, hash_t +cdef cppclass Feature: + hash_t field + hash_t value + + __init__(): + this.field = 0 + this.value = 0 + + +cdef cppclass MorphAnalysisC: + hash_t key + vector[Feature] features + + __init__(): + this.key = 0 + cdef class Morphology: - cdef readonly Pool mem cdef readonly StringStore strings - cdef PreshMap tags # Keyed by hash, value is pointer to tag + cdef unordered_map[hash_t, shared_ptr[MorphAnalysisC]] tags - cdef MorphAnalysisC create_morph_tag(self, field_feature_pairs) except * - cdef int insert(self, MorphAnalysisC tag) except -1 + cdef shared_ptr[MorphAnalysisC] _lookup_tag(self, hash_t tag_hash) + cdef void _intern_morph_tag(self, hash_t tag_key, feats) + cdef hash_t _add(self, features) + cdef str _normalize_features(self, features) + cdef str get_morph_str(self, hash_t morph_key) + cdef shared_ptr[MorphAnalysisC] get_morph_c(self, hash_t morph_key) - -cdef int check_feature(const MorphAnalysisC* morph, attr_t feature) nogil -cdef list list_features(const MorphAnalysisC* morph) -cdef np.ndarray get_by_field(const MorphAnalysisC* morph, attr_t field) -cdef int get_n_by_field(attr_t* results, const MorphAnalysisC* morph, attr_t field) nogil +cdef int check_feature(const shared_ptr[MorphAnalysisC] morph, attr_t feature) nogil +cdef list list_features(const shared_ptr[MorphAnalysisC] morph) +cdef np.ndarray get_by_field(const shared_ptr[MorphAnalysisC] morph, attr_t field) +cdef int get_n_by_field(attr_t* results, const shared_ptr[MorphAnalysisC] morph, attr_t field) nogil diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index c3ffc46a1..2c3be7b46 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -1,10 +1,10 @@ # cython: infer_types import numpy import warnings +from typing import Union, Tuple, List, Dict, Optional +from cython.operator cimport dereference as deref +from libcpp.memory cimport shared_ptr -from .attrs cimport POS - -from .parts_of_speech import IDS as POS_IDS from .errors import Warnings from . import symbols @@ -24,134 +24,187 @@ cdef class Morphology: EMPTY_MORPH = symbols.NAMES[symbols._] def __init__(self, StringStore strings): - self.mem = Pool() self.strings = strings - self.tags = PreshMap() def __reduce__(self): tags = set([self.get(self.strings[s]) for s in self.strings]) tags -= set([""]) return (unpickle_morphology, (self.strings, sorted(tags)), None, None) - def add(self, features): + cdef shared_ptr[MorphAnalysisC] _lookup_tag(self, hash_t tag_hash): + match = self.tags.find(tag_hash) + if match != self.tags.const_end(): + return deref(match).second + else: + return shared_ptr[MorphAnalysisC]() + + def _normalize_attr(self, attr_key : Union[int, str], attr_value : Union[int, str]) -> Optional[Tuple[str, Union[str, List[str]]]]: + if isinstance(attr_key, (int, str)) and isinstance(attr_value, (int, str)): + attr_key = self.strings.as_string(attr_key) + attr_value = self.strings.as_string(attr_value) + + # Preserve multiple values as a list + if self.VALUE_SEP in attr_value: + values = attr_value.split(self.VALUE_SEP) + values.sort() + attr_value = values + else: + warnings.warn(Warnings.W100.format(feature={attr_key: attr_value})) + return None + + return attr_key, attr_value + + def _str_to_normalized_feat_dict(self, feats: str) -> Dict[str, str]: + if not feats or feats == self.EMPTY_MORPH: + return {} + + out = [] + for feat in feats.split(self.FEATURE_SEP): + field, values = feat.split(self.FIELD_SEP, 1) + normalized_attr = self._normalize_attr(field, values) + if normalized_attr is None: + continue + out.append((normalized_attr[0], normalized_attr[1])) + out.sort(key=lambda x: x[0]) + return dict(out) + + def _dict_to_normalized_feat_dict(self, feats: Dict[Union[int, str], Union[int, str]]) -> Dict[str, str]: + out = [] + for field, values in feats.items(): + normalized_attr = self._normalize_attr(field, values) + if normalized_attr is None: + continue + out.append((normalized_attr[0], normalized_attr[1])) + out.sort(key=lambda x: x[0]) + return dict(out) + + + def _normalized_feat_dict_to_str(self, feats: Dict[str, str]) -> str: + norm_feats_string = self.FEATURE_SEP.join([ + self.FIELD_SEP.join([field, self.VALUE_SEP.join(values) if isinstance(values, list) else values]) + for field, values in feats.items() + ]) + return norm_feats_string or self.EMPTY_MORPH + + + cdef hash_t _add(self, features): """Insert a morphological analysis in the morphology table, if not already present. The morphological analysis may be provided in the UD FEATS format as a string or in the tag map dict format. Returns the hash of the new analysis. """ - cdef MorphAnalysisC* tag_ptr + cdef hash_t tag_hash = 0 + cdef shared_ptr[MorphAnalysisC] tag if isinstance(features, str): if features == "": features = self.EMPTY_MORPH - tag_ptr = self.tags.get(self.strings[features]) - if tag_ptr != NULL: - return tag_ptr.key - features = self.feats_to_dict(features) - if not isinstance(features, dict): + + tag_hash = self.strings[features] + tag = self._lookup_tag(tag_hash) + if tag: + return deref(tag).key + + features = self._str_to_normalized_feat_dict(features) + elif isinstance(features, dict): + features = self._dict_to_normalized_feat_dict(features) + else: warnings.warn(Warnings.W100.format(feature=features)) features = {} - string_features = {self.strings.as_string(field): self.strings.as_string(values) for field, values in features.items()} - # intified ("Field", "Field=Value") pairs - field_feature_pairs = [] - for field in sorted(string_features): - values = string_features[field] - for value in values.split(self.VALUE_SEP): - field_feature_pairs.append(( - self.strings.add(field), - self.strings.add(field + self.FIELD_SEP + value), - )) - cdef MorphAnalysisC tag = self.create_morph_tag(field_feature_pairs) + # the hash key for the tag is either the hash of the normalized UFEATS # string or the hash of an empty placeholder - norm_feats_string = self.normalize_features(features) - tag.key = self.strings.add(norm_feats_string) - self.insert(tag) - return tag.key + norm_feats_string = self._normalized_feat_dict_to_str(features) + tag_hash = self.strings.add(norm_feats_string) + tag = self._lookup_tag(tag_hash) + if tag: + return deref(tag).key - def normalize_features(self, features): + self._intern_morph_tag(tag_hash, features) + return tag_hash + + cdef void _intern_morph_tag(self, hash_t tag_key, feats): + # intified ("Field", "Field=Value") pairs where fields with multiple values have + # been split into individual tuples, e.g.: + # [("Field1", "Field1=Value1"), ("Field1", "Field1=Value2"), + # ("Field2", "Field2=Value3")] + field_feature_pairs = [] + + # Feat dict is normalized at this point. + for field, values in feats.items(): + field_key = self.strings.add(field) + if isinstance(values, list): + for value in values: + value_key = self.strings.add(field + self.FIELD_SEP + value) + field_feature_pairs.append((field_key, value_key)) + else: + # We could box scalar values into a list and use a common + # code path to generate features but that incurs a small + # but measurable allocation/iteration overhead (as this + # branch is taken often enough). + value_key = self.strings.add(field + self.FIELD_SEP + values) + field_feature_pairs.append((field_key, value_key)) + + num_features = len(field_feature_pairs) + cdef shared_ptr[MorphAnalysisC] tag = shared_ptr[MorphAnalysisC](new MorphAnalysisC()) + deref(tag).key = tag_key + deref(tag).features.resize(num_features) + + for i in range(num_features): + deref(tag).features[i].field = field_feature_pairs[i][0] + deref(tag).features[i].value = field_feature_pairs[i][1] + + self.tags[tag_key] = tag + + cdef str get_morph_str(self, hash_t morph_key): + cdef shared_ptr[MorphAnalysisC] tag = self._lookup_tag(morph_key) + if not tag: + return "" + else: + return self.strings[deref(tag).key] + + cdef shared_ptr[MorphAnalysisC] get_morph_c(self, hash_t morph_key): + return self._lookup_tag(morph_key) + + cdef str _normalize_features(self, features): """Create a normalized FEATS string from a features string or dict. features (Union[dict, str]): Features as dict or UFEATS string. RETURNS (str): Features as normalized UFEATS string. """ if isinstance(features, str): - features = self.feats_to_dict(features) - if not isinstance(features, dict): + features = self._str_to_normalized_feat_dict(features) + elif isinstance(features, dict): + features = self._dict_to_normalized_feat_dict(features) + else: warnings.warn(Warnings.W100.format(feature=features)) features = {} - features = self.normalize_attrs(features) - string_features = {self.strings.as_string(field): self.strings.as_string(values) for field, values in features.items()} - # normalized UFEATS string with sorted fields and values - norm_feats_string = self.FEATURE_SEP.join(sorted([ - self.FIELD_SEP.join([field, values]) - for field, values in string_features.items() - ])) - return norm_feats_string or self.EMPTY_MORPH - def normalize_attrs(self, attrs): - """Convert attrs dict so that POS is always by ID, other features are - by string. Values separated by VALUE_SEP are sorted. - """ - out = {} - attrs = dict(attrs) - for key, value in attrs.items(): - # convert POS value to ID - if key == POS or (isinstance(key, str) and key.upper() == "POS"): - if isinstance(value, str) and value.upper() in POS_IDS: - value = POS_IDS[value.upper()] - elif isinstance(value, int) and value not in POS_IDS.values(): - warnings.warn(Warnings.W100.format(feature={key: value})) - continue - out[POS] = value - # accept any string or ID fields and values and convert to strings - elif isinstance(key, (int, str)) and isinstance(value, (int, str)): - key = self.strings.as_string(key) - value = self.strings.as_string(value) - # sort values - if self.VALUE_SEP in value: - value = self.VALUE_SEP.join(sorted(value.split(self.VALUE_SEP))) - out[key] = value - else: - warnings.warn(Warnings.W100.format(feature={key: value})) - return out + return self._normalized_feat_dict_to_str(features) - cdef MorphAnalysisC create_morph_tag(self, field_feature_pairs) except *: - """Creates a MorphAnalysisC from a list of intified - ("Field", "Field=Value") tuples where fields with multiple values have - been split into individual tuples, e.g.: - [("Field1", "Field1=Value1"), ("Field1", "Field1=Value2"), - ("Field2", "Field2=Value3")] - """ - cdef MorphAnalysisC tag - tag.length = len(field_feature_pairs) - if tag.length > 0: - tag.fields = self.mem.alloc(tag.length, sizeof(attr_t)) - tag.features = self.mem.alloc(tag.length, sizeof(attr_t)) - for i, (field, feature) in enumerate(field_feature_pairs): - tag.fields[i] = field - tag.features[i] = feature - return tag + def add(self, features): + return self._add(features) - cdef int insert(self, MorphAnalysisC tag) except -1: - cdef hash_t key = tag.key - if self.tags.get(key) == NULL: - tag_ptr = self.mem.alloc(1, sizeof(MorphAnalysisC)) - tag_ptr[0] = tag - self.tags.set(key, tag_ptr) + def get(self, morph_key): + return self.get_morph_str(morph_key) - def get(self, hash_t morph): - tag = self.tags.get(morph) - if tag == NULL: - return "" - else: - return self.strings[tag.key] + def normalize_features(self, features): + return self._normalize_features(features) @staticmethod - def feats_to_dict(feats): + def feats_to_dict(feats, *, sort_values=True): if not feats or feats == Morphology.EMPTY_MORPH: return {} - return {field: Morphology.VALUE_SEP.join(sorted(values.split(Morphology.VALUE_SEP))) for field, values in - [feat.split(Morphology.FIELD_SEP) for feat in feats.split(Morphology.FEATURE_SEP)]} + + out = {} + for feat in feats.split(Morphology.FEATURE_SEP): + field, values = feat.split(Morphology.FIELD_SEP, 1) + if sort_values: + values = values.split(Morphology.VALUE_SEP) + values.sort() + values = Morphology.VALUE_SEP.join(values) + + out[field] = values + return out @staticmethod def dict_to_feats(feats_dict): @@ -160,34 +213,34 @@ cdef class Morphology: return Morphology.FEATURE_SEP.join(sorted([Morphology.FIELD_SEP.join([field, Morphology.VALUE_SEP.join(sorted(values.split(Morphology.VALUE_SEP)))]) for field, values in feats_dict.items()])) -cdef int check_feature(const MorphAnalysisC* morph, attr_t feature) nogil: +cdef int check_feature(const shared_ptr[MorphAnalysisC] morph, attr_t feature) nogil: cdef int i - for i in range(morph.length): - if morph.features[i] == feature: + for i in range(deref(morph).features.size()): + if deref(morph).features[i].value == feature: return True return False -cdef list list_features(const MorphAnalysisC* morph): +cdef list list_features(const shared_ptr[MorphAnalysisC] morph): cdef int i features = [] - for i in range(morph.length): - features.append(morph.features[i]) + for i in range(deref(morph).features.size()): + features.append(deref(morph).features[i].value) return features -cdef np.ndarray get_by_field(const MorphAnalysisC* morph, attr_t field): - cdef np.ndarray results = numpy.zeros((morph.length,), dtype="uint64") +cdef np.ndarray get_by_field(const shared_ptr[MorphAnalysisC] morph, attr_t field): + cdef np.ndarray results = numpy.zeros((deref(morph).features.size(),), dtype="uint64") n = get_n_by_field(results.data, morph, field) return results[:n] -cdef int get_n_by_field(attr_t* results, const MorphAnalysisC* morph, attr_t field) nogil: +cdef int get_n_by_field(attr_t* results, const shared_ptr[MorphAnalysisC] morph, attr_t field) nogil: cdef int n_results = 0 cdef int i - for i in range(morph.length): - if morph.fields[i] == field: - results[n_results] = morph.features[i] + for i in range(deref(morph).features.size()): + if deref(morph).features[i].field == field: + results[n_results] = deref(morph).features[i].value n_results += 1 return n_results diff --git a/spacy/parts_of_speech.pxd b/spacy/parts_of_speech.pxd index 0bf5b4789..67390ad63 100644 --- a/spacy/parts_of_speech.pxd +++ b/spacy/parts_of_speech.pxd @@ -3,22 +3,22 @@ from . cimport symbols cpdef enum univ_pos_t: NO_TAG = 0 ADJ = symbols.ADJ - ADP - ADV - AUX - CONJ - CCONJ # U20 - DET - INTJ - NOUN - NUM - PART - PRON - PROPN - PUNCT - SCONJ - SYM - VERB - X - EOL - SPACE + ADP = symbols.ADP + ADV = symbols.ADV + AUX = symbols.AUX + CONJ = symbols.CONJ + CCONJ = symbols.CCONJ # U20 + DET = symbols.DET + INTJ = symbols.INTJ + NOUN = symbols.NOUN + NUM = symbols.NUM + PART = symbols.PART + PRON = symbols.PRON + PROPN = symbols.PROPN + PUNCT = symbols.PUNCT + SCONJ = symbols.SCONJ + SYM = symbols.SYM + VERB = symbols.VERB + X = symbols.X + EOL = symbols.EOL + SPACE = symbols.SPACE diff --git a/spacy/pipeline/__init__.py b/spacy/pipeline/__init__.py index 26931606b..14dfed949 100644 --- a/spacy/pipeline/__init__.py +++ b/spacy/pipeline/__init__.py @@ -1,9 +1,8 @@ -from .attributeruler import AttributeRuler +from .attribute_ruler import AttributeRuler from .dep_parser import DependencyParser from .edit_tree_lemmatizer import EditTreeLemmatizer from .entity_linker import EntityLinker from .ner import EntityRecognizer -from .entityruler import EntityRuler from .lemmatizer import Lemmatizer from .morphologizer import Morphologizer from .pipe import Pipe @@ -23,7 +22,6 @@ __all__ = [ "DependencyParser", "EntityLinker", "EntityRecognizer", - "EntityRuler", "Morphologizer", "Lemmatizer", "MultiLabel_TextCategorizer", diff --git a/spacy/pipeline/_parser_internals/_beam_utils.pxd b/spacy/pipeline/_parser_internals/_beam_utils.pxd index de3573fbc..571f246b1 100644 --- a/spacy/pipeline/_parser_internals/_beam_utils.pxd +++ b/spacy/pipeline/_parser_internals/_beam_utils.pxd @@ -1,6 +1,6 @@ from ...typedefs cimport class_t, hash_t -# These are passed as callbacks to thinc.search.Beam +# These are passed as callbacks to .search.Beam cdef int transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1 cdef int check_final_state(void* _state, void* extra_args) except -1 diff --git a/spacy/pipeline/_parser_internals/_beam_utils.pyx b/spacy/pipeline/_parser_internals/_beam_utils.pyx index fa7df2056..d07c13aeb 100644 --- a/spacy/pipeline/_parser_internals/_beam_utils.pyx +++ b/spacy/pipeline/_parser_internals/_beam_utils.pyx @@ -3,17 +3,17 @@ cimport numpy as np import numpy from cpython.ref cimport PyObject, Py_XDECREF -from thinc.extra.search cimport Beam -from thinc.extra.search import MaxViolation -from thinc.extra.search cimport MaxViolation from ...typedefs cimport hash_t, class_t from .transition_system cimport TransitionSystem, Transition from ...errors import Errors +from .batch cimport Batch +from .search cimport Beam, MaxViolation +from .search import MaxViolation from .stateclass cimport StateC, StateClass -# These are passed as callbacks to thinc.search.Beam +# These are passed as callbacks to .search.Beam cdef int transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1: dest = _dest src = _src @@ -27,7 +27,7 @@ cdef int check_final_state(void* _state, void* extra_args) except -1: return state.is_final() -cdef class BeamBatch(object): +cdef class BeamBatch(Batch): cdef public TransitionSystem moves cdef public object states cdef public object docs diff --git a/spacy/pipeline/_parser_internals/_parser_utils.pxd b/spacy/pipeline/_parser_internals/_parser_utils.pxd new file mode 100644 index 000000000..7fee05bad --- /dev/null +++ b/spacy/pipeline/_parser_internals/_parser_utils.pxd @@ -0,0 +1,2 @@ +cdef int arg_max(const float* scores, const int n_classes) nogil +cdef int arg_max_if_valid(const float* scores, const int* is_valid, int n) nogil diff --git a/spacy/pipeline/_parser_internals/_parser_utils.pyx b/spacy/pipeline/_parser_internals/_parser_utils.pyx new file mode 100644 index 000000000..582756bf5 --- /dev/null +++ b/spacy/pipeline/_parser_internals/_parser_utils.pyx @@ -0,0 +1,22 @@ +# cython: infer_types=True + +cdef inline int arg_max(const float* scores, const int n_classes) nogil: + if n_classes == 2: + return 0 if scores[0] > scores[1] else 1 + cdef int i + cdef int best = 0 + cdef float mode = scores[0] + for i in range(1, n_classes): + if scores[i] > mode: + mode = scores[i] + best = i + return best + + +cdef inline int arg_max_if_valid(const float* scores, const int* is_valid, int n) nogil: + cdef int best = -1 + for i in range(n): + if is_valid[i] >= 1: + if best == -1 or scores[i] > scores[best]: + best = i + return best diff --git a/spacy/pipeline/_parser_internals/_state.pxd b/spacy/pipeline/_parser_internals/_state.pxd index a1262bb61..bd5d5208c 100644 --- a/spacy/pipeline/_parser_internals/_state.pxd +++ b/spacy/pipeline/_parser_internals/_state.pxd @@ -6,7 +6,6 @@ cimport libcpp from libcpp.unordered_map cimport unordered_map from libcpp.vector cimport vector from libcpp.set cimport set -from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno from murmurhash.mrmr cimport hash64 from ...vocab cimport EMPTY_LEXEME @@ -26,7 +25,7 @@ cdef struct ArcC: cdef cppclass StateC: - int* _heads + vector[int] _heads const TokenC* _sent vector[int] _stack vector[int] _rebuffer @@ -34,31 +33,34 @@ cdef cppclass StateC: unordered_map[int, vector[ArcC]] _left_arcs unordered_map[int, vector[ArcC]] _right_arcs vector[libcpp.bool] _unshiftable + vector[int] history set[int] _sent_starts TokenC _empty_token int length int offset int _b_i - __init__(const TokenC* sent, int length) nogil: + __init__(const TokenC* sent, int length) nogil except +: + this._heads.resize(length, -1) + this._unshiftable.resize(length, False) + + # Reserve memory ahead of time to minimize allocations during parsing. + # The initial capacity set here ideally reflects the expected average-case/majority usage. + cdef int init_capacity = 32 + this._stack.reserve(init_capacity) + this._rebuffer.reserve(init_capacity) + this._ents.reserve(init_capacity) + this._left_arcs.reserve(init_capacity) + this._right_arcs.reserve(init_capacity) + this.history.reserve(init_capacity) + this._sent = sent - this._heads = calloc(length, sizeof(int)) - if not (this._sent and this._heads): - with gil: - PyErr_SetFromErrno(MemoryError) - PyErr_CheckSignals() this.offset = 0 this.length = length this._b_i = 0 - for i in range(length): - this._heads[i] = -1 - this._unshiftable.push_back(0) memset(&this._empty_token, 0, sizeof(TokenC)) this._empty_token.lex = &EMPTY_LEXEME - __dealloc__(): - free(this._heads) - void set_context_tokens(int* ids, int n) nogil: cdef int i, j if n == 1: @@ -131,19 +133,20 @@ cdef cppclass StateC: ids[i] = -1 int S(int i) nogil const: - if i >= this._stack.size(): + cdef int stack_size = this._stack.size() + if i >= stack_size or i < 0: return -1 - elif i < 0: - return -1 - return this._stack.at(this._stack.size() - (i+1)) + else: + return this._stack[stack_size - (i+1)] int B(int i) nogil const: + cdef int buf_size = this._rebuffer.size() if i < 0: return -1 - elif i < this._rebuffer.size(): - return this._rebuffer.at(this._rebuffer.size() - (i+1)) + elif i < buf_size: + return this._rebuffer[buf_size - (i+1)] else: - b_i = this._b_i + (i - this._rebuffer.size()) + b_i = this._b_i + (i - buf_size) if b_i >= this.length: return -1 else: @@ -242,7 +245,7 @@ cdef cppclass StateC: return 0 elif this._sent[word].sent_start == 1: return 1 - elif this._sent_starts.count(word) >= 1: + elif this._sent_starts.const_find(word) != this._sent_starts.const_end(): return 1 else: return 0 @@ -327,7 +330,7 @@ cdef cppclass StateC: if item >= this._unshiftable.size(): return 0 else: - return this._unshiftable.at(item) + return this._unshiftable[item] void set_reshiftable(int item) nogil: if item < this._unshiftable.size(): @@ -347,6 +350,9 @@ cdef cppclass StateC: this._heads[child] = head void map_del_arc(unordered_map[int, vector[ArcC]]* heads_arcs, int h_i, int c_i) nogil: + cdef vector[ArcC]* arcs + cdef ArcC* arc + arcs_it = heads_arcs.find(h_i) if arcs_it == heads_arcs.end(): return @@ -355,12 +361,12 @@ cdef cppclass StateC: if arcs.size() == 0: return - arc = arcs.back() + arc = &arcs.back() if arc.head == h_i and arc.child == c_i: arcs.pop_back() else: for i in range(arcs.size()-1): - arc = arcs.at(i) + arc = &deref(arcs)[i] if arc.head == h_i and arc.child == c_i: arc.head = -1 arc.child = -1 @@ -400,10 +406,11 @@ cdef cppclass StateC: this._rebuffer = src._rebuffer this._sent_starts = src._sent_starts this._unshiftable = src._unshiftable - memcpy(this._heads, src._heads, this.length * sizeof(this._heads[0])) + this._heads = src._heads this._ents = src._ents this._left_arcs = src._left_arcs this._right_arcs = src._right_arcs this._b_i = src._b_i this.offset = src.offset this._empty_token = src._empty_token + this.history = src.history diff --git a/spacy/pipeline/_parser_internals/arc_eager.pyx b/spacy/pipeline/_parser_internals/arc_eager.pyx index 257b5ef8a..9c358475a 100644 --- a/spacy/pipeline/_parser_internals/arc_eager.pyx +++ b/spacy/pipeline/_parser_internals/arc_eager.pyx @@ -15,7 +15,7 @@ from ...training.example cimport Example from .stateclass cimport StateClass from ._state cimport StateC, ArcC from ...errors import Errors -from thinc.extra.search cimport Beam +from .search cimport Beam cdef weight_t MIN_SCORE = -90000 cdef attr_t SUBTOK_LABEL = hash_string('subtok') @@ -773,6 +773,8 @@ cdef class ArcEager(TransitionSystem): return list(arcs) def has_gold(self, Example eg, start=0, end=None): + if end is not None and end < 0: + end = None for word in eg.y[start:end]: if word.dep != 0: return True @@ -858,6 +860,7 @@ cdef class ArcEager(TransitionSystem): state.print_state() ))) action.do(state.c, action.label) + state.c.history.push_back(i) break else: failed = False diff --git a/spacy/pipeline/_parser_internals/batch.pxd b/spacy/pipeline/_parser_internals/batch.pxd new file mode 100644 index 000000000..60734e549 --- /dev/null +++ b/spacy/pipeline/_parser_internals/batch.pxd @@ -0,0 +1,2 @@ +cdef class Batch: + pass diff --git a/spacy/pipeline/_parser_internals/batch.pyx b/spacy/pipeline/_parser_internals/batch.pyx new file mode 100644 index 000000000..91073b52e --- /dev/null +++ b/spacy/pipeline/_parser_internals/batch.pyx @@ -0,0 +1,52 @@ +from typing import Any + +TransitionSystem = Any # TODO + +cdef class Batch: + def advance(self, scores): + raise NotImplementedError + + def get_states(self): + raise NotImplementedError + + @property + def is_done(self): + raise NotImplementedError + + def get_unfinished_states(self): + raise NotImplementedError + + def __getitem__(self, i): + raise NotImplementedError + + def __len__(self): + raise NotImplementedError + + +class GreedyBatch(Batch): + def __init__(self, moves: TransitionSystem, states, golds): + self._moves = moves + self._states = states + self._next_states = [s for s in states if not s.is_final()] + + def advance(self, scores): + self._next_states = self._moves.transition_states(self._next_states, scores) + + def advance_with_actions(self, actions): + self._next_states = self._moves.apply_actions(self._next_states, actions) + + def get_states(self): + return self._states + + @property + def is_done(self): + return all(s.is_final() for s in self._states) + + def get_unfinished_states(self): + return [st for st in self._states if not st.is_final()] + + def __getitem__(self, i): + return self._states[i] + + def __len__(self): + return len(self._states) diff --git a/spacy/pipeline/_parser_internals/ner.pyx b/spacy/pipeline/_parser_internals/ner.pyx index fab872f00..d4d564dc7 100644 --- a/spacy/pipeline/_parser_internals/ner.pyx +++ b/spacy/pipeline/_parser_internals/ner.pyx @@ -1,10 +1,11 @@ import os import random from libc.stdint cimport int32_t +from libcpp.memory cimport shared_ptr +from libcpp.vector cimport vector from cymem.cymem cimport Pool from collections import Counter -from thinc.extra.search cimport Beam from ...tokens.doc cimport Doc from ...tokens.span import Span @@ -15,6 +16,7 @@ from ...attrs cimport IS_SPACE from ...structs cimport TokenC, SpanC from ...training import split_bilu_label from ...training.example cimport Example +from .search cimport Beam from .stateclass cimport StateClass from ._state cimport StateC from .transition_system cimport Transition, do_func_t @@ -43,9 +45,7 @@ MOVE_NAMES[OUT] = 'O' cdef struct GoldNERStateC: Transition* ner - SpanC* negs - int32_t length - int32_t nr_neg + vector[shared_ptr[SpanC]] negs cdef class BiluoGold: @@ -78,8 +78,6 @@ cdef GoldNERStateC create_gold_state( negs = [] assert example.x.length > 0 gs.ner = mem.alloc(example.x.length, sizeof(Transition)) - gs.negs = mem.alloc(len(negs), sizeof(SpanC)) - gs.nr_neg = len(negs) ner_ents, ner_tags = example.get_aligned_ents_and_ner() for i, ner_tag in enumerate(ner_tags): gs.ner[i] = moves.lookup_transition(ner_tag) @@ -93,8 +91,8 @@ cdef GoldNERStateC create_gold_state( # In order to handle negative samples, we need to maintain the full # (start, end, label) triple. If we break it down to the 'isnt B-LOC' # thing, we'll get blocked if there's an incorrect prefix. - for i, neg in enumerate(negs): - gs.negs[i] = neg.c + for neg in negs: + gs.negs.push_back(neg.c) return gs @@ -158,7 +156,7 @@ cdef class BiluoPushDown(TransitionSystem): if token.ent_type: labels.add(token.ent_type_) return labels - + def move_name(self, int move, attr_t label): if move == OUT: return 'O' @@ -308,6 +306,8 @@ cdef class BiluoPushDown(TransitionSystem): for span in eg.y.spans.get(neg_key, []): if span.start >= start and span.end <= end: return True + if end is not None and end < 0: + end = None for word in eg.y[start:end]: if word.ent_iob != 0: return True @@ -411,6 +411,8 @@ cdef class Begin: cdef int g_act = gold.ner[b0].move cdef attr_t g_tag = gold.ner[b0].label + cdef shared_ptr[SpanC] span + if g_act == MISSING: pass elif g_act == BEGIN: @@ -428,8 +430,8 @@ cdef class Begin: # be correct or not. However, we can at least tell whether we're # going to be opening an entity where there's only one possible # L. - for span in gold.negs[:gold.nr_neg]: - if span.label == label and span.start == b0: + for span in gold.negs: + if span.get().label == label and span.get().start == b0: cost += 1 break return cost @@ -574,8 +576,9 @@ cdef class Last: # If we have negative-example entities, integrate them into the objective, # by marking actions that close an entity that we know is incorrect # as costly. - for span in gold.negs[:gold.nr_neg]: - if span.label == label and (span.end-1) == b0 and span.start == ent_start: + cdef shared_ptr[SpanC] span + for span in gold.negs: + if span.get().label == label and (span.get().end-1) == b0 and span.get().start == ent_start: cost += 1 break return cost @@ -639,12 +642,13 @@ cdef class Unit: # This is fairly straight-forward for U- entities, as we have a single # action cdef int b0 = s.B(0) - for span in gold.negs[:gold.nr_neg]: - if span.label == label and span.start == b0 and span.end == (b0+1): + cdef shared_ptr[SpanC] span + for span in gold.negs: + if span.get().label == label and span.get().start == b0 and span.get().end == (b0+1): cost += 1 break return cost - + cdef class Out: diff --git a/spacy/pipeline/_parser_internals/search.pxd b/spacy/pipeline/_parser_internals/search.pxd new file mode 100644 index 000000000..dfe30e1c1 --- /dev/null +++ b/spacy/pipeline/_parser_internals/search.pxd @@ -0,0 +1,89 @@ +from cymem.cymem cimport Pool + +from libc.stdint cimport uint32_t +from libc.stdint cimport uint64_t +from libcpp.pair cimport pair +from libcpp.queue cimport priority_queue +from libcpp.vector cimport vector + +from ...typedefs cimport class_t, weight_t, hash_t + +ctypedef pair[weight_t, size_t] Entry +ctypedef priority_queue[Entry] Queue + + +ctypedef int (*trans_func_t)(void* dest, void* src, class_t clas, void* x) except -1 + +ctypedef void* (*init_func_t)(Pool mem, int n, void* extra_args) except NULL + +ctypedef int (*del_func_t)(Pool mem, void* state, void* extra_args) except -1 + +ctypedef int (*finish_func_t)(void* state, void* extra_args) except -1 + +ctypedef hash_t (*hash_func_t)(void* state, void* x) except 0 + + +cdef struct _State: + void* content + class_t* hist + weight_t score + weight_t loss + int i + int t + bint is_done + + +cdef class Beam: + cdef Pool mem + cdef class_t nr_class + cdef class_t width + cdef class_t size + cdef public weight_t min_density + cdef int t + cdef readonly bint is_done + cdef list histories + cdef list _parent_histories + cdef weight_t** scores + cdef int** is_valid + cdef weight_t** costs + cdef _State* _parents + cdef _State* _states + cdef del_func_t del_func + + cdef int _fill(self, Queue* q, weight_t** scores, int** is_valid) except -1 + + cdef inline void* at(self, int i) nogil: + return self._states[i].content + + cdef int initialize(self, init_func_t init_func, del_func_t del_func, int n, void* extra_args) except -1 + cdef int advance(self, trans_func_t transition_func, hash_func_t hash_func, + void* extra_args) except -1 + cdef int check_done(self, finish_func_t finish_func, void* extra_args) except -1 + + + cdef inline void set_cell(self, int i, int j, weight_t score, int is_valid, weight_t cost) nogil: + self.scores[i][j] = score + self.is_valid[i][j] = is_valid + self.costs[i][j] = cost + + cdef int set_row(self, int i, const weight_t* scores, const int* is_valid, + const weight_t* costs) except -1 + cdef int set_table(self, weight_t** scores, int** is_valid, weight_t** costs) except -1 + + +cdef class MaxViolation: + cdef Pool mem + cdef weight_t cost + cdef weight_t delta + cdef readonly weight_t p_score + cdef readonly weight_t g_score + cdef readonly double Z + cdef readonly double gZ + cdef class_t n + cdef readonly list p_hist + cdef readonly list g_hist + cdef readonly list p_probs + cdef readonly list g_probs + + cpdef int check(self, Beam pred, Beam gold) except -1 + cpdef int check_crf(self, Beam pred, Beam gold) except -1 diff --git a/spacy/pipeline/_parser_internals/search.pyx b/spacy/pipeline/_parser_internals/search.pyx new file mode 100644 index 000000000..1d9b6dd7a --- /dev/null +++ b/spacy/pipeline/_parser_internals/search.pyx @@ -0,0 +1,306 @@ +# cython: profile=True, experimental_cpp_class_def=True, cdivision=True, infer_types=True +cimport cython +from libc.string cimport memset, memcpy +from libc.math cimport log, exp +import math + +from cymem.cymem cimport Pool +from preshed.maps cimport PreshMap + + +cdef class Beam: + def __init__(self, class_t nr_class, class_t width, weight_t min_density=0.0): + assert nr_class != 0 + assert width != 0 + self.nr_class = nr_class + self.width = width + self.min_density = min_density + self.size = 1 + self.t = 0 + self.mem = Pool() + self.del_func = NULL + self._parents = <_State*>self.mem.alloc(self.width, sizeof(_State)) + self._states = <_State*>self.mem.alloc(self.width, sizeof(_State)) + cdef int i + self.histories = [[] for i in range(self.width)] + self._parent_histories = [[] for i in range(self.width)] + + self.scores = self.mem.alloc(self.width, sizeof(weight_t*)) + self.is_valid = self.mem.alloc(self.width, sizeof(weight_t*)) + self.costs = self.mem.alloc(self.width, sizeof(weight_t*)) + for i in range(self.width): + self.scores[i] = self.mem.alloc(self.nr_class, sizeof(weight_t)) + self.is_valid[i] = self.mem.alloc(self.nr_class, sizeof(int)) + self.costs[i] = self.mem.alloc(self.nr_class, sizeof(weight_t)) + + def __len__(self): + return self.size + + property score: + def __get__(self): + return self._states[0].score + + property min_score: + def __get__(self): + return self._states[self.size-1].score + + property loss: + def __get__(self): + return self._states[0].loss + + property probs: + def __get__(self): + return _softmax([self._states[i].score for i in range(self.size)]) + + property scores: + def __get__(self): + return [self._states[i].score for i in range(self.size)] + + property histories: + def __get__(self): + return self.histories + + cdef int set_row(self, int i, const weight_t* scores, const int* is_valid, + const weight_t* costs) except -1: + cdef int j + for j in range(self.nr_class): + self.scores[i][j] = scores[j] + self.is_valid[i][j] = is_valid[j] + self.costs[i][j] = costs[j] + + cdef int set_table(self, weight_t** scores, int** is_valid, weight_t** costs) except -1: + cdef int i, j + for i in range(self.width): + memcpy(self.scores[i], scores[i], sizeof(weight_t) * self.nr_class) + memcpy(self.is_valid[i], is_valid[i], sizeof(bint) * self.nr_class) + memcpy(self.costs[i], costs[i], sizeof(int) * self.nr_class) + + cdef int initialize(self, init_func_t init_func, del_func_t del_func, int n, void* extra_args) except -1: + for i in range(self.width): + self._states[i].content = init_func(self.mem, n, extra_args) + self._parents[i].content = init_func(self.mem, n, extra_args) + self.del_func = del_func + + def __dealloc__(self): + if self.del_func == NULL: + return + + for i in range(self.width): + self.del_func(self.mem, self._states[i].content, NULL) + self.del_func(self.mem, self._parents[i].content, NULL) + + @cython.cdivision(True) + cdef int advance(self, trans_func_t transition_func, hash_func_t hash_func, + void* extra_args) except -1: + cdef weight_t** scores = self.scores + cdef int** is_valid = self.is_valid + cdef weight_t** costs = self.costs + + cdef Queue* q = new Queue() + self._fill(q, scores, is_valid) + # For a beam of width k, we only ever need 2k state objects. How? + # Each transition takes a parent and a class and produces a new state. + # So, we don't need the whole history --- just the parent. So at + # each step, we take a parent, and apply one or more extensions to + # it. + self._parents, self._states = self._states, self._parents + self._parent_histories, self.histories = self.histories, self._parent_histories + cdef weight_t score + cdef int p_i + cdef int i = 0 + cdef class_t clas + cdef _State* parent + cdef _State* state + cdef hash_t key + cdef PreshMap seen_states = PreshMap(self.width) + cdef uint64_t is_seen + cdef uint64_t one = 1 + while i < self.width and not q.empty(): + data = q.top() + p_i = data.second / self.nr_class + clas = data.second % self.nr_class + score = data.first + q.pop() + parent = &self._parents[p_i] + # Indicates terminal state reached; i.e. state is done + if parent.is_done: + # Now parent will not be changed, so we don't have to copy. + # Once finished, should also be unbranching. + self._states[i], parent[0] = parent[0], self._states[i] + parent.i = self._states[i].i + parent.t = self._states[i].t + parent.is_done = self._states[i].t + self._states[i].score = score + self.histories[i] = list(self._parent_histories[p_i]) + i += 1 + else: + state = &self._states[i] + # The supplied transition function should adjust the destination + # state to be the result of applying the class to the source state + transition_func(state.content, parent.content, clas, extra_args) + key = hash_func(state.content, extra_args) if hash_func is not NULL else 0 + is_seen = seen_states.get(key) + if key == 0 or key == 1 or not is_seen: + if key != 0 and key != 1: + seen_states.set(key, one) + state.score = score + state.loss = parent.loss + costs[p_i][clas] + self.histories[i] = list(self._parent_histories[p_i]) + self.histories[i].append(clas) + i += 1 + del q + self.size = i + assert self.size >= 1 + for i in range(self.width): + memset(self.scores[i], 0, sizeof(weight_t) * self.nr_class) + memset(self.costs[i], 0, sizeof(weight_t) * self.nr_class) + memset(self.is_valid[i], 0, sizeof(int) * self.nr_class) + self.t += 1 + + cdef int check_done(self, finish_func_t finish_func, void* extra_args) except -1: + cdef int i + for i in range(self.size): + if not self._states[i].is_done: + self._states[i].is_done = finish_func(self._states[i].content, extra_args) + for i in range(self.size): + if not self._states[i].is_done: + self.is_done = False + break + else: + self.is_done = True + + @cython.cdivision(True) + cdef int _fill(self, Queue* q, weight_t** scores, int** is_valid) except -1: + """Populate the queue from a k * n matrix of scores, where k is the + beam-width, and n is the number of classes. + """ + cdef Entry entry + cdef weight_t score + cdef _State* s + cdef int i, j, move_id + assert self.size >= 1 + cdef vector[Entry] entries + for i in range(self.size): + s = &self._states[i] + move_id = i * self.nr_class + if s.is_done: + # Update score by path average, following TACL '13 paper. + if self.histories[i]: + entry.first = s.score + (s.score / self.t) + else: + entry.first = s.score + entry.second = move_id + entries.push_back(entry) + else: + for j in range(self.nr_class): + if is_valid[i][j]: + entry.first = s.score + scores[i][j] + entry.second = move_id + j + entries.push_back(entry) + cdef double max_, Z, cutoff + if self.min_density == 0.0: + for i in range(entries.size()): + q.push(entries[i]) + elif not entries.empty(): + max_ = entries[0].first + Z = 0. + cutoff = 0. + # Softmax into probabilities, so we can prune + for i in range(entries.size()): + if entries[i].first > max_: + max_ = entries[i].first + for i in range(entries.size()): + Z += exp(entries[i].first-max_) + cutoff = (1. / Z) * self.min_density + for i in range(entries.size()): + prob = exp(entries[i].first-max_) / Z + if prob >= cutoff: + q.push(entries[i]) + + +cdef class MaxViolation: + def __init__(self): + self.p_score = 0.0 + self.g_score = 0.0 + self.Z = 0.0 + self.gZ = 0.0 + self.delta = -1 + self.cost = 0 + self.p_hist = [] + self.g_hist = [] + self.p_probs = [] + self.g_probs = [] + + cpdef int check(self, Beam pred, Beam gold) except -1: + cdef _State* p = &pred._states[0] + cdef _State* g = &gold._states[0] + cdef weight_t d = p.score - g.score + if p.loss >= 1 and (self.cost == 0 or d > self.delta): + self.cost = p.loss + self.delta = d + self.p_hist = list(pred.histories[0]) + self.g_hist = list(gold.histories[0]) + self.p_score = p.score + self.g_score = g.score + self.Z = 1e-10 + self.gZ = 1e-10 + for i in range(pred.size): + if pred._states[i].loss > 0: + self.Z += exp(pred._states[i].score) + for i in range(gold.size): + if gold._states[i].loss == 0: + prob = exp(gold._states[i].score) + self.Z += prob + self.gZ += prob + + cpdef int check_crf(self, Beam pred, Beam gold) except -1: + d = pred.score - gold.score + seen_golds = set([tuple(gold.histories[i]) for i in range(gold.size)]) + if pred.loss > 0 and (self.cost == 0 or d > self.delta): + p_hist = [] + p_scores = [] + g_hist = [] + g_scores = [] + for i in range(pred.size): + if pred._states[i].loss > 0: + p_scores.append(pred._states[i].score) + p_hist.append(list(pred.histories[i])) + # This can happen from non-monotonic actions + # If we find a better gold analysis this way, be sure to keep it. + elif pred._states[i].loss <= 0 \ + and tuple(pred.histories[i]) not in seen_golds: + g_scores.append(pred._states[i].score) + g_hist.append(list(pred.histories[i])) + for i in range(gold.size): + if gold._states[i].loss == 0: + g_scores.append(gold._states[i].score) + g_hist.append(list(gold.histories[i])) + + all_probs = _softmax(p_scores + g_scores) + p_probs = all_probs[:len(p_scores)] + g_probs_all = all_probs[len(p_scores):] + g_probs = _softmax(g_scores) + + self.cost = pred.loss + self.delta = d + self.p_hist = p_hist + self.g_hist = g_hist + # TODO: These variables are misnamed! These are the gradients of the loss. + self.p_probs = p_probs + # Intuition here: + # The gradient of the loss is: + # P(model) - P(truth) + # Normally, P(truth) is 1 for the gold + # But, if we want to do the "partial credit" scheme, we want + # to create a distribution over the gold, proportional to the scores + # awarded. + self.g_probs = [x-y for x, y in zip(g_probs_all, g_probs)] + + +def _softmax(nums): + if not nums: + return [] + max_ = max(nums) + nums = [(exp(n-max_) if n is not None else None) for n in nums] + Z = sum(n for n in nums if n is not None) + return [(n/Z if n is not None else None) for n in nums] diff --git a/spacy/pipeline/_parser_internals/stateclass.pyx b/spacy/pipeline/_parser_internals/stateclass.pyx index 4eaddd997..dbd22117e 100644 --- a/spacy/pipeline/_parser_internals/stateclass.pyx +++ b/spacy/pipeline/_parser_internals/stateclass.pyx @@ -20,6 +20,10 @@ cdef class StateClass: if self._borrowed != 1: del self.c + @property + def history(self): + return list(self.c.history) + @property def stack(self): return [self.S(i) for i in range(self.c.stack_depth())] @@ -176,3 +180,6 @@ cdef class StateClass: def clone(self, StateClass src): self.c.clone(src.c) + + def set_context_tokens(self, int[:, :] output, int row, int n_feats): + self.c.set_context_tokens(&output[row, 0], n_feats) diff --git a/spacy/pipeline/_parser_internals/transition_system.pxd b/spacy/pipeline/_parser_internals/transition_system.pxd index 52ebd2b8e..c8ebd8b27 100644 --- a/spacy/pipeline/_parser_internals/transition_system.pxd +++ b/spacy/pipeline/_parser_internals/transition_system.pxd @@ -53,3 +53,10 @@ cdef class TransitionSystem: cdef int set_costs(self, int* is_valid, weight_t* costs, const StateC* state, gold) except -1 + + +cdef void c_apply_actions(TransitionSystem moves, StateC** states, const int* actions, + int batch_size) nogil + +cdef void c_transition_batch(TransitionSystem moves, StateC** states, const float* scores, + int nr_class, int batch_size) nogil diff --git a/spacy/pipeline/_parser_internals/transition_system.pyx b/spacy/pipeline/_parser_internals/transition_system.pyx index 18eb745a9..89f9e8ae8 100644 --- a/spacy/pipeline/_parser_internals/transition_system.pyx +++ b/spacy/pipeline/_parser_internals/transition_system.pyx @@ -1,6 +1,8 @@ # cython: infer_types=True from __future__ import print_function from cymem.cymem cimport Pool +from libc.stdlib cimport calloc, free +from libcpp.vector cimport vector from collections import Counter import srsly @@ -10,6 +12,7 @@ from ...typedefs cimport weight_t, attr_t from ...tokens.doc cimport Doc from ...structs cimport TokenC from .stateclass cimport StateClass +from ._parser_utils cimport arg_max_if_valid from ...errors import Errors from ... import util @@ -73,7 +76,18 @@ cdef class TransitionSystem: offset += len(doc) return states + def follow_history(self, doc, history): + cdef int clas + cdef StateClass state = StateClass(doc) + for clas in history: + action = self.c[clas] + action.do(state.c, action.label) + state.c.history.push_back(clas) + return state + def get_oracle_sequence(self, Example example, _debug=False): + if not self.has_gold(example): + return [] states, golds, _ = self.init_gold_batch([example]) if not states: return [] @@ -85,6 +99,8 @@ cdef class TransitionSystem: return self.get_oracle_sequence_from_state(state, gold) def get_oracle_sequence_from_state(self, StateClass state, gold, _debug=None): + if state.is_final(): + return [] cdef Pool mem = Pool() # n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc assert self.n_moves > 0 @@ -110,6 +126,7 @@ cdef class TransitionSystem: "S0 head?", str(state.has_head(state.S(0))), ))) action.do(state.c, action.label) + state.c.history.push_back(i) break else: if _debug: @@ -137,6 +154,28 @@ cdef class TransitionSystem: raise ValueError(Errors.E170.format(name=name)) action = self.lookup_transition(name) action.do(state.c, action.label) + state.c.history.push_back(action.clas) + + def apply_actions(self, states, const int[::1] actions): + assert len(states) == actions.shape[0] + cdef StateClass state + cdef vector[StateC*] c_states + c_states.resize(len(states)) + cdef int i + for (i, state) in enumerate(states): + c_states[i] = state.c + c_apply_actions(self, &c_states[0], &actions[0], actions.shape[0]) + return [state for state in states if not state.c.is_final()] + + def transition_states(self, states, float[:, ::1] scores): + assert len(states) == scores.shape[0] + cdef StateClass state + cdef float* c_scores = &scores[0, 0] + cdef vector[StateC*] c_states + for state in states: + c_states.push_back(state.c) + c_transition_batch(self, &c_states[0], c_scores, scores.shape[1], scores.shape[0]) + return [state for state in states if not state.c.is_final()] cdef Transition lookup_transition(self, object name) except *: raise NotImplementedError @@ -250,3 +289,35 @@ cdef class TransitionSystem: self.cfg.update(msg['cfg']) self.initialize_actions(labels) return self + + +cdef void c_apply_actions(TransitionSystem moves, StateC** states, const int* actions, + int batch_size) nogil: + cdef int i + cdef Transition action + cdef StateC* state + for i in range(batch_size): + state = states[i] + action = moves.c[actions[i]] + action.do(state, action.label) + state.history.push_back(action.clas) + + +cdef void c_transition_batch(TransitionSystem moves, StateC** states, const float* scores, + int nr_class, int batch_size) nogil: + is_valid = calloc(moves.n_moves, sizeof(int)) + cdef int i, guess + cdef Transition action + for i in range(batch_size): + moves.set_valid(is_valid, states[i]) + guess = arg_max_if_valid(&scores[i*nr_class], is_valid, nr_class) + if guess == -1: + # This shouldn't happen, but it's hard to raise an error here, + # and we don't want to infinite loop. So, force to end state. + states[i].force_final() + else: + action = moves.c[guess] + action.do(states[i], action.label) + states[i].history.push_back(guess) + free(is_valid) + diff --git a/spacy/pipeline/attributeruler.py b/spacy/pipeline/attribute_ruler.py similarity index 99% rename from spacy/pipeline/attributeruler.py rename to spacy/pipeline/attribute_ruler.py index 0d9494865..ac998a61d 100644 --- a/spacy/pipeline/attributeruler.py +++ b/spacy/pipeline/attribute_ruler.py @@ -11,7 +11,7 @@ from ..matcher import Matcher from ..scorer import Scorer from ..symbols import IDS from ..tokens import Doc, Span -from ..tokens._retokenize import normalize_token_attrs, set_token_attrs +from ..tokens.retokenizer import normalize_token_attrs, set_token_attrs from ..vocab import Vocab from ..util import SimpleFrozenList, registry from .. import util diff --git a/spacy/pipeline/dep_parser.pyx b/spacy/pipeline/dep_parser.py similarity index 97% rename from spacy/pipeline/dep_parser.pyx rename to spacy/pipeline/dep_parser.py index e5f686158..f6689e017 100644 --- a/spacy/pipeline/dep_parser.pyx +++ b/spacy/pipeline/dep_parser.py @@ -4,8 +4,8 @@ from typing import Optional, Iterable, Callable from thinc.api import Model, Config from ._parser_internals.transition_system import TransitionSystem -from .transition_parser cimport Parser -from ._parser_internals.arc_eager cimport ArcEager +from .transition_parser import Parser +from ._parser_internals.arc_eager import ArcEager from .functions import merge_subtokens from ..language import Language @@ -18,12 +18,11 @@ from ..util import registry default_model_config = """ [model] -@architectures = "spacy.TransitionBasedParser.v2" +@architectures = "spacy.TransitionBasedParser.v3" state_type = "parser" extra_state_tokens = false hidden_width = 64 maxout_pieces = 2 -use_upper = true [model.tok2vec] @architectures = "spacy.HashEmbedCNN.v2" @@ -123,6 +122,7 @@ def make_parser( scorer=scorer, ) + @Language.factory( "beam_parser", assigns=["token.dep", "token.head", "token.is_sent_start", "doc.sents"], @@ -228,6 +228,7 @@ def parser_score(examples, **kwargs): DOCS: https://spacy.io/api/dependencyparser#score """ + def has_sents(doc): return doc.has_annotation("SENT_START") @@ -235,8 +236,11 @@ def parser_score(examples, **kwargs): dep = getattr(token, attr) dep = token.vocab.strings.as_string(dep).lower() return dep + results = {} - results.update(Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs)) + results.update( + Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs) + ) kwargs.setdefault("getter", dep_getter) kwargs.setdefault("ignore_labels", ("p", "punct")) results.update(Scorer.score_deps(examples, "dep", **kwargs)) @@ -249,11 +253,12 @@ def make_parser_scorer(): return parser_score -cdef class DependencyParser(Parser): +class DependencyParser(Parser): """Pipeline component for dependency parsing. DOCS: https://spacy.io/api/dependencyparser """ + TransitionSystem = ArcEager def __init__( @@ -273,8 +278,7 @@ cdef class DependencyParser(Parser): incorrect_spans_key=None, scorer=parser_score, ): - """Create a DependencyParser. - """ + """Create a DependencyParser.""" super().__init__( vocab, model, diff --git a/spacy/pipeline/edit_tree_lemmatizer.py b/spacy/pipeline/edit_tree_lemmatizer.py index e83fe63ba..3198b7509 100644 --- a/spacy/pipeline/edit_tree_lemmatizer.py +++ b/spacy/pipeline/edit_tree_lemmatizer.py @@ -1,12 +1,13 @@ -from typing import cast, Any, Callable, Dict, Iterable, List, Optional +from typing import cast, Any, Callable, Dict, Iterable, List, Optional, Union from typing import Tuple from collections import Counter from itertools import islice import numpy as np import srsly -from thinc.api import Config, Model, SequenceCategoricalCrossentropy -from thinc.types import Floats2d, Ints1d, Ints2d +from thinc.api import Config, Model, SequenceCategoricalCrossentropy, NumpyOps +from thinc.types import ArrayXd, Floats2d, Ints1d +from thinc.legacy import LegacySequenceCategoricalCrossentropy from ._edit_tree_internals.edit_trees import EditTrees from ._edit_tree_internals.schemas import validate_edit_tree @@ -20,6 +21,11 @@ from ..vocab import Vocab from .. import util +ActivationsT = Dict[str, Union[List[Floats2d], List[Ints1d]]] +# The cutoff value of *top_k* above which an alternative method is used to process guesses. +TOP_K_GUARDRAIL = 20 + + default_model_config = """ [model] @architectures = "spacy.Tagger.v2" @@ -48,6 +54,7 @@ DEFAULT_EDIT_TREE_LEMMATIZER_MODEL = Config().from_str(default_model_config)["mo "overwrite": False, "top_k": 1, "scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"}, + "save_activations": False, }, default_score_weights={"lemma_acc": 1.0}, ) @@ -60,6 +67,7 @@ def make_edit_tree_lemmatizer( overwrite: bool, top_k: int, scorer: Optional[Callable], + save_activations: bool, ): """Construct an EditTreeLemmatizer component.""" return EditTreeLemmatizer( @@ -71,6 +79,7 @@ def make_edit_tree_lemmatizer( overwrite=overwrite, top_k=top_k, scorer=scorer, + save_activations=save_activations, ) @@ -90,6 +99,7 @@ class EditTreeLemmatizer(TrainablePipe): overwrite: bool = False, top_k: int = 1, scorer: Optional[Callable] = lemmatizer_score, + save_activations: bool = False, ): """ Construct an edit tree lemmatizer. @@ -101,6 +111,7 @@ class EditTreeLemmatizer(TrainablePipe): frequency in the training data. overwrite (bool): overwrite existing lemma annotations. top_k (int): try to apply at most the k most probable edit trees. + save_activations (bool): save model activations in Doc when annotating. """ self.vocab = vocab self.model = model @@ -115,12 +126,16 @@ class EditTreeLemmatizer(TrainablePipe): self.cfg: Dict[str, Any] = {"labels": []} self.scorer = scorer + self.save_activations = save_activations + self.numpy_ops = NumpyOps() def get_loss( self, examples: Iterable[Example], scores: List[Floats2d] ) -> Tuple[float, List[Floats2d]]: validate_examples(examples, "EditTreeLemmatizer.get_loss") - loss_func = SequenceCategoricalCrossentropy(normalize=False, missing_value=-1) + loss_func = LegacySequenceCategoricalCrossentropy( + normalize=False, missing_value=-1 + ) truths = [] for eg in examples: @@ -143,30 +158,98 @@ class EditTreeLemmatizer(TrainablePipe): return float(loss), d_scores - def predict(self, docs: Iterable[Doc]) -> List[Ints2d]: + def get_teacher_student_loss( + self, teacher_scores: List[Floats2d], student_scores: List[Floats2d] + ) -> Tuple[float, List[Floats2d]]: + """Calculate the loss and its gradient for a batch of student + scores, relative to teacher scores. + + teacher_scores: Scores representing the teacher model's predictions. + student_scores: Scores representing the student model's predictions. + + RETURNS (Tuple[float, float]): The loss and the gradient. + + DOCS: https://spacy.io/api/edittreelemmatizer#get_teacher_student_loss + """ + loss_func = LegacySequenceCategoricalCrossentropy(normalize=False) + d_scores, loss = loss_func(student_scores, teacher_scores) + if self.model.ops.xp.isnan(loss): + raise ValueError(Errors.E910.format(name=self.name)) + return float(loss), d_scores + + def predict(self, docs: Iterable[Doc]) -> ActivationsT: + if self.top_k == 1: + scores2guesses = self._scores2guesses_top_k_equals_1 + elif self.top_k <= TOP_K_GUARDRAIL: + scores2guesses = self._scores2guesses_top_k_greater_1 + else: + scores2guesses = self._scores2guesses_top_k_guardrail + # The behaviour of *_scores2guesses_top_k_greater_1()* is efficient for values + # of *top_k>1* that are likely to be useful when the edit tree lemmatizer is used + # for its principal purpose of lemmatizing tokens. However, the code could also + # be used for other purposes, and with very large values of *top_k* the method + # becomes inefficient. In such cases, *_scores2guesses_top_k_guardrail()* is used + # instead. n_docs = len(list(docs)) if not any(len(doc) for doc in docs): # Handle cases where there are no tokens in any docs. n_labels = len(self.cfg["labels"]) - guesses: List[Ints2d] = [self.model.ops.alloc2i(0, n_labels) for _ in docs] + guesses: List[Ints1d] = [ + self.model.ops.alloc((0,), dtype="i") for doc in docs + ] + scores: List[Floats2d] = [ + self.model.ops.alloc((0, n_labels), dtype="i") for doc in docs + ] assert len(guesses) == n_docs - return guesses + return {"probabilities": scores, "tree_ids": guesses} scores = self.model.predict(docs) assert len(scores) == n_docs - guesses = self._scores2guesses(docs, scores) + guesses = scores2guesses(docs, scores) assert len(guesses) == n_docs - return guesses + return {"probabilities": scores, "tree_ids": guesses} - def _scores2guesses(self, docs, scores): + def _scores2guesses_top_k_equals_1(self, docs, scores): guesses = [] for doc, doc_scores in zip(docs, scores): - if self.top_k == 1: - doc_guesses = doc_scores.argmax(axis=1).reshape(-1, 1) - else: - doc_guesses = np.argsort(doc_scores)[..., : -self.top_k - 1 : -1] + doc_guesses = doc_scores.argmax(axis=1) + doc_guesses = self.numpy_ops.asarray(doc_guesses) - if not isinstance(doc_guesses, np.ndarray): - doc_guesses = doc_guesses.get() + doc_compat_guesses = [] + for i, token in enumerate(doc): + tree_id = self.cfg["labels"][doc_guesses[i]] + if self.trees.apply(tree_id, token.text) is not None: + doc_compat_guesses.append(tree_id) + else: + doc_compat_guesses.append(-1) + guesses.append(np.array(doc_compat_guesses)) + + return guesses + + def _scores2guesses_top_k_greater_1(self, docs, scores): + guesses = [] + top_k = min(self.top_k, len(self.labels)) + for doc, doc_scores in zip(docs, scores): + doc_scores = self.numpy_ops.asarray(doc_scores) + doc_compat_guesses = [] + for i, token in enumerate(doc): + for _ in range(top_k): + candidate = int(doc_scores[i].argmax()) + candidate_tree_id = self.cfg["labels"][candidate] + if self.trees.apply(candidate_tree_id, token.text) is not None: + doc_compat_guesses.append(candidate_tree_id) + break + doc_scores[i, candidate] = np.finfo(np.float32).min + else: + doc_compat_guesses.append(-1) + guesses.append(np.array(doc_compat_guesses)) + + return guesses + + def _scores2guesses_top_k_guardrail(self, docs, scores): + guesses = [] + for doc, doc_scores in zip(docs, scores): + doc_guesses = np.argsort(doc_scores)[..., : -self.top_k - 1 : -1] + doc_guesses = self.numpy_ops.asarray(doc_guesses) doc_compat_guesses = [] for token, candidates in zip(doc, doc_guesses): @@ -183,8 +266,13 @@ class EditTreeLemmatizer(TrainablePipe): return guesses - def set_annotations(self, docs: Iterable[Doc], batch_tree_ids): + def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT): + batch_tree_ids = activations["tree_ids"] for i, doc in enumerate(docs): + if self.save_activations: + doc.activations[self.name] = {} + for act_name, acts in activations.items(): + doc.activations[self.name][act_name] = acts[i] doc_tree_ids = batch_tree_ids[i] if hasattr(doc_tree_ids, "get"): doc_tree_ids = doc_tree_ids.get() diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py index 62845287b..6a187b6c3 100644 --- a/spacy/pipeline/entity_linker.py +++ b/spacy/pipeline/entity_linker.py @@ -1,5 +1,7 @@ -from typing import Optional, Iterable, Callable, Dict, Union, List, Any -from thinc.types import Floats2d +from typing import Optional, Iterable, Callable, Dict, Sequence, Union, List, Any +from typing import cast +from numpy import dtype +from thinc.types import Floats1d, Floats2d, Ints1d, Ragged from pathlib import Path from itertools import islice import srsly @@ -11,7 +13,6 @@ from ..kb import KnowledgeBase, Candidate from ..ml import empty_kb from ..tokens import Doc, Span from .pipe import deserialize_config -from .legacy.entity_linker import EntityLinker_v1 from .trainable_pipe import TrainablePipe from ..language import Language from ..vocab import Vocab @@ -21,8 +22,10 @@ from ..util import SimpleFrozenList, registry from .. import util from ..scorer import Scorer -# See #9050 -BACKWARD_OVERWRITE = True + +ActivationsT = Dict[str, Union[List[Ragged], List[str]]] + +KNOWLEDGE_BASE_IDS = "kb_ids" default_model_config = """ [model] @@ -54,11 +57,13 @@ DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"] "entity_vector_length": 64, "get_candidates": {"@misc": "spacy.CandidateGenerator.v1"}, "get_candidates_batch": {"@misc": "spacy.CandidateBatchGenerator.v1"}, - "overwrite": True, + "overwrite": False, + "generate_empty_kb": {"@misc": "spacy.EmptyKB.v2"}, "scorer": {"@scorers": "spacy.entity_linker_scorer.v1"}, "use_gold_ents": True, "candidates_batch_size": 1, "threshold": None, + "save_activations": False, }, default_score_weights={ "nel_micro_f": 1.0, @@ -80,11 +85,13 @@ def make_entity_linker( get_candidates_batch: Callable[ [KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]] ], + generate_empty_kb: Callable[[Vocab, int], KnowledgeBase], overwrite: bool, scorer: Optional[Callable], use_gold_ents: bool, candidates_batch_size: int, threshold: Optional[float] = None, + save_activations: bool, ): """Construct an EntityLinker component. @@ -101,29 +108,18 @@ def make_entity_linker( get_candidates_batch ( Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]], Iterable[Candidate]] ): Function that produces a list of candidates, given a certain knowledge base and several textual mentions. + generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase. scorer (Optional[Callable]): The scoring method. use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another component must provide entity annotations. candidates_batch_size (int): Size of batches for entity candidate generation. threshold (Optional[float]): Confidence threshold for entity predictions. If confidence is below the threshold, prediction is discarded. If None, predictions are not filtered by any threshold. + save_activations (bool): save model activations in Doc when annotating. """ - if not model.attrs.get("include_span_maker", False): - # The only difference in arguments here is that use_gold_ents and threshold aren't available. - return EntityLinker_v1( - nlp.vocab, - model, - name, - labels_discard=labels_discard, - n_sents=n_sents, - incl_prior=incl_prior, - incl_context=incl_context, - entity_vector_length=entity_vector_length, - get_candidates=get_candidates, - overwrite=overwrite, - scorer=scorer, - ) + raise ValueError(Errors.E4005) + return EntityLinker( nlp.vocab, model, @@ -135,11 +131,13 @@ def make_entity_linker( entity_vector_length=entity_vector_length, get_candidates=get_candidates, get_candidates_batch=get_candidates_batch, + generate_empty_kb=generate_empty_kb, overwrite=overwrite, scorer=scorer, use_gold_ents=use_gold_ents, candidates_batch_size=candidates_batch_size, threshold=threshold, + save_activations=save_activations, ) @@ -175,11 +173,13 @@ class EntityLinker(TrainablePipe): get_candidates_batch: Callable[ [KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]] ], - overwrite: bool = BACKWARD_OVERWRITE, + generate_empty_kb: Callable[[Vocab, int], KnowledgeBase], + overwrite: bool = False, scorer: Optional[Callable] = entity_linker_score, use_gold_ents: bool, candidates_batch_size: int, threshold: Optional[float] = None, + save_activations: bool = False, ) -> None: """Initialize an entity linker. @@ -198,12 +198,15 @@ class EntityLinker(TrainablePipe): Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]], Iterable[Candidate]] ): Function that produces a list of candidates, given a certain knowledge base and several textual mentions. + generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase. + overwrite (bool): Whether to overwrite existing non-empty annotations. scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_links. use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another component must provide entity annotations. candidates_batch_size (int): Size of batches for entity candidate generation. threshold (Optional[float]): Confidence threshold for entity predictions. If confidence is below the threshold, prediction is discarded. If None, predictions are not filtered by any threshold. + save_activations (bool): save model activations in Doc when annotating. DOCS: https://spacy.io/api/entitylinker#init """ @@ -220,6 +223,7 @@ class EntityLinker(TrainablePipe): self.model = model self.name = name self.labels_discard = list(labels_discard) + # how many neighbour sentences to take into account self.n_sents = n_sents self.incl_prior = incl_prior self.incl_context = incl_context @@ -227,13 +231,12 @@ class EntityLinker(TrainablePipe): self.get_candidates_batch = get_candidates_batch self.cfg: Dict[str, Any] = {"overwrite": overwrite} self.distance = CosineDistance(normalize=False) - # how many neighbour sentences to take into account - # create an empty KB by default - self.kb = empty_kb(entity_vector_length)(self.vocab) + self.kb = generate_empty_kb(self.vocab, entity_vector_length) self.scorer = scorer self.use_gold_ents = use_gold_ents self.candidates_batch_size = candidates_batch_size self.threshold = threshold + self.save_activations = save_activations if candidates_batch_size < 1: raise ValueError(Errors.E1044) @@ -250,7 +253,7 @@ class EntityLinker(TrainablePipe): # Raise an error if the knowledge base is not initialized. if self.kb is None: raise ValueError(Errors.E1018.format(name=self.name)) - if len(self.kb) == 0: + if hasattr(self.kb, "is_empty") and self.kb.is_empty(): raise ValueError(Errors.E139.format(name=self.name)) def initialize( @@ -422,7 +425,7 @@ class EntityLinker(TrainablePipe): loss = loss / len(entity_encodings) return float(loss), out - def predict(self, docs: Iterable[Doc]) -> List[str]: + def predict(self, docs: Iterable[Doc]) -> ActivationsT: """Apply the pipeline's model to a batch of docs, without modifying them. Returns the KB IDs for each entity in each doc, including NIL if there is no prediction. @@ -435,13 +438,24 @@ class EntityLinker(TrainablePipe): self.validate_kb() entity_count = 0 final_kb_ids: List[str] = [] - xp = self.model.ops.xp + ops = self.model.ops + xp = ops.xp + docs_ents: List[Ragged] = [] + docs_scores: List[Ragged] = [] if not docs: - return final_kb_ids + return { + KNOWLEDGE_BASE_IDS: final_kb_ids, + "ents": docs_ents, + "scores": docs_scores, + } if isinstance(docs, Doc): docs = [docs] - for i, doc in enumerate(docs): + for doc in docs: + doc_ents: List[Ints1d] = [] + doc_scores: List[Floats1d] = [] if len(doc) == 0: + docs_scores.append(Ragged(ops.alloc1f(0), ops.alloc1i(0))) + docs_ents.append(Ragged(xp.zeros(0, dtype="uint64"), ops.alloc1i(0))) continue sentences = [s for s in doc.sents] @@ -489,14 +503,32 @@ class EntityLinker(TrainablePipe): if ent.label_ in self.labels_discard: # ignoring this entity - setting to NIL final_kb_ids.append(self.NIL) + self._add_activations( + doc_scores=doc_scores, + doc_ents=doc_ents, + scores=[0.0], + ents=[0], + ) else: candidates = list(batch_candidates[j]) if not candidates: # no prediction possible for this entity - setting to NIL final_kb_ids.append(self.NIL) + self._add_activations( + doc_scores=doc_scores, + doc_ents=doc_ents, + scores=[0.0], + ents=[0], + ) elif len(candidates) == 1 and self.threshold is None: # shortcut for efficiency reasons: take the 1 candidate final_kb_ids.append(candidates[0].entity_) + self._add_activations( + doc_scores=doc_scores, + doc_ents=doc_ents, + scores=[1.0], + ents=[candidates[0].entity_], + ) else: random.shuffle(candidates) # set all prior probabilities to 0 if incl_prior=False @@ -530,28 +562,52 @@ class EntityLinker(TrainablePipe): or scores.max() >= self.threshold else EntityLinker.NIL ) - + self._add_activations( + doc_scores=doc_scores, + doc_ents=doc_ents, + scores=scores, + ents=[c.entity for c in candidates], + ) + self._add_doc_activations( + docs_scores=docs_scores, + docs_ents=docs_ents, + doc_scores=doc_scores, + doc_ents=doc_ents, + ) if not (len(final_kb_ids) == entity_count): err = Errors.E147.format( method="predict", msg="result variables not of equal length" ) raise RuntimeError(err) - return final_kb_ids + return { + KNOWLEDGE_BASE_IDS: final_kb_ids, + "ents": docs_ents, + "scores": docs_scores, + } - def set_annotations(self, docs: Iterable[Doc], kb_ids: List[str]) -> None: + def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT) -> None: """Modify a batch of documents, using pre-computed scores. docs (Iterable[Doc]): The documents to modify. - kb_ids (List[str]): The IDs to set, produced by EntityLinker.predict. + activations (ActivationsT): The activations used for setting annotations, produced + by EntityLinker.predict. DOCS: https://spacy.io/api/entitylinker#set_annotations """ + kb_ids = cast(List[str], activations[KNOWLEDGE_BASE_IDS]) count_ents = len([ent for doc in docs for ent in doc.ents]) if count_ents != len(kb_ids): raise ValueError(Errors.E148.format(ents=count_ents, ids=len(kb_ids))) i = 0 overwrite = self.cfg["overwrite"] - for doc in docs: + for j, doc in enumerate(docs): + if self.save_activations: + doc.activations[self.name] = {} + for act_name, acts in activations.items(): + if act_name != KNOWLEDGE_BASE_IDS: + # We only copy activations that are Ragged. + doc.activations[self.name][act_name] = cast(Ragged, acts[j]) + for ent in doc.ents: kb_id = kb_ids[i] i += 1 @@ -650,3 +706,32 @@ class EntityLinker(TrainablePipe): def add_label(self, label): raise NotImplementedError + + def _add_doc_activations( + self, + *, + docs_scores: List[Ragged], + docs_ents: List[Ragged], + doc_scores: List[Floats1d], + doc_ents: List[Ints1d], + ): + if not self.save_activations: + return + ops = self.model.ops + lengths = ops.asarray1i([s.shape[0] for s in doc_scores]) + docs_scores.append(Ragged(ops.flatten(doc_scores), lengths)) + docs_ents.append(Ragged(ops.flatten(doc_ents), lengths)) + + def _add_activations( + self, + *, + doc_scores: List[Floats1d], + doc_ents: List[Ints1d], + scores: Sequence[float], + ents: Sequence[int], + ): + if not self.save_activations: + return + ops = self.model.ops + doc_scores.append(ops.asarray1f(scores)) + doc_ents.append(ops.asarray1i(ents, dtype="uint64")) diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py index 6a3755533..e69de29bb 100644 --- a/spacy/pipeline/entityruler.py +++ b/spacy/pipeline/entityruler.py @@ -1,541 +0,0 @@ -from typing import Optional, Union, List, Dict, Tuple, Iterable, Any, Callable, Sequence -import warnings -from collections import defaultdict -from pathlib import Path -import srsly - -from .pipe import Pipe -from ..training import Example -from ..language import Language -from ..errors import Errors, Warnings -from ..util import ensure_path, to_disk, from_disk, SimpleFrozenList, registry -from ..tokens import Doc, Span -from ..matcher import Matcher, PhraseMatcher -from ..matcher.levenshtein import levenshtein_compare -from ..scorer import get_ner_prf - - -DEFAULT_ENT_ID_SEP = "||" -PatternType = Dict[str, Union[str, List[Dict[str, Any]]]] - - -@Language.factory( - "entity_ruler", - assigns=["doc.ents", "token.ent_type", "token.ent_iob"], - default_config={ - "phrase_matcher_attr": None, - "matcher_fuzzy_compare": {"@misc": "spacy.levenshtein_compare.v1"}, - "validate": False, - "overwrite_ents": False, - "ent_id_sep": DEFAULT_ENT_ID_SEP, - "scorer": {"@scorers": "spacy.entity_ruler_scorer.v1"}, - }, - default_score_weights={ - "ents_f": 1.0, - "ents_p": 0.0, - "ents_r": 0.0, - "ents_per_type": None, - }, -) -def make_entity_ruler( - nlp: Language, - name: str, - phrase_matcher_attr: Optional[Union[int, str]], - matcher_fuzzy_compare: Callable, - validate: bool, - overwrite_ents: bool, - ent_id_sep: str, - scorer: Optional[Callable], -): - return EntityRuler( - nlp, - name, - phrase_matcher_attr=phrase_matcher_attr, - matcher_fuzzy_compare=matcher_fuzzy_compare, - validate=validate, - overwrite_ents=overwrite_ents, - ent_id_sep=ent_id_sep, - scorer=scorer, - ) - - -def entity_ruler_score(examples, **kwargs): - return get_ner_prf(examples) - - -@registry.scorers("spacy.entity_ruler_scorer.v1") -def make_entity_ruler_scorer(): - return entity_ruler_score - - -class EntityRuler(Pipe): - """The EntityRuler lets you add spans to the `Doc.ents` using token-based - rules or exact phrase matches. It can be combined with the statistical - `EntityRecognizer` to boost accuracy, or used on its own to implement a - purely rule-based entity recognition system. After initialization, the - component is typically added to the pipeline using `nlp.add_pipe`. - - DOCS: https://spacy.io/api/entityruler - USAGE: https://spacy.io/usage/rule-based-matching#entityruler - """ - - def __init__( - self, - nlp: Language, - name: str = "entity_ruler", - *, - phrase_matcher_attr: Optional[Union[int, str]] = None, - matcher_fuzzy_compare: Callable = levenshtein_compare, - validate: bool = False, - overwrite_ents: bool = False, - ent_id_sep: str = DEFAULT_ENT_ID_SEP, - patterns: Optional[List[PatternType]] = None, - scorer: Optional[Callable] = entity_ruler_score, - ) -> None: - """Initialize the entity ruler. If patterns are supplied here, they - need to be a list of dictionaries with a `"label"` and `"pattern"` - key. A pattern can either be a token pattern (list) or a phrase pattern - (string). For example: `{'label': 'ORG', 'pattern': 'Apple'}`. - - nlp (Language): The shared nlp object to pass the vocab to the matchers - and process phrase patterns. - name (str): Instance name of the current pipeline component. Typically - passed in automatically from the factory when the component is - added. Used to disable the current entity ruler while creating - phrase patterns with the nlp object. - phrase_matcher_attr (int / str): Token attribute to match on, passed - to the internal PhraseMatcher as `attr`. - matcher_fuzzy_compare (Callable): The fuzzy comparison method for the - internal Matcher. Defaults to - spacy.matcher.levenshtein.levenshtein_compare. - validate (bool): Whether patterns should be validated, passed to - Matcher and PhraseMatcher as `validate` - patterns (iterable): Optional patterns to load in. - overwrite_ents (bool): If existing entities are present, e.g. entities - added by the model, overwrite them by matches if necessary. - ent_id_sep (str): Separator used internally for entity IDs. - scorer (Optional[Callable]): The scoring method. Defaults to - spacy.scorer.get_ner_prf. - - DOCS: https://spacy.io/api/entityruler#init - """ - self.nlp = nlp - self.name = name - self.overwrite = overwrite_ents - self.token_patterns = defaultdict(list) # type: ignore - self.phrase_patterns = defaultdict(list) # type: ignore - self._validate = validate - self.matcher_fuzzy_compare = matcher_fuzzy_compare - self.matcher = Matcher( - nlp.vocab, validate=validate, fuzzy_compare=self.matcher_fuzzy_compare - ) - self.phrase_matcher_attr = phrase_matcher_attr - self.phrase_matcher = PhraseMatcher( - nlp.vocab, attr=self.phrase_matcher_attr, validate=validate - ) - self.ent_id_sep = ent_id_sep - self._ent_ids = defaultdict(tuple) # type: ignore - if patterns is not None: - self.add_patterns(patterns) - self.scorer = scorer - - def __len__(self) -> int: - """The number of all patterns added to the entity ruler.""" - n_token_patterns = sum(len(p) for p in self.token_patterns.values()) - n_phrase_patterns = sum(len(p) for p in self.phrase_patterns.values()) - return n_token_patterns + n_phrase_patterns - - def __contains__(self, label: str) -> bool: - """Whether a label is present in the patterns.""" - return label in self.token_patterns or label in self.phrase_patterns - - def __call__(self, doc: Doc) -> Doc: - """Find matches in document and add them as entities. - - doc (Doc): The Doc object in the pipeline. - RETURNS (Doc): The Doc with added entities, if available. - - DOCS: https://spacy.io/api/entityruler#call - """ - error_handler = self.get_error_handler() - try: - matches = self.match(doc) - self.set_annotations(doc, matches) - return doc - except Exception as e: - return error_handler(self.name, self, [doc], e) - - def match(self, doc: Doc): - self._require_patterns() - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", message="\\[W036") - matches = list(self.matcher(doc)) + list(self.phrase_matcher(doc)) - - final_matches = set( - [(m_id, start, end) for m_id, start, end in matches if start != end] - ) - get_sort_key = lambda m: (m[2] - m[1], -m[1]) - final_matches = sorted(final_matches, key=get_sort_key, reverse=True) - return final_matches - - def set_annotations(self, doc, matches): - """Modify the document in place""" - entities = list(doc.ents) - new_entities = [] - seen_tokens = set() - for match_id, start, end in matches: - if any(t.ent_type for t in doc[start:end]) and not self.overwrite: - continue - # check for end - 1 here because boundaries are inclusive - if start not in seen_tokens and end - 1 not in seen_tokens: - if match_id in self._ent_ids: - label, ent_id = self._ent_ids[match_id] - span = Span(doc, start, end, label=label, span_id=ent_id) - else: - span = Span(doc, start, end, label=match_id) - new_entities.append(span) - entities = [ - e for e in entities if not (e.start < end and e.end > start) - ] - seen_tokens.update(range(start, end)) - doc.ents = entities + new_entities - - @property - def labels(self) -> Tuple[str, ...]: - """All labels present in the match patterns. - - RETURNS (set): The string labels. - - DOCS: https://spacy.io/api/entityruler#labels - """ - keys = set(self.token_patterns.keys()) - keys.update(self.phrase_patterns.keys()) - all_labels = set() - - for l in keys: - if self.ent_id_sep in l: - label, _ = self._split_label(l) - all_labels.add(label) - else: - all_labels.add(l) - return tuple(sorted(all_labels)) - - def initialize( - self, - get_examples: Callable[[], Iterable[Example]], - *, - nlp: Optional[Language] = None, - patterns: Optional[Sequence[PatternType]] = None, - ): - """Initialize the pipe for training. - - get_examples (Callable[[], Iterable[Example]]): Function that - returns a representative sample of gold-standard Example objects. - nlp (Language): The current nlp object the component is part of. - patterns Optional[Iterable[PatternType]]: The list of patterns. - - DOCS: https://spacy.io/api/entityruler#initialize - """ - self.clear() - if patterns: - self.add_patterns(patterns) # type: ignore[arg-type] - - @property - def ent_ids(self) -> Tuple[Optional[str], ...]: - """All entity ids present in the match patterns `id` properties - - RETURNS (set): The string entity ids. - - DOCS: https://spacy.io/api/entityruler#ent_ids - """ - keys = set(self.token_patterns.keys()) - keys.update(self.phrase_patterns.keys()) - all_ent_ids = set() - - for l in keys: - if self.ent_id_sep in l: - _, ent_id = self._split_label(l) - all_ent_ids.add(ent_id) - return tuple(all_ent_ids) - - @property - def patterns(self) -> List[PatternType]: - """Get all patterns that were added to the entity ruler. - - RETURNS (list): The original patterns, one dictionary per pattern. - - DOCS: https://spacy.io/api/entityruler#patterns - """ - all_patterns = [] - for label, patterns in self.token_patterns.items(): - for pattern in patterns: - ent_label, ent_id = self._split_label(label) - p = {"label": ent_label, "pattern": pattern} - if ent_id: - p["id"] = ent_id - all_patterns.append(p) - for label, patterns in self.phrase_patterns.items(): - for pattern in patterns: - ent_label, ent_id = self._split_label(label) - p = {"label": ent_label, "pattern": pattern.text} - if ent_id: - p["id"] = ent_id - all_patterns.append(p) - return all_patterns - - def add_patterns(self, patterns: List[PatternType]) -> None: - """Add patterns to the entity ruler. A pattern can either be a token - pattern (list of dicts) or a phrase pattern (string). For example: - {'label': 'ORG', 'pattern': 'Apple'} - {'label': 'GPE', 'pattern': [{'lower': 'san'}, {'lower': 'francisco'}]} - - patterns (list): The patterns to add. - - DOCS: https://spacy.io/api/entityruler#add_patterns - """ - - # disable the nlp components after this one in case they hadn't been initialized / deserialised yet - try: - current_index = -1 - for i, (name, pipe) in enumerate(self.nlp.pipeline): - if self == pipe: - current_index = i - break - subsequent_pipes = [pipe for pipe in self.nlp.pipe_names[current_index:]] - except ValueError: - subsequent_pipes = [] - with self.nlp.select_pipes(disable=subsequent_pipes): - token_patterns = [] - phrase_pattern_labels = [] - phrase_pattern_texts = [] - phrase_pattern_ids = [] - for entry in patterns: - if isinstance(entry["pattern"], str): - phrase_pattern_labels.append(entry["label"]) - phrase_pattern_texts.append(entry["pattern"]) - phrase_pattern_ids.append(entry.get("id")) - elif isinstance(entry["pattern"], list): - token_patterns.append(entry) - phrase_patterns = [] - for label, pattern, ent_id in zip( - phrase_pattern_labels, - self.nlp.pipe(phrase_pattern_texts), - phrase_pattern_ids, - ): - phrase_pattern = {"label": label, "pattern": pattern} - if ent_id: - phrase_pattern["id"] = ent_id - phrase_patterns.append(phrase_pattern) - for entry in token_patterns + phrase_patterns: # type: ignore[operator] - label = entry["label"] # type: ignore - if "id" in entry: - ent_label = label - label = self._create_label(label, entry["id"]) - key = self.matcher._normalize_key(label) - self._ent_ids[key] = (ent_label, entry["id"]) - pattern = entry["pattern"] # type: ignore - if isinstance(pattern, Doc): - self.phrase_patterns[label].append(pattern) - self.phrase_matcher.add(label, [pattern]) # type: ignore - elif isinstance(pattern, list): - self.token_patterns[label].append(pattern) - self.matcher.add(label, [pattern]) - else: - raise ValueError(Errors.E097.format(pattern=pattern)) - - def clear(self) -> None: - """Reset all patterns.""" - self.token_patterns = defaultdict(list) - self.phrase_patterns = defaultdict(list) - self._ent_ids = defaultdict(tuple) - self.matcher = Matcher( - self.nlp.vocab, - validate=self._validate, - fuzzy_compare=self.matcher_fuzzy_compare, - ) - self.phrase_matcher = PhraseMatcher( - self.nlp.vocab, attr=self.phrase_matcher_attr, validate=self._validate - ) - - def remove(self, ent_id: str) -> None: - """Remove a pattern by its ent_id if a pattern with this ent_id was added before - - ent_id (str): id of the pattern to be removed - RETURNS: None - DOCS: https://spacy.io/api/entityruler#remove - """ - label_id_pairs = [ - (label, eid) for (label, eid) in self._ent_ids.values() if eid == ent_id - ] - if not label_id_pairs: - raise ValueError( - Errors.E1024.format(attr_type="ID", label=ent_id, component=self.name) - ) - created_labels = [ - self._create_label(label, eid) for (label, eid) in label_id_pairs - ] - # remove the patterns from self.phrase_patterns - self.phrase_patterns = defaultdict( - list, - { - label: val - for (label, val) in self.phrase_patterns.items() - if label not in created_labels - }, - ) - # remove the patterns from self.token_pattern - self.token_patterns = defaultdict( - list, - { - label: val - for (label, val) in self.token_patterns.items() - if label not in created_labels - }, - ) - # remove the patterns from self.token_pattern - for label in created_labels: - if label in self.phrase_matcher: - self.phrase_matcher.remove(label) - else: - self.matcher.remove(label) - - def _require_patterns(self) -> None: - """Raise a warning if this component has no patterns defined.""" - if len(self) == 0: - warnings.warn(Warnings.W036.format(name=self.name)) - - def _split_label(self, label: str) -> Tuple[str, Optional[str]]: - """Split Entity label into ent_label and ent_id if it contains self.ent_id_sep - - label (str): The value of label in a pattern entry - RETURNS (tuple): ent_label, ent_id - """ - if self.ent_id_sep in label: - ent_label, ent_id = label.rsplit(self.ent_id_sep, 1) - else: - ent_label = label - ent_id = None # type: ignore - return ent_label, ent_id - - def _create_label(self, label: Any, ent_id: Any) -> str: - """Join Entity label with ent_id if the pattern has an `id` attribute - If ent_id is not a string, the label is returned as is. - - label (str): The label to set for ent.label_ - ent_id (str): The label - RETURNS (str): The ent_label joined with configured `ent_id_sep` - """ - if isinstance(ent_id, str): - label = f"{label}{self.ent_id_sep}{ent_id}" - return label - - def from_bytes( - self, patterns_bytes: bytes, *, exclude: Iterable[str] = SimpleFrozenList() - ) -> "EntityRuler": - """Load the entity ruler from a bytestring. - - patterns_bytes (bytes): The bytestring to load. - RETURNS (EntityRuler): The loaded entity ruler. - - DOCS: https://spacy.io/api/entityruler#from_bytes - """ - cfg = srsly.msgpack_loads(patterns_bytes) - self.clear() - if isinstance(cfg, dict): - self.add_patterns(cfg.get("patterns", cfg)) - self.overwrite = cfg.get("overwrite", False) - self.phrase_matcher_attr = cfg.get("phrase_matcher_attr", None) - self.phrase_matcher = PhraseMatcher( - self.nlp.vocab, - attr=self.phrase_matcher_attr, - ) - self.ent_id_sep = cfg.get("ent_id_sep", DEFAULT_ENT_ID_SEP) - else: - self.add_patterns(cfg) - return self - - def to_bytes(self, *, exclude: Iterable[str] = SimpleFrozenList()) -> bytes: - """Serialize the entity ruler patterns to a bytestring. - - RETURNS (bytes): The serialized patterns. - - DOCS: https://spacy.io/api/entityruler#to_bytes - """ - serial = { - "overwrite": self.overwrite, - "ent_id_sep": self.ent_id_sep, - "phrase_matcher_attr": self.phrase_matcher_attr, - "patterns": self.patterns, - } - return srsly.msgpack_dumps(serial) - - def from_disk( - self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList() - ) -> "EntityRuler": - """Load the entity ruler from a file. Expects a file containing - newline-delimited JSON (JSONL) with one entry per line. - - path (str / Path): The JSONL file to load. - RETURNS (EntityRuler): The loaded entity ruler. - - DOCS: https://spacy.io/api/entityruler#from_disk - """ - path = ensure_path(path) - self.clear() - depr_patterns_path = path.with_suffix(".jsonl") - if path.suffix == ".jsonl": # user provides a jsonl - if path.is_file: - patterns = srsly.read_jsonl(path) - self.add_patterns(patterns) - else: - raise ValueError(Errors.E1023.format(path=path)) - elif depr_patterns_path.is_file(): - patterns = srsly.read_jsonl(depr_patterns_path) - self.add_patterns(patterns) - elif path.is_dir(): # path is a valid directory - cfg = {} - deserializers_patterns = { - "patterns": lambda p: self.add_patterns( - srsly.read_jsonl(p.with_suffix(".jsonl")) - ) - } - deserializers_cfg = {"cfg": lambda p: cfg.update(srsly.read_json(p))} - from_disk(path, deserializers_cfg, {}) - self.overwrite = cfg.get("overwrite", False) - self.phrase_matcher_attr = cfg.get("phrase_matcher_attr") - self.ent_id_sep = cfg.get("ent_id_sep", DEFAULT_ENT_ID_SEP) - - self.phrase_matcher = PhraseMatcher( - self.nlp.vocab, attr=self.phrase_matcher_attr - ) - from_disk(path, deserializers_patterns, {}) - else: # path is not a valid directory or file - raise ValueError(Errors.E146.format(path=path)) - return self - - def to_disk( - self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList() - ) -> None: - """Save the entity ruler patterns to a directory. The patterns will be - saved as newline-delimited JSON (JSONL). - - path (str / Path): The JSONL file to save. - - DOCS: https://spacy.io/api/entityruler#to_disk - """ - path = ensure_path(path) - cfg = { - "overwrite": self.overwrite, - "phrase_matcher_attr": self.phrase_matcher_attr, - "ent_id_sep": self.ent_id_sep, - } - serializers = { - "patterns": lambda p: srsly.write_jsonl( - p.with_suffix(".jsonl"), self.patterns - ), - "cfg": lambda p: srsly.write_json(p, cfg), - } - if path.suffix == ".jsonl": # user wants to save only JSONL - srsly.write_jsonl(path, self.patterns) - else: - to_disk(path, serializers, {}) diff --git a/spacy/pipeline/legacy/__init__.py b/spacy/pipeline/legacy/__init__.py deleted file mode 100644 index f216840dc..000000000 --- a/spacy/pipeline/legacy/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from .entity_linker import EntityLinker_v1 - -__all__ = ["EntityLinker_v1"] diff --git a/spacy/pipeline/legacy/entity_linker.py b/spacy/pipeline/legacy/entity_linker.py deleted file mode 100644 index c14dfa1db..000000000 --- a/spacy/pipeline/legacy/entity_linker.py +++ /dev/null @@ -1,422 +0,0 @@ -# This file is present to provide a prior version of the EntityLinker component -# for backwards compatability. For details see #9669. - -from typing import Optional, Iterable, Callable, Dict, Union, List, Any -from thinc.types import Floats2d -from pathlib import Path -from itertools import islice -import srsly -import random -from thinc.api import CosineDistance, Model, Optimizer -from thinc.api import set_dropout_rate -import warnings - -from ...kb import KnowledgeBase, Candidate -from ...ml import empty_kb -from ...tokens import Doc, Span -from ..pipe import deserialize_config -from ..trainable_pipe import TrainablePipe -from ...language import Language -from ...vocab import Vocab -from ...training import Example, validate_examples, validate_get_examples -from ...errors import Errors, Warnings -from ...util import SimpleFrozenList -from ... import util -from ...scorer import Scorer - -# See #9050 -BACKWARD_OVERWRITE = True - - -def entity_linker_score(examples, **kwargs): - return Scorer.score_links(examples, negative_labels=[EntityLinker_v1.NIL], **kwargs) - - -class EntityLinker_v1(TrainablePipe): - """Pipeline component for named entity linking. - - DOCS: https://spacy.io/api/entitylinker - """ - - NIL = "NIL" # string used to refer to a non-existing link - - def __init__( - self, - vocab: Vocab, - model: Model, - name: str = "entity_linker", - *, - labels_discard: Iterable[str], - n_sents: int, - incl_prior: bool, - incl_context: bool, - entity_vector_length: int, - get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]], - overwrite: bool = BACKWARD_OVERWRITE, - scorer: Optional[Callable] = entity_linker_score, - ) -> None: - """Initialize an entity linker. - - vocab (Vocab): The shared vocabulary. - model (thinc.api.Model): The Thinc Model powering the pipeline component. - name (str): The component instance name, used to add entries to the - losses during training. - labels_discard (Iterable[str]): NER labels that will automatically get a "NIL" prediction. - n_sents (int): The number of neighbouring sentences to take into account. - incl_prior (bool): Whether or not to include prior probabilities from the KB in the model. - incl_context (bool): Whether or not to include the local context in the model. - entity_vector_length (int): Size of encoding vectors in the KB. - get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that - produces a list of candidates, given a certain knowledge base and a textual mention. - scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_links. - DOCS: https://spacy.io/api/entitylinker#init - """ - self.vocab = vocab - self.model = model - self.name = name - self.labels_discard = list(labels_discard) - self.n_sents = n_sents - self.incl_prior = incl_prior - self.incl_context = incl_context - self.get_candidates = get_candidates - self.cfg: Dict[str, Any] = {"overwrite": overwrite} - self.distance = CosineDistance(normalize=False) - # how many neighbour sentences to take into account - # create an empty KB by default. If you want to load a predefined one, specify it in 'initialize'. - self.kb = empty_kb(entity_vector_length)(self.vocab) - self.scorer = scorer - - def set_kb(self, kb_loader: Callable[[Vocab], KnowledgeBase]): - """Define the KB of this pipe by providing a function that will - create it using this object's vocab.""" - if not callable(kb_loader): - raise ValueError(Errors.E885.format(arg_type=type(kb_loader))) - - self.kb = kb_loader(self.vocab) - - def validate_kb(self) -> None: - # Raise an error if the knowledge base is not initialized. - if self.kb is None: - raise ValueError(Errors.E1018.format(name=self.name)) - if len(self.kb) == 0: - raise ValueError(Errors.E139.format(name=self.name)) - - def initialize( - self, - get_examples: Callable[[], Iterable[Example]], - *, - nlp: Optional[Language] = None, - kb_loader: Optional[Callable[[Vocab], KnowledgeBase]] = None, - ): - """Initialize the pipe for training, using a representative set - of data examples. - - get_examples (Callable[[], Iterable[Example]]): Function that - returns a representative sample of gold-standard Example objects. - nlp (Language): The current nlp object the component is part of. - kb_loader (Callable[[Vocab], KnowledgeBase]): A function that creates an InMemoryLookupKB from a Vocab instance. - Note that providing this argument, will overwrite all data accumulated in the current KB. - Use this only when loading a KB as-such from file. - - DOCS: https://spacy.io/api/entitylinker#initialize - """ - validate_get_examples(get_examples, "EntityLinker_v1.initialize") - if kb_loader is not None: - self.set_kb(kb_loader) - self.validate_kb() - nO = self.kb.entity_vector_length - doc_sample = [] - vector_sample = [] - for example in islice(get_examples(), 10): - doc_sample.append(example.x) - vector_sample.append(self.model.ops.alloc1f(nO)) - assert len(doc_sample) > 0, Errors.E923.format(name=self.name) - assert len(vector_sample) > 0, Errors.E923.format(name=self.name) - self.model.initialize( - X=doc_sample, Y=self.model.ops.asarray(vector_sample, dtype="float32") - ) - - def update( - self, - examples: Iterable[Example], - *, - drop: float = 0.0, - sgd: Optional[Optimizer] = None, - losses: Optional[Dict[str, float]] = None, - ) -> Dict[str, float]: - """Learn from a batch of documents and gold-standard information, - updating the pipe's model. Delegates to predict and get_loss. - - examples (Iterable[Example]): A batch of Example objects. - drop (float): The dropout rate. - sgd (thinc.api.Optimizer): The optimizer. - losses (Dict[str, float]): Optional record of the loss during training. - Updated using the component name as the key. - RETURNS (Dict[str, float]): The updated losses dictionary. - - DOCS: https://spacy.io/api/entitylinker#update - """ - self.validate_kb() - if losses is None: - losses = {} - losses.setdefault(self.name, 0.0) - if not examples: - return losses - validate_examples(examples, "EntityLinker_v1.update") - sentence_docs = [] - for eg in examples: - sentences = [s for s in eg.reference.sents] - kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True) - for ent in eg.reference.ents: - # KB ID of the first token is the same as the whole span - kb_id = kb_ids[ent.start] - if kb_id: - try: - # find the sentence in the list of sentences. - sent_index = sentences.index(ent.sent) - except AttributeError: - # Catch the exception when ent.sent is None and provide a user-friendly warning - raise RuntimeError(Errors.E030) from None - # get n previous sentences, if there are any - start_sentence = max(0, sent_index - self.n_sents) - # get n posterior sentences, or as many < n as there are - end_sentence = min(len(sentences) - 1, sent_index + self.n_sents) - # get token positions - start_token = sentences[start_sentence].start - end_token = sentences[end_sentence].end - # append that span as a doc to training - sent_doc = eg.predicted[start_token:end_token].as_doc() - sentence_docs.append(sent_doc) - set_dropout_rate(self.model, drop) - if not sentence_docs: - warnings.warn(Warnings.W093.format(name="Entity Linker")) - return losses - sentence_encodings, bp_context = self.model.begin_update(sentence_docs) - loss, d_scores = self.get_loss( - sentence_encodings=sentence_encodings, examples=examples - ) - bp_context(d_scores) - if sgd is not None: - self.finish_update(sgd) - losses[self.name] += loss - return losses - - def get_loss(self, examples: Iterable[Example], sentence_encodings: Floats2d): - validate_examples(examples, "EntityLinker_v1.get_loss") - entity_encodings = [] - for eg in examples: - kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True) - for ent in eg.reference.ents: - kb_id = kb_ids[ent.start] - if kb_id: - entity_encoding = self.kb.get_vector(kb_id) - entity_encodings.append(entity_encoding) - entity_encodings = self.model.ops.asarray2f(entity_encodings) - if sentence_encodings.shape != entity_encodings.shape: - err = Errors.E147.format( - method="get_loss", msg="gold entities do not match up" - ) - raise RuntimeError(err) - gradients = self.distance.get_grad(sentence_encodings, entity_encodings) - loss = self.distance.get_loss(sentence_encodings, entity_encodings) - loss = loss / len(entity_encodings) - return float(loss), gradients - - def predict(self, docs: Iterable[Doc]) -> List[str]: - """Apply the pipeline's model to a batch of docs, without modifying them. - Returns the KB IDs for each entity in each doc, including NIL if there is - no prediction. - - docs (Iterable[Doc]): The documents to predict. - RETURNS (List[str]): The models prediction for each document. - - DOCS: https://spacy.io/api/entitylinker#predict - """ - self.validate_kb() - entity_count = 0 - final_kb_ids: List[str] = [] - if not docs: - return final_kb_ids - if isinstance(docs, Doc): - docs = [docs] - for i, doc in enumerate(docs): - sentences = [s for s in doc.sents] - if len(doc) > 0: - # Looping through each entity (TODO: rewrite) - for ent in doc.ents: - sent = ent.sent - sent_index = sentences.index(sent) - assert sent_index >= 0 - # get n_neighbour sentences, clipped to the length of the document - start_sentence = max(0, sent_index - self.n_sents) - end_sentence = min(len(sentences) - 1, sent_index + self.n_sents) - start_token = sentences[start_sentence].start - end_token = sentences[end_sentence].end - sent_doc = doc[start_token:end_token].as_doc() - # currently, the context is the same for each entity in a sentence (should be refined) - xp = self.model.ops.xp - if self.incl_context: - sentence_encoding = self.model.predict([sent_doc])[0] - sentence_encoding_t = sentence_encoding.T - sentence_norm = xp.linalg.norm(sentence_encoding_t) - entity_count += 1 - if ent.label_ in self.labels_discard: - # ignoring this entity - setting to NIL - final_kb_ids.append(self.NIL) - else: - candidates = list(self.get_candidates(self.kb, ent)) - if not candidates: - # no prediction possible for this entity - setting to NIL - final_kb_ids.append(self.NIL) - elif len(candidates) == 1: - # shortcut for efficiency reasons: take the 1 candidate - final_kb_ids.append(candidates[0].entity_) - else: - random.shuffle(candidates) - # set all prior probabilities to 0 if incl_prior=False - prior_probs = xp.asarray([c.prior_prob for c in candidates]) - if not self.incl_prior: - prior_probs = xp.asarray([0.0 for _ in candidates]) - scores = prior_probs - # add in similarity from the context - if self.incl_context: - entity_encodings = xp.asarray( - [c.entity_vector for c in candidates] - ) - entity_norm = xp.linalg.norm(entity_encodings, axis=1) - if len(entity_encodings) != len(prior_probs): - raise RuntimeError( - Errors.E147.format( - method="predict", - msg="vectors not of equal length", - ) - ) - # cosine similarity - sims = xp.dot(entity_encodings, sentence_encoding_t) / ( - sentence_norm * entity_norm - ) - if sims.shape != prior_probs.shape: - raise ValueError(Errors.E161) - scores = prior_probs + sims - (prior_probs * sims) - best_index = scores.argmax().item() - best_candidate = candidates[best_index] - final_kb_ids.append(best_candidate.entity_) - if not (len(final_kb_ids) == entity_count): - err = Errors.E147.format( - method="predict", msg="result variables not of equal length" - ) - raise RuntimeError(err) - return final_kb_ids - - def set_annotations(self, docs: Iterable[Doc], kb_ids: List[str]) -> None: - """Modify a batch of documents, using pre-computed scores. - - docs (Iterable[Doc]): The documents to modify. - kb_ids (List[str]): The IDs to set, produced by EntityLinker.predict. - - DOCS: https://spacy.io/api/entitylinker#set_annotations - """ - count_ents = len([ent for doc in docs for ent in doc.ents]) - if count_ents != len(kb_ids): - raise ValueError(Errors.E148.format(ents=count_ents, ids=len(kb_ids))) - i = 0 - overwrite = self.cfg["overwrite"] - for doc in docs: - for ent in doc.ents: - kb_id = kb_ids[i] - i += 1 - for token in ent: - if token.ent_kb_id == 0 or overwrite: - token.ent_kb_id_ = kb_id - - def to_bytes(self, *, exclude=tuple()): - """Serialize the pipe to a bytestring. - - exclude (Iterable[str]): String names of serialization fields to exclude. - RETURNS (bytes): The serialized object. - - DOCS: https://spacy.io/api/entitylinker#to_bytes - """ - self._validate_serialization_attrs() - serialize = {} - if hasattr(self, "cfg") and self.cfg is not None: - serialize["cfg"] = lambda: srsly.json_dumps(self.cfg) - serialize["vocab"] = lambda: self.vocab.to_bytes(exclude=exclude) - serialize["kb"] = self.kb.to_bytes - serialize["model"] = self.model.to_bytes - return util.to_bytes(serialize, exclude) - - def from_bytes(self, bytes_data, *, exclude=tuple()): - """Load the pipe from a bytestring. - - exclude (Iterable[str]): String names of serialization fields to exclude. - RETURNS (TrainablePipe): The loaded object. - - DOCS: https://spacy.io/api/entitylinker#from_bytes - """ - self._validate_serialization_attrs() - - def load_model(b): - try: - self.model.from_bytes(b) - except AttributeError: - raise ValueError(Errors.E149) from None - - deserialize = {} - if hasattr(self, "cfg") and self.cfg is not None: - deserialize["cfg"] = lambda b: self.cfg.update(srsly.json_loads(b)) - deserialize["vocab"] = lambda b: self.vocab.from_bytes(b, exclude=exclude) - deserialize["kb"] = lambda b: self.kb.from_bytes(b) - deserialize["model"] = load_model - util.from_bytes(bytes_data, deserialize, exclude) - return self - - def to_disk( - self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList() - ) -> None: - """Serialize the pipe to disk. - - path (str / Path): Path to a directory. - exclude (Iterable[str]): String names of serialization fields to exclude. - - DOCS: https://spacy.io/api/entitylinker#to_disk - """ - serialize = {} - serialize["vocab"] = lambda p: self.vocab.to_disk(p, exclude=exclude) - serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg) - serialize["kb"] = lambda p: self.kb.to_disk(p) - serialize["model"] = lambda p: self.model.to_disk(p) - util.to_disk(path, serialize, exclude) - - def from_disk( - self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList() - ) -> "EntityLinker_v1": - """Load the pipe from disk. Modifies the object in place and returns it. - - path (str / Path): Path to a directory. - exclude (Iterable[str]): String names of serialization fields to exclude. - RETURNS (EntityLinker): The modified EntityLinker object. - - DOCS: https://spacy.io/api/entitylinker#from_disk - """ - - def load_model(p): - try: - with p.open("rb") as infile: - self.model.from_bytes(infile.read()) - except AttributeError: - raise ValueError(Errors.E149) from None - - deserialize: Dict[str, Callable[[Any], Any]] = {} - deserialize["cfg"] = lambda p: self.cfg.update(deserialize_config(p)) - deserialize["vocab"] = lambda p: self.vocab.from_disk(p, exclude=exclude) - deserialize["kb"] = lambda p: self.kb.from_disk(p) - deserialize["model"] = load_model - util.from_disk(path, deserialize, exclude) - return self - - def rehearse(self, examples, *, sgd=None, losses=None, **config): - raise NotImplementedError - - def add_label(self, label): - raise NotImplementedError diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx index 24f98508f..fabc51fee 100644 --- a/spacy/pipeline/morphologizer.pyx +++ b/spacy/pipeline/morphologizer.pyx @@ -1,7 +1,9 @@ # cython: infer_types=True, profile=True, binding=True -from typing import Optional, Union, Dict, Callable +from typing import Callable, Dict, Iterable, List, Optional, Union import srsly -from thinc.api import SequenceCategoricalCrossentropy, Model, Config +from thinc.api import Model, Config +from thinc.legacy import LegacySequenceCategoricalCrossentropy +from thinc.types import Floats2d, Ints1d from itertools import islice from ..tokens.doc cimport Doc @@ -13,16 +15,12 @@ from ..symbols import POS from ..language import Language from ..errors import Errors from .pipe import deserialize_config -from .tagger import Tagger +from .tagger import ActivationsT, Tagger from .. import util from ..scorer import Scorer from ..training import validate_examples, validate_get_examples from ..util import registry -# See #9050 -BACKWARD_OVERWRITE = True -BACKWARD_EXTEND = False - default_model_config = """ [model] @architectures = "spacy.Tagger.v2" @@ -52,7 +50,13 @@ DEFAULT_MORPH_MODEL = Config().from_str(default_model_config)["model"] @Language.factory( "morphologizer", assigns=["token.morph", "token.pos"], - default_config={"model": DEFAULT_MORPH_MODEL, "overwrite": True, "extend": False, "scorer": {"@scorers": "spacy.morphologizer_scorer.v1"}}, + default_config={ + "model": DEFAULT_MORPH_MODEL, + "overwrite": True, + "extend": False, + "scorer": {"@scorers": "spacy.morphologizer_scorer.v1"}, + "save_activations": False, + }, default_score_weights={"pos_acc": 0.5, "morph_acc": 0.5, "morph_per_feat": None}, ) def make_morphologizer( @@ -62,8 +66,10 @@ def make_morphologizer( overwrite: bool, extend: bool, scorer: Optional[Callable], + save_activations: bool, ): - return Morphologizer(nlp.vocab, model, name, overwrite=overwrite, extend=extend, scorer=scorer) + return Morphologizer(nlp.vocab, model, name, overwrite=overwrite, extend=extend, scorer=scorer, + save_activations=save_activations) def morphologizer_score(examples, **kwargs): @@ -92,9 +98,10 @@ class Morphologizer(Tagger): model: Model, name: str = "morphologizer", *, - overwrite: bool = BACKWARD_OVERWRITE, - extend: bool = BACKWARD_EXTEND, + overwrite: bool = False, + extend: bool = False, scorer: Optional[Callable] = morphologizer_score, + save_activations: bool = False, ): """Initialize a morphologizer. @@ -102,9 +109,12 @@ class Morphologizer(Tagger): model (thinc.api.Model): The Thinc Model powering the pipeline component. name (str): The component instance name, used to add entries to the losses during training. + overwrite (bool): Whether to overwrite existing annotations. + extend (bool): Whether to extend existing annotations. scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_token_attr for the attributes "pos" and "morph" and Scorer.score_token_attr_per_feat for the attribute "morph". + save_activations (bool): save model activations in Doc when annotating. DOCS: https://spacy.io/api/morphologizer#init """ @@ -124,11 +134,12 @@ class Morphologizer(Tagger): } self.cfg = dict(sorted(cfg.items())) self.scorer = scorer + self.save_activations = save_activations @property def labels(self): - """RETURNS (Tuple[str]): The labels currently added to the component.""" - return tuple(self.cfg["labels_morph"].keys()) + """RETURNS (Iterable[str]): The labels currently added to the component.""" + return self.cfg["labels_morph"].keys() @property def label_data(self) -> Dict[str, Dict[str, Union[str, float, int, None]]]: @@ -151,7 +162,7 @@ class Morphologizer(Tagger): # normalize label norm_label = self.vocab.morphology.normalize_features(label) # extract separate POS and morph tags - label_dict = Morphology.feats_to_dict(label) + label_dict = Morphology.feats_to_dict(label, sort_values=False) pos = label_dict.get(self.POS_FEAT, "") if self.POS_FEAT in label_dict: label_dict.pop(self.POS_FEAT) @@ -189,7 +200,7 @@ class Morphologizer(Tagger): continue morph = str(token.morph) # create and add the combined morph+POS label - morph_dict = Morphology.feats_to_dict(morph) + morph_dict = Morphology.feats_to_dict(morph, sort_values=False) if pos: morph_dict[self.POS_FEAT] = pos norm_label = self.vocab.strings[self.vocab.morphology.add(morph_dict)] @@ -206,7 +217,7 @@ class Morphologizer(Tagger): for i, token in enumerate(example.reference): pos = token.pos_ morph = str(token.morph) - morph_dict = Morphology.feats_to_dict(morph) + morph_dict = Morphology.feats_to_dict(morph, sort_values=False) if pos: morph_dict[self.POS_FEAT] = pos norm_label = self.vocab.strings[self.vocab.morphology.add(morph_dict)] @@ -217,40 +228,48 @@ class Morphologizer(Tagger): assert len(label_sample) > 0, Errors.E923.format(name=self.name) self.model.initialize(X=doc_sample, Y=label_sample) - def set_annotations(self, docs, batch_tag_ids): + def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT): """Modify a batch of documents, using pre-computed scores. docs (Iterable[Doc]): The documents to modify. - batch_tag_ids: The IDs to set, produced by Morphologizer.predict. + activations (ActivationsT): The activations used for setting annotations, produced by Morphologizer.predict. DOCS: https://spacy.io/api/morphologizer#set_annotations """ + batch_tag_ids = activations["label_ids"] if isinstance(docs, Doc): docs = [docs] cdef Doc doc cdef Vocab vocab = self.vocab cdef bint overwrite = self.cfg["overwrite"] cdef bint extend = self.cfg["extend"] - labels = self.labels + + # We require random access for the upcoming ops, so we need + # to allocate a compatible container out of the iterable. + labels = tuple(self.labels) for i, doc in enumerate(docs): + if self.save_activations: + doc.activations[self.name] = {} + for act_name, acts in activations.items(): + doc.activations[self.name][act_name] = acts[i] doc_tag_ids = batch_tag_ids[i] if hasattr(doc_tag_ids, "get"): doc_tag_ids = doc_tag_ids.get() for j, tag_id in enumerate(doc_tag_ids): - morph = labels[tag_id] + morph = labels[int(tag_id)] # set morph if doc.c[j].morph == 0 or overwrite or extend: if overwrite and extend: # morphologizer morph overwrites any existing features # while extending - extended_morph = Morphology.feats_to_dict(self.vocab.strings[doc.c[j].morph]) - extended_morph.update(Morphology.feats_to_dict(self.cfg["labels_morph"].get(morph, 0))) + extended_morph = Morphology.feats_to_dict(self.vocab.strings[doc.c[j].morph], sort_values=False) + extended_morph.update(Morphology.feats_to_dict(self.cfg["labels_morph"].get(morph, 0), sort_values=False)) doc.c[j].morph = self.vocab.morphology.add(extended_morph) elif extend: # existing features are preserved and any new features # are added - extended_morph = Morphology.feats_to_dict(self.cfg["labels_morph"].get(morph, 0)) - extended_morph.update(Morphology.feats_to_dict(self.vocab.strings[doc.c[j].morph])) + extended_morph = Morphology.feats_to_dict(self.cfg["labels_morph"].get(morph, 0), sort_values=False) + extended_morph.update(Morphology.feats_to_dict(self.vocab.strings[doc.c[j].morph], sort_values=False)) doc.c[j].morph = self.vocab.morphology.add(extended_morph) else: # clobber @@ -270,7 +289,7 @@ class Morphologizer(Tagger): DOCS: https://spacy.io/api/morphologizer#get_loss """ validate_examples(examples, "Morphologizer.get_loss") - loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False) + loss_func = LegacySequenceCategoricalCrossentropy(names=tuple(self.labels), normalize=False) truths = [] for eg in examples: eg_truths = [] @@ -291,7 +310,7 @@ class Morphologizer(Tagger): label = None # Otherwise, generate the combined label else: - label_dict = Morphology.feats_to_dict(morph) + label_dict = Morphology.feats_to_dict(morph, sort_values=False) if pos: label_dict[self.POS_FEAT] = pos label = self.vocab.strings[self.vocab.morphology.add(label_dict)] diff --git a/spacy/pipeline/multitask.pyx b/spacy/pipeline/multitask.pyx deleted file mode 100644 index 8c44061e2..000000000 --- a/spacy/pipeline/multitask.pyx +++ /dev/null @@ -1,221 +0,0 @@ -# cython: infer_types=True, profile=True, binding=True -from typing import Optional -import numpy -from thinc.api import CosineDistance, to_categorical, Model, Config -from thinc.api import set_dropout_rate - -from ..tokens.doc cimport Doc - -from .trainable_pipe import TrainablePipe -from .tagger import Tagger -from ..training import validate_examples -from ..language import Language -from ._parser_internals import nonproj -from ..attrs import POS, ID -from ..errors import Errors - - -default_model_config = """ -[model] -@architectures = "spacy.MultiTask.v1" -maxout_pieces = 3 -token_vector_width = 96 - -[model.tok2vec] -@architectures = "spacy.HashEmbedCNN.v2" -pretrained_vectors = null -width = 96 -depth = 4 -embed_size = 2000 -window_size = 1 -maxout_pieces = 2 -subword_features = true -""" -DEFAULT_MT_MODEL = Config().from_str(default_model_config)["model"] - - -@Language.factory( - "nn_labeller", - default_config={"labels": None, "target": "dep_tag_offset", "model": DEFAULT_MT_MODEL} -) -def make_nn_labeller(nlp: Language, name: str, model: Model, labels: Optional[dict], target: str): - return MultitaskObjective(nlp.vocab, model, name) - - -class MultitaskObjective(Tagger): - """Experimental: Assist training of a parser or tagger, by training a - side-objective. - """ - - def __init__(self, vocab, model, name="nn_labeller", *, target): - self.vocab = vocab - self.model = model - self.name = name - if target == "dep": - self.make_label = self.make_dep - elif target == "tag": - self.make_label = self.make_tag - elif target == "ent": - self.make_label = self.make_ent - elif target == "dep_tag_offset": - self.make_label = self.make_dep_tag_offset - elif target == "ent_tag": - self.make_label = self.make_ent_tag - elif target == "sent_start": - self.make_label = self.make_sent_start - elif hasattr(target, "__call__"): - self.make_label = target - else: - raise ValueError(Errors.E016) - cfg = {"labels": {}, "target": target} - self.cfg = dict(cfg) - - @property - def labels(self): - return self.cfg.setdefault("labels", {}) - - @labels.setter - def labels(self, value): - self.cfg["labels"] = value - - def set_annotations(self, docs, dep_ids): - pass - - def initialize(self, get_examples, nlp=None, labels=None): - if not hasattr(get_examples, "__call__"): - err = Errors.E930.format(name="MultitaskObjective", obj=type(get_examples)) - raise ValueError(err) - if labels is not None: - self.labels = labels - else: - for example in get_examples(): - for token in example.y: - label = self.make_label(token) - if label is not None and label not in self.labels: - self.labels[label] = len(self.labels) - self.model.initialize() # TODO: fix initialization by defining X and Y - - def predict(self, docs): - tokvecs = self.model.get_ref("tok2vec")(docs) - scores = self.model.get_ref("softmax")(tokvecs) - return tokvecs, scores - - def get_loss(self, examples, scores): - cdef int idx = 0 - correct = numpy.zeros((scores.shape[0],), dtype="i") - guesses = scores.argmax(axis=1) - docs = [eg.predicted for eg in examples] - for i, eg in enumerate(examples): - # Handles alignment for tokenization differences - doc_annots = eg.get_aligned() # TODO - for j in range(len(eg.predicted)): - tok_annots = {key: values[j] for key, values in tok_annots.items()} - label = self.make_label(j, tok_annots) - if label is None or label not in self.labels: - correct[idx] = guesses[idx] - else: - correct[idx] = self.labels[label] - idx += 1 - correct = self.model.ops.xp.array(correct, dtype="i") - d_scores = scores - to_categorical(correct, n_classes=scores.shape[1]) - loss = (d_scores**2).sum() - return float(loss), d_scores - - @staticmethod - def make_dep(token): - return token.dep_ - - @staticmethod - def make_tag(token): - return token.tag_ - - @staticmethod - def make_ent(token): - if token.ent_iob_ == "O": - return "O" - else: - return token.ent_iob_ + "-" + token.ent_type_ - - @staticmethod - def make_dep_tag_offset(token): - dep = token.dep_ - tag = token.tag_ - offset = token.head.i - token.i - offset = min(offset, 2) - offset = max(offset, -2) - return f"{dep}-{tag}:{offset}" - - @staticmethod - def make_ent_tag(token): - if token.ent_iob_ == "O": - ent = "O" - else: - ent = token.ent_iob_ + "-" + token.ent_type_ - tag = token.tag_ - return f"{tag}-{ent}" - - @staticmethod - def make_sent_start(token): - """A multi-task objective for representing sentence boundaries, - using BILU scheme. (O is impossible) - """ - if token.is_sent_start and token.is_sent_end: - return "U-SENT" - elif token.is_sent_start: - return "B-SENT" - else: - return "I-SENT" - - -class ClozeMultitask(TrainablePipe): - def __init__(self, vocab, model, **cfg): - self.vocab = vocab - self.model = model - self.cfg = cfg - self.distance = CosineDistance(ignore_zeros=True, normalize=False) # TODO: in config - - def set_annotations(self, docs, dep_ids): - pass - - def initialize(self, get_examples, nlp=None): - self.model.initialize() # TODO: fix initialization by defining X and Y - X = self.model.ops.alloc((5, self.model.get_ref("tok2vec").get_dim("nO"))) - self.model.output_layer.initialize(X) - - def predict(self, docs): - tokvecs = self.model.get_ref("tok2vec")(docs) - vectors = self.model.get_ref("output_layer")(tokvecs) - return tokvecs, vectors - - def get_loss(self, examples, vectors, prediction): - validate_examples(examples, "ClozeMultitask.get_loss") - # The simplest way to implement this would be to vstack the - # token.vector values, but that's a bit inefficient, especially on GPU. - # Instead we fetch the index into the vectors table for each of our tokens, - # and look them up all at once. This prevents data copying. - ids = self.model.ops.flatten([eg.predicted.to_array(ID).ravel() for eg in examples]) - target = vectors[ids] - gradient = self.distance.get_grad(prediction, target) - loss = self.distance.get_loss(prediction, target) - return float(loss), gradient - - def update(self, examples, *, drop=0., sgd=None, losses=None): - pass - - def rehearse(self, examples, drop=0., sgd=None, losses=None): - if losses is not None and self.name not in losses: - losses[self.name] = 0. - set_dropout_rate(self.model, drop) - validate_examples(examples, "ClozeMultitask.rehearse") - docs = [eg.predicted for eg in examples] - predictions, bp_predictions = self.model.begin_update() - loss, d_predictions = self.get_loss(examples, self.vocab.vectors.data, predictions) - bp_predictions(d_predictions) - if sgd is not None: - self.finish_update(sgd) - if losses is not None: - losses[self.name] += loss - return losses - - def add_label(self, label): - raise NotImplementedError diff --git a/spacy/pipeline/ner.pyx b/spacy/pipeline/ner.py similarity index 92% rename from spacy/pipeline/ner.pyx rename to spacy/pipeline/ner.py index 25f48c9f8..7e44b2835 100644 --- a/spacy/pipeline/ner.pyx +++ b/spacy/pipeline/ner.py @@ -4,22 +4,22 @@ from typing import Optional, Iterable, Callable from thinc.api import Model, Config from ._parser_internals.transition_system import TransitionSystem -from .transition_parser cimport Parser -from ._parser_internals.ner cimport BiluoPushDown +from .transition_parser import Parser +from ._parser_internals.ner import BiluoPushDown from ..language import Language from ..scorer import get_ner_prf, PRFScore +from ..training import validate_examples from ..util import registry from ..training import remove_bilu_prefix default_model_config = """ [model] -@architectures = "spacy.TransitionBasedParser.v2" +@architectures = "spacy.TransitionBasedParser.v3" state_type = "ner" extra_state_tokens = false hidden_width = 64 maxout_pieces = 2 -use_upper = true [model.tok2vec] @architectures = "spacy.HashEmbedCNN.v2" @@ -44,8 +44,12 @@ DEFAULT_NER_MODEL = Config().from_str(default_model_config)["model"] "incorrect_spans_key": None, "scorer": {"@scorers": "spacy.ner_scorer.v1"}, }, - default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None}, - + default_score_weights={ + "ents_f": 1.0, + "ents_p": 0.0, + "ents_r": 0.0, + "ents_per_type": None, + }, ) def make_ner( nlp: Language, @@ -98,6 +102,7 @@ def make_ner( scorer=scorer, ) + @Language.factory( "beam_ner", assigns=["doc.ents", "token.ent_iob", "token.ent_type"], @@ -111,7 +116,12 @@ def make_ner( "incorrect_spans_key": None, "scorer": None, }, - default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None}, + default_score_weights={ + "ents_f": 1.0, + "ents_p": 0.0, + "ents_r": 0.0, + "ents_per_type": None, + }, ) def make_beam_ner( nlp: Language, @@ -185,11 +195,12 @@ def make_ner_scorer(): return ner_score -cdef class EntityRecognizer(Parser): +class EntityRecognizer(Parser): """Pipeline component for named entity recognition. DOCS: https://spacy.io/api/entityrecognizer """ + TransitionSystem = BiluoPushDown def __init__( @@ -207,15 +218,14 @@ cdef class EntityRecognizer(Parser): incorrect_spans_key=None, scorer=ner_score, ): - """Create an EntityRecognizer. - """ + """Create an EntityRecognizer.""" super().__init__( vocab, model, name, moves, update_with_oracle_cut_size=update_with_oracle_cut_size, - min_action_freq=1, # not relevant for NER + min_action_freq=1, # not relevant for NER learn_tokens=False, # not relevant for NER beam_width=beam_width, beam_density=beam_density, @@ -242,8 +252,11 @@ cdef class EntityRecognizer(Parser): def labels(self): # Get the labels from the model by looking at the available moves, e.g. # B-PERSON, I-PERSON, L-PERSON, U-PERSON - labels = set(remove_bilu_prefix(move) for move in self.move_names - if move[0] in ("B", "I", "L", "U")) + labels = set( + remove_bilu_prefix(move) + for move in self.move_names + if move[0] in ("B", "I", "L", "U") + ) return tuple(sorted(labels)) def scored_ents(self, beams): diff --git a/spacy/pipeline/pipe.pyx b/spacy/pipeline/pipe.pyx index 8407acc45..8b8fdc361 100644 --- a/spacy/pipeline/pipe.pyx +++ b/spacy/pipeline/pipe.pyx @@ -19,13 +19,6 @@ cdef class Pipe: DOCS: https://spacy.io/api/pipe """ - @classmethod - def __init_subclass__(cls, **kwargs): - """Raise a warning if an inheriting class implements 'begin_training' - (from v2) instead of the new 'initialize' method (from v3)""" - if hasattr(cls, "begin_training"): - warnings.warn(Warnings.W088.format(name=cls.__name__)) - def __call__(self, Doc doc) -> Doc: """Apply the pipe to one document. The document is modified in place, and returned. This usually happens under the hood when the nlp object @@ -94,6 +87,10 @@ cdef class Pipe: return self.scorer(examples, **scorer_kwargs) return {} + @property + def is_distillable(self) -> bool: + return False + @property def is_trainable(self) -> bool: return False diff --git a/spacy/pipeline/sentencizer.pyx b/spacy/pipeline/sentencizer.pyx index 77f4e8adb..6c2565170 100644 --- a/spacy/pipeline/sentencizer.pyx +++ b/spacy/pipeline/sentencizer.pyx @@ -10,9 +10,6 @@ from ..language import Language from ..scorer import Scorer from .. import util -# see #9050 -BACKWARD_OVERWRITE = False - @Language.factory( "sentencizer", assigns=["token.is_sent_start", "doc.sents"], @@ -52,13 +49,14 @@ class Sentencizer(Pipe): name="sentencizer", *, punct_chars=None, - overwrite=BACKWARD_OVERWRITE, + overwrite=False, scorer=senter_score, ): """Initialize the sentencizer. punct_chars (list): Punctuation characters to split on. Will be serialized with the nlp object. + overwrite (bool): Whether to overwrite existing annotations. scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_spans for the attribute "sents". diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx index 6808fe70e..a7d263e94 100644 --- a/spacy/pipeline/senter.pyx +++ b/spacy/pipeline/senter.pyx @@ -1,13 +1,16 @@ # cython: infer_types=True, profile=True, binding=True -from typing import Optional, Callable +from typing import Dict, Iterable, Optional, Callable, List, Union from itertools import islice import srsly -from thinc.api import Model, SequenceCategoricalCrossentropy, Config +from thinc.api import Model, Config +from thinc.legacy import LegacySequenceCategoricalCrossentropy + +from thinc.types import Floats2d, Ints1d from ..tokens.doc cimport Doc -from .tagger import Tagger +from .tagger import ActivationsT, Tagger from ..language import Language from ..errors import Errors from ..scorer import Scorer @@ -15,8 +18,6 @@ from ..training import validate_examples, validate_get_examples from ..util import registry from .. import util -# See #9050 -BACKWARD_OVERWRITE = False default_model_config = """ [model] @@ -38,11 +39,21 @@ DEFAULT_SENTER_MODEL = Config().from_str(default_model_config)["model"] @Language.factory( "senter", assigns=["token.is_sent_start"], - default_config={"model": DEFAULT_SENTER_MODEL, "overwrite": False, "scorer": {"@scorers": "spacy.senter_scorer.v1"}}, + default_config={ + "model": DEFAULT_SENTER_MODEL, + "overwrite": False, + "scorer": {"@scorers": "spacy.senter_scorer.v1"}, + "save_activations": False, + }, default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0}, ) -def make_senter(nlp: Language, name: str, model: Model, overwrite: bool, scorer: Optional[Callable]): - return SentenceRecognizer(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer) +def make_senter(nlp: Language, + name: str, + model: Model, + overwrite: bool, + scorer: Optional[Callable], + save_activations: bool): + return SentenceRecognizer(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer, save_activations=save_activations) def senter_score(examples, **kwargs): @@ -70,8 +81,9 @@ class SentenceRecognizer(Tagger): model, name="senter", *, - overwrite=BACKWARD_OVERWRITE, + overwrite=False, scorer=senter_score, + save_activations: bool = False, ): """Initialize a sentence recognizer. @@ -79,8 +91,10 @@ class SentenceRecognizer(Tagger): model (thinc.api.Model): The Thinc Model powering the pipeline component. name (str): The component instance name, used to add entries to the losses during training. + overwrite (bool): Whether to overwrite existing annotations. scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_spans for the attribute "sents". + save_activations (bool): save model activations in Doc when annotating. DOCS: https://spacy.io/api/sentencerecognizer#init """ @@ -90,6 +104,7 @@ class SentenceRecognizer(Tagger): self._rehearsal_model = None self.cfg = {"overwrite": overwrite} self.scorer = scorer + self.save_activations = save_activations @property def labels(self): @@ -107,19 +122,24 @@ class SentenceRecognizer(Tagger): def label_data(self): return None - def set_annotations(self, docs, batch_tag_ids): + def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT): """Modify a batch of documents, using pre-computed scores. docs (Iterable[Doc]): The documents to modify. - batch_tag_ids: The IDs to set, produced by SentenceRecognizer.predict. + activations (ActivationsT): The activations used for setting annotations, produced by SentenceRecognizer.predict. DOCS: https://spacy.io/api/sentencerecognizer#set_annotations """ + batch_tag_ids = activations["label_ids"] if isinstance(docs, Doc): docs = [docs] cdef Doc doc cdef bint overwrite = self.cfg["overwrite"] for i, doc in enumerate(docs): + if self.save_activations: + doc.activations[self.name] = {} + for act_name, acts in activations.items(): + doc.activations[self.name][act_name] = acts[i] doc_tag_ids = batch_tag_ids[i] if hasattr(doc_tag_ids, "get"): doc_tag_ids = doc_tag_ids.get() @@ -142,7 +162,7 @@ class SentenceRecognizer(Tagger): """ validate_examples(examples, "SentenceRecognizer.get_loss") labels = self.labels - loss_func = SequenceCategoricalCrossentropy(names=labels, normalize=False) + loss_func = LegacySequenceCategoricalCrossentropy(names=labels, normalize=False) truths = [] for eg in examples: eg_truth = [] diff --git a/spacy/pipeline/span_ruler.py b/spacy/pipeline/span_ruler.py index b0669c0ef..ca3bd572b 100644 --- a/spacy/pipeline/span_ruler.py +++ b/spacy/pipeline/span_ruler.py @@ -11,7 +11,7 @@ from ..language import Language from ..errors import Errors, Warnings from ..util import ensure_path, SimpleFrozenList, registry from ..tokens import Doc, Span -from ..scorer import Scorer +from ..scorer import Scorer, get_ner_prf from ..matcher import Matcher, PhraseMatcher from ..matcher.levenshtein import levenshtein_compare from .. import util @@ -21,7 +21,7 @@ DEFAULT_SPANS_KEY = "ruler" @Language.factory( - "future_entity_ruler", + "entity_ruler", assigns=["doc.ents"], default_config={ "phrase_matcher_attr": None, @@ -67,6 +67,15 @@ def make_entity_ruler( ) +def entity_ruler_score(examples, **kwargs): + return get_ner_prf(examples) + + +@registry.scorers("spacy.entity_ruler_scorer.v1") +def make_entity_ruler_scorer(): + return entity_ruler_score + + @Language.factory( "span_ruler", assigns=["doc.spans"], @@ -124,7 +133,7 @@ def prioritize_new_ents_filter( ) -> List[Span]: """Merge entities and spans into one list without overlaps by allowing spans to overwrite any entities that they overlap with. Intended to - replicate the overwrite_ents=True behavior from the EntityRuler. + replicate the overwrite_ents=True behavior from the v3 EntityRuler. entities (Iterable[Span]): The entities, already filtered for overlaps. spans (Iterable[Span]): The spans to merge, may contain overlaps. @@ -155,7 +164,7 @@ def prioritize_existing_ents_filter( ) -> List[Span]: """Merge entities and spans into one list without overlaps by prioritizing existing entities. Intended to replicate the overwrite_ents=False behavior - from the EntityRuler. + from the v3 EntityRuler. entities (Iterable[Span]): The entities, already filtered for overlaps. spans (Iterable[Span]): The spans to merge, may contain overlaps. diff --git a/spacy/pipeline/spancat.py b/spacy/pipeline/spancat.py index a3388e81a..33e1c87dc 100644 --- a/spacy/pipeline/spancat.py +++ b/spacy/pipeline/spancat.py @@ -1,11 +1,11 @@ -from typing import List, Dict, Callable, Tuple, Optional, Iterable, Any +from typing import List, Dict, Callable, Tuple, Optional, Iterable, Any, cast +from typing import Union, Protocol, runtime_checkable from thinc.api import Config, Model, get_current_ops, set_dropout_rate, Ops from thinc.api import Optimizer from thinc.types import Ragged, Ints2d, Floats2d import numpy -from ..compat import Protocol, runtime_checkable from ..scorer import Scorer from ..language import Language from .trainable_pipe import TrainablePipe @@ -16,6 +16,9 @@ from ..errors import Errors from ..util import registry +ActivationsT = Dict[str, Union[Floats2d, Ragged]] + + spancat_default_config = """ [model] @architectures = "spacy.SpanCategorizer.v1" @@ -106,6 +109,7 @@ def build_ngram_range_suggester(min_size: int, max_size: int) -> Suggester: "model": DEFAULT_SPANCAT_MODEL, "suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]}, "scorer": {"@scorers": "spacy.spancat_scorer.v1"}, + "save_activations": False, }, default_score_weights={"spans_sc_f": 1.0, "spans_sc_p": 0.0, "spans_sc_r": 0.0}, ) @@ -118,6 +122,7 @@ def make_spancat( scorer: Optional[Callable], threshold: float, max_positive: Optional[int], + save_activations: bool, ) -> "SpanCategorizer": """Create a SpanCategorizer component. The span categorizer consists of two parts: a suggester function that proposes candidate spans, and a labeller @@ -141,6 +146,7 @@ def make_spancat( 0.5. max_positive (Optional[int]): Maximum number of labels to consider positive per span. Defaults to None, indicating no limit. + save_activations (bool): save model activations in Doc when annotating. """ return SpanCategorizer( nlp.vocab, @@ -151,6 +157,7 @@ def make_spancat( max_positive=max_positive, name=name, scorer=scorer, + save_activations=save_activations, ) @@ -189,6 +196,7 @@ class SpanCategorizer(TrainablePipe): threshold: float = 0.5, max_positive: Optional[int] = None, scorer: Optional[Callable] = spancat_score, + save_activations: bool = False, ) -> None: """Initialize the span categorizer. vocab (Vocab): The shared vocabulary. @@ -221,6 +229,7 @@ class SpanCategorizer(TrainablePipe): self.model = model self.name = name self.scorer = scorer + self.save_activations = save_activations @property def key(self) -> str: @@ -263,7 +272,7 @@ class SpanCategorizer(TrainablePipe): """ return list(self.labels) - def predict(self, docs: Iterable[Doc]): + def predict(self, docs: Iterable[Doc]) -> ActivationsT: """Apply the pipeline's model to a batch of docs, without modifying them. docs (Iterable[Doc]): The documents to predict. @@ -276,7 +285,7 @@ class SpanCategorizer(TrainablePipe): scores = self.model.ops.alloc2f(0, 0) else: scores = self.model.predict((docs, indices)) # type: ignore - return indices, scores + return {"indices": indices, "scores": scores} def set_candidates( self, docs: Iterable[Doc], *, candidates_key: str = "candidates" @@ -296,19 +305,29 @@ class SpanCategorizer(TrainablePipe): for index in candidates.dataXd: doc.spans[candidates_key].append(doc[index[0] : index[1]]) - def set_annotations(self, docs: Iterable[Doc], indices_scores) -> None: + def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT) -> None: """Modify a batch of Doc objects, using pre-computed scores. docs (Iterable[Doc]): The documents to modify. - scores: The scores to set, produced by SpanCategorizer.predict. + activations: ActivationsT: The activations, produced by SpanCategorizer.predict. DOCS: https://spacy.io/api/spancategorizer#set_annotations """ labels = self.labels - indices, scores = indices_scores + + indices = activations["indices"] + assert isinstance(indices, Ragged) + scores = cast(Floats2d, activations["scores"]) + offset = 0 for i, doc in enumerate(docs): indices_i = indices[i].dataXd + if self.save_activations: + doc.activations[self.name] = {} + doc.activations[self.name]["indices"] = indices_i + doc.activations[self.name]["scores"] = scores[ + offset : offset + indices.lengths[i] + ] doc.spans[self.key] = self._make_span_group( doc, indices_i, scores[offset : offset + indices.lengths[i]], labels # type: ignore[arg-type] ) diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx index f236bc20e..cad18f08c 100644 --- a/spacy/pipeline/tagger.pyx +++ b/spacy/pipeline/tagger.pyx @@ -1,9 +1,11 @@ # cython: infer_types=True, profile=True, binding=True -from typing import Callable, Optional +from typing import Callable, Dict, Iterable, List, Optional, Union +from typing import Tuple import numpy import srsly -from thinc.api import Model, set_dropout_rate, SequenceCategoricalCrossentropy, Config -from thinc.types import Floats2d +from thinc.api import Model, set_dropout_rate, Config +from thinc.legacy import LegacySequenceCategoricalCrossentropy +from thinc.types import Floats2d, Ints1d import warnings from itertools import islice @@ -22,8 +24,8 @@ from ..training import validate_examples, validate_get_examples from ..util import registry from .. import util -# See #9050 -BACKWARD_OVERWRITE = False + +ActivationsT = Dict[str, Union[List[Floats2d], List[Ints1d]]] default_model_config = """ [model] @@ -45,7 +47,13 @@ DEFAULT_TAGGER_MODEL = Config().from_str(default_model_config)["model"] @Language.factory( "tagger", assigns=["token.tag"], - default_config={"model": DEFAULT_TAGGER_MODEL, "overwrite": False, "scorer": {"@scorers": "spacy.tagger_scorer.v1"}, "neg_prefix": "!"}, + default_config={ + "model": DEFAULT_TAGGER_MODEL, + "overwrite": False, + "scorer": {"@scorers": "spacy.tagger_scorer.v1"}, + "neg_prefix": "!", + "save_activations": False, + }, default_score_weights={"tag_acc": 1.0}, ) def make_tagger( @@ -55,6 +63,7 @@ def make_tagger( overwrite: bool, scorer: Optional[Callable], neg_prefix: str, + save_activations: bool, ): """Construct a part-of-speech tagger component. @@ -63,7 +72,8 @@ def make_tagger( in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to 1). """ - return Tagger(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer, neg_prefix=neg_prefix) + return Tagger(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer, neg_prefix=neg_prefix, + save_activations=save_activations) def tagger_score(examples, **kwargs): @@ -86,9 +96,10 @@ class Tagger(TrainablePipe): model, name="tagger", *, - overwrite=BACKWARD_OVERWRITE, + overwrite=False, scorer=tagger_score, neg_prefix="!", + save_activations: bool = False, ): """Initialize a part-of-speech tagger. @@ -96,8 +107,10 @@ class Tagger(TrainablePipe): model (thinc.api.Model): The Thinc Model powering the pipeline component. name (str): The component instance name, used to add entries to the losses during training. + overwrite (bool): Whether to overwrite existing annotations. scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_token_attr for the attribute "tag". + save_activations (bool): save model activations in Doc when annotating. DOCS: https://spacy.io/api/tagger#init """ @@ -108,6 +121,7 @@ class Tagger(TrainablePipe): cfg = {"labels": [], "overwrite": overwrite, "neg_prefix": neg_prefix} self.cfg = dict(sorted(cfg.items())) self.scorer = scorer + self.save_activations = save_activations @property def labels(self): @@ -126,7 +140,7 @@ class Tagger(TrainablePipe): """Data about the labels currently added to the component.""" return tuple(self.cfg["labels"]) - def predict(self, docs): + def predict(self, docs) -> ActivationsT: """Apply the pipeline's model to a batch of docs, without modifying them. docs (Iterable[Doc]): The documents to predict. @@ -139,12 +153,12 @@ class Tagger(TrainablePipe): n_labels = len(self.labels) guesses = [self.model.ops.alloc((0, n_labels)) for doc in docs] assert len(guesses) == len(docs) - return guesses + return {"probabilities": guesses, "label_ids": guesses} scores = self.model.predict(docs) assert len(scores) == len(docs), (len(scores), len(docs)) guesses = self._scores2guesses(scores) assert len(guesses) == len(docs) - return guesses + return {"probabilities": scores, "label_ids": guesses} def _scores2guesses(self, scores): guesses = [] @@ -155,14 +169,15 @@ class Tagger(TrainablePipe): guesses.append(doc_guesses) return guesses - def set_annotations(self, docs, batch_tag_ids): + def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT): """Modify a batch of documents, using pre-computed scores. docs (Iterable[Doc]): The documents to modify. - batch_tag_ids: The IDs to set, produced by Tagger.predict. + activations (ActivationsT): The activations used for setting annotations, produced by Tagger.predict. DOCS: https://spacy.io/api/tagger#set_annotations """ + batch_tag_ids = activations["label_ids"] if isinstance(docs, Doc): docs = [docs] cdef Doc doc @@ -170,6 +185,10 @@ class Tagger(TrainablePipe): cdef bint overwrite = self.cfg["overwrite"] labels = self.labels for i, doc in enumerate(docs): + if self.save_activations: + doc.activations[self.name] = {} + for act_name, acts in activations.items(): + doc.activations[self.name][act_name] = acts[i] doc_tag_ids = batch_tag_ids[i] if hasattr(doc_tag_ids, "get"): doc_tag_ids = doc_tag_ids.get() @@ -225,7 +244,6 @@ class Tagger(TrainablePipe): DOCS: https://spacy.io/api/tagger#rehearse """ - loss_func = SequenceCategoricalCrossentropy() if losses is None: losses = {} losses.setdefault(self.name+"_rehearse", 0.0) @@ -239,13 +257,32 @@ class Tagger(TrainablePipe): set_dropout_rate(self.model, drop) tag_scores, bp_tag_scores = self.model.begin_update(docs) tutor_tag_scores, _ = self._rehearsal_model.begin_update(docs) - grads, loss = loss_func(tag_scores, tutor_tag_scores) + loss, grads = self.get_teacher_student_loss(tutor_tag_scores, tag_scores) bp_tag_scores(grads) if sgd is not None: self.finish_update(sgd) losses[self.name+"_rehearse"] += loss return losses + def get_teacher_student_loss( + self, teacher_scores: List[Floats2d], student_scores: List[Floats2d] + ) -> Tuple[float, List[Floats2d]]: + """Calculate the loss and its gradient for a batch of student + scores, relative to teacher scores. + + teacher_scores: Scores representing the teacher model's predictions. + student_scores: Scores representing the student model's predictions. + + RETURNS (Tuple[float, float]): The loss and the gradient. + + DOCS: https://spacy.io/api/tagger#get_teacher_student_loss + """ + loss_func = LegacySequenceCategoricalCrossentropy(normalize=False) + d_scores, loss = loss_func(student_scores, teacher_scores) + if self.model.ops.xp.isnan(loss): + raise ValueError(Errors.E910.format(name=self.name)) + return float(loss), d_scores + def get_loss(self, examples, scores): """Find the loss and gradient of loss for the batch of documents and their predicted scores. @@ -257,7 +294,7 @@ class Tagger(TrainablePipe): DOCS: https://spacy.io/api/tagger#get_loss """ validate_examples(examples, "Tagger.get_loss") - loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False, neg_prefix=self.cfg["neg_prefix"]) + loss_func = LegacySequenceCategoricalCrossentropy(names=self.labels, normalize=False, neg_prefix=self.cfg["neg_prefix"]) # Convert empty tag "" to missing value None so that both misaligned # tokens and tokens with missing annotation have the default missing # value None. diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py index 8b9762c4a..ceac76b85 100644 --- a/spacy/pipeline/textcat.py +++ b/spacy/pipeline/textcat.py @@ -1,4 +1,4 @@ -from typing import Iterable, Tuple, Optional, Dict, List, Callable, Any +from typing import Iterable, Tuple, Optional, Dict, List, Callable, Any, Union from thinc.api import get_array_module, Model, Optimizer, set_dropout_rate, Config from thinc.types import Floats2d import numpy @@ -14,6 +14,9 @@ from ..util import registry from ..vocab import Vocab +ActivationsT = Dict[str, Floats2d] + + single_label_default_config = """ [model] @architectures = "spacy.TextCatEnsemble.v2" @@ -75,6 +78,7 @@ subword_features = true "threshold": 0.0, "model": DEFAULT_SINGLE_TEXTCAT_MODEL, "scorer": {"@scorers": "spacy.textcat_scorer.v2"}, + "save_activations": False, }, default_score_weights={ "cats_score": 1.0, @@ -95,6 +99,7 @@ def make_textcat( model: Model[List[Doc], List[Floats2d]], threshold: float, scorer: Optional[Callable], + save_activations: bool, ) -> "TextCategorizer": """Create a TextCategorizer component. The text categorizer predicts categories over a whole document. It can learn one or more labels, and the labels are considered @@ -104,8 +109,16 @@ def make_textcat( scores for each category. threshold (float): Cutoff to consider a prediction "positive". scorer (Optional[Callable]): The scoring method. + save_activations (bool): save model activations in Doc when annotating. """ - return TextCategorizer(nlp.vocab, model, name, threshold=threshold, scorer=scorer) + return TextCategorizer( + nlp.vocab, + model, + name, + threshold=threshold, + scorer=scorer, + save_activations=save_activations, + ) def textcat_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]: @@ -136,6 +149,7 @@ class TextCategorizer(TrainablePipe): *, threshold: float, scorer: Optional[Callable] = textcat_score, + save_activations: bool = False, ) -> None: """Initialize a text categorizer for single-label classification. @@ -161,6 +175,7 @@ class TextCategorizer(TrainablePipe): } self.cfg = dict(cfg) self.scorer = scorer + self.save_activations = save_activations @property def support_missing_values(self): @@ -185,7 +200,7 @@ class TextCategorizer(TrainablePipe): """ return self.labels # type: ignore[return-value] - def predict(self, docs: Iterable[Doc]): + def predict(self, docs: Iterable[Doc]) -> ActivationsT: """Apply the pipeline's model to a batch of docs, without modifying them. docs (Iterable[Doc]): The documents to predict. @@ -198,12 +213,12 @@ class TextCategorizer(TrainablePipe): tensors = [doc.tensor for doc in docs] xp = self.model.ops.xp scores = xp.zeros((len(list(docs)), len(self.labels))) - return scores + return {"probabilities": scores} scores = self.model.predict(docs) scores = self.model.ops.asarray(scores) - return scores + return {"probabilities": scores} - def set_annotations(self, docs: Iterable[Doc], scores) -> None: + def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT) -> None: """Modify a batch of Doc objects, using pre-computed scores. docs (Iterable[Doc]): The documents to modify. @@ -211,9 +226,13 @@ class TextCategorizer(TrainablePipe): DOCS: https://spacy.io/api/textcategorizer#set_annotations """ + probs = activations["probabilities"] for i, doc in enumerate(docs): + if self.save_activations: + doc.activations[self.name] = {} + doc.activations[self.name]["probabilities"] = probs[i] for j, label in enumerate(self.labels): - doc.cats[label] = float(scores[i, j]) + doc.cats[label] = float(probs[i, j]) def update( self, diff --git a/spacy/pipeline/textcat_multilabel.py b/spacy/pipeline/textcat_multilabel.py index 41c0e2f63..6af238b16 100644 --- a/spacy/pipeline/textcat_multilabel.py +++ b/spacy/pipeline/textcat_multilabel.py @@ -1,4 +1,4 @@ -from typing import Iterable, Optional, Dict, List, Callable, Any +from typing import Iterable, Optional, Dict, List, Callable, Any, Union from thinc.types import Floats2d from thinc.api import Model, Config @@ -75,6 +75,7 @@ subword_features = true "threshold": 0.5, "model": DEFAULT_MULTI_TEXTCAT_MODEL, "scorer": {"@scorers": "spacy.textcat_multilabel_scorer.v2"}, + "save_activations": False, }, default_score_weights={ "cats_score": 1.0, @@ -95,8 +96,9 @@ def make_multilabel_textcat( model: Model[List[Doc], List[Floats2d]], threshold: float, scorer: Optional[Callable], + save_activations: bool, ) -> "MultiLabel_TextCategorizer": - """Create a MultiLabel_TextCategorizer component. The text categorizer predicts categories + """Create a TextCategorizer component. The text categorizer predicts categories over a whole document. It can learn one or more labels, and the labels are considered to be non-mutually exclusive, which means that there can be zero or more labels per doc). @@ -107,7 +109,12 @@ def make_multilabel_textcat( scorer (Optional[Callable]): The scoring method. """ return MultiLabel_TextCategorizer( - nlp.vocab, model, name, threshold=threshold, scorer=scorer + nlp.vocab, + model, + name, + threshold=threshold, + scorer=scorer, + save_activations=save_activations, ) @@ -139,6 +146,7 @@ class MultiLabel_TextCategorizer(TextCategorizer): *, threshold: float, scorer: Optional[Callable] = textcat_multilabel_score, + save_activations: bool = False, ) -> None: """Initialize a text categorizer for multi-label classification. @@ -148,6 +156,7 @@ class MultiLabel_TextCategorizer(TextCategorizer): losses during training. threshold (float): Cutoff to consider a prediction "positive". scorer (Optional[Callable]): The scoring method. + save_activations (bool): save model activations in Doc when annotating. DOCS: https://spacy.io/api/textcategorizer#init """ @@ -158,6 +167,7 @@ class MultiLabel_TextCategorizer(TextCategorizer): cfg = {"labels": [], "threshold": threshold} self.cfg = dict(cfg) self.scorer = scorer + self.save_activations = save_activations @property def support_missing_values(self): diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py index c742aaeaa..d9639f8d5 100644 --- a/spacy/pipeline/tok2vec.py +++ b/spacy/pipeline/tok2vec.py @@ -1,5 +1,6 @@ -from typing import Sequence, Iterable, Optional, Dict, Callable, List, Any +from typing import Sequence, Iterable, Optional, Dict, Callable, List, Any, Tuple from thinc.api import Model, set_dropout_rate, Optimizer, Config +from thinc.types import Floats2d from itertools import islice from .trainable_pipe import TrainablePipe @@ -157,39 +158,9 @@ class Tok2Vec(TrainablePipe): DOCS: https://spacy.io/api/tok2vec#update """ - if losses is None: - losses = {} validate_examples(examples, "Tok2Vec.update") docs = [eg.predicted for eg in examples] - set_dropout_rate(self.model, drop) - tokvecs, bp_tokvecs = self.model.begin_update(docs) - d_tokvecs = [self.model.ops.alloc2f(*t2v.shape) for t2v in tokvecs] - losses.setdefault(self.name, 0.0) - - def accumulate_gradient(one_d_tokvecs): - """Accumulate tok2vec loss and gradient. This is passed as a callback - to all but the last listener. Only the last one does the backprop. - """ - nonlocal d_tokvecs - for i in range(len(one_d_tokvecs)): - d_tokvecs[i] += one_d_tokvecs[i] - losses[self.name] += float((one_d_tokvecs[i] ** 2).sum()) - return [self.model.ops.alloc2f(*t2v.shape) for t2v in tokvecs] - - def backprop(one_d_tokvecs): - """Callback to actually do the backprop. Passed to last listener.""" - accumulate_gradient(one_d_tokvecs) - d_docs = bp_tokvecs(d_tokvecs) - if sgd is not None: - self.finish_update(sgd) - return d_docs - - batch_id = Tok2VecListener.get_batch_id(docs) - for listener in self.listeners[:-1]: - listener.receive(batch_id, tokvecs, accumulate_gradient) - if self.listeners: - self.listeners[-1].receive(batch_id, tokvecs, backprop) - return losses + return self._update_with_docs(docs, drop=drop, sgd=sgd, losses=losses) def get_loss(self, examples, scores) -> None: pass @@ -219,6 +190,96 @@ class Tok2Vec(TrainablePipe): def add_label(self, label): raise NotImplementedError + def distill( + self, + teacher_pipe: Optional["TrainablePipe"], + examples: Iterable["Example"], + *, + drop: float = 0.0, + sgd: Optional[Optimizer] = None, + losses: Optional[Dict[str, float]] = None, + ) -> Dict[str, float]: + """Performs an update of the student pipe's model using the + student's distillation examples and sets the annotations + of the teacher's distillation examples using the teacher pipe. + + teacher_pipe (Optional[TrainablePipe]): The teacher pipe to use + for prediction. + examples (Iterable[Example]): Distillation examples. The reference (teacher) + and predicted (student) docs must have the same number of tokens and the + same orthography. + drop (float): dropout rate. + sgd (Optional[Optimizer]): An optimizer. Will be created via + create_optimizer if not set. + losses (Optional[Dict[str, float]]): Optional record of loss during + distillation. + RETURNS: The updated losses dictionary. + + DOCS: https://spacy.io/api/tok2vec#distill + """ + # By default we require a teacher pipe, but there are downstream + # implementations that don't require a pipe. + if teacher_pipe is None: + raise ValueError(Errors.E4002.format(name=self.name)) + teacher_docs = [eg.reference for eg in examples] + student_docs = [eg.predicted for eg in examples] + teacher_preds = teacher_pipe.predict(teacher_docs) + teacher_pipe.set_annotations(teacher_docs, teacher_preds) + return self._update_with_docs(student_docs, drop=drop, sgd=sgd, losses=losses) + + def _update_with_docs( + self, + docs: Iterable[Doc], + *, + drop: float = 0.0, + sgd: Optional[Optimizer] = None, + losses: Optional[Dict[str, float]] = None, + ): + if losses is None: + losses = {} + losses.setdefault(self.name, 0.0) + set_dropout_rate(self.model, drop) + + tokvecs, accumulate_gradient, backprop = self._create_backprops( + docs, losses, sgd=sgd + ) + batch_id = Tok2VecListener.get_batch_id(docs) + for listener in self.listeners[:-1]: + listener.receive(batch_id, tokvecs, accumulate_gradient) + if self.listeners: + self.listeners[-1].receive(batch_id, tokvecs, backprop) + return losses + + def _create_backprops( + self, + docs: Iterable[Doc], + losses: Dict[str, float], + *, + sgd: Optional[Optimizer] = None, + ) -> Tuple[Floats2d, Callable, Callable]: + tokvecs, bp_tokvecs = self.model.begin_update(docs) + d_tokvecs = [self.model.ops.alloc2f(*t2v.shape) for t2v in tokvecs] + + def accumulate_gradient(one_d_tokvecs): + """Accumulate tok2vec loss and gradient. This is passed as a callback + to all but the last listener. Only the last one does the backprop. + """ + nonlocal d_tokvecs + for i in range(len(one_d_tokvecs)): + d_tokvecs[i] += one_d_tokvecs[i] + losses[self.name] += float((one_d_tokvecs[i] ** 2).sum()) + return [self.model.ops.alloc2f(*t2v.shape) for t2v in tokvecs] + + def backprop(one_d_tokvecs): + """Callback to actually do the backprop. Passed to last listener.""" + accumulate_gradient(one_d_tokvecs) + d_docs = bp_tokvecs(d_tokvecs) + if sgd is not None: + self.finish_update(sgd) + return d_docs + + return tokvecs, accumulate_gradient, backprop + class Tok2VecListener(Model): """A layer that gets fed its answers from an upstream connection, diff --git a/spacy/pipeline/trainable_pipe.pxd b/spacy/pipeline/trainable_pipe.pxd index 65daa8b22..180f86f45 100644 --- a/spacy/pipeline/trainable_pipe.pxd +++ b/spacy/pipeline/trainable_pipe.pxd @@ -6,3 +6,4 @@ cdef class TrainablePipe(Pipe): cdef public object model cdef public object cfg cdef public object scorer + cdef bint _save_activations diff --git a/spacy/pipeline/trainable_pipe.pyx b/spacy/pipeline/trainable_pipe.pyx index 3f0507d4b..fcffd11ee 100644 --- a/spacy/pipeline/trainable_pipe.pyx +++ b/spacy/pipeline/trainable_pipe.pyx @@ -2,11 +2,12 @@ from typing import Iterable, Iterator, Optional, Dict, Tuple, Callable import srsly from thinc.api import set_dropout_rate, Model, Optimizer +import warnings from ..tokens.doc cimport Doc -from ..training import validate_examples -from ..errors import Errors +from ..training import validate_examples, validate_distillation_examples +from ..errors import Errors, Warnings from .pipe import Pipe, deserialize_config from .. import util from ..vocab import Vocab @@ -55,6 +56,53 @@ cdef class TrainablePipe(Pipe): except Exception as e: error_handler(self.name, self, [doc], e) + + def distill(self, + teacher_pipe: Optional["TrainablePipe"], + examples: Iterable["Example"], + *, + drop: float=0.0, + sgd: Optional[Optimizer]=None, + losses: Optional[Dict[str, float]]=None) -> Dict[str, float]: + """Train a pipe (the student) on the predictions of another pipe + (the teacher). The student is typically trained on the probability + distribution of the teacher, but details may differ per pipe. + + teacher_pipe (Optional[TrainablePipe]): The teacher pipe to learn + from. + examples (Iterable[Example]): Distillation examples. The reference + (teacher) and predicted (student) docs must have the same number of + tokens and the same orthography. + drop (float): dropout rate. + sgd (Optional[Optimizer]): An optimizer. Will be created via + create_optimizer if not set. + losses (Optional[Dict[str, float]]): Optional record of loss during + distillation. + RETURNS: The updated losses dictionary. + + DOCS: https://spacy.io/api/pipe#distill + """ + # By default we require a teacher pipe, but there are downstream + # implementations that don't require a pipe. + if teacher_pipe is None: + raise ValueError(Errors.E4002.format(name=self.name)) + if losses is None: + losses = {} + losses.setdefault(self.name, 0.0) + validate_distillation_examples(examples, "TrainablePipe.distill") + set_dropout_rate(self.model, drop) + for node in teacher_pipe.model.walk(): + if node.name == "softmax": + node.attrs["softmax_normalize"] = True + teacher_scores = teacher_pipe.model.predict([eg.reference for eg in examples]) + student_scores, bp_student_scores = self.model.begin_update([eg.predicted for eg in examples]) + loss, d_scores = self.get_teacher_student_loss(teacher_scores, student_scores) + bp_student_scores(d_scores) + if sgd is not None: + self.finish_update(sgd) + losses[self.name] += loss + return losses + def pipe(self, stream: Iterable[Doc], *, batch_size: int=128) -> Iterator[Doc]: """Apply the pipe to a stream of documents. This usually happens under the hood when the nlp object is called on a text and all components are @@ -168,6 +216,19 @@ cdef class TrainablePipe(Pipe): """ raise NotImplementedError(Errors.E931.format(parent="TrainablePipe", method="get_loss", name=self.name)) + def get_teacher_student_loss(self, teacher_scores, student_scores): + """Calculate the loss and its gradient for a batch of student + scores, relative to teacher scores. + + teacher_scores: Scores representing the teacher model's predictions. + student_scores: Scores representing the student model's predictions. + + RETURNS (Tuple[float, float]): The loss and the gradient. + + DOCS: https://spacy.io/api/pipe#get_teacher_student_loss + """ + raise NotImplementedError(Errors.E931.format(parent="TrainablePipe", method="get_teacher_student_loss", name=self.name)) + def create_optimizer(self) -> Optimizer: """Create an optimizer for the pipeline component. @@ -204,6 +265,14 @@ cdef class TrainablePipe(Pipe): """ raise NotImplementedError(Errors.E931.format(parent="Pipe", method="add_label", name=self.name)) + @property + def is_distillable(self) -> bool: + # Normally a pipe overrides `get_teacher_student_loss` to implement + # distillation. In more exceptional cases, a pipe can provide its + # own `distill` implementation. If neither of these methods is + # overridden, the pipe does not implement distillation. + return not (self.__class__.distill is TrainablePipe.distill and self.__class__.get_teacher_student_loss is TrainablePipe.get_teacher_student_loss) + @property def is_trainable(self) -> bool: return True @@ -342,3 +411,11 @@ cdef class TrainablePipe(Pipe): deserialize["model"] = load_model util.from_disk(path, deserialize, exclude) return self + + @property + def save_activations(self): + return self._save_activations + + @save_activations.setter + def save_activations(self, save_activations: bool): + self._save_activations = save_activations diff --git a/spacy/pipeline/transition_parser.pxd b/spacy/pipeline/transition_parser.pxd deleted file mode 100644 index 1521fde60..000000000 --- a/spacy/pipeline/transition_parser.pxd +++ /dev/null @@ -1,20 +0,0 @@ -from cymem.cymem cimport Pool -from thinc.backends.cblas cimport CBlas - -from ..vocab cimport Vocab -from .trainable_pipe cimport TrainablePipe -from ._parser_internals.transition_system cimport Transition, TransitionSystem -from ._parser_internals._state cimport StateC -from ..ml.parser_model cimport WeightsC, ActivationsC, SizesC - - -cdef class Parser(TrainablePipe): - cdef public object _rehearsal_model - cdef readonly TransitionSystem moves - cdef public object _multitasks - - cdef void _parseC(self, CBlas cblas, StateC** states, - WeightsC weights, SizesC sizes) nogil - - cdef void c_transition_batch(self, StateC** states, const float* scores, - int nr_class, int batch_size) nogil diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx index ed58b41a5..086938dc6 100644 --- a/spacy/pipeline/transition_parser.pyx +++ b/spacy/pipeline/transition_parser.pyx @@ -1,5 +1,6 @@ # cython: infer_types=True, cdivision=True, boundscheck=False, binding=True from __future__ import print_function +from typing import Dict, Iterable, List, Optional, Tuple from cymem.cymem cimport Pool cimport numpy as np from itertools import islice @@ -7,33 +8,43 @@ from libcpp.vector cimport vector from libc.string cimport memset, memcpy from libc.stdlib cimport calloc, free import random +import contextlib import srsly -from thinc.api import get_ops, set_dropout_rate, CupyOps, NumpyOps -from thinc.extra.search cimport Beam +from thinc.api import get_ops, set_dropout_rate, CupyOps, NumpyOps, Optimizer +from thinc.api import chain, softmax_activation, use_ops, get_array_module +from thinc.legacy import LegacySequenceCategoricalCrossentropy +from thinc.types import Floats2d, Ints1d import numpy.random import numpy import warnings -from ._parser_internals.stateclass cimport StateClass -from ..ml.parser_model cimport alloc_activations, free_activations -from ..ml.parser_model cimport predict_states, arg_max_if_valid -from ..ml.parser_model cimport WeightsC, ActivationsC, SizesC, cpu_log_loss -from ..ml.parser_model cimport get_c_weights, get_c_sizes +from ..ml.tb_framework import TransitionModelInputs +from ._parser_internals.stateclass cimport StateC, StateClass +from ._parser_internals.search cimport Beam from ..tokens.doc cimport Doc -from .trainable_pipe import TrainablePipe +from .trainable_pipe cimport TrainablePipe from ._parser_internals cimport _beam_utils from ._parser_internals import _beam_utils +from ..vocab cimport Vocab +from ._parser_internals.transition_system cimport Transition, TransitionSystem +from ..typedefs cimport weight_t from ..training import validate_examples, validate_get_examples +from ..training import validate_distillation_examples from ..errors import Errors, Warnings from .. import util +# TODO: Remove when we switch to Cython 3. +cdef extern from "" namespace "std" nogil: + bint equal[InputIt1, InputIt2](InputIt1 first1, InputIt1 last1, InputIt2 first2) except + + + NUMPY_OPS = NumpyOps() -cdef class Parser(TrainablePipe): +class Parser(TrainablePipe): """ Base class of the DependencyParser and EntityRecognizer. """ @@ -123,6 +134,7 @@ cdef class Parser(TrainablePipe): self._rehearsal_model = None self.scorer = scorer + self._cpu_ops = get_ops("cpu") if isinstance(self.model.ops, CupyOps) else self.model.ops def __getnewargs_ex__(self): """This allows pickling the Parser and its keyword-only init arguments""" @@ -132,8 +144,9 @@ cdef class Parser(TrainablePipe): @property def move_names(self): names = [] + cdef TransitionSystem moves = self.moves for i in range(self.moves.n_moves): - name = self.moves.move_name(self.moves.c[i].move, self.moves.c[i].label) + name = self.moves.move_name(moves.c[i].move, moves.c[i].label) # Explicitly removing the internal "U-" token used for blocking entities if name != "U-": names.append(name) @@ -202,6 +215,118 @@ cdef class Parser(TrainablePipe): # Defined in subclasses, to avoid circular import raise NotImplementedError + def distill(self, + teacher_pipe: Optional[TrainablePipe], + examples: Iterable["Example"], + *, + drop: float=0.0, + sgd: Optional[Optimizer]=None, + losses: Optional[Dict[str, float]]=None): + """Train a pipe (the student) on the predictions of another pipe + (the teacher). The student is trained on the transition probabilities + of the teacher. + + teacher_pipe (Optional[TrainablePipe]): The teacher pipe to learn + from. + examples (Iterable[Example]): Distillation examples. The reference + (teacher) and predicted (student) docs must have the same number of + tokens and the same orthography. + drop (float): dropout rate. + sgd (Optional[Optimizer]): An optimizer. Will be created via + create_optimizer if not set. + losses (Optional[Dict[str, float]]): Optional record of loss during + distillation. + RETURNS: The updated losses dictionary. + + DOCS: https://spacy.io/api/dependencyparser#distill + """ + if teacher_pipe is None: + raise ValueError(Errors.E4002.format(name=self.name)) + if losses is None: + losses = {} + losses.setdefault(self.name, 0.0) + + validate_distillation_examples(examples, "TransitionParser.distill") + + set_dropout_rate(self.model, drop) + + student_docs = [eg.predicted for eg in examples] + + max_moves = self.cfg["update_with_oracle_cut_size"] + if max_moves >= 1: + # Chop sequences into lengths of this many words, to make the + # batch uniform length. Since we do not have a gold standard + # sequence, we use the teacher's predictions as the gold + # standard. + max_moves = int(random.uniform(max(max_moves // 2, 1), max_moves * 2)) + states = self._init_batch_from_teacher(teacher_pipe, student_docs, max_moves) + else: + states = self.moves.init_batch(student_docs) + + # We distill as follows: 1. we first let the student predict transition + # sequences (and the corresponding transition probabilities); (2) we + # let the teacher follow the student's predicted transition sequences + # to obtain the teacher's transition probabilities; (3) we compute the + # gradients of the student's transition distributions relative to the + # teacher's distributions. + + student_inputs = TransitionModelInputs(docs=student_docs, + states=[state.copy() for state in states], moves=self.moves, max_moves=max_moves) + (student_states, student_scores), backprop_scores = self.model.begin_update(student_inputs) + actions = _states_diff_to_actions(states, student_states) + teacher_inputs = TransitionModelInputs(docs=[eg.reference for eg in examples], + states=states, moves=teacher_pipe.moves, actions=actions) + (_, teacher_scores) = teacher_pipe.model.predict(teacher_inputs) + + loss, d_scores = self.get_teacher_student_loss(teacher_scores, student_scores) + backprop_scores((student_states, d_scores)) + + if sgd is not None: + self.finish_update(sgd) + + losses[self.name] += loss + + return losses + + + def get_teacher_student_loss( + self, teacher_scores: List[Floats2d], student_scores: List[Floats2d], + normalize: bool=False, + ) -> Tuple[float, List[Floats2d]]: + """Calculate the loss and its gradient for a batch of student + scores, relative to teacher scores. + + teacher_scores: Scores representing the teacher model's predictions. + student_scores: Scores representing the student model's predictions. + + RETURNS (Tuple[float, float]): The loss and the gradient. + + DOCS: https://spacy.io/api/dependencyparser#get_teacher_student_loss + """ + + # We can't easily hook up a softmax layer in the parsing model, since + # the get_loss does additional masking. So, we could apply softmax + # manually here and use Thinc's cross-entropy loss. But it's a bit + # suboptimal, since we can have a lot of states that would result in + # many kernel launches. Futhermore the parsing model's backprop expects + # a XP array, so we'd have to concat the softmaxes anyway. So, like + # the get_loss implementation, we'll compute the loss and gradients + # ourselves. + + teacher_scores = self.model.ops.softmax(self.model.ops.xp.vstack(teacher_scores), + axis=-1, inplace=True) + student_scores = self.model.ops.softmax(self.model.ops.xp.vstack(student_scores), + axis=-1, inplace=True) + + assert teacher_scores.shape == student_scores.shape + + d_scores = student_scores - teacher_scores + if normalize: + d_scores /= d_scores.shape[0] + loss = (d_scores**2).sum() / d_scores.size + + return float(loss), d_scores + def init_multitask_objectives(self, get_examples, pipeline, **cfg): """Setup models for secondary objectives, to benefit from multi-task learning. This method is intended to be overridden by subclasses. @@ -222,9 +347,6 @@ cdef class Parser(TrainablePipe): stream: The sequence of documents to process. batch_size (int): Number of documents to accumulate into a working set. - error_handler (Callable[[str, List[Doc], Exception], Any]): Function that - deals with a failing batch of documents. The default function just reraises - the exception. YIELDS (Doc): Documents, in order. """ @@ -246,83 +368,29 @@ cdef class Parser(TrainablePipe): def predict(self, docs): if isinstance(docs, Doc): docs = [docs] + self._ensure_labels_are_added(docs) if not any(len(doc) for doc in docs): result = self.moves.init_batch(docs) return result - if self.cfg["beam_width"] == 1: - return self.greedy_parse(docs, drop=0.0) - else: - return self.beam_parse( - docs, - drop=0.0, - beam_width=self.cfg["beam_width"], - beam_density=self.cfg["beam_density"] - ) + with _change_attrs(self.model, beam_width=self.cfg["beam_width"], beam_density=self.cfg["beam_density"]): + inputs = TransitionModelInputs(docs=docs, moves=self.moves) + states_or_beams, _ = self.model.predict(inputs) + return states_or_beams def greedy_parse(self, docs, drop=0.): - cdef vector[StateC*] states - cdef StateClass state - ops = self.model.ops - cdef CBlas cblas - if isinstance(ops, CupyOps): - cblas = NUMPY_OPS.cblas() - else: - cblas = ops.cblas() + self._resize() self._ensure_labels_are_added(docs) - set_dropout_rate(self.model, drop) - batch = self.moves.init_batch(docs) - model = self.model.predict(docs) - weights = get_c_weights(model) - for state in batch: - if not state.is_final(): - states.push_back(state.c) - sizes = get_c_sizes(model, states.size()) - with nogil: - self._parseC(cblas, &states[0], weights, sizes) - model.clear_memory() - del model - return batch + with _change_attrs(self.model, beam_width=1): + inputs = TransitionModelInputs(docs=docs, moves=self.moves) + states, _ = self.model.predict(inputs) + return states def beam_parse(self, docs, int beam_width, float drop=0., beam_density=0.): - cdef Beam beam - cdef Doc doc self._ensure_labels_are_added(docs) - batch = _beam_utils.BeamBatch( - self.moves, - self.moves.init_batch(docs), - None, - beam_width, - density=beam_density - ) - model = self.model.predict(docs) - while not batch.is_done: - states = batch.get_unfinished_states() - if not states: - break - scores = model.predict(states) - batch.advance(scores) - model.clear_memory() - del model - return list(batch) - - cdef void _parseC(self, CBlas cblas, StateC** states, - WeightsC weights, SizesC sizes) nogil: - cdef int i, j - cdef vector[StateC*] unfinished - cdef ActivationsC activations = alloc_activations(sizes) - while sizes.states >= 1: - predict_states(cblas, &activations, states, &weights, sizes) - # Validate actions, argmax, take action. - self.c_transition_batch(states, - activations.scores, sizes.classes, sizes.states) - for i in range(sizes.states): - if not states[i].is_final(): - unfinished.push_back(states[i]) - for i in range(unfinished.size()): - states[i] = unfinished[i] - sizes.states = unfinished.size() - unfinished.clear() - free_activations(&activations) + with _change_attrs(self.model, beam_width=self.cfg["beam_width"], beam_density=self.cfg["beam_density"]): + inputs = TransitionModelInputs(docs=docs, moves=self.moves) + beams, _ = self.model.predict(inputs) + return beams def set_annotations(self, docs, states_or_beams): cdef StateClass state @@ -334,35 +402,6 @@ cdef class Parser(TrainablePipe): for hook in self.postprocesses: hook(doc) - def transition_states(self, states, float[:, ::1] scores): - cdef StateClass state - cdef float* c_scores = &scores[0, 0] - cdef vector[StateC*] c_states - for state in states: - c_states.push_back(state.c) - self.c_transition_batch(&c_states[0], c_scores, scores.shape[1], scores.shape[0]) - return [state for state in states if not state.c.is_final()] - - cdef void c_transition_batch(self, StateC** states, const float* scores, - int nr_class, int batch_size) nogil: - # n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc - with gil: - assert self.moves.n_moves > 0, Errors.E924.format(name=self.name) - is_valid = calloc(self.moves.n_moves, sizeof(int)) - cdef int i, guess - cdef Transition action - for i in range(batch_size): - self.moves.set_valid(is_valid, states[i]) - guess = arg_max_if_valid(&scores[i*nr_class], is_valid, nr_class) - if guess == -1: - # This shouldn't happen, but it's hard to raise an error here, - # and we don't want to infinite loop. So, force to end state. - states[i].force_final() - else: - action = self.moves.c[guess] - action.do(states[i], action.label) - free(is_valid) - def update(self, examples, *, drop=0., sgd=None, losses=None): cdef StateClass state if losses is None: @@ -374,67 +413,99 @@ cdef class Parser(TrainablePipe): ) for multitask in self._multitasks: multitask.update(examples, drop=drop, sgd=sgd) + # We need to take care to act on the whole batch, because we might be + # getting vectors via a listener. n_examples = len([eg for eg in examples if self.moves.has_gold(eg)]) if n_examples == 0: return losses set_dropout_rate(self.model, drop) - # The probability we use beam update, instead of falling back to - # a greedy update - beam_update_prob = self.cfg["beam_update_prob"] - if self.cfg['beam_width'] >= 2 and numpy.random.random() < beam_update_prob: - return self.update_beam( - examples, - beam_width=self.cfg["beam_width"], - sgd=sgd, - losses=losses, - beam_density=self.cfg["beam_density"] - ) + docs = [eg.x for eg in examples if len(eg.x)] + max_moves = self.cfg["update_with_oracle_cut_size"] if max_moves >= 1: # Chop sequences into lengths of this many words, to make the # batch uniform length. - max_moves = int(random.uniform(max_moves // 2, max_moves * 2)) - states, golds, _ = self._init_gold_batch( + max_moves = int(random.uniform(max(max_moves // 2, 1), max_moves * 2)) + init_states, gold_states, _ = self._init_gold_batch( examples, max_length=max_moves ) else: - states, golds, _ = self.moves.init_gold_batch(examples) - if not states: - return losses - model, backprop_tok2vec = self.model.begin_update([eg.x for eg in examples]) - - all_states = list(states) - states_golds = list(zip(states, golds)) - n_moves = 0 - while states_golds: - states, golds = zip(*states_golds) - scores, backprop = model.begin_update(states) - d_scores = self.get_batch_loss(states, golds, scores, losses) - # Note that the gradient isn't normalized by the batch size - # here, because our "samples" are really the states...But we - # can't normalize by the number of states either, as then we'd - # be getting smaller gradients for states in long sequences. - backprop(d_scores) - # Follow the predicted action - self.transition_states(states, scores) - states_golds = [(s, g) for (s, g) in zip(states, golds) if not s.is_final()] - if max_moves >= 1 and n_moves >= max_moves: - break - n_moves += 1 + init_states, gold_states, _ = self.moves.init_gold_batch(examples) - backprop_tok2vec(golds) + inputs = TransitionModelInputs(docs=docs, moves=self.moves, + max_moves=max_moves, states=[state.copy() for state in init_states]) + (pred_states, scores), backprop_scores = self.model.begin_update(inputs) + if sum(s.shape[0] for s in scores) == 0: + return losses + d_scores = self.get_loss((gold_states, init_states, pred_states, scores), + examples, max_moves) + backprop_scores((pred_states, d_scores)) if sgd not in (None, False): self.finish_update(sgd) + losses[self.name] += float((d_scores**2).sum()) # Ugh, this is annoying. If we're working on GPU, we want to free the # memory ASAP. It seems that Python doesn't necessarily get around to # removing these in time if we don't explicitly delete? It's confusing. - del backprop - del backprop_tok2vec - model.clear_memory() - del model + del backprop_scores return losses + def get_loss(self, states_scores, examples, max_moves): + gold_states, init_states, pred_states, scores = states_scores + scores = self.model.ops.xp.vstack(scores) + costs = self._get_costs_from_histories( + examples, + gold_states, + init_states, + [list(state.history) for state in pred_states], + max_moves + ) + xp = get_array_module(scores) + best_costs = costs.min(axis=1, keepdims=True) + gscores = scores.copy() + min_score = scores.min() - 1000 + assert costs.shape == scores.shape, (costs.shape, scores.shape) + gscores[costs > best_costs] = min_score + max_ = scores.max(axis=1, keepdims=True) + gmax = gscores.max(axis=1, keepdims=True) + exp_scores = xp.exp(scores - max_) + exp_gscores = xp.exp(gscores - gmax) + Z = exp_scores.sum(axis=1, keepdims=True) + gZ = exp_gscores.sum(axis=1, keepdims=True) + d_scores = exp_scores / Z + d_scores -= (costs <= best_costs) * (exp_gscores / gZ) + return d_scores + + def _get_costs_from_histories(self, examples, gold_states, init_states, histories, max_moves): + cdef TransitionSystem moves = self.moves + cdef StateClass state + cdef int clas + cdef int nF = self.model.get_dim("nF") + cdef int nO = moves.n_moves + cdef int nS = sum([len(history) for history in histories]) + cdef Pool mem = Pool() + cdef np.ndarray costs_i + is_valid = mem.alloc(nO, sizeof(int)) + batch = list(zip(init_states, histories, gold_states)) + n_moves = 0 + output = [] + while batch: + costs = numpy.zeros((len(batch), nO), dtype="f") + for i, (state, history, gold) in enumerate(batch): + costs_i = costs[i] + clas = history.pop(0) + moves.set_costs(is_valid, costs_i.data, state.c, gold) + action = moves.c[clas] + action.do(state.c, action.label) + state.c.history.push_back(clas) + output.append(costs) + batch = [(s, h, g) for s, h, g in batch if len(h) != 0] + if n_moves >= max_moves >= 1: + break + n_moves += 1 + + return self.model.ops.xp.vstack(output) + def rehearse(self, examples, sgd=None, losses=None, **cfg): """Perform a "rehearsal" update, to prevent catastrophic forgetting.""" if losses is None: @@ -447,7 +518,6 @@ cdef class Parser(TrainablePipe): losses.setdefault(self.name+"_rehearse", 0.) validate_examples(examples, "Parser.rehearse") docs = [eg.predicted for eg in examples] - states = self.moves.init_batch(docs) # This is pretty dirty, but the NER can resize itself in init_batch, # if labels are missing. We therefore have to check whether we need to # expand our model output. @@ -455,85 +525,33 @@ cdef class Parser(TrainablePipe): # Prepare the stepwise model, and get the callback for finishing the batch set_dropout_rate(self._rehearsal_model, 0.0) set_dropout_rate(self.model, 0.0) - tutor, _ = self._rehearsal_model.begin_update(docs) - model, backprop_tok2vec = self.model.begin_update(docs) - n_scores = 0. - loss = 0. - while states: - targets, _ = tutor.begin_update(states) - guesses, backprop = model.begin_update(states) - d_scores = (guesses - targets) / targets.shape[0] - # If all weights for an output are 0 in the original model, don't - # supervise that output. This allows us to add classes. - loss += (d_scores**2).sum() - backprop(d_scores) - # Follow the predicted action - self.transition_states(states, guesses) - states = [state for state in states if not state.is_final()] - n_scores += d_scores.size - # Do the backprop - backprop_tok2vec(docs) + student_inputs = TransitionModelInputs(docs=docs, moves=self.moves) + (student_states, student_scores), backprop_scores = self.model.begin_update(student_inputs) + actions = _states_to_actions(student_states) + teacher_inputs = TransitionModelInputs(docs=docs, moves=self.moves, actions=actions) + _, teacher_scores = self._rehearsal_model.predict(teacher_inputs) + + loss, d_scores = self.get_teacher_student_loss(teacher_scores, student_scores, normalize=True) + + teacher_scores = self.model.ops.xp.vstack(teacher_scores) + student_scores = self.model.ops.xp.vstack(student_scores) + assert teacher_scores.shape == student_scores.shape + + d_scores = (student_scores - teacher_scores) / teacher_scores.shape[0] + # If all weights for an output are 0 in the original model, don't + # supervise that output. This allows us to add classes. + loss = (d_scores**2).sum() / d_scores.size + backprop_scores((student_states, d_scores)) + if sgd is not None: self.finish_update(sgd) - losses[self.name+"_rehearse"] += loss / n_scores - del backprop - del backprop_tok2vec - model.clear_memory() - tutor.clear_memory() - del model - del tutor + losses[self.name+"_rehearse"] += loss + return losses def update_beam(self, examples, *, beam_width, drop=0., sgd=None, losses=None, beam_density=0.0): - states, golds, _ = self.moves.init_gold_batch(examples) - if not states: - return losses - # Prepare the stepwise model, and get the callback for finishing the batch - model, backprop_tok2vec = self.model.begin_update( - [eg.predicted for eg in examples]) - loss = _beam_utils.update_beam( - self.moves, - states, - golds, - model, - beam_width, - beam_density=beam_density, - ) - losses[self.name] += loss - backprop_tok2vec(golds) - if sgd is not None: - self.finish_update(sgd) - - def get_batch_loss(self, states, golds, float[:, ::1] scores, losses): - cdef StateClass state - cdef Pool mem = Pool() - cdef int i - - # n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc - assert self.moves.n_moves > 0, Errors.E924.format(name=self.name) - - is_valid = mem.alloc(self.moves.n_moves, sizeof(int)) - costs = mem.alloc(self.moves.n_moves, sizeof(float)) - cdef np.ndarray d_scores = numpy.zeros((len(states), self.moves.n_moves), - dtype='f', order='C') - c_d_scores = d_scores.data - unseen_classes = self.model.attrs["unseen_classes"] - for i, (state, gold) in enumerate(zip(states, golds)): - memset(is_valid, 0, self.moves.n_moves * sizeof(int)) - memset(costs, 0, self.moves.n_moves * sizeof(float)) - self.moves.set_costs(is_valid, costs, state.c, gold) - for j in range(self.moves.n_moves): - if costs[j] <= 0.0 and j in unseen_classes: - unseen_classes.remove(j) - cpu_log_loss(c_d_scores, - costs, is_valid, &scores[i, 0], d_scores.shape[1]) - c_d_scores += d_scores.shape[1] - # Note that we don't normalize this. See comment in update() for why. - if losses is not None: - losses.setdefault(self.name, 0.) - losses[self.name] += (d_scores**2).sum() - return d_scores + raise NotImplementedError def set_output(self, nO): self.model.attrs["resize_output"](self.model, nO) @@ -572,7 +590,7 @@ cdef class Parser(TrainablePipe): for example in islice(get_examples(), 10): doc_sample.append(example.predicted) assert len(doc_sample) > 0, Errors.E923.format(name=self.name) - self.model.initialize(doc_sample) + self.model.initialize((doc_sample, self.moves)) if nlp is not None: self.init_multitask_objectives(get_examples, nlp.pipeline) @@ -629,28 +647,75 @@ cdef class Parser(TrainablePipe): raise ValueError(Errors.E149) from None return self - def _init_gold_batch(self, examples, max_length): - """Make a square batch, of length equal to the shortest transition + def _init_batch_from_teacher(self, teacher_pipe, docs, max_length): + """Make a square batch of length equal to the shortest transition sequence or a cap. A long doc will get multiple states. Let's say we have a doc of length 2*N, where N is the shortest doc. We'll make two states, one representing - long_doc[:N], and another representing long_doc[N:].""" + long_doc[:N], and another representing long_doc[N:]. In contrast to + _init_gold_batch, this version uses a teacher model to generate the + cut sequences.""" + cdef: + StateClass state + TransitionSystem moves = teacher_pipe.moves + + # Start with the same heuristic as in supervised training: exclude + # docs that are within the maximum length. + all_states = moves.init_batch(docs) + states = [] + to_cut = [] + for state, doc in zip(all_states, docs): + if not state.is_final(): + if len(doc) < max_length: + states.append(state) + else: + to_cut.append(state) + + if not to_cut: + return states + + # Parse the states that are too long with the teacher's parsing model. + teacher_inputs = TransitionModelInputs(docs=docs, moves=moves, + states=[state.copy() for state in to_cut]) + (teacher_states, _ ) = teacher_pipe.model.predict(teacher_inputs) + + # Step through the teacher's actions and store every state after + # each multiple of max_length. + teacher_actions = _states_to_actions(teacher_states) + while to_cut: + states.extend(state.copy() for state in to_cut) + for step_actions in teacher_actions[:max_length]: + to_cut = moves.apply_actions(to_cut, step_actions) + teacher_actions = teacher_actions[max_length:] + + if len(teacher_actions) < max_length: + break + + return states + + def _init_gold_batch(self, examples, max_length): + """Make a square batch, of length equal to the shortest transition + sequence or a cap. A long doc will get multiple states. Let's say we + have a doc of length 2*N, where N is the shortest doc. We'll make + two states, one representing long_doc[:N], and another representing + long_doc[N:].""" cdef: StateClass start_state StateClass state Transition action - all_states = self.moves.init_batch([eg.predicted for eg in examples]) + TransitionSystem moves = self.moves + all_states = moves.init_batch([eg.predicted for eg in examples]) states = [] golds = [] to_cut = [] for state, eg in zip(all_states, examples): - if self.moves.has_gold(eg) and not state.is_final(): - gold = self.moves.init_gold(state, eg) + if moves.has_gold(eg) and not state.is_final(): + gold = moves.init_gold(state, eg) if len(eg.x) < max_length: states.append(state) golds.append(gold) else: - oracle_actions = self.moves.get_oracle_sequence_from_state( + oracle_actions = moves.get_oracle_sequence_from_state( state.copy(), gold) to_cut.append((eg, state, gold, oracle_actions)) if not to_cut: @@ -660,13 +725,94 @@ cdef class Parser(TrainablePipe): for i in range(0, len(oracle_actions), max_length): start_state = state.copy() for clas in oracle_actions[i:i+max_length]: - action = self.moves.c[clas] + action = moves.c[clas] action.do(state.c, action.label) if state.is_final(): break - if self.moves.has_gold(eg, start_state.B(0), state.B(0)): + if moves.has_gold(eg, start_state.B(0), state.B(0)): states.append(start_state) golds.append(gold) if state.is_final(): break return states, golds, max_length + + +@contextlib.contextmanager +def _change_attrs(model, **kwargs): + """Temporarily modify a thinc model's attributes.""" + unset = object() + old_attrs = {} + for key, value in kwargs.items(): + old_attrs[key] = model.attrs.get(key, unset) + model.attrs[key] = value + yield model + for key, value in old_attrs.items(): + if value is unset: + model.attrs.pop(key) + else: + model.attrs[key] = value + + +def _states_to_actions(states: List[StateClass]) -> List[Ints1d]: + cdef int step + cdef StateClass state + cdef StateC* c_state + actions = [] + while True: + step = len(actions) + + step_actions = [] + for state in states: + c_state = state.c + if step < c_state.history.size(): + step_actions.append(c_state.history[step]) + + # We are done if we have exhausted all histories. + if len(step_actions) == 0: + break + + actions.append(numpy.array(step_actions, dtype="i")) + + return actions + +def _states_diff_to_actions( + before_states: List[StateClass], + after_states: List[StateClass] +) -> List[Ints1d]: + """ + Return for two sets of states the actions to go from the first set of + states to the second set of states. The histories of the first set of + states must be a prefix of the second set of states. + """ + cdef StateClass before_state, after_state + cdef StateC* c_state_before + cdef StateC* c_state_after + + assert len(before_states) == len(after_states) + + # Check invariant: before states histories must be prefixes of after states. + for before_state, after_state in zip(before_states, after_states): + c_state_before = before_state.c + c_state_after = after_state.c + + assert equal(c_state_before.history.begin(), c_state_before.history.end(), + c_state_after.history.begin()) + + actions = [] + while True: + step = len(actions) + + step_actions = [] + for before_state, after_state in zip(before_states, after_states): + c_state_before = before_state.c + c_state_after = after_state.c + if step < c_state_after.history.size() - c_state_before.history.size(): + step_actions.append(c_state_after.history[c_state_before.history.size() + step]) + + # We are done if we have exhausted all histories. + if len(step_actions) == 0: + break + + actions.append(numpy.array(step_actions, dtype="i")) + + return actions diff --git a/spacy/schemas.py b/spacy/schemas.py index f707c7dcf..3e45c8fa9 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -1,6 +1,5 @@ from typing import Dict, List, Union, Optional, Any, Callable, Type, Tuple -from typing import Iterable, TypeVar, TYPE_CHECKING -from .compat import Literal +from typing import Iterable, TypeVar, Literal, TYPE_CHECKING from enum import Enum from pydantic import BaseModel, Field, ValidationError, validator, create_model from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool, ConstrainedStr @@ -144,7 +143,7 @@ def validate_init_settings( def validate_token_pattern(obj: list) -> List[str]: # Try to convert non-string keys (e.g. {ORTH: "foo"} -> {"ORTH": "foo"}) - get_key = lambda k: NAMES[k] if isinstance(k, int) and k < len(NAMES) else k + get_key = lambda k: NAMES[k] if isinstance(k, int) and k in NAMES else k if isinstance(obj, list): converted = [] for pattern in obj: @@ -424,6 +423,27 @@ class ConfigSchemaInit(BaseModel): arbitrary_types_allowed = True +class ConfigSchemaDistillEmpty(BaseModel): + class Config: + extra = "forbid" + + +class ConfigSchemaDistill(BaseModel): + # fmt: off + batcher: Batcher = Field(..., title="Batcher for the training data") + corpus: StrictStr = Field(..., title="Path in the config to the distillation data") + dropout: StrictFloat = Field(..., title="Dropout rate") + max_epochs: StrictInt = Field(..., title="Maximum number of epochs to distill for") + max_steps: StrictInt = Field(..., title="Maximum number of steps to distill for") + optimizer: Optimizer = Field(..., title="The optimizer to use") + student_to_teacher: Dict[str, str] = Field(..., title="Mapping from student to teacher pipe") + # fmt: on + + class Config: + extra = "forbid" + arbitrary_types_allowed = True + + class ConfigSchema(BaseModel): training: ConfigSchemaTraining nlp: ConfigSchemaNlp @@ -431,6 +451,7 @@ class ConfigSchema(BaseModel): components: Dict[str, Dict[str, Any]] corpora: Dict[str, Reader] initialize: ConfigSchemaInit + distillation: Union[ConfigSchemaDistill, ConfigSchemaDistillEmpty] = {} # type: ignore[assignment] class Config: extra = "allow" @@ -442,6 +463,7 @@ CONFIG_SCHEMAS = { "training": ConfigSchemaTraining, "pretraining": ConfigSchemaPretrain, "initialize": ConfigSchemaInit, + "distill": ConfigSchemaDistill, } diff --git a/spacy/scorer.py b/spacy/scorer.py index de4f52be6..095effdcf 100644 --- a/spacy/scorer.py +++ b/spacy/scorer.py @@ -104,7 +104,7 @@ class Scorer: def __init__( self, nlp: Optional["Language"] = None, - default_lang: str = "xx", + default_lang: str = "mul", default_pipeline: Iterable[str] = DEFAULT_PIPELINE, **cfg, ) -> None: diff --git a/spacy/strings.pxd b/spacy/strings.pxd index 5f03a9a28..0c1a30fe3 100644 --- a/spacy/strings.pxd +++ b/spacy/strings.pxd @@ -1,4 +1,4 @@ -from libc.stdint cimport int64_t +from libc.stdint cimport int64_t, uint32_t from libcpp.vector cimport vector from libcpp.set cimport set from cymem.cymem cimport Pool @@ -7,13 +7,6 @@ from murmurhash.mrmr cimport hash64 from .typedefs cimport attr_t, hash_t - -cpdef hash_t hash_string(str string) except 0 -cdef hash_t hash_utf8(char* utf8_string, int length) nogil - -cdef str decode_Utf8Str(const Utf8Str* string) - - ctypedef union Utf8Str: unsigned char[8] s unsigned char* p @@ -21,9 +14,13 @@ ctypedef union Utf8Str: cdef class StringStore: cdef Pool mem + cdef vector[hash_t] _keys + cdef PreshMap _map - cdef vector[hash_t] keys - cdef public PreshMap _map + cdef hash_t _intern_str(self, str string) + cdef Utf8Str* _allocate_str_repr(self, const unsigned char* chars, uint32_t length) except * + cdef str _decode_str_repr(self, const Utf8Str* string) - cdef const Utf8Str* intern_unicode(self, str py_string) - cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash) + +cpdef hash_t hash_string(object string) except -1 +cpdef hash_t get_string_id(object string_or_hash) except -1 diff --git a/spacy/strings.pyi b/spacy/strings.pyi index b29389b9a..d9509ff57 100644 --- a/spacy/strings.pyi +++ b/spacy/strings.pyi @@ -1,21 +1,20 @@ -from typing import Optional, Iterable, Iterator, Union, Any, overload +from typing import List, Optional, Iterable, Iterator, Union, Any, Tuple, overload from pathlib import Path -def get_string_id(key: Union[str, int]) -> int: ... - class StringStore: - def __init__( - self, strings: Optional[Iterable[str]] = ..., freeze: bool = ... - ) -> None: ... + def __init__(self, strings: Optional[Iterable[str]]) -> None: ... @overload - def __getitem__(self, string_or_id: Union[bytes, str]) -> int: ... + def __getitem__(self, string_or_hash: str) -> int: ... @overload - def __getitem__(self, string_or_id: int) -> str: ... - def as_int(self, key: Union[bytes, str, int]) -> int: ... - def as_string(self, key: Union[bytes, str, int]) -> str: ... + def __getitem__(self, string_or_hash: int) -> str: ... + def as_int(self, string_or_hash: Union[str, int]) -> int: ... + def as_string(self, string_or_hash: Union[str, int]) -> str: ... def add(self, string: str) -> int: ... + def items(self) -> List[Tuple[str, int]]: ... + def keys(self) -> List[str]: ... + def values(self) -> List[int]: ... def __len__(self) -> int: ... - def __contains__(self, string: str) -> bool: ... + def __contains__(self, string_or_hash: Union[str, int]) -> bool: ... def __iter__(self) -> Iterator[str]: ... def __reduce__(self) -> Any: ... def to_disk(self, path: Union[str, Path]) -> None: ... @@ -23,3 +22,5 @@ class StringStore: def to_bytes(self, **kwargs: Any) -> bytes: ... def from_bytes(self, bytes_data: bytes, **kwargs: Any) -> StringStore: ... def _reset_and_load(self, strings: Iterable[str]) -> None: ... + +def get_string_id(string_or_hash: Union[str, int]) -> int: ... diff --git a/spacy/strings.pyx b/spacy/strings.pyx index c5f218342..5a037eb9a 100644 --- a/spacy/strings.pyx +++ b/spacy/strings.pyx @@ -1,9 +1,10 @@ # cython: infer_types=True +from typing import Optional, Union, Iterable, Tuple, Callable, Any, List, Iterator cimport cython from libc.string cimport memcpy from libcpp.set cimport set from libc.stdint cimport uint32_t -from murmurhash.mrmr cimport hash64, hash32 +from murmurhash.mrmr cimport hash64 import srsly @@ -14,105 +15,13 @@ from .symbols import NAMES as SYMBOLS_BY_INT from .errors import Errors from . import util -# Not particularly elegant, but this is faster than `isinstance(key, numbers.Integral)` -cdef inline bint _try_coerce_to_hash(object key, hash_t* out_hash): - try: - out_hash[0] = key - return True - except: - return False - -def get_string_id(key): - """Get a string ID, handling the reserved symbols correctly. If the key is - already an ID, return it. - - This function optimises for convenience over performance, so shouldn't be - used in tight loops. - """ - cdef hash_t str_hash - if isinstance(key, str): - if len(key) == 0: - return 0 - - symbol = SYMBOLS_BY_STR.get(key, None) - if symbol is not None: - return symbol - else: - chars = key.encode("utf8") - return hash_utf8(chars, len(chars)) - elif _try_coerce_to_hash(key, &str_hash): - # Coerce the integral key to the expected primitive hash type. - # This ensures that custom/overloaded "primitive" data types - # such as those implemented by numpy are not inadvertently used - # downsteam (as these are internally implemented as custom PyObjects - # whose comparison operators can incur a significant overhead). - return str_hash - else: - # TODO: Raise an error instead - return key - - -cpdef hash_t hash_string(str string) except 0: - chars = string.encode("utf8") - return hash_utf8(chars, len(chars)) - - -cdef hash_t hash_utf8(char* utf8_string, int length) nogil: - return hash64(utf8_string, length, 1) - - -cdef uint32_t hash32_utf8(char* utf8_string, int length) nogil: - return hash32(utf8_string, length, 1) - - -cdef str decode_Utf8Str(const Utf8Str* string): - cdef int i, length - if string.s[0] < sizeof(string.s) and string.s[0] != 0: - return string.s[1:string.s[0]+1].decode("utf8") - elif string.p[0] < 255: - return string.p[1:string.p[0]+1].decode("utf8") - else: - i = 0 - length = 0 - while string.p[i] == 255: - i += 1 - length += 255 - length += string.p[i] - i += 1 - return string.p[i:length + i].decode("utf8") - - -cdef Utf8Str* _allocate(Pool mem, const unsigned char* chars, uint32_t length) except *: - cdef int n_length_bytes - cdef int i - cdef Utf8Str* string = mem.alloc(1, sizeof(Utf8Str)) - cdef uint32_t ulength = length - if length < sizeof(string.s): - string.s[0] = length - memcpy(&string.s[1], chars, length) - return string - elif length < 255: - string.p = mem.alloc(length + 1, sizeof(unsigned char)) - string.p[0] = length - memcpy(&string.p[1], chars, length) - return string - else: - i = 0 - n_length_bytes = (length // 255) + 1 - string.p = mem.alloc(length + n_length_bytes, sizeof(unsigned char)) - for i in range(n_length_bytes-1): - string.p[i] = 255 - string.p[n_length_bytes-1] = length % 255 - memcpy(&string.p[n_length_bytes], chars, length) - return string - cdef class StringStore: - """Look up strings by 64-bit hashes. + """Look up strings by 64-bit hashes. Implicitly handles reserved symbols. DOCS: https://spacy.io/api/stringstore """ - def __init__(self, strings=None, freeze=False): + def __init__(self, strings: Optional[Iterable[str]] = None): """Create the StringStore. strings (iterable): A sequence of unicode strings to add to the store. @@ -123,127 +32,126 @@ cdef class StringStore: for string in strings: self.add(string) - def __getitem__(self, object string_or_id): - """Retrieve a string from a given hash, or vice versa. + def __getitem__(self, string_or_hash: Union[str, int]) -> Union[str, int]: + """Retrieve a string from a given hash. If a string + is passed as the input, add it to the store and return + its hash. - string_or_id (bytes, str or uint64): The value to encode. - Returns (str / uint64): The value to be retrieved. + string_or_hash (int / str): The hash value to lookup or the string to store. + RETURNS (str / int): The stored string or the hash of the newly added string. """ - cdef hash_t str_hash - cdef Utf8Str* utf8str = NULL - - if isinstance(string_or_id, str): - if len(string_or_id) == 0: - return 0 - - # Return early if the string is found in the symbols LUT. - symbol = SYMBOLS_BY_STR.get(string_or_id, None) - if symbol is not None: - return symbol - else: - return hash_string(string_or_id) - elif isinstance(string_or_id, bytes): - return hash_utf8(string_or_id, len(string_or_id)) - elif _try_coerce_to_hash(string_or_id, &str_hash): - if str_hash == 0: - return "" - elif str_hash < len(SYMBOLS_BY_INT): - return SYMBOLS_BY_INT[str_hash] - else: - utf8str = self._map.get(str_hash) + if isinstance(string_or_hash, str): + return self.add(string_or_hash) else: - # TODO: Raise an error instead - utf8str = self._map.get(string_or_id) + return self._get_interned_str(string_or_hash) - if utf8str is NULL: - raise KeyError(Errors.E018.format(hash_value=string_or_id)) + def __contains__(self, string_or_hash: Union[str, int]) -> bool: + """Check whether a string or a hash is in the store. + + string (str / int): The string/hash to check. + RETURNS (bool): Whether the store contains the string. + """ + cdef hash_t str_hash = get_string_id(string_or_hash) + if str_hash in SYMBOLS_BY_INT: + return True else: - return decode_Utf8Str(utf8str) + return self._map.get(str_hash) is not NULL - def as_int(self, key): - """If key is an int, return it; otherwise, get the int value.""" - if not isinstance(key, str): - return key - else: - return self[key] + def __iter__(self) -> Iterator[str]: + """Iterate over the strings in the store in insertion order. - def as_string(self, key): - """If key is a string, return it; otherwise, get the string value.""" - if isinstance(key, str): - return key - else: - return self[key] + RETURNS: An iterable collection of strings. + """ + return iter(self.keys()) - def add(self, string): + def __reduce__(self): + strings = list(self) + return (StringStore, (strings,), None, None, None) + + def __len__(self) -> int: + """The number of strings in the store. + + RETURNS (int): The number of strings in the store. + """ + return self._keys.size() + + def add(self, string: str) -> int: """Add a string to the StringStore. string (str): The string to add. RETURNS (uint64): The string's hash value. """ - cdef hash_t str_hash - if isinstance(string, str): - if string in SYMBOLS_BY_STR: - return SYMBOLS_BY_STR[string] - - string = string.encode("utf8") - str_hash = hash_utf8(string, len(string)) - self._intern_utf8(string, len(string), &str_hash) - elif isinstance(string, bytes): - if string in SYMBOLS_BY_STR: - return SYMBOLS_BY_STR[string] - str_hash = hash_utf8(string, len(string)) - self._intern_utf8(string, len(string), &str_hash) - else: + if not isinstance(string, str): raise TypeError(Errors.E017.format(value_type=type(string))) - return str_hash - def __len__(self): - """The number of strings in the store. - - RETURNS (int): The number of strings in the store. - """ - return self.keys.size() - - def __contains__(self, string_or_id not None): - """Check whether a string or ID is in the store. - - string_or_id (str or int): The string to check. - RETURNS (bool): Whether the store contains the string. - """ - cdef hash_t str_hash - if isinstance(string_or_id, str): - if len(string_or_id) == 0: - return True - elif string_or_id in SYMBOLS_BY_STR: - return True - str_hash = hash_string(string_or_id) - elif _try_coerce_to_hash(string_or_id, &str_hash): - pass + if string in SYMBOLS_BY_STR: + return SYMBOLS_BY_STR[string] else: - # TODO: Raise an error instead - return self._map.get(string_or_id) is not NULL + return self._intern_str(string) - if str_hash < len(SYMBOLS_BY_INT): - return True + def as_int(self, string_or_hash: Union[str, int]) -> str: + """If a hash value is passed as the input, return it as-is. If the input + is a string, return its corresponding hash. + + string_or_hash (str / int): The string to hash or a hash value. + RETURNS (int): The hash of the string or the input hash value. + """ + if isinstance(string_or_hash, int): + return string_or_hash else: - return self._map.get(str_hash) is not NULL + return get_string_id(string_or_hash) - def __iter__(self): - """Iterate over the strings in the store, in order. + def as_string(self, string_or_hash: Union[str, int]) -> str: + """If a string is passed as the input, return it as-is. If the input + is a hash value, return its corresponding string. - YIELDS (str): A string in the store. + string_or_hash (str / int): The hash value to lookup or a string. + RETURNS (str): The stored string or the input string. + """ + if isinstance(string_or_hash, str): + return string_or_hash + else: + return self._get_interned_str(string_or_hash) + + def items(self) -> List[Tuple[str, int]]: + """Iterate over the stored strings and their hashes in insertion order. + + RETURNS: A list of string-hash pairs. + """ + # Even though we internally store the hashes as keys and the strings as + # values, we invert the order in the public API to keep it consistent with + # the implementation of the `__iter__` method (where we wish to iterate over + # the strings in the store). + cdef int i + pairs = [None] * self._keys.size() + for i in range(self._keys.size()): + str_hash = self._keys[i] + utf8str = self._map.get(str_hash) + pairs[i] = (self._decode_str_repr(utf8str), str_hash) + return pairs + + def keys(self) -> List[str]: + """Iterate over the stored strings in insertion order. + + RETURNS: A list of strings. """ cdef int i - cdef hash_t key - for i in range(self.keys.size()): - key = self.keys[i] - utf8str = self._map.get(key) - yield decode_Utf8Str(utf8str) - # TODO: Iterate OOV here? + strings = [None] * self._keys.size() + for i in range(self._keys.size()): + utf8str = self._map.get(self._keys[i]) + strings[i] = self._decode_str_repr(utf8str) + return strings - def __reduce__(self): - strings = list(self) - return (StringStore, (strings,), None, None, None) + def values(self) -> List[int]: + """Iterate over the stored strings hashes in insertion order. + + RETURNS: A list of string hashs. + """ + cdef int i + hashes = [None] * self._keys.size() + for i in range(self._keys.size()): + hashes[i] = self._keys[i] + return hashes def to_disk(self, path): """Save the current state to a directory. @@ -294,24 +202,122 @@ cdef class StringStore: def _reset_and_load(self, strings): self.mem = Pool() self._map = PreshMap() - self.keys.clear() + self._keys.clear() for string in strings: self.add(string) - cdef const Utf8Str* intern_unicode(self, str py_string): - # 0 means missing, but we don't bother offsetting the index. - cdef bytes byte_string = py_string.encode("utf8") - return self._intern_utf8(byte_string, len(byte_string), NULL) + def _get_interned_str(self, hash_value: int) -> str: + cdef hash_t str_hash + if not _try_coerce_to_hash(hash_value, &str_hash): + raise TypeError(Errors.E4001.format(expected_types="'int'", received_type=type(hash_value))) - @cython.final - cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash): + # Handle reserved symbols and empty strings correctly. + if str_hash == 0: + return "" + + symbol = SYMBOLS_BY_INT.get(str_hash) + if symbol is not None: + return symbol + + utf8str = self._map.get(str_hash) + if utf8str is NULL: + raise KeyError(Errors.E018.format(hash_value=str_hash)) + else: + return self._decode_str_repr(utf8str) + + cdef hash_t _intern_str(self, str string): # TODO: This function's API/behaviour is an unholy mess... # 0 means missing, but we don't bother offsetting the index. - cdef hash_t key = precalculated_hash[0] if precalculated_hash is not NULL else hash_utf8(utf8_string, length) + chars = string.encode('utf-8') + cdef hash_t key = hash64(chars, len(chars), 1) cdef Utf8Str* value = self._map.get(key) if value is not NULL: - return value - value = _allocate(self.mem, utf8_string, length) + return key + + value = self._allocate_str_repr(chars, len(chars)) self._map.set(key, value) - self.keys.push_back(key) - return value + self._keys.push_back(key) + return key + + cdef Utf8Str* _allocate_str_repr(self, const unsigned char* chars, uint32_t length) except *: + cdef int n_length_bytes + cdef int i + cdef Utf8Str* string = self.mem.alloc(1, sizeof(Utf8Str)) + cdef uint32_t ulength = length + if length < sizeof(string.s): + string.s[0] = length + memcpy(&string.s[1], chars, length) + return string + elif length < 255: + string.p = self.mem.alloc(length + 1, sizeof(unsigned char)) + string.p[0] = length + memcpy(&string.p[1], chars, length) + return string + else: + i = 0 + n_length_bytes = (length // 255) + 1 + string.p = self.mem.alloc(length + n_length_bytes, sizeof(unsigned char)) + for i in range(n_length_bytes-1): + string.p[i] = 255 + string.p[n_length_bytes-1] = length % 255 + memcpy(&string.p[n_length_bytes], chars, length) + return string + + cdef str _decode_str_repr(self, const Utf8Str* string): + cdef int i, length + if string.s[0] < sizeof(string.s) and string.s[0] != 0: + return string.s[1:string.s[0]+1].decode('utf-8') + elif string.p[0] < 255: + return string.p[1:string.p[0]+1].decode('utf-8') + else: + i = 0 + length = 0 + while string.p[i] == 255: + i += 1 + length += 255 + length += string.p[i] + i += 1 + return string.p[i:length + i].decode('utf-8') + + +cpdef hash_t hash_string(object string) except -1: + if not isinstance(string, str): + raise TypeError(Errors.E4001.format(expected_types="'str'", received_type=type(string))) + + # Handle reserved symbols and empty strings correctly. + if len(string) == 0: + return 0 + + symbol = SYMBOLS_BY_STR.get(string) + if symbol is not None: + return symbol + + chars = string.encode('utf-8') + return hash64(chars, len(chars), 1) + + +cpdef hash_t get_string_id(object string_or_hash) except -1: + cdef hash_t str_hash + + try: + return hash_string(string_or_hash) + except: + if _try_coerce_to_hash(string_or_hash, &str_hash): + # Coerce the integral key to the expected primitive hash type. + # This ensures that custom/overloaded "primitive" data types + # such as those implemented by numpy are not inadvertently used + # downsteam (as these are internally implemented as custom PyObjects + # whose comparison operators can incur a significant overhead). + return str_hash + else: + raise TypeError(Errors.E4001.format(expected_types="'str','int'", received_type=type(string_or_hash))) + + +# Not particularly elegant, but this is faster than `isinstance(key, numbers.Integral)` +cdef inline bint _try_coerce_to_hash(object key, hash_t* out_hash): + try: + out_hash[0] = key + return True + except: + return False + diff --git a/spacy/structs.pxd b/spacy/structs.pxd index 86d5b67ed..b9b6f6ba8 100644 --- a/spacy/structs.pxd +++ b/spacy/structs.pxd @@ -58,14 +58,6 @@ cdef struct TokenC: hash_t ent_id -cdef struct MorphAnalysisC: - hash_t key - int length - - attr_t* fields - attr_t* features - - # Internal struct, for storage and disambiguation of entities. cdef struct KBEntryC: diff --git a/spacy/symbols.pxd b/spacy/symbols.pxd index bc15d9b80..f5d7784dc 100644 --- a/spacy/symbols.pxd +++ b/spacy/symbols.pxd @@ -1,5 +1,6 @@ +# DO NOT EDIT! The symbols are frozen as of spaCy v3.0.0. cdef enum symbol_t: - NIL + NIL = 0 IS_ALPHA IS_ASCII IS_DIGIT @@ -65,7 +66,7 @@ cdef enum symbol_t: FLAG62 FLAG63 - ID + ID = 64 ORTH LOWER NORM @@ -385,7 +386,7 @@ cdef enum symbol_t: DEPRECATED275 DEPRECATED276 - PERSON + PERSON = 380 NORP FACILITY ORG @@ -405,7 +406,7 @@ cdef enum symbol_t: ORDINAL CARDINAL - acomp + acomp = 398 advcl advmod agent @@ -458,12 +459,12 @@ cdef enum symbol_t: rcmod root xcomp - acl - ENT_KB_ID + ENT_KB_ID = 452 MORPH ENT_ID IDX - _ + _ = 456 + # DO NOT ADD ANY NEW SYMBOLS! diff --git a/spacy/symbols.pyx b/spacy/symbols.pyx index b0345c710..fbfc6f10d 100644 --- a/spacy/symbols.pyx +++ b/spacy/symbols.pyx @@ -469,11 +469,7 @@ IDS = { } -def sort_nums(x): - return x[1] - - -NAMES = [it[0] for it in sorted(IDS.items(), key=sort_nums)] +NAMES = {v: k for k, v in IDS.items()} # Unfortunate hack here, to work around problem with long cpdef enum # (which is generating an enormous amount of C++ in Cython 0.24+) # We keep the enum cdef, and just make sure the names are available to Python diff --git a/spacy/tests/README.md b/spacy/tests/README.md index 82fabcc77..9ac1e6d2e 100644 --- a/spacy/tests/README.md +++ b/spacy/tests/README.md @@ -40,7 +40,7 @@ py.test spacy/tests/tokenizer/test_exceptions.py::test_tokenizer_handles_emoji # To keep the behavior of the tests consistent and predictable, we try to follow a few basic conventions: -- **Test names** should follow a pattern of `test_[module]_[tested behaviour]`. For example: `test_tokenizer_keeps_email` or `test_spans_override_sentiment`. +- **Test names** should follow a pattern of `test_[module]_[tested behaviour]`. For example: `test_tokenizer_keeps_email`. - If you're testing for a bug reported in a specific issue, always create a **regression test**. Regression tests should be named `test_issue[ISSUE NUMBER]` and live in the [`regression`](regression) directory. - Only use `@pytest.mark.xfail` for tests that **should pass, but currently fail**. To test for desired negative behavior, use `assert not` in your test. - Very **extensive tests** that take a long time to run should be marked with `@pytest.mark.slow`. If your slow test is testing important behavior, consider adding an additional simpler version. @@ -86,7 +86,7 @@ These are the main fixtures that are currently available: | Fixture | Description | | ----------------------------------- | ---------------------------------------------------------------------------- | -| `tokenizer` | Basic, language-independent tokenizer. Identical to the `xx` language class. | +| `tokenizer` | Basic, language-independent tokenizer. Identical to the `mul` language class. | | `en_tokenizer`, `de_tokenizer`, ... | Creates an English, German etc. tokenizer. | | `en_vocab` | Creates an instance of the English `Vocab`. | diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 3a5c8e451..cc0450cab 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -1,6 +1,10 @@ import pytest from spacy.util import get_lang_class +import functools from hypothesis import settings +import inspect +import importlib +import sys # Functionally disable deadline settings for tests # to prevent spurious test failures in CI builds. @@ -47,12 +51,39 @@ def pytest_runtest_setup(item): pytest.skip("not referencing any issues") +# Decorator for Cython-built tests +# https://shwina.github.io/cython-testing/ +def cytest(func): + """ + Wraps `func` in a plain Python function. + """ + + @functools.wraps(func) + def wrapped(*args, **kwargs): + bound = inspect.signature(func).bind(*args, **kwargs) + return func(*bound.args, **bound.kwargs) + + return wrapped + + +def register_cython_tests(cython_mod_name: str, test_mod_name: str): + """ + Registers all callables with name `test_*` in Cython module `cython_mod_name` + as attributes in module `test_mod_name`, making them discoverable by pytest. + """ + cython_mod = importlib.import_module(cython_mod_name) + for name in dir(cython_mod): + item = getattr(cython_mod, name) + if callable(item) and name.startswith("test_"): + setattr(sys.modules[test_mod_name], name, item) + + # Fixtures for language tokenizers (languages sorted alphabetically) @pytest.fixture(scope="module") def tokenizer(): - return get_lang_class("xx")().tokenizer + return get_lang_class("mul")().tokenizer @pytest.fixture(scope="session") @@ -212,8 +243,8 @@ def id_tokenizer(): @pytest.fixture(scope="session") -def is_tokenizer(): - return get_lang_class("is")().tokenizer +def isl_tokenizer(): + return get_lang_class("isl")().tokenizer @pytest.fixture(scope="session") @@ -239,7 +270,7 @@ def hsb_tokenizer(): @pytest.fixture(scope="session") def ko_tokenizer(): - pytest.importorskip("natto") + pytest.importorskip("mecab_ko") return get_lang_class("ko")().tokenizer @@ -261,6 +292,20 @@ def la_tokenizer(): return get_lang_class("la")().tokenizer +@pytest.fixture(scope="session") +def ko_tokenizer_natto(): + pytest.importorskip("natto") + config = { + "nlp": { + "tokenizer": { + "@tokenizers": "spacy.KoreanNattoTokenizer.v1", + } + } + } + nlp = get_lang_class("ko").from_config(config) + return nlp.tokenizer + + @pytest.fixture(scope="session") def lb_tokenizer(): return get_lang_class("lb")().tokenizer @@ -451,8 +496,8 @@ def vi_tokenizer(): @pytest.fixture(scope="session") -def xx_tokenizer(): - return get_lang_class("xx")().tokenizer +def mul_tokenizer(): + return get_lang_class("mul")().tokenizer @pytest.fixture(scope="session") diff --git a/spacy/tests/doc/test_add_entities.py b/spacy/tests/doc/test_add_entities.py index 231b7c2a8..30d66115f 100644 --- a/spacy/tests/doc/test_add_entities.py +++ b/spacy/tests/doc/test_add_entities.py @@ -45,6 +45,33 @@ def test_ents_reset(en_vocab): assert [t.ent_iob_ for t in doc] == orig_iobs +def test_ents_clear(en_vocab): + """Ensure that removing entities clears token attributes""" + text = ["Louisiana", "Office", "of", "Conservation"] + doc = Doc(en_vocab, words=text) + entity = Span(doc, 0, 4, label=391, span_id="TEST") + doc.ents = [entity] + doc.ents = [] + for token in doc: + assert token.ent_iob == 2 + assert token.ent_type == 0 + assert token.ent_id == 0 + assert token.ent_kb_id == 0 + doc.ents = [entity] + doc.set_ents([], default="missing") + for token in doc: + assert token.ent_iob == 0 + assert token.ent_type == 0 + assert token.ent_id == 0 + assert token.ent_kb_id == 0 + doc.set_ents([], default="blocked") + for token in doc: + assert token.ent_iob == 3 + assert token.ent_type == 0 + assert token.ent_id == 0 + assert token.ent_kb_id == 0 + + def test_add_overlapping_entities(en_vocab): text = ["Louisiana", "Office", "of", "Conservation"] doc = Doc(en_vocab, words=text) diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index 38003dea9..2009a29d6 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -9,7 +9,7 @@ from thinc.api import NumpyOps, get_current_ops from spacy.attrs import DEP, ENT_IOB, ENT_TYPE, HEAD, IS_ALPHA, MORPH, POS from spacy.attrs import SENT_START, TAG from spacy.lang.en import English -from spacy.lang.xx import MultiLanguage +from spacy.lang.mul import MultiLanguage from spacy.language import Language from spacy.lexeme import Lexeme from spacy.tokens import Doc, Span, SpanGroup, Token @@ -380,9 +380,7 @@ def test_doc_api_serialize(en_tokenizer, text): assert [t.text for t in tokens] == [t.text for t in new_tokens] assert [t.orth for t in tokens] == [t.orth for t in new_tokens] - new_tokens = Doc(tokens.vocab).from_bytes( - tokens.to_bytes(exclude=["sentiment"]), exclude=["sentiment"] - ) + new_tokens = Doc(tokens.vocab).from_bytes(tokens.to_bytes()) assert tokens.text == new_tokens.text assert [t.text for t in tokens] == [t.text for t in new_tokens] assert [t.orth for t in tokens] == [t.orth for t in new_tokens] @@ -990,3 +988,12 @@ def test_doc_spans_setdefault(en_tokenizer): assert len(doc.spans["key2"]) == 1 doc.spans.setdefault("key3", default=SpanGroup(doc, spans=[doc[0:1], doc[1:2]])) assert len(doc.spans["key3"]) == 2 + + +def test_doc_sentiment_from_bytes_v3_to_v4(): + """Test if a doc with sentiment attribute created in v3.x works with '.from_bytes' in v4.x without throwing errors. The sentiment attribute was removed in v4""" + doc_bytes = b"\x89\xa4text\xa5happy\xaaarray_head\x9fGQACKOLMN\xcd\x01\xc4\xcd\x01\xc6I\xcd\x01\xc5JP\xaaarray_body\x85\xc4\x02nd\xc3\xc4\x04type\xa3+", 0), + ("over", "fox", ">+", 0), + ("over", "the", ">+", 0), + ("jumped", "over", ">+", 1), ("jumped", "over", ">++", 1), ("fox", "lazy", ">++", 0), ("over", "the", ">++", 0), + ("jumped", "over", ">-", 0), + ("fox", "quick", ">-", 0), + ("brown", "quick", ">-", 0), + ("fox", "brown", ">-", 1), ("brown", "fox", ">--", 0), ("fox", "brown", ">--", 1), ("jumped", "fox", ">--", 1), diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py index 09ab6c7dc..b17d23382 100644 --- a/spacy/tests/matcher/test_matcher_api.py +++ b/spacy/tests/matcher/test_matcher_api.py @@ -50,8 +50,6 @@ def test_matcher_from_usage_docs(en_vocab): def label_sentiment(matcher, doc, i, matches): match_id, start, end = matches[i] - if doc.vocab.strings[match_id] == "HAPPY": - doc.sentiment += 0.1 span = doc[start:end] with doc.retokenize() as retokenizer: retokenizer.merge(span) @@ -61,7 +59,6 @@ def test_matcher_from_usage_docs(en_vocab): matcher = Matcher(en_vocab) matcher.add("HAPPY", pos_patterns, on_match=label_sentiment) matcher(doc) - assert doc.sentiment != 0 assert doc[1].norm_ == "happy emoji" @@ -793,9 +790,16 @@ def test_matcher_span(matcher): doc = Doc(matcher.vocab, words=text.split()) span_js = doc[:3] span_java = doc[4:] - assert len(matcher(doc)) == 2 - assert len(matcher(span_js)) == 1 - assert len(matcher(span_java)) == 1 + doc_matches = matcher(doc) + span_js_matches = matcher(span_js) + span_java_matches = matcher(span_java) + assert len(doc_matches) == 2 + assert len(span_js_matches) == 1 + assert len(span_java_matches) == 1 + + # match offsets always refer to the doc + assert doc_matches[0] == span_js_matches[0] + assert doc_matches[1] == span_java_matches[0] def test_matcher_as_spans(matcher): diff --git a/spacy/tests/matcher/test_phrase_matcher.py b/spacy/tests/matcher/test_phrase_matcher.py index 8a8d9eb84..20d0febb8 100644 --- a/spacy/tests/matcher/test_phrase_matcher.py +++ b/spacy/tests/matcher/test_phrase_matcher.py @@ -87,14 +87,15 @@ def test_issue4373(): @pytest.mark.issue(4651) def test_issue4651_with_phrase_matcher_attr(): - """Test that the EntityRuler PhraseMatcher is deserialized correctly using - the method from_disk when the EntityRuler argument phrase_matcher_attr is + """Test that the entity_ruler PhraseMatcher is deserialized correctly using + the method from_disk when the entity_ruler argument phrase_matcher_attr is specified. """ text = "Spacy is a python library for nlp" nlp = English() patterns = [{"label": "PYTHON_LIB", "pattern": "spacy", "id": "spaCy"}] - ruler = nlp.add_pipe("entity_ruler", config={"phrase_matcher_attr": "LOWER"}) + config = {"phrase_matcher_attr": "LOWER"} + ruler = nlp.add_pipe("entity_ruler", config=config) ruler.add_patterns(patterns) doc = nlp(text) res = [(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents] @@ -102,7 +103,7 @@ def test_issue4651_with_phrase_matcher_attr(): with make_tempdir() as d: file_path = d / "entityruler" ruler.to_disk(file_path) - nlp_reloaded.add_pipe("entity_ruler").from_disk(file_path) + nlp_reloaded.add_pipe("entity_ruler", config=config).from_disk(file_path) doc_reloaded = nlp_reloaded(text) res_reloaded = [(ent.text, ent.label_, ent.ent_id_) for ent in doc_reloaded.ents] assert res == res_reloaded @@ -198,28 +199,6 @@ def test_phrase_matcher_contains(en_vocab): assert "TEST2" not in matcher -def test_phrase_matcher_add_new_api(en_vocab): - doc = Doc(en_vocab, words=["a", "b"]) - patterns = [Doc(en_vocab, words=["a"]), Doc(en_vocab, words=["a", "b"])] - matcher = PhraseMatcher(en_vocab) - matcher.add("OLD_API", None, *patterns) - assert len(matcher(doc)) == 2 - matcher = PhraseMatcher(en_vocab) - on_match = Mock() - matcher.add("OLD_API_CALLBACK", on_match, *patterns) - assert len(matcher(doc)) == 2 - assert on_match.call_count == 2 - # New API: add(key: str, patterns: List[List[dict]], on_match: Callable) - matcher = PhraseMatcher(en_vocab) - matcher.add("NEW_API", patterns) - assert len(matcher(doc)) == 2 - matcher = PhraseMatcher(en_vocab) - on_match = Mock() - matcher.add("NEW_API_CALLBACK", patterns, on_match=on_match) - assert len(matcher(doc)) == 2 - assert on_match.call_count == 2 - - def test_phrase_matcher_repeated_add(en_vocab): matcher = PhraseMatcher(en_vocab) # match ID only gets added once @@ -468,6 +447,13 @@ def test_phrase_matcher_deprecated(en_vocab): assert "spaCy v3.0" in str(record.list[0].message) +def test_phrase_matcher_non_doc(en_vocab): + matcher = PhraseMatcher(en_vocab) + doc = Doc(en_vocab, words=["hello", "world"]) + with pytest.raises(ValueError): + matcher.add("TEST", [doc, "junk"]) + + @pytest.mark.parametrize("attr", ["SENT_START", "IS_SENT_START"]) def test_phrase_matcher_sent_start(en_vocab, attr): _ = PhraseMatcher(en_vocab, attr=attr) # noqa: F841 diff --git a/spacy/tests/package/test_requirements.py b/spacy/tests/package/test_requirements.py index b403f274f..94dffd7ce 100644 --- a/spacy/tests/package/test_requirements.py +++ b/spacy/tests/package/test_requirements.py @@ -4,8 +4,8 @@ from pathlib import Path def test_build_dependencies(): # Check that library requirements are pinned exactly the same across different setup files. - # TODO: correct checks for numpy rather than ignoring libs_ignore_requirements = [ + "cython", "pytest", "pytest-timeout", "mock", @@ -22,7 +22,7 @@ def test_build_dependencies(): # ignore language-specific packages that shouldn't be installed by all libs_ignore_setup = [ "fugashi", - "natto-py", + "mecab-ko", "pythainlp", "sudachipy", "sudachidict_core", diff --git a/spacy/tests/parser/_search.pyx b/spacy/tests/parser/_search.pyx new file mode 100644 index 000000000..23fc81644 --- /dev/null +++ b/spacy/tests/parser/_search.pyx @@ -0,0 +1,119 @@ +# cython: infer_types=True, binding=True +from spacy.pipeline._parser_internals.search cimport Beam, MaxViolation +from spacy.typedefs cimport class_t, weight_t +from cymem.cymem cimport Pool + +from ..conftest import cytest +import pytest + +cdef struct TestState: + int length + int x + Py_UNICODE* string + + +cdef int transition(void* dest, void* src, class_t clas, void* extra_args) except -1: + dest_state = dest + src_state = src + dest_state.length = src_state.length + dest_state.x = src_state.x + dest_state.x += clas + if extra_args != NULL: + dest_state.string = extra_args + else: + dest_state.string = src_state.string + + +cdef void* initialize(Pool mem, int n, void* extra_args) except NULL: + state = mem.alloc(1, sizeof(TestState)) + state.length = n + state.x = 1 + if extra_args == NULL: + state.string = u'default' + else: + state.string = extra_args + return state + + +cdef int destroy(Pool mem, void* state, void* extra_args) except -1: + state = state + mem.free(state) + +@cytest +@pytest.mark.parametrize("nr_class,beam_width", + [ + (2, 3), + (3, 6), + (4, 20), + ] +) +def test_init(nr_class, beam_width): + b = Beam(nr_class, beam_width) + assert b.size == 1 + assert b.width == beam_width + assert b.nr_class == nr_class + +@cytest +def test_init_violn(): + MaxViolation() + +@cytest +@pytest.mark.parametrize("nr_class,beam_width,length", + [ + (2, 3, 3), + (3, 6, 15), + (4, 20, 32), + ] +) +def test_initialize(nr_class, beam_width, length): + b = Beam(nr_class, beam_width) + b.initialize(initialize, destroy, length, NULL) + for i in range(b.width): + s = b.at(i) + assert s.length == length, s.length + assert s.string == 'default' + + +@cytest +@pytest.mark.parametrize("nr_class,beam_width,length,extra", + [ + (2, 3, 4, None), + (3, 6, 15, u"test beam 1"), + ] +) +def test_initialize_extra(nr_class, beam_width, length, extra): + b = Beam(nr_class, beam_width) + if extra is None: + b.initialize(initialize, destroy, length, NULL) + else: + b.initialize(initialize, destroy, length, extra) + for i in range(b.width): + s = b.at(i) + assert s.length == length + + +@cytest +@pytest.mark.parametrize("nr_class,beam_width,length", + [ + (3, 6, 15), + (4, 20, 32), + ] +) +def test_transition(nr_class, beam_width, length): + b = Beam(nr_class, beam_width) + b.initialize(initialize, destroy, length, NULL) + b.set_cell(0, 2, 30, True, 0) + b.set_cell(0, 1, 42, False, 0) + b.advance(transition, NULL, NULL) + assert b.size == 1, b.size + assert b.score == 30, b.score + s = b.at(0) + assert s.x == 3 + assert b._states[0].score == 30, b._states[0].score + b.set_cell(0, 1, 10, True, 0) + b.set_cell(0, 2, 20, True, 0) + b.advance(transition, NULL, NULL) + assert b._states[0].score == 50, b._states[0].score + assert b._states[1].score == 40 + s = b.at(0) + assert s.x == 5 diff --git a/spacy/tests/parser/test_model.py b/spacy/tests/parser/test_model.py new file mode 100644 index 000000000..8c1cf7a93 --- /dev/null +++ b/spacy/tests/parser/test_model.py @@ -0,0 +1,61 @@ +import numpy +import pytest + +from spacy.lang.en import English +from spacy.ml.tb_framework import TransitionModelInputs +from spacy.training import Example + +TRAIN_DATA = [ + ( + "They trade mortgage-backed securities.", + { + "heads": [1, 1, 4, 4, 5, 1, 1], + "deps": ["nsubj", "ROOT", "compound", "punct", "nmod", "dobj", "punct"], + }, + ), + ( + "I like London and Berlin.", + { + "heads": [1, 1, 1, 2, 2, 1], + "deps": ["nsubj", "ROOT", "dobj", "cc", "conj", "punct"], + }, + ), +] + + +@pytest.fixture +def nlp_parser(): + nlp = English() + parser = nlp.add_pipe("parser") + + train_examples = [] + for text, annotations in TRAIN_DATA: + train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) + for dep in annotations["deps"]: + parser.add_label(dep) + nlp.initialize() + + return nlp, parser + + +def test_incorrect_number_of_actions(nlp_parser): + nlp, parser = nlp_parser + doc = nlp.make_doc("test") + + # Too many actions for the number of docs + with pytest.raises(AssertionError): + parser.model.predict( + TransitionModelInputs( + docs=[doc], moves=parser.moves, actions=[numpy.array([0, 0], dtype="i")] + ) + ) + + # Too few actions for the number of docs + with pytest.raises(AssertionError): + parser.model.predict( + TransitionModelInputs( + docs=[doc, doc], + moves=parser.moves, + actions=[numpy.array([0], dtype="i")], + ) + ) diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py index 00889efdc..62b8f9704 100644 --- a/spacy/tests/parser/test_ner.py +++ b/spacy/tests/parser/test_ner.py @@ -13,6 +13,7 @@ from spacy.pipeline._parser_internals.ner import BiluoPushDown from spacy.training import Example, iob_to_biluo, split_bilu_label from spacy.tokens import Doc, Span from spacy.vocab import Vocab +from thinc.api import fix_random_seed import logging from ..util import make_tempdir @@ -412,7 +413,7 @@ def test_train_empty(): train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) ner = nlp.add_pipe("ner", last=True) ner.add_label("PERSON") - nlp.initialize() + nlp.initialize(get_examples=lambda: train_examples) for itn in range(2): losses = {} batches = util.minibatch(train_examples, size=8) @@ -539,11 +540,11 @@ def test_block_ner(): assert [token.ent_type_ for token in doc] == expected_types -@pytest.mark.parametrize("use_upper", [True, False]) -def test_overfitting_IO(use_upper): +def test_overfitting_IO(): + fix_random_seed(1) # Simple test to try and quickly overfit the NER component nlp = English() - ner = nlp.add_pipe("ner", config={"model": {"use_upper": use_upper}}) + ner = nlp.add_pipe("ner", config={"model": {}}) train_examples = [] for text, annotations in TRAIN_DATA: train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) @@ -575,7 +576,6 @@ def test_overfitting_IO(use_upper): assert ents2[0].label_ == "LOC" # Ensure that the predictions are still the same, even after adding a new label ner2 = nlp2.get_pipe("ner") - assert ner2.model.attrs["has_upper"] == use_upper ner2.add_label("RANDOM_NEW_LABEL") doc3 = nlp2(test_text) ents3 = doc3.ents @@ -617,6 +617,55 @@ def test_overfitting_IO(use_upper): assert ents[1].kb_id == 0 +def test_is_distillable(): + nlp = English() + ner = nlp.add_pipe("ner") + assert ner.is_distillable + + +@pytest.mark.slow +@pytest.mark.parametrize("max_moves", [0, 1, 5, 100]) +def test_distill(max_moves): + teacher = English() + teacher_ner = teacher.add_pipe("ner") + train_examples = [] + for text, annotations in TRAIN_DATA: + train_examples.append(Example.from_dict(teacher.make_doc(text), annotations)) + for ent in annotations.get("entities"): + teacher_ner.add_label(ent[2]) + + optimizer = teacher.initialize(get_examples=lambda: train_examples) + + for i in range(50): + losses = {} + teacher.update(train_examples, sgd=optimizer, losses=losses) + assert losses["ner"] < 0.00001 + + student = English() + student_ner = student.add_pipe("ner") + student_ner.cfg["update_with_oracle_cut_size"] = max_moves + student_ner.initialize( + get_examples=lambda: train_examples, labels=teacher_ner.label_data + ) + + distill_examples = [ + Example.from_dict(teacher.make_doc(t[0]), {}) for t in TRAIN_DATA + ] + + for i in range(100): + losses = {} + student_ner.distill(teacher_ner, distill_examples, sgd=optimizer, losses=losses) + assert losses["ner"] < 0.0001 + + # test the trained model + test_text = "I like London." + doc = student(test_text) + ents = doc.ents + assert len(ents) == 1 + assert ents[0].text == "London" + assert ents[0].label_ == "LOC" + + def test_beam_ner_scores(): # Test that we can get confidence values out of the beam_ner pipe beam_width = 16 diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py index aaf31ed56..2f2fa397e 100644 --- a/spacy/tests/parser/test_parse.py +++ b/spacy/tests/parser/test_parse.py @@ -1,13 +1,17 @@ +import itertools import pytest +import numpy from numpy.testing import assert_equal from thinc.api import Adam from spacy import registry, util from spacy.attrs import DEP, NORM from spacy.lang.en import English -from spacy.tokens import Doc from spacy.training import Example +from spacy.tokens import Doc from spacy.vocab import Vocab +from spacy import util, registry +from thinc.api import fix_random_seed from ...pipeline import DependencyParser from ...pipeline.dep_parser import DEFAULT_PARSER_MODEL @@ -59,6 +63,8 @@ PARTIAL_DATA = [ ), ] +PARSERS = ["parser"] # TODO: Test beam_parser when ready + eps = 0.1 @@ -171,6 +177,57 @@ def test_parser_parse_one_word_sentence(en_vocab, en_parser, words): assert doc[0].dep != 0 +def test_parser_apply_actions(en_vocab, en_parser): + words = ["I", "ate", "pizza"] + words2 = ["Eat", "more", "pizza", "!"] + doc1 = Doc(en_vocab, words=words) + doc2 = Doc(en_vocab, words=words2) + docs = [doc1, doc2] + + moves = en_parser.moves + moves.add_action(0, "") + moves.add_action(1, "") + moves.add_action(2, "nsubj") + moves.add_action(3, "obj") + moves.add_action(2, "amod") + + actions = [ + numpy.array([0, 0], dtype="i"), + numpy.array([2, 0], dtype="i"), + numpy.array([0, 4], dtype="i"), + numpy.array([3, 3], dtype="i"), + numpy.array([1, 1], dtype="i"), + numpy.array([1, 1], dtype="i"), + numpy.array([0], dtype="i"), + numpy.array([1], dtype="i"), + ] + + states = moves.init_batch(docs) + active_states = states + + for step_actions in actions: + active_states = moves.apply_actions(active_states, step_actions) + + assert len(active_states) == 0 + + for (state, doc) in zip(states, docs): + moves.set_annotations(state, doc) + + assert docs[0][0].head.i == 1 + assert docs[0][0].dep_ == "nsubj" + assert docs[0][1].head.i == 1 + assert docs[0][1].dep_ == "ROOT" + assert docs[0][2].head.i == 1 + assert docs[0][2].dep_ == "obj" + + assert docs[1][0].head.i == 0 + assert docs[1][0].dep_ == "ROOT" + assert docs[1][1].head.i == 2 + assert docs[1][1].dep_ == "amod" + assert docs[1][2].head.i == 0 + assert docs[1][2].dep_ == "obj" + + @pytest.mark.skip( reason="The step_through API was removed (but should be brought back)" ) @@ -319,7 +376,7 @@ def test_parser_constructor(en_vocab): DependencyParser(en_vocab, model) -@pytest.mark.parametrize("pipe_name", ["parser", "beam_parser"]) +@pytest.mark.parametrize("pipe_name", PARSERS) def test_incomplete_data(pipe_name): # Test that the parser works with incomplete information nlp = English() @@ -345,11 +402,15 @@ def test_incomplete_data(pipe_name): assert doc[2].head.i == 1 -@pytest.mark.parametrize("pipe_name", ["parser", "beam_parser"]) -def test_overfitting_IO(pipe_name): +@pytest.mark.parametrize( + "pipe_name,max_moves", itertools.product(PARSERS, [0, 1, 5, 100]) +) +def test_overfitting_IO(pipe_name, max_moves): + fix_random_seed(0) # Simple test to try and quickly overfit the dependency parser (normal or beam) nlp = English() parser = nlp.add_pipe(pipe_name) + parser.cfg["update_with_oracle_cut_size"] = max_moves train_examples = [] for text, annotations in TRAIN_DATA: train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) @@ -396,16 +457,70 @@ def test_overfitting_IO(pipe_name): assert_equal(batch_deps_1, no_batch_deps) +def test_is_distillable(): + nlp = English() + parser = nlp.add_pipe("parser") + assert parser.is_distillable + + +@pytest.mark.slow +@pytest.mark.parametrize("max_moves", [0, 1, 5, 100]) +def test_distill(max_moves): + teacher = English() + teacher_parser = teacher.add_pipe("parser") + train_examples = [] + for text, annotations in TRAIN_DATA: + train_examples.append(Example.from_dict(teacher.make_doc(text), annotations)) + for dep in annotations.get("deps", []): + teacher_parser.add_label(dep) + + optimizer = teacher.initialize(get_examples=lambda: train_examples) + + for i in range(200): + losses = {} + teacher.update(train_examples, sgd=optimizer, losses=losses) + assert losses["parser"] < 0.0001 + + student = English() + student_parser = student.add_pipe("parser") + student_parser.cfg["update_with_oracle_cut_size"] = max_moves + student_parser.initialize( + get_examples=lambda: train_examples, labels=teacher_parser.label_data + ) + + distill_examples = [ + Example.from_dict(teacher.make_doc(t[0]), {}) for t in TRAIN_DATA + ] + + for i in range(200): + losses = {} + student_parser.distill( + teacher_parser, distill_examples, sgd=optimizer, losses=losses + ) + assert losses["parser"] < 0.0001 + + test_text = "I like securities." + doc = student(test_text) + assert doc[0].dep_ == "nsubj" + assert doc[2].dep_ == "dobj" + assert doc[3].dep_ == "punct" + assert doc[0].head.i == 1 + assert doc[2].head.i == 1 + assert doc[3].head.i == 1 + + # fmt: off @pytest.mark.slow @pytest.mark.parametrize("pipe_name", ["parser", "beam_parser"]) @pytest.mark.parametrize( "parser_config", [ - # TransitionBasedParser V1 - ({"@architectures": "spacy.TransitionBasedParser.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2, "use_upper": True}), - # TransitionBasedParser V2 + # TODO: re-enable after we have a spacy-legacy release for v4. See + # https://github.com/explosion/spacy-legacy/pull/36 + #({"@architectures": "spacy.TransitionBasedParser.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2, "use_upper": True}), ({"@architectures": "spacy.TransitionBasedParser.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2, "use_upper": True}), + ({"@architectures": "spacy.TransitionBasedParser.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2, "use_upper": False}), + ({"@architectures": "spacy.TransitionBasedParser.v3", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2}), ], ) # fmt: on diff --git a/spacy/tests/parser/test_search.py b/spacy/tests/parser/test_search.py new file mode 100644 index 000000000..136c3a11b --- /dev/null +++ b/spacy/tests/parser/test_search.py @@ -0,0 +1,3 @@ +from ..conftest import register_cython_tests + +register_cython_tests("spacy.tests.parser._search", __name__) diff --git a/spacy/tests/pipeline/test_annotates_on_update.py b/spacy/tests/pipeline/test_annotates_on_update.py index 869b8b874..10fb22c97 100644 --- a/spacy/tests/pipeline/test_annotates_on_update.py +++ b/spacy/tests/pipeline/test_annotates_on_update.py @@ -54,9 +54,11 @@ def test_annotates_on_update(): return AssertSents(name) class AssertSents: + model = None + is_trainable = True + def __init__(self, name, **cfg): self.name = name - pass def __call__(self, doc): if not doc.has_annotation("SENT_START"): @@ -64,10 +66,16 @@ def test_annotates_on_update(): return doc def update(self, examples, *, drop=0.0, sgd=None, losses=None): + losses.setdefault(self.name, 0.0) + for example in examples: if not example.predicted.has_annotation("SENT_START"): raise ValueError("No sents") - return {} + + return losses + + def finish_update(self, sgd=None): + pass nlp = English() nlp.add_pipe("sentencizer") diff --git a/spacy/tests/pipeline/test_edit_tree_lemmatizer.py b/spacy/tests/pipeline/test_edit_tree_lemmatizer.py index c4f9b09f3..c5c50c77f 100644 --- a/spacy/tests/pipeline/test_edit_tree_lemmatizer.py +++ b/spacy/tests/pipeline/test_edit_tree_lemmatizer.py @@ -1,3 +1,4 @@ +from typing import cast import pickle import pytest from hypothesis import given @@ -6,6 +7,7 @@ from spacy import util from spacy.lang.en import English from spacy.language import Language from spacy.pipeline._edit_tree_internals.edit_trees import EditTrees +from spacy.pipeline.trainable_pipe import TrainablePipe from spacy.training import Example from spacy.strings import StringStore from spacy.util import make_tempdir @@ -101,14 +103,15 @@ def test_initialize_from_labels(): } -def test_no_data(): +@pytest.mark.parametrize("top_k", (1, 5, 30)) +def test_no_data(top_k): # Test that the lemmatizer provides a nice error when there's no tagging data / labels TEXTCAT_DATA = [ ("I'm so happy.", {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}), ("I'm so angry", {"cats": {"POSITIVE": 0.0, "NEGATIVE": 1.0}}), ] nlp = English() - nlp.add_pipe("trainable_lemmatizer") + nlp.add_pipe("trainable_lemmatizer", config={"top_k": top_k}) nlp.add_pipe("textcat") train_examples = [] @@ -119,10 +122,11 @@ def test_no_data(): nlp.initialize(get_examples=lambda: train_examples) -def test_incomplete_data(): +@pytest.mark.parametrize("top_k", (1, 5, 30)) +def test_incomplete_data(top_k): # Test that the lemmatizer works with incomplete information nlp = English() - lemmatizer = nlp.add_pipe("trainable_lemmatizer") + lemmatizer = nlp.add_pipe("trainable_lemmatizer", config={"top_k": top_k}) lemmatizer.min_tree_freq = 1 train_examples = [] for t in PARTIAL_DATA: @@ -154,9 +158,10 @@ def test_incomplete_data(): assert xp.count_nonzero(dX[1][1]) == 0 -def test_overfitting_IO(): +@pytest.mark.parametrize("top_k", (1, 5, 30)) +def test_overfitting_IO(top_k): nlp = English() - lemmatizer = nlp.add_pipe("trainable_lemmatizer") + lemmatizer = nlp.add_pipe("trainable_lemmatizer", config={"top_k": top_k}) lemmatizer.min_tree_freq = 1 train_examples = [] for t in TRAIN_DATA: @@ -189,7 +194,7 @@ def test_overfitting_IO(): # Check model after a {to,from}_bytes roundtrip nlp_bytes = nlp.to_bytes() nlp3 = English() - nlp3.add_pipe("trainable_lemmatizer") + nlp3.add_pipe("trainable_lemmatizer", config={"top_k": top_k}) nlp3.from_bytes(nlp_bytes) doc3 = nlp3(test_text) assert doc3[0].lemma_ == "she" @@ -207,6 +212,53 @@ def test_overfitting_IO(): assert doc4[3].lemma_ == "egg" +def test_is_distillable(): + nlp = English() + lemmatizer = nlp.add_pipe("trainable_lemmatizer") + assert lemmatizer.is_distillable + + +def test_distill(): + teacher = English() + teacher_lemmatizer = teacher.add_pipe("trainable_lemmatizer") + teacher_lemmatizer.min_tree_freq = 1 + train_examples = [] + for t in TRAIN_DATA: + train_examples.append(Example.from_dict(teacher.make_doc(t[0]), t[1])) + + optimizer = teacher.initialize(get_examples=lambda: train_examples) + + for i in range(50): + losses = {} + teacher.update(train_examples, sgd=optimizer, losses=losses) + assert losses["trainable_lemmatizer"] < 0.00001 + + student = English() + student_lemmatizer = student.add_pipe("trainable_lemmatizer") + student_lemmatizer.min_tree_freq = 1 + student_lemmatizer.initialize( + get_examples=lambda: train_examples, labels=teacher_lemmatizer.label_data + ) + + distill_examples = [ + Example.from_dict(teacher.make_doc(t[0]), {}) for t in TRAIN_DATA + ] + + for i in range(50): + losses = {} + student_lemmatizer.distill( + teacher_lemmatizer, distill_examples, sgd=optimizer, losses=losses + ) + assert losses["trainable_lemmatizer"] < 0.00001 + + test_text = "She likes blue eggs" + doc = student(test_text) + assert doc[0].lemma_ == "she" + assert doc[1].lemma_ == "like" + assert doc[2].lemma_ == "blue" + assert doc[3].lemma_ == "egg" + + def test_lemmatizer_requires_labels(): nlp = English() nlp.add_pipe("trainable_lemmatizer") @@ -327,3 +379,26 @@ def test_empty_strings(): no_change = trees.add("xyz", "xyz") empty = trees.add("", "") assert no_change == empty + + +def test_save_activations(): + nlp = English() + lemmatizer = cast(TrainablePipe, nlp.add_pipe("trainable_lemmatizer")) + lemmatizer.min_tree_freq = 1 + train_examples = [] + for t in TRAIN_DATA: + train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) + nlp.initialize(get_examples=lambda: train_examples) + nO = lemmatizer.model.get_dim("nO") + + doc = nlp("This is a test.") + assert "trainable_lemmatizer" not in doc.activations + + lemmatizer.save_activations = True + doc = nlp("This is a test.") + assert list(doc.activations["trainable_lemmatizer"].keys()) == [ + "probabilities", + "tree_ids", + ] + assert doc.activations["trainable_lemmatizer"]["probabilities"].shape == (5, nO) + assert doc.activations["trainable_lemmatizer"]["tree_ids"].shape == (5,) diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index 99f164f15..87cacfc9d 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -1,7 +1,8 @@ -from typing import Callable, Iterable, Dict, Any +from typing import Callable, Iterable, Dict, Any, cast import pytest from numpy.testing import assert_equal +from thinc.types import Ragged from spacy import registry, util from spacy.attrs import ENT_KB_ID @@ -10,8 +11,7 @@ from spacy.kb import Candidate, InMemoryLookupKB, get_candidates, KnowledgeBase from spacy.lang.en import English from spacy.ml import load_kb from spacy.ml.models.entity_linker import build_span_maker -from spacy.pipeline import EntityLinker -from spacy.pipeline.legacy import EntityLinker_v1 +from spacy.pipeline import EntityLinker, TrainablePipe from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL from spacy.scorer import Scorer from spacy.tests.util import make_tempdir @@ -353,6 +353,9 @@ def test_kb_default(nlp): """Test that the default (empty) KB is loaded upon construction""" entity_linker = nlp.add_pipe("entity_linker", config={}) assert len(entity_linker.kb) == 0 + with pytest.raises(ValueError, match="E139"): + # this raises an error because the KB is empty + entity_linker.validate_kb() assert entity_linker.kb.get_size_entities() == 0 assert entity_linker.kb.get_size_aliases() == 0 # 64 is the default value from pipeline.entity_linker @@ -990,12 +993,12 @@ def test_scorer_links(): @pytest.mark.parametrize( "name,config", [ - ("entity_linker", {"@architectures": "spacy.EntityLinker.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL}), ("entity_linker", {"@architectures": "spacy.EntityLinker.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL}), ], ) # fmt: on def test_legacy_architectures(name, config): + # Ensure that the legacy architectures still work vector_length = 3 nlp = English() @@ -1017,10 +1020,7 @@ def test_legacy_architectures(name, config): return mykb entity_linker = nlp.add_pipe(name, config={"model": config}) - if config["@architectures"] == "spacy.EntityLinker.v1": - assert isinstance(entity_linker, EntityLinker_v1) - else: - assert isinstance(entity_linker, EntityLinker) + assert isinstance(entity_linker, EntityLinker) entity_linker.set_kb(create_kb) optimizer = nlp.initialize(get_examples=lambda: train_examples) @@ -1203,6 +1203,69 @@ def test_threshold(meet_threshold: bool, config: Dict[str, Any]): assert doc.ents[0].kb_id_ == entity_id if meet_threshold else EntityLinker.NIL +def test_save_activations(): + nlp = English() + vector_length = 3 + assert "Q2146908" not in nlp.vocab.strings + + # Convert the texts to docs to make sure we have doc.ents set for the training examples + train_examples = [] + for text, annotation in TRAIN_DATA: + doc = nlp(text) + train_examples.append(Example.from_dict(doc, annotation)) + + def create_kb(vocab): + # create artificial KB - assign same prior weight to the two russ cochran's + # Q2146908 (Russ Cochran): American golfer + # Q7381115 (Russ Cochran): publisher + mykb = InMemoryLookupKB(vocab, entity_vector_length=vector_length) + mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3]) + mykb.add_entity(entity="Q7381115", freq=12, entity_vector=[9, 1, -7]) + mykb.add_alias( + alias="Russ Cochran", + entities=["Q2146908", "Q7381115"], + probabilities=[0.5, 0.5], + ) + return mykb + + # Create the Entity Linker component and add it to the pipeline + entity_linker = cast(TrainablePipe, nlp.add_pipe("entity_linker", last=True)) + assert isinstance(entity_linker, EntityLinker) + entity_linker.set_kb(create_kb) + assert "Q2146908" in entity_linker.vocab.strings + assert "Q2146908" in entity_linker.kb.vocab.strings + + # initialize the NEL pipe + nlp.initialize(get_examples=lambda: train_examples) + + nO = entity_linker.model.get_dim("nO") + + nlp.add_pipe("sentencizer", first=True) + patterns = [ + {"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]}, + {"label": "ORG", "pattern": [{"LOWER": "ec"}, {"LOWER": "comics"}]}, + ] + ruler = nlp.add_pipe("entity_ruler", before="entity_linker") + ruler.add_patterns(patterns) + + doc = nlp("Russ Cochran was a publisher") + assert "entity_linker" not in doc.activations + + entity_linker.save_activations = True + doc = nlp("Russ Cochran was a publisher") + assert set(doc.activations["entity_linker"].keys()) == {"ents", "scores"} + ents = doc.activations["entity_linker"]["ents"] + assert isinstance(ents, Ragged) + assert ents.data.shape == (2, 1) + assert ents.data.dtype == "uint64" + assert ents.lengths.shape == (1,) + scores = doc.activations["entity_linker"]["scores"] + assert isinstance(scores, Ragged) + assert scores.data.shape == (2, 1) + assert scores.data.dtype == "float32" + assert scores.lengths.shape == (1,) + + def test_span_maker_forward_with_empty(): """The forward pass of the span maker may have a doc with no entities.""" nlp = English() diff --git a/spacy/tests/pipeline/test_entity_ruler.py b/spacy/tests/pipeline/test_entity_ruler.py index 417f930cb..db502e13f 100644 --- a/spacy/tests/pipeline/test_entity_ruler.py +++ b/spacy/tests/pipeline/test_entity_ruler.py @@ -4,7 +4,7 @@ from spacy import registry from spacy.tokens import Doc, Span from spacy.language import Language from spacy.lang.en import English -from spacy.pipeline import EntityRuler, EntityRecognizer, merge_entities +from spacy.pipeline import EntityRecognizer, merge_entities from spacy.pipeline import SpanRuler from spacy.pipeline.ner import DEFAULT_NER_MODEL from spacy.errors import MatchPatternError @@ -12,8 +12,6 @@ from spacy.tests.util import make_tempdir from thinc.api import NumpyOps, get_current_ops -ENTITY_RULERS = ["entity_ruler", "future_entity_ruler"] - @pytest.fixture def nlp(): @@ -40,13 +38,12 @@ def add_ent_component(doc): @pytest.mark.issue(3345) -@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) -def test_issue3345(entity_ruler_factory): +def test_issue3345(): """Test case where preset entity crosses sentence boundary.""" nlp = English() doc = Doc(nlp.vocab, words=["I", "live", "in", "New", "York"]) doc[4].is_sent_start = True - ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler") + ruler = nlp.add_pipe("entity_ruler") ruler.add_patterns([{"label": "GPE", "pattern": "New York"}]) cfg = {"model": DEFAULT_NER_MODEL} model = registry.resolve(cfg, validate=True)["model"] @@ -65,15 +62,14 @@ def test_issue3345(entity_ruler_factory): @pytest.mark.issue(4849) -@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) -def test_issue4849(entity_ruler_factory): +def test_issue4849(): nlp = English() patterns = [ {"label": "PERSON", "pattern": "joe biden", "id": "joe-biden"}, {"label": "PERSON", "pattern": "bernie sanders", "id": "bernie-sanders"}, ] ruler = nlp.add_pipe( - entity_ruler_factory, + "entity_ruler", name="entity_ruler", config={"phrase_matcher_attr": "LOWER"}, ) @@ -96,11 +92,10 @@ def test_issue4849(entity_ruler_factory): @pytest.mark.issue(5918) -@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) -def test_issue5918(entity_ruler_factory): +def test_issue5918(): # Test edge case when merging entities. nlp = English() - ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler") + ruler = nlp.add_pipe("entity_ruler") patterns = [ {"label": "ORG", "pattern": "Digicon Inc"}, {"label": "ORG", "pattern": "Rotan Mosle Inc's"}, @@ -125,10 +120,9 @@ def test_issue5918(entity_ruler_factory): @pytest.mark.issue(8168) -@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) -def test_issue8168(entity_ruler_factory): +def test_issue8168(): nlp = English() - ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler") + ruler = nlp.add_pipe("entity_ruler") patterns = [ {"label": "ORG", "pattern": "Apple"}, { @@ -148,12 +142,9 @@ def test_issue8168(entity_ruler_factory): @pytest.mark.issue(8216) -@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) -def test_entity_ruler_fix8216(nlp, patterns, entity_ruler_factory): +def test_entity_ruler_fix8216(nlp, patterns): """Test that patterns don't get added excessively.""" - ruler = nlp.add_pipe( - entity_ruler_factory, name="entity_ruler", config={"validate": True} - ) + ruler = nlp.add_pipe("entity_ruler", config={"validate": True}) ruler.add_patterns(patterns) pattern_count = sum(len(mm) for mm in ruler.matcher._patterns.values()) assert pattern_count > 0 @@ -162,16 +153,15 @@ def test_entity_ruler_fix8216(nlp, patterns, entity_ruler_factory): assert after_count == pattern_count -@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) -def test_entity_ruler_init(nlp, patterns, entity_ruler_factory): - ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler") +def test_entity_ruler_init(nlp, patterns): + ruler = nlp.add_pipe("entity_ruler") ruler.add_patterns(patterns) assert len(ruler) == len(patterns) assert len(ruler.labels) == 4 assert "HELLO" in ruler assert "BYE" in ruler nlp.remove_pipe("entity_ruler") - ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler") + ruler = nlp.add_pipe("entity_ruler") ruler.add_patterns(patterns) doc = nlp("hello world bye bye") assert len(doc.ents) == 2 @@ -179,23 +169,21 @@ def test_entity_ruler_init(nlp, patterns, entity_ruler_factory): assert doc.ents[1].label_ == "BYE" -@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) -def test_entity_ruler_no_patterns_warns(nlp, entity_ruler_factory): - ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler") +def test_entity_ruler_no_patterns_warns(nlp): + ruler = nlp.add_pipe("entity_ruler") assert len(ruler) == 0 assert len(ruler.labels) == 0 nlp.remove_pipe("entity_ruler") - nlp.add_pipe(entity_ruler_factory, name="entity_ruler") + nlp.add_pipe("entity_ruler") assert nlp.pipe_names == ["entity_ruler"] with pytest.warns(UserWarning): doc = nlp("hello world bye bye") assert len(doc.ents) == 0 -@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) -def test_entity_ruler_init_patterns(nlp, patterns, entity_ruler_factory): +def test_entity_ruler_init_patterns(nlp, patterns): # initialize with patterns - ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler") + ruler = nlp.add_pipe("entity_ruler") assert len(ruler.labels) == 0 ruler.initialize(lambda: [], patterns=patterns) assert len(ruler.labels) == 4 @@ -207,7 +195,7 @@ def test_entity_ruler_init_patterns(nlp, patterns, entity_ruler_factory): nlp.config["initialize"]["components"]["entity_ruler"] = { "patterns": {"@misc": "entity_ruler_patterns"} } - ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler") + ruler = nlp.add_pipe("entity_ruler") assert len(ruler.labels) == 0 nlp.initialize() assert len(ruler.labels) == 4 @@ -216,20 +204,18 @@ def test_entity_ruler_init_patterns(nlp, patterns, entity_ruler_factory): assert doc.ents[1].label_ == "BYE" -@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) -def test_entity_ruler_init_clear(nlp, patterns, entity_ruler_factory): +def test_entity_ruler_init_clear(nlp, patterns): """Test that initialization clears patterns.""" - ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler") + ruler = nlp.add_pipe("entity_ruler") ruler.add_patterns(patterns) assert len(ruler.labels) == 4 ruler.initialize(lambda: []) assert len(ruler.labels) == 0 -@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) -def test_entity_ruler_clear(nlp, patterns, entity_ruler_factory): +def test_entity_ruler_clear(nlp, patterns): """Test that initialization clears patterns.""" - ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler") + ruler = nlp.add_pipe("entity_ruler") ruler.add_patterns(patterns) assert len(ruler.labels) == 4 doc = nlp("hello world") @@ -241,9 +227,8 @@ def test_entity_ruler_clear(nlp, patterns, entity_ruler_factory): assert len(doc.ents) == 0 -@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) -def test_entity_ruler_existing(nlp, patterns, entity_ruler_factory): - ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler") +def test_entity_ruler_existing(nlp, patterns): + ruler = nlp.add_pipe("entity_ruler") ruler.add_patterns(patterns) nlp.add_pipe("add_ent", before="entity_ruler") doc = nlp("OH HELLO WORLD bye bye") @@ -252,11 +237,8 @@ def test_entity_ruler_existing(nlp, patterns, entity_ruler_factory): assert doc.ents[1].label_ == "BYE" -@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) -def test_entity_ruler_existing_overwrite(nlp, patterns, entity_ruler_factory): - ruler = nlp.add_pipe( - entity_ruler_factory, name="entity_ruler", config={"overwrite_ents": True} - ) +def test_entity_ruler_existing_overwrite(nlp, patterns): + ruler = nlp.add_pipe("entity_ruler", config={"overwrite_ents": True}) ruler.add_patterns(patterns) nlp.add_pipe("add_ent", before="entity_ruler") doc = nlp("OH HELLO WORLD bye bye") @@ -266,11 +248,8 @@ def test_entity_ruler_existing_overwrite(nlp, patterns, entity_ruler_factory): assert doc.ents[1].label_ == "BYE" -@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) -def test_entity_ruler_existing_complex(nlp, patterns, entity_ruler_factory): - ruler = nlp.add_pipe( - entity_ruler_factory, name="entity_ruler", config={"overwrite_ents": True} - ) +def test_entity_ruler_existing_complex(nlp, patterns): + ruler = nlp.add_pipe("entity_ruler", config={"overwrite_ents": True}) ruler.add_patterns(patterns) nlp.add_pipe("add_ent", before="entity_ruler") doc = nlp("foo foo bye bye") @@ -281,11 +260,8 @@ def test_entity_ruler_existing_complex(nlp, patterns, entity_ruler_factory): assert len(doc.ents[1]) == 2 -@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) -def test_entity_ruler_entity_id(nlp, patterns, entity_ruler_factory): - ruler = nlp.add_pipe( - entity_ruler_factory, name="entity_ruler", config={"overwrite_ents": True} - ) +def test_entity_ruler_entity_id(nlp, patterns): + ruler = nlp.add_pipe("entity_ruler", config={"overwrite_ents": True}) ruler.add_patterns(patterns) doc = nlp("Apple is a technology company") assert len(doc.ents) == 1 @@ -293,26 +269,23 @@ def test_entity_ruler_entity_id(nlp, patterns, entity_ruler_factory): assert doc.ents[0].ent_id_ == "a1" -@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) -def test_entity_ruler_cfg_ent_id_sep(nlp, patterns, entity_ruler_factory): +def test_entity_ruler_cfg_ent_id_sep(nlp, patterns): config = {"overwrite_ents": True, "ent_id_sep": "**"} - ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler", config=config) + ruler = nlp.add_pipe("entity_ruler", config=config) ruler.add_patterns(patterns) doc = nlp("Apple is a technology company") - if isinstance(ruler, EntityRuler): - assert "TECH_ORG**a1" in ruler.phrase_patterns assert len(doc.ents) == 1 assert doc.ents[0].label_ == "TECH_ORG" assert doc.ents[0].ent_id_ == "a1" -@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) -def test_entity_ruler_serialize_bytes(nlp, patterns, entity_ruler_factory): - ruler = EntityRuler(nlp, patterns=patterns) +def test_entity_ruler_serialize_bytes(nlp, patterns): + ruler = nlp.add_pipe("entity_ruler") + ruler.add_patterns(patterns) assert len(ruler) == len(patterns) assert len(ruler.labels) == 4 ruler_bytes = ruler.to_bytes() - new_ruler = EntityRuler(nlp) + new_ruler = nlp.add_pipe("entity_ruler", name="new_ruler") assert len(new_ruler) == 0 assert len(new_ruler.labels) == 0 new_ruler = new_ruler.from_bytes(ruler_bytes) @@ -324,28 +297,27 @@ def test_entity_ruler_serialize_bytes(nlp, patterns, entity_ruler_factory): assert sorted(new_ruler.labels) == sorted(ruler.labels) -@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) -def test_entity_ruler_serialize_phrase_matcher_attr_bytes( - nlp, patterns, entity_ruler_factory -): - ruler = EntityRuler(nlp, phrase_matcher_attr="LOWER", patterns=patterns) +def test_entity_ruler_serialize_phrase_matcher_attr_bytes(nlp, patterns): + ruler = nlp.add_pipe("entity_ruler", config={"phrase_matcher_attr": "LOWER"}) + ruler.add_patterns(patterns) assert len(ruler) == len(patterns) assert len(ruler.labels) == 4 ruler_bytes = ruler.to_bytes() - new_ruler = EntityRuler(nlp) + new_ruler = nlp.add_pipe( + "entity_ruler", name="new_ruler", config={"phrase_matcher_attr": "LOWER"} + ) assert len(new_ruler) == 0 assert len(new_ruler.labels) == 0 - assert new_ruler.phrase_matcher_attr is None new_ruler = new_ruler.from_bytes(ruler_bytes) assert len(new_ruler) == len(patterns) assert len(new_ruler.labels) == 4 - assert new_ruler.phrase_matcher_attr == "LOWER" -@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) -def test_entity_ruler_validate(nlp, entity_ruler_factory): - ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler") - validated_ruler = EntityRuler(nlp, validate=True) +def test_entity_ruler_validate(nlp): + ruler = nlp.add_pipe("entity_ruler") + validated_ruler = nlp.add_pipe( + "entity_ruler", name="validated_ruler", config={"validate": True} + ) valid_pattern = {"label": "HELLO", "pattern": [{"LOWER": "HELLO"}]} invalid_pattern = {"label": "HELLO", "pattern": [{"ASDF": "HELLO"}]} @@ -362,16 +334,15 @@ def test_entity_ruler_validate(nlp, entity_ruler_factory): validated_ruler.add_patterns([invalid_pattern]) -@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) -def test_entity_ruler_properties(nlp, patterns, entity_ruler_factory): - ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True) +def test_entity_ruler_properties(nlp, patterns): + ruler = nlp.add_pipe("entity_ruler", config={"overwrite_ents": True}) + ruler.add_patterns(patterns) assert sorted(ruler.labels) == sorted(["HELLO", "BYE", "COMPLEX", "TECH_ORG"]) - assert sorted(ruler.ent_ids) == ["a1", "a2"] + assert sorted(ruler.ids) == ["a1", "a2"] -@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) -def test_entity_ruler_overlapping_spans(nlp, entity_ruler_factory): - ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler") +def test_entity_ruler_overlapping_spans(nlp): + ruler = nlp.add_pipe("entity_ruler") patterns = [ {"label": "FOOBAR", "pattern": "foo bar"}, {"label": "BARBAZ", "pattern": "bar baz"}, @@ -382,9 +353,8 @@ def test_entity_ruler_overlapping_spans(nlp, entity_ruler_factory): assert doc.ents[0].label_ == "FOOBAR" -@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) -def test_entity_ruler_fuzzy_pipe(nlp, entity_ruler_factory): - ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler") +def test_entity_ruler_fuzzy_pipe(nlp): + ruler = nlp.add_pipe("entity_ruler") patterns = [{"label": "HELLO", "pattern": [{"LOWER": {"FUZZY": "hello"}}]}] ruler.add_patterns(patterns) doc = nlp("helloo") @@ -392,9 +362,8 @@ def test_entity_ruler_fuzzy_pipe(nlp, entity_ruler_factory): assert doc.ents[0].label_ == "HELLO" -@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) -def test_entity_ruler_fuzzy(nlp, entity_ruler_factory): - ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler") +def test_entity_ruler_fuzzy(nlp): + ruler = nlp.add_pipe("entity_ruler") patterns = [{"label": "HELLO", "pattern": [{"LOWER": {"FUZZY": "hello"}}]}] ruler.add_patterns(patterns) doc = nlp("helloo") @@ -402,15 +371,13 @@ def test_entity_ruler_fuzzy(nlp, entity_ruler_factory): assert doc.ents[0].label_ == "HELLO" -@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) -def test_entity_ruler_fuzzy_disabled(nlp, entity_ruler_factory): +def test_entity_ruler_fuzzy_disabled(nlp): @registry.misc("test_fuzzy_compare_disabled") def make_test_fuzzy_compare_disabled(): return lambda x, y, z: False ruler = nlp.add_pipe( - entity_ruler_factory, - name="entity_ruler", + "entity_ruler", config={"matcher_fuzzy_compare": {"@misc": "test_fuzzy_compare_disabled"}}, ) patterns = [{"label": "HELLO", "pattern": [{"LOWER": {"FUZZY": "hello"}}]}] @@ -420,14 +387,13 @@ def test_entity_ruler_fuzzy_disabled(nlp, entity_ruler_factory): @pytest.mark.parametrize("n_process", [1, 2]) -@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) -def test_entity_ruler_multiprocessing(nlp, n_process, entity_ruler_factory): +def test_entity_ruler_multiprocessing(nlp, n_process): if isinstance(get_current_ops, NumpyOps) or n_process < 2: texts = ["I enjoy eating Pizza Hut pizza."] patterns = [{"label": "FASTFOOD", "pattern": "Pizza Hut", "id": "1234"}] - ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler") + ruler = nlp.add_pipe("entity_ruler") ruler.add_patterns(patterns) for doc in nlp.pipe(texts, n_process=2): @@ -435,9 +401,8 @@ def test_entity_ruler_multiprocessing(nlp, n_process, entity_ruler_factory): assert ent.ent_id_ == "1234" -@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) -def test_entity_ruler_serialize_jsonl(nlp, patterns, entity_ruler_factory): - ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler") +def test_entity_ruler_serialize_jsonl(nlp, patterns): + ruler = nlp.add_pipe("entity_ruler") ruler.add_patterns(patterns) with make_tempdir() as d: ruler.to_disk(d / "test_ruler.jsonl") @@ -446,9 +411,8 @@ def test_entity_ruler_serialize_jsonl(nlp, patterns, entity_ruler_factory): ruler.from_disk(d / "non_existing.jsonl") # read from a bad jsonl file -@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) -def test_entity_ruler_serialize_dir(nlp, patterns, entity_ruler_factory): - ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler") +def test_entity_ruler_serialize_dir(nlp, patterns): + ruler = nlp.add_pipe("entity_ruler") ruler.add_patterns(patterns) with make_tempdir() as d: ruler.to_disk(d / "test_ruler") @@ -457,9 +421,8 @@ def test_entity_ruler_serialize_dir(nlp, patterns, entity_ruler_factory): ruler.from_disk(d / "non_existing_dir") # read from a bad directory -@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) -def test_entity_ruler_remove_basic(nlp, entity_ruler_factory): - ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler") +def test_entity_ruler_remove_basic(nlp): + ruler = nlp.add_pipe("entity_ruler") patterns = [ {"label": "PERSON", "pattern": "Dina", "id": "dina"}, {"label": "ORG", "pattern": "ACME", "id": "acme"}, @@ -469,24 +432,16 @@ def test_entity_ruler_remove_basic(nlp, entity_ruler_factory): doc = nlp("Dina went to school") assert len(ruler.patterns) == 3 assert len(doc.ents) == 1 - if isinstance(ruler, EntityRuler): - assert "PERSON||dina" in ruler.phrase_matcher assert doc.ents[0].label_ == "PERSON" assert doc.ents[0].text == "Dina" - if isinstance(ruler, EntityRuler): - ruler.remove("dina") - else: - ruler.remove_by_id("dina") + ruler.remove_by_id("dina") doc = nlp("Dina went to school") assert len(doc.ents) == 0 - if isinstance(ruler, EntityRuler): - assert "PERSON||dina" not in ruler.phrase_matcher assert len(ruler.patterns) == 2 -@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) -def test_entity_ruler_remove_same_id_multiple_patterns(nlp, entity_ruler_factory): - ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler") +def test_entity_ruler_remove_same_id_multiple_patterns(nlp): + ruler = nlp.add_pipe("entity_ruler") patterns = [ {"label": "PERSON", "pattern": "Dina", "id": "dina"}, {"label": "ORG", "pattern": "DinaCorp", "id": "dina"}, @@ -495,25 +450,15 @@ def test_entity_ruler_remove_same_id_multiple_patterns(nlp, entity_ruler_factory ruler.add_patterns(patterns) doc = nlp("Dina founded DinaCorp and ACME.") assert len(ruler.patterns) == 3 - if isinstance(ruler, EntityRuler): - assert "PERSON||dina" in ruler.phrase_matcher - assert "ORG||dina" in ruler.phrase_matcher assert len(doc.ents) == 3 - if isinstance(ruler, EntityRuler): - ruler.remove("dina") - else: - ruler.remove_by_id("dina") + ruler.remove_by_id("dina") doc = nlp("Dina founded DinaCorp and ACME.") assert len(ruler.patterns) == 1 - if isinstance(ruler, EntityRuler): - assert "PERSON||dina" not in ruler.phrase_matcher - assert "ORG||dina" not in ruler.phrase_matcher assert len(doc.ents) == 1 -@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) -def test_entity_ruler_remove_nonexisting_pattern(nlp, entity_ruler_factory): - ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler") +def test_entity_ruler_remove_nonexisting_pattern(nlp): + ruler = nlp.add_pipe("entity_ruler") patterns = [ {"label": "PERSON", "pattern": "Dina", "id": "dina"}, {"label": "ORG", "pattern": "ACME", "id": "acme"}, @@ -528,9 +473,8 @@ def test_entity_ruler_remove_nonexisting_pattern(nlp, entity_ruler_factory): ruler.remove_by_id("nepattern") -@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) -def test_entity_ruler_remove_several_patterns(nlp, entity_ruler_factory): - ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler") +def test_entity_ruler_remove_several_patterns(nlp): + ruler = nlp.add_pipe("entity_ruler") patterns = [ {"label": "PERSON", "pattern": "Dina", "id": "dina"}, {"label": "ORG", "pattern": "ACME", "id": "acme"}, @@ -544,27 +488,20 @@ def test_entity_ruler_remove_several_patterns(nlp, entity_ruler_factory): assert doc.ents[0].text == "Dina" assert doc.ents[1].label_ == "ORG" assert doc.ents[1].text == "ACME" - if isinstance(ruler, EntityRuler): - ruler.remove("dina") - else: - ruler.remove_by_id("dina") + ruler.remove_by_id("dina") doc = nlp("Dina founded her company ACME") assert len(ruler.patterns) == 2 assert len(doc.ents) == 1 assert doc.ents[0].label_ == "ORG" assert doc.ents[0].text == "ACME" - if isinstance(ruler, EntityRuler): - ruler.remove("acme") - else: - ruler.remove_by_id("acme") + ruler.remove_by_id("acme") doc = nlp("Dina founded her company ACME") assert len(ruler.patterns) == 1 assert len(doc.ents) == 0 -@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) -def test_entity_ruler_remove_patterns_in_a_row(nlp, entity_ruler_factory): - ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler") +def test_entity_ruler_remove_patterns_in_a_row(nlp): + ruler = nlp.add_pipe("entity_ruler") patterns = [ {"label": "PERSON", "pattern": "Dina", "id": "dina"}, {"label": "ORG", "pattern": "ACME", "id": "acme"}, @@ -580,21 +517,15 @@ def test_entity_ruler_remove_patterns_in_a_row(nlp, entity_ruler_factory): assert doc.ents[1].text == "ACME" assert doc.ents[2].label_ == "DATE" assert doc.ents[2].text == "her birthday" - if isinstance(ruler, EntityRuler): - ruler.remove("dina") - ruler.remove("acme") - ruler.remove("bday") - else: - ruler.remove_by_id("dina") - ruler.remove_by_id("acme") - ruler.remove_by_id("bday") + ruler.remove_by_id("dina") + ruler.remove_by_id("acme") + ruler.remove_by_id("bday") doc = nlp("Dina went to school") assert len(doc.ents) == 0 -@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) -def test_entity_ruler_remove_all_patterns(nlp, entity_ruler_factory): - ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler") +def test_entity_ruler_remove_all_patterns(nlp): + ruler = nlp.add_pipe("entity_ruler") patterns = [ {"label": "PERSON", "pattern": "Dina", "id": "dina"}, {"label": "ORG", "pattern": "ACME", "id": "acme"}, @@ -602,29 +533,19 @@ def test_entity_ruler_remove_all_patterns(nlp, entity_ruler_factory): ] ruler.add_patterns(patterns) assert len(ruler.patterns) == 3 - if isinstance(ruler, EntityRuler): - ruler.remove("dina") - else: - ruler.remove_by_id("dina") + ruler.remove_by_id("dina") assert len(ruler.patterns) == 2 - if isinstance(ruler, EntityRuler): - ruler.remove("acme") - else: - ruler.remove_by_id("acme") + ruler.remove_by_id("acme") assert len(ruler.patterns) == 1 - if isinstance(ruler, EntityRuler): - ruler.remove("bday") - else: - ruler.remove_by_id("bday") + ruler.remove_by_id("bday") assert len(ruler.patterns) == 0 with pytest.warns(UserWarning): doc = nlp("Dina founded her company ACME on her birthday") assert len(doc.ents) == 0 -@pytest.mark.parametrize("entity_ruler_factory", ENTITY_RULERS) -def test_entity_ruler_remove_and_add(nlp, entity_ruler_factory): - ruler = nlp.add_pipe(entity_ruler_factory, name="entity_ruler") +def test_entity_ruler_remove_and_add(nlp): + ruler = nlp.add_pipe("entity_ruler") patterns = [{"label": "DATE", "pattern": "last time"}] ruler.add_patterns(patterns) doc = ruler( @@ -645,10 +566,7 @@ def test_entity_ruler_remove_and_add(nlp, entity_ruler_factory): assert doc.ents[0].text == "last time" assert doc.ents[1].label_ == "DATE" assert doc.ents[1].text == "this time" - if isinstance(ruler, EntityRuler): - ruler.remove("ttime") - else: - ruler.remove_by_id("ttime") + ruler.remove_by_id("ttime") doc = ruler( nlp.make_doc("I saw him last time we met, this time he brought some flowers") ) @@ -671,10 +589,7 @@ def test_entity_ruler_remove_and_add(nlp, entity_ruler_factory): ) assert len(ruler.patterns) == 3 assert len(doc.ents) == 3 - if isinstance(ruler, EntityRuler): - ruler.remove("ttime") - else: - ruler.remove_by_id("ttime") + ruler.remove_by_id("ttime") doc = ruler( nlp.make_doc( "I saw him last time we met, this time he brought some flowers, another time some chocolate." diff --git a/spacy/tests/pipeline/test_models.py b/spacy/tests/pipeline/test_models.py index e3fd28d0f..50ad94422 100644 --- a/spacy/tests/pipeline/test_models.py +++ b/spacy/tests/pipeline/test_models.py @@ -9,7 +9,7 @@ from thinc.types import Array2d, Ragged from spacy.lang.en import English from spacy.ml import FeatureExtractor, StaticVectors -from spacy.ml._character_embed import CharacterEmbed +from spacy.ml.character_embed import CharacterEmbed from spacy.tokens import Doc diff --git a/spacy/tests/pipeline/test_morphologizer.py b/spacy/tests/pipeline/test_morphologizer.py index 33696bfd8..5b9b17c01 100644 --- a/spacy/tests/pipeline/test_morphologizer.py +++ b/spacy/tests/pipeline/test_morphologizer.py @@ -1,3 +1,4 @@ +from typing import cast import pytest from numpy.testing import assert_equal @@ -7,6 +8,7 @@ from spacy.lang.en import English from spacy.language import Language from spacy.tests.util import make_tempdir from spacy.morphology import Morphology +from spacy.pipeline import TrainablePipe from spacy.attrs import MORPH from spacy.tokens import Doc @@ -48,6 +50,12 @@ def test_implicit_label(): nlp.initialize(get_examples=lambda: train_examples) +def test_is_distillable(): + nlp = English() + morphologizer = nlp.add_pipe("morphologizer") + assert morphologizer.is_distillable + + def test_no_resize(): nlp = Language() morphologizer = nlp.add_pipe("morphologizer") @@ -197,3 +205,25 @@ def test_overfitting_IO(): gold_pos_tags = ["NOUN", "NOUN", "NOUN", "NOUN"] assert [str(t.morph) for t in doc] == gold_morphs assert [t.pos_ for t in doc] == gold_pos_tags + + +def test_save_activations(): + nlp = English() + morphologizer = cast(TrainablePipe, nlp.add_pipe("morphologizer")) + train_examples = [] + for inst in TRAIN_DATA: + train_examples.append(Example.from_dict(nlp.make_doc(inst[0]), inst[1])) + nlp.initialize(get_examples=lambda: train_examples) + + doc = nlp("This is a test.") + assert "morphologizer" not in doc.activations + + morphologizer.save_activations = True + doc = nlp("This is a test.") + assert "morphologizer" in doc.activations + assert set(doc.activations["morphologizer"].keys()) == { + "label_ids", + "probabilities", + } + assert doc.activations["morphologizer"]["probabilities"].shape == (5, 6) + assert doc.activations["morphologizer"]["label_ids"].shape == (5,) diff --git a/spacy/tests/pipeline/test_pipe_methods.py b/spacy/tests/pipeline/test_pipe_methods.py index 4dd7bae16..9b9786f04 100644 --- a/spacy/tests/pipeline/test_pipe_methods.py +++ b/spacy/tests/pipeline/test_pipe_methods.py @@ -529,17 +529,6 @@ def test_pipe_label_data_no_labels(pipe): assert "labels" not in get_arg_names(initialize) -def test_warning_pipe_begin_training(): - with pytest.warns(UserWarning, match="begin_training"): - - class IncompatPipe(TrainablePipe): - def __init__(self): - ... - - def begin_training(*args, **kwargs): - ... - - def test_pipe_methods_initialize(): """Test that the [initialize] config reflects the components correctly.""" nlp = Language() diff --git a/spacy/tests/pipeline/test_senter.py b/spacy/tests/pipeline/test_senter.py index 047f59bef..a771d62fa 100644 --- a/spacy/tests/pipeline/test_senter.py +++ b/spacy/tests/pipeline/test_senter.py @@ -1,3 +1,4 @@ +from typing import cast import pytest from numpy.testing import assert_equal from spacy.attrs import SENT_START @@ -6,9 +7,16 @@ from spacy import util from spacy.training import Example from spacy.lang.en import English from spacy.language import Language +from spacy.pipeline import TrainablePipe from spacy.tests.util import make_tempdir +def test_is_distillable(): + nlp = English() + senter = nlp.add_pipe("senter") + assert senter.is_distillable + + def test_label_types(): nlp = Language() senter = nlp.add_pipe("senter") @@ -101,3 +109,26 @@ def test_overfitting_IO(): # test internal pipe labels vs. Language.pipe_labels with hidden labels assert nlp.get_pipe("senter").labels == ("I", "S") assert "senter" not in nlp.pipe_labels + + +def test_save_activations(): + # Test if activations are correctly added to Doc when requested. + nlp = English() + senter = cast(TrainablePipe, nlp.add_pipe("senter")) + + train_examples = [] + for t in TRAIN_DATA: + train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) + + nlp.initialize(get_examples=lambda: train_examples) + nO = senter.model.get_dim("nO") + + doc = nlp("This is a test.") + assert "senter" not in doc.activations + + senter.save_activations = True + doc = nlp("This is a test.") + assert "senter" in doc.activations + assert set(doc.activations["senter"].keys()) == {"label_ids", "probabilities"} + assert doc.activations["senter"]["probabilities"].shape == (5, nO) + assert doc.activations["senter"]["label_ids"].shape == (5,) diff --git a/spacy/tests/pipeline/test_span_ruler.py b/spacy/tests/pipeline/test_span_ruler.py index 794815359..fe3bdd1bf 100644 --- a/spacy/tests/pipeline/test_span_ruler.py +++ b/spacy/tests/pipeline/test_span_ruler.py @@ -47,7 +47,7 @@ def person_org_date_patterns(person_org_patterns): def test_span_ruler_add_empty(patterns): """Test that patterns don't get added excessively.""" - nlp = spacy.blank("xx") + nlp = spacy.blank("mul") ruler = nlp.add_pipe("span_ruler", config={"validate": True}) ruler.add_patterns(patterns) pattern_count = sum(len(mm) for mm in ruler.matcher._patterns.values()) @@ -58,7 +58,7 @@ def test_span_ruler_add_empty(patterns): def test_span_ruler_init(patterns): - nlp = spacy.blank("xx") + nlp = spacy.blank("mul") ruler = nlp.add_pipe("span_ruler") ruler.add_patterns(patterns) assert len(ruler) == len(patterns) @@ -74,7 +74,7 @@ def test_span_ruler_init(patterns): def test_span_ruler_no_patterns_warns(): - nlp = spacy.blank("xx") + nlp = spacy.blank("mul") ruler = nlp.add_pipe("span_ruler") assert len(ruler) == 0 assert len(ruler.labels) == 0 @@ -86,7 +86,7 @@ def test_span_ruler_no_patterns_warns(): def test_span_ruler_init_patterns(patterns): # initialize with patterns - nlp = spacy.blank("xx") + nlp = spacy.blank("mul") ruler = nlp.add_pipe("span_ruler") assert len(ruler.labels) == 0 ruler.initialize(lambda: [], patterns=patterns) @@ -110,7 +110,7 @@ def test_span_ruler_init_patterns(patterns): def test_span_ruler_init_clear(patterns): """Test that initialization clears patterns.""" - nlp = spacy.blank("xx") + nlp = spacy.blank("mul") ruler = nlp.add_pipe("span_ruler") ruler.add_patterns(patterns) assert len(ruler.labels) == 4 @@ -119,7 +119,7 @@ def test_span_ruler_init_clear(patterns): def test_span_ruler_clear(patterns): - nlp = spacy.blank("xx") + nlp = spacy.blank("mul") ruler = nlp.add_pipe("span_ruler") ruler.add_patterns(patterns) assert len(ruler.labels) == 4 @@ -133,7 +133,7 @@ def test_span_ruler_clear(patterns): def test_span_ruler_existing(patterns): - nlp = spacy.blank("xx") + nlp = spacy.blank("mul") ruler = nlp.add_pipe("span_ruler", config={"overwrite": False}) ruler.add_patterns(patterns) doc = nlp.make_doc("OH HELLO WORLD bye bye") @@ -148,7 +148,7 @@ def test_span_ruler_existing(patterns): def test_span_ruler_existing_overwrite(patterns): - nlp = spacy.blank("xx") + nlp = spacy.blank("mul") ruler = nlp.add_pipe("span_ruler", config={"overwrite": True}) ruler.add_patterns(patterns) doc = nlp.make_doc("OH HELLO WORLD bye bye") @@ -161,13 +161,13 @@ def test_span_ruler_existing_overwrite(patterns): def test_span_ruler_serialize_bytes(patterns): - nlp = spacy.blank("xx") + nlp = spacy.blank("mul") ruler = nlp.add_pipe("span_ruler") ruler.add_patterns(patterns) assert len(ruler) == len(patterns) assert len(ruler.labels) == 4 ruler_bytes = ruler.to_bytes() - new_nlp = spacy.blank("xx") + new_nlp = spacy.blank("mul") new_ruler = new_nlp.add_pipe("span_ruler") assert len(new_ruler) == 0 assert len(new_ruler.labels) == 0 @@ -181,7 +181,7 @@ def test_span_ruler_serialize_bytes(patterns): def test_span_ruler_validate(): - nlp = spacy.blank("xx") + nlp = spacy.blank("mul") ruler = nlp.add_pipe("span_ruler") validated_ruler = nlp.add_pipe( "span_ruler", name="validated_span_ruler", config={"validate": True} @@ -203,14 +203,14 @@ def test_span_ruler_validate(): def test_span_ruler_properties(patterns): - nlp = spacy.blank("xx") + nlp = spacy.blank("mul") ruler = nlp.add_pipe("span_ruler", config={"overwrite": True}) ruler.add_patterns(patterns) assert sorted(ruler.labels) == sorted(set([p["label"] for p in patterns])) def test_span_ruler_overlapping_spans(overlapping_patterns): - nlp = spacy.blank("xx") + nlp = spacy.blank("mul") ruler = nlp.add_pipe("span_ruler") ruler.add_patterns(overlapping_patterns) doc = ruler(nlp.make_doc("foo bar baz")) @@ -220,7 +220,7 @@ def test_span_ruler_overlapping_spans(overlapping_patterns): def test_span_ruler_scorer(overlapping_patterns): - nlp = spacy.blank("xx") + nlp = spacy.blank("mul") ruler = nlp.add_pipe("span_ruler") ruler.add_patterns(overlapping_patterns) text = "foo bar baz" @@ -243,7 +243,7 @@ def test_span_ruler_multiprocessing(n_process): patterns = [{"label": "FASTFOOD", "pattern": "Pizza Hut"}] - nlp = spacy.blank("xx") + nlp = spacy.blank("mul") ruler = nlp.add_pipe("span_ruler") ruler.add_patterns(patterns) @@ -253,7 +253,7 @@ def test_span_ruler_multiprocessing(n_process): def test_span_ruler_serialize_dir(patterns): - nlp = spacy.blank("xx") + nlp = spacy.blank("mul") ruler = nlp.add_pipe("span_ruler") ruler.add_patterns(patterns) with make_tempdir() as d: @@ -264,7 +264,7 @@ def test_span_ruler_serialize_dir(patterns): def test_span_ruler_remove_basic(person_org_patterns): - nlp = spacy.blank("xx") + nlp = spacy.blank("mul") ruler = nlp.add_pipe("span_ruler") ruler.add_patterns(person_org_patterns) doc = ruler(nlp.make_doc("Dina went to school")) @@ -279,7 +279,7 @@ def test_span_ruler_remove_basic(person_org_patterns): def test_span_ruler_remove_nonexisting_pattern(person_org_patterns): - nlp = spacy.blank("xx") + nlp = spacy.blank("mul") ruler = nlp.add_pipe("span_ruler") ruler.add_patterns(person_org_patterns) assert len(ruler.patterns) == 3 @@ -290,7 +290,7 @@ def test_span_ruler_remove_nonexisting_pattern(person_org_patterns): def test_span_ruler_remove_several_patterns(person_org_patterns): - nlp = spacy.blank("xx") + nlp = spacy.blank("mul") ruler = nlp.add_pipe("span_ruler") ruler.add_patterns(person_org_patterns) doc = ruler(nlp.make_doc("Dina founded the company ACME.")) @@ -314,7 +314,7 @@ def test_span_ruler_remove_several_patterns(person_org_patterns): def test_span_ruler_remove_patterns_in_a_row(person_org_date_patterns): - nlp = spacy.blank("xx") + nlp = spacy.blank("mul") ruler = nlp.add_pipe("span_ruler") ruler.add_patterns(person_org_date_patterns) doc = ruler(nlp.make_doc("Dina founded the company ACME on June 14th")) @@ -332,7 +332,7 @@ def test_span_ruler_remove_patterns_in_a_row(person_org_date_patterns): def test_span_ruler_remove_all_patterns(person_org_date_patterns): - nlp = spacy.blank("xx") + nlp = spacy.blank("mul") ruler = nlp.add_pipe("span_ruler") ruler.add_patterns(person_org_date_patterns) assert len(ruler.patterns) == 4 @@ -348,7 +348,7 @@ def test_span_ruler_remove_all_patterns(person_org_date_patterns): def test_span_ruler_remove_and_add(): - nlp = spacy.blank("xx") + nlp = spacy.blank("mul") ruler = nlp.add_pipe("span_ruler") patterns1 = [{"label": "DATE1", "pattern": "last time"}] ruler.add_patterns(patterns1) @@ -404,7 +404,7 @@ def test_span_ruler_remove_and_add(): def test_span_ruler_spans_filter(overlapping_patterns): - nlp = spacy.blank("xx") + nlp = spacy.blank("mul") ruler = nlp.add_pipe( "span_ruler", config={"spans_filter": {"@misc": "spacy.first_longest_spans_filter.v1"}}, @@ -416,7 +416,7 @@ def test_span_ruler_spans_filter(overlapping_patterns): def test_span_ruler_ents_default_filter(overlapping_patterns): - nlp = spacy.blank("xx") + nlp = spacy.blank("mul") ruler = nlp.add_pipe("span_ruler", config={"annotate_ents": True}) ruler.add_patterns(overlapping_patterns) doc = ruler(nlp.make_doc("foo bar baz")) @@ -425,7 +425,7 @@ def test_span_ruler_ents_default_filter(overlapping_patterns): def test_span_ruler_ents_overwrite_filter(overlapping_patterns): - nlp = spacy.blank("xx") + nlp = spacy.blank("mul") ruler = nlp.add_pipe( "span_ruler", config={ @@ -452,7 +452,7 @@ def test_span_ruler_ents_bad_filter(overlapping_patterns): return pass_through_filter - nlp = spacy.blank("xx") + nlp = spacy.blank("mul") ruler = nlp.add_pipe( "span_ruler", config={ diff --git a/spacy/tests/pipeline/test_spancat.py b/spacy/tests/pipeline/test_spancat.py index e9db983d3..da9bffbc8 100644 --- a/spacy/tests/pipeline/test_spancat.py +++ b/spacy/tests/pipeline/test_spancat.py @@ -1,15 +1,15 @@ import pytest import numpy from numpy.testing import assert_array_equal, assert_almost_equal -from thinc.api import get_current_ops, Ragged +from thinc.api import get_current_ops, Ragged, fix_random_seed from spacy import util from spacy.lang.en import English from spacy.language import Language from spacy.tokens import SpanGroup -from spacy.tokens._dict_proxies import SpanGroups +from spacy.tokens.span_groups import SpanGroups from spacy.training import Example -from spacy.util import fix_random_seed, registry, make_tempdir +from spacy.util import registry, make_tempdir OPS = get_current_ops() @@ -444,3 +444,23 @@ def test_set_candidates(): assert len(docs[0].spans["candidates"]) == 9 assert docs[0].spans["candidates"][0].text == "Just" assert docs[0].spans["candidates"][4].text == "Just a" + + +def test_save_activations(): + # Test if activations are correctly added to Doc when requested. + nlp = English() + spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY}) + train_examples = make_examples(nlp) + nlp.initialize(get_examples=lambda: train_examples) + nO = spancat.model.get_dim("nO") + assert nO == 2 + assert set(spancat.labels) == {"LOC", "PERSON"} + + doc = nlp("This is a test.") + assert "spancat" not in doc.activations + + spancat.save_activations = True + doc = nlp("This is a test.") + assert set(doc.activations["spancat"].keys()) == {"indices", "scores"} + assert doc.activations["spancat"]["indices"].shape == (12, 2) + assert doc.activations["spancat"]["scores"].shape == (12, nO) diff --git a/spacy/tests/pipeline/test_tagger.py b/spacy/tests/pipeline/test_tagger.py index 96e75851e..505b41f8c 100644 --- a/spacy/tests/pipeline/test_tagger.py +++ b/spacy/tests/pipeline/test_tagger.py @@ -1,3 +1,4 @@ +from typing import cast import pytest from numpy.testing import assert_equal from spacy.attrs import TAG @@ -6,6 +7,7 @@ from spacy import util from spacy.training import Example from spacy.lang.en import English from spacy.language import Language +from spacy.pipeline import TrainablePipe from thinc.api import compounding from ..util import make_tempdir @@ -22,7 +24,9 @@ def test_issue4348(): optimizer = nlp.initialize() for i in range(5): losses = {} - batches = util.minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001)) + batches = util.minibatch( + TRAIN_DATA, size=compounding(4.0, 32.0, 1.001).to_generator() + ) for batch in batches: nlp.update(batch, sgd=optimizer, losses=losses) @@ -211,6 +215,72 @@ def test_overfitting_IO(): assert doc3[0].tag_ != "N" +def test_is_distillable(): + nlp = English() + tagger = nlp.add_pipe("tagger") + assert tagger.is_distillable + + +def test_distill(): + teacher = English() + teacher_tagger = teacher.add_pipe("tagger") + train_examples = [] + for t in TRAIN_DATA: + train_examples.append(Example.from_dict(teacher.make_doc(t[0]), t[1])) + + optimizer = teacher.initialize(get_examples=lambda: train_examples) + + for i in range(50): + losses = {} + teacher.update(train_examples, sgd=optimizer, losses=losses) + assert losses["tagger"] < 0.00001 + + student = English() + student_tagger = student.add_pipe("tagger") + student_tagger.min_tree_freq = 1 + student_tagger.initialize( + get_examples=lambda: train_examples, labels=teacher_tagger.label_data + ) + + distill_examples = [ + Example.from_dict(teacher.make_doc(t[0]), {}) for t in TRAIN_DATA + ] + + for i in range(50): + losses = {} + student_tagger.distill( + teacher_tagger, distill_examples, sgd=optimizer, losses=losses + ) + assert losses["tagger"] < 0.00001 + + test_text = "I like blue eggs" + doc = student(test_text) + assert doc[0].tag_ == "N" + assert doc[1].tag_ == "V" + assert doc[2].tag_ == "J" + assert doc[3].tag_ == "N" + + +def test_save_activations(): + # Test if activations are correctly added to Doc when requested. + nlp = English() + tagger = cast(TrainablePipe, nlp.add_pipe("tagger")) + train_examples = [] + for t in TRAIN_DATA: + train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) + nlp.initialize(get_examples=lambda: train_examples) + + doc = nlp("This is a test.") + assert "tagger" not in doc.activations + + tagger.save_activations = True + doc = nlp("This is a test.") + assert "tagger" in doc.activations + assert set(doc.activations["tagger"].keys()) == {"label_ids", "probabilities"} + assert doc.activations["tagger"]["probabilities"].shape == (5, len(TAGS)) + assert doc.activations["tagger"]["label_ids"].shape == (5,) + + def test_tagger_requires_labels(): nlp = English() nlp.add_pipe("tagger") diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py index d042f3445..506897a45 100644 --- a/spacy/tests/pipeline/test_textcat.py +++ b/spacy/tests/pipeline/test_textcat.py @@ -1,3 +1,4 @@ +from typing import cast import random import numpy.random @@ -11,7 +12,7 @@ from spacy import util from spacy.cli.evaluate import print_prf_per_type, print_textcats_auc_per_cat from spacy.lang.en import English from spacy.language import Language -from spacy.pipeline import TextCategorizer +from spacy.pipeline import TextCategorizer, TrainablePipe from spacy.pipeline.textcat import single_label_bow_config from spacy.pipeline.textcat import single_label_cnn_config from spacy.pipeline.textcat import single_label_default_config @@ -90,7 +91,9 @@ def test_issue3611(): optimizer = nlp.initialize() for i in range(3): losses = {} - batches = util.minibatch(train_data, size=compounding(4.0, 32.0, 1.001)) + batches = util.minibatch( + train_data, size=compounding(4.0, 32.0, 1.001).to_generator() + ) for batch in batches: nlp.update(examples=batch, sgd=optimizer, drop=0.1, losses=losses) @@ -127,7 +130,9 @@ def test_issue4030(): optimizer = nlp.initialize() for i in range(3): losses = {} - batches = util.minibatch(train_data, size=compounding(4.0, 32.0, 1.001)) + batches = util.minibatch( + train_data, size=compounding(4.0, 32.0, 1.001).to_generator() + ) for batch in batches: nlp.update(examples=batch, sgd=optimizer, drop=0.1, losses=losses) @@ -285,7 +290,7 @@ def test_issue9904(): nlp.initialize(get_examples) examples = get_examples() - scores = textcat.predict([eg.predicted for eg in examples]) + scores = textcat.predict([eg.predicted for eg in examples])["probabilities"] loss = textcat.get_loss(examples, scores)[0] loss_double_bs = textcat.get_loss(examples * 2, scores.repeat(2, axis=0))[0] @@ -564,6 +569,12 @@ def test_initialize_examples(name, get_examples, train_data): nlp.initialize(get_examples=get_examples()) +def test_is_distillable(): + nlp = English() + textcat = nlp.add_pipe("textcat") + assert not textcat.is_distillable + + def test_overfitting_IO(): # Simple test to try and quickly overfit the single-label textcat component - ensuring the ML models work correctly fix_random_seed(0) @@ -897,6 +908,44 @@ def test_textcat_multi_threshold(): assert scores["cats_f_per_type"]["POSITIVE"]["r"] == 1.0 +def test_save_activations(): + nlp = English() + textcat = cast(TrainablePipe, nlp.add_pipe("textcat")) + + train_examples = [] + for text, annotations in TRAIN_DATA_SINGLE_LABEL: + train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) + nlp.initialize(get_examples=lambda: train_examples) + nO = textcat.model.get_dim("nO") + + doc = nlp("This is a test.") + assert "textcat" not in doc.activations + + textcat.save_activations = True + doc = nlp("This is a test.") + assert list(doc.activations["textcat"].keys()) == ["probabilities"] + assert doc.activations["textcat"]["probabilities"].shape == (nO,) + + +def test_save_activations_multi(): + nlp = English() + textcat = cast(TrainablePipe, nlp.add_pipe("textcat_multilabel")) + + train_examples = [] + for text, annotations in TRAIN_DATA_MULTI_LABEL: + train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) + nlp.initialize(get_examples=lambda: train_examples) + nO = textcat.model.get_dim("nO") + + doc = nlp("This is a test.") + assert "textcat_multilabel" not in doc.activations + + textcat.save_activations = True + doc = nlp("This is a test.") + assert list(doc.activations["textcat_multilabel"].keys()) == ["probabilities"] + assert doc.activations["textcat_multilabel"]["probabilities"].shape == (nO,) + + @pytest.mark.parametrize( "component_name,scorer", [ diff --git a/spacy/tests/pipeline/test_tok2vec.py b/spacy/tests/pipeline/test_tok2vec.py index e423d9a19..6929b76fa 100644 --- a/spacy/tests/pipeline/test_tok2vec.py +++ b/spacy/tests/pipeline/test_tok2vec.py @@ -382,7 +382,7 @@ cfg_string_multi = """ factory = "ner" [components.ner.model] - @architectures = "spacy.TransitionBasedParser.v2" + @architectures = "spacy.TransitionBasedParser.v3" [components.ner.model.tok2vec] @architectures = "spacy.Tok2VecListener.v1" @@ -540,3 +540,86 @@ def test_tok2vec_listeners_textcat(): assert cats1["imperative"] < 0.9 assert [t.tag_ for t in docs[0]] == ["V", "J", "N"] assert [t.tag_ for t in docs[1]] == ["N", "V", "J", "N"] + + +cfg_string_distillation = """ + [nlp] + lang = "en" + pipeline = ["tok2vec","tagger"] + + [components] + + [components.tagger] + factory = "tagger" + + [components.tagger.model] + @architectures = "spacy.Tagger.v2" + nO = null + + [components.tagger.model.tok2vec] + @architectures = "spacy.Tok2VecListener.v1" + width = ${components.tok2vec.model.encode.width} + + [components.tok2vec] + factory = "tok2vec" + + [components.tok2vec.model] + @architectures = "spacy.Tok2Vec.v2" + + [components.tok2vec.model.embed] + @architectures = "spacy.MultiHashEmbed.v2" + width = ${components.tok2vec.model.encode.width} + rows = [2000, 1000, 1000, 1000] + attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"] + include_static_vectors = false + + [components.tok2vec.model.encode] + @architectures = "spacy.MaxoutWindowEncoder.v2" + width = 96 + depth = 4 + window_size = 1 + maxout_pieces = 3 + """ + + +def test_tok2vec_distillation_teacher_annotations(): + orig_config = Config().from_str(cfg_string_distillation) + teacher_nlp = util.load_model_from_config( + orig_config, auto_fill=True, validate=True + ) + student_nlp = util.load_model_from_config( + orig_config, auto_fill=True, validate=True + ) + + train_examples_teacher = [] + train_examples_student = [] + for t in TRAIN_DATA: + train_examples_teacher.append( + Example.from_dict(teacher_nlp.make_doc(t[0]), t[1]) + ) + train_examples_student.append( + Example.from_dict(student_nlp.make_doc(t[0]), t[1]) + ) + + optimizer = teacher_nlp.initialize(lambda: train_examples_teacher) + student_nlp.initialize(lambda: train_examples_student) + + # Since Language.distill creates a copy of the examples to use as + # its internal teacher/student docs, we'll need to monkey-patch the + # tok2vec pipe's distill method. + student_tok2vec = student_nlp.get_pipe("tok2vec") + student_tok2vec._old_distill = student_tok2vec.distill + + def tok2vec_distill_wrapper( + self, + teacher_pipe, + examples, + **kwargs, + ): + assert all(not eg.reference.tensor.any() for eg in examples) + out = self._old_distill(teacher_pipe, examples, **kwargs) + assert all(eg.reference.tensor.any() for eg in examples) + return out + + student_tok2vec.distill = tok2vec_distill_wrapper.__get__(student_tok2vec, Tok2Vec) + student_nlp.distill(teacher_nlp, train_examples_student, sgd=optimizer, losses={}) diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py index 85e6f8b2c..6eb95001a 100644 --- a/spacy/tests/serialize/test_serialize_config.py +++ b/spacy/tests/serialize/test_serialize_config.py @@ -6,10 +6,11 @@ import spacy from spacy.lang.de import German from spacy.lang.en import English from spacy.language import DEFAULT_CONFIG, DEFAULT_CONFIG_PRETRAIN_PATH +from spacy.language import DEFAULT_CONFIG_DISTILL_PATH from spacy.language import Language from spacy.ml.models import MaxoutWindowEncoder, MultiHashEmbed from spacy.ml.models import build_tb_parser_model, build_Tok2Vec_model -from spacy.schemas import ConfigSchema, ConfigSchemaPretrain +from spacy.schemas import ConfigSchema, ConfigSchemaDistill, ConfigSchemaPretrain from spacy.util import load_config, load_config_from_str from spacy.util import load_model_from_config, registry @@ -66,6 +67,60 @@ factory = "tagger" width = ${components.tok2vec.model.width} """ +distill_config_string = """ +[paths] +train = null +dev = null + +[corpora] + +[corpora.train] +@readers = "spacy.Corpus.v1" +path = ${paths.train} + +[corpora.dev] +@readers = "spacy.Corpus.v1" +path = ${paths.dev} + +[training] + +[training.batcher] +@batchers = "spacy.batch_by_words.v1" +size = 666 + +[nlp] +lang = "en" +pipeline = ["tok2vec", "tagger"] + +[components] + +[components.tok2vec] +factory = "tok2vec" + +[components.tok2vec.model] +@architectures = "spacy.HashEmbedCNN.v1" +pretrained_vectors = null +width = 342 +depth = 4 +window_size = 1 +embed_size = 2000 +maxout_pieces = 3 +subword_features = true + +[components.tagger] +factory = "tagger" + +[components.tagger.model] +@architectures = "spacy.Tagger.v2" + +[components.tagger.model.tok2vec] +@architectures = "spacy.Tok2VecListener.v1" +width = ${components.tok2vec.model.width} + +[distill] +""" + + pretrain_config_string = """ [paths] train = null @@ -122,33 +177,11 @@ width = ${components.tok2vec.model.width} parser_config_string_upper = """ [model] -@architectures = "spacy.TransitionBasedParser.v2" +@architectures = "spacy.TransitionBasedParser.v3" state_type = "parser" extra_state_tokens = false hidden_width = 66 maxout_pieces = 2 -use_upper = true - -[model.tok2vec] -@architectures = "spacy.HashEmbedCNN.v1" -pretrained_vectors = null -width = 333 -depth = 4 -embed_size = 5555 -window_size = 1 -maxout_pieces = 7 -subword_features = false -""" - - -parser_config_string_no_upper = """ -[model] -@architectures = "spacy.TransitionBasedParser.v2" -state_type = "parser" -extra_state_tokens = false -hidden_width = 66 -maxout_pieces = 2 -use_upper = false [model.tok2vec] @architectures = "spacy.HashEmbedCNN.v1" @@ -179,7 +212,6 @@ def my_parser(): extra_state_tokens=True, hidden_width=65, maxout_pieces=5, - use_upper=True, ) return parser @@ -224,6 +256,14 @@ def test_create_nlp_from_config(): load_model_from_config(Config(bad_cfg), auto_fill=True) +def test_nlp_from_distillation_config(): + """Test that the default distillation config validates properly""" + config = Config().from_str(distill_config_string) + distill_config = load_config(DEFAULT_CONFIG_DISTILL_PATH) + filled = config.merge(distill_config) + registry.resolve(filled["distillation"], schema=ConfigSchemaDistill) + + def test_create_nlp_from_pretraining_config(): """Test that the default pretraining config validates properly""" config = Config().from_str(pretrain_config_string) @@ -285,15 +325,16 @@ def test_serialize_custom_nlp(): nlp.to_disk(d) nlp2 = spacy.load(d) model = nlp2.get_pipe("parser").model - model.get_ref("tok2vec") - # check that we have the correct settings, not the default ones - assert model.get_ref("upper").get_dim("nI") == 65 - assert model.get_ref("lower").get_dim("nI") == 65 + assert model.get_ref("tok2vec") is not None + assert model.has_param("hidden_W") + assert model.has_param("hidden_b") + output = model.get_ref("output") + assert output is not None + assert output.has_param("W") + assert output.has_param("b") -@pytest.mark.parametrize( - "parser_config_string", [parser_config_string_upper, parser_config_string_no_upper] -) +@pytest.mark.parametrize("parser_config_string", [parser_config_string_upper]) def test_serialize_parser(parser_config_string): """Create a non-default parser config to check nlp serializes it correctly""" nlp = English() @@ -306,11 +347,13 @@ def test_serialize_parser(parser_config_string): nlp.to_disk(d) nlp2 = spacy.load(d) model = nlp2.get_pipe("parser").model - model.get_ref("tok2vec") - # check that we have the correct settings, not the default ones - if model.attrs["has_upper"]: - assert model.get_ref("upper").get_dim("nI") == 66 - assert model.get_ref("lower").get_dim("nI") == 66 + assert model.get_ref("tok2vec") is not None + assert model.has_param("hidden_W") + assert model.has_param("hidden_b") + output = model.get_ref("output") + assert output is not None + assert output.has_param("b") + assert output.has_param("W") def test_config_nlp_roundtrip(): @@ -457,9 +500,7 @@ def test_config_auto_fill_extra_fields(): load_model_from_config(nlp.config) -@pytest.mark.parametrize( - "parser_config_string", [parser_config_string_upper, parser_config_string_no_upper] -) +@pytest.mark.parametrize("parser_config_string", [parser_config_string_upper]) def test_config_validate_literal(parser_config_string): nlp = English() config = Config().from_str(parser_config_string) diff --git a/spacy/tests/serialize/test_serialize_kb.py b/spacy/tests/serialize/test_serialize_kb.py index 8d3653ab1..f9d2e226b 100644 --- a/spacy/tests/serialize/test_serialize_kb.py +++ b/spacy/tests/serialize/test_serialize_kb.py @@ -1,7 +1,10 @@ -from typing import Callable +from pathlib import Path +from typing import Callable, Iterable, Any, Dict -from spacy import util -from spacy.util import ensure_path, registry, load_model_from_config +import srsly + +from spacy import util, Errors +from spacy.util import ensure_path, registry, load_model_from_config, SimpleFrozenList from spacy.kb.kb_in_memory import InMemoryLookupKB from spacy.vocab import Vocab from thinc.api import Config @@ -91,7 +94,10 @@ def test_serialize_subclassed_kb(): [components.entity_linker] factory = "entity_linker" - + + [components.entity_linker.generate_empty_kb] + @misc = "kb_test.CustomEmptyKB.v1" + [initialize] [initialize.components] @@ -99,7 +105,7 @@ def test_serialize_subclassed_kb(): [initialize.components.entity_linker] [initialize.components.entity_linker.kb_loader] - @misc = "spacy.CustomKB.v1" + @misc = "kb_test.CustomKB.v1" entity_vector_length = 342 custom_field = 666 """ @@ -109,10 +115,57 @@ def test_serialize_subclassed_kb(): super().__init__(vocab, entity_vector_length) self.custom_field = custom_field - @registry.misc("spacy.CustomKB.v1") + def to_disk(self, path, exclude: Iterable[str] = SimpleFrozenList()): + """We overwrite InMemoryLookupKB.to_disk() to ensure that self.custom_field is stored as well.""" + path = ensure_path(path) + if not path.exists(): + path.mkdir(parents=True) + if not path.is_dir(): + raise ValueError(Errors.E928.format(loc=path)) + + def serialize_custom_fields(file_path: Path) -> None: + srsly.write_json(file_path, {"custom_field": self.custom_field}) + + serialize = { + "contents": lambda p: self.write_contents(p), + "strings.json": lambda p: self.vocab.strings.to_disk(p), + "custom_fields": lambda p: serialize_custom_fields(p), + } + util.to_disk(path, serialize, exclude) + + def from_disk(self, path, exclude: Iterable[str] = SimpleFrozenList()): + """We overwrite InMemoryLookupKB.from_disk() to ensure that self.custom_field is loaded as well.""" + path = ensure_path(path) + if not path.exists(): + raise ValueError(Errors.E929.format(loc=path)) + if not path.is_dir(): + raise ValueError(Errors.E928.format(loc=path)) + + def deserialize_custom_fields(file_path: Path) -> None: + self.custom_field = srsly.read_json(file_path)["custom_field"] + + deserialize: Dict[str, Callable[[Any], Any]] = { + "contents": lambda p: self.read_contents(p), + "strings.json": lambda p: self.vocab.strings.from_disk(p), + "custom_fields": lambda p: deserialize_custom_fields(p), + } + util.from_disk(path, deserialize, exclude) + + @registry.misc("kb_test.CustomEmptyKB.v1") + def empty_custom_kb() -> Callable[[Vocab, int], SubInMemoryLookupKB]: + def empty_kb_factory(vocab: Vocab, entity_vector_length: int): + return SubInMemoryLookupKB( + vocab=vocab, + entity_vector_length=entity_vector_length, + custom_field=0, + ) + + return empty_kb_factory + + @registry.misc("kb_test.CustomKB.v1") def custom_kb( entity_vector_length: int, custom_field: int - ) -> Callable[[Vocab], InMemoryLookupKB]: + ) -> Callable[[Vocab], SubInMemoryLookupKB]: def custom_kb_factory(vocab): kb = SubInMemoryLookupKB( vocab=vocab, @@ -139,6 +192,6 @@ def test_serialize_subclassed_kb(): nlp2 = util.load_model_from_path(tmp_dir) entity_linker2 = nlp2.get_pipe("entity_linker") # After IO, the KB is the standard one - assert type(entity_linker2.kb) == InMemoryLookupKB + assert type(entity_linker2.kb) == SubInMemoryLookupKB assert entity_linker2.kb.entity_vector_length == 342 - assert not hasattr(entity_linker2.kb, "custom_field") + assert entity_linker2.kb.custom_field == 666 diff --git a/spacy/tests/serialize/test_serialize_pipeline.py b/spacy/tests/serialize/test_serialize_pipeline.py index 9fcf18e2d..4720bc4da 100644 --- a/spacy/tests/serialize/test_serialize_pipeline.py +++ b/spacy/tests/serialize/test_serialize_pipeline.py @@ -8,7 +8,7 @@ import spacy from spacy import Vocab, load, registry from spacy.lang.en import English from spacy.language import Language -from spacy.pipeline import DependencyParser, EntityRecognizer, EntityRuler +from spacy.pipeline import DependencyParser, EntityRecognizer from spacy.pipeline import SentenceRecognizer, Tagger, TextCategorizer from spacy.pipeline import TrainablePipe from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL @@ -85,58 +85,17 @@ def test_issue_3526_1(en_vocab): {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"}, ] nlp = Language(vocab=en_vocab) - ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True) + ruler = nlp.add_pipe("entity_ruler", config={"overwrite_ents": True}) + ruler.add_patterns(patterns) ruler_bytes = ruler.to_bytes() assert len(ruler) == len(patterns) assert len(ruler.labels) == 4 - assert ruler.overwrite - new_ruler = EntityRuler(nlp) + new_ruler = nlp.add_pipe( + "entity_ruler", name="new_ruler", config={"overwrite_ents": True} + ) new_ruler = new_ruler.from_bytes(ruler_bytes) assert len(new_ruler) == len(ruler) assert len(new_ruler.labels) == 4 - assert new_ruler.overwrite == ruler.overwrite - assert new_ruler.ent_id_sep == ruler.ent_id_sep - - -@pytest.mark.issue(3526) -def test_issue_3526_2(en_vocab): - patterns = [ - {"label": "HELLO", "pattern": "hello world"}, - {"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]}, - {"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]}, - {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]}, - {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"}, - ] - nlp = Language(vocab=en_vocab) - ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True) - bytes_old_style = srsly.msgpack_dumps(ruler.patterns) - new_ruler = EntityRuler(nlp) - new_ruler = new_ruler.from_bytes(bytes_old_style) - assert len(new_ruler) == len(ruler) - for pattern in ruler.patterns: - assert pattern in new_ruler.patterns - assert new_ruler.overwrite is not ruler.overwrite - - -@pytest.mark.issue(3526) -def test_issue_3526_3(en_vocab): - patterns = [ - {"label": "HELLO", "pattern": "hello world"}, - {"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]}, - {"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]}, - {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]}, - {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"}, - ] - nlp = Language(vocab=en_vocab) - ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True) - with make_tempdir() as tmpdir: - out_file = tmpdir / "entity_ruler" - srsly.write_jsonl(out_file.with_suffix(".jsonl"), ruler.patterns) - new_ruler = EntityRuler(nlp).from_disk(out_file) - for pattern in ruler.patterns: - assert pattern in new_ruler.patterns - assert len(new_ruler) == len(ruler) - assert new_ruler.overwrite is not ruler.overwrite @pytest.mark.issue(3526) @@ -150,16 +109,14 @@ def test_issue_3526_4(en_vocab): nlp.to_disk(tmpdir) ruler = nlp.get_pipe("entity_ruler") assert ruler.patterns == [{"label": "ORG", "pattern": "Apple"}] - assert ruler.overwrite is True nlp2 = load(tmpdir) new_ruler = nlp2.get_pipe("entity_ruler") assert new_ruler.patterns == [{"label": "ORG", "pattern": "Apple"}] - assert new_ruler.overwrite is True @pytest.mark.issue(4042) def test_issue4042(): - """Test that serialization of an EntityRuler before NER works fine.""" + """Test that serialization of an entity_ruler before NER works fine.""" nlp = English() # add ner pipe ner = nlp.add_pipe("ner") @@ -224,7 +181,7 @@ def test_issue4042_bug2(): @pytest.mark.issue(4725) def test_issue4725_1(): """Ensure the pickling of the NER goes well""" - vocab = Vocab(vectors_name="test_vocab_add_vector") + vocab = Vocab() nlp = English(vocab=vocab) config = { "update_with_oracle_cut_size": 111, diff --git a/spacy/tests/serialize/test_serialize_span_groups.py b/spacy/tests/serialize/test_serialize_span_groups.py index 85313fcdc..c1c910fa1 100644 --- a/spacy/tests/serialize/test_serialize_span_groups.py +++ b/spacy/tests/serialize/test_serialize_span_groups.py @@ -1,7 +1,7 @@ import pytest from spacy.tokens import Span, SpanGroup -from spacy.tokens._dict_proxies import SpanGroups +from spacy.tokens.span_groups import SpanGroups @pytest.mark.issue(10685) diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index c88e20de2..752750d33 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -2,7 +2,6 @@ import os import math from collections import Counter from typing import Tuple, List, Dict, Any -import pkg_resources import time from pathlib import Path @@ -618,7 +617,6 @@ def test_string_to_list_intify(value): assert string_to_list(value, intify=True) == [1, 2, 3] -@pytest.mark.skip(reason="Temporarily skip for dev version") def test_download_compatibility(): spec = SpecifierSet("==" + about.__version__) spec.prereleases = False @@ -629,7 +627,6 @@ def test_download_compatibility(): assert get_minor_version(about.__version__) == get_minor_version(version) -@pytest.mark.skip(reason="Temporarily skip for dev version") def test_validate_compatibility_table(): spec = SpecifierSet("==" + about.__version__) spec.prereleases = False @@ -1019,8 +1016,6 @@ def test_local_remote_storage_pull_missing(): def test_cli_find_threshold(capsys): - thresholds = numpy.linspace(0, 1, 10) - def make_examples(nlp: Language) -> List[Example]: docs: List[Example] = [] @@ -1076,7 +1071,7 @@ def test_cli_find_threshold(capsys): ) with make_tempdir() as nlp_dir: nlp.to_disk(nlp_dir) - res = find_threshold( + best_threshold, best_score, res = find_threshold( model=nlp_dir, data_path=docs_dir / "docs.spacy", pipe_name="tc_multi", @@ -1084,16 +1079,14 @@ def test_cli_find_threshold(capsys): scores_key="cats_macro_f", silent=True, ) - assert res[0] != thresholds[0] - assert thresholds[0] < res[0] < thresholds[9] - assert res[1] == 1.0 - assert res[2][1.0] == 0.0 + assert best_score == max(res.values()) + assert res[1.0] == 0.0 # Test with spancat. nlp, _ = init_nlp((("spancat", {}),)) with make_tempdir() as nlp_dir: nlp.to_disk(nlp_dir) - res = find_threshold( + best_threshold, best_score, res = find_threshold( model=nlp_dir, data_path=docs_dir / "docs.spacy", pipe_name="spancat", @@ -1101,10 +1094,8 @@ def test_cli_find_threshold(capsys): scores_key="spans_sc_f", silent=True, ) - assert res[0] != thresholds[0] - assert thresholds[0] < res[0] < thresholds[8] - assert res[1] >= 0.6 - assert res[2][1.0] == 0.0 + assert best_score == max(res.values()) + assert res[1.0] == 0.0 # Having multiple textcat_multilabel components should work, since the name has to be specified. nlp, _ = init_nlp((("textcat_multilabel", {}),)) @@ -1134,6 +1125,7 @@ def test_cli_find_threshold(capsys): ) +@pytest.mark.filterwarnings("ignore::DeprecationWarning") @pytest.mark.parametrize( "reqs,output", [ @@ -1166,6 +1158,8 @@ def test_cli_find_threshold(capsys): ], ) def test_project_check_requirements(reqs, output): + import pkg_resources + # excessive guard against unlikely package name try: pkg_resources.require("spacyunknowndoesnotexist12345") @@ -1209,3 +1203,69 @@ def test_walk_directory(): assert (len(walk_directory(d, suffix="iob"))) == 2 assert (len(walk_directory(d, suffix="conll"))) == 3 assert (len(walk_directory(d, suffix="pdf"))) == 0 + + +def test_debug_data_trainable_lemmatizer_basic(): + examples = [ + ("She likes green eggs", {"lemmas": ["she", "like", "green", "egg"]}), + ("Eat blue ham", {"lemmas": ["eat", "blue", "ham"]}), + ] + nlp = Language() + train_examples = [] + for t in examples: + train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) + + data = _compile_gold(train_examples, ["trainable_lemmatizer"], nlp, True) + # ref test_edit_tree_lemmatizer::test_initialize_from_labels + # this results in 4 trees + assert len(data["lemmatizer_trees"]) == 4 + + +def test_debug_data_trainable_lemmatizer_partial(): + partial_examples = [ + # partial annotation + ("She likes green eggs", {"lemmas": ["", "like", "green", ""]}), + # misaligned partial annotation + ( + "He hates green eggs", + { + "words": ["He", "hat", "es", "green", "eggs"], + "lemmas": ["", "hat", "e", "green", ""], + }, + ), + ] + nlp = Language() + train_examples = [] + for t in partial_examples: + train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) + + data = _compile_gold(train_examples, ["trainable_lemmatizer"], nlp, True) + assert data["partial_lemma_annotations"] == 2 + + +def test_debug_data_trainable_lemmatizer_low_cardinality(): + low_cardinality_examples = [ + ("She likes green eggs", {"lemmas": ["no", "no", "no", "no"]}), + ("Eat blue ham", {"lemmas": ["no", "no", "no"]}), + ] + nlp = Language() + train_examples = [] + for t in low_cardinality_examples: + train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) + + data = _compile_gold(train_examples, ["trainable_lemmatizer"], nlp, True) + assert data["n_low_cardinality_lemmas"] == 2 + + +def test_debug_data_trainable_lemmatizer_not_annotated(): + unannotated_examples = [ + ("She likes green eggs", {}), + ("Eat blue ham", {}), + ] + nlp = Language() + train_examples = [] + for t in unannotated_examples: + train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) + + data = _compile_gold(train_examples, ["trainable_lemmatizer"], nlp, True) + assert data["no_lemma_annotations"] == 2 diff --git a/spacy/tests/test_cli_app.py b/spacy/tests/test_cli_app.py index 84b2b8d4d..8aaadf686 100644 --- a/spacy/tests/test_cli_app.py +++ b/spacy/tests/test_cli_app.py @@ -1,9 +1,12 @@ import os from pathlib import Path +import pytest +import srsly from typer.testing import CliRunner +from spacy.tokens import DocBin, Doc from spacy.cli._util import app -from .util import make_tempdir +from .util import make_tempdir, normalize_whitespace def test_convert_auto(): @@ -37,6 +40,189 @@ def test_benchmark_accuracy_alias(): # Verify that the `evaluate` alias works correctly. result_benchmark = CliRunner().invoke(app, ["benchmark", "accuracy", "--help"]) result_evaluate = CliRunner().invoke(app, ["evaluate", "--help"]) - assert result_benchmark.stdout == result_evaluate.stdout.replace( - "spacy evaluate", "spacy benchmark accuracy" + assert normalize_whitespace(result_benchmark.stdout) == normalize_whitespace( + result_evaluate.stdout.replace("spacy evaluate", "spacy benchmark accuracy") ) + + +def test_debug_data_trainable_lemmatizer_cli(en_vocab): + train_docs = [ + Doc(en_vocab, words=["I", "like", "cats"], lemmas=["I", "like", "cat"]), + Doc( + en_vocab, + words=["Dogs", "are", "great", "too"], + lemmas=["dog", "be", "great", "too"], + ), + ] + dev_docs = [ + Doc(en_vocab, words=["Cats", "are", "cute"], lemmas=["cat", "be", "cute"]), + Doc(en_vocab, words=["Pets", "are", "great"], lemmas=["pet", "be", "great"]), + ] + with make_tempdir() as d_in: + train_bin = DocBin(docs=train_docs) + train_bin.to_disk(d_in / "train.spacy") + dev_bin = DocBin(docs=dev_docs) + dev_bin.to_disk(d_in / "dev.spacy") + # `debug data` requires an input pipeline config + CliRunner().invoke( + app, + [ + "init", + "config", + f"{d_in}/config.cfg", + "--lang", + "en", + "--pipeline", + "trainable_lemmatizer", + ], + ) + result_debug_data = CliRunner().invoke( + app, + [ + "debug", + "data", + f"{d_in}/config.cfg", + "--paths.train", + f"{d_in}/train.spacy", + "--paths.dev", + f"{d_in}/dev.spacy", + ], + ) + # Instead of checking specific wording of the output, which may change, + # we'll check that this section of the debug output is present. + assert "= Trainable Lemmatizer =" in result_debug_data.stdout + + +# project tests + +SAMPLE_PROJECT = { + "title": "Sample project", + "description": "This is a project for testing", + "assets": [ + { + "dest": "assets/spacy-readme.md", + "url": "https://github.com/explosion/spaCy/raw/dec81508d28b47f09a06203c472b37f00db6c869/README.md", + "checksum": "411b2c89ccf34288fae8ed126bf652f7", + }, + { + "dest": "assets/citation.cff", + "url": "https://github.com/explosion/spaCy/raw/master/CITATION.cff", + "checksum": "c996bfd80202d480eb2e592369714e5e", + "extra": True, + }, + ], + "commands": [ + { + "name": "ok", + "help": "print ok", + "script": ["python -c \"print('okokok')\""], + }, + { + "name": "create", + "help": "make a file", + "script": ["touch abc.txt"], + "outputs": ["abc.txt"], + }, + { + "name": "clean", + "help": "remove test file", + "script": ["rm abc.txt"], + }, + ], +} + +SAMPLE_PROJECT_TEXT = srsly.yaml_dumps(SAMPLE_PROJECT) + + +@pytest.fixture +def project_dir(): + with make_tempdir() as pdir: + (pdir / "project.yml").write_text(SAMPLE_PROJECT_TEXT) + yield pdir + + +def test_project_document(project_dir): + readme_path = project_dir / "README.md" + assert not readme_path.exists(), "README already exists" + result = CliRunner().invoke( + app, ["project", "document", str(project_dir), "-o", str(readme_path)] + ) + assert result.exit_code == 0 + assert readme_path.is_file() + text = readme_path.read_text("utf-8") + assert SAMPLE_PROJECT["description"] in text + + +def test_project_assets(project_dir): + asset_dir = project_dir / "assets" + assert not asset_dir.exists(), "Assets dir is already present" + result = CliRunner().invoke(app, ["project", "assets", str(project_dir)]) + assert result.exit_code == 0 + assert (asset_dir / "spacy-readme.md").is_file(), "Assets not downloaded" + # check that extras work + result = CliRunner().invoke(app, ["project", "assets", "--extra", str(project_dir)]) + assert result.exit_code == 0 + assert (asset_dir / "citation.cff").is_file(), "Extras not downloaded" + + +def test_project_run(project_dir): + # make sure dry run works + test_file = project_dir / "abc.txt" + result = CliRunner().invoke( + app, ["project", "run", "--dry", "create", str(project_dir)] + ) + assert result.exit_code == 0 + assert not test_file.is_file() + result = CliRunner().invoke(app, ["project", "run", "create", str(project_dir)]) + assert result.exit_code == 0 + assert test_file.is_file() + result = CliRunner().invoke(app, ["project", "run", "ok", str(project_dir)]) + assert result.exit_code == 0 + assert "okokok" in result.stdout + + +@pytest.mark.parametrize( + "options", + [ + "", + # "--sparse", + "--branch v3", + "--repo https://github.com/explosion/projects --branch v3", + ], +) +def test_project_clone(options): + with make_tempdir() as workspace: + out = workspace / "project" + target = "benchmarks/ner_conll03" + if not options: + options = [] + else: + options = options.split() + result = CliRunner().invoke( + app, ["project", "clone", target, *options, str(out)] + ) + assert result.exit_code == 0 + assert (out / "README.md").is_file() + + +def test_project_push_pull(project_dir): + proj = dict(SAMPLE_PROJECT) + remote = "xyz" + + with make_tempdir() as remote_dir: + proj["remotes"] = {remote: str(remote_dir)} + proj_text = srsly.yaml_dumps(proj) + (project_dir / "project.yml").write_text(proj_text) + + test_file = project_dir / "abc.txt" + result = CliRunner().invoke(app, ["project", "run", "create", str(project_dir)]) + assert result.exit_code == 0 + assert test_file.is_file() + result = CliRunner().invoke(app, ["project", "push", remote, str(project_dir)]) + assert result.exit_code == 0 + result = CliRunner().invoke(app, ["project", "run", "clean", str(project_dir)]) + assert result.exit_code == 0 + assert not test_file.exists() + result = CliRunner().invoke(app, ["project", "pull", remote, str(project_dir)]) + assert result.exit_code == 0 + assert test_file.is_file() diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py index 03790eb86..9b8c7b9c7 100644 --- a/spacy/tests/test_language.py +++ b/spacy/tests/test_language.py @@ -10,8 +10,9 @@ from spacy.training import Example from spacy.lang.en import English from spacy.lang.de import German from spacy.util import registry, ignore_error, raise_error, find_matching_language +from spacy.util import load_model_from_config import spacy -from thinc.api import CupyOps, NumpyOps, get_current_ops +from thinc.api import Config, CupyOps, NumpyOps, get_array_module, get_current_ops from .util import add_vecs_to_vocab, assert_docs_equal @@ -25,6 +26,57 @@ try: except ImportError: pass +TAGGER_CFG_STRING = """ + [nlp] + lang = "en" + pipeline = ["tok2vec","tagger"] + + [components] + + [components.tagger] + factory = "tagger" + + [components.tagger.model] + @architectures = "spacy.Tagger.v2" + nO = null + + [components.tagger.model.tok2vec] + @architectures = "spacy.Tok2VecListener.v1" + width = ${components.tok2vec.model.encode.width} + + [components.tok2vec] + factory = "tok2vec" + + [components.tok2vec.model] + @architectures = "spacy.Tok2Vec.v2" + + [components.tok2vec.model.embed] + @architectures = "spacy.MultiHashEmbed.v1" + width = ${components.tok2vec.model.encode.width} + rows = [2000, 1000, 1000, 1000] + attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"] + include_static_vectors = false + + [components.tok2vec.model.encode] + @architectures = "spacy.MaxoutWindowEncoder.v2" + width = 96 + depth = 4 + window_size = 1 + maxout_pieces = 3 + """ + + +TAGGER_TRAIN_DATA = [ + ("I like green eggs", {"tags": ["N", "V", "J", "N"]}), + ("Eat blue ham", {"tags": ["V", "J", "N"]}), +] + + +TAGGER_TRAIN_DATA = [ + ("I like green eggs", {"tags": ["N", "V", "J", "N"]}), + ("Eat blue ham", {"tags": ["V", "J", "N"]}), +] + def evil_component(doc): if "2" in doc.text: @@ -46,7 +98,7 @@ def assert_sents_error(doc): def warn_error(proc_name, proc, docs, e): logger = logging.getLogger("spacy") - logger.warning(f"Trouble with component {proc_name}.") + logger.warning("Trouble with component %s.", proc_name) @pytest.fixture @@ -85,6 +137,26 @@ def test_language_update(nlp): example = Example.from_dict(doc, wrongkeyannots) +def test_language_update_updates(): + config = Config().from_str(TAGGER_CFG_STRING) + nlp = load_model_from_config(config, auto_fill=True, validate=True) + + train_examples = [] + for t in TAGGER_TRAIN_DATA: + train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) + + optimizer = nlp.initialize(get_examples=lambda: train_examples) + + docs_before_update = list(nlp.pipe([eg.predicted.copy() for eg in train_examples])) + nlp.update(train_examples, sgd=optimizer) + docs_after_update = list(nlp.pipe([eg.predicted.copy() for eg in train_examples])) + + xp = get_array_module(docs_after_update[0].tensor) + assert xp.any( + xp.not_equal(docs_before_update[0].tensor, docs_after_update[0].tensor) + ) + + def test_language_evaluate(nlp): text = "hello world" annots = {"doc_annotation": {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}} @@ -658,11 +730,12 @@ def test_spacy_blank(): ("fra", "fr"), ("fre", "fr"), ("iw", "he"), + ("is", "isl"), ("mo", "ro"), - ("mul", "xx"), + ("mul", "mul"), ("no", "nb"), ("pt-BR", "pt"), - ("xx", "xx"), + ("xx", "mul"), ("zh-Hans", "zh"), ("zh-Hant", None), ("zxx", None), @@ -683,11 +756,11 @@ def test_language_matching(lang, target): ("fra", "fr"), ("fre", "fr"), ("iw", "he"), + ("is", "isl"), ("mo", "ro"), - ("mul", "xx"), + ("xx", "mul"), ("no", "nb"), ("pt-BR", "pt"), - ("xx", "xx"), ("zh-Hans", "zh"), ], ) @@ -799,3 +872,66 @@ def test_component_return(): nlp.add_pipe("test_component_bad_pipe") with pytest.raises(ValueError, match="instead of a Doc"): nlp("text") + + +@pytest.mark.slow +@pytest.mark.parametrize("teacher_tagger_name", ["tagger", "teacher_tagger"]) +def test_distill(teacher_tagger_name): + teacher = English() + teacher_tagger = teacher.add_pipe("tagger", name=teacher_tagger_name) + train_examples = [] + for t in TAGGER_TRAIN_DATA: + train_examples.append(Example.from_dict(teacher.make_doc(t[0]), t[1])) + + optimizer = teacher.initialize(get_examples=lambda: train_examples) + + for i in range(50): + losses = {} + teacher.update(train_examples, sgd=optimizer, losses=losses) + assert losses[teacher_tagger_name] < 0.00001 + + student = English() + student_tagger = student.add_pipe("tagger") + student_tagger.min_tree_freq = 1 + student_tagger.initialize( + get_examples=lambda: train_examples, labels=teacher_tagger.label_data + ) + + distill_examples = [ + Example.from_dict(teacher.make_doc(t[0]), {}) for t in TAGGER_TRAIN_DATA + ] + + student_to_teacher = ( + None + if teacher_tagger.name == student_tagger.name + else {student_tagger.name: teacher_tagger.name} + ) + + for i in range(50): + losses = {} + student.distill( + teacher, + distill_examples, + sgd=optimizer, + losses=losses, + student_to_teacher=student_to_teacher, + ) + assert losses["tagger"] < 0.00001 + + test_text = "I like blue eggs" + doc = student(test_text) + assert doc[0].tag_ == "N" + assert doc[1].tag_ == "V" + assert doc[2].tag_ == "J" + assert doc[3].tag_ == "N" + + # Do an extra update to check if annotates works, though we can't really + # validate the resuls, since the annotations are ephemeral. + student.distill( + teacher, + distill_examples, + sgd=optimizer, + losses=losses, + student_to_teacher=student_to_teacher, + annotates=["tagger"], + ) diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py index 618f17334..e4e0f9d83 100644 --- a/spacy/tests/test_misc.py +++ b/spacy/tests/test_misc.py @@ -5,10 +5,8 @@ from pathlib import Path from spacy.about import __version__ as spacy_version from spacy import util from spacy import prefer_gpu, require_gpu, require_cpu -from spacy.ml._precomputable_affine import PrecomputableAffine -from spacy.ml._precomputable_affine import _backprop_precomputable_affine_padding -from spacy.util import dot_to_object, SimpleFrozenList, import_file -from spacy.util import to_ternary_int, find_available_port +from spacy.util import dot_to_object, SimpleFrozenList, import_file, to_ternary_int +from spacy.util import find_available_port from thinc.api import Config, Optimizer, ConfigValidationError from thinc.api import get_current_ops, set_current_ops, NumpyOps, CupyOps, MPSOps from thinc.compat import has_cupy_gpu, has_torch_mps_gpu @@ -81,34 +79,6 @@ def test_util_get_package_path(package): assert isinstance(path, Path) -def test_PrecomputableAffine(nO=4, nI=5, nF=3, nP=2): - model = PrecomputableAffine(nO=nO, nI=nI, nF=nF, nP=nP).initialize() - assert model.get_param("W").shape == (nF, nO, nP, nI) - tensor = model.ops.alloc((10, nI)) - Y, get_dX = model.begin_update(tensor) - assert Y.shape == (tensor.shape[0] + 1, nF, nO, nP) - dY = model.ops.alloc((15, nO, nP)) - ids = model.ops.alloc((15, nF)) - ids[1, 2] = -1 - dY[1] = 1 - assert not model.has_grad("pad") - d_pad = _backprop_precomputable_affine_padding(model, dY, ids) - assert d_pad[0, 2, 0, 0] == 1.0 - ids.fill(0.0) - dY.fill(0.0) - dY[0] = 0 - ids[1, 2] = 0 - ids[1, 1] = -1 - ids[1, 0] = -1 - dY[1] = 1 - ids[2, 0] = -1 - dY[2] = 5 - d_pad = _backprop_precomputable_affine_padding(model, dY, ids) - assert d_pad[0, 0, 0, 0] == 6 - assert d_pad[0, 1, 0, 0] == 1 - assert d_pad[0, 2, 0, 0] == 0 - - def test_prefer_gpu(): current_ops = get_current_ops() if has_cupy_gpu: diff --git a/spacy/tests/test_symbols.py b/spacy/tests/test_symbols.py new file mode 100644 index 000000000..fb034acca --- /dev/null +++ b/spacy/tests/test_symbols.py @@ -0,0 +1,467 @@ +import pytest +from spacy.symbols import IDS, NAMES + +V3_SYMBOLS = { + "": 0, + "IS_ALPHA": 1, + "IS_ASCII": 2, + "IS_DIGIT": 3, + "IS_LOWER": 4, + "IS_PUNCT": 5, + "IS_SPACE": 6, + "IS_TITLE": 7, + "IS_UPPER": 8, + "LIKE_URL": 9, + "LIKE_NUM": 10, + "LIKE_EMAIL": 11, + "IS_STOP": 12, + "IS_OOV_DEPRECATED": 13, + "IS_BRACKET": 14, + "IS_QUOTE": 15, + "IS_LEFT_PUNCT": 16, + "IS_RIGHT_PUNCT": 17, + "IS_CURRENCY": 18, + "FLAG19": 19, + "FLAG20": 20, + "FLAG21": 21, + "FLAG22": 22, + "FLAG23": 23, + "FLAG24": 24, + "FLAG25": 25, + "FLAG26": 26, + "FLAG27": 27, + "FLAG28": 28, + "FLAG29": 29, + "FLAG30": 30, + "FLAG31": 31, + "FLAG32": 32, + "FLAG33": 33, + "FLAG34": 34, + "FLAG35": 35, + "FLAG36": 36, + "FLAG37": 37, + "FLAG38": 38, + "FLAG39": 39, + "FLAG40": 40, + "FLAG41": 41, + "FLAG42": 42, + "FLAG43": 43, + "FLAG44": 44, + "FLAG45": 45, + "FLAG46": 46, + "FLAG47": 47, + "FLAG48": 48, + "FLAG49": 49, + "FLAG50": 50, + "FLAG51": 51, + "FLAG52": 52, + "FLAG53": 53, + "FLAG54": 54, + "FLAG55": 55, + "FLAG56": 56, + "FLAG57": 57, + "FLAG58": 58, + "FLAG59": 59, + "FLAG60": 60, + "FLAG61": 61, + "FLAG62": 62, + "FLAG63": 63, + "ID": 64, + "ORTH": 65, + "LOWER": 66, + "NORM": 67, + "SHAPE": 68, + "PREFIX": 69, + "SUFFIX": 70, + "LENGTH": 71, + "CLUSTER": 72, + "LEMMA": 73, + "POS": 74, + "TAG": 75, + "DEP": 76, + "ENT_IOB": 77, + "ENT_TYPE": 78, + "ENT_ID": 454, + "ENT_KB_ID": 452, + "HEAD": 79, + "SENT_START": 80, + "SPACY": 81, + "PROB": 82, + "LANG": 83, + "IDX": 455, + "ADJ": 84, + "ADP": 85, + "ADV": 86, + "AUX": 87, + "CONJ": 88, + "CCONJ": 89, + "DET": 90, + "INTJ": 91, + "NOUN": 92, + "NUM": 93, + "PART": 94, + "PRON": 95, + "PROPN": 96, + "PUNCT": 97, + "SCONJ": 98, + "SYM": 99, + "VERB": 100, + "X": 101, + "EOL": 102, + "SPACE": 103, + "DEPRECATED001": 104, + "DEPRECATED002": 105, + "DEPRECATED003": 106, + "DEPRECATED004": 107, + "DEPRECATED005": 108, + "DEPRECATED006": 109, + "DEPRECATED007": 110, + "DEPRECATED008": 111, + "DEPRECATED009": 112, + "DEPRECATED010": 113, + "DEPRECATED011": 114, + "DEPRECATED012": 115, + "DEPRECATED013": 116, + "DEPRECATED014": 117, + "DEPRECATED015": 118, + "DEPRECATED016": 119, + "DEPRECATED017": 120, + "DEPRECATED018": 121, + "DEPRECATED019": 122, + "DEPRECATED020": 123, + "DEPRECATED021": 124, + "DEPRECATED022": 125, + "DEPRECATED023": 126, + "DEPRECATED024": 127, + "DEPRECATED025": 128, + "DEPRECATED026": 129, + "DEPRECATED027": 130, + "DEPRECATED028": 131, + "DEPRECATED029": 132, + "DEPRECATED030": 133, + "DEPRECATED031": 134, + "DEPRECATED032": 135, + "DEPRECATED033": 136, + "DEPRECATED034": 137, + "DEPRECATED035": 138, + "DEPRECATED036": 139, + "DEPRECATED037": 140, + "DEPRECATED038": 141, + "DEPRECATED039": 142, + "DEPRECATED040": 143, + "DEPRECATED041": 144, + "DEPRECATED042": 145, + "DEPRECATED043": 146, + "DEPRECATED044": 147, + "DEPRECATED045": 148, + "DEPRECATED046": 149, + "DEPRECATED047": 150, + "DEPRECATED048": 151, + "DEPRECATED049": 152, + "DEPRECATED050": 153, + "DEPRECATED051": 154, + "DEPRECATED052": 155, + "DEPRECATED053": 156, + "DEPRECATED054": 157, + "DEPRECATED055": 158, + "DEPRECATED056": 159, + "DEPRECATED057": 160, + "DEPRECATED058": 161, + "DEPRECATED059": 162, + "DEPRECATED060": 163, + "DEPRECATED061": 164, + "DEPRECATED062": 165, + "DEPRECATED063": 166, + "DEPRECATED064": 167, + "DEPRECATED065": 168, + "DEPRECATED066": 169, + "DEPRECATED067": 170, + "DEPRECATED068": 171, + "DEPRECATED069": 172, + "DEPRECATED070": 173, + "DEPRECATED071": 174, + "DEPRECATED072": 175, + "DEPRECATED073": 176, + "DEPRECATED074": 177, + "DEPRECATED075": 178, + "DEPRECATED076": 179, + "DEPRECATED077": 180, + "DEPRECATED078": 181, + "DEPRECATED079": 182, + "DEPRECATED080": 183, + "DEPRECATED081": 184, + "DEPRECATED082": 185, + "DEPRECATED083": 186, + "DEPRECATED084": 187, + "DEPRECATED085": 188, + "DEPRECATED086": 189, + "DEPRECATED087": 190, + "DEPRECATED088": 191, + "DEPRECATED089": 192, + "DEPRECATED090": 193, + "DEPRECATED091": 194, + "DEPRECATED092": 195, + "DEPRECATED093": 196, + "DEPRECATED094": 197, + "DEPRECATED095": 198, + "DEPRECATED096": 199, + "DEPRECATED097": 200, + "DEPRECATED098": 201, + "DEPRECATED099": 202, + "DEPRECATED100": 203, + "DEPRECATED101": 204, + "DEPRECATED102": 205, + "DEPRECATED103": 206, + "DEPRECATED104": 207, + "DEPRECATED105": 208, + "DEPRECATED106": 209, + "DEPRECATED107": 210, + "DEPRECATED108": 211, + "DEPRECATED109": 212, + "DEPRECATED110": 213, + "DEPRECATED111": 214, + "DEPRECATED112": 215, + "DEPRECATED113": 216, + "DEPRECATED114": 217, + "DEPRECATED115": 218, + "DEPRECATED116": 219, + "DEPRECATED117": 220, + "DEPRECATED118": 221, + "DEPRECATED119": 222, + "DEPRECATED120": 223, + "DEPRECATED121": 224, + "DEPRECATED122": 225, + "DEPRECATED123": 226, + "DEPRECATED124": 227, + "DEPRECATED125": 228, + "DEPRECATED126": 229, + "DEPRECATED127": 230, + "DEPRECATED128": 231, + "DEPRECATED129": 232, + "DEPRECATED130": 233, + "DEPRECATED131": 234, + "DEPRECATED132": 235, + "DEPRECATED133": 236, + "DEPRECATED134": 237, + "DEPRECATED135": 238, + "DEPRECATED136": 239, + "DEPRECATED137": 240, + "DEPRECATED138": 241, + "DEPRECATED139": 242, + "DEPRECATED140": 243, + "DEPRECATED141": 244, + "DEPRECATED142": 245, + "DEPRECATED143": 246, + "DEPRECATED144": 247, + "DEPRECATED145": 248, + "DEPRECATED146": 249, + "DEPRECATED147": 250, + "DEPRECATED148": 251, + "DEPRECATED149": 252, + "DEPRECATED150": 253, + "DEPRECATED151": 254, + "DEPRECATED152": 255, + "DEPRECATED153": 256, + "DEPRECATED154": 257, + "DEPRECATED155": 258, + "DEPRECATED156": 259, + "DEPRECATED157": 260, + "DEPRECATED158": 261, + "DEPRECATED159": 262, + "DEPRECATED160": 263, + "DEPRECATED161": 264, + "DEPRECATED162": 265, + "DEPRECATED163": 266, + "DEPRECATED164": 267, + "DEPRECATED165": 268, + "DEPRECATED166": 269, + "DEPRECATED167": 270, + "DEPRECATED168": 271, + "DEPRECATED169": 272, + "DEPRECATED170": 273, + "DEPRECATED171": 274, + "DEPRECATED172": 275, + "DEPRECATED173": 276, + "DEPRECATED174": 277, + "DEPRECATED175": 278, + "DEPRECATED176": 279, + "DEPRECATED177": 280, + "DEPRECATED178": 281, + "DEPRECATED179": 282, + "DEPRECATED180": 283, + "DEPRECATED181": 284, + "DEPRECATED182": 285, + "DEPRECATED183": 286, + "DEPRECATED184": 287, + "DEPRECATED185": 288, + "DEPRECATED186": 289, + "DEPRECATED187": 290, + "DEPRECATED188": 291, + "DEPRECATED189": 292, + "DEPRECATED190": 293, + "DEPRECATED191": 294, + "DEPRECATED192": 295, + "DEPRECATED193": 296, + "DEPRECATED194": 297, + "DEPRECATED195": 298, + "DEPRECATED196": 299, + "DEPRECATED197": 300, + "DEPRECATED198": 301, + "DEPRECATED199": 302, + "DEPRECATED200": 303, + "DEPRECATED201": 304, + "DEPRECATED202": 305, + "DEPRECATED203": 306, + "DEPRECATED204": 307, + "DEPRECATED205": 308, + "DEPRECATED206": 309, + "DEPRECATED207": 310, + "DEPRECATED208": 311, + "DEPRECATED209": 312, + "DEPRECATED210": 313, + "DEPRECATED211": 314, + "DEPRECATED212": 315, + "DEPRECATED213": 316, + "DEPRECATED214": 317, + "DEPRECATED215": 318, + "DEPRECATED216": 319, + "DEPRECATED217": 320, + "DEPRECATED218": 321, + "DEPRECATED219": 322, + "DEPRECATED220": 323, + "DEPRECATED221": 324, + "DEPRECATED222": 325, + "DEPRECATED223": 326, + "DEPRECATED224": 327, + "DEPRECATED225": 328, + "DEPRECATED226": 329, + "DEPRECATED227": 330, + "DEPRECATED228": 331, + "DEPRECATED229": 332, + "DEPRECATED230": 333, + "DEPRECATED231": 334, + "DEPRECATED232": 335, + "DEPRECATED233": 336, + "DEPRECATED234": 337, + "DEPRECATED235": 338, + "DEPRECATED236": 339, + "DEPRECATED237": 340, + "DEPRECATED238": 341, + "DEPRECATED239": 342, + "DEPRECATED240": 343, + "DEPRECATED241": 344, + "DEPRECATED242": 345, + "DEPRECATED243": 346, + "DEPRECATED244": 347, + "DEPRECATED245": 348, + "DEPRECATED246": 349, + "DEPRECATED247": 350, + "DEPRECATED248": 351, + "DEPRECATED249": 352, + "DEPRECATED250": 353, + "DEPRECATED251": 354, + "DEPRECATED252": 355, + "DEPRECATED253": 356, + "DEPRECATED254": 357, + "DEPRECATED255": 358, + "DEPRECATED256": 359, + "DEPRECATED257": 360, + "DEPRECATED258": 361, + "DEPRECATED259": 362, + "DEPRECATED260": 363, + "DEPRECATED261": 364, + "DEPRECATED262": 365, + "DEPRECATED263": 366, + "DEPRECATED264": 367, + "DEPRECATED265": 368, + "DEPRECATED266": 369, + "DEPRECATED267": 370, + "DEPRECATED268": 371, + "DEPRECATED269": 372, + "DEPRECATED270": 373, + "DEPRECATED271": 374, + "DEPRECATED272": 375, + "DEPRECATED273": 376, + "DEPRECATED274": 377, + "DEPRECATED275": 378, + "DEPRECATED276": 379, + "PERSON": 380, + "NORP": 381, + "FACILITY": 382, + "ORG": 383, + "GPE": 384, + "LOC": 385, + "PRODUCT": 386, + "EVENT": 387, + "WORK_OF_ART": 388, + "LANGUAGE": 389, + "DATE": 391, + "TIME": 392, + "PERCENT": 393, + "MONEY": 394, + "QUANTITY": 395, + "ORDINAL": 396, + "CARDINAL": 397, + "acomp": 398, + "advcl": 399, + "advmod": 400, + "agent": 401, + "amod": 402, + "appos": 403, + "attr": 404, + "aux": 405, + "auxpass": 406, + "cc": 407, + "ccomp": 408, + "complm": 409, + "conj": 410, + "cop": 411, + "csubj": 412, + "csubjpass": 413, + "dep": 414, + "det": 415, + "dobj": 416, + "expl": 417, + "hmod": 418, + "hyph": 419, + "infmod": 420, + "intj": 421, + "iobj": 422, + "mark": 423, + "meta": 424, + "neg": 425, + "nmod": 426, + "nn": 427, + "npadvmod": 428, + "nsubj": 429, + "nsubjpass": 430, + "num": 431, + "number": 432, + "oprd": 433, + "obj": 434, + "obl": 435, + "parataxis": 436, + "partmod": 437, + "pcomp": 438, + "pobj": 439, + "poss": 440, + "possessive": 441, + "preconj": 442, + "prep": 443, + "prt": 444, + "punct": 445, + "quantmod": 446, + "rcmod": 448, + "relcl": 447, + "root": 449, + "xcomp": 450, + "acl": 451, + "LAW": 390, + "MORPH": 453, + "_": 456, +} + + +def test_frozen_symbols(): + assert IDS == V3_SYMBOLS + assert NAMES == {v: k for k, v in IDS.items()} diff --git a/spacy/tests/tokenizer/test_explain.py b/spacy/tests/tokenizer/test_explain.py index 5b4eeca16..4268392dd 100644 --- a/spacy/tests/tokenizer/test_explain.py +++ b/spacy/tests/tokenizer/test_explain.py @@ -36,6 +36,7 @@ LANGUAGES = [ "hu", pytest.param("id", marks=pytest.mark.slow()), pytest.param("it", marks=pytest.mark.slow()), + pytest.param("isl", marks=pytest.mark.slow()), pytest.param("kn", marks=pytest.mark.slow()), pytest.param("lb", marks=pytest.mark.slow()), pytest.param("lt", marks=pytest.mark.slow()), diff --git a/spacy/tests/tokenizer/test_urls.py b/spacy/tests/tokenizer/test_urls.py index 57e970f87..3d8c7b085 100644 --- a/spacy/tests/tokenizer/test_urls.py +++ b/spacy/tests/tokenizer/test_urls.py @@ -33,6 +33,9 @@ URLS_SHOULD_MATCH = [ "http://userid:password@example.com/", "http://142.42.1.1/", "http://142.42.1.1:8080/", + "http://10.140.12.13/foo", + "http://10.140.12.13/foo/bar?arg1=baz&arg2=taz", + "http://10.1.1.1", "http://foo.com/blah_(wikipedia)#cite-1", "http://foo.com/blah_(wikipedia)_blah#cite-1", "http://foo.com/unicode_(✪)_in_parens", @@ -94,6 +97,7 @@ URLS_SHOULD_NOT_MATCH = [ "http://foo.bar/foo(bar)baz quux", "http://-error-.invalid/", "http://a.b-.co", + # Loopback and broadcast addresses should be excluded "http://0.0.0.0", "http://10.1.1.0", "http://10.1.1.255", @@ -102,7 +106,6 @@ URLS_SHOULD_NOT_MATCH = [ "http://3628126748", "http://.www.foo.bar/", "http://.www.foo.bar./", - "http://10.1.1.1", "NASDAQ:GOOG", "http://-a.b.co", pytest.param("foo.com", marks=pytest.mark.xfail()), diff --git a/spacy/tests/training/test_corpus.py b/spacy/tests/training/test_corpus.py new file mode 100644 index 000000000..b4f9cc13a --- /dev/null +++ b/spacy/tests/training/test_corpus.py @@ -0,0 +1,78 @@ +from typing import IO, Generator, Iterable, List, TextIO, Tuple +from contextlib import contextmanager +from pathlib import Path +import pytest +import tempfile + +from spacy.lang.en import English +from spacy.training import Example, PlainTextCorpus +from spacy.util import make_tempdir + +# Intentional newlines to check that they are skipped. +PLAIN_TEXT_DOC = """ + +This is a doc. It contains two sentences. +This is another doc. + +A third doc. + +""" + +PLAIN_TEXT_DOC_TOKENIZED = [ + [ + "This", + "is", + "a", + "doc", + ".", + "It", + "contains", + "two", + "sentences", + ".", + ], + ["This", "is", "another", "doc", "."], + ["A", "third", "doc", "."], +] + + +@pytest.mark.parametrize("min_length", [0, 5]) +@pytest.mark.parametrize("max_length", [0, 5]) +def test_plain_text_reader(min_length, max_length): + nlp = English() + with _string_to_tmp_file(PLAIN_TEXT_DOC) as file_path: + corpus = PlainTextCorpus( + file_path, min_length=min_length, max_length=max_length + ) + + check = [ + doc + for doc in PLAIN_TEXT_DOC_TOKENIZED + if len(doc) >= min_length and (max_length == 0 or len(doc) <= max_length) + ] + reference, predicted = _examples_to_tokens(corpus(nlp)) + + assert reference == check + assert predicted == check + + +@contextmanager +def _string_to_tmp_file(s: str) -> Generator[Path, None, None]: + with make_tempdir() as d: + file_path = Path(d) / "string.txt" + with open(file_path, "w", encoding="utf-8") as f: + f.write(s) + yield file_path + + +def _examples_to_tokens( + examples: Iterable[Example], +) -> Tuple[List[List[str]], List[List[str]]]: + reference = [] + predicted = [] + + for eg in examples: + reference.append([t.text for t in eg.reference]) + predicted.append([t.text for t in eg.predicted]) + + return reference, predicted diff --git a/spacy/tests/training/test_training.py b/spacy/tests/training/test_training.py index 65b03c30a..224f7857f 100644 --- a/spacy/tests/training/test_training.py +++ b/spacy/tests/training/test_training.py @@ -8,7 +8,7 @@ from spacy.lang.en import English from spacy.tokens import Doc, DocBin from spacy.training import Alignment, Corpus, Example, biluo_tags_to_offsets from spacy.training import biluo_tags_to_spans, docs_to_json, iob_to_biluo -from spacy.training import offsets_to_biluo_tags +from spacy.training import offsets_to_biluo_tags, validate_distillation_examples from spacy.training.alignment_array import AlignmentArray from spacy.training.align import get_alignments from spacy.training.converters import json_to_docs @@ -365,6 +365,19 @@ def test_example_from_dict_some_ner(en_vocab): assert ner_tags == ["U-LOC", None, None, None] +def test_validate_distillation_examples(en_vocab): + words = ["a", "b", "c", "d"] + spaces = [True, True, False, True] + predicted = Doc(en_vocab, words=words, spaces=spaces) + + example = Example.from_dict(predicted, {}) + validate_distillation_examples([example], "test_validate_distillation_examples") + + example = Example.from_dict(predicted, {"words": words + ["e"]}) + with pytest.raises(ValueError, match=r"distillation"): + validate_distillation_examples([example], "test_validate_distillation_examples") + + @pytest.mark.filterwarnings("ignore::UserWarning") def test_json_to_docs_no_ner(en_vocab): data = [ @@ -905,7 +918,9 @@ def _train_tuples(train_data): optimizer = nlp.initialize() for i in range(5): losses = {} - batches = minibatch(train_examples, size=compounding(4.0, 32.0, 1.001)) + batches = minibatch( + train_examples, size=compounding(4.0, 32.0, 1.001).to_generator() + ) for batch in batches: nlp.update(batch, sgd=optimizer, losses=losses) diff --git a/spacy/tests/util.py b/spacy/tests/util.py index d5f3c39ff..c2647558d 100644 --- a/spacy/tests/util.py +++ b/spacy/tests/util.py @@ -1,6 +1,7 @@ import numpy import tempfile import contextlib +import re import srsly from spacy.tokens import Doc from spacy.vocab import Vocab @@ -95,3 +96,7 @@ def assert_packed_msg_equal(b1, b2): for (k1, v1), (k2, v2) in zip(sorted(msg1.items()), sorted(msg2.items())): assert k1 == k2 assert v1 == v2 + + +def normalize_whitespace(s): + return re.sub(r"\s+", " ", s) diff --git a/spacy/tests/vocab_vectors/test_stringstore.py b/spacy/tests/vocab_vectors/test_stringstore.py index a0f8016af..f86c0f10d 100644 --- a/spacy/tests/vocab_vectors/test_stringstore.py +++ b/spacy/tests/vocab_vectors/test_stringstore.py @@ -24,6 +24,14 @@ def test_stringstore_from_api_docs(stringstore): stringstore.add("orange") all_strings = [s for s in stringstore] assert all_strings == ["apple", "orange"] + assert all_strings == list(stringstore.keys()) + all_strings_and_hashes = list(stringstore.items()) + assert all_strings_and_hashes == [ + ("apple", 8566208034543834098), + ("orange", 2208928596161743350), + ] + all_hashes = list(stringstore.values()) + assert all_hashes == [8566208034543834098, 2208928596161743350] banana_hash = stringstore.add("banana") assert len(stringstore) == 3 assert banana_hash == 2525716904149915114 @@ -31,12 +39,25 @@ def test_stringstore_from_api_docs(stringstore): assert stringstore["banana"] == banana_hash -@pytest.mark.parametrize("text1,text2,text3", [(b"Hello", b"goodbye", b"hello")]) -def test_stringstore_save_bytes(stringstore, text1, text2, text3): - key = stringstore.add(text1) - assert stringstore[text1] == key - assert stringstore[text2] != key - assert stringstore[text3] != key +@pytest.mark.parametrize( + "val_bytes,val_float,val_list,val_text,val_hash", + [(b"Hello", 1.1, ["abc"], "apple", 8566208034543834098)], +) +def test_stringstore_type_checking( + stringstore, val_bytes, val_float, val_list, val_text, val_hash +): + with pytest.raises(TypeError): + assert stringstore[val_bytes] + + with pytest.raises(TypeError): + stringstore.add(val_float) + + with pytest.raises(TypeError): + assert val_list not in stringstore + + key = stringstore.add(val_text) + assert val_hash == key + assert stringstore[val_hash] == val_text @pytest.mark.parametrize("text1,text2,text3", [("Hello", "goodbye", "hello")]) @@ -47,19 +68,19 @@ def test_stringstore_save_unicode(stringstore, text1, text2, text3): assert stringstore[text3] != key -@pytest.mark.parametrize("text", [b"A"]) +@pytest.mark.parametrize("text", ["A"]) def test_stringstore_retrieve_id(stringstore, text): key = stringstore.add(text) assert len(stringstore) == 1 - assert stringstore[key] == text.decode("utf8") + assert stringstore[key] == text with pytest.raises(KeyError): stringstore[20000] -@pytest.mark.parametrize("text1,text2", [(b"0123456789", b"A")]) +@pytest.mark.parametrize("text1,text2", [("0123456789", "A")]) def test_stringstore_med_string(stringstore, text1, text2): store = stringstore.add(text1) - assert stringstore[store] == text1.decode("utf8") + assert stringstore[store] == text1 stringstore.add(text2) assert stringstore[text1] == store diff --git a/spacy/tests/vocab_vectors/test_vectors.py b/spacy/tests/vocab_vectors/test_vectors.py index 70835816d..ed1322908 100644 --- a/spacy/tests/vocab_vectors/test_vectors.py +++ b/spacy/tests/vocab_vectors/test_vectors.py @@ -84,7 +84,7 @@ def test_issue1539(): @pytest.mark.issue(1807) def test_issue1807(): """Test vocab.set_vector also adds the word to the vocab.""" - vocab = Vocab(vectors_name="test_issue1807") + vocab = Vocab() assert "hello" not in vocab vocab.set_vector("hello", numpy.ones((50,), dtype="f")) assert "hello" in vocab @@ -94,13 +94,12 @@ def test_issue1807(): def test_issue2871(): """Test that vectors recover the correct key for spaCy reserved words.""" words = ["dog", "cat", "SUFFIX"] - vocab = Vocab(vectors_name="test_issue2871") + vocab = Vocab() vocab.vectors.resize(shape=(3, 10)) vector_data = numpy.zeros((3, 10), dtype="f") for word in words: _ = vocab[word] # noqa: F841 vocab.set_vector(word, vector_data[0]) - vocab.vectors.name = "dummy_vectors" assert vocab["dog"].rank == 0 assert vocab["cat"].rank == 1 assert vocab["SUFFIX"].rank == 2 @@ -125,7 +124,7 @@ def test_issue4725_2(): # ensures that this runs correctly and doesn't hang or crash because of the global vectors # if it does crash, it's usually because of calling 'spawn' for multiprocessing (e.g. on Windows), # or because of issues with pickling the NER (cf test_issue4725_1) - vocab = Vocab(vectors_name="test_vocab_add_vector") + vocab = Vocab() data = numpy.ndarray((5, 3), dtype="f") data[0] = 1.0 data[1] = 2.0 @@ -340,7 +339,7 @@ def test_vectors_doc_doc_similarity(vocab, text1, text2): def test_vocab_add_vector(): - vocab = Vocab(vectors_name="test_vocab_add_vector") + vocab = Vocab() data = OPS.xp.ndarray((5, 3), dtype="f") data[0] = 1.0 data[1] = 2.0 @@ -356,7 +355,7 @@ def test_vocab_add_vector(): def test_vocab_prune_vectors(): - vocab = Vocab(vectors_name="test_vocab_prune_vectors") + vocab = Vocab() _ = vocab["cat"] # noqa: F841 _ = vocab["dog"] # noqa: F841 _ = vocab["kitten"] # noqa: F841 @@ -405,7 +404,7 @@ def test_vectors_serialize(): def test_vector_is_oov(): - vocab = Vocab(vectors_name="test_vocab_is_oov") + vocab = Vocab() data = OPS.xp.ndarray((5, 3), dtype="f") data[0] = 1.0 data[1] = 2.0 diff --git a/spacy/tokenizer.pxd b/spacy/tokenizer.pxd index e6a072053..ba268eaeb 100644 --- a/spacy/tokenizer.pxd +++ b/spacy/tokenizer.pxd @@ -4,7 +4,6 @@ from cymem.cymem cimport Pool from .typedefs cimport hash_t from .structs cimport LexemeC, SpanC, TokenC -from .strings cimport StringStore from .tokens.doc cimport Doc from .vocab cimport Vocab, LexemesOrTokens, _Cached from .matcher.phrasematcher cimport PhraseMatcher @@ -23,11 +22,7 @@ cdef class Tokenizer: cdef object _infix_finditer cdef object _rules cdef PhraseMatcher _special_matcher - # TODO convert to bool in v4 - cdef int _faster_heuristics - # TODO next one is unused and should be removed in v4 - # https://github.com/explosion/spaCy/pull/9150 - cdef int _unused_int2 + cdef bint _faster_heuristics cdef Doc _tokenize_affixes(self, str string, bint with_special_cases) cdef int _apply_special_cases(self, Doc doc) except -1 @@ -42,7 +37,7 @@ cdef class Tokenizer: bint with_special_cases) except -1 cdef int _tokenize(self, Doc tokens, str span, hash_t key, int* has_special, bint with_special_cases) except -1 - cdef str _split_affixes(self, Pool mem, str string, + cdef str _split_affixes(self, str string, vector[LexemeC*] *prefixes, vector[LexemeC*] *suffixes, int* has_special, bint with_special_cases) diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 0e75b5f7a..0466b041a 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -8,7 +8,6 @@ from preshed.maps cimport PreshMap cimport cython import re -import warnings from .tokens.doc cimport Doc from .strings cimport hash_string @@ -16,9 +15,9 @@ from .lexeme cimport EMPTY_LEXEME from .attrs import intify_attrs from .symbols import ORTH, NORM -from .errors import Errors, Warnings +from .errors import Errors from . import util -from .util import registry, get_words_and_spaces +from .util import get_words_and_spaces from .attrs import intify_attrs from .symbols import ORTH from .scorer import Scorer @@ -128,10 +127,10 @@ cdef class Tokenizer: property faster_heuristics: def __get__(self): - return bool(self._faster_heuristics) + return self._faster_heuristics def __set__(self, faster_heuristics): - self._faster_heuristics = bool(faster_heuristics) + self._faster_heuristics = faster_heuristics self._reload_special_cases() def __reduce__(self): @@ -390,14 +389,14 @@ cdef class Tokenizer: cdef vector[LexemeC*] suffixes cdef int orig_size orig_size = tokens.length - span = self._split_affixes(tokens.mem, span, &prefixes, &suffixes, + span = self._split_affixes(span, &prefixes, &suffixes, has_special, with_special_cases) self._attach_tokens(tokens, span, &prefixes, &suffixes, has_special, with_special_cases) self._save_cached(&tokens.c[orig_size], orig_key, has_special, tokens.length - orig_size) - cdef str _split_affixes(self, Pool mem, str string, + cdef str _split_affixes(self, str string, vector[const LexemeC*] *prefixes, vector[const LexemeC*] *suffixes, int* has_special, @@ -420,7 +419,7 @@ cdef class Tokenizer: minus_pre = string[pre_len:] if minus_pre and with_special_cases and self._specials.get(hash_string(minus_pre)) != NULL: string = minus_pre - prefixes.push_back(self.vocab.get(mem, prefix)) + prefixes.push_back(self.vocab.get(prefix)) break suf_len = self.find_suffix(string[pre_len:]) if suf_len != 0: @@ -428,18 +427,18 @@ cdef class Tokenizer: minus_suf = string[:-suf_len] if minus_suf and with_special_cases and self._specials.get(hash_string(minus_suf)) != NULL: string = minus_suf - suffixes.push_back(self.vocab.get(mem, suffix)) + suffixes.push_back(self.vocab.get(suffix)) break if pre_len and suf_len and (pre_len + suf_len) <= len(string): string = string[pre_len:-suf_len] - prefixes.push_back(self.vocab.get(mem, prefix)) - suffixes.push_back(self.vocab.get(mem, suffix)) + prefixes.push_back(self.vocab.get(prefix)) + suffixes.push_back(self.vocab.get(suffix)) elif pre_len: string = minus_pre - prefixes.push_back(self.vocab.get(mem, prefix)) + prefixes.push_back(self.vocab.get(prefix)) elif suf_len: string = minus_suf - suffixes.push_back(self.vocab.get(mem, suffix)) + suffixes.push_back(self.vocab.get(suffix)) return string cdef int _attach_tokens(self, Doc tokens, str string, @@ -466,11 +465,11 @@ cdef class Tokenizer: # We're always saying 'no' to spaces here -- the caller will # fix up the outermost one, with reference to the original. # See Issue #859 - tokens.push_back(self.vocab.get(tokens.mem, string), False) + tokens.push_back(self.vocab.get(string), False) else: matches = self.find_infix(string) if not matches: - tokens.push_back(self.vocab.get(tokens.mem, string), False) + tokens.push_back(self.vocab.get(string), False) else: # Let's say we have dyn-o-mite-dave - the regex finds the # start and end positions of the hyphens @@ -485,7 +484,7 @@ cdef class Tokenizer: if infix_start != start: span = string[start:infix_start] - tokens.push_back(self.vocab.get(tokens.mem, span), False) + tokens.push_back(self.vocab.get(span), False) if infix_start != infix_end: # If infix_start != infix_end, it means the infix @@ -493,11 +492,11 @@ cdef class Tokenizer: # for tokenization in some languages (see # https://github.com/explosion/spaCy/issues/768) infix_span = string[infix_start:infix_end] - tokens.push_back(self.vocab.get(tokens.mem, infix_span), False) + tokens.push_back(self.vocab.get(infix_span), False) start = infix_end span = string[start:] if span: - tokens.push_back(self.vocab.get(tokens.mem, span), False) + tokens.push_back(self.vocab.get(span), False) cdef vector[const LexemeC*].reverse_iterator it = suffixes.rbegin() while it != suffixes.rend(): lexeme = deref(it) @@ -582,7 +581,7 @@ cdef class Tokenizer: substrings (iterable): A sequence of dicts, where each dict describes a token and its attributes. """ - attrs = [intify_attrs(spec, _do_deprecated=True) for spec in substrings] + attrs = [intify_attrs(spec) for spec in substrings] orth = "".join([spec[ORTH] for spec in attrs]) if chunk != orth: raise ValueError(Errors.E997.format(chunk=chunk, orth=orth, token_attrs=substrings)) @@ -615,7 +614,7 @@ cdef class Tokenizer: self._rules[string] = substrings self._flush_cache() if not self.faster_heuristics or self.find_prefix(string) or self.find_infix(string) or self.find_suffix(string) or " " in string: - self._special_matcher.add(string, None, self._tokenize_affixes(string, False)) + self._special_matcher.add(string, [self._tokenize_affixes(string, False)]) def _reload_special_cases(self): self._flush_cache() @@ -650,7 +649,7 @@ cdef class Tokenizer: url_match = re.compile("a^").match special_cases = {} for orth, special_tokens in self.rules.items(): - special_cases[orth] = [intify_attrs(special_token, strings_map=self.vocab.strings, _do_deprecated=True) for special_token in special_tokens] + special_cases[orth] = [intify_attrs(special_token, strings_map=self.vocab.strings) for special_token in special_tokens] tokens = [] for substring in text.split(): suffixes = [] diff --git a/spacy/tokens/__init__.py b/spacy/tokens/__init__.py index 64090925d..cb0911283 100644 --- a/spacy/tokens/__init__.py +++ b/spacy/tokens/__init__.py @@ -2,7 +2,7 @@ from .doc import Doc from .token import Token from .span import Span from .span_group import SpanGroup -from ._serialize import DocBin +from .doc_bin import DocBin from .morphanalysis import MorphAnalysis __all__ = ["Doc", "Token", "Span", "SpanGroup", "DocBin", "MorphAnalysis"] diff --git a/spacy/tokens/doc.pxd b/spacy/tokens/doc.pxd index 57d087958..b53c75a2f 100644 --- a/spacy/tokens/doc.pxd +++ b/spacy/tokens/doc.pxd @@ -48,7 +48,7 @@ cdef class Doc: cdef TokenC* c - cdef public float sentiment + cdef public dict activations cdef public dict user_hooks cdef public dict user_token_hooks diff --git a/spacy/tokens/doc.pyi b/spacy/tokens/doc.pyi index f0cdaee87..48bc21c27 100644 --- a/spacy/tokens/doc.pyi +++ b/spacy/tokens/doc.pyi @@ -1,11 +1,11 @@ from typing import Callable, Protocol, Iterable, Iterator, Optional from typing import Union, Tuple, List, Dict, Any, overload from cymem.cymem import Pool -from thinc.types import Floats1d, Floats2d, Ints2d +from thinc.types import ArrayXd, Floats1d, Floats2d, Ints2d, Ragged from .span import Span from .token import Token -from ._dict_proxies import SpanGroups -from ._retokenize import Retokenizer +from .span_groups import SpanGroups +from .retokenizer import Retokenizer from ..lexeme import Lexeme from ..vocab import Vocab from .underscore import Underscore @@ -21,7 +21,7 @@ class Doc: spans: SpanGroups max_length: int length: int - sentiment: float + activations: Dict[str, Dict[str, Union[ArrayXd, Ragged]]] cats: Dict[str, float] user_hooks: Dict[str, Callable[..., Any]] user_token_hooks: Dict[str, Callable[..., Any]] @@ -105,9 +105,11 @@ class Doc: start_idx: int, end_idx: int, label: Union[int, str] = ..., + *, kb_id: Union[int, str] = ..., vector: Optional[Floats1d] = ..., alignment_mode: str = ..., + span_id: Union[int, str] = ..., ) -> Span: ... def similarity(self, other: Union[Doc, Span, Token, Lexeme]) -> float: ... @property @@ -126,12 +128,12 @@ class Doc: blocked: Optional[List[Span]] = ..., missing: Optional[List[Span]] = ..., outside: Optional[List[Span]] = ..., - default: str = ... + default: str = ..., ) -> None: ... @property - def noun_chunks(self) -> Iterator[Span]: ... + def noun_chunks(self) -> Tuple[Span]: ... @property - def sents(self) -> Iterator[Span]: ... + def sents(self) -> Tuple[Span]: ... @property def lang(self) -> int: ... @property diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 075bc4d15..0ea2c39ab 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -19,7 +19,7 @@ import warnings from .span cimport Span from .token cimport MISSING_DEP -from ._dict_proxies import SpanGroups +from .span_groups import SpanGroups from .token cimport Token from ..lexeme cimport Lexeme, EMPTY_LEXEME from ..typedefs cimport attr_t, flags_t @@ -35,8 +35,8 @@ from .. import util from .. import parts_of_speech from .. import schemas from .underscore import Underscore, get_ext_args -from ._retokenize import Retokenizer -from ._serialize import ALL_ATTRS as DOCBIN_ALL_ATTRS +from .retokenizer import Retokenizer +from .doc_bin import ALL_ATTRS as DOCBIN_ALL_ATTRS from ..util import get_words_and_spaces DEF PADDING = 5 @@ -243,8 +243,8 @@ cdef class Doc: self.c = data_start + PADDING self.max_length = size self.length = 0 - self.sentiment = 0.0 self.cats = {} + self.activations = {} self.user_hooks = {} self.user_token_hooks = {} self.user_span_hooks = {} @@ -266,12 +266,12 @@ cdef class Doc: cdef const LexemeC* lexeme for word, has_space in zip(words, spaces): if isinstance(word, str): - lexeme = self.vocab.get(self.mem, word) + lexeme = self.vocab.get(word) elif isinstance(word, bytes): raise ValueError(Errors.E028.format(value=word)) else: try: - lexeme = self.vocab.get_by_orth(self.mem, word) + lexeme = self.vocab.get_by_orth(word) except TypeError: raise TypeError(Errors.E1022.format(wtype=type(word))) self.push_back(lexeme, has_space) @@ -520,7 +520,7 @@ cdef class Doc: def doc(self): return self - def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None, alignment_mode="strict", span_id=0): + def char_span(self, int start_idx, int end_idx, label=0, *, kb_id=0, vector=None, alignment_mode="strict", span_id=0): """Create a `Span` object from the slice `doc.text[start_idx : end_idx]`. Returns None if no valid `Span` can be created. @@ -528,9 +528,9 @@ cdef class Doc: doc (Doc): The parent document. start_idx (int): The index of the first character of the span. end_idx (int): The index of the first character after the span. - label (uint64 or string): A label to attach to the Span, e.g. for + label (Union[int, str]): A label to attach to the Span, e.g. for named entities. - kb_id (uint64 or string): An ID from a KB to capture the meaning of a + kb_id (Union[int, str]): An ID from a KB to capture the meaning of a named entity. vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span. @@ -539,6 +539,7 @@ cdef class Doc: with token boundaries), "contract" (span of all tokens completely within the character span), "expand" (span of all tokens at least partially covered by the character span). Defaults to "strict". + span_id (Union[int, str]): An identifier to associate with the span. RETURNS (Span): The newly constructed object. DOCS: https://spacy.io/api/doc#char_span @@ -656,9 +657,6 @@ cdef class Doc: elif self.vocab.vectors.size > 0: self._vector = sum(t.vector for t in self) / len(self) return self._vector - elif self.tensor.size > 0: - self._vector = self.tensor.mean(axis=0) - return self._vector else: return xp.zeros((self.vocab.vectors_length,), dtype="float32") @@ -705,10 +703,10 @@ cdef class Doc: return self.text property ents: - """The named entities in the document. Returns a tuple of named entity + """The named entities in the document. Returns a list of named entity `Span` objects, if the entity recognizer has been applied. - RETURNS (tuple): Entities in the document, one `Span` per entity. + RETURNS (Tuple[Span]): Entities in the document, one `Span` per entity. DOCS: https://spacy.io/api/doc#ents """ @@ -809,27 +807,33 @@ cdef class Doc: self.c[i].ent_iob = 1 self.c[i].ent_type = span.label self.c[i].ent_kb_id = span.kb_id - # for backwards compatibility in v3, only set ent_id from - # span.id if it's set, otherwise don't override - self.c[i].ent_id = span.id if span.id else self.c[i].ent_id + self.c[i].ent_id = span.id for span in blocked: for i in range(span.start, span.end): self.c[i].ent_iob = 3 self.c[i].ent_type = 0 + self.c[i].ent_kb_id = 0 + self.c[i].ent_id = 0 for span in missing: for i in range(span.start, span.end): self.c[i].ent_iob = 0 self.c[i].ent_type = 0 + self.c[i].ent_kb_id = 0 + self.c[i].ent_id = 0 for span in outside: for i in range(span.start, span.end): self.c[i].ent_iob = 2 self.c[i].ent_type = 0 + self.c[i].ent_kb_id = 0 + self.c[i].ent_id = 0 # Set tokens outside of all provided spans if default != SetEntsDefault.unmodified: for i in range(self.length): if i not in seen_tokens: self.c[i].ent_type = 0 + self.c[i].ent_kb_id = 0 + self.c[i].ent_id = 0 if default == SetEntsDefault.outside: self.c[i].ent_iob = 2 elif default == SetEntsDefault.missing: @@ -860,7 +864,7 @@ cdef class Doc: NP-level coordination, no prepositional phrases, and no relative clauses. - YIELDS (Span): Noun chunks in the document. + RETURNS (Tuple[Span]): Noun chunks in the document. DOCS: https://spacy.io/api/doc#noun_chunks """ @@ -869,36 +873,35 @@ cdef class Doc: # Accumulate the result before beginning to iterate over it. This # prevents the tokenization from being changed out from under us - # during the iteration. The tricky thing here is that Span accepts - # its tokenization changing, so it's okay once we have the Span - # objects. See Issue #375. + # during the iteration. spans = [] for start, end, label in self.noun_chunks_iterator(self): spans.append(Span(self, start, end, label=label)) - for span in spans: - yield span + return tuple(spans) @property def sents(self): """Iterate over the sentences in the document. Yields sentence `Span` objects. Sentence spans have no label. - YIELDS (Span): Sentences in the document. + RETURNS (Tuple[Span]): Sentences in the document. DOCS: https://spacy.io/api/doc#sents """ if not self.has_annotation("SENT_START"): raise ValueError(Errors.E030) if "sents" in self.user_hooks: - yield from self.user_hooks["sents"](self) + return tuple(self.user_hooks["sents"](self)) else: start = 0 + spans = [] for i in range(1, self.length): if self.c[i].sent_start == 1: - yield Span(self, start, i) + spans.append(Span(self, start, i)) start = i if start != self.length: - yield Span(self, start, self.length) + spans.append(Span(self, start, self.length)) + return tuple(spans) @property def lang(self): @@ -969,22 +972,26 @@ cdef class Doc: py_attr_ids = [(IDS[id_.upper()] if hasattr(id_, "upper") else id_) for id_ in py_attr_ids] except KeyError as msg: - keys = [k for k in IDS.keys() if not k.startswith("FLAG")] + keys = list(IDS.keys()) raise KeyError(Errors.E983.format(dict="IDS", key=msg, keys=keys)) from None # Make an array from the attributes --- otherwise our inner loop is # Python dict iteration. - cdef np.ndarray attr_ids = numpy.asarray(py_attr_ids, dtype="i") - output = numpy.ndarray(shape=(self.length, len(attr_ids)), dtype=numpy.uint64) + cdef Pool mem = Pool() + cdef int n_attrs = len(py_attr_ids) + cdef attr_id_t* c_attr_ids + if n_attrs > 0: + c_attr_ids = mem.alloc(n_attrs, sizeof(attr_id_t)) + for i, attr_id in enumerate(py_attr_ids): + c_attr_ids[i] = attr_id + output = numpy.ndarray(shape=(self.length, n_attrs), dtype=numpy.uint64) c_output = output.data - c_attr_ids = attr_ids.data cdef TokenC* token - cdef int nr_attr = attr_ids.shape[0] for i in range(self.length): token = &self.c[i] - for j in range(nr_attr): - c_output[i*nr_attr + j] = get_token_attr(token, c_attr_ids[j]) + for j in range(n_attrs): + c_output[i*n_attrs + j] = get_token_attr(token, c_attr_ids[j]) # Handle 1d case - return output if len(attr_ids) >= 2 else output.reshape((self.length,)) + return output if n_attrs >= 2 else output.reshape((self.length,)) def count_by(self, attr_id_t attr_id, exclude=None, object counts=None): """Count the frequencies of a given attribute. Produces a dict of @@ -1168,13 +1175,22 @@ cdef class Doc: if "user_data" not in exclude: for key, value in doc.user_data.items(): - if isinstance(key, tuple) and len(key) == 4 and key[0] == "._.": - data_type, name, start, end = key + if isinstance(key, tuple) and len(key) >= 4 and key[0] == "._.": + data_type = key[0] + name = key[1] + start = key[2] + end = key[3] if start is not None or end is not None: start += char_offset if end is not None: end += char_offset - concat_user_data[(data_type, name, start, end)] = copy.copy(value) + _label = key[4] + _kb_id = key[5] + _span_id = key[6] + concat_user_data[(data_type, name, start, end, _label, _kb_id, _span_id)] = copy.copy(value) + else: + concat_user_data[(data_type, name, start, end)] = copy.copy(value) + else: warnings.warn(Warnings.W101.format(name=name)) else: @@ -1260,7 +1276,6 @@ cdef class Doc: other.tensor = copy.deepcopy(self.tensor) other.cats = copy.deepcopy(self.cats) other.user_data = copy.deepcopy(self.user_data) - other.sentiment = self.sentiment other.has_unknown_spaces = self.has_unknown_spaces other.user_hooks = dict(self.user_hooks) other.user_token_hooks = dict(self.user_token_hooks) @@ -1357,7 +1372,6 @@ cdef class Doc: "text": lambda: self.text, "array_head": lambda: array_head, "array_body": lambda: self.to_array(array_head), - "sentiment": lambda: self.sentiment, "tensor": lambda: self.tensor, "cats": lambda: self.cats, "spans": lambda: self.spans.to_bytes(), @@ -1395,8 +1409,6 @@ cdef class Doc: for key, value in zip(user_data_keys, user_data_values): self.user_data[key] = value cdef int i, start, end, has_space - if "sentiment" not in exclude and "sentiment" in msg: - self.sentiment = msg["sentiment"] if "tensor" not in exclude and "tensor" in msg: self.tensor = msg["tensor"] if "cats" not in exclude and "cats" in msg: @@ -1415,7 +1427,7 @@ cdef class Doc: end = start + attrs[i, 0] has_space = attrs[i, 1] orth_ = text[start:end] - lex = self.vocab.get(self.mem, orth_) + lex = self.vocab.get(orth_) self.push_back(lex, has_space) start = end + has_space self.from_array(msg["array_head"][2:], attrs[:, 2:]) @@ -1521,7 +1533,7 @@ cdef class Doc: assert words == reconstructed_words for word, has_space in zip(words, spaces): - lex = self.vocab.get(self.mem, word) + lex = self.vocab.get(word) self.push_back(lex, has_space) # Set remaining token-level attributes via Doc.from_array(). @@ -1589,7 +1601,7 @@ cdef class Doc: for span_group in doc_json.get("spans", {}): spans = [] for span in doc_json["spans"][span_group]: - char_span = self.char_span(span["start"], span["end"], span["label"], span["kb_id"]) + char_span = self.char_span(span["start"], span["end"], span["label"], kb_id=span["kb_id"]) if char_span is None: raise ValueError(Errors.E1039.format(obj="span", start=span["start"], end=span["end"])) spans.append(char_span) @@ -1623,7 +1635,11 @@ cdef class Doc: Span.set_extension(span_attr) for span_data in doc_json["underscore_span"][span_attr]: value = span_data["value"] - self.char_span(span_data["start"], span_data["end"])._.set(span_attr, value) + span = self.char_span(span_data["start"], span_data["end"]) + span.label = span_data["label"] + span.kb_id = span_data["kb_id"] + span.id = span_data["id"] + span._.set(span_attr, value) return self def to_json(self, underscore=None): @@ -1701,13 +1717,16 @@ cdef class Doc: if attr not in data["underscore_token"]: data["underscore_token"][attr] = [] data["underscore_token"][attr].append({"start": start, "value": value}) - # Span attribute - elif start is not None and end is not None: + # Else span attribute + elif end is not None: + _label = data_key[4] + _kb_id = data_key[5] + _span_id = data_key[6] if "underscore_span" not in data: data["underscore_span"] = {} if attr not in data["underscore_span"]: data["underscore_span"][attr] = [] - data["underscore_span"][attr].append({"start": start, "end": end, "value": value}) + data["underscore_span"][attr].append({"start": start, "end": end, "value": value, "label": _label, "kb_id": _kb_id, "id":_span_id}) for attr in underscore: if attr not in user_keys: diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/doc_bin.py similarity index 99% rename from spacy/tokens/_serialize.py rename to spacy/tokens/doc_bin.py index c4e8f26f4..c107aa25d 100644 --- a/spacy/tokens/_serialize.py +++ b/spacy/tokens/doc_bin.py @@ -12,7 +12,7 @@ from ..compat import copy_reg from ..attrs import SPACY, ORTH, intify_attr, IDS from ..errors import Errors from ..util import ensure_path, SimpleFrozenList -from ._dict_proxies import SpanGroups +from .span_groups import SpanGroups # fmt: off ALL_ATTRS = ("ORTH", "NORM", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "ENT_ID", "LEMMA", "MORPH", "POS", "SENT_START") diff --git a/spacy/tokens/graph.pyx b/spacy/tokens/graph.pyx index adc4d23c8..0ae0d94c7 100644 --- a/spacy/tokens/graph.pyx +++ b/spacy/tokens/graph.pyx @@ -12,7 +12,7 @@ from murmurhash.mrmr cimport hash64 from .. import Errors from ..typedefs cimport hash_t -from ..strings import get_string_id +from ..strings cimport get_string_id from ..structs cimport EdgeC, GraphC from .token import Token diff --git a/spacy/tokens/morphanalysis.pxd b/spacy/tokens/morphanalysis.pxd index 9510875c9..f866488ec 100644 --- a/spacy/tokens/morphanalysis.pxd +++ b/spacy/tokens/morphanalysis.pxd @@ -1,9 +1,12 @@ from ..vocab cimport Vocab from ..typedefs cimport hash_t -from ..structs cimport MorphAnalysisC +from ..morphology cimport MorphAnalysisC +from libcpp.memory cimport shared_ptr cdef class MorphAnalysis: cdef readonly Vocab vocab cdef readonly hash_t key - cdef MorphAnalysisC c + cdef shared_ptr[MorphAnalysisC] c + + cdef void _init_c(self, hash_t key) diff --git a/spacy/tokens/morphanalysis.pyx b/spacy/tokens/morphanalysis.pyx index a7d1f2e44..af0067f4e 100644 --- a/spacy/tokens/morphanalysis.pyx +++ b/spacy/tokens/morphanalysis.pyx @@ -5,7 +5,12 @@ from ..errors import Errors from ..morphology import Morphology from ..vocab cimport Vocab from ..typedefs cimport hash_t, attr_t -from ..morphology cimport list_features, check_feature, get_by_field +from ..morphology cimport list_features, check_feature, get_by_field, MorphAnalysisC +from libcpp.memory cimport shared_ptr +from cython.operator cimport dereference as deref + + +cdef shared_ptr[MorphAnalysisC] EMPTY_MORPH_TAG = shared_ptr[MorphAnalysisC](new MorphAnalysisC()) cdef class MorphAnalysis: @@ -13,39 +18,38 @@ cdef class MorphAnalysis: def __init__(self, Vocab vocab, features=dict()): self.vocab = vocab self.key = self.vocab.morphology.add(features) - analysis = self.vocab.morphology.tags.get(self.key) - if analysis is not NULL: - self.c = analysis[0] + self._init_c(self.key) + + cdef void _init_c(self, hash_t key): + cdef shared_ptr[MorphAnalysisC] analysis = self.vocab.morphology.get_morph_c(key) + if analysis: + self.c = analysis else: - memset(&self.c, 0, sizeof(self.c)) + self.c = EMPTY_MORPH_TAG @classmethod def from_id(cls, Vocab vocab, hash_t key): """Create a morphological analysis from a given ID.""" - cdef MorphAnalysis morph = MorphAnalysis.__new__(MorphAnalysis, vocab) + cdef MorphAnalysis morph = MorphAnalysis(vocab) morph.vocab = vocab morph.key = key - analysis = vocab.morphology.tags.get(key) - if analysis is not NULL: - morph.c = analysis[0] - else: - memset(&morph.c, 0, sizeof(morph.c)) + morph._init_c(key) return morph def __contains__(self, feature): """Test whether the morphological analysis contains some feature.""" cdef attr_t feat_id = self.vocab.strings.as_int(feature) - return check_feature(&self.c, feat_id) + return check_feature(self.c, feat_id) def __iter__(self): """Iterate over the features in the analysis.""" cdef attr_t feature - for feature in list_features(&self.c): + for feature in list_features(self.c): yield self.vocab.strings[feature] def __len__(self): """The number of features in the analysis.""" - return self.c.length + return deref(self.c).features.size() def __hash__(self): return self.key @@ -61,7 +65,7 @@ cdef class MorphAnalysis: def get(self, field): """Retrieve feature values by field.""" cdef attr_t field_id = self.vocab.strings.as_int(field) - cdef np.ndarray results = get_by_field(&self.c, field_id) + cdef np.ndarray results = get_by_field(self.c, field_id) features = [self.vocab.strings[result] for result in results] return [f.split(Morphology.FIELD_SEP)[1] for f in features] @@ -69,7 +73,7 @@ cdef class MorphAnalysis: """Produce a json serializable representation as a UD FEATS-style string. """ - morph_string = self.vocab.strings[self.c.key] + morph_string = self.vocab.strings[deref(self.c).key] if morph_string == self.vocab.morphology.EMPTY_MORPH: return "" return morph_string diff --git a/spacy/tokens/_retokenize.pyi b/spacy/tokens/retokenizer.pyi similarity index 100% rename from spacy/tokens/_retokenize.pyi rename to spacy/tokens/retokenizer.pyi diff --git a/spacy/tokens/_retokenize.pyx b/spacy/tokens/retokenizer.pyx similarity index 99% rename from spacy/tokens/_retokenize.pyx rename to spacy/tokens/retokenizer.pyx index 43e6d4aa7..8aef1d74f 100644 --- a/spacy/tokens/_retokenize.pyx +++ b/spacy/tokens/retokenizer.pyx @@ -18,7 +18,7 @@ from .underscore import is_writable_attr from ..attrs import intify_attrs from ..util import SimpleFrozenDict from ..errors import Errors -from ..strings import get_string_id +from ..strings cimport get_string_id cdef class Retokenizer: @@ -223,7 +223,7 @@ def _merge(Doc doc, merges): if doc.vocab.vectors_length > 0: doc.vocab.set_vector(new_orth, span.vector) token = tokens[token_index] - lex = doc.vocab.get(doc.mem, new_orth) + lex = doc.vocab.get(new_orth) token.lex = lex # We set trailing space here too token.spacy = doc.c[spans[token_index].end-1].spacy @@ -359,7 +359,7 @@ def _split(Doc doc, int token_index, orths, heads, attrs): cdef int idx_offset = 0 for i, orth in enumerate(orths): token = &doc.c[token_index + i] - lex = doc.vocab.get(doc.mem, orth) + lex = doc.vocab.get(orth) token.lex = lex # If lemma is currently set, set default lemma to orth if token.lemma != 0: diff --git a/spacy/tokens/span.pxd b/spacy/tokens/span.pxd index 78bee0a8c..85553068e 100644 --- a/spacy/tokens/span.pxd +++ b/spacy/tokens/span.pxd @@ -1,3 +1,4 @@ +from libcpp.memory cimport shared_ptr cimport numpy as np from .doc cimport Doc @@ -7,19 +8,21 @@ from ..structs cimport SpanC cdef class Span: cdef readonly Doc doc - cdef SpanC c + cdef shared_ptr[SpanC] c cdef public _vector cdef public _vector_norm @staticmethod - cdef inline Span cinit(Doc doc, SpanC span): + cdef inline Span cinit(Doc doc, const shared_ptr[SpanC] &span): cdef Span self = Span.__new__( Span, doc, - start=span.start, - end=span.end + start=span.get().start, + end=span.get().end ) self.c = span return self cpdef np.ndarray to_array(self, object features) + + cdef SpanC* span_c(self) diff --git a/spacy/tokens/span.pyi b/spacy/tokens/span.pyi index 9986a90e6..e5031fea9 100644 --- a/spacy/tokens/span.pyi +++ b/spacy/tokens/span.pyi @@ -74,6 +74,8 @@ class Span: @property def ents(self) -> Tuple[Span]: ... @property + def sents(self) -> Tuple[Span]: ... + @property def has_vector(self) -> bool: ... @property def vector(self) -> Floats1d: ... @@ -82,13 +84,11 @@ class Span: @property def tensor(self) -> FloatsXd: ... @property - def sentiment(self) -> float: ... - @property def text(self) -> str: ... @property def text_with_ws(self) -> str: ... @property - def noun_chunks(self) -> Iterator[Span]: ... + def noun_chunks(self) -> Tuple[Span]: ... @property def root(self) -> Token: ... def char_span( @@ -96,8 +96,11 @@ class Span: start_idx: int, end_idx: int, label: Union[int, str] = ..., + *, kb_id: Union[int, str] = ..., vector: Optional[Floats1d] = ..., + alignment_mode: str = ..., + span_id: Union[int, str] = ..., ) -> Span: ... @property def conjuncts(self) -> Tuple[Token]: ... @@ -115,15 +118,23 @@ class Span: end: int start_char: int end_char: int - label: int - kb_id: int - id: int - ent_id: int - ent_id_: str + @property + def label(self) -> int: ... + @property + def kb_id(self) -> int: ... + @property + def id(self) -> int: ... + @property + def ent_id(self) -> int: ... @property def orth_(self) -> str: ... @property def lemma_(self) -> str: ... - label_: str - kb_id_: str - id_: str + @property + def label_(self) -> str: ... + @property + def kb_id_(self) -> str: ... + @property + def id_(self) -> str: ... + @property + def ent_id_(self) -> str: ... diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 99a5f43bd..75f7db7ca 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -1,5 +1,6 @@ cimport numpy as np from libc.math cimport sqrt +from libcpp.memory cimport make_shared import numpy from thinc.api import get_array_module @@ -114,7 +115,7 @@ cdef class Span: end_char = start_char else: end_char = doc[end - 1].idx + len(doc[end - 1]) - self.c = SpanC( + self.c = make_shared[SpanC](SpanC( label=label, kb_id=kb_id, id=span_id, @@ -122,7 +123,7 @@ cdef class Span: end=end, start_char=start_char, end_char=end_char, - ) + )) self._vector = vector self._vector_norm = vector_norm @@ -132,8 +133,9 @@ cdef class Span: return False else: return True - self_tuple = (self.c.start_char, self.c.end_char, self.c.label, self.c.kb_id, self.id, self.doc) - other_tuple = (other.c.start_char, other.c.end_char, other.c.label, other.c.kb_id, other.id, other.doc) + + self_tuple = self._cmp_tuple() + other_tuple = other._cmp_tuple() # < if op == 0: return self_tuple < other_tuple @@ -154,7 +156,20 @@ cdef class Span: return self_tuple >= other_tuple def __hash__(self): - return hash((self.doc, self.c.start_char, self.c.end_char, self.c.label, self.c.kb_id, self.c.id)) + return hash(self._cmp_tuple()) + + def _cmp_tuple(self): + cdef SpanC* span_c = self.span_c() + return ( + span_c.start_char, + span_c.end_char, + span_c.start, + span_c.end, + span_c.label, + span_c.kb_id, + span_c.id, + self.doc, + ) def __len__(self): """Get the number of tokens in the span. @@ -163,9 +178,10 @@ cdef class Span: DOCS: https://spacy.io/api/span#len """ - if self.c.end < self.c.start: + cdef SpanC* span_c = self.span_c() + if span_c.end < span_c.start: return 0 - return self.c.end - self.c.start + return span_c.end - span_c.start def __repr__(self): return self.text @@ -179,15 +195,16 @@ cdef class Span: DOCS: https://spacy.io/api/span#getitem """ + cdef SpanC* span_c = self.span_c() if isinstance(i, slice): start, end = normalize_slice(len(self), i.start, i.stop, i.step) return Span(self.doc, start + self.start, end + self.start) else: if i < 0: - token_i = self.c.end + i + token_i = span_c.end + i else: - token_i = self.c.start + i - if self.c.start <= token_i < self.c.end: + token_i = span_c.start + i + if span_c.start <= token_i < span_c.end: return self.doc[token_i] else: raise IndexError(Errors.E1002) @@ -199,7 +216,8 @@ cdef class Span: DOCS: https://spacy.io/api/span#iter """ - for i in range(self.c.start, self.c.end): + cdef SpanC* span_c = self.span_c() + for i in range(span_c.start, span_c.end): yield self.doc[i] def __reduce__(self): @@ -207,13 +225,13 @@ cdef class Span: @property def _(self): + cdef SpanC* span_c = self.span_c() """Custom extension attributes registered via `set_extension`.""" return Underscore(Underscore.span_extensions, self, - start=self.c.start_char, end=self.c.end_char) + start=span_c.start_char, end=span_c.end_char, label=self.label, kb_id=self.kb_id, span_id=self.id) def as_doc(self, *, bint copy_user_data=False, array_head=None, array=None): """Create a `Doc` object with a copy of the `Span`'s data. - copy_user_data (bool): Whether or not to copy the original doc's user data. array_head (tuple): `Doc` array attrs, can be passed in to speed up computation. array (ndarray): `Doc` as array, can be passed in to speed up computation. @@ -266,12 +284,22 @@ cdef class Span: char_offset = self.start_char for key, value in self.doc.user_data.items(): if isinstance(key, tuple) and len(key) == 4 and key[0] == "._.": - data_type, name, start, end = key + data_type = key[0] + name = key[1] + start = key[2] + end = key[3] if start is not None or end is not None: start -= char_offset + # Check if Span object if end is not None: end -= char_offset - user_data[(data_type, name, start, end)] = copy.copy(value) + _label = key[4] + _kb_id = key[5] + _span_id = key[6] + user_data[(data_type, name, start, end, _label, _kb_id, _span_id)] = copy.copy(value) + # Else Token object + else: + user_data[(data_type, name, start, end)] = copy.copy(value) else: user_data[key] = copy.copy(value) doc.user_data = user_data @@ -283,13 +311,14 @@ cdef class Span: cdef int length = len(array) cdef attr_t value cdef int i, head_col, ancestor_i + cdef SpanC* span_c = self.span_c() old_to_new_root = dict() if HEAD in attrs: head_col = attrs.index(HEAD) for i in range(length): # if the HEAD refers to a token outside this span, find a more appropriate ancestor token = self[i] - ancestor_i = token.head.i - self.c.start # span offset + ancestor_i = token.head.i - span_c.start # span offset if ancestor_i not in range(length): if DEP in attrs: array[i, attrs.index(DEP)] = dep @@ -297,7 +326,7 @@ cdef class Span: # try finding an ancestor within this span ancestors = token.ancestors for ancestor in ancestors: - ancestor_i = ancestor.i - self.c.start + ancestor_i = ancestor.i - span_c.start if ancestor_i in range(length): array[i, head_col] = numpy.int32(ancestor_i - i).astype(numpy.uint64) @@ -326,7 +355,8 @@ cdef class Span: DOCS: https://spacy.io/api/span#get_lca_matrix """ - return numpy.asarray(_get_lca_matrix(self.doc, self.c.start, self.c.end)) + cdef SpanC* span_c = self.span_c() + return numpy.asarray(_get_lca_matrix(self.doc, span_c.start, span_c.end)) def similarity(self, other): """Make a semantic similarity estimate. The default estimate is cosine @@ -362,7 +392,7 @@ cdef class Span: result = xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm) # ensure we get a scalar back (numpy does this automatically but cupy doesn't) return result.item() - + cpdef np.ndarray to_array(self, object py_attr_ids): """Given a list of M attribute IDs, export the tokens to a numpy `ndarray` of shape `(N, M)`, where `N` is the length of the document. @@ -423,25 +453,29 @@ cdef class Span: else: raise ValueError(Errors.E030) + cdef SpanC* span_c(self): + return self.c.get() + @property def sents(self): """Obtain the sentences that contain this span. If the given span crosses sentence boundaries, return all sentences it is a part of. - RETURNS (Iterable[Span]): All sentences that the span is a part of. + RETURNS (Tuple[Span]): All sentences that the span is a part of. - DOCS: https://spacy.io/api/span#sents + DOCS: https://spacy.io/api/span#sents """ cdef int start cdef int i if "sents" in self.doc.user_span_hooks: - yield from self.doc.user_span_hooks["sents"](self) - elif "sents" in self.doc.user_hooks: + return tuple(self.doc.user_span_hooks["sents"](self)) + spans = [] + if "sents" in self.doc.user_hooks: for sentence in self.doc.user_hooks["sents"](self.doc): if sentence.end > self.start: if sentence.start < self.end or sentence.start == self.start == self.end: - yield sentence + spans.append(sentence) else: break else: @@ -456,12 +490,13 @@ cdef class Span: # Now, find all the sentences in the span for i in range(start + 1, self.doc.length): if self.doc.c[i].sent_start == 1: - yield Span(self.doc, start, i) + spans.append(Span(self.doc, start, i)) start = i if start >= self.end: break if start < self.end: - yield Span(self.doc, start, self.end) + spans.append(Span(self.doc, start, self.end)) + return tuple(spans) @property @@ -469,19 +504,22 @@ cdef class Span: """The named entities that fall completely within the span. Returns a tuple of `Span` objects. - RETURNS (tuple): Entities in the span, one `Span` per entity. + RETURNS (Tuple[Span]): Entities in the span, one `Span` per entity. DOCS: https://spacy.io/api/span#ents """ cdef Span ent + cdef SpanC* span_c = self.span_c() + cdef SpanC* ent_span_c ents = [] for ent in self.doc.ents: - if ent.c.start >= self.c.start: - if ent.c.end <= self.c.end: + ent_span_c = ent.span_c() + if ent_span_c.start >= span_c.start: + if ent_span_c.end <= span_c.end: ents.append(ent) else: break - return ents + return tuple(ents) @property def has_vector(self): @@ -496,8 +534,6 @@ cdef class Span: return self.doc.user_span_hooks["has_vector"](self) elif self.vocab.vectors.size > 0: return any(token.has_vector for token in self) - elif self.doc.tensor.size > 0: - return True else: return False @@ -549,16 +585,6 @@ cdef class Span: return None return self.doc.tensor[self.start : self.end] - @property - def sentiment(self): - """RETURNS (float): A scalar value indicating the positivity or - negativity of the span. - """ - if "sentiment" in self.doc.user_span_hooks: - return self.doc.user_span_hooks["sentiment"](self) - else: - return sum([token.sentiment for token in self]) / len(self) - @property def text(self): """RETURNS (str): The original verbatim text of the span.""" @@ -589,13 +615,15 @@ cdef class Span: NP-level coordination, no prepositional phrases, and no relative clauses. - YIELDS (Span): Noun chunks in the span. + RETURNS (Tuple[Span]): Noun chunks in the span. DOCS: https://spacy.io/api/span#noun_chunks """ + spans = [] for span in self.doc.noun_chunks: if span.start >= self.start and span.end <= self.end: - yield span + spans.append(span) + return tuple(spans) @property def root(self): @@ -612,11 +640,12 @@ cdef class Span: # This should probably be called 'head', and the other one called # 'gov'. But we went with 'head' elsewhere, and now we're stuck =/ cdef int i + cdef SpanC* span_c = self.span_c() # First, we scan through the Span, and check whether there's a word # with head==0, i.e. a sentence root. If so, we can return it. The # longer the span, the more likely it contains a sentence root, and # in this case we return in linear time. - for i in range(self.c.start, self.c.end): + for i in range(span_c.start, span_c.end): if self.doc.c[i].head == 0: return self.doc[i] # If we don't have a sentence root, we do something that's not so @@ -627,33 +656,40 @@ cdef class Span: # think this should be okay. cdef int current_best = self.doc.length cdef int root = -1 - for i in range(self.c.start, self.c.end): - if self.c.start <= (i+self.doc.c[i].head) < self.c.end: + for i in range(span_c.start, span_c.end): + if span_c.start <= (i+self.doc.c[i].head) < span_c.end: continue words_to_root = _count_words_to_root(&self.doc.c[i], self.doc.length) if words_to_root < current_best: current_best = words_to_root root = i if root == -1: - return self.doc[self.c.start] + return self.doc[span_c.start] else: return self.doc[root] - def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None, id=0): + def char_span(self, int start_idx, int end_idx, label=0, *, kb_id=0, vector=None, alignment_mode="strict", span_id=0): """Create a `Span` object from the slice `span.text[start : end]`. - start (int): The index of the first character of the span. - end (int): The index of the first character after the span. - label (uint64 or string): A label to attach to the Span, e.g. for + start_idx (int): The index of the first character of the span. + end_idx (int): The index of the first character after the span. + label (Union[int, str]): A label to attach to the Span, e.g. for named entities. - kb_id (uint64 or string): An ID from a KB to capture the meaning of a named entity. + kb_id (Union[int, str]): An ID from a KB to capture the meaning of a named entity. vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span. + alignment_mode (str): How character indices are aligned to token + boundaries. Options: "strict" (character indices must be aligned + with token boundaries), "contract" (span of all tokens completely + within the character span), "expand" (span of all tokens at least + partially covered by the character span). Defaults to "strict". + span_id (Union[int, str]): An identifier to associate with the span. RETURNS (Span): The newly constructed object. """ - start_idx += self.c.start_char - end_idx += self.c.start_char - return self.doc.char_span(start_idx, end_idx, label=label, kb_id=kb_id, vector=vector) + cdef SpanC* span_c = self.span_c() + start_idx += span_c.start_char + end_idx += span_c.start_char + return self.doc.char_span(start_idx, end_idx, label=label, kb_id=kb_id, vector=vector, alignment_mode=alignment_mode, span_id=span_id) @property def conjuncts(self): @@ -733,76 +769,83 @@ cdef class Span: property start: def __get__(self): - return self.c.start + return self.span_c().start def __set__(self, int start): if start < 0: raise IndexError(Errors.E1032.format(var="start", forbidden="< 0", value=start)) - self.c.start = start + self.span_c().start = start property end: def __get__(self): - return self.c.end + return self.span_c().end def __set__(self, int end): if end < 0: raise IndexError(Errors.E1032.format(var="end", forbidden="< 0", value=end)) - self.c.end = end + self.span_c().end = end property start_char: def __get__(self): - return self.c.start_char + return self.span_c().start_char def __set__(self, int start_char): if start_char < 0: raise IndexError(Errors.E1032.format(var="start_char", forbidden="< 0", value=start_char)) - self.c.start_char = start_char + self.span_c().start_char = start_char property end_char: def __get__(self): - return self.c.end_char + return self.span_c().end_char def __set__(self, int end_char): if end_char < 0: raise IndexError(Errors.E1032.format(var="end_char", forbidden="< 0", value=end_char)) - self.c.end_char = end_char + self.span_c().end_char = end_char property label: def __get__(self): - return self.c.label + return self.span_c().label def __set__(self, attr_t label): - self.c.label = label + if label != self.span_c().label : + old_label = self.span_c().label + self.span_c().label = label + new = Underscore(Underscore.span_extensions, self, start=self.span_c().start_char, end=self.span_c().end_char, label=self.label, kb_id=self.kb_id, span_id=self.id) + old = Underscore(Underscore.span_extensions, self, start=self.span_c().start_char, end=self.span_c().end_char, label=old_label, kb_id=self.kb_id, span_id=self.id) + Underscore._replace_keys(old, new) property kb_id: def __get__(self): - return self.c.kb_id + return self.span_c().kb_id def __set__(self, attr_t kb_id): - self.c.kb_id = kb_id + if kb_id != self.span_c().kb_id : + old_kb_id = self.span_c().kb_id + self.span_c().kb_id = kb_id + new = Underscore(Underscore.span_extensions, self, start=self.span_c().start_char, end=self.span_c().end_char, label=self.label, kb_id=self.kb_id, span_id=self.id) + old = Underscore(Underscore.span_extensions, self, start=self.span_c().start_char, end=self.span_c().end_char, label=self.label, kb_id=old_kb_id, span_id=self.id) + Underscore._replace_keys(old, new) property id: def __get__(self): - return self.c.id + return self.span_c().id def __set__(self, attr_t id): - self.c.id = id + if id != self.span_c().id : + old_id = self.span_c().id + self.span_c().id = id + new = Underscore(Underscore.span_extensions, self, start=self.span_c().start_char, end=self.span_c().end_char, label=self.label, kb_id=self.kb_id, span_id=self.id) + old = Underscore(Underscore.span_extensions, self, start=self.span_c().start_char, end=self.span_c().end_char, label=self.label, kb_id=self.kb_id, span_id=old_id) + Underscore._replace_keys(old, new) property ent_id: - """RETURNS (uint64): The entity ID.""" + """Alias for the span's ID.""" def __get__(self): - return self.root.ent_id + return self.id - def __set__(self, hash_t key): - raise NotImplementedError(Errors.E200.format(attr="ent_id")) - - property ent_id_: - """RETURNS (str): The (string) entity ID.""" - def __get__(self): - return self.root.ent_id_ - - def __set__(self, str key): - raise NotImplementedError(Errors.E200.format(attr="ent_id_")) + def __set__(self, attr_t ent_id): + self.id = ent_id @property def orth_(self): @@ -818,7 +861,7 @@ cdef class Span: return "".join([t.lemma_ + t.whitespace_ for t in self]).strip() property label_: - """RETURNS (str): The span's label.""" + """The span's label.""" def __get__(self): return self.doc.vocab.strings[self.label] @@ -826,7 +869,7 @@ cdef class Span: self.label = self.doc.vocab.strings.add(label_) property kb_id_: - """RETURNS (str): The span's KB ID.""" + """The span's KB ID.""" def __get__(self): return self.doc.vocab.strings[self.kb_id] @@ -834,13 +877,22 @@ cdef class Span: self.kb_id = self.doc.vocab.strings.add(kb_id_) property id_: - """RETURNS (str): The span's ID.""" + """The span's ID.""" def __get__(self): return self.doc.vocab.strings[self.id] def __set__(self, str id_): self.id = self.doc.vocab.strings.add(id_) + property ent_id_: + """Alias for the span's ID.""" + def __get__(self): + return self.id_ + + def __set__(self, str ent_id_): + self.id_ = ent_id_ + + cdef int _count_words_to_root(const TokenC* token, int sent_length) except -1: # Don't allow spaces to be the root, if there are diff --git a/spacy/tokens/span_group.pxd b/spacy/tokens/span_group.pxd index 5074aa275..6b817578a 100644 --- a/spacy/tokens/span_group.pxd +++ b/spacy/tokens/span_group.pxd @@ -1,3 +1,4 @@ +from libcpp.memory cimport shared_ptr from libcpp.vector cimport vector from ..structs cimport SpanC @@ -5,6 +6,6 @@ cdef class SpanGroup: cdef public object _doc_ref cdef public str name cdef public dict attrs - cdef vector[SpanC] c + cdef vector[shared_ptr[SpanC]] c - cdef void push_back(self, SpanC span) nogil + cdef void push_back(self, const shared_ptr[SpanC] &span) diff --git a/spacy/tokens/span_group.pyx b/spacy/tokens/span_group.pyx index 608dda283..7325c1fa7 100644 --- a/spacy/tokens/span_group.pyx +++ b/spacy/tokens/span_group.pyx @@ -6,6 +6,7 @@ import srsly from spacy.errors import Errors from .span cimport Span +from libcpp.memory cimport make_shared cdef class SpanGroup: @@ -197,10 +198,12 @@ cdef class SpanGroup: DOCS: https://spacy.io/api/spangroup#to_bytes """ + cdef SpanC* span_c output = {"name": self.name, "attrs": self.attrs, "spans": []} cdef int i for i in range(self.c.size()): span = self.c[i] + span_c = span.get() # The struct.pack here is probably overkill, but it might help if # you're saving tonnes of spans, and it doesn't really add any # complexity. We do take care to specify little-endian byte order @@ -212,13 +215,13 @@ cdef class SpanGroup: # l: int32_t output["spans"].append(struct.pack( ">QQQllll", - span.id, - span.kb_id, - span.label, - span.start, - span.end, - span.start_char, - span.end_char + span_c.id, + span_c.kb_id, + span_c.label, + span_c.start, + span_c.end, + span_c.start_char, + span_c.end_char )) return srsly.msgpack_dumps(output) @@ -245,10 +248,10 @@ cdef class SpanGroup: span.end = items[4] span.start_char = items[5] span.end_char = items[6] - self.c.push_back(span) + self.c.push_back(make_shared[SpanC](span)) return self - cdef void push_back(self, SpanC span) nogil: + cdef void push_back(self, const shared_ptr[SpanC] &span): self.c.push_back(span) def copy(self, doc: Optional["Doc"] = None) -> SpanGroup: diff --git a/spacy/tokens/_dict_proxies.py b/spacy/tokens/span_groups.py similarity index 100% rename from spacy/tokens/_dict_proxies.py rename to spacy/tokens/span_groups.py diff --git a/spacy/tokens/token.pyi b/spacy/tokens/token.pyi index bd585d034..6de7e984a 100644 --- a/spacy/tokens/token.pyi +++ b/spacy/tokens/token.pyi @@ -79,8 +79,6 @@ class Token: @property def prob(self) -> float: ... @property - def sentiment(self) -> float: ... - @property def lang(self) -> int: ... @property def idx(self) -> int: ... diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 7fff6b162..74f812af7 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -22,6 +22,7 @@ from .. import parts_of_speech from ..errors import Errors, Warnings from ..attrs import IOB_STRINGS from .underscore import Underscore, get_ext_args +from cython.operator cimport dereference as deref cdef class Token: @@ -231,7 +232,7 @@ cdef class Token: # Check that the morph has the same vocab if self.vocab != morph.vocab: raise ValueError(Errors.E1013) - self.c.morph = morph.c.key + self.c.morph = deref(morph.c).key def set_morph(self, features): cdef hash_t key @@ -282,14 +283,6 @@ cdef class Token: """RETURNS (float): Smoothed log probability estimate of token type.""" return self.vocab[self.c.lex.orth].prob - @property - def sentiment(self): - """RETURNS (float): A scalar value indicating the positivity or - negativity of the token.""" - if "sentiment" in self.doc.user_token_hooks: - return self.doc.user_token_hooks["sentiment"](self) - return self.vocab[self.c.lex.orth].sentiment - @property def lang(self): """RETURNS (uint64): ID of the language of the parent document's @@ -396,8 +389,6 @@ cdef class Token: """ if "has_vector" in self.doc.user_token_hooks: return self.doc.user_token_hooks["has_vector"](self) - if self.vocab.vectors.size == 0 and self.doc.tensor.size != 0: - return True return self.vocab.has_vector(self.c.lex.orth) @property @@ -411,8 +402,6 @@ cdef class Token: """ if "vector" in self.doc.user_token_hooks: return self.doc.user_token_hooks["vector"](self) - if self.vocab.vectors.size == 0 and self.doc.tensor.size != 0: - return self.doc.tensor[self.i] else: return self.vocab.get_vector(self.c.lex.orth) diff --git a/spacy/tokens/underscore.py b/spacy/tokens/underscore.py index e9a4e1862..f2f357441 100644 --- a/spacy/tokens/underscore.py +++ b/spacy/tokens/underscore.py @@ -2,10 +2,10 @@ from typing import Dict, Any, List, Optional, Tuple, Union, TYPE_CHECKING import functools import copy from ..errors import Errors +from .span import Span if TYPE_CHECKING: from .doc import Doc - from .span import Span from .token import Token @@ -25,6 +25,9 @@ class Underscore: obj: Union["Doc", "Span", "Token"], start: Optional[int] = None, end: Optional[int] = None, + label: int = 0, + kb_id: int = 0, + span_id: int = 0, ): object.__setattr__(self, "_extensions", extensions) object.__setattr__(self, "_obj", obj) @@ -36,6 +39,10 @@ class Underscore: object.__setattr__(self, "_doc", obj.doc) object.__setattr__(self, "_start", start) object.__setattr__(self, "_end", end) + if type(obj) == Span: + object.__setattr__(self, "_label", label) + object.__setattr__(self, "_kb_id", kb_id) + object.__setattr__(self, "_span_id", span_id) def __dir__(self) -> List[str]: # Hack to enable autocomplete on custom extensions @@ -88,8 +95,39 @@ class Underscore: def has(self, name: str) -> bool: return name in self._extensions - def _get_key(self, name: str) -> Tuple[str, str, Optional[int], Optional[int]]: - return ("._.", name, self._start, self._end) + def _get_key( + self, name: str + ) -> Union[ + Tuple[str, str, Optional[int], Optional[int]], + Tuple[str, str, Optional[int], Optional[int], int, int, int], + ]: + if hasattr(self, "_label"): + return ( + "._.", + name, + self._start, + self._end, + self._label, + self._kb_id, + self._span_id, + ) + else: + return "._.", name, self._start, self._end + + @staticmethod + def _replace_keys(old_underscore: "Underscore", new_underscore: "Underscore"): + """ + This function is called by Span when its kb_id or label are re-assigned. + It checks if any user_data is stored for this span and replaces the keys + """ + for name in old_underscore._extensions: + old_key = old_underscore._get_key(name) + old_doc = old_underscore._doc + new_key = new_underscore._get_key(name) + if old_key != new_key and old_key in old_doc.user_data: + old_underscore._doc.user_data[ + new_key + ] = old_underscore._doc.user_data.pop(old_key) @classmethod def get_state(cls) -> Tuple[Dict[Any, Any], Dict[Any, Any], Dict[Any, Any]]: diff --git a/spacy/training/__init__.py b/spacy/training/__init__.py index 71d1fa775..f8e69b1c8 100644 --- a/spacy/training/__init__.py +++ b/spacy/training/__init__.py @@ -1,5 +1,6 @@ -from .corpus import Corpus, JsonlCorpus # noqa: F401 +from .corpus import Corpus, JsonlCorpus, PlainTextCorpus # noqa: F401 from .example import Example, validate_examples, validate_get_examples # noqa: F401 +from .example import validate_distillation_examples # noqa: F401 from .alignment import Alignment # noqa: F401 from .augment import dont_augment, orth_variants_augmenter # noqa: F401 from .iob_utils import iob_to_biluo, biluo_to_iob # noqa: F401 diff --git a/spacy/training/batchers.py b/spacy/training/batchers.py index f0b6c3123..d9aa04e32 100644 --- a/spacy/training/batchers.py +++ b/spacy/training/batchers.py @@ -2,11 +2,13 @@ from typing import Union, Iterable, Sequence, TypeVar, List, Callable, Iterator from typing import Optional, Any from functools import partial import itertools +from thinc.schedules import Schedule from ..util import registry, minibatch -Sizing = Union[Sequence[int], int] +SizingSchedule = Union[Iterable[int], int, Schedule] +Sizing = Union[Iterable[int], int] ItemT = TypeVar("ItemT") BatcherT = Callable[[Iterable[ItemT]], Iterable[List[ItemT]]] @@ -14,7 +16,7 @@ BatcherT = Callable[[Iterable[ItemT]], Iterable[List[ItemT]]] @registry.batchers("spacy.batch_by_padded.v1") def configure_minibatch_by_padded_size( *, - size: Sizing, + size: SizingSchedule, buffer: int, discard_oversize: bool, get_length: Optional[Callable[[ItemT], int]] = None @@ -24,8 +26,8 @@ def configure_minibatch_by_padded_size( The padded size is defined as the maximum length of sequences within the batch multiplied by the number of sequences in the batch. - size (int or Sequence[int]): The largest padded size to batch sequences into. - Can be a single integer, or a sequence, allowing for variable batch sizes. + size (int, Iterable[int] or Schedule): The largest padded size to batch sequences + into. Can be a single integer, or a sequence, allowing for variable batch sizes. buffer (int): The number of sequences to accumulate before sorting by length. A larger buffer will result in more even sizing, but if the buffer is very large, the iteration order will be less random, which can result @@ -39,7 +41,7 @@ def configure_minibatch_by_padded_size( optionals = {"get_length": get_length} if get_length is not None else {} return partial( minibatch_by_padded_size, - size=size, + size=_schedule_to_sizing(size), buffer=buffer, discard_oversize=discard_oversize, **optionals @@ -49,14 +51,14 @@ def configure_minibatch_by_padded_size( @registry.batchers("spacy.batch_by_words.v1") def configure_minibatch_by_words( *, - size: Sizing, + size: SizingSchedule, tolerance: float, discard_oversize: bool, get_length: Optional[Callable[[ItemT], int]] = None ) -> BatcherT: """Create a batcher that uses the "minibatch by words" strategy. - size (int or Sequence[int]): The target number of words per batch. + size (int, Iterable[int] or Schedule): The target number of words per batch. Can be a single integer, or a sequence, allowing for variable batch sizes. tolerance (float): What percentage of the size to allow batches to exceed. discard_oversize (bool): Whether to discard sequences that by themselves @@ -67,7 +69,7 @@ def configure_minibatch_by_words( optionals = {"get_length": get_length} if get_length is not None else {} return partial( minibatch_by_words, - size=size, + size=_schedule_to_sizing(size), tolerance=tolerance, discard_oversize=discard_oversize, **optionals @@ -76,15 +78,15 @@ def configure_minibatch_by_words( @registry.batchers("spacy.batch_by_sequence.v1") def configure_minibatch( - size: Sizing, get_length: Optional[Callable[[ItemT], int]] = None + size: SizingSchedule, get_length: Optional[Callable[[ItemT], int]] = None ) -> BatcherT: """Create a batcher that creates batches of the specified size. - size (int or Sequence[int]): The target number of items per batch. + size (int, Iterable[int] or Schedule): The target number of items per batch. Can be a single integer, or a sequence, allowing for variable batch sizes. """ optionals = {"get_length": get_length} if get_length is not None else {} - return partial(minibatch, size=size, **optionals) + return partial(minibatch, size=_schedule_to_sizing(size), **optionals) def minibatch_by_padded_size( @@ -100,7 +102,7 @@ def minibatch_by_padded_size( The padded size is defined as the maximum length of sequences within the batch multiplied by the number of sequences in the batch. - size (int or Sequence[int]): The largest padded size to batch sequences into. + size (int or Iterable[int]): The largest padded size to batch sequences into. buffer (int): The number of sequences to accumulate before sorting by length. A larger buffer will result in more even sizing, but if the buffer is very large, the iteration order will be less random, which can result @@ -111,7 +113,7 @@ def minibatch_by_padded_size( The `len` function is used by default. """ if isinstance(size, int): - size_ = itertools.repeat(size) # type: Iterator[int] + size_: Iterator[int] = itertools.repeat(size) else: size_ = iter(size) for outer_batch in minibatch(seqs, size=buffer): @@ -138,7 +140,7 @@ def minibatch_by_words( themselves, or be discarded if discard_oversize=True. seqs (Iterable[Sequence]): The sequences to minibatch. - size (int or Sequence[int]): The target number of words per batch. + size (int or Iterable[int]): The target number of words per batch. Can be a single integer, or a sequence, allowing for variable batch sizes. tolerance (float): What percentage of the size to allow batches to exceed. discard_oversize (bool): Whether to discard sequences that by themselves @@ -147,7 +149,7 @@ def minibatch_by_words( item. The `len` function is used by default. """ if isinstance(size, int): - size_ = itertools.repeat(size) # type: Iterator[int] + size_: Iterator[int] = itertools.repeat(size) else: size_ = iter(size) target_size = next(size_) @@ -230,3 +232,9 @@ def _batch_by_length( batches = [list(sorted(batch)) for batch in batches] batches.reverse() return batches + + +def _schedule_to_sizing(size: SizingSchedule) -> Sizing: + if isinstance(size, Schedule): + return size.to_generator() + return size diff --git a/spacy/training/callbacks.py b/spacy/training/callbacks.py index 426fddf90..7e2494f5b 100644 --- a/spacy/training/callbacks.py +++ b/spacy/training/callbacks.py @@ -11,7 +11,7 @@ def create_copy_from_base_model( ) -> Callable[[Language], Language]: def copy_from_base_model(nlp): if tokenizer: - logger.info(f"Copying tokenizer from: {tokenizer}") + logger.info("Copying tokenizer from: %s", tokenizer) base_nlp = load_model(tokenizer) if nlp.config["nlp"]["tokenizer"] == base_nlp.config["nlp"]["tokenizer"]: nlp.tokenizer.from_bytes(base_nlp.tokenizer.to_bytes(exclude=["vocab"])) @@ -23,7 +23,7 @@ def create_copy_from_base_model( ) ) if vocab: - logger.info(f"Copying vocab from: {vocab}") + logger.info("Copying vocab from: %s", vocab) # only reload if the vocab is from a different model if tokenizer != vocab: base_nlp = load_model(vocab) diff --git a/spacy/training/converters/conll_ner_to_docs.py b/spacy/training/converters/conll_ner_to_docs.py index 28b21c5f0..259f5fa8c 100644 --- a/spacy/training/converters/conll_ner_to_docs.py +++ b/spacy/training/converters/conll_ner_to_docs.py @@ -86,7 +86,7 @@ def conll_ner_to_docs( if model: nlp = load_model(model) else: - nlp = get_lang_class("xx")() + nlp = get_lang_class("mul")() for conll_doc in input_data.strip().split(doc_delimiter): conll_doc = conll_doc.strip() if not conll_doc: @@ -133,7 +133,7 @@ def segment_sents_and_docs(doc, n_sents, doc_delimiter, model=None, msg=None): "Segmenting sentences with sentencizer. (Use `-b model` for " "improved parser-based sentence segmentation.)" ) - nlp = get_lang_class("xx")() + nlp = get_lang_class("mul")() sentencizer = nlp.create_pipe("sentencizer") lines = doc.strip().split("\n") words = [line.strip().split()[0] for line in lines] diff --git a/spacy/training/converters/json_to_docs.py b/spacy/training/converters/json_to_docs.py index 4123839f2..1ff7a64e0 100644 --- a/spacy/training/converters/json_to_docs.py +++ b/spacy/training/converters/json_to_docs.py @@ -3,7 +3,7 @@ from ..gold_io import json_iterate, json_to_annotations from ..example import annotations_to_doc from ..example import _fix_legacy_dict_data, _parse_example_dict_data from ...util import load_model -from ...lang.xx import MultiLanguage +from ...lang.mul import MultiLanguage def json_to_docs(input_data, model=None, **kwargs): diff --git a/spacy/training/corpus.py b/spacy/training/corpus.py index b9f929fcd..086ad831c 100644 --- a/spacy/training/corpus.py +++ b/spacy/training/corpus.py @@ -29,7 +29,7 @@ def create_docbin_reader( ) -> Callable[["Language"], Iterable[Example]]: if path is None: raise ValueError(Errors.E913) - util.logger.debug(f"Loading corpus from path: {path}") + util.logger.debug("Loading corpus from path: %s", path) return Corpus( path, gold_preproc=gold_preproc, @@ -58,6 +58,28 @@ def read_labels(path: Path, *, require: bool = False): return srsly.read_json(path) +@util.registry.readers("spacy.PlainTextCorpus.v1") +def create_plain_text_reader( + path: Optional[Path], + min_length: int = 0, + max_length: int = 0, +) -> Callable[["Language"], Iterable[Doc]]: + """Iterate Example objects from a file or directory of plain text + UTF-8 files with one line per doc. + + path (Path): The directory or filename to read from. + min_length (int): Minimum document length (in tokens). Shorter documents + will be skipped. Defaults to 0, which indicates no limit. + max_length (int): Maximum document length (in tokens). Longer documents will + be skipped. Defaults to 0, which indicates no limit. + + DOCS: https://spacy.io/api/corpus#plaintextcorpus + """ + if path is None: + raise ValueError(Errors.E913) + return PlainTextCorpus(path, min_length=min_length, max_length=max_length) + + def walk_corpus(path: Union[str, Path], file_type) -> List[Path]: path = util.ensure_path(path) if not path.is_dir() and path.parts[-1].endswith(file_type): @@ -257,3 +279,52 @@ class JsonlCorpus: # We don't *need* an example here, but it seems nice to # make it match the Corpus signature. yield Example(doc, Doc(nlp.vocab, words=words, spaces=spaces)) + + +class PlainTextCorpus: + """Iterate Example objects from a file or directory of plain text + UTF-8 files with one line per doc. + + path (Path): The directory or filename to read from. + min_length (int): Minimum document length (in tokens). Shorter documents + will be skipped. Defaults to 0, which indicates no limit. + max_length (int): Maximum document length (in tokens). Longer documents will + be skipped. Defaults to 0, which indicates no limit. + + DOCS: https://spacy.io/api/corpus#plaintextcorpus + """ + + file_type = "txt" + + def __init__( + self, + path: Optional[Union[str, Path]], + *, + min_length: int = 0, + max_length: int = 0, + ) -> None: + self.path = util.ensure_path(path) + self.min_length = min_length + self.max_length = max_length + + def __call__(self, nlp: "Language") -> Iterator[Example]: + """Yield examples from the data. + + nlp (Language): The current nlp object. + YIELDS (Example): The example objects. + + DOCS: https://spacy.io/api/corpus#plaintextcorpus-call + """ + for loc in walk_corpus(self.path, ".txt"): + with open(loc, encoding="utf-8") as f: + for text in f: + text = text.rstrip("\r\n") + if len(text): + doc = nlp.make_doc(text) + if self.min_length >= 1 and len(doc) < self.min_length: + continue + elif self.max_length >= 1 and len(doc) > self.max_length: + continue + # We don't *need* an example here, but it seems nice to + # make it match the Corpus signature. + yield Example(doc, doc.copy()) diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx index 95b0f0de9..a36fa0d73 100644 --- a/spacy/training/example.pyx +++ b/spacy/training/example.pyx @@ -1,5 +1,4 @@ from collections.abc import Iterable as IterableInstance -import warnings import numpy from murmurhash.mrmr cimport hash64 @@ -47,6 +46,13 @@ def validate_examples(examples, method): raise TypeError(err) +def validate_distillation_examples(examples, method): + validate_examples(examples, method) + for eg in examples: + if [token.text for token in eg.reference] != [token.text for token in eg.predicted]: + raise ValueError(Errors.E4003) + + def validate_get_examples(get_examples, method): """Check that a generator of a batch of examples received during processing is valid: the callable produces a non-empty list of Example objects. diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py index 9453959bf..8364e13f5 100644 --- a/spacy/training/initialize.py +++ b/spacy/training/initialize.py @@ -62,10 +62,10 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language": frozen_components = T["frozen_components"] # Sourced components that require resume_training resume_components = [p for p in sourced if p not in frozen_components] - logger.info(f"Pipeline: {nlp.pipe_names}") + logger.info("Pipeline: %s", nlp.pipe_names) if resume_components: with nlp.select_pipes(enable=resume_components): - logger.info(f"Resuming training for: {resume_components}") + logger.info("Resuming training for: %s", resume_components) nlp.resume_training(sgd=optimizer) # Components that should be updated during training rehearsal_components = T["rehearsal_components"] @@ -77,16 +77,17 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language": if T["max_epochs"] == -1: sample_size = 100 logger.debug( - f"Due to streamed train corpus, using only first {sample_size} " - f"examples for initialization. If necessary, provide all labels " - f"in [initialize]. More info: https://spacy.io/api/cli#init_labels" + "Due to streamed train corpus, using only first %s examples for initialization. " + "If necessary, provide all labels in [initialize]. " + "More info: https://spacy.io/api/cli#init_labels", + sample_size, ) nlp.initialize( lambda: islice(train_corpus(nlp), sample_size), sgd=optimizer ) else: nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer) - logger.info(f"Initialized pipeline components: {nlp.pipe_names}") + logger.info("Initialized pipeline components: %s", nlp.pipe_names) # Detect components with listeners that are not frozen consistently for name, proc in nlp.pipeline: for listener in getattr( @@ -113,7 +114,7 @@ def init_vocab( ) -> None: if lookups: nlp.vocab.lookups = lookups - logger.info(f"Added vocab lookups: {', '.join(lookups.tables)}") + logger.info("Added vocab lookups: %s", ", ".join(lookups.tables)) data_path = ensure_path(data) if data_path is not None: lex_attrs = srsly.read_jsonl(data_path) @@ -129,11 +130,11 @@ def init_vocab( else: oov_prob = DEFAULT_OOV_PROB nlp.vocab.cfg.update({"oov_prob": oov_prob}) - logger.info(f"Added {len(nlp.vocab)} lexical entries to the vocab") + logger.info("Added %d lexical entries to the vocab", len(nlp.vocab)) logger.info("Created vocabulary") if vectors is not None: load_vectors_into_model(nlp, vectors) - logger.info(f"Added vectors: {vectors}") + logger.info("Added vectors: %s", vectors) # warn if source model vectors are not identical sourced_vectors_hashes = nlp.meta.pop("_sourced_vectors_hashes", {}) vectors_hash = hash(nlp.vocab.vectors.to_bytes(exclude=["strings"])) @@ -195,7 +196,7 @@ def init_tok2vec( if weights_data is not None: layer = get_tok2vec_ref(nlp, P) layer.from_bytes(weights_data) - logger.info(f"Loaded pretrained weights from {init_tok2vec}") + logger.info("Loaded pretrained weights from %s", init_tok2vec) return True return False @@ -206,7 +207,6 @@ def convert_vectors( *, truncate: int, prune: int, - name: Optional[str] = None, mode: str = VectorsMode.default, ) -> None: vectors_loc = ensure_path(vectors_loc) @@ -220,13 +220,13 @@ def convert_vectors( nlp.vocab.deduplicate_vectors() else: if vectors_loc: - logger.info(f"Reading vectors from {vectors_loc}") + logger.info("Reading vectors from %s", vectors_loc) vectors_data, vector_keys, floret_settings = read_vectors( vectors_loc, truncate, mode=mode, ) - logger.info(f"Loaded vectors from {vectors_loc}") + logger.info("Loaded vectors from %s", vectors_loc) else: vectors_data, vector_keys = (None, None) if vector_keys is not None and mode != VectorsMode.floret: @@ -245,12 +245,6 @@ def convert_vectors( strings=nlp.vocab.strings, data=vectors_data, keys=vector_keys ) nlp.vocab.deduplicate_vectors() - if name is None: - # TODO: Is this correct? Does this matter? - nlp.vocab.vectors.name = f"{nlp.meta['lang']}_{nlp.meta['name']}.vectors" - else: - nlp.vocab.vectors.name = name - nlp.meta["vectors"]["name"] = nlp.vocab.vectors.name if prune >= 1 and mode != VectorsMode.floret: nlp.vocab.prune_vectors(prune) diff --git a/spacy/training/loop.py b/spacy/training/loop.py index 233b5e642..3b9d13a4f 100644 --- a/spacy/training/loop.py +++ b/spacy/training/loop.py @@ -103,7 +103,7 @@ def train( stdout.write( msg.info(f"Set annotations on update for: {annotating_components}") + "\n" ) - stdout.write(msg.info(f"Initial learn rate: {optimizer.learn_rate}") + "\n") + stdout.write(msg.info(f"Initial learn rate: {optimizer.learn_rate(step=0)}") + "\n") with nlp.select_pipes(disable=frozen_components): log_step, finalize_logger = train_logger(nlp, stdout, stderr) try: @@ -208,7 +208,7 @@ def train_while_improving( if before_update: before_update_args = {"step": step, "epoch": epoch} before_update(nlp, before_update_args) - dropout = next(dropouts) # type: ignore + dropout = dropouts(optimizer.step) # type: ignore for subbatch in subdivide_batch(batch, accumulate_gradient): nlp.update( subbatch, @@ -241,6 +241,7 @@ def train_while_improving( score, other_scores = evaluate() else: score, other_scores = evaluate() + optimizer.last_score = score results.append((score, step)) is_best_checkpoint = score == max(results)[0] else: @@ -381,6 +382,6 @@ def clean_output_dir(path: Optional[Path]) -> None: if subdir.exists(): try: shutil.rmtree(str(subdir)) - logger.debug(f"Removed existing output directory: {subdir}") + logger.debug("Removed existing output directory: %s", subdir) except Exception as e: raise IOError(Errors.E901.format(path=path)) from e diff --git a/spacy/ty.py b/spacy/ty.py index 8f2903d78..f6dece840 100644 --- a/spacy/ty.py +++ b/spacy/ty.py @@ -1,6 +1,5 @@ -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Protocol, runtime_checkable from typing import Optional, Any, Iterable, Dict, Callable, Sequence, List -from .compat import Protocol, runtime_checkable from thinc.api import Optimizer, Model @@ -27,6 +26,25 @@ class TrainableComponent(Protocol): ... +@runtime_checkable +class DistillableComponent(Protocol): + is_distillable: bool + + def distill( + self, + teacher_pipe: Optional[TrainableComponent], + examples: Iterable["Example"], + *, + drop: float = 0.0, + sgd: Optional[Optimizer] = None, + losses: Optional[Dict[str, float]] = None + ) -> Dict[str, float]: + ... + + def finish_update(self, sgd: Optimizer) -> None: + ... + + @runtime_checkable class InitializableComponent(Protocol): def initialize( diff --git a/spacy/util.py b/spacy/util.py index 8bf8fb1b0..1ce869152 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -4,12 +4,13 @@ from typing import Iterator, Pattern, Generator, TYPE_CHECKING from types import ModuleType import os import importlib +import importlib.metadata import importlib.util import re from pathlib import Path import thinc from thinc.api import NumpyOps, get_current_ops, Adam, Config, Optimizer -from thinc.api import ConfigValidationError, Model +from thinc.api import ConfigValidationError, Model, constant as constant_schedule import functools import itertools import numpy @@ -32,22 +33,17 @@ import inspect import pkgutil import logging import socket +import stat try: import cupy.random except ImportError: cupy = None -# These are functions that were previously (v2.x) available from spacy.util -# and have since moved to Thinc. We're importing them here so people's code -# doesn't break, but they should always be imported from Thinc from now on, -# not from spacy.util. -from thinc.api import fix_random_seed, compounding, decaying # noqa: F401 - from .symbols import ORTH -from .compat import cupy, CudaStream, is_windows, importlib_metadata -from .errors import Errors, Warnings, OLD_MODEL_SHORTCUTS +from .compat import cupy, CudaStream, is_windows +from .errors import Errors, Warnings from . import about if TYPE_CHECKING: @@ -60,7 +56,7 @@ if TYPE_CHECKING: # fmt: off OOV_RANK = numpy.iinfo(numpy.uint64).max DEFAULT_OOV_PROB = -20 -LEXEME_NORM_LANGS = ["cs", "da", "de", "el", "en", "id", "lb", "mk", "pt", "ru", "sr", "ta", "th"] +LEXEME_NORM_LANGS = ["cs", "da", "de", "el", "en", "grc", "id", "lb", "mk", "pt", "ru", "sr", "ta", "th"] # Default order of sections in the config file. Not all sections needs to exist, # and additional sections are added at the end, in alphabetical order. @@ -144,8 +140,17 @@ class registry(thinc.registry): return func @classmethod - def find(cls, registry_name: str, func_name: str) -> Callable: - """Get info about a registered function from the registry.""" + def find( + cls, registry_name: str, func_name: str + ) -> Dict[str, Optional[Union[str, int]]]: + """Find information about a registered function, including the + module and path to the file it's defined in, the line number and the + docstring, if available. + + registry_name (str): Name of the catalogue registry. + func_name (str): Name of the registered function. + RETURNS (Dict[str, Optional[Union[str, int]]]): The function info. + """ # We're overwriting this classmethod so we're able to provide more # specific error messages and implement a fallback to spacy-legacy. if not hasattr(cls, registry_name): @@ -288,7 +293,7 @@ def find_matching_language(lang: str) -> Optional[str]: import spacy.lang # noqa: F401 if lang == "xx": - return "xx" + return "mul" # Find out which language modules we have possible_languages = [] @@ -306,11 +311,7 @@ def find_matching_language(lang: str) -> Optional[str]: # is labeled that way is probably trying to be distinct from 'zh' and # shouldn't automatically match. match = langcodes.closest_supported_match(lang, possible_languages, max_distance=9) - if match == "mul": - # Convert 'mul' back to spaCy's 'xx' - return "xx" - else: - return match + return match def get_lang_class(lang: str) -> Type["Language"]: @@ -434,8 +435,6 @@ def load_model( return load_model_from_path(Path(name), **kwargs) # type: ignore[arg-type] elif hasattr(name, "exists"): # Path or Path-like to model data return load_model_from_path(name, **kwargs) # type: ignore[arg-type] - if name in OLD_MODEL_SHORTCUTS: - raise IOError(Errors.E941.format(name=name, full=OLD_MODEL_SHORTCUTS[name])) # type: ignore[index] raise IOError(Errors.E050.format(name=name)) @@ -714,8 +713,8 @@ def get_package_version(name: str) -> Optional[str]: RETURNS (str / None): The version or None if package not installed. """ try: - return importlib_metadata.version(name) # type: ignore[attr-defined] - except importlib_metadata.PackageNotFoundError: # type: ignore[attr-defined] + return importlib.metadata.version(name) # type: ignore[attr-defined] + except importlib.metadata.PackageNotFoundError: # type: ignore[attr-defined] return None @@ -903,7 +902,7 @@ def is_package(name: str) -> bool: RETURNS (bool): True if installed package, False if not. """ try: - importlib_metadata.distribution(name) # type: ignore[attr-defined] + importlib.metadata.distribution(name) # type: ignore[attr-defined] return True except: # noqa: E722 return False @@ -1041,8 +1040,15 @@ def make_tempdir() -> Generator[Path, None, None]: """ d = Path(tempfile.mkdtemp()) yield d + + # On Windows, git clones use read-only files, which cause permission errors + # when being deleted. This forcibly fixes permissions. + def force_remove(rmfunc, path, ex): + os.chmod(path, stat.S_IWRITE) + rmfunc(path) + try: - shutil.rmtree(str(d)) + shutil.rmtree(str(d), onerror=force_remove) except PermissionError as e: warnings.warn(Warnings.W091.format(dir=d, msg=e)) @@ -1593,7 +1599,7 @@ def minibatch(items, size): if isinstance(size, int): size_ = itertools.repeat(size) else: - size_ = size + size_ = iter(size) items = iter(items) while True: batch_size = next(size_) @@ -1726,7 +1732,7 @@ def packages_distributions() -> Dict[str, List[str]]: it's not available in the builtin importlib.metadata. """ pkg_to_dist = defaultdict(list) - for dist in importlib_metadata.distributions(): + for dist in importlib.metadata.distributions(): for pkg in (dist.read_text("top_level.txt") or "").split(): pkg_to_dist[pkg].append(dist.metadata["Name"]) return dict(pkg_to_dist) diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx index be0f6db09..bec3ac276 100644 --- a/spacy/vectors.pyx +++ b/spacy/vectors.pyx @@ -52,7 +52,6 @@ cdef class Vectors: DOCS: https://spacy.io/api/vectors """ cdef public object strings - cdef public object name cdef readonly object mode cdef public object data cdef public object key2row @@ -64,14 +63,13 @@ cdef class Vectors: cdef readonly unicode bow cdef readonly unicode eow - def __init__(self, *, strings=None, shape=None, data=None, keys=None, name=None, mode=Mode.default, minn=0, maxn=0, hash_count=1, hash_seed=0, bow="<", eow=">"): + def __init__(self, *, strings=None, shape=None, data=None, keys=None, mode=Mode.default, minn=0, maxn=0, hash_count=1, hash_seed=0, bow="<", eow=">"): """Create a new vector store. strings (StringStore): The string store. shape (tuple): Size of the table, as (# entries, # columns) data (numpy.ndarray or cupy.ndarray): The vector data. keys (iterable): A sequence of keys, aligned with the data. - name (str): A name to identify the vectors table. mode (str): Vectors mode: "default" or "floret" (default: "default"). minn (int): The floret char ngram minn (default: 0). maxn (int): The floret char ngram maxn (default: 0). @@ -85,7 +83,6 @@ cdef class Vectors: self.strings = strings if self.strings is None: self.strings = StringStore() - self.name = name if mode not in Mode.values(): raise ValueError( Errors.E202.format( diff --git a/spacy/vocab.pxd b/spacy/vocab.pxd index 9c951b2b7..2db709b71 100644 --- a/spacy/vocab.pxd +++ b/spacy/vocab.pxd @@ -32,16 +32,14 @@ cdef class Vocab: cdef public object writing_system cdef public object get_noun_chunks cdef readonly int length - cdef public object _unused_object # TODO remove in v4, see #9150 cdef public object lex_attr_getters cdef public object cfg - cdef const LexemeC* get(self, Pool mem, str string) except NULL - cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL + cdef const LexemeC* get(self, str string) except NULL + cdef const LexemeC* get_by_orth(self, attr_t orth) except NULL cdef const TokenC* make_fused_token(self, substrings) except NULL - cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL + cdef const LexemeC* _new_lexeme(self, str string) except NULL cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1 - cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL cdef PreshMap _by_orth diff --git a/spacy/vocab.pyi b/spacy/vocab.pyi index 4cc359c47..871044fff 100644 --- a/spacy/vocab.pyi +++ b/spacy/vocab.pyi @@ -11,7 +11,8 @@ from .vectors import Vectors from pathlib import Path def create_vocab( - lang: Optional[str], defaults: Any, vectors_name: Optional[str] = ... + lang: Optional[str], + defaults: Any, ) -> Vocab: ... class Vocab: @@ -28,7 +29,6 @@ class Vocab: strings: Optional[Union[List[str], StringStore]] = ..., lookups: Optional[Lookups] = ..., oov_prob: float = ..., - vectors_name: Optional[str] = ..., writing_system: Dict[str, Any] = ..., get_noun_chunks: Optional[Callable[[Union[Doc, Span]], Iterator[Span]]] = ..., ) -> None: ... @@ -72,7 +72,6 @@ def unpickle_vocab( sstore: StringStore, vectors: Any, morphology: Any, - _unused_object: Any, lex_attr_getters: Any, lookups: Any, get_noun_chunks: Any, diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 27f8e5f98..f3c3595ef 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -23,7 +23,7 @@ from .lang.norm_exceptions import BASE_NORMS from .lang.lex_attrs import LEX_ATTRS, is_stop, get_lang -def create_vocab(lang, defaults, vectors_name=None): +def create_vocab(lang, defaults): # If the spacy-lookups-data package is installed, we pre-populate the lookups # with lexeme data, if available lex_attrs = {**LEX_ATTRS, **defaults.lex_attr_getters} @@ -39,7 +39,6 @@ def create_vocab(lang, defaults, vectors_name=None): lex_attr_getters=lex_attrs, writing_system=defaults.writing_system, get_noun_chunks=defaults.syntax_iterators.get("noun_chunks"), - vectors_name=vectors_name, ) @@ -51,8 +50,8 @@ cdef class Vocab: DOCS: https://spacy.io/api/vocab """ def __init__(self, lex_attr_getters=None, strings=tuple(), lookups=None, - oov_prob=-20., vectors_name=None, writing_system={}, - get_noun_chunks=None, **deprecated_kwargs): + oov_prob=-20., writing_system={}, get_noun_chunks=None, + **deprecated_kwargs): """Create the vocabulary. lex_attr_getters (dict): A dictionary mapping attribute IDs to @@ -61,7 +60,6 @@ cdef class Vocab: vice versa. lookups (Lookups): Container for large lookup tables and dictionaries. oov_prob (float): Default OOV probability. - vectors_name (str): Optional name to identify the vectors table. get_noun_chunks (Optional[Callable[[Union[Doc, Span], Iterator[Tuple[int, int, int]]]]]): A function that yields base noun phrases used for Doc.noun_chunks. """ @@ -78,7 +76,7 @@ cdef class Vocab: _ = self[string] self.lex_attr_getters = lex_attr_getters self.morphology = Morphology(self.strings) - self.vectors = Vectors(strings=self.strings, name=vectors_name) + self.vectors = Vectors(strings=self.strings) self.lookups = lookups self.writing_system = writing_system self.get_noun_chunks = get_noun_chunks @@ -139,7 +137,7 @@ cdef class Vocab: self.lex_attr_getters[flag_id] = flag_getter return flag_id - cdef const LexemeC* get(self, Pool mem, str string) except NULL: + cdef const LexemeC* get(self, str string) except NULL: """Get a pointer to a `LexemeC` from the lexicon, creating a new `Lexeme` if necessary using memory acquired from the given pool. If the pool is the lexicon's own memory, the lexeme is saved in the lexicon. @@ -157,9 +155,9 @@ cdef class Vocab: orth=key, orth_id=string)) return lex else: - return self._new_lexeme(mem, string) + return self._new_lexeme(string) - cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL: + cdef const LexemeC* get_by_orth(self, attr_t orth) except NULL: """Get a pointer to a `LexemeC` from the lexicon, creating a new `Lexeme` if necessary using memory acquired from the given pool. If the pool is the lexicon's own memory, the lexeme is saved in the lexicon. @@ -171,21 +169,10 @@ cdef class Vocab: if lex != NULL: return lex else: - return self._new_lexeme(mem, self.strings[orth]) + return self._new_lexeme(self.strings[orth]) - cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL: - # I think this heuristic is bad, and the Vocab should always - # own the lexemes. It avoids weird bugs this way, as it's how the thing - # was originally supposed to work. The best solution to the growing - # memory use is to periodically reset the vocab, which is an action - # that should be up to the user to do (so we don't need to keep track - # of the doc ownership). - # TODO: Change the C API so that the mem isn't passed in here. - mem = self.mem - #if len(string) < 3 or self.length < 10000: - # mem = self.mem - cdef bint is_oov = mem is not self.mem - lex = mem.alloc(1, sizeof(LexemeC)) + cdef const LexemeC* _new_lexeme(self, str string) except NULL: + lex = self.mem.alloc(1, sizeof(LexemeC)) lex.orth = self.strings.add(string) lex.length = len(string) if self.vectors is not None: @@ -199,8 +186,7 @@ cdef class Vocab: value = self.strings.add(value) if value is not None: Lexeme.set_struct_attr(lex, attr, value) - if not is_oov: - self._add_lex_to_vocab(lex.orth, lex) + self._add_lex_to_vocab(lex.orth, lex) if lex == NULL: raise ValueError(Errors.E085.format(string=string)) return lex @@ -268,11 +254,10 @@ cdef class Vocab: cdef int i tokens = self.mem.alloc(len(substrings) + 1, sizeof(TokenC)) for i, props in enumerate(substrings): - props = intify_attrs(props, strings_map=self.strings, - _do_deprecated=True) + props = intify_attrs(props, strings_map=self.strings) token = &tokens[i] # Set the special tokens up to have arbitrary attributes - lex = self.get_by_orth(self.mem, props[ORTH]) + lex = self.get_by_orth(props[ORTH]) token.lex = lex for attr_id, value in props.items(): Token.set_struct_attr(token, attr_id, value) @@ -321,7 +306,7 @@ cdef class Vocab: for key, row in self.vectors.key2row.items() } # replace vectors with deduplicated version - self.vectors = Vectors(strings=self.strings, data=data, name=self.vectors.name) + self.vectors = Vectors(strings=self.strings, data=data) for key, row in key2row.items(): self.vectors.add(key, row=row) @@ -371,7 +356,7 @@ cdef class Vocab: keys = xp.asarray([key for (prob, i, key) in priority], dtype="uint64") keep = xp.ascontiguousarray(self.vectors.data[indices[:nr_row]]) toss = xp.ascontiguousarray(self.vectors.data[indices[nr_row:]]) - self.vectors = Vectors(strings=self.strings, data=keep, keys=keys[:nr_row], name=self.vectors.name) + self.vectors = Vectors(strings=self.strings, data=keep, keys=keys[:nr_row]) syn_keys, syn_rows, scores = self.vectors.most_similar(toss, batch_size=batch_size) syn_keys = ops.to_numpy(syn_keys) remap = {} @@ -559,21 +544,18 @@ def pickle_vocab(vocab): sstore = vocab.strings vectors = vocab.vectors morph = vocab.morphology - _unused_object = vocab._unused_object lex_attr_getters = srsly.pickle_dumps(vocab.lex_attr_getters) lookups = vocab.lookups get_noun_chunks = vocab.get_noun_chunks return (unpickle_vocab, - (sstore, vectors, morph, _unused_object, lex_attr_getters, lookups, get_noun_chunks)) + (sstore, vectors, morph, lex_attr_getters, lookups, get_noun_chunks)) -def unpickle_vocab(sstore, vectors, morphology, _unused_object, - lex_attr_getters, lookups, get_noun_chunks): +def unpickle_vocab(sstore, vectors, morphology, lex_attr_getters, lookups, get_noun_chunks): cdef Vocab vocab = Vocab() vocab.vectors = vectors vocab.strings = sstore vocab.morphology = morphology - vocab._unused_object = _unused_object vocab.lex_attr_getters = srsly.pickle_loads(lex_attr_getters) vocab.lookups = lookups vocab.get_noun_chunks = get_noun_chunks diff --git a/website/UNIVERSE.md b/website/UNIVERSE.md index ac4e2e684..a9008086c 100644 --- a/website/UNIVERSE.md +++ b/website/UNIVERSE.md @@ -61,7 +61,7 @@ use a linter to verify that your markup is correct. "import spacy", "import package_name", "", - "nlp = spacy.load('en')", + "nlp = spacy.load('en_core_web_sm')", "nlp.add_pipe(package_name)" ], "code_language": "python", diff --git a/website/docs/api/architectures.mdx b/website/docs/api/architectures.mdx index 2a1bc4380..ee41144f6 100644 --- a/website/docs/api/architectures.mdx +++ b/website/docs/api/architectures.mdx @@ -553,18 +553,17 @@ for a Tok2Vec layer. ## Parser & NER architectures {id="parser"} -### spacy.TransitionBasedParser.v2 {id="TransitionBasedParser",source="spacy/ml/models/parser.py"} +### spacy.TransitionBasedParser.v3 {id="TransitionBasedParser",source="spacy/ml/models/parser.py"} > #### Example Config > > ```ini > [model] -> @architectures = "spacy.TransitionBasedParser.v2" +> @architectures = "spacy.TransitionBasedParser.v3" > state_type = "ner" > extra_state_tokens = false > hidden_width = 64 > maxout_pieces = 2 -> use_upper = true > > [model.tok2vec] > @architectures = "spacy.HashEmbedCNN.v2" @@ -594,23 +593,22 @@ consists of either two or three subnetworks: state representation. If not present, the output from the lower model is used as action scores directly. -| Name | Description | -| -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `tok2vec` | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~ | -| `state_type` | Which task to extract features for. Possible values are "ner" and "parser". ~~str~~ | -| `extra_state_tokens` | Whether to use an expanded feature set when extracting the state tokens. Slightly slower, but sometimes improves accuracy slightly. Defaults to `False`. ~~bool~~ | -| `hidden_width` | The width of the hidden layer. ~~int~~ | -| `maxout_pieces` | How many pieces to use in the state prediction layer. Recommended values are `1`, `2` or `3`. If `1`, the maxout non-linearity is replaced with a [`Relu`](https://thinc.ai/docs/api-layers#relu) non-linearity if `use_upper` is `True`, and no non-linearity if `False`. ~~int~~ | -| `use_upper` | Whether to use an additional hidden layer after the state vector in order to predict the action scores. It is recommended to set this to `False` for large pretrained models such as transformers, and `True` for smaller networks. The upper layer is computed on CPU, which becomes a bottleneck on larger GPU-based models, where it's also less necessary. ~~bool~~ | -| `nO` | The number of actions the model will predict between. Usually inferred from data at the beginning of training, or loaded from disk. ~~int~~ | -| **CREATES** | The model using the architecture. ~~Model[List[Docs], List[List[Floats2d]]]~~ | +| Name | Description | +| -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `tok2vec` | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~ | +| `state_type` | Which task to extract features for. Possible values are "ner" and "parser". ~~str~~ | +| `extra_state_tokens` | Whether to use an expanded feature set when extracting the state tokens. Slightly slower, but sometimes improves accuracy slightly. Defaults to `False`. ~~bool~~ | +| `hidden_width` | The width of the hidden layer. ~~int~~ | +| `maxout_pieces` | How many pieces to use in the state prediction layer. Recommended values are `1`, `2` or `3`. ~~int~~ | +| `nO` | The number of actions the model will predict between. Usually inferred from data at the beginning of training, or loaded from disk. ~~int~~ | +| **CREATES** | The model using the architecture. ~~Model[List[Docs], List[List[Floats2d]]]~~ | [TransitionBasedParser.v1](/api/legacy#TransitionBasedParser_v1) had the exact same signature, but the `use_upper` argument was `True` by default. - + ## Tagging architectures {id="tagger",source="spacy/ml/models/tagger.py"} @@ -899,15 +897,21 @@ The `EntityLinker` model architecture is a Thinc `Model` with a | `nO` | Output dimension, determined by the length of the vectors encoding each entity in the KB. If the `nO` dimension is not set, the entity linking component will set it when `initialize` is called. ~~Optional[int]~~ | | **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ | -### spacy.EmptyKB.v1 {id="EmptyKB"} +### spacy.EmptyKB.v1 {id="EmptyKB.v1"} A function that creates an empty `KnowledgeBase` from a [`Vocab`](/api/vocab) -instance. This is the default when a new entity linker component is created. +instance. | Name | Description | | ---------------------- | ----------------------------------------------------------------------------------- | | `entity_vector_length` | The length of the vectors encoding each entity in the KB. Defaults to `64`. ~~int~~ | +### spacy.EmptyKB.v2 {id="EmptyKB"} + +A function that creates an empty `KnowledgeBase` from a [`Vocab`](/api/vocab) +instance. This is the default when a new entity linker component is created. It +returns a `Callable[[Vocab, int], InMemoryLookupKB]`. + ### spacy.KBFromFile.v1 {id="KBFromFile"} A function that reads an existing `KnowledgeBase` from file. @@ -924,6 +928,15 @@ plausible [`Candidate`](/api/kb/#candidate) objects. The default `CandidateGenerator` uses the text of a mention to find its potential aliases in the `KnowledgeBase`. Note that this function is case-dependent. +### spacy.CandidateBatchGenerator.v1 {id="CandidateBatchGenerator"} + +A function that takes as input a [`KnowledgeBase`](/api/kb) and an `Iterable` of +[`Span`](/api/span) objects denoting named entities, and returns a list of +plausible [`Candidate`](/api/kb/#candidate) objects per specified +[`Span`](/api/span). The default `CandidateBatchGenerator` uses the text of a +mention to find its potential aliases in the `KnowledgeBase`. Note that this +function is case-dependent. + ## Coreference {id="coref-architectures",tag="experimental"} A [`CoreferenceResolver`](/api/coref) component identifies tokens that refer to diff --git a/website/docs/api/attributeruler.mdx b/website/docs/api/attributeruler.mdx index c18319187..ce4faeead 100644 --- a/website/docs/api/attributeruler.mdx +++ b/website/docs/api/attributeruler.mdx @@ -1,7 +1,7 @@ --- title: AttributeRuler tag: class -source: spacy/pipeline/attributeruler.py +source: spacy/pipeline/attribute_ruler.py version: 3 teaser: 'Pipeline component for rule-based token attribute assignment' api_string_name: attribute_ruler @@ -34,7 +34,7 @@ how the component should be configured. You can override its settings via the | `validate` | Whether patterns should be validated (passed to the `Matcher`). Defaults to `False`. ~~bool~~ | ```python -%%GITHUB_SPACY/spacy/pipeline/attributeruler.py +%%GITHUB_SPACY/spacy/pipeline/attribute_ruler.py ``` ## AttributeRuler.\_\_init\_\_ {id="init",tag="method"} diff --git a/website/docs/api/cli.mdx b/website/docs/api/cli.mdx index ca4023101..1a3f15e48 100644 --- a/website/docs/api/cli.mdx +++ b/website/docs/api/cli.mdx @@ -201,7 +201,7 @@ This functionality was previously available as part of the command `init-model`. ```bash -$ python -m spacy init vectors [lang] [vectors_loc] [output_dir] [--prune] [--truncate] [--name] [--verbose] +$ python -m spacy init vectors [lang] [vectors_loc] [output_dir] [--prune] [--truncate] [--verbose] ``` | Name | Description | @@ -212,7 +212,6 @@ $ python -m spacy init vectors [lang] [vectors_loc] [output_dir] [--prune] [--tr | `--truncate`, `-t` | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. ~~int (option)~~ | | `--prune`, `-p` | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. ~~int (option)~~ | | `--mode`, `-m` | Vectors mode: `default` or [`floret`](https://github.com/explosion/floret). Defaults to `default`. ~~Optional[str] \(option)~~ | -| `--name`, `-n` | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. ~~Optional[str] \(option)~~ | | `--verbose`, `-V` | Print additional information and explanations. ~~bool (flag)~~ | | `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | | **CREATES** | A spaCy pipeline directory containing the vocab and vectors. | @@ -270,10 +269,10 @@ $ python -m spacy convert [input_file] [output_dir] [--converter] [--file-type] | `--file-type`, `-t` | Type of file to create. Either `spacy` (default) for binary [`DocBin`](/api/docbin) data or `json` for v2.x JSON format. ~~str (option)~~ | | `--n-sents`, `-n` | Number of sentences per document. Supported for: `conll`, `conllu`, `iob`, `ner` ~~int (option)~~ | | `--seg-sents`, `-s` | Segment sentences. Supported for: `conll`, `ner` ~~bool (flag)~~ | -| `--base`, `-b`, `--model` | Trained spaCy pipeline for sentence segmentation to use as base (for `--seg-sents`). ~~Optional[str](option)~~ | +| `--base`, `-b`, `--model` | Trained spaCy pipeline for sentence segmentation to use as base (for `--seg-sents`). ~~Optional[str] (option)~~ | | `--morphology`, `-m` | Enable appending morphology to tags. Supported for: `conllu` ~~bool (flag)~~ | | `--merge-subtokens`, `-T` | Merge CoNLL-U subtokens ~~bool (flag)~~ | -| `--ner-map`, `-nm` | NER tag mapping (as JSON-encoded dict of entity types). Supported for: `conllu` ~~Optional[Path](option)~~ | +| `--ner-map`, `-nm` | NER tag mapping (as JSON-encoded dict of entity types). Supported for: `conllu` ~~Optional[Path] (option)~~ | | `--lang`, `-l` | Language code (if tokenizer required). ~~Optional[str] \(option)~~ | | `--concatenate`, `-C` | Concatenate output to a single file ~~bool (flag)~~ | | `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | @@ -362,7 +361,7 @@ Module spacy.language File /path/to/spacy/language.py (line 64) ℹ [components.ner.model] Registry @architectures -Name spacy.TransitionBasedParser.v1 +Name spacy.TransitionBasedParser.v3 Module spacy.ml.models.parser File /path/to/spacy/ml/models/parser.py (line 11) ℹ [components.ner.model.tok2vec] @@ -372,7 +371,7 @@ Module spacy.ml.models.tok2vec File /path/to/spacy/ml/models/tok2vec.py (line 16) ℹ [components.parser.model] Registry @architectures -Name spacy.TransitionBasedParser.v1 +Name spacy.TransitionBasedParser.v3 Module spacy.ml.models.parser File /path/to/spacy/ml/models/parser.py (line 11) ℹ [components.parser.model.tok2vec] @@ -697,7 +696,7 @@ scorer = {"@scorers":"spacy.ner_scorer.v1"} update_with_oracle_cut_size = 100 [components.ner.model] -@architectures = "spacy.TransitionBasedParser.v2" +@architectures = "spacy.TransitionBasedParser.v3" state_type = "ner" extra_state_tokens = false - hidden_width = 64 @@ -720,7 +719,7 @@ scorer = {"@scorers":"spacy.parser_scorer.v1"} update_with_oracle_cut_size = 100 [components.parser.model] -@architectures = "spacy.TransitionBasedParser.v2" +@architectures = "spacy.TransitionBasedParser.v3" state_type = "parser" extra_state_tokens = false hidden_width = 128 @@ -1410,12 +1409,13 @@ $ python -m spacy project assets [project_dir] > $ python -m spacy project assets [--sparse] > ``` -| Name | Description | -| ---------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `project_dir` | Path to project directory. Defaults to current working directory. ~~Path (positional)~~ | -| `--sparse`, `-S` | Enable [sparse checkout](https://git-scm.com/docs/git-sparse-checkout) to only check out and download what's needed. Requires Git v22.2+. ~~bool (flag)~~ | -| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | -| **CREATES** | Downloaded or copied assets defined in the `project.yml`. | +| Name | Description | +| ---------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `project_dir` | Path to project directory. Defaults to current working directory. ~~Path (positional)~~ | +| `--extra`, `-e` 3.3.1 | Download assets marked as "extra". Default false. ~~bool (flag)~~ | +| `--sparse`, `-S` | Enable [sparse checkout](https://git-scm.com/docs/git-sparse-checkout) to only check out and download what's needed. Requires Git v22.2+. ~~bool (flag)~~ | +| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | +| **CREATES** | Downloaded or copied assets defined in the `project.yml`. | ### project run {id="project-run",tag="command"} @@ -1491,7 +1491,7 @@ $ python -m spacy project push [remote] [project_dir] ### project pull {id="project-pull",tag="command"} Download all files or directories listed as `outputs` for commands, unless they -are not already present locally. When searching for files in the remote, `pull` +are already present locally. When searching for files in the remote, `pull` won't just look at the output path, but will also consider the **command string** and the **hashes of the dependencies**. For instance, let's say you've previously pushed a checkpoint to the remote, but now you've changed some diff --git a/website/docs/api/corpus.mdx b/website/docs/api/corpus.mdx index c58723e82..75e8f5c0f 100644 --- a/website/docs/api/corpus.mdx +++ b/website/docs/api/corpus.mdx @@ -175,3 +175,68 @@ Yield examples from the data. | ---------- | -------------------------------------- | | `nlp` | The current `nlp` object. ~~Language~~ | | **YIELDS** | The examples. ~~Example~~ | + +## PlainTextCorpus {id="plaintextcorpus",tag="class",version="3.5.1"} + +Iterate over documents from a plain text file. Can be used to read the raw text +corpus for language model +[pretraining](/usage/embeddings-transformers#pretraining). The expected file +format is: + +- UTF-8 encoding +- One document per line +- Blank lines are ignored. + +```text {title="Example"} +Can I ask where you work now and what you do, and if you enjoy it? +They may just pull out of the Seattle market completely, at least until they have autonomous vehicles. +My cynical view on this is that it will never be free to the public. Reason: what would be the draw of joining the military? Right now their selling point is free Healthcare and Education. Ironically both are run horribly and most, that I've talked to, come out wishing they never went in. +``` + +### PlainTextCorpus.\_\_init\_\_ {id="plaintextcorpus-init",tag="method"} + +Initialize the reader. + +> #### Example +> +> ```python +> from spacy.training import PlainTextCorpus +> +> corpus = PlainTextCorpus("./data/docs.txt") +> ``` +> +> ```ini +> ### Example config +> [corpora.pretrain] +> @readers = "spacy.PlainTextCorpus.v1" +> path = "corpus/raw_text.txt" +> min_length = 0 +> max_length = 0 +> ``` + +| Name | Description | +| -------------- | -------------------------------------------------------------------------------------------------------------------------- | +| `path` | The directory or filename to read from. Expects newline-delimited documents in UTF8 format. ~~Union[str, Path]~~ | +| _keyword-only_ | | +| `min_length` | Minimum document length (in tokens). Shorter documents will be skipped. Defaults to `0`, which indicates no limit. ~~int~~ | +| `max_length` | Maximum document length (in tokens). Longer documents will be skipped. Defaults to `0`, which indicates no limit. ~~int~~ | + +### PlainTextCorpus.\_\_call\_\_ {id="plaintextcorpus-call",tag="method"} + +Yield examples from the data. + +> #### Example +> +> ```python +> from spacy.training import PlainTextCorpus +> import spacy +> +> corpus = PlainTextCorpus("./docs.txt") +> nlp = spacy.blank("en") +> data = corpus(nlp) +> ``` + +| Name | Description | +| ---------- | -------------------------------------- | +| `nlp` | The current `nlp` object. ~~Language~~ | +| **YIELDS** | The examples. ~~Example~~ | diff --git a/website/docs/api/cython-classes.mdx b/website/docs/api/cython-classes.mdx index ce7c03940..88bd92c72 100644 --- a/website/docs/api/cython-classes.mdx +++ b/website/docs/api/cython-classes.mdx @@ -163,14 +163,13 @@ vocabulary. > #### Example > > ```python -> lexeme = vocab.get(vocab.mem, "hello") +> lexeme = vocab.get("hello") > ``` -| Name | Description | -| ----------- | ---------------------------------------------------------------------------------------------------------- | -| `mem` | A memory pool. Allocated memory will be freed once the `Vocab` object is garbage collected. ~~cymem.Pool~~ | -| `string` | The string of the word to look up. ~~str~~ | -| **RETURNS** | The lexeme in the vocabulary. ~~const LexemeC\*~~ | +| Name | Description | +| ----------- | ------------------------------------------------- | +| `string` | The string of the word to look up. ~~str~~ | +| **RETURNS** | The lexeme in the vocabulary. ~~const LexemeC\*~~ | ### Vocab.get_by_orth {id="vocab_get_by_orth",tag="method"} @@ -183,11 +182,10 @@ vocabulary. > lexeme = vocab.get_by_orth(doc[0].lex.norm) > ``` -| Name | Description | -| ----------- | ---------------------------------------------------------------------------------------------------------- | -| `mem` | A memory pool. Allocated memory will be freed once the `Vocab` object is garbage collected. ~~cymem.Pool~~ | -| `orth` | ID of the verbatim text content. ~~attr_t (uint64_t)~~ | -| **RETURNS** | The lexeme in the vocabulary. ~~const LexemeC\*~~ | +| Name | Description | +| ----------- | ------------------------------------------------------ | +| `orth` | ID of the verbatim text content. ~~attr_t (uint64_t)~~ | +| **RETURNS** | The lexeme in the vocabulary. ~~const LexemeC\*~~ | ## StringStore {id="stringstore",tag="cdef class",source="spacy/strings.pxd"} diff --git a/website/docs/api/dependencymatcher.mdx b/website/docs/api/dependencymatcher.mdx index 390034a6c..14e0916d1 100644 --- a/website/docs/api/dependencymatcher.mdx +++ b/website/docs/api/dependencymatcher.mdx @@ -68,24 +68,28 @@ The following operators are supported by the `DependencyMatcher`, most of which come directly from [Semgrex](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html): -| Symbol | Description | -| --------- | -------------------------------------------------------------------------------------------------------------------- | -| `A < B` | `A` is the immediate dependent of `B`. | -| `A > B` | `A` is the immediate head of `B`. | -| `A << B` | `A` is the dependent in a chain to `B` following dep → head paths. | -| `A >> B` | `A` is the head in a chain to `B` following head → dep paths. | -| `A . B` | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree. | -| `A .* B` | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(not in Semgrex)_. | -| `A ; B` | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(not in Semgrex)_. | -| `A ;* B` | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(not in Semgrex)_. | -| `A $+ B` | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`. | -| `A $- B` | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`. | -| `A $++ B` | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`. | -| `A $-- B` | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`. | -| `A >++ B` | `B` is a right child of `A`, i.e. `A` is a parent of `B` and `A.i < B.i` _(not in Semgrex)_. | -| `A >-- B` | `B` is a left child of `A`, i.e. `A` is a parent of `B` and `A.i > B.i` _(not in Semgrex)_. | -| `A <++ B` | `B` is a right parent of `A`, i.e. `A` is a child of `B` and `A.i < B.i` _(not in Semgrex)_. | -| `A <-- B` | `B` is a left parent of `A`, i.e. `A` is a child of `B` and `A.i > B.i` _(not in Semgrex)_. | +| Symbol | Description | +| --------------------------------------- | -------------------------------------------------------------------------------------------------------------------- | +| `A < B` | `A` is the immediate dependent of `B`. | +| `A > B` | `A` is the immediate head of `B`. | +| `A << B` | `A` is the dependent in a chain to `B` following dep → head paths. | +| `A >> B` | `A` is the head in a chain to `B` following head → dep paths. | +| `A . B` | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree. | +| `A .* B` | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(not in Semgrex)_. | +| `A ; B` | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(not in Semgrex)_. | +| `A ;* B` | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(not in Semgrex)_. | +| `A $+ B` | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`. | +| `A $- B` | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`. | +| `A $++ B` | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`. | +| `A $-- B` | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`. | +| `A >+ B` 3.5.1 | `B` is a right immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i - 1` _(not in Semgrex)_. | +| `A >- B` 3.5.1 | `B` is a left immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i + 1` _(not in Semgrex)_. | +| `A >++ B` | `B` is a right child of `A`, i.e. `A` is a parent of `B` and `A.i < B.i` _(not in Semgrex)_. | +| `A >-- B` | `B` is a left child of `A`, i.e. `A` is a parent of `B` and `A.i > B.i` _(not in Semgrex)_. | +| `A <+ B` 3.5.1 | `B` is a right immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i - 1` _(not in Semgrex)_. | +| `A <- B` 3.5.1 | `B` is a left immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i + 1` _(not in Semgrex)_. | +| `A <++ B` | `B` is a right parent of `A`, i.e. `A` is a child of `B` and `A.i < B.i` _(not in Semgrex)_. | +| `A <-- B` | `B` is a left parent of `A`, i.e. `A` is a child of `B` and `A.i > B.i` _(not in Semgrex)_. | ## DependencyMatcher.\_\_init\_\_ {id="init",tag="method"} diff --git a/website/docs/api/dependencyparser.mdx b/website/docs/api/dependencyparser.mdx index a6bc48cdf..296d6d87d 100644 --- a/website/docs/api/dependencyparser.mdx +++ b/website/docs/api/dependencyparser.mdx @@ -131,6 +131,39 @@ and all pipeline components are applied to the `Doc` in order. Both | `doc` | The document to process. ~~Doc~~ | | **RETURNS** | The processed document. ~~Doc~~ | +## DependencyParser.distill {id="distill", tag="method,experimental", version="4"} + +Train a pipe (the student) on the predictions of another pipe (the teacher). The +student is typically trained on the probability distribution of the teacher, but +details may differ per pipe. The goal of distillation is to transfer knowledge +from the teacher to the student. + +The distillation is performed on ~~Example~~ objects. The `Example.reference` +and `Example.predicted` ~~Doc~~s must have the same number of tokens and the +same orthography. Even though the reference does not need have to have gold +annotations, the teacher could adds its own annotations when necessary. + +This feature is experimental. + +> #### Example +> +> ```python +> teacher_pipe = teacher.add_pipe("parser") +> student_pipe = student.add_pipe("parser") +> optimizer = nlp.resume_training() +> losses = student.distill(teacher_pipe, examples, sgd=optimizer) +> ``` + +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~ | +| `examples` | A batch of [`Example`](/api/example) distillation examples. The reference (teacher) and predicted (student) docs must have the same number of tokens and orthography. ~~Iterable[Example]~~ | +| _keyword-only_ | | +| `drop` | Dropout rate. ~~float~~ | +| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | +| `losses` | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ | +| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ | + ## DependencyParser.pipe {id="pipe",tag="method"} Apply the pipe to a stream of documents. This usually happens under the hood @@ -169,12 +202,6 @@ arguments it receives via the [`[initialize.components]`](/api/data-formats#config-initialize) block in the config. - - -This method was previously called `begin_training`. - - - > #### Example > > ```python @@ -274,6 +301,27 @@ predicted scores. | `scores` | Scores representing the model's predictions. ~~StateClass~~ | | **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ | +## DependencyParser.get_teacher_student_loss {id="get_teacher_student_loss", tag="method", version="4"} + +Calculate the loss and its gradient for the batch of student scores relative to +the teacher scores. + +> #### Example +> +> ```python +> teacher_parser = teacher.get_pipe("parser") +> student_parser = student.add_pipe("parser") +> student_scores = student_parser.predict([eg.predicted for eg in examples]) +> teacher_scores = teacher_parser.predict([eg.predicted for eg in examples]) +> loss, d_loss = student_parser.get_teacher_student_loss(teacher_scores, student_scores) +> ``` + +| Name | Description | +| ---------------- | --------------------------------------------------------------------------- | +| `teacher_scores` | Scores representing the teacher model's predictions. | +| `student_scores` | Scores representing the student model's predictions. | +| **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ | + ## DependencyParser.create_optimizer {id="create_optimizer",tag="method"} Create an [`Optimizer`](https://thinc.ai/docs/api-optimizers) for the pipeline diff --git a/website/docs/api/doc.mdx b/website/docs/api/doc.mdx index a5f3de6be..fca056ed0 100644 --- a/website/docs/api/doc.mdx +++ b/website/docs/api/doc.mdx @@ -37,7 +37,7 @@ Construct a `Doc` object. The most common way to get a `Doc` object is via the | `words` | A list of strings or integer hash values to add to the document as words. ~~Optional[List[Union[str,int]]]~~ | | `spaces` | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. ~~Optional[List[bool]]~~ | | _keyword-only_ | | -| `user\_data` | Optional extra data to attach to the Doc. ~~Dict~~ | +| `user_data` | Optional extra data to attach to the Doc. ~~Dict~~ | | `tags` 3 | A list of strings, of the same length as `words`, to assign as `token.tag` for each word. Defaults to `None`. ~~Optional[List[str]]~~ | | `pos` 3 | A list of strings, of the same length as `words`, to assign as `token.pos` for each word. Defaults to `None`. ~~Optional[List[str]]~~ | | `morphs` 3 | A list of strings, of the same length as `words`, to assign as `token.morph` for each word. Defaults to `None`. ~~Optional[List[str]]~~ | @@ -209,15 +209,17 @@ alignment mode `"strict". > assert span.text == "New York" > ``` -| Name | Description | -| ---------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `start` | The index of the first character of the span. ~~int~~ | -| `end` | The index of the last character after the span. ~~int~~ | -| `label` | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~ | -| `kb_id` | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~ | -| `vector` | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~ | -| `alignment_mode` | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ | -| **RETURNS** | The newly constructed object or `None`. ~~Optional[Span]~~ | +| Name | Description | +| ---------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `start` | The index of the first character of the span. ~~int~~ | +| `end` | The index of the last character after the span. ~~int~~ | +| `label` | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~ | +| _keyword-only_ | | +| `kb_id` | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~ | +| `vector` | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~ | +| `alignment_mode` | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ | +| `span_id` 3.3.1 | An identifier to associate with the span. ~~Union[int, str]~~ | +| **RETURNS** | The newly constructed object or `None`. ~~Optional[Span]~~ | ## Doc.set_ents {id="set_ents",tag="method",version="3"} @@ -652,11 +654,10 @@ the [`TextCategorizer`](/api/textcategorizer). ## Doc.noun_chunks {id="noun_chunks",tag="property",model="parser"} -Iterate over the base noun phrases in the document. Yields base noun-phrase -`Span` objects, if the document has been syntactically parsed. A base noun -phrase, or "NP chunk", is a noun phrase that does not permit other NPs to be -nested within it – so no NP-level coordination, no prepositional phrases, and no -relative clauses. +Returns a tuple of the base noun phrases in the doc, if the document has been +syntactically parsed. A base noun phrase, or "NP chunk", is a noun phrase that +does not permit other NPs to be nested within it – so no NP-level coordination, +no prepositional phrases, and no relative clauses. To customize the noun chunk iterator in a loaded pipeline, modify [`nlp.vocab.get_noun_chunks`](/api/vocab#attributes). If the `noun_chunk` @@ -673,13 +674,13 @@ implemented for the given language, a `NotImplementedError` is raised. > assert chunks[1].text == "another phrase" > ``` -| Name | Description | -| ---------- | ------------------------------------- | -| **YIELDS** | Noun chunks in the document. ~~Span~~ | +| Name | Description | +| ----------- | -------------------------------------------- | +| **RETURNS** | Noun chunks in the document. ~~Tuple[Span]~~ | ## Doc.sents {id="sents",tag="property",model="sentences"} -Iterate over the sentences in the document. Sentence spans have no label. +Returns a tuple of the sentences in the document. Sentence spans have no label. This property is only available when [sentence boundaries](/usage/linguistic-features#sbd) have been set on the @@ -695,9 +696,9 @@ will raise an error otherwise. > assert [s.root.text for s in sents] == ["is", "'s"] > ``` -| Name | Description | -| ---------- | ----------------------------------- | -| **YIELDS** | Sentences in the document. ~~Span~~ | +| Name | Description | +| ----------- | ------------------------------------------ | +| **RETURNS** | Sentences in the document. ~~Tuple[Span]~~ | ## Doc.has_vector {id="has_vector",tag="property",model="vectors"} @@ -751,22 +752,22 @@ The L2 norm of the document's vector representation. ## Attributes {id="attributes"} -| Name | Description | -| -------------------- | ----------------------------------------------------------------------------------------------------------------------------------- | -| `text` | A string representation of the document text. ~~str~~ | -| `text_with_ws` | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`. ~~str~~ | -| `mem` | The document's local memory heap, for all C data it owns. ~~cymem.Pool~~ | -| `vocab` | The store of lexical types. ~~Vocab~~ | -| `tensor` | Container for dense vector representations. ~~numpy.ndarray~~ | -| `user_data` | A generic storage area, for user custom data. ~~Dict[str, Any]~~ | -| `lang` | Language of the document's vocabulary. ~~int~~ | -| `lang_` | Language of the document's vocabulary. ~~str~~ | -| `sentiment` | The document's positivity/negativity score, if available. ~~float~~ | -| `user_hooks` | A dictionary that allows customization of the `Doc`'s properties. ~~Dict[str, Callable]~~ | -| `user_token_hooks` | A dictionary that allows customization of properties of `Token` children. ~~Dict[str, Callable]~~ | -| `user_span_hooks` | A dictionary that allows customization of properties of `Span` children. ~~Dict[str, Callable]~~ | -| `has_unknown_spaces` | Whether the document was constructed without known spacing between tokens (typically when created from gold tokenization). ~~bool~~ | -| `_` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~ | +| Name | Description | +| ------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------- | +| `text` | A string representation of the document text. ~~str~~ | +| `text_with_ws` | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`. ~~str~~ | +| `mem` | The document's local memory heap, for all C data it owns. ~~cymem.Pool~~ | +| `vocab` | The store of lexical types. ~~Vocab~~ | +| `tensor` | Container for dense vector representations. ~~numpy.ndarray~~ | +| `user_data` | A generic storage area, for user custom data. ~~Dict[str, Any]~~ | +| `lang` | Language of the document's vocabulary. ~~int~~ | +| `lang_` | Language of the document's vocabulary. ~~str~~ | +| `user_hooks` | A dictionary that allows customization of the `Doc`'s properties. ~~Dict[str, Callable]~~ | +| `user_token_hooks` | A dictionary that allows customization of properties of `Token` children. ~~Dict[str, Callable]~~ | +| `user_span_hooks` | A dictionary that allows customization of properties of `Span` children. ~~Dict[str, Callable]~~ | +| `has_unknown_spaces` | Whether the document was constructed without known spacing between tokens (typically when created from gold tokenization). ~~bool~~ | +| `_` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~ | +| `activations` 4.0 | A dictionary of activations per trainable pipe (available when the `save_activations` option of a pipe is enabled). ~~Dict[str, Option[Any]]~~ | ## Serialization fields {id="serialization-fields"} @@ -784,7 +785,6 @@ serialization by passing in the string names via the `exclude` argument. | Name | Description | | ------------------ | --------------------------------------------- | | `text` | The value of the `Doc.text` attribute. | -| `sentiment` | The value of the `Doc.sentiment` attribute. | | `tensor` | The value of the `Doc.tensor` attribute. | | `user_data` | The value of the `Doc.user_data` dictionary. | | `user_data_keys` | The keys of the `Doc.user_data` dictionary. | diff --git a/website/docs/api/edittreelemmatizer.mdx b/website/docs/api/edittreelemmatizer.mdx index 82967482c..c8b5c7180 100644 --- a/website/docs/api/edittreelemmatizer.mdx +++ b/website/docs/api/edittreelemmatizer.mdx @@ -44,14 +44,15 @@ architectures and their arguments and hyperparameters. > nlp.add_pipe("trainable_lemmatizer", config=config, name="lemmatizer") > ``` -| Setting | Description | -| --------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `model` | A model instance that predicts the edit tree probabilities. The output vectors should match the number of edit trees in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ | -| `backoff` | ~~Token~~ attribute to use when no applicable edit tree is found. Defaults to `orth`. ~~str~~ | -| `min_tree_freq` | Minimum frequency of an edit tree in the training set to be used. Defaults to `3`. ~~int~~ | -| `overwrite` | Whether existing annotation is overwritten. Defaults to `False`. ~~bool~~ | -| `top_k` | The number of most probable edit trees to try before resorting to `backoff`. Defaults to `1`. ~~int~~ | -| `scorer` | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attribute `"lemma"`. ~~Optional[Callable]~~ | +| Setting | Description | +| ----------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `model` | A model instance that predicts the edit tree probabilities. The output vectors should match the number of edit trees in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ | +| `backoff` | ~~Token~~ attribute to use when no applicable edit tree is found. Defaults to `orth`. ~~str~~ | +| `min_tree_freq` | Minimum frequency of an edit tree in the training set to be used. Defaults to `3`. ~~int~~ | +| `overwrite` | Whether existing annotation is overwritten. Defaults to `False`. ~~bool~~ | +| `top_k` | The number of most probable edit trees to try before resorting to `backoff`. Defaults to `1`. ~~int~~ | +| `scorer` | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attribute `"lemma"`. ~~Optional[Callable]~~ | +| `save_activations` 4.0 | Save activations in `Doc` when annotating. Saved activations are `"probabilities"` and `"tree_ids"`. ~~Union[bool, list[str]]~~ | ```python %%GITHUB_SPACY/spacy/pipeline/edit_tree_lemmatizer.py @@ -114,6 +115,39 @@ and all pipeline components are applied to the `Doc` in order. Both | `doc` | The document to process. ~~Doc~~ | | **RETURNS** | The processed document. ~~Doc~~ | +## EditTreeLemmatizer.distill {id="distill", tag="method,experimental", version="4"} + +Train a pipe (the student) on the predictions of another pipe (the teacher). The +student is typically trained on the probability distribution of the teacher, but +details may differ per pipe. The goal of distillation is to transfer knowledge +from the teacher to the student. + +The distillation is performed on ~~Example~~ objects. The `Example.reference` +and `Example.predicted` ~~Doc~~s must have the same number of tokens and the +same orthography. Even though the reference does not need have to have gold +annotations, the teacher could adds its own annotations when necessary. + +This feature is experimental. + +> #### Example +> +> ```python +> teacher_pipe = teacher.add_pipe("trainable_lemmatizer") +> student_pipe = student.add_pipe("trainable_lemmatizer") +> optimizer = nlp.resume_training() +> losses = student.distill(teacher_pipe, examples, sgd=optimizer) +> ``` + +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~ | +| `examples` | A batch of [`Example`](/api/example) distillation examples. The reference (teacher) and predicted (student) docs must have the same number of tokens and orthography. ~~Iterable[Example]~~ | +| _keyword-only_ | | +| `drop` | Dropout rate. ~~float~~ | +| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | +| `losses` | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ | +| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ | + ## EditTreeLemmatizer.pipe {id="pipe",tag="method"} Apply the pipe to a stream of documents. This usually happens under the hood @@ -268,6 +302,27 @@ Create an optimizer for the pipeline component. | ----------- | ---------------------------- | | **RETURNS** | The optimizer. ~~Optimizer~~ | +## EditTreeLemmatizer.get_teacher_student_loss {id="get_teacher_student_loss", tag="method", version="4"} + +Calculate the loss and its gradient for the batch of student scores relative to +the teacher scores. + +> #### Example +> +> ```python +> teacher_lemmatizer = teacher.get_pipe("trainable_lemmatizer") +> student_lemmatizer = student.add_pipe("trainable_lemmatizer") +> student_scores = student_lemmatizer.predict([eg.predicted for eg in examples]) +> teacher_scores = teacher_lemmatizer.predict([eg.predicted for eg in examples]) +> loss, d_loss = student_lemmatizer.get_teacher_student_loss(teacher_scores, student_scores) +> ``` + +| Name | Description | +| ---------------- | --------------------------------------------------------------------------- | +| `teacher_scores` | Scores representing the teacher model's predictions. | +| `student_scores` | Scores representing the student model's predictions. | +| **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ | + ## EditTreeLemmatizer.use_params {id="use_params",tag="method, contextmanager"} Modify the pipe's model, to use the given parameter values. At the end of the diff --git a/website/docs/api/entitylinker.mdx b/website/docs/api/entitylinker.mdx index 5c30d252e..3af7ac4dd 100644 --- a/website/docs/api/entitylinker.mdx +++ b/website/docs/api/entitylinker.mdx @@ -15,7 +15,7 @@ world". It requires a `KnowledgeBase`, as well as a function to generate plausible candidates from that `KnowledgeBase` given a certain textual mention, and a machine learning model to pick the right candidate, given the local context of the mention. `EntityLinker` defaults to using the -[`InMemoryLookupKB`](/api/kb_in_memory) implementation. +[`InMemoryLookupKB`](/api/inmemorylookupkb) implementation. ## Assigned Attributes {id="assigned-attributes"} @@ -53,19 +53,22 @@ architectures and their arguments and hyperparameters. > nlp.add_pipe("entity_linker", config=config) > ``` -| Setting | Description | -| ---------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `labels_discard` | NER labels that will automatically get a "NIL" prediction. Defaults to `[]`. ~~Iterable[str]~~ | -| `n_sents` | The number of neighbouring sentences to take into account. Defaults to 0. ~~int~~ | -| `incl_prior` | Whether or not to include prior probabilities from the KB in the model. Defaults to `True`. ~~bool~~ | -| `incl_context` | Whether or not to include the local context in the model. Defaults to `True`. ~~bool~~ | -| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~ | -| `entity_vector_length` | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~ | -| `use_gold_ents` | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~int~~ | -| `get_candidates` | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ | -| `overwrite` 3.2 | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~ | -| `scorer` 3.2 | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ | -| `threshold` 3.4 | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ | +| Setting | Description | +| --------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `labels_discard` | NER labels that will automatically get a "NIL" prediction. Defaults to `[]`. ~~Iterable[str]~~ | +| `n_sents` | The number of neighbouring sentences to take into account. Defaults to 0. ~~int~~ | +| `incl_prior` | Whether or not to include prior probabilities from the KB in the model. Defaults to `True`. ~~bool~~ | +| `incl_context` | Whether or not to include the local context in the model. Defaults to `True`. ~~bool~~ | +| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~ | +| `entity_vector_length` | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~ | +| `use_gold_ents` | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~int~~ | +| `get_candidates` | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ | +| `get_candidates_batch` 3.5 | Function that generates plausible candidates for a given batch of `Span` objects. Defaults to [CandidateBatchGenerator](/api/architectures#CandidateBatchGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]]~~ | +| `generate_empty_kb` 3.6 | Function that generates an empty `KnowledgeBase` object. Defaults to [`spacy.EmptyKB.v2`](/api/architectures#EmptyKB), which generates an empty [`InMemoryLookupKB`](/api/inmemorylookupkb). ~~Callable[[Vocab, int], KnowledgeBase]~~ | +| `overwrite` 3.2 | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~ | +| `scorer` 3.2 | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ | +| `save_activations` 4.0 | Save activations in `Doc` when annotating. Saved activations are `"ents"` and `"scores"`. ~~Union[bool, list[str]]~~ | +| `threshold` 3.4 | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ | ```python %%GITHUB_SPACY/spacy/pipeline/entity_linker.py @@ -199,12 +202,6 @@ knowledge base. This argument should be a function that takes a `Vocab` instance and creates the `KnowledgeBase`, ensuring that the strings of the knowledge base are synced with the current vocab. - - -This method was previously called `begin_training`. - - - > #### Example > > ```python diff --git a/website/docs/api/entityrecognizer.mdx b/website/docs/api/entityrecognizer.mdx index c80406a5b..f503cc998 100644 --- a/website/docs/api/entityrecognizer.mdx +++ b/website/docs/api/entityrecognizer.mdx @@ -127,6 +127,39 @@ and all pipeline components are applied to the `Doc` in order. Both | `doc` | The document to process. ~~Doc~~ | | **RETURNS** | The processed document. ~~Doc~~ | +## EntityRecognizer.distill {id="distill", tag="method,experimental", version="4"} + +Train a pipe (the student) on the predictions of another pipe (the teacher). The +student is typically trained on the probability distribution of the teacher, but +details may differ per pipe. The goal of distillation is to transfer knowledge +from the teacher to the student. + +The distillation is performed on ~~Example~~ objects. The `Example.reference` +and `Example.predicted` ~~Doc~~s must have the same number of tokens and the +same orthography. Even though the reference does not need have to have gold +annotations, the teacher could adds its own annotations when necessary. + +This feature is experimental. + +> #### Example +> +> ```python +> teacher_pipe = teacher.add_pipe("ner") +> student_pipe = student.add_pipe("ner") +> optimizer = nlp.resume_training() +> losses = student.distill(teacher_pipe, examples, sgd=optimizer) +> ``` + +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~ | +| `examples` | A batch of [`Example`](/api/example) distillation examples. The reference (teacher) and predicted (student) docs must have the same number of tokens and orthography. ~~Iterable[Example]~~ | +| _keyword-only_ | | +| `drop` | Dropout rate. ~~float~~ | +| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | +| `losses` | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ | +| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ | + ## EntityRecognizer.pipe {id="pipe",tag="method"} Apply the pipe to a stream of documents. This usually happens under the hood @@ -165,12 +198,6 @@ arguments it receives via the [`[initialize.components]`](/api/data-formats#config-initialize) block in the config. - - -This method was previously called `begin_training`. - - - > #### Example > > ```python @@ -270,6 +297,27 @@ predicted scores. | `scores` | Scores representing the model's predictions. ~~StateClass~~ | | **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ | +## EntityRecognizer.get_teacher_student_loss {id="get_teacher_student_loss", tag="method", version="4"} + +Calculate the loss and its gradient for the batch of student scores relative to +the teacher scores. + +> #### Example +> +> ```python +> teacher_ner = teacher.get_pipe("ner") +> student_ner = student.add_pipe("ner") +> student_scores = student_ner.predict([eg.predicted for eg in examples]) +> teacher_scores = teacher_ner.predict([eg.predicted for eg in examples]) +> loss, d_loss = student_ner.get_teacher_student_loss(teacher_scores, student_scores) +> ``` + +| Name | Description | +| ---------------- | --------------------------------------------------------------------------- | +| `teacher_scores` | Scores representing the teacher model's predictions. | +| `student_scores` | Scores representing the student model's predictions. | +| **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ | + ## EntityRecognizer.create_optimizer {id="create_optimizer",tag="method"} Create an optimizer for the pipeline component. diff --git a/website/docs/api/entityruler.mdx b/website/docs/api/entityruler.mdx index 27624398e..adb1f14d4 100644 --- a/website/docs/api/entityruler.mdx +++ b/website/docs/api/entityruler.mdx @@ -1,13 +1,24 @@ --- title: EntityRuler -tag: class -source: spacy/pipeline/entityruler.py version: 2.1 teaser: 'Pipeline component for rule-based named entity recognition' api_string_name: entity_ruler api_trainable: false --- + + +As of spaCy v4, there is no separate `EntityRuler` class. The entity ruler is +implemented as a special case of the `SpanRuler` component. + +See the [migration guide](#migrating) below for differences between the v3 +`EntityRuler` and v4 `SpanRuler` implementations of the `entity_ruler` +component. + +See the [`SpanRuler`](/api/spanruler) API docs for the full API. + + + The entity ruler lets you add spans to the [`Doc.ents`](/api/doc#ents) using token-based rules or exact phrase matches. It can be combined with the statistical [`EntityRecognizer`](/api/entityrecognizer) to boost accuracy, or @@ -64,273 +75,51 @@ how the component should be configured. You can override its settings via the | `ent_id_sep` | Separator used internally for entity IDs. Defaults to `"\|\|"`. ~~str~~ | | `scorer` | The scoring method. Defaults to [`spacy.scorer.get_ner_prf`](/api/scorer#get_ner_prf). ~~Optional[Callable]~~ | -```python -%%GITHUB_SPACY/spacy/pipeline/entityruler.py +## Migrating from v3 {id="migrating"} + +### Loading patterns + +Unlike the v3 `EntityRuler`, the `SpanRuler` cannot load patterns on +initialization with `SpanRuler(patterns=patterns)` or directly from a JSONL file +path with `SpanRuler.from_disk(jsonl_path)`. Patterns should be loaded from the +JSONL file separately and then added through +[`SpanRuler.initialize`](/api/spanruler#initialize]) or +[`SpanRuler.add_patterns`](/api/spanruler#add_patterns). + +```diff + ruler = nlp.get_pipe("entity_ruler") +- ruler.from_disk("patterns.jsonl") ++ import srsly ++ patterns = srsly.read_jsonl("patterns.jsonl") ++ ruler.add_patterns(patterns) ``` -## EntityRuler.\_\_init\_\_ {id="init",tag="method"} +### Saving patterns -Initialize the entity ruler. If patterns are supplied here, they need to be a -list of dictionaries with a `"label"` and `"pattern"` key. A pattern can either -be a token pattern (list) or a phrase pattern (string). For example: -`{"label": "ORG", "pattern": "Apple"}`. +`SpanRuler.to_disk` always saves the full component data to a directory and does +not include an option to save the patterns to a single JSONL file. -> #### Example -> -> ```python -> # Construction via add_pipe -> ruler = nlp.add_pipe("entity_ruler") -> -> # Construction from class -> from spacy.pipeline import EntityRuler -> ruler = EntityRuler(nlp, overwrite_ents=True) -> ``` +```diff + ruler = nlp.get_pipe("entity_ruler") +- ruler.to_disk("patterns.jsonl") ++ import srsly ++ srsly.write_jsonl("patterns.jsonl", ruler.patterns) +``` -| Name | Description | -| ---------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `nlp` | The shared nlp object to pass the vocab to the matchers and process phrase patterns. ~~Language~~ | -| `name` 3 | Instance name of the current pipeline component. Typically passed in automatically from the factory when the component is added. Used to disable the current entity ruler while creating phrase patterns with the nlp object. ~~str~~ | -| _keyword-only_ | | -| `phrase_matcher_attr` | Optional attribute name match on for the internal [`PhraseMatcher`](/api/phrasematcher), e.g. `LOWER` to match on the lowercase token text. Defaults to `None`. ~~Optional[Union[int, str]]~~ | -| `matcher_fuzzy_compare` 3.5 | The fuzzy comparison method, passed on to the internal `Matcher`. Defaults to `spacy.matcher.levenshtein.levenshtein_compare`. ~~Callable~~ | -| `validate` | Whether patterns should be validated, passed to Matcher and PhraseMatcher as `validate`. Defaults to `False`. ~~bool~~ | -| `overwrite_ents` | If existing entities are present, e.g. entities added by the model, overwrite them by matches if necessary. Defaults to `False`. ~~bool~~ | -| `ent_id_sep` | Separator used internally for entity IDs. Defaults to `"\|\|"`. ~~str~~ | -| `patterns` | Optional patterns to load in on initialization. ~~Optional[List[Dict[str, Union[str, List[dict]]]]]~~ | -| `scorer` | The scoring method. Defaults to [`spacy.scorer.get_ner_prf`](/api/scorer#get_ner_prf). ~~Optional[Callable]~~ | +### Accessing token and phrase patterns -## EntityRuler.initialize {id="initialize",tag="method",version="3"} +The separate token patterns and phrase patterns are no longer accessible under +`ruler.token_patterns` or `ruler.phrase_patterns`. You can access the combined +patterns in their original format using the property +[`SpanRuler.patterns`](/api/spanruler#patterns). -Initialize the component with data and used before training to load in rules -from a [pattern file](/usage/rule-based-matching/#entityruler-files). This -method is typically called by [`Language.initialize`](/api/language#initialize) -and lets you customize arguments it receives via the -[`[initialize.components]`](/api/data-formats#config-initialize) block in the -config. +### Removing patterns by ID -> #### Example -> -> ```python -> entity_ruler = nlp.add_pipe("entity_ruler") -> entity_ruler.initialize(lambda: [], nlp=nlp, patterns=patterns) -> ``` -> -> ```ini -> ### config.cfg -> [initialize.components.entity_ruler] -> -> [initialize.components.entity_ruler.patterns] -> @readers = "srsly.read_jsonl.v1" -> path = "corpus/entity_ruler_patterns.jsonl -> ``` +[`SpanRuler.remove`](/api/spanruler#remove) removes by label rather than ID. To +remove by ID, use [`SpanRuler.remove_by_id`](/api/spanruler#remove_by_id): -| Name | Description | -| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. Not used by the `EntityRuler`. ~~Callable[[], Iterable[Example]]~~ | -| _keyword-only_ | | -| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ | -| `patterns` | The list of patterns. Defaults to `None`. ~~Optional[Sequence[Dict[str, Union[str, List[Dict[str, Any]]]]]]~~ | - -## EntityRuler.\_\_len\_\_ {id="len",tag="method"} - -The number of all patterns added to the entity ruler. - -> #### Example -> -> ```python -> ruler = nlp.add_pipe("entity_ruler") -> assert len(ruler) == 0 -> ruler.add_patterns([{"label": "ORG", "pattern": "Apple"}]) -> assert len(ruler) == 1 -> ``` - -| Name | Description | -| ----------- | ------------------------------- | -| **RETURNS** | The number of patterns. ~~int~~ | - -## EntityRuler.\_\_contains\_\_ {id="contains",tag="method"} - -Whether a label is present in the patterns. - -> #### Example -> -> ```python -> ruler = nlp.add_pipe("entity_ruler") -> ruler.add_patterns([{"label": "ORG", "pattern": "Apple"}]) -> assert "ORG" in ruler -> assert not "PERSON" in ruler -> ``` - -| Name | Description | -| ----------- | ----------------------------------------------------- | -| `label` | The label to check. ~~str~~ | -| **RETURNS** | Whether the entity ruler contains the label. ~~bool~~ | - -## EntityRuler.\_\_call\_\_ {id="call",tag="method"} - -Find matches in the `Doc` and add them to the `doc.ents`. Typically, this -happens automatically after the component has been added to the pipeline using -[`nlp.add_pipe`](/api/language#add_pipe). If the entity ruler was initialized -with `overwrite_ents=True`, existing entities will be replaced if they overlap -with the matches. When matches overlap in a Doc, the entity ruler prioritizes -longer patterns over shorter, and if equal the match occuring first in the Doc -is chosen. - -> #### Example -> -> ```python -> ruler = nlp.add_pipe("entity_ruler") -> ruler.add_patterns([{"label": "ORG", "pattern": "Apple"}]) -> -> doc = nlp("A text about Apple.") -> ents = [(ent.text, ent.label_) for ent in doc.ents] -> assert ents == [("Apple", "ORG")] -> ``` - -| Name | Description | -| ----------- | -------------------------------------------------------------------- | -| `doc` | The `Doc` object to process, e.g. the `Doc` in the pipeline. ~~Doc~~ | -| **RETURNS** | The modified `Doc` with added entities, if available. ~~Doc~~ | - -## EntityRuler.add_patterns {id="add_patterns",tag="method"} - -Add patterns to the entity ruler. A pattern can either be a token pattern (list -of dicts) or a phrase pattern (string). For more details, see the usage guide on -[rule-based matching](/usage/rule-based-matching). - -> #### Example -> -> ```python -> patterns = [ -> {"label": "ORG", "pattern": "Apple"}, -> {"label": "GPE", "pattern": [{"lower": "san"}, {"lower": "francisco"}]} -> ] -> ruler = nlp.add_pipe("entity_ruler") -> ruler.add_patterns(patterns) -> ``` - -| Name | Description | -| ---------- | ---------------------------------------------------------------- | -| `patterns` | The patterns to add. ~~List[Dict[str, Union[str, List[dict]]]]~~ | - -## EntityRuler.remove {id="remove",tag="method",version="3.2.1"} - -Remove a pattern by its ID from the entity ruler. A `ValueError` is raised if -the ID does not exist. - -> #### Example -> -> ```python -> patterns = [{"label": "ORG", "pattern": "Apple", "id": "apple"}] -> ruler = nlp.add_pipe("entity_ruler") -> ruler.add_patterns(patterns) -> ruler.remove("apple") -> ``` - -| Name | Description | -| ---- | ----------------------------------- | -| `id` | The ID of the pattern rule. ~~str~~ | - -## EntityRuler.to_disk {id="to_disk",tag="method"} - -Save the entity ruler patterns to a directory. The patterns will be saved as -newline-delimited JSON (JSONL). If a file with the suffix `.jsonl` is provided, -only the patterns are saved as JSONL. If a directory name is provided, a -`patterns.jsonl` and `cfg` file with the component configuration is exported. - -> #### Example -> -> ```python -> ruler = nlp.add_pipe("entity_ruler") -> ruler.to_disk("/path/to/patterns.jsonl") # saves patterns only -> ruler.to_disk("/path/to/entity_ruler") # saves patterns and config -> ``` - -| Name | Description | -| ------ | -------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `path` | A path to a JSONL file or directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ | - -## EntityRuler.from_disk {id="from_disk",tag="method"} - -Load the entity ruler from a path. Expects either a file containing -newline-delimited JSON (JSONL) with one entry per line, or a directory -containing a `patterns.jsonl` file and a `cfg` file with the component -configuration. - -> #### Example -> -> ```python -> ruler = nlp.add_pipe("entity_ruler") -> ruler.from_disk("/path/to/patterns.jsonl") # loads patterns only -> ruler.from_disk("/path/to/entity_ruler") # loads patterns and config -> ``` - -| Name | Description | -| ----------- | ------------------------------------------------------------------------------------------------------------- | -| `path` | A path to a JSONL file or directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ | -| **RETURNS** | The modified `EntityRuler` object. ~~EntityRuler~~ | - -## EntityRuler.to_bytes {id="to_bytes",tag="method"} - -Serialize the entity ruler patterns to a bytestring. - -> #### Example -> -> ```python -> ruler = nlp.add_pipe("entity_ruler") -> ruler_bytes = ruler.to_bytes() -> ``` - -| Name | Description | -| ----------- | ---------------------------------- | -| **RETURNS** | The serialized patterns. ~~bytes~~ | - -## EntityRuler.from_bytes {id="from_bytes",tag="method"} - -Load the pipe from a bytestring. Modifies the object in place and returns it. - -> #### Example -> -> ```python -> ruler_bytes = ruler.to_bytes() -> ruler = nlp.add_pipe("entity_ruler") -> ruler.from_bytes(ruler_bytes) -> ``` - -| Name | Description | -| ------------ | -------------------------------------------------- | -| `bytes_data` | The bytestring to load. ~~bytes~~ | -| **RETURNS** | The modified `EntityRuler` object. ~~EntityRuler~~ | - -## EntityRuler.labels {id="labels",tag="property"} - -All labels present in the match patterns. - -| Name | Description | -| ----------- | -------------------------------------- | -| **RETURNS** | The string labels. ~~Tuple[str, ...]~~ | - -## EntityRuler.ent_ids {id="ent_ids",tag="property",version="2.2.2"} - -All entity IDs present in the `id` properties of the match patterns. - -| Name | Description | -| ----------- | ----------------------------------- | -| **RETURNS** | The string IDs. ~~Tuple[str, ...]~~ | - -## EntityRuler.patterns {id="patterns",tag="property"} - -Get all patterns that were added to the entity ruler. - -| Name | Description | -| ----------- | ---------------------------------------------------------------------------------------- | -| **RETURNS** | The original patterns, one dictionary per pattern. ~~List[Dict[str, Union[str, dict]]]~~ | - -## Attributes {id="attributes"} - -| Name | Description | -| ----------------- | --------------------------------------------------------------------------------------------------------------------- | -| `matcher` | The underlying matcher used to process token patterns. ~~Matcher~~ | -| `phrase_matcher` | The underlying phrase matcher used to process phrase patterns. ~~PhraseMatcher~~ | -| `token_patterns` | The token patterns present in the entity ruler, keyed by label. ~~Dict[str, List[Dict[str, Union[str, List[dict]]]]~~ | -| `phrase_patterns` | The phrase patterns present in the entity ruler, keyed by label. ~~Dict[str, List[Doc]]~~ | +```diff + ruler = nlp.get_pipe("entity_ruler") +- ruler.remove("id") ++ ruler.remove_by_id("id") +``` diff --git a/website/docs/api/kb_in_memory.mdx b/website/docs/api/inmemorylookupkb.mdx similarity index 96% rename from website/docs/api/kb_in_memory.mdx rename to website/docs/api/inmemorylookupkb.mdx index e85b63c45..c24fe78d6 100644 --- a/website/docs/api/kb_in_memory.mdx +++ b/website/docs/api/inmemorylookupkb.mdx @@ -43,7 +43,7 @@ The length of the fixed-size entity vectors in the knowledge base. Add an entity to the knowledge base, specifying its corpus frequency and entity vector, which should be of length -[`entity_vector_length`](/api/kb_in_memory#entity_vector_length). +[`entity_vector_length`](/api/inmemorylookupkb#entity_vector_length). > #### Example > @@ -79,8 +79,9 @@ frequency and entity vector for each entity. Add an alias or mention to the knowledge base, specifying its potential KB identifiers and their prior probabilities. The entity identifiers should refer -to entities previously added with [`add_entity`](/api/kb_in_memory#add_entity) -or [`set_entities`](/api/kb_in_memory#set_entities). The sum of the prior +to entities previously added with +[`add_entity`](/api/inmemorylookupkb#add_entity) or +[`set_entities`](/api/inmemorylookupkb#set_entities). The sum of the prior probabilities should not exceed 1. Note that an empty string can not be used as alias. @@ -156,7 +157,7 @@ Get a list of all aliases in the knowledge base. Given a certain textual mention as input, retrieve a list of candidate entities of type [`Candidate`](/api/kb#candidate). Wraps -[`get_alias_candidates()`](/api/kb_in_memory#get_alias_candidates). +[`get_alias_candidates()`](/api/inmemorylookupkb#get_alias_candidates). > #### Example > @@ -174,7 +175,7 @@ of type [`Candidate`](/api/kb#candidate). Wraps ## InMemoryLookupKB.get_candidates_batch {id="get_candidates_batch",tag="method"} -Same as [`get_candidates()`](/api/kb_in_memory#get_candidates), but for an +Same as [`get_candidates()`](/api/inmemorylookupkb#get_candidates), but for an arbitrary number of mentions. The [`EntityLinker`](/api/entitylinker) component will call `get_candidates_batch()` instead of `get_candidates()`, if the config parameter `candidates_batch_size` is greater or equal than 1. @@ -231,7 +232,7 @@ Given a certain entity ID, retrieve its pretrained entity vector. ## InMemoryLookupKB.get_vectors {id="get_vectors",tag="method"} -Same as [`get_vector()`](/api/kb_in_memory#get_vector), but for an arbitrary +Same as [`get_vector()`](/api/inmemorylookupkb#get_vector), but for an arbitrary number of entity IDs. The default implementation of `get_vectors()` executes `get_vector()` in a loop. diff --git a/website/docs/api/kb.mdx b/website/docs/api/kb.mdx index 887b7fe97..2b0d4d9d6 100644 --- a/website/docs/api/kb.mdx +++ b/website/docs/api/kb.mdx @@ -21,8 +21,8 @@ functions called by the [`EntityLinker`](/api/entitylinker) component. This class was not abstract up to spaCy version 3.5. The `KnowledgeBase` -implementation up to that point is available as `InMemoryLookupKB` from 3.5 -onwards. +implementation up to that point is available as +[`InMemoryLookupKB`](/api/inmemorylookupkb) from 3.5 onwards. @@ -110,14 +110,15 @@ to you. From spaCy 3.5 on `KnowledgeBase` is an abstract class (with -[`InMemoryLookupKB`](/api/kb_in_memory) being a drop-in replacement) to allow -more flexibility in customizing knowledge bases. Some of its methods were moved -to [`InMemoryLookupKB`](/api/kb_in_memory) during this refactoring, one of those -being `get_alias_candidates()`. This method is now available as -[`InMemoryLookupKB.get_alias_candidates()`](/api/kb_in_memory#get_alias_candidates). -Note: [`InMemoryLookupKB.get_candidates()`](/api/kb_in_memory#get_candidates) +[`InMemoryLookupKB`](/api/inmemorylookupkb) being a drop-in replacement) to +allow more flexibility in customizing knowledge bases. Some of its methods were +moved to [`InMemoryLookupKB`](/api/inmemorylookupkb) during this refactoring, +one of those being `get_alias_candidates()`. This method is now available as +[`InMemoryLookupKB.get_alias_candidates()`](/api/inmemorylookupkb#get_alias_candidates). +Note: +[`InMemoryLookupKB.get_candidates()`](/api/inmemorylookupkb#get_candidates) defaults to -[`InMemoryLookupKB.get_alias_candidates()`](/api/kb_in_memory#get_alias_candidates). +[`InMemoryLookupKB.get_alias_candidates()`](/api/inmemorylookupkb#get_alias_candidates). ## KnowledgeBase.get_vector {id="get_vector",tag="method"} diff --git a/website/docs/api/language.mdx b/website/docs/api/language.mdx index a744e89fc..12cea6c07 100644 --- a/website/docs/api/language.mdx +++ b/website/docs/api/language.mdx @@ -259,15 +259,6 @@ either in the [config](/usage/training#config), or by calling [`pipe.add_label`](/api/pipe#add_label) for each possible output label (e.g. for the tagger or textcat). - - -This method was previously called `begin_training`. It now also takes a -**function** that is called with no arguments and returns a sequence of -[`Example`](/api/example) objects instead of tuples of `Doc` and `GoldParse` -objects. - - - > #### Example > > ```python @@ -342,6 +333,34 @@ and custom registered functions if needed. See the | `component_cfg` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~ | | **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ | +## Language.distill {id="distill",tag="method,experimental",version="4"} + +Distill the models in a student pipeline from a teacher pipeline. + +> #### Example +> +> ```python +> +> teacher = spacy.load("en_core_web_lg") +> student = English() +> student.add_pipe("tagger") +> student.distill(teacher, examples, sgd=optimizer) +> ``` + +| Name | Description | +| -------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `teacher` | The teacher pipeline to distill from. ~~Language~~ | +| `examples` | A batch of [`Example`](/api/example) distillation examples. The reference (teacher) and predicted (student) docs must have the same number of tokens and orthography. ~~Iterable[Example]~~ | +| _keyword-only_ | | +| `drop` | The dropout rate. ~~float~~ | +| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | +| `losses` | Dictionary to update with the loss, keyed by pipeline component. ~~Optional[Dict[str, float]]~~ | +| `component_cfg` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~ | +| `exclude` | Names of components that shouldn't be updated. Defaults to `[]`. ~~Iterable[str]~~ | +| `annotates` | Names of components that should set annotations on the prediced examples after updating. Defaults to `[]`. ~~Iterable[str]~~ | +| `student_to_teacher` | Map student component names to teacher component names, only necessary when the names differ. Defaults to `None`. ~~Optional[Dict[str, str]]~~ | +| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ | + ## Language.rehearse {id="rehearse",tag="method,experimental",version="3"} Perform a "rehearsal" update from a batch of data. Rehearsal updates teach the diff --git a/website/docs/api/legacy.mdx b/website/docs/api/legacy.mdx index ea6d3a899..70d6223e7 100644 --- a/website/docs/api/legacy.mdx +++ b/website/docs/api/legacy.mdx @@ -225,7 +225,7 @@ the others, but may not be as accurate, especially if texts are short. ### spacy.TransitionBasedParser.v1 {id="TransitionBasedParser_v1"} Identical to -[`spacy.TransitionBasedParser.v2`](/api/architectures#TransitionBasedParser) +[`spacy.TransitionBasedParser.v3`](/api/architectures#TransitionBasedParser) except the `use_upper` was set to `True` by default. ## Layers {id="layers"} diff --git a/website/docs/api/lexeme.mdx b/website/docs/api/lexeme.mdx index 539f502f0..d5f3e9122 100644 --- a/website/docs/api/lexeme.mdx +++ b/website/docs/api/lexeme.mdx @@ -161,4 +161,3 @@ The L2 norm of the lexeme's vector representation. | `lang_` | Language of the parent vocabulary. ~~str~~ | | `prob` | Smoothed log probability estimate of the lexeme's word type (context-independent entry in the vocabulary). ~~float~~ | | `cluster` | Brown cluster ID. ~~int~~ | -| `sentiment` | A scalar value indicating the positivity or negativity of the lexeme. ~~float~~ | diff --git a/website/docs/api/matcher.mdx b/website/docs/api/matcher.mdx index c66579da8..66954b6c4 100644 --- a/website/docs/api/matcher.mdx +++ b/website/docs/api/matcher.mdx @@ -211,20 +211,6 @@ will be overwritten. > matches = matcher(doc) > ``` - - -As of spaCy v3.0, `Matcher.add` takes a list of patterns as the second argument -(instead of a variable number of arguments). The `on_match` callback becomes an -optional keyword argument. - -```diff -patterns = [[{"TEXT": "Google"}, {"TEXT": "Now"}], [{"TEXT": "GoogleNow"}]] -- matcher.add("GoogleNow", on_match, *patterns) -+ matcher.add("GoogleNow", patterns, on_match=on_match) -``` - - - | Name | Description | | ----------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- | | `match_id` | An ID for the thing you're matching. ~~str~~ | diff --git a/website/docs/api/morphologizer.mdx b/website/docs/api/morphologizer.mdx index f097f2ae3..9514bc773 100644 --- a/website/docs/api/morphologizer.mdx +++ b/website/docs/api/morphologizer.mdx @@ -42,12 +42,13 @@ architectures and their arguments and hyperparameters. > nlp.add_pipe("morphologizer", config=config) > ``` -| Setting | Description | -| ---------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `model` | The model to use. Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ | -| `overwrite` 3.2 | Whether the values of existing features are overwritten. Defaults to `True`. ~~bool~~ | -| `extend` 3.2 | Whether existing feature types (whose values may or may not be overwritten depending on `overwrite`) are preserved. Defaults to `False`. ~~bool~~ | -| `scorer` 3.2 | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attributes `"pos"` and `"morph"` and [`Scorer.score_token_attr_per_feat`](/api/scorer#score_token_attr_per_feat) for the attribute `"morph"`. ~~Optional[Callable]~~ | +| Setting | Description | +| ----------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `model` | The model to use. Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ | +| `overwrite` 3.2 | Whether the values of existing features are overwritten. Defaults to `False`. ~~bool~~ | +| `extend` 3.2 | Whether existing feature types (whose values may or may not be overwritten depending on `overwrite`) are preserved. Defaults to `False`. ~~bool~~ | +| `scorer` 3.2 | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attributes `"pos"` and `"morph"` and [`Scorer.score_token_attr_per_feat`](/api/scorer#score_token_attr_per_feat) for the attribute `"morph"`. ~~Optional[Callable]~~ | +| `save_activations` 4.0 | Save activations in `Doc` when annotating. Saved activations are `"probabilities"` and `"label_ids"`. ~~Union[bool, list[str]]~~ | ```python %%GITHUB_SPACY/spacy/pipeline/morphologizer.pyx @@ -120,6 +121,39 @@ delegate to the [`predict`](/api/morphologizer#predict) and | `doc` | The document to process. ~~Doc~~ | | **RETURNS** | The processed document. ~~Doc~~ | +## Morphologizer.distill {id="distill", tag="method,experimental", version="4"} + +Train a pipe (the student) on the predictions of another pipe (the teacher). The +student is typically trained on the probability distribution of the teacher, but +details may differ per pipe. The goal of distillation is to transfer knowledge +from the teacher to the student. + +The distillation is performed on ~~Example~~ objects. The `Example.reference` +and `Example.predicted` ~~Doc~~s must have the same number of tokens and the +same orthography. Even though the reference does not need have to have gold +annotations, the teacher could adds its own annotations when necessary. + +This feature is experimental. + +> #### Example +> +> ```python +> teacher_pipe = teacher.add_pipe("morphologizer") +> student_pipe = student.add_pipe("morphologizer") +> optimizer = nlp.resume_training() +> losses = student.distill(teacher_pipe, examples, sgd=optimizer) +> ``` + +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~ | +| `examples` | A batch of [`Example`](/api/example) distillation examples. The reference (teacher) and predicted (student) docs must have the same number of tokens and orthography. ~~Iterable[Example]~~ | +| _keyword-only_ | | +| `drop` | Dropout rate. ~~float~~ | +| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | +| `losses` | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ | +| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ | + ## Morphologizer.pipe {id="pipe",tag="method"} Apply the pipe to a stream of documents. This usually happens under the hood @@ -258,6 +292,27 @@ predicted scores. | `scores` | Scores representing the model's predictions. | | **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ | +## Morphologizer.get_teacher_student_loss {id="get_teacher_student_loss", tag="method", version="4"} + +Calculate the loss and its gradient for the batch of student scores relative to +the teacher scores. + +> #### Example +> +> ```python +> teacher_morphologizer = teacher.get_pipe("morphologizer") +> student_morphologizer = student.add_pipe("morphologizer") +> student_scores = student_morphologizer.predict([eg.predicted for eg in examples]) +> teacher_scores = teacher_morphologizer.predict([eg.predicted for eg in examples]) +> loss, d_loss = student_morphologizer.get_teacher_student_loss(teacher_scores, student_scores) +> ``` + +| Name | Description | +| ---------------- | --------------------------------------------------------------------------- | +| `teacher_scores` | Scores representing the teacher model's predictions. | +| `student_scores` | Scores representing the student model's predictions. | +| **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ | + ## Morphologizer.create_optimizer {id="create_optimizer",tag="method"} Create an optimizer for the pipeline component. @@ -399,9 +454,9 @@ coarse-grained POS as the feature `POS`. > assert "Mood=Ind|POS=VERB|Tense=Past|VerbForm=Fin" in morphologizer.labels > ``` -| Name | Description | -| ----------- | ------------------------------------------------------ | -| **RETURNS** | The labels added to the component. ~~Tuple[str, ...]~~ | +| Name | Description | +| ----------- | --------------------------------------------------------- | +| **RETURNS** | The labels added to the component. ~~Iterable[str, ...]~~ | ## Morphologizer.label_data {id="label_data",tag="property",version="3"} diff --git a/website/docs/api/phrasematcher.mdx b/website/docs/api/phrasematcher.mdx index 14ccefb77..2c5e767dc 100644 --- a/website/docs/api/phrasematcher.mdx +++ b/website/docs/api/phrasematcher.mdx @@ -116,10 +116,10 @@ Check whether the matcher contains rules for a match ID. ## PhraseMatcher.add {id="add",tag="method"} Add a rule to the matcher, consisting of an ID key, one or more patterns, and a -callback function to act on the matches. The callback function will receive the -arguments `matcher`, `doc`, `i` and `matches`. If a pattern already exists for -the given ID, the patterns will be extended. An `on_match` callback will be -overwritten. +optional callback function to act on the matches. The callback function will +receive the arguments `matcher`, `doc`, `i` and `matches`. If a pattern already +exists for the given ID, the patterns will be extended. An `on_match` callback +will be overwritten. > #### Example > @@ -134,20 +134,6 @@ overwritten. > matches = matcher(doc) > ``` - - -As of spaCy v3.0, `PhraseMatcher.add` takes a list of patterns as the second -argument (instead of a variable number of arguments). The `on_match` callback -becomes an optional keyword argument. - -```diff -patterns = [nlp("health care reform"), nlp("healthcare reform")] -- matcher.add("HEALTH", on_match, *patterns) -+ matcher.add("HEALTH", patterns, on_match=on_match) -``` - - - | Name | Description | | -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- | | `key` | An ID for the thing you're matching. ~~str~~ | diff --git a/website/docs/api/pipe.mdx b/website/docs/api/pipe.mdx index 4f72ef0ff..197d9af00 100644 --- a/website/docs/api/pipe.mdx +++ b/website/docs/api/pipe.mdx @@ -152,12 +152,6 @@ network, setting up the label scheme based on the data. This method is typically called by [`Language.initialize`](/api/language#initialize). - - -This method was previously called `begin_training`. - - - > #### Example > > ```python @@ -240,6 +234,39 @@ predictions and gold-standard annotations, and update the component's model. | `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ | | **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ | +## TrainablePipe.distill {id="distill", tag="method,experimental", version="4"} + +Train a pipe (the student) on the predictions of another pipe (the teacher). The +student is typically trained on the probability distribution of the teacher, but +details may differ per pipe. The goal of distillation is to transfer knowledge +from the teacher to the student. + +The distillation is performed on ~~Example~~ objects. The `Example.reference` +and `Example.predicted` ~~Doc~~s must have the same number of tokens and the +same orthography. Even though the reference does not need have to have gold +annotations, the teacher could adds its own annotations when necessary. + +This feature is experimental. + +> #### Example +> +> ```python +> teacher_pipe = teacher.add_pipe("your_custom_pipe") +> student_pipe = student.add_pipe("your_custom_pipe") +> optimizer = nlp.resume_training() +> losses = student.distill(teacher_pipe, examples, sgd=optimizer) +> ``` + +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~ | +| `examples` | A batch of [`Example`](/api/example) distillation examples. The reference (teacher) and predicted (student) docs must have the same number of tokens and orthography. ~~Iterable[Example]~~ | +| _keyword-only_ | | +| `drop` | Dropout rate. ~~float~~ | +| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | +| `losses` | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ | +| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ | + ## TrainablePipe.rehearse {id="rehearse",tag="method,experimental",version="3"} Perform a "rehearsal" update from a batch of data. Rehearsal updates teach the @@ -296,6 +323,34 @@ This method needs to be overwritten with your own custom `get_loss` method. | `scores` | Scores representing the model's predictions. | | **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ | +## TrainablePipe.get_teacher_student_loss {id="get_teacher_student_loss", tag="method", version="4"} + +Calculate the loss and its gradient for the batch of student scores relative to +the teacher scores. + + + +This method needs to be overwritten with your own custom +`get_teacher_student_loss` method. + + + +> #### Example +> +> ```python +> teacher_pipe = teacher.get_pipe("your_custom_pipe") +> student_pipe = student.add_pipe("your_custom_pipe") +> student_scores = student_pipe.predict([eg.predicted for eg in examples]) +> teacher_scores = teacher_pipe.predict([eg.predicted for eg in examples]) +> loss, d_loss = student_pipe.get_teacher_student_loss(teacher_scores, student_scores) +> ``` + +| Name | Description | +| ---------------- | --------------------------------------------------------------------------- | +| `teacher_scores` | Scores representing the teacher model's predictions. | +| `student_scores` | Scores representing the student model's predictions. | +| **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ | + ## TrainablePipe.score {id="score",tag="method",version="3"} Score a batch of examples. diff --git a/website/docs/api/scorer.mdx b/website/docs/api/scorer.mdx index 6f0c95f6f..d72018b90 100644 --- a/website/docs/api/scorer.mdx +++ b/website/docs/api/scorer.mdx @@ -30,7 +30,7 @@ Create a new `Scorer`. | Name | Description | | ------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `nlp` | The pipeline to use for scoring, where each pipeline component may provide a scoring method. If none is provided, then a default pipeline is constructed using the `default_lang` and `default_pipeline` settings. ~~Optional[Language]~~ | -| `default_lang` | The language to use for a default pipeline if `nlp` is not provided. Defaults to `xx`. ~~str~~ | +| `default_lang` | The language to use for a default pipeline if `nlp` is not provided. Defaults to `mul`. ~~str~~ | | `default_pipeline` | The pipeline components to use for a default pipeline if `nlp` is not provided. Defaults to `("senter", "tagger", "morphologizer", "parser", "ner", "textcat")`. ~~Iterable[string]~~ | | _keyword-only_ | | | `\*\*kwargs` | Any additional settings to pass on to the individual scoring methods. ~~Any~~ | diff --git a/website/docs/api/sentencerecognizer.mdx b/website/docs/api/sentencerecognizer.mdx index 5435399f9..dfb7ed308 100644 --- a/website/docs/api/sentencerecognizer.mdx +++ b/website/docs/api/sentencerecognizer.mdx @@ -39,11 +39,12 @@ architectures and their arguments and hyperparameters. > nlp.add_pipe("senter", config=config) > ``` -| Setting | Description | -| ---------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ | -| `overwrite` 3.2 | Whether existing annotation is overwritten. Defaults to `False`. ~~bool~~ | -| `scorer` 3.2 | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for the attribute `"sents"`. ~~Optional[Callable]~~ | +| Setting | Description | +| ----------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ | +| `overwrite` 3.2 | Whether existing annotation is overwritten. Defaults to `False`. ~~bool~~ | +| `scorer` 3.2 | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for the attribute `"sents"`. ~~Optional[Callable]~~ | +| `save_activations` 4.0 | Save activations in `Doc` when annotating. Saved activations are `"probabilities"` and `"label_ids"`. ~~Union[bool, list[str]]~~ | ```python %%GITHUB_SPACY/spacy/pipeline/senter.pyx @@ -105,6 +106,39 @@ and all pipeline components are applied to the `Doc` in order. Both | `doc` | The document to process. ~~Doc~~ | | **RETURNS** | The processed document. ~~Doc~~ | +## SentenceRecognizer.distill {id="distill", tag="method,experimental", version="4"} + +Train a pipe (the student) on the predictions of another pipe (the teacher). The +student is typically trained on the probability distribution of the teacher, but +details may differ per pipe. The goal of distillation is to transfer knowledge +from the teacher to the student. + +The distillation is performed on ~~Example~~ objects. The `Example.reference` +and `Example.predicted` ~~Doc~~s must have the same number of tokens and the +same orthography. Even though the reference does not need have to have gold +annotations, the teacher could adds its own annotations when necessary. + +This feature is experimental. + +> #### Example +> +> ```python +> teacher_pipe = teacher.add_pipe("senter") +> student_pipe = student.add_pipe("senter") +> optimizer = nlp.resume_training() +> losses = student.distill(teacher_pipe, examples, sgd=optimizer) +> ``` + +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~ | +| `examples` | A batch of [`Example`](/api/example) distillation examples. The reference (teacher) and predicted (student) docs must have the same number of tokens and orthography. ~~Iterable[Example]~~ | +| _keyword-only_ | | +| `drop` | Dropout rate. ~~float~~ | +| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | +| `losses` | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ | +| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ | + ## SentenceRecognizer.pipe {id="pipe",tag="method"} Apply the pipe to a stream of documents. This usually happens under the hood @@ -253,6 +287,27 @@ predicted scores. | `scores` | Scores representing the model's predictions. | | **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ | +## SentenceRecognizer.get_teacher_student_loss {id="get_teacher_student_loss", tag="method", version="4"} + +Calculate the loss and its gradient for the batch of student scores relative to +the teacher scores. + +> #### Example +> +> ```python +> teacher_senter = teacher.get_pipe("senter") +> student_senter = student.add_pipe("senter") +> student_scores = student_senter.predict([eg.predicted for eg in examples]) +> teacher_scores = teacher_senter.predict([eg.predicted for eg in examples]) +> loss, d_loss = student_senter.get_teacher_student_loss(teacher_scores, student_scores) +> ``` + +| Name | Description | +| ---------------- | --------------------------------------------------------------------------- | +| `teacher_scores` | Scores representing the teacher model's predictions. | +| `student_scores` | Scores representing the student model's predictions. | +| **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ | + ## SentenceRecognizer.create_optimizer {id="create_optimizer",tag="method"} Create an optimizer for the pipeline component. diff --git a/website/docs/api/span.mdx b/website/docs/api/span.mdx index bd7794edc..e1ada3b45 100644 --- a/website/docs/api/span.mdx +++ b/website/docs/api/span.mdx @@ -186,14 +186,17 @@ the character indices don't map to a valid span. > assert span.text == "New York" > ``` -| Name | Description | -| ----------- | ----------------------------------------------------------------------------------------- | -| `start` | The index of the first character of the span. ~~int~~ | -| `end` | The index of the last character after the span. ~~int~~ | -| `label` | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~ | -| `kb_id` | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~ | -| `vector` | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~ | -| **RETURNS** | The newly constructed object or `None`. ~~Optional[Span]~~ | +| Name | Description | +| ----------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `start_idx` | The index of the first character of the span. ~~int~~ | +| `end_idx` | The index of the last character after the span. ~~int~~ | +| `label` | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~ | +| _keyword-only_ | | +| `kb_id` | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~ | +| `vector` | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~ | +| `alignment_mode` 3.5.1 | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ | +| `span_id` 3.5.1 | An identifier to associate with the span. ~~Union[int, str]~~ | +| **RETURNS** | The newly constructed object or `None`. ~~Optional[Span]~~ | ## Span.similarity {id="similarity",tag="method",model="vectors"} @@ -272,17 +275,16 @@ The named entities that fall completely within the span. Returns a tuple of > assert ents[0].text == "Mr. Best" > ``` -| Name | Description | -| ----------- | ----------------------------------------------------------------- | -| **RETURNS** | Entities in the span, one `Span` per entity. ~~Tuple[Span, ...]~~ | +| Name | Description | +| ----------- | ------------------------------------------------------------ | +| **RETURNS** | Entities in the span, one `Span` per entity. ~~Tuple[Span]~~ | ## Span.noun_chunks {id="noun_chunks",tag="property",model="parser"} -Iterate over the base noun phrases in the span. Yields base noun-phrase `Span` -objects, if the document has been syntactically parsed. A base noun phrase, or -"NP chunk", is a noun phrase that does not permit other NPs to be nested within -it – so no NP-level coordination, no prepositional phrases, and no relative -clauses. +Returns a tuple of the base noun phrases in the span if the document has been +syntactically parsed. A base noun phrase, or "NP chunk", is a noun phrase that +does not permit other NPs to be nested within it – so no NP-level coordination, +no prepositional phrases, and no relative clauses. If the `noun_chunk` [syntax iterator](/usage/linguistic-features#language-data) has not been implemeted for the given language, a `NotImplementedError` is @@ -298,9 +300,9 @@ raised. > assert chunks[0].text == "another phrase" > ``` -| Name | Description | -| ---------- | --------------------------------- | -| **YIELDS** | Noun chunks in the span. ~~Span~~ | +| Name | Description | +| ----------- | ---------------------------------------- | +| **RETURNS** | Noun chunks in the span. ~~Tuple[Span]~~ | ## Span.as_doc {id="as_doc",tag="method"} @@ -522,9 +524,9 @@ sent = doc[sent.start : max(sent.end, span.end)] ## Span.sents {id="sents",tag="property",model="sentences",version="3.2.1"} -Returns a generator over the sentences the span belongs to. This property is -only available when [sentence boundaries](/usage/linguistic-features#sbd) have -been set on the document by the `parser`, `senter`, `sentencizer` or some custom +Returns a tuple of the sentences the span belongs to. This property is only +available when [sentence boundaries](/usage/linguistic-features#sbd) have been +set on the document by the `parser`, `senter`, `sentencizer` or some custom function. It will raise an error otherwise. If the span happens to cross sentence boundaries, all sentences the span @@ -538,9 +540,9 @@ overlaps with will be returned. > assert len(span.sents) == 2 > ``` -| Name | Description | -| ----------- | -------------------------------------------------------------------------- | -| **RETURNS** | A generator yielding sentences this `Span` is a part of ~~Iterable[Span]~~ | +| Name | Description | +| ----------- | ------------------------------------------------------------- | +| **RETURNS** | A tuple of sentences this `Span` is a part of ~~Tuple[Span]~~ | ## Attributes {id="attributes"} @@ -561,9 +563,8 @@ overlaps with will be returned. | `lemma_` | The span's lemma. Equivalent to `"".join(token.text_with_ws for token in span)`. ~~str~~ | | `kb_id` | The hash value of the knowledge base ID referred to by the span. ~~int~~ | | `kb_id_` | The knowledge base ID referred to by the span. ~~str~~ | -| `ent_id` | The hash value of the named entity the root token is an instance of. ~~int~~ | -| `ent_id_` | The string ID of the named entity the root token is an instance of. ~~str~~ | +| `ent_id` | Alias for `id`: the hash value of the span's ID. ~~int~~ | +| `ent_id_` | Alias for `id_`: the span's ID. ~~str~~ | | `id` | The hash value of the span's ID. ~~int~~ | | `id_` | The span's ID. ~~str~~ | -| `sentiment` | A scalar value indicating the positivity or negativity of the span. ~~float~~ | | `_` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~ | diff --git a/website/docs/api/spancategorizer.mdx b/website/docs/api/spancategorizer.mdx index f39c0aff9..c51b32671 100644 --- a/website/docs/api/spancategorizer.mdx +++ b/website/docs/api/spancategorizer.mdx @@ -52,14 +52,15 @@ architectures and their arguments and hyperparameters. > nlp.add_pipe("spancat", config=config) > ``` -| Setting | Description | -| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `suggester` | A function that [suggests spans](#suggesters). Spans are returned as a ragged array with two integer columns, for the start and end positions. Defaults to [`ngram_suggester`](#ngram_suggester). ~~Callable[[Iterable[Doc], Optional[Ops]], Ragged]~~ | -| `model` | A model instance that is given a a list of documents and `(start, end)` indices representing candidate span offsets. The model predicts a probability for each category for each span. Defaults to [SpanCategorizer](/api/architectures#SpanCategorizer). ~~Model[Tuple[List[Doc], Ragged], Floats2d]~~ | -| `spans_key` | Key of the [`Doc.spans`](/api/doc#spans) dict to save the spans under. During initialization and training, the component will look for spans on the reference document under the same key. Defaults to `"sc"`. ~~str~~ | -| `threshold` | Minimum probability to consider a prediction positive. Spans with a positive prediction will be saved on the Doc. Defaults to `0.5`. ~~float~~ | -| `max_positive` | Maximum number of labels to consider positive per span. Defaults to `None`, indicating no limit. ~~Optional[int]~~ | -| `scorer` | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for `Doc.spans[spans_key]` with overlapping spans allowed. ~~Optional[Callable]~~ | +| Setting | Description | +| ----------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `suggester` | A function that [suggests spans](#suggesters). Spans are returned as a ragged array with two integer columns, for the start and end positions. Defaults to [`ngram_suggester`](#ngram_suggester). ~~Callable[[Iterable[Doc], Optional[Ops]], Ragged]~~ | +| `model` | A model instance that is given a a list of documents and `(start, end)` indices representing candidate span offsets. The model predicts a probability for each category for each span. Defaults to [SpanCategorizer](/api/architectures#SpanCategorizer). ~~Model[Tuple[List[Doc], Ragged], Floats2d]~~ | +| `spans_key` | Key of the [`Doc.spans`](/api/doc#spans) dict to save the spans under. During initialization and training, the component will look for spans on the reference document under the same key. Defaults to `"sc"`. ~~str~~ | +| `threshold` | Minimum probability to consider a prediction positive. Spans with a positive prediction will be saved on the Doc. Defaults to `0.5`. ~~float~~ | +| `max_positive` | Maximum number of labels to consider positive per span. Defaults to `None`, indicating no limit. ~~Optional[int]~~ | +| `scorer` | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for `Doc.spans[spans_key]` with overlapping spans allowed. ~~Optional[Callable]~~ | +| `save_activations` 4.0 | Save activations in `Doc` when annotating. Saved activations are `"indices"` and `"scores"`. ~~Union[bool, list[str]]~~ | ```python %%GITHUB_SPACY/spacy/pipeline/spancat.py diff --git a/website/docs/api/spanruler.mdx b/website/docs/api/spanruler.mdx index d2d41f620..c1037435b 100644 --- a/website/docs/api/spanruler.mdx +++ b/website/docs/api/spanruler.mdx @@ -13,6 +13,17 @@ The span ruler lets you add spans to [`Doc.spans`](/api/doc#spans) and/or usage examples, see the docs on [rule-based span matching](/usage/rule-based-matching#spanruler). + + +As of spaCy v4, there is no separate `EntityRuler` class. The entity ruler is +implemented as a special case of the `SpanRuler` component. + +See the [migration guide](/api/entityruler#migrating) for differences between +the v3 `EntityRuler` and v4 `SpanRuler` implementations of the `entity_ruler` +component. + + + ## Assigned Attributes {id="assigned-attributes"} Matches will be saved to `Doc.spans[spans_key]` as a diff --git a/website/docs/api/stringstore.mdx b/website/docs/api/stringstore.mdx index 47d3715c1..7e380f5f8 100644 --- a/website/docs/api/stringstore.mdx +++ b/website/docs/api/stringstore.mdx @@ -40,7 +40,8 @@ Get the number of strings in the store. ## StringStore.\_\_getitem\_\_ {id="getitem",tag="method"} -Retrieve a string from a given hash, or vice versa. +Retrieve a string from a given hash. If a string is passed as the input, add it +to the store and return its hash. > #### Example > @@ -51,14 +52,14 @@ Retrieve a string from a given hash, or vice versa. > assert stringstore[apple_hash] == "apple" > ``` -| Name | Description | -| -------------- | ----------------------------------------------- | -| `string_or_id` | The value to encode. ~~Union[bytes, str, int]~~ | -| **RETURNS** | The value to be retrieved. ~~Union[str, int]~~ | +| Name | Description | +| ---------------- | ---------------------------------------------------------------------------- | +| `string_or_hash` | The hash value to lookup or the string to store. ~~Union[str, int]~~ | +| **RETURNS** | The stored string or the hash of the newly added string. ~~Union[str, int]~~ | ## StringStore.\_\_contains\_\_ {id="contains",tag="method"} -Check whether a string is in the store. +Check whether a string or a hash is in the store. > #### Example > @@ -68,15 +69,14 @@ Check whether a string is in the store. > assert not "cherry" in stringstore > ``` -| Name | Description | -| ----------- | ----------------------------------------------- | -| `string` | The string to check. ~~str~~ | -| **RETURNS** | Whether the store contains the string. ~~bool~~ | +| Name | Description | +| ---------------- | ------------------------------------------------------- | +| `string_or_hash` | The string or hash to check. ~~Union[str, int]~~ | +| **RETURNS** | Whether the store contains the string or hash. ~~bool~~ | ## StringStore.\_\_iter\_\_ {id="iter",tag="method"} -Iterate over the strings in the store, in order. Note that a newly initialized -store will always include an empty string `""` at position `0`. +Iterate over the stored strings in insertion order. > #### Example > @@ -86,11 +86,59 @@ store will always include an empty string `""` at position `0`. > assert all_strings == ["apple", "orange"] > ``` -| Name | Description | -| ---------- | ------------------------------ | -| **YIELDS** | A string in the store. ~~str~~ | +| Name | Description | +| ----------- | ------------------------------ | +| **RETURNS** | A string in the store. ~~str~~ | -## StringStore.add {id="add",tag="method",version="2"} +## StringStore.items {id="items", tag="method", version="4"} + +Iterate over the stored string-hash pairs in insertion order. + +> #### Example +> +> ```python +> stringstore = StringStore(["apple", "orange"]) +> all_strings_and_hashes = stringstore.items() +> assert all_strings_and_hashes == [("apple", 8566208034543834098), ("orange", 2208928596161743350)] +> ``` + +| Name | Description | +| ----------- | ------------------------------------------------------ | +| **RETURNS** | A list of string-hash pairs. ~~List[Tuple[str, int]]~~ | + +## StringStore.keys {id="keys", tag="method", version="4"} + +Iterate over the stored strings in insertion order. + +> #### Example +> +> ```python +> stringstore = StringStore(["apple", "orange"]) +> all_strings = stringstore.keys() +> assert all_strings == ["apple", "orange"] +> ``` + +| Name | Description | +| ----------- | -------------------------------- | +| **RETURNS** | A list of strings. ~~List[str]~~ | + +## StringStore.values {id="values", tag="method", version="4"} + +Iterate over the stored string hashes in insertion order. + +> #### Example +> +> ```python +> stringstore = StringStore(["apple", "orange"]) +> all_hashes = stringstore.values() +> assert all_hashes == [8566208034543834098, 2208928596161743350] +> ``` + +| Name | Description | +| ----------- | -------------------------------------- | +| **RETURNS** | A list of string hashes. ~~List[int]~~ | + +## StringStore.add {id="add", tag="method"} Add a string to the `StringStore`. @@ -110,7 +158,7 @@ Add a string to the `StringStore`. | `string` | The string to add. ~~str~~ | | **RETURNS** | The string's hash value. ~~int~~ | -## StringStore.to_disk {id="to_disk",tag="method",version="2"} +## StringStore.to_disk {id="to_disk",tag="method"} Save the current state to a directory. @@ -124,7 +172,7 @@ Save the current state to a directory. | ------ | ------------------------------------------------------------------------------------------------------------------------------------------ | | `path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ | -## StringStore.from_disk {id="from_disk",tag="method",version="2"} +## StringStore.from_disk {id="from_disk",tag="method"} Loads state from a directory. Modifies the object in place and returns it. diff --git a/website/docs/api/tagger.mdx b/website/docs/api/tagger.mdx index ee38de81c..35e7a23b1 100644 --- a/website/docs/api/tagger.mdx +++ b/website/docs/api/tagger.mdx @@ -40,12 +40,13 @@ architectures and their arguments and hyperparameters. > nlp.add_pipe("tagger", config=config) > ``` -| Setting | Description | -| ------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `model` | A model instance that predicts the tag probabilities. The output vectors should match the number of tags in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ | -| `overwrite` 3.2 | Whether existing annotation is overwritten. Defaults to `False`. ~~bool~~ | -| `scorer` 3.2 | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attribute `"tag"`. ~~Optional[Callable]~~ | -| `neg_prefix` 3.2.1 | The prefix used to specify incorrect tags while training. The tagger will learn not to predict exactly this tag. Defaults to `!`. ~~str~~ | +| Setting | Description | +| ----------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `model` | A model instance that predicts the tag probabilities. The output vectors should match the number of tags in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ | +| `overwrite` 3.2 | Whether existing annotation is overwritten. Defaults to `False`. ~~bool~~ | +| `scorer` 3.2 | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attribute `"tag"`. ~~Optional[Callable]~~ | +| `neg_prefix` 3.2.1 | The prefix used to specify incorrect tags while training. The tagger will learn not to predict exactly this tag. Defaults to `!`. ~~str~~ | +| `save_activations` 4.0 | Save activations in `Doc` when annotating. Saved activations are `"probabilities"` and `"label_ids"`. ~~Union[bool, list[str]]~~ | ```python %%GITHUB_SPACY/spacy/pipeline/tagger.pyx @@ -104,6 +105,39 @@ and all pipeline components are applied to the `Doc` in order. Both | `doc` | The document to process. ~~Doc~~ | | **RETURNS** | The processed document. ~~Doc~~ | +## Tagger.distill {id="distill", tag="method,experimental", version="4"} + +Train a pipe (the student) on the predictions of another pipe (the teacher). The +student is typically trained on the probability distribution of the teacher, but +details may differ per pipe. The goal of distillation is to transfer knowledge +from the teacher to the student. + +The distillation is performed on ~~Example~~ objects. The `Example.reference` +and `Example.predicted` ~~Doc~~s must have the same number of tokens and the +same orthography. Even though the reference does not need have to have gold +annotations, the teacher could adds its own annotations when necessary. + +This feature is experimental. + +> #### Example +> +> ```python +> teacher_pipe = teacher.add_pipe("tagger") +> student_pipe = student.add_pipe("tagger") +> optimizer = nlp.resume_training() +> losses = student.distill(teacher_pipe, examples, sgd=optimizer) +> ``` + +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~ | +| `examples` | A batch of [`Example`](/api/example) distillation examples. The reference (teacher) and predicted (student) docs must have the same number of tokens and orthography. ~~Iterable[Example]~~ | +| _keyword-only_ | | +| `drop` | Dropout rate. ~~float~~ | +| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | +| `losses` | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ | +| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ | + ## Tagger.pipe {id="pipe",tag="method"} Apply the pipe to a stream of documents. This usually happens under the hood @@ -141,12 +175,6 @@ arguments it receives via the [`[initialize.components]`](/api/data-formats#config-initialize) block in the config. - - -This method was previously called `begin_training`. - - - > #### Example > > ```python @@ -270,6 +298,27 @@ predicted scores. | `scores` | Scores representing the model's predictions. | | **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ | +## Tagger.get_teacher_student_loss {id="get_teacher_student_loss", tag="method", version="4"} + +Calculate the loss and its gradient for the batch of student scores relative to +the teacher scores. + +> #### Example +> +> ```python +> teacher_tagger = teacher.get_pipe("tagger") +> student_tagger = student.add_pipe("tagger") +> student_scores = student_tagger.predict([eg.predicted for eg in examples]) +> teacher_scores = teacher_tagger.predict([eg.predicted for eg in examples]) +> loss, d_loss = student_tagger.get_teacher_student_loss(teacher_scores, student_scores) +> ``` + +| Name | Description | +| ---------------- | --------------------------------------------------------------------------- | +| `teacher_scores` | Scores representing the teacher model's predictions. | +| `student_scores` | Scores representing the student model's predictions. | +| **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ | + ## Tagger.create_optimizer {id="create_optimizer",tag="method"} Create an optimizer for the pipeline component. diff --git a/website/docs/api/textcategorizer.mdx b/website/docs/api/textcategorizer.mdx index a259b7b3c..46cfb9f8c 100644 --- a/website/docs/api/textcategorizer.mdx +++ b/website/docs/api/textcategorizer.mdx @@ -116,14 +116,15 @@ Create a new pipeline instance. In your application, you would normally use a shortcut for this and instantiate the component using its string name and [`nlp.add_pipe`](/api/language#create_pipe). -| Name | Description | -| -------------- | -------------------------------------------------------------------------------------------------------------------------------- | -| `vocab` | The shared vocabulary. ~~Vocab~~ | -| `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~ | -| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | -| _keyword-only_ | | -| `threshold` | Cutoff to consider a prediction "positive", relevant for `textcat_multilabel` when calculating accuracy scores. ~~float~~ | -| `scorer` | The scoring method. Defaults to [`Scorer.score_cats`](/api/scorer#score_cats) for the attribute `"cats"`. ~~Optional[Callable]~~ | +| Name | Description | +| ----------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------- | +| `vocab` | The shared vocabulary. ~~Vocab~~ | +| `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~ | +| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | +| _keyword-only_ | | +| `threshold` | Cutoff to consider a prediction "positive", relevant for `textcat_multilabel` when calculating accuracy scores. ~~float~~ | +| `scorer` | The scoring method. Defaults to [`Scorer.score_cats`](/api/scorer#score_cats) for the attribute `"cats"`. ~~Optional[Callable]~~ | +| `save_activations` 4.0 | Save activations in `Doc` when annotating. The supported activations is `"probabilities"`. ~~Union[bool, list[str]]~~ | ## TextCategorizer.\_\_call\_\_ {id="call",tag="method"} @@ -186,12 +187,6 @@ arguments it receives via the [`[initialize.components]`](/api/data-formats#config-initialize) block in the config. - - -This method was previously called `begin_training`. - - - > #### Example > > ```python diff --git a/website/docs/api/tok2vec.mdx b/website/docs/api/tok2vec.mdx index a1bb1265e..8b6d2380b 100644 --- a/website/docs/api/tok2vec.mdx +++ b/website/docs/api/tok2vec.mdx @@ -100,6 +100,43 @@ pipeline components are applied to the `Doc` in order. Both | `doc` | The document to process. ~~Doc~~ | | **RETURNS** | The processed document. ~~Doc~~ | +## Tok2Vec.distill {id="distill", tag="method,experimental", version="4"} + +Performs an update of the student pipe's model using the student's distillation +examples and sets the annotations of the teacher's distillation examples using +the teacher pipe. + +Unlike other trainable pipes, the student pipe doesn't directly learn its +representations from the teacher. However, since downstream pipes that do +perform distillation expect the tok2vec annotations to be present on the +correct distillation examples, we need to ensure that they are set beforehand. + +The distillation is performed on ~~Example~~ objects. The `Example.reference` +and `Example.predicted` ~~Doc~~s must have the same number of tokens and the +same orthography. Even though the reference does not need have to have gold +annotations, the teacher could adds its own annotations when necessary. + +This feature is experimental. + +> #### Example +> +> ```python +> teacher_pipe = teacher.add_pipe("tok2vec") +> student_pipe = student.add_pipe("tok2vec") +> optimizer = nlp.resume_training() +> losses = student.distill(teacher_pipe, examples, sgd=optimizer) +> ``` + +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------- | +| `teacher_pipe` | The teacher pipe to use for prediction. ~~Optional[TrainablePipe]~~ | +| `examples` | Distillation examples. The reference (teacher) and predicted (student) docs must have the same number of tokens and the same orthography. ~~Iterable[Example]~~ | +| _keyword-only_ | | +| `drop` | Dropout rate. ~~float~~ | +| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | +| `losses` | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ | +| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ | + ## Tok2Vec.pipe {id="pipe",tag="method"} Apply the pipe to a stream of documents. This usually happens under the hood diff --git a/website/docs/api/token.mdx b/website/docs/api/token.mdx index 63ee1080b..3a817729b 100644 --- a/website/docs/api/token.mdx +++ b/website/docs/api/token.mdx @@ -470,7 +470,6 @@ The L2 norm of the token's vector representation. | `lang_` | Language of the parent document's vocabulary. ~~str~~ | | `prob` | Smoothed log probability estimate of token's word type (context-independent entry in the vocabulary). ~~float~~ | | `idx` | The character offset of the token within the parent document. ~~int~~ | -| `sentiment` | A scalar value indicating the positivity or negativity of the token. ~~float~~ | | `lex_id` | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. ~~int~~ | | `rank` | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. ~~int~~ | | `cluster` | Brown cluster ID. ~~int~~ | diff --git a/website/docs/api/top-level.mdx b/website/docs/api/top-level.mdx index 9748719d7..921b7a151 100644 --- a/website/docs/api/top-level.mdx +++ b/website/docs/api/top-level.mdx @@ -354,22 +354,22 @@ If a setting is not present in the options, the default value will be used. > displacy.serve(doc, style="dep", options=options) > ``` -| Name | Description | -| ------------------ | -------------------------------------------------------------------------------------------------------------------------------------------- | -| `fine_grained` | Use fine-grained part-of-speech tags (`Token.tag_`) instead of coarse-grained tags (`Token.pos_`). Defaults to `False`. ~~bool~~ | -| `add_lemma` | Print the lemmas in a separate row below the token texts. Defaults to `False`. ~~bool~~ | -| `collapse_punct` | Attach punctuation to tokens. Can make the parse more readable, as it prevents long arcs to attach punctuation. Defaults to `True`. ~~bool~~ | -| `collapse_phrases` | Merge noun phrases into one token. Defaults to `False`. ~~bool~~ | -| `compact` | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~ | -| `color` | Text color (HEX, RGB or color names). Defaults to `"#000000"`. ~~str~~ | -| `bg` | Background color (HEX, RGB or color names). Defaults to `"#ffffff"`. ~~str~~ | -| `font` | Font name or font family for all text. Defaults to `"Arial"`. ~~str~~ | -| `offset_x` | Spacing on left side of the SVG in px. Defaults to `50`. ~~int~~ | -| `arrow_stroke` | Width of arrow path in px. Defaults to `2`. ~~int~~ | -| `arrow_width` | Width of arrow head in px. Defaults to `10` in regular mode and `8` in compact mode. ~~int~~ | -| `arrow_spacing` | Spacing between arrows in px to avoid overlaps. Defaults to `20` in regular mode and `12` in compact mode. ~~int~~ | -| `word_spacing` | Vertical spacing between words and arcs in px. Defaults to `45`. ~~int~~ | -| `distance` | Distance between words in px. Defaults to `175` in regular mode and `150` in compact mode. ~~int~~ | +| Name | Description | +| ------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `fine_grained` | Use fine-grained part-of-speech tags (`Token.tag_`) instead of coarse-grained tags (`Token.pos_`). Defaults to `False`. ~~bool~~ | +| `add_lemma` | Print the lemmas in a separate row below the token texts. Defaults to `False`. ~~bool~~ | +| `collapse_punct` | Attach punctuation to tokens. Can make the parse more readable, as it prevents long arcs to attach punctuation. Defaults to `True`. ~~bool~~ | +| `collapse_phrases` | Merge noun phrases into one token. Defaults to `False`. ~~bool~~ | +| `compact` | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~ | +| `color` | Text color. Can be provided in any CSS legal format as a string e.g.: `"#00ff00"`, `"rgb(0, 255, 0)"`, `"hsl(120, 100%, 50%)"` and `"green"` all correspond to the color green (without transparency). Defaults to `"#000000"`. ~~str~~ | +| `bg` | Background color. Can be provided in any CSS legal format as a string e.g.: `"#00ff00"`, `"rgb(0, 255, 0)"`, `"hsl(120, 100%, 50%)"` and `"green"` all correspond to the color green (without transparency). Defaults to `"#ffffff"`. ~~str~~ | +| `font` | Font name or font family for all text. Defaults to `"Arial"`. ~~str~~ | +| `offset_x` | Spacing on left side of the SVG in px. Defaults to `50`. ~~int~~ | +| `arrow_stroke` | Width of arrow path in px. Defaults to `2`. ~~int~~ | +| `arrow_width` | Width of arrow head in px. Defaults to `10` in regular mode and `8` in compact mode. ~~int~~ | +| `arrow_spacing` | Spacing between arrows in px to avoid overlaps. Defaults to `20` in regular mode and `12` in compact mode. ~~int~~ | +| `word_spacing` | Vertical spacing between words and arcs in px. Defaults to `45`. ~~int~~ | +| `distance` | Distance between words in px. Defaults to `175` in regular mode and `150` in compact mode. ~~int~~ | #### Named Entity Visualizer options {id="displacy_options-ent"} @@ -751,14 +751,14 @@ themselves, or be discarded if `discard_oversize` is set to `True`. The argument > get_length = null > ``` -| Name | Description | -| ------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `seqs` | The sequences to minibatch. ~~Iterable[Any]~~ | -| `size` | The target number of words per batch. Can also be a block referencing a schedule, e.g. [`compounding`](https://thinc.ai/docs/api-schedules/#compounding). ~~Union[int, Sequence[int]]~~ | -| `tolerance` | What percentage of the size to allow batches to exceed. ~~float~~ | -| `discard_oversize` | Whether to discard sequences that by themselves exceed the tolerated size. ~~bool~~ | -| `get_length` | Optional function that receives a sequence item and returns its length. Defaults to the built-in `len()` if not set. ~~Optional[Callable[[Any], int]]~~ | -| **CREATES** | The batcher that takes an iterable of items and returns batches. ~~Callable[[Iterable[Any]], Iterable[List[Any]]]~~ | +| Name | Description | +| ------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `seqs` | The sequences to minibatch. ~~Iterable[Any]~~ | +| `size` | The target number of words per batch. Can also be a block referencing a schedule, e.g. [`compounding`](https://thinc.ai/docs/api-schedules/#compounding). ~~Union[int, Iterable[int], Schedule]~~ | +| `tolerance` | What percentage of the size to allow batches to exceed. ~~float~~ | +| `discard_oversize` | Whether to discard sequences that by themselves exceed the tolerated size. ~~bool~~ | +| `get_length` | Optional function that receives a sequence item and returns its length. Defaults to the built-in `len()` if not set. ~~Optional[Callable[[Any], int]]~~ | +| **CREATES** | The batcher that takes an iterable of items and returns batches. ~~Callable[[Iterable[Any]], Iterable[List[Any]]]~~ | ### spacy.batch_by_sequence.v1 {id="batch_by_sequence",tag="registered function"} @@ -773,11 +773,11 @@ themselves, or be discarded if `discard_oversize` is set to `True`. The argument Create a batcher that creates batches of the specified size. -| Name | Description | -| ------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `size` | The target number of items per batch. Can also be a block referencing a schedule, e.g. [`compounding`](https://thinc.ai/docs/api-schedules/#compounding). ~~Union[int, Sequence[int]]~~ | -| `get_length` | Optional function that receives a sequence item and returns its length. Defaults to the built-in `len()` if not set. ~~Optional[Callable[[Any], int]]~~ | -| **CREATES** | The batcher that takes an iterable of items and returns batches. ~~Callable[[Iterable[Any]], Iterable[List[Any]]]~~ | +| Name | Description | +| ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `size` | The target number of items per batch. Can also be a block referencing a schedule, e.g. [`compounding`](https://thinc.ai/docs/api-schedules/#compounding). ~~Union[int, Iterable[int], Schedule]~~ | +| `get_length` | Optional function that receives a sequence item and returns its length. Defaults to the built-in `len()` if not set. ~~Optional[Callable[[Any], int]]~~ | +| **CREATES** | The batcher that takes an iterable of items and returns batches. ~~Callable[[Iterable[Any]], Iterable[List[Any]]]~~ | ### spacy.batch_by_padded.v1 {id="batch_by_padded",tag="registered function"} @@ -799,7 +799,7 @@ sequences in the batch. | Name | Description | | ------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `size` | The largest padded size to batch sequences into. Can also be a block referencing a schedule, e.g. [`compounding`](https://thinc.ai/docs/api-schedules/#compounding). ~~Union[int, Sequence[int]]~~ | +| `size` | The largest padded size to batch sequences into. Can also be a block referencing a schedule, e.g. [`compounding`](https://thinc.ai/docs/api-schedules/#compounding). ~~Union[int, Iterable[int], Schedule]~~ | | `buffer` | The number of sequences to accumulate before sorting by length. A larger buffer will result in more even sizing, but if the buffer is very large, the iteration order will be less random, which can result in suboptimal training. ~~int~~ | | `discard_oversize` | Whether to discard sequences that are by themselves longer than the largest padded batch size. ~~bool~~ | | `get_length` | Optional function that receives a sequence item and returns its length. Defaults to the built-in `len()` if not set. ~~Optional[Callable[[Any], int]]~~ | @@ -921,7 +921,8 @@ backprop passes. Recursively wrap both the models and methods of each pipe using [NVTX](https://nvidia.github.io/NVTX/) range markers. By default, the following methods are wrapped: `pipe`, `predict`, `set_annotations`, `update`, `rehearse`, -`get_loss`, `initialize`, `begin_update`, `finish_update`, `update`. +`get_loss`, `get_teacher_student_loss`, `initialize`, `begin_update`, +`finish_update`, `update`. | Name | Description | | --------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------- | @@ -1400,7 +1401,7 @@ vary on each step. | Name | Description | | ---------- | ------------------------------------------------ | | `items` | The items to batch up. ~~Iterable[Any]~~ | -| `size` | The batch size(s). ~~Union[int, Sequence[int]]~~ | +| `size` | The batch size(s). ~~Union[int, Iterable[int]]~~ | | **YIELDS** | The batches. | ### util.filter_spans {id="util.filter_spans",tag="function",version="2.1.4"} diff --git a/website/docs/api/vectors.mdx b/website/docs/api/vectors.mdx index d6033c096..021484a1b 100644 --- a/website/docs/api/vectors.mdx +++ b/website/docs/api/vectors.mdx @@ -52,7 +52,6 @@ modified later. | `shape` | Size of the table as `(n_entries, n_columns)`, the number of entries and number of columns. Not required if you're initializing the object with `data` and `keys`. ~~Tuple[int, int]~~ | | `data` | The vector data. ~~numpy.ndarray[ndim=2, dtype=float32]~~ | | `keys` | A sequence of keys aligned with the data. ~~Iterable[Union[str, int]]~~ | -| `name` | A name to identify the vectors table. ~~str~~ | | `mode` 3.2 | Vectors mode: `"default"` or [`"floret"`](https://github.com/explosion/floret) (default: `"default"`). ~~str~~ | | `minn` 3.2 | The floret char ngram minn (default: `0`). ~~int~~ | | `maxn` 3.2 | The floret char ngram maxn (default: `0`). ~~int~~ | diff --git a/website/docs/api/vocab.mdx b/website/docs/api/vocab.mdx index 131e4ce0a..3faf1f1a0 100644 --- a/website/docs/api/vocab.mdx +++ b/website/docs/api/vocab.mdx @@ -27,7 +27,6 @@ Create the vocabulary. | `strings` | A [`StringStore`](/api/stringstore) that maps strings to hash values, and vice versa, or a list of strings. ~~Union[List[str], StringStore]~~ | | `lookups` | A [`Lookups`](/api/lookups) that stores the `lexeme_norm` and other large lookup tables. Defaults to `None`. ~~Optional[Lookups]~~ | | `oov_prob` | The default OOV probability. Defaults to `-20.0`. ~~float~~ | -| `vectors_name` | A name to identify the vectors table. ~~str~~ | | `writing_system` | A dictionary describing the language's writing system. Typically provided by [`Language.Defaults`](/api/language#defaults). ~~Dict[str, Any]~~ | | `get_noun_chunks` | A function that yields base noun phrases used for [`Doc.noun_chunks`](/api/doc#noun_chunks). ~~Optional[Callable[[Union[Doc, Span], Iterator[Tuple[int, int, int]]]]]~~ | diff --git a/website/docs/models/index.mdx b/website/docs/models/index.mdx index 371e4460f..366d44f0e 100644 --- a/website/docs/models/index.mdx +++ b/website/docs/models/index.mdx @@ -21,8 +21,8 @@ menu: ## Package naming conventions {id="conventions"} In general, spaCy expects all pipeline packages to follow the naming convention -of `[lang]\_[name]`. For spaCy's pipelines, we also chose to divide the name -into three components: +of `[lang]_[name]`. For spaCy's pipelines, we also chose to divide the name into +three components: 1. **Type:** Capabilities (e.g. `core` for general-purpose pipeline with tagging, parsing, lemmatization and named entity recognition, or `dep` for diff --git a/website/docs/usage/101/_architecture.mdx b/website/docs/usage/101/_architecture.mdx index 5727c6921..35c36088a 100644 --- a/website/docs/usage/101/_architecture.mdx +++ b/website/docs/usage/101/_architecture.mdx @@ -41,25 +41,27 @@ components for different language processing tasks and also allows adding ![The processing pipeline](/images/pipeline.svg) -| Name | Description | -| ----------------------------------------------- | ------------------------------------------------------------------------------------------- | -| [`AttributeRuler`](/api/attributeruler) | Set token attributes using matcher rules. | -| [`DependencyParser`](/api/dependencyparser) | Predict syntactic dependencies. | -| [`EditTreeLemmatizer`](/api/edittreelemmatizer) | Predict base forms of words. | -| [`EntityLinker`](/api/entitylinker) | Disambiguate named entities to nodes in a knowledge base. | -| [`EntityRecognizer`](/api/entityrecognizer) | Predict named entities, e.g. persons or products. | -| [`EntityRuler`](/api/entityruler) | Add entity spans to the `Doc` using token-based rules or exact phrase matches. | -| [`Lemmatizer`](/api/lemmatizer) | Determine the base forms of words using rules and lookups. | -| [`Morphologizer`](/api/morphologizer) | Predict morphological features and coarse-grained part-of-speech tags. | -| [`SentenceRecognizer`](/api/sentencerecognizer) | Predict sentence boundaries. | -| [`Sentencizer`](/api/sentencizer) | Implement rule-based sentence boundary detection that doesn't require the dependency parse. | -| [`Tagger`](/api/tagger) | Predict part-of-speech tags. | -| [`TextCategorizer`](/api/textcategorizer) | Predict categories or labels over the whole document. | -| [`Tok2Vec`](/api/tok2vec) | Apply a "token-to-vector" model and set its outputs. | -| [`Tokenizer`](/api/tokenizer) | Segment raw text and create `Doc` objects from the words. | -| [`TrainablePipe`](/api/pipe) | Class that all trainable pipeline components inherit from. | -| [`Transformer`](/api/transformer) | Use a transformer model and set its outputs. | -| [Other functions](/api/pipeline-functions) | Automatically apply something to the `Doc`, e.g. to merge spans of tokens. | +| Component name | Component class | Description | +| ---------------------- | ---------------------------------------------------- | ------------------------------------------------------------------------------------------- | +| `attribute_ruler` | [`AttributeRuler`](/api/attributeruler) | Set token attributes using matcher rules. | +| `entity_linker` | [`EntityLinker`](/api/entitylinker) | Disambiguate named entities to nodes in a knowledge base. | +| `entity_ruler` | [`SpanRuler`](/api/spanruler) | Add entity spans to the `Doc` using token-based rules or exact phrase matches. | +| `lemmatizer` | [`Lemmatizer`](/api/lemmatizer) | Determine the base forms of words using rules and lookups. | +| `morphologizer` | [`Morphologizer`](/api/morphologizer) | Predict morphological features and coarse-grained part-of-speech tags. | +| `ner` | [`EntityRecognizer`](/api/entityrecognizer) | Predict named entities, e.g. persons or products. | +| `parser` | [`DependencyParser`](/api/dependencyparser) | Predict syntactic dependencies. | +| `senter` | [`SentenceRecognizer`](/api/sentencerecognizer) | Predict sentence boundaries. | +| `sentencizer` | [`Sentencizer`](/api/sentencizer) | Implement rule-based sentence boundary detection that doesn't require the dependency parse. | +| `span_ruler` | [`SpanRuler`](/api/spanruler) | Add spans to the `Doc` using token-based rules or exact phrase matches. | +| `tagger` | [`Tagger`](/api/tagger) | Predict part-of-speech tags. | +| `textcat` | [`TextCategorizer`](/api/textcategorizer) | Predict exactly one category or label over a whole document. | +| `textcat_multilabel` | [`MultiLabel_TextCategorizer`](/api/textcategorizer) | Predict 0, 1 or more categories or labels over a whole document. | +| `tok2vec` | [`Tok2Vec`](/api/tok2vec) | Apply a "token-to-vector" model and set its outputs. | +| `tokenizer` | [`Tokenizer`](/api/tokenizer) | Segment raw text and create `Doc` objects from the words. | +| `trainable_lemmatizer` | [`EditTreeLemmatizer`](/api/edittreelemmatizer) | Predict base forms of words. | +| `transformer` | [`Transformer`](/api/transformer) | Use a transformer model and set its outputs. | +| - | [`TrainablePipe`](/api/pipe) | Class that all trainable pipeline components inherit from. | +| - | [Other functions](/api/pipeline-functions) | Automatically apply something to the `Doc`, e.g. to merge spans of tokens. | ### Matchers {id="architecture-matchers"} @@ -79,7 +81,7 @@ operates on a `Doc` and gives you access to the matched tokens **in context**. | ------------------------------------------------ | -------------------------------------------------------------------------------------------------- | | [`Corpus`](/api/corpus) | Class for managing annotated corpora for training and evaluation data. | | [`KnowledgeBase`](/api/kb) | Abstract base class for storage and retrieval of data for entity linking. | -| [`InMemoryLookupKB`](/api/kb_in_memory) | Implementation of `KnowledgeBase` storing all data in memory. | +| [`InMemoryLookupKB`](/api/inmemorylookupkb) | Implementation of `KnowledgeBase` storing all data in memory. | | [`Candidate`](/api/kb#candidate) | Object associating a textual mention with a specific entity contained in a `KnowledgeBase`. | | [`Lookups`](/api/lookups) | Container for convenient access to large lookup tables and dictionaries. | | [`MorphAnalysis`](/api/morphology#morphanalysis) | A morphological analysis. | diff --git a/website/docs/usage/101/_pipelines.mdx b/website/docs/usage/101/_pipelines.mdx index 315291762..e5a08c5e4 100644 --- a/website/docs/usage/101/_pipelines.mdx +++ b/website/docs/usage/101/_pipelines.mdx @@ -51,9 +51,9 @@ example, a custom lemmatizer may need the part-of-speech tags assigned, so it'll only work if it's added after the tagger. The parser will respect pre-defined sentence boundaries, so if a previous component in the pipeline sets them, its dependency predictions may be different. Similarly, it matters if you add the -[`EntityRuler`](/api/entityruler) before or after the statistical entity -recognizer: if it's added before, the entity recognizer will take the existing -entities into account when making predictions. The +[`SpanRuler`](/api/spanruler) before or after the statistical entity recognizer: +if it's added before and it is writing to `doc.ents`, then the entity recognizer +will take those existing entities into account when making predictions. The [`EntityLinker`](/api/entitylinker), which resolves named entities to knowledge base IDs, should be preceded by a pipeline component that recognizes entities such as the [`EntityRecognizer`](/api/entityrecognizer). diff --git a/website/docs/usage/101/_vectors-similarity.mdx b/website/docs/usage/101/_vectors-similarity.mdx index c27f777d8..39ee8e48a 100644 --- a/website/docs/usage/101/_vectors-similarity.mdx +++ b/website/docs/usage/101/_vectors-similarity.mdx @@ -22,17 +22,20 @@ array([2.02280000e-01, -7.66180009e-02, 3.70319992e-01, To make them compact and fast, spaCy's small [pipeline packages](/models) (all -packages that end in `sm`) **don't ship with word vectors**, and only include -context-sensitive **tensors**. This means you can still use the `similarity()` -methods to compare documents, spans and tokens – but the result won't be as -good, and individual tokens won't have any vectors assigned. So in order to use -_real_ word vectors, you need to download a larger pipeline package: +packages that end in `sm`) **don't ship with word vectors**. In order to use +`similarity()`, you need to download a larger pipeline package that includes +vectors: ```diff - python -m spacy download en_core_web_sm -+ python -m spacy download en_core_web_lg ++ python -m spacy download en_core_web_md ``` +In spaCy v3 and earlier, small pipeline packages supported `similarity()` by +backing off to context-sensitive tensors from the `tok2vec` component. These +tensors do not work well for this purpose and this backoff has been removed in +spaCy v4. + Pipeline packages that come with built-in word vectors make them available as @@ -134,6 +137,7 @@ useful for your purpose. Here are some important considerations to keep in mind: sense2vec Screenshot [`sense2vec`](https://github.com/explosion/sense2vec) is a library developed by diff --git a/website/docs/usage/embeddings-transformers.mdx b/website/docs/usage/embeddings-transformers.mdx index cf80822fb..0de173a21 100644 --- a/website/docs/usage/embeddings-transformers.mdx +++ b/website/docs/usage/embeddings-transformers.mdx @@ -140,7 +140,7 @@ factory = "tok2vec" factory = "ner" [components.ner.model] -@architectures = "spacy.TransitionBasedParser.v1" +@architectures = "spacy.TransitionBasedParser.v3" [components.ner.model.tok2vec] @architectures = "spacy.Tok2VecListener.v1" @@ -156,7 +156,7 @@ same. This makes them fully independent and doesn't require an upstream factory = "ner" [components.ner.model] -@architectures = "spacy.TransitionBasedParser.v1" +@architectures = "spacy.TransitionBasedParser.v3" [components.ner.model.tok2vec] @architectures = "spacy.Tok2Vec.v2" @@ -472,7 +472,7 @@ sneakily delegates to the `Transformer` pipeline component. factory = "ner" [nlp.pipeline.ner.model] -@architectures = "spacy.TransitionBasedParser.v1" +@architectures = "spacy.TransitionBasedParser.v3" state_type = "ner" extra_state_tokens = false hidden_width = 128 diff --git a/website/docs/usage/index.mdx b/website/docs/usage/index.mdx index a5b7990d6..07f2bd282 100644 --- a/website/docs/usage/index.mdx +++ b/website/docs/usage/index.mdx @@ -20,7 +20,7 @@ menu: ## Installation instructions {id="installation"} -spaCy is compatible with **64-bit CPython 3.6+** and runs on **Unix/Linux**, +spaCy is compatible with **64-bit CPython 3.8+** and runs on **Unix/Linux**, **macOS/OS X** and **Windows**. The latest spaCy releases are available over [pip](https://pypi.python.org/pypi/spacy) and [conda](https://anaconda.org/conda-forge/spacy). @@ -290,7 +290,7 @@ You can configure the build process with the following environment variables: | Variable | Description | | -------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `SPACY_EXTRAS` | Additional Python packages to install alongside spaCy with optional version specifications. Should be a string that can be passed to `pip install`. See [`Makefile`](%%GITHUB_SPACY/Makefile) for defaults. | -| `PYVER` | The Python version to build against. This version needs to be available on your build and runtime machines. Defaults to `3.6`. | +| `PYVER` | The Python version to build against. This version needs to be available on your build and runtime machines. Defaults to `3.8`. | | `WHEELHOUSE` | Directory to store the wheel files during compilation. Defaults to `./wheelhouse`. | ### Run tests {id="run-tests"} diff --git a/website/docs/usage/layers-architectures.mdx b/website/docs/usage/layers-architectures.mdx index 37f11e8e2..8f6bf3a20 100644 --- a/website/docs/usage/layers-architectures.mdx +++ b/website/docs/usage/layers-architectures.mdx @@ -113,6 +113,7 @@ code. Screenshot of Thinc type checking in VSCode with mypy diff --git a/website/docs/usage/models.mdx b/website/docs/usage/models.mdx index 3b8a5fa3f..5b783002c 100644 --- a/website/docs/usage/models.mdx +++ b/website/docs/usage/models.mdx @@ -74,23 +74,23 @@ your data. > ```python > # Standard import -> from spacy.lang.xx import MultiLanguage +> from spacy.lang.mul import MultiLanguage > nlp = MultiLanguage() > > # With lazy-loading -> nlp = spacy.blank("xx") +> nlp = spacy.blank("mul") > ``` spaCy also supports pipelines trained on more than one language. This is especially useful for named entity recognition. The language ID used for -multi-language or language-neutral pipelines is `xx`. The language class, a +multi-language or language-neutral pipelines is `mul`. The language class, a generic subclass containing only the base language data, can be found in -[`lang/xx`](%%GITHUB_SPACY/spacy/lang/xx). +[`lang/mul`](%%GITHUB_SPACY/spacy/lang/mul). To train a pipeline using the neutral multi-language class, you can set -`lang = "xx"` in your [training config](/usage/training#config). You can also +`lang = "mul"` in your [training config](/usage/training#config). You can also \import the `MultiLanguage` class directly, or call -[`spacy.blank("xx")`](/api/top-level#spacy.blank) for lazy-loading. +[`spacy.blank("mul")`](/api/top-level#spacy.blank) for lazy-loading. ### Chinese language support {id="chinese",version="2.3"} @@ -264,18 +264,49 @@ used for training the current [Japanese pipelines](/models/ja). ### Korean language support {id="korean"} -> #### mecab-ko tokenizer +There are currently three built-in options for Korean tokenization, two based on +[mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md) and one +using the rule-based tokenizer. + +> #### Default mecab-ko tokenizer > > ```python +> # uses mecab-ko-dic > nlp = spacy.blank("ko") +> +> # with custom mecab args +> mecab_args = "-d /path/to/dicdir -u /path/to/userdic" +> config = {"nlp": {"tokenizer": {"mecab_args": mecab_args}}} +> nlp = spacy.blank("ko", config=config) > ``` -The default MeCab-based Korean tokenizer requires: +The default MeCab-based Korean tokenizer requires the python package +[`mecab-ko`](https://pypi.org/project/mecab-ko/) and no further system +requirements. + +The `natto-py` MeCab-based tokenizer (the previous default for spaCy v3.4 and +earlier) is available as `spacy.KoreanNattoTokenizer.v1`. It requires: - [mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md) - [mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic) - [natto-py](https://github.com/buruzaemon/natto-py) +To use this tokenizer, edit `[nlp.tokenizer]` in your config: + +> #### natto-py MeCab-ko tokenizer +> +> ```python +> config = {"nlp": {"tokenizer": {"@tokenizers": "spacy.KoreanNattoTokenizer.v1"}}} +> nlp = spacy.blank("ko", config=config) +> ``` + +```ini +### config.cfg +[nlp] +lang = "ko" +tokenizer = {"@tokenizers" = "spacy.KoreanNattoTokenizer.v1"} +``` + For some Korean datasets and tasks, the [rule-based tokenizer](/usage/linguistic-features#tokenization) is better-suited than MeCab. To configure a Korean pipeline with the rule-based tokenizer: @@ -306,22 +337,6 @@ The easiest way to download a trained pipeline is via spaCy's [`download`](/api/cli#download) command. It takes care of finding the best-matching package compatible with your spaCy installation. -> #### Important note for v3.0 -> -> Note that as of spaCy v3.0, shortcut links like `en` that create (potentially -> brittle) symlinks in your spaCy installation are **deprecated**. To download -> and load an installed pipeline package, use its full name: -> -> ```diff -> - python -m spacy download en -> + python -m spacy download en_core_web_sm -> ``` -> -> ```diff -> - nlp = spacy.load("en") -> + nlp = spacy.load("en_core_web_sm") -> ``` - ```bash # Download best-matching version of a package for your spaCy installation $ python -m spacy download en_core_web_sm @@ -452,17 +467,6 @@ spacy.cli.download("en_core_web_sm") To load a pipeline package, use [`spacy.load`](/api/top-level#spacy.load) with the package name or a path to the data directory: -> #### Important note for v3.0 -> -> Note that as of spaCy v3.0, shortcut links like `en` that create (potentially -> brittle) symlinks in your spaCy installation are **deprecated**. To download -> and load an installed pipeline package, use its full name: -> -> ```diff -> - python -m spacy download en -> + python -m spacy download en_core_web_sm -> ``` - ```python import spacy nlp = spacy.load("en_core_web_sm") # load package "en_core_web_sm" diff --git a/website/docs/usage/processing-pipelines.mdx b/website/docs/usage/processing-pipelines.mdx index 307cb9dcb..08cd64aa7 100644 --- a/website/docs/usage/processing-pipelines.mdx +++ b/website/docs/usage/processing-pipelines.mdx @@ -297,13 +297,14 @@ available pipeline components and component functions. > ruler = nlp.add_pipe("entity_ruler") > ``` -| String name | Component | Description | +| Component name | Component class | Description | | ---------------------- | ---------------------------------------------------- | ----------------------------------------------------------------------------------------- | | `tagger` | [`Tagger`](/api/tagger) | Assign part-of-speech-tags. | | `parser` | [`DependencyParser`](/api/dependencyparser) | Assign dependency labels. | | `ner` | [`EntityRecognizer`](/api/entityrecognizer) | Assign named entities. | | `entity_linker` | [`EntityLinker`](/api/entitylinker) | Assign knowledge base IDs to named entities. Should be added after the entity recognizer. | -| `entity_ruler` | [`EntityRuler`](/api/entityruler) | Assign named entities based on pattern rules and dictionaries. | +| `span_ruler` | [`SpanRuler`](/api/spanruler) | Assign spans based on pattern rules and dictionaries. | +| `entity_ruler` | [`SpanRuler`](/api/spanruler) | Assign named entities based on pattern rules and dictionaries. | | `textcat` | [`TextCategorizer`](/api/textcategorizer) | Assign text categories: exactly one category is predicted per document. | | `textcat_multilabel` | [`MultiLabel_TextCategorizer`](/api/textcategorizer) | Assign text categories in a multi-label setting: zero, one or more labels per document. | | `lemmatizer` | [`Lemmatizer`](/api/lemmatizer) | Assign base forms to words using rules and lookups. | @@ -1353,12 +1354,14 @@ For some use cases, it makes sense to also overwrite additional methods to customize how the model is updated from examples, how it's initialized, how the loss is calculated and to add evaluation scores to the training output. -| Name | Description | -| ------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| [`update`](/api/pipe#update) | Learn from a batch of [`Example`](/api/example) objects containing the predictions and gold-standard annotations, and update the component's model. | -| [`initialize`](/api/pipe#initialize) | Initialize the model. Typically calls into [`Model.initialize`](https://thinc.ai/docs/api-model#initialize) and can be passed custom arguments via the [`[initialize]`](/api/data-formats#config-initialize) config block that are only loaded during training or when you call [`nlp.initialize`](/api/language#initialize), not at runtime. | -| [`get_loss`](/api/pipe#get_loss) | Return a tuple of the loss and the gradient for a batch of [`Example`](/api/example) objects. | -| [`score`](/api/pipe#score) | Score a batch of [`Example`](/api/example) objects and return a dictionary of scores. The [`@Language.factory`](/api/language#factory) decorator can define the `default_score_weights` of the component to decide which keys of the scores to display during training and how they count towards the final score. | +| Name | Description | +| ---------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [`update`](/api/pipe#update) | Learn from a batch of [`Example`](/api/example) objects containing the predictions and gold-standard annotations, and update the component's model. | +| [`distill`](/api/pipe#distill) | Learn from a teacher pipeline using a batch of [`Doc`](/api/doc) objects and update the component's model. | +| [`initialize`](/api/pipe#initialize) | Initialize the model. Typically calls into [`Model.initialize`](https://thinc.ai/docs/api-model#initialize) and can be passed custom arguments via the [`[initialize]`](/api/data-formats#config-initialize) config block that are only loaded during training or when you call [`nlp.initialize`](/api/language#initialize), not at runtime. | +| [`get_loss`](/api/pipe#get_loss) | Return a tuple of the loss and the gradient for a batch of [`Example`](/api/example) objects. | +| [`get_teacher_student_loss`](/api/pipe#get_teacher_student_loss) | Return a tuple of the loss and the gradient for the student scores relative to the teacher scores. | +| [`score`](/api/pipe#score) | Score a batch of [`Example`](/api/example) objects and return a dictionary of scores. The [`@Language.factory`](/api/language#factory) decorator can define the `default_score_weights` of the component to decide which keys of the scores to display during training and how they count towards the final score. | @@ -1385,8 +1388,8 @@ Writing to a `._` attribute instead of to the `Doc` directly keeps a clearer separation and makes it easier to ensure backwards compatibility. For example, if you've implemented your own `.coref` property and spaCy claims it one day, it'll break your code. Similarly, just by looking at the code, you'll -immediately know what's built-in and what's custom – for example, -`doc.sentiment` is spaCy, while `doc._.sent_score` isn't. +immediately know what's built-in and what's custom – for example, `doc.lang` is +spaCy, while `doc._.language` isn't. diff --git a/website/docs/usage/projects.mdx b/website/docs/usage/projects.mdx index 8ec035942..f3cca8013 100644 --- a/website/docs/usage/projects.mdx +++ b/website/docs/usage/projects.mdx @@ -943,7 +943,7 @@ full embedded visualizer, as well as individual components. > $ pip install spacy-streamlit --pre > ``` -![](/images/spacy-streamlit.png) +![Screenshot of the spacy-streamlit package in Streamlit](/images/spacy-streamlit.png) Using [`spacy-streamlit`](https://github.com/explosion/spacy-streamlit), your projects can easily define their own scripts that spin up an interactive diff --git a/website/docs/usage/rule-based-matching.mdx b/website/docs/usage/rule-based-matching.mdx index 08d2b3b91..792ec119a 100644 --- a/website/docs/usage/rule-based-matching.mdx +++ b/website/docs/usage/rule-based-matching.mdx @@ -511,7 +511,7 @@ matches = matcher(doc) ``` A very similar logic has been implemented in the built-in -[`EntityRuler`](/api/entityruler) by the way. It also takes care of handling +[`entity_ruler`](/api/entityruler) by the way. It also takes care of handling overlapping matches, which you would otherwise have to take care of yourself. > #### Tip: Visualizing matches @@ -811,6 +811,9 @@ whitespace, making them easy to match as well. ```python {executable="true"} from spacy.lang.en import English from spacy.matcher import Matcher +from spacy.tokens import Doc + +Doc.set_extension("sentiment", default=0.0) nlp = English() # We only want the tokenizer, so no need to load a pipeline matcher = Matcher(nlp.vocab) @@ -826,9 +829,9 @@ neg_patterns = [[{"ORTH": emoji}] for emoji in neg_emoji] def label_sentiment(matcher, doc, i, matches): match_id, start, end = matches[i] if doc.vocab.strings[match_id] == "HAPPY": # Don't forget to get string! - doc.sentiment += 0.1 # Add 0.1 for positive sentiment + doc._.sentiment += 0.1 # Add 0.1 for positive sentiment elif doc.vocab.strings[match_id] == "SAD": - doc.sentiment -= 0.1 # Subtract 0.1 for negative sentiment + doc._.sentiment -= 0.1 # Subtract 0.1 for negative sentiment matcher.add("HAPPY", pos_patterns, on_match=label_sentiment) # Add positive pattern matcher.add("SAD", neg_patterns, on_match=label_sentiment) # Add negative pattern @@ -858,16 +861,17 @@ the emoji span will make it available as `span._.emoji_desc`. ```python from emojipedia import Emojipedia # Installation: pip install emojipedia -from spacy.tokens import Span # Get the global Span object +from spacy.tokens import Doc, Span # Get the global Doc and Span object Span.set_extension("emoji_desc", default=None) # Register the custom attribute +Doc.set_extension("sentiment", default=0.0) def label_sentiment(matcher, doc, i, matches): match_id, start, end = matches[i] if doc.vocab.strings[match_id] == "HAPPY": # Don't forget to get string! - doc.sentiment += 0.1 # Add 0.1 for positive sentiment + doc._.sentiment += 0.1 # Add 0.1 for positive sentiment elif doc.vocab.strings[match_id] == "SAD": - doc.sentiment -= 0.1 # Subtract 0.1 for negative sentiment + doc._.sentiment -= 0.1 # Subtract 0.1 for negative sentiment span = doc[start:end] emoji = Emojipedia.search(span[0].text) # Get data for emoji span._.emoji_desc = emoji.title # Assign emoji description @@ -1096,20 +1100,28 @@ The following operators are supported by the `DependencyMatcher`, most of which come directly from [Semgrex](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html): -| Symbol | Description | -| --------- | -------------------------------------------------------------------------------------------------------------------- | -| `A < B` | `A` is the immediate dependent of `B`. | -| `A > B` | `A` is the immediate head of `B`. | -| `A << B` | `A` is the dependent in a chain to `B` following dep → head paths. | -| `A >> B` | `A` is the head in a chain to `B` following head → dep paths. | -| `A . B` | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree. | -| `A .* B` | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(not in Semgrex)_. | -| `A ; B` | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(not in Semgrex)_. | -| `A ;* B` | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(not in Semgrex)_. | -| `A $+ B` | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`. | -| `A $- B` | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`. | -| `A $++ B` | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`. | -| `A $-- B` | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`. | +| Symbol | Description | +| --------------------------------------- | -------------------------------------------------------------------------------------------------------------------- | +| `A < B` | `A` is the immediate dependent of `B`. | +| `A > B` | `A` is the immediate head of `B`. | +| `A << B` | `A` is the dependent in a chain to `B` following dep → head paths. | +| `A >> B` | `A` is the head in a chain to `B` following head → dep paths. | +| `A . B` | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree. | +| `A .* B` | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(not in Semgrex)_. | +| `A ; B` | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(not in Semgrex)_. | +| `A ;* B` | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(not in Semgrex)_. | +| `A $+ B` | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`. | +| `A $- B` | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`. | +| `A $++ B` | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`. | +| `A $-- B` | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`. | +| `A >+ B` 3.5.1 | `B` is a right immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i - 1` _(not in Semgrex)_. | +| `A >- B` 3.5.1 | `B` is a left immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i + 1` _(not in Semgrex)_. | +| `A >++ B` | `B` is a right child of `A`, i.e. `A` is a parent of `B` and `A.i < B.i` _(not in Semgrex)_. | +| `A >-- B` | `B` is a left child of `A`, i.e. `A` is a parent of `B` and `A.i > B.i` _(not in Semgrex)_. | +| `A <+ B` 3.5.1 | `B` is a right immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i - 1` _(not in Semgrex)_. | +| `A <- B` 3.5.1 | `B` is a left immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i + 1` _(not in Semgrex)_. | +| `A <++ B` | `B` is a right parent of `A`, i.e. `A` is a child of `B` and `A.i < B.i` _(not in Semgrex)_. | +| `A <-- B` | `B` is a left parent of `A`, i.e. `A` is a child of `B` and `A.i > B.i` _(not in Semgrex)_. | ### Designing dependency matcher patterns {id="dependencymatcher-patterns"} @@ -1298,7 +1310,7 @@ of patterns such as `{}` that match any token in the sentence. ## Rule-based entity recognition {id="entityruler",version="2.1"} -The [`EntityRuler`](/api/entityruler) is a component that lets you add named +The [`entity_ruler`](/api/entityruler) is a component that lets you add named entities based on pattern dictionaries, which makes it easy to combine rule-based and statistical named entity recognition for even more powerful pipelines. @@ -1323,13 +1335,12 @@ pattern. The entity ruler accepts two types of patterns: ### Using the entity ruler {id="entityruler-usage"} -The [`EntityRuler`](/api/entityruler) is a pipeline component that's typically -added via [`nlp.add_pipe`](/api/language#add_pipe). When the `nlp` object is -called on a text, it will find matches in the `doc` and add them as entities to -the `doc.ents`, using the specified pattern label as the entity label. If any -matches were to overlap, the pattern matching most tokens takes priority. If -they also happen to be equally long, then the match occurring first in the `Doc` -is chosen. +The `entity_ruler` is a pipeline component that's typically added via +[`nlp.add_pipe`](/api/language#add_pipe). When the `nlp` object is called on a +text, it will find matches in the `doc` and add them as entities to `doc.ents`, +using the specified pattern label as the entity label. If any matches were to +overlap, the pattern matching most tokens takes priority. If they also happen to +be equally long, then the match occurring first in the `Doc` is chosen. ```python {executable="true"} from spacy.lang.en import English @@ -1365,7 +1376,7 @@ doc = nlp("MyCorp Inc. is a company in the U.S.") print([(ent.text, ent.label_) for ent in doc.ents]) ``` -#### Validating and debugging EntityRuler patterns {id="entityruler-pattern-validation",version="2.1.8"} +#### Validating and debugging entity ruler patterns {id="entityruler-pattern-validation",version="2.1.8"} The entity ruler can validate patterns against a JSON schema with the config setting `"validate"`. See details under @@ -1377,9 +1388,9 @@ ruler = nlp.add_pipe("entity_ruler", config={"validate": True}) ### Adding IDs to patterns {id="entityruler-ent-ids",version="2.2.2"} -The [`EntityRuler`](/api/entityruler) can also accept an `id` attribute for each -pattern. Using the `id` attribute allows multiple patterns to be associated with -the same entity. +The [`entity_ruler`](/api/entityruler) can also accept an `id` attribute for +each pattern. Using the `id` attribute allows multiple patterns to be associated +with the same entity. ```python {executable="true"} from spacy.lang.en import English @@ -1392,16 +1403,16 @@ patterns = [{"label": "ORG", "pattern": "Apple", "id": "apple"}, ruler.add_patterns(patterns) doc1 = nlp("Apple is opening its first big office in San Francisco.") -print([(ent.text, ent.label_, ent.ent_id_) for ent in doc1.ents]) +print([(ent.text, ent.label_, ent.id_) for ent in doc1.ents]) doc2 = nlp("Apple is opening its first big office in San Fran.") -print([(ent.text, ent.label_, ent.ent_id_) for ent in doc2.ents]) +print([(ent.text, ent.label_, ent.id_) for ent in doc2.ents]) ``` -If the `id` attribute is included in the [`EntityRuler`](/api/entityruler) -patterns, the `ent_id_` property of the matched entity is set to the `id` given -in the patterns. So in the example above it's easy to identify that "San -Francisco" and "San Fran" are both the same entity. +If the `id` attribute is included in the [`entity_ruler`](/api/entityruler) +patterns, the `id_` property of the matched entity is set to the `id` given in +the patterns. So in the example above it's easy to identify that "San Francisco" +and "San Fran" are both the same entity. ### Using pattern files {id="entityruler-files"} @@ -1424,13 +1435,13 @@ new_ruler = nlp.add_pipe("entity_ruler").from_disk("./patterns.jsonl") If you're using the [Prodigy](https://prodi.gy) annotation tool, you might recognize these pattern files from bootstrapping your named entity and text -classification labelling. The patterns for the `EntityRuler` follow the same +classification labelling. The patterns for the `entity_ruler` follow the same syntax, so you can use your existing Prodigy pattern files in spaCy, and vice versa. -When you save out an `nlp` object that has an `EntityRuler` added to its +When you save out an `nlp` object that has an `entity_ruler` added to its pipeline, its patterns are automatically exported to the pipeline directory: ```python @@ -1442,8 +1453,8 @@ nlp.to_disk("/path/to/pipeline") The saved pipeline now includes the `"entity_ruler"` in its [`config.cfg`](/api/data-formats#config) and the pipeline directory contains a -file `entityruler.jsonl` with the patterns. When you load the pipeline back in, -all pipeline components will be restored and deserialized – including the entity +file `patterns.jsonl` with the patterns. When you load the pipeline back in, all +pipeline components will be restored and deserialized – including the entity ruler. This lets you ship powerful pipeline packages with binary weights _and_ rules included! @@ -1453,9 +1464,9 @@ rules included! When using a large amount of **phrase patterns** (roughly > 10000) it's useful to understand how the `add_patterns` function of the entity ruler works. For -each **phrase pattern**, the EntityRuler calls the nlp object to construct a doc -object. This happens in case you try to add the EntityRuler at the end of an -existing pipeline with, for example, a POS tagger and want to extract matches +each **phrase pattern**, the entity ruler calls the nlp object to construct a +doc object. This happens in case you try to add the entity ruler at the end of +an existing pipeline with, for example, a POS tagger and want to extract matches based on the pattern's POS signature. In this case you would pass a config value of `"phrase_matcher_attr": "POS"` for the entity ruler. diff --git a/website/docs/usage/saving-loading.mdx b/website/docs/usage/saving-loading.mdx index e0daebe35..cdc587273 100644 --- a/website/docs/usage/saving-loading.mdx +++ b/website/docs/usage/saving-loading.mdx @@ -187,13 +187,13 @@ the data to and from a JSON file. > #### Real-world example > -> To see custom serialization methods in action, check out the new -> [`EntityRuler`](/api/entityruler) component and its -> [source](%%GITHUB_SPACY/spacy/pipeline/entityruler.py). Patterns added to the +> To see custom serialization methods in action, check out the +> [`SpanRuler`](/api/spanruler) component and its +> [source](%%GITHUB_SPACY/spacy/pipeline/span_ruler.py). Patterns added to the > component will be saved to a `.jsonl` file if the pipeline is serialized to > disk, and to a bytestring if the pipeline is serialized to bytes. This allows -> saving out a pipeline with a rule-based entity recognizer and including all -> rules _with_ the component data. +> saving out a pipeline with rule-based components _with_ all the component +> data. ```python {highlight="16-23,25-30"} import json @@ -304,6 +304,28 @@ installed in the same environment – that's it. | `spacy_lookups` | Group of entry points for custom [`Lookups`](/api/lookups), including lemmatizer data. Used by spaCy's [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) package. | | [`spacy_displacy_colors`](#entry-points-displacy) | Group of entry points of custom label colors for the [displaCy visualizer](/usage/visualizers#ent). The key name doesn't matter, but it should point to a dict of labels and color values. Useful for custom models that predict different entity types. | +### Loading probability tables into existing models + +You can load a probability table from [spacy-lookups-data](https://github.com/explosion/spacy-lookups-data) into an existing spaCy model like `en_core_web_sm`. + +```python +# Requirements: pip install spacy-lookups-data +import spacy +from spacy.lookups import load_lookups +nlp = spacy.load("en_core_web_sm") +lookups = load_lookups("en", ["lexeme_prob"]) +nlp.vocab.lookups.add_table("lexeme_prob", lookups.get_table("lexeme_prob")) +``` + +When training a model from scratch you can also specify probability tables in the `config.cfg`. + +```ini {title="config.cfg (excerpt)"} +[initialize.lookups] +@misc = "spacy.LookupsDataLoader.v1" +lang = ${nlp.lang} +tables = ["lexeme_prob"] +``` + ### Custom components via entry points {id="entry-points-components"} When you load a pipeline, spaCy will generally use its `config.cfg` to set up diff --git a/website/docs/usage/spacy-101.mdx b/website/docs/usage/spacy-101.mdx index a02e73508..6d444a1e9 100644 --- a/website/docs/usage/spacy-101.mdx +++ b/website/docs/usage/spacy-101.mdx @@ -567,7 +567,10 @@ If you would like to use the spaCy logo on your site, please get in touch and ask us first. However, if you want to show support and tell others that your project is using spaCy, you can grab one of our **spaCy badges** here: - +Built with spaCy ```markdown [![Built with spaCy](https://img.shields.io/badge/built%20with-spaCy-09a3d5.svg)](https://spacy.io) @@ -575,8 +578,9 @@ project is using spaCy, you can grab one of our **spaCy badges** here: Made with love and spaCy ```markdown -[![Built with spaCy](https://img.shields.io/badge/made%20with%20❤%20and-spaCy-09a3d5.svg)](https://spacy.io) +[![Made with love and spaCy](https://img.shields.io/badge/made%20with%20❤%20and-spaCy-09a3d5.svg)](https://spacy.io) ``` diff --git a/website/docs/usage/training.mdx b/website/docs/usage/training.mdx index bae9ef326..4918ed6d9 100644 --- a/website/docs/usage/training.mdx +++ b/website/docs/usage/training.mdx @@ -422,7 +422,7 @@ your components during training, and the most common scenarios are: 2. Update an existing **trained component** with more examples. 3. Include an existing trained component without updating it. 4. Include a non-trainable component, like a rule-based - [`EntityRuler`](/api/entityruler) or [`Sentencizer`](/api/sentencizer), or a + [`SpanRuler`](/api/spanruler) or [`Sentencizer`](/api/sentencizer), or a fully [custom component](/usage/processing-pipelines#custom-components). If a component block defines a `factory`, spaCy will look it up in the diff --git a/website/docs/usage/v3-5.mdx b/website/docs/usage/v3-5.mdx new file mode 100644 index 000000000..3ca64f8a2 --- /dev/null +++ b/website/docs/usage/v3-5.mdx @@ -0,0 +1,230 @@ +--- +title: What's New in v3.5 +teaser: New features and how to upgrade +menu: + - ['New Features', 'features'] + - ['Upgrading Notes', 'upgrading'] +--- + +## New features {id="features",hidden="true"} + +spaCy v3.5 introduces three new CLI commands, `apply`, `benchmark` and +`find-threshold`, adds fuzzy matching, provides improvements to our entity +linking functionality, and includes a range of language updates and bug fixes. + +### New CLI commands {id="cli"} + +#### apply CLI + +The [`apply` CLI](/api/cli#apply) can be used to apply a pipeline to one or more +`.txt`, `.jsonl` or `.spacy` input files, saving the annotated docs in a single +`.spacy` file. + +```bash +$ spacy apply en_core_web_sm my_texts/ output.spacy +``` + +#### benchmark CLI + +The [`benchmark` CLI](/api/cli#benchmark) has been added to extend the existing +`evaluate` functionality with a wider range of profiling subcommands. + +The `benchmark accuracy` CLI is introduced as an alias for `evaluate`. The new +`benchmark speed` CLI performs warmup rounds before measuring the speed in words +per second on batches of randomly shuffled documents from the provided data. + +```bash +$ spacy benchmark speed my_pipeline data.spacy +``` + +The output is the mean performance using batches (`nlp.pipe`) with a 95% +confidence interval, e.g., profiling `en_core_web_sm` on CPU: + +```none +Outliers: 2.0%, extreme outliers: 0.0% +Mean: 18904.1 words/s (95% CI: -256.9 +244.1) +``` + +#### find-threshold CLI + +The [`find-threshold` CLI](/api/cli#find-threshold) runs a series of trials +across threshold values from `0.0` to `1.0` and identifies the best threshold +for the provided score metric. + +The following command runs 20 trials for the `spancat` component in +`my_pipeline`, recording the `spans_sc_f` score for each value of the threshold +`[components.spancat.threshold]` from `0.0` to `1.0`: + +```bash +$ spacy find-threshold my_pipeline data.spacy spancat threshold spans_sc_f --n_trials 20 +``` + +The `find-threshold` CLI can be used with `textcat_multilabel`, `spancat` and +custom components with thresholds that are applied while predicting or scoring. + +### Fuzzy matching {id="fuzzy"} + +New `FUZZY` operators support [fuzzy matching](/usage/rule-based-matching#fuzzy) +with the `Matcher`. By default, the `FUZZY` operator allows a Levenshtein edit +distance of 2 and up to 30% of the pattern string length. `FUZZY1`..`FUZZY9` can +be used to specify the exact number of allowed edits. + +```python +# Match lowercase with fuzzy matching (allows up to 3 edits) +pattern = [{"LOWER": {"FUZZY": "definitely"}}] + +# Match custom attribute values with fuzzy matching (allows up to 3 edits) +pattern = [{"_": {"country": {"FUZZY": "Kyrgyzstan"}}}] + +# Match with exact Levenshtein edit distance limits (allows up to 4 edits) +pattern = [{"_": {"country": {"FUZZY4": "Kyrgyzstan"}}}] +``` + +Note that `FUZZY` uses Levenshtein edit distance rather than Damerau-Levenshtein +edit distance, so a transposition like `teh` for `the` counts as two edits, one +insertion and one deletion. + +If you'd prefer an alternate fuzzy matching algorithm, you can provide your own +custom method to the `Matcher` or as a config option for an entity ruler and +span ruler. + +### FUZZY and REGEX with lists {id="fuzzy-regex-lists"} + +The `FUZZY` and `REGEX` operators are also now supported for lists with `IN` and +`NOT_IN`: + +```python +pattern = [{"TEXT": {"FUZZY": {"IN": ["awesome", "cool", "wonderful"]}}}] +pattern = [{"TEXT": {"REGEX": {"NOT_IN": ["^awe(some)?$", "^wonder(ful)?"]}}}] +``` + +### Entity linking generalization {id="el"} + +The knowledge base used for entity linking is now easier to customize and has a +new default implementation [`InMemoryLookupKB`](/api/inmemorylookupkb). + +### Additional features and improvements {id="additional-features-and-improvements"} + +- Language updates: + - Extended support for Slovenian + - Fixed lookup fallback for French and Catalan lemmatizers + - Switch Russian and Ukrainian lemmatizers to `pymorphy3` + - Support for editorial punctuation in Ancient Greek + - Update to Russian tokenizer exceptions + - Small fix for Dutch stop words +- Allow up to `typer` v0.7.x, `mypy` 0.990 and `typing_extensions` v4.4.x. +- New `spacy.ConsoleLogger.v3` with expanded progress + [tracking](/api/top-level#ConsoleLogger). +- Improved scoring behavior for `textcat` with `spacy.textcat_scorer.v2` and + `spacy.textcat_multilabel_scorer.v2`. +- Updates so that downstream components can train properly on a frozen `tok2vec` + or `transformer` layer. +- Allow interpolation of variables in directory names in projects. +- Support for local file system [remotes](/usage/projects#remote) for projects. +- Improve UX around `displacy.serve` when the default port is in use. +- Optional `before_update` callback that is invoked at the start of each + [training step](/api/data-formats#config-training). +- Improve performance of `SpanGroup` and fix typing issues for `SpanGroup` and + `Span` objects. +- Patch a + [security vulnerability](https://github.com/advisories/GHSA-gw9q-c7gh-j9vm) in + extracting tar files. +- Add equality definition for `Vectors`. +- Ensure `Vocab.to_disk` respects the exclude setting for `lookups` and + `vectors`. +- Correctly handle missing annotations in the edit tree lemmatizer. + +### Trained pipeline updates {id="pipelines"} + +- The CNN pipelines add `IS_SPACE` as a `tok2vec` feature for `tagger` and + `morphologizer` components to improve tagging of non-whitespace vs. whitespace + tokens. +- The transformer pipelines require `spacy-transformers` v1.2, which uses the + exact alignment from `tokenizers` for fast tokenizers instead of the heuristic + alignment from `spacy-alignments`. For all trained pipelines except + `ja_core_news_trf`, the alignments between spaCy tokens and transformer tokens + may be slightly different. More details about the `spacy-transformers` changes + in the + [v1.2.0 release notes](https://github.com/explosion/spacy-transformers/releases/tag/v1.2.0). + +## Notes about upgrading from v3.4 {id="upgrading"} + +### Validation of textcat values {id="textcat-validation"} + +An error is now raised when unsupported values are given as input to train a +`textcat` or `textcat_multilabel` model - ensure that values are `0.0` or `1.0` +as explained in the [docs](/api/textcategorizer#assigned-attributes). + +### Using the default knowledge base + +As `KnowledgeBase` is now an abstract class, you should call the constructor of +the new `InMemoryLookupKB` instead when you want to use spaCy's default KB +implementation: + +```diff +- kb = KnowledgeBase() ++ kb = InMemoryLookupKB() +``` + +If you've written a custom KB that inherits from `KnowledgeBase`, you'll need to +implement its abstract methods, or alternatively inherit from `InMemoryLookupKB` +instead. + +### Updated scorers for tokenization and textcat {id="scores"} + +We fixed a bug that inflated the `token_acc` scores in v3.0-v3.4. The reported +`token_acc` will drop from v3.4 to v3.5, but if `token_p/r/f` stay the same, +your tokenization performance has not changed from v3.4. + +For new `textcat` or `textcat_multilabel` configs, the new default `v2` scorers: + +- ignore `threshold` for `textcat`, so the reported `cats_p/r/f` may increase + slightly in v3.5 even though the underlying predictions are unchanged +- report the performance of only the **final** `textcat` or `textcat_multilabel` + component in the pipeline by default +- allow custom scorers to be used to score multiple `textcat` and + `textcat_multilabel` components with `Scorer.score_cats` by restricting the + evaluation to the component's provided labels + +### Pipeline package version compatibility {id="version-compat"} + +> #### Using legacy implementations +> +> In spaCy v3, you'll still be able to load and reference legacy implementations +> via [`spacy-legacy`](https://github.com/explosion/spacy-legacy), even if the +> components or architectures change and newer versions are available in the +> core library. + +When you're loading a pipeline package trained with an earlier version of spaCy +v3, you will see a warning telling you that the pipeline may be incompatible. +This doesn't necessarily have to be true, but we recommend running your +pipelines against your test suite or evaluation data to make sure there are no +unexpected results. + +If you're using one of the [trained pipelines](/models) we provide, you should +run [`spacy download`](/api/cli#download) to update to the latest version. To +see an overview of all installed packages and their compatibility, you can run +[`spacy validate`](/api/cli#validate). + +If you've trained your own custom pipeline and you've confirmed that it's still +working as expected, you can update the spaCy version requirements in the +[`meta.json`](/api/data-formats#meta): + +```diff +- "spacy_version": ">=3.4.0,<3.5.0", ++ "spacy_version": ">=3.4.0,<3.6.0", +``` + +### Updating v3.4 configs + +To update a config from spaCy v3.4 with the new v3.5 settings, run +[`init fill-config`](/api/cli#init-fill-config): + +```cli +$ python -m spacy init fill-config config-v3.4.cfg config-v3.5.cfg +``` + +In many cases ([`spacy train`](/api/cli#train), +[`spacy.load`](/api/top-level#spacy.load)), the new defaults will be filled in +automatically, but you'll need to fill in the new settings to run +[`debug config`](/api/cli#debug) and [`debug data`](/api/cli#debug-data). diff --git a/website/docs/usage/visualizers.mdx b/website/docs/usage/visualizers.mdx index f1ff6dd3d..c372744de 100644 --- a/website/docs/usage/visualizers.mdx +++ b/website/docs/usage/visualizers.mdx @@ -58,12 +58,12 @@ arcs. -| Argument | Description | -| --------- | ----------------------------------------------------------------------------------------- | -| `compact` | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~ | -| `color` | Text color (HEX, RGB or color names). Defaults to `"#000000"`. ~~str~~ | -| `bg` | Background color (HEX, RGB or color names). Defaults to `"#ffffff"`. ~~str~~ | -| `font` | Font name or font family for all text. Defaults to `"Arial"`. ~~str~~ | +| Argument | Description | +| --------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `compact` | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~ | +| `color` | Text color. Can be provided in any CSS legal format as a string e.g.: `"#00ff00"`, `"rgb(0, 255, 0)"`, `"hsl(120, 100%, 50%)"` and `"green"` all correspond to the color green (without transparency). Defaults to `"#000000"`. ~~str~~ | +| `bg` | Background color. Can be provided in any CSS legal format as a string e.g.: `"#00ff00"`, `"rgb(0, 255, 0)"`, `"hsl(120, 100%, 50%)"` and `"green"` all correspond to the color green (without transparency). Defaults to `"#ffffff"`. ~~str~~ | +| `font` | Font name or font family for all text. Defaults to `"Arial"`. ~~str~~ | For a list of all available options, see the [`displacy` API documentation](/api/top-level#displacy_options). @@ -437,6 +437,6 @@ Alternatively, if you're using [Streamlit](https://streamlit.io), check out the helps you integrate spaCy visualizations into your apps. It includes a full embedded visualizer, as well as individual components. -![](/images/spacy-streamlit.png) +![Screenshot of the spacy-streamlit package in Streamlit](/images/spacy-streamlit.png) diff --git a/website/meta/languages.json b/website/meta/languages.json index 46c0d3adb..eeb3a74b7 100644 --- a/website/meta/languages.json +++ b/website/meta/languages.json @@ -165,7 +165,7 @@ "has_examples": true }, { - "code": "is", + "code": "isl", "name": "Icelandic" }, { @@ -434,9 +434,9 @@ ] }, { - "code": "xx", + "code": "mul", "name": "Multi-language", - "models": ["xx_ent_wiki_sm", "xx_sent_ud_sm"], + "models": ["mul_ent_wiki_sm", "mul_sent_ud_sm"], "example": "This is a sentence about Facebook." }, { diff --git a/website/meta/sidebars.json b/website/meta/sidebars.json index 339e4085b..b5c555da6 100644 --- a/website/meta/sidebars.json +++ b/website/meta/sidebars.json @@ -13,7 +13,8 @@ { "text": "New in v3.1", "url": "/usage/v3-1" }, { "text": "New in v3.2", "url": "/usage/v3-2" }, { "text": "New in v3.3", "url": "/usage/v3-3" }, - { "text": "New in v3.4", "url": "/usage/v3-4" } + { "text": "New in v3.4", "url": "/usage/v3-4" }, + { "text": "New in v3.5", "url": "/usage/v3-5" } ] }, { @@ -129,6 +130,7 @@ "items": [ { "text": "Attributes", "url": "/api/attributes" }, { "text": "Corpus", "url": "/api/corpus" }, + { "text": "InMemoryLookupKB", "url": "/api/inmemorylookupkb" }, { "text": "KnowledgeBase", "url": "/api/kb" }, { "text": "Lookups", "url": "/api/lookups" }, { "text": "MorphAnalysis", "url": "/api/morphology#morphanalysis" }, diff --git a/website/meta/site.json b/website/meta/site.json index 5dcb89443..3d4f2d5ee 100644 --- a/website/meta/site.json +++ b/website/meta/site.json @@ -27,7 +27,7 @@ "indexName": "spacy" }, "binderUrl": "explosion/spacy-io-binder", - "binderVersion": "3.4", + "binderVersion": "3.5", "sections": [ { "id": "usage", "title": "Usage Documentation", "theme": "blue" }, { "id": "models", "title": "Models Documentation", "theme": "blue" }, diff --git a/website/meta/universe.json b/website/meta/universe.json index 43a78d609..e35a4f045 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -1015,12 +1015,13 @@ "author_links": { "github": "mholtzscher" }, - "category": ["pipeline"] + "category": ["pipeline"], + "spacy_version": 2 }, { "id": "spacy_cld", "title": "spaCy-CLD", - "slogan": "Add language detection to your spaCy pipeline using CLD2", + "slogan": "Add language detection to your spaCy v2 pipeline using CLD2", "description": "spaCy-CLD operates on `Doc` and `Span` spaCy objects. When called on a `Doc` or `Span`, the object is given two attributes: `languages` (a list of up to 3 language codes) and `language_scores` (a dictionary mapping language codes to confidence scores between 0 and 1).\n\nspacy-cld is a little extension that wraps the [PYCLD2](https://github.com/aboSamoor/pycld2) Python library, which in turn wraps the [Compact Language Detector 2](https://github.com/CLD2Owners/cld2) C library originally built at Google for the Chromium project. CLD2 uses character n-grams as features and a Naive Bayes classifier to identify 80+ languages from Unicode text strings (or XML/HTML). It can detect up to 3 different languages in a given document, and reports a confidence score (reported in with each language.", "github": "nickdavidhaynes/spacy-cld", "pip": "spacy_cld", @@ -1040,7 +1041,8 @@ "author_links": { "github": "nickdavidhaynes" }, - "category": ["pipeline"] + "category": ["pipeline"], + "spacy_version": 2 }, { "id": "spacy-iwnlp", @@ -1114,7 +1116,8 @@ "github": "sammous" }, "category": ["pipeline"], - "tags": ["pos", "lemmatizer", "french"] + "tags": ["pos", "lemmatizer", "french"], + "spacy_version": 2 }, { "id": "lemmy", @@ -1308,8 +1311,8 @@ }, { "id": "neuralcoref", - "slogan": "State-of-the-art coreference resolution based on neural nets and spaCy", - "description": "This coreference resolution module is based on the super fast [spaCy](https://spacy.io/) parser and uses the neural net scoring model described in [Deep Reinforcement Learning for Mention-Ranking Coreference Models](http://cs.stanford.edu/people/kevclark/resources/clark-manning-emnlp2016-deep.pdf) by Kevin Clark and Christopher D. Manning, EMNLP 2016. Since ✨Neuralcoref v2.0, you can train the coreference resolution system on your own dataset — e.g., another language than English! — **provided you have an annotated dataset**. Note that to use neuralcoref with spaCy > 2.1.0, you'll have to install neuralcoref from source.", + "slogan": "State-of-the-art coreference resolution based on neural nets and spaCy v2", + "description": "This coreference resolution module is based on the super fast spaCy parser and uses the neural net scoring model described in [Deep Reinforcement Learning for Mention-Ranking Coreference Models](http://cs.stanford.edu/people/kevclark/resources/clark-manning-emnlp2016-deep.pdf) by Kevin Clark and Christopher D. Manning, EMNLP 2016. Since ✨Neuralcoref v2.0, you can train the coreference resolution system on your own dataset — e.g., another language than English! — **provided you have an annotated dataset**. Note that to use neuralcoref with spaCy > 2.1.0, you'll have to install neuralcoref from source, and v3+ is not supported.", "github": "huggingface/neuralcoref", "thumb": "https://i.imgur.com/j6FO9O6.jpg", "code_example": [ @@ -1330,7 +1333,8 @@ "github": "huggingface" }, "category": ["standalone", "conversational", "models"], - "tags": ["coref"] + "tags": ["coref"], + "spacy_version": 2 }, { "id": "neuralcoref-vizualizer", @@ -1406,7 +1410,7 @@ "import spacy", "import explacy", "", - "nlp = spacy.load('en')", + "nlp = spacy.load('en_core_web_sm')", "explacy.print_parse_info(nlp, 'The salad was surprisingly tasty.')" ], "author": "Tyler Neylon", @@ -2377,7 +2381,7 @@ "author": "Nikita Kitaev", "author_links": { "github": "nikitakit", - "website": " http://kitaev.io" + "website": "http://kitaev.io" }, "category": ["research", "pipeline"] }, diff --git a/website/pages/_app.tsx b/website/pages/_app.tsx index 8db80a672..a837d9ce8 100644 --- a/website/pages/_app.tsx +++ b/website/pages/_app.tsx @@ -17,7 +17,7 @@ export default function App({ Component, pageProps }: AppProps) { diff --git a/website/pages/index.tsx b/website/pages/index.tsx index 4c0932926..fc0dba378 100644 --- a/website/pages/index.tsx +++ b/website/pages/index.tsx @@ -13,7 +13,7 @@ import { LandingBanner, } from '../src/components/landing' import { H2 } from '../src/components/typography' -import { InlineCode } from '../src/components/code' +import { InlineCode } from '../src/components/inlineCode' import { Ul, Li } from '../src/components/list' import Button from '../src/components/button' import Link from '../src/components/link' @@ -89,8 +89,8 @@ const Landing = () => { - Since its release in 2015, spaCy has become an industry standard with - a huge ecosystem. Choose from a variety of plugins, integrate with your machine + Since its release in 2015, spaCy has become an industry standard with a huge + ecosystem. Choose from a variety of plugins, integrate with your machine learning stack and build custom components and workflows. @@ -162,7 +162,7 @@ const Landing = () => { small >

- + { - +

diff --git a/website/src/components/accordion.js b/website/src/components/accordion.js index 504f415a5..9ff145bd2 100644 --- a/website/src/components/accordion.js +++ b/website/src/components/accordion.js @@ -33,7 +33,7 @@ export default function Accordion({ title, id, expanded = false, spaced = false, event.stopPropagation()} > ¶ diff --git a/website/src/components/card.js b/website/src/components/card.js index 9eb597b7b..ef43eb866 100644 --- a/website/src/components/card.js +++ b/website/src/components/card.js @@ -1,6 +1,7 @@ import React from 'react' import PropTypes from 'prop-types' import classNames from 'classnames' +import ImageNext from 'next/image' import Link from './link' import { H5 } from './typography' @@ -10,7 +11,7 @@ export default function Card({ title, to, image, header, small, onClick, childre return (

{header && ( - + {header} )} @@ -18,18 +19,17 @@ export default function Card({ title, to, image, header, small, onClick, childre
{image && (
- {/* eslint-disable-next-line @next/next/no-img-element */} - +
)} {title && ( - + {title} )}
)} - + {children}
diff --git a/website/src/components/code.js b/website/src/components/code.js index 51067115b..09c2fabfc 100644 --- a/website/src/components/code.js +++ b/website/src/components/code.js @@ -14,96 +14,16 @@ import 'prismjs/components/prism-markdown.min.js' import 'prismjs/components/prism-python.min.js' import 'prismjs/components/prism-yaml.min.js' -import CUSTOM_TYPES from '../../meta/type-annotations.json' -import { isString, htmlToReact } from './util' +import { isString } from './util' import Link, { OptionalLink } from './link' import GitHubCode from './github' -import Juniper from './juniper' import classes from '../styles/code.module.sass' import siteMetadata from '../../meta/site.json' import { binderBranch } from '../../meta/dynamicMeta.mjs' +import dynamic from 'next/dynamic' -const WRAP_THRESHOLD = 30 const CLI_GROUPS = ['init', 'debug', 'project', 'ray', 'huggingface-hub'] -const CodeBlock = (props) => ( -
-        
-    
-) - -export default CodeBlock - -export const Pre = (props) => { - return
{props.children}
-} - -export const InlineCode = ({ wrap = false, className, children, ...props }) => { - const codeClassNames = classNames(classes['inline-code'], className, { - [classes['wrap']]: wrap || (isString(children) && children.length >= WRAP_THRESHOLD), - }) - return ( - - {children} - - ) -} - -InlineCode.propTypes = { - wrap: PropTypes.bool, - className: PropTypes.string, - children: PropTypes.node, -} - -function linkType(el, showLink = true) { - if (!isString(el) || !el.length) return el - const elStr = el.trim() - if (!elStr) return el - const typeUrl = CUSTOM_TYPES[elStr] - const url = typeUrl == true ? DEFAULT_TYPE_URL : typeUrl - const ws = el[0] == ' ' - return url && showLink ? ( - - {ws && ' '} - - {elStr} - - - ) : ( - el - ) -} - -export const TypeAnnotation = ({ lang = 'python', link = true, children }) => { - // Hacky, but we're temporarily replacing a dot to prevent it from being split during highlighting - const TMP_DOT = '۔' - const code = Array.isArray(children) ? children.join('') : children || '' - const [rawText, meta] = code.split(/(?= \(.+\)$)/) - const rawStr = rawText.replace(/\./g, TMP_DOT) - const rawHtml = - lang === 'none' || !code ? code : Prism.highlight(rawStr, Prism.languages[lang], lang) - const html = rawHtml.replace(new RegExp(TMP_DOT, 'g'), '.').replace(/\n/g, ' ') - const result = htmlToReact(html) - const elements = Array.isArray(result) ? result : [result] - const annotClassNames = classNames( - 'type-annotation', - `language-${lang}`, - classes['inline-code'], - classes['type-annotation'], - { - [classes['wrap']]: code.length >= WRAP_THRESHOLD, - } - ) - return ( - - {elements.map((el, i) => ( - {linkType(el, !!link)} - ))} - {meta && {meta}} - - ) -} - const splitLines = (children) => { const listChildrenPerLine = [] @@ -235,7 +155,7 @@ const handlePromot = ({ lineFlat, prompt }) => { {j !== 0 && ' '} - @@ -288,7 +208,7 @@ const addLineHighlight = (children, highlight) => { }) } -export const CodeHighlighted = ({ children, highlight, lang }) => { +const CodeHighlighted = ({ children, highlight, lang }) => { const [html, setHtml] = useState() useEffect( @@ -305,7 +225,7 @@ export const CodeHighlighted = ({ children, highlight, lang }) => { return <>{html} } -export class Code extends React.Component { +export default class Code extends React.Component { static defaultProps = { lang: 'none', executable: null, @@ -354,6 +274,8 @@ export class Code extends React.Component { } } +const JuniperDynamic = dynamic(() => import('./juniper')) + const JuniperWrapper = ({ title, lang, children }) => { const { binderUrl, binderVersion } = siteMetadata const juniperTitle = title || 'Editable Code' @@ -363,13 +285,13 @@ const JuniperWrapper = ({ title, lang, children }) => { {juniperTitle} spaCy v{binderVersion} · Python 3 · via{' '} - + Binder - { }} > {children} - + ) } diff --git a/website/src/components/codeBlock.js b/website/src/components/codeBlock.js new file mode 100644 index 000000000..d990b93dd --- /dev/null +++ b/website/src/components/codeBlock.js @@ -0,0 +1,14 @@ +import React from 'react' +import Code from './codeDynamic' +import classes from '../styles/code.module.sass' + +export const Pre = (props) => { + return
{props.children}
+} + +const CodeBlock = (props) => ( +
+        
+    
+) +export default CodeBlock diff --git a/website/src/components/codeDynamic.js b/website/src/components/codeDynamic.js new file mode 100644 index 000000000..8c9483567 --- /dev/null +++ b/website/src/components/codeDynamic.js @@ -0,0 +1,5 @@ +import dynamic from 'next/dynamic' + +export default dynamic(() => import('./code'), { + loading: () =>
Loading...
, +}) diff --git a/website/src/components/copy.js b/website/src/components/copy.js index 4caabac98..bc7327115 100644 --- a/website/src/components/copy.js +++ b/website/src/components/copy.js @@ -14,7 +14,7 @@ export function copyToClipboard(ref, callback) { } } -export default function CopyInput({ text, prefix }) { +export default function CopyInput({ text, description, prefix }) { const isClient = typeof window !== 'undefined' const [supportsCopy, setSupportsCopy] = useState(false) @@ -41,6 +41,7 @@ export default function CopyInput({ text, prefix }) { defaultValue={text} rows={1} onClick={selectText} + aria-label={description} /> {supportsCopy && (