diff --git a/.gitignore b/.gitignore index 087163761..4dbcd67f7 100644 --- a/.gitignore +++ b/.gitignore @@ -18,8 +18,7 @@ website/.npm website/logs *.log npm-debug.log* -website/www/ -website/_deploy.sh +quickstart-training-generator.js # Cython / C extensions cythonize.json diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 6b7881dd2..0abde2abf 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -5,7 +5,7 @@ Thanks for your interest in contributing to spaCy πŸŽ‰ The project is maintained by [@honnibal](https://github.com/honnibal) and [@ines](https://github.com/ines), and we'll do our best to help you get started. This page will give you a quick -overview of how things are organised and most importantly, how to get involved. +overview of how things are organized and most importantly, how to get involved. ## Table of contents @@ -43,33 +43,33 @@ can also submit a [regression test](#fixing-bugs) straight away. When you're opening an issue to report the bug, simply refer to your pull request in the issue body. A few more tips: -- **Describing your issue:** Try to provide as many details as possible. What - exactly goes wrong? _How_ is it failing? Is there an error? - "XY doesn't work" usually isn't that helpful for tracking down problems. Always - remember to include the code you ran and if possible, extract only the relevant - parts and don't just dump your entire script. This will make it easier for us to - reproduce the error. +- **Describing your issue:** Try to provide as many details as possible. What + exactly goes wrong? _How_ is it failing? Is there an error? + "XY doesn't work" usually isn't that helpful for tracking down problems. Always + remember to include the code you ran and if possible, extract only the relevant + parts and don't just dump your entire script. This will make it easier for us to + reproduce the error. -- **Getting info about your spaCy installation and environment:** If you're - using spaCy v1.7+, you can use the command line interface to print details and - even format them as Markdown to copy-paste into GitHub issues: - `python -m spacy info --markdown`. +- **Getting info about your spaCy installation and environment:** If you're + using spaCy v1.7+, you can use the command line interface to print details and + even format them as Markdown to copy-paste into GitHub issues: + `python -m spacy info --markdown`. -- **Checking the model compatibility:** If you're having problems with a - [statistical model](https://spacy.io/models), it may be because the - model is incompatible with your spaCy installation. In spaCy v2.0+, you can check - this on the command line by running `python -m spacy validate`. +- **Checking the model compatibility:** If you're having problems with a + [statistical model](https://spacy.io/models), it may be because the + model is incompatible with your spaCy installation. In spaCy v2.0+, you can check + this on the command line by running `python -m spacy validate`. -- **Sharing a model's output, like dependencies and entities:** spaCy v2.0+ - comes with [built-in visualizers](https://spacy.io/usage/visualizers) that - you can run from within your script or a Jupyter notebook. For some issues, it's - helpful to **include a screenshot** of the visualization. You can simply drag and - drop the image into GitHub's editor and it will be uploaded and included. +- **Sharing a model's output, like dependencies and entities:** spaCy v2.0+ + comes with [built-in visualizers](https://spacy.io/usage/visualizers) that + you can run from within your script or a Jupyter notebook. For some issues, it's + helpful to **include a screenshot** of the visualization. You can simply drag and + drop the image into GitHub's editor and it will be uploaded and included. -- **Sharing long blocks of code or logs:** If you need to include long code, - logs or tracebacks, you can wrap them in `
` and `
`. This - [collapses the content](https://developer.mozilla.org/en/docs/Web/HTML/Element/details) - so it only becomes visible on click, making the issue easier to read and follow. +- **Sharing long blocks of code or logs:** If you need to include long code, + logs or tracebacks, you can wrap them in `
` and `
`. This + [collapses the content](https://developer.mozilla.org/en/docs/Web/HTML/Element/details) + so it only becomes visible on click, making the issue easier to read and follow. ### Issue labels @@ -94,39 +94,39 @@ shipped in the core library, and what could be provided in other packages. Our philosophy is to prefer a smaller core library. We generally ask the following questions: -- **What would this feature look like if implemented in a separate package?** - Some features would be very difficult to implement externally – for example, - changes to spaCy's built-in methods. In contrast, a library of word - alignment functions could easily live as a separate package that depended on - spaCy β€” there's little difference between writing `import word_aligner` and - `import spacy.word_aligner`. spaCy v2.0+ makes it easy to implement - [custom pipeline components](https://spacy.io/usage/processing-pipelines#custom-components), - and add your own attributes, properties and methods to the `Doc`, `Token` and - `Span`. If you're looking to implement a new spaCy feature, starting with a - custom component package is usually the best strategy. You won't have to worry - about spaCy's internals and you can test your module in an isolated - environment. And if it works well, we can always integrate it into the core - library later. +- **What would this feature look like if implemented in a separate package?** + Some features would be very difficult to implement externally – for example, + changes to spaCy's built-in methods. In contrast, a library of word + alignment functions could easily live as a separate package that depended on + spaCy β€” there's little difference between writing `import word_aligner` and + `import spacy.word_aligner`. spaCy v2.0+ makes it easy to implement + [custom pipeline components](https://spacy.io/usage/processing-pipelines#custom-components), + and add your own attributes, properties and methods to the `Doc`, `Token` and + `Span`. If you're looking to implement a new spaCy feature, starting with a + custom component package is usually the best strategy. You won't have to worry + about spaCy's internals and you can test your module in an isolated + environment. And if it works well, we can always integrate it into the core + library later. -- **Would the feature be easier to implement if it relied on "heavy" dependencies spaCy doesn't currently require?** - Python has a very rich ecosystem. Libraries like scikit-learn, SciPy, Gensim or - TensorFlow/Keras do lots of useful things β€” but we don't want to have them as - dependencies. If the feature requires functionality in one of these libraries, - it's probably better to break it out into a different package. +- **Would the feature be easier to implement if it relied on "heavy" dependencies spaCy doesn't currently require?** + Python has a very rich ecosystem. Libraries like scikit-learn, SciPy, Gensim or + TensorFlow/Keras do lots of useful things β€” but we don't want to have them as + dependencies. If the feature requires functionality in one of these libraries, + it's probably better to break it out into a different package. -- **Is the feature orthogonal to the current spaCy functionality, or overlapping?** - spaCy strongly prefers to avoid having 6 different ways of doing the same thing. - As better techniques are developed, we prefer to drop support for "the old way". - However, it's rare that one approach _entirely_ dominates another. It's very - common that there's still a use-case for the "obsolete" approach. For instance, - [WordNet](https://wordnet.princeton.edu/) is still very useful β€” but word - vectors are better for most use-cases, and the two approaches to lexical - semantics do a lot of the same things. spaCy therefore only supports word - vectors, and support for WordNet is currently left for other packages. +- **Is the feature orthogonal to the current spaCy functionality, or overlapping?** + spaCy strongly prefers to avoid having 6 different ways of doing the same thing. + As better techniques are developed, we prefer to drop support for "the old way". + However, it's rare that one approach _entirely_ dominates another. It's very + common that there's still a use-case for the "obsolete" approach. For instance, + [WordNet](https://wordnet.princeton.edu/) is still very useful β€” but word + vectors are better for most use-cases, and the two approaches to lexical + semantics do a lot of the same things. spaCy therefore only supports word + vectors, and support for WordNet is currently left for other packages. -- **Do you need the feature to get basic things done?** We do want spaCy to be - at least somewhat self-contained. If we keep needing some feature in our - recipes, that does provide some argument for bringing it "in house". +- **Do you need the feature to get basic things done?** We do want spaCy to be + at least somewhat self-contained. If we keep needing some feature in our + recipes, that does provide some argument for bringing it "in house". ### Getting started @@ -195,7 +195,7 @@ modules in `.py` files, not Cython modules in `.pyx` and `.pxd` files.** ### Code formatting [`black`](https://github.com/ambv/black) is an opinionated Python code -formatter, optimised to produce readable code and small diffs. You can run +formatter, optimized to produce readable code and small diffs. You can run `black` from the command-line, or via your code editor. For example, if you're using [Visual Studio Code](https://code.visualstudio.com/), you can add the following to your `settings.json` to use `black` for formatting and auto-format @@ -203,10 +203,10 @@ your files on save: ```json { - "python.formatting.provider": "black", - "[python]": { - "editor.formatOnSave": true - } + "python.formatting.provider": "black", + "[python]": { + "editor.formatOnSave": true + } } ``` @@ -216,7 +216,7 @@ list of available editor integrations. #### Disabling formatting There are a few cases where auto-formatting doesn't improve readability – for -example, in some of the the language data files like the `tag_map.py`, or in +example, in some of the language data files like the `tag_map.py`, or in the tests that construct `Doc` objects from lists of words and other labels. Wrapping a block in `# fmt: off` and `# fmt: on` lets you disable formatting for that particular code. Here's an example: @@ -286,7 +286,7 @@ Code that interacts with the file-system should accept objects that follow the If the function is user-facing and takes a path as an argument, it should check whether the path is provided as a string. Strings should be converted to `pathlib.Path` objects. Serialization and deserialization functions should always -accept **file-like objects**, as it makes the library io-agnostic. Working on +accept **file-like objects**, as it makes the library IO-agnostic. Working on buffers makes the code more general, easier to test, and compatible with Python 3's asynchronous IO. @@ -384,7 +384,7 @@ of Python and C++, with additional complexity and syntax from numpy. The many "traps for new players". Working in Cython is very rewarding once you're over the initial learning curve. As with C and C++, the first way you write something in Cython will often be the performance-optimal approach. In contrast, -Python optimisation generally requires a lot of experimentation. Is it faster to +Python optimization generally requires a lot of experimentation. Is it faster to have an `if item in my_dict` check, or to use `.get()`? What about `try`/`except`? Does this numpy operation create a copy? There's no way to guess the answers to these questions, and you'll usually be dissatisfied with your results β€” so @@ -397,10 +397,10 @@ Python. If it's not fast enough the first time, just switch to Cython. ### Resources to get you started -- [PEP 8 Style Guide for Python Code](https://www.python.org/dev/peps/pep-0008/) (python.org) -- [Official Cython documentation](http://docs.cython.org/en/latest/) (cython.org) -- [Writing C in Cython](https://explosion.ai/blog/writing-c-in-cython) (explosion.ai) -- [Multi-threading spaCy’s parser and named entity recogniser](https://explosion.ai/blog/multithreading-with-cython) (explosion.ai) +- [PEP 8 Style Guide for Python Code](https://www.python.org/dev/peps/pep-0008/) (python.org) +- [Official Cython documentation](http://docs.cython.org/en/latest/) (cython.org) +- [Writing C in Cython](https://explosion.ai/blog/writing-c-in-cython) (explosion.ai) +- [Multi-threading spaCy’s parser and named entity recognizer](https://explosion.ai/blog/multithreading-with-cython) (explosion.ai) ## Adding tests @@ -412,7 +412,7 @@ name. For example, tests for the `Tokenizer` can be found in all test files and test functions need to be prefixed with `test_`. When adding tests, make sure to use descriptive names, keep the code short and -concise and only test for one behaviour at a time. Try to `parametrize` test +concise and only test for one behavior at a time. Try to `parametrize` test cases wherever possible, use our pre-defined fixtures for spaCy components and avoid unnecessary imports. @@ -440,25 +440,25 @@ simply click on the "Suggest edits" button at the bottom of a page. We're very excited about all the new possibilities for **community extensions** and plugins in spaCy v2.0, and we can't wait to see what you build with it! -- An extension or plugin should add substantial functionality, be - **well-documented** and **open-source**. It should be available for users to download - and install as a Python package – for example via [PyPi](http://pypi.python.org). +- An extension or plugin should add substantial functionality, be + **well-documented** and **open-source**. It should be available for users to download + and install as a Python package – for example via [PyPi](http://pypi.python.org). -- Extensions that write to `Doc`, `Token` or `Span` attributes should be wrapped - as [pipeline components](https://spacy.io/usage/processing-pipelines#custom-components) - that users can **add to their processing pipeline** using `nlp.add_pipe()`. +- Extensions that write to `Doc`, `Token` or `Span` attributes should be wrapped + as [pipeline components](https://spacy.io/usage/processing-pipelines#custom-components) + that users can **add to their processing pipeline** using `nlp.add_pipe()`. -- When publishing your extension on GitHub, **tag it** with the topics - [`spacy`](https://github.com/topics/spacy?o=desc&s=stars) and - [`spacy-extensions`](https://github.com/topics/spacy-extension?o=desc&s=stars) - to make it easier to find. Those are also the topics we're linking to from the - spaCy website. If you're sharing your project on Twitter, feel free to tag - [@spacy_io](https://twitter.com/spacy_io) so we can check it out. +- When publishing your extension on GitHub, **tag it** with the topics + [`spacy`](https://github.com/topics/spacy?o=desc&s=stars) and + [`spacy-extensions`](https://github.com/topics/spacy-extension?o=desc&s=stars) + to make it easier to find. Those are also the topics we're linking to from the + spaCy website. If you're sharing your project on Twitter, feel free to tag + [@spacy_io](https://twitter.com/spacy_io) so we can check it out. -- Once your extension is published, you can open an issue on the - [issue tracker](https://github.com/explosion/spacy/issues) to suggest it for the - [resources directory](https://spacy.io/usage/resources#extensions) on the - website. +- Once your extension is published, you can open an issue on the + [issue tracker](https://github.com/explosion/spacy/issues) to suggest it for the + [resources directory](https://spacy.io/usage/resources#extensions) on the + website. πŸ“– **For more tips and best practices, see the [checklist for developing spaCy extensions](https://spacy.io/usage/processing-pipelines#extensions).** diff --git a/MANIFEST.in b/MANIFEST.in index ef42138f1..b4887cdb8 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -5,5 +5,5 @@ include README.md include pyproject.toml recursive-exclude spacy/lang *.json recursive-include spacy/lang *.json.gz -recursive-include spacy/cli *.json +recursive-include spacy/cli *.json *.yml recursive-include licenses * diff --git a/README.md b/README.md index 1fece1e5a..cef2a1fdd 100644 --- a/README.md +++ b/README.md @@ -49,9 +49,8 @@ It's commercial open-source software, released under the MIT license. ## πŸ’¬ Where to ask questions -The spaCy project is maintained by [@honnibal](https://github.com/honnibal) and -[@ines](https://github.com/ines), along with core contributors -[@svlandeg](https://github.com/svlandeg) and +The spaCy project is maintained by [@honnibal](https://github.com/honnibal), +[@ines](https://github.com/ines), [@svlandeg](https://github.com/svlandeg) and [@adrianeboyd](https://github.com/adrianeboyd). Please understand that we won't be able to provide individual support via email. We also believe that help is much more valuable if it's shared publicly, so that more people can benefit from diff --git a/bin/ud/ud_train.py b/bin/ud/ud_train.py index 11ad564ec..362057b37 100644 --- a/bin/ud/ud_train.py +++ b/bin/ud/ud_train.py @@ -15,7 +15,8 @@ import spacy.util from bin.ud import conll17_ud_eval from spacy.tokens import Token, Doc from spacy.gold import Example -from spacy.util import compounding, minibatch, minibatch_by_words +from spacy.util import compounding, minibatch +from spacy.gold.batchers import minibatch_by_words from spacy.pipeline._parser_internals.nonproj import projectivize from spacy.matcher import Matcher from spacy import displacy diff --git a/examples/training/create_kb.py b/examples/training/create_kb.py index 0c6e29226..a455c8d7e 100644 --- a/examples/training/create_kb.py +++ b/examples/training/create_kb.py @@ -48,8 +48,7 @@ def main(model, output_dir=None): # You can change the dimension of vectors in your KB by using an encoder that changes the dimensionality. # For simplicity, we'll just use the original vector dimension here instead. vectors_dim = nlp.vocab.vectors.shape[1] - kb = KnowledgeBase(entity_vector_length=vectors_dim) - kb.initialize(nlp.vocab) + kb = KnowledgeBase(nlp.vocab, entity_vector_length=vectors_dim) # set up the data entity_ids = [] @@ -81,7 +80,7 @@ def main(model, output_dir=None): if not output_dir.exists(): output_dir.mkdir() kb_path = str(output_dir / "kb") - kb.dump(kb_path) + kb.to_disk(kb_path) print() print("Saved KB to", kb_path) @@ -96,9 +95,8 @@ def main(model, output_dir=None): print("Loading vocab from", vocab_path) print("Loading KB from", kb_path) vocab2 = Vocab().from_disk(vocab_path) - kb2 = KnowledgeBase(entity_vector_length=1) - kb.initialize(vocab2) - kb2.load_bulk(kb_path) + kb2 = KnowledgeBase(vocab2, entity_vector_length=1) + kb2.from_disk(kb_path) print() _print_kb(kb2) diff --git a/examples/training/train_entity_linker.py b/examples/training/train_entity_linker.py index 8a69ae39c..d2bd61e5b 100644 --- a/examples/training/train_entity_linker.py +++ b/examples/training/train_entity_linker.py @@ -83,7 +83,7 @@ def main(kb_path, vocab_path, output_dir=None, n_iter=50): if "entity_linker" not in nlp.pipe_names: print("Loading Knowledge Base from '%s'" % kb_path) cfg = { - "kb": { + "kb_loader": { "@assets": "spacy.KBFromFile.v1", "vocab_path": vocab_path, "kb_path": kb_path, diff --git a/netlify.toml b/netlify.toml index 6afa5ed7e..2f3e350e6 100644 --- a/netlify.toml +++ b/netlify.toml @@ -36,11 +36,11 @@ redirects = [ {from = "/docs/api/features", to = "/models/#architecture", force = true}, {from = "/docs/api/philosophy", to = "/usage/spacy-101", force = true}, {from = "/docs/usage/showcase", to = "/universe", force = true}, - {from = "/tutorials/load-new-word-vectors", to = "/usage/vectors-similarity#custom", force = true}, + {from = "/tutorials/load-new-word-vectors", to = "/usage/linguistic-features", force = true}, {from = "/tutorials", to = "/usage/examples", force = true}, # Old documentation pages (v2.x) {from = "/usage/adding-languages", to = "/usage/linguistic-features", force = true}, - {from = "/usage/vectors-similarity", to = "/usage/vectors-embeddings", force = true}, + {from = "/usage/vectors-similarity", to = "/usage/linguistic-features#vectors-similarity", force = true}, {from = "/api/goldparse", to = "/api/top-level", force = true}, {from = "/api/goldcorpus", to = "/api/corpus", force = true}, {from = "/api/annotation", to = "/api/data-formats", force = true}, diff --git a/pyproject.toml b/pyproject.toml index 1b4972bd5..77deb44b0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,9 +6,10 @@ requires = [ "cymem>=2.0.2,<2.1.0", "preshed>=3.0.2,<3.1.0", "murmurhash>=0.28.0,<1.1.0", - "thinc>=8.0.0a27,<8.0.0a30", + "thinc>=8.0.0a29,<8.0.0a40", "blis>=0.4.0,<0.5.0", "pytokenizations", - "smart_open>=2.0.0,<3.0.0" + "smart_open>=2.0.0,<3.0.0", + "pathy" ] build-backend = "setuptools.build_meta" diff --git a/requirements.txt b/requirements.txt index b4901a692..5aafd83dd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ # Our libraries cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 -thinc>=8.0.0a27,<8.0.0a30 +thinc>=8.0.0a29,<8.0.0a40 blis>=0.4.0,<0.5.0 ml_datasets>=0.1.1 murmurhash>=0.28.0,<1.1.0 @@ -9,6 +9,7 @@ wasabi>=0.7.1,<1.1.0 srsly>=2.1.0,<3.0.0 catalogue>=0.0.7,<1.1.0 typer>=0.3.0,<0.4.0 +pathy # Third party dependencies numpy>=1.15.0 requests>=2.13.0,<3.0.0 diff --git a/setup.cfg b/setup.cfg index a34c34e23..8b4819ed8 100644 --- a/setup.cfg +++ b/setup.cfg @@ -34,18 +34,19 @@ setup_requires = cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 murmurhash>=0.28.0,<1.1.0 - thinc>=8.0.0a27,<8.0.0a30 + thinc>=8.0.0a29,<8.0.0a40 install_requires = # Our libraries murmurhash>=0.28.0,<1.1.0 cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 - thinc>=8.0.0a27,<8.0.0a30 + thinc>=8.0.0a29,<8.0.0a40 blis>=0.4.0,<0.5.0 wasabi>=0.7.1,<1.1.0 srsly>=2.1.0,<3.0.0 catalogue>=0.0.7,<1.1.0 typer>=0.3.0,<0.4.0 + pathy # Third-party dependencies tqdm>=4.38.0,<5.0.0 numpy>=1.15.0 diff --git a/spacy/about.py b/spacy/about.py index 5ed46bbe4..da3e32805 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy-nightly" -__version__ = "3.0.0a7" +__version__ = "3.0.0a10" __release__ = True __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py index 2b21e2f2b..4c3adc5d3 100644 --- a/spacy/cli/__init__.py +++ b/spacy/cli/__init__.py @@ -21,6 +21,8 @@ from .project.clone import project_clone # noqa: F401 from .project.assets import project_assets # noqa: F401 from .project.run import project_run # noqa: F401 from .project.dvc import project_update_dvc # noqa: F401 +from .project.push import project_push # noqa: F401 +from .project.pull import project_pull # noqa: F401 @app.command("link", no_args_is_help=True, deprecated=True, hidden=True) diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py index 5613fa317..b527ac2a0 100644 --- a/spacy/cli/_util.py +++ b/spacy/cli/_util.py @@ -1,4 +1,5 @@ -from typing import Dict, Any, Union, List, Optional +from typing import Dict, Any, Union, List, Optional, TYPE_CHECKING +import sys from pathlib import Path from wasabi import msg import srsly @@ -8,11 +9,13 @@ from typer.main import get_command from contextlib import contextmanager from thinc.config import Config, ConfigValidationError from configparser import InterpolationError -import sys from ..schemas import ProjectConfigSchema, validate from ..util import import_file +if TYPE_CHECKING: + from pathy import Pathy # noqa: F401 + PROJECT_FILE = "project.yml" PROJECT_LOCK = "project.lock" @@ -68,11 +71,12 @@ def parse_config_overrides(args: List[str]) -> Dict[str, Any]: opt = args.pop(0) err = f"Invalid CLI argument '{opt}'" if opt.startswith("--"): # new argument - opt = opt.replace("--", "").replace("-", "_") + opt = opt.replace("--", "") if "." not in opt: msg.fail(f"{err}: can't override top-level section", exits=1) if "=" in opt: # we have --opt=value opt, value = opt.split("=", 1) + opt = opt.replace("-", "_") else: if not args or args[0].startswith("--"): # flag with no value value = "true" @@ -92,11 +96,12 @@ def parse_config_overrides(args: List[str]) -> Dict[str, Any]: return result -def load_project_config(path: Path) -> Dict[str, Any]: +def load_project_config(path: Path, interpolate: bool = True) -> Dict[str, Any]: """Load the project.yml file from a directory and validate it. Also make sure that all directories defined in the config exist. path (Path): The path to the project directory. + interpolate (bool): Whether to substitute project variables. RETURNS (Dict[str, Any]): The loaded project.yml. """ config_path = path / PROJECT_FILE @@ -109,16 +114,34 @@ def load_project_config(path: Path) -> Dict[str, Any]: msg.fail(invalid_err, e, exits=1) errors = validate(ProjectConfigSchema, config) if errors: - msg.fail(invalid_err, "\n".join(errors), exits=1) + msg.fail(invalid_err) + print("\n".join(errors)) + sys.exit(1) validate_project_commands(config) # Make sure directories defined in config exist for subdir in config.get("directories", []): dir_path = path / subdir if not dir_path.exists(): dir_path.mkdir(parents=True) + if interpolate: + err = "project.yml validation error" + with show_validation_error(title=err, hint_fill=False): + config = substitute_project_variables(config) return config +def substitute_project_variables(config: Dict[str, Any], overrides: Dict = {}): + key = "vars" + config.setdefault(key, {}) + config[key].update(overrides) + # Need to put variables in the top scope again so we can have a top-level + # section "project" (otherwise, a list of commands in the top scope wouldn't) + # be allowed by Thinc's config system + cfg = Config({"project": config, key: config[key]}) + interpolated = cfg.interpolate() + return dict(interpolated["project"]) + + def validate_project_commands(config: Dict[str, Any]) -> None: """Check that project commands and workflows are valid, don't contain duplicates, don't clash and only refer to commands that exist. @@ -229,3 +252,39 @@ def get_sourced_components(config: Union[Dict[str, Any], Config]) -> List[str]: for name, cfg in config.get("components", {}).items() if "factory" not in cfg and "source" in cfg ] + + +def upload_file(src: Path, dest: Union[str, "Pathy"]) -> None: + """Upload a file. + + src (Path): The source path. + url (str): The destination URL to upload to. + """ + dest = ensure_pathy(dest) + with dest.open(mode="wb") as output_file: + with src.open(mode="rb") as input_file: + output_file.write(input_file.read()) + + +def download_file(src: Union[str, "Pathy"], dest: Path, *, force: bool = False) -> None: + """Download a file using smart_open. + + url (str): The URL of the file. + dest (Path): The destination path. + force (bool): Whether to force download even if file exists. + If False, the download will be skipped. + """ + if dest.exists() and not force: + return None + src = ensure_pathy(src) + with src.open(mode="rb") as input_file: + with dest.open(mode="wb") as output_file: + output_file.write(input_file.read()) + + +def ensure_pathy(path): + """Temporary helper to prevent importing Pathy globally (which can cause + slow and annoying Google Cloud warning).""" + from pathy import Pathy # noqa: F811 + + return Pathy(path) diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index 27cf033c4..b23705311 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -3,7 +3,7 @@ from pathlib import Path from collections import Counter import sys import srsly -from wasabi import Printer, MESSAGES, msg, diff_strings +from wasabi import Printer, MESSAGES, msg import typer from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides @@ -32,8 +32,6 @@ def debug_config_cli( ctx: typer.Context, # This is only used to read additional arguments config_path: Path = Arg(..., help="Path to config file", exists=True), code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"), - auto_fill: bool = Opt(False, "--auto-fill", "-F", help="Whether or not to auto-fill the config with built-in defaults if possible"), - diff: bool = Opt(False, "--diff", "-D", help="Show a visual diff if config was auto-filled") # fmt: on ): """Debug a config.cfg file and show validation errors. The command will @@ -49,18 +47,8 @@ def debug_config_cli( import_code(code_path) with show_validation_error(config_path): config = util.load_config(config_path, overrides=overrides) - nlp, _ = util.load_model_from_config(config, auto_fill=auto_fill) - if auto_fill: - orig_config = config.to_str() - filled_config = nlp.config.to_str() - if orig_config == filled_config: - msg.good("Original config is valid, no values were auto-filled") - else: - msg.good("Auto-filled config is valid") - if diff: - print(diff_strings(config.to_str(), nlp.config.to_str())) - else: - msg.good("Original config is valid") + nlp, _ = util.load_model_from_config(config) + msg.good("Original config is valid") @debug_cli.command( diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py index cf8f513fc..3847c74f3 100644 --- a/spacy/cli/evaluate.py +++ b/spacy/cli/evaluate.py @@ -70,7 +70,7 @@ def evaluate( corpus = Corpus(data_path, gold_preproc=gold_preproc) nlp = util.load_model(model) dev_dataset = list(corpus(nlp)) - scores = nlp.evaluate(dev_dataset, verbose=False) + scores = nlp.evaluate(dev_dataset) metrics = { "TOK": "token_acc", "TAG": "tag_acc", diff --git a/spacy/cli/init_config.py b/spacy/cli/init_config.py index 7d80eb289..94e0bd6fc 100644 --- a/spacy/cli/init_config.py +++ b/spacy/cli/init_config.py @@ -3,17 +3,17 @@ from enum import Enum from pathlib import Path from wasabi import Printer, diff_strings from thinc.api import Config -from pydantic import BaseModel import srsly import re from .. import util +from ..schemas import RecommendationSchema from ._util import init_cli, Arg, Opt, show_validation_error, COMMAND -TEMPLATE_ROOT = Path(__file__).parent / "templates" -TEMPLATE_PATH = TEMPLATE_ROOT / "quickstart_training.jinja" -RECOMMENDATIONS_PATH = TEMPLATE_ROOT / "quickstart_training_recommendations.json" +ROOT = Path(__file__).parent / "templates" +TEMPLATE_PATH = ROOT / "quickstart_training.jinja" +RECOMMENDATIONS = srsly.read_yaml(ROOT / "quickstart_training_recommendations.yml") class Optimizations(str, Enum): @@ -21,25 +21,10 @@ class Optimizations(str, Enum): accuracy = "accuracy" -class RecommendationsTrfItem(BaseModel): - name: str - size_factor: int - - -class RecommendationsTrf(BaseModel): - efficiency: RecommendationsTrfItem - accuracy: RecommendationsTrfItem - - -class RecommendationSchema(BaseModel): - word_vectors: Optional[str] = None - transformer: Optional[RecommendationsTrf] = None - - @init_cli.command("config") def init_config_cli( # fmt: off - output_file: Path = Arg("-", help="File to save config.cfg to (or - for stdout)", allow_dash=True), + output_file: Path = Arg(..., help="File to save config.cfg to or - for stdout (will only output config and no additional logging info)", allow_dash=True), lang: Optional[str] = Opt("en", "--lang", "-l", help="Two-letter code of the language to use"), pipeline: Optional[str] = Opt("tagger,parser,ner", "--pipeline", "-p", help="Comma-separated names of trainable pipeline components to include in the model (without 'tok2vec' or 'transformer')"), optimize: Optimizations = Opt(Optimizations.efficiency.value, "--optimize", "-o", help="Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters."), @@ -111,14 +96,11 @@ def init_config( from jinja2 import Template except ImportError: msg.fail("This command requires jinja2", "pip install jinja2", exits=1) - recommendations = srsly.read_json(RECOMMENDATIONS_PATH) - lang_defaults = util.get_lang_class(lang).Defaults - has_letters = lang_defaults.writing_system.get("has_letters", True) - # Filter out duplicates since tok2vec and transformer are added by template - pipeline = [pipe for pipe in pipeline if pipe not in ("tok2vec", "transformer")] - reco = RecommendationSchema(**recommendations.get(lang, {})).dict() with TEMPLATE_PATH.open("r") as f: template = Template(f.read()) + # Filter out duplicates since tok2vec and transformer are added by template + pipeline = [pipe for pipe in pipeline if pipe not in ("tok2vec", "transformer")] + reco = RecommendationSchema(**RECOMMENDATIONS.get(lang, {})).dict() variables = { "lang": lang, "components": pipeline, @@ -126,8 +108,15 @@ def init_config( "hardware": "cpu" if cpu else "gpu", "transformer_data": reco["transformer"], "word_vectors": reco["word_vectors"], - "has_letters": has_letters, + "has_letters": reco["has_letters"], } + if variables["transformer_data"] and not has_spacy_transformers(): + msg.warn( + "To generate a more effective transformer-based config (GPU-only), " + "install the spacy-transformers package and re-run this command. " + "The config generated now does not use transformers." + ) + variables["transformer_data"] = None base_template = template.render(variables).strip() # Giving up on getting the newlines right in jinja for now base_template = re.sub(r"\n\n\n+", "\n\n", base_template) @@ -144,8 +133,6 @@ def init_config( for label, value in use_case.items(): msg.text(f"- {label}: {value}") use_transformer = bool(template_vars.use_transformer) - if use_transformer: - require_spacy_transformers(msg) with show_validation_error(hint_fill=False): config = util.load_config_from_str(base_template) nlp, _ = util.load_model_from_config(config, auto_fill=True) @@ -167,12 +154,10 @@ def save_config(config: Config, output_file: Path, is_stdout: bool = False) -> N print(f"{COMMAND} train {output_file.parts[-1]} {' '.join(variables)}") -def require_spacy_transformers(msg: Printer) -> None: +def has_spacy_transformers() -> bool: try: import spacy_transformers # noqa: F401 + + return True except ImportError: - msg.fail( - "Using a transformer-based pipeline requires spacy-transformers " - "to be installed.", - exits=1, - ) + return False diff --git a/spacy/cli/package.py b/spacy/cli/package.py index a1162f3e1..523e8a99a 100644 --- a/spacy/cli/package.py +++ b/spacy/cli/package.py @@ -229,6 +229,7 @@ if __name__ == '__main__': TEMPLATE_MANIFEST = """ include meta.json +include config.cfg """.strip() diff --git a/spacy/cli/project/assets.py b/spacy/cli/project/assets.py index 3be784e04..60cf95160 100644 --- a/spacy/cli/project/assets.py +++ b/spacy/cli/project/assets.py @@ -4,10 +4,10 @@ from wasabi import msg import re import shutil import requests -import smart_open from ...util import ensure_path, working_dir from .._util import project_cli, Arg, PROJECT_FILE, load_project_config, get_checksum +from .._util import download_file # TODO: find a solution for caches @@ -44,16 +44,14 @@ def project_assets(project_dir: Path) -> None: if not assets: msg.warn(f"No assets specified in {PROJECT_FILE}", exits=0) msg.info(f"Fetching {len(assets)} asset(s)") - variables = config.get("variables", {}) for asset in assets: - dest = asset["dest"].format(**variables) + dest = asset["dest"] url = asset.get("url") checksum = asset.get("checksum") if not url: # project.yml defines asset without URL that the user has to place check_private_asset(dest, checksum) continue - url = url.format(**variables) fetch_asset(project_path, url, dest, checksum) @@ -132,15 +130,3 @@ def convert_asset_url(url: str) -> str: ) return converted return url - - -def download_file(url: str, dest: Path, chunk_size: int = 1024) -> None: - """Download a file using smart_open. - - url (str): The URL of the file. - dest (Path): The destination path. - chunk_size (int): The size of chunks to read/write. - """ - with smart_open.open(url, mode="rb") as input_file: - with dest.open(mode="wb") as output_file: - output_file.write(input_file.read()) diff --git a/spacy/cli/project/dvc.py b/spacy/cli/project/dvc.py index 7386339d9..e0f6cd430 100644 --- a/spacy/cli/project/dvc.py +++ b/spacy/cli/project/dvc.py @@ -99,7 +99,6 @@ def update_dvc_config( if ref_hash == config_hash and not force: return False # Nothing has changed in project.yml, don't need to update dvc_config_path.unlink() - variables = config.get("variables", {}) dvc_commands = [] config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])} for name in workflows[workflow]: @@ -122,7 +121,7 @@ def update_dvc_config( dvc_commands.append(join_command(full_cmd)) with working_dir(path): dvc_flags = {"--verbose": verbose, "--quiet": silent} - run_dvc_commands(dvc_commands, variables, flags=dvc_flags) + run_dvc_commands(dvc_commands, flags=dvc_flags) with dvc_config_path.open("r+", encoding="utf8") as f: content = f.read() f.seek(0, 0) @@ -131,23 +130,16 @@ def update_dvc_config( def run_dvc_commands( - commands: List[str] = tuple(), - variables: Dict[str, str] = {}, - flags: Dict[str, bool] = {}, + commands: List[str] = tuple(), flags: Dict[str, bool] = {}, ) -> None: """Run a sequence of DVC commands in a subprocess, in order. commands (List[str]): The string commands without the leading "dvc". - variables (Dict[str, str]): Dictionary of variable names, mapped to their - values. Will be used to substitute format string variables in the - commands. flags (Dict[str, bool]): Conditional flags to be added to command. Makes it easier to pass flags like --quiet that depend on a variable or command-line setting while avoiding lots of nested conditionals. """ for command in commands: - # Substitute variables, e.g. "./{NAME}.json" - command = command.format(**variables) command = split_command(command) dvc_command = ["dvc", *command] # Add the flags if they are set to True diff --git a/spacy/cli/project/pull.py b/spacy/cli/project/pull.py new file mode 100644 index 000000000..73cb46bb7 --- /dev/null +++ b/spacy/cli/project/pull.py @@ -0,0 +1,38 @@ +from pathlib import Path +from wasabi import msg +from .remote_storage import RemoteStorage +from .remote_storage import get_command_hash +from .._util import project_cli, Arg +from .._util import load_project_config + + +@project_cli.command("pull") +def project_pull_cli( + # fmt: off + remote: str = Arg("default", help="Name or path of remote storage"), + project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False), + # fmt: on +): + """Retrieve any precomputed outputs from a remote storage that are available. + You can alias remotes in your project.yml by mapping them to storage paths. + A storage can be anything that the smart-open library can upload to, e.g. + gcs, aws, ssh, local directories etc + """ + for url, output_path in project_pull(project_dir, remote): + if url is not None: + msg.good(f"Pulled {output_path} from {url}") + + +def project_pull(project_dir: Path, remote: str, *, verbose: bool = False): + config = load_project_config(project_dir) + if remote in config.get("remotes", {}): + remote = config["remotes"][remote] + storage = RemoteStorage(project_dir, remote) + for cmd in config.get("commands", []): + deps = [project_dir / dep for dep in cmd.get("deps", [])] + if any(not dep.exists() for dep in deps): + continue + cmd_hash = get_command_hash("", "", deps, cmd["script"]) + for output_path in cmd.get("outputs", []): + url = storage.pull(output_path, command_hash=cmd_hash) + yield url, output_path diff --git a/spacy/cli/project/push.py b/spacy/cli/project/push.py new file mode 100644 index 000000000..e09ee6e1a --- /dev/null +++ b/spacy/cli/project/push.py @@ -0,0 +1,51 @@ +from pathlib import Path +from wasabi import msg +from .remote_storage import RemoteStorage +from .remote_storage import get_content_hash, get_command_hash +from .._util import load_project_config +from .._util import project_cli, Arg + + +@project_cli.command("push") +def project_push_cli( + # fmt: off + remote: str = Arg("default", help="Name or path of remote storage"), + project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False), + # fmt: on +): + """Persist outputs to a remote storage. You can alias remotes in your project.yml + by mapping them to storage paths. A storage can be anything that the smart-open + library can upload to, e.g. gcs, aws, ssh, local directories etc + """ + for output_path, url in project_push(project_dir, remote): + if url is None: + msg.info(f"Skipping {output_path}") + else: + msg.good(f"Pushed {output_path} to {url}") + + +def project_push(project_dir: Path, remote: str): + """Persist outputs to a remote storage. You can alias remotes in your project.yml + by mapping them to storage paths. A storage can be anything that the smart-open + library can upload to, e.g. gcs, aws, ssh, local directories etc + """ + config = load_project_config(project_dir) + if remote in config.get("remotes", {}): + remote = config["remotes"][remote] + storage = RemoteStorage(project_dir, remote) + for cmd in config.get("commands", []): + deps = [project_dir / dep for dep in cmd.get("deps", [])] + if any(not dep.exists() for dep in deps): + continue + cmd_hash = get_command_hash( + "", "", [project_dir / dep for dep in cmd.get("deps", [])], cmd["script"] + ) + for output_path in cmd.get("outputs", []): + output_loc = project_dir / output_path + if output_loc.exists(): + url = storage.push( + output_path, + command_hash=cmd_hash, + content_hash=get_content_hash(output_loc), + ) + yield output_path, url diff --git a/spacy/cli/project/remote_storage.py b/spacy/cli/project/remote_storage.py new file mode 100644 index 000000000..e7e7cbbe8 --- /dev/null +++ b/spacy/cli/project/remote_storage.py @@ -0,0 +1,169 @@ +from typing import Optional, List, Dict, TYPE_CHECKING +import os +import site +import hashlib +import urllib.parse +import tarfile +from pathlib import Path + +from .._util import get_hash, get_checksum, download_file, ensure_pathy +from ...util import make_tempdir + +if TYPE_CHECKING: + from pathy import Pathy # noqa: F401 + + +class RemoteStorage: + """Push and pull outputs to and from a remote file storage. + + Remotes can be anything that `smart-open` can support: AWS, GCS, file system, + ssh, etc. + """ + + def __init__(self, project_root: Path, url: str, *, compression="gz"): + self.root = project_root + self.url = ensure_pathy(url) + self.compression = compression + + def push(self, path: Path, command_hash: str, content_hash: str) -> "Pathy": + """Compress a file or directory within a project and upload it to a remote + storage. If an object exists at the full URL, nothing is done. + + Within the remote storage, files are addressed by their project path + (url encoded) and two user-supplied hashes, representing their creation + context and their file contents. If the URL already exists, the data is + not uploaded. Paths are archived and compressed prior to upload. + """ + loc = self.root / path + if not loc.exists(): + raise IOError(f"Cannot push {loc}: does not exist.") + url = self.make_url(path, command_hash, content_hash) + if url.exists(): + return None + tmp: Path + with make_tempdir() as tmp: + tar_loc = tmp / self.encode_name(str(path)) + mode_string = f"w:{self.compression}" if self.compression else "w" + with tarfile.open(tar_loc, mode=mode_string) as tar_file: + tar_file.add(str(loc), arcname=str(path)) + with tar_loc.open(mode="rb") as input_file: + with url.open(mode="wb") as output_file: + output_file.write(input_file.read()) + return url + + def pull( + self, + path: Path, + *, + command_hash: Optional[str] = None, + content_hash: Optional[str] = None, + ) -> Optional["Pathy"]: + """Retrieve a file from the remote cache. If the file already exists, + nothing is done. + + If the command_hash and/or content_hash are specified, only matching + results are returned. If no results are available, an error is raised. + """ + dest = self.root / path + if dest.exists(): + return None + url = self.find(path, command_hash=command_hash, content_hash=content_hash) + if url is None: + return url + else: + # Make sure the destination exists + if not dest.parent.exists(): + dest.parent.mkdir(parents=True) + tmp: Path + with make_tempdir() as tmp: + tar_loc = tmp / url.parts[-1] + download_file(url, tar_loc) + mode_string = f"r:{self.compression}" if self.compression else "r" + with tarfile.open(tar_loc, mode=mode_string) as tar_file: + # This requires that the path is added correctly, relative + # to root. This is how we set things up in push() + tar_file.extractall(self.root) + return url + + def find( + self, + path: Path, + *, + command_hash: Optional[str] = None, + content_hash: Optional[str] = None, + ) -> Optional["Pathy"]: + """Find the best matching version of a file within the storage, + or `None` if no match can be found. If both the creation and content hash + are specified, only exact matches will be returned. Otherwise, the most + recent matching file is preferred. + """ + name = self.encode_name(str(path)) + if command_hash is not None and content_hash is not None: + url = self.make_url(path, command_hash, content_hash) + urls = [url] if url.exists() else [] + elif command_hash is not None: + urls = list((self.url / name / command_hash).iterdir()) + else: + urls = list((self.url / name).iterdir()) + if content_hash is not None: + urls = [url for url in urls if url.parts[-1] == content_hash] + return urls[-1] if urls else None + + def make_url(self, path: Path, command_hash: str, content_hash: str) -> "Pathy": + """Construct a URL from a subpath, a creation hash and a content hash.""" + return self.url / self.encode_name(str(path)) / command_hash / content_hash + + def encode_name(self, name: str) -> str: + """Encode a subpath into a URL-safe name.""" + return urllib.parse.quote_plus(name) + + +def get_content_hash(loc: Path) -> str: + return get_checksum(loc) + + +def get_command_hash( + site_hash: str, env_hash: str, deps: List[Path], cmd: List[str] +) -> str: + """Create a hash representing the execution of a command. This includes the + currently installed packages, whatever environment variables have been marked + as relevant, and the command. + """ + hashes = [site_hash, env_hash] + [get_checksum(dep) for dep in sorted(deps)] + hashes.extend(cmd) + creation_bytes = "".join(hashes).encode("utf8") + return hashlib.md5(creation_bytes).hexdigest() + + +def get_site_hash(): + """Hash the current Python environment's site-packages contents, including + the name and version of the libraries. The list we're hashing is what + `pip freeze` would output. + """ + site_dirs = site.getsitepackages() + if site.ENABLE_USER_SITE: + site_dirs.extend(site.getusersitepackages()) + packages = set() + for site_dir in site_dirs: + site_dir = Path(site_dir) + for subpath in site_dir.iterdir(): + if subpath.parts[-1].endswith("dist-info"): + packages.add(subpath.parts[-1].replace(".dist-info", "")) + package_bytes = "".join(sorted(packages)).encode("utf8") + return hashlib.md5sum(package_bytes).hexdigest() + + +def get_env_hash(env: Dict[str, str]) -> str: + """Construct a hash of the environment variables that will be passed into + the commands. + + Values in the env dict may be references to the current os.environ, using + the syntax $ENV_VAR to mean os.environ[ENV_VAR] + """ + env_vars = {} + for key, value in env.items(): + if value.startswith("$"): + env_vars[key] = os.environ.get(value[1:], "") + else: + env_vars[key] = value + return get_hash(env_vars) diff --git a/spacy/cli/project/run.py b/spacy/cli/project/run.py index 5c66095aa..6e1deeeee 100644 --- a/spacy/cli/project/run.py +++ b/spacy/cli/project/run.py @@ -44,7 +44,6 @@ def project_run( dry (bool): Perform a dry run and don't execute commands. """ config = load_project_config(project_dir) - variables = config.get("variables", {}) commands = {cmd["name"]: cmd for cmd in config.get("commands", [])} workflows = config.get("workflows", {}) validate_subcommand(commands.keys(), workflows.keys(), subcommand) @@ -54,22 +53,20 @@ def project_run( project_run(project_dir, cmd, force=force, dry=dry) else: cmd = commands[subcommand] - variables = config.get("variables", {}) for dep in cmd.get("deps", []): - dep = dep.format(**variables) if not (project_dir / dep).exists(): err = f"Missing dependency specified by command '{subcommand}': {dep}" err_kwargs = {"exits": 1} if not dry else {} msg.fail(err, **err_kwargs) with working_dir(project_dir) as current_dir: - rerun = check_rerun(current_dir, cmd, variables) + rerun = check_rerun(current_dir, cmd) if not rerun and not force: msg.info(f"Skipping '{cmd['name']}': nothing changed") else: msg.divider(subcommand) - run_commands(cmd["script"], variables, dry=dry) + run_commands(cmd["script"], dry=dry) if not dry: - update_lockfile(current_dir, cmd, variables) + update_lockfile(current_dir, cmd) def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None: @@ -115,23 +112,15 @@ def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None: def run_commands( - commands: List[str] = tuple(), - variables: Dict[str, Any] = {}, - silent: bool = False, - dry: bool = False, + commands: List[str] = tuple(), silent: bool = False, dry: bool = False, ) -> None: """Run a sequence of commands in a subprocess, in order. commands (List[str]): The string commands. - variables (Dict[str, Any]): Dictionary of variable names, mapped to their - values. Will be used to substitute format string variables in the - commands. silent (bool): Don't print the commands. dry (bool): Perform a dry run and don't execut anything. """ for command in commands: - # Substitute variables, e.g. "./{NAME}.json" - command = command.format(**variables) command = split_command(command) # Not sure if this is needed or a good idea. Motivation: users may often # use commands in their config that reference "python" and we want to @@ -173,15 +162,12 @@ def validate_subcommand( ) -def check_rerun( - project_dir: Path, command: Dict[str, Any], variables: Dict[str, Any] -) -> bool: +def check_rerun(project_dir: Path, command: Dict[str, Any]) -> bool: """Check if a command should be rerun because its settings or inputs/outputs changed. project_dir (Path): The current project directory. command (Dict[str, Any]): The command, as defined in the project.yml. - variables (Dict[str, Any]): The variables defined in the project.yml. RETURNS (bool): Whether to re-run the command. """ lock_path = project_dir / PROJECT_LOCK @@ -197,19 +183,16 @@ def check_rerun( # If the entry in the lockfile matches the lockfile entry that would be # generated from the current command, we don't rerun because it means that # all inputs/outputs, hashes and scripts are the same and nothing changed - return get_hash(get_lock_entry(project_dir, command, variables)) != get_hash(entry) + return get_hash(get_lock_entry(project_dir, command)) != get_hash(entry) -def update_lockfile( - project_dir: Path, command: Dict[str, Any], variables: Dict[str, Any] -) -> None: +def update_lockfile(project_dir: Path, command: Dict[str, Any]) -> None: """Update the lockfile after running a command. Will create a lockfile if it doesn't yet exist and will add an entry for the current command, its script and dependencies/outputs. project_dir (Path): The current project directory. command (Dict[str, Any]): The command, as defined in the project.yml. - variables (Dict[str, Any]): The variables defined in the project.yml. """ lock_path = project_dir / PROJECT_LOCK if not lock_path.exists(): @@ -217,13 +200,11 @@ def update_lockfile( data = {} else: data = srsly.read_yaml(lock_path) - data[command["name"]] = get_lock_entry(project_dir, command, variables) + data[command["name"]] = get_lock_entry(project_dir, command) srsly.write_yaml(lock_path, data) -def get_lock_entry( - project_dir: Path, command: Dict[str, Any], variables: Dict[str, Any] -) -> Dict[str, Any]: +def get_lock_entry(project_dir: Path, command: Dict[str, Any]) -> Dict[str, Any]: """Get a lockfile entry for a given command. An entry includes the command, the script (command steps) and a list of dependencies and outputs with their paths and file hashes, if available. The format is based on the @@ -231,12 +212,11 @@ def get_lock_entry( project_dir (Path): The current project directory. command (Dict[str, Any]): The command, as defined in the project.yml. - variables (Dict[str, Any]): The variables defined in the project.yml. RETURNS (Dict[str, Any]): The lockfile entry. """ - deps = get_fileinfo(project_dir, command.get("deps", []), variables) - outs = get_fileinfo(project_dir, command.get("outputs", []), variables) - outs_nc = get_fileinfo(project_dir, command.get("outputs_no_cache", []), variables) + deps = get_fileinfo(project_dir, command.get("deps", [])) + outs = get_fileinfo(project_dir, command.get("outputs", [])) + outs_nc = get_fileinfo(project_dir, command.get("outputs_no_cache", [])) return { "cmd": f"{COMMAND} run {command['name']}", "script": command["script"], @@ -245,20 +225,16 @@ def get_lock_entry( } -def get_fileinfo( - project_dir: Path, paths: List[str], variables: Dict[str, Any] -) -> List[Dict[str, str]]: +def get_fileinfo(project_dir: Path, paths: List[str]) -> List[Dict[str, str]]: """Generate the file information for a list of paths (dependencies, outputs). Includes the file path and the file's checksum. project_dir (Path): The current project directory. paths (List[str]): The file paths. - variables (Dict[str, Any]): The variables defined in the project.yml. RETURNS (List[Dict[str, str]]): The lockfile entry for a file. """ data = [] for path in paths: - path = path.format(**variables) file_path = project_dir / path md5 = get_checksum(file_path) if file_path.exists() else None data.append({"path": path, "md5": md5}) diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja index 4f5a2226e..0071f1b1a 100644 --- a/spacy/cli/templates/quickstart_training.jinja +++ b/spacy/cli/templates/quickstart_training.jinja @@ -105,10 +105,10 @@ factory = "tok2vec" [components.tok2vec.model.embed] @architectures = "spacy.MultiHashEmbed.v1" -width = ${components.tok2vec.model.encode:width} +width = ${components.tok2vec.model.encode.width} rows = {{ 2000 if optimize == "efficiency" else 7000 }} -also_embed_subwords = {{ true if has_letters else false }} -also_use_static_vectors = {{ true if optimize == "accuracy" else false }} +also_embed_subwords = {{ "true" if has_letters else "false" }} +also_use_static_vectors = {{ "true" if optimize == "accuracy" else "false" }} [components.tok2vec.model.encode] @architectures = "spacy.MaxoutWindowEncoder.v1" @@ -127,7 +127,7 @@ nO = null [components.tagger.model.tok2vec] @architectures = "spacy.Tok2VecListener.v1" -width = ${components.tok2vec.model.encode:width} +width = ${components.tok2vec.model.encode.width} {%- endif %} {% if "parser" in components -%} @@ -144,7 +144,7 @@ nO = null [components.parser.model.tok2vec] @architectures = "spacy.Tok2VecListener.v1" -width = ${components.tok2vec.model.encode:width} +width = ${components.tok2vec.model.encode.width} {%- endif %} {% if "ner" in components %} @@ -161,7 +161,7 @@ nO = null [components.ner.model.tok2vec] @architectures = "spacy.Tok2VecListener.v1" -width = ${components.tok2vec.model.encode:width} +width = ${components.tok2vec.model.encode.width} {% endif %} {% endif %} @@ -194,12 +194,12 @@ initial_rate = 5e-5 [training.train_corpus] @readers = "spacy.Corpus.v1" -path = ${paths:train} -max_length = {{ 500 if hardware == "gpu" else 0 }} +path = ${paths.train} +max_length = {{ 500 if hardware == "gpu" else 2000 }} [training.dev_corpus] @readers = "spacy.Corpus.v1" -path = ${paths:dev} +path = ${paths.dev} max_length = 0 {% if use_transformer %} diff --git a/spacy/cli/templates/quickstart_training_recommendations.json b/spacy/cli/templates/quickstart_training_recommendations.json deleted file mode 100644 index 8a3acc438..000000000 --- a/spacy/cli/templates/quickstart_training_recommendations.json +++ /dev/null @@ -1,13 +0,0 @@ -{ - "en": { - "word_vectors": "en_vectors_web_lg", - "transformer": { - "efficiency": { "name": "roberta-base", "size_factor": 3 }, - "accuracy": { "name": "roberta-base", "size_factor": 3 } - } - }, - "de": { - "word_vectors": null, - "transformer": null - } -} diff --git a/spacy/cli/templates/quickstart_training_recommendations.yml b/spacy/cli/templates/quickstart_training_recommendations.yml new file mode 100644 index 000000000..206e69954 --- /dev/null +++ b/spacy/cli/templates/quickstart_training_recommendations.yml @@ -0,0 +1,103 @@ +# Recommended settings and available resources for each language, if available. +# Not all languages have recommended word vectors or transformers and for some, +# the recommended transformer for efficiency and accuracy may be the same. +en: + word_vectors: en_vectors_web_lg + transformer: + efficiency: + name: roberta-base + size_factor: 3 + accuracy: + name: roberta-base + size_factor: 3 +de: + word_vectors: null + transformer: + efficiency: + name: bert-base-german-cased + size_factor: 3 + accuracy: + name: bert-base-german-cased + size_factor: 3 +fr: + word_vectors: null + transformer: + efficiency: + name: camembert-base + size_factor: 3 + accuracy: + name: camembert-base + size_factor: 3 +es: + word_vectors: null + transformer: + efficiency: + name: mrm8488/RuPERTa-base + size_factor: 3 + accuracy: + name: mrm8488/RuPERTa-base + size_factor: 3 +sv: + word_vectors: null + transformer: + efficiency: + name: KB/bert-base-swedish-cased + size_factor: 3 + accuracy: + name: KB/bert-base-swedish-cased + size_factor: 3 +fi: + word_vectors: null + transformer: + efficiency: + name: TurkuNLP/bert-base-finnish-cased-v1 + size_factor: 3 + accuracy: + name: TurkuNLP/bert-base-finnish-cased-v1 + size_factor: 3 +el: + word_vectors: null + transformer: + efficiency: + name: nlpaueb/bert-base-greek-uncased-v1 + size_factor: 3 + accuracy: + name: nlpaueb/bert-base-greek-uncased-v1 + size_factor: 3 +tr: + word_vectors: null + transformer: + efficiency: + name: dbmdz/bert-base-turkish-cased + size_factor: 3 + accuracy: + name: dbmdz/bert-base-turkish-cased + size_factor: 3 +zh: + word_vectors: null + transformer: + efficiency: + name: bert-base-chinese + size_factor: 3 + accuracy: + name: bert-base-chinese + size_factor: 3 + has_letters: false +ar: + word_vectors: null + transformer: + efficiency: + name: asafaya/bert-base-arabic + size_factor: 3 + accuracy: + name: asafaya/bert-base-arabic + size_factor: 3 +pl: + word_vectors: null + transformer: + efficiency: + name: dkleczek/bert-base-polish-cased-v1 + size_factor: 3 + accuracy: + name: dkleczek/bert-base-polish-cased-v1 + size_factor: 3 diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 375e64ffd..15188cd4e 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -75,7 +75,9 @@ def train( msg.info("Using CPU") msg.info(f"Loading config and nlp from: {config_path}") with show_validation_error(config_path): - config = util.load_config(config_path, overrides=config_overrides) + config = util.load_config( + config_path, overrides=config_overrides, interpolate=True + ) if config.get("training", {}).get("seed") is not None: fix_random_seed(config["training"]["seed"]) # Use original config here before it's resolved to functions @@ -162,13 +164,14 @@ def train( progress = tqdm.tqdm(total=T_cfg["eval_frequency"], leave=False) except Exception as e: if output_path is not None: + # We don't want to swallow the traceback if we don't have a + # specific error. msg.warn( f"Aborting and saving the final best model. " - f"Encountered exception: {str(e)}", - exits=1, + f"Encountered exception: {str(e)}" ) - else: - raise e + nlp.to_disk(output_path / "model-final") + raise e finally: if output_path is not None: final_model_path = output_path / "model-final" @@ -207,7 +210,9 @@ def create_evaluation_callback( scores = nlp.evaluate(dev_examples) # Calculate a weighted sum based on score_weights for the main score try: - weighted_score = sum(scores[s] * weights.get(s, 0.0) for s in weights) + weighted_score = sum( + scores.get(s, 0.0) * weights.get(s, 0.0) for s in weights + ) except KeyError as e: keys = list(scores.keys()) err = Errors.E983.format(dict="score_weights", key=str(e), keys=keys) @@ -235,7 +240,7 @@ def train_while_improving( with each iteration yielding a tuple `(batch, info, is_best_checkpoint)`, where info is a dict, and is_best_checkpoint is in [True, False, None] -- None indicating that the iteration was not evaluated as a checkpoint. - The evaluation is conducted by calling the evaluate callback, which should + The evaluation is conducted by calling the evaluate callback. Positional arguments: nlp: The spaCy pipeline to evaluate. @@ -377,7 +382,8 @@ def setup_printer( try: scores = [ - "{0:.2f}".format(float(info["other_scores"][col])) for col in score_cols + "{0:.2f}".format(float(info["other_scores"].get(col, 0.0))) + for col in score_cols ] except KeyError as e: raise KeyError( @@ -403,7 +409,7 @@ def update_meta( ) -> None: nlp.meta["performance"] = {} for metric in training["score_weights"]: - nlp.meta["performance"][metric] = info["other_scores"][metric] + nlp.meta["performance"][metric] = info["other_scores"].get(metric, 0.0) for pipe_name in nlp.pipe_names: nlp.meta["performance"][f"{pipe_name}_loss"] = info["losses"][pipe_name] diff --git a/spacy/default_config.cfg b/spacy/default_config.cfg index 8aadad668..3eab21888 100644 --- a/spacy/default_config.cfg +++ b/spacy/default_config.cfg @@ -23,12 +23,12 @@ after_pipeline_creation = null # Training hyper-parameters and additional features. [training] -seed = ${system:seed} +seed = ${system.seed} dropout = 0.1 accumulate_gradient = 1 # Extra resources for transfer-learning or pseudo-rehearsal -init_tok2vec = ${paths:init_tok2vec} -raw_text = ${paths:raw} +init_tok2vec = ${paths.init_tok2vec} +raw_text = ${paths.raw} vectors = null # Controls early-stopping. 0 or -1 mean unlimited. patience = 1600 @@ -42,7 +42,7 @@ frozen_components = [] [training.train_corpus] @readers = "spacy.Corpus.v1" -path = ${paths:train} +path = ${paths.train} # Whether to train on sequences with 'gold standard' sentence boundaries # and tokens. If you set this to true, take care to ensure your run-time # data is passed in sentence-by-sentence via some prior preprocessing. @@ -54,7 +54,7 @@ limit = 0 [training.dev_corpus] @readers = "spacy.Corpus.v1" -path = ${paths:dev} +path = ${paths.dev} # Whether to train on sequences with 'gold standard' sentence boundaries # and tokens. If you set this to true, take care to ensure your run-time # data is passed in sentence-by-sentence via some prior preprocessing. @@ -98,8 +98,8 @@ max_length = 500 dropout = 0.2 n_save_every = null batch_size = 3000 -seed = ${system:seed} -use_pytorch_for_gpu_memory = ${system:use_pytorch_for_gpu_memory} +seed = ${system.seed} +use_pytorch_for_gpu_memory = ${system.use_pytorch_for_gpu_memory} tok2vec_model = "components.tok2vec.model" [pretraining.objective] diff --git a/spacy/displacy/__init__.py b/spacy/displacy/__init__.py index 3f885f09f..2df2bd61c 100644 --- a/spacy/displacy/__init__.py +++ b/spacy/displacy/__init__.py @@ -18,7 +18,7 @@ RENDER_WRAPPER = None def render( - docs: Union[Iterable[Doc], Doc], + docs: Union[Iterable[Union[Doc, Span]], Doc, Span], style: str = "dep", page: bool = False, minify: bool = False, diff --git a/spacy/displacy/render.py b/spacy/displacy/render.py index 69f6df8f0..07550f9aa 100644 --- a/spacy/displacy/render.py +++ b/spacy/displacy/render.py @@ -252,8 +252,10 @@ class EntityRenderer: colors.update(user_color) colors.update(options.get("colors", {})) self.default_color = DEFAULT_ENTITY_COLOR - self.colors = colors + self.colors = {label.upper(): color for label, color in colors.items()} self.ents = options.get("ents", None) + if self.ents is not None: + self.ents = [ent.upper() for ent in self.ents] self.direction = DEFAULT_DIR self.lang = DEFAULT_LANG template = options.get("template") diff --git a/spacy/displacy/templates.py b/spacy/displacy/templates.py index ff99000f4..b9cbf717b 100644 --- a/spacy/displacy/templates.py +++ b/spacy/displacy/templates.py @@ -51,14 +51,14 @@ TPL_ENTS = """ TPL_ENT = """ {text} - {label} + {label} """ TPL_ENT_RTL = """ {text} - {label} + {label} """ diff --git a/spacy/errors.py b/spacy/errors.py index 26c0dba29..d1e9489d1 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -78,10 +78,11 @@ class Warnings: "are currently: {langs}") # TODO: fix numbering after merging develop into master + W090 = ("Could not locate any binary .spacy files in path '{path}'.") W091 = ("Could not clean/remove the temp directory at {dir}: {msg}.") W092 = ("Ignoring annotations for sentence starts, as dependency heads are set.") W093 = ("Could not find any data to train the {name} on. Is your " - "input data correctly formatted ?") + "input data correctly formatted?") W094 = ("Model '{model}' ({model_version}) specifies an under-constrained " "spaCy version requirement: {version}. This can lead to compatibility " "problems with older versions, or as new spaCy versions are " @@ -476,6 +477,10 @@ class Errors: E199 = ("Unable to merge 0-length span at doc[{start}:{end}].") # TODO: fix numbering after merging develop into master + E928 = ("A 'KnowledgeBase' should be written to / read from a file, but the " + "provided argument {loc} is an existing directory.") + E929 = ("A 'KnowledgeBase' could not be read from {loc} - the path does " + "not seem to exist.") E930 = ("Received invalid get_examples callback in {name}.begin_training. " "Expected function that returns an iterable of Example objects but " "got: {obj}") @@ -503,8 +508,6 @@ class Errors: "not found in pipeline. Available components: {opts}") E945 = ("Can't copy pipeline component '{name}' from source. Expected loaded " "nlp object, but got: {source}") - E946 = ("The Vocab for the knowledge base is not initialized. Did you forget to " - "call kb.initialize()?") E947 = ("Matcher.add received invalid 'greedy' argument: expected " "a string value from {expected} but got: '{arg}'") E948 = ("Matcher.add received invalid 'patterns' argument: expected " @@ -600,7 +603,8 @@ class Errors: "\"en_core_web_sm\" will copy the component from that model.\n\n{config}") E985 = ("Can't load model from config file: no 'nlp' section found.\n\n{config}") E986 = ("Could not create any training batches: check your input. " - "Perhaps discard_oversize should be set to False ?") + "Are the train and dev paths defined? " + "Is 'discard_oversize' set appropriately? ") E987 = ("The text of an example training instance is either a Doc or " "a string, but found {type} instead.") E988 = ("Could not parse any training examples. Ensure the data is " @@ -610,8 +614,6 @@ class Errors: "of the training data in spaCy 3.0 onwards. The 'update' " "function should now be called with a batch of 'Example' " "objects, instead of (text, annotation) tuples. ") - E990 = ("An entity linking component needs to be initialized with a " - "KnowledgeBase object, but found {type} instead.") E991 = ("The function 'select_pipes' should be called with either a " "'disable' argument to list the names of the pipe components " "that should be disabled, or with an 'enable' argument that " diff --git a/spacy/gold/corpus.py b/spacy/gold/corpus.py index 774c3b840..1046da1e6 100644 --- a/spacy/gold/corpus.py +++ b/spacy/gold/corpus.py @@ -1,8 +1,10 @@ +import warnings from typing import Union, List, Iterable, Iterator, TYPE_CHECKING, Callable from pathlib import Path from .. import util from .example import Example +from ..errors import Warnings from ..tokens import DocBin, Doc from ..vocab import Vocab @@ -10,6 +12,8 @@ if TYPE_CHECKING: # This lets us add type hints for mypy etc. without causing circular imports from ..language import Language # noqa: F401 +FILE_TYPE = ".spacy" + @util.registry.readers("spacy.Corpus.v1") def create_docbin_reader( @@ -53,8 +57,9 @@ class Corpus: @staticmethod def walk_corpus(path: Union[str, Path]) -> List[Path]: path = util.ensure_path(path) - if not path.is_dir(): + if not path.is_dir() and path.parts[-1].endswith(FILE_TYPE): return [path] + orig_path = path paths = [path] locs = [] seen = set() @@ -66,8 +71,10 @@ class Corpus: continue elif path.is_dir(): paths.extend(path.iterdir()) - elif path.parts[-1].endswith(".spacy"): + elif path.parts[-1].endswith(FILE_TYPE): locs.append(path) + if len(locs) == 0: + warnings.warn(Warnings.W090.format(path=orig_path)) return locs def __call__(self, nlp: "Language") -> Iterator[Example]: @@ -135,7 +142,7 @@ class Corpus: i = 0 for loc in locs: loc = util.ensure_path(loc) - if loc.parts[-1].endswith(".spacy"): + if loc.parts[-1].endswith(FILE_TYPE): doc_bin = DocBin().from_disk(loc) docs = doc_bin.get_docs(vocab) for doc in docs: diff --git a/spacy/kb.pxd b/spacy/kb.pxd index 53038b5db..695693666 100644 --- a/spacy/kb.pxd +++ b/spacy/kb.pxd @@ -140,7 +140,7 @@ cdef class KnowledgeBase: self._entries.push_back(entry) self._aliases_table.push_back(alias) - cpdef load_bulk(self, loc) + cpdef from_disk(self, loc) cpdef set_entities(self, entity_list, freq_list, vector_list) diff --git a/spacy/kb.pyx b/spacy/kb.pyx index 9035f7e6a..3b8017a0c 100644 --- a/spacy/kb.pyx +++ b/spacy/kb.pyx @@ -1,4 +1,5 @@ # cython: infer_types=True, profile=True +from typing import Iterator from cymem.cymem cimport Pool from preshed.maps cimport PreshMap from cpython.exc cimport PyErr_SetFromErrno @@ -64,6 +65,16 @@ cdef class Candidate: return self.prior_prob +def get_candidates(KnowledgeBase kb, span) -> Iterator[Candidate]: + """ + Return candidate entities for a given span by using the text of the span as the alias + and fetching appropriate entries from the index. + This particular function is optimized to work with the built-in KB functionality, + but any other custom candidate generation method can be used in combination with the KB as well. + """ + return kb.get_alias_candidates(span.text) + + cdef class KnowledgeBase: """A `KnowledgeBase` instance stores unique identifiers for entities and their textual aliases, to support entity linking of named entities to real-world concepts. @@ -71,25 +82,16 @@ cdef class KnowledgeBase: DOCS: https://spacy.io/api/kb """ - def __init__(self, entity_vector_length): - """Create a KnowledgeBase. Make sure to call kb.initialize() before using it.""" + def __init__(self, Vocab vocab, entity_vector_length): + """Create a KnowledgeBase.""" self.mem = Pool() self.entity_vector_length = entity_vector_length - self._entry_index = PreshMap() self._alias_index = PreshMap() - self.vocab = None - - - def initialize(self, Vocab vocab): self.vocab = vocab self.vocab.strings.add("") self._create_empty_vectors(dummy_hash=self.vocab.strings[""]) - def require_vocab(self): - if self.vocab is None: - raise ValueError(Errors.E946) - @property def entity_vector_length(self): """RETURNS (uint64): length of the entity vectors""" @@ -102,14 +104,12 @@ cdef class KnowledgeBase: return len(self._entry_index) def get_entity_strings(self): - self.require_vocab() return [self.vocab.strings[x] for x in self._entry_index] def get_size_aliases(self): return len(self._alias_index) def get_alias_strings(self): - self.require_vocab() return [self.vocab.strings[x] for x in self._alias_index] def add_entity(self, unicode entity, float freq, vector[float] entity_vector): @@ -117,7 +117,6 @@ cdef class KnowledgeBase: Add an entity to the KB, optionally specifying its log probability based on corpus frequency Return the hash of the entity ID/name at the end. """ - self.require_vocab() cdef hash_t entity_hash = self.vocab.strings.add(entity) # Return if this entity was added before @@ -140,7 +139,6 @@ cdef class KnowledgeBase: return entity_hash cpdef set_entities(self, entity_list, freq_list, vector_list): - self.require_vocab() if len(entity_list) != len(freq_list) or len(entity_list) != len(vector_list): raise ValueError(Errors.E140) @@ -176,12 +174,10 @@ cdef class KnowledgeBase: i += 1 def contains_entity(self, unicode entity): - self.require_vocab() cdef hash_t entity_hash = self.vocab.strings.add(entity) return entity_hash in self._entry_index def contains_alias(self, unicode alias): - self.require_vocab() cdef hash_t alias_hash = self.vocab.strings.add(alias) return alias_hash in self._alias_index @@ -190,7 +186,6 @@ cdef class KnowledgeBase: For a given alias, add its potential entities and prior probabilies to the KB. Return the alias_hash at the end """ - self.require_vocab() # Throw an error if the length of entities and probabilities are not the same if not len(entities) == len(probabilities): raise ValueError(Errors.E132.format(alias=alias, @@ -234,7 +229,6 @@ cdef class KnowledgeBase: Throw an error if this entity+prior prob would exceed the sum of 1. For efficiency, it's best to use the method `add_alias` as much as possible instead of this one. """ - self.require_vocab() # Check if the alias exists in the KB cdef hash_t alias_hash = self.vocab.strings[alias] if not alias_hash in self._alias_index: @@ -274,14 +268,12 @@ cdef class KnowledgeBase: alias_entry.probs = probs self._aliases_table[alias_index] = alias_entry - - def get_candidates(self, unicode alias): + def get_alias_candidates(self, unicode alias) -> Iterator[Candidate]: """ Return candidate entities for an alias. Each candidate defines the entity, the original alias, and the prior probability of that alias resolving to that entity. If the alias is not known in the KB, and empty list is returned. """ - self.require_vocab() cdef hash_t alias_hash = self.vocab.strings[alias] if not alias_hash in self._alias_index: return [] @@ -298,7 +290,6 @@ cdef class KnowledgeBase: if entry_index != 0] def get_vector(self, unicode entity): - self.require_vocab() cdef hash_t entity_hash = self.vocab.strings[entity] # Return an empty list if this entity is unknown in this KB @@ -311,7 +302,6 @@ cdef class KnowledgeBase: def get_prior_prob(self, unicode entity, unicode alias): """ Return the prior probability of a given alias being linked to a given entity, or return 0.0 when this combination is not known in the knowledge base""" - self.require_vocab() cdef hash_t alias_hash = self.vocab.strings[alias] cdef hash_t entity_hash = self.vocab.strings[entity] @@ -329,8 +319,7 @@ cdef class KnowledgeBase: return 0.0 - def dump(self, loc): - self.require_vocab() + def to_disk(self, loc): cdef Writer writer = Writer(loc) writer.write_header(self.get_size_entities(), self.entity_vector_length) @@ -370,7 +359,7 @@ cdef class KnowledgeBase: writer.close() - cpdef load_bulk(self, loc): + cpdef from_disk(self, loc): cdef hash_t entity_hash cdef hash_t alias_hash cdef int64_t entry_index @@ -462,12 +451,11 @@ cdef class KnowledgeBase: cdef class Writer: def __init__(self, object loc): - if path.exists(loc): - assert not path.isdir(loc), f"{loc} is directory" if isinstance(loc, Path): loc = bytes(loc) if path.exists(loc): - assert not path.isdir(loc), "%s is directory." % loc + if path.isdir(loc): + raise ValueError(Errors.E928.format(loc=loc)) cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc self._fp = fopen(bytes_loc, 'wb') if not self._fp: @@ -511,8 +499,10 @@ cdef class Reader: def __init__(self, object loc): if isinstance(loc, Path): loc = bytes(loc) - assert path.exists(loc) - assert not path.isdir(loc) + if not path.exists(loc): + raise ValueError(Errors.E929.format(loc=loc)) + if path.isdir(loc): + raise ValueError(Errors.E928.format(loc=loc)) cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc self._fp = fopen(bytes_loc, 'rb') if not self._fp: diff --git a/spacy/language.py b/spacy/language.py index b67c55e3b..57abcca0e 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -439,8 +439,6 @@ class Language: assigns: Iterable[str] = tuple(), requires: Iterable[str] = tuple(), retokenizes: bool = False, - scores: Iterable[str] = tuple(), - default_score_weights: Dict[str, float] = SimpleFrozenDict(), func: Optional[Callable[[Doc], Doc]] = None, ) -> Callable: """Register a new pipeline component. Can be used for stateless function @@ -456,12 +454,6 @@ class Language: e.g. "token.ent_id". Used for pipeline analyis. retokenizes (bool): Whether the component changes the tokenization. Used for pipeline analysis. - scores (Iterable[str]): All scores set by the component if it's trainable, - e.g. ["ents_f", "ents_r", "ents_p"]. - default_score_weights (Dict[str, float]): The scores to report during - training, and their default weight towards the final score used to - select the best model. Weights should sum to 1.0 per component and - will be combined and normalized for the whole pipeline. func (Optional[Callable]): Factory function if not used as a decorator. DOCS: https://spacy.io/api/language#component @@ -482,8 +474,6 @@ class Language: assigns=assigns, requires=requires, retokenizes=retokenizes, - scores=scores, - default_score_weights=default_score_weights, func=factory_func, ) return component_func @@ -782,9 +772,15 @@ class Language: self.remove_pipe(name) if not len(self.pipeline) or pipe_index == len(self.pipeline): # we have no components to insert before/after, or we're replacing the last component - self.add_pipe(factory_name, name=name) + self.add_pipe(factory_name, name=name, config=config, validate=validate) else: - self.add_pipe(factory_name, name=name, before=pipe_index) + self.add_pipe( + factory_name, + name=name, + before=pipe_index, + config=config, + validate=validate, + ) def rename_pipe(self, old_name: str, new_name: str) -> None: """Rename a pipeline component. @@ -1112,7 +1108,6 @@ class Language: self, examples: Iterable[Example], *, - verbose: bool = False, batch_size: int = 256, scorer: Optional[Scorer] = None, component_cfg: Optional[Dict[str, Dict[str, Any]]] = None, @@ -1121,7 +1116,6 @@ class Language: """Evaluate a model's pipeline components. examples (Iterable[Example]): `Example` objects. - verbose (bool): Print debugging information. batch_size (int): Batch size to use. scorer (Optional[Scorer]): Scorer to use. If not passed in, a new one will be created. @@ -1140,7 +1134,6 @@ class Language: scorer_cfg = {} if scorer is None: kwargs = dict(scorer_cfg) - kwargs.setdefault("verbose", verbose) kwargs.setdefault("nlp", self) scorer = Scorer(**kwargs) texts = [eg.reference.text for eg in examples] @@ -1163,8 +1156,7 @@ class Language: docs = list(docs) end_time = timer() for i, (doc, eg) in enumerate(zip(docs, examples)): - if verbose: - print(doc) + util.logger.debug(doc) eg.predicted = doc results = scorer.score(examples) n_words = sum(len(eg.predicted) for eg in examples) diff --git a/spacy/ml/models/entity_linker.py b/spacy/ml/models/entity_linker.py index f96d50a7b..6792f3e59 100644 --- a/spacy/ml/models/entity_linker.py +++ b/spacy/ml/models/entity_linker.py @@ -1,9 +1,9 @@ -from typing import Optional +from typing import Optional, Callable, Iterable from thinc.api import chain, clone, list2ragged, reduce_mean, residual from thinc.api import Model, Maxout, Linear from ...util import registry -from ...kb import KnowledgeBase +from ...kb import KnowledgeBase, Candidate, get_candidates from ...vocab import Vocab @@ -25,15 +25,23 @@ def build_nel_encoder(tok2vec: Model, nO: Optional[int] = None) -> Model: @registry.assets.register("spacy.KBFromFile.v1") -def load_kb(vocab_path: str, kb_path: str) -> KnowledgeBase: - vocab = Vocab().from_disk(vocab_path) - kb = KnowledgeBase(entity_vector_length=1) - kb.initialize(vocab) - kb.load_bulk(kb_path) - return kb +def load_kb(kb_path: str) -> Callable[[Vocab], KnowledgeBase]: + def kb_from_file(vocab): + kb = KnowledgeBase(vocab, entity_vector_length=1) + kb.from_disk(kb_path) + return kb + + return kb_from_file @registry.assets.register("spacy.EmptyKB.v1") -def empty_kb(entity_vector_length: int) -> KnowledgeBase: - kb = KnowledgeBase(entity_vector_length=entity_vector_length) - return kb +def empty_kb(entity_vector_length: int) -> Callable[[Vocab], KnowledgeBase]: + def empty_kb_factory(vocab): + return KnowledgeBase(vocab=vocab, entity_vector_length=entity_vector_length) + + return empty_kb_factory + + +@registry.assets.register("spacy.CandidateGenerator.v1") +def create_candidates() -> Callable[[KnowledgeBase, "Span"], Iterable[Candidate]]: + return get_candidates diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py index 35bf2906e..d92c700ba 100644 --- a/spacy/pipeline/entity_linker.py +++ b/spacy/pipeline/entity_linker.py @@ -6,7 +6,7 @@ from thinc.api import CosineDistance, get_array_module, Model, Optimizer, Config from thinc.api import set_dropout_rate import warnings -from ..kb import KnowledgeBase +from ..kb import KnowledgeBase, Candidate from ..tokens import Doc from .pipe import Pipe, deserialize_config from ..language import Language @@ -32,35 +32,30 @@ subword_features = true """ DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"] -default_kb_config = """ -[kb] -@assets = "spacy.EmptyKB.v1" -entity_vector_length = 64 -""" -DEFAULT_NEL_KB = Config().from_str(default_kb_config)["kb"] - @Language.factory( "entity_linker", requires=["doc.ents", "doc.sents", "token.ent_iob", "token.ent_type"], assigns=["token.ent_kb_id"], default_config={ - "kb": DEFAULT_NEL_KB, + "kb_loader": {"@assets": "spacy.EmptyKB.v1", "entity_vector_length": 64}, "model": DEFAULT_NEL_MODEL, "labels_discard": [], "incl_prior": True, "incl_context": True, + "get_candidates": {"@assets": "spacy.CandidateGenerator.v1"}, }, ) def make_entity_linker( nlp: Language, name: str, model: Model, - kb: KnowledgeBase, + kb_loader: Callable[[Vocab], KnowledgeBase], *, labels_discard: Iterable[str], incl_prior: bool, incl_context: bool, + get_candidates: Callable[[KnowledgeBase, "Span"], Iterable[Candidate]], ): """Construct an EntityLinker component. @@ -76,10 +71,11 @@ def make_entity_linker( nlp.vocab, model, name, - kb=kb, + kb_loader=kb_loader, labels_discard=labels_discard, incl_prior=incl_prior, incl_context=incl_context, + get_candidates=get_candidates, ) @@ -97,10 +93,11 @@ class EntityLinker(Pipe): model: Model, name: str = "entity_linker", *, - kb: KnowledgeBase, + kb_loader: Callable[[Vocab], KnowledgeBase], labels_discard: Iterable[str], incl_prior: bool, incl_context: bool, + get_candidates: Callable[[KnowledgeBase, "Span"], Iterable[Candidate]], ) -> None: """Initialize an entity linker. @@ -108,7 +105,7 @@ class EntityLinker(Pipe): model (thinc.api.Model): The Thinc Model powering the pipeline component. name (str): The component instance name, used to add entries to the losses during training. - kb (KnowledgeBase): The KnowledgeBase holding all entities and their aliases. + kb_loader (Callable[[Vocab], KnowledgeBase]): A function that creates a KnowledgeBase from a Vocab instance. labels_discard (Iterable[str]): NER labels that will automatically get a "NIL" prediction. incl_prior (bool): Whether or not to include prior probabilities from the KB in the model. incl_context (bool): Whether or not to include the local context in the model. @@ -119,17 +116,12 @@ class EntityLinker(Pipe): self.model = model self.name = name cfg = { - "kb": kb, "labels_discard": list(labels_discard), "incl_prior": incl_prior, "incl_context": incl_context, } - if not isinstance(kb, KnowledgeBase): - raise ValueError(Errors.E990.format(type=type(self.kb))) - kb.initialize(vocab) - self.kb = kb - if "kb" in cfg: - del cfg["kb"] # we don't want to duplicate its serialization + self.kb = kb_loader(self.vocab) + self.get_candidates = get_candidates self.cfg = dict(cfg) self.distance = CosineDistance(normalize=False) # how many neightbour sentences to take into account @@ -326,10 +318,11 @@ class EntityLinker(Pipe): end_token = sentences[end_sentence].end sent_doc = doc[start_token:end_token].as_doc() # currently, the context is the same for each entity in a sentence (should be refined) - sentence_encoding = self.model.predict([sent_doc])[0] - xp = get_array_module(sentence_encoding) - sentence_encoding_t = sentence_encoding.T - sentence_norm = xp.linalg.norm(sentence_encoding_t) + xp = self.model.ops.xp + if self.cfg.get("incl_context"): + sentence_encoding = self.model.predict([sent_doc])[0] + sentence_encoding_t = sentence_encoding.T + sentence_norm = xp.linalg.norm(sentence_encoding_t) for ent in sent.ents: entity_count += 1 to_discard = self.cfg.get("labels_discard", []) @@ -337,7 +330,7 @@ class EntityLinker(Pipe): # ignoring this entity - setting to NIL final_kb_ids.append(self.NIL) else: - candidates = self.kb.get_candidates(ent.text) + candidates = self.get_candidates(self.kb, ent) if not candidates: # no prediction possible for this entity - setting to NIL final_kb_ids.append(self.NIL) @@ -421,10 +414,9 @@ class EntityLinker(Pipe): DOCS: https://spacy.io/api/entitylinker#to_disk """ serialize = {} - self.cfg["entity_width"] = self.kb.entity_vector_length serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg) serialize["vocab"] = lambda p: self.vocab.to_disk(p) - serialize["kb"] = lambda p: self.kb.dump(p) + serialize["kb"] = lambda p: self.kb.to_disk(p) serialize["model"] = lambda p: self.model.to_disk(p) util.to_disk(path, serialize, exclude) @@ -446,15 +438,10 @@ class EntityLinker(Pipe): except AttributeError: raise ValueError(Errors.E149) from None - def load_kb(p): - self.kb = KnowledgeBase(entity_vector_length=self.cfg["entity_width"]) - self.kb.initialize(self.vocab) - self.kb.load_bulk(p) - deserialize = {} deserialize["vocab"] = lambda p: self.vocab.from_disk(p) deserialize["cfg"] = lambda p: self.cfg.update(deserialize_config(p)) - deserialize["kb"] = load_kb + deserialize["kb"] = lambda p: self.kb.from_disk(p) deserialize["model"] = load_model util.from_disk(path, deserialize, exclude) return self diff --git a/spacy/pipeline/nn_parser.pyx b/spacy/pipeline/nn_parser.pyx deleted file mode 100644 index e69de29bb..000000000 diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx index 9070329e8..2a4274597 100644 --- a/spacy/pipeline/tagger.pyx +++ b/spacy/pipeline/tagger.pyx @@ -68,7 +68,6 @@ class Tagger(Pipe): name (str): The component instance name, used to add entries to the losses during training. labels (List): The set of labels. Defaults to None. - set_morphology (bool): Whether to set morphological features. DOCS: https://spacy.io/api/tagger#init """ diff --git a/spacy/schemas.py b/spacy/schemas.py index 0f2a35c60..170342b54 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -167,18 +167,20 @@ class ModelMetaSchema(BaseModel): lang: StrictStr = Field(..., title="Two-letter language code, e.g. 'en'") name: StrictStr = Field(..., title="Model name") version: StrictStr = Field(..., title="Model version") - spacy_version: Optional[StrictStr] = Field(None, title="Compatible spaCy version identifier") - parent_package: Optional[StrictStr] = Field("spacy", title="Name of parent spaCy package, e.g. spacy or spacy-nightly") - pipeline: Optional[List[StrictStr]] = Field([], title="Names of pipeline components") - description: Optional[StrictStr] = Field(None, title="Model description") - license: Optional[StrictStr] = Field(None, title="Model license") - author: Optional[StrictStr] = Field(None, title="Model author name") - email: Optional[StrictStr] = Field(None, title="Model author email") - url: Optional[StrictStr] = Field(None, title="Model author URL") - sources: Optional[Union[List[StrictStr], Dict[str, str]]] = Field(None, title="Training data sources") - vectors: Optional[Dict[str, Any]] = Field(None, title="Included word vectors") - accuracy: Optional[Dict[str, Union[float, int]]] = Field(None, title="Accuracy numbers") - speed: Optional[Dict[str, Union[float, int]]] = Field(None, title="Speed evaluation numbers") + spacy_version: StrictStr = Field("", title="Compatible spaCy version identifier") + parent_package: StrictStr = Field("spacy", title="Name of parent spaCy package, e.g. spacy or spacy-nightly") + pipeline: List[StrictStr] = Field([], title="Names of pipeline components") + description: StrictStr = Field("", title="Model description") + license: StrictStr = Field("", title="Model license") + author: StrictStr = Field("", title="Model author name") + email: StrictStr = Field("", title="Model author email") + url: StrictStr = Field("", title="Model author URL") + sources: Optional[Union[List[StrictStr], List[Dict[str, str]]]] = Field(None, title="Training data sources") + vectors: Dict[str, Any] = Field({}, title="Included word vectors") + labels: Dict[str, Dict[str, List[str]]] = Field({}, title="Component labels, keyed by component name") + accuracy: Dict[str, Union[float, Dict[str, float]]] = Field({}, title="Accuracy numbers") + speed: Dict[str, Union[float, int]] = Field({}, title="Speed evaluation numbers") + spacy_git_version: StrictStr = Field("", title="Commit of spaCy version used") # fmt: on @@ -301,7 +303,7 @@ class ProjectConfigCommand(BaseModel): class ProjectConfigSchema(BaseModel): # fmt: off - variables: Dict[StrictStr, Union[str, int, float, bool]] = Field({}, title="Optional variables to substitute in commands") + vars: Dict[StrictStr, Any] = Field({}, title="Optional variables to substitute in commands") assets: List[ProjectConfigAsset] = Field([], title="Data assets") workflows: Dict[StrictStr, List[StrictStr]] = Field({}, title="Named workflows, mapped to list of project commands to run in order") commands: List[ProjectConfigCommand] = Field([], title="Project command shortucts") @@ -309,3 +311,22 @@ class ProjectConfigSchema(BaseModel): class Config: title = "Schema for project configuration file" + + +# Recommendations for init config workflows + + +class RecommendationTrfItem(BaseModel): + name: str + size_factor: int + + +class RecommendationTrf(BaseModel): + efficiency: RecommendationTrfItem + accuracy: RecommendationTrfItem + + +class RecommendationSchema(BaseModel): + word_vectors: Optional[str] = None + transformer: Optional[RecommendationTrf] = None + has_letters: bool = True diff --git a/spacy/scorer.py b/spacy/scorer.py index d77881ad0..dc017f82f 100644 --- a/spacy/scorer.py +++ b/spacy/scorer.py @@ -2,7 +2,7 @@ from typing import Optional, Iterable, Dict, Any, Callable, Tuple, TYPE_CHECKING import numpy as np from .gold import Example -from .tokens import Token, Doc +from .tokens import Token, Doc, Span from .errors import Errors from .util import get_lang_class from .morphology import Morphology @@ -250,15 +250,16 @@ class Scorer: examples: Iterable[Example], attr: str, *, - getter: Callable[[Doc, str], Any] = getattr, + getter: Callable[[Doc, str], Iterable[Span]] = getattr, **cfg, ) -> Dict[str, Any]: """Returns PRF scores for labeled spans. examples (Iterable[Example]): Examples to score attr (str): The attribute to score. - getter (Callable[[Doc, str], Any]): Defaults to getattr. If provided, - getter(doc, attr) should return the spans for the individual doc. + getter (Callable[[Doc, str], Iterable[Span]]): Defaults to getattr. If + provided, getter(doc, attr) should return the spans for the + individual doc. RETURNS (Dict[str, Any]): A dictionary containing the PRF scores under the keys attr_p/r/f and the per-type PRF scores under attr_per_type. @@ -444,7 +445,7 @@ class Scorer: *, getter: Callable[[Token, str], Any] = getattr, head_attr: str = "head", - head_getter: Callable[[Token, str], Any] = getattr, + head_getter: Callable[[Token, str], Token] = getattr, ignore_labels: Tuple[str] = tuple(), **cfg, ) -> Dict[str, Any]: @@ -458,7 +459,7 @@ class Scorer: individual token. head_attr (str): The attribute containing the head token. Defaults to 'head'. - head_getter (Callable[[Token, str], Any]): Defaults to getattr. If provided, + head_getter (Callable[[Token, str], Token]): Defaults to getattr. If provided, head_getter(token, attr) should return the value of the head for an individual token. ignore_labels (Tuple): Labels to ignore while scoring (e.g., punct). diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index b3fb6d0fc..4385d2bf9 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -1,6 +1,7 @@ +from typing import Callable, Iterable import pytest -from spacy.kb import KnowledgeBase +from spacy.kb import KnowledgeBase, get_candidates, Candidate from spacy import util, registry from spacy.gold import Example @@ -21,8 +22,7 @@ def assert_almost_equal(a, b): def test_kb_valid_entities(nlp): """Test the valid construction of a KB with 3 entities and two aliases""" - mykb = KnowledgeBase(entity_vector_length=3) - mykb.initialize(nlp.vocab) + mykb = KnowledgeBase(nlp.vocab, entity_vector_length=3) # adding entities mykb.add_entity(entity="Q1", freq=19, entity_vector=[8, 4, 3]) @@ -51,8 +51,7 @@ def test_kb_valid_entities(nlp): def test_kb_invalid_entities(nlp): """Test the invalid construction of a KB with an alias linked to a non-existing entity""" - mykb = KnowledgeBase(entity_vector_length=1) - mykb.initialize(nlp.vocab) + mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1) # adding entities mykb.add_entity(entity="Q1", freq=19, entity_vector=[1]) @@ -68,8 +67,7 @@ def test_kb_invalid_entities(nlp): def test_kb_invalid_probabilities(nlp): """Test the invalid construction of a KB with wrong prior probabilities""" - mykb = KnowledgeBase(entity_vector_length=1) - mykb.initialize(nlp.vocab) + mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1) # adding entities mykb.add_entity(entity="Q1", freq=19, entity_vector=[1]) @@ -83,8 +81,7 @@ def test_kb_invalid_probabilities(nlp): def test_kb_invalid_combination(nlp): """Test the invalid construction of a KB with non-matching entity and probability lists""" - mykb = KnowledgeBase(entity_vector_length=1) - mykb.initialize(nlp.vocab) + mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1) # adding entities mykb.add_entity(entity="Q1", freq=19, entity_vector=[1]) @@ -100,8 +97,7 @@ def test_kb_invalid_combination(nlp): def test_kb_invalid_entity_vector(nlp): """Test the invalid construction of a KB with non-matching entity vector lengths""" - mykb = KnowledgeBase(entity_vector_length=3) - mykb.initialize(nlp.vocab) + mykb = KnowledgeBase(nlp.vocab, entity_vector_length=3) # adding entities mykb.add_entity(entity="Q1", freq=19, entity_vector=[1, 2, 3]) @@ -117,14 +113,14 @@ def test_kb_default(nlp): assert len(entity_linker.kb) == 0 assert entity_linker.kb.get_size_entities() == 0 assert entity_linker.kb.get_size_aliases() == 0 - # default value from pipeline.entity_linker + # 64 is the default value from pipeline.entity_linker assert entity_linker.kb.entity_vector_length == 64 def test_kb_custom_length(nlp): """Test that the default (empty) KB can be configured with a custom entity length""" entity_linker = nlp.add_pipe( - "entity_linker", config={"kb": {"entity_vector_length": 35}} + "entity_linker", config={"kb_loader": {"entity_vector_length": 35}} ) assert len(entity_linker.kb) == 0 assert entity_linker.kb.get_size_entities() == 0 @@ -141,7 +137,7 @@ def test_kb_undefined(nlp): def test_kb_empty(nlp): """Test that the EL can't train with an empty KB""" - config = {"kb": {"@assets": "spacy.EmptyKB.v1", "entity_vector_length": 342}} + config = {"kb_loader": {"@assets": "spacy.EmptyKB.v1", "entity_vector_length": 342}} entity_linker = nlp.add_pipe("entity_linker", config=config) assert len(entity_linker.kb) == 0 with pytest.raises(ValueError): @@ -150,8 +146,13 @@ def test_kb_empty(nlp): def test_candidate_generation(nlp): """Test correct candidate generation""" - mykb = KnowledgeBase(entity_vector_length=1) - mykb.initialize(nlp.vocab) + mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1) + doc = nlp("douglas adam Adam shrubbery") + + douglas_ent = doc[0:1] + adam_ent = doc[1:2] + Adam_ent = doc[2:3] + shrubbery_ent = doc[3:4] # adding entities mykb.add_entity(entity="Q1", freq=27, entity_vector=[1]) @@ -163,21 +164,76 @@ def test_candidate_generation(nlp): mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9]) # test the size of the relevant candidates - assert len(mykb.get_candidates("douglas")) == 2 - assert len(mykb.get_candidates("adam")) == 1 - assert len(mykb.get_candidates("shrubbery")) == 0 + assert len(get_candidates(mykb, douglas_ent)) == 2 + assert len(get_candidates(mykb, adam_ent)) == 1 + assert len(get_candidates(mykb, Adam_ent)) == 0 # default case sensitive + assert len(get_candidates(mykb, shrubbery_ent)) == 0 # test the content of the candidates - assert mykb.get_candidates("adam")[0].entity_ == "Q2" - assert mykb.get_candidates("adam")[0].alias_ == "adam" - assert_almost_equal(mykb.get_candidates("adam")[0].entity_freq, 12) - assert_almost_equal(mykb.get_candidates("adam")[0].prior_prob, 0.9) + assert get_candidates(mykb, adam_ent)[0].entity_ == "Q2" + assert get_candidates(mykb, adam_ent)[0].alias_ == "adam" + assert_almost_equal(get_candidates(mykb, adam_ent)[0].entity_freq, 12) + assert_almost_equal(get_candidates(mykb, adam_ent)[0].prior_prob, 0.9) + + +def test_el_pipe_configuration(nlp): + """Test correct candidate generation as part of the EL pipe""" + nlp.add_pipe("sentencizer") + pattern = {"label": "PERSON", "pattern": [{"LOWER": "douglas"}]} + ruler = nlp.add_pipe("entity_ruler") + ruler.add_patterns([pattern]) + + @registry.assets.register("myAdamKB.v1") + def mykb() -> Callable[["Vocab"], KnowledgeBase]: + def create_kb(vocab): + kb = KnowledgeBase(vocab, entity_vector_length=1) + kb.add_entity(entity="Q2", freq=12, entity_vector=[2]) + kb.add_entity(entity="Q3", freq=5, entity_vector=[3]) + kb.add_alias( + alias="douglas", entities=["Q2", "Q3"], probabilities=[0.8, 0.1] + ) + return kb + + return create_kb + + # run an EL pipe without a trained context encoder, to check the candidate generation step only + nlp.add_pipe( + "entity_linker", + config={"kb_loader": {"@assets": "myAdamKB.v1"}, "incl_context": False}, + ) + # With the default get_candidates function, matching is case-sensitive + text = "Douglas and douglas are not the same." + doc = nlp(text) + assert doc[0].ent_kb_id_ == "NIL" + assert doc[1].ent_kb_id_ == "" + assert doc[2].ent_kb_id_ == "Q2" + + def get_lowercased_candidates(kb, span): + return kb.get_alias_candidates(span.text.lower()) + + @registry.assets.register("spacy.LowercaseCandidateGenerator.v1") + def create_candidates() -> Callable[[KnowledgeBase, "Span"], Iterable[Candidate]]: + return get_lowercased_candidates + + # replace the pipe with a new one with with a different candidate generator + nlp.replace_pipe( + "entity_linker", + "entity_linker", + config={ + "kb_loader": {"@assets": "myAdamKB.v1"}, + "incl_context": False, + "get_candidates": {"@assets": "spacy.LowercaseCandidateGenerator.v1"}, + }, + ) + doc = nlp(text) + assert doc[0].ent_kb_id_ == "Q2" + assert doc[1].ent_kb_id_ == "" + assert doc[2].ent_kb_id_ == "Q2" def test_append_alias(nlp): """Test that we can append additional alias-entity pairs""" - mykb = KnowledgeBase(entity_vector_length=1) - mykb.initialize(nlp.vocab) + mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1) # adding entities mykb.add_entity(entity="Q1", freq=27, entity_vector=[1]) @@ -189,26 +245,25 @@ def test_append_alias(nlp): mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9]) # test the size of the relevant candidates - assert len(mykb.get_candidates("douglas")) == 2 + assert len(mykb.get_alias_candidates("douglas")) == 2 # append an alias mykb.append_alias(alias="douglas", entity="Q1", prior_prob=0.2) # test the size of the relevant candidates has been incremented - assert len(mykb.get_candidates("douglas")) == 3 + assert len(mykb.get_alias_candidates("douglas")) == 3 # append the same alias-entity pair again should not work (will throw a warning) with pytest.warns(UserWarning): mykb.append_alias(alias="douglas", entity="Q1", prior_prob=0.3) # test the size of the relevant candidates remained unchanged - assert len(mykb.get_candidates("douglas")) == 3 + assert len(mykb.get_alias_candidates("douglas")) == 3 def test_append_invalid_alias(nlp): """Test that append an alias will throw an error if prior probs are exceeding 1""" - mykb = KnowledgeBase(entity_vector_length=1) - mykb.initialize(nlp.vocab) + mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1) # adding entities mykb.add_entity(entity="Q1", freq=27, entity_vector=[1]) @@ -228,16 +283,18 @@ def test_preserving_links_asdoc(nlp): """Test that Span.as_doc preserves the existing entity links""" @registry.assets.register("myLocationsKB.v1") - def dummy_kb() -> KnowledgeBase: - mykb = KnowledgeBase(entity_vector_length=1) - mykb.initialize(nlp.vocab) - # adding entities - mykb.add_entity(entity="Q1", freq=19, entity_vector=[1]) - mykb.add_entity(entity="Q2", freq=8, entity_vector=[1]) - # adding aliases - mykb.add_alias(alias="Boston", entities=["Q1"], probabilities=[0.7]) - mykb.add_alias(alias="Denver", entities=["Q2"], probabilities=[0.6]) - return mykb + def dummy_kb() -> Callable[["Vocab"], KnowledgeBase]: + def create_kb(vocab): + mykb = KnowledgeBase(vocab, entity_vector_length=1) + # adding entities + mykb.add_entity(entity="Q1", freq=19, entity_vector=[1]) + mykb.add_entity(entity="Q2", freq=8, entity_vector=[1]) + # adding aliases + mykb.add_alias(alias="Boston", entities=["Q1"], probabilities=[0.7]) + mykb.add_alias(alias="Denver", entities=["Q2"], probabilities=[0.6]) + return mykb + + return create_kb # set up pipeline with NER (Entity Ruler) and NEL (prior probability only, model not trained) nlp.add_pipe("sentencizer") @@ -247,7 +304,7 @@ def test_preserving_links_asdoc(nlp): ] ruler = nlp.add_pipe("entity_ruler") ruler.add_patterns(patterns) - el_config = {"kb": {"@assets": "myLocationsKB.v1"}, "incl_prior": False} + el_config = {"kb_loader": {"@assets": "myLocationsKB.v1"}, "incl_prior": False} el_pipe = nlp.add_pipe("entity_linker", config=el_config, last=True) el_pipe.begin_training(lambda: []) el_pipe.incl_context = False @@ -331,24 +388,28 @@ def test_overfitting_IO(): train_examples.append(Example.from_dict(doc, annotation)) @registry.assets.register("myOverfittingKB.v1") - def dummy_kb() -> KnowledgeBase: - # create artificial KB - assign same prior weight to the two russ cochran's - # Q2146908 (Russ Cochran): American golfer - # Q7381115 (Russ Cochran): publisher - mykb = KnowledgeBase(entity_vector_length=3) - mykb.initialize(nlp.vocab) - mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3]) - mykb.add_entity(entity="Q7381115", freq=12, entity_vector=[9, 1, -7]) - mykb.add_alias( - alias="Russ Cochran", - entities=["Q2146908", "Q7381115"], - probabilities=[0.5, 0.5], - ) - return mykb + def dummy_kb() -> Callable[["Vocab"], KnowledgeBase]: + def create_kb(vocab): + # create artificial KB - assign same prior weight to the two russ cochran's + # Q2146908 (Russ Cochran): American golfer + # Q7381115 (Russ Cochran): publisher + mykb = KnowledgeBase(vocab, entity_vector_length=3) + mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3]) + mykb.add_entity(entity="Q7381115", freq=12, entity_vector=[9, 1, -7]) + mykb.add_alias( + alias="Russ Cochran", + entities=["Q2146908", "Q7381115"], + probabilities=[0.5, 0.5], + ) + return mykb + + return create_kb # Create the Entity Linker component and add it to the pipeline nlp.add_pipe( - "entity_linker", config={"kb": {"@assets": "myOverfittingKB.v1"}}, last=True + "entity_linker", + config={"kb_loader": {"@assets": "myOverfittingKB.v1"}}, + last=True, ) # train the NEL pipe diff --git a/spacy/tests/pipeline/test_pipe_factories.py b/spacy/tests/pipeline/test_pipe_factories.py index 9948f6bcd..aa682fefe 100644 --- a/spacy/tests/pipeline/test_pipe_factories.py +++ b/spacy/tests/pipeline/test_pipe_factories.py @@ -356,13 +356,13 @@ def test_language_factories_combine_score_weights(weights, expected): def test_language_factories_scores(): name = "test_language_factories_scores" - func = lambda doc: doc + func = lambda nlp, name: lambda doc: doc weights1 = {"a1": 0.5, "a2": 0.5} weights2 = {"b1": 0.2, "b2": 0.7, "b3": 0.1} - Language.component( + Language.factory( f"{name}1", scores=list(weights1), default_score_weights=weights1, func=func, ) - Language.component( + Language.factory( f"{name}2", scores=list(weights2), default_score_weights=weights2, func=func, ) meta1 = Language.get_factory_meta(f"{name}1") diff --git a/spacy/tests/pipeline/test_pipe_methods.py b/spacy/tests/pipeline/test_pipe_methods.py index 0141708b4..feb11cabc 100644 --- a/spacy/tests/pipeline/test_pipe_methods.py +++ b/spacy/tests/pipeline/test_pipe_methods.py @@ -78,6 +78,14 @@ def test_replace_last_pipe(nlp): assert nlp.pipe_names == ["sentencizer", "ner"] +def test_replace_pipe_config(nlp): + nlp.add_pipe("entity_linker") + nlp.add_pipe("sentencizer") + assert nlp.get_pipe("entity_linker").cfg["incl_prior"] == True + nlp.replace_pipe("entity_linker", "entity_linker", config={"incl_prior": False}) + assert nlp.get_pipe("entity_linker").cfg["incl_prior"] == False + + @pytest.mark.parametrize("old_name,new_name", [("old_pipe", "new_pipe")]) def test_rename_pipe(nlp, old_name, new_name): with pytest.raises(ValueError): diff --git a/spacy/tests/regression/test_issue4501-5000.py b/spacy/tests/regression/test_issue4501-5000.py index 1e655851f..d16ecc1e6 100644 --- a/spacy/tests/regression/test_issue4501-5000.py +++ b/spacy/tests/regression/test_issue4501-5000.py @@ -65,7 +65,7 @@ def test_issue4590(en_vocab): def test_issue4651_with_phrase_matcher_attr(): - """Test that the EntityRuler PhraseMatcher is deserialize correctly using + """Test that the EntityRuler PhraseMatcher is deserialized correctly using the method from_disk when the EntityRuler argument phrase_matcher_attr is specified. """ @@ -87,7 +87,7 @@ def test_issue4651_with_phrase_matcher_attr(): def test_issue4651_without_phrase_matcher_attr(): - """Test that the EntityRuler PhraseMatcher is deserialize correctly using + """Test that the EntityRuler PhraseMatcher is deserialized correctly using the method from_disk when the EntityRuler argument phrase_matcher_attr is not specified. """ @@ -139,8 +139,7 @@ def test_issue4665(): def test_issue4674(): """Test that setting entities with overlapping identifiers does not mess up IO""" nlp = English() - kb = KnowledgeBase(entity_vector_length=3) - kb.initialize(nlp.vocab) + kb = KnowledgeBase(nlp.vocab, entity_vector_length=3) vector1 = [0.9, 1.1, 1.01] vector2 = [1.8, 2.25, 2.01] with pytest.warns(UserWarning): @@ -156,10 +155,9 @@ def test_issue4674(): if not dir_path.exists(): dir_path.mkdir() file_path = dir_path / "kb" - kb.dump(str(file_path)) - kb2 = KnowledgeBase(entity_vector_length=3) - kb2.initialize(nlp.vocab) - kb2.load_bulk(str(file_path)) + kb.to_disk(str(file_path)) + kb2 = KnowledgeBase(nlp.vocab, entity_vector_length=3) + kb2.from_disk(str(file_path)) assert kb2.get_size_entities() == 1 diff --git a/spacy/tests/regression/test_issue5230.py b/spacy/tests/regression/test_issue5230.py index 93069d9a3..2ac886625 100644 --- a/spacy/tests/regression/test_issue5230.py +++ b/spacy/tests/regression/test_issue5230.py @@ -1,3 +1,4 @@ +from typing import Callable import warnings from unittest import TestCase import pytest @@ -70,13 +71,15 @@ def entity_linker(): nlp = Language() @registry.assets.register("TestIssue5230KB.v1") - def dummy_kb() -> KnowledgeBase: - kb = KnowledgeBase(entity_vector_length=1) - kb.initialize(nlp.vocab) - kb.add_entity("test", 0.0, zeros((1, 1), dtype="f")) - return kb + def dummy_kb() -> Callable[["Vocab"], KnowledgeBase]: + def create_kb(vocab): + kb = KnowledgeBase(vocab, entity_vector_length=1) + kb.add_entity("test", 0.0, zeros((1, 1), dtype="f")) + return kb - config = {"kb": {"@assets": "TestIssue5230KB.v1"}} + return create_kb + + config = {"kb_loader": {"@assets": "TestIssue5230KB.v1"}} entity_linker = nlp.add_pipe("entity_linker", config=config) # need to add model for two reasons: # 1. no model leads to error in serialization, @@ -121,19 +124,17 @@ def test_writer_with_path_py35(): def test_save_and_load_knowledge_base(): nlp = Language() - kb = KnowledgeBase(entity_vector_length=1) - kb.initialize(nlp.vocab) + kb = KnowledgeBase(nlp.vocab, entity_vector_length=1) with make_tempdir() as d: path = d / "kb" try: - kb.dump(path) + kb.to_disk(path) except Exception as e: pytest.fail(str(e)) try: - kb_loaded = KnowledgeBase(entity_vector_length=1) - kb_loaded.initialize(nlp.vocab) - kb_loaded.load_bulk(path) + kb_loaded = KnowledgeBase(nlp.vocab, entity_vector_length=1) + kb_loaded.from_disk(path) except Exception as e: pytest.fail(str(e)) diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py index 1de137e81..f2b496d71 100644 --- a/spacy/tests/serialize/test_serialize_config.py +++ b/spacy/tests/serialize/test_serialize_config.py @@ -20,11 +20,11 @@ dev = "" [training.train_corpus] @readers = "spacy.Corpus.v1" -path = ${paths:train} +path = ${paths.train} [training.dev_corpus] @readers = "spacy.Corpus.v1" -path = ${paths:dev} +path = ${paths.dev} [training.batcher] @batchers = "batch_by_words.v1" @@ -57,7 +57,7 @@ factory = "tagger" [components.tagger.model.tok2vec] @architectures = "spacy.Tok2VecListener.v1" -width = ${components.tok2vec.model:width} +width = ${components.tok2vec.model.width} """ @@ -284,13 +284,13 @@ def test_config_overrides(): def test_config_interpolation(): config = Config().from_str(nlp_config_string, interpolate=False) - assert config["training"]["train_corpus"]["path"] == "${paths:train}" + assert config["training"]["train_corpus"]["path"] == "${paths.train}" interpolated = config.interpolate() assert interpolated["training"]["train_corpus"]["path"] == "" nlp = English.from_config(config) - assert nlp.config["training"]["train_corpus"]["path"] == "${paths:train}" + assert nlp.config["training"]["train_corpus"]["path"] == "${paths.train}" # Ensure that variables are preserved in nlp config - width = "${components.tok2vec.model:width}" + width = "${components.tok2vec.model.width}" assert config["components"]["tagger"]["model"]["tok2vec"]["width"] == width assert nlp.config["components"]["tagger"]["model"]["tok2vec"]["width"] == width interpolated2 = nlp.config.interpolate() diff --git a/spacy/tests/serialize/test_serialize_kb.py b/spacy/tests/serialize/test_serialize_kb.py index 3f33c6f06..3cf5485d7 100644 --- a/spacy/tests/serialize/test_serialize_kb.py +++ b/spacy/tests/serialize/test_serialize_kb.py @@ -1,4 +1,8 @@ -from spacy.util import ensure_path +from typing import Callable + +from spacy import util +from spacy.lang.en import English +from spacy.util import ensure_path, registry from spacy.kb import KnowledgeBase from ..util import make_tempdir @@ -15,20 +19,16 @@ def test_serialize_kb_disk(en_vocab): if not dir_path.exists(): dir_path.mkdir() file_path = dir_path / "kb" - kb1.dump(str(file_path)) - - kb2 = KnowledgeBase(entity_vector_length=3) - kb2.initialize(en_vocab) - kb2.load_bulk(str(file_path)) + kb1.to_disk(str(file_path)) + kb2 = KnowledgeBase(vocab=en_vocab, entity_vector_length=3) + kb2.from_disk(str(file_path)) # final assertions _check_kb(kb2) def _get_dummy_kb(vocab): - kb = KnowledgeBase(entity_vector_length=3) - kb.initialize(vocab) - + kb = KnowledgeBase(vocab, entity_vector_length=3) kb.add_entity(entity="Q53", freq=33, entity_vector=[0, 5, 3]) kb.add_entity(entity="Q17", freq=2, entity_vector=[7, 1, 0]) kb.add_entity(entity="Q007", freq=7, entity_vector=[0, 0, 7]) @@ -61,7 +61,7 @@ def _check_kb(kb): assert alias_string not in kb.get_alias_strings() # check candidates & probabilities - candidates = sorted(kb.get_candidates("double07"), key=lambda x: x.entity_) + candidates = sorted(kb.get_alias_candidates("double07"), key=lambda x: x.entity_) assert len(candidates) == 2 assert candidates[0].entity_ == "Q007" @@ -75,3 +75,47 @@ def _check_kb(kb): assert candidates[1].entity_vector == [7, 1, 0] assert candidates[1].alias_ == "double07" assert 0.099 < candidates[1].prior_prob < 0.101 + + +def test_serialize_subclassed_kb(): + """Check that IO of a custom KB works fine as part of an EL pipe.""" + + class SubKnowledgeBase(KnowledgeBase): + def __init__(self, vocab, entity_vector_length, custom_field): + super().__init__(vocab, entity_vector_length) + self.custom_field = custom_field + + @registry.assets.register("spacy.CustomKB.v1") + def custom_kb( + entity_vector_length: int, custom_field: int + ) -> Callable[["Vocab"], KnowledgeBase]: + def custom_kb_factory(vocab): + return SubKnowledgeBase( + vocab=vocab, + entity_vector_length=entity_vector_length, + custom_field=custom_field, + ) + + return custom_kb_factory + + nlp = English() + config = { + "kb_loader": { + "@assets": "spacy.CustomKB.v1", + "entity_vector_length": 342, + "custom_field": 666, + } + } + entity_linker = nlp.add_pipe("entity_linker", config=config) + assert type(entity_linker.kb) == SubKnowledgeBase + assert entity_linker.kb.entity_vector_length == 342 + assert entity_linker.kb.custom_field == 666 + + # Make sure the custom KB is serialized correctly + with make_tempdir() as tmp_dir: + nlp.to_disk(tmp_dir) + nlp2 = util.load_model_from_path(tmp_dir) + entity_linker2 = nlp2.get_pipe("entity_linker") + assert type(entity_linker2.kb) == SubKnowledgeBase + assert entity_linker2.kb.entity_vector_length == 342 + assert entity_linker2.kb.custom_field == 666 diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index 1da257fd5..104c7c516 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -2,14 +2,16 @@ import pytest from spacy.gold import docs_to_json, biluo_tags_from_offsets from spacy.gold.converters import iob2docs, conll_ner2docs, conllu2docs from spacy.lang.en import English -from spacy.schemas import ProjectConfigSchema, validate +from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate from spacy.cli.pretrain import make_docs -from spacy.cli.init_config import init_config, RECOMMENDATIONS_PATH -from spacy.cli.init_config import RecommendationSchema +from spacy.cli.init_config import init_config, RECOMMENDATIONS from spacy.cli._util import validate_project_commands, parse_config_overrides -from spacy.util import get_lang_class +from spacy.cli._util import load_project_config, substitute_project_variables +from thinc.config import ConfigValidationError import srsly +from .util import make_tempdir + def test_cli_converters_conllu2json(): # from NorNE: https://github.com/ltgoslo/norne/blob/3d23274965f513f23aa48455b28b1878dad23c05/ud/nob/no_bokmaal-ud-dev.conllu @@ -296,6 +298,24 @@ def test_project_config_validation2(config, n_errors): assert len(errors) == n_errors +def test_project_config_interpolation(): + variables = {"a": 10, "b": {"c": "foo", "d": True}} + commands = [ + {"name": "x", "script": ["hello ${vars.a} ${vars.b.c}"]}, + {"name": "y", "script": ["${vars.b.c} ${vars.b.d}"]}, + ] + project = {"commands": commands, "vars": variables} + with make_tempdir() as d: + srsly.write_yaml(d / "project.yml", project) + cfg = load_project_config(d) + assert cfg["commands"][0]["script"][0] == "hello 10 foo" + assert cfg["commands"][1]["script"][0] == "foo true" + commands = [{"name": "x", "script": ["hello ${vars.a} ${vars.b.e}"]}] + project = {"commands": commands, "vars": variables} + with pytest.raises(ConfigValidationError): + substitute_project_variables(project) + + @pytest.mark.parametrize( "args,expected", [ @@ -335,7 +355,5 @@ def test_init_config(lang, pipeline, optimize): def test_model_recommendations(): - recommendations = srsly.read_json(RECOMMENDATIONS_PATH) - for lang, data in recommendations.items(): - assert get_lang_class(lang) + for lang, data in RECOMMENDATIONS.items(): assert RecommendationSchema(**data) diff --git a/spacy/tests/test_displacy.py b/spacy/tests/test_displacy.py index adac0f7c3..1fa0eeaa1 100644 --- a/spacy/tests/test_displacy.py +++ b/spacy/tests/test_displacy.py @@ -1,6 +1,6 @@ import pytest from spacy import displacy -from spacy.displacy.render import DependencyRenderer +from spacy.displacy.render import DependencyRenderer, EntityRenderer from spacy.tokens import Span from spacy.lang.fa import Persian @@ -97,3 +97,17 @@ def test_displacy_render_wrapper(en_vocab): assert html.endswith("/div>TEST") # Restore displacy.set_render_wrapper(lambda html: html) + + +def test_displacy_options_case(): + ents = ["foo", "BAR"] + colors = {"FOO": "red", "bar": "green"} + renderer = EntityRenderer({"ents": ents, "colors": colors}) + text = "abcd" + labels = ["foo", "bar", "FOO", "BAR"] + spans = [{"start": i, "end": i + 1, "label": labels[i]} for i in range(len(text))] + result = renderer.render_ents("abcde", spans, None).split("\n\n") + assert "red" in result[0] and "foo" in result[0] + assert "green" in result[1] and "bar" in result[1] + assert "red" in result[2] and "FOO" in result[2] + assert "green" in result[3] and "BAR" in result[3] diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index a13299fff..9fda1800b 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -47,9 +47,9 @@ cdef class Tokenizer: `infix_finditer` (callable): A function matching the signature of `re.compile(string).finditer` to find infixes. token_match (callable): A boolean function matching strings to be - recognised as tokens. + recognized as tokens. url_match (callable): A boolean function matching strings to be - recognised as tokens after considering prefixes and suffixes. + recognized as tokens after considering prefixes and suffixes. EXAMPLE: >>> tokenizer = Tokenizer(nlp.vocab) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 15dafb86d..cd080bf35 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -102,8 +102,7 @@ cdef class Doc: Construction 2 >>> from spacy.tokens import Doc - >>> doc = Doc(nlp.vocab, words=[u'hello', u'world', u'!'], - >>> spaces=[True, False, False]) + >>> doc = Doc(nlp.vocab, words=["hello", "world", "!"], spaces=[True, False, False]) DOCS: https://spacy.io/api/doc """ @@ -1194,8 +1193,7 @@ cdef class Doc: retokenizer.merge(span, attributes[i]) def to_json(self, underscore=None): - """Convert a Doc to JSON. The format it produces will be the new format - for the `spacy train` command (not implemented yet). + """Convert a Doc to JSON. underscore (list): Optional list of string names of custom doc._. attributes. Attribute values need to be JSON-serializable. Values will diff --git a/spacy/util.py b/spacy/util.py index 3cf165a4f..736f4d805 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -1,5 +1,5 @@ from typing import List, Union, Dict, Any, Optional, Iterable, Callable, Tuple -from typing import Iterator, Type, Pattern, TYPE_CHECKING +from typing import Iterator, Type, Pattern, Generator, TYPE_CHECKING from types import ModuleType import os import importlib @@ -249,7 +249,16 @@ def load_model_from_package( disable: Iterable[str] = tuple(), config: Union[Dict[str, Any], Config] = SimpleFrozenDict(), ) -> "Language": - """Load a model from an installed package.""" + """Load a model from an installed package. + + name (str): The package name. + vocab (Vocab / True): Optional vocab to pass in on initialization. If True, + a new Vocab object will be created. + disable (Iterable[str]): Names of pipeline components to disable. + config (Dict[str, Any] / Config): Config overrides as nested dict or dict + keyed by section values in dot notation. + RETURNS (Language): The loaded nlp object. + """ cls = importlib.import_module(name) return cls.load(vocab=vocab, disable=disable, config=config) @@ -263,7 +272,17 @@ def load_model_from_path( config: Union[Dict[str, Any], Config] = SimpleFrozenDict(), ) -> "Language": """Load a model from a data directory path. Creates Language class with - pipeline from config.cfg and then calls from_disk() with path.""" + pipeline from config.cfg and then calls from_disk() with path. + + name (str): Package name or model path. + meta (Dict[str, Any]): Optional model meta. + vocab (Vocab / True): Optional vocab to pass in on initialization. If True, + a new Vocab object will be created. + disable (Iterable[str]): Names of pipeline components to disable. + config (Dict[str, Any] / Config): Config overrides as nested dict or dict + keyed by section values in dot notation. + RETURNS (Language): The loaded nlp object. + """ if not model_path.exists(): raise IOError(Errors.E052.format(path=model_path)) if not meta: @@ -284,6 +303,15 @@ def load_model_from_config( ) -> Tuple["Language", Config]: """Create an nlp object from a config. Expects the full config file including a section "nlp" containing the settings for the nlp object. + + name (str): Package name or model path. + meta (Dict[str, Any]): Optional model meta. + vocab (Vocab / True): Optional vocab to pass in on initialization. If True, + a new Vocab object will be created. + disable (Iterable[str]): Names of pipeline components to disable. + auto_fill (bool): Whether to auto-fill config with missing defaults. + validate (bool): Whether to show config validation errors. + RETURNS (Language): The loaded nlp object. """ if "nlp" not in config: raise ValueError(Errors.E985.format(config=config)) @@ -308,6 +336,13 @@ def load_model_from_init_py( ) -> "Language": """Helper function to use in the `load()` method of a model package's __init__.py. + + vocab (Vocab / True): Optional vocab to pass in on initialization. If True, + a new Vocab object will be created. + disable (Iterable[str]): Names of pipeline components to disable. + config (Dict[str, Any] / Config): Config overrides as nested dict or dict + keyed by section values in dot notation. + RETURNS (Language): The loaded nlp object. """ model_path = Path(init_file).parent meta = get_model_meta(model_path) @@ -325,7 +360,14 @@ def load_config( overrides: Dict[str, Any] = SimpleFrozenDict(), interpolate: bool = False, ) -> Config: - """Load a config file. Takes care of path validation and section order.""" + """Load a config file. Takes care of path validation and section order. + + path (Union[str, Path]): Path to the config file. + overrides: (Dict[str, Any]): Config overrides as nested dict or + dict keyed by section values in dot notation. + interpolate (bool): Whether to interpolate and resolve variables. + RETURNS (Config): The loaded config. + """ config_path = ensure_path(path) if not config_path.exists() or not config_path.is_file(): raise IOError(Errors.E053.format(path=config_path, name="config.cfg")) @@ -337,7 +379,12 @@ def load_config( def load_config_from_str( text: str, overrides: Dict[str, Any] = SimpleFrozenDict(), interpolate: bool = False ): - """Load a full config from a string.""" + """Load a full config from a string. Wrapper around Thinc's Config.from_str. + + text (str): The string config to load. + interpolate (bool): Whether to interpolate and resolve variables. + RETURNS (Config): The loaded config. + """ return Config(section_order=CONFIG_SECTION_ORDER).from_str( text, overrides=overrides, interpolate=interpolate, ) @@ -435,19 +482,18 @@ def get_base_version(version: str) -> str: return Version(version).base_version -def get_model_meta(path: Union[str, Path]) -> Dict[str, Any]: - """Get model meta.json from a directory path and validate its contents. +def load_meta(path: Union[str, Path]) -> Dict[str, Any]: + """Load a model meta.json from a path and validate its contents. - path (str / Path): Path to model directory. - RETURNS (Dict[str, Any]): The model's meta data. + path (Union[str, Path]): Path to meta.json. + RETURNS (Dict[str, Any]): The loaded meta. """ - model_path = ensure_path(path) - if not model_path.exists(): - raise IOError(Errors.E052.format(path=model_path)) - meta_path = model_path / "meta.json" - if not meta_path.is_file(): - raise IOError(Errors.E053.format(path=meta_path, name="meta.json")) - meta = srsly.read_json(meta_path) + path = ensure_path(path) + if not path.parent.exists(): + raise IOError(Errors.E052.format(path=path.parent)) + if not path.exists() or not path.is_file(): + raise IOError(Errors.E053.format(path=path, name="meta.json")) + meta = srsly.read_json(path) for setting in ["lang", "name", "version"]: if setting not in meta or not meta[setting]: raise ValueError(Errors.E054.format(setting=setting)) @@ -471,6 +517,16 @@ def get_model_meta(path: Union[str, Path]) -> Dict[str, Any]: return meta +def get_model_meta(path: Union[str, Path]) -> Dict[str, Any]: + """Get model meta.json from a directory path and validate its contents. + + path (str / Path): Path to model directory. + RETURNS (Dict[str, Any]): The model's meta data. + """ + model_path = ensure_path(path) + return load_meta(model_path / "meta.json") + + def is_package(name: str) -> bool: """Check if string maps to a package installed via pip. @@ -554,7 +610,7 @@ def working_dir(path: Union[str, Path]) -> None: @contextmanager -def make_tempdir() -> None: +def make_tempdir() -> Generator[Path, None, None]: """Execute a block in a temporary directory and remove the directory and its contents at the end of the with block. @@ -886,6 +942,15 @@ def escape_html(text: str) -> str: def get_words_and_spaces( words: Iterable[str], text: str ) -> Tuple[List[str], List[bool]]: + """Given a list of words and a text, reconstruct the original tokens and + return a list of words and spaces that can be used to create a Doc. This + can help recover destructive tokenization that didn't preserve any + whitespace information. + + words (Iterable[str]): The words. + text (str): The original text. + RETURNS (Tuple[List[str], List[bool]]): The words and spaces. + """ if "".join("".join(words).split()) != "".join(text.split()): raise ValueError(Errors.E194.format(text=text, words=words)) text_words = [] diff --git a/website/README.md b/website/README.md index c1f6e5805..f3a64d1cb 100644 --- a/website/README.md +++ b/website/README.md @@ -75,7 +75,8 @@ import { H1, H2, H3, H4, H5, Label, InlineList, Comment } from Headlines are set in [HK Grotesk](http://cargocollective.com/hanken/HK-Grotesk-Open-Source-Font) by Hanken Design. All other body text and code uses the best-matching default -system font to provide a "native" reading experience. +system font to provide a "native" reading experience. All code uses the +[JetBrains Mono](https://www.jetbrains.com/lp/mono/) typeface by JetBrains. @@ -106,7 +107,7 @@ Tags are also available as standalone `` components. | Argument | Example | Result | | -------- | -------------------------- | ----------------------------------------- | | `tag` | `{tag="method"}` | method | -| `new` | `{new="2"}` | 2 | +| `new` | `{new="3"}` | 3 | | `model` | `{model="tagger, parser"}` | tagger, parser | | `hidden` | `{hidden="true"}` | | @@ -130,6 +131,8 @@ Special link styles are used depending on the link URL. - [I am a regular external link](https://explosion.ai) - [I am a link to the documentation](/api/doc) +- [I am a link to an architecture](/api/architectures#HashEmbedCNN) +- [I am a link to a model](/models/en#en_core_web_sm) - [I am a link to GitHub](https://github.com/explosion/spaCy) ### Abbreviations {#abbr} @@ -188,18 +191,20 @@ the buttons are implemented as styled links instead of native button elements. +
+ ## Components -### Table +### Table {#table} > #### Markdown > > ```markdown_ > | Header 1 | Header 2 | -> | --- | --- | +> | -------- | -------- | > | Column 1 | Column 2 | > ``` > @@ -213,7 +218,7 @@ the buttons are implemented as styled links instead of native button elements. > ``` Tables are used to present data and API documentation. Certain keywords can be -used to mark a footer row with a distinct style, for example to visualise the +used to mark a footer row with a distinct style, for example to visualize the return values of a documented function. | Header 1 | Header 2 | Header 3 | Header 4 | @@ -224,7 +229,73 @@ return values of a documented function. | Column 1 | Column 2 | Column 3 | Column 4 | | **RETURNS** | Column 2 | Column 3 | Column 4 | -### List +Tables also support optional "divider" rows that are typically used to denote +keyword-only arguments in API documentation. To turn a row into a dividing +headline, it should only include content in its first cell, and its value should +be italicized: + +> #### Markdown +> +> ```markdown_ +> | Header 1 | Header 2 | Header 3 | +> | -------- | -------- | -------- | +> | Column 1 | Column 2 | Column 3 | +> | _Hello_ | | | +> | Column 1 | Column 2 | Column 3 | +> ``` + +| Header 1 | Header 2 | Header 3 | +| -------- | -------- | -------- | +| Column 1 | Column 2 | Column 3 | +| _Hello_ | | | +| Column 1 | Column 2 | Column 3 | + +### Type Annotations {#type-annotations} + +> #### Markdown +> +> ```markdown_ +> ~~Model[List[Doc], Floats2d]~~ +> ``` +> +> #### JSX +> +> ```markup +> Model[List[Doc], Floats2d] +> ``` + +Type annotations are special inline code blocks are used to describe Python +types in the [type hints](https://docs.python.org/3/library/typing.html) format. +The special component will split the type, apply syntax highlighting and link +all types that specify links in `meta/type-annotations.json`. Types can link to +internal or external documentation pages. To make it easy to represent the type +annotations in Markdown, the rendering "hijacks" the `~~` tags that would +typically be converted to a `` element – but in this case, text surrounded +by `~~` becomes a type annotation. + +- ~~Dict[str, List[Union[Doc, Span]]]~~ +- ~~Model[List[Doc], List[numpy.ndarray]]~~ + +Type annotations support a special visual style in tables and will render as a +separate row, under the cell text. This allows the API docs to display complex +types without taking up too much space in the cell. The type annotation should +always be the **last element** in the row. + +> #### Markdown +> +> ```markdown_ +> | Header 1 | Header 2 | +> | -------- | ----------------------- | +> | Column 1 | Column 2 ~~List[Doc]~~ | +> ``` + +| Name | Description | +| ------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `vocab` | The shared vocabulary. ~~Vocab~~ | +| `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) wrapping the transformer. ~~Model[List[Doc], FullTransformerBatch]~~ | +| `annotation_setter` | Function that takes a batch of `Doc` objects and transformer outputs can set additional annotations on the `Doc`. ~~Callable[[List[Doc], FullTransformerBatch], None]~~ | + +### List {#list} > #### Markdown > @@ -255,7 +326,7 @@ automatically. 3. Lorem ipsum dolor 4. consectetur adipiscing elit -### Aside +### Aside {#aside} > #### Markdown > @@ -280,7 +351,7 @@ To make them easier to use in Markdown, paragraphs formatted as blockquotes will turn into asides by default. Level 4 headlines (with a leading `####`) will become aside titles. -### Code Block +### Code Block {#code-block} > #### Markdown > @@ -387,7 +458,7 @@ original file is shown at the top of the widget. https://github.com/explosion/spaCy/tree/master/spacy/language.py ``` -### Infobox +### Infobox {#infobox} import Infobox from 'components/infobox' @@ -425,7 +496,7 @@ blocks.
-### Accordion +### Accordion {#accordion} import Accordion from 'components/accordion' diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md index cc6f44fcc..3089fa1b3 100644 --- a/website/docs/api/architectures.md +++ b/website/docs/api/architectures.md @@ -11,9 +11,17 @@ menu: - ['Entity Linking', 'entitylinker'] --- -TODO: intro and how architectures work, link to -[`registry`](/api/top-level#registry), -[custom models](/usage/training#custom-models) usage etc. +A **model architecture** is a function that wires up a +[`Model`](https://thinc.ai/docs/api-model) instance, which you can then use in a +pipeline component or as a layer of a larger network. This page documents +spaCy's built-in architectures that are used for different NLP tasks. All +trainable [built-in components](/api#architecture-pipeline) expect a `model` +argument defined in the config and document their the default architecture. +Custom architectures can be registered using the +[`@spacy.registry.architectures`](/api/top-level#regsitry) decorator and used as +part of the [training config](/usage/training#custom-functions). Also see the +usage documentation on +[layers and model architectures](/usage/layers-architectures). ## Tok2Vec architectures {#tok2vec-arch source="spacy/ml/models/tok2vec.py"} @@ -33,18 +41,19 @@ TODO: intro and how architectures work, link to > subword_features = true > ``` -Build spaCy's 'standard' tok2vec layer, which uses hash embedding with subword +Build spaCy's "standard" embedding layer, which uses hash embedding with subword features and a CNN with layer-normalized maxout. -| Name | Type | Description | -| -------------------- | ---- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `width` | int | The width of the input and output. These are required to be the same, so that residual connections can be used. Recommended values are `96`, `128` or `300`. | -| `depth` | int | The number of convolutional layers to use. Recommended values are between `2` and `8`. | -| `embed_size` | int | The number of rows in the hash embedding tables. This can be surprisingly small, due to the use of the hash embeddings. Recommended values are between `2000` and `10000`. | -| `window_size` | int | The number of tokens on either side to concatenate during the convolutions. The receptive field of the CNN will be `depth * (window_size * 2 + 1)`, so a 4-layer network with a window size of `2` will be sensitive to 17 words at a time. Recommended value is `1`. | -| `maxout_pieces` | int | The number of pieces to use in the maxout non-linearity. If `1`, the [`Mish`](https://thinc.ai/docs/api-layers#mish) non-linearity is used instead. Recommended values are `1`-`3`. | -| `subword_features` | bool | Whether to also embed subword features, specifically the prefix, suffix and word shape. This is recommended for alphabetic languages like English, but not if single-character tokens are used for a language such as Chinese. | -| `pretrained_vectors` | bool | Whether to also use static vectors. | +| Name | Description | +| -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `width` | The width of the input and output. These are required to be the same, so that residual connections can be used. Recommended values are `96`, `128` or `300`. ~~int~~ | +| `depth` | The number of convolutional layers to use. Recommended values are between `2` and `8`. ~~int~~ | +| `embed_size` | The number of rows in the hash embedding tables. This can be surprisingly small, due to the use of the hash embeddings. Recommended values are between `2000` and `10000`. ~~int~~ | +| `window_size` | The number of tokens on either side to concatenate during the convolutions. The receptive field of the CNN will be `depth * (window_size * 2 + 1)`, so a 4-layer network with a window size of `2` will be sensitive to 17 words at a time. Recommended value is `1`. ~~int~~ | +| `maxout_pieces` | The number of pieces to use in the maxout non-linearity. If `1`, the [`Mish`](https://thinc.ai/docs/api-layers#mish) non-linearity is used instead. Recommended values are `1`-`3`. ~~int~~ | +| `subword_features` | Whether to also embed subword features, specifically the prefix, suffix and word shape. This is recommended for alphabetic languages like English, but not if single-character tokens are used for a language such as Chinese. ~~bool~~ | +| `pretrained_vectors` | Whether to also use static vectors. ~~bool~~ | +| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ | ### spacy.Tok2Vec.v1 {#Tok2Vec} @@ -67,10 +76,11 @@ Construct a tok2vec model out of embedding and encoding subnetworks. See the ["Embed, Encode, Attend, Predict"](https://explosion.ai/blog/deep-learning-formula-nlp) blog post for background. -| Name | Type | Description | -| -------- | ------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `embed` | [`Model`](https://thinc.ai/docs/api-model) | **Input:** `List[Doc]`. **Output:** `List[Floats2d]`. Embed tokens into context-independent word vector representations. For example, [CharacterEmbed](/api/architectures#CharacterEmbed) or [MultiHashEmbed](/api/architectures#MultiHashEmbed) | -| `encode` | [`Model`](https://thinc.ai/docs/api-model) | **Input:** `List[Floats2d]`. **Output:** `List[Floats2d]`. Encode context into the embeddings, using an architecture such as a CNN, BiLSTM or transformer. For example, [MaxoutWindowEncoder](/api/architectures#MaxoutWindowEncoder). | +| Name | Description | +| ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `embed` | Embed tokens into context-independent word vector representations. For example, [CharacterEmbed](/api/architectures#CharacterEmbed) or [MultiHashEmbed](/api/architectures#MultiHashEmbed). ~~Model[List[Doc], List[Floats2d]]~~ | +| `encode` | Encode context into the embeddings, using an architecture such as a CNN, BiLSTM or transformer. For example, [MaxoutWindowEncoder](/api/architectures#MaxoutWindowEncoder). ~~Model[List[Floats2d], List[Floats2d]]~~ | +| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ | ### spacy.Tok2VecListener.v1 {#Tok2VecListener} @@ -92,7 +102,7 @@ blog post for background. > > [components.tagger.model.tok2vec] > @architectures = "spacy.Tok2VecListener.v1" -> width = ${components.tok2vec.model:width} +> width = ${components.tok2vec.model.width} > ``` A listener is used as a sublayer within a component such as a @@ -108,10 +118,11 @@ Instead of defining its own `Tok2Vec` instance, a model architecture like [Tagger](/api/architectures#tagger) can define a listener as its `tok2vec` argument that connects to the shared `tok2vec` component in the pipeline. -| Name | Type | Description | -| ---------- | ---- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `width` | int | The width of the vectors produced by the "upstream" [`Tok2Vec`](/api/tok2vec) component. | -| `upstream` | str | A string to identify the "upstream" `Tok2Vec` component to communicate with. The upstream name should either be the wildcard string `"*"`, or the name of the `Tok2Vec` component. You'll almost never have multiple upstream `Tok2Vec` components, so the wildcard string will almost always be fine. | +| Name | Description | +| ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `width` | The width of the vectors produced by the "upstream" [`Tok2Vec`](/api/tok2vec) component. ~~int~~ | +| `upstream` | A string to identify the "upstream" `Tok2Vec` component to communicate with. The upstream name should either be the wildcard string `"*"`, or the name of the `Tok2Vec` component. You'll almost never have multiple upstream `Tok2Vec` components, so the wildcard string will almost always be fine. ~~str~~ | +| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ | ### spacy.MultiHashEmbed.v1 {#MultiHashEmbed} @@ -134,12 +145,13 @@ definitions depending on the `Vocab` of the `Doc` object passed in. Vectors from pretrained static vectors can also be incorporated into the concatenated representation. -| Name | Type | Description | -| ------------------------- | ---- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `width` | int | The output width. Also used as the width of the embedding tables. Recommended values are between `64` and `300`. | -| `rows` | int | The number of rows for the embedding tables. Can be low, due to the hashing trick. Embeddings for prefix, suffix and word shape use half as many rows. Recommended values are between `2000` and `10000`. | -| `also_embed_subwords` | bool | Whether to use the `PREFIX`, `SUFFIX` and `SHAPE` features in the embeddings. If not using these, you may need more rows in your hash embeddings, as there will be increased chance of collisions. | -| `also_use_static_vectors` | bool | Whether to also use static word vectors. Requires a vectors table to be loaded in the [Doc](/api/doc) objects' vocab. | +| Name | Description | +| ------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `width` | The output width. Also used as the width of the embedding tables. Recommended values are between `64` and `300`. ~~int~~ | +| `rows` | The number of rows for the embedding tables. Can be low, due to the hashing trick. Embeddings for prefix, suffix and word shape use half as many rows. Recommended values are between `2000` and `10000`. ~~int~~ | +| `also_embed_subwords` | Whether to use the `PREFIX`, `SUFFIX` and `SHAPE` features in the embeddings. If not using these, you may need more rows in your hash embeddings, as there will be increased chance of collisions. ~~bool~~ | +| `also_use_static_vectors` | Whether to also use static word vectors. Requires a vectors table to be loaded in the [Doc](/api/doc) objects' vocab. ~~bool~~ | +| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ | ### spacy.CharacterEmbed.v1 {#CharacterEmbed} @@ -170,12 +182,13 @@ concatenated. A hash-embedded vector of the `NORM` of the word is also concatenated on, and the result is then passed through a feed-forward network to construct a single vector to represent the information. -| Name | Type | Description | -| ------- | ---- | ------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `width` | int | The width of the output vector and the `NORM` hash embedding. | -| `rows` | int | The number of rows in the `NORM` hash embedding table. | -| `nM` | int | The dimensionality of the character embeddings. Recommended values are between `16` and `64`. | -| `nC` | int | The number of UTF-8 bytes to embed per word. Recommended values are between `3` and `8`, although it may depend on the length of words in the language. | +| Name | Description | +| ----------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `width` | The width of the output vector and the `NORM` hash embedding. ~~int~~ | +| `rows` | The number of rows in the `NORM` hash embedding table. ~~int~~ | +| `nM` | The dimensionality of the character embeddings. Recommended values are between `16` and `64`. ~~int~~ | +| `nC` | The number of UTF-8 bytes to embed per word. Recommended values are between `3` and `8`, although it may depend on the length of words in the language. ~~int~~ | +| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ | ### spacy.MaxoutWindowEncoder.v1 {#MaxoutWindowEncoder} @@ -193,12 +206,13 @@ construct a single vector to represent the information. Encode context using convolutions with maxout activation, layer normalization and residual connections. -| Name | Type | Description | -| --------------- | ---- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `width` | int | The input and output width. These are required to be the same, to allow residual connections. This value will be determined by the width of the inputs. Recommended values are between `64` and `300`. | -| `window_size` | int | The number of words to concatenate around each token to construct the convolution. Recommended value is `1`. | -| `maxout_pieces` | int | The number of maxout pieces to use. Recommended values are `2` or `3`. | -| `depth` | int | The number of convolutional layers. Recommended value is `4`. | +| Name | Description | +| --------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `width` | The input and output width. These are required to be the same, to allow residual connections. This value will be determined by the width of the inputs. Recommended values are between `64` and `300`. ~~int~~ | +| `window_size` | The number of words to concatenate around each token to construct the convolution. Recommended value is `1`. ~~int~~ | +| `maxout_pieces` | The number of maxout pieces to use. Recommended values are `2` or `3`. ~~int~~ | +| `depth` | The number of convolutional layers. Recommended value is `4`. ~~int~~ | +| **CREATES** | The model using the architecture. ~~Model[List[Floats2d], List[Floats2d]]~~ | ### spacy.MishWindowEncoder.v1 {#MishWindowEncoder} @@ -216,11 +230,12 @@ Encode context using convolutions with [`Mish`](https://thinc.ai/docs/api-layers#mish) activation, layer normalization and residual connections. -| Name | Type | Description | -| ------------- | ---- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `width` | int | The input and output width. These are required to be the same, to allow residual connections. This value will be determined by the width of the inputs. Recommended values are between `64` and `300`. | -| `window_size` | int | The number of words to concatenate around each token to construct the convolution. Recommended value is `1`. | -| `depth` | int | The number of convolutional layers. Recommended value is `4`. | +| Name | Description | +| ------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `width` | The input and output width. These are required to be the same, to allow residual connections. This value will be determined by the width of the inputs. Recommended values are between `64` and `300`. ~~int~~ | +| `window_size` | The number of words to concatenate around each token to construct the convolution. Recommended value is `1`. ~~int~~ | +| `depth` | The number of convolutional layers. Recommended value is `4`. ~~int~~ | +| **CREATES** | The model using the architecture. ~~Model[List[Floats2d], List[Floats2d]]~~ | ### spacy.TorchBiLSTMEncoder.v1 {#TorchBiLSTMEncoder} @@ -237,18 +252,58 @@ and residual connections. Encode context using bidirectional LSTM layers. Requires [PyTorch](https://pytorch.org). -| Name | Type | Description | -| ------------- | ---- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `width` | int | The input and output width. These are required to be the same, to allow residual connections. This value will be determined by the width of the inputs. Recommended values are between `64` and `300`. | -| `window_size` | int | The number of words to concatenate around each token to construct the convolution. Recommended value is `1`. | -| `depth` | int | The number of convolutional layers. Recommended value is `4`. | +| Name | Description | +| ------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `width` | The input and output width. These are required to be the same, to allow residual connections. This value will be determined by the width of the inputs. Recommended values are between `64` and `300`. ~~int~~ | +| `window_size` | The number of words to concatenate around each token to construct the convolution. Recommended value is `1`. ~~int~~ | +| `depth` | The number of convolutional layers. Recommended value is `4`. ~~int~~ | +| **CREATES** | The model using the architecture. ~~Model[List[Floats2d], List[Floats2d]]~~ | + +### spacy.StaticVectors.v1 {#StaticVectors} + +> #### Example config +> +> ```ini +> [model] +> @architectures = "spacy.StaticVectors.v1" +> nO = null +> nM = null +> dropout = 0.2 +> key_attr = "ORTH" +> +> [model.init_W] +> @initializers = "glorot_uniform_init.v1" +> ``` + +Embed [`Doc`](/api/doc) objects with their vocab's vectors table, applying a +learned linear projection to control the dimensionality. See the documentation +on [static vectors](/usage/embeddings-transformers#static-vectors) for details. + +| Name | Β Description | +| ----------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `nO` | The output width of the layer, after the linear projection. ~~Optional[int]~~ | +| `nM` | The width of the static vectors. ~~Optional[int]~~ | +| `dropout` | Optional dropout rate. If set, it's applied per dimension over the whole batch. Defaults to `None`. ~~Optional[float]~~ | +| `init_W` | The [initialization function](https://thinc.ai/docs/api-initializers). Defaults to [`glorot_uniform_init`](https://thinc.ai/docs/api-initializers#glorot_uniform_init). ~~Callable[[Ops, Tuple[int, ...]]], FloatsXd]~~ | +| `key_attr` | Defaults to `"ORTH"`. ~~str~~ | +| **CREATES** | The model using the architecture. ~~Model[List[Doc], Ragged]~~ | ## Transformer architectures {#transformers source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/architectures.py"} The following architectures are provided by the package [`spacy-transformers`](https://github.com/explosion/spacy-transformers). See the -[usage documentation](/usage/transformers) for how to integrate the -architectures into your training config. +[usage documentation](/usage/embeddings-transformers#transformers) for how to +integrate the architectures into your training config. + + + +Note that in order to use these architectures in your config, you need to +install the +[`spacy-transformers`](https://github.com/explosion/spacy-transformers). See the +[installation docs](/usage/embeddings-transformers#transformers-installation) +for details and system requirements. + + ### spacy-transformers.TransformerModel.v1 {#TransformerModel} @@ -266,13 +321,30 @@ architectures into your training config. > stride = 96 > ``` - +Load and wrap a transformer model from the +[HuggingFace `transformers`](https://huggingface.co/transformers) library. You +can any transformer that has pretrained weights and a PyTorch implementation. +The `name` variable is passed through to the underlying library, so it can be +either a string or a path. If it's a string, the pretrained weights will be +downloaded via the transformers library if they are not already available +locally. -| Name | Type | Description | -| ------------------ | ---------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `name` | str | Any model name that can be loaded by [`transformers.AutoModel`](https://huggingface.co/transformers/model_doc/auto.html#transformers.AutoModel). | -| `get_spans` | `Callable` | Function that takes a batch of [`Doc`](/api/doc) object and returns lists of [`Span`](/api) objects to process by the transformer. [See here](/api/transformer#span_getters) for built-in options and examples. | -| `tokenizer_config` | `Dict[str, Any]` | Tokenizer settings passed to [`transformers.AutoTokenizer`](https://huggingface.co/transformers/model_doc/auto.html#transformers.AutoTokenizer). | +In order to support longer documents, the +[TransformerModel](/api/architectures#TransformerModel) layer allows you to pass +in a `get_spans` function that will divide up the [`Doc`](/api/doc) objects +before passing them through the transformer. Your spans are allowed to overlap +or exclude tokens. This layer is usually used directly by the +[`Transformer`](/api/transformer) component, which allows you to share the +transformer weights across your pipeline. For a layer that's configured for use +in other components, see +[Tok2VecTransformer](/api/architectures#Tok2VecTransformer). + +| Name | Description | +| ------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `name` | Any model name that can be loaded by [`transformers.AutoModel`](https://huggingface.co/transformers/model_doc/auto.html#transformers.AutoModel). ~~str~~ | +| `get_spans` | Function that takes a batch of [`Doc`](/api/doc) object and returns lists of [`Span`](/api) objects to process by the transformer. [See here](/api/transformer#span_getters) for built-in options and examples. ~~Callable[[List[Doc]], List[Span]]~~ | +| `tokenizer_config` | Tokenizer settings passed to [`transformers.AutoTokenizer`](https://huggingface.co/transformers/model_doc/auto.html#transformers.AutoTokenizer). ~~Dict[str, Any]~~ | +| **CREATES** | The model using the architecture. ~~Model[List[Doc], FullTransformerBatch]~~ | ### spacy-transformers.Tok2VecListener.v1 {#transformers-Tok2VecListener} @@ -297,10 +369,11 @@ operate over wordpieces, which usually don't align one-to-one against spaCy tokens. The layer therefore requires a reduction operation in order to calculate a single token vector given zero or more wordpiece vectors. -| Name | Type | Description | -| ------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `pooling` | [`Model`](https://thinc.ai/docs/api-model) | **Input:** [`Ragged`](https://thinc.ai/docs/api-types#ragged). **Output:** [`Floats2d`](https://thinc.ai/docs/api-types#types) | A reduction layer used to calculate the token vectors based on zero or more wordpiece vectors. If in doubt, mean pooling (see [`reduce_mean`](https://thinc.ai/docs/api-layers#reduce_mean)) is usually a good choice. | -| `grad_factor` | float | Reweight gradients from the component before passing them upstream. You can set this to `0` to "freeze" the transformer weights with respect to the component, or use it to make some components more significant than others. Leaving it at `1.0` is usually fine. | +| Name | Description | +| ------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `pooling` | A reduction layer used to calculate the token vectors based on zero or more wordpiece vectors. If in doubt, mean pooling (see [`reduce_mean`](https://thinc.ai/docs/api-layers#reduce_mean)) is usually a good choice. ~~Model[Ragged, Floats2d]~~ | +| `grad_factor` | Reweight gradients from the component before passing them upstream. You can set this to `0` to "freeze" the transformer weights with respect to the component, or use it to make some components more significant than others. Leaving it at `1.0` is usually fine. ~~float~~ | +| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ | ### spacy-transformers.Tok2VecTransformer.v1 {#Tok2VecTransformer} @@ -320,12 +393,13 @@ Use a transformer as a [`Tok2Vec`](/api/tok2vec) layer directly. This does object, but it's a **simpler solution** if you only need the transformer within one component. -| Name | Type | Description | -| ------------------ | ------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `get_spans` | callable | Function that takes a batch of [`Doc`](/api/doc) object and returns lists of [`Span`](/api) objects to process by the transformer. [See here](/api/transformer#span_getters) for built-in options and examples. | -| `tokenizer_config` | `Dict[str, Any]` | Tokenizer settings passed to [`transformers.AutoTokenizer`](https://huggingface.co/transformers/model_doc/auto.html#transformers.AutoTokenizer). | -| `pooling` | [`Model`](https://thinc.ai/docs/api-model) | **Input:** [`Ragged`](https://thinc.ai/docs/api-types#ragged). **Output:** [`Floats2d`](https://thinc.ai/docs/api-types#types) | A reduction layer used to calculate the token vectors based on zero or more wordpiece vectors. If in doubt, mean pooling (see [`reduce_mean`](https://thinc.ai/docs/api-layers#reduce_mean)) is usually a good choice. | -| `grad_factor` | float | Reweight gradients from the component before passing them upstream. You can set this to `0` to "freeze" the transformer weights with respect to the component, or use it to make some components more significant than others. Leaving it at `1.0` is usually fine. | +| Name | Description | +| ------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `get_spans` | Function that takes a batch of [`Doc`](/api/doc) object and returns lists of [`Span`](/api) objects to process by the transformer. [See here](/api/transformer#span_getters) for built-in options and examples. ~~Callable[[List[Doc]], List[Span]]~~ | +| `tokenizer_config` | Tokenizer settings passed to [`transformers.AutoTokenizer`](https://huggingface.co/transformers/model_doc/auto.html#transformers.AutoTokenizer). ~~Dict[str, Any]~~ | +| `pooling` | A reduction layer used to calculate the token vectors based on zero or more wordpiece vectors. If in doubt, mean pooling (see [`reduce_mean`](https://thinc.ai/docs/api-layers#reduce_mean)) is usually a good choice. ~~Model[Ragged, Floats2d]~~ | +| `grad_factor` | Reweight gradients from the component before passing them upstream. You can set this to `0` to "freeze" the transformer weights with respect to the component, or use it to make some components more significant than others. Leaving it at `1.0` is usually fine. ~~float~~ | +| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ | ## Parser & NER architectures {#parser} @@ -351,7 +425,7 @@ one component. > subword_features = true > ``` -Build a transition-based parser model. Can apply to NER or dependency-parsing. +Build a transition-based parser model. Can apply to NER or dependency parsing. Transition-based parsing is an approach to structured prediction where the task of predicting the structure is mapped to a series of state transitions. You might find [this tutorial](https://explosion.ai/blog/parsing-english-in-python) @@ -368,14 +442,15 @@ consists of either two or three subnetworks: state representation. If not present, the output from the lower model is used as action scores directly. -| Name | Type | Description | -| ------------------- | ------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `tok2vec` | [`Model`](https://thinc.ai/docs/api-model) | **Input:** `List[Doc]`. **Output:** `List[Floats2d]`. Subnetwork to map tokens into vector representations. | -| `nr_feature_tokens` | int | The number of tokens in the context to use to construct the state vector. Valid choices are `1`, `2`, `3`, `6`, `8` and `13`. The `2`, `8` and `13` feature sets are designed for the parser, while the `3` and `6` feature sets are designed for the entity recognizer. The recommended feature sets are `3` for NER, and `8` for the dependency parser. | -| `hidden_width` | int | The width of the hidden layer. | -| `maxout_pieces` | int | How many pieces to use in the state prediction layer. Recommended values are `1`, `2` or `3`. If `1`, the maxout non-linearity is replaced with a [`Relu`](https://thinc.ai/docs/api-layers#relu) non-linearity if `use_upper` is `True`, and no non-linearity if `False`. | -| `use_upper` | bool | Whether to use an additional hidden layer after the state vector in order to predict the action scores. It is recommended to set this to `False` for large pretrained models such as transformers, and `True` for smaller networks. The upper layer is computed on CPU, which becomes a bottleneck on larger GPU-based models, where it's also less necessary. | -| `nO` | int | The number of actions the model will predict between. Usually inferred from data at the beginning of training, or loaded from disk. | +| Name | Description | +| ------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `tok2vec` | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~ | +| `nr_feature_tokens` | The number of tokens in the context to use to construct the state vector. Valid choices are `1`, `2`, `3`, `6`, `8` and `13`. The `2`, `8` and `13` feature sets are designed for the parser, while the `3` and `6` feature sets are designed for the entity recognizer. The recommended feature sets are `3` for NER, and `8` for the dependency parser. ~~int~~ | +| `hidden_width` | The width of the hidden layer. ~~int~~ | +| `maxout_pieces` | How many pieces to use in the state prediction layer. Recommended values are `1`, `2` or `3`. If `1`, the maxout non-linearity is replaced with a [`Relu`](https://thinc.ai/docs/api-layers#relu) non-linearity if `use_upper` is `True`, and no non-linearity if `False`. ~~int~~ | +| `use_upper` | Whether to use an additional hidden layer after the state vector in order to predict the action scores. It is recommended to set this to `False` for large pretrained models such as transformers, and `True` for smaller networks. The upper layer is computed on CPU, which becomes a bottleneck on larger GPU-based models, where it's also less necessary. ~~bool~~ | +| `nO` | The number of actions the model will predict between. Usually inferred from data at the beginning of training, or loaded from disk. ~~int~~ | +| **CREATES** | The model using the architecture. ~~Model[List[Docs], List[List[Floats2d]]]~~ | ### spacy.BILUOTagger.v1 {#BILUOTagger source="spacy/ml/models/simple_ner.py"} @@ -402,9 +477,10 @@ generally results in better linear separation between classes, especially for non-CRF models, because there are more distinct classes for the different situations ([Ratinov et al., 2009](https://www.aclweb.org/anthology/W09-1119/)). -| Name | Type | Description | -| --------- | ------------------------------------------ | ----------------------------------------------------------------------------------------------------------- | -| `tok2vec` | [`Model`](https://thinc.ai/docs/api-model) | **Input:** `List[Doc]`. **Output:** `List[Floats2d]`. Subnetwork to map tokens into vector representations. | +| Name | Description | +| ----------- | ------------------------------------------------------------------------------------------ | +| `tok2vec` | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~ | +| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ | ### spacy.IOBTagger.v1 {#IOBTagger source="spacy/ml/models/simple_ner.py"} @@ -427,9 +503,10 @@ spans into tags assigned to each token. The first token of a span is given the tag B-LABEL, and subsequent tokens are given the tag I-LABEL. All other tokens are assigned the tag O. -| Name | Type | Description | -| --------- | ------------------------------------------ | ----------------------------------------------------------------------------------------------------------- | -| `tok2vec` | [`Model`](https://thinc.ai/docs/api-model) | **Input:** `List[Doc]`. **Output:** `List[Floats2d]`. Subnetwork to map tokens into vector representations. | +| Name | Description | +| ----------- | ------------------------------------------------------------------------------------------ | +| `tok2vec` | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~ | +| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ | ## Tagging architectures {#tagger source="spacy/ml/models/tagger.py"} @@ -450,10 +527,11 @@ Build a tagger model, using a provided token-to-vector component. The tagger model simply adds a linear layer with softmax activation to predict scores given the token vectors. -| Name | Type | Description | -| --------- | ------------------------------------------ | ----------------------------------------------------------------------------------------------------------- | -| `tok2vec` | [`Model`](https://thinc.ai/docs/api-model) | **Input:** `List[Doc]`. **Output:** `List[Floats2d]`. Subnetwork to map tokens into vector representations. | -| `nO` | int | The number of tags to output. Inferred from the data if `None`. | +| Name | Description | +| ----------- | ------------------------------------------------------------------------------------------ | +| `tok2vec` | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~ | +| `nO` | The number of tags to output. Inferred from the data if `None`. ~~Optional[int]~~ | +| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ | ## Text classification architectures {#textcat source="spacy/ml/models/textcat.py"} @@ -470,9 +548,6 @@ specific data and challenge. ### spacy.TextCatEnsemble.v1 {#TextCatEnsemble} -Stacked ensemble of a bag-of-words model and a neural network model. The neural -network has an internal CNN Tok2Vec layer and uses attention. - > #### Example Config > > ```ini @@ -489,18 +564,21 @@ network has an internal CNN Tok2Vec layer and uses attention. > nO = null > ``` -| Name | Type | Description | -| --------------------------- | ----- | -------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `exclusive_classes` | bool | Whether or not categories are mutually exclusive. | -| `pretrained_vectors` | bool | Whether or not pretrained vectors will be used in addition to the feature vectors. | -| `width` | int | Output dimension of the feature encoding step. | -| `embed_size` | int | Input dimension of the feature encoding step. | -| `conv_depth` | int | Depth of the Tok2Vec layer. | -| `window_size` | int | The number of contextual vectors to [concatenate](https://thinc.ai/docs/api-layers#expand_window) from the left and from the right. | -| `ngram_size` | int | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3`would give unigram, trigram and bigram features. | -| `dropout` | float | The dropout rate. | -| `nO` | int | Output dimension, determined by the number of different labels. If not set, the the [`TextCategorizer`](/api/textcategorizer) component will set it when | -| `begin_training` is called. | +Stacked ensemble of a bag-of-words model and a neural network model. The neural +network has an internal CNN Tok2Vec layer and uses attention. + +| Name | Description | +| -------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ | +| `pretrained_vectors` | Whether or not pretrained vectors will be used in addition to the feature vectors. ~~bool~~ | +| `width` | Output dimension of the feature encoding step. ~~int~~ | +| `embed_size` | Input dimension of the feature encoding step. ~~int~~ | +| `conv_depth` | Depth of the tok2vec layer. ~~int~~ | +| `window_size` | The number of contextual vectors to [concatenate](https://thinc.ai/docs/api-layers#expand_window) from the left and from the right. ~~int~~ | +| `ngram_size` | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3`would give unigram, trigram and bigram features. ~~int~~ | +| `dropout` | The dropout rate. ~~float~~ | +| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `begin_training` is called. ~~Optional[int]~~ | +| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ | ### spacy.TextCatCNN.v1 {#TextCatCNN} @@ -527,11 +605,12 @@ A neural network model where token vectors are calculated using a CNN. The vectors are mean pooled and used as features in a feed-forward network. This architecture is usually less accurate than the ensemble, but runs faster. -| Name | Type | Description | -| ------------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `exclusive_classes` | bool | Whether or not categories are mutually exclusive. | -| `tok2vec` | [`Model`](https://thinc.ai/docs/api-model) | The [`tok2vec`](#tok2vec) layer of the model. | -| `nO` | int | Output dimension, determined by the number of different labels. If not set, the the [`TextCategorizer`](/api/textcategorizer) component will set it when `begin_training` is called. | +| Name | Description | +| ------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ | +| `tok2vec` | The [`tok2vec`](#tok2vec) layer of the model. ~~Model~~ | +| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `begin_training` is called. ~~Optional[int]~~ | +| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ | ### spacy.TextCatBOW.v1 {#TextCatBOW} @@ -549,12 +628,13 @@ architecture is usually less accurate than the ensemble, but runs faster. An ngram "bag-of-words" model. This architecture should run much faster than the others, but may not be as accurate, especially if texts are short. -| Name | Type | Description | -| ------------------- | ----- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `exclusive_classes` | bool | Whether or not categories are mutually exclusive. | -| `ngram_size` | int | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3`would give unigram, trigram and bigram features. | -| `no_output_layer` | float | Whether or not to add an output layer to the model (`Softmax` activation if `exclusive_classes=True`, else `Logistic`. | -| `nO` | int | Output dimension, determined by the number of different labels. If not set, the the [`TextCategorizer`](/api/textcategorizer) component will set it when `begin_training` is called. | +| Name | Description | +| ------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ | +| `ngram_size` | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3`would give unigram, trigram and bigram features. ~~int~~ | +| `no_output_layer` | Whether or not to add an output layer to the model (`Softmax` activation if `exclusive_classes` is `True`, else `Logistic`. ~~bool~~ | +| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `begin_training` is called. ~~Optional[int]~~ | +| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ | ## Entity linking architectures {#entitylinker source="spacy/ml/models/entity_linker.py"} @@ -571,9 +651,6 @@ into the "real world". This requires 3 main components: ### spacy.EntityLinker.v1 {#EntityLinker} -The `EntityLinker` model architecture is a `Thinc` `Model` with a Linear output -layer. - > #### Example Config > > ```ini @@ -599,27 +676,28 @@ layer. > @assets = "spacy.CandidateGenerator.v1" > ``` -| Name | Type | Description | -| --------- | ------------------------------------------ | ---------------------------------------------------------------------------------------- | -| `tok2vec` | [`Model`](https://thinc.ai/docs/api-model) | The [`tok2vec`](#tok2vec) layer of the model. | -| `nO` | int | Output dimension, determined by the length of the vectors encoding each entity in the KB | +The `EntityLinker` model architecture is a Thinc `Model` with a +[`Linear`](https://thinc.ai/api-layers#linear) output layer. -If the `nO` dimension is not set, the Entity Linking component will set it when -`begin_training` is called. +| Name | Description | +| ----------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `tok2vec` | The [`tok2vec`](#tok2vec) layer of the model. ~~Model~~ | +| `nO` | Output dimension, determined by the length of the vectors encoding each entity in the KB. If the `nO` dimension is not set, the entity linking component will set it when `begin_training` is called. ~~Optional[int]~~ | +| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ | ### spacy.EmptyKB.v1 {#EmptyKB} A function that creates a default, empty `KnowledgeBase` from a [`Vocab`](/api/vocab) instance. -| Name | Type | Description | -| ---------------------- | ---- | ------------------------------------------------------------------------- | -| `entity_vector_length` | int | The length of the vectors encoding each entity in the KB - 64 by default. | +| Name | Description | +| ---------------------- | ----------------------------------------------------------------------------------- | +| `entity_vector_length` | The length of the vectors encoding each entity in the KB. Defaults to `64`. ~~int~~ | ### spacy.CandidateGenerator.v1 {#CandidateGenerator} A function that takes as input a [`KnowledgeBase`](/api/kb) and a [`Span`](/api/span) object denoting a named entity, and returns a list of -plausible [`Candidate` objects](/api/kb/#candidate_init). The default +plausible [`Candidate`](/api/kb/#candidate) objects. The default `CandidateGenerator` simply uses the text of a mention to find its potential aliases in the `KnowledgeBase`. Note that this function is case-dependent. diff --git a/website/docs/api/attributeruler.md b/website/docs/api/attributeruler.md index e2f009cad..98f267e87 100644 --- a/website/docs/api/attributeruler.md +++ b/website/docs/api/attributeruler.md @@ -31,10 +31,10 @@ how the component should be configured. You can override its settings via the > nlp.add_pipe("attribute_ruler", config=config) > ``` -| Setting | Type | Description | Default | -| --------------- | ---------------- | --------------------------------------------------------------------------------------------------------------------------------------- | ------- | -| `pattern_dicts` | `Iterable[dict]` | A list of pattern dicts with the keys as the arguments to [`AttributeRuler.add`](#add) (`patterns`/`attrs`/`index`) to add as patterns. | `None` | -| `validate` | bool | Whether patterns should be validated (passed to the `Matcher`). | `False` | +| Setting | Description | +| --------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `pattern_dicts` | A list of pattern dicts with the keys as the arguments to [`AttributeRuler.add`](/api/attributeruler#add) (`patterns`/`attrs`/`index`) to add as patterns. Defaults to `None`. ~~Optional[Iterable[Dict[str, Union[List[dict], dict, int]]]]~~ | +| `validate` | Whether patterns should be validated (passed to the `Matcher`). Defaults to `False`. ~~bool~~ | ```python https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/attributeruler.py @@ -47,10 +47,10 @@ be a list of dictionaries with `"patterns"`, `"attrs"`, and optional `"index"` keys, e.g.: ```python -pattern_dicts = \[ - {"patterns": \[\[{"TAG": "VB"}\]\], "attrs": {"POS": "VERB"}}, - {"patterns": \[\[{"LOWER": "an"}\]\], "attrs": {"LEMMA": "a"}}, -\] +pattern_dicts = [ + {"patterns": [[{"TAG": "VB"}]], "attrs": {"POS": "VERB"}}, + {"patterns": [[{"LOWER": "an"}]], "attrs": {"LEMMA": "a"}}, +] ``` > #### Example @@ -60,23 +60,23 @@ pattern_dicts = \[ > attribute_ruler = nlp.add_pipe("attribute_ruler") > ``` -| Name | Type | Description | -| --------------- | ----------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `vocab` | `Vocab` | The shared nlp object to pass the vocab to the matchers and process phrase patterns. | -| `name` | str | Instance name of the current pipeline component. Typically passed in automatically from the factory when the component is added. Used to disable the current entity ruler while creating phrase patterns with the nlp object. | -| _keyword-only_ | | | -| `pattern_dicts` | `Iterable[Dict]]` | Optional patterns to load in on initialization. Defaults to `None`. | -| `validate` | bool | Whether patterns should be validated (passed to the `Matcher`). Defaults to `False`. | +| Name | Description | +| --------------- | ---------------------------------------------------------------------------------------------------------------------------------------- | +| `vocab` | The shared vocabulary to pass to the matcher. ~~Vocab~~ | +| `name` | Instance name of the current pipeline component. Typically passed in automatically from the factory when the component is added. ~~str~~ | +| _keyword-only_ | | +| `pattern_dicts` | Optional patterns to load in on initialization. Defaults to `None`. ~~Optional[Iterable[Dict[str, Union[List[dict], dict, int]]]]~~ | +| `validate` | Whether patterns should be validated (passed to the [`Matcher`](/api/matcher#init)). Defaults to `False`. ~~bool~~ | ## AttributeRuler.\_\_call\_\_ {#call tag="method"} Apply the attribute ruler to a Doc, setting token attributes for tokens matched by the provided patterns. -| Name | Type | Description | -| ----------- | ----- | ------------------------------------------------------------ | -| `doc` | `Doc` | The `Doc` object to process, e.g. the `Doc` in the pipeline. | -| **RETURNS** | `Doc` | The modified `Doc` with added entities, if available. | +| Name | Description | +| ----------- | -------------------------------- | +| `doc` | The document to process. ~~Doc~~ | +| **RETURNS** | The processed document. ~~Doc~~ | ## AttributeRuler.add {#add tag="method"} @@ -95,11 +95,11 @@ may be negative to index from the end of the span. > attribute_ruler.add(patterns=patterns, attrs=attrs) > ``` -| Name | Type | Description | -| -------- | ---------------------- | ----------------------------------------------------------------------------------------------------------------------- | -| patterns | `Iterable[List[Dict]]` | A list of Matcher patterns. | -| attrs | dict | The attributes to assign to the target token in the matched span. | -| index | int | The index of the token in the matched span to modify. May be negative to index from the end of the span. Defaults to 0. | +| Name | Description | +| ---------- | --------------------------------------------------------------------------------------------------------------------------------- | +| `patterns` | The `Matcher` patterns to add. ~~Iterable[List[Dict[Union[int, str], Any]]]~~ | +| `attrs` | The attributes to assign to the target token in the matched span. ~~Dict[str, Any]~~ | +| `index` | The index of the token in the matched span to modify. May be negative to index from the end of the span. Defaults to `0`. ~~int~~ | ## AttributeRuler.add_patterns {#add_patterns tag="method"} @@ -107,52 +107,52 @@ may be negative to index from the end of the span. > > ```python > attribute_ruler = nlp.add_pipe("attribute_ruler") -> pattern_dicts = \[ +> pattern_dicts = [ > { -> "patterns": \[\[{"TAG": "VB"}\]\], +> "patterns": [[{"TAG": "VB"}]], > "attrs": {"POS": "VERB"} > }, > { -> "patterns": \[\[{"LOWER": "two"}, {"LOWER": "apples"}\]\], +> "patterns": [[{"LOWER": "two"}, {"LOWER": "apples"}]], > "attrs": {"LEMMA": "apple"}, > "index": -1 > }, -> \] +> ] > attribute_ruler.add_patterns(pattern_dicts) > ``` Add patterns from a list of pattern dicts with the keys as the arguments to -[`AttributeRuler.add`](#add). +[`AttributeRuler.add`](/api/attributeruler#add). -| Name | Type | Description | -| --------------- | ----------------- | -------------------- | -| `pattern_dicts` | `Iterable[Dict]]` | The patterns to add. | +| Name | Description | +| --------------- | -------------------------------------------------------------------------- | +| `pattern_dicts` | The patterns to add. ~~Iterable[Dict[str, Union[List[dict], dict, int]]]~~ | ## AttributeRuler.patterns {#patterns tag="property"} Get all patterns that have been added to the attribute ruler in the `patterns_dict` format accepted by -[`AttributeRuler.add_patterns`](#add_patterns). +[`AttributeRuler.add_patterns`](/api/attributeruler#add_patterns). -| Name | Type | Description | -| ----------- | ------------ | ------------------------------------------ | -| **RETURNS** | `List[dict]` | The patterns added to the attribute ruler. | +| Name | Description | +| ----------- | -------------------------------------------------------------------------------------------- | +| **RETURNS** | The patterns added to the attribute ruler. ~~List[Dict[str, Union[List[dict], dict, int]]]~~ | ## AttributeRuler.load_from_tag_map {#load_from_tag_map tag="method"} Load attribute ruler patterns from a tag map. -| Name | Type | Description | -| --------- | ---- | ------------------------------------------------------------------------------------------ | -| `tag_map` | dict | The tag map that maps fine-grained tags to coarse-grained tags and morphological features. | +| Name | Description | +| --------- | ------------------------------------------------------------------------------------------------------------------------------------------------ | +| `tag_map` | The tag map that maps fine-grained tags to coarse-grained tags and morphological features. ~~Dict[str, Dict[Union[int, str], Union[int, str]]]~~ | ## AttributeRuler.load_from_morph_rules {#load_from_morph_rules tag="method"} Load attribute ruler patterns from morph rules. -| Name | Type | Description | -| ------------- | ---- | -------------------------------------------------------------------------------------------------------------------- | -| `morph_rules` | dict | The morph rules that map token text and fine-grained tags to coarse-grained tags, lemmas and morphological features. | +| Name | Description | +| ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `morph_rules` | The morph rules that map token text and fine-grained tags to coarse-grained tags, lemmas and morphological features. ~~Dict[str, Dict[str, Dict[Union[int, str], Union[int, str]]]]~~ | ## AttributeRuler.to_disk {#to_disk tag="method"} @@ -165,11 +165,11 @@ Serialize the pipe to disk. > attribute_ruler.to_disk("/path/to/attribute_ruler") > ``` -| Name | Type | Description | -| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | -| _keyword-only_ | | | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------ | +| `path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ | +| _keyword-only_ | | +| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ | ## AttributeRuler.from_disk {#from_disk tag="method"} @@ -182,12 +182,12 @@ Load the pipe from disk. Modifies the object in place and returns it. > attribute_ruler.from_disk("/path/to/attribute_ruler") > ``` -| Name | Type | Description | -| -------------- | ---------------- | -------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | -| _keyword-only_ | | | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `AttributeRuler` | The modified `AttributeRuler` object. | +| Name | Description | +| -------------- | ----------------------------------------------------------------------------------------------- | +| `path` | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ | +| _keyword-only_ | | +| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ | +| **RETURNS** | The modified `AttributeRuler` object. ~~AttributeRuler~~ | ## AttributeRuler.to_bytes {#to_bytes tag="method"} @@ -200,11 +200,11 @@ Load the pipe from disk. Modifies the object in place and returns it. Serialize the pipe to a bytestring. -| Name | Type | Description | -| -------------- | --------------- | ------------------------------------------------------------------------- | -| _keyword-only_ | | | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | bytes | The serialized form of the `AttributeRuler` object. | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------- | +| _keyword-only_ | | +| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ | +| **RETURNS** | The serialized form of the `AttributeRuler` object. ~~bytes~~ | ## AttributeRuler.from_bytes {#from_bytes tag="method"} @@ -218,12 +218,12 @@ Load the pipe from a bytestring. Modifies the object in place and returns it. > attribute_ruler.from_bytes(attribute_ruler_bytes) > ``` -| Name | Type | Description | -| -------------- | ---------------- | ------------------------------------------------------------------------- | -| `bytes_data` | bytes | The data to load from. | -| _keyword-only_ | | | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `AttributeRuler` | The `AttributeRuler` object. | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------- | +| `bytes_data` | The data to load from. ~~bytes~~ | +| _keyword-only_ | | +| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ | +| **RETURNS** | The `AttributeRuler` object. ~~AttributeRuler~~ | ## Serialization fields {#serialization-fields} diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index be7a2b499..7ce95c019 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -3,17 +3,17 @@ title: Command Line Interface teaser: Download, train and package models, and debug spaCy source: spacy/cli menu: - - ['Download', 'download'] - - ['Info', 'info'] - - ['Validate', 'validate'] - - ['Init', 'init'] - - ['Convert', 'convert'] - - ['Debug', 'debug'] - - ['Train', 'train'] - - ['Pretrain', 'pretrain'] - - ['Evaluate', 'evaluate'] - - ['Package', 'package'] - - ['Project', 'project'] + - ['download', 'download'] + - ['info', 'info'] + - ['validate', 'validate'] + - ['init', 'init'] + - ['convert', 'convert'] + - ['debug', 'debug'] + - ['train', 'train'] + - ['pretrain', 'pretrain'] + - ['evaluate', 'evaluate'] + - ['package', 'package'] + - ['project', 'project'] --- spaCy's CLI provides a range of helpful commands for downloading and training @@ -22,7 +22,7 @@ list of available commands, you can type `python -m spacy --help`. You can also add the `--help` flag to any command or subcommand to see the description, available arguments and usage. -## Download {#download} +## download {#download tag="command"} Download [models](/usage/models) for spaCy. The downloader finds the best-matching compatible version and uses `pip install` to download the model as @@ -39,41 +39,41 @@ the model name to be specified with its version (e.g. `en_core_web_sm-2.2.0`). > to a local PyPi installation and fetching it straight from there. This will > also allow you to add it as a versioned package dependency to your project. -```bash -$ python -m spacy download [model] [--direct] [pip args] +```cli +$ python -m spacy download [model] [--direct] [pip_args] ``` -| Argument | Type | Description | -| ------------------------------------- | ------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `model` | positional | Model name, e.g. [`en_core_web_sm`](/models/en#en_core_web_sm). | -| `--direct`, `-d` | flag | Force direct download of exact model version. | -| `--help`, `-h` | flag | Show help message and available arguments. | -| pip args 2.1 | option / flag | Additional installation options to be passed to `pip install` when installing the model package. For example, `--user` to install to the user home directory or `--no-deps` to not install model dependencies. | -| **CREATES** | directory | The installed model package in your `site-packages` directory. | +| Name | Description | +| ------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `model` | Model name, e.g. [`en_core_web_sm`](/models/en#en_core_web_sm). ~~str (positional)~~ | +| `--direct`, `-d` | Force direct download of exact model version. ~~bool (flag)~~ | +| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | +| pip args 2.1 | Additional installation options to be passed to `pip install` when installing the model package. For example, `--user` to install to the user home directory or `--no-deps` to not install model dependencies. ~~Any (option/flag)~~ | +| **CREATES** | The installed model package in your `site-packages` directory. | -## Info {#info} +## info {#info tag="command"} Print information about your spaCy installation, models and local setup, and generate [Markdown](https://en.wikipedia.org/wiki/Markdown)-formatted markup to copy-paste into [GitHub issues](https://github.com/explosion/spaCy/issues). -```bash +```cli $ python -m spacy info [--markdown] [--silent] ``` -```bash +```cli $ python -m spacy info [model] [--markdown] [--silent] ``` -| Argument | Type | Description | -| ------------------------------------------------ | ---------- | ---------------------------------------------- | -| `model` | positional | A model, i.e. package name or path (optional). | -| `--markdown`, `-md` | flag | Print information as Markdown. | -| `--silent`, `-s` 2.0.12 | flag | Don't print anything, just return the values. | -| `--help`, `-h` | flag | Show help message and available arguments. | -| **PRINTS** | `stdout` | Information about your spaCy installation. | +| Name | Description | +| ------------------------------------------------ | ------------------------------------------------------------------------------ | +| `model` | A model, i.e. package name or path (optional). ~~Optional[str] \(positional)~~ | +| `--markdown`, `-md` | Print information as Markdown. ~~bool (flag)~~ | +| `--silent`, `-s` 2.0.12 | Don't print anything, just return the values. ~~bool (flag)~~ | +| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | +| **PRINTS** | Information about your spaCy installation. | -## Validate {#validate new="2"} +## validate {#validate new="2" tag="command"} Find all models installed in the current environment and check whether they are compatible with the currently installed version of spaCy. Should be run after @@ -88,20 +88,20 @@ and command for updating are shown. > suite, to ensure all models are up to date before proceeding. If incompatible > models are found, it will return `1`. -```bash +```cli $ python -m spacy validate ``` -| Argument | Type | Description | -| ---------- | -------- | --------------------------------------------------------- | -| **PRINTS** | `stdout` | Details about the compatibility of your installed models. | +| Name | Description | +| ---------- | --------------------------------------------------------- | +| **PRINTS** | Details about the compatibility of your installed models. | -## Init {#init new="3"} +## init {#init new="3"} The `spacy init` CLI includes helpful commands for initializing training config files and model directories. -### init config {#init-config new="3"} +### init config {#init-config new="3" tag="command"} Initialize and save a [`config.cfg` file](/usage/training#config) using the **recommended settings** for your use case. It works just like the @@ -111,25 +111,25 @@ config. The settings you specify will impact the suggested model architectures and pipeline setup, as well as the hyperparameters. You can also adjust and customize those settings in your config file later. -> ```bash -> ### Example {wrap="true"} +> #### Example +> +> ```cli > $ python -m spacy init config config.cfg --lang en --pipeline ner,textcat --optimize accuracy > ``` -```bash -$ python -m spacy init config [output_file] [--lang] [--pipeline] -[--optimize] [--cpu] +```cli +$ python -m spacy init config [output_file] [--lang] [--pipeline] [--optimize] [--cpu] ``` -| Argument | Type | Description | -| ------------------ | ---------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `output_file` | positional | Path to output `.cfg` file. If not set, the config is written to stdout so you can pipe it forward to a file. | -| `--lang`, `-l` | option | Optional code of the [language](/usage/models#languages) to use. Defaults to `"en"`. | -| `--pipeline`, `-p` | option | Comma-separated list of trainable [pipeline components](/usage/processing-pipelines#built-in) to include in the model. Defaults to `"tagger,parser,ner"`. | -| `--optimize`, `-o` | option | `"efficiency"` or `"accuracy"`. Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters. Defaults to `"efficiency"`. | -| `--cpu`, `-C` | flag | Whether the model needs to run on CPU. This will impact the choice of architecture, pretrained weights and related hyperparameters. | -| `--help`, `-h` | flag | Show help message and available arguments. | -| **CREATES** | file | The config file for training. | +| Name | Description | +| ------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `output_file` | Path to output `.cfg` file or `-` to write the config to stdout (so you can pipe it forward to a file). Note that if you're writing to stdout, no additional logging info is printed. ~~Path (positional)~~ | +| `--lang`, `-l` | Optional code of the [language](/usage/models#languages) to use. Defaults to `"en"`. ~~str (option)~~ | +| `--pipeline`, `-p` | Comma-separated list of trainable [pipeline components](/usage/processing-pipelines#built-in) to include in the model. Defaults to `"tagger,parser,ner"`. ~~str (option)~~ | +| `--optimize`, `-o` | `"efficiency"` or `"accuracy"`. Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters. Defaults to `"efficiency"`. ~~str (option)~~ | +| `--cpu`, `-C` | Whether the model needs to run on CPU. This will impact the choice of architecture, pretrained weights and related hyperparameters. ~~bool (flag)~~ | +| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | +| **CREATES** | The config file for training. | ### init fill-config {#init-fill-config new="3"} @@ -143,33 +143,32 @@ be created, and their signatures are used to find the defaults. If your config contains a problem that can't be resolved automatically, spaCy will show you a validation error with more details. -> ```bash -> ### Example {wrap="true"} +> #### Example +> +> ```cli > $ python -m spacy init fill-config base.cfg config.cfg > ``` -```bash +```cli $ python -m spacy init fill-config [base_path] [output_file] [--diff] ``` -| Argument | Type | Description | -| -------------- | ---------- | ------------------------------------------------------------------------------------------------------------- | -| `base_path` | positional | Path to base config to fill, e.g. generated by the [quickstart widget](/usage/training#quickstart). | -| `output_file` | positional | Path to output `.cfg` file. If not set, the config is written to stdout so you can pipe it forward to a file. | -| `--diff`, `-D` | flag | Print a visual diff highlighting the changes. | -| `--help`, `-h` | flag | Show help message and available arguments. | -| **CREATES** | file | Complete and auto-filled config file for training. | +| Name | Description | +| -------------- | ----------------------------------------------------------------------------------------------------------------------------------- | +| `base_path` | Path to base config to fill, e.g. generated by the [quickstart widget](/usage/training#quickstart). ~~Path (positional)~~ | +| `output_file` | Path to output `.cfg` file. If not set, the config is written to stdout so you can pipe it forward to a file. ~~Path (positional)~~ | +| `--diff`, `-D` | Print a visual diff highlighting the changes. ~~bool (flag)~~ | +| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | +| **CREATES** | Complete and auto-filled config file for training. | -### init model {#init-model new="2"} - - +### init model {#init-model new="2" tag="command"} Create a new model directory from raw data, like word frequencies, Brown -clusters and word vectors. This command is similar to the `spacy model` command -in v1.x. Note that in order to populate the model's vocab, you need to pass in a -JSONL-formatted [vocabulary file](/api/data-formats#vocab-jsonl) as -`--jsonl-loc` with optional `id` values that correspond to the vectors table. -Just loading in vectors will not automatically populate the vocab. +clusters and word vectors. Note that in order to populate the model's vocab, you +need to pass in a JSONL-formatted +[vocabulary file](/api/data-formats#vocab-jsonl) as `--jsonl-loc` with optional +`id` values that correspond to the vectors table. Just loading in vectors will +not automatically populate the vocab. @@ -177,24 +176,23 @@ The `init-model` command is now available as a subcommand of `spacy init`. -```bash -$ python -m spacy init model [lang] [output_dir] [--jsonl-loc] [--vectors-loc] -[--prune-vectors] +```cli +$ python -m spacy init model [lang] [output_dir] [--jsonl-loc] [--vectors-loc] [--prune-vectors] ``` -| Argument | Type | Description | -| ------------------------------------------------------- | ---------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `lang` | positional | Model language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes), e.g. `en`. | -| `output_dir` | positional | Model output directory. Will be created if it doesn't exist. | -| `--jsonl-loc`, `-j` | option | Optional location of JSONL-formatted [vocabulary file](/api/data-formats#vocab-jsonl) with lexical attributes. | -| `--vectors-loc`, `-v` | option | Optional location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. | -| `--truncate-vectors`, `-t` 2.3 | option | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. | -| `--prune-vectors`, `-V` | option | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. | -| `--vectors-name`, `-vn` | option | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. | -| `--help`, `-h` | flag | Show help message and available arguments. | -| **CREATES** | model | A spaCy model containing the vocab and vectors. | +| Name | Description | +| ------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `lang` | Model language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes), e.g. `en`. ~~str (positional)~~ | +| `output_dir` | Model output directory. Will be created if it doesn't exist. ~~Path (positional)~~ | +| `--jsonl-loc`, `-j` | Optional location of JSONL-formatted [vocabulary file](/api/data-formats#vocab-jsonl) with lexical attributes. ~~Optional[Path] \(option)~~ | +| `--vectors-loc`, `-v` | Optional location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. ~~Optional[Path] \(option)~~ | +| `--truncate-vectors`, `-t` 2.3 | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. ~~int (option)~~ | +| `--prune-vectors`, `-V` | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. ~~int (option)~~ | +| `--vectors-name`, `-vn` | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. ~~str (option)~~ | +| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | +| **CREATES** | A spaCy model containing the vocab and vectors. | -## Convert {#convert} +## convert {#convert tag="command"} Convert files into spaCy's [binary training data format](/api/data-formats#binary-training), a serialized @@ -202,28 +200,26 @@ Convert files into spaCy's management functions. The converter can be specified on the command line, or chosen based on the file extension of the input file. -```bash -$ python -m spacy convert [input_file] [output_dir] [--converter] -[--file-type] [--n-sents] [--seg-sents] [--model] [--morphology] -[--merge-subtokens] [--ner-map] [--lang] +```cli +$ python -m spacy convert [input_file] [output_dir] [--converter] [--file-type] [--n-sents] [--seg-sents] [--model] [--morphology] [--merge-subtokens] [--ner-map] [--lang] ``` -| Argument | Type | Description | -| ------------------------------------------------ | ---------- | ------------------------------------------------------------------------------------------------------------------------ | -| `input_file` | positional | Input file. | -| `output_dir` | positional | Output directory for converted file. Defaults to `"-"`, meaning data will be written to `stdout`. | -| `--converter`, `-c` 2 | option | Name of converter to use (see below). | -| `--file-type`, `-t` 2.1 | option | Type of file to create. Either `spacy` (default) for binary [`DocBin`](/api/docbin) data or `json` for v2.x JSON format. | -| `--n-sents`, `-n` | option | Number of sentences per document. | -| `--seg-sents`, `-s` 2.2 | flag | Segment sentences (for `-c ner`) | -| `--model`, `-b` 2.2 | option | Model for parser-based sentence segmentation (for `-s`) | -| `--morphology`, `-m` | option | Enable appending morphology to tags. | -| `--ner-map`, `-nm` | option | NER tag mapping (as JSON-encoded dict of entity types). | -| `--lang`, `-l` 2.1 | option | Language code (if tokenizer required). | -| `--help`, `-h` | flag | Show help message and available arguments. | -| **CREATES** | binary | Binary [`DocBin`](/api/docbin) training data that can be used with [`spacy train`](/api/cli#train). | +| Name | Description | +| ------------------------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------------- | +| `input_file` | Input file. ~~Path (positional)~~ | +| `output_dir` | Output directory for converted file. Defaults to `"-"`, meaning data will be written to `stdout`. ~~Optional[Path] \(positional)~~ | +| `--converter`, `-c` 2 | Name of converter to use (see below). ~~str (option)~~ | +| `--file-type`, `-t` 2.1 | Type of file to create. Either `spacy` (default) for binary [`DocBin`](/api/docbin) data or `json` for v2.x JSON format. ~~str (option)~~ | +| `--n-sents`, `-n` | Number of sentences per document. ~~int (option)~~ | +| `--seg-sents`, `-s` 2.2 | Segment sentences (for `--converter ner`). ~~bool (flag)~~ | +| `--model`, `-b` 2.2 | Model for parser-based sentence segmentation (for `--seg-sents`). ~~Optional[str](option)~~ | +| `--morphology`, `-m` | Enable appending morphology to tags. ~~bool (flag)~~ | +| `--ner-map`, `-nm` | NER tag mapping (as JSON-encoded dict of entity types). ~~Optional[Path](option)~~ | +| `--lang`, `-l` 2.1 | Language code (if tokenizer required). ~~Optional[str] \(option)~~ | +| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | +| **CREATES** | Binary [`DocBin`](/api/docbin) training data that can be used with [`spacy train`](/api/cli#train). | -### Converters +### Converters {#converters} | ID | Description | | ------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | @@ -233,12 +229,12 @@ $ python -m spacy convert [input_file] [output_dir] [--converter] | `ner` | NER with IOB/IOB2 tags, one token per line with columns separated by whitespace. The first column is the token and the final column is the IOB tag. Sentences are separated by blank lines and documents are separated by the line `-DOCSTART- -X- O O`. Supports CoNLL 2003 NER format. See [sample data](https://github.com/explosion/spaCy/tree/master/examples/training/ner_example_data). | | `iob` | NER with IOB/IOB2 tags, one sentence per line with tokens separated by whitespace and annotation separated by `|`, either `word|B-ENT` or `word|POS|B-ENT`. See [sample data](https://github.com/explosion/spaCy/tree/master/examples/training/ner_example_data). | -## Debug {#debug new="3"} +## debug {#debug new="3"} The `spacy debug` CLI includes helpful commands for debugging and profiling your configs, data and implementations. -### debug config {#debug-config} +### debug config {#debug-config new="3" tag="command"} Debug a [`config.cfg` file](/usage/training#config) and show validation errors. The command will create all objects in the tree and validate them. Note that @@ -246,15 +242,15 @@ some config validation errors are blocking and will prevent the rest of the config from being resolved. This means that you may not see all validation errors at once and some issues are only shown once previous errors have been fixed. To auto-fill a partial config and save the result, you can use the -[`init config`](/api/cli#init-config) command. +[`init fillconfig`](/api/cli#init-fill-config) command. -```bash -$ python -m spacy debug config [config_path] [--code_path] [--output] [--auto_fill] [--diff] [overrides] +```cli +$ python -m spacy debug config [config_path] [--code_path] [overrides] ``` > #### Example > -> ```bash +> ```cli > $ python -m spacy debug config ./config.cfg > ``` @@ -277,18 +273,15 @@ python -m spacy init fill-config tmp/starter-config_invalid.cfg --base tmp/start -| Argument | Type | Default | Description | -| --------------------- | ------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------- | -| `config_path` | positional | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. | -| `--code_path`, `-c` | option | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-models) for new architectures. | -| `--auto_fill`, `-F` | option | Whether or not to auto-fill the config with built-in defaults if possible. If `False`, the provided config needs to be complete. | -| `--output_path`, `-o` | option | Output path where the filled config can be stored. Use '-' for standard output. | -| `--diff`, `-D` | option | `Show a visual diff if config was auto-filled. | -| `--help`, `-h` | flag | Show help message and available arguments. | -| overrides | option / flag | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. | -| **PRINTS** | stdout | Config validation errors, if available. | +| Name | Description | +| ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~ | +| `--code_path`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ | +| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | +| overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~ | +| **PRINTS** | Config validation errors, if available. | -### debug data {#debug-data} +### debug data {#debug-data tag="command"} Analyze, debug, and validate your training and development data. Get useful stats, and find problems like invalid entity annotations, cyclic dependencies, @@ -303,14 +296,13 @@ takes the same arguments as `train` and reads settings off the -```bash -$ python -m spacy debug data [config_path] [--code] [--ignore-warnings] -[--verbose] [--no-format] [overrides] +```cli +$ python -m spacy debug data [config_path] [--code] [--ignore-warnings] [--verbose] [--no-format] [overrides] ``` > #### Example > -> ```bash +> ```cli > $ python -m spacy debug data ./config.cfg > ``` @@ -453,18 +445,18 @@ will not be available. -| Argument | Type | Description | -| -------------------------- | ------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `config_path` | positional | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. | -| `--code`, `-c` | option | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-models) for new architectures. | -| `--ignore-warnings`, `-IW` | flag | Ignore warnings, only show stats and errors. | -| `--verbose`, `-V` | flag | Print additional information and explanations. | -| `--no-format`, `-NF` | flag | Don't pretty-print the results. Use this if you want to write to a file. | -| `--help`, `-h` | flag | Show help message and available arguments. | -| overrides | option / flag | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. | -| **PRINTS** | stdout | Debugging information. | +| Name | Description | +| -------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~ | +| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ | +| `--ignore-warnings`, `-IW` | Ignore warnings, only show stats and errors. ~~bool (flag)~~ | +| `--verbose`, `-V` | Print additional information and explanations. ~~bool (flag)~~ | +| `--no-format`, `-NF` | Don't pretty-print the results. Use this if you want to write to a file. ~~bool (flag)~~ | +| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | +| overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~ | +| **PRINTS** | Debugging information. | -### debug profile {#debug-profile} +### debug profile {#debug-profile tag="command"} Profile which functions take the most time in a spaCy pipeline. Input should be formatted as one JSON object per line with a key `"text"`. It can either be @@ -478,26 +470,25 @@ The `profile` command is now available as a subcommand of `spacy debug`. -```bash +```cli $ python -m spacy debug profile [model] [inputs] [--n-texts] ``` -| Argument | Type | Description | -| ----------------- | ---------- | ----------------------------------------------------------------- | -| `model` | positional | A loadable spaCy model. | -| `inputs` | positional | Optional path to input file, or `-` for standard input. | -| `--n-texts`, `-n` | option | Maximum number of texts to use if available. Defaults to `10000`. | -| `--help`, `-h` | flag | Show help message and available arguments. | -| **PRINTS** | stdout | Profiling information for the model. | +| Name | Description | +| ----------------- | ---------------------------------------------------------------------------------- | +| `model` | A loadable spaCy model. ~~str (positional)~~ | +| `inputs` | Optional path to input file, or `-` for standard input. ~~Path (positional)~~ | +| `--n-texts`, `-n` | Maximum number of texts to use if available. Defaults to `10000`. ~~int (option)~~ | +| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | +| **PRINTS** | Profiling information for the model. | -### debug model {#debug-model} +### debug model {#debug-model new="3" tag="command"} Debug a Thinc [`Model`](https://thinc.ai/docs/api-model) by running it on a sample text and checking how it updates its internal weights and parameters. -```bash -$ python -m spacy debug model [config_path] [component] [--layers] [-DIM] -[-PAR] [-GRAD] [-ATTR] [-P0] [-P1] [-P2] [P3] [--gpu-id] +```cli +$ python -m spacy debug model [config_path] [component] [--layers] [-DIM] [-PAR] [-GRAD] [-ATTR] [-P0] [-P1] [-P2] [P3] [--gpu-id] ``` @@ -507,7 +498,7 @@ model ("Step 0"), which helps us to understand the internal structure of the Neural Network, and to focus on specific layers that we want to inspect further (see next example). -```bash +```cli $ python -m spacy debug model ./config.cfg tagger -P0 ``` @@ -553,7 +544,7 @@ an all-zero matrix determined by the `nO` and `nI` dimensions. After a first training step (Step 2), this matrix has clearly updated its values through the training feedback loop. -```bash +```cli $ python -m spacy debug model ./config.cfg tagger -l "5,15" -DIM -PAR -P0 -P1 -P2 ``` @@ -596,23 +587,24 @@ $ python -m spacy debug model ./config.cfg tagger -l "5,15" -DIM -PAR -P0 -P1 -P -| Argument | Type | Description | -| ----------------------- | ---------- | ----------------------------------------------------------------------------------------------------- | -| `config_path` | positional | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. | | -| `component` | positional | Name of the pipeline component of which the model should be analyzed. | Β  | -| `--layers`, `-l` | option | Comma-separated names of layer IDs to print. | | -| `--dimensions`, `-DIM` | option | Show dimensions of each layer. | -| `--parameters`, `-PAR` | option | Show parameters of each layer. | -| `--gradients`, `-GRAD` | option | Show gradients of each layer. | -| `--attributes`, `-ATTR` | option | Show attributes of each layer. | -| `--print-step0`, `-P0` | option | Print model before training. | -| `--print-step1`, `-P1` | option | Print model after initialization. | -| `--print-step2`, `-P2` | option | Print model after training. | -| `--print-step3`, `-P3` | option | Print final predictions. | -| `--help`, `-h` | flag | Show help message and available arguments. | -| **PRINTS** | stdout | Debugging information. | +| Name | Description | +| ----------------------- | --------------------------------------------------------------------------------------------------------------------------- | +| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~ | +| `component` | Name of the pipeline component of which the model should be analyzed. ~~str (positional)~~ | +| `--layers`, `-l` | Comma-separated names of layer IDs to print. ~~str (option)~~ | +| `--dimensions`, `-DIM` | Show dimensions of each layer. ~~bool (flag)~~ | +| `--parameters`, `-PAR` | Show parameters of each layer. ~~bool (flag)~~ | +| `--gradients`, `-GRAD` | Show gradients of each layer. ~~bool (flag)~~ | +| `--attributes`, `-ATTR` | Show attributes of each layer. ~~bool (flag)~~ | +| `--print-step0`, `-P0` | Print model before training. ~~bool (flag)~~ | +| `--print-step1`, `-P1` | Print model after initialization. ~~bool (flag)~~ | +| `--print-step2`, `-P2` | Print model after training. ~~bool (flag)~~ | +| `--print-step3`, `-P3` | Print final predictions. ~~bool (flag)~~ | +| `--gpu-id`, `-g` | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~ | +| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | +| **PRINTS** | Debugging information. | -## Train {#train} +## train {#train tag="command"} Train a model. Expects data in spaCy's [binary format](/api/data-formats#training) and a @@ -620,9 +612,9 @@ Train a model. Expects data in spaCy's Will save out the best model from all epochs, as well as the final model. The `--code` argument can be used to provide a Python file that's imported before the training process starts. This lets you register -[custom functions](/usage/training#custom-models) and architectures and refer to -them in your config, all while still using spaCy's built-in `train` workflow. If -you need to manage complex multi-step training workflows, check out the new +[custom functions](/usage/training#custom-functions) and architectures and refer +to them in your config, all while still using spaCy's built-in `train` workflow. +If you need to manage complex multi-step training workflows, check out the new [spaCy projects](/usage/projects). @@ -636,21 +628,21 @@ in the section `[paths]`. -```bash +```cli $ python -m spacy train [config_path] [--output] [--code] [--verbose] [overrides] ``` -| Argument | Type | Description | -| ----------------- | ------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `config_path` | positional | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. | -| `--output`, `-o` | positional | Directory to store model in. Will be created if it doesn't exist. | -| `--code`, `-c` | option | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-models) for new architectures. | -| `--verbose`, `-V` | flag | Show more detailed messages during training. | -| `--help`, `-h` | flag | Show help message and available arguments. | -| overrides | option / flag | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. | -| **CREATES** | model | The final model and the best model. | +| Name | Description | +| ----------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~ | +| `--output`, `-o` | Directory to store model in. Will be created if it doesn't exist. ~~Optional[Path] \(positional)~~ | +| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ | +| `--verbose`, `-V` | Show more detailed messages during training. ~~bool (flag)~~ | +| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | +| overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~ | +| **CREATES** | The final model and the best model. | -## Pretrain {#pretrain new="2.1" tag="experimental"} +## pretrain {#pretrain new="2.1" tag="command,experimental"} Pretrain the "token to vector" ([`Tok2vec`](/api/tok2vec)) layer of pipeline components on [raw text](/api/data-formats#pretrain), using an approximate @@ -673,24 +665,23 @@ the [data format](/api/data-formats#config) for details. -```bash -$ python -m spacy pretrain [texts_loc] [output_dir] [config_path] -[--code] [--resume-path] [--epoch-resume] [overrides] +```cli +$ python -m spacy pretrain [texts_loc] [output_dir] [config_path] [--code] [--resume-path] [--epoch-resume] [overrides] ``` -| Argument | Type | Description | -| ----------------------- | ------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `texts_loc` | positional | Path to JSONL file with raw texts to learn from, with text provided as the key `"text"` or tokens as the key `"tokens"`. [See here](/api/data-formats#pretrain) for details. | -| `output_dir` | positional | Directory to write models to on each epoch. | -| `config_path` | positional | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. | -| `--code`, `-c` | option | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-models) for new architectures. | -| `--resume-path`, `-r` | option | Path to pretrained weights from which to resume pretraining. | -| `--epoch-resume`, `-er` | option | The epoch to resume counting from when using `--resume-path`. Prevents unintended overwriting of existing weight files. | -| `--help`, `-h` | flag | Show help message and available arguments. | -| overrides | option / flag | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--training.use_gpu 1`. | -| **CREATES** | weights | The pretrained weights that can be used to initialize `spacy train`. | +| Name | Description | +| ----------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `texts_loc` | Path to JSONL file with raw texts to learn from, with text provided as the key `"text"` or tokens as the key `"tokens"`. [See here](/api/data-formats#pretrain) for details. ~~Path (positional)~~ | +| `output_dir` | Directory to write models to on each epoch. ~~Path (positional)~~ | +| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~ | +| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ | +| `--resume-path`, `-r` | Path to pretrained weights from which to resume pretraining. ~~Optional[Path] \(option)~~ | +| `--epoch-resume`, `-er` | The epoch to resume counting from when using `--resume-path`. Prevents unintended overwriting of existing weight files. ~~Optional[int] \(option)~~ | +| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | +| overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--training.dropout 0.2`. ~~Any (option/flag)~~ | +| **CREATES** | The pretrained weights that can be used to initialize `spacy train`. | -## Evaluate {#evaluate new="2"} +## evaluate {#evaluate new="2" tag="command"} Evaluate a model. Expects a loadable spaCy model and evaluation data in the [binary `.spacy` format](/api/data-formats#binary-training). The @@ -702,32 +693,31 @@ skew. To render a sample of dependency parses in a HTML file using the [displaCy visualizations](/usage/visualizers), set as output directory as the `--displacy-path` argument. -```bash -$ python -m spacy evaluate [model] [data_path] [--output] [--gold-preproc] -[--gpu-id] [--displacy-path] [--displacy-limit] +```cli +$ python -m spacy evaluate [model] [data_path] [--output] [--gold-preproc] [--gpu-id] [--displacy-path] [--displacy-limit] ``` -| Argument | Type | Description | -| ------------------------- | -------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `model` | positional | Model to evaluate. Can be a package or a path to a model data directory. | -| `data_path` | positional | Location of evaluation data in spaCy's [binary format](/api/data-formats#training). | -| `--output`, `-o` | option | Output JSON file for metrics. If not set, no metrics will be exported. | -| `--gold-preproc`, `-G` | flag | Use gold preprocessing. | -| `--gpu-id`, `-g` | option | GPU to use, if any. Defaults to `-1` for CPU. | -| `--displacy-path`, `-dp` | option | Directory to output rendered parses as HTML. If not set, no visualizations will be generated. | -| `--displacy-limit`, `-dl` | option | Number of parses to generate per file. Defaults to `25`. Keep in mind that a significantly higher number might cause the `.html` files to render slowly. | -| `--help`, `-h` | flag | Show help message and available arguments. | -| **CREATES** | `stdout`, JSON, HTML | Training results and optional metrics and visualizations. | +| Name | Description | +| ------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `model` | Model to evaluate. Can be a package or a path to a model data directory. ~~str (positional)~~ | +| `data_path` | Location of evaluation data in spaCy's [binary format](/api/data-formats#training). ~~Path (positional)~~ | +| `--output`, `-o` | Output JSON file for metrics. If not set, no metrics will be exported. ~~Optional[Path] \(option)~~ | +| `--gold-preproc`, `-G` | Use gold preprocessing. ~~bool (flag)~~ | +| `--gpu-id`, `-g` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ | +| `--displacy-path`, `-dp` | Directory to output rendered parses as HTML. If not set, no visualizations will be generated. ~~Optional[Path] \(option)~~ | +| `--displacy-limit`, `-dl` | Number of parses to generate per file. Defaults to `25`. Keep in mind that a significantly higher number might cause the `.html` files to render slowly. ~~int (option)~~ | +| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | +| **CREATES** | Training results and optional metrics and visualizations. | -## Package {#package} +## package {#package tag="command"} Generate an installable [model Python package](/usage/training#models-generating) from an existing model -data directory. All data files are copied over. If the path to a `meta.json` is -supplied, or a `meta.json` is found in the input directory, this file is used. -Otherwise, the data can be entered directly from the command line. spaCy will -then create a `.tar.gz` archive file that you can distribute and install with -`pip install`. +data directory. All data files are copied over. If the path to a +[`meta.json`](/api/data-formats#meta) is supplied, or a `meta.json` is found in +the input directory, this file is used. Otherwise, the data can be entered +directly from the command line. spaCy will then create a `.tar.gz` archive file +that you can distribute and install with `pip install`. @@ -737,38 +727,37 @@ this, you can set the `--no-sdist` flag. -```bash -$ python -m spacy package [input_dir] [output_dir] [--meta-path] [--create-meta] -[--no-sdist] [--version] [--force] +```cli +$ python -m spacy package [input_dir] [output_dir] [--meta-path] [--create-meta] [--no-sdist] [--version] [--force] ``` > #### Example > -> ```bash -> python -m spacy package /input /output -> cd /output/en_model-0.0.0 -> pip install dist/en_model-0.0.0.tar.gz +> ```cli +> $ python -m spacy package /input /output +> $ cd /output/en_model-0.0.0 +> $ pip install dist/en_model-0.0.0.tar.gz > ``` -| Argument | Type | Description | -| ------------------------------------------------ | ---------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `input_dir` | positional | Path to directory containing model data. | -| `output_dir` | positional | Directory to create package folder in. | -| `--meta-path`, `-m` 2 | option | Path to `meta.json` file (optional). | -| `--create-meta`, `-C` 2 | flag | Create a `meta.json` file on the command line, even if one already exists in the directory. If an existing file is found, its entries will be shown as the defaults in the command line prompt. | -| `--no-sdist`, `-NS`, | flag | Don't build the `.tar.gz` sdist automatically. Can be set if you want to run this step manually. | -| `--version`, `-v` 3 | option | Package version to override in meta. Useful when training new versions, as it doesn't require editing the meta template. | -| `--force`, `-f` | flag | Force overwriting of existing folder in output directory. | -| `--help`, `-h` | flag | Show help message and available arguments. | -| **CREATES** | directory | A Python package containing the spaCy model. | +| Name | Description | +| ------------------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `input_dir` | Path to directory containing model data. ~~Path (positional)~~ | +| `output_dir` | Directory to create package folder in. ~~Path (positional)~~ | +| `--meta-path`, `-m` 2 | Path to [`meta.json`](/api/data-formats#meta) file (optional). ~~Optional[Path] \(option)~~ | +| `--create-meta`, `-C` 2 | Create a `meta.json` file on the command line, even if one already exists in the directory. If an existing file is found, its entries will be shown as the defaults in the command line prompt. ~~bool (flag)~~ | +| `--no-sdist`, `-NS`, | Don't build the `.tar.gz` sdist automatically. Can be set if you want to run this step manually. ~~bool (flag)~~ | +| `--version`, `-v` 3 | Package version to override in meta. Useful when training new versions, as it doesn't require editing the meta template. ~~Optional[str] \(option)~~ | +| `--force`, `-f` | Force overwriting of existing folder in output directory. ~~bool (flag)~~ | +| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | +| **CREATES** | A Python package containing the spaCy model. | -## Project {#project new="3"} +## project {#project new="3"} The `spacy project` CLI includes subcommands for working with [spaCy projects](/usage/projects), end-to-end workflows for building and deploying custom spaCy models. -### project clone {#project-clone} +### project clone {#project-clone tag="command"} Clone a project template from a Git repository. Calls into `git` under the hood and uses the sparse checkout feature, so you're only downloading what you need. @@ -779,31 +768,31 @@ can provide any other repo (public or private) that you have access to using the -```bash +```cli $ python -m spacy project clone [name] [dest] [--repo] ``` > #### Example > -> ```bash +> ```cli > $ python -m spacy project clone some_example > ``` > > Clone from custom repo: > -> ```bash +> ```cli > $ python -m spacy project clone template --repo https://github.com/your_org/your_repo > ``` -| Argument | Type | Description | -| -------------- | ---------- | ---------------------------------------------------------------------------------------------------------------------------- | -| `name` | positional | The name of the template to clone, relative to the repo. Can be a top-level directory or a subdirectory like `dir/template`. | -| `dest` | positional | Where to clone the project. Defaults to current working directory. | -| `--repo`, `-r` | option | The repository to clone from. Can be any public or private Git repo you have access to. | -| `--help`, `-h` | flag | Show help message and available arguments. | -| **CREATES** | directory | The cloned [project directory](/usage/projects#project-files). | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------- | +| `name` | The name of the template to clone, relative to the repo. Can be a top-level directory or a subdirectory like `dir/template`. ~~str (positional)~~ | +| `dest` | Where to clone the project. Defaults to current working directory. ~~Path (positional)~~ | +| `--repo`, `-r` | The repository to clone from. Can be any public or private Git repo you have access to. ~~str (option)~~ | +| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | +| **CREATES** | The cloned [project directory](/usage/projects#project-files). | -### project assets {#project-assets} +### project assets {#project-assets tag="command"} Fetch project assets like datasets and pretrained weights. Assets are defined in the `assets` section of the [`project.yml`](/usage/projects#project-yml). If a @@ -814,23 +803,23 @@ considered "private" and you have to take care of putting them into the destination directory yourself. If a local path is provided, the asset is copied into the current project. -```bash +```cli $ python -m spacy project assets [project_dir] ``` > #### Example > -> ```bash +> ```cli > $ python -m spacy project assets > ``` -| Argument | Type | Description | -| -------------- | ---------- | ----------------------------------------------------------------- | -| `project_dir` | positional | Path to project directory. Defaults to current working directory. | -| `--help`, `-h` | flag | Show help message and available arguments. | -| **CREATES** | files | Downloaded or copied assets defined in the `project.yml`. | +| Name | Description | +| -------------- | --------------------------------------------------------------------------------------- | +| `project_dir` | Path to project directory. Defaults to current working directory. ~~Path (positional)~~ | +| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | +| **CREATES** | Downloaded or copied assets defined in the `project.yml`. | -### project run {#project-run} +### project run {#project-run tag="command"} Run a named command or workflow defined in the [`project.yml`](/usage/projects#project-yml). If a workflow name is specified, @@ -839,26 +828,112 @@ all commands in the workflow are run, in order. If commands define re-run if state has changed. For example, if the input dataset changes, a preprocessing command that depends on those files will be re-run. -```bash +```cli $ python -m spacy project run [subcommand] [project_dir] [--force] [--dry] ``` > #### Example > -> ```bash +> ```cli > $ python -m spacy project run train > ``` -| Argument | Type | Description | -| --------------- | ---------- | ----------------------------------------------------------------- | -| `subcommand` | positional | Name of the command or workflow to run. | -| `project_dir` | positional | Path to project directory. Defaults to current working directory. | -| `--force`, `-F` | flag | Force re-running steps, even if nothing changed. | -| `--dry`, `-D` | flag | Β Perform a dry run and don't execute scripts. | -| `--help`, `-h` | flag | Show help message and available arguments. | -| **EXECUTES** | script | The command defined in the `project.yml`. | +| Name | Description | +| --------------- | --------------------------------------------------------------------------------------- | +| `subcommand` | Name of the command or workflow to run. ~~str (positional)~~ | +| `project_dir` | Path to project directory. Defaults to current working directory. ~~Path (positional)~~ | +| `--force`, `-F` | Force re-running steps, even if nothing changed. ~~bool (flag)~~ | +| `--dry`, `-D` | Β Perform a dry run and don't execute scripts. ~~bool (flag)~~ | +| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | +| **EXECUTES** | The command defined in the `project.yml`. | -### project dvc {#project-dvc} +### project push {#project-push tag="command"} + +Upload all available files or directories listed as in the `outputs` section of +commands to a remote storage. Outputs are archived and compressed prior to +upload, and addressed in the remote storage using the output's relative path +(URL encoded), a hash of its command string and dependencies, and a hash of its +file contents. This means `push` should **never overwrite** a file in your +remote. If all the hashes match, the contents are the same and nothing happens. +If the contents are different, the new version of the file is uploaded. Deleting +obsolete files is left up to you. + +Remotes can be defined in the `remotes` section of the +[`project.yml`](/usage/projects#project-yml). Under the hood, spaCy uses the +[`smart-open`](https://github.com/RaRe-Technologies/smart_open) library to +communicate with the remote storages, so you can use any protocol that +`smart-open` supports, including [S3](https://aws.amazon.com/s3/), +[Google Cloud Storage](https://cloud.google.com/storage), SSH and more, although +you may need to install extra dependencies to use certain protocols. + +```cli +$ python -m spacy project push [remote] [project_dir] +``` + +> #### Example +> +> ```cli +> $ python -m spacy project push my_bucket +> ``` +> +> ```yaml +> ### project.yml +> remotes: +> my_bucket: 's3://my-spacy-bucket' +> ``` + +| Name | Description | +| -------------- | --------------------------------------------------------------------------------------- | +| `remote` | The name of the remote to upload to. Defaults to `"default"`. ~~str (positional)~~ | +| `project_dir` | Path to project directory. Defaults to current working directory. ~~Path (positional)~~ | +| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | +| **UPLOADS** | All project outputs that exist and are not already stored in the remote. | + +### project pull {#project-pull tag="command"} + +Download all files or directories listed as `outputs` for commands, unless they +are not already present locally. When searching for files in the remote, `pull` +won't just look at the output path, but will also consider the **command +string** and the **hashes of the dependencies**. For instance, let's say you've +previously pushed a model checkpoint to the remote, but now you've changed some +hyper-parameters. Because you've changed the inputs to the command, if you run +`pull`, you won't retrieve the stale result. If you train your model and push +the outputs to the remote, the outputs will be saved alongside the prior +outputs, so if you change the config back, you'll be able to fetch back the +result. + +Remotes can be defined in the `remotes` section of the +[`project.yml`](/usage/projects#project-yml). Under the hood, spaCy uses the +[`smart-open`](https://github.com/RaRe-Technologies/smart_open) library to +communicate with the remote storages, so you can use any protocol that +`smart-open` supports, including [S3](https://aws.amazon.com/s3/), +[Google Cloud Storage](https://cloud.google.com/storage), SSH and more, although +you may need to install extra dependencies to use certain protocols. + +```cli +$ python -m spacy project pull [remote] [project_dir] +``` + +> #### Example +> +> ```cli +> $ python -m spacy project pull my_bucket +> ``` +> +> ```yaml +> ### project.yml +> remotes: +> my_bucket: 's3://my-spacy-bucket' +> ``` + +| Name | Description | +| -------------- | --------------------------------------------------------------------------------------- | +| `remote` | The name of the remote to download from. Defaults to `"default"`. ~~str (positional)~~ | +| `project_dir` | Path to project directory. Defaults to current working directory. ~~Path (positional)~~ | +| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | +| **DOWNLOADS** | All project outputs that do not exist locally and can be found in the remote. | + +### project dvc {#project-dvc tag="command"} Auto-generate [Data Version Control](https://dvc.org) (DVC) config file. Calls [`dvc run`](https://dvc.org/doc/command-reference/run) with `--no-exec` under @@ -878,23 +953,23 @@ You'll also need to add the assets you want to track with -```bash +```cli $ python -m spacy project dvc [project_dir] [workflow] [--force] [--verbose] ``` > #### Example > -> ```bash -> git init -> dvc init -> python -m spacy project dvc all +> ```cli +> $ git init +> $ dvc init +> $ python -m spacy project dvc all > ``` -| Argument | Type | Description | -| ----------------- | ---------- | --------------------------------------------------------------------------------------------- | -| `project_dir` | positional | Path to project directory. Defaults to current working directory. | -| `workflow` | positional | Name of workflow defined in `project.yml`. Defaults to first workflow if not set. | -| `--force`, `-F` | flag | Force-updating config file. | -| `--verbose`, `-V` | flag | Β Print more output generated by DVC. | -| `--help`, `-h` | flag | Show help message and available arguments. | -| **CREATES** | file | A `dvc.yaml` file in the project directory, based on the steps defined in the given workflow. | +| Name | Description | +| ----------------- | ----------------------------------------------------------------------------------------------------------------- | +| `project_dir` | Path to project directory. Defaults to current working directory. ~~Path (positional)~~ | +| `workflow` | Name of workflow defined in `project.yml`. Defaults to first workflow if not set. ~~Optional[str] \(positional)~~ | +| `--force`, `-F` | Force-updating config file. ~~bool (flag)~~ | +| `--verbose`, `-V` | Β Print more output generated by DVC. ~~bool (flag)~~ | +| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | +| **CREATES** | A `dvc.yaml` file in the project directory, based on the steps defined in the given workflow. | diff --git a/website/docs/api/corpus.md b/website/docs/api/corpus.md index 5f9fd49db..86cfa9121 100644 --- a/website/docs/api/corpus.md +++ b/website/docs/api/corpus.md @@ -17,7 +17,7 @@ customize the data loading during training, you can register your own or evaluation data. It takes the same arguments as the `Corpus` class and returns a callable that yields [`Example`](/api/example) objects. You can replace it with your own registered function in the -[`@readers` registry](/api/top-level#regsitry) to customize the data loading and +[`@readers` registry](/api/top-level#registry) to customize the data loading and streaming. > #### Example config @@ -28,18 +28,18 @@ streaming. > > [training.train_corpus] > @readers = "spacy.Corpus.v1" -> path = ${paths:train} +> path = ${paths.train} > gold_preproc = false > max_length = 0 > limit = 0 > ``` -| Name | Type | Description | -| --------------- | ------ | ----------------------------------------------------------------------------------------------------------------------------------------------- | -| `path` | `Path` | The directory or filename to read from. Expects data in spaCy's binary [`.spacy` format](/api/data-formats#binary-training). | -| Β `gold_preproc` | bool | Whether to set up the Example object with gold-standard sentences and tokens for the predictions. See [`Corpus`](/api/corpus#init) for details. | -| `max_length` | int | Maximum document length. Longer documents will be split into sentences, if sentence boundaries are available. Defaults to `0` for no limit. | -| `limit` | int | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. | +| Name | Description | +| --------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `path` | The directory or filename to read from. Expects data in spaCy's binary [`.spacy` format](/api/data-formats#binary-training). ~~Path~~ | +| Β `gold_preproc` | Whether to set up the Example object with gold-standard sentences and tokens for the predictions. See [`Corpus`](/api/corpus#init) for details. ~~bool~~ | +| `max_length` | Maximum document length. Longer documents will be split into sentences, if sentence boundaries are available. Defaults to `0` for no limit. ~~int~~ | +| `limit` | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~ | ```python https://github.com/explosion/spaCy/blob/develop/spacy/gold/corpus.py @@ -67,13 +67,13 @@ train/test skew. > corpus = Corpus("./data", limit=10) > ``` -| Name | Type | Description | -| --------------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------- | -| `path` | str / `Path` | The directory or filename to read from. | -| _keyword-only_ | | | -| Β `gold_preproc` | bool | Whether to set up the Example object with gold-standard sentences and tokens for the predictions. Defaults to `False`. | -| `max_length` | int | Maximum document length. Longer documents will be split into sentences, if sentence boundaries are available. Defaults to `0` for no limit. | -| `limit` | int | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. | +| Name | Description | +| --------------- | --------------------------------------------------------------------------------------------------------------------------------------------------- | +| `path` | The directory or filename to read from. ~~Union[str, Path]~~ | +| _keyword-only_ | | +| Β `gold_preproc` | Whether to set up the Example object with gold-standard sentences and tokens for the predictions. Defaults to `False`. ~~bool~~ | +| `max_length` | Maximum document length. Longer documents will be split into sentences, if sentence boundaries are available. Defaults to `0` for no limit. ~~int~~ | +| `limit` | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~ | ## Corpus.\_\_call\_\_ {#call tag="method"} @@ -90,7 +90,7 @@ Yield examples from the data. > train_data = corpus(nlp) > ``` -| Name | Type | Description | -| ---------- | ---------- | ------------------------- | -| `nlp` | `Language` | The current `nlp` object. | -| **YIELDS** | `Example` | The examples. | +| Name | Description | +| ---------- | -------------------------------------- | +| `nlp` | The current `nlp` object. ~~Language~~ | +| **YIELDS** | The examples. ~~Example~~ | diff --git a/website/docs/api/cython-classes.md b/website/docs/api/cython-classes.md index 6e54fb112..a4ecf294a 100644 --- a/website/docs/api/cython-classes.md +++ b/website/docs/api/cython-classes.md @@ -23,13 +23,13 @@ accessed from Python. For the Python documentation, see [`Doc`](/api/doc). ### Attributes {#doc_attributes} -| Name | Type | Description | -| ------------ | ------------ | ----------------------------------------------------------------------------------------- | -| `mem` | `cymem.Pool` | A memory pool. Allocated memory will be freed once the `Doc` object is garbage collected. | -| `vocab` | `Vocab` | A reference to the shared `Vocab` object. | -| `c` | `TokenC*` | A pointer to a [`TokenC`](/api/cython-structs#tokenc) struct. | -| `length` | `int` | The number of tokens in the document. | -| `max_length` | `int` | The underlying size of the `Doc.c` array. | +| Name | Description | +| ------------ | -------------------------------------------------------------------------------------------------------- | +| `mem` | A memory pool. Allocated memory will be freed once the `Doc` object is garbage collected. ~~cymem.Pool~~ | +| `vocab` | A reference to the shared `Vocab` object. ~~Vocab~~ | +| `c` | A pointer to a [`TokenC`](/api/cython-structs#tokenc) struct. ~~TokenC\*~~ | +| `length` | The number of tokens in the document. ~~int~~ | +| `max_length` | The underlying size of the `Doc.c` array. ~~int~~ | ### Doc.push_back {#doc_push_back tag="method"} @@ -50,10 +50,10 @@ Append a token to the `Doc`. The token can be provided as a > assert doc.text == "hello " > ``` -| Name | Type | Description | -| ------------ | --------------- | ----------------------------------------- | -| `lex_or_tok` | `LexemeOrToken` | The word to append to the `Doc`. | -| `has_space` | `bint` | Whether the word has trailing whitespace. | +| Name | Description | +| ------------ | -------------------------------------------------- | +| `lex_or_tok` | The word to append to the `Doc`. ~~LexemeOrToken~~ | +| `has_space` | Whether the word has trailing whitespace. ~~bint~~ | ## Token {#token tag="cdef class" source="spacy/tokens/token.pxd"} @@ -70,12 +70,12 @@ accessed from Python. For the Python documentation, see [`Token`](/api/token). ### Attributes {#token_attributes} -| Name | Type | Description | -| ------- | --------- | ------------------------------------------------------------- | -| `vocab` | `Vocab` | A reference to the shared `Vocab` object. | -| `c` | `TokenC*` | A pointer to a [`TokenC`](/api/cython-structs#tokenc) struct. | -| `i` | `int` | The offset of the token within the document. | -| `doc` | `Doc` | The parent document. | +| Name | Description | +| ------- | -------------------------------------------------------------------------- | +| `vocab` | A reference to the shared `Vocab` object. ~~Vocab~~ | +| `c` | A pointer to a [`TokenC`](/api/cython-structs#tokenc) struct. ~~TokenC\*~~ | +| `i` | The offset of the token within the document. ~~int~~ | +| `doc` | The parent document. ~~Doc~~ | ### Token.cinit {#token_cinit tag="method"} @@ -87,12 +87,12 @@ Create a `Token` object from a `TokenC*` pointer. > token = Token.cinit(&doc.c[3], doc, 3) > ``` -| Name | Type | Description | -| -------- | --------- | ------------------------------------------------------------ | -| `vocab` | `Vocab` | A reference to the shared `Vocab`. | -| `c` | `TokenC*` | A pointer to a [`TokenC`](/api/cython-structs#tokenc)struct. | -| `offset` | `int` | The offset of the token within the document. | -| `doc` | `Doc` | The parent document. | +| Name | Description | +| -------- | -------------------------------------------------------------------------- | +| `vocab` | A reference to the shared `Vocab`. ~~Vocab~~ | +| `c` | A pointer to a [`TokenC`](/api/cython-structs#tokenc) struct. ~~TokenC\*~~ | +| `offset` | The offset of the token within the document. ~~int~~ | +| `doc` | The parent document. ~~int~~ | ## Span {#span tag="cdef class" source="spacy/tokens/span.pxd"} @@ -107,14 +107,14 @@ accessed from Python. For the Python documentation, see [`Span`](/api/span). ### Attributes {#span_attributes} -| Name | Type | Description | -| ------------ | -------------------------------------- | ------------------------------------------------------- | -| `doc` | `Doc` | The parent document. | -| `start` | `int` | The index of the first token of the span. | -| `end` | `int` | The index of the first token after the span. | -| `start_char` | `int` | The index of the first character of the span. | -| `end_char` | `int` | The index of the last character of the span. | -| `label` | `attr_t` | A label to attach to the span, e.g. for named entities. | +| Name | Description | +| ------------ | ----------------------------------------------------------------------------- | +| `doc` | The parent document. ~~Doc~~ | +| `start` | The index of the first token of the span. ~~int~~ | +| `end` | The index of the first token after the span. ~~int~~ | +| `start_char` | The index of the first character of the span. ~~int~~ | +| `end_char` | The index of the last character of the span. ~~int~~ | +| `label` | A label to attach to the span, e.g. for named entities. ~~attr_t (uint64_t)~~ | ## Lexeme {#lexeme tag="cdef class" source="spacy/lexeme.pxd"} @@ -129,11 +129,11 @@ accessed from Python. For the Python documentation, see [`Lexeme`](/api/lexeme). ### Attributes {#lexeme_attributes} -| Name | Type | Description | -| ------- | -------------------------------------- | --------------------------------------------------------------- | -| `c` | `LexemeC*` | A pointer to a [`LexemeC`](/api/cython-structs#lexemec) struct. | -| `vocab` | `Vocab` | A reference to the shared `Vocab` object. | -| `orth` | `attr_t` | ID of the verbatim text content. | +| Name | Description | +| ------- | ----------------------------------------------------------------------------- | +| `c` | A pointer to a [`LexemeC`](/api/cython-structs#lexemec) struct. ~~LexemeC\*~~ | +| `vocab` | A reference to the shared `Vocab` object. ~~Vocab~~ | +| `orth` | ID of the verbatim text content. ~~attr_t (uint64_t)~~ | ## Vocab {#vocab tag="cdef class" source="spacy/vocab.pxd"} @@ -149,11 +149,11 @@ accessed from Python. For the Python documentation, see [`Vocab`](/api/vocab). ### Attributes {#vocab_attributes} -| Name | Type | Description | -| --------- | ------------- | ------------------------------------------------------------------------------------------- | -| `mem` | `cymem.Pool` | A memory pool. Allocated memory will be freed once the `Vocab` object is garbage collected. | -| `strings` | `StringStore` | A `StringStore` that maps string to hash values and vice versa. | -| `length` | `int` | The number of entries in the vocabulary. | +| Name | Description | +| --------- | ---------------------------------------------------------------------------------------------------------- | +| `mem` | A memory pool. Allocated memory will be freed once the `Vocab` object is garbage collected. ~~cymem.Pool~~ | +| `strings` | A `StringStore` that maps string to hash values and vice versa. ~~StringStore~~ | +| `length` | The number of entries in the vocabulary. ~~int~~ | ### Vocab.get {#vocab_get tag="method"} @@ -166,11 +166,11 @@ vocabulary. > lexeme = vocab.get(vocab.mem, "hello") > ``` -| Name | Type | Description | -| ----------- | ---------------- | ------------------------------------------------------------------------------------------- | -| `mem` | `cymem.Pool` | A memory pool. Allocated memory will be freed once the `Vocab` object is garbage collected. | -| `string` | str | The string of the word to look up. | -| **RETURNS** | `const LexemeC*` | The lexeme in the vocabulary. | +| Name | Description | +| ----------- | ---------------------------------------------------------------------------------------------------------- | +| `mem` | A memory pool. Allocated memory will be freed once the `Vocab` object is garbage collected. ~~cymem.Pool~~ | +| `string` | The string of the word to look up. ~~str~~ | +| **RETURNS** | The lexeme in the vocabulary. ~~const LexemeC\*~~ | ### Vocab.get_by_orth {#vocab_get_by_orth tag="method"} @@ -183,11 +183,11 @@ vocabulary. > lexeme = vocab.get_by_orth(doc[0].lex.norm) > ``` -| Name | Type | Description | -| ----------- | -------------------------------------- | ------------------------------------------------------------------------------------------- | -| `mem` | `cymem.Pool` | A memory pool. Allocated memory will be freed once the `Vocab` object is garbage collected. | -| `orth` | `attr_t` | ID of the verbatim text content. | -| **RETURNS** | `const LexemeC*` | The lexeme in the vocabulary. | +| Name | Description | +| ----------- | ---------------------------------------------------------------------------------------------------------- | +| `mem` | A memory pool. Allocated memory will be freed once the `Vocab` object is garbage collected. ~~cymem.Pool~~ | +| `orth` | ID of the verbatim text content. ~~attr_t (uint64_t)~~ | +| **RETURNS** | The lexeme in the vocabulary. ~~const LexemeC\*~~ | ## StringStore {#stringstore tag="cdef class" source="spacy/strings.pxd"} @@ -203,7 +203,7 @@ accessed from Python. For the Python documentation, see ### Attributes {#stringstore_attributes} -| Name | Type | Description | -| ------ | ------------------------------------------------------ | ------------------------------------------------------------------------------------------------ | -| `mem` | `cymem.Pool` | A memory pool. Allocated memory will be freed once the`StringStore` object is garbage collected. | -| `keys` | `vector[hash_t]` | A list of hash values in the `StringStore`. | +| Name | Description | +| ------ | ---------------------------------------------------------------------------------------------------------------- | +| `mem` | A memory pool. Allocated memory will be freed once the `StringStore` object is garbage collected. ~~cymem.Pool~~ | +| `keys` | A list of hash values in the `StringStore`. ~~vector[hash_t] \(vector[uint64_t])~~ | diff --git a/website/docs/api/cython-structs.md b/website/docs/api/cython-structs.md index 8ee1f1b9a..4c8514b64 100644 --- a/website/docs/api/cython-structs.md +++ b/website/docs/api/cython-structs.md @@ -18,26 +18,26 @@ Cython data container for the `Token` object. > token_ptr = &doc.c[3] > ``` -| Name | Type | Description | -| ------------ | -------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `lex` | `const LexemeC*` | A pointer to the lexeme for the token. | -| `morph` | `uint64_t` | An ID allowing lookup of morphological attributes. | -| `pos` | `univ_pos_t` | Coarse-grained part-of-speech tag. | -| `spacy` | `bint` | A binary value indicating whether the token has trailing whitespace. | -| `tag` | `attr_t` | Fine-grained part-of-speech tag. | -| `idx` | `int` | The character offset of the token within the parent document. | -| `lemma` | `attr_t` | Base form of the token, with no inflectional suffixes. | -| `sense` | `attr_t` | Space for storing a word sense ID, currently unused. | -| `head` | `int` | Offset of the syntactic parent relative to the token. | -| `dep` | `attr_t` | Syntactic dependency relation. | -| `l_kids` | `uint32_t` | Number of left children. | -| `r_kids` | `uint32_t` | Number of right children. | -| `l_edge` | `uint32_t` | Offset of the leftmost token of this token's syntactic descendants. | -| `r_edge` | `uint32_t` | Offset of the rightmost token of this token's syntactic descendants. | -| `sent_start` | `int` | Ternary value indicating whether the token is the first word of a sentence. `0` indicates a missing value, `-1` indicates `False` and `1` indicates `True`. The default value, 0, is interpreted as no sentence break. Sentence boundary detectors will usually set 0 for all tokens except tokens that follow a sentence boundary. | -| `ent_iob` | `int` | IOB code of named entity tag. `0` indicates a missing value, `1` indicates `I`, `2` indicates `0` and `3` indicates `B`. | -| `ent_type` | `attr_t` | Named entity type. | -| `ent_id` | `attr_t` | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. | +| Name | Description | +| ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `lex` | A pointer to the lexeme for the token. ~~const LexemeC\*~~ | +| `morph` | An ID allowing lookup of morphological attributes. ~~uint64_t~~ | +| `pos` | Coarse-grained part-of-speech tag. ~~univ_pos_t~~ | +| `spacy` | A binary value indicating whether the token has trailing whitespace. ~~bint~~ | +| `tag` | Fine-grained part-of-speech tag. ~~attr_t (uint64_t)~~ | +| `idx` | The character offset of the token within the parent document. ~~int~~ | +| `lemma` | Base form of the token, with no inflectional suffixes. ~~attr_t (uint64_t)~~ | +| `sense` | Space for storing a word sense ID, currently unused. ~~attr_t (uint64_t)~~ | +| `head` | Offset of the syntactic parent relative to the token. ~~int~~ | +| `dep` | Syntactic dependency relation. ~~attr_t (uint64_t)~~ | +| `l_kids` | Number of left children. ~~uint32_t~~ | +| `r_kids` | Number of right children. ~~uint32_t~~ | +| `l_edge` | Offset of the leftmost token of this token's syntactic descendants. ~~uint32_t~~ | +| `r_edge` | Offset of the rightmost token of this token's syntactic descendants. ~~uint32_t~~ | +| `sent_start` | Ternary value indicating whether the token is the first word of a sentence. `0` indicates a missing value, `-1` indicates `False` and `1` indicates `True`. The default value, 0, is interpreted as no sentence break. Sentence boundary detectors will usually set 0 for all tokens except tokens that follow a sentence boundary. ~~int~~ | +| `ent_iob` | IOB code of named entity tag. `0` indicates a missing value, `1` indicates `I`, `2` indicates `0` and `3` indicates `B`. ~~int~~ | +| `ent_type` | Named entity type. ~~attr_t (uint64_t)~~ | +| `ent_id` | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. ~~attr_t (uint64_t)~~ | ### Token.get_struct_attr {#token_get_struct_attr tag="staticmethod, nogil" source="spacy/tokens/token.pxd"} @@ -52,11 +52,11 @@ Get the value of an attribute from the `TokenC` struct by attribute ID. > is_alpha = Token.get_struct_attr(&doc.c[3], IS_ALPHA) > ``` -| Name | Type | Description | -| ----------- | -------------------------------------- | -------------------------------------------------------------------------------------- | -| `token` | `const TokenC*` | A pointer to a `TokenC` struct. | -| `feat_name` | `attr_id_t` | The ID of the attribute to look up. The attributes are enumerated in `spacy.typedefs`. | -| **RETURNS** | `attr_t` | The value of the attribute. | +| Name | Description | +| ----------- | ---------------------------------------------------------------------------------------------------- | +| `token` | A pointer to a `TokenC` struct. ~~const TokenC\*~~ | +| `feat_name` | The ID of the attribute to look up. The attributes are enumerated in `spacy.typedefs`. ~~attr_id_t~~ | +| **RETURNS** | The value of the attribute. ~~attr_t (uint64_t)~~ | ### Token.set_struct_attr {#token_set_struct_attr tag="staticmethod, nogil" source="spacy/tokens/token.pxd"} @@ -72,11 +72,11 @@ Set the value of an attribute of the `TokenC` struct by attribute ID. > Token.set_struct_attr(token, TAG, 0) > ``` -| Name | Type | Description | -| ----------- | -------------------------------------- | -------------------------------------------------------------------------------------- | -| `token` | `const TokenC*` | A pointer to a `TokenC` struct. | -| `feat_name` | `attr_id_t` | The ID of the attribute to look up. The attributes are enumerated in `spacy.typedefs`. | -| `value` | `attr_t` | The value to set. | +| Name | Description | +| ----------- | ---------------------------------------------------------------------------------------------------- | +| `token` | A pointer to a `TokenC` struct. ~~const TokenC\*~~ | +| `feat_name` | The ID of the attribute to look up. The attributes are enumerated in `spacy.typedefs`. ~~attr_id_t~~ | +| `value` | The value to set. ~~attr_t (uint64_t)~~ | ### token_by_start {#token_by_start tag="function" source="spacy/tokens/doc.pxd"} @@ -93,12 +93,12 @@ Find a token in a `TokenC*` array by the offset of its first character. > assert token_by_start(doc.c, doc.length, 4) == -1 > ``` -| Name | Type | Description | -| ------------ | --------------- | --------------------------------------------------------- | -| `tokens` | `const TokenC*` | A `TokenC*` array. | -| `length` | `int` | The number of tokens in the array. | -| `start_char` | `int` | The start index to search for. | -| **RETURNS** | `int` | The index of the token in the array or `-1` if not found. | +| Name | Description | +| ------------ | ----------------------------------------------------------------- | +| `tokens` | A `TokenC*` array. ~~const TokenC\*~~ | +| `length` | The number of tokens in the array. ~~int~~ | +| `start_char` | The start index to search for. ~~int~~ | +| **RETURNS** | The index of the token in the array or `-1` if not found. ~~int~~ | ### token_by_end {#token_by_end tag="function" source="spacy/tokens/doc.pxd"} @@ -115,12 +115,12 @@ Find a token in a `TokenC*` array by the offset of its final character. > assert token_by_end(doc.c, doc.length, 1) == -1 > ``` -| Name | Type | Description | -| ----------- | --------------- | --------------------------------------------------------- | -| `tokens` | `const TokenC*` | A `TokenC*` array. | -| `length` | `int` | The number of tokens in the array. | -| `end_char` | `int` | The end index to search for. | -| **RETURNS** | `int` | The index of the token in the array or `-1` if not found. | +| Name | Description | +| ----------- | ----------------------------------------------------------------- | +| `tokens` | A `TokenC*` array. ~~const TokenC\*~~ | +| `length` | The number of tokens in the array. ~~int~~ | +| `end_char` | The end index to search for. ~~int~~ | +| **RETURNS** | The index of the token in the array or `-1` if not found. ~~int~~ | ### set_children_from_heads {#set_children_from_heads tag="function" source="spacy/tokens/doc.pxd"} @@ -143,10 +143,10 @@ attribute, in order to make the parse tree navigation consistent. > assert doc.c[3].l_kids == 1 > ``` -| Name | Type | Description | -| -------- | --------------- | ---------------------------------- | -| `tokens` | `const TokenC*` | A `TokenC*` array. | -| `length` | `int` | The number of tokens in the array. | +| Name | Description | +| -------- | ------------------------------------------ | +| `tokens` | A `TokenC*` array. ~~const TokenC\*~~ | +| `length` | The number of tokens in the array. ~~int~~ | ## LexemeC {#lexemec tag="C struct" source="spacy/structs.pxd"} @@ -160,17 +160,17 @@ struct. > lex = doc.c[3].lex > ``` -| Name | Type | Description | -| ----------- | --------------------------------------- | -------------------------------------------------------------------------------------------------------------------------- | -| `flags` | `flags_t` | Bit-field for binary lexical flag values. | -| `id` | `attr_t` | Usually used to map lexemes to rows in a matrix, e.g. for word vectors. Does not need to be unique, so currently misnamed. | -| `length` | `attr_t` | Number of unicode characters in the lexeme. | -| `orth` | `attr_t` | ID of the verbatim text content. | -| `lower` | `attr_t` | ID of the lowercase form of the lexeme. | -| `norm` | `attr_t` | ID of the lexeme's norm, i.e. a normalized form of the text. | -| `shape` | `attr_t` | Transform of the lexeme's string, to show orthographic features. | -| `prefix` | `attr_t` | Length-N substring from the start of the lexeme. Defaults to `N=1`. | -| `suffix` | `attr_t` | Length-N substring from the end of the lexeme. Defaults to `N=3`. | +| Name | Description | +| -------- | ------------------------------------------------------------------------------------------------------------------------------------------------ | +| `flags` | Bit-field for binary lexical flag values. ~~flags_t (uint64_t)~~ | +| `id` | Usually used to map lexemes to rows in a matrix, e.g. for word vectors. Does not need to be unique, so currently misnamed. ~~attr_t (uint64_t)~~ | +| `length` | Number of unicode characters in the lexeme. ~~attr_t (uint64_t)~~ | +| `orth` | ID of the verbatim text content. ~~attr_t (uint64_t)~~ | +| `lower` | ID of the lowercase form of the lexeme. ~~attr_t (uint64_t)~~ | +| `norm` | ID of the lexeme's norm, i.e. a normalized form of the text. ~~attr_t (uint64_t)~~ | +| `shape` | Transform of the lexeme's string, to show orthographic features. ~~attr_t (uint64_t)~~ | +| `prefix` | Length-N substring from the start of the lexeme. Defaults to `N=1`. ~~attr_t (uint64_t)~~ | +| `suffix` | Length-N substring from the end of the lexeme. Defaults to `N=3`. ~~attr_t (uint64_t)~~ | ### Lexeme.get_struct_attr {#lexeme_get_struct_attr tag="staticmethod, nogil" source="spacy/lexeme.pxd"} @@ -186,11 +186,11 @@ Get the value of an attribute from the `LexemeC` struct by attribute ID. > is_alpha = Lexeme.get_struct_attr(lexeme, IS_ALPHA) > ``` -| Name | Type | Description | -| ----------- | -------------------------------------- | -------------------------------------------------------------------------------------- | -| `lex` | `const LexemeC*` | A pointer to a `LexemeC` struct. | -| `feat_name` | `attr_id_t` | The ID of the attribute to look up. The attributes are enumerated in `spacy.typedefs`. | -| **RETURNS** | `attr_t` | The value of the attribute. | +| Name | Description | +| ----------- | ---------------------------------------------------------------------------------------------------- | +| `lex` | A pointer to a `LexemeC` struct. ~~const LexemeC\*~~ | +| `feat_name` | The ID of the attribute to look up. The attributes are enumerated in `spacy.typedefs`. ~~attr_id_t~~ | +| **RETURNS** | The value of the attribute. ~~attr_t (uint64_t)~~ | ### Lexeme.set_struct_attr {#lexeme_set_struct_attr tag="staticmethod, nogil" source="spacy/lexeme.pxd"} @@ -206,11 +206,11 @@ Set the value of an attribute of the `LexemeC` struct by attribute ID. > Lexeme.set_struct_attr(lexeme, NORM, lexeme.lower) > ``` -| Name | Type | Description | -| ----------- | -------------------------------------- | -------------------------------------------------------------------------------------- | -| `lex` | `const LexemeC*` | A pointer to a `LexemeC` struct. | -| `feat_name` | `attr_id_t` | The ID of the attribute to look up. The attributes are enumerated in `spacy.typedefs`. | -| `value` | `attr_t` | The value to set. | +| Name | Description | +| ----------- | ---------------------------------------------------------------------------------------------------- | +| `lex` | A pointer to a `LexemeC` struct. ~~const LexemeC\*~~ | +| `feat_name` | The ID of the attribute to look up. The attributes are enumerated in `spacy.typedefs`. ~~attr_id_t~~ | +| `value` | The value to set. ~~attr_t (uint64_t)~~ | ### Lexeme.c_check_flag {#lexeme_c_check_flag tag="staticmethod, nogil" source="spacy/lexeme.pxd"} @@ -226,11 +226,11 @@ Check the value of a binary flag attribute. > is_stop = Lexeme.c_check_flag(lexeme, IS_STOP) > ``` -| Name | Type | Description | -| ----------- | ---------------- | ------------------------------------------------------------------------------- | -| `lexeme` | `const LexemeC*` | A pointer to a `LexemeC` struct. | -| `flag_id` | `attr_id_t` | The ID of the flag to look up. The flag IDs are enumerated in `spacy.typedefs`. | -| **RETURNS** | `bint` | The boolean value of the flag. | +| Name | Description | +| ----------- | --------------------------------------------------------------------------------------------- | +| `lexeme` | A pointer to a `LexemeC` struct. ~~const LexemeC\*~~ | +| `flag_id` | The ID of the flag to look up. The flag IDs are enumerated in `spacy.typedefs`. ~~attr_id_t~~ | +| **RETURNS** | The boolean value of the flag. ~~bint~~ | ### Lexeme.c_set_flag {#lexeme_c_set_flag tag="staticmethod, nogil" source="spacy/lexeme.pxd"} @@ -246,8 +246,8 @@ Set the value of a binary flag attribute. > Lexeme.c_set_flag(lexeme, IS_STOP, 0) > ``` -| Name | Type | Description | -| --------- | ---------------- | ------------------------------------------------------------------------------- | -| `lexeme` | `const LexemeC*` | A pointer to a `LexemeC` struct. | -| `flag_id` | `attr_id_t` | The ID of the flag to look up. The flag IDs are enumerated in `spacy.typedefs`. | -| `value` | `bint` | The value to set. | +| Name | Description | +| --------- | --------------------------------------------------------------------------------------------- | +| `lexeme` | A pointer to a `LexemeC` struct. ~~const LexemeC\*~~ | +| `flag_id` | The ID of the flag to look up. The flag IDs are enumerated in `spacy.typedefs`. ~~attr_id_t~~ | +| `value` | The value to set. ~~bint~~ | diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md index 6245c219f..727c0f35c 100644 --- a/website/docs/api/data-formats.md +++ b/website/docs/api/data-formats.md @@ -5,7 +5,8 @@ menu: - ['Training Config', 'config'] - ['Training Data', 'training'] - ['Pretraining Data', 'pretraining'] - - ['Vocabulary', 'vocab'] + - ['Vocabulary', 'vocab-jsonl'] + - ['Model Meta', 'meta'] --- This section documents input and output formats of data used by spaCy, including @@ -73,15 +74,15 @@ your config and check that it's valid, you can run the Defines the `nlp` object, its tokenizer and [processing pipeline](/usage/processing-pipelines) component names. -| Name | Type | Description | Default | -| ------------------------- | ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | ----------------------------- | -| `lang` | str | The language code to use. | `null` | -| `pipeline` | `List[str]` | Names of pipeline components in order. Should correspond to sections in the `[components]` block, e.g. `[components.ner]`. See docs on [defining components](/usage/training#config-components). | `[]` | -| `load_vocab_data` | bool | Whether to load additional lexeme and vocab data from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) if available. | `true` | -| `before_creation` | callable | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `Language` subclass before it's initialized. | `null` | -| `after_creation` | callable | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `nlp` object right after it's initialized. | `null` | -| `after_pipeline_creation` | callable | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `nlp` object after the pipeline components have been added. | `null` | -| `tokenizer` | callable | The tokenizer to use. | [`Tokenizer`](/api/tokenizer) | +| Name | Description | +| ------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `lang` | Model language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes). Defaults to `null`. ~~str~~ | +| `pipeline` | Names of pipeline components in order. Should correspond to sections in the `[components]` block, e.g. `[components.ner]`. See docs on [defining components](/usage/training#config-components). Defaults to `[]`. ~~List[str]~~ | +| `load_vocab_data` | Whether to load additional lexeme and vocab data from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) if available. Defaults to `true`. ~~bool~~ | +| `before_creation` | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `Language` subclass before it's initialized. Defaults to `null`. ~~Optional[Callable[[Type[Language]], Type[Language]]]~~ | +| `after_creation` | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `nlp` object right after it's initialized. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~ | +| `after_pipeline_creation` | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `nlp` object after the pipeline components have been added. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~ | +| `tokenizer` | The tokenizer to use. Defaults to [`Tokenizer`](/api/tokenizer). ~~Callable[[str], Doc]~~ | ### components {#config-components tag="section"} @@ -110,15 +111,15 @@ model to copy components from). See the docs on ### paths, system {#config-variables tag="variables"} These sections define variables that can be referenced across the other sections -as variables. For example `${paths:train}` uses the value of `train` defined in +as variables. For example `${paths.train}` uses the value of `train` defined in the block `[paths]`. If your config includes custom registered functions that need paths, you can define them here. All config values can also be [overwritten](/usage/training#config-overrides) on the CLI when you run [`spacy train`](/api/cli#train), which is especially relevant for data paths that you don't want to hard-code in your config file. -```bash -$ python -m spacy train ./config.cfg --paths.train ./corpus/train.spacy +```cli +$ python -m spacy train config.cfg --paths.train ./corpus/train.spacy ``` ### training {#config-training tag="section"} @@ -126,26 +127,24 @@ $ python -m spacy train ./config.cfg --paths.train ./corpus/train.spacy This section defines settings and controls for the training and evaluation process that are used when you run [`spacy train`](/api/cli#train). - - -| Name | Type | Description | Default | -| --------------------- | --------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------- | -| `seed` | int | The random seed. | `${system:seed}` | -| `dropout` | float | The dropout rate. | `0.1` | -| `accumulate_gradient` | int | Whether to divide the batch up into substeps. | `1` | -| `init_tok2vec` | str | Optional path to pretrained tok2vec weights created with [`spacy pretrain`](/api/cli#pretrain). | `${paths:init_tok2vec}` | -| `raw_text` | str | | `${paths:raw}` | -| `vectors` | str | | `null` | -| `patience` | int | How many steps to continue without improvement in evaluation score. | `1600` | -| `max_epochs` | int | Maximum number of epochs to train for. | `0` | -| `max_steps` | int | Maximum number of update steps to train for. | `20000` | -| `eval_frequency` | int | How often to evaluate during training (steps). | `200` | -| `score_weights` | `Dict[str, float]` | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. | `{}` | -| `frozen_components` | `List[str]` | Pipeline component names that are "frozen" and shouldn't be updated during training. See [here](/usage/training#config-components) for details. | `[]` | -| `train_corpus` | callable | Callable that takes the current `nlp` object and yields [`Example`](/api/example) objects. | [`Corpus`](/api/corpus) | -| `dev_corpus` | callable | Callable that takes the current `nlp` object and yields [`Example`](/api/example) objects. | [`Corpus`](/api/corpus) | -| `batcher` | callable | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. | [`batch_by_words`](/api/top-level#batch_by_words) | -| `optimizer` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. | [`Adam`](https://thinc.ai/docs/api-optimizers#adam) | +| Name | Description | +| --------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~ | +| `batcher` | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ | +| `dev_corpus` | Callable that takes the current `nlp` object and yields [`Example`](/api/example) objects. Defaults to [`Corpus`](/api/corpus). ~~Callable[[Language], Iterator[Example]]~~ | +| `dropout` | The dropout rate. Defaults to `0.1`. ~~float~~ | +| `eval_frequency` | How often to evaluate during training (steps). Defaults to `200`. ~~int~~ | +| `frozen_components` | Pipeline component names that are "frozen" and shouldn't be updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~ | +| `init_tok2vec` | Optional path to pretrained tok2vec weights created with [`spacy pretrain`](/api/cli#pretrain). Defaults to variable `${paths.init_tok2vec}`. ~~Optional[str]~~ | +| `max_epochs` | Maximum number of epochs to train for. Defaults to `0`. ~~int~~ | +| `max_steps` | Maximum number of update steps to train for. Defaults to `20000`. ~~int~~ | +| `optimizer` | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~ | +| `patience` | How many steps to continue without improvement in evaluation score. Defaults to `1600`. ~~int~~ | +| `raw_text` | Optional path to a jsonl file with unlabelled text documents for a [rehearsal](/api/language#rehearse) step. Defaults to variable `${paths.raw}`. ~~Optional[str]~~ | +| `score_weights` | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. Defaults to `{}`. ~~Dict[str, float]~~ | +| `seed` | The random seed. Defaults to variable `${system.seed}`. ~~int~~ | +| `train_corpus` | Callable that takes the current `nlp` object and yields [`Example`](/api/example) objects. Defaults to [`Corpus`](/api/corpus). ~~Callable[[Language], Iterator[Example]]~~ | +| `vectors` | Model name or path to model containing pretrained word vectors to use, e.g. created with [`init model`](/api/cli#init-model). Defaults to `null`. ~~Optional[str]~~ | ### pretraining {#config-pretraining tag="section,optional"} @@ -153,19 +152,19 @@ This section is optional and defines settings and controls for [language model pretraining](/usage/training#pretraining). It's used when you run [`spacy pretrain`](/api/cli#pretrain). -| Name | Type | Description | Default | -| ---------------------------- | --------------------------------------------------- | ----------------------------------------------------------------------------- | --------------------------------------------------- | -| `max_epochs` | int | Maximum number of epochs. | `1000` | -| `min_length` | int | Minimum length of examples. | `5` | -| `max_length` | int | Maximum length of examples. | `500` | -| `dropout` | float | The dropout rate. | `0.2` | -| `n_save_every` | int | Saving frequency. | `null` | -| `batch_size` | int / `Sequence[int]` | The batch size or batch size [schedule](https://thinc.ai/docs/api-schedules). | `3000` | -| `seed` | int | The random seed. | `${system.seed}` | -| `use_pytorch_for_gpu_memory` | bool | Allocate memory via PyTorch. | `${system:use_pytorch_for_gpu_memory}` | -| `tok2vec_model` | str | tok2vec model section in the config. | `"components.tok2vec.model"` | -| `objective` | dict | The pretraining objective. | `{"type": "characters", "n_characters": 4}` | -| `optimizer` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | [`Adam`](https://thinc.ai/docs/api-optimizers#adam) | +| Name | Description | +| ---------------------------- | ------------------------------------------------------------------------------------------------------------------------------- | +| `max_epochs` | Maximum number of epochs. Defaults to `1000`. ~~int~~ | +| `min_length` | Minimum length of examples. Defaults to `5`. ~~int~~ | +| `max_length` | Maximum length of examples. Defaults to `500`. ~~int~~ | +| `dropout` | The dropout rate. Defaults to `0.2`. ~~float~~ | +| `n_save_every` | Saving frequency. Defaults to `null`. ~~Optional[int]~~ | +| `batch_size` | The batch size or batch size [schedule](https://thinc.ai/docs/api-schedules). Defaults to `3000`. ~~Union[int, Sequence[int]]~~ | +| `seed` | The random seed. Defaults to variable `${system.seed}`. ~~int~~ | +| `use_pytorch_for_gpu_memory` | Allocate memory via PyTorch. Defaults to variable `${system.use_pytorch_for_gpu_memory}`. ~~bool~~ | +| `tok2vec_model` | The model section of the embedding component in the config. Defaults to `"components.tok2vec.model"`. ~~str~~ | +| `objective` | The pretraining objective. Defaults to `{"type": "characters", "n_characters": 4}`. ~~Dict[str, Any]~~ | +| `optimizer` | The optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~ | ## Training data {#training} @@ -208,8 +207,8 @@ objects to JSON, you can now serialize them directly using the [`spacy convert`](/api/cli) lets you convert your JSON data to the new `.spacy` format: -```bash -$ python -m spacy convert ./data.json ./output +```cli +$ python -m spacy convert ./data.json ./output.spacy ``` @@ -313,22 +312,22 @@ to keep track of your settings and hyperparameters and your own > } > ``` -| Name | Type | Description | -| ------------- | ---------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `text` | str | Raw text. | -| `words` | `List[str]` | List of gold-standard tokens. | -| `lemmas` | `List[str]` | List of lemmas. | -| `spaces` | `List[bool]` | List of boolean values indicating whether the corresponding tokens is followed by a space or not. | -| `tags` | `List[str]` | List of fine-grained [POS tags](/usage/linguistic-features#pos-tagging). | -| `pos` | `List[str]` | List of coarse-grained [POS tags](/usage/linguistic-features#pos-tagging). | -| `morphs` | `List[str]` | List of [morphological features](/usage/linguistic-features#rule-based-morphology). | -| `sent_starts` | `List[bool]` | List of boolean values indicating whether each token is the first of a sentence or not. | -| `deps` | `List[str]` | List of string values indicating the [dependency relation](/usage/linguistic-features#dependency-parse) of a token to its head. | -| `heads` | `List[int]` | List of integer values indicating the dependency head of each token, referring to the absolute index of each token in the text. | -| `entities` | `List[str]` | **Option 1:** List of [BILUO tags](/usage/linguistic-features#accessing-ner) per token of the format `"{action}-{label}"`, or `None` for unannotated tokens. | -| `entities` | `List[Tuple[int, int, str]]` | **Option 2:** List of `"(start, end, label)"` tuples defining all entities in the text. | -| `cats` | `Dict[str, float]` | Dictionary of `label`/`value` pairs indicating how relevant a certain [text category](/api/textcategorizer) is for the text. | -| `links` | `Dict[(int, int), Dict]` | Dictionary of `offset`/`dict` pairs defining [named entity links](/usage/linguistic-features#entity-linking). The character offsets are linked to a dictionary of relevant knowledge base IDs. | +| Name | Description | +| ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `text` | Raw text. ~~str~~ | +| `words` | List of gold-standard tokens. ~~List[str]~~ | +| `lemmas` | List of lemmas. ~~List[str]~~ | +| `spaces` | List of boolean values indicating whether the corresponding tokens is followed by a space or not. ~~List[bool]~~ | +| `tags` | List of fine-grained [POS tags](/usage/linguistic-features#pos-tagging). ~~List[str]~~ | +| `pos` | List of coarse-grained [POS tags](/usage/linguistic-features#pos-tagging). ~~List[str]~~ | +| `morphs` | List of [morphological features](/usage/linguistic-features#rule-based-morphology). ~~List[str]~~ | +| `sent_starts` | List of boolean values indicating whether each token is the first of a sentence or not. ~~List[bool]~~ | +| `deps` | List of string values indicating the [dependency relation](/usage/linguistic-features#dependency-parse) of a token to its head. ~~List[str]~~ | +| `heads` | List of integer values indicating the dependency head of each token, referring to the absolute index of each token in the text. ~~List[int]~~ | +| `entities` | **Option 1:** List of [BILUO tags](/usage/linguistic-features#accessing-ner) per token of the format `"{action}-{label}"`, or `None` for unannotated tokens. ~~List[str]~~ | +| `entities` | **Option 2:** List of `"(start, end, label)"` tuples defining all entities in the text. ~~List[Tuple[int, int, str]]~~ | +| `cats` | Dictionary of `label`/`value` pairs indicating how relevant a certain [text category](/api/textcategorizer) is for the text. ~~Dict[str, float]~~ | +| `links` | Dictionary of `offset`/`dict` pairs defining [named entity links](/usage/linguistic-features#entity-linking). The character offsets are linked to a dictionary of relevant knowledge base IDs. ~~Dict[Tuple[int, int], Dict]~~ | @@ -372,11 +371,11 @@ example = Example.from_dict(doc, gold_dict) ## Pretraining data {#pretraining} -The [`spacy pretrain`](/api/cli#pretrain) command lets you pretrain the tok2vec -layer of pipeline components from raw text. Raw text can be provided as a -`.jsonl` (newline-delimited JSON) file containing one input text per line -(roughly paragraph length is good). Optionally, custom tokenization can be -provided. +The [`spacy pretrain`](/api/cli#pretrain) command lets you pretrain the +"token-to-vector" embedding layer of pipeline components from raw text. Raw text +can be provided as a `.jsonl` (newline-delimited JSON) file containing one input +text per line (roughly paragraph length is good). Optionally, custom +tokenization can be provided. > #### Tip: Writing JSONL > @@ -390,10 +389,10 @@ provided. > srsly.write_jsonl("/path/to/text.jsonl", data) > ``` -| Key | Type | Description | -| -------- | ---- | ---------------------------------------------------------- | -| `text` | str | The raw input text. Is not required if `tokens` available. | -| `tokens` | list | Optional tokenization, one string per token. | +| Key | Description | +| -------- | --------------------------------------------------------------------- | +| `text` | The raw input text. Is not required if `tokens` is available. ~~str~~ | +| `tokens` | Optional tokenization, one string per token. ~~List[str]~~ | ```json ### Example @@ -406,7 +405,7 @@ provided. ## Lexical data for vocabulary {#vocab-jsonl new="2"} To populate a model's vocabulary, you can use the -[`spacy init-model`](/api/cli#init-model) command and load in a +[`spacy init model`](/api/cli#init-model) command and load in a [newline-delimited JSON](http://jsonlines.org/) (JSONL) file containing one lexical entry per line via the `--jsonl-loc` option. The first line defines the language and vocabulary settings. All other lines are expected to be JSON @@ -457,3 +456,75 @@ Here's an example of the 20 most frequent lexemes in the English training data: ```json https://github.com/explosion/spaCy/tree/master/examples/training/vocab-data.jsonl ``` + +## Model meta {#meta} + +The model meta is available as the file `meta.json` and exported automatically +when you save an `nlp` object to disk. Its contents are available as +[`nlp.meta`](/api/language#meta). + + + +As of spaCy v3.0, the `meta.json` **isn't** used to construct the language class +and pipeline anymore and only contains meta information for reference and for +creating a Python package with [`spacy package`](/api/cli#package). How to set +up the `nlp` object is now defined in the +[`config.cfg`](/api/data-formats#config), which includes detailed information +about the pipeline components and their model architectures, and all other +settings and hyperparameters used to train the model. It's the **single source +of truth** used for loading a model. + + + +> #### Example +> +> ```json +> { +> "name": "example_model", +> "lang": "en", +> "version": "1.0.0", +> "spacy_version": ">=3.0.0,<3.1.0", +> "parent_package": "spacy", +> "description": "Example model for spaCy", +> "author": "You", +> "email": "you@example.com", +> "url": "https://example.com", +> "license": "CC BY-SA 3.0", +> "sources": [{ "name": "My Corpus", "license": "MIT" }], +> "vectors": { "width": 0, "vectors": 0, "keys": 0, "name": null }, +> "pipeline": ["tok2vec", "ner", "textcat"], +> "labels": { +> "ner": ["PERSON", "ORG", "PRODUCT"], +> "textcat": ["POSITIVE", "NEGATIVE"] +> }, +> "accuracy": { +> "ents_f": 82.7300930714, +> "ents_p": 82.135523614, +> "ents_r": 83.3333333333, +> "textcat_score": 88.364323811 +> }, +> "speed": { "cpu": 7667.8, "gpu": null, "nwords": 10329 }, +> "spacy_git_version": "61dfdd9fb" +> } +> ``` + +| Name | Description | +| ---------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `lang` | Model language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes). Defaults to `"en"`. ~~str~~ | +| `name` | Model name, e.g. `"core_web_sm"`. The final model package name will be `{lang}_{name}`. Defaults to `"model"`. ~~str~~ | +| `version` | Model version. Will be used to version a Python package created with [`spacy package`](/api/cli#package). Defaults to `"0.0.0"`. ~~str~~ | +| `spacy_version` | spaCy version range the model is compatible with. Defaults to the spaCy version used to create the model, up to next minor version, which is the default compatibility for the available [pretrained models](/models). For instance, a model trained with v3.0.0 will have the version range `">=3.0.0,<3.1.0"`. ~~str~~ | +| `parent_package` | Name of the spaCy package. Typically `"spacy"` or `"spacy_nightly"`. Defaults to `"spacy"`. ~~str~~ | +| `description` | Model description. Also used for Python package. Defaults to `""`. ~~str~~ | +| `author` | Model author name. Also used for Python package. Defaults to `""`. ~~str~~ | +| `email` | Model author email. Also used for Python package. Defaults to `""`. ~~str~~ | +| `url` | Model author URL. Also used for Python package. Defaults to `""`. ~~str~~ | +| `license` | Model license. Also used for Python package. Defaults to `""`. ~~str~~ | +| `sources` | Data sources used to train the model. Typically a list of dicts with the keys `"name"`, `"url"`, `"author"` and `"license"`. [See here](https://github.com/explosion/spacy-models/tree/master/meta) for examples. Defaults to `None`. ~~Optional[List[Dict[str, str]]]~~ | +| `vectors` | Information about the word vectors included with the model. Typically a dict with the keys `"width"`, `"vectors"` (number of vectors), `"keys"` and `"name"`. ~~Dict[str, Any]~~ | +| `pipeline` | Names of pipeline component names in the model, in order. Corresponds to [`nlp.pipe_names`](/api/language#pipe_names). Only exists for reference and is not used to create the components. This information is defined in the [`config.cfg`](/api/data-formats#config). Defaults to `[]`. ~~List[str]~~ | +| `labels` | Label schemes of the trained pipeline components, keyed by component name. Corresponds to [`nlp.pipe_labels`](/api/language#pipe_labels). [See here](https://github.com/explosion/spacy-models/tree/master/meta) for examples. Defaults to `{}`. ~~Dict[str, Dict[str, List[str]]]~~ | +| `accuracy` | Training accuracy, added automatically by [`spacy train`](/api/cli#train). Dictionary of [score names](/usage/training#metrics) mapped to scores. Defaults to `{}`. ~~Dict[str, Union[float, Dict[str, float]]]~~ | +| `speed` | Model speed, added automatically by [`spacy train`](/api/cli#train). Typically a dictionary with the keys `"cpu"`, `"gpu"` and `"nwords"` (words per second). Defaults to `{}`. ~~Dict[str, Optional[Union[float, str]]]~~ | +| `spacy_git_version` 3 | Git commit of [`spacy`](https://github.com/explosion/spaCy) used to create model. ~~str~~ | +| other | Any other custom meta information you want to add. The data is preserved in [`nlp.meta`](/api/language#meta). ~~Any~~ | diff --git a/website/docs/api/dependencymatcher.md b/website/docs/api/dependencymatcher.md index 4f192783f..2fb903100 100644 --- a/website/docs/api/dependencymatcher.md +++ b/website/docs/api/dependencymatcher.md @@ -44,18 +44,18 @@ A pattern added to the `DependencyMatcher` consists of a list of dictionaries, with each dictionary describing a node to match. Each pattern should have the following top-level keys: -| Name | Type | Description | -| --------- | ---- | --------------------------------------------------------------------------------------------------------------------------- | -| `PATTERN` | dict | The token attributes to match in the same format as patterns provided to the regular token-based [`Matcher`](/api/matcher). | -| `SPEC` | dict | The relationships of the nodes in the subtree that should be matched. | +| Name | Description | +| --------- | ---------------------------------------------------------------------------------------------------------------------------------------------- | +| `PATTERN` | The token attributes to match in the same format as patterns provided to the regular token-based [`Matcher`](/api/matcher). ~~Dict[str, Any]~~ | +| `SPEC` | The relationships of the nodes in the subtree that should be matched. ~~Dict[str, str]~~ | The `SPEC` includes the following fields: -| Name | Type | Description | -| ------------ | ---- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `NODE_NAME` | str | A unique name for this node to refer to it in other specs. | -| `NBOR_RELOP` | str | A [Semgrex](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html) operator that describes how the two nodes are related. | -| `NBOR_NAME` | str | The unique name of the node that this node is connected to. | +| Name | Description | +| ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `NODE_NAME` | A unique name for this node to refer to it in other specs. ~~str~~ | +| `NBOR_RELOP` | A [Semgrex](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html) operator that describes how the two nodes are related. ~~str~~ | +| `NBOR_NAME` | The unique name of the node that this node is connected to. ~~str~~ | ## DependencyMatcher.\_\_init\_\_ {#init tag="method"} @@ -68,9 +68,9 @@ Create a rule-based `DependencyMatcher`. > matcher = DependencyMatcher(nlp.vocab) > ``` -| Name | Type | Description | -| ------- | ------- | ------------------------------------------------------------------------------------------- | -| `vocab` | `Vocab` | The vocabulary object, which must be shared with the documents the matcher will operate on. | +| Name | Description | +| ------- | ----------------------------------------------------------------------------------------------------- | +| `vocab` | The vocabulary object, which must be shared with the documents the matcher will operate on. ~~Vocab~~ | ## DependencyMatcher.\_\call\_\_ {#call tag="method"} @@ -79,9 +79,9 @@ Find all token sequences matching the supplied patterns on the `Doc` or `Span`. > #### Example > > ```python -> from spacy.matcher import Matcher +> from spacy.matcher import DependencyMatcher > -> matcher = Matcher(nlp.vocab) +> matcher = DependencyMatcher(nlp.vocab) > pattern = [ > {"SPEC": {"NODE_NAME": "founded"}, "PATTERN": {"ORTH": "founded"}}, > {"SPEC": {"NODE_NAME": "founder", "NBOR_RELOP": ">", "NBOR_NAME": "founded"}, "PATTERN": {"DEP": "nsubj"}}, @@ -91,10 +91,10 @@ Find all token sequences matching the supplied patterns on the `Doc` or `Span`. > matches = matcher(doc) > ``` -| Name | Type | Description | -| ----------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `doclike` | `Doc`/`Span` | The `Doc` or `Span` to match over. | -| **RETURNS** | list | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. The `match_id` is the ID of the added match pattern. | +| Name | Description | +| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `doclike` | The `Doc` or `Span` to match over. ~~Union[Doc, Span]~~ | +| **RETURNS** | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. The `match_id` is the ID of the added match pattern. ~~List[Tuple[int, int, int]]~~ | ## DependencyMatcher.\_\_len\_\_ {#len tag="method"} @@ -115,9 +115,9 @@ number of individual patterns. > assert len(matcher) == 1 > ``` -| Name | Type | Description | -| ----------- | ---- | -------------------- | -| **RETURNS** | int | The number of rules. | +| Name | Description | +| ----------- | ---------------------------- | +| **RETURNS** | The number of rules. ~~int~~ | ## DependencyMatcher.\_\_contains\_\_ {#contains tag="method"} @@ -132,10 +132,10 @@ Check whether the matcher contains rules for a match ID. > assert "Rule" in matcher > ``` -| Name | Type | Description | -| ----------- | ---- | ----------------------------------------------------- | -| `key` | str | The match ID. | -| **RETURNS** | bool | Whether the matcher contains rules for this match ID. | +| Name | Description | +| ----------- | -------------------------------------------------------------- | +| `key` | The match ID. ~~str~~ | +| **RETURNS** | Whether the matcher contains rules for this match ID. ~~bool~~ | ## DependencyMatcher.add {#add tag="method"} @@ -151,16 +151,16 @@ will be overwritten. > def on_match(matcher, doc, id, matches): > print('Matched!', matches) > -> matcher = Matcher(nlp.vocab) +> matcher = DependencyMatcher(nlp.vocab) > matcher.add("TEST_PATTERNS", patterns) > ``` -| Name | Type | Description | -| -------------- | ------------------ | --------------------------------------------------------------------------------------------- | -| `match_id` | str | An ID for the thing you're matching. | -| `patterns` | list | Match pattern. A pattern consists of a list of dicts, where each dict describes a token. | -| _keyword-only_ | | | -| `on_match` | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. | +| Name | Description | +| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `match_id` | An ID for the thing you're matching. ~~str~~ | +| `patterns` | list | Match pattern. A pattern consists of a list of dicts, where each dict describes a `"PATTERN"` and `"SPEC"`. ~~List[List[Dict[str, dict]]]~~ | +| _keyword-only_ | | | +| `on_match` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. ~~Optional[Callable[[Matcher, Doc, int, List[tuple], Any]]~~ | ## DependencyMatcher.remove {#remove tag="method"} @@ -176,9 +176,9 @@ exist. > assert "Rule" not in matcher > ``` -| Name | Type | Description | -| ----- | ---- | ------------------------- | -| `key` | str | The ID of the match rule. | +| Name | Description | +| ----- | --------------------------------- | +| `key` | The ID of the match rule. ~~str~~ | ## DependencyMatcher.get {#get tag="method"} @@ -192,7 +192,7 @@ Retrieve the pattern stored for a key. Returns the rule as an > on_match, patterns = matcher.get("Rule") > ``` -| Name | Type | Description | -| ----------- | ----- | --------------------------------------------- | -| `key` | str | The ID of the match rule. | -| **RETURNS** | tuple | The rule, as an `(on_match, patterns)` tuple. | +| Name | Description | +| ----------- | --------------------------------------------------------------------------------------------- | +| `key` | The ID of the match rule. ~~str~~ | +| **RETURNS** | The rule, as an `(on_match, patterns)` tuple. ~~Tuple[Optional[Callable], List[List[dict]]]~~ | diff --git a/website/docs/api/dependencyparser.md b/website/docs/api/dependencyparser.md index c7af8ffae..7a09a840a 100644 --- a/website/docs/api/dependencyparser.md +++ b/website/docs/api/dependencyparser.md @@ -48,13 +48,13 @@ architectures and their arguments and hyperparameters. > nlp.add_pipe("parser", config=config) > ``` -| Setting | Type | Description | Default | -| ----------------------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------- | -| `moves` | `List[str]` | A list of transition names. Inferred from the data if not provided. | `None` | -| `update_with_oracle_cut_size` | int | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. | `100` | -| `learn_tokens` | bool | Whether to learn to merge subtokens that are split relative to the gold standard. Experimental. | `False` | -| `min_action_freq` | int | The minimum frequency of labelled actions to retain. Rarer labelled actions have their label backed-off to "dep". While this primarily affects the label accuracy, it can also affect the attachment structure, as the labels are used to represent the pseudo-projectivity transformation. | `30` | -| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [TransitionBasedParser](/api/architectures#TransitionBasedParser) | +| Setting | Description | +| ----------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `moves` | A list of transition names. Inferred from the data if not provided. Defaults to `None`. ~~Optional[List[str]]~~ | +| `update_with_oracle_cut_size` | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. Defaults to `100`. ~~int~~ | +| `learn_tokens` | Whether to learn to merge subtokens that are split relative to the gold standard. Experimental. Defaults to `False`. ~~bool~~ | +| `min_action_freq` | The minimum frequency of labelled actions to retain. Rarer labelled actions have their label backed-off to "dep". While this primarily affects the label accuracy, it can also affect the attachment structure, as the labels are used to represent the pseudo-projectivity transformation. Defaults to `30`. ~~int~~ | +| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [TransitionBasedParser](/api/architectures#TransitionBasedParser). ~~Model[List[Doc], List[Floats2d]]~~ | ```python https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/dep_parser.pyx @@ -81,16 +81,16 @@ Create a new pipeline instance. In your application, you would normally use a shortcut for this and instantiate the component using its string name and [`nlp.add_pipe`](/api/language#add_pipe). -| Name | Type | Description | -| ----------------------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `vocab` | `Vocab` | The shared vocabulary. | -| `model` | [`Model`](https://thinc.ai/docs/api-model) | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. | -| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. | -| `moves` | `List[str]` | A list of transition names. Inferred from the data if not provided. | -| _keyword-only_ | | | -| `update_with_oracle_cut_size` | int | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. `100` is a good default. | -| `learn_tokens` | bool | Whether to learn to merge subtokens that are split relative to the gold standard. Experimental. | -| `min_action_freq` | int | The minimum frequency of labelled actions to retain. Rarer labelled actions have their label backed-off to "dep". While this primarily affects the label accuracy, it can also affect the attachment structure, as the labels are used to represent the pseudo-projectivity transformation. | +| Name | Description | +| ----------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `vocab` | The shared vocabulary. ~~Vocab~~ | +| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~ | +| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | +| `moves` | A list of transition names. Inferred from the data if not provided. ~~Optional[List[str]]~~ | +| _keyword-only_ | | +| `update_with_oracle_cut_size` | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. `100` is a good default. ~~int~~ | +| `learn_tokens` | Whether to learn to merge subtokens that are split relative to the gold standard. Experimental. ~~bool~~ | +| `min_action_freq` | The minimum frequency of labelled actions to retain. Rarer labelled actions have their label backed-off to "dep". While this primarily affects the label accuracy, it can also affect the attachment structure, as the labels are used to represent the pseudo-projectivity transformation. ~~int~~ | ## DependencyParser.\_\_call\_\_ {#call tag="method"} @@ -111,10 +111,10 @@ and all pipeline components are applied to the `Doc` in order. Both > processed = parser(doc) > ``` -| Name | Type | Description | -| ----------- | ----- | ------------------------ | -| `doc` | `Doc` | The document to process. | -| **RETURNS** | `Doc` | The processed document. | +| Name | Description | +| ----------- | -------------------------------- | +| `doc` | The document to process. ~~Doc~~ | +| **RETURNS** | The processed document. ~~Doc~~ | ## DependencyParser.pipe {#pipe tag="method"} @@ -133,12 +133,12 @@ applied to the `Doc` in order. Both [`__call__`](/api/dependencyparser#call) and > pass > ``` -| Name | Type | Description | -| -------------- | --------------- | ------------------------------------------------------ | -| `stream` | `Iterable[Doc]` | A stream of documents. | -| _keyword-only_ | | | -| `batch_size` | int | The number of texts to buffer. Defaults to `128`. | -| **YIELDS** | `Doc` | Processed documents in the order of the original text. | +| Name | Description | +| -------------- | ------------------------------------------------------------- | +| `docs` | A stream of documents. ~~Iterable[Doc]~~ | +| _keyword-only_ | | +| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ | +| **YIELDS** | The processed documents in order. ~~Doc~~ | ## DependencyParser.begin_training {#begin_training tag="method"} @@ -158,13 +158,13 @@ setting up the label scheme based on the data. > optimizer = parser.begin_training(lambda: [], pipeline=nlp.pipeline) > ``` -| Name | Type | Description | -| -------------- | --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------- | -| `get_examples` | `Callable[[], Iterable[Example]]` | Optional function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. | -| _keyword-only_ | | | -| `pipeline` | `List[Tuple[str, Callable]]` | Optional list of pipeline components that this component is part of. | -| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | An optional optimizer. Will be created via [`create_optimizer`](/api/dependencyparser#create_optimizer) if not set. | -| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------------------------------------- | +| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ | +| _keyword-only_ | | +| `pipeline` | Optional list of pipeline components that this component is part of. ~~Optional[List[Tuple[str, Callable[[Doc], Doc]]]]~~ | +| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | +| **RETURNS** | The optimizer. ~~Optimizer~~ | ## DependencyParser.predict {#predict tag="method"} @@ -178,10 +178,10 @@ modifying them. > scores = parser.predict([doc1, doc2]) > ``` -| Name | Type | Description | -| ----------- | ------------------- | ---------------------------------------------- | -| `docs` | `Iterable[Doc]` | The documents to predict. | -| **RETURNS** | `syntax.StateClass` | A helper class for the parse state (internal). | +| Name | Description | +| ----------- | ------------------------------------------------------------- | +| `docs` | The documents to predict. ~~Iterable[Doc]~~ | +| **RETURNS** | A helper class for the parse state (internal). ~~StateClass~~ | ## DependencyParser.set_annotations {#set_annotations tag="method"} @@ -195,10 +195,10 @@ Modify a batch of [`Doc`](/api/doc) objects, using pre-computed scores. > parser.set_annotations([doc1, doc2], scores) > ``` -| Name | Type | Description | -| -------- | ------------------- | ---------------------------------------------------------- | -| `docs` | `Iterable[Doc]` | The documents to modify. | -| `scores` | `syntax.StateClass` | The scores to set, produced by `DependencyParser.predict`. | +| Name | Description | +| -------- | ------------------------------------------------------------------------------------------------------------------------------------- | +| `docs` | The documents to modify. ~~Iterable[Doc]~~ | +| `scores` | The scores to set, produced by `DependencyParser.predict`. Returns an internal helper class for the parse state. ~~List[StateClass]~~ | ## DependencyParser.update {#update tag="method"} @@ -214,15 +214,15 @@ model. Delegates to [`predict`](/api/dependencyparser#predict) and > losses = parser.update(examples, sgd=optimizer) > ``` -| Name | Type | Description | -| ----------------- | ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- | -| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. | -| _keyword-only_ | | | -| `drop` | float | The dropout rate. | -| `set_annotations` | bool | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](/api/dependencyparser#set_annotations). | -| `sgd` | `Optimizer` | The [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. | -| `losses` | `Dict[str, float]` | Optional record of the loss during training. Updated using the component name as the key. | -| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. | +| Name | Description | +| ----------------- | ---------------------------------------------------------------------------------------------------------------------------------- | +| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ | +| _keyword-only_ | | | +| `drop` | The dropout rate. ~~float~~ | +| `set_annotations` | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](#set_annotations). ~~bool~~ | +| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | +| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ | +| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ | ## DependencyParser.get_loss {#get_loss tag="method"} @@ -237,11 +237,11 @@ predicted scores. > loss, d_loss = parser.get_loss(examples, scores) > ``` -| Name | Type | Description | -| ----------- | --------------------- | --------------------------------------------------- | -| `examples` | `Iterable[Example]` | The batch of examples. | -| `scores` | `syntax.StateClass` | Scores representing the model's predictions. | -| **RETURNS** | `Tuple[float, float]` | The loss and the gradient, i.e. `(loss, gradient)`. | +| Name | Description | +| ----------- | --------------------------------------------------------------------------- | +| `examples` | The batch of examples. ~~Iterable[Example]~~ | +| `scores` | Scores representing the model's predictions. ~~StateClass~~ | +| **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ | ## DependencyParser.score {#score tag="method" new="3"} @@ -253,10 +253,10 @@ Score a batch of examples. > scores = parser.score(examples) > ``` -| Name | Type | Description | -| ----------- | ------------------- | -------------------------------------------------------------------------------------------------------------------------- | -| `examples` | `Iterable[Example]` | The examples to score. | -| **RETURNS** | `Dict[str, Any]` | The scores, produced by [`Scorer.score_spans`](/api/scorer#score_spans) and [`Scorer.score_deps`](/api/scorer#score_deps). | +| Name | Description | +| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `examples` | The examples to score. ~~Iterable[Example]~~ | +| **RETURNS** | The scores, produced by [`Scorer.score_spans`](/api/scorer#score_spans) and [`Scorer.score_deps`](/api/scorer#score_deps). ~~Dict[str, Union[float, Dict[str, float]]]~~ | ## DependencyParser.create_optimizer {#create_optimizer tag="method"} @@ -270,9 +270,9 @@ component. > optimizer = parser.create_optimizer() > ``` -| Name | Type | Description | -| ----------- | --------------------------------------------------- | -------------- | -| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | +| Name | Description | +| ----------- | ---------------------------- | +| **RETURNS** | The optimizer. ~~Optimizer~~ | ## DependencyParser.use_params {#use_params tag="method, contextmanager"} @@ -287,9 +287,9 @@ context, the original parameters are restored. > parser.to_disk("/best_model") > ``` -| Name | Type | Description | -| -------- | ---- | ----------------------------------------- | -| `params` | dict | The parameter values to use in the model. | +| Name | Description | +| -------- | -------------------------------------------------- | +| `params` | The parameter values to use in the model. ~~dict~~ | ## DependencyParser.add_label {#add_label tag="method"} @@ -302,10 +302,10 @@ Add a new label to the pipe. > parser.add_label("MY_LABEL") > ``` -| Name | Type | Description | -| ----------- | ---- | --------------------------------------------------- | -| `label` | str | The label to add. | -| **RETURNS** | int | `0` if the label is already present, otherwise `1`. | +| Name | Description | +| ----------- | ----------------------------------------------------------- | +| `label` | The label to add. ~~str~~ | +| **RETURNS** | `0` if the label is already present, otherwise `1`. ~~int~~ | ## DependencyParser.to_disk {#to_disk tag="method"} @@ -318,11 +318,11 @@ Serialize the pipe to disk. > parser.to_disk("/path/to/parser") > ``` -| Name | Type | Description | -| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | -| _keyword-only_ | | | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------ | +| `path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ | +| _keyword-only_ | | +| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ | ## DependencyParser.from_disk {#from_disk tag="method"} @@ -335,12 +335,12 @@ Load the pipe from disk. Modifies the object in place and returns it. > parser.from_disk("/path/to/parser") > ``` -| Name | Type | Description | -| -------------- | ------------------ | -------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | -| _keyword-only_ | | | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `DependencyParser` | The modified `DependencyParser` object. | +| Name | Description | +| -------------- | ----------------------------------------------------------------------------------------------- | +| `path` | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ | +| _keyword-only_ | | +| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ | +| **RETURNS** | The modified `DependencyParser` object. ~~DependencyParser~~ | ## DependencyParser.to_bytes {#to_bytes tag="method"} @@ -353,11 +353,11 @@ Load the pipe from disk. Modifies the object in place and returns it. Serialize the pipe to a bytestring. -| Name | Type | Description | -| -------------- | --------------- | ------------------------------------------------------------------------- | -| _keyword-only_ | | | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | bytes | The serialized form of the `DependencyParser` object. | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------- | +| _keyword-only_ | | +| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ | +| **RETURNS** | The serialized form of the `DependencyParser` object. ~~bytes~~ | ## DependencyParser.from_bytes {#from_bytes tag="method"} @@ -371,12 +371,12 @@ Load the pipe from a bytestring. Modifies the object in place and returns it. > parser.from_bytes(parser_bytes) > ``` -| Name | Type | Description | -| -------------- | ------------------ | ------------------------------------------------------------------------- | -| `bytes_data` | bytes | The data to load from. | -| _keyword-only_ | | | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `DependencyParser` | The `DependencyParser` object. | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------- | +| `bytes_data` | The data to load from. ~~bytes~~ | +| _keyword-only_ | | +| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ | +| **RETURNS** | The `DependencyParser` object. ~~DependencyParser~~ | ## DependencyParser.labels {#labels tag="property"} @@ -389,9 +389,9 @@ The labels currently added to the component. > assert "MY_LABEL" in parser.labels > ``` -| Name | Type | Description | -| ----------- | ----- | ---------------------------------- | -| **RETURNS** | tuple | The labels added to the component. | +| Name | Description | +| ----------- | ------------------------------------------------------ | +| **RETURNS** | The labels added to the component. ~~Tuple[str, ...]~~ | ## Serialization fields {#serialization-fields} diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md index 45bfa31a2..3c4825f0d 100644 --- a/website/docs/api/doc.md +++ b/website/docs/api/doc.md @@ -30,11 +30,11 @@ Construct a `Doc` object. The most common way to get a `Doc` object is via the > doc = Doc(nlp.vocab, words=words, spaces=spaces) > ``` -| Name | Type | Description | -| -------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `vocab` | `Vocab` | A storage container for lexical types. | -| `words` | iterable | A list of strings to add to the container. | -| `spaces` | iterable | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. | +| Name | Description | +| -------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `vocab` | A storage container for lexical types. ~~Vocab~~ | +| `words` | A list of strings to add to the container. ~~Optional[List[str]]~~ | +| `spaces` | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. ~~Optional[List[bool]]~~ | ## Doc.\_\_getitem\_\_ {#getitem tag="method"} @@ -52,10 +52,10 @@ Negative indexing is supported, and follows the usual Python semantics, i.e. > assert span.text == "it back" > ``` -| Name | Type | Description | -| ----------- | ------- | ----------------------- | -| `i` | int | The index of the token. | -| **RETURNS** | `Token` | The token at `doc[i]`. | +| Name | Description | +| ----------- | -------------------------------- | +| `i` | The index of the token. ~~int~~ | +| **RETURNS** | The token at `doc[i]`. ~~Token~~ | Get a [`Span`](/api/span) object, starting at position `start` (token index) and ending at position `end` (token index). For instance, `doc[2:5]` produces a span @@ -64,10 +64,10 @@ are not supported, as `Span` objects must be contiguous (cannot have gaps). You can use negative indices and open-ended ranges, which have their normal Python semantics. -| Name | Type | Description | -| ----------- | ------ | --------------------------------- | -| `start_end` | tuple | The slice of the document to get. | -| **RETURNS** | `Span` | The span at `doc[start:end]`. | +| Name | Description | +| ----------- | ----------------------------------------------------- | +| `start_end` | The slice of the document to get. ~~Tuple[int, int]~~ | +| **RETURNS** | The span at `doc[start:end]`. ~~Span~~ | ## Doc.\_\_iter\_\_ {#iter tag="method"} @@ -85,9 +85,9 @@ main way annotations are accessed from Python. If faster-than-Python speeds are required, you can instead access the annotations as a numpy array, or access the underlying C data directly from Cython. -| Name | Type | Description | -| ---------- | ------- | ----------------- | -| **YIELDS** | `Token` | A `Token` object. | +| Name | Description | +| ---------- | --------------------------- | +| **YIELDS** | A `Token` object. ~~Token~~ | ## Doc.\_\_len\_\_ {#len tag="method"} @@ -100,9 +100,9 @@ Get the number of tokens in the document. > assert len(doc) == 7 > ``` -| Name | Type | Description | -| ----------- | ---- | ------------------------------------- | -| **RETURNS** | int | The number of tokens in the document. | +| Name | Description | +| ----------- | --------------------------------------------- | +| **RETURNS** | The number of tokens in the document. ~~int~~ | ## Doc.set_extension {#set_extension tag="classmethod" new="2"} @@ -120,14 +120,14 @@ details, see the documentation on > assert doc._.has_city > ``` -| Name | Type | Description | -| --------- | -------- | ----------------------------------------------------------------------------------------------------------------------------------- | -| `name` | str | Name of the attribute to set by the extension. For example, `"my_attr"` will be available as `doc._.my_attr`. | -| `default` | - | Optional default value of the attribute if no getter or method is defined. | -| `method` | callable | Set a custom method on the object, for example `doc._.compare(other_doc)`. | -| `getter` | callable | Getter function that takes the object and returns an attribute value. Is called when the user accesses the `._` attribute. | -| `setter` | callable | Setter function that takes the `Doc` and a value, and modifies the object. Is called when the user writes to the `Doc._` attribute. | -| `force` | bool | Force overwriting existing attribute. | +| Name | Description | +| --------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `name` | Name of the attribute to set by the extension. For example, `"my_attr"` will be available as `doc._.my_attr`. ~~str~~ | +| `default` | Optional default value of the attribute if no getter or method is defined. ~~Optional[Any]~~ | +| `method` | Set a custom method on the object, for example `doc._.compare(other_doc)`. ~~Optional[Callable[[Doc, ...], Any]]~~ | +| `getter` | Getter function that takes the object and returns an attribute value. Is called when the user accesses the `._` attribute. ~~Optional[Callable[[Doc], Any]]~~ | +| `setter` | Setter function that takes the `Doc` and a value, and modifies the object. Is called when the user writes to the `Doc._` attribute. ~~Optional[Callable[[Doc, Any], None]]~~ | +| `force` | Force overwriting existing attribute. ~~bool~~ | ## Doc.get_extension {#get_extension tag="classmethod" new="2"} @@ -144,10 +144,10 @@ Look up a previously registered extension by name. Returns a 4-tuple > assert extension == (False, None, None, None) > ``` -| Name | Type | Description | -| ----------- | ----- | ------------------------------------------------------------- | -| `name` | str | Name of the extension. | -| **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the extension. | +| Name | Description | +| ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------- | +| `name` | Name of the extension. ~~str~~ | +| **RETURNS** | A `(default, method, getter, setter)` tuple of the extension. ~~Tuple[Optional[Any], Optional[Callable], Optional[Callable], Optional[Callable]]~~ | ## Doc.has_extension {#has_extension tag="classmethod" new="2"} @@ -161,10 +161,10 @@ Check whether an extension has been registered on the `Doc` class. > assert Doc.has_extension("has_city") > ``` -| Name | Type | Description | -| ----------- | ---- | ------------------------------------------ | -| `name` | str | Name of the extension to check. | -| **RETURNS** | bool | Whether the extension has been registered. | +| Name | Description | +| ----------- | --------------------------------------------------- | +| `name` | Name of the extension to check. ~~str~~ | +| **RETURNS** | Whether the extension has been registered. ~~bool~~ | ## Doc.remove_extension {#remove_extension tag="classmethod" new="2.0.12"} @@ -179,10 +179,10 @@ Remove a previously registered extension. > assert not Doc.has_extension("has_city") > ``` -| Name | Type | Description | -| ----------- | ----- | --------------------------------------------------------------------- | -| `name` | str | Name of the extension. | -| **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the removed extension. | +| Name | Description | +| ----------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `name` | Name of the extension. ~~str~~ | +| **RETURNS** | A `(default, method, getter, setter)` tuple of the removed extension. ~~Tuple[Optional[Any], Optional[Callable], Optional[Callable], Optional[Callable]]~~ | ## Doc.char_span {#char_span tag="method" new="2"} @@ -197,14 +197,14 @@ the character indices don't map to a valid span. > assert span.text == "New York" > ``` -| Name | Type | Description | -| ------------------------------------ | ---------------------------------------- | --------------------------------------------------------------------- | -| `start` | int | The index of the first character of the span. | -| `end` | int | The index of the last character after the span. | -| `label` | uint64 / str | A label to attach to the span, e.g. for named entities. | -| `kb_id` 2.2 | uint64 / str | An ID from a knowledge base to capture the meaning of a named entity. | -| `vector` | `numpy.ndarray[ndim=1, dtype="float32"]` | A meaning representation of the span. | -| **RETURNS** | `Span` | The newly constructed object or `None`. | +| Name | Description | +| ------------------------------------ | ----------------------------------------------------------------------------------------- | +| `start` | The index of the first character of the span. ~~int~~ | +| `end` | The index of the last character after the span. ~int~~ | +| `label` | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~ | +| `kb_id` 2.2 | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~ | +| `vector` | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~ | +| **RETURNS** | The newly constructed object or `None`. ~~Optional[Span]~~ | ## Doc.similarity {#similarity tag="method" model="vectors"} @@ -221,10 +221,10 @@ using an average of word vectors. > assert apples_oranges == oranges_apples > ``` -| Name | Type | Description | -| ----------- | ----- | -------------------------------------------------------------------------------------------- | -| `other` | - | The object to compare with. By default, accepts `Doc`, `Span`, `Token` and `Lexeme` objects. | -| **RETURNS** | float | A scalar similarity score. Higher is more similar. | +| Name | Description | +| ----------- | -------------------------------------------------------------------------------------------------------------------------------- | +| `other` | The object to compare with. By default, accepts `Doc`, `Span`, `Token` and `Lexeme` objects. ~~Union[Doc, Span, Token, Lexeme]~~ | +| **RETURNS** | A scalar similarity score. Higher is more similar. ~~float~~ | ## Doc.count_by {#count_by tag="method"} @@ -237,15 +237,15 @@ attribute ID. > ```python > from spacy.attrs import ORTH > doc = nlp("apple apple orange banana") -> assert doc.count_by(ORTH) == {7024L: 1, 119552L: 1, 2087L: 2} +> assert doc.count_by(ORTH) == {7024: 1, 119552: 1, 2087: 2} > doc.to_array([ORTH]) > # array([[11880], [11880], [7561], [12800]]) > ``` -| Name | Type | Description | -| ----------- | ---- | -------------------------------------------------- | -| `attr_id` | int | The attribute ID | -| **RETURNS** | dict | A dictionary mapping attributes to integer counts. | +| Name | Description | +| ----------- | --------------------------------------------------------------------- | +| `attr_id` | The attribute ID. ~~int~~ | +| **RETURNS** | A dictionary mapping attributes to integer counts. ~~Dict[int, int]~~ | ## Doc.get_lca_matrix {#get_lca_matrix tag="method"} @@ -261,9 +261,9 @@ ancestor is found, e.g. if span excludes a necessary ancestor. > # array([[0, 1, 1, 1], [1, 1, 1, 1], [1, 1, 2, 3], [1, 1, 3, 3]], dtype=int32) > ``` -| Name | Type | Description | -| ----------- | -------------------------------------- | ----------------------------------------------- | -| **RETURNS** | `numpy.ndarray[ndim=2, dtype="int32"]` | The lowest common ancestor matrix of the `Doc`. | +| Name | Description | +| ----------- | -------------------------------------------------------------------------------------- | +| **RETURNS** | The lowest common ancestor matrix of the `Doc`. ~~numpy.ndarray[ndim=2, dtype=int32]~~ | ## Doc.to_array {#to_array tag="method"} @@ -288,10 +288,10 @@ Returns a 2D array with one row per token and one column per attribute (when > np_array = doc.to_array("POS") > ``` -| Name | Type | Description | -| ----------- | ---------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------- | -| `attr_ids` | list or int or string | A list of attributes (int IDs or string names) or a single attribute (int ID or string name) | -| **RETURNS** | `numpy.ndarray[ndim=2, dtype="uint64"]` or `numpy.ndarray[ndim=1, dtype="uint64"]` | The exported attributes as a numpy array. | +| Name | Description | +| ----------- | ---------------------------------------------------------------------------------------------------------------------------------------- | +| `attr_ids` | A list of attributes (int IDs or string names) or a single attribute (int ID or string name). ~~Union[int, str, List[Union[int, str]]]~~ | +| **RETURNS** | The exported attributes as a numpy array. ~~Union[numpy.ndarray[ndim=2, dtype=uint64], numpy.ndarray[ndim=1, dtype=uint64]]~~ | ## Doc.from_array {#from_array tag="method"} @@ -310,14 +310,14 @@ array of attributes. > assert doc[0].pos_ == doc2[0].pos_ > ``` -| Name | Type | Description | -| ----------- | -------------------------------------- | ------------------------------------------------------------------------- | -| `attrs` | list | A list of attribute ID ints. | -| `array` | `numpy.ndarray[ndim=2, dtype="int32"]` | The attribute values to load. | -| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `Doc` | Itself. | +| Name | Description | +| ----------- | ------------------------------------------------------------------------------------------- | +| `attrs` | A list of attribute ID ints. ~~List[int]~~ | +| `array` | The attribute values to load. ~~numpy.ndarray[ndim=2, dtype=int32]~~ | +| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ | +| **RETURNS** | The `Doc` itself. ~~Doc~~ | -## Doc.from_docs {#from_docs tag="staticmethod"} +## Doc.from_docs {#from_docs tag="staticmethod" new="3"} Concatenate multiple `Doc` objects to form a new one. Raises an error if the `Doc` objects do not all share the same `Vocab`. @@ -337,12 +337,12 @@ Concatenate multiple `Doc` objects to form a new one. Raises an error if the > [str(ent) for doc in docs for ent in doc.ents] > ``` -| Name | Type | Description | -| ------------------- | ----- | ----------------------------------------------------------------------------------------------- | -| `docs` | list | A list of `Doc` objects. | -| `ensure_whitespace` | bool | Insert a space between two adjacent docs whenever the first doc does not end in whitespace. | -| `attrs` | list | Optional list of attribute ID ints or attribute name strings. | -| **RETURNS** | `Doc` | The new `Doc` object that is containing the other docs or `None`, if `docs` is empty or `None`. | +| Name | Description | +| ------------------- | ----------------------------------------------------------------------------------------------------------------- | +| `docs` | A list of `Doc` objects. ~~List[Doc]~~ | +| `ensure_whitespace` | Insert a space between two adjacent docs whenever the first doc does not end in whitespace. ~~bool~~ | +| `attrs` | Optional list of attribute ID ints or attribute name strings. ~~Optional[List[Union[str, int]]]~~ | +| **RETURNS** | The new `Doc` object that is containing the other docs or `None`, if `docs` is empty or `None`. ~~Optional[Doc]~~ | ## Doc.to_disk {#to_disk tag="method" new="2"} @@ -354,11 +354,11 @@ Save the current state to a directory. > doc.to_disk("/path/to/doc") > ``` -| Name | Type | Description | -| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | -| _keyword-only_ | | | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------ | +| `path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ | +| _keyword-only_ | | +| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ | ## Doc.from_disk {#from_disk tag="method" new="2"} @@ -372,12 +372,12 @@ Loads state from a directory. Modifies the object in place and returns it. > doc = Doc(Vocab()).from_disk("/path/to/doc") > ``` -| Name | Type | Description | -| -------------- | --------------- | -------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | -| _keyword-only_ | | | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `Doc` | The modified `Doc` object. | +| Name | Description | +| -------------- | ----------------------------------------------------------------------------------------------- | +| `path` | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ | +| _keyword-only_ | | +| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ | +| **RETURNS** | The modified `Doc` object. ~~Doc~~ | ## Doc.to_bytes {#to_bytes tag="method"} @@ -390,11 +390,11 @@ Serialize, i.e. export the document contents to a binary string. > doc_bytes = doc.to_bytes() > ``` -| Name | Type | Description | -| -------------- | --------------- | ------------------------------------------------------------------------- | -| _keyword-only_ | | | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | bytes | A losslessly serialized copy of the `Doc`, including all annotations. | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------- | +| _keyword-only_ | | +| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ | +| **RETURNS** | A losslessly serialized copy of the `Doc`, including all annotations. ~~bytes~~ | ## Doc.from_bytes {#from_bytes tag="method"} @@ -410,12 +410,12 @@ Deserialize, i.e. import the document contents from a binary string. > assert doc.text == doc2.text > ``` -| Name | Type | Description | -| -------------- | --------------- | ------------------------------------------------------------------------- | -| `data` | bytes | The string to load from. | -| _keyword-only_ | | | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `Doc` | The `Doc` object. | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------- | +| `data` | The string to load from. ~~bytes~~ | +| _keyword-only_ | | +| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ | +| **RETURNS** | The `Doc` object. ~~Doc~~ | ## Doc.retokenize {#retokenize tag="contextmanager" new="2.1"} @@ -433,9 +433,9 @@ invalidated, although they may accidentally continue to work. > retokenizer.merge(doc[0:2]) > ``` -| Name | Type | Description | -| ----------- | ------------- | ---------------- | -| **RETURNS** | `Retokenizer` | The retokenizer. | +| Name | Description | +| ----------- | -------------------------------- | +| **RETURNS** | The retokenizer. ~~Retokenizer~~ | ### Retokenizer.merge {#retokenizer.merge tag="method"} @@ -454,10 +454,10 @@ dictionary mapping attribute names to values as the `"_"` key. > retokenizer.merge(doc[2:4], attrs=attrs) > ``` -| Name | Type | Description | -| ------- | ------ | -------------------------------------- | -| `span` | `Span` | The span to merge. | -| `attrs` | dict | Attributes to set on the merged token. | +| Name | Description | +| ------- | --------------------------------------------------------------------- | +| `span` | The span to merge. ~~Span~~ | +| `attrs` | Attributes to set on the merged token. ~~Dict[Union[str, int], Any]~~ | ### Retokenizer.split {#retokenizer.split tag="method"} @@ -488,33 +488,12 @@ underlying lexeme (if they're context-independent lexical attributes like > retokenizer.split(doc[3], ["New", "York"], heads=heads, attrs=attrs) > ``` -| Name | Type | Description | -| ------- | ------- | ----------------------------------------------------------------------------------------------------------- | -| `token` | `Token` | The token to split. | -| `orths` | list | The verbatim text of the split tokens. Needs to match the text of the original token. | -| `heads` | list | List of `token` or `(token, subtoken)` tuples specifying the tokens to attach the newly split subtokens to. | -| `attrs` | dict | Attributes to set on all split tokens. Attribute names mapped to list of per-token attribute values. | - -## Doc.merge {#merge tag="method"} - -Retokenize the document, such that the span at `doc.text[start_idx : end_idx]` -is merged into a single token. If `start_idx` and `end_idx` do not mark start -and end token boundaries, the document remains unchanged. - -> #### Example -> -> ```python -> doc = nlp("Los Angeles start.") -> doc.merge(0, len("Los Angeles"), "NNP", "Los Angeles", "GPE") -> assert [t.text for t in doc] == ["Los Angeles", "start", "."] -> ``` - -| Name | Type | Description | -| -------------- | ------- | ------------------------------------------------------------------------------------------------------------------------- | -| `start_idx` | int | The character index of the start of the slice to merge. | -| `end_idx` | int | The character index after the end of the slice to merge. | -| `**attributes` | - | Attributes to assign to the merged token. By default, attributes are inherited from the syntactic root token of the span. | -| **RETURNS** | `Token` | The newly merged token, or `None` if the start and end indices did not fall at token boundaries | +| Name | Description | +| ------- | ----------------------------------------------------------------------------------------------------------------------------------------------------- | +| `token` | The token to split. ~~Token~~ | +| `orths` | The verbatim text of the split tokens. Needs to match the text of the original token. ~~List[str]~~ | +| `heads` | List of `token` or `(token, subtoken)` tuples specifying the tokens to attach the newly split subtokens to. ~~List[Union[Token, Tuple[Token, int]]]~~ | +| `attrs` | Attributes to set on all split tokens. Attribute names mapped to list of per-token attribute values. ~~Dict[Union[str, int], List[Any]]~~ | ## Doc.ents {#ents tag="property" model="NER"} @@ -531,9 +510,9 @@ objects, if the entity recognizer has been applied. > assert ents[0].text == "Mr. Best" > ``` -| Name | Type | Description | -| ----------- | ----- | ------------------------------------------------ | -| **RETURNS** | tuple | Entities in the document, one `Span` per entity. | +| Name | Description | +| ----------- | --------------------------------------------------------------------- | +| **RETURNS** | Entities in the document, one `Span` per entity. ~~Tuple[Span, ...]~~ | ## Doc.noun_chunks {#noun_chunks tag="property" model="parser"} @@ -552,9 +531,9 @@ relative clauses. > assert chunks[1].text == "another phrase" > ``` -| Name | Type | Description | -| ---------- | ------ | ---------------------------- | -| **YIELDS** | `Span` | Noun chunks in the document. | +| Name | Description | +| ---------- | ------------------------------------- | +| **YIELDS** | Noun chunks in the document. ~~Span~~ | ## Doc.sents {#sents tag="property" model="parser"} @@ -572,9 +551,9 @@ will be unavailable. > assert [s.root.text for s in sents] == ["is", "'s"] > ``` -| Name | Type | Description | -| ---------- | ------ | -------------------------- | -| **YIELDS** | `Span` | Sentences in the document. | +| Name | Description | +| ---------- | ----------------------------------- | +| **YIELDS** | Sentences in the document. ~~Span~~ | ## Doc.has_vector {#has_vector tag="property" model="vectors"} @@ -587,9 +566,9 @@ A boolean value indicating whether a word vector is associated with the object. > assert doc.has_vector > ``` -| Name | Type | Description | -| ----------- | ---- | ------------------------------------------------ | -| **RETURNS** | bool | Whether the document has a vector data attached. | +| Name | Description | +| ----------- | --------------------------------------------------------- | +| **RETURNS** | Whether the document has a vector data attached. ~~bool~~ | ## Doc.vector {#vector tag="property" model="vectors"} @@ -604,9 +583,9 @@ vectors. > assert doc.vector.shape == (300,) > ``` -| Name | Type | Description | -| ----------- | ---------------------------------------- | ------------------------------------------------------- | -| **RETURNS** | `numpy.ndarray[ndim=1, dtype="float32"]` | A 1D numpy array representing the document's semantics. | +| Name | Description | +| ----------- | -------------------------------------------------------------------------------------------------- | +| **RETURNS** | A 1-dimensional array representing the document's vector. ~~numpy.ndarray[ndim=1, dtype=float32]~~ | ## Doc.vector_norm {#vector_norm tag="property" model="vectors"} @@ -622,32 +601,32 @@ The L2 norm of the document's vector representation. > assert doc1.vector_norm != doc2.vector_norm > ``` -| Name | Type | Description | -| ----------- | ----- | ----------------------------------------- | -| **RETURNS** | float | The L2 norm of the vector representation. | +| Name | Description | +| ----------- | --------------------------------------------------- | +| **RETURNS** | The L2 norm of the vector representation. ~~float~~ | ## Attributes {#attributes} -| Name | Type | Description | -| --------------------------------------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `text` | str | A string representation of the document text. | -| `text_with_ws` | str | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`. | -| `mem` | `Pool` | The document's local memory heap, for all C data it owns. | -| `vocab` | `Vocab` | The store of lexical types. | -| `tensor` 2 | `ndarray` | Container for dense vector representations. | -| `cats` 2 | dict | Maps a label to a score for categories applied to the document. The label is a string and the score should be a float. | -| `user_data` | - | A generic storage area, for user custom data. | -| `lang` 2.1 | int | Language of the document's vocabulary. | -| `lang_` 2.1 | str | Language of the document's vocabulary. | -| `is_tagged` | bool | A flag indicating that the document has been part-of-speech tagged. Returns `True` if the `Doc` is empty. | -| `is_parsed` | bool | A flag indicating that the document has been syntactically parsed. Returns `True` if the `Doc` is empty. | -| `is_sentenced` | bool | A flag indicating that sentence boundaries have been applied to the document. Returns `True` if the `Doc` is empty. | -| `is_nered` 2.1 | bool | A flag indicating that named entities have been set. Will return `True` if the `Doc` is empty, or if _any_ of the tokens has an entity tag set, even if the others are unknown. | -| `sentiment` | float | The document's positivity/negativity score, if available. | -| `user_hooks` | dict | A dictionary that allows customization of the `Doc`'s properties. | -| `user_token_hooks` | dict | A dictionary that allows customization of properties of `Token` children. | -| `user_span_hooks` | dict | A dictionary that allows customization of properties of `Span` children. | -| `_` | `Underscore` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). | +| Name | Description | +| --------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `text` | A string representation of the document text. ~~str~~ | +| `text_with_ws` | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`. ~~str~~ | +| `mem` | The document's local memory heap, for all C data it owns. ~~cymem.Pool~~ | +| `vocab` | The store of lexical types. ~~Vocab~~ | +| `tensor` 2 | Container for dense vector representations. ~~numpy.ndarray~~ | +| `cats` 2 | Maps a label to a score for categories applied to the document. The label is a string and the score should be a float. ~~Dict[str, float]~~ | +| `user_data` | A generic storage area, for user custom data. ~~Dict[str, Any]~~ | +| `lang` 2.1 | Language of the document's vocabulary. ~~int~~ | +| `lang_` 2.1 | Language of the document's vocabulary. ~~str~~ | +| `is_tagged` | A flag indicating that the document has been part-of-speech tagged. Returns `True` if the `Doc` is empty. ~~bool~~ | +| `is_parsed` | A flag indicating that the document has been syntactically parsed. Returns `True` if the `Doc` is empty. ~~bool~~ | +| `is_sentenced` | A flag indicating that sentence boundaries have been applied to the document. Returns `True` if the `Doc` is empty. ~~bool~~ | +| `is_nered` 2.1 | A flag indicating that named entities have been set. Will return `True` if the `Doc` is empty, or if _any_ of the tokens has an entity tag set, even if the others are unknown. ~~bool~~ | +| `sentiment` | The document's positivity/negativity score, if available. ~~float~~ | +| `user_hooks` | A dictionary that allows customization of the `Doc`'s properties. ~~Dict[str, Callable]~~ | +| `user_token_hooks` | A dictionary that allows customization of properties of `Token` children. ~~Dict[str, Callable]~~ | +| `user_span_hooks` | A dictionary that allows customization of properties of `Span` children. ~~Dict[str, Callable]~~ | +| `_` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~ | ## Serialization fields {#serialization-fields} diff --git a/website/docs/api/docbin.md b/website/docs/api/docbin.md index ced742045..03aff2f6e 100644 --- a/website/docs/api/docbin.md +++ b/website/docs/api/docbin.md @@ -44,11 +44,11 @@ Create a `DocBin` object to hold serialized annotations. > doc_bin = DocBin(attrs=["ENT_IOB", "ENT_TYPE"]) > ``` -| Argument | Type | Description | -| ----------------- | --------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `attrs` | `Iterable[str]` | List of attributes to serialize. `ORTH` (hash of token text) and `SPACY` (whether the token is followed by whitespace) are always serialized, so they're not required. Defaults to `("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "LEMMA", "MORPH", "POS")`. | -| `store_user_data` | bool | Whether to include the `Doc.user_data` and the values of custom extension attributes. Defaults to `False`. | -| `docs` | `Iterable[Doc]` | `Doc` objects to add on initialization. | +| Argument | Description | +| ----------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `attrs` | List of attributes to serialize. `ORTH` (hash of token text) and `SPACY` (whether the token is followed by whitespace) are always serialized, so they're not required. Defaults to `("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "LEMMA", "MORPH", "POS")`. ~~Iterable[str]~~ | +| `store_user_data` | Whether to include the `Doc.user_data` and the values of custom extension attributes. Defaults to `False`. ~~bool~~ | +| `docs` | `Doc` objects to add on initialization. ~~Iterable[Doc]~~ | ## DocBin.\_\len\_\_ {#len tag="method"} @@ -63,9 +63,9 @@ Get the number of `Doc` objects that were added to the `DocBin`. > assert len(doc_bin) == 1 > ``` -| Argument | Type | Description | -| ----------- | ---- | ------------------------------------------- | -| **RETURNS** | int | The number of `Doc`s added to the `DocBin`. | +| Argument | Description | +| ----------- | --------------------------------------------------- | +| **RETURNS** | The number of `Doc`s added to the `DocBin`. ~~int~~ | ## DocBin.add {#add tag="method"} @@ -79,9 +79,9 @@ Add a `Doc`'s annotations to the `DocBin` for serialization. > doc_bin.add(doc) > ``` -| Argument | Type | Description | -| -------- | ----- | ------------------------ | -| `doc` | `Doc` | The `Doc` object to add. | +| Argument | Description | +| -------- | -------------------------------- | +| `doc` | The `Doc` object to add. ~~Doc~~ | ## DocBin.get_docs {#get_docs tag="method"} @@ -93,15 +93,15 @@ Recover `Doc` objects from the annotations, using the given vocab. > docs = list(doc_bin.get_docs(nlp.vocab)) > ``` -| Argument | Type | Description | -| ---------- | ------- | ------------------ | -| `vocab` | `Vocab` | The shared vocab. | -| **YIELDS** | `Doc` | The `Doc` objects. | +| Argument | Description | +| ---------- | --------------------------- | +| `vocab` | The shared vocab. ~~Vocab~~ | +| **YIELDS** | The `Doc` objects. ~~Doc~~ | ## DocBin.merge {#merge tag="method"} Extend the annotations of this `DocBin` with the annotations from another. Will -raise an error if the pre-defined attrs of the two `DocBin`s don't match. +raise an error if the pre-defined `attrs` of the two `DocBin`s don't match. > #### Example > @@ -114,9 +114,9 @@ raise an error if the pre-defined attrs of the two `DocBin`s don't match. > assert len(doc_bin1) == 2 > ``` -| Argument | Type | Description | -| -------- | -------- | ------------------------------------------- | -| `other` | `DocBin` | The `DocBin` to merge into the current bin. | +| Argument | Description | +| -------- | ------------------------------------------------------ | +| `other` | The `DocBin` to merge into the current bin. ~~DocBin~~ | ## DocBin.to_bytes {#to_bytes tag="method"} @@ -130,9 +130,9 @@ Serialize the `DocBin`'s annotations to a bytestring. > doc_bin_bytes = doc_bin.to_bytes() > ``` -| Argument | Type | Description | -| ----------- | ----- | ------------------------ | -| **RETURNS** | bytes | The serialized `DocBin`. | +| Argument | Description | +| ----------- | ---------------------------------- | +| **RETURNS** | The serialized `DocBin`. ~~bytes~~ | ## DocBin.from_bytes {#from_bytes tag="method"} @@ -145,10 +145,10 @@ Deserialize the `DocBin`'s annotations from a bytestring. > new_doc_bin = DocBin().from_bytes(doc_bin_bytes) > ``` -| Argument | Type | Description | -| ------------ | -------- | ---------------------- | -| `bytes_data` | bytes | The data to load from. | -| **RETURNS** | `DocBin` | The loaded `DocBin`. | +| Argument | Description | +| ------------ | -------------------------------- | +| `bytes_data` | The data to load from. ~~bytes~~ | +| **RETURNS** | The loaded `DocBin`. ~~DocBin~~ | ## DocBin.to_disk {#to_disk tag="method" new="3"} @@ -164,9 +164,9 @@ and the result can be used as the input data for > doc_bin.to_disk("./data.spacy") > ``` -| Argument | Type | Description | -| -------- | ------------ | ----------------------------------------------------- | -| `path` | str / `Path` | The file path, typically with the `.spacy` extension. | +| Argument | Description | +| -------- | -------------------------------------------------------------------------- | +| `path` | The file path, typically with the `.spacy` extension. ~~Union[str, Path]~~ | ## DocBin.from_disk {#from_disk tag="method" new="3"} @@ -178,7 +178,7 @@ Load a serialized `DocBin` from a file. Typically uses the `.spacy` extension. > doc_bin = DocBin().from_disk("./data.spacy") > ``` -| Argument | Type | Description | -| ----------- | ------------ | ----------------------------------------------------- | -| `path` | str / `Path` | The file path, typically with the `.spacy` extension. | -| **RETURNS** | `DocBin` | The loaded `DocBin`. | +| Argument | Description | +| ----------- | -------------------------------------------------------------------------- | +| `path` | The file path, typically with the `.spacy` extension. ~~Union[str, Path]~~ | +| **RETURNS** | The loaded `DocBin`. ~~DocBin~~ | diff --git a/website/docs/api/entitylinker.md b/website/docs/api/entitylinker.md index fa8918dba..679c3c0c2 100644 --- a/website/docs/api/entitylinker.md +++ b/website/docs/api/entitylinker.md @@ -40,14 +40,14 @@ architectures and their arguments and hyperparameters. > nlp.add_pipe("entity_linker", config=config) > ``` -| Setting | Type | Description | Default | -| ---------------- | -------------------------------------------------------- | --------------------------------------------------------------------------- | ------------------------------------------------------ | -| `labels_discard` | `Iterable[str]` | NER labels that will automatically get a "NIL" prediction. | `[]` | -| `incl_prior` | bool | Whether or not to include prior probabilities from the KB in the model. | `True` | -| `incl_context` | bool | Whether or not to include the local context in the model. | `True` | -| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [EntityLinker](/api/architectures#EntityLinker) | -| `kb_loader` | `Callable[[Vocab], KnowledgeBase]` | Function that creates a [`KnowledgeBase`](/api/kb) from a `Vocab` instance. | An empty KnowledgeBase with `entity_vector_length` 64. | -| `get_candidates` | `Callable[[KnowledgeBase, "Span"], Iterable[Candidate]]` | Function that generates plausible candidates for a given `Span` object. | Built-in dictionary-lookup function. | +| Setting | Description | +| ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `labels_discard` | NER labels that will automatically get a "NIL" prediction. Defaults to `[]`. ~~Iterable[str]~~ | +| `incl_prior` | Whether or not to include prior probabilities from the KB in the model. Defaults to `True`. ~~bool~~ | +| `incl_context` | Whether or not to include the local context in the model. Defaults to `True`. ~~bool~~ | +| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~ | +| `kb_loader` | Function that creates a [`KnowledgeBase`](/api/kb) from a `Vocab` instance. Defaults to [EmptyKB](/api/architectures#EmptyKB), a function returning an empty `KnowledgeBase` with an `entity_vector_length` of `64`. ~~Callable[[Vocab], KnowledgeBase]~~ | +| `get_candidates` | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ | ```python https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/entity_linker.py @@ -66,7 +66,7 @@ https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/entity_linker.py > entity_linker = nlp.add_pipe("entity_linker", config=config) > > # Construction via add_pipe with custom KB and candidate generation -> config = {"kb_loader": {"@assets": "my_kb.v1"}, "get_candidates": {"@assets": "my_candidates.v1"},} +> config = {"kb": {"@assets": "my_kb.v1"}} > entity_linker = nlp.add_pipe("entity_linker", config=config) > > # Construction from class @@ -76,22 +76,21 @@ https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/entity_linker.py Create a new pipeline instance. In your application, you would normally use a shortcut for this and instantiate the component using its string name and -[`nlp.add_pipe`](/api/language#add_pipe). +[`nlp.add_pipe`](/api/language#add_pipe). Note that both the internal +`KnowledgeBase` as well as the Candidate generator can be customized by +providing custom registered functions. -Note that both the internal KB as well as the Candidate generator can be -customized by providing custom registered functions. - -| Name | Type | Description | -| ---------------- | -------------------------------------------------------- | ------------------------------------------------------------------------------------------- | -| `vocab` | `Vocab` | The shared vocabulary. | -| `model` | `Model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. | -| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. | -| _keyword-only_ | | | -| `kb_loader` | `Callable[[Vocab], KnowledgeBase]` | Function that creates a [`KnowledgeBase`](/api/kb) from a `Vocab` instance. | -| `get_candidates` | `Callable[[KnowledgeBase, "Span"], Iterable[Candidate]]` | Function that generates plausible candidates for a given `Span` object. | -| `labels_discard` | `Iterable[str]` | NER labels that will automatically get a "NIL" prediction. | -| `incl_prior` | bool | Whether or not to include prior probabilities from the KB in the model. | -| `incl_context` | bool | Whether or not to include the local context in the model. | +| Name | Description | +| ---------------- | -------------------------------------------------------------------------------------------------------------------------------- | +| `vocab` | The shared vocabulary. ~~Vocab~~ | +| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model~~ | +| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | +| _keyword-only_ | | | +| `kb_loader` | Function that creates a [`KnowledgeBase`](/api/kb) from a `Vocab` instance. ~~Callable[[Vocab], KnowledgeBase]~~ | +| `get_candidates` | Function that generates plausible candidates for a given `Span` object. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ | +| `labels_discard` | NER labels that will automatically get a `"NIL"` prediction. ~~Iterable[str]~~ | +| `incl_prior` | Whether or not to include prior probabilities from the KB in the model. ~~bool~~ | +| `incl_context` | Whether or not to include the local context in the model. ~~bool~~ | ## EntityLinker.\_\_call\_\_ {#call tag="method"} @@ -111,10 +110,10 @@ delegate to the [`predict`](/api/entitylinker#predict) and > processed = entity_linker(doc) > ``` -| Name | Type | Description | -| ----------- | ----- | ------------------------ | -| `doc` | `Doc` | The document to process. | -| **RETURNS** | `Doc` | The processed document. | +| Name | Description | +| ----------- | -------------------------------- | +| `doc` | The document to process. ~~Doc~~ | +| **RETURNS** | The processed document. ~~Doc~~ | ## EntityLinker.pipe {#pipe tag="method"} @@ -133,12 +132,12 @@ applied to the `Doc` in order. Both [`__call__`](/api/entitylinker#call) and > pass > ``` -| Name | Type | Description | -| -------------- | --------------- | ------------------------------------------------------ | -| `stream` | `Iterable[Doc]` | A stream of documents. | -| _keyword-only_ | | | -| `batch_size` | int | The number of texts to buffer. Defaults to `128`. | -| **YIELDS** | `Doc` | Processed documents in the order of the original text. | +| Name | Description | +| -------------- | ------------------------------------------------------------- | +| `stream` | A stream of documents. ~~Iterable[Doc]~~ | +| _keyword-only_ | | +| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ | +| **YIELDS** | The processed documents in order. ~~Doc~~ | ## EntityLinker.begin_training {#begin_training tag="method"} @@ -158,13 +157,13 @@ setting up the label scheme based on the data. > optimizer = entity_linker.begin_training(lambda: [], pipeline=nlp.pipeline) > ``` -| Name | Type | Description | -| -------------- | --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------- | -| `get_examples` | `Callable[[], Iterable[Example]]` | Optional function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. | -| _keyword-only_ | | | -| `pipeline` | `List[Tuple[str, Callable]]` | Optional list of pipeline components that this component is part of. | -| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | An optional optimizer. Will be created via [`create_optimizer`](/api/dependencyparser#create_optimizer) if not set. | -| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------------------------------------- | +| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ | +| _keyword-only_ | | +| `pipeline` | Optional list of pipeline components that this component is part of. ~~Optional[List[Tuple[str, Callable[[Doc], Doc]]]]~~ | +| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | +| **RETURNS** | The optimizer. ~~Optimizer~~ | ## EntityLinker.predict {#predict tag="method"} @@ -179,10 +178,10 @@ if there is no prediction. > kb_ids = entity_linker.predict([doc1, doc2]) > ``` -| Name | Type | Description | -| ----------- | --------------- | ------------------------------------------------------------ | -| `docs` | `Iterable[Doc]` | The documents to predict. | -| **RETURNS** | `List[str]` | The predicted KB identifiers for the entities in the `docs`. | +| Name | Description | +| ----------- | ------------------------------------------- | +| `docs` | The documents to predict. ~~Iterable[Doc]~~ | +| **RETURNS** | `List[str]` | The predicted KB identifiers for the entities in the `docs`. ~~List[str]~~ | ## EntityLinker.set_annotations {#set_annotations tag="method"} @@ -197,10 +196,10 @@ entities. > entity_linker.set_annotations([doc1, doc2], kb_ids) > ``` -| Name | Type | Description | -| -------- | --------------- | ------------------------------------------------------------------------------------------------- | -| `docs` | `Iterable[Doc]` | The documents to modify. | -| `kb_ids` | `List[str]` | The knowledge base identifiers for the entities in the docs, predicted by `EntityLinker.predict`. | +| Name | Description | +| -------- | --------------------------------------------------------------------------------------------------------------- | +| `docs` | The documents to modify. ~~Iterable[Doc]~~ | +| `kb_ids` | The knowledge base identifiers for the entities in the docs, predicted by `EntityLinker.predict`. ~~List[str]~~ | ## EntityLinker.update {#update tag="method"} @@ -216,15 +215,15 @@ pipe's entity linking model and context encoder. Delegates to > losses = entity_linker.update(examples, sgd=optimizer) > ``` -| Name | Type | Description | -| ----------------- | --------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------- | -| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. | -| _keyword-only_ | | | -| `drop` | float | The dropout rate. | -| `set_annotations` | bool | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](/api/textcategorizer#set_annotations). | -| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | -| `losses` | `Dict[str, float]` | Optional record of the loss during training. Updated using the component name as the key. | -| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. | +| Name | Description | +| ----------------- | ---------------------------------------------------------------------------------------------------------------------------------- | +| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ | +| _keyword-only_ | | | +| `drop` | The dropout rate. ~~float~~ | +| `set_annotations` | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](#set_annotations). ~~bool~~ | +| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | +| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ | +| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ | ## EntityLinker.create_optimizer {#create_optimizer tag="method"} @@ -237,9 +236,9 @@ Create an optimizer for the pipeline component. > optimizer = entity_linker.create_optimizer() > ``` -| Name | Type | Description | -| ----------- | --------------------------------------------------- | -------------- | -| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | +| Name | Description | +| ----------- | ---------------------------- | +| **RETURNS** | The optimizer. ~~Optimizer~~ | ## EntityLinker.use_params {#use_params tag="method, contextmanager"} @@ -254,9 +253,9 @@ context, the original parameters are restored. > entity_linker.to_disk("/best_model") > ``` -| Name | Type | Description | -| -------- | ---- | ----------------------------------------- | -| `params` | dict | The parameter values to use in the model. | +| Name | Description | +| -------- | -------------------------------------------------- | +| `params` | The parameter values to use in the model. ~~dict~~ | ## EntityLinker.to_disk {#to_disk tag="method"} @@ -269,11 +268,11 @@ Serialize the pipe to disk. > entity_linker.to_disk("/path/to/entity_linker") > ``` -| Name | Type | Description | -| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | -| _keyword-only_ | | | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------ | +| `path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ | +| _keyword-only_ | | +| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ | ## EntityLinker.from_disk {#from_disk tag="method"} @@ -286,12 +285,12 @@ Load the pipe from disk. Modifies the object in place and returns it. > entity_linker.from_disk("/path/to/entity_linker") > ``` -| Name | Type | Description | -| -------------- | --------------- | -------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | -| _keyword-only_ | | | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `EntityLinker` | The modified `EntityLinker` object. | +| Name | Description | +| -------------- | ----------------------------------------------------------------------------------------------- | +| `path` | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ | +| _keyword-only_ | | +| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ | +| **RETURNS** | The modified `EntityLinker` object. ~~EntityLinker~~ | ## Serialization fields {#serialization-fields} diff --git a/website/docs/api/entityrecognizer.md b/website/docs/api/entityrecognizer.md index 8d30463ff..b6b9caa84 100644 --- a/website/docs/api/entityrecognizer.md +++ b/website/docs/api/entityrecognizer.md @@ -41,11 +41,11 @@ architectures and their arguments and hyperparameters. > nlp.add_pipe("ner", config=config) > ``` -| Setting | Type | Description | Default | -| ----------------------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | ----------------------------------------------------------------- | -| `moves` | `List[str]` | A list of transition names. Inferred from the data if not provided. | -| `update_with_oracle_cut_size` | int | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. | `100` | -| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [TransitionBasedParser](/api/architectures#TransitionBasedParser) | +| Setting | Description | +| ----------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `moves` | A list of transition names. Inferred from the data if not provided. Defaults to `None`. ~~Optional[List[str]] | +| `update_with_oracle_cut_size` | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. Defaults to `100`. ~~int~~ | +| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [TransitionBasedParser](/api/architectures#TransitionBasedParser). ~~Model[List[Doc], List[Floats2d]]~~ | ```python https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/ner.pyx @@ -72,14 +72,14 @@ Create a new pipeline instance. In your application, you would normally use a shortcut for this and instantiate the component using its string name and [`nlp.add_pipe`](/api/language#add_pipe). -| Name | Type | Description | -| ----------------------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `vocab` | `Vocab` | The shared vocabulary. | -| `model` | [`Model`](https://thinc.ai/docs/api-model) | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. | -| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. | -| `moves` | `List[str]` | A list of transition names. Inferred from the data if not provided. | -| _keyword-only_ | | | -| `update_with_oracle_cut_size` | int | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. `100` is a good default. | +| Name | Description | +| ----------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `vocab` | The shared vocabulary. ~~Vocab~~ | +| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~ | +| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | +| `moves` | A list of transition names. Inferred from the data if not provided. ~~Optional[List[str]]~~ | +| _keyword-only_ | | +| `update_with_oracle_cut_size` | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. `100` is a good default. ~~int~~ | ## EntityRecognizer.\_\_call\_\_ {#call tag="method"} @@ -100,10 +100,10 @@ and all pipeline components are applied to the `Doc` in order. Both > processed = ner(doc) > ``` -| Name | Type | Description | -| ----------- | ----- | ------------------------ | -| `doc` | `Doc` | The document to process. | -| **RETURNS** | `Doc` | The processed document. | +| Name | Description | +| ----------- | -------------------------------- | +| `doc` | The document to process. ~~Doc~~ | +| **RETURNS** | The processed document. ~~Doc~~ | ## EntityRecognizer.pipe {#pipe tag="method"} @@ -122,12 +122,12 @@ applied to the `Doc` in order. Both [`__call__`](/api/entityrecognizer#call) and > pass > ``` -| Name | Type | Description | -| -------------- | --------------- | ------------------------------------------------------ | -| `docs` | `Iterable[Doc]` | A stream of documents. | -| _keyword-only_ | | | -| `batch_size` | int | The number of texts to buffer. Defaults to `128`. | -| **YIELDS** | `Doc` | Processed documents in the order of the original text. | +| Name | Description | +| -------------- | ------------------------------------------------------------- | +| `docs` | A stream of documents. ~~Iterable[Doc]~~ | +| _keyword-only_ | | +| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ | +| **YIELDS** | The processed documents in order. ~~Doc~~ | ## EntityRecognizer.begin_training {#begin_training tag="method"} @@ -147,13 +147,13 @@ setting up the label scheme based on the data. > optimizer = ner.begin_training(lambda: [], pipeline=nlp.pipeline) > ``` -| Name | Type | Description | -| -------------- | --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------- | -| `get_examples` | `Callable[[], Iterable[Example]]` | Optional function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. | -| _keyword-only_ | | | -| `pipeline` | `List[Tuple[str, Callable]]` | Optional list of pipeline components that this component is part of. | -| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | An optional optimizer. Will be created via [`create_optimizer`](/api/entityrecognizer#create_optimizer) if not set. | -| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------------------------------------- | +| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ | +| _keyword-only_ | | +| `pipeline` | Optional list of pipeline components that this component is part of. ~~Optional[List[Tuple[str, Callable[[Doc], Doc]]]]~~ | +| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | +| **RETURNS** | The optimizer. ~~Optimizer~~ | ## EntityRecognizer.predict {#predict tag="method"} @@ -167,10 +167,10 @@ modifying them. > scores = ner.predict([doc1, doc2]) > ``` -| Name | Type | Description | -| ----------- | ------------------ | ---------------------------------------------------------------------------------------------------------- | -| `docs` | `Iterable[Doc]` | The documents to predict. | -| **RETURNS** | `List[StateClass]` | List of `syntax.StateClass` objects. `syntax.StateClass` is a helper class for the parse state (internal). | +| Name | Description | +| ----------- | ------------------------------------------------------------- | +| `docs` | The documents to predict. ~~Iterable[Doc]~~ | +| **RETURNS** | A helper class for the parse state (internal). ~~StateClass~~ | ## EntityRecognizer.set_annotations {#set_annotations tag="method"} @@ -184,10 +184,10 @@ Modify a batch of [`Doc`](/api/doc) objects, using pre-computed scores. > ner.set_annotations([doc1, doc2], scores) > ``` -| Name | Type | Description | -| -------- | ------------------ | ---------------------------------------------------------- | -| `docs` | `Iterable[Doc]` | The documents to modify. | -| `scores` | `List[StateClass]` | The scores to set, produced by `EntityRecognizer.predict`. | +| Name | Description | +| -------- | ------------------------------------------------------------------------------------------------------------------------------------- | +| `docs` | The documents to modify. ~~Iterable[Doc]~~ | +| `scores` | The scores to set, produced by `EntityRecognizer.predict`. Returns an internal helper class for the parse state. ~~List[StateClass]~~ | ## EntityRecognizer.update {#update tag="method"} @@ -203,15 +203,15 @@ model. Delegates to [`predict`](/api/entityrecognizer#predict) and > losses = ner.update(examples, sgd=optimizer) > ``` -| Name | Type | Description | -| ----------------- | --------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- | -| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. | -| _keyword-only_ | | | -| `drop` | float | The dropout rate. | -| `set_annotations` | bool | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](/api/entityrecognizer#set_annotations). | -| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | -| `losses` | `Dict[str, float]` | Optional record of the loss during training. Updated using the component name as the key. | -| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. | +| Name | Description | +| ----------------- | ---------------------------------------------------------------------------------------------------------------------------------- | +| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ | +| _keyword-only_ | | | +| `drop` | The dropout rate. ~~float~~ | +| `set_annotations` | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](#set_annotations). ~~bool~~ | +| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | +| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ | +| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ | ## EntityRecognizer.get_loss {#get_loss tag="method"} @@ -226,11 +226,11 @@ predicted scores. > loss, d_loss = ner.get_loss(examples, scores) > ``` -| Name | Type | Description | -| ----------- | --------------------- | --------------------------------------------------- | -| `examples` | `Iterable[Example]` | The batch of examples. | -| `scores` | `List[StateClass]` | Scores representing the model's predictions. | -| **RETURNS** | `Tuple[float, float]` | The loss and the gradient, i.e. `(loss, gradient)`. | +| Name | Description | +| ----------- | --------------------------------------------------------------------------- | +| `examples` | The batch of examples. ~~Iterable[Example]~~ | +| `scores` | Scores representing the model's predictions. ~~StateClass~~ | +| **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ | ## EntityRecognizer.score {#score tag="method" new="3"} @@ -242,10 +242,10 @@ Score a batch of examples. > scores = ner.score(examples) > ``` -| Name | Type | Description | -| ----------- | ------------------- | ------------------------------------------------------------------------ | -| `examples` | `Iterable[Example]` | The examples to score. | -| **RETURNS** | `Dict[str, Any]` | The scores, produced by [`Scorer.score_spans`](/api/scorer#score_spans). | +| Name | Description | +| ----------- | ---------------------------------------------------------------------------------------------------------------------- | +| `examples` | The examples to score. ~~Iterable[Example]~~ | +| **RETURNS** | The scores, produced by [`Scorer.score_spans`](/api/scorer#score_spans). ~~Dict[str, Union[float, Dict[str, float]]]~~ | ## EntityRecognizer.create_optimizer {#create_optimizer tag="method"} @@ -258,9 +258,9 @@ Create an optimizer for the pipeline component. > optimizer = ner.create_optimizer() > ``` -| Name | Type | Description | -| ----------- | --------------------------------------------------- | -------------- | -| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | +| Name | Description | +| ----------- | ---------------------------- | +| **RETURNS** | The optimizer. ~~Optimizer~~ | ## EntityRecognizer.use_params {#use_params tag="method, contextmanager"} @@ -275,9 +275,9 @@ context, the original parameters are restored. > ner.to_disk("/best_model") > ``` -| Name | Type | Description | -| -------- | ---- | ----------------------------------------- | -| `params` | dict | The parameter values to use in the model. | +| Name | Description | +| -------- | -------------------------------------------------- | +| `params` | The parameter values to use in the model. ~~dict~~ | ## EntityRecognizer.add_label {#add_label tag="method"} @@ -290,10 +290,10 @@ Add a new label to the pipe. > ner.add_label("MY_LABEL") > ``` -| Name | Type | Description | -| ----------- | ---- | --------------------------------------------------- | -| `label` | str | The label to add. | -| **RETURNS** | int | `0` if the label is already present, otherwise `1`. | +| Name | Description | +| ----------- | ----------------------------------------------------------- | +| `label` | The label to add. ~~str~~ | +| **RETURNS** | `0` if the label is already present, otherwise `1`. ~~int~~ | ## EntityRecognizer.to_disk {#to_disk tag="method"} @@ -306,11 +306,11 @@ Serialize the pipe to disk. > ner.to_disk("/path/to/ner") > ``` -| Name | Type | Description | -| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | -| _keyword-only_ | | | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------ | +| `path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ | +| _keyword-only_ | | +| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ | ## EntityRecognizer.from_disk {#from_disk tag="method"} @@ -323,12 +323,12 @@ Load the pipe from disk. Modifies the object in place and returns it. > ner.from_disk("/path/to/ner") > ``` -| Name | Type | Description | -| -------------- | ------------------ | -------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | -| _keyword-only_ | | | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `EntityRecognizer` | The modified `EntityRecognizer` object. | +| Name | Description | +| -------------- | ----------------------------------------------------------------------------------------------- | +| `path` | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ | +| _keyword-only_ | | +| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ | +| **RETURNS** | The modified `EntityRecognizer` object. ~~EntityRecognizer~~ | ## EntityRecognizer.to_bytes {#to_bytes tag="method"} @@ -341,11 +341,11 @@ Load the pipe from disk. Modifies the object in place and returns it. Serialize the pipe to a bytestring. -| Name | Type | Description | -| -------------- | --------------- | ------------------------------------------------------------------------- | -| _keyword-only_ | | | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | bytes | The serialized form of the `EntityRecognizer` object. | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------- | +| _keyword-only_ | | +| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ | +| **RETURNS** | The serialized form of the `EntityRecognizer` object. ~~bytes~~ | ## EntityRecognizer.from_bytes {#from_bytes tag="method"} @@ -359,12 +359,12 @@ Load the pipe from a bytestring. Modifies the object in place and returns it. > ner.from_bytes(ner_bytes) > ``` -| Name | Type | Description | -| -------------- | ------------------ | ------------------------------------------------------------------------- | -| `bytes_data` | bytes | The data to load from. | -| _keyword-only_ | | | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `EntityRecognizer` | The `EntityRecognizer` object. | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------- | +| `bytes_data` | The data to load from. ~~bytes~~ | +| _keyword-only_ | | +| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ | +| **RETURNS** | The `EntityRecognizer` object. ~~EntityRecognizer~~ | ## EntityRecognizer.labels {#labels tag="property"} @@ -377,9 +377,9 @@ The labels currently added to the component. > assert "MY_LABEL" in ner.labels > ``` -| Name | Type | Description | -| ----------- | ----- | ---------------------------------- | -| **RETURNS** | tuple | The labels added to the component. | +| Name | Description | +| ----------- | ------------------------------------------------------ | +| **RETURNS** | The labels added to the component. ~~Tuple[str, ...]~~ | ## Serialization fields {#serialization-fields} diff --git a/website/docs/api/entityruler.md b/website/docs/api/entityruler.md index 1b98a659d..454b2a04b 100644 --- a/website/docs/api/entityruler.md +++ b/website/docs/api/entityruler.md @@ -34,12 +34,12 @@ how the component should be configured. You can override its settings via the > nlp.add_pipe("entity_ruler", config=config) > ``` -| Setting | Type | Description | Default | -| --------------------- | ---- | ------------------------------------------------------------------------------------------------------------------------------------------- | ------- | -| `phrase_matcher_attr` | str | Optional attribute name match on for the internal [`PhraseMatcher`](/api/phrasematcher), e.g. `LOWER` to match on the lowercase token text. | `None` | -| `validate` | bool | Whether patterns should be validated (passed to the `Matcher` and `PhraseMatcher`). | `False` | -| `overwrite_ents` | bool | If existing entities are present, e.g. entities added by the model, overwrite them by matches if necessary. | `False` | -| `ent_id_sep` | str | Separator used internally for entity IDs. | `"||"` | +| Setting | Description | +| --------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `phrase_matcher_attr` | Optional attribute name match on for the internal [`PhraseMatcher`](/api/phrasematcher), e.g. `LOWER` to match on the lowercase token text. Defaults to `None`. ~~Optional[Union[int, str]]~~ | +| `validate` | Whether patterns should be validated (passed to the `Matcher` and `PhraseMatcher`). Defaults to `False`. ~~bool~~ | +| `overwrite_ents` | If existing entities are present, e.g. entities added by the model, overwrite them by matches if necessary. Defaults to `False`. ~~bool~~ | +| `ent_id_sep` | Separator used internally for entity IDs. Defaults to `"||"`. ~~str~~ | ```python https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/entityruler.py @@ -63,16 +63,16 @@ be a token pattern (list) or a phrase pattern (string). For example: > ruler = EntityRuler(nlp, overwrite_ents=True) > ``` -| Name | Type | Description | -| --------------------------------- | ---------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `nlp` | `Language` | The shared nlp object to pass the vocab to the matchers and process phrase patterns. | -| `name` 3 | str | Instance name of the current pipeline component. Typically passed in automatically from the factory when the component is added. Used to disable the current entity ruler while creating phrase patterns with the nlp object. | -| _keyword-only_ | | | -| `phrase_matcher_attr` | int / str | Optional attribute name match on for the internal [`PhraseMatcher`](/api/phrasematcher), e.g. `LOWER` to match on the lowercase token text. Defaults to `None`. | -| `validate` | bool | Whether patterns should be validated, passed to Matcher and PhraseMatcher as `validate`. Defaults to `False`. | -| `overwrite_ents` | bool | If existing entities are present, e.g. entities added by the model, overwrite them by matches if necessary. Defaults to `False`. | -| `ent_id_sep` | str | Separator used internally for entity IDs. Defaults to `"||"`. | -| `patterns` | iterable | Optional patterns to load in on initialization. | +| Name | Description | +| --------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `nlp` | The shared nlp object to pass the vocab to the matchers and process phrase patterns. ~~Language~~ | +| `name` 3 | Instance name of the current pipeline component. Typically passed in automatically from the factory when the component is added. Used to disable the current entity ruler while creating phrase patterns with the nlp object. ~~str~~ | +| _keyword-only_ | | +| `phrase_matcher_attr` | Optional attribute name match on for the internal [`PhraseMatcher`](/api/phrasematcher), e.g. `LOWER` to match on the lowercase token text. Defaults to `None`. ~~Optional[Union[int, str]]~~ | +| `validate` | Whether patterns should be validated, passed to Matcher and PhraseMatcher as `validate`. Defaults to `False`. ~~bool~~ | +| `overwrite_ents` | If existing entities are present, e.g. entities added by the model, overwrite them by matches if necessary. Defaults to `False`. ~~bool~~ | +| `ent_id_sep` | Separator used internally for entity IDs. Defaults to `"||"`. ~~str~~ | +| `patterns` | Optional patterns to load in on initialization. ~~Optional[List[Dict[str, Union[str, List[dict]]]]]~~ | ## EntityRuler.\_\len\_\_ {#len tag="method"} @@ -87,9 +87,9 @@ The number of all patterns added to the entity ruler. > assert len(ruler) == 1 > ``` -| Name | Type | Description | -| ----------- | ---- | ----------------------- | -| **RETURNS** | int | The number of patterns. | +| Name | Description | +| ----------- | ------------------------------- | +| **RETURNS** | The number of patterns. ~~int~~ | ## EntityRuler.\_\_contains\_\_ {#contains tag="method"} @@ -104,10 +104,10 @@ Whether a label is present in the patterns. > assert not "PERSON" in ruler > ``` -| Name | Type | Description | -| ----------- | ---- | -------------------------------------------- | -| `label` | str | The label to check. | -| **RETURNS** | bool | Whether the entity ruler contains the label. | +| Name | Description | +| ----------- | ----------------------------------------------------- | +| `label` | The label to check. ~~str~~ | +| **RETURNS** | Whether the entity ruler contains the label. ~~bool~~ | ## EntityRuler.\_\_call\_\_ {#call tag="method"} @@ -130,10 +130,10 @@ is chosen. > assert ents == [("Apple", "ORG")] > ``` -| Name | Type | Description | -| ----------- | ----- | ------------------------------------------------------------ | -| `doc` | `Doc` | The `Doc` object to process, e.g. the `Doc` in the pipeline. | -| **RETURNS** | `Doc` | The modified `Doc` with added entities, if available. | +| Name | Description | +| ----------- | -------------------------------------------------------------------- | +| `doc` | The `Doc` object to process, e.g. the `Doc` in the pipeline. ~~Doc~~ | +| **RETURNS** | The modified `Doc` with added entities, if available. ~~Doc~~ | ## EntityRuler.add_patterns {#add_patterns tag="method"} @@ -152,9 +152,9 @@ of dicts) or a phrase pattern (string). For more details, see the usage guide on > ruler.add_patterns(patterns) > ``` -| Name | Type | Description | -| ---------- | ---- | -------------------- | -| `patterns` | list | The patterns to add. | +| Name | Description | +| ---------- | ---------------------------------------------------------------- | +| `patterns` | The patterns to add. ~~List[Dict[str, Union[str, List[dict]]]]~~ | ## EntityRuler.to_disk {#to_disk tag="method"} @@ -171,9 +171,9 @@ only the patterns are saved as JSONL. If a directory name is provided, a > ruler.to_disk("/path/to/entity_ruler") # saves patterns and config > ``` -| Name | Type | Description | -| ------ | ------------ | ----------------------------------------------------------------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a JSONL file or directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| Name | Description | +| ------ | -------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `path` | A path to a JSONL file or directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ | ## EntityRuler.from_disk {#from_disk tag="method"} @@ -190,10 +190,10 @@ configuration. > ruler.from_disk("/path/to/entity_ruler") # loads patterns and config > ``` -| Name | Type | Description | -| ----------- | ------------- | ---------------------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a JSONL file or directory. Paths may be either strings or `Path`-like objects. | -| **RETURNS** | `EntityRuler` | The modified `EntityRuler` object. | +| Name | Description | +| ----------- | ------------------------------------------------------------------------------------------------------------- | +| `path` | A path to a JSONL file or directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ | +| **RETURNS** | The modified `EntityRuler` object. ~~EntityRuler~~ | ## EntityRuler.to_bytes {#to_bytes tag="method"} @@ -206,9 +206,9 @@ Serialize the entity ruler patterns to a bytestring. > ruler_bytes = ruler.to_bytes() > ``` -| Name | Type | Description | -| ----------- | ----- | ------------------------ | -| **RETURNS** | bytes | The serialized patterns. | +| Name | Description | +| ----------- | ---------------------------------- | +| **RETURNS** | The serialized patterns. ~~bytes~~ | ## EntityRuler.from_bytes {#from_bytes tag="method"} @@ -222,40 +222,40 @@ Load the pipe from a bytestring. Modifies the object in place and returns it. > ruler.from_bytes(ruler_bytes) > ``` -| Name | Type | Description | -| ------------ | ------------- | ---------------------------------- | -| `bytes_data` | bytes | The bytestring to load. | -| **RETURNS** | `EntityRuler` | The modified `EntityRuler` object. | +| Name | Description | +| ------------ | -------------------------------------------------- | +| `bytes_data` | The bytestring to load. ~~bytes~~ | +| **RETURNS** | The modified `EntityRuler` object. ~~EntityRuler~~ | ## EntityRuler.labels {#labels tag="property"} All labels present in the match patterns. -| Name | Type | Description | -| ----------- | ----- | ------------------ | -| **RETURNS** | tuple | The string labels. | +| Name | Description | +| ----------- | -------------------------------------- | +| **RETURNS** | The string labels. ~~Tuple[str, ...]~~ | ## EntityRuler.ent_ids {#labels tag="property" new="2.2.2"} -All entity ids present in the match patterns `id` properties. +All entity IDs present in the `id` properties of the match patterns. -| Name | Type | Description | -| ----------- | ----- | ------------------- | -| **RETURNS** | tuple | The string ent_ids. | +| Name | Description | +| ----------- | ----------------------------------- | +| **RETURNS** | The string IDs. ~~Tuple[str, ...]~~ | ## EntityRuler.patterns {#patterns tag="property"} Get all patterns that were added to the entity ruler. -| Name | Type | Description | -| ----------- | ---- | -------------------------------------------------- | -| **RETURNS** | list | The original patterns, one dictionary per pattern. | +| Name | Description | +| ----------- | ---------------------------------------------------------------------------------------- | +| **RETURNS** | The original patterns, one dictionary per pattern. ~~List[Dict[str, Union[str, dict]]]~~ | ## Attributes {#attributes} -| Name | Type | Description | -| ----------------- | ------------------------------------- | ---------------------------------------------------------------- | -| `matcher` | [`Matcher`](/api/matcher) | The underlying matcher used to process token patterns. | -| `phrase_matcher` | [`PhraseMatcher`](/api/phrasematcher) | The underlying phrase matcher, used to process phrase patterns. | -| `token_patterns` | dict | The token patterns present in the entity ruler, keyed by label. | -| `phrase_patterns` | dict | The phrase patterns present in the entity ruler, keyed by label. | +| Name | Description | +| ----------------- | --------------------------------------------------------------------------------------------------------------------- | +| `matcher` | The underlying matcher used to process token patterns. ~~Matcher~~ | | +| `phrase_matcher` | The underlying phrase matcher, used to process phrase patterns. ~~PhraseMatcher~~ | +| `token_patterns` | The token patterns present in the entity ruler, keyed by label. ~~Dict[str, List[Dict[str, Union[str, List[dict]]]]~~ | +| `phrase_patterns` | The phrase patterns present in the entity ruler, keyed by label. ~~Dict[str, List[Doc]]~~ | diff --git a/website/docs/api/example.md b/website/docs/api/example.md index 190729490..2434cce43 100644 --- a/website/docs/api/example.md +++ b/website/docs/api/example.md @@ -8,9 +8,9 @@ new: 3.0 An `Example` holds the information for one training instance. It stores two `Doc` objects: one for holding the gold-standard reference data, and one for -holding the predictions of the pipeline. An [`Alignment`](#alignment-object) -object stores the alignment between these two documents, as they can differ in -tokenization. +holding the predictions of the pipeline. An +[`Alignment`](/api/example#alignment-object) object stores the alignment between +these two documents, as they can differ in tokenization. ## Example.\_\_init\_\_ {#init tag="method"} @@ -31,12 +31,12 @@ both documents. > example = Example(predicted, reference) > ``` -| Name | Type | Description | -| -------------- | ----------- | ------------------------------------------------------------------------------------------------ | -| `predicted` | `Doc` | The document containing (partial) predictions. Can not be `None`. | -| `reference` | `Doc` | The document containing gold-standard annotations. Can not be `None`. | -| _keyword-only_ | | | -| `alignment` | `Alignment` | An object holding the alignment between the tokens of the `predicted` and `reference` documents. | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------------------------ | +| `predicted` | The document containing (partial) predictions. Can not be `None`. ~~Doc~~ | +| `reference` | The document containing gold-standard annotations. Can not be `None`. ~~Doc~~ | +| _keyword-only_ | | +| `alignment` | An object holding the alignment between the tokens of the `predicted` and `reference` documents. ~~Optional[Alignment]~~ | ## Example.from_dict {#from_dict tag="classmethod"} @@ -56,11 +56,11 @@ see the [training format documentation](/api/data-formats#dict-input). > example = Example.from_dict(predicted, {"words": token_ref, "tags": tags_ref}) > ``` -| Name | Type | Description | -| -------------- | ---------------- | ----------------------------------------------------------------- | -| `predicted` | `Doc` | The document containing (partial) predictions. Can not be `None`. | -| `example_dict` | `Dict[str, obj]` | The gold-standard annotations as a dictionary. Can not be `None`. | -| **RETURNS** | `Example` | The newly constructed object. | +| Name | Description | +| -------------- | ------------------------------------------------------------------------- | +| `predicted` | The document containing (partial) predictions. Can not be `None`. ~~Doc~~ | +| `example_dict` | `Dict[str, obj]` | The gold-standard annotations as a dictionary. Can not be `None`. ~~Dict[str, Any]~~ | +| **RETURNS** | The newly constructed object. ~~Example~~ | ## Example.text {#text tag="property"} @@ -72,12 +72,14 @@ The text of the `predicted` document in this `Example`. > raw_text = example.text > ``` -| Name | Type | Description | -| ----------- | ---- | ------------------------------------- | -| **RETURNS** | str | The text of the `predicted` document. | +| Name | Description | +| ----------- | --------------------------------------------- | +| **RETURNS** | The text of the `predicted` document. ~~str~~ | ## Example.predicted {#predicted tag="property"} +The `Doc` holding the predictions. Occasionally also referred to as `example.x`. + > #### Example > > ```python @@ -86,14 +88,15 @@ The text of the `predicted` document in this `Example`. > set_annotations(docs, predictions) > ``` -The `Doc` holding the predictions. Occassionally also refered to as `example.x`. - -| Name | Type | Description | -| ----------- | ----- | ---------------------------------------------- | -| **RETURNS** | `Doc` | The document containing (partial) predictions. | +| Name | Description | +| ----------- | ------------------------------------------------------ | +| **RETURNS** | The document containing (partial) predictions. ~~Doc~~ | ## Example.reference {#reference tag="property"} +The `Doc` holding the gold-standard annotations. Occasionally also referred to +as `example.y`. + > #### Example > > ```python @@ -102,15 +105,15 @@ The `Doc` holding the predictions. Occassionally also refered to as `example.x`. > gold_labels[i][j] = eg.reference.cats.get(label, 0.0) > ``` -The `Doc` holding the gold-standard annotations. Occassionally also refered to -as `example.y`. - -| Name | Type | Description | -| ----------- | ----- | -------------------------------------------------- | -| **RETURNS** | `Doc` | The document containing gold-standard annotations. | +| Name | Description | +| ----------- | ---------------------------------------------------------- | +| **RETURNS** | The document containing gold-standard annotations. ~~Doc~~ | ## Example.alignment {#alignment tag="property"} +The [`Alignment`](/api/example#alignment-object) object mapping the tokens of +the `predicted` document to those of the `reference` document. + > #### Example > > ```python @@ -122,15 +125,15 @@ as `example.y`. > assert list(alignment.y2x.data) == [[0], [1], [2], [2]] > ``` -The `Alignment` object mapping the tokens of the `predicted` document to those -of the `reference` document. - -| Name | Type | Description | -| ----------- | ----------- | -------------------------------------------------- | -| **RETURNS** | `Alignment` | The document containing gold-standard annotations. | +| Name | Description | +| ----------- | ---------------------------------------------------------------- | +| **RETURNS** | The document containing gold-standard annotations. ~~Alignment~~ | ## Example.get_aligned {#get_aligned tag="method"} +Get the aligned view of a certain token attribute, denoted by its int ID or +string name. + > #### Example > > ```python @@ -141,17 +144,18 @@ of the `reference` document. > assert example.get_aligned("TAG", as_string=True) == ["VERB", "DET", "NOUN"] > ``` -Get the aligned view of a certain token attribute, denoted by its int ID or -string name. - -| Name | Type | Description | Default | -| ----------- | -------------------------- | ------------------------------------------------------------------ | ------- | -| `field` | int or str | Attribute ID or string name | | -| `as_string` | bool | Whether or not to return the list of values as strings. | `False` | -| **RETURNS** | `List[int]` or `List[str]` | List of integer values, or string values if `as_string` is `True`. | | +| Name | Description | +| ----------- | -------------------------------------------------------------------------------------------------- | +| `field` | Attribute ID or string name. ~~Union[int, str]~~ | +| `as_string` | Whether or not to return the list of values as strings. Defaults to `False`. ~~bool~~ | +| **RETURNS** | List of integer values, or string values if `as_string` is `True`. ~~Union[List[int], List[str]]~~ | ## Example.get_aligned_parse {#get_aligned_parse tag="method"} +Get the aligned view of the dependency parse. If `projectivize` is set to +`True`, non-projective dependency trees are made projective through the +Pseudo-Projective Dependency Parsing algorithm by Nivre and Nilsson (2005). + > #### Example > > ```python @@ -161,17 +165,16 @@ string name. > assert proj_heads == [3, 2, 3, 0, 3] > ``` -Get the aligned view of the dependency parse. If `projectivize` is set to -`True`, non-projective dependency trees are made projective through the -Pseudo-Projective Dependency Parsing algorithm by Nivre and Nilsson (2005). - -| Name | Type | Description | Default | -| -------------- | -------------------------- | ------------------------------------------------------------------ | ------- | -| `projectivize` | bool | Whether or not to projectivize the dependency trees | `True` | -| **RETURNS** | `List[int]` or `List[str]` | List of integer values, or string values if `as_string` is `True`. | | +| Name | Description | +| -------------- | -------------------------------------------------------------------------------------------------- | +| `projectivize` | Whether or not to projectivize the dependency trees. Defaults to `True`. ~~bool~~ | +| **RETURNS** | List of integer values, or string values if `as_string` is `True`. ~~Union[List[int], List[str]]~~ | ## Example.get_aligned_ner {#get_aligned_ner tag="method"} +Get the aligned view of the NER +[BILUO](/usage/linguistic-features#accessing-ner) tags. + > #### Example > > ```python @@ -184,15 +187,16 @@ Pseudo-Projective Dependency Parsing algorithm by Nivre and Nilsson (2005). > assert ner_tags == ["B-PERSON", "L-PERSON", "O", "O", "U-LOC"] > ``` -Get the aligned view of the NER -[BILUO](/usage/linguistic-features#accessing-ner) tags. - -| Name | Type | Description | -| ----------- | ----------- | ----------------------------------------------------------------------------------- | -| **RETURNS** | `List[str]` | List of BILUO values, denoting whether tokens are part of an NER annotation or not. | +| Name | Description | +| ----------- | ------------------------------------------------------------------------------------------------- | +| **RETURNS** | List of BILUO values, denoting whether tokens are part of an NER annotation or not. ~~List[str]~~ | ## Example.get_aligned_spans_y2x {#get_aligned_spans_y2x tag="method"} +Get the aligned view of any set of [`Span`](/api/span) objects defined over +[`Example.reference`](/api/example#reference). The resulting span indices will +align to the tokenization in [`Example.predicted`](/api/example#predicted). + > #### Example > > ```python @@ -207,17 +211,19 @@ Get the aligned view of the NER > assert [(ent.start, ent.end) for ent in ents_y2x] == [(0, 1)] > ``` -Get the aligned view of any set of [`Span`](/api/span) objects defined over -`example.reference`. The resulting span indices will align to the tokenization -in `example.predicted`. - -| Name | Type | Description | -| ----------- | ---------------- | --------------------------------------------------------------- | -| `y_spans` | `Iterable[Span]` | `Span` objects aligned to the tokenization of `self.reference`. | -| **RETURNS** | `Iterable[Span]` | `Span` objects aligned to the tokenization of `self.predicted`. | +| Name | Description | +| ----------- | ----------------------------------------------------------------------------- | +| `y_spans` | `Span` objects aligned to the tokenization of `reference`. ~~Iterable[Span]~~ | +| **RETURNS** | `Span` objects aligned to the tokenization of `predicted`. ~~List[Span]~~ | ## Example.get_aligned_spans_x2y {#get_aligned_spans_x2y tag="method"} +Get the aligned view of any set of [`Span`](/api/span) objects defined over +[`Example.predicted`](/api/example#predicted). The resulting span indices will +align to the tokenization in [`Example.reference`](/api/example#reference). This +method is particularly useful to assess the accuracy of predicted entities +against the original gold-standard annotation. + > #### Example > > ```python @@ -232,15 +238,10 @@ in `example.predicted`. > assert [(ent.start, ent.end) for ent in ents_x2y] == [(0, 2)] > ``` -Get the aligned view of any set of [`Span`](/api/span) objects defined over -`example.predicted`. The resulting span indices will align to the tokenization -in `example.reference`. This method is particularly useful to assess the -accuracy of predicted entities against the original gold-standard annotation. - -| Name | Type | Description | -| ----------- | ---------------- | --------------------------------------------------------------- | -| `x_spans` | `Iterable[Span]` | `Span` objects aligned to the tokenization of `self.predicted`. | -| **RETURNS** | `Iterable[Span]` | `Span` objects aligned to the tokenization of `self.reference`. | +| Name | Description | +| ----------- | ----------------------------------------------------------------------------- | +| `x_spans` | `Span` objects aligned to the tokenization of `predicted`. ~~Iterable[Span]~~ | +| **RETURNS** | `Span` objects aligned to the tokenization of `reference`. ~~List[Span]~~ | ## Example.to_dict {#to_dict tag="method"} @@ -253,12 +254,14 @@ reference annotation contained in this `Example`. > eg_dict = example.to_dict() > ``` -| Name | Type | Description | -| ----------- | ---------------- | ------------------------------------------------------ | -| **RETURNS** | `Dict[str, Any]` | Dictionary representation of the reference annotation. | +| Name | Description | +| ----------- | ------------------------------------------------------------------------- | +| **RETURNS** | Dictionary representation of the reference annotation. ~~Dict[str, Any]~~ | ## Example.split_sents {#split_sents tag="method"} +Split one `Example` into multiple `Example` objects, one for each sentence. + > #### Example > > ```python @@ -271,11 +274,9 @@ reference annotation contained in this `Example`. > assert split_examples[1].text == "had lots of fun" > ``` -Split one `Example` into multiple `Example` objects, one for each sentence. - -| Name | Type | Description | -| ----------- | --------------- | ---------------------------------------------------------- | -| **RETURNS** | `List[Example]` | List of `Example` objects, one for each original sentence. | +| Name | Description | +| ----------- | ---------------------------------------------------------------------------- | +| **RETURNS** | List of `Example` objects, one for each original sentence. ~~List[Example]~~ | ## Alignment {#alignment-object new="3"} @@ -283,10 +284,10 @@ Calculate alignment tables between two tokenizations. ### Alignment attributes {#alignment-attributes"} -| Name | Type | Description | -| ----- | -------------------------------------------------- | ---------------------------------------------------------- | -| `x2y` | [`Ragged`](https://thinc.ai/docs/api-types#ragged) | The `Ragged` object holding the alignment from `x` to `y`. | -| `y2x` | [`Ragged`](https://thinc.ai/docs/api-types#ragged) | The `Ragged` object holding the alignment from `y` to `x`. | +| Name | Description | +| ----- | --------------------------------------------------------------------- | +| `x2y` | The `Ragged` object holding the alignment from `x` to `y`. ~~Ragged~~ | +| `y2x` | The `Ragged` object holding the alignment from `y` to `x`. ~~Ragged~~ | @@ -314,8 +315,8 @@ tokenizations add up to the same string. For example, you'll be able to align ### Alignment.from_strings {#classmethod tag="function"} -| Name | Type | Description | -| ----------- | ----------- | ----------------------------------------------- | -| `A` | list | String values of candidate tokens to align. | -| `B` | list | String values of reference tokens to align. | -| **RETURNS** | `Alignment` | An `Alignment` object describing the alignment. | +| Name | Description | +| ----------- | ------------------------------------------------------------- | +| `A` | String values of candidate tokens to align. ~~List[str]~~ | +| `B` | String values of reference tokens to align. ~~List[str]~~ | +| **RETURNS** | An `Alignment` object describing the alignment. ~~Alignment~~ | diff --git a/website/docs/api/kb.md b/website/docs/api/kb.md index 7b2c4edf4..855dead27 100644 --- a/website/docs/api/kb.md +++ b/website/docs/api/kb.md @@ -9,7 +9,7 @@ new: 2.2 --- The `KnowledgeBase` object provides a method to generate -[`Candidate`](/api/kb/#candidate_init) objects, which are plausible external +[`Candidate`](/api/kb/#candidate) objects, which are plausible external identifiers given a certain textual mention. Each such `Candidate` holds information from the relevant KB entities, such as its frequency in text and possible aliases. Each entity in the knowledge base also has a pretrained entity @@ -27,18 +27,18 @@ Create the knowledge base. > kb = KnowledgeBase(vocab=vocab, entity_vector_length=64) > ``` -| Name | Type | Description | -| ---------------------- | ------- | ---------------------------------------- | -| `vocab` | `Vocab` | A `Vocab` object. | -| `entity_vector_length` | int | Length of the fixed-size entity vectors. | +| Name | Description | +| ---------------------- | ------------------------------------------------ | +| `vocab` | The shared vocabulary. ~~Vocab~~ | +| `entity_vector_length` | Length of the fixed-size entity vectors. ~~int~~ | ## KnowledgeBase.entity_vector_length {#entity_vector_length tag="property"} The length of the fixed-size entity vectors in the knowledge base. -| Name | Type | Description | -| ----------- | ---- | ---------------------------------------- | -| **RETURNS** | int | Length of the fixed-size entity vectors. | +| Name | Description | +| ----------- | ------------------------------------------------ | +| **RETURNS** | Length of the fixed-size entity vectors. ~~int~~ | ## KnowledgeBase.add_entity {#add_entity tag="method"} @@ -53,11 +53,11 @@ vector, which should be of length > kb.add_entity(entity="Q463035", freq=111, entity_vector=vector2) > ``` -| Name | Type | Description | -| --------------- | ------ | ----------------------------------------------- | -| `entity` | str | The unique entity identifier | -| `freq` | float | The frequency of the entity in a typical corpus | -| `entity_vector` | vector | The pretrained vector of the entity | +| Name | Description | +| --------------- | ---------------------------------------------------------- | +| `entity` | The unique entity identifier. ~~str~~ | +| `freq` | The frequency of the entity in a typical corpus. ~~float~~ | +| `entity_vector` | The pretrained vector of the entity. ~~numpy.ndarray~~ | ## KnowledgeBase.set_entities {#set_entities tag="method"} @@ -70,11 +70,11 @@ frequency and entity vector for each entity. > kb.set_entities(entity_list=["Q42", "Q463035"], freq_list=[32, 111], vector_list=[vector1, vector2]) > ``` -| Name | Type | Description | -| ------------- | -------- | --------------------------------- | -| `entity_list` | iterable | List of unique entity identifiers | -| `freq_list` | iterable | List of entity frequencies | -| `vector_list` | iterable | List of entity vectors | +| Name | Description | +| ------------- | ---------------------------------------------------------------- | +| `entity_list` | List of unique entity identifiers. ~~Iterable[Union[str, int]]~~ | +| `freq_list` | List of entity frequencies. ~~Iterable[int]~~ | +| `vector_list` | List of entity vectors. ~~Iterable[numpy.ndarray]~~ | ## KnowledgeBase.add_alias {#add_alias tag="method"} @@ -90,11 +90,11 @@ should not exceed 1. > kb.add_alias(alias="Douglas", entities=["Q42", "Q463035"], probabilities=[0.6, 0.3]) > ``` -| Name | Type | Description | -| --------------- | -------- | -------------------------------------------------- | -| `alias` | str | The textual mention or alias | -| `entities` | iterable | The potential entities that the alias may refer to | -| `probabilities` | iterable | The prior probabilities of each entity | +| Name | Description | +| --------------- | --------------------------------------------------------------------------------- | +| `alias` | The textual mention or alias. ~~str~~ | +| `entities` | The potential entities that the alias may refer to. ~~Iterable[Union[str, int]]~~ | +| `probabilities` | The prior probabilities of each entity. ~~Iterable[float]~~ | ## KnowledgeBase.\_\_len\_\_ {#len tag="method"} @@ -106,9 +106,9 @@ Get the total number of entities in the knowledge base. > total_entities = len(kb) > ``` -| Name | Type | Description | -| ----------- | ---- | --------------------------------------------- | -| **RETURNS** | int | The number of entities in the knowledge base. | +| Name | Description | +| ----------- | ----------------------------------------------------- | +| **RETURNS** | The number of entities in the knowledge base. ~~int~~ | ## KnowledgeBase.get_entity_strings {#get_entity_strings tag="method"} @@ -120,9 +120,9 @@ Get a list of all entity IDs in the knowledge base. > all_entities = kb.get_entity_strings() > ``` -| Name | Type | Description | -| ----------- | ---- | ------------------------------------------- | -| **RETURNS** | list | The list of entities in the knowledge base. | +| Name | Description | +| ----------- | --------------------------------------------------------- | +| **RETURNS** | The list of entities in the knowledge base. ~~List[str]~~ | ## KnowledgeBase.get_size_aliases {#get_size_aliases tag="method"} @@ -134,9 +134,9 @@ Get the total number of aliases in the knowledge base. > total_aliases = kb.get_size_aliases() > ``` -| Name | Type | Description | -| ----------- | ---- | -------------------------------------------- | -| **RETURNS** | int | The number of aliases in the knowledge base. | +| Name | Description | +| ----------- | ---------------------------------------------------- | +| **RETURNS** | The number of aliases in the knowledge base. ~~int~~ | ## KnowledgeBase.get_alias_strings {#get_alias_strings tag="method"} @@ -148,14 +148,14 @@ Get a list of all aliases in the knowledge base. > all_aliases = kb.get_alias_strings() > ``` -| Name | Type | Description | -| ----------- | ---- | ------------------------------------------ | -| **RETURNS** | list | The list of aliases in the knowledge base. | +| Name | Description | +| ----------- | -------------------------------------------------------- | +| **RETURNS** | The list of aliases in the knowledge base. ~~List[str]~~ | ## KnowledgeBase.get_candidates {#get_candidates tag="method"} Given a certain textual mention as input, retrieve a list of candidate entities -of type [`Candidate`](/api/kb/#candidate_init). +of type [`Candidate`](/api/kb/#candidate). > #### Example > @@ -163,10 +163,10 @@ of type [`Candidate`](/api/kb/#candidate_init). > candidates = kb.get_candidates("Douglas") > ``` -| Name | Type | Description | -| ----------- | -------- | ---------------------------------------- | -| `alias` | str | The textual mention or alias | -| **RETURNS** | iterable | The list of relevant `Candidate` objects | +| Name | Description | +| ----------- | ------------------------------------- | +| `alias` | The textual mention or alias. ~~str~~ | +| **RETURNS** | iterable | The list of relevant `Candidate` objects. ~~List[Candidate]~~ | ## KnowledgeBase.get_vector {#get_vector tag="method"} @@ -178,10 +178,10 @@ Given a certain entity ID, retrieve its pretrained entity vector. > vector = kb.get_vector("Q42") > ``` -| Name | Type | Description | -| ----------- | ------ | ----------------- | -| `entity` | str | The entity ID | -| **RETURNS** | vector | The entity vector | +| Name | Description | +| ----------- | ------------------------------------ | +| `entity` | The entity ID. ~~str~~ | +| **RETURNS** | The entity vector. ~~numpy.ndarray~~ | ## KnowledgeBase.get_prior_prob {#get_prior_prob tag="method"} @@ -194,27 +194,27 @@ probability of the fact that the mention links to the entity ID. > probability = kb.get_prior_prob("Q42", "Douglas") > ``` -| Name | Type | Description | -| ----------- | ----- | -------------------------------------------------------------- | -| `entity` | str | The entity ID | -| `alias` | str | The textual mention or alias | -| **RETURNS** | float | The prior probability of the `alias` referring to the `entity` | +| Name | Description | +| ----------- | ------------------------------------------------------------------------- | +| `entity` | The entity ID. ~~str~~ | +| `alias` | The textual mention or alias. ~~str~~ | +| **RETURNS** | The prior probability of the `alias` referring to the `entity`. ~~float~~ | -## KnowledgeBase.dump {#dump tag="method"} +## KnowledgeBase.to_disk {#to_disk tag="method"} Save the current state of the knowledge base to a directory. > #### Example > > ```python -> kb.dump(loc) +> kb.to_disk(loc) > ``` -| Name | Type | Description | -| ----- | ------------ | --------------------------------------------------------------------------------------------------------------------- | -| `loc` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| Name | Description | +| ----- | ------------------------------------------------------------------------------------------------------------------------------------------ | +| `loc` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ | -## KnowledgeBase.load_bulk {#load_bulk tag="method"} +## KnowledgeBase.from_disk {#from_disk tag="method"} Restore the state of the knowledge base from a given directory. Note that the [`Vocab`](/api/vocab) should also be the same as the one used to create the KB. @@ -226,15 +226,23 @@ Restore the state of the knowledge base from a given directory. Note that the > from spacy.vocab import Vocab > vocab = Vocab().from_disk("/path/to/vocab") > kb = KnowledgeBase(vocab=vocab, entity_vector_length=64) -> kb.load_bulk("/path/to/kb") +> kb.from_disk("/path/to/kb") > ``` -| Name | Type | Description | -| ----------- | --------------- | -------------------------------------------------------------------------- | -| `loc` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | -| **RETURNS** | `KnowledgeBase` | The modified `KnowledgeBase` object. | +| Name | Description | +| ----------- | ----------------------------------------------------------------------------------------------- | +| `loc` | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ | +| **RETURNS** | The modified `KnowledgeBase` object. ~~KnowledgeBase~~ | -## Candidate.\_\_init\_\_ {#candidate_init tag="method"} +## Candidate {#candidate tag="class"} + +A `Candidate` object refers to a textual mention (alias) that may or may not be +resolved to a specific entity from a `KnowledgeBase`. This will be used as input +for the entity linking algorithm which will disambiguate the various candidates +to the correct one. Each candidate `(alias, entity)` pair is assigned to a +certain prior probability. + +### Candidate.\_\_init\_\_ {#candidate-init tag="method"} Construct a `Candidate` object. Usually this constructor is not called directly, but instead these objects are returned by the @@ -247,22 +255,22 @@ but instead these objects are returned by the > candidate = Candidate(kb, entity_hash, entity_freq, entity_vector, alias_hash, prior_prob) > ``` -| Name | Type | Description | -| ------------- | --------------- | -------------------------------------------------------------- | -| `kb` | `KnowledgeBase` | The knowledge base that defined this candidate. | -| `entity_hash` | int | The hash of the entity's KB ID. | -| `entity_freq` | float | The entity frequency as recorded in the KB. | -| `alias_hash` | int | The hash of the textual mention or alias. | -| `prior_prob` | float | The prior probability of the `alias` referring to the `entity` | +| Name | Description | +| ------------- | ------------------------------------------------------------------------- | +| `kb` | The knowledge base that defined this candidate. ~~KnowledgeBase~~ | +| `entity_hash` | The hash of the entity's KB ID. ~~int~~ | +| `entity_freq` | The entity frequency as recorded in the KB. ~~float~~ | +| `alias_hash` | The hash of the textual mention or alias. ~~int~~ | +| `prior_prob` | The prior probability of the `alias` referring to the `entity`. ~~float~~ | -## Candidate attributes {#candidate_attributes} +## Candidate attributes {#candidate-attributes} -| Name | Type | Description | -| --------------- | ------ | -------------------------------------------------------------- | -| `entity` | int | The entity's unique KB identifier | -| `entity_` | str | The entity's unique KB identifier | -| `alias` | int | The alias or textual mention | -| `alias_` | str | The alias or textual mention | -| `prior_prob` | long | The prior probability of the `alias` referring to the `entity` | -| `entity_freq` | long | The frequency of the entity in a typical corpus | -| `entity_vector` | vector | The pretrained vector of the entity | +| Name | Description | +| --------------- | ------------------------------------------------------------------------ | +| `entity` | The entity's unique KB identifier. ~~int~~ | +| `entity_` | The entity's unique KB identifier. ~~str~~ | +| `alias` | The alias or textual mention. ~~int~~ | +| `alias_` | The alias or textual mention. ~~str~~ | +| `prior_prob` | The prior probability of the `alias` referring to the `entity`. ~~long~~ | +| `entity_freq` | The frequency of the entity in a typical corpus. ~~long~~ | +| `entity_vector` | The pretrained vector of the entity. ~~numpy.ndarray~~ | diff --git a/website/docs/api/language.md b/website/docs/api/language.md index 41d660421..34e3569a7 100644 --- a/website/docs/api/language.md +++ b/website/docs/api/language.md @@ -32,15 +32,15 @@ Initialize a `Language` object. > nlp = Language(Vocab()) > ``` -| Name | Type | Description | -| ------------------ | ----------- | ------------------------------------------------------------------------------------------ | -| `vocab` | `Vocab` | A `Vocab` object. If `True`, a vocab is created using the default language data settings. | -| _keyword-only_ | | | -| `max_length` | int | Maximum number of characters allowed in a single text. Defaults to `10 ** 6`. | -| `meta` | dict | Custom meta data for the `Language` class. Is written to by models to add model meta data. | -| `create_tokenizer` | Β `Callable` | Optional function that receives the `nlp` object and returns a tokenizer. | +| Name | Description | +| ------------------ | ------------------------------------------------------------------------------------------------------------------------ | +| `vocab` | A `Vocab` object. If `True`, a vocab is created using the default language data settings. ~~Vocab~~ | +| _keyword-only_ | | +| `max_length` | Maximum number of characters allowed in a single text. Defaults to `10 ** 6`. ~~int~~ | +| `meta` | Custom meta data for the `Language` class. Is written to by models to add model meta data. ~~dict~~ | +| `create_tokenizer` | Optional function that receives the `nlp` object and returns a tokenizer. ~~Callable[[Language], Callable[[str], Doc]]~~ | -## Language.from_config {#from_config tag="classmethod"} +## Language.from_config {#from_config tag="classmethod" new="3"} Create a `Language` object from a loaded config. Will set up the tokenizer and language data, add pipeline components based on the pipeline and components @@ -58,14 +58,14 @@ model under the hood based on its [`config.cfg`](/api/data-formats#config). > nlp = Language.from_config(config) > ``` -| Name | Type | Description | -| -------------- | ---------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------- | -| `config` | `Dict[str, Any]` / [`Config`](https://thinc.ai/docs/api-config#config) | The loaded config. | -| _keyword-only_ | | -| `disable` | `Iterable[str]` | List of pipeline component names to disable. | -| `auto_fill` | bool | Whether to automatically fill in missing values in the config, based on defaults and function argument annotations. Defaults to `True`. | -| `validate` | bool | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. | -| **RETURNS** | `Language` | The initialized object. | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------ | +| `config` | The loaded config. ~~Union[Dict[str, Any], Config]~~ | +| _keyword-only_ | | +| `disable` | List of pipeline component names to disable. ~~Iterable[str]~~ | +| `auto_fill` | Whether to automatically fill in missing values in the config, based on defaults and function argument annotations. Defaults to `True`. ~~bool~~ | +| `validate` | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. ~~bool~~ | +| **RETURNS** | The initialized object. ~~Language~~ | ## Language.component {#component tag="classmethod" new="3"} @@ -94,16 +94,14 @@ decorator. For more details and examples, see the > Language.component("my_component2", func=my_component) > ``` -| Name | Type | Description | -| ----------------------- | -------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `name` | str | The name of the component factory. | -| _keyword-only_ | | | -| `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis).. | -| `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). | -| `retokenizes` | bool | Whether the component changes tokenization. Used for [pipe analysis](/usage/processing-pipelines#analysis). | -| `scores` | `Iterable[str]` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). | -| `default_score_weights` | `Dict[str, float]` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. | -| `func` | `Optional[Callable]` | Optional function if not used a a decorator. | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `name` | The name of the component factory. ~~str~~ | +| _keyword-only_ | | +| `assigns` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~ | +| `requires` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~ | +| `retokenizes` | Whether the component changes tokenization. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~bool~~ | +| `func` | Optional function if not used a a decorator. ~~Optional[Callable[[Doc], Doc]]~~ | ## Language.factory {#factory tag="classmethod"} @@ -141,17 +139,17 @@ examples, see the > ) > ``` -| Name | Type | Description | -| ----------------------- | -------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `name` | str | The name of the component factory. | -| _keyword-only_ | | | -| `default_config` | `Dict[str, any]` | The default config, describing the default values of the factory arguments. | -| `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). | -| `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). | -| `retokenizes` | bool | Whether the component changes tokenization. Used for [pipe analysis](/usage/processing-pipelines#analysis). | -| `scores` | `Iterable[str]` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). | -| `default_score_weights` | `Dict[str, float]` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. | -| `func` | `Optional[Callable]` | Optional function if not used a a decorator. | +| Name | Description | +| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `name` | The name of the component factory. ~~str~~ | +| _keyword-only_ | | +| `default_config` | The default config, describing the default values of the factory arguments. ~~Dict[str, Any]~~ | +| `assigns` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~ | +| `requires` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~ | +| `retokenizes` | Whether the component changes tokenization. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~bool~~ | +| `scores` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~ | +| `default_score_weights` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. ~~Dict[str, float]~~ | +| `func` | Optional function if not used a a decorator. ~~Optional[Callable[[...], Callable[[Doc], Doc]]]~~ | ## Language.\_\_call\_\_ {#call tag="method"} @@ -165,13 +163,13 @@ contain arbitrary whitespace. Alignment into the original string is preserved. > assert (doc[0].text, doc[0].head.tag_) == ("An", "NN") > ``` -| Name | Type | Description | -| --------------- | ----------------- | ------------------------------------------------------------------------------------------------------ | -| `text` | str | The text to be processed. | -| _keyword-only_ | | | -| `disable` | `List[str]` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). | -| `component_cfg` | `Dict[str, dict]` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. | -| **RETURNS** | [`Doc`](/api/doc) | A container for accessing the annotations. | +| Name | Description | +| --------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- | +| `text` | The text to be processed. ~~str~~ | +| _keyword-only_ | | +| `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). ~~List[str]~~ | +| `component_cfg` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~ | +| **RETURNS** | A container for accessing the annotations. ~~Doc~~ | ## Language.pipe {#pipe tag="method"} @@ -186,17 +184,17 @@ more efficient than processing texts one-by-one. > assert doc.is_parsed > ``` -| Name | Type | Description | -| ------------------------------------------ | ----------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `texts` | `Iterable[str]` | A sequence of strings. | -| _keyword-only_ | | | -| `as_tuples` | bool | If set to `True`, inputs should be a sequence of `(text, context)` tuples. Output will then be a sequence of `(doc, context)` tuples. Defaults to `False`. | -| `batch_size` | int | The number of texts to buffer. | -| `disable` | `List[str]` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). | -| `cleanup` | bool | If `True`, unneeded strings are freed to control memory use. Experimental. | -| `component_cfg` | `Dict[str, dict]` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. | -| `n_process` 2.2.2 | int | Number of processors to use, only supported in Python 3. Defaults to `1`. | -| **YIELDS** | `Doc` | Documents in the order of the original text. | +| Name | Description | +| ------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `texts` | A sequence of strings. ~~Iterable[str]~~ | +| _keyword-only_ | | +| `as_tuples` | If set to `True`, inputs should be a sequence of `(text, context)` tuples. Output will then be a sequence of `(doc, context)` tuples. Defaults to `False`. ~~bool~~ | +| `batch_size` | The number of texts to buffer. ~~int~~ | +| `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). ~~List[str]~~ | +| `cleanup` | If `True`, unneeded strings are freed to control memory use. Experimental. ~~bool~~ | +| `component_cfg` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~ | +| `n_process` 2.2.2 | Number of processors to use. Defaults to `1`. ~~int~~ | +| **YIELDS** | Documents in the order of the original text. ~~Doc~~ | ## Language.begin_training {#begin_training tag="method"} @@ -225,12 +223,12 @@ tuples of `Doc` and `GoldParse` objects. > optimizer = nlp.begin_training(get_examples) > ``` -| Name | Type | Description | -| -------------- | --------------------------------------------------- | ----------------------------------------------------------------------------------------------------------- | -| `get_examples` | `Callable[[], Iterable[Example]]` | Optional function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. | -| _keyword-only_ | | | -| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | An optional optimizer. Will be created via [`create_optimizer`](/api/language#create_optimizer) if not set. | -| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | +| Name | Description | +| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `get_examples` | Optional function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Optional[Callable[[], Iterable[Example]]]~~ | +| _keyword-only_ | | +| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | +| **RETURNS** | The optimizer. ~~Optimizer~~ | ## Language.resume_training {#resume_training tag="method,experimental" new="3"} @@ -248,11 +246,11 @@ a batch of [Example](/api/example) objects. > nlp.rehearse(examples, sgd=optimizer) > ``` -| Name | Type | Description | -| -------------- | --------------------------------------------------- | ----------------------------------------------------------------------------------------------------------- | -| _keyword-only_ | | | -| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | An optional optimizer. Will be created via [`create_optimizer`](/api/language#create_optimizer) if not set. | -| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------------- | +| _keyword-only_ | | +| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | +| **RETURNS** | The optimizer. ~~Optimizer~~ | ## Language.update {#update tag="method"} @@ -282,15 +280,15 @@ and custom registered functions if needed. See the > nlp.update([example], sgd=optimizer) > ``` -| Name | Type | Description | -| --------------- | --------------------------------------------------- | ------------------------------------------------------------------------------------------------------ | -| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. | -| _keyword-only_ | | | -| `drop` | float | The dropout rate. | -| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | -| `losses` | `Dict[str, float]` | Dictionary to update with the loss, keyed by pipeline component. | -| `component_cfg` | `Dict[str, dict]` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. | -| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. | +| Name | Description | +| --------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- | +| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ | +| _keyword-only_ | | +| `drop` | The dropout rate. ~~float~~ | +| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | +| `losses` | Dictionary to update with the loss, keyed by pipeline component. ~~Optional[Dict[str, float]]~~ | +| `component_cfg` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~ | +| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ | ## Language.rehearse {#rehearse tag="method,experimental" new="3"} @@ -305,14 +303,14 @@ the "catastrophic forgetting" problem. This feature is experimental. > losses = nlp.rehearse(examples, sgd=optimizer) > ``` -| Name | Type | Description | -| -------------- | --------------------------------------------------- | ----------------------------------------------------------------------------------------- | -| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. | -| _keyword-only_ | | | -| `drop` | float | The dropout rate. | -| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | -| `losses` | `Dict[str, float]` | Optional record of the loss during training. Updated using the component name as the key. | -| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------------- | +| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ | +| _keyword-only_ | | +| `drop` | The dropout rate. ~~float~~ | +| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | +| `losses` | Dictionary to update with the loss, keyed by pipeline component. ~~Optional[Dict[str, float]]~~ | +| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ | ## Language.evaluate {#evaluate tag="method"} @@ -328,20 +326,19 @@ objects instead of tuples of `Doc` and `GoldParse` objects. > #### Example > > ```python -> scores = nlp.evaluate(examples, verbose=True) +> scores = nlp.evaluate(examples) > print(scores) > ``` -| Name | Type | Description | -| --------------- | ------------------------------- | ------------------------------------------------------------------------------------------------------ | -| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. | -| _keyword-only_ | | | -| `verbose` | bool | Print debugging information. | -| `batch_size` | int | The batch size to use. | -| `scorer` | `Scorer` | Optional [`Scorer`](/api/scorer) to use. If not passed in, a new one will be created. | -| `component_cfg` | `Dict[str, dict]` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. | -| `scorer_cfg` | `Dict[str, Any]` | Optional dictionary of keyword arguments for the `Scorer`. Defaults to `None`. | -| **RETURNS** | `Dict[str, Union[float, dict]]` | A dictionary of evaluation scores. | +| Name | Description | +| --------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- | +| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ | +| _keyword-only_ | | +| `batch_size` | The batch size to use. ~~int~~ | +| `scorer` | Optional [`Scorer`](/api/scorer) to use. If not passed in, a new one will be created. ~~Optional[Scorer]~~ | +| `component_cfg` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~ | +| `scorer_cfg` | Optional dictionary of keyword arguments for the `Scorer`. Defaults to `None`. ~~Optional[Dict[str, Any]]~~ | +| **RETURNS** | A dictionary of evaluation scores. ~~Dict[str, Union[float, Dict[str, float]]]~~ | ## Language.use_params {#use_params tag="contextmanager, method"} @@ -356,9 +353,9 @@ their original weights after the block. > nlp.to_disk("/tmp/checkpoint") > ``` -| Name | Type | Description | -| -------- | ---- | --------------------------------------------- | -| `params` | dict | A dictionary of parameters keyed by model ID. | +| Name | Description | +| -------- | ------------------------------------------------------ | +| `params` | A dictionary of parameters keyed by model ID. ~~dict~~ | ## Language.create_pipe {#create_pipe tag="method" new="2"} @@ -380,14 +377,14 @@ To create a component and add it to the pipeline, you should always use > parser = nlp.create_pipe("parser") > ``` -| Name | Type | Description | -| ------------------------------------- | ---------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `factory_name` | str | Name of the registered component factory. | -| `name` | str | Optional unique name of pipeline component instance. If not set, the factory name is used. An error is raised if the name already exists in the pipeline. | -| _keyword-only_ | | | -| `config` 3 | `Dict[str, Any]` | Optional config parameters to use for this component. Will be merged with the `default_config` specified by the component factory. | -| `validate` 3 | bool | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. | -| **RETURNS** | callable | The pipeline component. | +| Name | Description | +| ------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `factory_name` | Name of the registered component factory. ~~str~~ | +| `name` | Optional unique name of pipeline component instance. If not set, the factory name is used. An error is raised if the name already exists in the pipeline. ~~Optional[str]~~ | +| _keyword-only_ | | +| `config` 3 | Optional config parameters to use for this component. Will be merged with the `default_config` specified by the component factory. ~~Optional[Dict[str, Any]]~~ | +| `validate` 3 | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. ~~bool~~ | +| **RETURNS** | The pipeline component. ~~Callable[[Doc], Doc]~~ | ## Language.add_pipe {#add_pipe tag="method" new="2"} @@ -423,19 +420,19 @@ component, adds it to the pipeline and returns it. > nlp.add_pipe("ner", source=source_nlp) > ``` -| Name | Type | Description | -| -------------------------------------- | ---------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `factory_name` | str | Name of the registered component factory. | -| `name` | str | Optional unique name of pipeline component instance. If not set, the factory name is used. An error is raised if the name already exists in the pipeline. | -| _keyword-only_ | | | -| `before` | str / int | Component name or index to insert component directly before. | -| `after` | str / int | Component name or index to insert component directly after: | -| `first` | bool | Insert component first / not first in the pipeline. | -| `last` | bool | Insert component last / not last in the pipeline. | -| `config` 3 | `Dict[str, Any]` | Optional config parameters to use for this component. Will be merged with the `default_config` specified by the component factory. | -| `source` 3 | `Language` | Optional source model to copy component from. If a source is provided, the `factory_name` is interpreted as the name of the component in the source pipeline. Make sure that the vocab, vectors and settings of the source model match the target model. | -| `validate` 3 | bool | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. | -| **RETURNS** 3 | callable | The pipeline component. | +| Name | Description | +| ------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `factory_name` | Name of the registered component factory. ~~str~~ | +| `name` | Optional unique name of pipeline component instance. If not set, the factory name is used. An error is raised if the name already exists in the pipeline. ~~Optional[str]~~ | +| _keyword-only_ | | +| `before` | Component name or index to insert component directly before. ~~Optional[Union[str, int]]~~ | +| `after` | Component name or index to insert component directly after. ~~Optional[Union[str, int]]~~ | +| `first` | Insert component first / not first in the pipeline. ~~Optional[bool]~~ | +| `last` | Insert component last / not last in the pipeline. ~~Optional[bool]~~ | +| `config` 3 | Optional config parameters to use for this component. Will be merged with the `default_config` specified by the component factory. ~~Optional[Dict[str, Any]]~~ | +| `source` 3 | Optional source model to copy component from. If a source is provided, the `factory_name` is interpreted as the name of the component in the source pipeline. Make sure that the vocab, vectors and settings of the source model match the target model. ~~Optional[Language]~~ | +| `validate` 3 | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. ~~bool~~ | +| **RETURNS** | The pipeline component. ~~Callable[[Doc], Doc]~~ | ## Language.has_factory {#has_factory tag="classmethod" new="3"} @@ -459,10 +456,10 @@ the `Language` base class, available to all subclasses. > assert not Language.has_factory("component") > ``` -| Name | Type | Description | -| ----------- | ---- | ---------------------------------------------------------- | -| `name` | str | Name of the pipeline factory to check. | -| **RETURNS** | bool | Whether a factory of that name is registered on the class. | +| Name | Description | +| ----------- | ------------------------------------------------------------------- | +| `name` | Name of the pipeline factory to check. ~~str~~ | +| **RETURNS** | Whether a factory of that name is registered on the class. ~~bool~~ | ## Language.has_pipe {#has_pipe tag="method" new="2"} @@ -481,10 +478,10 @@ Check whether a component is present in the pipeline. Equivalent to > assert nlp.has_pipe("my_component") > ``` -| Name | Type | Description | -| ----------- | ---- | -------------------------------------------------------- | -| `name` | str | Name of the pipeline component to check. | -| **RETURNS** | bool | Whether a component of that name exists in the pipeline. | +| Name | Description | +| ----------- | ----------------------------------------------------------------- | +| `name` | Name of the pipeline component to check. ~~str~~ | +| **RETURNS** | Whether a component of that name exists in the pipeline. ~~bool~~ | ## Language.get_pipe {#get_pipe tag="method" new="2"} @@ -497,28 +494,37 @@ Get a pipeline component for a given component name. > custom_component = nlp.get_pipe("custom_component") > ``` -| Name | Type | Description | -| ----------- | -------- | -------------------------------------- | -| `name` | str | Name of the pipeline component to get. | -| **RETURNS** | callable | The pipeline component. | +| Name | Description | +| ----------- | ------------------------------------------------ | +| `name` | Name of the pipeline component to get. ~~str~~ | +| **RETURNS** | The pipeline component. ~~Callable[[Doc], Doc]~~ | ## Language.replace_pipe {#replace_pipe tag="method" new="2"} Replace a component in the pipeline. + + +As of v3.0, the `Language.replace_pipe` method doesn't take callables anymore +and instead expects the **name of a component factory** registered using +[`@Language.component`](/api/language#component) or +[`@Language.factory`](/api/language#factory). + + + > #### Example > > ```python > nlp.replace_pipe("parser", my_custom_parser) > ``` -| Name | Type | Description | -| ------------------------------------- | ---------------- | ------------------------------------------------------------------------------------------------------------------------------------- | -| `name` | str | Name of the component to replace. | -| `component` | callable | The pipeline component to insert. | -| _keyword-only_ | | | -| `config` 3 | `Dict[str, Any]` | Optional config parameters to use for the new component. Will be merged with the `default_config` specified by the component factory. | -| `validate` 3 | bool | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. | +| Name | Description | +| ------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `name` | Name of the component to replace. ~~str~~ | +| `component` | The factory name of the component to insert. ~~str~~ | +| _keyword-only_ | | +| `config` 3 | Optional config parameters to use for the new component. Will be merged with the `default_config` specified by the component factory. ~~Optional[Dict[str, Any]]~~ | +| `validate` 3 | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. ~~bool~~ | ## Language.rename_pipe {#rename_pipe tag="method" new="2"} @@ -533,10 +539,10 @@ added to the pipeline, you can also use the `name` argument on > nlp.rename_pipe("parser", "spacy_parser") > ``` -| Name | Type | Description | -| ---------- | ---- | -------------------------------- | -| `old_name` | str | Name of the component to rename. | -| `new_name` | str | New name of the component. | +| Name | Description | +| ---------- | ---------------------------------------- | +| `old_name` | Name of the component to rename. ~~str~~ | +| `new_name` | New name of the component. ~~str~~ | ## Language.remove_pipe {#remove_pipe tag="method" new="2"} @@ -550,10 +556,10 @@ component function. > assert name == "parser" > ``` -| Name | Type | Description | -| ----------- | ----- | ----------------------------------------------------- | -| `name` | str | Name of the component to remove. | -| **RETURNS** | tuple | A `(name, component)` tuple of the removed component. | +| Name | Description | +| ----------- | ------------------------------------------------------------------------------------------ | +| `name` | Name of the component to remove. ~~str~~ | +| **RETURNS** | A `(name, component)` tuple of the removed component. ~~Tuple[str, Callable[[Doc], Doc]]~~ | ## Language.select_pipes {#select_pipes tag="contextmanager, method" new="3"} @@ -589,12 +595,12 @@ As of spaCy v3.0, the `disable_pipes` method has been renamed to `select_pipes`: -| Name | Type | Description | -| -------------- | --------------- | ------------------------------------------------------------------------------------ | -| _keyword-only_ | | | -| `disable` | str / list | Name(s) of pipeline components to disable. | -| `enable` | str / list | Names(s) of pipeline components that will not be disabled. | -| **RETURNS** | `DisabledPipes` | The disabled pipes that can be restored by calling the object's `.restore()` method. | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------ | +| _keyword-only_ | | +| `disable` | Name(s) of pipeline components to disable. ~~Optional[Union[str, Iterable[str]]]~~ | +| `enable` | Names(s) of pipeline components that will not be disabled. ~~Optional[Union[str, Iterable[str]]]~~ | +| **RETURNS** | The disabled pipes that can be restored by calling the object's `.restore()` method. ~~DisabledPipes~~ | ## Language.get_factory_meta {#get_factory_meta tag="classmethod" new="3"} @@ -613,10 +619,10 @@ information about the component and its default provided by the > print(factory_meta.default_config) > ``` -| Name | Type | Description | -| ----------- | ----------------------------- | ------------------ | -| `name` | str | The factory name. | -| **RETURNS** | [`FactoryMeta`](#factorymeta) | Β The factory meta. | +| Name | Description | +| ----------- | --------------------------------- | +| `name` | The factory name. ~~str~~ | +| **RETURNS** | The factory meta. ~~FactoryMeta~~ | ## Language.get_pipe_meta {#get_pipe_meta tag="method" new="3"} @@ -636,10 +642,10 @@ contains the information about the component and its default provided by the > print(factory_meta.default_config) > ``` -| Name | Type | Description | -| ----------- | ----------------------------- | ---------------------------- | -| `name` | str | The pipeline component name. | -| **RETURNS** | [`FactoryMeta`](#factorymeta) | Β The factory meta. | +| Name | Description | +| ----------- | ------------------------------------ | +| `name` | The pipeline component name. ~~str~~ | +| **RETURNS** | The factory meta. ~~FactoryMeta~~ | ## Language.analyze_pipes {#analyze_pipes tag="method" new="3"} @@ -725,18 +731,18 @@ token.ent_iob, token.ent_type -| Name | Type | Description | -| -------------- | ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| _keyword-only_ | | | -| `keys` | `List[str]` | The values to display in the table. Corresponds to attributes of the [`FactoryMeta`](/api/language#factorymeta). Defaults to `["assigns", "requires", "scores", "retokenizes"]`. | -| `pretty` | bool | Pretty-print the results as a table. Defaults to `False`. | -| **RETURNS** | dict | Dictionary containing the pipe analysis, keyed by `"summary"` (component meta by pipe), `"problems"` (attribute names by pipe) and `"attrs"` (pipes that assign and require an attribute, keyed by attribute). | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| _keyword-only_ | | +| `keys` | The values to display in the table. Corresponds to attributes of the [`FactoryMeta`](/api/language#factorymeta). Defaults to `["assigns", "requires", "scores", "retokenizes"]`. ~~List[str]~~ | +| `pretty` | Pretty-print the results as a table. Defaults to `False`. ~~bool~~ | +| **RETURNS** | Dictionary containing the pipe analysis, keyed by `"summary"` (component meta by pipe), `"problems"` (attribute names by pipe) and `"attrs"` (pipes that assign and require an attribute, keyed by attribute). ~~Optional[Dict[str, Any]]~~ | ## Language.meta {#meta tag="property"} Custom meta data for the Language class. If a model is loaded, contains meta data of the model. The `Language.meta` is also what's serialized as the -`meta.json` when you save an `nlp` object to disk. +[`meta.json`](/api/data-formats#meta) when you save an `nlp` object to disk. > #### Example > @@ -744,9 +750,9 @@ data of the model. The `Language.meta` is also what's serialized as the > print(nlp.meta) > ``` -| Name | Type | Description | -| ----------- | ---- | -------------- | -| **RETURNS** | dict | The meta data. | +| Name | Description | +| ----------- | --------------------------------- | +| **RETURNS** | The meta data. ~~Dict[str, Any]~~ | ## Language.config {#config tag="property" new="3"} @@ -765,9 +771,9 @@ subclass of the built-in `dict`. It supports the additional methods `to_disk` > print(nlp.config.to_str()) > ``` -| Name | Type | Description | -| ----------- | --------------------------------------------------- | ----------- | -| **RETURNS** | [`Config`](https://thinc.ai/docs/api-config#config) | The config. | +| Name | Description | +| ----------- | ---------------------- | +| **RETURNS** | The config. ~~Config~~ | ## Language.to_disk {#to_disk tag="method" new="2"} @@ -780,11 +786,11 @@ the model**. > nlp.to_disk("/path/to/models") > ``` -| Name | Type | Description | -| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | -| _keyword-only_ | | | -| `exclude` | `Iterable[str]` | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------ | +| `path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ | +| _keyword-only_ | | +| `exclude` | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ | ## Language.from_disk {#from_disk tag="method" new="2"} @@ -806,12 +812,12 @@ loaded object. > nlp = English().from_disk("/path/to/en_model") > ``` -| Name | Type | Description | -| -------------- | --------------- | ----------------------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | -| _keyword-only_ | | | -| `exclude` | `Iterable[str]` | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `Language` | The modified `Language` object. | +| Name | Description | +| -------------- | ----------------------------------------------------------------------------------------------------------- | +| `path` | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ | +| _keyword-only_ | | +| `exclude` | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ | +| **RETURNS** | The modified `Language` object. ~~Language~~ | ## Language.to_bytes {#to_bytes tag="method"} @@ -823,11 +829,11 @@ Serialize the current state to a binary string. > nlp_bytes = nlp.to_bytes() > ``` -| Name | Type | Description | -| -------------- | --------------- | ----------------------------------------------------------------------------------------- | -| _keyword-only_ | | | -| `exclude` | `Iterable[str]` | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | bytes | The serialized form of the `Language` object. | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------ | +| _keyword-only_ | | +| `exclude` | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. ~~iterable~~ | +| **RETURNS** | The serialized form of the `Language` object. ~~bytes~~ | ## Language.from_bytes {#from_bytes tag="method"} @@ -845,35 +851,35 @@ available to the loaded object. > nlp2.from_bytes(nlp_bytes) > ``` -| Name | Type | Description | -| -------------- | --------------- | ----------------------------------------------------------------------------------------- | -| `bytes_data` | bytes | The data to load from. | -| _keyword-only_ | | | -| `exclude` | `Iterable[str]` | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `Language` | The `Language` object. | +| Name | Description | +| -------------- | ----------------------------------------------------------------------------------------------------------- | +| `bytes_data` | The data to load from. ~~bytes~~ | +| _keyword-only_ | | +| `exclude` | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ | +| **RETURNS** | The `Language` object. ~~Language~~ | ## Attributes {#attributes} -| Name | Type | Description | -| --------------------------------------------- | ---------------------- | ---------------------------------------------------------------------------------------- | -| `vocab` | `Vocab` | A container for the lexical types. | -| `tokenizer` | `Tokenizer` | The tokenizer. | -| `make_doc` | `Callable` | Callable that takes a string and returns a `Doc`. | -| `pipeline` | `List[str, Callable]` | List of `(name, component)` tuples describing the current processing pipeline, in order. | -| `pipe_names` 2 | `List[str]` | List of pipeline component names, in order. | -| `pipe_labels` 2.2 | `Dict[str, List[str]]` | List of labels set by the pipeline components, if available, keyed by component name. | -| `pipe_factories` 2.2 | `Dict[str, str]` | Dictionary of pipeline component names, mapped to their factory names. | -| `factories` | `Dict[str, Callable]` | All available factory functions, keyed by name. | -| `factory_names` 3 | `List[str]` | List of all available factory names. | -| `path` 2 | `Path` | Path to the model data directory, if a model is loaded. Otherwise `None`. | +| Name | Description | +| --------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------- | +| `vocab` | A container for the lexical types. ~~Vocab~~ | +| `tokenizer` | The tokenizer. ~~Tokenizer~~ | +| `make_doc` | Callable that takes a string and returns a `Doc`. ~~Callable[[str], Doc]~~ | +| `pipeline` | List of `(name, component)` tuples describing the current processing pipeline, in order. ~~List[str, Callable[[Doc], Doc]]~~ | +| `pipe_names` 2 | List of pipeline component names, in order. ~~List[str]~~ | +| `pipe_labels` 2.2 | List of labels set by the pipeline components, if available, keyed by component name. ~~Dict[str, List[str]]~~ | +| `pipe_factories` 2.2 | Dictionary of pipeline component names, mapped to their factory names. ~~Dict[str, str]~~ | +| `factories` | All available factory functions, keyed by name. ~~Dict[str, Callable[[...], Callable[[Doc], Doc]]]~~ | +| `factory_names` 3 | List of all available factory names. ~~List[str]~~ | +| `path` 2 | Path to the model data directory, if a model is loaded. Otherwise `None`. ~~Optional[Path]~~ | ## Class attributes {#class-attributes} -| Name | Type | Description | -| ---------------- | ----- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `Defaults` | class | Settings, data and factory methods for creating the `nlp` object and processing pipeline. | -| `lang` | str | Two-letter language ID, i.e. [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes). | -| `default_config` | dict | Base [config](/usage/training#config) to use for [Language.config](/api/language#config). Defaults to [`default_config.cfg`](https://github.com/explosion/spaCy/tree/develop/spacy/default_config.cfg). | +| Name | Description | +| ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `Defaults` | Settings, data and factory methods for creating the `nlp` object and processing pipeline. ~~Defaults~~ | +| `lang` | Two-letter language ID, i.e. [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes). ~~str~~ | +| `default_config` | Base [config](/usage/training#config) to use for [Language.config](/api/language#config). Defaults to [`default_config.cfg`](https://github.com/explosion/spaCy/tree/develop/spacy/default_config.cfg). ~~Config~~ | ## Defaults {#defaults} @@ -906,17 +912,17 @@ customize the default language data: > config = Config().from_str(DEFAULT_CONFIG) > ``` -| Name | Description | -| --------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `stop_words` | List of stop words, used for `Token.is_stop`.
**Example:** [`stop_words.py`][stop_words.py] | -| `tokenizer_exceptions` | Tokenizer exception rules, string mapped to list of token attributes.
**Example:** [`de/tokenizer_exceptions.py`][de/tokenizer_exceptions.py] | -| `prefixes`, `suffixes`, `infixes` | Prefix, suffix and infix rules for the default tokenizer.
**Example:** [`puncutation.py`][punctuation.py] | -| `token_match` | Optional regex for matching strings that should never be split, overriding the infix rules.
**Example:** [`fr/tokenizer_exceptions.py`][fr/tokenizer_exceptions.py] | -| `url_match` | Regular expression for matching URLs. Prefixes and suffixes are removed before applying the match.
**Example:** [`tokenizer_exceptions.py`][tokenizer_exceptions.py] | -| `lex_attr_getters` | Custom functions for setting lexical attributes on tokens, e.g. `like_num`.
**Example:** [`lex_attrs.py`][lex_attrs.py] | -| `syntax_iterators` | Functions that compute views of a `Doc` object based on its syntax. At the moment, only used for [noun chunks](/usage/linguistic-features#noun-chunks).
**Example:** [`syntax_iterators.py`][syntax_iterators.py]. | -| `writing_system` | Information about the language's writing system, available via `Vocab.writing_system`. Defaults to: `{"direction": "ltr", "has_case": True, "has_letters": True}.`.
**Example:** [`zh/__init__.py`][zh/__init__.py] | -| `config` | Default [config](/usage/training#config) added to `nlp.config`. This can include references to custom tokenizers or lemmatizers.
**Example:** [`zh/__init__.py`][zh/__init__.py] | +| Name | Description | +| --------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `stop_words` | List of stop words, used for `Token.is_stop`.
**Example:** [`stop_words.py`][stop_words.py] ~~Set[str]~~ | +| `tokenizer_exceptions` | Tokenizer exception rules, string mapped to list of token attributes.
**Example:** [`de/tokenizer_exceptions.py`][de/tokenizer_exceptions.py] ~~Dict[str, List[dict]]~~ | +| `prefixes`, `suffixes`, `infixes` | Prefix, suffix and infix rules for the default tokenizer.
**Example:** [`puncutation.py`][punctuation.py] ~~Optional[List[Union[str, Pattern]]]~~ | +| `token_match` | Optional regex for matching strings that should never be split, overriding the infix rules.
**Example:** [`fr/tokenizer_exceptions.py`][fr/tokenizer_exceptions.py] ~~Optional[Pattern]~~ | +| `url_match` | Regular expression for matching URLs. Prefixes and suffixes are removed before applying the match.
**Example:** [`tokenizer_exceptions.py`][tokenizer_exceptions.py] ~~Optional[Pattern]~~ | +| `lex_attr_getters` | Custom functions for setting lexical attributes on tokens, e.g. `like_num`.
**Example:** [`lex_attrs.py`][lex_attrs.py] ~~Dict[int, Callable[[str], Any]]~~ | +| `syntax_iterators` | Functions that compute views of a `Doc` object based on its syntax. At the moment, only used for [noun chunks](/usage/linguistic-features#noun-chunks).
**Example:** [`syntax_iterators.py`][syntax_iterators.py]. ~~Dict[str, Callable[[Union[Doc, Span]], Iterator[Span]]]~~ | +| `writing_system` | Information about the language's writing system, available via `Vocab.writing_system`. Defaults to: `{"direction": "ltr", "has_case": True, "has_letters": True}.`.
**Example:** [`zh/__init__.py`][zh/__init__.py] ~~Dict[str, Any]~~ | +| `config` | Default [config](/usage/training#config) added to `nlp.config`. This can include references to custom tokenizers or lemmatizers.
**Example:** [`zh/__init__.py`][zh/__init__.py] ~~Config~~ | [stop_words.py]: https://github.com/explosion/spaCy/tree/master/spacy/lang/en/stop_words.py @@ -948,12 +954,12 @@ serialization by passing in the string names via the `exclude` argument. > nlp.from_disk("./model-data", exclude=["ner"]) > ``` -| Name | Description | -| ----------- | -------------------------------------------------- | -| `vocab` | The shared [`Vocab`](/api/vocab). | -| `tokenizer` | Tokenization rules and exceptions. | -| `meta` | The meta data, available as `Language.meta`. | -| ... | String names of pipeline components, e.g. `"ner"`. | +| Name | Description | +| ----------- | ------------------------------------------------------------------ | +| `vocab` | The shared [`Vocab`](/api/vocab). | +| `tokenizer` | Tokenization rules and exceptions. | +| `meta` | The meta data, available as [`Language.meta`](/api/language#meta). | +| ... | String names of pipeline components, e.g. `"ner"`. | ## FactoryMeta {#factorymeta new="3" tag="dataclass"} @@ -963,12 +969,12 @@ provided by the [`@Language.component`](/api/language#component) or component is defined and stored on the `Language` class for each component instance and factory instance. -| Name | Type | Description | -| ----------------------- | ------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `factory` | str | The name of the registered component factory. | -| `default_config` | `Dict[str, Any]` | The default config, describing the default values of the factory arguments. | -| `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). | -| `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis).Β  | -| `retokenizes` | bool | Whether the component changes tokenization. Used for [pipe analysis](/usage/processing-pipelines#analysis).Β  | -| `scores` | `Iterable[str]` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). | -| `default_score_weights` | `Dict[str, float]` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. | +| Name | Description | +| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `factory` | The name of the registered component factory. ~~str~~ | +| `default_config` | The default config, describing the default values of the factory arguments. ~~Dict[str, Any]~~ | +| `assigns` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~ | +| `requires` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~Β  | +| `retokenizes` | Whether the component changes tokenization. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~bool~~Β  | +| `scores` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~ | +| `default_score_weights` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. ~~Dict[str, float]~~ | diff --git a/website/docs/api/lemmatizer.md b/website/docs/api/lemmatizer.md index f1242d193..8417fd5e8 100644 --- a/website/docs/api/lemmatizer.md +++ b/website/docs/api/lemmatizer.md @@ -36,11 +36,9 @@ tags is available in the pipeline and runs _before_ the lemmatizer. The default config is defined by the pipeline component factory and describes how the component should be configured. You can override its settings via the `config` argument on [`nlp.add_pipe`](/api/language#add_pipe) or in your -[`config.cfg` for training](/usage/training#config). - -For examples of the lookups data formats used by the lookup and rule-based -lemmatizers, see the -[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) repo. +[`config.cfg` for training](/usage/training#config). For examples of the lookups +data formats used by the lookup and rule-based lemmatizers, see +[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data). > #### Example > @@ -49,12 +47,12 @@ lemmatizers, see the > nlp.add_pipe("lemmatizer", config=config) > ``` -| Setting | Type | Description | Default | -| ----------- | ------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------- | -| `mode` | str | The lemmatizer mode, e.g. `"lookup"` or `"rule"`. | `"lookup"` | -| `lookups` | [`Lookups`](/api/lookups) | The lookups object containing the tables such as `"lemma_rules"`, `"lemma_index"`, `"lemma_exc"` and `"lemma_lookup"`. If `None`, default tables are loaded from `spacy-lookups-data`. | `None` | -| `overwrite` | bool | Whether to overwrite existing lemmas. | `False` | -| `model` | [`Model`](https://thinc.ai/docs/api-model) | **Not yet implemented:** the model to use. | `None` | +| Setting | Description | +| ----------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `mode` | The lemmatizer mode, e.g. `"lookup"` or `"rule"`. Defaults to `"lookup"`. ~~str~~ | +| `lookups` | The lookups object containing the tables such as `"lemma_rules"`, `"lemma_index"`, `"lemma_exc"` and `"lemma_lookup"`. If `None`, default tables are loaded from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data). Defaults to `None`. ~~Optional[Lookups]~~ | +| `overwrite` | Whether to overwrite existing lemmas. Defaults to `False`. ~~bool~~ | +| `model` | **Not yet implemented:** the model to use. ~~Model~~ | ```python https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/lemmatizer.py @@ -77,15 +75,15 @@ Create a new pipeline instance. In your application, you would normally use a shortcut for this and instantiate the component using its string name and [`nlp.add_pipe`](/api/language#add_pipe). -| Name | Type | Description | -| -------------- | ------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------- | -| `vocab` | [`Vocab`](/api/vocab) | The vocab. | -| `model` | [`Model`](https://thinc.ai/docs/api-model) | A model (not yet implemented). | -| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. | -| _keyword-only_ | | | -| mode | str | The lemmatizer mode, e.g. `"lookup"` or `"rule"`. Defaults to `"lookup"`. | -| lookups | [`Lookups`](/api/lookups) | A lookups object containing the tables such as `"lemma_rules"`, `"lemma_index"`, `"lemma_exc"` and `"lemma_lookup"`. Defaults to `None`. | -| overwrite | bool | Whether to overwrite existing lemmas. | +| Name | Description | +| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `vocab` | The shared vocabulary. ~~Vocab~~ | +| `model` | **Not yet implemented:** The model to use. ~~Model~~ | +| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | +| _keyword-only_ | | | +| mode | The lemmatizer mode, e.g. `"lookup"` or `"rule"`. Defaults to `"lookup"`. ~~str~~ | +| lookups | A lookups object containing the tables such as `"lemma_rules"`, `"lemma_index"`, `"lemma_exc"` and `"lemma_lookup"`. Defaults to `None`. ~~Optional[Lookups]~~ | +| overwrite | Whether to overwrite existing lemmas. ~~bool~ | ## Lemmatizer.\_\_call\_\_ {#call tag="method"} @@ -102,10 +100,10 @@ and all pipeline components are applied to the `Doc` in order. > processed = lemmatizer(doc) > ``` -| Name | Type | Description | -| ----------- | ----- | ------------------------ | -| `doc` | `Doc` | The document to process. | -| **RETURNS** | `Doc` | The processed document. | +| Name | Description | +| ----------- | -------------------------------- | +| `doc` | The document to process. ~~Doc~~ | +| **RETURNS** | The processed document. ~~Doc~~ | ## Lemmatizer.pipe {#pipe tag="method"} @@ -121,12 +119,12 @@ applied to the `Doc` in order. > pass > ``` -| Name | Type | Description | -| -------------- | --------------- | ------------------------------------------------------ | -| `stream` | `Iterable[Doc]` | A stream of documents. | -| _keyword-only_ | | | -| `batch_size` | int | The number of texts to buffer. Defaults to `128`. | -| **YIELDS** | `Doc` | Processed documents in the order of the original text. | +| Name | Description | +| -------------- | ------------------------------------------------------------- | +| `stream` | A stream of documents. ~~Iterable[Doc]~~ | +| _keyword-only_ | | +| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ | +| **YIELDS** | The processed documents in order. ~~Doc~~ | ## Lemmatizer.lookup_lemmatize {#lookup_lemmatize tag="method"} @@ -134,39 +132,39 @@ Lemmatize a token using a lookup-based approach. If no lemma is found, the original string is returned. Languages can provide a [lookup table](/usage/adding-languages#lemmatizer) via the `Lookups`. -| Name | Type | Description | -| ----------- | --------------------- | ------------------------------------- | -| `token` | [`Token`](/api/token) | The token to lemmatize. | -| **RETURNS** | `List[str]` | A list containing one or more lemmas. | +| Name | Description | +| ----------- | --------------------------------------------------- | +| `token` | The token to lemmatize. ~~Token~~ | +| **RETURNS** | A list containing one or more lemmas. ~~List[str]~~ | ## Lemmatizer.rule_lemmatize {#rule_lemmatize tag="method"} Lemmatize a token using a rule-based approach. Typically relies on POS tags. -| Name | Type | Description | -| ----------- | --------------------- | ------------------------------------- | -| `token` | [`Token`](/api/token) | The token to lemmatize. | -| **RETURNS** | `List[str]` | A list containing one or more lemmas. | +| Name | Description | +| ----------- | --------------------------------------------------- | +| `token` | The token to lemmatize. ~~Token~~ | +| **RETURNS** | A list containing one or more lemmas. ~~List[str]~~ | ## Lemmatizer.is_base_form {#is_base_form tag="method"} Check whether we're dealing with an uninflected paradigm, so we can avoid lemmatization entirely. -| Name | Type | Description | -| ----------- | --------------------- | ------------------------------------------------------------------------------------------------------- | -| `token` | [`Token`](/api/token) | The token to analyze. | -| **RETURNS** | bool | Whether the token's attributes (e.g., part-of-speech tag, morphological features) describe a base form. | +| Name | Description | +| ----------- | ---------------------------------------------------------------------------------------------------------------- | +| `token` | The token to analyze. ~~Token~~ | +| **RETURNS** | Whether the token's attributes (e.g., part-of-speech tag, morphological features) describe a base form. ~~bool~~ | ## Lemmatizer.get_lookups_config {#get_lookups_config tag="classmethod"} Returns the lookups configuration settings for a given mode for use in -[`Lemmatizer.load_lookups`](#load_lookups). +[`Lemmatizer.load_lookups`](/api/lemmatizer#load_lookups). -| Name | Type | Description | -| ----------- | ---- | ------------------------------------------------- | -| `mode` | str | The lemmatizer mode. | -| **RETURNS** | dict | The lookups configuration settings for this mode. | +| Name | Description | +| ----------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `mode` | The lemmatizer mode. ~~str~~ | +| **RETURNS** | The lookups configuration settings for this mode. Includes the keys `"required_tables"` and `"optional_tables"`, mapped to a list of table string names. ~~Dict[str, List[str]]~~ | ## Lemmatizer.load_lookups {#load_lookups tag="classmethod"} @@ -174,12 +172,12 @@ Load and validate lookups tables. If the provided lookups is `None`, load the default lookups tables according to the language and mode settings. Confirm that all required tables for the language and mode are present. -| Name | Type | Description | -| ----------- | ------------------------- | ---------------------------------------------------------------------------- | -| `lang` | str | The language. | -| `mode` | str | The lemmatizer mode. | -| `lookups` | [`Lookups`](/api/lookups) | The provided lookups, may be `None` if the default lookups should be loaded. | -| **RETURNS** | [`Lookups`](/api/lookups) | The lookups object. | +| Name | Description | +| ----------- | -------------------------------------------------------------------------------------------------- | +| `lang` | The language. ~~str~~ | +| `mode` | The lemmatizer mode. ~~str~~ | +| `lookups` | The provided lookups, may be `None` if the default lookups should be loaded. ~~Optional[Lookups]~~ | +| **RETURNS** | The lookups. ~~Lookups~~ | ## Lemmatizer.to_disk {#to_disk tag="method"} @@ -192,11 +190,11 @@ Serialize the pipe to disk. > lemmatizer.to_disk("/path/to/lemmatizer") > ``` -| Name | Type | Description | -| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | -| _keyword-only_ | | | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------ | +| `path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ | +| _keyword-only_ | | +| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ | ## Lemmatizer.from_disk {#from_disk tag="method"} @@ -209,12 +207,12 @@ Load the pipe from disk. Modifies the object in place and returns it. > lemmatizer.from_disk("/path/to/lemmatizer") > ``` -| Name | Type | Description | -| -------------- | --------------- | -------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | -| _keyword-only_ | | | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `Lemmatizer` | The modified `Lemmatizer` object. | +| Name | Description | +| -------------- | ----------------------------------------------------------------------------------------------- | +| `path` | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ | +| _keyword-only_ | | +| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ | +| **RETURNS** | The modified `Lemmatizer` object. ~~Lemmatizer~~ | ## Lemmatizer.to_bytes {#to_bytes tag="method"} @@ -227,11 +225,11 @@ Load the pipe from disk. Modifies the object in place and returns it. Serialize the pipe to a bytestring. -| Name | Type | Description | -| -------------- | --------------- | ------------------------------------------------------------------------- | -| _keyword-only_ | | | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | bytes | The serialized form of the `Lemmatizer` object. | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------- | +| _keyword-only_ | | +| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ | +| **RETURNS** | The serialized form of the `Lemmatizer` object. ~~bytes~~ | ## Lemmatizer.from_bytes {#from_bytes tag="method"} @@ -245,27 +243,20 @@ Load the pipe from a bytestring. Modifies the object in place and returns it. > lemmatizer.from_bytes(lemmatizer_bytes) > ``` -| Name | Type | Description | -| -------------- | --------------- | ------------------------------------------------------------------------- | -| `bytes_data` | bytes | The data to load from. | -| _keyword-only_ | | | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `Lemmatizer` | The `Lemmatizer` object. | - -## Lemmatizer.mode {#mode tag="property"} - -The lemmatizer mode. - -| Name | Type | Description | -| ----------- | ----- | -------------------- | -| **RETURNS** | `str` | The lemmatizer mode. | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------- | +| `bytes_data` | The data to load from. ~~bytes~~ | +| _keyword-only_ | | +| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ | +| **RETURNS** | The `Lemmatizer` object. ~~Lemmatizer~~ | ## Attributes {#attributes} -| Name | Type | Description | -| --------- | --------------------------------- | ------------------- | -| `vocab` | The shared [`Vocab`](/api/vocab). | -| `lookups` | [`Lookups`](/api/lookups) | The lookups object. | +| Name | Description | +| --------- | ------------------------------------------- | +| `vocab` | The shared [`Vocab`](/api/vocab). ~~Vocab~~ | +| `lookups` | The lookups object. ~~Lookups~~ | +| `mode` | The lemmatizer mode. ~~str~~ | ## Serialization fields {#serialization-fields} diff --git a/website/docs/api/lexeme.md b/website/docs/api/lexeme.md index 625a26412..a7e1d1ca0 100644 --- a/website/docs/api/lexeme.md +++ b/website/docs/api/lexeme.md @@ -13,10 +13,10 @@ lemmatization depends on the part-of-speech tag). Create a `Lexeme` object. -| Name | Type | Description | -| ------- | ------- | -------------------------- | -| `vocab` | `Vocab` | The parent vocabulary. | -| `orth` | int | The orth id of the lexeme. | +| Name | Description | +| ------- | ---------------------------------- | +| `vocab` | The parent vocabulary. ~~Vocab~~ | +| `orth` | The orth id of the lexeme. ~~int~~ | ## Lexeme.set_flag {#set_flag tag="method"} @@ -29,10 +29,10 @@ Change the value of a boolean flag. > nlp.vocab["spaCy"].set_flag(COOL_FLAG, True) > ``` -| Name | Type | Description | -| --------- | ---- | ------------------------------------ | -| `flag_id` | int | The attribute ID of the flag to set. | -| `value` | bool | The new value of the flag. | +| Name | Description | +| --------- | -------------------------------------------- | +| `flag_id` | The attribute ID of the flag to set. ~~int~~ | +| `value` | The new value of the flag. ~~bool~~ | ## Lexeme.check_flag {#check_flag tag="method"} @@ -46,10 +46,10 @@ Check the value of a boolean flag. > assert nlp.vocab["spaCy"].check_flag(MY_LIBRARY) == True > ``` -| Name | Type | Description | -| ----------- | ---- | -------------------------------------- | -| `flag_id` | int | The attribute ID of the flag to query. | -| **RETURNS** | bool | The value of the flag. | +| Name | Description | +| ----------- | ---------------------------------------------- | +| `flag_id` | The attribute ID of the flag to query. ~~int~~ | +| **RETURNS** | The value of the flag. ~~bool~~ | ## Lexeme.similarity {#similarity tag="method" model="vectors"} @@ -65,10 +65,10 @@ Compute a semantic similarity estimate. Defaults to cosine over vectors. > assert apple_orange == orange_apple > ``` -| Name | Type | Description | -| ----------- | ----- | -------------------------------------------------------------------------------------------- | -| other | - | The object to compare with. By default, accepts `Doc`, `Span`, `Token` and `Lexeme` objects. | -| **RETURNS** | float | A scalar similarity score. Higher is more similar. | +| Name | Description | +| ----------- | -------------------------------------------------------------------------------------------------------------------------------- | +| other | The object to compare with. By default, accepts `Doc`, `Span`, `Token` and `Lexeme` objects. ~~Union[Doc, Span, Token, Lexeme]~~ | +| **RETURNS** | A scalar similarity score. Higher is more similar. ~~float~~ | ## Lexeme.has_vector {#has_vector tag="property" model="vectors"} @@ -81,9 +81,9 @@ A boolean value indicating whether a word vector is associated with the lexeme. > assert apple.has_vector > ``` -| Name | Type | Description | -| ----------- | ---- | ---------------------------------------------- | -| **RETURNS** | bool | Whether the lexeme has a vector data attached. | +| Name | Description | +| ----------- | ------------------------------------------------------- | +| **RETURNS** | Whether the lexeme has a vector data attached. ~~bool~~ | ## Lexeme.vector {#vector tag="property" model="vectors"} @@ -97,9 +97,9 @@ A real-valued meaning representation. > assert apple.vector.shape == (300,) > ``` -| Name | Type | Description | -| ----------- | ---------------------------------------- | ----------------------------------------------------- | -| **RETURNS** | `numpy.ndarray[ndim=1, dtype='float32']` | A 1D numpy array representing the lexeme's semantics. | +| Name | Description | +| ----------- | ------------------------------------------------------------------------------------------------ | +| **RETURNS** | A 1-dimensional array representing the lexeme's vector. ~~numpy.ndarray[ndim=1, dtype=float32]~~ | ## Lexeme.vector_norm {#vector_norm tag="property" model="vectors"} @@ -115,50 +115,50 @@ The L2 norm of the lexeme's vector representation. > assert apple.vector_norm != pasta.vector_norm > ``` -| Name | Type | Description | -| ----------- | ----- | ----------------------------------------- | -| **RETURNS** | float | The L2 norm of the vector representation. | +| Name | Description | +| ----------- | --------------------------------------------------- | +| **RETURNS** | The L2 norm of the vector representation. ~~float~~ | ## Attributes {#attributes} -| Name | Type | Description | -| -------------------------------------------- | ------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `vocab` | `Vocab` | The lexeme's vocabulary. | -| `text` | str | Verbatim text content. | -| `orth` | int | ID of the verbatim text content. | -| `orth_` | str | Verbatim text content (identical to `Lexeme.text`). Exists mostly for consistency with the other attributes. | -| `rank` | int | Sequential ID of the lexemes's lexical type, used to index into tables, e.g. for word vectors. | -| `flags` | int | Container of the lexeme's binary flags. | -| `norm` | int | The lexemes's norm, i.e. a normalized form of the lexeme text. | -| `norm_` | str | The lexemes's norm, i.e. a normalized form of the lexeme text. | -| `lower` | int | Lowercase form of the word. | -| `lower_` | str | Lowercase form of the word. | -| `shape` | int | Transform of the words's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. | -| `shape_` | str | Transform of the word's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. | -| `prefix` | int | Length-N substring from the start of the word. Defaults to `N=1`. | -| `prefix_` | str | Length-N substring from the start of the word. Defaults to `N=1`. | -| `suffix` | int | Length-N substring from the end of the word. Defaults to `N=3`. | -| `suffix_` | str | Length-N substring from the start of the word. Defaults to `N=3`. | -| `is_alpha` | bool | Does the lexeme consist of alphabetic characters? Equivalent to `lexeme.text.isalpha()`. | -| `is_ascii` | bool | Does the lexeme consist of ASCII characters? Equivalent to `[any(ord(c) >= 128 for c in lexeme.text)]`. | -| `is_digit` | bool | Does the lexeme consist of digits? Equivalent to `lexeme.text.isdigit()`. | -| `is_lower` | bool | Is the lexeme in lowercase? Equivalent to `lexeme.text.islower()`. | -| `is_upper` | bool | Is the lexeme in uppercase? Equivalent to `lexeme.text.isupper()`. | -| `is_title` | bool | Is the lexeme in titlecase? Equivalent to `lexeme.text.istitle()`. | -| `is_punct` | bool | Is the lexeme punctuation? | -| `is_left_punct` | bool | Is the lexeme a left punctuation mark, e.g. `(`? | -| `is_right_punct` | bool | Is the lexeme a right punctuation mark, e.g. `)`? | -| `is_space` | bool | Does the lexeme consist of whitespace characters? Equivalent to `lexeme.text.isspace()`. | -| `is_bracket` | bool | Is the lexeme a bracket? | -| `is_quote` | bool | Is the lexeme a quotation mark? | -| `is_currency` 2.0.8 | bool | Is the lexeme a currency symbol? | -| `like_url` | bool | Does the lexeme resemble a URL? | -| `like_num` | bool | Does the lexeme represent a number? e.g. "10.9", "10", "ten", etc. | -| `like_email` | bool | Does the lexeme resemble an email address? | -| `is_oov` | bool | Does the lexeme have a word vector? | -| `is_stop` | bool | Is the lexeme part of a "stop list"? | -| `lang` | int | Language of the parent vocabulary. | -| `lang_` | str | Language of the parent vocabulary. | -| `prob` | float | Smoothed log probability estimate of the lexeme's word type (context-independent entry in the vocabulary). | -| `cluster` | int | Brown cluster ID. | -| `sentiment` | float | A scalar value indicating the positivity or negativity of the lexeme. | +| Name | Description | +| -------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `vocab` | The lexeme's vocabulary. ~~Vocab~~ | +| `text` | Verbatim text content. ~~str~~ | +| `orth` | ID of the verbatim text content. ~~int~~ | +| `orth_` | Verbatim text content (identical to `Lexeme.text`). Exists mostly for consistency with the other attributes. ~~str~~ | +| `rank` | Sequential ID of the lexemes's lexical type, used to index into tables, e.g. for word vectors. ~~int~~ | +| `flags` | Container of the lexeme's binary flags. ~~int~~ | +| `norm` | The lexemes's norm, i.e. a normalized form of the lexeme text. ~~int~~ | +| `norm_` | The lexemes's norm, i.e. a normalized form of the lexeme text. ~~str~~ | +| `lower` | Lowercase form of the word. ~~int~~ | +| `lower_` | Lowercase form of the word. ~~str~~ | +| `shape` | Transform of the words's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~int~~ | +| `shape_` | Transform of the word's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~str~~ | +| `prefix` | Length-N substring from the start of the word. Defaults to `N=1`. ~~int~~ | +| `prefix_` | Length-N substring from the start of the word. Defaults to `N=1`. ~~str~~ | +| `suffix` | Length-N substring from the end of the word. Defaults to `N=3`. ~~int~~ | +| `suffix_` | Length-N substring from the start of the word. Defaults to `N=3`. ~~str~~ | +| `is_alpha` | Does the lexeme consist of alphabetic characters? Equivalent to `lexeme.text.isalpha()`. ~~bool~~ | +| `is_ascii` | Does the lexeme consist of ASCII characters? Equivalent to `[any(ord(c) >= 128 for c in lexeme.text)]`. ~~bool~~ | +| `is_digit` | Does the lexeme consist of digits? Equivalent to `lexeme.text.isdigit()`. ~~bool~~ | +| `is_lower` | Is the lexeme in lowercase? Equivalent to `lexeme.text.islower()`. ~~bool~~ | +| `is_upper` | Is the lexeme in uppercase? Equivalent to `lexeme.text.isupper()`. ~~bool~~ | +| `is_title` | Is the lexeme in titlecase? Equivalent to `lexeme.text.istitle()`. ~~bool~~ | +| `is_punct` | Is the lexeme punctuation? ~~bool~~ | +| `is_left_punct` | Is the lexeme a left punctuation mark, e.g. `(`? ~~bool~~ | +| `is_right_punct` | Is the lexeme a right punctuation mark, e.g. `)`? ~~bool~~ | +| `is_space` | Does the lexeme consist of whitespace characters? Equivalent to `lexeme.text.isspace()`. ~~bool~~ | +| `is_bracket` | Is the lexeme a bracket? ~~bool~~ | +| `is_quote` | Is the lexeme a quotation mark? ~~bool~~ | +| `is_currency` 2.0.8 | Is the lexeme a currency symbol? ~~bool~~ | +| `like_url` | Does the lexeme resemble a URL? ~~bool~~ | +| `like_num` | Does the lexeme represent a number? e.g. "10.9", "10", "ten", etc. ~~bool~~ | +| `like_email` | Does the lexeme resemble an email address? ~~bool~~ | +| `is_oov` | Does the lexeme have a word vector? ~~bool~~ | +| `is_stop` | Is the lexeme part of a "stop list"? ~~bool~~ | +| `lang` | Language of the parent vocabulary. ~~int~~ | +| `lang_` | Language of the parent vocabulary. ~~str~~ | +| `prob` | Smoothed log probability estimate of the lexeme's word type (context-independent entry in the vocabulary). ~~float~~ | +| `cluster` | Brown cluster ID. ~~int~~ | +| `sentiment` | A scalar value indicating the positivity or negativity of the lexeme. ~~float~~ | diff --git a/website/docs/api/lookups.md b/website/docs/api/lookups.md index 099b5306e..9565e478f 100644 --- a/website/docs/api/lookups.md +++ b/website/docs/api/lookups.md @@ -24,10 +24,6 @@ Create a `Lookups` object. > lookups = Lookups() > ``` -| Name | Type | Description | -| ----------- | --------- | ----------------------------- | -| **RETURNS** | `Lookups` | The newly constructed object. | - ## Lookups.\_\_len\_\_ {#len tag="method"} Get the current number of tables in the lookups. @@ -39,9 +35,9 @@ Get the current number of tables in the lookups. > assert len(lookups) == 0 > ``` -| Name | Type | Description | -| ----------- | ---- | ------------------------------------ | -| **RETURNS** | int | The number of tables in the lookups. | +| Name | Description | +| ----------- | -------------------------------------------- | +| **RETURNS** | The number of tables in the lookups. ~~int~~ | ## Lookups.\_\contains\_\_ {#contains tag="method"} @@ -56,10 +52,10 @@ Check if the lookups contain a table of a given name. Delegates to > assert "some_table" in lookups > ``` -| Name | Type | Description | -| ----------- | ---- | ----------------------------------------------- | -| `name` | str | Name of the table. | -| **RETURNS** | bool | Whether a table of that name is in the lookups. | +| Name | Description | +| ----------- | -------------------------------------------------------- | +| `name` | Name of the table. ~~str~~ | +| **RETURNS** | Whether a table of that name is in the lookups. ~~bool~~ | ## Lookups.tables {#tables tag="property"} @@ -73,9 +69,9 @@ Get the names of all tables in the lookups. > assert lookups.tables == ["some_table"] > ``` -| Name | Type | Description | -| ----------- | ---- | ----------------------------------- | -| **RETURNS** | list | Names of the tables in the lookups. | +| Name | Description | +| ----------- | ------------------------------------------------- | +| **RETURNS** | Names of the tables in the lookups. ~~List[str]~~ | ## Lookups.add_table {#add_table tag="method"} @@ -89,11 +85,11 @@ exists. > lookups.add_table("some_table", {"foo": "bar"}) > ``` -| Name | Type | Description | -| ----------- | ----------------------------- | ---------------------------------- | -| `name` | str | Unique name of the table. | -| `data` | dict | Optional data to add to the table. | -| **RETURNS** | [`Table`](/api/lookups#table) | The newly added table. | +| Name | Description | +| ----------- | ------------------------------------------- | +| `name` | Unique name of the table. ~~str~~ | +| `data` | Optional data to add to the table. ~~dict~~ | +| **RETURNS** | The newly added table. ~~Table~~ | ## Lookups.get_table {#get_table tag="method"} @@ -108,10 +104,10 @@ Get a table from the lookups. Raises an error if the table doesn't exist. > assert table["foo"] == "bar" > ``` -| Name | Type | Description | -| ----------- | ----------------------------- | ------------------ | -| `name` | str | Name of the table. | -| **RETURNS** | [`Table`](/api/lookups#table) | The table. | +| Name | Description | +| ----------- | -------------------------- | +| `name` | Name of the table. ~~str~~ | +| **RETURNS** | The table. ~~Table~~ | ## Lookups.remove_table {#remove_table tag="method"} @@ -126,10 +122,10 @@ Remove a table from the lookups. Raises an error if the table doesn't exist. > assert "some_table" not in lookups > ``` -| Name | Type | Description | -| ----------- | ----------------------------- | ---------------------------- | -| `name` | str | Name of the table to remove. | -| **RETURNS** | [`Table`](/api/lookups#table) | The removed table. | +| Name | Description | +| ----------- | ------------------------------------ | +| `name` | Name of the table to remove. ~~str~~ | +| **RETURNS** | The removed table. ~~Table~~ | ## Lookups.has_table {#has_table tag="method"} @@ -144,10 +140,10 @@ Check if the lookups contain a table of a given name. Equivalent to > assert lookups.has_table("some_table") > ``` -| Name | Type | Description | -| ----------- | ---- | ----------------------------------------------- | -| `name` | str | Name of the table. | -| **RETURNS** | bool | Whether a table of that name is in the lookups. | +| Name | Description | +| ----------- | -------------------------------------------------------- | +| `name` | Name of the table. ~~str~~ | +| **RETURNS** | Whether a table of that name is in the lookups. ~~bool~~ | ## Lookups.to_bytes {#to_bytes tag="method"} @@ -159,9 +155,9 @@ Serialize the lookups to a bytestring. > lookup_bytes = lookups.to_bytes() > ``` -| Name | Type | Description | -| ----------- | ----- | ----------------------- | -| **RETURNS** | bytes | The serialized lookups. | +| Name | Description | +| ----------- | --------------------------------- | +| **RETURNS** | The serialized lookups. ~~bytes~~ | ## Lookups.from_bytes {#from_bytes tag="method"} @@ -175,10 +171,10 @@ Load the lookups from a bytestring. > lookups.from_bytes(lookup_bytes) > ``` -| Name | Type | Description | -| ------------ | --------- | ---------------------- | -| `bytes_data` | bytes | The data to load from. | -| **RETURNS** | `Lookups` | The loaded lookups. | +| Name | Description | +| ------------ | -------------------------------- | +| `bytes_data` | The data to load from. ~~bytes~~ | +| **RETURNS** | The loaded lookups. ~~Lookups~~ | ## Lookups.to_disk {#to_disk tag="method"} @@ -191,9 +187,9 @@ which will be created if it doesn't exist. > lookups.to_disk("/path/to/lookups") > ``` -| Name | Type | Description | -| ------ | ------------ | --------------------------------------------------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| Name | Description | +| ------ | ------------------------------------------------------------------------------------------------------------------------------------------ | +| `path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ | ## Lookups.from_disk {#from_disk tag="method"} @@ -208,10 +204,10 @@ the file doesn't exist. > lookups.from_disk("/path/to/lookups") > ``` -| Name | Type | Description | -| ----------- | ------------ | -------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | -| **RETURNS** | `Lookups` | The loaded lookups. | +| Name | Description | +| ----------- | ----------------------------------------------------------------------------------------------- | +| `path` | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ | +| **RETURNS** | The loaded lookups. ~~Lookups~~ | ## Table {#table tag="class, ordererddict"} @@ -236,9 +232,9 @@ Initialize a new table. > assert table["foo"] == "bar" > ``` -| Name | Type | Description | -| ------ | ---- | ---------------------------------- | -| `name` | str | Optional table name for reference. | +| Name | Description | +| ------ | ------------------------------------------ | +| `name` | Optional table name for reference. ~~str~~ | ### Table.from_dict {#table.from_dict tag="classmethod"} @@ -252,11 +248,11 @@ Initialize a new table from a dict. > table = Table.from_dict(data, name="some_table") > ``` -| Name | Type | Description | -| ----------- | ------- | ---------------------------------- | -| `data` | dict | The dictionary. | -| `name` | str | Optional table name for reference. | -| **RETURNS** | `Table` | The newly constructed object. | +| Name | Description | +| ----------- | ------------------------------------------ | +| `data` | The dictionary. ~~dict~~ | +| `name` | Optional table name for reference. ~~str~~ | +| **RETURNS** | The newly constructed object. ~~Table~~ | ### Table.set {#table.set tag="method"} @@ -272,10 +268,10 @@ Set a new key / value pair. String keys will be hashed. Same as > assert table["foo"] == "bar" > ``` -| Name | Type | Description | -| ------- | --------- | ----------- | -| `key` | str / int | The key. | -| `value` | - | The value. | +| Name | Description | +| ------- | ---------------------------- | +| `key` | The key. ~~Union[str, int]~~ | +| `value` | The value. | ### Table.to_bytes {#table.to_bytes tag="method"} @@ -287,9 +283,9 @@ Serialize the table to a bytestring. > table_bytes = table.to_bytes() > ``` -| Name | Type | Description | -| ----------- | ----- | --------------------- | -| **RETURNS** | bytes | The serialized table. | +| Name | Description | +| ----------- | ------------------------------- | +| **RETURNS** | The serialized table. ~~bytes~~ | ### Table.from_bytes {#table.from_bytes tag="method"} @@ -303,15 +299,15 @@ Load a table from a bytestring. > table.from_bytes(table_bytes) > ``` -| Name | Type | Description | -| ------------ | ------- | ----------------- | -| `bytes_data` | bytes | The data to load. | -| **RETURNS** | `Table` | The loaded table. | +| Name | Description | +| ------------ | --------------------------- | +| `bytes_data` | The data to load. ~~bytes~~ | +| **RETURNS** | The loaded table. ~~Table~~ | ### Attributes {#table-attributes} -| Name | Type | Description | -| -------------- | --------------------------- | ----------------------------------------------------- | -| `name` | str | Table name. | -| `default_size` | int | Default size of bloom filters if no data is provided. | -| `bloom` | `preshed.bloom.BloomFilter` | The bloom filters. | +| Name | Description | +| -------------- | ------------------------------------------------------------- | +| `name` | Table name. ~~str~~ | +| `default_size` | Default size of bloom filters if no data is provided. ~~int~~ | +| `bloom` | The bloom filters. ~~preshed.BloomFilter~~ | diff --git a/website/docs/api/matcher.md b/website/docs/api/matcher.md index b481f1972..f259174e2 100644 --- a/website/docs/api/matcher.md +++ b/website/docs/api/matcher.md @@ -30,20 +30,20 @@ pattern keys correspond to a number of [`Token` attributes](/api/token#attributes). The supported attributes for rule-based matching are: -| Attribute | Type | Β Description | -| -------------------------------------- | ---- | ------------------------------------------------------------------------------------------------------ | -| `ORTH` | str | The exact verbatim text of a token. | -| `TEXT` 2.1 | str | The exact verbatim text of a token. | -| `LOWER` | str | The lowercase form of the token text. | -| Β `LENGTH` | int | The length of the token text. | -| Β `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT` | bool | Token text consists of alphabetic characters, ASCII characters, digits. | -| Β `IS_LOWER`, `IS_UPPER`, `IS_TITLE` | bool | Token text is in lowercase, uppercase, titlecase. | -| Β `IS_PUNCT`, `IS_SPACE`, `IS_STOP` | bool | Token is punctuation, whitespace, stop word. | -| Β `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL` | bool | Token text resembles a number, URL, email. | -| Β `POS`, `TAG`, `DEP`, `LEMMA`, `SHAPE` | str | The token's simple and extended part-of-speech tag, dependency label, lemma, shape. | -| `ENT_TYPE` | str | The token's entity label. | -| `_` 2.1 | dict | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). | -| `OP` | str | Operator or quantifier to determine how often to match a token pattern. | +| Attribute | Β Description | +| -------------------------------------- | ------------------------------------------------------------------------------------------------------------------------- | +| `ORTH` | The exact verbatim text of a token. ~~str~~ | +| `TEXT` 2.1 | The exact verbatim text of a token. ~~str~~ | +| `LOWER` | The lowercase form of the token text. ~~str~~ | +| Β `LENGTH` | The length of the token text. ~~int~~ | +| Β `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT` | Token text consists of alphabetic characters, ASCII characters, digits. ~~bool~~ | +| Β `IS_LOWER`, `IS_UPPER`, `IS_TITLE` | Token text is in lowercase, uppercase, titlecase. ~~bool~~ | +| Β `IS_PUNCT`, `IS_SPACE`, `IS_STOP` | Token is punctuation, whitespace, stop word. ~~bool~~ | +| Β `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL` | Token text resembles a number, URL, email. ~~bool~~ | +| Β `POS`, `TAG`, `DEP`, `LEMMA`, `SHAPE` | The token's simple and extended part-of-speech tag, dependency label, lemma, shape. ~~str~~ | +| `ENT_TYPE` | The token's entity label. ~~str~~ | +| `_` 2.1 | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). ~~Dict[str, Any]~~ | +| `OP` | Operator or quantifier to determine how often to match a token pattern. ~~str~~ | Operators and quantifiers define **how often** a token pattern should be matched: @@ -75,11 +75,11 @@ it compares to another value. > ] > ``` -| Attribute | Type | Description | -| -------------------------- | ---------- | --------------------------------------------------------------------------------- | -| `IN` | any | Attribute value is member of a list. | -| `NOT_IN` | any | Attribute value is _not_ member of a list. | -| `==`, `>=`, `<=`, `>`, `<` | int, float | Attribute value is equal, greater or equal, smaller or equal, greater or smaller. | +| Attribute | Description | +| -------------------------- | ------------------------------------------------------------------------------------------------------- | +| `IN` | Attribute value is member of a list. ~~Any~~ | +| `NOT_IN` | Attribute value is _not_ member of a list. ~~Any~~ | +| `==`, `>=`, `<=`, `>`, `<` | Attribute value is equal, greater or equal, smaller or equal, greater or smaller. ~~Union[int, float]~~ | ## Matcher.\_\_init\_\_ {#init tag="method"} @@ -95,10 +95,10 @@ string where an integer is expected) or unexpected property names. > matcher = Matcher(nlp.vocab) > ``` -| Name | Type | Description | -| --------------------------------------- | ------- | ------------------------------------------------------------------------------------------- | -| `vocab` | `Vocab` | The vocabulary object, which must be shared with the documents the matcher will operate on. | -| `validate` 2.1 | bool | Validate all patterns added to this matcher. | +| Name | Description | +| --------------------------------------- | ----------------------------------------------------------------------------------------------------- | +| `vocab` | The vocabulary object, which must be shared with the documents the matcher will operate on. ~~Vocab~~ | +| `validate` 2.1 | Validate all patterns added to this matcher. ~~bool~~ | ## Matcher.\_\_call\_\_ {#call tag="method"} @@ -116,10 +116,10 @@ Find all token sequences matching the supplied patterns on the `Doc` or `Span`. > matches = matcher(doc) > ``` -| Name | Type | Description | -| ----------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `doclike` | `Doc`/`Span` | The `Doc` or `Span` to match over. | -| **RETURNS** | list | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. The `match_id` is the ID of the added match pattern. | +| Name | Description | +| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `doclike` | The `Doc` or `Span` to match over. ~~Union[Doc, Span]~~ | +| **RETURNS** | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. The `match_id` is the ID of the added match pattern. ~~List[Tuple[int, int, int]]~~ | ## Matcher.pipe {#pipe tag="method"} @@ -134,13 +134,13 @@ Match a stream of documents, yielding them in turn. > pass > ``` -| Name | Type | Description | -| --------------------------------------------- | -------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `docs` | iterable | A stream of documents or spans. | -| `batch_size` | int | The number of documents to accumulate into a working set. | -| `return_matches` 2.1 | bool | Yield the match lists along with the docs, making results `(doc, matches)` tuples. | -| `as_tuples` | bool | Interpret the input stream as `(doc, context)` tuples, and yield `(result, context)` tuples out. If both `return_matches` and `as_tuples` are `True`, the output will be a sequence of `((doc, matches), context)` tuples. | -| **YIELDS** | `Doc` | Documents, in order. | +| Name | Description | +| --------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `docs` | A stream of documents or spans. ~~Iterable[Union[Doc, Span]]~~ | +| `batch_size` | The number of documents to accumulate into a working set. ~~int~~ | +| `return_matches` 2.1 | Yield the match lists along with the docs, making results `(doc, matches)` tuples. ~~bool~~ | +| `as_tuples` | Interpret the input stream as `(doc, context)` tuples, and yield `(result, context)` tuples out. If both `return_matches` and `as_tuples` are `True`, the output will be a sequence of `((doc, matches), context)` tuples. ~~bool~~ | +| **YIELDS** | Documents, in order. ~~Union[Doc, Tuple[Doc, Any], Tuple[Tuple[Doc, Any], Any]]~~ | ## Matcher.\_\_len\_\_ {#len tag="method" new="2"} @@ -157,9 +157,9 @@ patterns. > assert len(matcher) == 1 > ``` -| Name | Type | Description | -| ----------- | ---- | -------------------- | -| **RETURNS** | int | The number of rules. | +| Name | Description | +| ----------- | ---------------------------- | +| **RETURNS** | The number of rules. ~~int~~ | ## Matcher.\_\_contains\_\_ {#contains tag="method" new="2"} @@ -174,10 +174,10 @@ Check whether the matcher contains rules for a match ID. > assert "Rule" in matcher > ``` -| Name | Type | Description | -| ----------- | ---- | ----------------------------------------------------- | -| `key` | str | The match ID. | -| **RETURNS** | bool | Whether the matcher contains rules for this match ID. | +| Name | Description | +| ----------- | -------------------------------------------------------------- | +| `key` | The match ID. ~~str~~ | +| **RETURNS** | Whether the matcher contains rules for this match ID. ~~bool~~ | ## Matcher.add {#add tag="method" new="2"} @@ -217,13 +217,13 @@ patterns = [[{"TEXT": "Google"}, {"TEXT": "Now"}], [{"TEXT": "GoogleNow"}]]
-| Name | Type | Description | -| ----------------------------------- | ------------------ | --------------------------------------------------------------------------------------------- | -| `match_id` | str | An ID for the thing you're matching. | -| `patterns` | `List[List[dict]]` | Match pattern. A pattern consists of a list of dicts, where each dict describes a token. | -| _keyword-only_ | | | -| `on_match` | callable / `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. | -| `greedy` 3 | str | Optional filter for greedy matches. Can either be `"FIRST"` or `"LONGEST"`. | +| Name | Description | +| ----------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `match_id` | An ID for the thing you're matching. ~~str~~ | +| `patterns` | Match pattern. A pattern consists of a list of dicts, where each dict describes a token. ~~List[List[Dict[str, Any]]]~~ | +| _keyword-only_ | | +| `on_match` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. ~~Optional[Callable[[Matcher, Doc, int, List[tuple], Any]]~~ | +| `greedy` 3 | Optional filter for greedy matches. Can either be `"FIRST"` or `"LONGEST"`. ~~Optional[str]~~ | ## Matcher.remove {#remove tag="method" new="2"} @@ -239,9 +239,9 @@ exist. > assert "Rule" not in matcher > ``` -| Name | Type | Description | -| ----- | ---- | ------------------------- | -| `key` | str | The ID of the match rule. | +| Name | Description | +| ----- | --------------------------------- | +| `key` | The ID of the match rule. ~~str~~ | ## Matcher.get {#get tag="method" new="2"} @@ -255,7 +255,7 @@ Retrieve the pattern stored for a key. Returns the rule as an > on_match, patterns = matcher.get("Rule") > ``` -| Name | Type | Description | -| ----------- | ----- | --------------------------------------------- | -| `key` | str | The ID of the match rule. | -| **RETURNS** | tuple | The rule, as an `(on_match, patterns)` tuple. | +| Name | Description | +| ----------- | --------------------------------------------------------------------------------------------- | +| `key` | The ID of the match rule. ~~str~~ | +| **RETURNS** | The rule, as an `(on_match, patterns)` tuple. ~~Tuple[Optional[Callable], List[List[dict]]]~~ | diff --git a/website/docs/api/morphanalysis.md b/website/docs/api/morphanalysis.md deleted file mode 100644 index 4df9a3f7f..000000000 --- a/website/docs/api/morphanalysis.md +++ /dev/null @@ -1,142 +0,0 @@ ---- -title: MorphAnalysis -tag: class -source: spacy/tokens/morphanalysis.pyx ---- - -Stores a single morphological analysis. - -## MorphAnalysis.\_\_init\_\_ {#init tag="method"} - -Initialize a MorphAnalysis object from a UD FEATS string or a dictionary of -morphological features. - -> #### Example -> -> ```python -> from spacy.tokens import MorphAnalysis -> -> feats = "Feat1=Val1|Feat2=Val2" -> m = MorphAnalysis(nlp.vocab, feats) -> ``` - -| Name | Type | Description | -| ---------- | ------------------ | --------------------------- | -| `vocab` | `Vocab` | The vocab. | -| `features` | `Union[Dict, str]` | The morphological features. | - -## MorphAnalysis.\_\_contains\_\_ {#contains tag="method"} - -Whether a feature/value pair is in the analysis. - -> #### Example -> -> ```python -> feats = "Feat1=Val1,Val2|Feat2=Val2" -> morph = MorphAnalysis(nlp.vocab, feats) -> assert "Feat1=Val1" in morph -> ``` - -| Name | Type | Description | -| ----------- | ----- | ------------------------------------- | -| **RETURNS** | `str` | A feature/value pair in the analysis. | - -## MorphAnalysis.\_\_iter\_\_ {#iter tag="method"} - -Iterate over the feature/value pairs in the analysis. - -> #### Example -> -> ```python -> feats = "Feat1=Val1,Val3|Feat2=Val2" -> morph = MorphAnalysis(nlp.vocab, feats) -> assert list(morph) == ["Feat1=Va1", "Feat1=Val3", "Feat2=Val2"] -> ``` - -| Name | Type | Description | -| ---------- | ----- | ------------------------------------- | -| **YIELDS** | `str` | A feature/value pair in the analysis. | - -## MorphAnalysis.\_\_len\_\_ {#len tag="method"} - -Returns the number of features in the analysis. - -> #### Example -> -> ```python -> feats = "Feat1=Val1,Val2|Feat2=Val2" -> morph = MorphAnalysis(nlp.vocab, feats) -> assert len(morph) == 3 -> ``` - -| Name | Type | Description | -| ----------- | ----- | --------------------------------------- | -| **RETURNS** | `int` | The number of features in the analysis. | - -## MorphAnalysis.\_\_str\_\_ {#str tag="method"} - -Returns the morphological analysis in the UD FEATS string format. - -> #### Example -> -> ```python -> feats = "Feat1=Val1,Val2|Feat2=Val2" -> morph = MorphAnalysis(nlp.vocab, feats) -> assert str(morph) == feats -> ``` - -| Name | Type | Description | -| ----------- | ----- | -------------------------------- | -| **RETURNS** | `str` | The analysis in UD FEATS format. | - -## MorphAnalysis.get {#get tag="method"} - -Retrieve values for a feature by field. - -> #### Example -> -> ```python -> feats = "Feat1=Val1,Val2" -> morph = MorphAnalysis(nlp.vocab, feats) -> assert morph.get("Feat1") == ["Val1", "Val2"] -> ``` - -| Name | Type | Description | -| ----------- | ------ | ---------------------------------- | -| `field` | `str` | The field to retrieve. | -| **RETURNS** | `list` | A list of the individual features. | - -## MorphAnalysis.to_dict {#to_dict tag="method"} - -Produce a dict representation of the analysis, in the same format as the tag -map. - -> #### Example -> -> ```python -> feats = "Feat1=Val1,Val2|Feat2=Val2" -> morph = MorphAnalysis(nlp.vocab, feats) -> assert morph.to_dict() == {"Feat1": "Val1,Val2", "Feat2": "Val2"} -> ``` - -| Name | Type | Description | -| ----------- | ------ | ---------------------------------------- | -| **RETURNS** | `dict` | The dict representation of the analysis. | - -## MorphAnalysis.from_id {#from_id tag="classmethod"} - -Create a morphological analysis from a given hash ID. - -> #### Example -> -> ```python -> feats = "Feat1=Val1|Feat2=Val2" -> hash = nlp.vocab.strings[feats] -> morph = MorphAnalysis.from_id(nlp.vocab, hash) -> assert str(morph) == feats -> ``` - -| Name | Type | Description | -| ------- | ------- | -------------------------------- | -| `vocab` | `Vocab` | The vocab. | -| `key` | `int` | The hash of the features string. | diff --git a/website/docs/api/morphologizer.md b/website/docs/api/morphologizer.md index 12d3050f6..069856ea3 100644 --- a/website/docs/api/morphologizer.md +++ b/website/docs/api/morphologizer.md @@ -32,9 +32,9 @@ architectures and their arguments and hyperparameters. > nlp.add_pipe("morphologizer", config=config) > ``` -| Setting | Type | Description | Default | -| ------- | ------------------------------------------ | ----------------- | ----------------------------------- | -| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [Tagger](/api/architectures#Tagger) | +| Setting | Description | +| ------- | ------------------------------------------------------------------------------------------------------- | +| `model` | The model to use. Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ | ```python https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/morphologizer.pyx @@ -42,7 +42,9 @@ https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/morphologizer.pyx ## Morphologizer.\_\_init\_\_ {#init tag="method"} -Initialize the morphologizer. +Create a new pipeline instance. In your application, you would normally use a +shortcut for this and instantiate the component using its string name and +[`nlp.add_pipe`](/api/language#add_pipe). > #### Example > @@ -59,18 +61,14 @@ Initialize the morphologizer. > morphologizer = Morphologizer(nlp.vocab, model) > ``` -Create a new pipeline instance. In your application, you would normally use a -shortcut for this and instantiate the component using its string name and -[`nlp.add_pipe`](/api/language#add_pipe). - -| Name | Type | Description | -| -------------- | ------- | ------------------------------------------------------------------------------------------- | -| `vocab` | `Vocab` | The shared vocabulary. | -| `model` | `Model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. | -| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. | -| _keyword-only_ | | | -| `labels_morph` | dict | Mapping of morph + POS tags to morph labels. | -| `labels_pos` | dict | Mapping of morph + POS tags to POS tags. | +| Name | Description | +| -------------- | -------------------------------------------------------------------------------------------------------------------- | +| `vocab` | The shared vocabulary. ~~Vocab~~ | +| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~ | +| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | +| _keyword-only_ | | +| `labels_morph` | Mapping of morph + POS tags to morph labels. ~~Dict[str, str]~~ | +| `labels_pos` | Mapping of morph + POS tags to POS tags. ~~Dict[str, str]~~ | ## Morphologizer.\_\_call\_\_ {#call tag="method"} @@ -90,10 +88,10 @@ delegate to the [`predict`](/api/morphologizer#predict) and > processed = morphologizer(doc) > ``` -| Name | Type | Description | -| ----------- | ----- | ------------------------ | -| `doc` | `Doc` | The document to process. | -| **RETURNS** | `Doc` | The processed document. | +| Name | Description | +| ----------- | -------------------------------- | +| `doc` | The document to process. ~~Doc~~ | +| **RETURNS** | The processed document. ~~Doc~~ | ## Morphologizer.pipe {#pipe tag="method"} @@ -112,12 +110,12 @@ applied to the `Doc` in order. Both [`__call__`](/api/morphologizer#call) and > pass > ``` -| Name | Type | Description | -| -------------- | --------------- | ------------------------------------------------------ | -| `stream` | `Iterable[Doc]` | A stream of documents. | -| _keyword-only_ | | | -| `batch_size` | int | The number of texts to buffer. Defaults to `128`. | -| **YIELDS** | `Doc` | Processed documents in the order of the original text. | +| Name | Description | +| -------------- | ------------------------------------------------------------- | +| `stream` | A stream of documents. ~~Iterable[Doc]~~ | +| _keyword-only_ | | +| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ | +| **YIELDS** | The processed documents in order. ~~Doc~~ | ## Morphologizer.begin_training {#begin_training tag="method"} @@ -138,13 +136,13 @@ setting up the label scheme based on the data. > optimizer = morphologizer.begin_training(lambda: [], pipeline=nlp.pipeline) > ``` -| Name | Type | Description | -| -------------- | --------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------- | -| `get_examples` | `Callable[[], Iterable[Example]]` | Optional function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. | -| _keyword-only_ | | | -| `pipeline` | `List[Tuple[str, Callable]]` | Optional list of pipeline components that this component is part of. | -| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | An optional optimizer. Will be created via [`create_optimizer`](/api/sentencerecognizer#create_optimizer) if not set. | -| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------------------------------------- | +| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ | +| _keyword-only_ | | | +| `pipeline` | Optional list of pipeline components that this component is part of. ~~Optional[List[Tuple[str, Callable[[Doc], Doc]]]]~~ | +| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | +| **RETURNS** | The optimizer. ~~Optimizer~~ | ## Morphologizer.predict {#predict tag="method"} @@ -158,10 +156,10 @@ modifying them. > scores = morphologizer.predict([doc1, doc2]) > ``` -| Name | Type | Description | -| ----------- | --------------- | ----------------------------------------- | -| `docs` | `Iterable[Doc]` | The documents to predict. | -| **RETURNS** | - | The model's prediction for each document. | +| Name | Description | +| ----------- | ------------------------------------------- | +| `docs` | The documents to predict. ~~Iterable[Doc]~~ | +| **RETURNS** | The model's prediction for each document. | ## Morphologizer.set_annotations {#set_annotations tag="method"} @@ -175,10 +173,10 @@ Modify a batch of [`Doc`](/api/doc) objects, using pre-computed scores. > morphologizer.set_annotations([doc1, doc2], scores) > ``` -| Name | Type | Description | -| -------- | --------------- | ------------------------------------------------------- | -| `docs` | `Iterable[Doc]` | The documents to modify. | -| `scores` | - | The scores to set, produced by `Morphologizer.predict`. | +| Name | Description | +| -------- | ------------------------------------------------------- | +| `docs` | The documents to modify. ~~Iterable[Doc]~~ | +| `scores` | The scores to set, produced by `Morphologizer.predict`. | ## Morphologizer.update {#update tag="method"} @@ -195,15 +193,15 @@ Delegates to [`predict`](/api/morphologizer#predict) and > losses = morphologizer.update(examples, sgd=optimizer) > ``` -| Name | Type | Description | -| ----------------- | --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------ | -| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. | -| _keyword-only_ | | | -| `drop` | float | The dropout rate. | -| `set_annotations` | bool | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](/api/sentencerecognizer#set_annotations). | -| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | -| `losses` | `Dict[str, float]` | Optional record of the loss during training. The value keyed by the model's name is updated. | -| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. | +| Name | Description | +| ----------------- | ---------------------------------------------------------------------------------------------------------------------------------- | +| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ | +| _keyword-only_ | | | +| `drop` | The dropout rate. ~~float~~ | +| `set_annotations` | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](#set_annotations). ~~bool~~ | +| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | +| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ | +| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ | ## Morphologizer.get_loss {#get_loss tag="method"} @@ -218,11 +216,11 @@ predicted scores. > loss, d_loss = morphologizer.get_loss(examples, scores) > ``` -| Name | Type | Description | -| ----------- | --------------------- | --------------------------------------------------- | -| `examples` | `Iterable[Example]` | The batch of examples. | -| `scores` | - | Scores representing the model's predictions. | -| **RETURNS** | `Tuple[float, float]` | The loss and the gradient, i.e. `(loss, gradient)`. | +| Name | Description | +| ----------- | --------------------------------------------------------------------------- | +| `examples` | The batch of examples. ~~Iterable[Example]~~ | +| `scores` | Scores representing the model's predictions. | +| **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ | ## Morphologizer.create_optimizer {#create_optimizer tag="method"} @@ -235,9 +233,9 @@ Create an optimizer for the pipeline component. > optimizer = morphologizer.create_optimizer() > ``` -| Name | Type | Description | -| ----------- | --------------------------------------------------- | -------------- | -| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | +| Name | Description | +| ----------- | ---------------------------- | +| **RETURNS** | The optimizer. ~~Optimizer~~ | ## Morphologizer.use_params {#use_params tag="method, contextmanager"} @@ -252,9 +250,9 @@ context, the original parameters are restored. > morphologizer.to_disk("/best_model") > ``` -| Name | Type | Description | -| -------- | ---- | ----------------------------------------- | -| `params` | dict | The parameter values to use in the model. | +| Name | Description | +| -------- | -------------------------------------------------- | +| `params` | The parameter values to use in the model. ~~dict~~ | ## Morphologizer.add_label {#add_label tag="method"} @@ -268,10 +266,10 @@ both `pos` and `morph`, the label should include the UPOS as the feature `POS`. > morphologizer.add_label("Mood=Ind|POS=VERB|Tense=Past|VerbForm=Fin") > ``` -| Name | Type | Description | -| ----------- | ---- | --------------------------------------------------- | -| `label` | str | The label to add. | -| **RETURNS** | int | `0` if the label is already present, otherwise `1`. | +| Name | Description | +| ----------- | ----------------------------------------------------------- | +| `label` | The label to add. ~~str~~ | +| **RETURNS** | `0` if the label is already present, otherwise `1`. ~~int~~ | ## Morphologizer.to_disk {#to_disk tag="method"} @@ -284,11 +282,11 @@ Serialize the pipe to disk. > morphologizer.to_disk("/path/to/morphologizer") > ``` -| Name | Type | Description | -| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | -| _keyword-only_ | | | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------ | +| `path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ | +| _keyword-only_ | | +| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ | ## Morphologizer.from_disk {#from_disk tag="method"} @@ -301,12 +299,12 @@ Load the pipe from disk. Modifies the object in place and returns it. > morphologizer.from_disk("/path/to/morphologizer") > ``` -| Name | Type | Description | -| -------------- | --------------- | -------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | -| _keyword-only_ | | | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `Morphologizer` | The modified `Morphologizer` object. | +| Name | Description | +| -------------- | ----------------------------------------------------------------------------------------------- | +| `path` | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ | +| _keyword-only_ | | +| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ | +| **RETURNS** | The modified `Morphologizer` object. ~~Morphologizer~~ | ## Morphologizer.to_bytes {#to_bytes tag="method"} @@ -319,11 +317,11 @@ Load the pipe from disk. Modifies the object in place and returns it. Serialize the pipe to a bytestring. -| Name | Type | Description | -| -------------- | --------------- | ------------------------------------------------------------------------- | -| _keyword-only_ | | | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | bytes | The serialized form of the `Morphologizer` object. | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------- | +| _keyword-only_ | | +| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ | +| **RETURNS** | The serialized form of the `Morphologizer` object. ~~bytes~~ | ## Morphologizer.from_bytes {#from_bytes tag="method"} @@ -337,19 +335,19 @@ Load the pipe from a bytestring. Modifies the object in place and returns it. > morphologizer.from_bytes(morphologizer_bytes) > ``` -| Name | Type | Description | -| -------------- | --------------- | ------------------------------------------------------------------------- | -| `bytes_data` | bytes | The data to load from. | -| _keyword-only_ | | | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `Morphologizer` | The `Morphologizer` object. | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------- | +| `bytes_data` | The data to load from. ~~bytes~~ | +| _keyword-only_ | | +| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ | +| **RETURNS** | The `Morphologizer` object. ~~Morphologizer~~ | ## Morphologizer.labels {#labels tag="property"} -The labels currently added to the component in Universal Dependencies -[FEATS format](https://universaldependencies.org/format.html#morphological-annotation). -Note that even for a blank component, this will always include the internal -empty label `_`. If POS features are used, the labels will include the +The labels currently added to the component in the Universal Dependencies +[FEATS](https://universaldependencies.org/format.html#morphological-annotation) +format. Note that even for a blank component, this will always include the +internal empty label `_`. If POS features are used, the labels will include the coarse-grained POS as the feature `POS`. > #### Example @@ -359,9 +357,9 @@ coarse-grained POS as the feature `POS`. > assert "Mood=Ind|POS=VERB|Tense=Past|VerbForm=Fin" in morphologizer.labels > ``` -| Name | Type | Description | -| ----------- | ----- | ---------------------------------- | -| **RETURNS** | tuple | The labels added to the component. | +| Name | Description | +| ----------- | ------------------------------------------------------ | +| **RETURNS** | The labels added to the component. ~~Tuple[str, ...]~~ | ## Serialization fields {#serialization-fields} diff --git a/website/docs/api/morphology.md b/website/docs/api/morphology.md index 3c5bf6fe4..5d5324061 100644 --- a/website/docs/api/morphology.md +++ b/website/docs/api/morphology.md @@ -7,7 +7,8 @@ source: spacy/morphology.pyx Store the possible morphological analyses for a language, and index them by hash. To save space on each token, tokens only know the hash of their morphological analysis, so queries of morphological attributes are delegated to -this class. +this class. See [`MorphAnalysis`](/api/morphology#morphanalysis) for the +container storing a single morphological analysis. ## Morphology.\_\_init\_\_ {#init tag="method"} @@ -21,15 +22,17 @@ Create a Morphology object. > morphology = Morphology(strings) > ``` -| Name | Type | Description | -| --------- | ------------- | ----------------- | -| `strings` | `StringStore` | The string store. | +| Name | Description | +| --------- | --------------------------------- | +| `strings` | The string store. ~~StringStore~~ | ## Morphology.add {#add tag="method"} Insert a morphological analysis in the morphology table, if not already present. -The morphological analysis may be provided in the UD FEATS format as a string or -in the tag map dictionary format. Returns the hash of the new analysis. +The morphological analysis may be provided in the Universal Dependencies +[FEATS](https://universaldependencies.org/format.html#morphological-annotation) +format as a string or in the tag map dictionary format. Returns the hash of the +new analysis. > #### Example > @@ -39,9 +42,9 @@ in the tag map dictionary format. Returns the hash of the new analysis. > assert hash == nlp.vocab.strings[feats] > ``` -| Name | Type | Description | -| ---------- | ------------------ | --------------------------- | -| `features` | `Union[Dict, str]` | The morphological features. | +| Name | Description | +| ---------- | ------------------------------------------------ | +| `features` | The morphological features. ~~Union[Dict, str]~~ | ## Morphology.get {#get tag="method"} @@ -53,16 +56,20 @@ in the tag map dictionary format. Returns the hash of the new analysis. > assert nlp.vocab.morphology.get(hash) == feats > ``` -Get the FEATS string for the hash of the morphological analysis. +Get the +[FEATS](https://universaldependencies.org/format.html#morphological-annotation) +string for the hash of the morphological analysis. -| Name | Type | Description | -| ------- | ---- | --------------------------------------- | -| `morph` | int | The hash of the morphological analysis. | +| Name | Description | +| ------- | ----------------------------------------------- | +| `morph` | The hash of the morphological analysis. ~~int~~ | ## Morphology.feats_to_dict {#feats_to_dict tag="staticmethod"} -Convert a string FEATS representation to a dictionary of features and values in -the same format as the tag map. +Convert a string +[FEATS](https://universaldependencies.org/format.html#morphological-annotation) +representation to a dictionary of features and values in the same format as the +tag map. > #### Example > @@ -72,14 +79,16 @@ the same format as the tag map. > assert d == {"Feat1": "Val1", "Feat2": "Val2"} > ``` -| Name | Type | Description | -| ----------- | ---- | ------------------------------------------------------------------ | -| `feats` | str | The morphological features in Universal Dependencies FEATS format. | -| **RETURNS** | dict | The morphological features as a dictionary. | +| Name | Description | +| ----------- | ---------------------------------------------------------------------------------------------------------------------------------------------------- | +| `feats` | The morphological features in Universal Dependencies [FEATS](https://universaldependencies.org/format.html#morphological-annotation) format. ~~str~~ | +| **RETURNS** | The morphological features as a dictionary. ~~Dict[str, str]~~ | ## Morphology.dict_to_feats {#dict_to_feats tag="staticmethod"} -Convert a dictionary of features and values to a string FEATS representation. +Convert a dictionary of features and values to a string +[FEATS](https://universaldependencies.org/format.html#morphological-annotation) +representation. > #### Example > @@ -89,15 +98,157 @@ Convert a dictionary of features and values to a string FEATS representation. > assert f == "Feat1=Val1|Feat2=Val2" > ``` -| Name | Type | Description | -| ------------ | ----------------- | --------------------------------------------------------------------- | -| `feats_dict` | `Dict[str, Dict]` | The morphological features as a dictionary. | -| **RETURNS** | str | The morphological features as in Universal Dependencies FEATS format. | +| Name | Description | +| ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `feats_dict` | The morphological features as a dictionary. ~~Dict[str, str]~~ | +| **RETURNS** | The morphological features as in Universal Dependencies [FEATS](https://universaldependencies.org/format.html#morphological-annotation) format. ~~str~~ | ## Attributes {#attributes} -| Name | Type | Description | -| ------------- | ----- | -------------------------------------------- | -| `FEATURE_SEP` | `str` | The FEATS feature separator. Default is `|`. | -| `FIELD_SEP` | `str` | The FEATS field separator. Default is `=`. | -| `VALUE_SEP` | `str` | The FEATS value separator. Default is `,`. | +| Name | Description | +| ------------- | ------------------------------------------------------------------------------------------------------------------------------ | +| `FEATURE_SEP` | The [FEATS](https://universaldependencies.org/format.html#morphological-annotation) feature separator. Default is `|`. ~~str~~ | +| `FIELD_SEP` | The [FEATS](https://universaldependencies.org/format.html#morphological-annotation) field separator. Default is `=`. ~~str~~ | +| `VALUE_SEP` | The [FEATS](https://universaldependencies.org/format.html#morphological-annotation) value separator. Default is `,`. ~~str~~ | + +## MorphAnalysis {#morphanalysis tag="class" source="spacy/tokens/morphanalysis.pyx"} + +Stores a single morphological analysis. + +### MorphAnalysis.\_\_init\_\_ {#morphanalysis-init tag="method"} + +Initialize a MorphAnalysis object from a Universal Dependencies +[FEATS](https://universaldependencies.org/format.html#morphological-annotation) +string or a dictionary of morphological features. + +> #### Example +> +> ```python +> from spacy.tokens import MorphAnalysis +> +> feats = "Feat1=Val1|Feat2=Val2" +> m = MorphAnalysis(nlp.vocab, feats) +> ``` + +| Name | Description | +| ---------- | ---------------------------------------------------------- | +| `vocab` | The vocab. ~~Vocab~~ | +| `features` | The morphological features. ~~Union[Dict[str, str], str]~~ | + +### MorphAnalysis.\_\_contains\_\_ {#morphanalysis-contains tag="method"} + +Whether a feature/value pair is in the analysis. + +> #### Example +> +> ```python +> feats = "Feat1=Val1,Val2|Feat2=Val2" +> morph = MorphAnalysis(nlp.vocab, feats) +> assert "Feat1=Val1" in morph +> ``` + +| Name | Description | +| ----------- | --------------------------------------------- | +| **RETURNS** | A feature/value pair in the analysis. ~~str~~ | + +### MorphAnalysis.\_\_iter\_\_ {#morphanalysis-iter tag="method"} + +Iterate over the feature/value pairs in the analysis. + +> #### Example +> +> ```python +> feats = "Feat1=Val1,Val3|Feat2=Val2" +> morph = MorphAnalysis(nlp.vocab, feats) +> assert list(morph) == ["Feat1=Va1", "Feat1=Val3", "Feat2=Val2"] +> ``` + +| Name | Description | +| ---------- | --------------------------------------------- | +| **YIELDS** | A feature/value pair in the analysis. ~~str~~ | + +### MorphAnalysis.\_\_len\_\_ {#morphanalysis-len tag="method"} + +Returns the number of features in the analysis. + +> #### Example +> +> ```python +> feats = "Feat1=Val1,Val2|Feat2=Val2" +> morph = MorphAnalysis(nlp.vocab, feats) +> assert len(morph) == 3 +> ``` + +| Name | Description | +| ----------- | ----------------------------------------------- | +| **RETURNS** | The number of features in the analysis. ~~int~~ | + +### MorphAnalysis.\_\_str\_\_ {#morphanalysis-str tag="method"} + +Returns the morphological analysis in the Universal Dependencies +[FEATS](https://universaldependencies.org/format.html#morphological-annotation) +string format. + +> #### Example +> +> ```python +> feats = "Feat1=Val1,Val2|Feat2=Val2" +> morph = MorphAnalysis(nlp.vocab, feats) +> assert str(morph) == feats +> ``` + +| Name | Description | +| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------ | +| **RETURNS** | The analysis in the Universal Dependencies [FEATS](https://universaldependencies.org/format.html#morphological-annotation) format. ~~str~~ | + +### MorphAnalysis.get {#morphanalysis-get tag="method"} + +Retrieve values for a feature by field. + +> #### Example +> +> ```python +> feats = "Feat1=Val1,Val2" +> morph = MorphAnalysis(nlp.vocab, feats) +> assert morph.get("Feat1") == ["Val1", "Val2"] +> ``` + +| Name | Description | +| ----------- | ------------------------------------------------ | +| `field` | The field to retrieve. ~~str~~ | +| **RETURNS** | A list of the individual features. ~~List[str]~~ | + +### MorphAnalysis.to_dict {#morphanalysis-to_dict tag="method"} + +Produce a dict representation of the analysis, in the same format as the tag +map. + +> #### Example +> +> ```python +> feats = "Feat1=Val1,Val2|Feat2=Val2" +> morph = MorphAnalysis(nlp.vocab, feats) +> assert morph.to_dict() == {"Feat1": "Val1,Val2", "Feat2": "Val2"} +> ``` + +| Name | Description | +| ----------- | ----------------------------------------------------------- | +| **RETURNS** | The dict representation of the analysis. ~~Dict[str, str]~~ | + +### MorphAnalysis.from_id {#morphanalysis-from_id tag="classmethod"} + +Create a morphological analysis from a given hash ID. + +> #### Example +> +> ```python +> feats = "Feat1=Val1|Feat2=Val2" +> hash = nlp.vocab.strings[feats] +> morph = MorphAnalysis.from_id(nlp.vocab, hash) +> assert str(morph) == feats +> ``` + +| Name | Description | +| ------- | ---------------------------------------- | +| `vocab` | The vocab. ~~Vocab~~ | +| `key` | The hash of the features string. ~~int~~ | diff --git a/website/docs/api/phrasematcher.md b/website/docs/api/phrasematcher.md index 71c7a463b..143eb9edf 100644 --- a/website/docs/api/phrasematcher.md +++ b/website/docs/api/phrasematcher.md @@ -36,11 +36,11 @@ be shown. > matcher = PhraseMatcher(nlp.vocab) > ``` -| Name | Type | Description | -| --------------------------------------- | --------- | ------------------------------------------------------------------------------------------- | -| `vocab` | `Vocab` | The vocabulary object, which must be shared with the documents the matcher will operate on. | -| `attr` 2.1 | int / str | The token attribute to match on. Defaults to `ORTH`, i.e. the verbatim token text. | -| `validate` 2.1 | bool | Validate patterns added to the matcher. | +| Name | Description | +| --------------------------------------- | ------------------------------------------------------------------------------------------------------ | +| `vocab` | The vocabulary object, which must be shared with the documents the matcher will operate on. ~~Vocab~~ | +| `attr` 2.1 | The token attribute to match on. Defaults to `ORTH`, i.e. the verbatim token text. ~~Union[int, str]~~ | +| `validate` 2.1 | Validate patterns added to the matcher. ~~bool~~ | ## PhraseMatcher.\_\_call\_\_ {#call tag="method"} @@ -57,10 +57,10 @@ Find all token sequences matching the supplied patterns on the `Doc`. > matches = matcher(doc) > ``` -| Name | Type | Description | -| ----------- | ----- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `doc` | `Doc` | The document to match over. | -| **RETURNS** | list | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end]`. The `match_id` is the ID of the added match pattern. | +| Name | Description | +| ----------- | ----------------------------------- | +| `doc` | The document to match over. ~~Doc~~ | +| **RETURNS** | list | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end]`. The `match_id` is the ID of the added match pattern. ~~List[Tuple[int, int, int]]~~ | @@ -87,11 +87,13 @@ Match a stream of documents, yielding them in turn. > pass > ``` -| Name | Type | Description | -| ------------ | -------- | --------------------------------------------------------- | -| `docs` | iterable | A stream of documents. | -| `batch_size` | int | The number of documents to accumulate into a working set. | -| **YIELDS** | `Doc` | Documents, in order. | +| Name | Description | +| --------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `docs` | A stream of documents. ~~Iterable[Doc]~~ | +| `batch_size` | The number of documents to accumulate into a working set. ~~int~~ | +| `return_matches` 2.1 | Yield the match lists along with the docs, making results `(doc, matches)` tuples. ~~bool~~ | +| `as_tuples` | Interpret the input stream as `(doc, context)` tuples, and yield `(result, context)` tuples out. If both `return_matches` and `as_tuples` are `True`, the output will be a sequence of `((doc, matches), context)` tuples. ~~bool~~ | +| **YIELDS** | Documents and optional matches or context in order. ~~Union[Doc, Tuple[Doc, Any], Tuple[Tuple[Doc, Any], Any]]~~ | ## PhraseMatcher.\_\_len\_\_ {#len tag="method"} @@ -108,9 +110,9 @@ patterns. > assert len(matcher) == 1 > ``` -| Name | Type | Description | -| ----------- | ---- | -------------------- | -| **RETURNS** | int | The number of rules. | +| Name | Description | +| ----------- | ---------------------------- | +| **RETURNS** | The number of rules. ~~int~~ | ## PhraseMatcher.\_\_contains\_\_ {#contains tag="method"} @@ -125,10 +127,10 @@ Check whether the matcher contains rules for a match ID. > assert "OBAMA" in matcher > ``` -| Name | Type | Description | -| ----------- | ---- | ----------------------------------------------------- | -| `key` | str | The match ID. | -| **RETURNS** | bool | Whether the matcher contains rules for this match ID. | +| Name | Description | +| ----------- | -------------------------------------------------------------- | +| `key` | The match ID. ~~str~~ | +| **RETURNS** | Whether the matcher contains rules for this match ID. ~~bool~~ | ## PhraseMatcher.add {#add tag="method"} @@ -165,12 +167,12 @@ patterns = [nlp("health care reform"), nlp("healthcare reform")] -| Name | Type | Description | -| -------------- | ------------------ | --------------------------------------------------------------------------------------------- | -| `match_id` | str | An ID for the thing you're matching. | -| `docs` | list | `Doc` objects of the phrases to match. | -| _keyword-only_ | | | -| `on_match` | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. | +| Name | Description | +| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `match_id` | str | An ID for the thing you're matching. ~~str~~ | +| `docs` | `Doc` objects of the phrases to match. ~~List[Doc]~~ | +| _keyword-only_ | | | +| `on_match` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. ~~Optional[Callable[[Matcher, Doc, int, List[tuple], Any]]~~ | ## PhraseMatcher.remove {#remove tag="method" new="2.2"} @@ -187,6 +189,6 @@ does not exist. > assert "OBAMA" not in matcher > ``` -| Name | Type | Description | -| ----- | ---- | ------------------------- | -| `key` | str | The ID of the match rule. | +| Name | Description | +| ----- | --------------------------------- | +| `key` | The ID of the match rule. ~~str~~ | diff --git a/website/docs/api/pipe.md b/website/docs/api/pipe.md index 8302c2e8b..9c3a4104e 100644 --- a/website/docs/api/pipe.md +++ b/website/docs/api/pipe.md @@ -45,12 +45,12 @@ Create a new pipeline instance. In your application, you would normally use a shortcut for this and instantiate the component using its string name and [`nlp.add_pipe`](/api/language#create_pipe). -| Name | Type | Description | -| ------- | ------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------- | -| `vocab` | `Vocab` | The shared vocabulary. | -| `model` | [`Model`](https://thinc.ai/docs/api-model) | The Thinc [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. | -| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. | -| `**cfg` | | Additional config parameters and settings. Will be available as the dictionary `Pipe.cfg` and is serialized with the component. | +| Name | Description | +| ------- | ------------------------------------------------------------------------------------------------------------------------------- | +| `vocab` | The shared vocabulary. ~~Vocab~~ | +| `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], Any]~~ | +| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | +| `**cfg` | Additional config parameters and settings. Will be available as the dictionary `Pipe.cfg` and is serialized with the component. | ## Pipe.\_\_call\_\_ {#call tag="method"} @@ -70,10 +70,10 @@ and all pipeline components are applied to the `Doc` in order. Both > processed = pipe(doc) > ``` -| Name | Type | Description | -| ----------- | ----- | ------------------------ | -| `doc` | `Doc` | The document to process. | -| **RETURNS** | `Doc` | The processed document. | +| Name | Description | +| ----------- | -------------------------------- | +| `doc` | The document to process. ~~Doc~~ | +| **RETURNS** | The processed document. ~~Doc~~ | ## Pipe.pipe {#pipe tag="method"} @@ -91,12 +91,12 @@ applied to the `Doc` in order. Both [`__call__`](/api/pipe#call) and > pass > ``` -| Name | Type | Description | -| -------------- | --------------- | ----------------------------------------------------- | -| `stream` | `Iterable[Doc]` | A stream of documents. | -| _keyword-only_ | | | -| `batch_size` | int | The number of documents to buffer. Defaults to `128`. | -| **YIELDS** | `Doc` | The processed documents in order. | +| Name | Description | +| -------------- | ------------------------------------------------------------- | +| `stream` | A stream of documents. ~~Iterable[Doc]~~ | +| _keyword-only_ | | +| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ | +| **YIELDS** | The processed documents in order. ~~Doc~~ | ## Pipe.begin_training {#begin_training tag="method"} @@ -116,13 +116,13 @@ setting up the label scheme based on the data. > optimizer = pipe.begin_training(lambda: [], pipeline=nlp.pipeline) > ``` -| Name | Type | Description | -| -------------- | --------------------------------------------------- | ---------------------------------------------------------------------------------------------------------- | -| `get_examples` | `Callable[[], Iterable[Example]]` | Optional function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. | -| _keyword-only_ | | | -| `pipeline` | `List[Tuple[str, Callable]]` | Optional list of pipeline components that this component is part of. | -| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | An optional optimizer. Will be created via [`create_optimizer`](/api/pipe#create_optimizer) if not set. | -| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------------------------------------- | +| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ | +| _keyword-only_ | | +| `pipeline` | Optional list of pipeline components that this component is part of. ~~Optional[List[Tuple[str, Callable[[Doc], Doc]]]]~~ | +| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | +| **RETURNS** | The optimizer. ~~Optimizer~~ | ## Pipe.predict {#predict tag="method"} @@ -142,10 +142,10 @@ This method needs to be overwritten with your own custom `predict` method. > scores = pipe.predict([doc1, doc2]) > ``` -| Name | Type | Description | -| ----------- | --------------- | ----------------------------------------- | -| `docs` | `Iterable[Doc]` | The documents to predict. | -| **RETURNS** | - | The model's prediction for each document. | +| Name | Description | +| ----------- | ------------------------------------------- | +| `docs` | The documents to predict. ~~Iterable[Doc]~~ | +| **RETURNS** | The model's prediction for each document. | ## Pipe.set_annotations {#set_annotations tag="method"} @@ -166,10 +166,10 @@ method. > pipe.set_annotations(docs, scores) > ``` -| Name | Type | Description | -| -------- | --------------- | ---------------------------------------------- | -| `docs` | `Iterable[Doc]` | The documents to modify. | -| `scores` | - | The scores to set, produced by `Pipe.predict`. | +| Name | Description | +| -------- | ------------------------------------------------ | +| `docs` | The documents to modify. ~~Iterable[Doc]~~ | +| `scores` | The scores to set, produced by `Tagger.predict`. | ## Pipe.update {#update tag="method"} @@ -184,15 +184,15 @@ predictions and gold-standard annotations, and update the component's model. > losses = pipe.update(examples, sgd=optimizer) > ``` -| Name | Type | Description | -| ----------------- | --------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------- | -| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. | -| _keyword-only_ | | | -| `drop` | float | The dropout rate. | -| `set_annotations` | bool | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](/api/pipe#set_annotations). | -| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | -| `losses` | `Dict[str, float]` | Optional record of the loss during training. Updated using the component name as the key. | -| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. | +| Name | Description | +| ----------------- | ---------------------------------------------------------------------------------------------------------------------------------- | +| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ | +| _keyword-only_ | | | +| `drop` | The dropout rate. ~~float~~ | +| `set_annotations` | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](#set_annotations). ~~bool~~ | +| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | +| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ | +| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ | ## Pipe.rehearse {#rehearse tag="method,experimental" new="3"} @@ -208,14 +208,14 @@ the "catastrophic forgetting" problem. This feature is experimental. > losses = pipe.rehearse(examples, sgd=optimizer) > ``` -| Name | Type | Description | -| -------------- | --------------------------------------------------- | ----------------------------------------------------------------------------------------- | -| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. | -| _keyword-only_ | | | -| `drop` | float | The dropout rate. | -| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | -| `losses` | `Dict[str, float]` | Optional record of the loss during training. Updated using the component name as the key. | -| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------------------------ | +| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ | +| _keyword-only_ | | | +| `drop` | The dropout rate. ~~float~~ | +| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | +| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ | +| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ | ## Pipe.get_loss {#get_loss tag="method"} @@ -230,11 +230,11 @@ predicted scores. > loss, d_loss = ner.get_loss(examples, scores) > ``` -| Name | Type | Description | -| ----------- | --------------------- | --------------------------------------------------- | -| `examples` | `Iterable[Example]` | The batch of examples. | -| `scores` | | Scores representing the model's predictions. | -| **RETURNS** | `Tuple[float, float]` | The loss and the gradient, i.e. `(loss, gradient)`. | +| Name | Description | +| ----------- | --------------------------------------------------------------------------- | +| `examples` | The batch of examples. ~~Iterable[Example]~~ | +| `scores` | Scores representing the model's predictions. | +| **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ | ## Pipe.score {#score tag="method" new="3"} @@ -246,10 +246,10 @@ Score a batch of examples. > scores = pipe.score(examples) > ``` -| Name | Type | Description | -| ----------- | ------------------- | --------------------------------------------------------- | -| `examples` | `Iterable[Example]` | The examples to score. | -| **RETURNS** | `Dict[str, Any]` | The scores, e.g. produced by the [`Scorer`](/api/scorer). | +| Name | Description | +| ----------- | ------------------------------------------------------------------------------------------------------- | +| `examples` | The examples to score. ~~Iterable[Example]~~ | +| **RETURNS** | The scores, e.g. produced by the [`Scorer`](/api/scorer). ~~Dict[str, Union[float, Dict[str, float]]]~~ | ## Pipe.create_optimizer {#create_optimizer tag="method"} @@ -263,26 +263,9 @@ Create an optimizer for the pipeline component. Defaults to > optimizer = pipe.create_optimizer() > ``` -| Name | Type | Description | -| ----------- | --------------------------------------------------- | -------------- | -| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | - -## Pipe.add_label {#add_label tag="method"} - -Add a new label to the pipe. It's possible to extend pretrained models with new -labels, but care should be taken to avoid the "catastrophic forgetting" problem. - -> #### Example -> -> ```python -> pipe = nlp.add_pipe("your_custom_pipe") -> pipe.add_label("MY_LABEL") -> ``` - -| Name | Type | Description | -| ----------- | ---- | --------------------------------------------------- | -| `label` | str | The label to add. | -| **RETURNS** | int | `0` if the label is already present, otherwise `1`. | +| Name | Description | +| ----------- | ---------------------------- | +| **RETURNS** | The optimizer. ~~Optimizer~~ | ## Pipe.use_params {#use_params tag="method, contextmanager"} @@ -297,9 +280,26 @@ context, the original parameters are restored. > pipe.to_disk("/best_model") > ``` -| Name | Type | Description | -| -------- | ---- | ----------------------------------------- | -| `params` | dict | The parameter values to use in the model. | +| Name | Description | +| -------- | -------------------------------------------------- | +| `params` | The parameter values to use in the model. ~~dict~~ | + +## Pipe.add_label {#add_label tag="method"} + +Add a new label to the pipe. It's possible to extend pretrained models with new +labels, but care should be taken to avoid the "catastrophic forgetting" problem. + +> #### Example +> +> ```python +> pipe = nlp.add_pipe("your_custom_pipe") +> pipe.add_label("MY_LABEL") +> ``` + +| Name | Description | +| ----------- | ----------------------------------------------------------- | +| `label` | The label to add. ~~str~~ | +| **RETURNS** | `0` if the label is already present, otherwise `1`. ~~int~~ | ## Pipe.to_disk {#to_disk tag="method"} @@ -312,11 +312,11 @@ Serialize the pipe to disk. > pipe.to_disk("/path/to/pipe") > ``` -| Name | Type | Description | -| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | -| _keyword-only_ | | | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------ | +| `path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ | +| _keyword-only_ | | +| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ | ## Pipe.from_disk {#from_disk tag="method"} @@ -329,12 +329,12 @@ Load the pipe from disk. Modifies the object in place and returns it. > pipe.from_disk("/path/to/pipe") > ``` -| Name | Type | Description | -| -------------- | --------------- | -------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | -| _keyword-only_ | | | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `Pipe` | The modified pipe. | +| Name | Description | +| -------------- | ----------------------------------------------------------------------------------------------- | +| `path` | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ | +| _keyword-only_ | | +| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ | +| **RETURNS** | The modified pipe. ~~Pipe~~ | ## Pipe.to_bytes {#to_bytes tag="method"} @@ -347,11 +347,11 @@ Load the pipe from disk. Modifies the object in place and returns it. Serialize the pipe to a bytestring. -| Name | Type | Description | -| -------------- | --------------- | ------------------------------------------------------------------------- | -| _keyword-only_ | | | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | bytes | The serialized form of the pipe. | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------- | +| _keyword-only_ | | +| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ | +| **RETURNS** | The serialized form of the pipe. ~~bytes~~ | ## Pipe.from_bytes {#from_bytes tag="method"} @@ -365,21 +365,21 @@ Load the pipe from a bytestring. Modifies the object in place and returns it. > pipe.from_bytes(pipe_bytes) > ``` -| Name | Type | Description | -| -------------- | --------------- | ------------------------------------------------------------------------- | -| `bytes_data` | bytes | The data to load from. | -| _keyword-only_ | | | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `Pipe` | The pipe. | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------- | +| `bytes_data` | The data to load from. ~~bytes~~ | +| _keyword-only_ | | +| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ | +| **RETURNS** | The pipe. ~~Pipe~~ | ## Attributes {#attributes} -| Name | Type | Description | -| ------- | ------------------------------------------ | ----------------------------------------------------------------------------------------------------- | -| `vocab` | [`Vocab`](/api/vocab) | The shared vocabulary that's passed in on initialization. | -| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model powering the component. | -| `name` | str | The name of the component instance in the pipeline. Can be used in the losses. | -| `cfg` | dict | Keyword arguments passed to [`Pipe.__init__`](/api/pipe#init). Will be serialized with the component. | +| Name | Description | +| ------- | ------------------------------------------------------------------------------------------------------------------------ | +| `vocab` | The shared vocabulary that's passed in on initialization. ~~Vocab~~ | +| `model` | The model powering the component. ~~Model[List[Doc], Any]~~ | +| `name` | The name of the component instance in the pipeline. Can be used in the losses. ~~str~~ | +| `cfg` | Keyword arguments passed to [`Pipe.__init__`](/api/pipe#init). Will be serialized with the component. ~~Dict[str, Any]~~ | ## Serialization fields {#serialization-fields} diff --git a/website/docs/api/pipeline-functions.md b/website/docs/api/pipeline-functions.md index 5c2eb2b97..0dc03a16a 100644 --- a/website/docs/api/pipeline-functions.md +++ b/website/docs/api/pipeline-functions.md @@ -33,10 +33,10 @@ all other components. -| Name | Type | Description | -| ----------- | ----- | ------------------------------------------------------------ | -| `doc` | `Doc` | The `Doc` object to process, e.g. the `Doc` in the pipeline. | -| **RETURNS** | `Doc` | The modified `Doc` with merged noun chunks. | +| Name | Description | +| ----------- | -------------------------------------------------------------------- | +| `doc` | The `Doc` object to process, e.g. the `Doc` in the pipeline. ~~Doc~~ | +| **RETURNS** | The modified `Doc` with merged noun chunks. ~~Doc~~ | ## merge_entities {#merge_entities tag="function"} @@ -63,10 +63,10 @@ components to the end of the pipeline and after all other components. -| Name | Type | Description | -| ----------- | ----- | ------------------------------------------------------------ | -| `doc` | `Doc` | The `Doc` object to process, e.g. the `Doc` in the pipeline. | -| **RETURNS** | `Doc` | The modified `Doc` with merged entities. | +| Name | Description | +| ----------- | -------------------------------------------------------------------- | +| `doc` | The `Doc` object to process, e.g. the `Doc` in the pipeline. ~~Doc~~ | +| **RETURNS** | The modified `Doc` with merged entities. ~~Doc~~ | ## merge_subtokens {#merge_subtokens tag="function" new="2.1"} @@ -102,8 +102,8 @@ end of the pipeline and after all other components. -| Name | Type | Description | -| ----------- | ----- | ------------------------------------------------------------ | -| `doc` | `Doc` | The `Doc` object to process, e.g. the `Doc` in the pipeline. | -| `label` | str | The subtoken dependency label. Defaults to `"subtok"`. | -| **RETURNS** | `Doc` | The modified `Doc` with merged subtokens. | +| Name | Description | +| ----------- | -------------------------------------------------------------------- | +| `doc` | The `Doc` object to process, e.g. the `Doc` in the pipeline. ~~Doc~~ | +| `label` | The subtoken dependency label. Defaults to `"subtok"`. ~~str~~ | +| **RETURNS** | The modified `Doc` with merged subtokens. ~~Doc~~ | diff --git a/website/docs/api/scorer.md b/website/docs/api/scorer.md index 2f37843a0..1c0895bcf 100644 --- a/website/docs/api/scorer.md +++ b/website/docs/api/scorer.md @@ -27,9 +27,9 @@ Create a new `Scorer`. > scorer = Scorer(nlp) > ``` -| Name | Type | Description | -| ----- | -------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `nlp` | Language | The pipeline to use for scoring, where each pipeline component may provide a scoring method. If none is provided, then a default pipeline for the multi-language code `xx` is constructed containing: `senter`, `tagger`, `morphologizer`, `parser`, `ner`, `textcat`. | +| Name | Description | +| ----- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `nlp` | The pipeline to use for scoring, where each pipeline component may provide a scoring method. If none is provided, then a default pipeline for the multi-language code `xx` is constructed containing: `senter`, `tagger`, `morphologizer`, `parser`, `ner`, `textcat`. ~~Language~~ | ## Scorer.score {#score tag="method"} @@ -55,10 +55,10 @@ attribute being scored: > scores = scorer.score(examples) > ``` -| Name | Type | Description | -| ----------- | ------------------- | --------------------------------------------------------------------------------------------- | -| `examples` | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. | -| **RETURNS** | `Dict` | A dictionary of scores. | +| Name | Description | +| ----------- | ------------------------------------------------------------------------------------------------------------------- | +| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ | +| **RETURNS** | A dictionary of scores. ~~Dict[str, Union[float, Dict[str, float]]]~~ | ## Scorer.score_tokenization {#score_tokenization tag="staticmethod" new="3"} @@ -74,10 +74,10 @@ Scores the tokenization: > scores = Scorer.score_tokenization(examples) > ``` -| Name | Type | Description | -| ----------- | ------------------- | --------------------------------------------------------------------------------------------- | -| `examples` | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. | -| **RETURNS** | `Dict` | A dictionary containing the scores `token_acc`, `token_p`, `token_r`, `token_f`. | +| Name | Description | +| ----------- | ------------------------------------------------------------------------------------------------------------------- | +| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ | +| **RETURNS** | `Dict` | A dictionary containing the scores `token_acc`, `token_p`, `token_r`, `token_f`. ~~Dict[str, float]]~~ | ## Scorer.score_token_attr {#score_token_attr tag="staticmethod" new="3"} @@ -90,18 +90,19 @@ Scores a single token attribute. > print(scores["pos_acc"]) > ``` -| Name | Type | Description | -| -------------- | ------------------- | ----------------------------------------------------------------------------------------------------------------------------- | -| `examples` | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. | -| `attr` | `str` | The attribute to score. | -| _keyword-only_ | | | -| `getter` | `Callable` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. | -| **RETURNS** | `Dict[str, float]` | A dictionary containing the score `{attr}_acc`. | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ | +| `attr` | The attribute to score. ~~str~~ | +| _keyword-only_ | | +| `getter` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. ~~Callable[[Token, str], Any]~~ | +| **RETURNS** | A dictionary containing the score `{attr}_acc`. ~~Dict[str, float]~~ | ## Scorer.score_token_attr_per_feat {#score_token_attr_per_feat tag="staticmethod" new="3"} -Scores a single token attribute per feature for a token attribute in -[UFEATS](https://universaldependencies.org/format.html#morphological-annotation) +Scores a single token attribute per feature for a token attribute in the +Universal Dependencies +[FEATS](https://universaldependencies.org/format.html#morphological-annotation) format. > #### Example @@ -111,13 +112,13 @@ format. > print(scores["morph_per_feat"]) > ``` -| Name | Type | Description | -| -------------- | ------------------- | ----------------------------------------------------------------------------------------------------------------------------- | -| `examples` | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. | -| `attr` | `str` | The attribute to score. | -| _keyword-only_ | | | -| `getter` | `Callable` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. | -| **RETURNS** | `Dict` | A dictionary containing the per-feature PRF scores under the key `{attr}_per_feat`. | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ | +| `attr` | The attribute to score. ~~str~~ | +| _keyword-only_ | | +| `getter` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. ~~Callable[[Token, str], Any]~~ | +| **RETURNS** | A dictionary containing the per-feature PRF scores under the key `{attr}_per_feat`. ~~Dict[str, Dict[str, float]]~~ | ## Scorer.score_spans {#score_spans tag="staticmethod" new="3"} @@ -130,13 +131,13 @@ Returns PRF scores for labeled or unlabeled spans. > print(scores["ents_f"]) > ``` -| Name | Type | Description | -| -------------- | ------------------- | --------------------------------------------------------------------------------------------------------------------------------------------- | -| `examples` | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. | -| `attr` | `str` | The attribute to score. | -| _keyword-only_ | | | -| `getter` | `Callable` | Defaults to `getattr`. If provided, `getter(doc, attr)` should return the `Span` objects for an individual `Doc`. | -| **RETURNS** | `Dict` | A dictionary containing the PRF scores under the keys `{attr}_p`, `{attr}_r`, `{attr}_f` and the per-type PRF scores under `{attr}_per_type`. | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ | +| `attr` | The attribute to score. ~~str~~ | +| _keyword-only_ | | +| `getter` | Defaults to `getattr`. If provided, `getter(doc, attr)` should return the `Span` objects for an individual `Doc`. ~~Callable[[Doc, str], Iterable[Span]]~~ | +| **RETURNS** | A dictionary containing the PRF scores under the keys `{attr}_p`, `{attr}_r`, `{attr}_f` and the per-type PRF scores under `{attr}_per_type`. ~~Dict[str, Union[float, Dict[str, float]]]~~ | ## Scorer.score_deps {#score_deps tag="staticmethod" new="3"} @@ -159,16 +160,16 @@ Calculate the UAS, LAS, and LAS per type scores for dependency parses. > print(scores["dep_uas"], scores["dep_las"]) > ``` -| Name | Type | Description | -| --------------- | ------------------- | ----------------------------------------------------------------------------------------------------------------------------- | -| `examples` | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. | -| `attr` | `str` | The attribute containing the dependency label. | -| _keyword-only_ | | | -| `getter` | `Callable` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. | -| `head_attr` | `str` | The attribute containing the head token. | -| `head_getter` | `callable` | Defaults to `getattr`. If provided, `head_getter(token, attr)` should return the head for an individual `Token`. | -| `ignore_labels` | `Tuple` | Labels to ignore while scoring (e.g., `punct`). | -| **RETURNS** | `Dict` | A dictionary containing the scores: `{attr}_uas`, `{attr}_las`, and `{attr}_las_per_type`. | +| Name | Description | +| --------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ | +| `attr` | The attribute to score. ~~str~~ | +| _keyword-only_ | | +| `getter` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. ~~Callable[[Token, str], Any]~~ | +| `head_attr` | The attribute containing the head token. ~~str~~ | +| `head_getter` | Defaults to `getattr`. If provided, `head_getter(token, attr)` should return the head for an individual `Token`. ~~Callable[[Doc, str], Token]~~ | +| `ignore_labels` | Labels to ignore while scoring (e.g. `"punct"`). ~~Iterable[str]~~ | +| **RETURNS** | A dictionary containing the scores: `{attr}_uas`, `{attr}_las`, and `{attr}_las_per_type`. ~~Dict[str, Union[float, Dict[str, float]]]~~ | ## Scorer.score_cats {#score_cats tag="staticmethod" new="3"} @@ -195,13 +196,13 @@ depends on the scorer settings: > print(scores["cats_macro_auc"]) > ``` -| Name | Type | Description | -| ---------------- | ------------------- | ------------------------------------------------------------------------------------------------------- | -| `examples` | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. | -| `attr` | `str` | The attribute to score. | -| _keyword-only_ | | | -| `getter` | `Callable` | Defaults to `getattr`. If provided, `getter(doc, attr)` should return the cats for an individual `Doc`. | -| labels | `Iterable[str]` | The set of possible labels. Defaults to `[]`. | -| `multi_label` | `bool` | Whether the attribute allows multiple labels. Defaults to `True`. | -| `positive_label` | `str` | The positive label for a binary task with exclusive classes. Defaults to `None`. | -| **RETURNS** | `Dict` | A dictionary containing the scores, with inapplicable scores as `None`. | +| Name | Description | +| ---------------- | -------------------------------------------------------------------------------------------------------------------------------------------------- | +| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ | +| `attr` | The attribute to score. ~~str~~ | +| _keyword-only_ | | +| `getter` | Defaults to `getattr`. If provided, `getter(doc, attr)` should return the cats for an individual `Doc`. ~~Callable[[Doc, str], Dict[str, float]]~~ | +| labels | The set of possible labels. Defaults to `[]`. ~~Iterable[str]~~ | +| `multi_label` | Whether the attribute allows multiple labels. Defaults to `True`. ~~bool~~ | +| `positive_label` | The positive label for a binary task with exclusive classes. Defaults to `None`. ~~Optional[str]~~ | +| **RETURNS** | A dictionary containing the scores, with inapplicable scores as `None`. ~~Dict[str, Optional[float]]~~ | diff --git a/website/docs/api/sentencerecognizer.md b/website/docs/api/sentencerecognizer.md index cefdbea88..06bef32ba 100644 --- a/website/docs/api/sentencerecognizer.md +++ b/website/docs/api/sentencerecognizer.md @@ -29,9 +29,9 @@ architectures and their arguments and hyperparameters. > nlp.add_pipe("senter", config=config) > ``` -| Setting | Type | Description | Default | -| ------- | ------------------------------------------ | ----------------- | ----------------------------------- | -| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [Tagger](/api/architectures#Tagger) | +| Setting | Description | +| ------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ | ```python https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/senter.pyx @@ -60,11 +60,11 @@ Create a new pipeline instance. In your application, you would normally use a shortcut for this and instantiate the component using its string name and [`nlp.add_pipe`](/api/language#add_pipe). -| Name | Type | Description | -| ------- | ------- | ------------------------------------------------------------------------------------------- | -| `vocab` | `Vocab` | The shared vocabulary. | -| `model` | `Model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. | -| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. | +| Name | Description | +| ------- | -------------------------------------------------------------------------------------------------------------------- | +| `vocab` | The shared vocabulary. ~~Vocab~~ | +| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~ | +| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | ## SentenceRecognizer.\_\_call\_\_ {#call tag="method"} @@ -85,10 +85,10 @@ and all pipeline components are applied to the `Doc` in order. Both > processed = senter(doc) > ``` -| Name | Type | Description | -| ----------- | ----- | ------------------------ | -| `doc` | `Doc` | The document to process. | -| **RETURNS** | `Doc` | The processed document. | +| Name | Description | +| ----------- | -------------------------------- | +| `doc` | The document to process. ~~Doc~~ | +| **RETURNS** | The processed document. ~~Doc~~ | ## SentenceRecognizer.pipe {#pipe tag="method"} @@ -107,12 +107,12 @@ and [`pipe`](/api/sentencerecognizer#pipe) delegate to the > pass > ``` -| Name | Type | Description | -| -------------- | --------------- | ------------------------------------------------------ | -| `stream` | `Iterable[Doc]` | A stream of documents. | -| _keyword-only_ | | | -| `batch_size` | int | The number of texts to buffer. Defaults to `128`. | -| **YIELDS** | `Doc` | Processed documents in the order of the original text. | +| Name | Description | +| -------------- | ------------------------------------------------------------- | +| `stream` | A stream of documents. ~~Iterable[Doc]~~ | +| _keyword-only_ | | +| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ | +| **YIELDS** | The processed documents in order. ~~Doc~~ | ## SentenceRecognizer.begin_training {#begin_training tag="method"} @@ -132,13 +132,13 @@ setting up the label scheme based on the data. > optimizer = senter.begin_training(lambda: [], pipeline=nlp.pipeline) > ``` -| Name | Type | Description | -| -------------- | --------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------- | -| `get_examples` | `Callable[[], Iterable[Example]]` | Optional function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. | -| _keyword-only_ | | | -| `pipeline` | `List[Tuple[str, Callable]]` | Optional list of pipeline components that this component is part of. | -| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | An optional optimizer. Will be created via [`create_optimizer`](/api/sentencerecognizer#create_optimizer) if not set. | -| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------------------------------------- | +| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ | +| _keyword-only_ | | +| `pipeline` | Optional list of pipeline components that this component is part of. ~~Optional[List[Tuple[str, Callable[[Doc], Doc]]]]~~ | +| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | +| **RETURNS** | The optimizer. ~~Optimizer~~ | ## SentenceRecognizer.predict {#predict tag="method"} @@ -152,10 +152,10 @@ modifying them. > scores = senter.predict([doc1, doc2]) > ``` -| Name | Type | Description | -| ----------- | --------------- | ----------------------------------------- | -| `docs` | `Iterable[Doc]` | The documents to predict. | -| **RETURNS** | - | The model's prediction for each document. | +| Name | Description | +| ----------- | ------------------------------------------- | +| `docs` | The documents to predict. ~~Iterable[Doc]~~ | +| **RETURNS** | The model's prediction for each document. | ## SentenceRecognizer.set_annotations {#set_annotations tag="method"} @@ -169,10 +169,10 @@ Modify a batch of [`Doc`](/api/doc) objects, using pre-computed scores. > senter.set_annotations([doc1, doc2], scores) > ``` -| Name | Type | Description | -| -------- | --------------- | ------------------------------------------------------------ | -| `docs` | `Iterable[Doc]` | The documents to modify. | -| `scores` | - | The scores to set, produced by `SentenceRecognizer.predict`. | +| Name | Description | +| -------- | ------------------------------------------------------------ | +| `docs` | The documents to modify. ~~Iterable[Doc]~~ | +| `scores` | The scores to set, produced by `SentenceRecognizer.predict`. | ## SentenceRecognizer.update {#update tag="method"} @@ -189,15 +189,15 @@ Delegates to [`predict`](/api/sentencerecognizer#predict) and > losses = senter.update(examples, sgd=optimizer) > ``` -| Name | Type | Description | -| ----------------- | --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------ | -| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. | -| _keyword-only_ | | | -| `drop` | float | The dropout rate. | -| `set_annotations` | bool | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](/api/sentencerecognizer#set_annotations). | -| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | -| `losses` | `Dict[str, float]` | Optional record of the loss during training. The value keyed by the model's name is updated. | -| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. | +| Name | Description | +| ----------------- | ---------------------------------------------------------------------------------------------------------------------------------- | +| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ | +| _keyword-only_ | | | +| `drop` | The dropout rate. ~~float~~ | +| `set_annotations` | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](#set_annotations). ~~bool~~ | +| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | +| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ | +| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ | ## SentenceRecognizer.rehearse {#rehearse tag="method,experimental" new="3"} @@ -213,14 +213,14 @@ the "catastrophic forgetting" problem. This feature is experimental. > losses = senter.rehearse(examples, sgd=optimizer) > ``` -| Name | Type | Description | -| -------------- | --------------------------------------------------- | ----------------------------------------------------------------------------------------- | -| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. | -| _keyword-only_ | | | -| `drop` | float | The dropout rate. | -| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | -| `losses` | `Dict[str, float]` | Optional record of the loss during training. Updated using the component name as the key. | -| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------------------------ | +| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ | +| _keyword-only_ | | | +| `drop` | The dropout rate. ~~float~~ | +| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | +| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ | +| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ | ## SentenceRecognizer.get_loss {#get_loss tag="method"} @@ -235,11 +235,11 @@ predicted scores. > loss, d_loss = senter.get_loss(examples, scores) > ``` -| Name | Type | Description | -| ----------- | --------------------- | --------------------------------------------------- | -| `examples` | `Iterable[Example]` | The batch of examples. | -| `scores` | - | Scores representing the model's predictions. | -| **RETURNS** | `Tuple[float, float]` | The loss and the gradient, i.e. `(loss, gradient)`. | +| Name | Description | +| ----------- | --------------------------------------------------------------------------- | +| `examples` | The batch of examples. ~~Iterable[Example]~~ | +| `scores` | Scores representing the model's predictions. | +| **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ | ## SentenceRecognizer.score {#score tag="method" new="3"} @@ -251,10 +251,10 @@ Score a batch of examples. > scores = senter.score(examples) > ``` -| Name | Type | Description | -| ----------- | ------------------- | ------------------------------------------------------------------------ | -| `examples` | `Iterable[Example]` | The examples to score. | -| **RETURNS** | `Dict[str, Any]` | The scores, produced by [`Scorer.score_spans`](/api/scorer#score_spans). | +| Name | Description | +| ----------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `examples` | The examples to score. ~~Iterable[Example]~~ | +| **RETURNS** | The scores, produced by [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attributes `"pos"`, `"tag"` and `"lemma"`. ~~Dict[str, float]~~ | ## SentenceRecognizer.create_optimizer {#create_optimizer tag="method"} @@ -267,9 +267,9 @@ Create an optimizer for the pipeline component. > optimizer = senter.create_optimizer() > ``` -| Name | Type | Description | -| ----------- | --------------------------------------------------- | -------------- | -| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | +| Name | Description | +| ----------- | ---------------------------- | +| **RETURNS** | The optimizer. ~~Optimizer~~ | ## SentenceRecognizer.use_params {#use_params tag="method, contextmanager"} @@ -284,9 +284,9 @@ context, the original parameters are restored. > senter.to_disk("/best_model") > ``` -| Name | Type | Description | -| -------- | ---- | ----------------------------------------- | -| `params` | dict | The parameter values to use in the model. | +| Name | Description | +| -------- | -------------------------------------------------- | +| `params` | The parameter values to use in the model. ~~dict~~ | ## SentenceRecognizer.to_disk {#to_disk tag="method"} @@ -299,11 +299,11 @@ Serialize the pipe to disk. > senter.to_disk("/path/to/senter") > ``` -| Name | Type | Description | -| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | -| _keyword-only_ | | | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------ | +| `path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ | +| _keyword-only_ | | +| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ | ## SentenceRecognizer.from_disk {#from_disk tag="method"} @@ -316,12 +316,12 @@ Load the pipe from disk. Modifies the object in place and returns it. > senter.from_disk("/path/to/senter") > ``` -| Name | Type | Description | -| -------------- | -------------------- | -------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | -| _keyword-only_ | | | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `SentenceRecognizer` | The modified `SentenceRecognizer` object. | +| Name | Description | +| -------------- | ----------------------------------------------------------------------------------------------- | +| `path` | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ | +| _keyword-only_ | | +| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ | +| **RETURNS** | The modified `SentenceRecognizer` object. ~~SentenceRecognizer~~ | ## SentenceRecognizer.to_bytes {#to_bytes tag="method"} @@ -334,11 +334,11 @@ Load the pipe from disk. Modifies the object in place and returns it. Serialize the pipe to a bytestring. -| Name | Type | Description | -| -------------- | --------------- | ------------------------------------------------------------------------- | -| _keyword-only_ | | | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | bytes | The serialized form of the `SentenceRecognizer` object. | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------- | +| _keyword-only_ | | +| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ | +| **RETURNS** | The serialized form of the `SentenceRecognizer` object. ~~bytes~~ | ## SentenceRecognizer.from_bytes {#from_bytes tag="method"} @@ -352,12 +352,12 @@ Load the pipe from a bytestring. Modifies the object in place and returns it. > senter.from_bytes(senter_bytes) > ``` -| Name | Type | Description | -| -------------- | -------------------- | ------------------------------------------------------------------------- | -| `bytes_data` | bytes | The data to load from. | -| _keyword-only_ | | | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `SentenceRecognizer` | The `SentenceRecognizer` object. | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------- | +| `bytes_data` | The data to load from. ~~bytes~~ | +| _keyword-only_ | | +| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ | +| **RETURNS** | The `SentenceRecognizer` object. ~~SentenceRecognizer~~ | ## Serialization fields {#serialization-fields} diff --git a/website/docs/api/sentencizer.md b/website/docs/api/sentencizer.md index 198215cfa..8104b1151 100644 --- a/website/docs/api/sentencizer.md +++ b/website/docs/api/sentencizer.md @@ -28,9 +28,9 @@ how the component should be configured. You can override its settings via the > nlp.add_pipe("entity_ruler", config=config) > ``` -| Setting | Type | Description | Default | -| ------------- | ----------- | ---------------------------------------------------------------------------------------------------------- | ------- | -| `punct_chars` | `List[str]` | Optional custom list of punctuation characters that mark sentence ends. See below for defaults if not set. | `None` | +| Setting | Description | +| ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `punct_chars` | Optional custom list of punctuation characters that mark sentence ends. See below for defaults if not set. Defaults to `None`. ~~Optional[List[str]]~~ | `None` | ```python https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/sentencizer.pyx @@ -51,10 +51,10 @@ Initialize the sentencizer. > sentencizer = Sentencizer() > ``` -| Name | Type | Description | -| -------------- | ----------- | ----------------------------------------------------------------------------------------------- | -| _keyword-only_ | | | -| `punct_chars` | `List[str]` | Optional custom list of punctuation characters that mark sentence ends. See below for defaults. | +| Name | Description | +| -------------- | ----------------------------------------------------------------------------------------------------------------------- | +| _keyword-only_ | | | +| `punct_chars` | Optional custom list of punctuation characters that mark sentence ends. See below for defaults. ~~Optional[List[str]]~~ | ```python ### punct_chars defaults @@ -87,10 +87,10 @@ the component has been added to the pipeline using > assert len(list(doc.sents)) == 2 > ``` -| Name | Type | Description | -| ----------- | ----- | ------------------------------------------------------------ | -| `doc` | `Doc` | The `Doc` object to process, e.g. the `Doc` in the pipeline. | -| **RETURNS** | `Doc` | The modified `Doc` with added sentence boundaries. | +| Name | Description | +| ----------- | -------------------------------------------------------------------- | +| `doc` | The `Doc` object to process, e.g. the `Doc` in the pipeline. ~~Doc~~ | +| **RETURNS** | The modified `Doc` with added sentence boundaries. ~~Doc~~ | ## Sentencizer.pipe {#pipe tag="method"} @@ -106,12 +106,12 @@ applied to the `Doc` in order. > pass > ``` -| Name | Type | Description | -| -------------- | --------------- | ----------------------------------------------------- | -| `stream` | `Iterable[Doc]` | A stream of documents. | -| _keyword-only_ | | | -| `batch_size` | int | The number of documents to buffer. Defaults to `128`. | -| **YIELDS** | `Doc` | The processed documents in order. | +| Name | Description | +| -------------- | ------------------------------------------------------------- | +| `stream` | A stream of documents. ~~Iterable[Doc]~~ | +| _keyword-only_ | | +| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ | +| **YIELDS** | The processed documents in order. ~~Doc~~ | ## Sentencizer.score {#score tag="method" new="3"} @@ -123,10 +123,10 @@ Score a batch of examples. > scores = sentencizer.score(examples) > ``` -| Name | Type | Description | -| ----------- | ------------------- | ------------------------------------------------------------------------ | -| `examples` | `Iterable[Example]` | The examples to score. | -| **RETURNS** | `Dict[str, Any]` | The scores, produced by [`Scorer.score_spans`](/api/scorer#score_spans). | +| Name | Description | +| ----------- | --------------------------------------------------------------------------------------------------------------------- | +| `examples` | The examples to score. ~~Iterable[Example]~~ | +| **RETURNS** | The scores, produced by [`Scorer.score_spans`](/api/scorer#score_spans). ~~Dict[str, Union[float, Dict[str, float]]~~ | ## Sentencizer.to_disk {#to_disk tag="method"} @@ -142,9 +142,9 @@ a file `sentencizer.json`. This also happens automatically when you save an > sentencizer.to_disk("/path/to/sentencizer.json") > ``` -| Name | Type | Description | -| ------ | ------------ | --------------------------------------------------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a JSON file, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| Name | Description | +| ------ | ------------------------------------------------------------------------------------------------------------------------------------------ | +| `path` | A path to a JSON file, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ | ## Sentencizer.from_disk {#from_disk tag="method"} @@ -159,10 +159,10 @@ added to its pipeline. > sentencizer.from_disk("/path/to/sentencizer.json") > ``` -| Name | Type | Description | -| ----------- | ------------- | -------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a JSON file. Paths may be either strings or `Path`-like objects. | -| **RETURNS** | `Sentencizer` | The modified `Sentencizer` object. | +| Name | Description | +| ----------- | ----------------------------------------------------------------------------------------------- | +| `path` | A path to a JSON file. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ | +| **RETURNS** | The modified `Sentencizer` object. ~~Sentencizer~~ | ## Sentencizer.to_bytes {#to_bytes tag="method"} @@ -176,9 +176,9 @@ Serialize the sentencizer settings to a bytestring. > sentencizer_bytes = sentencizer.to_bytes() > ``` -| Name | Type | Description | -| ----------- | ----- | -------------------- | -| **RETURNS** | bytes | The serialized data. | +| Name | Description | +| ----------- | ------------------------------ | +| **RETURNS** | The serialized data. ~~bytes~~ | ## Sentencizer.from_bytes {#from_bytes tag="method"} @@ -192,7 +192,7 @@ Load the pipe from a bytestring. Modifies the object in place and returns it. > sentencizer.from_bytes(sentencizer_bytes) > ``` -| Name | Type | Description | -| ------------ | ------------- | ---------------------------------- | -| `bytes_data` | bytes | The bytestring to load. | -| **RETURNS** | `Sentencizer` | The modified `Sentencizer` object. | +| Name | Description | +| ------------ | -------------------------------------------------- | +| `bytes_data` | The bytestring to load. ~~bytes~~ | +| **RETURNS** | The modified `Sentencizer` object. ~~Sentencizer~~ | diff --git a/website/docs/api/span.md b/website/docs/api/span.md index 9237b5538..1c7bc9592 100644 --- a/website/docs/api/span.md +++ b/website/docs/api/span.md @@ -18,14 +18,14 @@ Create a Span object from the slice `doc[start : end]`. > assert [t.text for t in span] == ["it", "back", "!"] > ``` -| Name | Type | Description | -| -------- | ---------------------------------------- | --------------------------------------------------------------------------------------------------------- | -| `doc` | `Doc` | The parent document. | -| `start` | int | The index of the first token of the span. | -| `end` | int | The index of the first token after the span. | -| `label` | int / str | A label to attach to the span, e.g. for named entities. As of v2.1, the label can also be a string. | -| `kb_id` | int / str | A knowledge base ID to attach to the span, e.g. for named entities. The ID can be an integer or a string. | -| `vector` | `numpy.ndarray[ndim=1, dtype="float32"]` | A meaning representation of the span. | +| Name | Description | +| -------- | --------------------------------------------------------------------------------------- | +| `doc` | The parent document. ~~Doc~~ | +| `start` | The index of the first token of the span. ~~int~~ | +| `end` | The index of the first token after the span. ~~int~~ | +| `label` | A label to attach to the span, e.g. for named entities. ~~Union[str, int]~~ | +| `kb_id` | A knowledge base ID to attach to the span, e.g. for named entities. ~~Union[str, int]~~ | +| `vector` | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~ | ## Span.\_\_getitem\_\_ {#getitem tag="method"} @@ -39,10 +39,10 @@ Get a `Token` object. > assert span[1].text == "back" > ``` -| Name | Type | Description | -| ----------- | ------- | --------------------------------------- | -| `i` | int | The index of the token within the span. | -| **RETURNS** | `Token` | The token at `span[i]`. | +| Name | Description | +| ----------- | ----------------------------------------------- | +| `i` | The index of the token within the span. ~~int~~ | +| **RETURNS** | The token at `span[i]`. ~~Token~~ | Get a `Span` object. @@ -54,10 +54,10 @@ Get a `Span` object. > assert span[1:3].text == "back!" > ``` -| Name | Type | Description | -| ----------- | ------ | -------------------------------- | -| `start_end` | tuple | The slice of the span to get. | -| **RETURNS** | `Span` | The span at `span[start : end]`. | +| Name | Description | +| ----------- | ------------------------------------------------- | +| `start_end` | The slice of the span to get. ~~Tuple[int, int]~~ | +| **RETURNS** | The span at `span[start : end]`. ~~Span~~ | ## Span.\_\_iter\_\_ {#iter tag="method"} @@ -71,9 +71,9 @@ Iterate over `Token` objects. > assert [t.text for t in span] == ["it", "back", "!"] > ``` -| Name | Type | Description | -| ---------- | ------- | ----------------- | -| **YIELDS** | `Token` | A `Token` object. | +| Name | Description | +| ---------- | --------------------------- | +| **YIELDS** | A `Token` object. ~~Token~~ | ## Span.\_\_len\_\_ {#len tag="method"} @@ -87,9 +87,9 @@ Get the number of tokens in the span. > assert len(span) == 3 > ``` -| Name | Type | Description | -| ----------- | ---- | --------------------------------- | -| **RETURNS** | int | The number of tokens in the span. | +| Name | Description | +| ----------- | ----------------------------------------- | +| **RETURNS** | The number of tokens in the span. ~~int~~ | ## Span.set_extension {#set_extension tag="classmethod" new="2"} @@ -107,14 +107,14 @@ For details, see the documentation on > assert doc[1:4]._.has_city > ``` -| Name | Type | Description | -| --------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------- | -| `name` | str | Name of the attribute to set by the extension. For example, `"my_attr"` will be available as `span._.my_attr`. | -| `default` | - | Optional default value of the attribute if no getter or method is defined. | -| `method` | callable | Set a custom method on the object, for example `span._.compare(other_span)`. | -| `getter` | callable | Getter function that takes the object and returns an attribute value. Is called when the user accesses the `._` attribute. | -| `setter` | callable | Setter function that takes the `Span` and a value, and modifies the object. Is called when the user writes to the `Span._` attribute. | -| `force` | bool | Force overwriting existing attribute. | +| Name | Description | +| --------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `name` | Name of the attribute to set by the extension. For example, `"my_attr"` will be available as `span._.my_attr`. ~~str~~ | +| `default` | Optional default value of the attribute if no getter or method is defined. ~~Optional[Any]~~ | +| `method` | Set a custom method on the object, for example `span._.compare(other_span)`. ~~Optional[Callable[[Span, ...], Any]]~~ | +| `getter` | Getter function that takes the object and returns an attribute value. Is called when the user accesses the `._` attribute. ~~Optional[Callable[[Span], Any]]~~ | +| `setter` | Setter function that takes the `Span` and a value, and modifies the object. Is called when the user writes to the `Span._` attribute. ~~Optional[Callable[[Span, Any], None]]~~ | +| `force` | Force overwriting existing attribute. ~~bool~~ | ## Span.get_extension {#get_extension tag="classmethod" new="2"} @@ -131,10 +131,10 @@ Look up a previously registered extension by name. Returns a 4-tuple > assert extension == (False, None, None, None) > ``` -| Name | Type | Description | -| ----------- | ----- | ------------------------------------------------------------- | -| `name` | str | Name of the extension. | -| **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the extension. | +| Name | Description | +| ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------- | +| `name` | Name of the extension. ~~str~~ | +| **RETURNS** | A `(default, method, getter, setter)` tuple of the extension. ~~Tuple[Optional[Any], Optional[Callable], Optional[Callable], Optional[Callable]]~~ | ## Span.has_extension {#has_extension tag="classmethod" new="2"} @@ -148,10 +148,10 @@ Check whether an extension has been registered on the `Span` class. > assert Span.has_extension("is_city") > ``` -| Name | Type | Description | -| ----------- | ---- | ------------------------------------------ | -| `name` | str | Name of the extension to check. | -| **RETURNS** | bool | Whether the extension has been registered. | +| Name | Description | +| ----------- | --------------------------------------------------- | +| `name` | Name of the extension to check. ~~str~~ | +| **RETURNS** | Whether the extension has been registered. ~~bool~~ | ## Span.remove_extension {#remove_extension tag="classmethod" new="2.0.12"} @@ -166,10 +166,10 @@ Remove a previously registered extension. > assert not Span.has_extension("is_city") > ``` -| Name | Type | Description | -| ----------- | ----- | --------------------------------------------------------------------- | -| `name` | str | Name of the extension. | -| **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the removed extension. | +| Name | Description | +| ----------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `name` | Name of the extension. ~~str~~ | +| **RETURNS** | A `(default, method, getter, setter)` tuple of the removed extension. ~~Tuple[Optional[Any], Optional[Callable], Optional[Callable], Optional[Callable]]~~ | ## Span.char_span {#char_span tag="method" new="2.2.4"} @@ -184,14 +184,14 @@ the character indices don't map to a valid span. > assert span.text == "New York" > ``` -| Name | Type | Description | -| ----------- | ---------------------------------------- | --------------------------------------------------------------------- | -| `start` | int | The index of the first character of the span. | -| `end` | int | The index of the last character after the span. | -| `label` | uint64 / str | A label to attach to the span, e.g. for named entities. | -| `kb_id` | uint64 / str | An ID from a knowledge base to capture the meaning of a named entity. | -| `vector` | `numpy.ndarray[ndim=1, dtype="float32"]` | A meaning representation of the span. | -| **RETURNS** | `Span` | The newly constructed object or `None`. | +| Name | Description | +| ------------------------------------ | ----------------------------------------------------------------------------------------- | +| `start` | The index of the first character of the span. ~~int~~ | +| `end` | The index of the last character after the span. ~int~~ | +| `label` | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~ | +| `kb_id` 2.2 | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~ | +| `vector` | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~ | +| **RETURNS** | The newly constructed object or `None`. ~~Optional[Span]~~ | ## Span.similarity {#similarity tag="method" model="vectors"} @@ -209,10 +209,10 @@ using an average of word vectors. > assert apples_oranges == oranges_apples > ``` -| Name | Type | Description | -| ----------- | ----- | -------------------------------------------------------------------------------------------- | -| `other` | - | The object to compare with. By default, accepts `Doc`, `Span`, `Token` and `Lexeme` objects. | -| **RETURNS** | float | A scalar similarity score. Higher is more similar. | +| Name | Description | +| ----------- | -------------------------------------------------------------------------------------------------------------------------------- | +| `other` | The object to compare with. By default, accepts `Doc`, `Span`, `Token` and `Lexeme` objects. ~~Union[Doc, Span, Token, Lexeme]~~ | +| **RETURNS** | A scalar similarity score. Higher is more similar. ~~float~~ | ## Span.get_lca_matrix {#get_lca_matrix tag="method"} @@ -229,9 +229,9 @@ ancestor is found, e.g. if span excludes a necessary ancestor. > # array([[0, 0, 0], [0, 1, 2], [0, 2, 2]], dtype=int32) > ``` -| Name | Type | Description | -| ----------- | -------------------------------------- | ------------------------------------------------ | -| **RETURNS** | `numpy.ndarray[ndim=2, dtype="int32"]` | The lowest common ancestor matrix of the `Span`. | +| Name | Description | +| ----------- | --------------------------------------------------------------------------------------- | +| **RETURNS** | The lowest common ancestor matrix of the `Span`. ~~numpy.ndarray[ndim=2, dtype=int32]~~ | ## Span.to_array {#to_array tag="method" new="2"} @@ -249,10 +249,10 @@ shape `(N, M)`, where `N` is the length of the document. The values will be > np_array = span.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA]) > ``` -| Name | Type | Description | -| ----------- | ----------------------------- | -------------------------------------------------------------------------------------------------------- | -| `attr_ids` | list | A list of attribute ID ints. | -| **RETURNS** | `numpy.ndarray[long, ndim=2]` | A feature matrix, with one row per word, and one column per attribute indicated in the input `attr_ids`. | +| Name | Description | +| ----------- | ---------------------------------------------------------------------------------------------------------------------------------------- | +| `attr_ids` | A list of attributes (int IDs or string names) or a single attribute (int ID or string name). ~~Union[int, str, List[Union[int, str]]]~~ | +| **RETURNS** | The exported attributes as a numpy array. ~~Union[numpy.ndarray[ndim=2, dtype=uint64], numpy.ndarray[ndim=1, dtype=uint64]]~~ | ## Span.ents {#ents tag="property" new="2.0.13" model="ner"} @@ -270,9 +270,9 @@ if the entity recognizer has been applied. > assert ents[0].text == "Mr. Best" > ``` -| Name | Type | Description | -| ----------- | ----- | -------------------------------------------- | -| **RETURNS** | tuple | Entities in the span, one `Span` per entity. | +| Name | Description | +| ----------- | ----------------------------------------------------------------- | +| **RETURNS** | Entities in the span, one `Span` per entity. ~~Tuple[Span, ...]~~ | ## Span.as_doc {#as_doc tag="method"} @@ -287,10 +287,10 @@ Create a new `Doc` object corresponding to the `Span`, with a copy of the data. > assert doc2.text == "New York" > ``` -| Name | Type | Description | -| ---------------- | ----- | ---------------------------------------------------- | -| `copy_user_data` | bool | Whether or not to copy the original doc's user data. | -| **RETURNS** | `Doc` | A `Doc` object of the `Span`'s content. | +| Name | Description | +| ---------------- | ------------------------------------------------------------- | +| `copy_user_data` | Whether or not to copy the original doc's user data. ~~bool~~ | +| **RETURNS** | A `Doc` object of the `Span`'s content. ~~Doc~~ | ## Span.root {#root tag="property" model="parser"} @@ -309,9 +309,9 @@ taken. > assert new_york.root.text == "York" > ``` -| Name | Type | Description | -| ----------- | ------- | --------------- | -| **RETURNS** | `Token` | The root token. | +| Name | Description | +| ----------- | ------------------------- | +| **RETURNS** | The root token. ~~Token~~ | ## Span.conjuncts {#conjuncts tag="property" model="parser"} @@ -325,9 +325,9 @@ A tuple of tokens coordinated to `span.root`. > assert [t.text for t in apples_conjuncts] == ["oranges"] > ``` -| Name | Type | Description | -| ----------- | ------- | ----------------------- | -| **RETURNS** | `tuple` | The coordinated tokens. | +| Name | Description | +| ----------- | --------------------------------------------- | +| **RETURNS** | The coordinated tokens. ~~Tuple[Token, ...]~~ | ## Span.lefts {#lefts tag="property" model="parser"} @@ -341,9 +341,9 @@ Tokens that are to the left of the span, whose heads are within the span. > assert lefts == ["New"] > ``` -| Name | Type | Description | -| ---------- | ------- | ------------------------------------ | -| **YIELDS** | `Token` | A left-child of a token of the span. | +| Name | Description | +| ---------- | ---------------------------------------------- | +| **YIELDS** | A left-child of a token of the span. ~~Token~~ | ## Span.rights {#rights tag="property" model="parser"} @@ -357,9 +357,9 @@ Tokens that are to the right of the span, whose heads are within the span. > assert rights == ["in"] > ``` -| Name | Type | Description | -| ---------- | ------- | ------------------------------------- | -| **YIELDS** | `Token` | A right-child of a token of the span. | +| Name | Description | +| ---------- | ----------------------------------------------- | +| **YIELDS** | A right-child of a token of the span. ~~Token~~ | ## Span.n_lefts {#n_lefts tag="property" model="parser"} @@ -373,9 +373,9 @@ the span. > assert doc[3:7].n_lefts == 1 > ``` -| Name | Type | Description | -| ----------- | ---- | -------------------------------- | -| **RETURNS** | int | The number of left-child tokens. | +| Name | Description | +| ----------- | ---------------------------------------- | +| **RETURNS** | The number of left-child tokens. ~~int~~ | ## Span.n_rights {#n_rights tag="property" model="parser"} @@ -389,9 +389,9 @@ the span. > assert doc[2:4].n_rights == 1 > ``` -| Name | Type | Description | -| ----------- | ---- | --------------------------------- | -| **RETURNS** | int | The number of right-child tokens. | +| Name | Description | +| ----------- | ----------------------------------------- | +| **RETURNS** | The number of right-child tokens. ~~int~~ | ## Span.subtree {#subtree tag="property" model="parser"} @@ -405,9 +405,9 @@ Tokens within the span and tokens which descend from them. > assert subtree == ["Give", "it", "back", "!"] > ``` -| Name | Type | Description | -| ---------- | ------- | ------------------------------------------------- | -| **YIELDS** | `Token` | A token within the span, or a descendant from it. | +| Name | Description | +| ---------- | ----------------------------------------------------------- | +| **YIELDS** | A token within the span, or a descendant from it. ~~Token~~ | ## Span.has_vector {#has_vector tag="property" model="vectors"} @@ -420,9 +420,9 @@ A boolean value indicating whether a word vector is associated with the object. > assert doc[1:].has_vector > ``` -| Name | Type | Description | -| ----------- | ---- | -------------------------------------------- | -| **RETURNS** | bool | Whether the span has a vector data attached. | +| Name | Description | +| ----------- | ----------------------------------------------------- | +| **RETURNS** | Whether the span has a vector data attached. ~~bool~~ | ## Span.vector {#vector tag="property" model="vectors"} @@ -437,9 +437,9 @@ vectors. > assert doc[1:].vector.shape == (300,) > ``` -| Name | Type | Description | -| ----------- | ---------------------------------------- | --------------------------------------------------- | -| **RETURNS** | `numpy.ndarray[ndim=1, dtype="float32"]` | A 1D numpy array representing the span's semantics. | +| Name | Description | +| ----------- | ----------------------------------------------------------------------------------------------- | +| **RETURNS** | A 1-dimensional array representing the span's vector. ~~`numpy.ndarray[ndim=1, dtype=float32]~~ | ## Span.vector_norm {#vector_norm tag="property" model="vectors"} @@ -454,31 +454,31 @@ The L2 norm of the span's vector representation. > assert doc[1:].vector_norm != doc[2:].vector_norm > ``` -| Name | Type | Description | -| ----------- | ----- | ----------------------------------------- | -| **RETURNS** | float | The L2 norm of the vector representation. | +| Name | Description | +| ----------- | --------------------------------------------------- | +| **RETURNS** | The L2 norm of the vector representation. ~~float~~ | ## Attributes {#attributes} -| Name | Type | Description | -| --------------------------------------- | ------------ | -------------------------------------------------------------------------------------------------------------- | -| `doc` | `Doc` | The parent document. | -| `tensor` 2.1.7 | `ndarray` | The span's slice of the parent `Doc`'s tensor. | -| `sent` | `Span` | The sentence span that this span is a part of. | -| `start` | int | The token offset for the start of the span. | -| `end` | int | The token offset for the end of the span. | -| `start_char` | int | The character offset for the start of the span. | -| `end_char` | int | The character offset for the end of the span. | -| `text` | str | A string representation of the span text. | -| `text_with_ws` | str | The text content of the span with a trailing whitespace character if the last token has one. | -| `orth` | int | ID of the verbatim text content. | -| `orth_` | str | Verbatim text content (identical to `Span.text`). Exists mostly for consistency with the other attributes. | -| `label` | int | The hash value of the span's label. | -| `label_` | str | The span's label. | -| `lemma_` | str | The span's lemma. | -| `kb_id` | int | The hash value of the knowledge base ID referred to by the span. | -| `kb_id_` | str | The knowledge base ID referred to by the span. | -| `ent_id` | int | The hash value of the named entity the token is an instance of. | -| `ent_id_` | str | The string ID of the named entity the token is an instance of. | -| `sentiment` | float | A scalar value indicating the positivity or negativity of the span. | -| `_` | `Underscore` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). | +| Name | Description | +| --------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------- | +| `doc` | The parent document. ~~Doc~~ | +| `tensor` 2.1.7 | The span's slice of the parent `Doc`'s tensor. ~~numpy.ndarray~~ | +| `sent` | The sentence span that this span is a part of. ~~Span~~ | +| `start` | The token offset for the start of the span. ~~int~~ | +| `end` | The token offset for the end of the span. ~~int~~ | +| `start_char` | The character offset for the start of the span. ~~int~~ | +| `end_char` | The character offset for the end of the span. ~~int~~ | +| `text` | A string representation of the span text. ~~str~~ | +| `text_with_ws` | The text content of the span with a trailing whitespace character if the last token has one. ~~str~~ | +| `orth` | ID of the verbatim text content. ~~int~~ | +| `orth_` | Verbatim text content (identical to `Span.text`). Exists mostly for consistency with the other attributes. ~~str~~ | +| `label` | The hash value of the span's label. ~~int~~ | +| `label_` | The span's label. ~~str~~ | +| `lemma_` | The span's lemma. Equivalent to `"".join(token.text_with_ws for token in span)`. ~~str~~ | +| `kb_id` | The hash value of the knowledge base ID referred to by the span. ~~int~~ | +| `kb_id_` | The knowledge base ID referred to by the span. ~~str~~ | +| `ent_id` | The hash value of the named entity the token is an instance of. ~~int~~ | +| `ent_id_` | The string ID of the named entity the token is an instance of. ~~str~~ | +| `sentiment` | A scalar value indicating the positivity or negativity of the span. ~~float~~ | +| `_` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~ | diff --git a/website/docs/api/stringstore.md b/website/docs/api/stringstore.md index b66d755ed..d5f78dbab 100644 --- a/website/docs/api/stringstore.md +++ b/website/docs/api/stringstore.md @@ -19,9 +19,9 @@ Create the `StringStore`. > stringstore = StringStore(["apple", "orange"]) > ``` -| Name | Type | Description | -| --------- | -------- | ------------------------------------------ | -| `strings` | iterable | A sequence of strings to add to the store. | +| Name | Description | +| --------- | ---------------------------------------------------------------------- | +| `strings` | A sequence of strings to add to the store. ~~Optional[Iterable[str]]~~ | ## StringStore.\_\_len\_\_ {#len tag="method"} @@ -34,9 +34,9 @@ Get the number of strings in the store. > assert len(stringstore) == 2 > ``` -| Name | Type | Description | -| ----------- | ---- | ----------------------------------- | -| **RETURNS** | int | The number of strings in the store. | +| Name | Description | +| ----------- | ------------------------------------------- | +| **RETURNS** | The number of strings in the store. ~~int~~ | ## StringStore.\_\_getitem\_\_ {#getitem tag="method"} @@ -51,10 +51,10 @@ Retrieve a string from a given hash, or vice versa. > assert stringstore[apple_hash] == "apple" > ``` -| Name | Type | Description | -| -------------- | -------------------- | -------------------------- | -| `string_or_id` | bytes, str or uint64 | The value to encode. | -| **RETURNS** | str or int | The value to be retrieved. | +| Name | Description | +| -------------- | ----------------------------------------------- | +| `string_or_id` | The value to encode. ~~Union[bytes, str, int]~~ | +| **RETURNS** | The value to be retrieved. ~~Union[str, int]~~ | ## StringStore.\_\_contains\_\_ {#contains tag="method"} @@ -68,15 +68,15 @@ Check whether a string is in the store. > assert not "cherry" in stringstore > ``` -| Name | Type | Description | -| ----------- | ---- | -------------------------------------- | -| `string` | str | The string to check. | -| **RETURNS** | bool | Whether the store contains the string. | +| Name | Description | +| ----------- | ----------------------------------------------- | +| `string` | The string to check. ~~str~~ | +| **RETURNS** | Whether the store contains the string. ~~bool~~ | ## StringStore.\_\_iter\_\_ {#iter tag="method"} Iterate over the strings in the store, in order. Note that a newly initialized -store will always include an empty string `''` at position `0`. +store will always include an empty string `""` at position `0`. > #### Example > @@ -86,9 +86,9 @@ store will always include an empty string `''` at position `0`. > assert all_strings == ["apple", "orange"] > ``` -| Name | Type | Description | -| ---------- | ---- | ---------------------- | -| **YIELDS** | str | A string in the store. | +| Name | Description | +| ---------- | ------------------------------ | +| **YIELDS** | A string in the store. ~~str~~ | ## StringStore.add {#add tag="method" new="2"} @@ -105,10 +105,10 @@ Add a string to the `StringStore`. > assert stringstore["banana"] == banana_hash > ``` -| Name | Type | Description | -| ----------- | ------ | ------------------------ | -| `string` | str | The string to add. | -| **RETURNS** | uint64 | The string's hash value. | +| Name | Description | +| ----------- | -------------------------------- | +| `string` | The string to add. ~~str~~ | +| **RETURNS** | The string's hash value. ~~int~~ | ## StringStore.to_disk {#to_disk tag="method" new="2"} @@ -120,9 +120,9 @@ Save the current state to a directory. > stringstore.to_disk("/path/to/strings") > ``` -| Name | Type | Description | -| ------ | ------------ | --------------------------------------------------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| Name | Description | +| ------ | ------------------------------------------------------------------------------------------------------------------------------------------ | +| `path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ | ## StringStore.from_disk {#from_disk tag="method" new="2"} @@ -135,10 +135,10 @@ Loads state from a directory. Modifies the object in place and returns it. > stringstore = StringStore().from_disk("/path/to/strings") > ``` -| Name | Type | Description | -| ----------- | ------------- | -------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | -| **RETURNS** | `StringStore` | The modified `StringStore` object. | +| Name | Description | +| ----------- | ----------------------------------------------------------------------------------------------- | +| `path` | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ | +| **RETURNS** | The modified `StringStore` object. ~~StringStore~~ | ## StringStore.to_bytes {#to_bytes tag="method"} @@ -150,9 +150,9 @@ Serialize the current state to a binary string. > store_bytes = stringstore.to_bytes() > ``` -| Name | Type | Description | -| ----------- | ----- | ------------------------------------------------ | -| **RETURNS** | bytes | The serialized form of the `StringStore` object. | +| Name | Description | +| ----------- | ---------------------------------------------------------- | +| **RETURNS** | The serialized form of the `StringStore` object. ~~bytes~~ | ## StringStore.from_bytes {#from_bytes tag="method"} @@ -166,10 +166,10 @@ Load state from a binary string. > new_store = StringStore().from_bytes(store_bytes) > ``` -| Name | Type | Description | -| ------------ | ------------- | ------------------------- | -| `bytes_data` | bytes | The data to load from. | -| **RETURNS** | `StringStore` | The `StringStore` object. | +| Name | Description | +| ------------ | ----------------------------------------- | +| `bytes_data` | The data to load from. ~~bytes~~ | +| **RETURNS** | The `StringStore` object. ~~StringStore~~ | ## Utilities {#util} @@ -184,7 +184,7 @@ Get a 64-bit hash for a given string. > assert hash_string("apple") == 8566208034543834098 > ``` -| Name | Type | Description | -| ----------- | ------ | ------------------- | -| `string` | str | The string to hash. | -| **RETURNS** | uint64 | The hash. | +| Name | Description | +| ----------- | --------------------------- | +| `string` | The string to hash. ~~str~~ | +| **RETURNS** | The hash. ~~int~~ | diff --git a/website/docs/api/tagger.md b/website/docs/api/tagger.md index 9761dea15..b255b2261 100644 --- a/website/docs/api/tagger.md +++ b/website/docs/api/tagger.md @@ -28,10 +28,10 @@ architectures and their arguments and hyperparameters. > nlp.add_pipe("tagger", config=config) > ``` -| Setting | Type | Description | Default | -| ---------------- | ------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------- | -| `set_morphology` | bool | Whether to set morphological features. | `False` | -| `model` | [`Model`](https://thinc.ai/docs/api-model) | A model instance that predicts the tag probabilities. The output vectors should match the number of tags in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). | [Tagger](/api/architectures#Tagger) | +| Setting | Description | +| ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `set_morphology` | Whether to set morphological features. Defaults to `False`. ~~bool~~ | +| `model` | A model instance that predicts the tag probabilities. The output vectors should match the number of tags in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ | ```python https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/tagger.pyx @@ -58,13 +58,13 @@ Create a new pipeline instance. In your application, you would normally use a shortcut for this and instantiate the component using its string name and [`nlp.add_pipe`](/api/language#add_pipe). -| Name | Type | Description | -| ---------------- | ------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `vocab` | `Vocab` | The shared vocabulary. | -| `model` | [`Model`](https://thinc.ai/docs/api-model) | A model instance that predicts the tag probabilities. The output vectors should match the number of tags in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). | -| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. | -| _keyword-only_ | | | -| `set_morphology` | bool | Whether to set morphological features. | +| Name | Description | +| ---------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `vocab` | The shared vocabulary. ~~Vocab~~ | +| `model` | A model instance that predicts the tag probabilities. The output vectors should match the number of tags in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). ~~Model[List[Doc], List[Floats2d]]~~ | +| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | +| _keyword-only_ | | +| `set_morphology` | Whether to set morphological features. ~~bool~~ | ## Tagger.\_\_call\_\_ {#call tag="method"} @@ -84,10 +84,10 @@ and all pipeline components are applied to the `Doc` in order. Both > processed = tagger(doc) > ``` -| Name | Type | Description | -| ----------- | ----- | ------------------------ | -| `doc` | `Doc` | The document to process. | -| **RETURNS** | `Doc` | The processed document. | +| Name | Description | +| ----------- | -------------------------------- | +| `doc` | The document to process. ~~Doc~~ | +| **RETURNS** | The processed document. ~~Doc~~ | ## Tagger.pipe {#pipe tag="method"} @@ -105,12 +105,12 @@ applied to the `Doc` in order. Both [`__call__`](/api/tagger#call) and > pass > ``` -| Name | Type | Description | -| -------------- | --------------- | ------------------------------------------------------ | -| `stream` | `Iterable[Doc]` | A stream of documents. | -| _keyword-only_ | | | -| `batch_size` | int | The number of texts to buffer. Defaults to `128`. | -| **YIELDS** | `Doc` | Processed documents in the order of the original text. | +| Name | Description | +| -------------- | ------------------------------------------------------------- | +| `stream` | A stream of documents. ~~Iterable[Doc]~~ | +| _keyword-only_ | | +| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ | +| **YIELDS** | The processed documents in order. ~~Doc~~ | ## Tagger.begin_training {#begin_training tag="method"} @@ -130,13 +130,13 @@ setting up the label scheme based on the data. > optimizer = tagger.begin_training(lambda: [], pipeline=nlp.pipeline) > ``` -| Name | Type | Description | -| -------------- | --------------------------------------------------- | ---------------------------------------------------------------------------------------------------------- | -| `get_examples` | `Callable[[], Iterable[Example]]` | Optional function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. | -| _keyword-only_ | | | -| `pipeline` | `List[Tuple[str, Callable]]` | Optional list of pipeline components that this component is part of. | -| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | An optional optimizer. Will be created via [`create_optimizer`](/api/tagger#create_optimizer) if not set. | -| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------------------------------------- | +| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ | +| _keyword-only_ | | +| `pipeline` | Optional list of pipeline components that this component is part of. ~~Optional[List[Tuple[str, Callable[[Doc], Doc]]]]~~ | +| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | +| **RETURNS** | The optimizer. ~~Optimizer~~ | ## Tagger.predict {#predict tag="method"} @@ -150,10 +150,10 @@ modifying them. > scores = tagger.predict([doc1, doc2]) > ``` -| Name | Type | Description | -| ----------- | --------------- | ----------------------------------------- | -| `docs` | `Iterable[Doc]` | The documents to predict. | -| **RETURNS** | - | The model's prediction for each document. | +| Name | Description | +| ----------- | ------------------------------------------- | +| `docs` | The documents to predict. ~~Iterable[Doc]~~ | +| **RETURNS** | The model's prediction for each document. | ## Tagger.set_annotations {#set_annotations tag="method"} @@ -167,10 +167,10 @@ Modify a batch of [`Doc`](/api/doc) objects, using pre-computed scores. > tagger.set_annotations([doc1, doc2], scores) > ``` -| Name | Type | Description | -| -------- | --------------- | ------------------------------------------------ | -| `docs` | `Iterable[Doc]` | The documents to modify. | -| `scores` | - | The scores to set, produced by `Tagger.predict`. | +| Name | Description | +| -------- | ------------------------------------------------ | +| `docs` | The documents to modify. ~~Iterable[Doc]~~ | +| `scores` | The scores to set, produced by `Tagger.predict`. | ## Tagger.update {#update tag="method"} @@ -187,15 +187,15 @@ Delegates to [`predict`](/api/tagger#predict) and > losses = tagger.update(examples, sgd=optimizer) > ``` -| Name | Type | Description | -| ----------------- | --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------ | -| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. | -| _keyword-only_ | | | -| `drop` | float | The dropout rate. | -| `set_annotations` | bool | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](/api/tagger#set_annotations). | -| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | -| `losses` | `Dict[str, float]` | Optional record of the loss during training. The value keyed by the model's name is updated. | -| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. | +| Name | Description | +| ----------------- | ---------------------------------------------------------------------------------------------------------------------------------- | +| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ | +| _keyword-only_ | | | +| `drop` | The dropout rate. ~~float~~ | +| `set_annotations` | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](#set_annotations). ~~bool~~ | +| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | +| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ | +| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ | ## Tagger.rehearse {#rehearse tag="method,experimental" new="3"} @@ -211,14 +211,14 @@ the "catastrophic forgetting" problem. This feature is experimental. > losses = tagger.rehearse(examples, sgd=optimizer) > ``` -| Name | Type | Description | -| -------------- | --------------------------------------------------- | ----------------------------------------------------------------------------------------- | -| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. | -| _keyword-only_ | | | -| `drop` | float | The dropout rate. | -| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | -| `losses` | `Dict[str, float]` | Optional record of the loss during training. Updated using the component name as the key. | -| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------------------------ | +| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ | +| _keyword-only_ | | | +| `drop` | The dropout rate. ~~float~~ | +| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | +| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ | +| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ | ## Tagger.get_loss {#get_loss tag="method"} @@ -233,11 +233,11 @@ predicted scores. > loss, d_loss = tagger.get_loss(examples, scores) > ``` -| Name | Type | Description | -| ----------- | --------------------- | --------------------------------------------------- | -| `examples` | `Iterable[Example]` | The batch of examples. | -| `scores` | - | Scores representing the model's predictions. | -| **RETURNS** | `Tuple[float, float]` | The loss and the gradient, i.e. `(loss, gradient)`. | +| Name | Description | +| ----------- | --------------------------------------------------------------------------- | +| `examples` | The batch of examples. ~~Iterable[Example]~~ | +| `scores` | Scores representing the model's predictions. | +| **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ | ## Tagger.score {#score tag="method" new="3"} @@ -249,10 +249,10 @@ Score a batch of examples. > scores = tagger.score(examples) > ``` -| Name | Type | Description | -| ----------- | ------------------- | ------------------------------------------------------------------------------------------------------------------------------------ | -| `examples` | `Iterable[Example]` | The examples to score. | -| **RETURNS** | `Dict[str, Any]` | The scores, produced by [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attributes `"pos"`, `"tag"` and `"lemma"`. | +| Name | Description | +| ----------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `examples` | The examples to score. ~~Iterable[Example]~~ | +| **RETURNS** | The scores, produced by [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attributes `"pos"`, `"tag"` and `"lemma"`. ~~Dict[str, float]~~ | ## Tagger.create_optimizer {#create_optimizer tag="method"} @@ -265,9 +265,9 @@ Create an optimizer for the pipeline component. > optimizer = tagger.create_optimizer() > ``` -| Name | Type | Description | -| ----------- | --------------------------------------------------- | -------------- | -| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | +| Name | Description | +| ----------- | ---------------------------- | +| **RETURNS** | The optimizer. ~~Optimizer~~ | ## Tagger.use_params {#use_params tag="method, contextmanager"} @@ -282,9 +282,9 @@ context, the original parameters are restored. > tagger.to_disk("/best_model") > ``` -| Name | Type | Description | -| -------- | ---- | ----------------------------------------- | -| `params` | dict | The parameter values to use in the model. | +| Name | Description | +| -------- | -------------------------------------------------- | +| `params` | The parameter values to use in the model. ~~dict~~ | ## Tagger.add_label {#add_label tag="method"} @@ -297,10 +297,10 @@ Add a new label to the pipe. > tagger.add_label("MY_LABEL") > ``` -| Name | Type | Description | -| ----------- | ---- | --------------------------------------------------- | -| `label` | str | The label to add. | -| **RETURNS** | int | `0` if the label is already present, otherwise `1`. | +| Name | Description | +| ----------- | ----------------------------------------------------------- | +| `label` | The label to add. ~~str~~ | +| **RETURNS** | `0` if the label is already present, otherwise `1`. ~~int~~ | ## Tagger.to_disk {#to_disk tag="method"} @@ -313,11 +313,11 @@ Serialize the pipe to disk. > tagger.to_disk("/path/to/tagger") > ``` -| Name | Type | Description | -| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | -| _keyword-only_ | | | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------ | +| `path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ | +| _keyword-only_ | | +| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ | ## Tagger.from_disk {#from_disk tag="method"} @@ -330,12 +330,12 @@ Load the pipe from disk. Modifies the object in place and returns it. > tagger.from_disk("/path/to/tagger") > ``` -| Name | Type | Description | -| -------------- | --------------- | -------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | -| _keyword-only_ | | | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `Tagger` | The modified `Tagger` object. | +| Name | Description | +| -------------- | ----------------------------------------------------------------------------------------------- | +| `path` | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ | +| _keyword-only_ | | +| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ | +| **RETURNS** | The modified `Tagger` object. ~~Tagger~~ | ## Tagger.to_bytes {#to_bytes tag="method"} @@ -348,11 +348,11 @@ Load the pipe from disk. Modifies the object in place and returns it. Serialize the pipe to a bytestring. -| Name | Type | Description | -| -------------- | --------------- | ------------------------------------------------------------------------- | -| _keyword-only_ | | | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | bytes | The serialized form of the `Tagger` object. | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------- | +| _keyword-only_ | | +| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ | +| **RETURNS** | The serialized form of the `Tagger` object. ~~bytes~~ | ## Tagger.from_bytes {#from_bytes tag="method"} @@ -366,12 +366,12 @@ Load the pipe from a bytestring. Modifies the object in place and returns it. > tagger.from_bytes(tagger_bytes) > ``` -| Name | Type | Description | -| -------------- | --------------- | ------------------------------------------------------------------------- | -| `bytes_data` | bytes | The data to load from. | -| _keyword-only_ | | | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `Tagger` | The `Tagger` object. | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------- | +| `bytes_data` | The data to load from. ~~bytes~~ | +| _keyword-only_ | | +| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ | +| **RETURNS** | The `Tagger` object. ~~Tagger~~ | ## Tagger.labels {#labels tag="property"} @@ -384,9 +384,9 @@ The labels currently added to the component. > assert "MY_LABEL" in tagger.labels > ``` -| Name | Type | Description | -| ----------- | ------------ | ---------------------------------- | -| **RETURNS** | `Tuple[str]` | The labels added to the component. | +| Name | Description | +| ----------- | ------------------------------------------------------ | +| **RETURNS** | The labels added to the component. ~~Tuple[str, ...]~~ | ## Serialization fields {#serialization-fields} diff --git a/website/docs/api/textcategorizer.md b/website/docs/api/textcategorizer.md index 73b50b865..927ac5411 100644 --- a/website/docs/api/textcategorizer.md +++ b/website/docs/api/textcategorizer.md @@ -35,10 +35,10 @@ architectures and their arguments and hyperparameters. > nlp.add_pipe("textcat", config=config) > ``` -| Setting | Type | Description | Default | -| -------- | ------------------------------------------ | --------------------------------------------------------------------------------------- | ----------------------------------------------------- | -| `labels` | `List[str]` | A list of categories to learn. If empty, the model infers the categories from the data. | `[]` | -| `model` | [`Model`](https://thinc.ai/docs/api-model) | A model instance that predicts scores for each category. | [TextCatEnsemble](/api/architectures#TextCatEnsemble) | +| Setting | Description | +| -------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `labels` | A list of categories to learn. If empty, the model infers the categories from the data. Defaults to `[]`. ~~Iterable[str]~~ | +| `model` | A model instance that predicts scores for each category. Defaults to [TextCatEnsemble](/api/architectures#TextCatEnsemble). ~~Model[List[Doc], List[Floats2d]]~~ | ```python https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/textcat.py @@ -65,13 +65,13 @@ Create a new pipeline instance. In your application, you would normally use a shortcut for this and instantiate the component using its string name and [`nlp.add_pipe`](/api/language#create_pipe). -| Name | Type | Description | -| -------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------- | -| `vocab` | `Vocab` | The shared vocabulary. | -| `model` | [`Model`](https://thinc.ai/docs/api-model) | The Thinc [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. | -| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. | -| _keyword-only_ | | | -| `labels` | `Iterable[str]` | The labels to use. | +| Name | Description | +| -------------- | -------------------------------------------------------------------------------------------------------------------------- | +| `vocab` | The shared vocabulary. ~~Vocab~~ | +| `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~ | +| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | +| _keyword-only_ | | +| `labels` | The labels to use. ~~Iterable[str]~~ | ## TextCategorizer.\_\_call\_\_ {#call tag="method"} @@ -91,10 +91,10 @@ delegate to the [`predict`](/api/textcategorizer#predict) and > processed = textcat(doc) > ``` -| Name | Type | Description | -| ----------- | ----- | ------------------------ | -| `doc` | `Doc` | The document to process. | -| **RETURNS** | `Doc` | The processed document. | +| Name | Description | +| ----------- | -------------------------------- | +| `doc` | The document to process. ~~Doc~~ | +| **RETURNS** | The processed document. ~~Doc~~ | ## TextCategorizer.pipe {#pipe tag="method"} @@ -113,12 +113,12 @@ applied to the `Doc` in order. Both [`__call__`](/api/textcategorizer#call) and > pass > ``` -| Name | Type | Description | -| -------------- | --------------- | ----------------------------------------------------- | -| `stream` | `Iterable[Doc]` | A stream of documents. | -| _keyword-only_ | | | -| `batch_size` | int | The number of documents to buffer. Defaults to `128`. | -| **YIELDS** | `Doc` | The processed documents in order. | +| Name | Description | +| -------------- | ------------------------------------------------------------- | +| `stream` | A stream of documents. ~~Iterable[Doc]~~ | +| _keyword-only_ | | +| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ | +| **YIELDS** | The processed documents in order. ~~Doc~~ | ## TextCategorizer.begin_training {#begin_training tag="method"} @@ -138,13 +138,13 @@ setting up the label scheme based on the data. > optimizer = textcat.begin_training(lambda: [], pipeline=nlp.pipeline) > ``` -| Name | Type | Description | -| -------------- | --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------ | -| `get_examples` | `Callable[[], Iterable[Example]]` | Optional function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. | -| _keyword-only_ | | | -| `pipeline` | `List[Tuple[str, Callable]]` | Optional list of pipeline components that this component is part of. | -| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | An optional optimizer. Will be created via [`create_optimizer`](/api/textcategorizer#create_optimizer) if not set. | -| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------------------------------------- | +| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ | +| _keyword-only_ | | +| `pipeline` | Optional list of pipeline components that this component is part of. ~~Optional[List[Tuple[str, Callable[[Doc], Doc]]]]~~ | +| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | +| **RETURNS** | The optimizer. ~~Optimizer~~ | ## TextCategorizer.predict {#predict tag="method"} @@ -158,10 +158,10 @@ modifying them. > scores = textcat.predict([doc1, doc2]) > ``` -| Name | Type | Description | -| ----------- | --------------- | ----------------------------------------- | -| `docs` | `Iterable[Doc]` | The documents to predict. | -| **RETURNS** | - | The model's prediction for each document. | +| Name | Description | +| ----------- | ------------------------------------------- | +| `docs` | The documents to predict. ~~Iterable[Doc]~~ | +| **RETURNS** | The model's prediction for each document. | ## TextCategorizer.set_annotations {#set_annotations tag="method"} @@ -175,10 +175,10 @@ Modify a batch of [`Doc`](/api/doc) objects, using pre-computed scores. > textcat.set_annotations(docs, scores) > ``` -| Name | Type | Description | -| -------- | --------------- | --------------------------------------------------------- | -| `docs` | `Iterable[Doc]` | The documents to modify. | -| `scores` | - | The scores to set, produced by `TextCategorizer.predict`. | +| Name | Description | +| -------- | --------------------------------------------------------- | +| `docs` | The documents to modify. ~~Iterable[Doc]~~ | +| `scores` | The scores to set, produced by `TextCategorizer.predict`. | ## TextCategorizer.update {#update tag="method"} @@ -195,15 +195,15 @@ Delegates to [`predict`](/api/textcategorizer#predict) and > losses = textcat.update(examples, sgd=optimizer) > ``` -| Name | Type | Description | -| ----------------- | --------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------- | -| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. | -| _keyword-only_ | | | -| `drop` | float | The dropout rate. | -| `set_annotations` | bool | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](/api/textcategorizer#set_annotations). | -| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | -| `losses` | `Dict[str, float]` | Optional record of the loss during training. Updated using the component name as the key. | -| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. | +| Name | Description | +| ----------------- | ---------------------------------------------------------------------------------------------------------------------------------- | +| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ | +| _keyword-only_ | | | +| `drop` | The dropout rate. ~~float~~ | +| `set_annotations` | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](#set_annotations). ~~bool~~ | +| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | +| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ | +| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ | ## TextCategorizer.rehearse {#rehearse tag="method,experimental" new="3"} @@ -219,14 +219,14 @@ the "catastrophic forgetting" problem. This feature is experimental. > losses = textcat.rehearse(examples, sgd=optimizer) > ``` -| Name | Type | Description | -| -------------- | --------------------------------------------------- | ----------------------------------------------------------------------------------------- | -| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. | -| _keyword-only_ | | | -| `drop` | float | The dropout rate. | -| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | -| `losses` | `Dict[str, float]` | Optional record of the loss during training. Updated using the component name as the key. | -| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------------------------ | +| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ | +| _keyword-only_ | | | +| `drop` | The dropout rate. ~~float~~ | +| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | +| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ | +| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ | ## TextCategorizer.get_loss {#get_loss tag="method"} @@ -241,11 +241,11 @@ predicted scores. > loss, d_loss = textcat.get_loss(examples, scores) > ``` -| Name | Type | Description | -| ----------- | --------------------- | --------------------------------------------------- | -| `examples` | `Iterable[Example]` | The batch of examples. | -| `scores` | - | Scores representing the model's predictions. | -| **RETURNS** | `Tuple[float, float]` | The loss and the gradient, i.e. `(loss, gradient)`. | +| Name | Description | +| ----------- | --------------------------------------------------------------------------- | +| `examples` | The batch of examples. ~~Iterable[Example]~~ | +| `scores` | Scores representing the model's predictions. | +| **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ | ## TextCategorizer.score {#score tag="method" new="3"} @@ -257,12 +257,12 @@ Score a batch of examples. > scores = textcat.score(examples) > ``` -| Name | Type | Description | -| ---------------- | ------------------- | ---------------------------------------------------------------------- | -| `examples` | `Iterable[Example]` | The examples to score. | -| _keyword-only_ | | | -| `positive_label` | str | Optional positive label. | -| **RETURNS** | `Dict[str, Any]` | The scores, produced by [`Scorer.score_cats`](/api/scorer#score_cats). | +| Name | Description | +| ---------------- | -------------------------------------------------------------------------------------------------------------------- | +| `examples` | The examples to score. ~~Iterable[Example]~~ | +| _keyword-only_ | | | +| `positive_label` | Optional positive label. ~~Optional[str]~~ | +| **RETURNS** | The scores, produced by [`Scorer.score_cats`](/api/scorer#score_cats). ~~Dict[str, Union[float, Dict[str, float]]]~~ | ## TextCategorizer.create_optimizer {#create_optimizer tag="method"} @@ -275,25 +275,9 @@ Create an optimizer for the pipeline component. > optimizer = textcat.create_optimizer() > ``` -| Name | Type | Description | -| ----------- | --------------------------------------------------- | -------------- | -| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | - -## TextCategorizer.add_label {#add_label tag="method"} - -Add a new label to the pipe. - -> #### Example -> -> ```python -> textcat = nlp.add_pipe("textcat") -> textcat.add_label("MY_LABEL") -> ``` - -| Name | Type | Description | -| ----------- | ---- | --------------------------------------------------- | -| `label` | str | The label to add. | -| **RETURNS** | int | `0` if the label is already present, otherwise `1`. | +| Name | Description | +| ----------- | ---------------------------- | +| **RETURNS** | The optimizer. ~~Optimizer~~ | ## TextCategorizer.use_params {#use_params tag="method, contextmanager"} @@ -307,9 +291,25 @@ Modify the pipe's model, to use the given parameter values. > textcat.to_disk("/best_model") > ``` -| Name | Type | Description | -| -------- | ---- | ----------------------------------------- | -| `params` | dict | The parameter values to use in the model. | +| Name | Description | +| -------- | -------------------------------------------------- | +| `params` | The parameter values to use in the model. ~~dict~~ | + +## TextCategorizer.add_label {#add_label tag="method"} + +Add a new label to the pipe. + +> #### Example +> +> ```python +> textcat = nlp.add_pipe("textcat") +> textcat.add_label("MY_LABEL") +> ``` + +| Name | Description | +| ----------- | ----------------------------------------------------------- | +| `label` | The label to add. ~~str~~ | +| **RETURNS** | `0` if the label is already present, otherwise `1`. ~~int~~ | ## TextCategorizer.to_disk {#to_disk tag="method"} @@ -322,11 +322,11 @@ Serialize the pipe to disk. > textcat.to_disk("/path/to/textcat") > ``` -| Name | Type | Description | -| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | -| _keyword-only_ | | | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------ | +| `path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ | +| _keyword-only_ | | +| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ | ## TextCategorizer.from_disk {#from_disk tag="method"} @@ -339,12 +339,12 @@ Load the pipe from disk. Modifies the object in place and returns it. > textcat.from_disk("/path/to/textcat") > ``` -| Name | Type | Description | -| -------------- | ----------------- | -------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | -| _keyword-only_ | | | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `TextCategorizer` | The modified `TextCategorizer` object. | +| Name | Description | +| -------------- | ----------------------------------------------------------------------------------------------- | +| `path` | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ | +| _keyword-only_ | | +| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ | +| **RETURNS** | The modified `TextCategorizer` object. ~~TextCategorizer~~ | ## TextCategorizer.to_bytes {#to_bytes tag="method"} @@ -357,11 +357,11 @@ Load the pipe from disk. Modifies the object in place and returns it. Serialize the pipe to a bytestring. -| Name | Type | Description | -| -------------- | --------------- | ------------------------------------------------------------------------- | -| _keyword-only_ | | | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | bytes | The serialized form of the `TextCategorizer` object. | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------- | +| _keyword-only_ | | +| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ | +| **RETURNS** | The serialized form of the `TextCategorizer` object. ~~bytes~~ | ## TextCategorizer.from_bytes {#from_bytes tag="method"} @@ -375,12 +375,12 @@ Load the pipe from a bytestring. Modifies the object in place and returns it. > textcat.from_bytes(textcat_bytes) > ``` -| Name | Type | Description | -| -------------- | ----------------- | ------------------------------------------------------------------------- | -| `bytes_data` | bytes | The data to load from. | -| _keyword-only_ | | | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `TextCategorizer` | The `TextCategorizer` object. | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------- | +| `bytes_data` | The data to load from. ~~bytes~~ | +| _keyword-only_ | | +| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ | +| **RETURNS** | The `TextCategorizer` object. ~~TextCategorizer~~ | ## TextCategorizer.labels {#labels tag="property"} @@ -393,9 +393,9 @@ The labels currently added to the component. > assert "MY_LABEL" in textcat.labels > ``` -| Name | Type | Description | -| ----------- | ----- | ---------------------------------- | -| **RETURNS** | tuple | The labels added to the component. | +| Name | Description | +| ----------- | ------------------------------------------------------ | +| **RETURNS** | The labels added to the component. ~~Tuple[str, ...]~~ | ## Serialization fields {#serialization-fields} diff --git a/website/docs/api/tok2vec.md b/website/docs/api/tok2vec.md index 4c820c07c..deb8369ab 100644 --- a/website/docs/api/tok2vec.md +++ b/website/docs/api/tok2vec.md @@ -15,7 +15,7 @@ multiple components, e.g. to have one embedding and CNN network shared between a [`EntityRecognizer`](/api/entityrecognizer). In order to use the `Tok2Vec` predictions, subsequent components should use the -[Tok2VecListener](/api/architectures#Tok2VecListener) layer as the tok2vec +[Tok2VecListener](/api/architectures#Tok2VecListener) layer as the `tok2vec` subnetwork of their model. This layer will read data from the `doc.tensor` attribute during prediction. During training, the `Tok2Vec` component will save its prediction and backprop callback for each batch, so that the subsequent @@ -40,9 +40,9 @@ architectures and their arguments and hyperparameters. > nlp.add_pipe("tok2vec", config=config) > ``` -| Setting | Type | Description | Default | -| ------- | ------------------------------------------ | ----------------------------------------------------------------------- | ----------------------------------------------- | -| `model` | [`Model`](https://thinc.ai/docs/api-model) | **Input:** `List[Doc]`. **Output:** `List[Floats2d]`. The model to use. | [HashEmbedCNN](/api/architectures#HashEmbedCNN) | +| Setting | Description | +| ------- | ------------------------------------------------------------------------------------------------------------------ | +| `model` | The model to use. Defaults to [HashEmbedCNN](/api/architectures#HashEmbedCNN). ~~Model[List[Doc], List[Floats2d]~~ | ```python https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/tok2vec.py @@ -69,11 +69,11 @@ Create a new pipeline instance. In your application, you would normally use a shortcut for this and instantiate the component using its string name and [`nlp.add_pipe`](/api/language#create_pipe). -| Name | Type | Description | -| ------- | ------------------------------------------ | ------------------------------------------------------------------------------------------- | -| `vocab` | `Vocab` | The shared vocabulary. | -| `model` | [`Model`](https://thinc.ai/docs/api-model) | The Thinc [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. | -| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. | +| Name | Description | +| ------- | ------------------------------------------------------------------------------------------------------------------------- | +| `vocab` | The shared vocabulary. ~~Vocab~~ | +| `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]~~ | +| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | ## Tok2Vec.\_\_call\_\_ {#call tag="method"} @@ -95,10 +95,10 @@ pipeline components are applied to the `Doc` in order. Both > processed = tok2vec(doc) > ``` -| Name | Type | Description | -| ----------- | ----- | ------------------------ | -| `doc` | `Doc` | The document to process. | -| **RETURNS** | `Doc` | The processed document. | +| Name | Description | +| ----------- | -------------------------------- | +| `doc` | The document to process. ~~Doc~~ | +| **RETURNS** | The processed document. ~~Doc~~ | ## Tok2Vec.pipe {#pipe tag="method"} @@ -116,12 +116,12 @@ and [`set_annotations`](/api/tok2vec#set_annotations) methods. > pass > ``` -| Name | Type | Description | -| -------------- | --------------- | ----------------------------------------------------- | -| `stream` | `Iterable[Doc]` | A stream of documents. | -| _keyword-only_ | | | -| `batch_size` | int | The number of documents to buffer. Defaults to `128`. | -| **YIELDS** | `Doc` | The processed documents in order. | +| Name | Description | +| -------------- | ------------------------------------------------------------- | +| `stream` | A stream of documents. ~~Iterable[Doc]~~ | +| _keyword-only_ | | +| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ | +| **YIELDS** | The processed documents in order. ~~Doc~~ | ## Tok2Vec.begin_training {#begin_training tag="method"} @@ -141,13 +141,13 @@ setting up the label scheme based on the data. > optimizer = tok2vec.begin_training(lambda: [], pipeline=nlp.pipeline) > ``` -| Name | Type | Description | -| -------------- | --------------------------------------------------- | ---------------------------------------------------------------------------------------------------------- | -| `get_examples` | `Callable[[], Iterable[Example]]` | Optional function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. | -| _keyword-only_ | | | -| `pipeline` | `List[Tuple[str, Callable]]` | Optional list of pipeline components that this component is part of. | -| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | An optional optimizer. Will be created via [`create_optimizer`](/api/tok2vec#create_optimizer) if not set. | -| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------------------------------------- | +| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ | +| _keyword-only_ | | | +| `pipeline` | Optional list of pipeline components that this component is part of. ~~Optional[List[Tuple[str, Callable[[Doc], Doc]]]]~~ | +| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | +| **RETURNS** | The optimizer. ~~Optimizer~~ | ## Tok2Vec.predict {#predict tag="method"} @@ -161,10 +161,10 @@ modifying them. > scores = tok2vec.predict([doc1, doc2]) > ``` -| Name | Type | Description | -| ----------- | --------------- | ----------------------------------------- | -| `docs` | `Iterable[Doc]` | The documents to predict. | -| **RETURNS** | - | The model's prediction for each document. | +| Name | Description | +| ----------- | ------------------------------------------- | +| `docs` | The documents to predict. ~~Iterable[Doc]~~ | +| **RETURNS** | The model's prediction for each document. | ## Tok2Vec.set_annotations {#set_annotations tag="method"} @@ -178,10 +178,10 @@ Modify a batch of [`Doc`](/api/doc) objects, using pre-computed scores. > tok2vec.set_annotations(docs, scores) > ``` -| Name | Type | Description | -| -------- | --------------- | ------------------------------------------------- | -| `docs` | `Iterable[Doc]` | The documents to modify. | -| `scores` | - | The scores to set, produced by `Tok2Vec.predict`. | +| Name | Description | +| -------- | ------------------------------------------------- | +| `docs` | The documents to modify. ~~Iterable[Doc]~~ | +| `scores` | The scores to set, produced by `Tok2Vec.predict`. | ## Tok2Vec.update {#update tag="method"} @@ -197,15 +197,15 @@ Delegates to [`predict`](/api/tok2vec#predict). > losses = tok2vec.update(examples, sgd=optimizer) > ``` -| Name | Type | Description | -| ----------------- | --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------- | -| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. | -| _keyword-only_ | | | -| `drop` | float | The dropout rate. | -| `set_annotations` | bool | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](/api/tok2vec#set_annotations). | -| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | -| `losses` | `Dict[str, float]` | Optional record of the loss during training. Updated using the component name as the key. | -| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. | +| Name | Description | +| ----------------- | ---------------------------------------------------------------------------------------------------------------------------------- | +| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ | +| _keyword-only_ | | | +| `drop` | The dropout rate. ~~float~~ | +| `set_annotations` | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](#set_annotations). ~~bool~~ | +| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | +| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ | +| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ | ## Tok2Vec.create_optimizer {#create_optimizer tag="method"} @@ -218,9 +218,9 @@ Create an optimizer for the pipeline component. > optimizer = tok2vec.create_optimizer() > ``` -| Name | Type | Description | -| ----------- | --------------------------------------------------- | -------------- | -| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | +| Name | Description | +| ----------- | ---------------------------- | +| **RETURNS** | The optimizer. ~~Optimizer~~ | ## Tok2Vec.use_params {#use_params tag="method, contextmanager"} @@ -235,9 +235,9 @@ context, the original parameters are restored. > tok2vec.to_disk("/best_model") > ``` -| Name | Type | Description | -| -------- | ---- | ----------------------------------------- | -| `params` | dict | The parameter values to use in the model. | +| Name | Description | +| -------- | -------------------------------------------------- | +| `params` | The parameter values to use in the model. ~~dict~~ | ## Tok2Vec.to_disk {#to_disk tag="method"} @@ -250,11 +250,11 @@ Serialize the pipe to disk. > tok2vec.to_disk("/path/to/tok2vec") > ``` -| Name | Type | Description | -| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | -| _keyword-only_ | | | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------ | +| `path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ | +| _keyword-only_ | | +| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ | ## Tok2Vec.from_disk {#from_disk tag="method"} @@ -267,12 +267,12 @@ Load the pipe from disk. Modifies the object in place and returns it. > tok2vec.from_disk("/path/to/tok2vec") > ``` -| Name | Type | Description | -| -------------- | --------------- | -------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | -| _keyword-only_ | | | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `Tok2Vec` | The modified `Tok2Vec` object. | +| Name | Description | +| -------------- | ----------------------------------------------------------------------------------------------- | +| `path` | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ | +| _keyword-only_ | | +| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ | +| **RETURNS** | The modified `Tok2Vec` object. ~~Tok2Vec~~ | ## Tok2Vec.to_bytes {#to_bytes tag="method"} @@ -285,11 +285,11 @@ Load the pipe from disk. Modifies the object in place and returns it. Serialize the pipe to a bytestring. -| Name | Type | Description | -| -------------- | --------------- | ------------------------------------------------------------------------- | -| _keyword-only_ | | | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | bytes | The serialized form of the `Tok2Vec` object. | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------- | +| _keyword-only_ | | +| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ | +| **RETURNS** | The serialized form of the `Tok2Vec` object. ~~bytes~~ | ## Tok2Vec.from_bytes {#from_bytes tag="method"} @@ -303,12 +303,12 @@ Load the pipe from a bytestring. Modifies the object in place and returns it. > tok2vec.from_bytes(tok2vec_bytes) > ``` -| Name | Type | Description | -| -------------- | --------------- | ------------------------------------------------------------------------- | -| `bytes_data` | bytes | The data to load from. | -| _keyword-only_ | | | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `Tok2Vec` | The `Tok2Vec` object. | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------- | +| `bytes_data` | The data to load from. ~~bytes~~ | +| _keyword-only_ | | +| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ | +| **RETURNS** | The `Tok2Vec` object. ~~Tok2Vec~~ | ## Serialization fields {#serialization-fields} diff --git a/website/docs/api/token.md b/website/docs/api/token.md index 6390ab975..0860797aa 100644 --- a/website/docs/api/token.md +++ b/website/docs/api/token.md @@ -17,11 +17,11 @@ Construct a `Token` object. > assert token.text == "Give" > ``` -| Name | Type | Description | -| -------- | ------- | ------------------------------------------- | -| `vocab` | `Vocab` | A storage container for lexical types. | -| `doc` | `Doc` | The parent document. | -| `offset` | int | The index of the token within the document. | +| Name | Description | +| -------- | --------------------------------------------------- | +| `vocab` | A storage container for lexical types. ~~Vocab~~ | +| `doc` | The parent document. ~~Doc~~ | +| `offset` | The index of the token within the document. ~~int~~ | ## Token.\_\_len\_\_ {#len tag="method"} @@ -35,9 +35,9 @@ The number of unicode characters in the token, i.e. `token.text`. > assert len(token) == 4 > ``` -| Name | Type | Description | -| ----------- | ---- | ---------------------------------------------- | -| **RETURNS** | int | The number of unicode characters in the token. | +| Name | Description | +| ----------- | ------------------------------------------------------ | +| **RETURNS** | The number of unicode characters in the token. ~~int~~ | ## Token.set_extension {#set_extension tag="classmethod" new="2"} @@ -55,14 +55,14 @@ For details, see the documentation on > assert doc[3]._.is_fruit > ``` -| Name | Type | Description | -| --------- | -------- | --------------------------------------------------------------------------------------------------------------------------------------- | -| `name` | str | Name of the attribute to set by the extension. For example, `"my_attr"` will be available as `token._.my_attr`. | -| `default` | - | Optional default value of the attribute if no getter or method is defined. | -| `method` | callable | Set a custom method on the object, for example `token._.compare(other_token)`. | -| `getter` | callable | Getter function that takes the object and returns an attribute value. Is called when the user accesses the `._` attribute. | -| `setter` | callable | Setter function that takes the `Token` and a value, and modifies the object. Is called when the user writes to the `Token._` attribute. | -| `force` | bool | Force overwriting existing attribute. | +| Name | Description | +| --------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `name` | Name of the attribute to set by the extension. For example, `"my_attr"` will be available as `token._.my_attr`. ~~str~~ | +| `default` | Optional default value of the attribute if no getter or method is defined. ~~Optional[Any]~~ | +| `method` | Set a custom method on the object, for example `token._.compare(other_token)`. ~~Optional[Callable[[Token, ...], Any]]~~ | +| `getter` | Getter function that takes the object and returns an attribute value. Is called when the user accesses the `._` attribute. ~~Optional[Callable[[Token], Any]]~~ | +| `setter` | Setter function that takes the `Token` and a value, and modifies the object. Is called when the user writes to the `Token._` attribute. ~~Optional[Callable[[Token, Any], None]]~~ | +| `force` | Force overwriting existing attribute. ~~bool~~ | ## Token.get_extension {#get_extension tag="classmethod" new="2"} @@ -79,10 +79,10 @@ Look up a previously registered extension by name. Returns a 4-tuple > assert extension == (False, None, None, None) > ``` -| Name | Type | Description | -| ----------- | ----- | ------------------------------------------------------------- | -| `name` | str | Name of the extension. | -| **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the extension. | +| Name | Description | +| ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------- | +| `name` | Name of the extension. ~~str~~ | +| **RETURNS** | A `(default, method, getter, setter)` tuple of the extension. ~~Tuple[Optional[Any], Optional[Callable], Optional[Callable], Optional[Callable]]~~ | ## Token.has_extension {#has_extension tag="classmethod" new="2"} @@ -96,10 +96,10 @@ Check whether an extension has been registered on the `Token` class. > assert Token.has_extension("is_fruit") > ``` -| Name | Type | Description | -| ----------- | ---- | ------------------------------------------ | -| `name` | str | Name of the extension to check. | -| **RETURNS** | bool | Whether the extension has been registered. | +| Name | Description | +| ----------- | --------------------------------------------------- | +| `name` | Name of the extension to check. ~~str~~ | +| **RETURNS** | Whether the extension has been registered. ~~bool~~ | ## Token.remove_extension {#remove_extension tag="classmethod" new=""2.0.11""} @@ -114,10 +114,10 @@ Remove a previously registered extension. > assert not Token.has_extension("is_fruit") > ``` -| Name | Type | Description | -| ----------- | ----- | --------------------------------------------------------------------- | -| `name` | str | Name of the extension. | -| **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the removed extension. | +| Name | Description | +| ----------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `name` | Name of the extension. ~~str~~ | +| **RETURNS** | A `(default, method, getter, setter)` tuple of the removed extension. ~~Tuple[Optional[Any], Optional[Callable], Optional[Callable], Optional[Callable]]~~ | ## Token.check_flag {#check_flag tag="method"} @@ -132,10 +132,10 @@ Check the value of a boolean flag. > assert token.check_flag(IS_TITLE) == True > ``` -| Name | Type | Description | -| ----------- | ---- | -------------------------------------- | -| `flag_id` | int | The attribute ID of the flag to check. | -| **RETURNS** | bool | Whether the flag is set. | +| Name | Description | +| ----------- | ---------------------------------------------- | +| `flag_id` | The attribute ID of the flag to check. ~~int~~ | +| **RETURNS** | Whether the flag is set. ~~bool~~ | ## Token.similarity {#similarity tag="method" model="vectors"} @@ -150,10 +150,10 @@ Compute a semantic similarity estimate. Defaults to cosine over vectors. > assert apples_oranges == oranges_apples > ``` -| Name | Type | Description | -| ----------- | ----- | -------------------------------------------------------------------------------------------- | -| other | - | The object to compare with. By default, accepts `Doc`, `Span`, `Token` and `Lexeme` objects. | -| **RETURNS** | float | A scalar similarity score. Higher is more similar. | +| Name | Description | +| ----------- | -------------------------------------------------------------------------------------------------------------------------------- | +| other | The object to compare with. By default, accepts `Doc`, `Span`, `Token` and `Lexeme` objects. ~~Union[Doc, Span, Token, Lexeme]~~ | +| **RETURNS** | A scalar similarity score. Higher is more similar. ~~float~~ | ## Token.nbor {#nbor tag="method"} @@ -167,10 +167,10 @@ Get a neighboring token. > assert give_nbor.text == "it" > ``` -| Name | Type | Description | -| ----------- | ------- | ----------------------------------------------------------- | -| `i` | int | The relative position of the token to get. Defaults to `1`. | -| **RETURNS** | `Token` | The token at position `self.doc[self.i+i]`. | +| Name | Description | +| ----------- | ------------------------------------------------------------------- | +| `i` | The relative position of the token to get. Defaults to `1`. ~~int~~ | +| **RETURNS** | The token at position `self.doc[self.i+i]`. ~~Token~~ | ## Token.is_ancestor {#is_ancestor tag="method" model="parser"} @@ -186,10 +186,10 @@ dependency tree. > assert give.is_ancestor(it) > ``` -| Name | Type | Description | -| ----------- | ------- | ----------------------------------------------------- | -| descendant | `Token` | Another token. | -| **RETURNS** | bool | Whether this token is the ancestor of the descendant. | +| Name | Description | +| ----------- | -------------------------------------------------------------- | +| descendant | Another token. ~~Token~~ | +| **RETURNS** | Whether this token is the ancestor of the descendant. ~~bool~~ | ## Token.ancestors {#ancestors tag="property" model="parser"} @@ -205,9 +205,9 @@ The rightmost token of this token's syntactic descendants. > assert [t.text for t in he_ancestors] == ["pleaded"] > ``` -| Name | Type | Description | -| ---------- | ------- | --------------------------------------------------------------------- | -| **YIELDS** | `Token` | A sequence of ancestor tokens such that `ancestor.is_ancestor(self)`. | +| Name | Description | +| ---------- | ------------------------------------------------------------------------------- | +| **YIELDS** | A sequence of ancestor tokens such that `ancestor.is_ancestor(self)`. ~~Token~~ | ## Token.conjuncts {#conjuncts tag="property" model="parser"} @@ -221,9 +221,9 @@ A tuple of coordinated tokens, not including the token itself. > assert [t.text for t in apples_conjuncts] == ["oranges"] > ``` -| Name | Type | Description | -| ----------- | ------- | ----------------------- | -| **RETURNS** | `tuple` | The coordinated tokens. | +| Name | Description | +| ----------- | --------------------------------------------- | +| **RETURNS** | The coordinated tokens. ~~Tuple[Token, ...]~~ | ## Token.children {#children tag="property" model="parser"} @@ -237,9 +237,9 @@ A sequence of the token's immediate syntactic children. > assert [t.text for t in give_children] == ["it", "back", "!"] > ``` -| Name | Type | Description | -| ---------- | ------- | ------------------------------------------- | -| **YIELDS** | `Token` | A child token such that `child.head==self`. | +| Name | Description | +| ---------- | ------------------------------------------------------- | +| **YIELDS** | A child token such that `child.head == self`. ~~Token~~ | ## Token.lefts {#lefts tag="property" model="parser"} @@ -253,9 +253,9 @@ The leftward immediate children of the word, in the syntactic dependency parse. > assert lefts == ["New"] > ``` -| Name | Type | Description | -| ---------- | ------- | -------------------------- | -| **YIELDS** | `Token` | A left-child of the token. | +| Name | Description | +| ---------- | ------------------------------------ | +| **YIELDS** | A left-child of the token. ~~Token~~ | ## Token.rights {#rights tag="property" model="parser"} @@ -269,9 +269,9 @@ The rightward immediate children of the word, in the syntactic dependency parse. > assert rights == ["in"] > ``` -| Name | Type | Description | -| ---------- | ------- | --------------------------- | -| **YIELDS** | `Token` | A right-child of the token. | +| Name | Description | +| ---------- | ------------------------------------- | +| **YIELDS** | A right-child of the token. ~~Token~~ | ## Token.n_lefts {#n_lefts tag="property" model="parser"} @@ -285,9 +285,9 @@ dependency parse. > assert doc[3].n_lefts == 1 > ``` -| Name | Type | Description | -| ----------- | ---- | -------------------------------- | -| **RETURNS** | int | The number of left-child tokens. | +| Name | Description | +| ----------- | ---------------------------------------- | +| **RETURNS** | The number of left-child tokens. ~~int~~ | ## Token.n_rights {#n_rights tag="property" model="parser"} @@ -301,9 +301,9 @@ dependency parse. > assert doc[3].n_rights == 1 > ``` -| Name | Type | Description | -| ----------- | ---- | --------------------------------- | -| **RETURNS** | int | The number of right-child tokens. | +| Name | Description | +| ----------- | ----------------------------------------- | +| **RETURNS** | The number of right-child tokens. ~~int~~ | ## Token.subtree {#subtree tag="property" model="parser"} @@ -317,9 +317,9 @@ A sequence containing the token and all the token's syntactic descendants. > assert [t.text for t in give_subtree] == ["Give", "it", "back", "!"] > ``` -| Name | Type | Description | -| ---------- | ------- | -------------------------------------------------------------------------- | -| **YIELDS** | `Token` | A descendant token such that `self.is_ancestor(token)` or `token == self`. | +| Name | Description | +| ---------- | ------------------------------------------------------------------------------------ | +| **YIELDS** | A descendant token such that `self.is_ancestor(token)` or `token == self`. ~~Token~~ | ## Token.is_sent_start {#is_sent_start tag="property" new="2"} @@ -334,9 +334,9 @@ unknown. Defaults to `True` for the first token in the `Doc`. > assert not doc[5].is_sent_start > ``` -| Name | Type | Description | -| ----------- | ---- | ------------------------------------ | -| **RETURNS** | bool | Whether the token starts a sentence. | +| Name | Description | +| ----------- | --------------------------------------------- | +| **RETURNS** | Whether the token starts a sentence. ~~bool~~ | ## Token.has_vector {#has_vector tag="property" model="vectors"} @@ -350,9 +350,9 @@ A boolean value indicating whether a word vector is associated with the token. > assert apples.has_vector > ``` -| Name | Type | Description | -| ----------- | ---- | --------------------------------------------- | -| **RETURNS** | bool | Whether the token has a vector data attached. | +| Name | Description | +| ----------- | ------------------------------------------------------ | +| **RETURNS** | Whether the token has a vector data attached. ~~bool~~ | ## Token.vector {#vector tag="property" model="vectors"} @@ -367,9 +367,9 @@ A real-valued meaning representation. > assert apples.vector.shape == (300,) > ``` -| Name | Type | Description | -| ----------- | ---------------------------------------- | ---------------------------------------------------- | -| **RETURNS** | `numpy.ndarray[ndim=1, dtype="float32"]` | A 1D numpy array representing the token's semantics. | +| Name | Description | +| ----------- | ----------------------------------------------------------------------------------------------- | +| **RETURNS** | A 1-dimensional array representing the token's vector. ~~numpy.ndarray[ndim=1, dtype=float32]~~ | ## Token.vector_norm {#vector_norm tag="property" model="vectors"} @@ -386,80 +386,80 @@ The L2 norm of the token's vector representation. > assert apples.vector_norm != pasta.vector_norm > ``` -| Name | Type | Description | -| ----------- | ----- | ----------------------------------------- | -| **RETURNS** | float | The L2 norm of the vector representation. | +| Name | Description | +| ----------- | --------------------------------------------------- | +| **RETURNS** | The L2 norm of the vector representation. ~~float~~ | ## Attributes {#attributes} -| Name | Type | Description | -| -------------------------------------------- | ----------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `doc` | `Doc` | The parent document. | -| `lex` 3 | [`Lexeme`](/api/lexeme) | The underlying lexeme. | -| `sent` 2.0.12 | [`Span`](/api/span) | The sentence span that this token is a part of. | -| `text` | str | Verbatim text content. | -| `text_with_ws` | str | Text content, with trailing space character if present. | -| `whitespace_` | str | Trailing space character if present. | -| `orth` | int | ID of the verbatim text content. | -| `orth_` | str | Verbatim text content (identical to `Token.text`). Exists mostly for consistency with the other attributes. | -| `vocab` | `Vocab` | The vocab object of the parent `Doc`. | -| `tensor` 2.1.7 | `ndarray` | The tokens's slice of the parent `Doc`'s tensor. | -| `head` | `Token` | The syntactic parent, or "governor", of this token. | -| `left_edge` | `Token` | The leftmost token of this token's syntactic descendants. | -| `right_edge` | `Token` | The rightmost token of this token's syntactic descendants. | -| `i` | int | The index of the token within the parent document. | -| `ent_type` | int | Named entity type. | -| `ent_type_` | str | Named entity type. | -| `ent_iob` | int | IOB code of named entity tag. `3` means the token begins an entity, `2` means it is outside an entity, `1` means it is inside an entity, and `0` means no entity tag is set. | -| `ent_iob_` | str | IOB code of named entity tag. "B" means the token begins an entity, "I" means it is inside an entity, "O" means it is outside an entity, and "" means no entity tag is set. | -| `ent_kb_id` 2.2 | int | Knowledge base ID that refers to the named entity this token is a part of, if any. | -| `ent_kb_id_` 2.2 | str | Knowledge base ID that refers to the named entity this token is a part of, if any. | -| `ent_id` | int | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. | -| `ent_id_` | str | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. | -| `lemma` | int | Base form of the token, with no inflectional suffixes. | -| `lemma_` | str | Base form of the token, with no inflectional suffixes. | -| `norm` | int | The token's norm, i.e. a normalized form of the token text. Usually set in the language's [tokenizer exceptions](/usage/adding-languages#tokenizer-exceptions) or [norm exceptions](/usage/adding-languages#norm-exceptions). | -| `norm_` | str | The token's norm, i.e. a normalized form of the token text. Usually set in the language's [tokenizer exceptions](/usage/adding-languages#tokenizer-exceptions) or [norm exceptions](/usage/adding-languages#norm-exceptions). | -| `lower` | int | Lowercase form of the token. | -| `lower_` | str | Lowercase form of the token text. Equivalent to `Token.text.lower()`. | -| `shape` | int | Transform of the tokens's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. | -| `shape_` | str | Transform of the tokens's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. | -| `prefix` | int | Hash value of a length-N substring from the start of the token. Defaults to `N=1`. | -| `prefix_` | str | A length-N substring from the start of the token. Defaults to `N=1`. | -| `suffix` | int | Hash value of a length-N substring from the end of the token. Defaults to `N=3`. | -| `suffix_` | str | Length-N substring from the end of the token. Defaults to `N=3`. | -| `is_alpha` | bool | Does the token consist of alphabetic characters? Equivalent to `token.text.isalpha()`. | -| `is_ascii` | bool | Does the token consist of ASCII characters? Equivalent to `all(ord(c) < 128 for c in token.text)`. | -| `is_digit` | bool | Does the token consist of digits? Equivalent to `token.text.isdigit()`. | -| `is_lower` | bool | Is the token in lowercase? Equivalent to `token.text.islower()`. | -| `is_upper` | bool | Is the token in uppercase? Equivalent to `token.text.isupper()`. | -| `is_title` | bool | Is the token in titlecase? Equivalent to `token.text.istitle()`. | -| `is_punct` | bool | Is the token punctuation? | -| `is_left_punct` | bool | Is the token a left punctuation mark, e.g. `"("` ? | -| `is_right_punct` | bool | Is the token a right punctuation mark, e.g. `")"` ? | -| `is_space` | bool | Does the token consist of whitespace characters? Equivalent to `token.text.isspace()`. | -| `is_bracket` | bool | Is the token a bracket? | -| `is_quote` | bool | Is the token a quotation mark? | -| `is_currency` 2.0.8 | bool | Is the token a currency symbol? | -| `like_url` | bool | Does the token resemble a URL? | -| `like_num` | bool | Does the token represent a number? e.g. "10.9", "10", "ten", etc. | -| `like_email` | bool | Does the token resemble an email address? | -| `is_oov` | bool | Does the token have a word vector? | -| `is_stop` | bool | Is the token part of a "stop list"? | -| `pos` | int | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/docs/u/pos/). | -| `pos_` | str | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/docs/u/pos/). | -| `tag` | int | Fine-grained part-of-speech. | -| `tag_` | str | Fine-grained part-of-speech. | -| `morph` | `MorphAnalysis` | Morphological analysis. | -| `morph_` | str | Morphological analysis in UD FEATS format. | -| `dep` | int | Syntactic dependency relation. | -| `dep_` | str | Syntactic dependency relation. | -| `lang` | int | Language of the parent document's vocabulary. | -| `lang_` | str | Language of the parent document's vocabulary. | -| `prob` | float | Smoothed log probability estimate of token's word type (context-independent entry in the vocabulary). | -| `idx` | int | The character offset of the token within the parent document. | -| `sentiment` | float | A scalar value indicating the positivity or negativity of the token. | -| `lex_id` | int | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. | -| `rank` | int | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. | -| `cluster` | int | Brown cluster ID. | -| `_` | `Underscore` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). | +| Name | Description | +| -------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `doc` | The parent document. ~~Doc~~ | +| `lex` 3 | The underlying lexeme. ~~Lexeme~~ | +| `sent` 2.0.12 | The sentence span that this token is a part of. ~~Span~~ | +| `text` | Verbatim text content. ~~str~~ | +| `text_with_ws` | Text content, with trailing space character if present. ~~str~~ | +| `whitespace_` | Trailing space character if present. ~~str~~ | +| `orth` | ID of the verbatim text content. ~~int~~ | +| `orth_` | Verbatim text content (identical to `Token.text`). Exists mostly for consistency with the other attributes. ~~str~~ | +| `vocab` | The vocab object of the parent `Doc`. ~~vocab~~ | +| `tensor` 2.1.7 | The tokens's slice of the parent `Doc`'s tensor. ~~numpy.ndarray~~ | +| `head` | The syntactic parent, or "governor", of this token. ~~Token~~ | +| `left_edge` | The leftmost token of this token's syntactic descendants. ~~Token~~ | +| `right_edge` | The rightmost token of this token's syntactic descendants. ~~Token~~ | +| `i` | The index of the token within the parent document. ~~int~~ | +| `ent_type` | Named entity type. ~~int~~ | +| `ent_type_` | Named entity type. ~~str~~ | +| `ent_iob` | IOB code of named entity tag. `3` means the token begins an entity, `2` means it is outside an entity, `1` means it is inside an entity, and `0` means no entity tag is set. ~~int~~ | +| `ent_iob_` | IOB code of named entity tag. "B" means the token begins an entity, "I" means it is inside an entity, "O" means it is outside an entity, and "" means no entity tag is set. ~~str~~ | +| `ent_kb_id` 2.2 | Knowledge base ID that refers to the named entity this token is a part of, if any. ~~int~~ | +| `ent_kb_id_` 2.2 | Knowledge base ID that refers to the named entity this token is a part of, if any. ~~str~~ | +| `ent_id` | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. ~~int~~ | +| `ent_id_` | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. ~~str~~ | +| `lemma` | Base form of the token, with no inflectional suffixes. ~~int~~ | +| `lemma_` | Base form of the token, with no inflectional suffixes. ~~str~~ | +| `norm` | The token's norm, i.e. a normalized form of the token text. Can be set in the language's [tokenizer exceptions](/usage/adding-languages#tokenizer-exceptions). ~~int~~ | +| `norm_` | The token's norm, i.e. a normalized form of the token text. Can be set in the language's [tokenizer exceptions](/usage/adding-languages#tokenizer-exceptions). ~~str~~ | +| `lower` | Lowercase form of the token. ~~int~~ | +| `lower_` | Lowercase form of the token text. Equivalent to `Token.text.lower()`. ~~str~~ | +| `shape` | Transform of the tokens's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~int~~ | +| `shape_` | Transform of the tokens's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~str~~ | +| `prefix` | Hash value of a length-N substring from the start of the token. Defaults to `N=1`. ~~int~~ | +| `prefix_` | A length-N substring from the start of the token. Defaults to `N=1`. ~~str~~ | +| `suffix` | Hash value of a length-N substring from the end of the token. Defaults to `N=3`. ~~int~~ | +| `suffix_` | Length-N substring from the end of the token. Defaults to `N=3`. ~~str~~ | +| `is_alpha` | Does the token consist of alphabetic characters? Equivalent to `token.text.isalpha()`. ~~bool~~ | +| `is_ascii` | Does the token consist of ASCII characters? Equivalent to `all(ord(c) < 128 for c in token.text)`. ~~bool~~ | +| `is_digit` | Does the token consist of digits? Equivalent to `token.text.isdigit()`. ~~bool~~ | +| `is_lower` | Is the token in lowercase? Equivalent to `token.text.islower()`. ~~bool~~ | +| `is_upper` | Is the token in uppercase? Equivalent to `token.text.isupper()`. ~~bool~~ | +| `is_title` | Is the token in titlecase? Equivalent to `token.text.istitle()`. ~~bool~~ | +| `is_punct` | Is the token punctuation? ~~bool~~ | +| `is_left_punct` | Is the token a left punctuation mark, e.g. `"("` ? ~~bool~~ | +| `is_right_punct` | Is the token a right punctuation mark, e.g. `")"` ? ~~bool~~ | +| `is_space` | Does the token consist of whitespace characters? Equivalent to `token.text.isspace()`. ~~bool~~ | +| `is_bracket` | Is the token a bracket? ~~bool~~ | +| `is_quote` | Is the token a quotation mark? ~~bool~~ | +| `is_currency` 2.0.8 | Is the token a currency symbol? ~~bool~~ | +| `like_url` | Does the token resemble a URL? ~~bool~~ | +| `like_num` | Does the token represent a number? e.g. "10.9", "10", "ten", etc. ~~bool~~ | +| `like_email` | Does the token resemble an email address? ~~bool~~ | +| `is_oov` | Does the token have a word vector? ~~bool~~ | +| `is_stop` | Is the token part of a "stop list"? ~~bool~~ | +| `pos` | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/docs/u/pos/). ~~int~~ | +| `pos_` | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/docs/u/pos/). ~~str~~ | +| `tag` | Fine-grained part-of-speech. ~~int~~ | +| `tag_` | Fine-grained part-of-speech. ~~str~~ | +| `morph` 3 | Morphological analysis. ~~MorphAnalysis~~ | +| `morph_` 3 | Morphological analysis in the Universal Dependencies [FEATS]https://universaldependencies.org/format.html#morphological-annotation format. ~~str~~ | +| `dep` | Syntactic dependency relation. ~~int~~ | +| `dep_` | Syntactic dependency relation. ~~str~~ | +| `lang` | Language of the parent document's vocabulary. ~~int~~ | +| `lang_` | Language of the parent document's vocabulary. ~~str~~ | +| `prob` | Smoothed log probability estimate of token's word type (context-independent entry in the vocabulary). ~~float~~ | +| `idx` | The character offset of the token within the parent document. ~~int~~ | +| `sentiment` | A scalar value indicating the positivity or negativity of the token. ~~float~~ | +| `lex_id` | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. ~~int~~ | +| `rank` | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. ~~int~~ | +| `cluster` | Brown cluster ID. ~~int~~ | +| `_` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~ | diff --git a/website/docs/api/tokenizer.md b/website/docs/api/tokenizer.md index 12edf0033..0158c5589 100644 --- a/website/docs/api/tokenizer.md +++ b/website/docs/api/tokenizer.md @@ -45,15 +45,15 @@ the > tokenizer = nlp.tokenizer > ``` -| Name | Type | Description | -| ---------------- | -------- | ------------------------------------------------------------------------------------------------------------------------------ | -| `vocab` | `Vocab` | A storage container for lexical types. | -| `rules` | dict | Exceptions and special-cases for the tokenizer. | -| `prefix_search` | callable | A function matching the signature of `re.compile(string).search` to match prefixes. | -| `suffix_search` | callable | A function matching the signature of `re.compile(string).search` to match suffixes. | -| `infix_finditer` | callable | A function matching the signature of `re.compile(string).finditer` to find infixes. | -| `token_match` | callable | A function matching the signature of `re.compile(string).match` to find token matches. | -| `url_match` | callable | A function matching the signature of `re.compile(string).match` to find token matches after considering prefixes and suffixes. | +| Name | Description | +| ---------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `vocab` | A storage container for lexical types. ~~Vocab~~ | +| `rules` | Exceptions and special-cases for the tokenizer. ~~Optional[Dict[str, List[Dict[int, str]]]]~~ | +| `prefix_search` | A function matching the signature of `re.compile(string).search` to match prefixes. ~~Optional[Callable[[str], Optional[Match]]]~~ | +| `suffix_search` | A function matching the signature of `re.compile(string).search` to match suffixes. ~~Optional[Callable[[str], Optional[Match]]]~~ | +| `infix_finditer` | A function matching the signature of `re.compile(string).finditer` to find infixes. ~~Optional[Callable[[str], Iterator[Match]]]~~ | +| `token_match` | A function matching the signature of `re.compile(string).match` to find token matches. ~~Optional[Callable[[str], Optional[Match]]]~~ | +| `url_match` | A function matching the signature of `re.compile(string).match` to find token matches after considering prefixes and suffixes. ~~Optional[Callable[[str], Optional[Match]]]~~ | ## Tokenizer.\_\_call\_\_ {#call tag="method"} @@ -66,10 +66,10 @@ Tokenize a string. > assert len(tokens) == 4 > ``` -| Name | Type | Description | -| ----------- | ----- | --------------------------------------- | -| `string` | str | The string to tokenize. | -| **RETURNS** | `Doc` | A container for linguistic annotations. | +| Name | Description | +| ----------- | ----------------------------------------------- | +| `string` | The string to tokenize. ~~str~~ | +| **RETURNS** | A container for linguistic annotations. ~~Doc~~ | ## Tokenizer.pipe {#pipe tag="method"} @@ -83,40 +83,40 @@ Tokenize a stream of texts. > pass > ``` -| Name | Type | Description | -| ------------ | ----- | ---------------------------------------------------------------------------- | -| `texts` | - | A sequence of unicode texts. | -| `batch_size` | int | The number of texts to accumulate in an internal buffer. Defaults to `1000`. | -| **YIELDS** | `Doc` | A sequence of Doc objects, in order. | +| Name | Description | +| ------------ | ------------------------------------------------------------------------------------ | +| `texts` | A sequence of unicode texts. ~~Iterable[str]~~ | +| `batch_size` | The number of texts to accumulate in an internal buffer. Defaults to `1000`. ~~int~~ | +| **YIELDS** | The tokenized Doc objects, in order. ~~Doc~~ | ## Tokenizer.find_infix {#find_infix tag="method"} Find internal split points of the string. -| Name | Type | Description | -| ----------- | ---- | -------------------------------------------------------------------------------------------------------------------------------------------------- | -| `string` | str | The string to split. | -| **RETURNS** | list | A list of `re.MatchObject` objects that have `.start()` and `.end()` methods, denoting the placement of internal segment separators, e.g. hyphens. | +| Name | Description | +| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `string` | The string to split. ~~str~~ | +| **RETURNS** | A list of `re.MatchObject` objects that have `.start()` and `.end()` methods, denoting the placement of internal segment separators, e.g. hyphens. ~~List[Match]~~ | ## Tokenizer.find_prefix {#find_prefix tag="method"} Find the length of a prefix that should be segmented from the string, or `None` if no prefix rules match. -| Name | Type | Description | -| ----------- | ---- | ------------------------------------------------------ | -| `string` | str | The string to segment. | -| **RETURNS** | int | The length of the prefix if present, otherwise `None`. | +| Name | Description | +| ----------- | ------------------------------------------------------------------------ | +| `string` | The string to segment. ~~str~~ | +| **RETURNS** | The length of the prefix if present, otherwise `None`. ~~Optional[int]~~ | ## Tokenizer.find_suffix {#find_suffix tag="method"} Find the length of a suffix that should be segmented from the string, or `None` if no suffix rules match. -| Name | Type | Description | -| ----------- | ------------ | ------------------------------------------------------ | -| `string` | str | The string to segment. | -| **RETURNS** | int / `None` | The length of the suffix if present, otherwise `None`. | +| Name | Description | +| ----------- | ------------------------------------------------------------------------ | +| `string` | The string to segment. ~~str~~ | +| **RETURNS** | The length of the suffix if present, otherwise `None`. ~~Optional[int]~~ | ## Tokenizer.add_special_case {#add_special_case tag="method"} @@ -134,10 +134,10 @@ and examples. > tokenizer.add_special_case("don't", case) > ``` -| Name | Type | Description | -| ------------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `string` | str | The string to specially tokenize. | -| `token_attrs` | iterable | A sequence of dicts, where each dict describes a token and its attributes. The `ORTH` fields of the attributes must exactly match the string when they are concatenated. | +| Name | Description | +| ------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `string` | The string to specially tokenize. ~~str~~ | +| `token_attrs` | A sequence of dicts, where each dict describes a token and its attributes. The `ORTH` fields of the attributes must exactly match the string when they are concatenated. ~~Iterable[Dict[int, str]]~~ | ## Tokenizer.explain {#explain tag="method"} @@ -153,10 +153,10 @@ produced are identical to `Tokenizer.__call__` except for whitespace tokens. > assert [t[1] for t in tok_exp] == ["(", "do", "n't", ")"] > ``` -| Name | Type | Description | -| ----------- | ---- | --------------------------------------------------- | -| `string` | str | The string to tokenize with the debugging tokenizer | -| **RETURNS** | list | A list of `(pattern_string, token_string)` tuples | +| Name | Description | +| ----------- | ---------------------------------------------------------------------------- | +| `string` | The string to tokenize with the debugging tokenizer. ~~str~~ | +| **RETURNS** | A list of `(pattern_string, token_string)` tuples. ~~List[Tuple[str, str]]~~ | ## Tokenizer.to_disk {#to_disk tag="method"} @@ -169,11 +169,11 @@ Serialize the tokenizer to disk. > tokenizer.to_disk("/path/to/tokenizer") > ``` -| Name | Type | Description | -| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | -| _keyword-only_ | | | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------ | +| `path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ | +| _keyword-only_ | | +| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ | ## Tokenizer.from_disk {#from_disk tag="method"} @@ -186,12 +186,12 @@ Load the tokenizer from disk. Modifies the object in place and returns it. > tokenizer.from_disk("/path/to/tokenizer") > ``` -| Name | Type | Description | -| -------------- | --------------- | -------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | -| _keyword-only_ | | | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `Tokenizer` | The modified `Tokenizer` object. | +| Name | Description | +| -------------- | ----------------------------------------------------------------------------------------------- | +| `path` | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ | +| _keyword-only_ | | +| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ | +| **RETURNS** | The modified `Tokenizer` object. ~~Tokenizer~~ | ## Tokenizer.to_bytes {#to_bytes tag="method"} @@ -204,11 +204,11 @@ Load the tokenizer from disk. Modifies the object in place and returns it. Serialize the tokenizer to a bytestring. -| Name | Type | Description | -| -------------- | --------------- | ------------------------------------------------------------------------- | -| _keyword-only_ | | | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | bytes | The serialized form of the `Tokenizer` object. | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------- | +| _keyword-only_ | | +| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ | +| **RETURNS** | The serialized form of the `Tokenizer` object. ~~bytes~~ | ## Tokenizer.from_bytes {#from_bytes tag="method"} @@ -223,23 +223,23 @@ it. > tokenizer.from_bytes(tokenizer_bytes) > ``` -| Name | Type | Description | -| -------------- | --------------- | ------------------------------------------------------------------------- | -| `bytes_data` | bytes | The data to load from. | -| _keyword-only_ | | | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `Tokenizer` | The `Tokenizer` object. | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------- | +| `bytes_data` | The data to load from. ~~bytes~~ | +| _keyword-only_ | | +| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ | +| **RETURNS** | The `Tokenizer` object. ~~Tokenizer~~ | ## Attributes {#attributes} -| Name | Type | Description | -| ---------------- | ------- | -------------------------------------------------------------------------------------------------------------------------- | -| `vocab` | `Vocab` | The vocab object of the parent `Doc`. | -| `prefix_search` | - | A function to find segment boundaries from the start of a string. Returns the length of the segment, or `None`. | -| `suffix_search` | - | A function to find segment boundaries from the end of a string. Returns the length of the segment, or `None`. | -| `infix_finditer` | - | A function to find internal segment separators, e.g. hyphens. Returns a (possibly empty) list of `re.MatchObject` objects. | -| `token_match` | - | A function matching the signature of `re.compile(string).match to find token matches. Returns an`re.MatchObject`or`None. | -| `rules` | dict | A dictionary of tokenizer exceptions and special cases. | +| Name | Description | +| ---------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `vocab` | The vocab object of the parent `Doc`. ~~Vocab~~ | +| `prefix_search` | A function to find segment boundaries from the start of a string. Returns the length of the segment, or `None`. ~~Optional[Callable[[str], Optional[Match]]]~~ | +| `suffix_search` | A function to find segment boundaries from the end of a string. Returns the length of the segment, or `None`. ~~Optional[Callable[[str], Optional[Match]]]~~ | +| `infix_finditer` | A function to find internal segment separators, e.g. hyphens. Returns a (possibly empty) sequence of `re.MatchObject` objects. ~~Optional[Callable[[str], Iterator[Match]]]~~ | +| `token_match` | A function matching the signature of `re.compile(string).match` to find token matches. Returns an `re.MatchObject` or `None`. ~~Optional[Callable[[str], Optional[Match]]]~~ | +| `rules` | A dictionary of tokenizer exceptions and special cases. ~~Optional[Dict[str, List[Dict[int, str]]]]~~ | ## Serialization fields {#serialization-fields} diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md index 60885f246..797fa0191 100644 --- a/website/docs/api/top-level.md +++ b/website/docs/api/top-level.md @@ -18,9 +18,10 @@ Load a model using the name of an installed `Path`-like object. spaCy will try resolving the load argument in this order. If a model is loaded from a model name, spaCy will assume it's a Python package and import it and call the model's own `load()` method. If a model is loaded from a -path, spaCy will assume it's a data directory, read the language and pipeline -settings off the meta.json and initialize the `Language` class. The data will be -loaded in via [`Language.from_disk`](/api/language#from_disk). +path, spaCy will assume it's a data directory, load its +[`config.cfg`](/api/data-formats#config) and use the language and pipeline +information to construct the `Language` class. The data will be loaded in via +[`Language.from_disk`](/api/language#from_disk). > #### Example > @@ -32,17 +33,18 @@ loaded in via [`Language.from_disk`](/api/language#from_disk). > nlp = spacy.load("en_core_web_sm", disable=["parser", "tagger"]) > ``` -| Name | Type | Description | -| ----------------------------------- | ---------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------- | -| `name` | str / `Path` | Model to load, i.e. package name or path. | -| _keyword-only_ | | | -| `disable` | `List[str]` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). | -| `config` 3 | `Dict[str, Any]` / [`Config`](https://thinc.ai/docs/api-config#config) | Optional config overrides, either as nested dict or dict keyed by section value in dot notation, e.g. `"components.name.value"`. | -| **RETURNS** | `Language` | A `Language` object with the loaded model. | +| Name | Description | +| ----------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `name` | Model to load, i.e. package name or path. ~~Union[str, Path]~~ | +| _keyword-only_ | | +| `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). ~~List[str]~~ | +| `config` 3 | Optional config overrides, either as nested dict or dict keyed by section value in dot notation, e.g. `"components.name.value"`. ~~Union[Dict[str, Any], Config]~~ | +| **RETURNS** | A `Language` object with the loaded model. ~~Language~~ | -Essentially, `spacy.load()` is a convenience wrapper that reads the language ID -and pipeline components from a model's `meta.json`, initializes the `Language` -class, loads in the model data and returns it. +Essentially, `spacy.load()` is a convenience wrapper that reads the model's +[`config.cfg`](/api/data-formats#config), uses the language and pipeline +information to construct a `Language` object, loads in the model data and +returns it. ```python ### Abstract example @@ -65,12 +67,12 @@ Create a blank model of a given language class. This function is the twin of > nlp_de = spacy.blank("de") # equivalent to German() > ``` -| Name | Type | Description | -| ----------- | ---------- | ------------------------------------------------------------------------------------------------ | -| `name` | str | [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) of the language class to load. | -| **RETURNS** | `Language` | An empty `Language` object of the appropriate subclass. | +| Name | Description | +| ----------- | -------------------------------------------------------------------------------------------------------- | +| `name` | [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) of the language class to load. ~~str~~ | +| **RETURNS** | An empty `Language` object of the appropriate subclass. ~~Language~~ | -#### spacy.info {#spacy.info tag="function"} +### spacy.info {#spacy.info tag="function"} The same as the [`info` command](/api/cli#info). Pretty-print information about your installation, models and local setup from within spaCy. To get the model @@ -85,12 +87,12 @@ meta data as a dictionary instead, you can use the `meta` attribute on your > markdown = spacy.info(markdown=True, silent=True) > ``` -| Name | Type | Description | -| -------------- | ---- | ------------------------------------------------ | -| `model` | str | A model, i.e. a package name or path (optional). | -| _keyword-only_ | | | -| `markdown` | bool | Print information as Markdown. | -| `silent` | bool | Don't print anything, just return. | +| Name | Description | +| -------------- | ------------------------------------------------------------------ | +| `model` | A model, i.e. a package name or path (optional). ~~Optional[str]~~ | +| _keyword-only_ | | +| `markdown` | Print information as Markdown. ~~bool~~ | +| `silent` | Don't print anything, just return. ~~bool~~ | ### spacy.explain {#spacy.explain tag="function"} @@ -111,10 +113,10 @@ list of available terms, see > # world NN noun, singular or mass > ``` -| Name | Type | Description | -| ----------- | ---- | -------------------------------------------------------- | -| `term` | str | Term to explain. | -| **RETURNS** | str | The explanation, or `None` if not found in the glossary. | +| Name | Description | +| ----------- | -------------------------------------------------------------------------- | +| `term` | Term to explain. ~~str~~ | +| **RETURNS** | The explanation, or `None` if not found in the glossary. ~~Optional[str]~~ | ### spacy.prefer_gpu {#spacy.prefer_gpu tag="function" new="2.0.14"} @@ -131,9 +133,9 @@ models. > nlp = spacy.load("en_core_web_sm") > ``` -| Name | Type | Description | -| ----------- | ---- | ------------------------------ | -| **RETURNS** | bool | Whether the GPU was activated. | +| Name | Description | +| ----------- | --------------------------------------- | +| **RETURNS** | Whether the GPU was activated. ~~bool~~ | ### spacy.require_gpu {#spacy.require_gpu tag="function" new="2.0.14"} @@ -150,9 +152,9 @@ and _before_ loading any models. > nlp = spacy.load("en_core_web_sm") > ``` -| Name | Type | Description | -| ----------- | ---- | ----------- | -| **RETURNS** | bool | `True` | +| Name | Description | +| ----------- | --------------- | +| **RETURNS** | `True` ~~bool~~ | ## displaCy {#displacy source="spacy/displacy"} @@ -175,16 +177,16 @@ browser. Will run a simple web server. > displacy.serve([doc1, doc2], style="dep") > ``` -| Name | Type | Description | Default | -| --------- | ------------------- | ------------------------------------------------------------------------------------------------------------------------------------ | ----------- | -| `docs` | list, `Doc`, `Span` | Document(s) to visualize. | -| `style` | str | Visualization style, `'dep'` or `'ent'`. | `'dep'` | -| `page` | bool | Render markup as full HTML page. | `True` | -| `minify` | bool | Minify HTML markup. | `False` | -| `options` | dict | [Visualizer-specific options](#displacy_options), e.g. colors. | `{}` | -| `manual` | bool | Don't parse `Doc` and instead, expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. | `False` | -| `port` | int | Port to serve visualization. | `5000` | -| `host` | str | Host to serve visualization. | `'0.0.0.0'` | +| Name | Description | +| --------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `docs` | Document(s) or span(s) to visualize. ~~Union[Iterable[Union[Doc, Span]], Doc, Span]~~ | +| `style` | Visualization style, `"dep"` or `"ent"`. Defaults to `"dep"`. ~~str~~ | +| `page` | Render markup as full HTML page. Defaults to `True`. ~~bool~~ | +| `minify` | Minify HTML markup. Defaults to `False`. ~~bool~~ | +| `options` | [Visualizer-specific options](#displacy_options), e.g. colors. ~~Dict[str, Any]~~ | +| `manual` | Don't parse `Doc` and instead, expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. Defaults to `False`. ~~bool~~ | +| `port` | Port to serve visualization. Defaults to `5000`. ~~int~~ | +| `host` | Host to serve visualization. Defaults to `"0.0.0.0"`. ~~str~~ | ### displacy.render {#displacy.render tag="method" new="2"} @@ -200,16 +202,16 @@ Render a dependency parse tree or named entity visualization. > html = displacy.render(doc, style="dep") > ``` -| Name | Type | Description | Default | -| ----------- | ------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- | ------- | -| `docs` | list, `Doc`, `Span` | Document(s) to visualize. | -| `style` | str | Visualization style, `'dep'` or `'ent'`. | `'dep'` | -| `page` | bool | Render markup as full HTML page. | `False` | -| `minify` | bool | Minify HTML markup. | `False` | -| `jupyter` | bool | Explicitly enable or disable "[Jupyter](http://jupyter.org/) mode" to return markup ready to be rendered in a notebook. Detected automatically if `None`. | `None` | -| `options` | dict | [Visualizer-specific options](#displacy_options), e.g. colors. | `{}` | -| `manual` | bool | Don't parse `Doc` and instead, expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. | `False` | -| **RETURNS** | str | Rendered HTML markup. | +| Name | Description | +| ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `docs` | Document(s) or span(s) to visualize. ~~Union[Iterable[Union[Doc, Span]], Doc, Span]~~ | +| `style` | Visualization style, `"dep"` or `"ent"`. Defaults to `"dep"`. ~~str~~ | +| `page` | Render markup as full HTML page. Defaults to `True`. ~~bool~~ | +| `minify` | Minify HTML markup. Defaults to `False`. ~~bool~~ | +| `options` | [Visualizer-specific options](#displacy_options), e.g. colors. ~~Dict[str, Any]~~ | +| `manual` | Don't parse `Doc` and instead, expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. Defaults to `False`. ~~bool~~ | +| `jupyter` | Explicitly enable or disable "[Jupyter](http://jupyter.org/) mode" to return markup ready to be rendered in a notebook. Detected automatically if `None` (default). ~~Optional[bool]~~ | +| **RETURNS** | The rendered HTML markup. ~~str~~ | ### Visualizer options {#displacy_options} @@ -225,22 +227,22 @@ If a setting is not present in the options, the default value will be used. > displacy.serve(doc, style="dep", options=options) > ``` -| Name | Type | Description | Default | -| ------------------------------------------ | ---- | --------------------------------------------------------------------------------------------------------------- | ----------------------- | -| `fine_grained` | bool | Use fine-grained part-of-speech tags (`Token.tag_`) instead of coarse-grained tags (`Token.pos_`). | `False` | -| `add_lemma` 2.2.4 | bool | Print the lemma's in a separate row below the token texts. | `False` | -| `collapse_punct` | bool | Attach punctuation to tokens. Can make the parse more readable, as it prevents long arcs to attach punctuation. | `True` | -| `collapse_phrases` | bool | Merge noun phrases into one token. | `False` | -| `compact` | bool | "Compact mode" with square arrows that takes up less space. | `False` | -| `color` | str | Text color (HEX, RGB or color names). | `'#000000'` | -| `bg` | str | Background color (HEX, RGB or color names). | `'#ffffff'` | -| `font` | str | Font name or font family for all text. | `'Arial'` | -| `offset_x` | int | Spacing on left side of the SVG in px. | `50` | -| `arrow_stroke` | int | Width of arrow path in px. | `2` | -| `arrow_width` | int | Width of arrow head in px. | `10` / `8` (compact) | -| `arrow_spacing` | int | Spacing between arrows in px to avoid overlaps. | `20` / `12` (compact) | -| `word_spacing` | int | Vertical spacing between words and arcs in px. | `45` | -| `distance` | int | Distance between words in px. | `175` / `150` (compact) | +| Name | Description | +| ------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------- | +| `fine_grained` | Use fine-grained part-of-speech tags (`Token.tag_`) instead of coarse-grained tags (`Token.pos_`). Defaults to `False`. ~~bool~~ | +| `add_lemma` 2.2.4 | Print the lemma's in a separate row below the token texts. Defaults to `False`. ~~bool~~ | +| `collapse_punct` | Attach punctuation to tokens. Can make the parse more readable, as it prevents long arcs to attach punctuation. Defaults to `True`. ~~bool~~ | +| `collapse_phrases` | Merge noun phrases into one token. Defaults to `False`. ~~bool~~ | +| `compact` | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~ | +| `color` | Text color (HEX, RGB or color names). Defaults to `"#000000"`. ~~str~~ | +| `bg` | Background color (HEX, RGB or color names). Defaults to `"#ffffff"`. ~~str~~ | +| `font` | Font name or font family for all text. Defaults to `"Arial"`. ~~str~~ | +| `offset_x` | Spacing on left side of the SVG in px. Defaults to `50`. ~~int~~ | +| `arrow_stroke` | Width of arrow path in px. Defaults to `2`. ~~int~~ | +| `arrow_width` | Width of arrow head in px. Defaults to `10` in regular mode and `8` in compact mode. ~~int~~ | +| `arrow_spacing` | Spacing between arrows in px to avoid overlaps. Defaults to `20` in regular mode and `12` in compact mode. ~~int~~ | +| `word_spacing` | Vertical spacing between words and arcs in px. Defaults to `45`. ~~int~~ | +| `distance` | Distance between words in px. Defaults to `175` in regular mode and `150` in compact mode. ~~int~~ | #### Named Entity Visualizer options {#displacy_options-ent} @@ -252,11 +254,11 @@ If a setting is not present in the options, the default value will be used. > displacy.serve(doc, style="ent", options=options) > ``` -| Name | Type | Description | Default | -| --------------------------------------- | ---- | ------------------------------------------------------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------------------ | -| `ents` | list | Entity types to highlight (`None` for all types). | `None` | -| `colors` | dict | Color overrides. Entity types in uppercase should be mapped to color names or values. | `{}` | -| `template` 2.2 | str | Optional template to overwrite the HTML used to render entity spans. Should be a format string and can use `{bg}`, `{text}` and `{label}`. | see [`templates.py`](https://github.com/explosion/spaCy/blob/master/spacy/displacy/templates.py) | +| Name | Description | +| --------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `ents` | Entity types to highlight or `None` for all types (default). ~~Optional[List[str]]~~ | +| `colors` | Color overrides. Entity types should be mapped to color names or values. ~~Dict[str, str]~~ | +| `template` 2.2 | Optional template to overwrite the HTML used to render entity spans. Should be a format string and can use `{bg}`, `{text}` and `{label}`. See [`templates.py`](https://github.com/explosion/spaCy/blob/master/spacy/displacy/templates.py) for examples. ~~Optional[str]~~ | By default, displaCy comes with colors for all entity types used by [spaCy models](/models). If you're using custom entity types, you can use the @@ -280,43 +282,44 @@ concept of function registries. spaCy also uses the function registry for language subclasses, model architecture, lookups and pipeline component factories. - - > #### Example > > ```python +> from typing import Iterator > import spacy -> from thinc.api import Model > -> @spacy.registry.architectures("CustomNER.v1") -> def custom_ner(n0: int) -> Model: -> return Model("custom", forward, dims={"nO": nO}) +> @spacy.registry.schedules("waltzing.v1") +> def waltzing() -> Iterator[float]: +> i = 0 +> while True: +> yield i % 3 + 1 +> i += 1 > ``` -| Registry name | Description | -| ----------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `architectures` | Registry for functions that create [model architectures](/api/architectures). Can be used to register custom model architectures and reference them in the `config.cfg`. | -| `factories` | Registry for functions that create [pipeline components](/usage/processing-pipelines#custom-components). Added automatically when you use the `@spacy.component` decorator and also reads from [entry points](/usage/saving-loading#entry-points) | -| `tokenizers` | Registry for tokenizer factories. Registered functions should return a callback that receives the `nlp` object and returns a [`Tokenizer`](/api/tokenizer) or a custom callable. | -| `languages` | Registry for language-specific `Language` subclasses. Automatically reads from [entry points](/usage/saving-loading#entry-points). | -| `lookups` | Registry for large lookup tables available via `vocab.lookups`. | -| `displacy_colors` | Registry for custom color scheme for the [`displacy` NER visualizer](/usage/visualizers). Automatically reads from [entry points](/usage/saving-loading#entry-points). | -| `assets` | Registry for data assets, knowledge bases etc. | -| `callbacks` | Registry for custom callbacks to [modify the `nlp` object](/usage/training#custom-code-nlp-callbacks) before training. | -| `readers` | Registry for training and evaluation data readers like [`Corpus`](/api/corpus). | -| `batchers` | Registry for training and evaluation [data batchers](#batchers). | -| `optimizers` | Registry for functions that create [optimizers](https://thinc.ai/docs/api-optimizers). | -| `schedules` | Registry for functions that create [schedules](https://thinc.ai/docs/api-schedules). | -| `layers` | Registry for functions that create [layers](https://thinc.ai/docs/api-layers). | -| `losses` | Registry for functions that create [losses](https://thinc.ai/docs/api-loss). | -| `initializers` | Registry for functions that create [initializers](https://thinc.ai/docs/api-initializers). | +| Registry name | Description | +| ----------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `architectures` | Registry for functions that create [model architectures](/api/architectures). Can be used to register custom model architectures and reference them in the `config.cfg`. | +| `assets` | Registry for data assets, knowledge bases etc. | +| `batchers` | Registry for training and evaluation [data batchers](#batchers). | +| `callbacks` | Registry for custom callbacks to [modify the `nlp` object](/usage/training#custom-code-nlp-callbacks) before training. | +| `displacy_colors` | Registry for custom color scheme for the [`displacy` NER visualizer](/usage/visualizers). Automatically reads from [entry points](/usage/saving-loading#entry-points). | +| `factories` | Registry for functions that create [pipeline components](/usage/processing-pipelines#custom-components). Added automatically when you use the `@spacy.component` decorator and also reads from [entry points](/usage/saving-loading#entry-points). | +| `initializers` | Registry for functions that create [initializers](https://thinc.ai/docs/api-initializers). | +| `languages` | Registry for language-specific `Language` subclasses. Automatically reads from [entry points](/usage/saving-loading#entry-points). | +| `layers` | Registry for functions that create [layers](https://thinc.ai/docs/api-layers). | +| `lookups` | Registry for large lookup tables available via `vocab.lookups`. | +| `losses` | Registry for functions that create [losses](https://thinc.ai/docs/api-loss). | +| `optimizers` | Registry for functions that create [optimizers](https://thinc.ai/docs/api-optimizers). | +| `readers` | Registry for training and evaluation data readers like [`Corpus`](/api/corpus). | +| `schedules` | Registry for functions that create [schedules](https://thinc.ai/docs/api-schedules). | +| `tokenizers` | Registry for tokenizer factories. Registered functions should return a callback that receives the `nlp` object and returns a [`Tokenizer`](/api/tokenizer) or a custom callable. | ### spacy-transformers registry {#registry-transformers} The following registries are added by the [`spacy-transformers`](https://github.com/explosion/spacy-transformers) package. See the [`Transformer`](/api/transformer) API reference and -[usage docs](/usage/transformers) for details. +[usage docs](/usage/embeddings-transformers) for details. > #### Example > @@ -338,7 +341,17 @@ See the [`Transformer`](/api/transformer) API reference and ## Batchers {#batchers source="spacy/gold/batchers.py" new="3"} - +A data batcher implements a batching strategy that essentially turns a stream of +items into a stream of batches, with each batch consisting of one item or a list +of items. During training, the models update their weights after processing one +batch at a time. Typical batching strategies include presenting the training +data as a stream of batches with similar sizes, or with increasing batch sizes. +See the Thinc documentation on +[`schedules`](https://thinc.ai/docs/api-schedules) for a few standard examples. + +Instead of using one of the built-in batchers listed here, you can also +[implement your own](/usage/training#custom-code-readers-batchers), which may or +may not use a custom schedule. #### batch_by_words.v1 {#batch_by_words tag="registered function"} @@ -359,13 +372,13 @@ themselves, or be discarded if `discard_oversize` is set to `True`. The argument > get_length = null > ``` -| Name | Type | Description | -| ------------------ | ---------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `seqs` | `Iterable[Any]` | The sequences to minibatch. | -| `size` | `Iterable[int]` / int | The target number of words per batch. Can also be a block referencing a schedule, e.g. [`compounding`](https://thinc.ai/docs/api-schedules/#compounding). | -| `tolerance` | float | What percentage of the size to allow batches to exceed. | -| `discard_oversize` | bool | Whether to discard sequences that by themselves exceed the tolerated size. | -| `get_length` | `Callable[[Any], int]` | Optional function that receives a sequence item and returns its length. Defaults to the built-in `len()` if not set. | +| Name | Description | +| ------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `seqs` | The sequences to minibatch. ~~Iterable[Any]~~ | +| `size` | The target number of words per batch. Can also be a block referencing a schedule, e.g. [`compounding`](https://thinc.ai/docs/api-schedules/#compounding). ~~Union[int, Sequence[int]]~~ | +| `tolerance` | What percentage of the size to allow batches to exceed. ~~float~~ | +| `discard_oversize` | Whether to discard sequences that by themselves exceed the tolerated size. ~~bool~~ | +| `get_length` | Optional function that receives a sequence item and returns its length. Defaults to the built-in `len()` if not set. ~~Optional[Callable[[Any], int]]~~ | #### batch_by_sequence.v1 {#batch_by_sequence tag="registered function"} @@ -380,10 +393,10 @@ themselves, or be discarded if `discard_oversize` is set to `True`. The argument Create a batcher that creates batches of the specified size. -| Name | Type | Description | -| ------------ | ---------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `size` | `Iterable[int]` / int | The target number of items per batch. Can also be a block referencing a schedule, e.g. [`compounding`](https://thinc.ai/docs/api-schedules/#compounding). | -| `get_length` | `Callable[[Any], int]` | Optional function that receives a sequence item and returns its length. Defaults to the built-in `len()` if not set. | +| Name | Description | +| ------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `size` | The target number of items per batch. Can also be a block referencing a schedule, e.g. [`compounding`](https://thinc.ai/docs/api-schedules/#compounding). ~~Union[int, Sequence[int]]~~ | +| `get_length` | Optional function that receives a sequence item and returns its length. Defaults to the built-in `len()` if not set. ~~Optional[Callable[[Any], int]]~~ | #### batch_by_padded.v1 {#batch_by_padded tag="registered function"} @@ -403,12 +416,12 @@ sequences binned by length within a window. The padded size is defined as the maximum length of sequences within the batch multiplied by the number of sequences in the batch. -| Name | Type | Description | -| ------------------ | ---------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `size` | `Iterable[int]` / int | The largest padded size to batch sequences into. Can also be a block referencing a schedule, e.g. [`compounding`](https://thinc.ai/docs/api-schedules/#compounding). | -| `buffer` | int | The number of sequences to accumulate before sorting by length. A larger buffer will result in more even sizing, but if the buffer is very large, the iteration order will be less random, which can result in suboptimal training. | -| `discard_oversize` | bool | Whether to discard sequences that are by themselves longer than the largest padded batch size. | -| `get_length` | `Callable[[Any], int]` | Optional function that receives a sequence item and returns its length. Defaults to the built-in `len()` if not set. | +| Name | Description | +| ------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `size` | The largest padded size to batch sequences into. Can also be a block referencing a schedule, e.g. [`compounding`](https://thinc.ai/docs/api-schedules/#compounding). ~~Union[int, Sequence[int]]~~ | +| `buffer` | The number of sequences to accumulate before sorting by length. A larger buffer will result in more even sizing, but if the buffer is very large, the iteration order will be less random, which can result in suboptimal training. ~~int~~ | +| `discard_oversize` | Whether to discard sequences that are by themselves longer than the largest padded batch size. ~~bool~~ | +| `get_length` | Optional function that receives a sequence item and returns its length. Defaults to the built-in `len()` if not set. ~~Optional[Callable[[Any], int]]~~ | ## Training data and alignment {#gold source="spacy/gold"} @@ -436,11 +449,11 @@ single-token entity. > assert tags == ["O", "O", "U-LOC", "O"] > ``` -| Name | Type | Description | -| ----------- | -------- | ----------------------------------------------------------------------------------------------------------------------------------------------- | -| `doc` | `Doc` | The document that the entity offsets refer to. The output tags will refer to the token boundaries within the document. | -| `entities` | iterable | A sequence of `(start, end, label)` triples. `start` and `end` should be character-offset integers denoting the slice into the original string. | -| **RETURNS** | list | str strings, describing the [BILUO](/usage/linguistic-features#accessing-ner) tags. | +| Name | Description | +| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `doc` | The document that the entity offsets refer to. The output tags will refer to the token boundaries within the document. ~~Doc~~ | +| `entities` | A sequence of `(start, end, label)` triples. `start` and `end` should be character-offset integers denoting the slice into the original string. ~~List[Tuple[int, int, Union[str, int]]]~~ | +| **RETURNS** | A list of strings, describing the [BILUO](/usage/linguistic-features#accessing-ner) tags. ~~List[str]~~ | ### gold.offsets_from_biluo_tags {#offsets_from_biluo_tags tag="function"} @@ -458,11 +471,11 @@ Encode per-token tags following the > assert entities == [(7, 13, "LOC")] > ``` -| Name | Type | Description | -| ----------- | -------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `doc` | `Doc` | The document that the BILUO tags refer to. | -| `entities` | iterable | A sequence of [BILUO](/usage/linguistic-features#accessing-ner) tags with each tag describing one token. Each tag string will be of the form of either `""`, `"O"` or `"{action}-{label}"`, where action is one of `"B"`, `"I"`, `"L"`, `"U"`. | -| **RETURNS** | list | A sequence of `(start, end, label)` triples. `start` and `end` will be character-offset integers denoting the slice into the original string. | +| Name | Description | +| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `doc` | The document that the BILUO tags refer to. ~~Doc~~ | +| `entities` | A sequence of [BILUO](/usage/linguistic-features#accessing-ner) tags with each tag describing one token. Each tag string will be of the form of either `""`, `"O"` or `"{action}-{label}"`, where action is one of `"B"`, `"I"`, `"L"`, `"U"`. ~~List[str]~~ | +| **RETURNS** | A sequence of `(start, end, label)` triples. `start` and `end` will be character-offset integers denoting the slice into the original string. ~~List[Tuple[int, int, str]]~~ | ### gold.spans_from_biluo_tags {#spans_from_biluo_tags tag="function" new="2.1"} @@ -481,11 +494,11 @@ token-based tags, e.g. to overwrite the `doc.ents`. > doc.ents = spans_from_biluo_tags(doc, tags) > ``` -| Name | Type | Description | -| ----------- | -------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `doc` | `Doc` | The document that the BILUO tags refer to. | -| `entities` | iterable | A sequence of [BILUO](/usage/linguistic-features#accessing-ner) tags with each tag describing one token. Each tag string will be of the form of either `""`, `"O"` or `"{action}-{label}"`, where action is one of `"B"`, `"I"`, `"L"`, `"U"`. | -| **RETURNS** | list | A sequence of `Span` objects with added entity labels. | +| Name | Description | +| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `doc` | The document that the BILUO tags refer to. ~~Doc~~ | +| `entities` | A sequence of [BILUO](/usage/linguistic-features#accessing-ner) tags with each tag describing one token. Each tag string will be of the form of either `""`, `"O"` or `"{action}-{label}"`, where action is one of `"B"`, `"I"`, `"L"`, `"U"`. ~~List[str]~~ | +| **RETURNS** | A sequence of `Span` objects with added entity labels. ~~List[Span]~~ | ## Utility functions {#util source="spacy/util.py"} @@ -497,14 +510,13 @@ page should be safe to use and we'll try to ensure backwards compatibility. However, we recommend having additional tests in place if your application depends on any of spaCy's utilities. - - ### util.get_lang_class {#util.get_lang_class tag="function"} Import and load a `Language` class. Allows lazy-loading [language data](/usage/adding-languages) and importing languages using the two-letter language code. To add a language code for a custom language class, -you can use the [`set_lang_class`](/api/top-level#util.set_lang_class) helper. +you can register it using the [`@registry.languages`](/api/top-level#registry) +decorator. > #### Example > @@ -514,36 +526,14 @@ you can use the [`set_lang_class`](/api/top-level#util.set_lang_class) helper. > lang = lang_class() > ``` -| Name | Type | Description | -| ----------- | ---------- | -------------------------------------- | -| `lang` | str | Two-letter language code, e.g. `'en'`. | -| **RETURNS** | `Language` | Language class. | - -### util.set_lang_class {#util.set_lang_class tag="function"} - -Set a custom `Language` class name that can be loaded via -[`get_lang_class`](/api/top-level#util.get_lang_class). If your model uses a -custom language, this is required so that spaCy can load the correct class from -the two-letter language code. - -> #### Example -> -> ```python -> from spacy.lang.xy import CustomLanguage -> -> util.set_lang_class('xy', CustomLanguage) -> lang_class = util.get_lang_class('xy') -> nlp = lang_class() -> ``` - -| Name | Type | Description | -| ------ | ---------- | -------------------------------------- | -| `name` | str | Two-letter language code, e.g. `'en'`. | -| `cls` | `Language` | The language class, e.g. `English`. | +| Name | Description | +| ----------- | ---------------------------------------------- | +| `lang` | Two-letter language code, e.g. `"en"`. ~~str~~ | +| **RETURNS** | The respective subclass. ~~Language~~ | ### util.lang_class_is_loaded {#util.lang_class_is_loaded tag="function" new="2.1"} -Check whether a `Language` class is already loaded. `Language` classes are +Check whether a `Language` subclass is already loaded. `Language` subclasses are loaded lazily, to avoid expensive setup code associated with the language data. > #### Example @@ -554,19 +544,19 @@ loaded lazily, to avoid expensive setup code associated with the language data. > assert util.lang_class_is_loaded("de") is False > ``` -| Name | Type | Description | -| ----------- | ---- | -------------------------------------- | -| `name` | str | Two-letter language code, e.g. `'en'`. | -| **RETURNS** | bool | Whether the class has been loaded. | +| Name | Description | +| ----------- | ---------------------------------------------- | +| `name` | Two-letter language code, e.g. `"en"`. ~~str~~ | +| **RETURNS** | Whether the class has been loaded. ~~bool~~ | ### util.load_model {#util.load_model tag="function" new="2"} Load a model from a package or data path. If called with a package name, spaCy will assume the model is a Python package and import and call its `load()` method. If called with a path, spaCy will assume it's a data directory, read the -language and pipeline settings from the meta.json and initialize a `Language` -class. The model data will then be loaded in via -[`Language.from_disk()`](/api/language#from_disk). +language and pipeline settings from the [`config.cfg`](/api/data-formats#config) +and create a `Language` object. The model data will then be loaded in via +[`Language.from_disk`](/api/language#from_disk). > #### Example > @@ -576,31 +566,13 @@ class. The model data will then be loaded in via > nlp = util.load_model("/path/to/data") > ``` -| Name | Type | Description | -| ------------- | ---------- | -------------------------------------------------------- | -| `name` | str | Package name or model path. | -| `**overrides` | - | Specific overrides, like pipeline components to disable. | -| **RETURNS** | `Language` | `Language` class with the loaded model. | - -### util.load_model_from_path {#util.load_model_from_path tag="function" new="2"} - -Load a model from a data directory path. Creates the [`Language`](/api/language) -class and pipeline based on the directory's meta.json and then calls -[`from_disk()`](/api/language#from_disk) with the path. This function also makes -it easy to test a new model that you haven't packaged yet. - -> #### Example -> -> ```python -> nlp = load_model_from_path("/path/to/data") -> ``` - -| Name | Type | Description | -| ------------- | ---------- | ---------------------------------------------------------------------------------------------------- | -| `model_path` | str | Path to model data directory. | -| `meta` | dict | Model meta data. If `False`, spaCy will try to load the meta from a meta.json in the same directory. | -| `**overrides` | - | Specific overrides, like pipeline components to disable. | -| **RETURNS** | `Language` | `Language` class with the loaded model. | +| Name | Description | +| ----------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- | +| `name` | Package name or model path. ~~str~~ | +| `vocab` 3 | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~. | +| `disable` | Names of pipeline components to disable. ~~Iterable[str]~~ | +| `config` 3 | Config overrides as nested dict or flat dict keyed by section values in dot notation, e.g. `"nlp.pipeline"`. ~~Union[Dict[str, Any], Config]~~ | +| **RETURNS** | `Language` class with the loaded model. ~~Language~~ | ### util.load_model_from_init_py {#util.load_model_from_init_py tag="function" new="2"} @@ -616,26 +588,66 @@ A helper function to use in the `load()` method of a model package's > return load_model_from_init_py(__file__, **overrides) > ``` -| Name | Type | Description | -| ------------- | ---------- | -------------------------------------------------------- | -| `init_file` | str | Path to model's `__init__.py`, i.e. `__file__`. | -| `**overrides` | - | Specific overrides, like pipeline components to disable. | -| **RETURNS** | `Language` | `Language` class with the loaded model. | +| Name | Description | +| ----------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- | +| `init_file` | Path to model's `__init__.py`, i.e. `__file__`. ~~Union[str, Path]~~ | +| `vocab` 3 | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~. | +| `disable` | Names of pipeline components to disable. ~~Iterable[str]~~ | +| `config` 3 | Config overrides as nested dict or flat dict keyed by section values in dot notation, e.g. `"nlp.pipeline"`. ~~Union[Dict[str, Any], Config]~~ | +| **RETURNS** | `Language` class with the loaded model. ~~Language~~ | -### util.get_model_meta {#util.get_model_meta tag="function" new="2"} +### util.load_config {#util.load_config tag="function" new="3"} -Get a model's meta.json from a directory path and validate its contents. +Load a model's [`config.cfg`](/api/data-formats#config) from a file path. The +config typically includes details about the model pipeline and how its +components are created, as well as all training settings and hyperparameters. > #### Example > > ```python -> meta = util.get_model_meta("/path/to/model") +> config = util.load_config("/path/to/model/config.cfg") +> print(config.to_str()) > ``` -| Name | Type | Description | -| ----------- | ------------ | ------------------------ | -| `path` | str / `Path` | Path to model directory. | -| **RETURNS** | dict | The model's meta data. | +| Name | Description | +| ------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `path` | Path to the model's `config.cfg`. ~~Union[str, Path]~~ | +| `overrides` | Optional config overrides to replace in loaded config. Can be provided as nested dict, or as flat dict with keys in dot notation, e.g. `"nlp.pipeline"`. ~~Dict[str, Any]~~ | +| `interpolate` | Whether to interpolate the config and replace variables like `${paths.train}` with their values. Defaults to `False`. ~~bool~~ | +| **RETURNS** | The model's config. ~~Config~~ | + +### util.load_meta {#util.load_meta tag="function" new="3"} + +Get a model's [`meta.json`](/api/data-formats#meta) from a file path and +validate its contents. + +> #### Example +> +> ```python +> meta = util.load_meta("/path/to/model/meta.json") +> ``` + +| Name | Description | +| ----------- | ----------------------------------------------------- | +| `path` | Path to the model's `meta.json`. ~~Union[str, Path]~~ | +| **RETURNS** | The model's meta data. ~~Dict[str, Any]~~ | + +### util.get_installed_models {#util.get_installed_models tag="function" new="3"} + +List all model packages installed in the current environment. This will include +any spaCy model that was packaged with [`spacy package`](/api/cli#package). +Under the hood, model packages expose a Python entry point that spaCy can check, +without having to load the model. + +> #### Example +> +> ```python +> model_names = util.get_installed_models() +> ``` + +| Name | Description | +| ----------- | ---------------------------------------------------------------------------------- | +| **RETURNS** | The string names of the models installed in the current environment. ~~List[str]~~ | ### util.is_package {#util.is_package tag="function"} @@ -649,10 +661,10 @@ Check if string maps to a package installed via pip. Mainly used to validate > util.is_package("xyz") # False > ``` -| Name | Type | Description | -| ----------- | ------ | -------------------------------------------- | -| `name` | str | Name of package. | -| **RETURNS** | `bool` | `True` if installed package, `False` if not. | +| Name | Description | +| ----------- | ----------------------------------------------------- | +| `name` | Name of package. ~~str~~ | +| **RETURNS** | `True` if installed package, `False` if not. ~~bool~~ | ### util.get_package_path {#util.get_package_path tag="function" new="2"} @@ -666,10 +678,10 @@ Get path to an installed package. Mainly used to resolve the location of > # /usr/lib/python3.6/site-packages/en_core_web_sm > ``` -| Name | Type | Description | -| -------------- | ------ | -------------------------------- | -| `package_name` | str | Name of installed package. | -| **RETURNS** | `Path` | Path to model package directory. | +| Name | Description | +| -------------- | ----------------------------------------- | +| `package_name` | Name of installed package. ~~str~~ | +| **RETURNS** | Path to model package directory. ~~Path~~ | ### util.is_in_jupyter {#util.is_in_jupyter tag="function" new="2"} @@ -686,9 +698,9 @@ detecting the IPython kernel. Mainly used for the > display(HTML(html)) > ``` -| Name | Type | Description | -| ----------- | ---- | ------------------------------------- | -| **RETURNS** | bool | `True` if in Jupyter, `False` if not. | +| Name | Description | +| ----------- | ---------------------------------------------- | +| **RETURNS** | `True` if in Jupyter, `False` if not. ~~bool~~ | ### util.compile_prefix_regex {#util.compile_prefix_regex tag="function"} @@ -702,10 +714,10 @@ Compile a sequence of prefix rules into a regex object. > nlp.tokenizer.prefix_search = prefix_regex.search > ``` -| Name | Type | Description | -| ----------- | ------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------- | -| `entries` | tuple | The prefix rules, e.g. [`lang.punctuation.TOKENIZER_PREFIXES`](https://github.com/explosion/spaCy/tree/master/spacy/lang/punctuation.py). | -| **RETURNS** | [regex](https://docs.python.org/3/library/re.html#re-objects) | The regex object. to be used for [`Tokenizer.prefix_search`](/api/tokenizer#attributes). | +| Name | Description | +| ----------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `entries` | The prefix rules, e.g. [`lang.punctuation.TOKENIZER_PREFIXES`](https://github.com/explosion/spaCy/tree/master/spacy/lang/punctuation.py). ~~Iterable[Union[str, Pattern]]~~ | +| **RETURNS** | The regex object. to be used for [`Tokenizer.prefix_search`](/api/tokenizer#attributes). ~~Pattern~~ | ### util.compile_suffix_regex {#util.compile_suffix_regex tag="function"} @@ -719,10 +731,10 @@ Compile a sequence of suffix rules into a regex object. > nlp.tokenizer.suffix_search = suffix_regex.search > ``` -| Name | Type | Description | -| ----------- | ------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------- | -| `entries` | tuple | The suffix rules, e.g. [`lang.punctuation.TOKENIZER_SUFFIXES`](https://github.com/explosion/spaCy/tree/master/spacy/lang/punctuation.py). | -| **RETURNS** | [regex](https://docs.python.org/3/library/re.html#re-objects) | The regex object. to be used for [`Tokenizer.suffix_search`](/api/tokenizer#attributes). | +| Name | Description | +| ----------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `entries` | The suffix rules, e.g. [`lang.punctuation.TOKENIZER_SUFFIXES`](https://github.com/explosion/spaCy/tree/master/spacy/lang/punctuation.py). ~~Iterable[Union[str, Pattern]]~~ | +| **RETURNS** | The regex object. to be used for [`Tokenizer.suffix_search`](/api/tokenizer#attributes). ~~Pattern~~ | ### util.compile_infix_regex {#util.compile_infix_regex tag="function"} @@ -736,10 +748,10 @@ Compile a sequence of infix rules into a regex object. > nlp.tokenizer.infix_finditer = infix_regex.finditer > ``` -| Name | Type | Description | -| ----------- | ------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------- | -| `entries` | tuple | The infix rules, e.g. [`lang.punctuation.TOKENIZER_INFIXES`](https://github.com/explosion/spaCy/tree/master/spacy/lang/punctuation.py). | -| **RETURNS** | [regex](https://docs.python.org/3/library/re.html#re-objects) | The regex object. to be used for [`Tokenizer.infix_finditer`](/api/tokenizer#attributes). | +| Name | Description | +| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `entries` | The infix rules, e.g. [`lang.punctuation.TOKENIZER_INFIXES`](https://github.com/explosion/spaCy/tree/master/spacy/lang/punctuation.py). ~~Iterable[Union[str, Pattern]]~~ | +| **RETURNS** | The regex object. to be used for [`Tokenizer.infix_finditer`](/api/tokenizer#attributes). ~~Pattern~~ | ### util.minibatch {#util.minibatch tag="function" new="2"} @@ -754,11 +766,11 @@ vary on each step. > nlp.update(batch) > ``` -| Name | Type | Description | -| ---------- | -------------- | ---------------------- | -| `items` | iterable | The items to batch up. | -| `size` | int / iterable | The batch size(s). | -| **YIELDS** | list | The batches. | +| Name | Description | +| ---------- | ---------------------------------------- | +| `items` | The items to batch up. ~~Iterable[Any]~~ | +| `size` | int / iterable | The batch size(s). ~~Union[int, Sequence[int]]~~ | +| **YIELDS** | The batches. | ### util.filter_spans {#util.filter_spans tag="function" new="2.1.4"} @@ -776,17 +788,30 @@ of one entity) or when merging spans with > filtered = filter_spans(spans) > ``` -| Name | Type | Description | -| ----------- | -------- | -------------------- | -| `spans` | iterable | The spans to filter. | -| **RETURNS** | list | The filtered spans. | +| Name | Description | +| ----------- | --------------------------------------- | +| `spans` | The spans to filter. ~~Iterable[Span]~~ | +| **RETURNS** | The filtered spans. ~~List[Span]~~ | ### util.get_words_and_spaces {#get_words_and_spaces tag="function" new="3"} - +Given a list of words and a text, reconstruct the original tokens and return a +list of words and spaces that can be used to create a [`Doc`](/api/doc#init). +This can help recover destructive tokenization that didn't preserve any +whitespace information. -| Name | Type | Description | -| ----------- | ----- | ----------- | -| `words` | list | | -| `text` | str | | -| **RETURNS** | tuple | | +> #### Example +> +> ```python +> orig_words = ["Hey", ",", "what", "'s", "up", "?"] +> orig_text = "Hey, what's up?" +> words, spaces = get_words_and_spaces(orig_words, orig_text) +> # ['Hey', ',', 'what', "'s", 'up', '?'] +> # [False, True, False, True, False, False] +> ``` + +| Name | Description | +| ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------- | +| `words` | The list of words. ~~Iterable[str]~~ | +| `text` | The original text. ~~str~~ | +| **RETURNS** | A list of words and a list of boolean values indicating whether the word at this position is followed by a space. ~~Tuple[List[str], List[bool]]~~ | diff --git a/website/docs/api/transformer.md b/website/docs/api/transformer.md index 0e4b066ed..c32651e02 100644 --- a/website/docs/api/transformer.md +++ b/website/docs/api/transformer.md @@ -41,7 +41,8 @@ token, the spaCy token receives the sum of their values. To access the values, you can use the custom [`Doc._.trf_data`](#custom-attributes) attribute. The package also adds the function registries [`@span_getters`](#span_getters) and [`@annotation_setters`](#annotation_setters) with several built-in registered -functions. For more details, see the [usage documentation](/usage/transformers). +functions. For more details, see the +[usage documentation](/usage/embeddings-transformers). ## Config and implementation {#config} @@ -60,11 +61,11 @@ architectures and their arguments and hyperparameters. > nlp.add_pipe("transformer", config=DEFAULT_CONFIG) > ``` -| Setting | Type | Description | Default | -| ------------------- | ------------------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------- | -| `max_batch_items` | int | Maximum size of a padded batch. | `4096` | -| `annotation_setter` | Callable | Function that takes a batch of `Doc` objects and a [`FullTransformerBatch`](/api/transformer#fulltransformerbatch) and can set additional annotations on the `Doc`. The `Doc._.transformer_data` attribute is set prior to calling the callback. By default, no additional annotations are set. | `null_annotation_setter` | -| `model` | [`Model`](https://thinc.ai/docs/api-model) | **Input:** `List[Doc]`. **Output:** [`FullTransformerBatch`](/api/transformer#fulltransformerbatch). The Thinc [`Model`](https://thinc.ai/docs/api-model) wrapping the transformer. | [TransformerModel](/api/architectures#TransformerModel) | +| Setting | Description | +| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `max_batch_items` | Maximum size of a padded batch. Defaults to `4096`. ~~int~~ | +| `annotation_setter` | Function that takes a batch of `Doc` objects and transformer outputs can set additional annotations on the `Doc`. The `Doc._.transformer_data` attribute is set prior to calling the callback. Defaults to `null_annotation_setter` (no additional annotations). ~~Callable[[List[Doc], FullTransformerBatch], None]~~ | +| `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) wrapping the transformer. Defaults to [TransformerModel](/api/architectures#TransformerModel). ~~Model[List[Doc], FullTransformerBatch]~~ | ```python https://github.com/explosion/spacy-transformers/blob/master/spacy_transformers/pipeline_component.py @@ -101,14 +102,14 @@ attribute. You can also provide a callback to set additional annotations. In your application, you would normally use a shortcut for this and instantiate the component using its string name and [`nlp.add_pipe`](/api/language#create_pipe). -| Name | Type | Description | -| ------------------- | ------------------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `vocab` | `Vocab` | The shared vocabulary. | -| `model` | [`Model`](https://thinc.ai/docs/api-model) | **Input:** `List[Doc]`. **Output:** [`FullTransformerBatch`](/api/transformer#fulltransformerbatch). The Thinc [`Model`](https://thinc.ai/docs/api-model) wrapping the transformer. Usually you will want to use the [TransformerModel](/api/architectures#TransformerModel) layer for this. | -| `annotation_setter` | `Callable` | Function that takes a batch of `Doc` objects and a [`FullTransformerBatch`](/api/transformer#fulltransformerbatch) and can set additional annotations on the `Doc`. The `Doc._.transformer_data` attribute is set prior to calling the callback. By default, no additional annotations are set. | -| _keyword-only_ | | | -| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. | -| `max_batch_items` | int | Maximum size of a padded batch. Defaults to `128*32`. | +| Name | Description | +| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `vocab` | The shared vocabulary. ~~Vocab~~ | +| `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) wrapping the transformer. Usually you will want to use the [TransformerModel](/api/architectures#TransformerModel) layer for this. ~~Model[List[Doc], FullTransformerBatch]~~ | +| `annotation_setter` | Function that takes a batch of `Doc` objects and transformer outputs can set additional annotations on the `Doc`. The `Doc._.transformer_data` attribute is set prior to calling the callback. By default, no annotations are set. ~~Callable[[List[Doc], FullTransformerBatch], None]~~ | +| _keyword-only_ | | +| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | +| `max_batch_items` | Maximum size of a padded batch. Defaults to `128*32`. ~~int~~ | ## Transformer.\_\_call\_\_ {#call tag="method"} @@ -128,10 +129,10 @@ to the [`predict`](/api/transformer#predict) and > processed = transformer(doc) > ``` -| Name | Type | Description | -| ----------- | ----- | ------------------------ | -| `doc` | `Doc` | The document to process. | -| **RETURNS** | `Doc` | The processed document. | +| Name | Description | +| ----------- | -------------------------------- | +| `doc` | The document to process. ~~Doc~~ | +| **RETURNS** | The processed document. ~~Doc~~ | ## Transformer.pipe {#pipe tag="method"} @@ -150,12 +151,12 @@ applied to the `Doc` in order. Both [`__call__`](/api/transformer#call) and > pass > ``` -| Name | Type | Description | -| -------------- | --------------- | ----------------------------------------------------- | -| `stream` | `Iterable[Doc]` | A stream of documents. | -| _keyword-only_ | | | -| `batch_size` | int | The number of documents to buffer. Defaults to `128`. | -| **YIELDS** | `Doc` | The processed documents in order. | +| Name | Description | +| -------------- | ------------------------------------------------------------- | +| `stream` | A stream of documents. ~~Iterable[Doc]~~ | +| _keyword-only_ | | +| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ | +| **YIELDS** | The processed documents in order. ~~Doc~~ | ## Transformer.begin_training {#begin_training tag="method"} @@ -175,13 +176,13 @@ setting up the label scheme based on the data. > optimizer = trf.begin_training(lambda: [], pipeline=nlp.pipeline) > ``` -| Name | Type | Description | -| -------------- | --------------------------------------------------- | -------------------------------------------------------------------------------------------------------------- | -| `get_examples` | `Callable[[], Iterable[Example]]` | Optional function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. | -| _keyword-only_ | | | -| `pipeline` | `List[Tuple[str, Callable]]` | Optional list of pipeline components that this component is part of. | -| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | An optional optimizer. Will be created via [`create_optimizer`](/api/transformer#create_optimizer) if not set. | -| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------------------------------------- | +| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ | +| _keyword-only_ | | +| `pipeline` | Optional list of pipeline components that this component is part of. ~~Optional[List[Tuple[str, Callable[[Doc], Doc]]]]~~ | +| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | +| **RETURNS** | The optimizer. ~~Optimizer~~ | ## Transformer.predict {#predict tag="method"} @@ -195,10 +196,10 @@ modifying them. > scores = trf.predict([doc1, doc2]) > ``` -| Name | Type | Description | -| ----------- | --------------- | ----------------------------------------- | -| `docs` | `Iterable[Doc]` | The documents to predict. | -| **RETURNS** | - | The model's prediction for each document. | +| Name | Description | +| ----------- | ------------------------------------------- | +| `docs` | The documents to predict. ~~Iterable[Doc]~~ | +| **RETURNS** | The model's prediction for each document. | ## Transformer.set_annotations {#set_annotations tag="method"} @@ -215,10 +216,10 @@ callback is then called, if provided. > trf.set_annotations(docs, scores) > ``` -| Name | Type | Description | -| -------- | --------------- | ----------------------------------------------------- | -| `docs` | `Iterable[Doc]` | The documents to modify. | -| `scores` | - | The scores to set, produced by `Transformer.predict`. | +| Name | Description | +| -------- | ----------------------------------------------------- | +| `docs` | The documents to modify. ~~Iterable[Doc]~~ | +| `scores` | The scores to set, produced by `Transformer.predict`. | ## Transformer.update {#update tag="method"} @@ -244,15 +245,15 @@ and call the optimizer, while the others simply increment the gradients. > losses = trf.update(examples, sgd=optimizer) > ``` -| Name | Type | Description | -| ----------------- | --------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects. Only the [`Example.predicted`](/api/example#predicted) `Doc` object is used, the reference `Doc` is ignored. | -| _keyword-only_ | | | -| `drop` | float | The dropout rate. | -| `set_annotations` | bool | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](/api/transformer#set_annotations). | -| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | -| `losses` | `Dict[str, float]` | Optional record of the loss during training. Updated using the component name as the key. | -| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. | +| Name | Description | +| ----------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `examples` | A batch of [`Example`](/api/example) objects. Only the [`Example.predicted`](/api/example#predicted) `Doc` object is used, the reference `Doc` is ignored. ~~Iterable[Example]~~ | +| _keyword-only_ | | +| `drop` | The dropout rate. ~~float~~ | +| `set_annotations` | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](#set_annotations). ~~bool~~ | +| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | +| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ | +| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ | ## Transformer.create_optimizer {#create_optimizer tag="method"} @@ -265,9 +266,9 @@ Create an optimizer for the pipeline component. > optimizer = trf.create_optimizer() > ``` -| Name | Type | Description | -| ----------- | --------------------------------------------------- | -------------- | -| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | +| Name | Description | +| ----------- | ---------------------------- | +| **RETURNS** | The optimizer. ~~Optimizer~~ | ## Transformer.use_params {#use_params tag="method, contextmanager"} @@ -282,9 +283,9 @@ context, the original parameters are restored. > trf.to_disk("/best_model") > ``` -| Name | Type | Description | -| -------- | ---- | ----------------------------------------- | -| `params` | dict | The parameter values to use in the model. | +| Name | Description | +| -------- | -------------------------------------------------- | +| `params` | The parameter values to use in the model. ~~dict~~ | ## Transformer.to_disk {#to_disk tag="method"} @@ -297,11 +298,11 @@ Serialize the pipe to disk. > trf.to_disk("/path/to/transformer") > ``` -| Name | Type | Description | -| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | -| _keyword-only_ | | | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------ | +| `path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ | +| _keyword-only_ | | +| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ | ## Transformer.from_disk {#from_disk tag="method"} @@ -314,12 +315,12 @@ Load the pipe from disk. Modifies the object in place and returns it. > trf.from_disk("/path/to/transformer") > ``` -| Name | Type | Description | -| -------------- | --------------- | -------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | -| _keyword-only_ | | | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `Tok2Vec` | The modified `Tok2Vec` object. | +| Name | Description | +| -------------- | ----------------------------------------------------------------------------------------------- | +| `path` | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ | +| _keyword-only_ | | +| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ | +| **RETURNS** | The modified `Transformer` object. ~~Transformer~~ | ## Transformer.to_bytes {#to_bytes tag="method"} @@ -332,11 +333,11 @@ Load the pipe from disk. Modifies the object in place and returns it. Serialize the pipe to a bytestring. -| Name | Type | Description | -| -------------- | --------------- | ------------------------------------------------------------------------- | -| _keyword-only_ | | | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | bytes | The serialized form of the `Tok2Vec` object. | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------- | +| _keyword-only_ | | +| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ | +| **RETURNS** | The serialized form of the `Transformer` object. ~~bytes~~ | ## Transformer.from_bytes {#from_bytes tag="method"} @@ -350,12 +351,12 @@ Load the pipe from a bytestring. Modifies the object in place and returns it. > trf.from_bytes(trf_bytes) > ``` -| Name | Type | Description | -| -------------- | --------------- | ------------------------------------------------------------------------- | -| `bytes_data` | bytes | The data to load from. | -| _keyword-only_ | | | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `Tok2Vec` | The `Tok2Vec` object. | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------- | +| `bytes_data` | The data to load from. ~~bytes~~ | +| _keyword-only_ | | +| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ | +| **RETURNS** | The `Transformer` object. ~~Transformer~~ | ## Serialization fields {#serialization-fields} @@ -386,20 +387,20 @@ by this class. Instances of this class are`typically assigned to the [Doc._.trf_data`](/api/transformer#custom-attributes) extension attribute. -| Name | Type | Description | -| --------- | -------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `tokens` | `Dict` | A slice of the tokens data produced by the tokenizer. This may have several fields, including the token IDs, the texts, and the attention mask. See the [`transformers.BatchEncoding`](https://huggingface.co/transformers/main_classes/tokenizer.html#transformers.BatchEncoding) object for details. | -| `tensors` | `List[FloatsXd]` | The activations for the Doc from the transformer. Usually the last tensor that is 3-dimensional will be the most important, as that will provide the final hidden state. Generally activations that are 2-dimensional will be attention weights. Details of this variable will differ depending on the underlying transformer model. | -| `align` | [`Ragged`](https://thinc.ai/docs/api-types#ragged) | Alignment from the `Doc`'s tokenization to the wordpieces. This is a ragged array, where `align.lengths[i]` indicates the number of wordpiece tokens that token `i` aligns against. The actual indices are provided at `align[i].dataXd`. | -| `width` | int | The width of the last hidden layer. | +| Name | Description | +| --------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `tokens` | A slice of the tokens data produced by the tokenizer. This may have several fields, including the token IDs, the texts, and the attention mask. See the [`transformers.BatchEncoding`](https://huggingface.co/transformers/main_classes/tokenizer.html#transformers.BatchEncoding) object for details. ~~dict~~ | +| `tensors` | The activations for the Doc from the transformer. Usually the last tensor that is 3-dimensional will be the most important, as that will provide the final hidden state. Generally activations that are 2-dimensional will be attention weights. Details of this variable will differ depending on the underlying transformer model. ~~List[FloatsXd]~~ | +| `align` | Alignment from the `Doc`'s tokenization to the wordpieces. This is a ragged array, where `align.lengths[i]` indicates the number of wordpiece tokens that token `i` aligns against. The actual indices are provided at `align[i].dataXd`. ~~Ragged~~ | +| `width` | The width of the last hidden layer. ~~int~~ | ### TransformerData.empty {#transformerdata-emoty tag="classmethod"} Create an empty `TransformerData` container. -| Name | Type | Description | -| ----------- | ----------------- | -------------- | -| **RETURNS** | `TransformerData` | The container. | +| Name | Description | +| ----------- | ---------------------------------- | +| **RETURNS** | The container. ~~TransformerData~~ | ## FullTransformerBatch {#fulltransformerbatch tag="dataclass"} @@ -407,13 +408,13 @@ Holds a batch of input and output objects for a transformer model. The data can then be split to a list of [`TransformerData`](/api/transformer#transformerdata) objects to associate the outputs to each [`Doc`](/api/doc) in the batch. -| Name | Type | Description | -| ---------- | -------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `spans` | `List[List[Span]]` | The batch of input spans. The outer list refers to the Doc objects in the batch, and the inner list are the spans for that `Doc`. Note that spans are allowed to overlap or exclude tokens, but each Span can only refer to one `Doc` (by definition). This means that within a `Doc`, the regions of the output tensors that correspond to each Span may overlap or have gaps, but for each `Doc`, there is a non-overlapping contiguous slice of the outputs. | -| `tokens` | [`transformers.BatchEncoding`](https://huggingface.co/transformers/main_classes/tokenizer.html#transformers.BatchEncoding) | The output of the tokenizer. | -| `tensors` | `List[torch.Tensor]` | The output of the transformer model. | -| `align` | [`Ragged`](https://thinc.ai/docs/api-types#ragged) | Alignment from the spaCy tokenization to the wordpieces. This is a ragged array, where `align.lengths[i]` indicates the number of wordpiece tokens that token `i` aligns against. The actual indices are provided at `align[i].dataXd`. | -| `doc_data` | `List[TransformerData]` | The outputs, split per `Doc` object. | +| Name | Description | +| ---------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `spans` | The batch of input spans. The outer list refers to the Doc objects in the batch, and the inner list are the spans for that `Doc`. Note that spans are allowed to overlap or exclude tokens, but each Span can only refer to one `Doc` (by definition). This means that within a `Doc`, the regions of the output tensors that correspond to each Span may overlap or have gaps, but for each `Doc`, there is a non-overlapping contiguous slice of the outputs. ~~List[List[Span]]~~ | +| `tokens` | The output of the tokenizer. ~~transformers.BatchEncoding~~ | +| `tensors` | The output of the transformer model. ~~List[torch.Tensor]~~ | +| `align` | Alignment from the spaCy tokenization to the wordpieces. This is a ragged array, where `align.lengths[i]` indicates the number of wordpiece tokens that token `i` aligns against. The actual indices are provided at `align[i].dataXd`. ~~Ragged~~ | +| `doc_data` | The outputs, split per `Doc` object. ~~List[TransformerData]~~ | ### FullTransformerBatch.unsplit_by_doc {#fulltransformerbatch-unsplit_by_doc tag="method"} @@ -422,19 +423,19 @@ current object's spans, tokens and alignment. This is used during the backward pass, in order to construct the gradients to pass back into the transformer model. -| Name | Type | Description | -| ----------- | ---------------------- | ------------------------------- | -| `arrays` | `List[List[Floats3d]]` | The split batch of activations. | -| **RETURNS** | `FullTransformerBatch` | The transformer batch. | +| Name | Description | +| ----------- | -------------------------------------------------------- | +| `arrays` | The split batch of activations. ~~List[List[Floats3d]]~~ | +| **RETURNS** | The transformer batch. ~~FullTransformerBatch~~ | ### FullTransformerBatch.split_by_doc {#fulltransformerbatch-split_by_doc tag="method"} Split a `TransformerData` object that represents a batch into a list with one `TransformerData` per `Doc`. -| Name | Type | Description | -| ----------- | ----------------------- | ---------------- | -| **RETURNS** | `List[TransformerData]` | The split batch. | +| Name | Description | +| ----------- | ------------------------------------------ | +| **RETURNS** | The split batch. ~~List[TransformerData]~~ | ## Span getters {#span_getters source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/span_getters.py"} @@ -460,10 +461,10 @@ decorator. > return get_sent_spans > ``` -| Name | Type | Description | -| ----------- | ------------------ | ---------------------------------------- | -| `docs` | `Iterable[Doc]` | A batch of `Doc` objects. | -| **RETURNS** | `List[List[Span]]` | The spans to process by the transformer. | +| Name | Description | +| ----------- | ------------------------------------------------------------- | +| `docs` | A batch of `Doc` objects. ~~Iterable[Doc]~~ | +| **RETURNS** | The spans to process by the transformer. ~~List[List[Span]]~~ | ### doc_spans.v1 {#doc_spans tag="registered function"} @@ -510,10 +511,10 @@ than `window` will allow for an overlap, so that some tokens are counted twice. This can be desirable, because it allows all tokens to have both a left and right context. -| Name | Type | Description | -| --------- | ---- | ---------------- | -| Β `window` | int | The window size. | -| `stride` | int | The stride size. | +| Name | Description | +| -------- | ------------------------ | +| `window` | The window size. ~~int~~ | +| `stride` | The stride size. ~~int~~ | ## Annotation setters {#annotation_setters tag="registered functions" source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/annotation_setters.py"} @@ -526,7 +527,7 @@ You can register custom annotation setters using the > #### Example > > ```python -> @registry.annotation_setters("spacy-transformer.null_annotation_setter.v1") +> @registry.annotation_setters("spacy-transformers.null_annotation_setter.v1") > def configure_null_annotation_setter() -> Callable: > def setter(docs: List[Doc], trf_data: FullTransformerBatch) -> None: > pass @@ -534,22 +535,22 @@ You can register custom annotation setters using the > return setter > ``` -| Name | Type | Description | -| ---------- | ---------------------- | ------------------------------------ | -| `docs` | `List[Doc]` | A batch of `Doc` objects. | -| `trf_data` | `FullTransformerBatch` | The transformers data for the batch. | +| Name | Description | +| ---------- | ------------------------------------------------------------- | +| `docs` | A batch of `Doc` objects. ~~List[Doc]~~ | +| `trf_data` | The transformers data for the batch. ~~FullTransformerBatch~~ | The following built-in functions are available: -| Name | Description | -| --------------------------------------------- | ------------------------------------- | -| `spacy-transformer.null_annotation_setter.v1` | Don't set any additional annotations. | +| Name | Description | +| ---------------------------------------------- | ------------------------------------- | +| `spacy-transformers.null_annotation_setter.v1` | Don't set any additional annotations. | ## Custom attributes {#custom-attributes} The component sets the following [custom extension attributes](/usage/processing-pipeline#custom-components-attributes): -| Name | Type | Description | -| -------------- | ----------------------------------------------------- | ---------------------------------------------------- | -| `Doc.trf_data` | [`TransformerData`](/api/transformer#transformerdata) | Transformer tokens and outputs for the `Doc` object. | +| Name | Description | +| -------------- | ------------------------------------------------------------------------ | +| `Doc.trf_data` | Transformer tokens and outputs for the `Doc` object. ~~TransformerData~~ | diff --git a/website/docs/api/vectors.md b/website/docs/api/vectors.md index bfb49e9a2..7e97b4ca3 100644 --- a/website/docs/api/vectors.md +++ b/website/docs/api/vectors.md @@ -30,13 +30,13 @@ you can add vectors to later. > vectors = Vectors(data=data, keys=keys) > ``` -| Name | Type | Description | -| -------------- | ---------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| _keyword-only_ | | | -| `shape` | tuple | Size of the table as `(n_entries, n_columns)`, the number of entries and number of columns. Not required if you're initializing the object with `data` and `keys`. | -| `data` | `ndarray[ndim=1, dtype='float32']` | The vector data. | -| `keys` | iterable | A sequence of keys aligned with the data. | -| `name` | str | A name to identify the vectors table. | +| Name | Description | +| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| _keyword-only_ | | +| `shape` | Size of the table as `(n_entries, n_columns)`, the number of entries and number of columns. Not required if you're initializing the object with `data` and `keys`. ~~Tuple[int, int]~~ | +| `data` | The vector data. ~~numpy.ndarray[ndim=1, dtype=float32]~~ | +| `keys` | A sequence of keys aligned with the data. ~~Iterable[Union[str, int]]~~ | +| `name` | A name to identify the vectors table. ~~str~~ | ## Vectors.\_\_getitem\_\_ {#getitem tag="method"} @@ -51,10 +51,10 @@ raised. > assert cat_vector == nlp.vocab["cat"].vector > ``` -| Name | Type | Description | -| ------- | ---------------------------------- | ------------------------------ | -| `key` | int | The key to get the vector for. | -| returns | `ndarray[ndim=1, dtype='float32']` | The vector for the key. | +| Name | Description | +| ----------- | ---------------------------------------------------------------- | +| `key` | The key to get the vector for. ~~int~~ | +| **RETURNS** | The vector for the key. ~~numpy.ndarray[ndim=1, dtype=float32]~~ | ## Vectors.\_\_setitem\_\_ {#setitem tag="method"} @@ -68,10 +68,10 @@ Set a vector for the given key. > nlp.vocab.vectors[cat_id] = vector > ``` -| Name | Type | Description | -| -------- | ---------------------------------- | ------------------------------ | -| `key` | int | The key to set the vector for. | -| `vector` | `ndarray[ndim=1, dtype='float32']` | The vector to set. | +| Name | Description | +| -------- | ----------------------------------------------------------- | +| `key` | The key to set the vector for. ~~int~~ | +| `vector` | The vector to set. ~~numpy.ndarray[ndim=1, dtype=float32]~~ | ## Vectors.\_\_iter\_\_ {#iter tag="method"} @@ -84,9 +84,9 @@ Iterate over the keys in the table. > print(key, nlp.vocab.strings[key]) > ``` -| Name | Type | Description | -| ---------- | ---- | ------------------- | -| **YIELDS** | int | A key in the table. | +| Name | Description | +| ---------- | --------------------------- | +| **YIELDS** | A key in the table. ~~int~~ | ## Vectors.\_\_len\_\_ {#len tag="method"} @@ -99,9 +99,9 @@ Return the number of vectors in the table. > assert len(vectors) == 3 > ``` -| Name | Type | Description | -| ----------- | ---- | ----------------------------------- | -| **RETURNS** | int | The number of vectors in the table. | +| Name | Description | +| ----------- | ------------------------------------------- | +| **RETURNS** | The number of vectors in the table. ~~int~~ | ## Vectors.\_\_contains\_\_ {#contains tag="method"} @@ -115,10 +115,10 @@ Check whether a key has been mapped to a vector entry in the table. > assert cat_id in vectors > ``` -| Name | Type | Description | -| ----------- | ---- | ----------------------------------- | -| `key` | int | The key to check. | -| **RETURNS** | bool | Whether the key has a vector entry. | +| Name | Description | +| ----------- | -------------------------------------------- | +| `key` | The key to check. ~~int~~ | +| **RETURNS** | Whether the key has a vector entry. ~~bool~~ | ## Vectors.add {#add tag="method"} @@ -138,13 +138,13 @@ mapping separately. If you need to manage the strings, you should use the > nlp.vocab.vectors.add("dog", row=0) > ``` -| Name | Type | Description | -| -------------- | ---------------------------------- | ----------------------------------------------------- | -| `key` | str / int | The key to add. | -| _keyword-only_ | | | -| `vector` | `ndarray[ndim=1, dtype='float32']` | An optional vector to add for the key. | -| `row` | int | An optional row number of a vector to map the key to. | -| **RETURNS** | int | The row the vector was added to. | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------- | +| `key` | The key to add. ~~Union[str, int]~~ | +| _keyword-only_ | | +| `vector` | An optional vector to add for the key. ~~numpy.ndarray[ndim=1, dtype=float32]~~ | +| `row` | An optional row number of a vector to map the key to. ~~int~~ | +| **RETURNS** | The row the vector was added to. ~~int~~ | ## Vectors.resize {#resize tag="method"} @@ -160,11 +160,11 @@ These removed items are returned as a list of `(key, row)` tuples. > removed = nlp.vocab.vectors.resize((10000, 300)) > ``` -| Name | Type | Description | -| ----------- | ----- | -------------------------------------------------------------------- | -| `shape` | tuple | A `(rows, dims)` tuple describing the number of rows and dimensions. | -| `inplace` | bool | Reallocate the memory. | -| **RETURNS** | list | The removed items as a list of `(key, row)` tuples. | +| Name | Description | +| ----------- | ---------------------------------------------------------------------------------------- | +| `shape` | A `(rows, dims)` tuple describing the number of rows and dimensions. ~~Tuple[int, int]~~ | +| `inplace` | Reallocate the memory. ~~bool~~ | +| **RETURNS** | The removed items as a list of `(key, row)` tuples. ~~List[Tuple[int, int]]~~ | ## Vectors.keys {#keys tag="method"} @@ -177,9 +177,9 @@ A sequence of the keys in the table. > print(key, nlp.vocab.strings[key]) > ``` -| Name | Type | Description | -| ----------- | -------- | ----------- | -| **RETURNS** | iterable | The keys. | +| Name | Description | +| ----------- | --------------------------- | +| **RETURNS** | The keys. ~~Iterable[int]~~ | ## Vectors.values {#values tag="method"} @@ -194,9 +194,9 @@ the length of the vectors table. > print(vector) > ``` -| Name | Type | Description | -| ---------- | ---------------------------------- | ---------------------- | -| **YIELDS** | `ndarray[ndim=1, dtype='float32']` | A vector in the table. | +| Name | Description | +| ---------- | --------------------------------------------------------------- | +| **YIELDS** | A vector in the table. ~~numpy.ndarray[ndim=1, dtype=float32]~~ | ## Vectors.items {#items tag="method"} @@ -209,9 +209,9 @@ Iterate over `(key, vector)` pairs, in order. > print(key, nlp.vocab.strings[key], vector) > ``` -| Name | Type | Description | -| ---------- | ----- | -------------------------------- | -| **YIELDS** | tuple | `(key, vector)` pairs, in order. | +| Name | Description | +| ---------- | ------------------------------------------------------------------------------------- | +| **YIELDS** | `(key, vector)` pairs, in order. ~~Tuple[int, numpy.ndarray[ndim=1, dtype=float32]]~~ | ## Vectors.find {#find tag="method"} @@ -226,14 +226,14 @@ Look up one or more keys by row, or vice versa. > keys = nlp.vocab.vectors.find(rows=[18, 256, 985]) > ``` -| Name | Type | Description | -| -------------- | ------------------------------------- | ------------------------------------------------------------------------ | -| _keyword-only_ | | | -| `key` | str / int | Find the row that the given key points to. Returns int, `-1` if missing. | -| `keys` | iterable | Find rows that the keys point to. Returns `ndarray`. | -| `row` | int | Find the first key that points to the row. Returns int. | -| `rows` | iterable | Find the keys that point to the rows. Returns ndarray. | -| **RETURNS** | The requested key, keys, row or rows. | +| Name | Description | +| -------------- | -------------------------------------------------------------------------------------------- | +| _keyword-only_ | | +| `key` | Find the row that the given key points to. Returns int, `-1` if missing. ~~Union[str, int]~~ | +| `keys` | Find rows that the keys point to. Returns `numpy.ndarray`. ~~Iterable[Union[str, int]]~~ | +| `row` | Find the first key that points to the row. Returns integer. ~~int~~ | +| `rows` | Find the keys that point to the rows. Returns `numpy.ndarray`. ~~Iterable[int]~~ | +| **RETURNS** | The requested key, keys, row or rows. ~~Union[int, numpy.ndarray[ndim=1, dtype=float32]]~~ | ## Vectors.shape {#shape tag="property"} @@ -250,9 +250,9 @@ vector table. > assert dims == 300 > ``` -| Name | Type | Description | -| ----------- | ----- | ---------------------- | -| **RETURNS** | tuple | A `(rows, dims)` pair. | +| Name | Description | +| ----------- | ------------------------------------------ | +| **RETURNS** | A `(rows, dims)` pair. ~~Tuple[int, int]~~ | ## Vectors.size {#size tag="property"} @@ -265,9 +265,9 @@ The vector size, i.e. `rows * dims`. > assert vectors.size == 150000 > ``` -| Name | Type | Description | -| ----------- | ---- | ---------------- | -| **RETURNS** | int | The vector size. | +| Name | Description | +| ----------- | ------------------------ | +| **RETURNS** | The vector size. ~~int~~ | ## Vectors.is_full {#is_full tag="property"} @@ -283,9 +283,9 @@ If a table is full, it can be resized using > assert vectors.is_full > ``` -| Name | Type | Description | -| ----------- | ---- | ---------------------------------- | -| **RETURNS** | bool | Whether the vectors table is full. | +| Name | Description | +| ----------- | ------------------------------------------- | +| **RETURNS** | Whether the vectors table is full. ~~bool~~ | ## Vectors.n_keys {#n_keys tag="property"} @@ -301,9 +301,9 @@ vectors, they will be counted individually. > assert vectors.n_keys == 0 > ``` -| Name | Type | Description | -| ----------- | ---- | ------------------------------------ | -| **RETURNS** | int | The number of all keys in the table. | +| Name | Description | +| ----------- | -------------------------------------------- | +| **RETURNS** | The number of all keys in the table. ~~int~~ | ## Vectors.most_similar {#most_similar tag="method"} @@ -320,14 +320,14 @@ performed in chunks, to avoid consuming too much memory. You can set the > most_similar = nlp.vocab.vectors.most_similar(queries, n=10) > ``` -| Name | Type | Description | -| -------------- | --------- | ------------------------------------------------------------------ | -| `queries` | `ndarray` | An array with one or more vectors. | -| _keyword-only_ | | | -| `batch_size` | int | The batch size to use. Default to `1024`. | -| `n` | int | The number of entries to return for each query. Defaults to `1`. | -| `sort` | bool | Whether to sort the entries returned by score. Defaults to `True`. | -| **RETURNS** | tuple | The most similar entries as a `(keys, best_rows, scores)` tuple. | +| Name | Description | +| -------------- | --------------------------------------------------------------------------- | +| `queries` | An array with one or more vectors. ~~numpy.ndarray~~ | +| _keyword-only_ | | +| `batch_size` | The batch size to use. Default to `1024`. ~~int~~ | +| `n` | The number of entries to return for each query. Defaults to `1`. ~~int~~ | +| `sort` | Whether to sort the entries returned by score. Defaults to `True`. ~~bool~~ | +| **RETURNS** | tuple | The most similar entries as a `(keys, best_rows, scores)` tuple. ~~Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray]~~ | ## Vectors.to_disk {#to_disk tag="method"} @@ -340,9 +340,9 @@ Save the current state to a directory. > > ``` -| Name | Type | Description | -| ------ | ------------ | --------------------------------------------------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| Name | Description | +| ------ | ------------------------------------------------------------------------------------------------------------------------------------------ | +| `path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ | ## Vectors.from_disk {#from_disk tag="method"} @@ -355,10 +355,10 @@ Loads state from a directory. Modifies the object in place and returns it. > vectors.from_disk("/path/to/vectors") > ``` -| Name | Type | Description | -| ----------- | ------------ | -------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | -| **RETURNS** | `Vectors` | The modified `Vectors` object. | +| Name | Description | +| ----------- | ----------------------------------------------------------------------------------------------- | +| `path` | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ | +| **RETURNS** | The modified `Vectors` object. ~~Vectors~~ | ## Vectors.to_bytes {#to_bytes tag="method"} @@ -370,9 +370,9 @@ Serialize the current state to a binary string. > vectors_bytes = vectors.to_bytes() > ``` -| Name | Type | Description | -| ----------- | ----- | -------------------------------------------- | -| **RETURNS** | bytes | The serialized form of the `Vectors` object. | +| Name | Description | +| ----------- | ------------------------------------------------------ | +| **RETURNS** | The serialized form of the `Vectors` object. ~~bytes~~ | ## Vectors.from_bytes {#from_bytes tag="method"} @@ -387,15 +387,15 @@ Load state from a binary string. > new_vectors.from_bytes(vectors_bytes) > ``` -| Name | Type | Description | -| ----------- | --------- | ---------------------- | -| `data` | bytes | The data to load from. | -| **RETURNS** | `Vectors` | The `Vectors` object. | +| Name | Description | +| ----------- | --------------------------------- | +| `data` | The data to load from. ~~bytes~~ | +| **RETURNS** | The `Vectors` object. ~~Vectors~~ | ## Attributes {#attributes} -| Name | Type | Description | -| --------- | ---------------------------------- | ------------------------------------------------------------------------------- | -| `data` | `ndarray[ndim=1, dtype='float32']` | Stored vectors data. `numpy` is used for CPU vectors, `cupy` for GPU vectors. | -| `key2row` | dict | Dictionary mapping word hashes to rows in the `Vectors.data` table. | -| `keys` | `ndarray[ndim=1, dtype='float32']` | Array keeping the keys in order, such that `keys[vectors.key2row[key]] == key`. | +| Name | Description | +| --------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `data` | Stored vectors data. `numpy` is used for CPU vectors, `cupy` for GPU vectors. ~~Union[numpy.ndarray[ndim=1, dtype=float32], cupy.ndarray[ndim=1, dtype=float32]]~~ | +| `key2row` | Dictionary mapping word hashes to rows in the `Vectors.data` table. ~~Dict[int, int]~~ | +| `keys` | Array keeping the keys in order, such that `keys[vectors.key2row[key]] == key`. ~~Union[numpy.ndarray[ndim=1, dtype=float32], cupy.ndarray[ndim=1, dtype=float32]]~~ | diff --git a/website/docs/api/vocab.md b/website/docs/api/vocab.md index 7e77762bb..71a678cb3 100644 --- a/website/docs/api/vocab.md +++ b/website/docs/api/vocab.md @@ -21,14 +21,15 @@ Create the vocabulary. > vocab = Vocab(strings=["hello", "world"]) > ``` -| Name | Type | Description | -| -------------------------------------------- | -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `lex_attr_getters` | dict | A dictionary mapping attribute IDs to functions to compute them. Defaults to `None`. | -| `strings` | `StringStore` / list | A [`StringStore`](/api/stringstore) that maps strings to hash values, and vice versa, or a list of strings. | -| `lookups` | `Lookups` | A [`Lookups`](/api/lookups) that stores the `lemma_\*`, `lexeme_norm` and other large lookup tables. Defaults to `None`. | -| `lookups_extra` 2.3 | `Lookups` | A [`Lookups`](/api/lookups) that stores the optional `lexeme_cluster`/`lexeme_prob`/`lexeme_sentiment`/`lexeme_settings` lookup tables. Defaults to `None`. | -| `oov_prob` | float | The default OOV probability. Defaults to `-20.0`. | -| `vectors_name` 2.2 | str | A name to identify the vectors table. | +| Name | Description | +| ------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `lex_attr_getters` | A dictionary mapping attribute IDs to functions to compute them. Defaults to `None`. ~~Optional[Dict[str, Callable[[str], Any]]]~~ | +| `strings` | A [`StringStore`](/api/stringstore) that maps strings to hash values, and vice versa, or a list of strings. ~~Union[List[str], StringStore]~~ | +| `lookups` | A [`Lookups`](/api/lookups) that stores the `lexeme_norm` and other large lookup tables. Defaults to `None`. ~~Optional[Lookups]~~ | +| `oov_prob` | The default OOV probability. Defaults to `-20.0`. ~~float~~ | +| `vectors_name` 2.2 | A name to identify the vectors table. ~~str~~ | +| `writing_system` | A dictionary describing the language's writing system. Typically provided by [`Language.Defaults`](/api/language#defaults). ~~Dict[str, Any]~~ | +| `get_noun_chunks` | A function that yields base noun phrases, used for [`Doc.noun_chunks`](/ap/doc#noun_chunks). ~~Optional[Callable[[Union[Doc, Span], Iterator[Span]]]]~~ | ## Vocab.\_\_len\_\_ {#len tag="method"} @@ -41,9 +42,9 @@ Get the current number of lexemes in the vocabulary. > assert len(nlp.vocab) > 0 > ``` -| Name | Type | Description | -| ----------- | ---- | ---------------------------------------- | -| **RETURNS** | int | The number of lexemes in the vocabulary. | +| Name | Description | +| ----------- | ------------------------------------------------ | +| **RETURNS** | The number of lexemes in the vocabulary. ~~int~~ | ## Vocab.\_\_getitem\_\_ {#getitem tag="method"} @@ -57,10 +58,10 @@ given, a new lexeme is created and stored. > assert nlp.vocab[apple] == nlp.vocab["apple"] > ``` -| Name | Type | Description | -| -------------- | --------- | ---------------------------------------- | -| `id_or_string` | int / str | The hash value of a word, or its string. | -| **RETURNS** | `Lexeme` | The lexeme indicated by the given ID. | +| Name | Description | +| -------------- | ------------------------------------------------------------ | +| `id_or_string` | The hash value of a word, or its string. ~~Union[int, str]~~ | +| **RETURNS** | The lexeme indicated by the given ID. ~~Lexeme~~ | ## Vocab.\_\_iter\_\_ {#iter tag="method"} @@ -72,9 +73,9 @@ Iterate over the lexemes in the vocabulary. > stop_words = (lex for lex in nlp.vocab if lex.is_stop) > ``` -| Name | Type | Description | -| ---------- | -------- | --------------------------- | -| **YIELDS** | `Lexeme` | An entry in the vocabulary. | +| Name | Description | +| ---------- | -------------------------------------- | +| **YIELDS** | An entry in the vocabulary. ~~Lexeme~~ | ## Vocab.\_\_contains\_\_ {#contains tag="method"} @@ -91,10 +92,10 @@ given string, you need to look it up in > assert oov not in nlp.vocab > ``` -| Name | Type | Description | -| ----------- | ---- | -------------------------------------------------- | -| `string` | str | The ID string. | -| **RETURNS** | bool | Whether the string has an entry in the vocabulary. | +| Name | Description | +| ----------- | ----------------------------------------------------------- | +| `string` | The ID string. ~~str~~ | +| **RETURNS** | Whether the string has an entry in the vocabulary. ~~bool~~ | ## Vocab.add_flag {#add_flag tag="method"} @@ -115,11 +116,11 @@ using `token.check_flag(flag_id)`. > assert doc[2].check_flag(MY_PRODUCT) == True > ``` -| Name | Type | Description | -| ------------- | ---- | ----------------------------------------------------------------------------------------------------------------------------------------------- | -| `flag_getter` | dict | A function `f(str) -> bool`, to get the flag value. | -| `flag_id` | int | An integer between 1 and 63 (inclusive), specifying the bit at which the flag will be stored. If `-1`, the lowest available bit will be chosen. | -| **RETURNS** | int | The integer ID by which the flag value can be checked. | +| Name | Description | +| ------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `flag_getter` | A function that takes the lexeme text and returns the boolean flag value. ~~Callable[[str], bool]~~ | +| `flag_id` | An integer between `1` and `63` (inclusive), specifying the bit at which the flag will be stored. If `-1`, the lowest available bit will be chosen. ~~int~~ | +| **RETURNS** | The integer ID by which the flag value can be checked. ~~int~~ | ## Vocab.reset_vectors {#reset_vectors tag="method" new="2"} @@ -133,11 +134,11 @@ have to call this to change the size of the vectors. Only one of the `width` and > nlp.vocab.reset_vectors(width=300) > ``` -| Name | Type | Description | -| -------------- | ---- | -------------------------------------- | -| _keyword-only_ | | | -| `width` | int | The new width (keyword argument only). | -| `shape` | int | The new shape (keyword argument only). | +| Name | Description | +| -------------- | ---------------------- | +| _keyword-only_ | | +| `width` | The new width. ~~int~~ | +| `shape` | The new shape. ~~int~~ | ## Vocab.prune_vectors {#prune_vectors tag="method" new="2"} @@ -158,11 +159,11 @@ cosines are calculated in minibatches, to reduce memory usage. > assert len(nlp.vocab.vectors) <= 1000 > ``` -| Name | Type | Description | -| ------------ | ---- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `nr_row` | int | The number of rows to keep in the vector table. | -| `batch_size` | int | Batch of vectors for calculating the similarities. Larger batch sizes might be faster, while temporarily requiring more memory. | -| **RETURNS** | dict | A dictionary keyed by removed words mapped to `(string, score)` tuples, where `string` is the entry the removed word was mapped to, and `score` the similarity score between the two words. | +| Name | Description | +| ------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `nr_row` | The number of rows to keep in the vector table. ~~int~~ | +| `batch_size` | Batch of vectors for calculating the similarities. Larger batch sizes might be faster, while temporarily requiring more memory. ~~int~~ | +| **RETURNS** | A dictionary keyed by removed words mapped to `(string, score)` tuples, where `string` is the entry the removed word was mapped to, and `score` the similarity score between the two words. ~~Dict[str, Tuple[str, float]]~~ | ## Vocab.get_vector {#get_vector tag="method" new="2"} @@ -178,12 +179,12 @@ subword features by average over ngrams of `orth` (introduced in spaCy `v2.1`). > nlp.vocab.get_vector("apple", minn=1, maxn=5) > ``` -| Name | Type | Description | -| ----------------------------------- | ---------------------------------------- | ---------------------------------------------------------------------------------------------- | -| `orth` | int / str | The hash value of a word, or its unicode string. | -| `minn` 2.1 | int | Minimum n-gram length used for FastText's ngram computation. Defaults to the length of `orth`. | -| `maxn` 2.1 | int | Maximum n-gram length used for FastText's ngram computation. Defaults to the length of `orth`. | -| **RETURNS** | `numpy.ndarray[ndim=1, dtype='float32']` | A word vector. Size and shape are determined by the `Vocab.vectors` instance. | +| Name | Description | +| ----------------------------------- | ---------------------------------------------------------------------------------------------------------------------- | +| `orth` | The hash value of a word, or its unicode string. ~~Union[int, str]~~ | +| `minn` 2.1 | Minimum n-gram length used for FastText's ngram computation. Defaults to the length of `orth`. ~~int~~ | +| `maxn` 2.1 | Maximum n-gram length used for FastText's ngram computation. Defaults to the length of `orth`. ~~int~~ | +| **RETURNS** | A word vector. Size and shape are determined by the `Vocab.vectors` instance. ~~numpy.ndarray[ndim=1, dtype=float32]~~ | ## Vocab.set_vector {#set_vector tag="method" new="2"} @@ -196,10 +197,10 @@ or hash value. > nlp.vocab.set_vector("apple", array([...])) > ``` -| Name | Type | Description | -| -------- | ---------------------------------------- | ------------------------------------------------ | -| `orth` | int / str | The hash value of a word, or its unicode string. | -| `vector` | `numpy.ndarray[ndim=1, dtype='float32']` | The vector to set. | +| Name | Description | +| -------- | -------------------------------------------------------------------- | +| `orth` | The hash value of a word, or its unicode string. ~~Union[int, str]~~ | +| `vector` | The vector to set. ~~numpy.ndarray[ndim=1, dtype=float32]~~ | ## Vocab.has_vector {#has_vector tag="method" new="2"} @@ -213,10 +214,10 @@ Words can be looked up by string or hash value. > vector = nlp.vocab.get_vector("apple") > ``` -| Name | Type | Description | -| ----------- | --------- | ------------------------------------------------ | -| `orth` | int / str | The hash value of a word, or its unicode string. | -| **RETURNS** | bool | Whether the word has a vector. | +| Name | Description | +| ----------- | -------------------------------------------------------------------- | +| `orth` | The hash value of a word, or its unicode string. ~~Union[int, str]~~ | +| **RETURNS** | Whether the word has a vector. ~~bool~~ | ## Vocab.to_disk {#to_disk tag="method" new="2"} @@ -228,11 +229,11 @@ Save the current state to a directory. > nlp.vocab.to_disk("/path/to/vocab") > ``` -| Name | Type | Description | -| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | -| _keyword-only_ | | | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------ | +| `path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ | +| _keyword-only_ | | +| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ | ## Vocab.from_disk {#from_disk tag="method" new="2"} @@ -245,12 +246,12 @@ Loads state from a directory. Modifies the object in place and returns it. > vocab = Vocab().from_disk("/path/to/vocab") > ``` -| Name | Type | Description | -| -------------- | --------------- | -------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | -| _keyword-only_ | | | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `Vocab` | The modified `Vocab` object. | +| Name | Description | +| -------------- | ----------------------------------------------------------------------------------------------- | +| `path` | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ | +| _keyword-only_ | | +| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ | +| **RETURNS** | The modified `Vocab` object. ~~Vocab~~ | ## Vocab.to_bytes {#to_bytes tag="method"} @@ -262,11 +263,11 @@ Serialize the current state to a binary string. > vocab_bytes = nlp.vocab.to_bytes() > ``` -| Name | Type | Description | -| -------------- | --------------- | ------------------------------------------------------------------------- | -| _keyword-only_ | | | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | bytes | The serialized form of the `Vocab` object. | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------- | +| _keyword-only_ | | +| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ | +| **RETURNS** | The serialized form of the `Vocab` object. ~~Vocab~~ | ## Vocab.from_bytes {#from_bytes tag="method"} @@ -281,12 +282,12 @@ Load state from a binary string. > vocab.from_bytes(vocab_bytes) > ``` -| Name | Type | Description | -| -------------- | --------------- | ------------------------------------------------------------------------- | -| `bytes_data` | bytes | The data to load from. | -| _keyword-only_ | | | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `Vocab` | The `Vocab` object. | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------- | +| `bytes_data` | The data to load from. ~~bytes~~ | +| _keyword-only_ | | +| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ | +| **RETURNS** | The `Vocab` object. ~~Vocab~~ | ## Attributes {#attributes} @@ -299,13 +300,13 @@ Load state from a binary string. > assert type(PERSON) == int > ``` -| Name | Type | Description | -| --------------------------------------------- | ------------- | ------------------------------------------------------------ | -| `strings` | `StringStore` | A table managing the string-to-int mapping. | -| `vectors` 2 | `Vectors` | A table associating word IDs to word vectors. | -| `vectors_length` | int | Number of dimensions for each word vector. | -| `lookups` | `Lookups` | The available lookup tables in this vocab. | -| `writing_system` 2.1 | dict | A dict with information about the language's writing system. | +| Name | Description | +| --------------------------------------------- | ------------------------------------------------------------------------------- | +| `strings` | A table managing the string-to-int mapping. ~~StringStore~~ | +| `vectors` 2 | A table associating word IDs to word vectors. ~~Vectors~~ | +| `vectors_length` | Number of dimensions for each word vector. ~~int~~ | +| `lookups` | The available lookup tables in this vocab. ~~Lookups~~ | +| `writing_system` 2.1 | A dict with information about the language's writing system. ~~Dict[str, Any]~~ | ## Serialization fields {#serialization-fields} diff --git a/website/docs/images/layers-architectures.svg b/website/docs/images/layers-architectures.svg new file mode 100644 index 000000000..22e705ba1 --- /dev/null +++ b/website/docs/images/layers-architectures.svg @@ -0,0 +1,97 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/website/docs/images/projects.svg b/website/docs/images/projects.svg new file mode 100644 index 000000000..c7518d445 --- /dev/null +++ b/website/docs/images/projects.svg @@ -0,0 +1,92 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/website/docs/images/sense2vec.jpg b/website/docs/images/sense2vec.jpg new file mode 100644 index 000000000..3a1772582 Binary files /dev/null and b/website/docs/images/sense2vec.jpg differ diff --git a/website/docs/images/thinc_mypy.jpg b/website/docs/images/thinc_mypy.jpg new file mode 100644 index 000000000..c0f7ee636 Binary files /dev/null and b/website/docs/images/thinc_mypy.jpg differ diff --git a/website/docs/images/tok2vec-listener.svg b/website/docs/images/tok2vec-listener.svg new file mode 100644 index 000000000..bb67d2186 --- /dev/null +++ b/website/docs/images/tok2vec-listener.svg @@ -0,0 +1,41 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/website/docs/images/tok2vec.svg b/website/docs/images/tok2vec.svg new file mode 100644 index 000000000..5338b6280 --- /dev/null +++ b/website/docs/images/tok2vec.svg @@ -0,0 +1,17 @@ + + + + + + + + + + + + + + + + + diff --git a/website/docs/usage/101/_architecture.md b/website/docs/usage/101/_architecture.md index 2a389cd87..98011f173 100644 --- a/website/docs/usage/101/_architecture.md +++ b/website/docs/usage/101/_architecture.md @@ -73,14 +73,14 @@ operates on a `Doc` and gives you access to the matched tokens **in context**. ### Other classes {#architecture-other} -| Name | Description | -| ------------------------------------- | ---------------------------------------------------------------------------------------------------------------- | -| [`Vocab`](/api/vocab) | The shared vocabulary that stores strings and gives you access to [`Lexeme`](/api/lexeme) objects. | -| [`StringStore`](/api/stringstore) | Map strings to and from hash values. | -| [`Vectors`](/api/vectors) | Container class for vector data keyed by string. | -| [`Lookups`](/api/lookups) | Container for convenient access to large lookup tables and dictionaries. | -| [`Morphology`](/api/morphology) | Assign linguistic features like lemmas, noun case, verb tense etc. based on the word and its part-of-speech tag. | -| [`MorphAnalysis`](/api/morphanalysis) | A morphological analysis. | -| [`KnowledgeBase`](/api/kb) | Storage for entities and aliases of a knowledge base for entity linking. | -| [`Scorer`](/api/scorer) | Compute evaluation scores. | -| [`Corpus`](/api/corpus) | Class for managing annotated corpora for training and evaluation data. | +| Name | Description | +| ------------------------------------------------ | ---------------------------------------------------------------------------------------------------------------- | +| [`Vocab`](/api/vocab) | The shared vocabulary that stores strings and gives you access to [`Lexeme`](/api/lexeme) objects. | +| [`StringStore`](/api/stringstore) | Map strings to and from hash values. | +| [`Vectors`](/api/vectors) | Container class for vector data keyed by string. | +| [`Lookups`](/api/lookups) | Container for convenient access to large lookup tables and dictionaries. | +| [`Morphology`](/api/morphology) | Assign linguistic features like lemmas, noun case, verb tense etc. based on the word and its part-of-speech tag. | +| [`MorphAnalysis`](/api/morphology#morphanalysis) | A morphological analysis. | +| [`KnowledgeBase`](/api/kb) | Storage for entities and aliases of a knowledge base for entity linking. | +| [`Scorer`](/api/scorer) | Compute evaluation scores. | +| [`Corpus`](/api/corpus) | Class for managing annotated corpora for training and evaluation data. | diff --git a/website/docs/usage/101/_vectors-similarity.md b/website/docs/usage/101/_vectors-similarity.md index 9ff55f815..92df1b331 100644 --- a/website/docs/usage/101/_vectors-similarity.md +++ b/website/docs/usage/101/_vectors-similarity.md @@ -77,26 +77,76 @@ or flagging duplicates. For example, you can suggest a user content that's similar to what they're currently looking at, or label a support ticket as a duplicate if it's very similar to an already existing one. -Each `Doc`, `Span` and `Token` comes with a -[`.similarity()`](/api/token#similarity) method that lets you compare it with -another object, and determine the similarity. Of course similarity is always -subjective – whether "dog" and "cat" are similar really depends on how you're -looking at it. spaCy's similarity model usually assumes a pretty general-purpose -definition of similarity. +Each [`Doc`](/api/doc), [`Span`](/api/span), [`Token`](/api/token) and +[`Lexeme`](/api/lexeme) comes with a [`.similarity`](/api/token#similarity) +method that lets you compare it with another object, and determine the +similarity. Of course similarity is always subjective – whether two words, spans +or documents are similar really depends on how you're looking at it. spaCy's +similarity model usually assumes a pretty general-purpose definition of +similarity. + +> #### πŸ“ Things to try +> +> 1. Compare two different tokens and try to find the two most _dissimilar_ +> tokens in the texts with the lowest similarity score (according to the +> vectors). +> 2. Compare the similarity of two [`Lexeme`](/api/lexeme) objects, entries in +> the vocabulary. You can get a lexeme via the `.lex` attribute of a token. +> You should see that the similarity results are identical to the token +> similarity. ```python ### {executable="true"} import spacy nlp = spacy.load("en_core_web_md") # make sure to use larger model! -tokens = nlp("dog cat banana") +doc1 = nlp("I like salty fries and hamburgers.") +doc2 = nlp("Fast food tastes very good.") -for token1 in tokens: - for token2 in tokens: - print(token1.text, token2.text, token1.similarity(token2)) +# Similarity of two documents +print(doc1, "<->", doc2, doc1.similarity(doc2)) +# Similarity of tokens and spans +french_fries = doc1[2:4] +burgers = doc1[5] +print(french_fries, "<->", burgers, french_fries.similarity(burgers)) ``` -In this case, the model's predictions are pretty on point. A dog is very similar -to a cat, whereas a banana is not very similar to either of them. Identical -tokens are obviously 100% similar to each other (just not always exactly `1.0`, -because of vector math and floating point imprecisions). +### What to expect from similarity results {#similarity-expectations} + +Computing similarity scores can be helpful in many situations, but it's also +important to maintain **realistic expectations** about what information it can +provide. Words can be related to each over in many ways, so a single +"similarity" score will always be a **mix of different signals**, and vectors +trained on different data can produce very different results that may not be +useful for your purpose. Here are some important considerations to keep in mind: + +- There's no objective definition of similarity. Whether "I like burgers" and "I + like pasta" is similar **depends on your application**. Both talk about food + preferences, which makes them very similar – but if you're analyzing mentions + of food, those sentences are pretty dissimilar, because they talk about very + different foods. +- The similarity of [`Doc`](/api/doc) and [`Span`](/api/span) objects defaults + to the **average** of the token vectors. This means that the vector for "fast + food" is the average of the vectors for "fast" and "food", which isn't + necessarily representative of the phrase "fast food". +- Vector averaging means that the vector of multiple tokens is **insensitive to + the order** of the words. Two documents expressing the same meaning with + dissimilar wording will return a lower similarity score than two documents + that happen to contain the same words while expressing different meanings. + + + +[![](../../images/sense2vec.jpg)](https://github.com/explosion/sense2vec) + +[`sense2vec`](https://github.com/explosion/sense2vec) is a library developed by +us that builds on top of spaCy and lets you train and query more interesting and +detailed word vectors. It combines noun phrases like "fast food" or "fair game" +and includes the part-of-speech tags and entity labels. The library also +includes annotation recipes for our annotation tool [Prodigy](https://prodi.gy) +that let you evaluate vector models and create terminology lists. For more +details, check out +[our blog post](https://explosion.ai/blog/sense2vec-reloaded). To explore the +semantic similarities across all Reddit comments of 2015 and 2019, see the +[interactive demo](https://explosion.ai/demos/sense2vec). + + diff --git a/website/docs/usage/embeddings-transformers.md b/website/docs/usage/embeddings-transformers.md new file mode 100644 index 000000000..7648a5d45 --- /dev/null +++ b/website/docs/usage/embeddings-transformers.md @@ -0,0 +1,505 @@ +--- +title: Embeddings, Transformers and Transfer Learning +teaser: Using transformer embeddings like BERT in spaCy +menu: + - ['Embedding Layers', 'embedding-layers'] + - ['Transformers', 'transformers'] + - ['Static Vectors', 'static-vectors'] + - ['Pretraining', 'pretraining'] +next: /usage/training +--- + +spaCy supports a number of **transfer and multi-task learning** workflows that +can often help improve your pipeline's efficiency or accuracy. Transfer learning +refers to techniques such as word vector tables and language model pretraining. +These techniques can be used to import knowledge from raw text into your +pipeline, so that your models are able to generalize better from your annotated +examples. + +You can convert **word vectors** from popular tools like +[FastText](https://fasttext.cc) and [Gensim](https://radimrehurek.com/gensim), +or you can load in any pretrained **transformer model** if you install +[`spacy-transformers`](https://github.com/explosion/spacy-transformers). You can +also do your own language model pretraining via the +[`spacy pretrain`](/api/cli#pretrain) command. You can even **share** your +transformer or other contextual embedding model across multiple components, +which can make long pipelines several times more efficient. To use transfer +learning, you'll need at least a few annotated examples for what you're trying +to predict. Otherwise, you could try using a "one-shot learning" approach using +[vectors and similarity](/usage/linguistic-features#vectors-similarity). + + + +The key difference between [word vectors](#word-vectors) and contextual language +models such as [transformers](#transformers) is that word vectors model +**lexical types**, rather than _tokens_. If you have a list of terms with no +context around them, a transformer model like BERT can't really help you. BERT +is designed to understand language **in context**, which isn't what you have. A +word vectors table will be a much better fit for your task. However, if you do +have words in context β€” whole sentences or paragraphs of running text β€” word +vectors will only provide a very rough approximation of what the text is about. + +Word vectors are also very computationally efficient, as they map a word to a +vector with a single indexing operation. Word vectors are therefore useful as a +way to **improve the accuracy** of neural network models, especially models that +are small or have received little or no pretraining. In spaCy, word vector +tables are only used as **static features**. spaCy does not backpropagate +gradients to the pretrained word vectors table. The static vectors table is +usually used in combination with a smaller table of learned task-specific +embeddings. + + + + + +Word vectors are not compatible with most [transformer models](#transformers), +but if you're training another type of NLP network, it's almost always worth +adding word vectors to your model. As well as improving your final accuracy, +word vectors often make experiments more consistent, as the accuracy you reach +will be less sensitive to how the network is randomly initialized. High variance +due to random chance can slow down your progress significantly, as you need to +run many experiments to filter the signal from the noise. + +Word vector features need to be enabled prior to training, and the same word +vectors table will need to be available at runtime as well. You cannot add word +vector features once the model has already been trained, and you usually cannot +replace one word vectors table with another without causing a significant loss +of performance. + + + +## Shared embedding layers {#embedding-layers} + +spaCy lets you share a single transformer or other token-to-vector ("tok2vec") +embedding layer between multiple components. You can even update the shared +layer, performing **multi-task learning**. Reusing the tok2vec layer between +components can make your pipeline run a lot faster and result in much smaller +models. However, it can make the pipeline less modular and make it more +difficult to swap components or retrain parts of the pipeline. Multi-task +learning can affect your accuracy (either positively or negatively), and may +require some retuning of your hyper-parameters. + +![Pipeline components using a shared embedding component vs. independent embedding layers](../images/tok2vec.svg) + +| Shared | Independent | +| ------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------- | +| βœ… **smaller:** models only need to include a single copy of the embeddings | ❌ **larger:** models need to include the embeddings for each component | +| βœ… **faster:** embed the documents once for your whole pipeline | ❌ **slower:** rerun the embedding for each component | +| ❌ **less composable:** all components require the same embedding component in the pipeline | βœ… **modular:** components can be moved and swapped freely | + +You can share a single transformer or other tok2vec model between multiple +components by adding a [`Transformer`](/api/transformer) or +[`Tok2Vec`](/api/tok2vec) component near the start of your pipeline. Components +later in the pipeline can "connect" to it by including a **listener layer** like +[Tok2VecListener](/api/architectures#Tok2VecListener) within their model. + +![Pipeline components listening to shared embedding component](../images/tok2vec-listener.svg) + +At the beginning of training, the [`Tok2Vec`](/api/tok2vec) component will grab +a reference to the relevant listener layers in the rest of your pipeline. When +it processes a batch of documents, it will pass forward its predictions to the +listeners, allowing the listeners to **reuse the predictions** when they are +eventually called. A similar mechanism is used to pass gradients from the +listeners back to the model. The [`Transformer`](/api/transformer) component and +[TransformerListener](/api/architectures#TransformerListener) layer do the same +thing for transformer models, but the `Transformer` component will also save the +transformer outputs to the +[`Doc._.trf_data`](/api/transformer#custom_attributes) extension attribute, +giving you access to them after the pipeline has finished running. + + + + + +## Using transformer models {#transformers} + +Transformers are a family of neural network architectures that compute **dense, +context-sensitive representations** for the tokens in your documents. Downstream +models in your pipeline can then use these representations as input features to +**improve their predictions**. You can connect multiple components to a single +transformer model, with any or all of those components giving feedback to the +transformer to fine-tune it to your tasks. spaCy's transformer support +interoperates with [PyTorch](https://pytorch.org) and the +[HuggingFace `transformers`](https://huggingface.co/transformers/) library, +giving you access to thousands of pretrained models for your pipelines. There +are many [great guides](http://jalammar.github.io/illustrated-transformer/) to +transformer models, but for practical purposes, you can simply think of them as +a drop-in replacement that let you achieve **higher accuracy** in exchange for +**higher training and runtime costs**. + +### Setup and installation {#transformers-installation} + +> #### System requirements +> +> We recommend an NVIDIA **GPU** with at least **10GB of memory** in order to +> work with transformer models. Make sure your GPU drivers are up to date and +> you have **CUDA v9+** installed. + +> The exact requirements will depend on the transformer model. Training a +> transformer-based model without a GPU will be too slow for most practical +> purposes. +> +> Provisioning a new machine will require about **5GB** of data to be +> downloaded: 3GB CUDA runtime, 800MB PyTorch, 400MB CuPy, 500MB weights, 200MB +> spaCy and dependencies. + +Once you have CUDA installed, you'll need to install two pip packages, +[`cupy`](https://docs.cupy.dev/en/stable/install.html) and +[`spacy-transformers`](https://github.com/explosion/spacy-transformers). `cupy` +is just like `numpy`, but for GPU. The best way to install it is to choose a +wheel that matches the version of CUDA you're using. You may also need to set +the `CUDA_PATH` environment variable if your CUDA runtime is installed in a +non-standard location. Putting it all together, if you had installed CUDA 10.2 +in `/opt/nvidia/cuda`, you would run: + +```bash +### Installation with CUDA +$ export CUDA_PATH="/opt/nvidia/cuda" +$ pip install cupy-cuda102 +$ pip install spacy-transformers +``` + +### Runtime usage {#transformers-runtime} + +Transformer models can be used as **drop-in replacements** for other types of +neural networks, so your spaCy pipeline can include them in a way that's +completely invisible to the user. Users will download, load and use the model in +the standard way, like any other spaCy pipeline. Instead of using the +transformers as subnetworks directly, you can also use them via the +[`Transformer`](/api/transformer) pipeline component. + +![The processing pipeline with the transformer component](../images/pipeline_transformer.svg) + +The `Transformer` component sets the +[`Doc._.trf_data`](/api/transformer#custom_attributes) extension attribute, +which lets you access the transformers outputs at runtime. + +```cli +$ python -m spacy download en_core_trf_lg +``` + +```python +### Example +import spacy +from thinc.api import use_pytorch_for_gpu_memory, require_gpu + +# Use the GPU, with memory allocations directed via PyTorch. +# This prevents out-of-memory errors that would otherwise occur from competing +# memory pools. +use_pytorch_for_gpu_memory() +require_gpu(0) + +nlp = spacy.load("en_core_trf_lg") +for doc in nlp.pipe(["some text", "some other text"]): + tokvecs = doc._.trf_data.tensors[-1] +``` + +You can also customize how the [`Transformer`](/api/transformer) component sets +annotations onto the [`Doc`](/api/doc), by customizing the `annotation_setter`. +This callback will be called with the raw input and output data for the whole +batch, along with the batch of `Doc` objects, allowing you to implement whatever +you need. The annotation setter is called with a batch of [`Doc`](/api/doc) +objects and a [`FullTransformerBatch`](/api/transformer#fulltransformerbatch) +containing the transformers data for the batch. + +```python +def custom_annotation_setter(docs, trf_data): + # TODO: + ... + +nlp = spacy.load("en_core_trf_lg") +nlp.get_pipe("transformer").annotation_setter = custom_annotation_setter +doc = nlp("This is a text") +print() # TODO: +``` + +### Training usage {#transformers-training} + +The recommended workflow for training is to use spaCy's +[config system](/usage/training#config), usually via the +[`spacy train`](/api/cli#train) command. The training config defines all +component settings and hyperparameters in one place and lets you describe a tree +of objects by referring to creation functions, including functions you register +yourself. For details on how to get started with training your own model, check +out the [training quickstart](/usage/training#quickstart). + + + +The `[components]` section in the [`config.cfg`](/api/data-formats#config) +describes the pipeline components and the settings used to construct them, +including their model implementation. Here's a config snippet for the +[`Transformer`](/api/transformer) component, along with matching Python code. In +this case, the `[components.transformer]` block describes the `transformer` +component: + +> #### Python equivalent +> +> ```python +> from spacy_transformers import Transformer, TransformerModel +> from spacy_transformers.annotation_setters import null_annotation_setter +> from spacy_transformers.span_getters import get_doc_spans +> +> trf = Transformer( +> nlp.vocab, +> TransformerModel( +> "bert-base-cased", +> get_spans=get_doc_spans, +> tokenizer_config={"use_fast": True}, +> ), +> annotation_setter=null_annotation_setter, +> max_batch_items=4096, +> ) +> ``` + +```ini +### config.cfg (excerpt) +[components.transformer] +factory = "transformer" +max_batch_items = 4096 + +[components.transformer.model] +@architectures = "spacy-transformers.TransformerModel.v1" +name = "bert-base-cased" +tokenizer_config = {"use_fast": true} + +[components.transformer.model.get_spans] +@span_getters = "doc_spans.v1" + +[components.transformer.annotation_setter] +@annotation_setters = "spacy-transformers.null_annotation_setter.v1" + +``` + +The `[components.transformer.model]` block describes the `model` argument passed +to the transformer component. It's a Thinc +[`Model`](https://thinc.ai/docs/api-model) object that will be passed into the +component. Here, it references the function +[spacy-transformers.TransformerModel.v1](/api/architectures#TransformerModel) +registered in the [`architectures` registry](/api/top-level#registry). If a key +in a block starts with `@`, it's **resolved to a function** and all other +settings are passed to the function as arguments. In this case, `name`, +`tokenizer_config` and `get_spans`. + +`get_spans` is a function that takes a batch of `Doc` object and returns lists +of potentially overlapping `Span` objects to process by the transformer. Several +[built-in functions](/api/transformer#span-getters) are available – for example, +to process the whole document or individual sentences. When the config is +resolved, the function is created and passed into the model as an argument. + + + +Remember that the `config.cfg` used for training should contain **no missing +values** and requires all settings to be defined. You don't want any hidden +defaults creeping in and changing your results! spaCy will tell you if settings +are missing, and you can run +[`spacy init fill-config`](/api/cli#init-fill-config) to automatically fill in +all defaults. + + + +### Customizing the settings {#transformers-training-custom-settings} + +To change any of the settings, you can edit the `config.cfg` and re-run the +training. To change any of the functions, like the span getter, you can replace +the name of the referenced function – e.g. `@span_getters = "sent_spans.v1"` to +process sentences. You can also register your own functions using the +`span_getters` registry: + +> #### config.cfg +> +> ```ini +> [components.transformer.model.get_spans] +> @span_getters = "custom_sent_spans" +> ``` + +```python +### code.py +import spacy_transformers + +@spacy_transformers.registry.span_getters("custom_sent_spans") +def configure_custom_sent_spans(): + # TODO: write custom example + def get_sent_spans(docs): + return [list(doc.sents) for doc in docs] + + return get_sent_spans +``` + +To resolve the config during training, spaCy needs to know about your custom +function. You can make it available via the `--code` argument that can point to +a Python file. For more details on training with custom code, see the +[training documentation](/usage/training#custom-code). + +```cli +python -m spacy train ./config.cfg --code ./code.py +``` + +### Customizing the model implementations {#training-custom-model} + +The [`Transformer`](/api/transformer) component expects a Thinc +[`Model`](https://thinc.ai/docs/api-model) object to be passed in as its `model` +argument. You're not limited to the implementation provided by +`spacy-transformers` – the only requirement is that your registered function +must return an object of type ~~Model[List[Doc], FullTransformerBatch]~~: that +is, a Thinc model that takes a list of [`Doc`](/api/doc) objects, and returns a +[`FullTransformerBatch`](/api/transformer#fulltransformerbatch) object with the +transformer data. + +The same idea applies to task models that power the **downstream components**. +Most of spaCy's built-in model creation functions support a `tok2vec` argument, +which should be a Thinc layer of type ~~Model[List[Doc], List[Floats2d]]~~. This +is where we'll plug in our transformer model, using the +[Tok2VecListener](/api/architectures#Tok2VecListener) layer, which sneakily +delegates to the `Transformer` pipeline component. + +```ini +### config.cfg (excerpt) {highlight="12"} +[components.ner] +factory = "ner" + +[nlp.pipeline.ner.model] +@architectures = "spacy.TransitionBasedParser.v1" +nr_feature_tokens = 3 +hidden_width = 128 +maxout_pieces = 3 +use_upper = false + +[nlp.pipeline.ner.model.tok2vec] +@architectures = "spacy-transformers.Tok2VecListener.v1" +grad_factor = 1.0 + +[nlp.pipeline.ner.model.tok2vec.pooling] +@layers = "reduce_mean.v1" +``` + +The [Tok2VecListener](/api/architectures#Tok2VecListener) layer expects a +[pooling layer](https://thinc.ai/docs/api-layers#reduction-ops) as the argument +`pooling`, which needs to be of type ~~Model[Ragged, Floats2d]~~. This layer +determines how the vector for each spaCy token will be computed from the zero or +more source rows the token is aligned against. Here we use the +[`reduce_mean`](https://thinc.ai/docs/api-layers#reduce_mean) layer, which +averages the wordpiece rows. We could instead use +[`reduce_max`](https://thinc.ai/docs/api-layers#reduce_max), or a custom +function you write yourself. + +You can have multiple components all listening to the same transformer model, +and all passing gradients back to it. By default, all of the gradients will be +**equally weighted**. You can control this with the `grad_factor` setting, which +lets you reweight the gradients from the different listeners. For instance, +setting `grad_factor = 0` would disable gradients from one of the listeners, +while `grad_factor = 2.0` would multiply them by 2. This is similar to having a +custom learning rate for each component. Instead of a constant, you can also +provide a schedule, allowing you to freeze the shared parameters at the start of +training. + +## Static vectors {#static-vectors} + + + +### Using word vectors in your models {#word-vectors-models} + +Many neural network models are able to use word vector tables as additional +features, which sometimes results in significant improvements in accuracy. +spaCy's built-in embedding layer, +[MultiHashEmbed](/api/architectures#MultiHashEmbed), can be configured to use +word vector tables using the `also_use_static_vectors` flag. This setting is +also available on the [MultiHashEmbedCNN](/api/architectures#MultiHashEmbedCNN) +layer, which builds the default token-to-vector encoding architecture. + +```ini +[tagger.model.tok2vec.embed] +@architectures = "spacy.MultiHashEmbed.v1" +width = 128 +rows = 7000 +also_embed_subwords = true +also_use_static_vectors = true +``` + + + +The configuration system will look up the string `"spacy.MultiHashEmbed.v1"` in +the `architectures` [registry](/api/top-level#registry), and call the returned +object with the rest of the arguments from the block. This will result in a call +to the +[`MultiHashEmbed`](https://github.com/explosion/spacy/tree/develop/spacy/ml/models/tok2vec.py) +function, which will return a [Thinc](https://thinc.ai) model object with the +type signature ~~Model[List[Doc], List[Floats2d]]~~. Because the embedding layer +takes a list of `Doc` objects as input, it does not need to store a copy of the +vectors table. The vectors will be retrieved from the `Doc` objects that are +passed in, via the `doc.vocab.vectors` attribute. This part of the process is +handled by the [StaticVectors](/api/architectures#StaticVectors) layer. + + + +#### Creating a custom embedding layer {#custom-embedding-layer} + +The [MultiHashEmbed](/api/architectures#StaticVectors) layer is spaCy's +recommended strategy for constructing initial word representations for your +neural network models, but you can also implement your own. You can register any +function to a string name, and then reference that function within your config +(see the [training docs](/usage/training) for more details). To try this out, +you can save the following little example to a new Python file: + +```python +from spacy.ml.staticvectors import StaticVectors +from spacy.util import registry + +print("I was imported!") + +@registry.architectures("my_example.MyEmbedding.v1") +def MyEmbedding(output_width: int) -> Model[List[Doc], List[Floats2d]]: + print("I was called!") + return StaticVectors(nO=output_width) +``` + +If you pass the path to your file to the [`spacy train`](/api/cli#train) command +using the `--code` argument, your file will be imported, which means the +decorator registering the function will be run. Your function is now on equal +footing with any of spaCy's built-ins, so you can drop it in instead of any +other model with the same input and output signature. For instance, you could +use it in the tagger model as follows: + +```ini +[tagger.model.tok2vec.embed] +@architectures = "my_example.MyEmbedding.v1" +output_width = 128 +``` + +Now that you have a custom function wired into the network, you can start +implementing the logic you're interested in. For example, let's say you want to +try a relatively simple embedding strategy that makes use of static word +vectors, but combines them via summation with a smaller table of learned +embeddings. + +```python +from thinc.api import add, chain, remap_ids, Embed +from spacy.ml.staticvectors import StaticVectors + +@registry.architectures("my_example.MyEmbedding.v1") +def MyCustomVectors( + output_width: int, + vector_width: int, + embed_rows: int, + key2row: Dict[int, int] +) -> Model[List[Doc], List[Floats2d]]: + return add( + StaticVectors(nO=output_width), + chain( + FeatureExtractor(["ORTH"]), + remap_ids(key2row), + Embed(nO=output_width, nV=embed_rows) + ) + ) +``` + +## Pretraining {#pretraining} + + diff --git a/website/docs/usage/index.md b/website/docs/usage/index.md index bda9f76d6..ede4ab6f9 100644 --- a/website/docs/usage/index.md +++ b/website/docs/usage/index.md @@ -40,7 +40,7 @@ $ pip install -U spacy > After installation you need to download a language model. For more info and > available models, see the [docs on models](/models). > -> ```bash +> ```cli > $ python -m spacy download en_core_web_sm > > >>> import spacy @@ -62,9 +62,9 @@ When using pip it is generally recommended to install packages in a virtual environment to avoid modifying system state: ```bash -python -m venv .env -source .env/bin/activate -pip install spacy +$ python -m venv .env +$ source .env/bin/activate +$ pip install spacy ``` ### conda {#conda} @@ -106,9 +106,9 @@ links created in different virtual environments. It's recommended to run the command with `python -m` to make sure you're executing the correct version of spaCy. -```bash -pip install -U spacy -python -m spacy validate +```cli +$ pip install -U spacy +$ python -m spacy validate ``` ### Run spaCy with GPU {#gpu new="2.0.14"} @@ -156,20 +156,20 @@ system. See notes on [Ubuntu](#source-ubuntu), [macOS / OS X](#source-osx) and [Windows](#source-windows) for details. ```bash -python -m pip install -U pip # update pip -git clone https://github.com/explosion/spaCy # clone spaCy -cd spaCy # navigate into directory +$ python -m pip install -U pip # update pip +$ git clone https://github.com/explosion/spaCy # clone spaCy +$ cd spaCy # navigate into dir -python -m venv .env # create environment in .env -source .env/bin/activate # activate virtual environment -\export PYTHONPATH=`pwd` # set Python path to spaCy directory -pip install -r requirements.txt # install all requirements -python setup.py build_ext --inplace # compile spaCy +$ python -m venv .env # create environment in .env +$ source .env/bin/activate # activate virtual env +$ export PYTHONPATH=`pwd` # set Python path to spaCy dir +$ pip install -r requirements.txt # install all requirements +$ python setup.py build_ext --inplace # compile spaCy ``` Compared to regular install via pip, the [`requirements.txt`](https://github.com/explosion/spaCy/tree/master/requirements.txt) -additionally installs developer dependencies such as Cython. See the the +additionally installs developer dependencies such as Cython. See the [quickstart widget](#quickstart) to get the right commands for your platform and Python version. @@ -209,20 +209,18 @@ that directory. Don't forget to also install the test utilities via spaCy's [`requirements.txt`](https://github.com/explosion/spaCy/tree/master/requirements.txt): ```bash -python -c "import os; import spacy; print(os.path.dirname(spacy.__file__))" -pip install -r path/to/requirements.txt -python -m pytest [spacy directory] +$ python -c "import os; import spacy; print(os.path.dirname(spacy.__file__))" +$ pip install -r path/to/requirements.txt +$ python -m pytest [spacy directory] ``` Calling `pytest` on the spaCy directory will run only the basic tests. The flag `--slow` is optional and enables additional tests that take longer. ```bash -# make sure you are using recent pytest version -python -m pip install -U pytest - -python -m pytest [spacy directory] # basic tests -python -m pytest [spacy directory] --slow # basic and slow tests +$ python -m pip install -U pytest # update pytest +$ python -m pytest [spacy directory] # basic tests +$ python -m pytest [spacy directory] --slow # basic and slow tests ``` ## Troubleshooting guide {#troubleshooting} @@ -283,7 +281,7 @@ only 65535 in a narrow unicode build. You can check this by running the following command: ```bash -python -c "import sys; print(sys.maxunicode)" +$ python -c "import sys; print(sys.maxunicode)" ``` If you're running a narrow unicode build, reinstall Python and use a wide @@ -305,8 +303,8 @@ run `source ~/.bash_profile` or `source ~/.zshrc`. Make sure to add **both lines** for `LC_ALL` and `LANG`. ```bash -\export LC_ALL=en_US.UTF-8 -\export LANG=en_US.UTF-8 +$ export LC_ALL=en_US.UTF-8 +$ export LANG=en_US.UTF-8 ``` @@ -370,7 +368,7 @@ from is called `spacy`. So, when using spaCy, never call anything else `spacy`. - + If your training data only contained new entities and you didn't mix in any examples the model previously recognized, it can cause the model to "forget" diff --git a/website/docs/usage/layers-architectures.md b/website/docs/usage/layers-architectures.md new file mode 100644 index 000000000..aa398f752 --- /dev/null +++ b/website/docs/usage/layers-architectures.md @@ -0,0 +1,185 @@ +--- +title: Layers and Model Architectures +teaser: Power spaCy components with custom neural networks +menu: + - ['Type Signatures', 'type-sigs'] + - ['Defining Sublayers', 'sublayers'] + - ['PyTorch & TensorFlow', 'frameworks'] + - ['Trainable Components', 'components'] +next: /usage/projects +--- + +​A **model architecture** is a function that wires up a +[Thinc `Model`](https://thinc.ai/docs/api-model) instance, which you can then +use in a component or as a layer of a larger network. You can use Thinc as a +thin wrapper around frameworks such as PyTorch, TensorFlow or MXNet, or you can +implement your logic in Thinc directly. ​ spaCy's built-in components will never +construct their `Model` instances themselves, so you won't have to subclass the +component to change its model architecture. You can just **update the config** +so that it refers to a different registered function. Once the component has +been created, its model instance has already been assigned, so you cannot change +its model architecture. The architecture is like a recipe for the network, and +you can't change the recipe once the dish has already been prepared. You have to +make a new one. + +![Diagram of a pipeline component with its model](../images/layers-architectures.svg) + +## Type signatures {#type-sigs} + + + +> #### Example +> +> ```python +> @spacy.registry.architectures.register("spacy.Tagger.v1") +> def build_tagger_model( +> tok2vec: Model[List[Doc], List[Floats2d]], nO: Optional[int] = None +> ) -> Model[List[Doc], List[Floats2d]]: +> t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None +> output_layer = Softmax(nO, t2v_width, init_W=zero_init) +> softmax = with_array(output_layer) +> model = chain(tok2vec, softmax) +> model.set_ref("tok2vec", tok2vec) +> model.set_ref("softmax", output_layer) +> model.set_ref("output_layer", output_layer) +> return model +> ``` + +​ The Thinc `Model` class is a **generic type** that can specify its input and +output types. Python uses a square-bracket notation for this, so the type +~~Model[List, Dict]~~ says that each batch of inputs to the model will be a +list, and the outputs will be a dictionary. Both `typing.List` and `typing.Dict` +are also generics, allowing you to be more specific about the data. For +instance, you can write ~~Model[List[Doc], Dict[str, float]]~~ to specify that +the model expects a list of [`Doc`](/api/doc) objects as input, and returns a +dictionary mapping strings to floats. Some of the most common types you'll see +are: ​ + +| Type | Description | +| ------------------ | ---------------------------------------------------------------------------------------------------- | +| ~~List[Doc]~~ | A batch of [`Doc`](/api/doc) objects. Most components expect their models to take this as input. | +| ~~Floats2d~~ | A two-dimensional `numpy` or `cupy` array of floats. Usually 32-bit. | +| ~~Ints2d~~ | A two-dimensional `numpy` or `cupy` array of integers. Common dtypes include uint64, int32 and int8. | +| ~~List[Floats2d]~~ | A list of two-dimensional arrays, generally with one array per `Doc` and one row per token. | +| ~~Ragged~~ | A container to handle variable-length sequence data in an unpadded contiguous array. | +| ~~Padded~~ | A container to handle variable-length sequence data in a passed contiguous array. | + +The model type signatures help you figure out which model architectures and +components can **fit together**. For instance, the +[`TextCategorizer`](/api/textcategorizer) class expects a model typed +~~Model[List[Doc], Floats2d]~~, because the model will predict one row of +category probabilities per [`Doc`](/api/doc). In contrast, the +[`Tagger`](/api/tagger) class expects a model typed ~~Model[List[Doc], +List[Floats2d]]~~, because it needs to predict one row of probabilities per +token. + +There's no guarantee that two models with the same type signature can be used +interchangeably. There are many other ways they could be incompatible. However, +if the types don't match, they almost surely _won't_ be compatible. This little +bit of validation goes a long way, especially if you +[configure your editor](https://thinc.ai/docs/usage-type-checking) or other +tools to highlight these errors early. Thinc will also verify that your types +match correctly when your config file is processed at the beginning of training. + + + +If you're using a modern editor like Visual Studio Code, you can +[set up `mypy`](https://thinc.ai/docs/usage-type-checking#install) with the +custom Thinc plugin and get live feedback about mismatched types as you write +code. + +[![](../images/thinc_mypy.jpg)](https://thinc.ai/docs/usage-type-checking#linting) + + + +## Defining sublayers {#sublayers} + +​ Model architecture functions often accept **sublayers as arguments**, so that +you can try **substituting a different layer** into the network. Depending on +how the architecture function is structured, you might be able to define your +network structure entirely through the [config system](/usage/training#config), +using layers that have already been defined. ​The +[transformers documentation](/usage/embeddings-transformers#transformers) +section shows a common example of swapping in a different sublayer. + +In most neural network models for NLP, the most important parts of the network +are what we refer to as the +[embed and encode](https://explosion.ai/blog/embed-encode-attend-predict) steps. +These steps together compute dense, context-sensitive representations of the +tokens. Most of spaCy's default architectures accept a +[`tok2vec` embedding layer](/api/architectures#tok2vec-arch) as an argument, so +you can control this important part of the network separately. This makes it +easy to **switch between** transformer, CNN, BiLSTM or other feature extraction +approaches. And if you want to define your own solution, all you need to do is +register a ~~Model[List[Doc], List[Floats2d]]~~ architecture function, and +you'll be able to try it out in any of spaCy components. ​ + + + +### Registering new architectures + +- Recap concept, link to config docs. ​ + +## Wrapping PyTorch, TensorFlow and other frameworks {#frameworks} + + + +Thinc allows you to wrap models written in other machine learning frameworks +like PyTorch, TensorFlow and MXNet using a unified +[`Model`](https://thinc.ai/docs/api-model) API. As well as **wrapping whole +models**, Thinc lets you call into an external framework for just **part of your +model**: you can have a model where you use PyTorch just for the transformer +layers, using "native" Thinc layers to do fiddly input and output +transformations and add on task-specific "heads", as efficiency is less of a +consideration for those parts of the network. + +Thinc uses a special class, [`Shim`](https://thinc.ai/docs/api-model#shim), to +hold references to external objects. This allows each wrapper space to define a +custom type, with whatever attributes and methods are helpful, to assist in +managing the communication between Thinc and the external library. The +[`Model`](https://thinc.ai/docs/api-model#model) class holds `shim` instances in +a separate list, and communicates with the shims about updates, serialization, +changes of device, etc. + +The wrapper will receive each batch of inputs, convert them into a suitable form +for the underlying model instance, and pass them over to the shim, which will +**manage the actual communication** with the model. The output is then passed +back into the wrapper, and converted for use in the rest of the network. The +equivalent procedure happens during backpropagation. Array conversion is handled +via the [DLPack](https://github.com/dmlc/dlpack) standard wherever possible, so +that data can be passed between the frameworks **without copying the data back** +to the host device unnecessarily. + +| Framework | Wrapper layer | Shim | DLPack | +| -------------- | ------------------------------------------------------------------------- | --------------------------------------------------------- | --------------- | +| **PyTorch** | [`PyTorchWrapper`](https://thinc.ai/docs/api-layers#pytorchwrapper) | [`PyTorchShim`](https://thinc.ai/docs/api-model#shims) | βœ… | +| **TensorFlow** | [`TensorFlowWrapper`](https://thinc.ai/docs/api-layers#tensorflowwrapper) | [`TensorFlowShim`](https://thinc.ai/docs/api-model#shims) | ❌ 1 | +| **MXNet** | [`MXNetWrapper`](https://thinc.ai/docs/api-layers#mxnetwrapper) | [`MXNetShim`](https://thinc.ai/docs/api-model#shims) | βœ… | + +1. DLPack support in TensorFlow is now + [available](<(https://github.com/tensorflow/tensorflow/issues/24453)>) but + still experimental. + + + +## Models for trainable components {#components} + +- Interaction with `predict`, `get_loss` and `set_annotations` +- Initialization life-cycle with `begin_training`. +- Link to relation extraction notebook. + +```python +def update(self, examples): + docs = [ex.predicted for ex in examples] + refs = [ex.reference for ex in examples] + predictions, backprop = self.model.begin_update(docs) + gradient = self.get_loss(predictions, refs) + backprop(gradient) + +def __call__(self, doc): + predictions = self.model([doc]) + self.set_annotations(predictions) +``` diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md index 589cef44c..f2ec48d63 100644 --- a/website/docs/usage/linguistic-features.md +++ b/website/docs/usage/linguistic-features.md @@ -9,6 +9,7 @@ menu: - ['Tokenization', 'tokenization'] - ['Merging & Splitting', 'retokenization'] - ['Sentence Segmentation', 'sbd'] + - ['Vectors & Similarity', 'vectors-similarity'] - ['Language data', 'language-data'] --- @@ -428,7 +429,7 @@ nlp = spacy.load("en_core_web_sm") doc = nlp("fb is hiring a new vice president of global policy") ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents] print('Before', ents) -# the model didn't recognise "fb" as an entity :( +# The model didn't recognize "fb" as an entity :( fb_ent = Span(doc, 0, 1, label="ORG") # create a Span for the new entity doc.ents = list(doc.ents) + [fb_ent] @@ -557,11 +558,11 @@ import spacy nlp = spacy.load("my_custom_el_model") doc = nlp("Ada Lovelace was born in London") -# document level +# Document level ents = [(e.text, e.label_, e.kb_id_) for e in doc.ents] print(ents) # [('Ada Lovelace', 'PERSON', 'Q7259'), ('London', 'GPE', 'Q84')] -# token level +# Token level ent_ada_0 = [doc[0].text, doc[0].ent_type_, doc[0].ent_kb_id_] ent_ada_1 = [doc[1].text, doc[1].ent_type_, doc[1].ent_kb_id_] ent_london_5 = [doc[5].text, doc[5].ent_type_, doc[5].ent_kb_id_] @@ -913,12 +914,12 @@ from spacy.lang.char_classes import ALPHA, ALPHA_LOWER, ALPHA_UPPER from spacy.lang.char_classes import CONCAT_QUOTES, LIST_ELLIPSES, LIST_ICONS from spacy.util import compile_infix_regex -# default tokenizer +# Default tokenizer nlp = spacy.load("en_core_web_sm") doc = nlp("mother-in-law") print([t.text for t in doc]) # ['mother', '-', 'in', '-', 'law'] -# modify tokenizer infix patterns +# Modify tokenizer infix patterns infixes = ( LIST_ELLIPSES + LIST_ICONS @@ -928,8 +929,8 @@ infixes = ( al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES ), r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), - # EDIT: commented out regex that splits on hyphens between letters: - #r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS), + # βœ… Commented out regex that splits on hyphens between letters: + # r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS), r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA), ] ) @@ -980,7 +981,7 @@ nlp.tokenizer = my_tokenizer | Argument | Type | Description | | ----------- | ----------------- | ------------------------- | -| `text` | str | The raw text to tokenize. | +| `text` | `str` | The raw text to tokenize. | | **RETURNS** | [`Doc`](/api/doc) | The tokenized document. | #### Example 1: Basic whitespace tokenizer {#custom-tokenizer-example} @@ -1024,10 +1025,10 @@ produced by the tokenizer. > > If you're working with transformer models like BERT, check out the > [`spacy-transformers`](https://github.com/explosion/spacy-transformers) -> extension package and [documentation](/usage/transformers). It includes a -> pipeline component for using pretrained transformer weights and **training -> transformer models** in spaCy, as well as helpful utilities for aligning word -> pieces to linguistic tokenization. +> extension package and [documentation](/usage/embeddings-transformers). It +> includes a pipeline component for using pretrained transformer weights and +> **training transformer models** in spaCy, as well as helpful utilities for +> aligning word pieces to linguistic tokenization. ```python ### Custom BERT word piece tokenizer @@ -1510,7 +1511,7 @@ adding it to the pipeline using [`nlp.add_pipe`](/api/language#add_pipe). Here's an example of a component that implements a pre-processing rule for -splitting on `'...'` tokens. The component is added before the parser, which is +splitting on `"..."` tokens. The component is added before the parser, which is then used to further segment the text. That's possible, because `is_sent_start` is only set to `True` for some of the tokens – all others still specify `None` for unset sentence boundaries. This approach can be useful if you want to @@ -1540,6 +1541,135 @@ doc = nlp(text) print("After:", [sent.text for sent in doc.sents]) ``` +## Word vectors and semantic similarity {#vectors-similarity} + +import Vectors101 from 'usage/101/\_vectors-similarity.md' + + + +### Adding word vectors {#adding-vectors} + +Custom word vectors can be trained using a number of open-source libraries, such +as [Gensim](https://radimrehurek.com/gensim), [FastText](https://fasttext.cc), +or Tomas Mikolov's original +[Word2vec implementation](https://code.google.com/archive/p/word2vec/). Most +word vector libraries output an easy-to-read text-based format, where each line +consists of the word followed by its vector. For everyday use, we want to +convert the vectors model into a binary format that loads faster and takes up +less space on disk. The easiest way to do this is the +[`init model`](/api/cli#init-model) command-line utility. This will output a +spaCy model in the directory `/tmp/la_vectors_wiki_lg`, giving you access to +some nice Latin vectors. You can then pass the directory path to +[`spacy.load`](/api/top-level#spacy.load). + +> #### Usage example +> +> ```python +> nlp_latin = spacy.load("/tmp/la_vectors_wiki_lg") +> doc1 = nlp_latin("Caecilius est in horto") +> doc2 = nlp_latin("servus est in atrio") +> doc1.similarity(doc2) +> ``` + +```cli +$ wget https://s3-us-west-1.amazonaws.com/fasttext-vectors/word-vectors-v2/cc.la.300.vec.gz +$ python -m spacy init model en /tmp/la_vectors_wiki_lg --vectors-loc cc.la.300.vec.gz +``` + + + +To help you strike a good balance between coverage and memory usage, spaCy's +[`Vectors`](/api/vectors) class lets you map **multiple keys** to the **same +row** of the table. If you're using the +[`spacy init model`](/api/cli#init-model) command to create a vocabulary, +pruning the vectors will be taken care of automatically if you set the +`--prune-vectors` flag. You can also do it manually in the following steps: + +1. Start with a **word vectors model** that covers a huge vocabulary. For + instance, the [`en_vectors_web_lg`](/models/en-starters#en_vectors_web_lg) + model provides 300-dimensional GloVe vectors for over 1 million terms of + English. +2. If your vocabulary has values set for the `Lexeme.prob` attribute, the + lexemes will be sorted by descending probability to determine which vectors + to prune. Otherwise, lexemes will be sorted by their order in the `Vocab`. +3. Call [`Vocab.prune_vectors`](/api/vocab#prune_vectors) with the number of + vectors you want to keep. + +```python +nlp = spacy.load('en_vectors_web_lg') +n_vectors = 105000 # number of vectors to keep +removed_words = nlp.vocab.prune_vectors(n_vectors) + +assert len(nlp.vocab.vectors) <= n_vectors # unique vectors have been pruned +assert nlp.vocab.vectors.n_keys > n_vectors # but not the total entries +``` + +[`Vocab.prune_vectors`](/api/vocab#prune_vectors) reduces the current vector +table to a given number of unique entries, and returns a dictionary containing +the removed words, mapped to `(string, score)` tuples, where `string` is the +entry the removed word was mapped to, and `score` the similarity score between +the two words. + +```python +### Removed words +{ + "Shore": ("coast", 0.732257), + "Precautionary": ("caution", 0.490973), + "hopelessness": ("sadness", 0.742366), + "Continous": ("continuous", 0.732549), + "Disemboweled": ("corpse", 0.499432), + "biostatistician": ("scientist", 0.339724), + "somewheres": ("somewheres", 0.402736), + "observing": ("observe", 0.823096), + "Leaving": ("leaving", 1.0), +} +``` + +In the example above, the vector for "Shore" was removed and remapped to the +vector of "coast", which is deemed about 73% similar. "Leaving" was remapped to +the vector of "leaving", which is identical. If you're using the +[`init model`](/api/cli#init-model) command, you can set the `--prune-vectors` +option to easily reduce the size of the vectors as you add them to a spaCy +model: + +```cli +$ python -m spacy init model en /tmp/la_vectors_web_md --vectors-loc la.300d.vec.tgz --prune-vectors 10000 +``` + +This will create a spaCy model with vectors for the first 10,000 words in the +vectors model. All other words in the vectors model are mapped to the closest +vector among those retained. + + + +### Adding vectors individually {#adding-individual-vectors} + +The `vector` attribute is a **read-only** numpy or cupy array (depending on +whether you've configured spaCy to use GPU memory), with dtype `float32`. The +array is read-only so that spaCy can avoid unnecessary copy operations where +possible. You can modify the vectors via the [`Vocab`](/api/vocab) or +[`Vectors`](/api/vectors) table. Using the +[`Vocab.set_vector`](/api/vocab#set_vector) method is often the easiest approach +if you have vectors in an arbitrary format, as you can read in the vectors with +your own logic, and just set them with a simple loop. This method is likely to +be slower than approaches that work with the whole vectors table at once, but +it's a great approach for once-off conversions before you save out your model to +disk. + +```python +### Adding vectors +from spacy.vocab import Vocab + +vector_data = { + "dog": numpy.random.uniform(-1, 1, (300,)), + "cat": numpy.random.uniform(-1, 1, (300,)), + "orange": numpy.random.uniform(-1, 1, (300,)) +} +vocab = Vocab() +for word, vector in vector_data.items(): + vocab.set_vector(word, vector) +``` + ## Language data {#language-data} import LanguageData101 from 'usage/101/\_language-data.md' @@ -1594,9 +1724,8 @@ language name, and even train models with it and refer to it in your > needs to be available during training. You can load a Python file containing > the code using the `--code` argument: > -> ```bash -> ### {wrap="true"} -> $ python -m spacy train config.cfg --code code.py +> ```cli +> python -m spacy train config.cfg --code code.py > ``` ```python diff --git a/website/docs/usage/models.md b/website/docs/usage/models.md index 5e58d126d..ec0e02297 100644 --- a/website/docs/usage/models.md +++ b/website/docs/usage/models.md @@ -70,8 +70,7 @@ import Languages from 'widgets/languages.js' > nlp = MultiLanguage() > > # With lazy-loading -> from spacy.util import get_lang_class -> nlp = get_lang_class('xx') +> nlp = spacy.blank("xx") > ``` spaCy also supports models trained on more than one language. This is especially @@ -80,10 +79,10 @@ language-neutral models is `xx`. The language class, a generic subclass containing only the base language data, can be found in [`lang/xx`](https://github.com/explosion/spaCy/tree/master/spacy/lang/xx). -To load your model with the neutral, multi-language class, simply set -`"language": "xx"` in your [model package](/usage/training#models-generating)'s -`meta.json`. You can also import the class directly, or call -[`util.get_lang_class()`](/api/top-level#util.get_lang_class) for lazy-loading. +To train a model using the neutral multi-language class, you can set +`lang = "xx"` in your [training config](/usage/training#config). You can also +import the `MultiLanguage` class directly, or call +[`spacy.blank("xx")`](/api/top-level#spacy.blank) for lazy-loading. ### Chinese language support {#chinese new=2.3} @@ -117,15 +116,10 @@ The Chinese language class supports three word segmentation options: -In spaCy v3, the default Chinese word segmenter has switched from Jieba to -character segmentation. - - - - - -Note that [`pkuseg`](https://github.com/lancopku/pkuseg-python) doesn't yet ship -with pre-compiled wheels for Python 3.8. If you're running Python 3.8, you can +In spaCy v3.0, the default Chinese word segmenter has switched from Jieba to +character segmentation. Also note that +[`pkuseg`](https://github.com/lancopku/pkuseg-python) doesn't yet ship with +pre-compiled wheels for Python 3.8. If you're running Python 3.8, you can install it from our fork and compile it locally: ```bash @@ -139,25 +133,25 @@ $ pip install https://github.com/honnibal/pkuseg-python/archive/master.zip The `meta` argument of the `Chinese` language class supports the following following tokenizer config settings: -| Name | Type | Description | -| ------------------ | ---- | ------------------------------------------------------------------------------------------------------- | -| `segmenter` | str | Word segmenter: `char`, `jieba` or `pkuseg`. Defaults to `char`. | -| `pkuseg_model` | str | **Required for `pkuseg`:** Name of a model provided by `pkuseg` or the path to a local model directory. | -| `pkuseg_user_dict` | str | Optional path to a file with one word per line which overrides the default `pkuseg` user dictionary. | +| Name | Description | +| ------------------ | --------------------------------------------------------------------------------------------------------------- | +| `segmenter` | Word segmenter: `char`, `jieba` or `pkuseg`. Defaults to `char`. ~~str~~ | +| `pkuseg_model` | **Required for `pkuseg`:** Name of a model provided by `pkuseg` or the path to a local model directory. ~~str~~ | +| `pkuseg_user_dict` | Optional path to a file with one word per line which overrides the default `pkuseg` user dictionary. ~~str~~ | ```python ### Examples # Load "default" model cfg = {"segmenter": "pkuseg", "pkuseg_model": "default"} -nlp = Chinese(meta={"tokenizer": {"config": cfg}}) +nlp = Chinese(config={"tokenizer": {"config": cfg}}) # Load local model cfg = {"segmenter": "pkuseg", "pkuseg_model": "/path/to/pkuseg_model"} -nlp = Chinese(meta={"tokenizer": {"config": cfg}}) +nlp = Chinese(config={"tokenizer": {"config": cfg}}) # Override the user directory cfg = {"segmenter": "pkuseg", "pkuseg_model": "default", "pkuseg_user_dict": "/path"} -nlp = Chinese(meta={"tokenizer": {"config": cfg}}) +nlp = Chinese(config={"tokenizer": {"config": cfg}}) ``` You can also modify the user dictionary on-the-fly: @@ -175,7 +169,7 @@ nlp.tokenizer.pkuseg_update_user_dict([], reset=True) - + The [Chinese models](/models/zh) provided by spaCy include a custom `pkuseg` model trained only on @@ -248,20 +242,20 @@ best-matching model compatible with your spaCy installation. > + nlp = spacy.load("en_core_web_sm") > ``` -```bash -# Download best-matching version of specific model for your spaCy installation -python -m spacy download en_core_web_sm +```cli +# Download best-matching version of a model for your spaCy installation +$ python -m spacy download en_core_web_sm # Download exact model version -python -m spacy download en_core_web_sm-2.2.0 --direct +$ python -m spacy download en_core_web_sm-3.0.0 --direct ``` The download command will [install the model](/usage/models#download-pip) via pip and place the package in your `site-packages` directory. -```bash -pip install spacy -python -m spacy download en_core_web_sm +```cli +$ pip install -U spacy +$ python -m spacy download en_core_web_sm ``` ```python @@ -280,10 +274,10 @@ click on the archive link and copy it to your clipboard. ```bash # With external URL -pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0.tar.gz +$ pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0.tar.gz # With local file -pip install /Users/you/en_core_web_sm-3.0.0.tar.gz +$ pip install /Users/you/en_core_web_sm-3.0.0.tar.gz ``` By default, this will install the model into your `site-packages` directory. You @@ -306,14 +300,16 @@ archive consists of a model directory that contains another directory with the model data. ```yaml -### Directory structure {highlight="7"} +### Directory structure {highlight="6"} └── en_core_web_md-3.0.0.tar.gz # downloaded archive - β”œβ”€β”€ meta.json # model meta data β”œβ”€β”€ setup.py # setup file for pip installation + β”œβ”€β”€ meta.json # copy of model meta └── en_core_web_md # πŸ“¦ model package β”œβ”€β”€ __init__.py # init for pip installation - β”œβ”€β”€ meta.json # model meta data └── en_core_web_md-3.0.0 # model data + β”œβ”€β”€ config.cfg # model config + β”œβ”€β”€ meta.json # model meta + └── ... # directories with component data ``` You can place the **model package directory** anywhere on your local file diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md index 00348065c..614f113b3 100644 --- a/website/docs/usage/processing-pipelines.md +++ b/website/docs/usage/processing-pipelines.md @@ -1,6 +1,6 @@ --- title: Language Processing Pipelines -next: /usage/vectors-embeddings +next: /usage/embeddings-transformers menu: - ['Processing Text', 'processing'] - ['How Pipelines Work', 'pipelines'] @@ -108,11 +108,11 @@ class, or defined within a [model package](/usage/saving-loading#models). > > [components.tagger] > factory = "tagger" -> # settings for the tagger component +> # Settings for the tagger component > > [components.parser] > factory = "parser" -> # settings for the parser component +> # Settings for the parser component > ``` When you load a model, spaCy first consults the model's @@ -171,11 +171,11 @@ lang = "en" pipeline = ["tagger", "parser", "ner"] data_path = "path/to/en_core_web_sm/en_core_web_sm-2.0.0" -cls = spacy.util.get_lang_class(lang) # 1. Get Language instance, e.g. English() -nlp = cls() # 2. Initialize it +cls = spacy.util.get_lang_class(lang) # 1. Get Language class, e.g. English +nlp = cls() # 2. Initialize it for name in pipeline: - nlp.add_pipe(name) # 3. Add the component to the pipeline -nlp.from_disk(model_data_path) # 4. Load in the binary data + nlp.add_pipe(name) # 3. Add the component to the pipeline +nlp.from_disk(model_data_path) # 4. Load in the binary data ``` When you call `nlp` on a text, spaCy will **tokenize** it and then **call each @@ -187,9 +187,9 @@ which is then processed by the component next in the pipeline. ```python ### The pipeline under the hood -doc = nlp.make_doc("This is a sentence") # create a Doc from raw text -for name, proc in nlp.pipeline: # iterate over components in order - doc = proc(doc) # apply each component +doc = nlp.make_doc("This is a sentence") # Create a Doc from raw text +for name, proc in nlp.pipeline: # Iterate over components in order + doc = proc(doc) # Apply each component ``` The current processing pipeline is available as `nlp.pipeline`, which returns a @@ -232,7 +232,7 @@ available pipeline components and component functions. | `morphologizer` | [`Morphologizer`](/api/morphologizer) | Assign morphological features and coarse-grained POS tags. | | `senter` | [`SentenceRecognizer`](/api/sentencerecognizer) | Assign sentence boundaries. | | `sentencizer` | [`Sentencizer`](/api/sentencizer) | Add rule-based sentence segmentation without the dependency parse. | -| `tok2vec` | [`Tok2Vec`](/api/tok2vec) | | +| `tok2vec` | [`Tok2Vec`](/api/tok2vec) | Assign token-to-vector embeddings. | | `transformer` | [`Transformer`](/api/transformer) | Assign the tokens and outputs of a transformer model. | ### Disabling and modifying pipeline components {#disabling} @@ -265,7 +265,7 @@ for doc in nlp.pipe(texts, disable=["tagger", "parser"]): If you need to **execute more code** with components disabled – e.g. to reset the weights or update only some components during training – you can use the -[`nlp.select_pipes`](/api/language#select_pipes) contextmanager. At the end of +[`nlp.select_pipes`](/api/language#select_pipes) context manager. At the end of the `with` block, the disabled pipeline components will be restored automatically. Alternatively, `select_pipes` returns an object that lets you call its `restore()` method to restore the disabled components when needed. This @@ -274,7 +274,7 @@ blocks. ```python ### Disable for block -# 1. Use as a contextmanager +# 1. Use as a context manager with nlp.select_pipes(disable=["tagger", "parser"]): doc = nlp("I won't be tagged and parsed") doc = nlp("I will be tagged and parsed") @@ -324,9 +324,9 @@ pretrained components and new components trained on your data. When reusing components across models, keep in mind that the **vocabulary**, **vectors** and model settings **must match**. If a pretrained model includes -[word vectors](/usage/vectors-embeddings) and the component uses them as -features, the model you copy it to needs to have the _same_ vectors available – -otherwise, it won't be able to make the same predictions. +[word vectors](/usage/linguistic-features#vectors-similarity) and the component +uses them as features, the model you copy it to needs to have the _same_ vectors +available – otherwise, it won't be able to make the same predictions. @@ -473,14 +473,14 @@ only being able to modify it afterwards. > > @Language.component("my_component") > def my_component(doc): -> # do something to the doc here +> # Do something to the doc here > return doc > ``` -| Argument | Type | Description | -| ----------- | ----- | ------------------------------------------------------ | -| `doc` | `Doc` | The `Doc` object processed by the previous component. | -| **RETURNS** | `Doc` | The `Doc` object processed by this pipeline component. | +| Argument | Type | Description | +| ----------- | ----------------- | ------------------------------------------------------ | +| `doc` | [`Doc`](/api/doc) | The `Doc` object processed by the previous component. | +| **RETURNS** | [`Doc`](/api/doc) | The `Doc` object processed by this pipeline component. | The [`@Language.component`](/api/language#component) decorator lets you turn a simple function into a pipeline component. It takes at least one argument, the @@ -502,12 +502,12 @@ last** in the pipeline, or define a **custom name**. If no name is set and no > nlp.add_pipe("my_component", before="parser") > ``` -| Argument | Type | Description | -| -------- | --------- | ------------------------------------------------------------------------ | -| `last` | bool | If set to `True`, component is added **last** in the pipeline (default). | -| `first` | bool | If set to `True`, component is added **first** in the pipeline. | -| `before` | str / int | String name or index to add the new component **before**. | -| `after` | str / int | String name or index to add the new component **after**. | +| Argument | Description | +| -------- | --------------------------------------------------------------------------------- | +| `last` | If set to `True`, component is added **last** in the pipeline (default). ~~bool~~ | +| `first` | If set to `True`, component is added **first** in the pipeline. ~~bool~~ | +| `before` | String name or index to add the new component **before**. ~~Union[str, int]~~ | +| `after` | String name or index to add the new component **after**. ~~Union[str, int]~~ | @@ -623,21 +623,19 @@ added to the pipeline: > > @Language.factory("my_component") > def my_component(nlp, name): -> return MyComponent() +> return MyComponent() > ``` -| Argument | Type | Description | -| -------- | --------------------------- | ------------------------------------------------------------------------------------------------------------------------- | -| `nlp` | [`Language`](/api/language) | The current `nlp` object. Can be used to access the | -| `name` | str | The **instance name** of the component in the pipeline. This lets you identify different instances of the same component. | +| Argument | Description | +| -------- | --------------------------------------------------------------------------------------------------------------------------------- | +| `nlp` | The current `nlp` object. Can be used to access the shared vocab. ~~Language~~ | +| `name` | The **instance name** of the component in the pipeline. This lets you identify different instances of the same component. ~~str~~ | All other settings can be passed in by the user via the `config` argument on [`nlp.add_pipe`](/api/language). The [`@Language.factory`](/api/language#factory) decorator also lets you define a `default_config` that's used as a fallback. - - ```python ### With config {highlight="4,9"} import spacy @@ -688,7 +686,7 @@ make your factory a separate function. That's also how spaCy does it internally. -### Example: Stateful component with settings +### Example: Stateful component with settings {#example-stateful-components} This example shows a **stateful** pipeline component for handling acronyms: based on a dictionary, it will detect acronyms and their expanded forms in both @@ -757,6 +755,85 @@ doc = nlp("LOL, be right back") print(doc._.acronyms) ``` +Many stateful components depend on **data resources** like dictionaries and +lookup tables that should ideally be **configurable**. For example, it makes +sense to make the `DICTIONARY` and argument of the registered function, so the +`AcronymComponent` can be re-used with different data. One logical solution +would be to make it an argument of the component factory, and allow it to be +initialized with different dictionaries. + +> #### Example +> +> Making the data an argument of the registered function would result in output +> like this in your `config.cfg`, which is typically not what you want (and only +> works for JSON-serializable data). +> +> ```ini +> [components.acronyms.dictionary] +> lol = "laugh out loud" +> brb = "be right back" +> ``` + +However, passing in the dictionary directly is problematic, because it means +that if a component saves out its config and settings, the +[`config.cfg`](/usage/training#config) will include a dump of the entire data, +since that's the config the component was created with. + +```diff +DICTIONARY = {"lol": "laughing out loud", "brb": "be right back"} +- default_config = {"dictionary:" DICTIONARY} +``` + +If what you're passing in isn't JSON-serializable – e.g. a custom object like a +[model](#trainable-components) – saving out the component config becomes +impossible because there's no way for spaCy to know _how_ that object was +created, and what to do to create it again. This makes it much harder to save, +load and train custom models with custom components. A simple solution is to +**register a function** that returns your resources. The +[registry](/api/top-level#registry) lets you **map string names to functions** +that create objects, so given a name and optional arguments, spaCy will know how +to recreate the object. To register a function that returns a custom asset, you +can use the `@spacy.registry.assets` decorator with a single argument, the name: + +```python +### Registered function for assets {highlight="1"} +@spacy.registry.assets("acronyms.slang_dict.v1") +def create_acronyms_slang_dict(): + dictionary = {"lol": "laughing out loud", "brb": "be right back"} + dictionary.update({value: key for key, value in dictionary.items()}) + return dictionary +``` + +In your `default_config` (and later in your +[training config](/usage/training#config)), you can now refer to the function +registered under the name `"acronyms.slang_dict.v1"` using the `@assets` key. +This tells spaCy how to create the value, and when your component is created, +the result of the registered function is passed in as the key `"dictionary"`. + +> #### config.cfg +> +> ```ini +> [components.acronyms] +> factory = "acronyms" +> +> [components.acronyms.dictionary] +> @assets = "acronyms.slang_dict.v1" +> ``` + +```diff +- default_config = {"dictionary:" DICTIONARY} ++ default_config = {"dictionary": {"@assets": "acronyms.slang_dict.v1"}} +``` + +Using a registered function also means that you can easily include your custom +components in models that you [train](/usage/training). To make sure spaCy knows +where to find your custom `@assets` function, you can pass in a Python file via +the argument `--code`. If someone else is using your component, all they have to +do to customize the data is to register their own function and swap out the +name. Registered functions can also take **arguments** by the way that can be +defined in the config as well – you can read more about this in the docs on +[training with custom code](/usage/training#custom-code). + ### Python type hints and pydantic validation {#type-hints new="3"} spaCy's configs are powered by our machine learning library Thinc's @@ -994,7 +1071,7 @@ loss is calculated and to add evaluation scores to the training output. | [`get_loss`](/api/pipe#get_loss) | Return a tuple of the loss and the gradient for a batch of [`Example`](/api/example) objects. | | [`score`](/api/pipe#score) | Score a batch of [`Example`](/api/example) objects and return a dictionary of scores. The [`@Language.factory`](/api/language#factory) decorator can define the `default_socre_weights` of the component to decide which keys of the scores to display during training and how they count towards the final score. | - + ## Extension attributes {#custom-components-attributes new="2"} @@ -1202,7 +1279,7 @@ document similarity method. Hooks let you customize some of the behaviors of the `Doc`, `Span` or `Token` objects by adding a component to the pipeline. For instance, to customize the [`Doc.similarity`](/api/doc#similarity) method, you can add a component that -sets a custom function to `doc.user_hooks['similarity']`. The built-in +sets a custom function to `doc.user_hooks["similarity"]`. The built-in `Doc.similarity` method will check the `user_hooks` dict, and delegate to your function if you've set one. Similar results can be achieved by setting functions to `Doc.user_span_hooks` and `Doc.user_token_hooks`. @@ -1332,12 +1409,11 @@ function that takes a `Doc`, modifies it and returns it. - If you're looking to publish a model that depends on a custom pipeline component, you can either **require it** in the model package's dependencies, or – if the component is specific and lightweight – choose to **ship it with - your model package** and add it to the `Language` instance returned by the - model's `load()` method. For examples of this, check out the implementations - of spaCy's - [`load_model_from_init_py`](/api/top-level#util.load_model_from_init_py) - [`load_model_from_path`](/api/top-level#util.load_model_from_path) utility - functions. + your model package**. Just make sure the + [`@Language.component`](/api/language#component) or + [`@Language.factory`](/api/language#factory) decorator that registers the + custom component runs in your model's `__init__.py` or is exposed via an + [entry point](/usage/saving-loading#entry-points). - Once you're ready to share your extension with others, make sure to **add docs and installation instructions** (you can always link to this page for more diff --git a/website/docs/usage/projects.md b/website/docs/usage/projects.md index ccf8ec49f..1aaaeb3af 100644 --- a/website/docs/usage/projects.md +++ b/website/docs/usage/projects.md @@ -5,9 +5,12 @@ menu: - ['Intro & Workflow', 'intro'] - ['Directory & Assets', 'directory'] - ['Custom Projects', 'custom'] + - ['Remote Storage', 'remote'] - ['Integrations', 'integrations'] --- +## Introduction and workflow {#intro hidden="true"} + > #### πŸͺ Project templates > > Our [`projects`](https://github.com/explosion/projects) repo includes various @@ -19,20 +22,18 @@ spaCy projects let you manage and share **end-to-end spaCy workflows** for different **use cases and domains**, and orchestrate training, packaging and serving your custom models. You can start off by cloning a pre-defined project template, adjust it to fit your needs, load in your data, train a model, export -it as a Python package and share the project templates with your team. spaCy -projects can be used via the new [`spacy project`](/api/cli#project) command. -For an overview of the available project templates, check out the -[`projects`](https://github.com/explosion/projects) repo. spaCy projects also -[integrate](#integrations) with many other cool machine learning and data -science tools to track and manage your data and experiments, iterate on demos -and prototypes and ship your models into production. +it as a Python package, upload your outputs to a remote storage and share your +results with your team. spaCy projects can be used via the new +[`spacy project`](/api/cli#project) command and we provide templates in our +[`projects`](https://github.com/explosion/projects) repo. -## Introduction and workflow {#intro} - +![Illustration of project workflow and commands](../images/projects.svg) + + spaCy projects make it easy to integrate with many other **awesome tools** in the data science and machine learning ecosystem to track and manage your data @@ -67,8 +69,8 @@ project template and copies the files to a local directory. You can then run the project, e.g. to train a model and edit the commands and scripts to build fully custom workflows. -```bash -$ python -m spacy clone some_example_project +```cli +python -m spacy project clone some_example_project ``` By default, the project will be cloned into the current working directory. You @@ -95,9 +97,9 @@ to download and where to put them. The [`spacy project assets`](/api/cli#project-assets) will fetch the project assets for you: -```bash -cd some_example_project -python -m spacy project assets +```cli +$ cd some_example_project +$ python -m spacy project assets ``` ### 3. Run a command {#run} @@ -123,7 +125,7 @@ Commands consist of one or more steps and can be run with [`spacy project run`](/api/cli#project-run). The following will run the command `preprocess` defined in the `project.yml`: -```bash +```cli $ python -m spacy project run preprocess ``` @@ -153,10 +155,10 @@ other. For instance, to generate a packaged model, you might start by converting your data, then run [`spacy train`](/api/cli#train) to train your model on the converted data and if that's successful, run [`spacy package`](/api/cli#package) to turn the best model artifact into an installable Python package. The -following command run the workflow named `all` defined in the `project.yml`, and -execute the commands it specifies, in order: +following command runs the workflow named `all` defined in the `project.yml`, +and executes the commands it specifies, in order: -```bash +```cli $ python -m spacy project run all ``` @@ -169,6 +171,31 @@ advanced data pipelines and track your changes in Git, check out the from a workflow defined in your `project.yml` so you can manage your spaCy project as a DVC repo. +### 5. Optional: Push to remote storage {#push} + +> ```yaml +> ### project.yml +> remotes: +> default: 's3://my-spacy-bucket' +> local: '/mnt/scratch/cache' +> ``` + +After training a model, you can optionally use the +[`spacy project push`](/api/cli#project-push) command to upload your outputs to +a remote storage, using protocols like [S3](https://aws.amazon.com/s3/), +[Google Cloud Storage](https://cloud.google.com/storage) or SSH. This can help +you **export** your model packages, **share** work with your team, or **cache +results** to avoid repeating work. + +```cli +$ python -m spacy project push +``` + +The `remotes` section in your `project.yml` lets you assign names to the +different storages. To download state from a remote storage, you can use the +[`spacy project pull`](/api/cli#project-pull) command. For more details, see the +docs on [remote storage](#remote). + ## Project directory and assets {#directory} ### project.yml {#project-yml} @@ -188,7 +215,7 @@ https://github.com/explosion/spacy-boilerplates/blob/master/ner_fashion/project. | Section | Description | | ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `variables` | A dictionary of variables that can be referenced in paths, URLs and scripts. For example, `{NAME}` will use the value of the variable `NAME`. | +| `vars` | A dictionary of variables that can be referenced in paths, URLs and scripts, just like [`config.cfg` variables](/usage/training#config-interpolation). For example, `${vars.name}` will use the value of the variable `name`. Variables need to be defined in the section `vars`, but can be a nested dict, so you're able to reference `${vars.model.name}`. | | `directories` | An optional list of [directories](#project-files) that should be created in the project for assets, training outputs, metrics etc. spaCy will make sure that these directories always exist. | | `assets` | A list of assets that can be fetched with the [`project assets`](/api/cli#project-assets) command. `url` defines a URL or local path, `dest` is the destination file relative to the project directory, and an optional `checksum` ensures that an error is raised if the file's checksum doesn't match. | | `workflows` | A dictionary of workflow names, mapped to a list of command names, to execute in order. Workflows can be run with the [`project run`](/api/cli#project-run) command. | @@ -197,7 +224,7 @@ https://github.com/explosion/spacy-boilerplates/blob/master/ner_fashion/project. ### Dependencies and outputs {#deps-outputs} Each command defined in the `project.yml` can optionally define a list of -dependencies and outputs. These are the files the commands requires and creates. +dependencies and outputs. These are the files the command requires and creates. For example, a command for training a model may depend on a [`config.cfg`](/usage/training#config) and the training and evaluation data, and it will export a directory `model-best`, containing the best model, which you @@ -347,9 +374,9 @@ if __name__ == "__main__": In your `project.yml`, you can then run the script by calling `python scripts/custom_evaluation.py` with the function arguments. You can also -use the `variables` section to define reusable variables that will be -substituted in commands, paths and URLs. In this example, the `BATCH_SIZE` is -defined as a variable will be added in place of `{BATCH_SIZE}` in the script. +use the `vars` section to define reusable variables that will be substituted in +commands, paths and URLs. In this example, the batch size is defined as a +variable will be added in place of `${vars.batch_size}` in the script. > #### Calling into Python > @@ -361,13 +388,13 @@ defined as a variable will be added in place of `{BATCH_SIZE}` in the script. ```yaml ### project.yml -variables: - BATCH_SIZE: 128 +vars: + batch_size: 128 commands: - name: evaluate script: - - 'python scripts/custom_evaluation.py {BATCH_SIZE} ./training/model-best ./corpus/eval.json' + - 'python scripts/custom_evaluation.py ${batch_size} ./training/model-best ./corpus/eval.json' deps: - 'training/model-best' - 'corpus/eval.json' @@ -379,8 +406,8 @@ The [`spacy project clone`](/api/cli#project-clone) command lets you customize the repo to clone from using the `--repo` option. It calls into `git`, so you'll be able to clone from any repo that you have access to, including private repos. -```bash -$ python -m spacy project your_project --repo https://github.com/you/repo +```cli +python -m spacy project clone your_project --repo https://github.com/you/repo ``` At a minimum, a valid project template needs to contain a @@ -419,6 +446,114 @@ assets: checksum: '5113dc04e03f079525edd8df3f4f39e3' ``` +## Remote Storage {#remote} + +You can persist your project outputs to a remote storage using the +[`project push`](/api/cli#project-push) command. This can help you **export** +your model packages, **share** work with your team, or **cache results** to +avoid repeating work. The [`project pull`](/api/cli#project-pull) command will +download any outputs that are in the remote storage and aren't available +locally. + +You can list one or more remotes in the `remotes` section of your +[`project.yml`](#project-yml) by mapping a string name to the URL of the +storage. Under the hood, spaCy uses the +[`smart-open`](https://github.com/RaRe-Technologies/smart_open) library to +communicate with the remote storages, so you can use any protocol that +`smart-open` supports, including [S3](https://aws.amazon.com/s3/), +[Google Cloud Storage](https://cloud.google.com/storage), SSH and more, although +you may need to install extra dependencies to use certain protocols. + +> #### Example +> +> ```cli +> $ python -m spacy project pull local +> ``` + +```yaml +### project.yml +remotes: + default: 's3://my-spacy-bucket' + local: '/mnt/scratch/cache' + stuff: 'ssh://myserver.example.com/whatever' +``` + + + +Inside the remote storage, spaCy uses a clever **directory structure** to avoid +overwriting files. The top level of the directory structure is a URL-encoded +version of the output's path. Within this directory are subdirectories named +according to a hash of the command string and the command's dependencies. +Finally, within those directories are files, named according to an MD5 hash of +their contents. + + + + +```yaml +└── urlencoded_file_path # Path of original file + β”œβ”€β”€ some_command_hash # Hash of command you ran + β”‚ β”œβ”€β”€ some_content_hash # Hash of file content + β”‚ └── another_content_hash + └── another_command_hash + └── third_content_hash +``` + + + +For instance, let's say you had the following command in your `project.yml`: + +```yaml +### project.yml +- name: train + help: 'Train a spaCy model using the specified corpus and config' + script: + - 'spacy train ./config.cfg --output training/' + deps: + - 'corpus/train' + - 'corpus/dev' + - 'config.cfg' + outputs: + - 'training/model-best' +``` + +> #### Example +> +> ``` +> └── s3://my-spacy-bucket/training%2Fmodel-best +> └── 1d8cb33a06cc345ad3761c6050934a1b +> └── d8e20c3537a084c5c10d95899fe0b1ff +> ``` + +After you finish training, you run [`project push`](/api/cli#project-push) to +make sure the `training/model-best` output is saved to remote storage. spaCy +will then construct a hash from your command script and the listed dependencies, +`corpus/train`, `corpus/dev` and `config.cfg`, in order to identify the +execution context of your output. It would then compute an MD5 hash of the +`training/model-best` directory, and use those three pieces of information to +construct the storage URL. + +```cli +$ python -m spacy project run train +$ python -m spacy project push +``` + +If you change the command or one of its dependencies (for instance, by editing +the [`config.cfg`](/usage/training#config) file to tune the hyperparameters, a +different creation hash will be calculated, so when you use +[`project push`](/api/cli#project-push) you won't be overwriting your previous +file. The system even supports multiple outputs for the same file and the same +context, which can happen if your training process is not deterministic, or if +you have dependencies that aren't represented in the command. + +In summary, the [`spacy project`](/api/cli#project) remote storages are designed +to make a particular set of trade-offs. Priority is placed on **convenience**, +**correctness** and **avoiding data loss**. You can use +[`project push`](/api/cli#project-push) freely, as you'll never overwrite remote +state, and you don't have to come up with names or version numbers. However, +it's up to you to manage the size of your remote storage, and to remove files +that are no longer relevant to you. + ## Integrations {#integrations} ### Data Version Control (DVC) {#dvc} @@ -445,9 +580,9 @@ to include support for remote storage like Google Cloud Storage, S3, Azure, SSH and more. ```bash -pip install dvc # Install DVC -git init # Initialize a Git repo -dvc init # Initialize a DVC project +$ pip install dvc # Install DVC +$ git init # Initialize a Git repo +$ dvc init # Initialize a DVC project ``` @@ -466,8 +601,8 @@ can then manage your spaCy project like any other DVC project, run and [`dvc repro`](https://dvc.org/doc/command-reference/repro) to reproduce the workflow or individual commands. -```bash -$ python -m spacy project dvc [workflow name] +```cli +$ python -m spacy project dvc [workflow_name] ``` @@ -508,23 +643,24 @@ and evaluation set. > #### Example usage > -> ```bash +> ```cli > $ python -m spacy project run annotate > ``` ```yaml ### project.yml -variables: - PRODIGY_DATASET: 'ner_articles' - PRODIGY_LABELS: 'PERSON,ORG,PRODUCT' - PRODIGY_MODEL: 'en_core_web_md' +vars: + prodigy: + dataset: 'ner_articles' + labels: 'PERSON,ORG,PRODUCT' + model: 'en_core_web_md' commands: - name: annotate - script: - - 'python -m prodigy ner.correct {PRODIGY_DATASET} ./assets/raw_data.jsonl {PRODIGY_MODEL} --labels {PRODIGY_LABELS}' - - 'python -m prodigy data-to-spacy ./corpus/train.json ./corpus/eval.json --ner {PRODIGY_DATASET}' + - 'python -m prodigy ner.correct ${vars.prodigy.dataset} ./assets/raw_data.jsonl ${vars.prodigy.model} --labels ${vars.prodigy.labels}' + - 'python -m prodigy data-to-spacy ./corpus/train.json ./corpus/eval.json --ner ${vars.prodigy.dataset}' - 'python -m spacy convert ./corpus/train.json ./corpus/train.spacy' - 'python -m spacy convert ./corpus/eval.json ./corpus/eval.spacy' - deps: @@ -595,7 +731,7 @@ spacy_streamlit.visualize(MODELS, DEFAULT_TEXT, visualizers=["ner"]) > #### Example usage > -> ```bash +> ```cli > $ python -m spacy project run visualize > ``` @@ -636,8 +772,8 @@ API. > #### Example usage > -> ```bash -> $ python -m spacy project run visualize +> ```cli +> $ python -m spacy project run serve > ``` diff --git a/website/docs/usage/rule-based-matching.md b/website/docs/usage/rule-based-matching.md index d7c3d49f8..7fdce032e 100644 --- a/website/docs/usage/rule-based-matching.md +++ b/website/docs/usage/rule-based-matching.md @@ -157,19 +157,20 @@ The available token pattern keys correspond to a number of [`Token` attributes](/api/token#attributes). The supported attributes for rule-based matching are: -| Attribute | Type | Β Description | -| -------------------------------------- | ---- | ------------------------------------------------------------------------------------------------------ | -| `ORTH` | str | The exact verbatim text of a token. | -| `TEXT` 2.1 | str | The exact verbatim text of a token. | -| `LOWER` | str | The lowercase form of the token text. | -| Β `LENGTH` | int | The length of the token text. | -| Β `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT` | bool | Token text consists of alphabetic characters, ASCII characters, digits. | -| Β `IS_LOWER`, `IS_UPPER`, `IS_TITLE` | bool | Token text is in lowercase, uppercase, titlecase. | -| Β `IS_PUNCT`, `IS_SPACE`, `IS_STOP` | bool | Token is punctuation, whitespace, stop word. | -| Β `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL` | bool | Token text resembles a number, URL, email. | -| Β `POS`, `TAG`, `DEP`, `LEMMA`, `SHAPE` | str | The token's simple and extended part-of-speech tag, dependency label, lemma, shape. | -| `ENT_TYPE` | str | The token's entity label. | -| `_` 2.1 | dict | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). | +| Attribute | Β Description | +| -------------------------------------- | ------------------------------------------------------------------------------------------------------------------------- | +| `ORTH` | The exact verbatim text of a token. ~~str~~ | +| `TEXT` 2.1 | The exact verbatim text of a token. ~~str~~ | +| `LOWER` | The lowercase form of the token text. ~~str~~ | +| Β `LENGTH` | The length of the token text. ~~int~~ | +| Β `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT` | Token text consists of alphabetic characters, ASCII characters, digits. ~~bool~~ | +| Β `IS_LOWER`, `IS_UPPER`, `IS_TITLE` | Token text is in lowercase, uppercase, titlecase. ~~bool~~ | +| Β `IS_PUNCT`, `IS_SPACE`, `IS_STOP` | Token is punctuation, whitespace, stop word. ~~bool~~ | +| Β `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL` | Token text resembles a number, URL, email. ~~bool~~ | +| Β `POS`, `TAG`, `DEP`, `LEMMA`, `SHAPE` | The token's simple and extended part-of-speech tag, dependency label, lemma, shape. ~~str~~ | +| `ENT_TYPE` | The token's entity label. ~~str~~ | +| `_` 2.1 | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). ~~Dict[str, Any]~~ | +| `OP` | [Operator or quantifier](#quantifiers) to determine how often to match a token pattern. ~~str~~ | @@ -231,11 +232,11 @@ following rich comparison attributes are available: > pattern2 = [{"LENGTH": {">=": 10}}] > ``` -| Attribute | Value Type | Description | -| -------------------------- | ---------- | --------------------------------------------------------------------------------- | -| `IN` | any | Attribute value is member of a list. | -| `NOT_IN` | any | Attribute value is _not_ member of a list. | -| `==`, `>=`, `<=`, `>`, `<` | int, float | Attribute value is equal, greater or equal, smaller or equal, greater or smaller. | +| Attribute | Description | +| -------------------------- | ------------------------------------------------------------------------------------------------------- | +| `IN` | Attribute value is member of a list. ~~Any~~ | +| `NOT_IN` | Attribute value is _not_ member of a list. ~~Any~~ | +| `==`, `>=`, `<=`, `>`, `<` | Attribute value is equal, greater or equal, smaller or equal, greater or smaller. ~~Union[int, float]~~ | #### Regular expressions {#regex new="2.1"} @@ -485,12 +486,12 @@ This allows you to write callbacks that consider the entire set of matched phrases, so that you can resolve overlaps and other conflicts in whatever way you prefer. -| Argument | Type | Description | -| --------- | --------- | -------------------------------------------------------------------------------------------------------------------- | -| `matcher` | `Matcher` | The matcher instance. | -| `doc` | `Doc` | The document the matcher was used on. | -| `i` | int | Index of the current match (`matches[i`]). | -| `matches` | list | Β A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. | +| Argument | Description | +| --------- | -------------------------------------------------------------------------------------------------------------------------------------------------- | +| `matcher` | The matcher instance. ~~Matcher~~ | +| `doc` | The document the matcher was used on. ~~Doc~~ | +| `i` | Index of the current match (`matches[i`]). ~~int~~ | +| `matches` | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. ~~Β List[Tuple[int, int int]]~~ | ### Using custom pipeline components {#matcher-pipeline} @@ -510,21 +511,21 @@ from spacy.language import Language from spacy.matcher import Matcher from spacy.tokens import Token -# We're using a component factory because the component needs to be initialized -# with the shared vocab via the nlp object +# We're using a component factory because the component needs to be +# initialized with the shared vocab via the nlp object @Language.factory("html_merger") def create_bad_html_merger(nlp, name): - return BadHTMLMerger(nlp) + return BadHTMLMerger(nlp.vocab) class BadHTMLMerger: - def __init__(self, nlp): + def __init__(self, vocab): patterns = [ [{"ORTH": "<"}, {"LOWER": "br"}, {"ORTH": ">"}], [{"ORTH": "<"}, {"LOWER": "br/"}, {"ORTH": ">"}], ] # Register a new token extension to flag bad HTML Token.set_extension("bad_html", default=False) - self.matcher = Matcher(nlp.vocab) + self.matcher = Matcher(vocab) self.matcher.add("BAD_HTML", patterns) def __call__(self, doc): @@ -1095,11 +1096,12 @@ ruler.add_patterns([{"label": "ORG", "pattern": "Apple"}]) nlp.to_disk("/path/to/model") ``` -The saved model now includes the `"entity_ruler"` in its `"pipeline"` setting in -the `meta.json`, and the model directory contains a file `entityruler.jsonl` -with the patterns. When you load the model back in, all pipeline components will -be restored and deserialized – including the entity ruler. This lets you ship -powerful model packages with binary weights _and_ rules included! +The saved model now includes the `"entity_ruler"` in its +[`config.cfg`](/api/data-formats#config) and the model directory contains a file +`entityruler.jsonl` with the patterns. When you load the model back in, all +pipeline components will be restored and deserialized – including the entity +ruler. This lets you ship powerful model packages with binary weights _and_ +rules included! ### Using a large number of phrase patterns {#entityruler-large-phrase-patterns new="2.2.4"} diff --git a/website/docs/usage/saving-loading.md b/website/docs/usage/saving-loading.md index 904477733..3f9435f5e 100644 --- a/website/docs/usage/saving-loading.md +++ b/website/docs/usage/saving-loading.md @@ -243,7 +243,7 @@ file `data.json` in its subdirectory: ### Directory structure {highlight="2-3"} └── /path/to/model β”œβ”€β”€ my_component # data serialized by "my_component" - | └── data.json + β”‚ └── data.json β”œβ”€β”€ ner # data for "ner" component β”œβ”€β”€ parser # data for "parser" component β”œβ”€β”€ tagger # data for "tagger" component @@ -551,9 +551,9 @@ setup( ) ``` -After installing the package, the the custom colors will be used when -visualizing text with `displacy`. Whenever the label `SNEK` is assigned, it will -be displayed in `#3dff74`. +After installing the package, the custom colors will be used when visualizing +text with `displacy`. Whenever the label `SNEK` is assigned, it will be +displayed in `#3dff74`. import DisplaCyEntSnekHtml from 'images/displacy-ent-snek.html' @@ -562,16 +562,39 @@ import DisplaCyEntSnekHtml from 'images/displacy-ent-snek.html' ## Saving, loading and distributing models {#models} After training your model, you'll usually want to save its state, and load it -back later. You can do this with the -[`Language.to_disk()`](/api/language#to_disk) method: +back later. You can do this with the [`Language.to_disk`](/api/language#to_disk) +method: ```python -nlp.to_disk('/home/me/data/en_example_model') +nlp.to_disk("./en_example_model") ``` -The directory will be created if it doesn't exist, and the whole pipeline will -be written out. To make the model more convenient to deploy, we recommend -wrapping it as a Python package. +The directory will be created if it doesn't exist, and the whole pipeline data, +model meta and model configuration will be written out. To make the model more +convenient to deploy, we recommend wrapping it as a +[Python package](/api/cli#package). + + + +When you save a model in spaCy v3.0+, two files will be exported: a +[`config.cfg`](/api/data-formats#config) based on +[`nlp.config`](/api/language#config) and a [`meta.json`](/api/data-formats#meta) +based on [`nlp.meta`](/api/language#meta). + +- **config**: Configuration used to create the current `nlp` object, its + pipeline components and models, as well as training settings and + hyperparameters. Can include references to registered functions like + [pipeline components](/usage/processing-pipelines#custom-components) or + [model architectures](/api/architectures). Given a config, spaCy is able + reconstruct the whole tree of objects and the `nlp` object. An exported config + can also be used to [train a model](/usage/training#conig) with the same + settings. +- **meta**: Meta information about the model and the Python package, such as the + author information, license, version, data sources and label scheme. This is + mostly used for documentation purposes and for packaging models. It has no + impact on the functionality of the `nlp` object. + + ### Generating a model package {#models-generating} @@ -606,8 +629,8 @@ docs. > } > ``` -```bash -$ python -m spacy package /home/me/data/en_example_model /home/me/my_models +```cli +$ python -m spacy package ./en_example_model ./my_models ``` This command will create a model package directory and will run @@ -623,6 +646,9 @@ model package that can be installed using `pip install`. β”œβ”€β”€ en_example_model # model directory β”‚ β”œβ”€β”€ __init__.py # init for pip installation β”‚ └── en_example_model-1.0.0 # model data + β”‚ β”œβ”€β”€ config.cfg # model config + β”‚ β”œβ”€β”€ meta.json # model meta + β”‚ └── ... # directories with component data └── dist └── en_example_model-1.0.0.tar.gz # installable package ``` @@ -644,13 +670,25 @@ you can also **ship the code with your model** and include it in the [pipeline components](/usage/processing-pipelines#custom-components) before the `nlp` object is created. + + +While it's no problem to edit the package code or meta information, avoid making +edits to the `config.cfg` **after** training, as this can easily lead to data +incompatibility. For instance, changing an architecture or hyperparameter can +mean that the trained weights are now incompatible. If you want to make +adjustments, you can do so before training. Otherwise, you should always trust +spaCy to export the current state of its `nlp` objects via +[`nlp.config`](/api/language#config). + + + ### Loading a custom model package {#loading} To load a model from a data directory, you can use [`spacy.load()`](/api/top-level#spacy.load) with the local path. This will look -for a meta.json in the directory and use the `lang` and `pipeline` settings to -initialize a `Language` class with a processing pipeline and load in the model -data. +for a `config.cfg` in the directory and use the `lang` and `pipeline` settings +to initialize a `Language` class with a processing pipeline and load in the +model data. ```python nlp = spacy.load("/path/to/model") diff --git a/website/docs/usage/spacy-101.md b/website/docs/usage/spacy-101.md index 49cdd96ea..8ea6a6ca0 100644 --- a/website/docs/usage/spacy-101.md +++ b/website/docs/usage/spacy-101.md @@ -160,7 +160,7 @@ the website or company in a specific context. > #### Loading models > -> ```bash +> ```cli > $ python -m spacy download en_core_web_sm > > >>> import spacy @@ -247,7 +247,7 @@ import Vectors101 from 'usage/101/\_vectors-similarity.md' To learn more about word vectors, how to **customize them** and how to load **your own vectors** into spaCy, see the usage guide on -[using word vectors and semantic similarities](/usage/vectors-embeddings). +[using word vectors and semantic similarities](/usage/linguistic-features#vectors-similarity). diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md index fc1624ec1..59766bada 100644 --- a/website/docs/usage/training.md +++ b/website/docs/usage/training.md @@ -1,13 +1,12 @@ --- title: Training Models -next: /usage/projects +next: /usage/layers-architectures menu: - ['Introduction', 'basics'] - ['Quickstart', 'quickstart'] - ['Config System', 'config'] - - ['Custom Models', 'custom-models'] - - ['Transfer Learning', 'transfer-learning'] - - ['Parallel Training', 'parallel-training'] + - ['Custom Functions', 'custom-functions'] + # - ['Parallel Training', 'parallel-training'] - ['Internal API', 'api'] --- @@ -30,13 +29,13 @@ ready-to-use spaCy models. -## Quickstart {#quickstart} +## Quickstart {#quickstart tag="new"} The recommended way to train your spaCy models is via the [`spacy train`](/api/cli#train) command on the command line. It only needs a single [`config.cfg`](#config) **configuration file** that includes all settings -and hyperparameters. You can optionally [overwritten](#config-overrides) -settings on the command line, and load in a Python file to register +and hyperparameters. You can optionally [overwrite](#config-overrides) settings +on the command line, and load in a Python file to register [custom functions](#custom-code) and architectures. This quickstart widget helps you generate a starter config with the **recommended settings** for your specific use case. It's also available in spaCy as the @@ -66,7 +65,7 @@ the [`init fill-config`](/api/cli#init-fill-config) command to fill in the remaining defaults. Training configs should always be **complete and without hidden defaults**, to keep your experiments reproducible. -```bash +```cli $ python -m spacy init fill-config base_config.cfg config.cfg ``` @@ -76,37 +75,24 @@ $ python -m spacy init fill-config base_config.cfg config.cfg > your training and development data, get useful stats, and find problems like > invalid entity annotations, cyclic dependencies, low data labels and more. > -> ```bash -> $ python -m spacy debug data config.cfg --verbose +> ```cli +> $ python -m spacy debug data config.cfg > ``` Instead of exporting your starter config from the quickstart widget and auto-filling it, you can also use the [`init config`](/api/cli#init-config) -command and specify your requirement and settings and CLI arguments. You can now +command and specify your requirement and settings as CLI arguments. You can now add your data and run [`train`](/api/cli#train) with your config. See the [`convert`](/api/cli#convert) command for details on how to convert your data to spaCy's binary `.spacy` format. You can either include the data paths in the `[paths]` section of your config, or pass them in via the command line. -```bash +```cli $ python -m spacy train config.cfg --output ./output --paths.train ./train.spacy --paths.dev ./dev.spacy ``` - - -The easiest way to get started with an end-to-end training process is to clone a -[project](/usage/projects) template. Projects let you manage multi-step -workflows, from data preprocessing to training and packaging your model. - - - ## Training config {#config} -> #### Migration from spaCy v2.x -> -> TODO: once we have an answer for how to update the training command -> (`spacy migrate`?), add details here - Training config files include all **settings and hyperparameters** for training your model. Instead of providing lots of arguments on the command line, you only need to pass your `config.cfg` file to [`spacy train`](/api/cli#train). Under @@ -124,14 +110,15 @@ Some of the main advantages and features of spaCy's training config are: functions like [model architectures](/api/architectures), [optimizers](https://thinc.ai/docs/api-optimizers) or [schedules](https://thinc.ai/docs/api-schedules) and define arguments that are - passed into them. You can also register your own functions to define - [custom architectures](#custom-models), reference them in your config and - tweak their parameters. + passed into them. You can also + [register your own functions](#custom-functions) to define custom + architectures or methods, reference them in your config and tweak their + parameters. - **Interpolation.** If you have hyperparameters or other settings used by multiple components, define them once and reference them as [variables](#config-interpolation). - **Reproducibility with no hidden defaults.** The config file is the "single - source of truth" and includes all settings. + source of truth" and includes all settings. - **Automated checks and validation.** When you load a config, spaCy checks if the settings are complete and if all values have the correct types. This lets you catch potential mistakes early. In your custom architectures, you can use @@ -144,7 +131,7 @@ https://github.com/explosion/spaCy/blob/develop/spacy/default_config.cfg Under the hood, the config is parsed into a dictionary. It's divided into sections and subsections, indicated by the square brackets and dot notation. For -example, `[training]` is a section and `[training.batch_size]` a subsections. +example, `[training]` is a section and `[training.batch_size]` a subsection. Subsections can define values, just like a dictionary, or use the `@` syntax to refer to [registered functions](#config-functions). This allows the config to not just define static settings, but also construct objects like architectures, @@ -155,7 +142,7 @@ sections of a config file are: | ------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `nlp` | Definition of the `nlp` object, its tokenizer and [processing pipeline](/usage/processing-pipelines) component names. | | `components` | Definitions of the [pipeline components](/usage/processing-pipelines) and their models. | -| `paths` | Paths to data and other assets. Re-used across the config as variables, e.g. `${paths:train}`, and can be [overwritten](#config-overrides) on the CLI. | +| `paths` | Paths to data and other assets. Re-used across the config as variables, e.g. `${paths.train}`, and can be [overwritten](#config-overrides) on the CLI. | | `system` | Settings related to system and hardware. Re-used across the config as variables, e.g. `${system.seed}`, and can be [overwritten](#config-overrides) on the CLI. | | `training` | Settings and controls for the training and evaluation process. | | `pretraining` | Optional settings and controls for the [language model pretraining](#pretraining). | @@ -186,9 +173,8 @@ For cases like this, you can set additional command-line options starting with `--paths.train ./corpus/train.spacy` sets the `train` value in the `[paths]` block. -```bash -$ python -m spacy train config.cfg --paths.train ./corpus/train.spacy ---paths.dev ./corpus/dev.spacy --training.batch_size 128 +```cli +$ python -m spacy train config.cfg --paths.train ./corpus/train.spacy --paths.dev ./corpus/dev.spacy --training.batch_size 128 ``` Only existing sections and values in the config can be overwritten. At the end @@ -225,21 +211,21 @@ passed to the component factory as arguments. This lets you configure the model settings and hyperparameters. If a component block defines a `source`, the component will be copied over from an existing pretrained model, with its existing weights. This lets you include an already trained component in your -model pipeline, or update a pretrained components with more data specific to -your use case. +model pipeline, or update a pretrained component with more data specific to your +use case. ```ini ### config.cfg (excerpt) [components] -# "parser" and "ner" are sourced from pretrained model +# "parser" and "ner" are sourced from a pretrained model [components.parser] source = "en_core_web_sm" [components.ner] source = "en_core_web_sm" -# "textcat" and "custom" are created blank from built-in / custom factory +# "textcat" and "custom" are created blank from a built-in / custom factory [components.textcat] factory = "textcat" @@ -293,12 +279,12 @@ batch_size = 128 ``` To refer to a function instead, you can make `[training.batch_size]` its own -section and use the `@` syntax specify the function and its arguments – in this -case [`compounding.v1`](https://thinc.ai/docs/api-schedules#compounding) defined -in the [function registry](/api/top-level#registry). All other values defined in -the block are passed to the function as keyword arguments when it's initialized. -You can also use this mechanism to register -[custom implementations and architectures](#custom-models) and reference them +section and use the `@` syntax to specify the function and its arguments – in +this case [`compounding.v1`](https://thinc.ai/docs/api-schedules#compounding) +defined in the [function registry](/api/top-level#registry). All other values +defined in the block are passed to the function as keyword arguments when it's +initialized. You can also use this mechanism to register +[custom implementations and architectures](#custom-functions) and reference them from your configs. > #### How the config is resolved @@ -324,19 +310,9 @@ compound = 1.001 Another very useful feature of the config system is that it supports variable interpolation for both **values and sections**. This means that you only need to define a setting once and can reference it across your config using the -`${section:value}` or `${section.block}` syntax. In this example, the value of -`seed` is reused within the `[training]` block, and the whole block of -`[training.optimizer]` is reused in `[pretraining]` and will become -`pretraining.optimizer`. - -> #### Note on syntax -> -> There are two different ways to format your variables, depending on whether -> you want to reference a single value or a block. Values are specified after a -> `:`, while blocks are specified with a `.`: -> -> 1. `${section:value}`, `${section.subsection:value}` -> 2. `${section.block}`, `${section.subsection.block}` +`${section.value}` syntax. In this example, the value of `seed` is reused within +the `[training]` block, and the whole block of `[training.optimizer]` is reused +in `[pretraining]` and will become `pretraining.optimizer`. ```ini ### config.cfg (excerpt) {highlight="5,18"} @@ -344,7 +320,7 @@ define a setting once and can reference it across your config using the seed = 0 [training] -seed = ${system:seed} +seed = ${system.seed} [training.optimizer] @optimizers = "Adam.v1" @@ -368,7 +344,7 @@ to a string. [paths] version = 5 root = "/Users/you/data" -train = "${paths:root}/train_${paths:version}.spacy" +train = "${paths.root}/train_${paths.version}.spacy" # Result: /Users/you/data/train_5.spacy ``` @@ -384,7 +360,42 @@ that reference this variable. ### Model architectures {#model-architectures} - +> #### πŸ’‘ Model type annotations +> +> In the documentation and code base, you may come across type annotations and +> descriptions of [Thinc](https://thinc.ai) model types, like ~~Model[List[Doc], +> List[Floats2d]]~~. This so-called generic type describes the layer and its +> input and output type – in this case, it takes a list of `Doc` objects as the +> input and list of 2-dimensional arrays of floats as the output. You can read +> more about defining Thinc modelsΒ [here](https://thinc.ai/docs/usage-models). +> Also see the [type checking](https://thinc.ai/docs/usage-type-checking) for +> how to enable linting in your editor to see live feedback if your inputs and +> outputs don't match. + +A **model architecture** is a function that wires up a Thinc +[`Model`](https://thinc.ai/docs/api-model) instance, which you can then use in a +component or as a layer of a larger network. You can use Thinc as a thin +[wrapper around frameworks](https://thinc.ai/docs/usage-frameworks) such as +PyTorch, TensorFlow or MXNet, or you can implement your logic in Thinc +[directly](https://thinc.ai/docs/usage-models). + +spaCy's built-in components will never construct their `Model` instances +themselves, so you won't have to subclass the component to change its model +architecture. You can just **update the config** so that it refers to a +different registered function. Once the component has been created, its `Model` +instance has already been assigned, so you cannot change its model architecture. +The architecture is like a recipe for the network, and you can't change the +recipe once the dish has already been prepared. You have to make a new one. +spaCy includes a variety of built-in [architectures](/api/architectures) for +different tasks. For example: + +| Architecture | Description | +| ----------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [HashEmbedCNN](/api/architectures#HashEmbedCNN) | Build spaCy’s "standard" embedding layer, which uses hash embedding with subword features and a CNN with layer-normalized maxout. ~~Model[List[Doc], List[Floats2d]]~~ | +| [TransitionBasedParser](/api/architectures#TransitionBasedParser) | Build a [transition-based parser](https://explosion.ai/blog/parsing-english-in-python) model used in the default [`EntityRecognizer`](/api/entityrecognizer) and [`DependencyParser`](/api/dependencyparser). ~~Model[List[Docs], List[List[Floats2d]]]~~ | +| [TextCatEnsemble](/api/architectures#TextCatEnsemble) | Stacked ensemble of a bag-of-words model and a neural network model with an internal CNN embedding layer. Used in the default [`TextCategorizer`](/api/textcategorizer). ~~Model[List[Doc], Floats2d]~~ | + + ### Metrics, training output and weighted scores {#metrics} @@ -430,19 +441,15 @@ components are weighted equally. - - | Name | Description | | -------------------------- | ----------------------------------------------------------------------------------------------------------------------- | | **Loss** | The training loss representing the amount of work left for the optimizer. Should decrease, but usually not to `0`. | -| **Precision** (P) | Should increase. | -| **Recall** (R) | Should increase. | -| **F-Score** (F) | The weighted average of precision and recall. Should increase. | +| **Precision** (P) | Percentage of predicted annotations that were correct. Should increase. | +| **Recall** (R) | Percentage of reference annotations recovered. Should increase. | +| **F-Score** (F) | Harmonic mean of precision and recall. Should increase. | | **UAS** / **LAS** | Unlabeled and labeled attachment score for the dependency parser, i.e. the percentage of correct arcs. Should increase. | | **Words per second** (WPS) | Prediction speed in words per second. Should stay stable. | - - Note that if the development data has raw text, some of the gold-standard entities might not align to the predicted tokenization. These tokenization errors are **excluded from the NER evaluation**. If your tokenization makes it @@ -451,14 +458,32 @@ still look good. -## Custom model implementations and architectures {#custom-models} +## Custom Functions {#custom-functions} - +Registered functions in the training config files can refer to built-in +implementations, but you can also plug in fully **custom implementations**. All +you need to do is register your function using the `@spacy.registry` decorator +with the name of the respective [registry](/api/top-level#registry), e.g. +`@spacy.registry.architectures`, and a string name to assign to your function. +Registering custom functions allows you to **plug in models** defined in PyTorch +or TensorFlow, make **custom modifications** to the `nlp` object, create custom +optimizers or schedules, or **stream in data** and preprocesses it on the fly +while training. + +Each custom function can have any numbers of arguments that are passed in via +the [config](#config), just the built-in functions. If your function defines +**default argument values**, spaCy is able to auto-fill your config when you run +[`init fill-config`](/api/cli#init-fill-config). If you want to make sure that a +given parameter is always explicitely set in the config, avoid setting a default +value for it. + + ### Training with custom code {#custom-code} -> ```bash -> ### Example {wrap="true"} +> #### Example +> +> ```cli > $ python -m spacy train config.cfg --code functions.py > ``` @@ -485,11 +510,11 @@ language class and `nlp` object at different points of the lifecycle: | `after_creation` | Called right after the `nlp` object is created, but before the pipeline components are added to the pipeline and receives the `nlp` object. Useful for modifying the tokenizer. | | `after_pipeline_creation` | Called right after the pipeline components are created and added and receives the `nlp` object. Useful for modifying pipeline components. | -The `@spacy.registry.callbacks` decorator lets you register that function in the -`callbacks` [registry](/api/top-level#registry) under a given name. You can then -reference the function in a config block using the `@callbacks` key. If a block -contains a key starting with an `@`, it's interpreted as a reference to a -function. Because you've registered the function, spaCy knows how to create it +The `@spacy.registry.callbacks` decorator lets you register your custom function +in the `callbacks` [registry](/api/top-level#registry) under a given name. You +can then reference the function in a config block using the `@callbacks` key. If +a block contains a key starting with an `@`, it's interpreted as a reference to +a function. Because you've registered the function, spaCy knows how to create it when you reference `"customize_language_data"` in your config. Here's an example of a callback that runs before the `nlp` object is created and adds a few custom tokenization rules to the defaults: @@ -564,9 +589,9 @@ spaCy's configs are powered by our machine learning library Thinc's using [`pydantic`](https://github.com/samuelcolvin/pydantic). If your registered function provides type hints, the values that are passed in will be checked against the expected types. For example, `debug: bool` in the example above will -ensure that the value received as the argument `debug` is an boolean. If the +ensure that the value received as the argument `debug` is a boolean. If the value can't be coerced into a boolean, spaCy will raise an error. -`start: pydantic.StrictBool` will force the value to be an boolean and raise an +`debug: pydantic.StrictBool` will force the value to be a boolean and raise an error if it's not – for instance, if your config defines `1` instead of `true`. @@ -576,9 +601,8 @@ you can now run [`spacy train`](/api/cli#train) and point the argument `--code` to your Python file. Before loading the config, spaCy will import the `functions.py` module and your custom functions will be registered. -```bash -### Training with custom code {wrap="true"} -python -m spacy train config.cfg --output ./output --code ./functions.py +```cli +$ python -m spacy train config.cfg --output ./output --code ./functions.py ``` #### Example: Custom batch size schedule {#custom-code-schedule} @@ -612,9 +636,7 @@ In your config, you can now reference the schedule in the starting with an `@`, it's interpreted as a reference to a function. All other settings in the block will be passed to the function as keyword arguments. Keep in mind that the config shouldn't have any hidden defaults and all arguments on -the functions need to be represented in the config. If your function defines -**default argument values**, spaCy is able to auto-fill your config when you run -[`init fill-config`](/api/cli#init-fill-config). +the functions need to be represented in the config. ```ini ### config.cfg (excerpt) @@ -626,64 +648,103 @@ factor = 1.005 #### Example: Custom data reading and batching {#custom-code-readers-batchers} - +Some use-cases require **streaming in data** or manipulating datasets on the +fly, rather than generating all data beforehand and storing it to file. Instead +of using the built-in [`Corpus`](/api/corpus) reader, which uses static file +paths, you can create and register a custom function that generates +[`Example`](/api/example) objects. The resulting generator can be infinite. When +using this dataset for training, stopping criteria such as maximum number of +steps, or stopping when the loss does not decrease further, can be used. -### Wrapping PyTorch and TensorFlow {#custom-frameworks} +In this example we assume a custom function `read_custom_data` which loads or +generates texts with relevant text classification annotations. Then, small +lexical variations of the input text are created before generating the final +[`Example`](/api/example) objects. The `@spacy.registry.readers` decorator lets +you register the function creating the custom reader in the `readers` +[registry](/api/top-level#registry) and assign it a string name, so it can be +used in your config. All arguments on the registered function become available +as **config settings** – in this case, `source`. - +> #### config.cfg +> +> ```ini +> [training.train_corpus] +> @readers = "corpus_variants.v1" +> source = "s3://your_bucket/path/data.csv" +> ``` - +```python +### functions.py {highlight="7-8"} +from typing import Callable, Iterator, List +import spacy +from spacy.gold import Example +from spacy.language import Language +import random -Lorem ipsum dolor sit amet, consectetur adipiscing elit. Phasellus interdum -sodales lectus, ut sodales orci ullamcorper id. Sed condimentum neque ut erat -mattis pretium. +@spacy.registry.readers("corpus_variants.v1") +def stream_data(source: str) -> Callable[[Language], Iterator[Example]]: + def generate_stream(nlp): + for text, cats in read_custom_data(source): + # Create a random variant of the example text + i = random.randint(0, len(text) - 1) + variant = text[:i] + text[i].upper() + text[i + 1:] + doc = nlp.make_doc(variant) + example = Example.from_dict(doc, {"cats": cats}) + yield example - + return generate_stream +``` + + + +Remember that a registered function should always be a function that spaCy +**calls to create something**. In this case, it **creates the reader function** +– it's not the reader itself. + + + +We can also customize the **batching strategy** by registering a new batcher +function in the `batchers` [registry](/api/top-level#registry). A batcher turns +a stream of items into a stream of batches. spaCy has several useful built-in +[batching strategies](/api/top-level#batchers) with customizable sizes, but it's +also easy to implement your own. For instance, the following function takes the +stream of generated [`Example`](/api/example) objects, and removes those which +have the same underlying raw text, to avoid duplicates within each batch. Note +that in a more realistic implementation, you'd also want to check whether the +annotations are the same. + +> #### config.cfg +> +> ```ini +> [training.batcher] +> @batchers = "filtering_batch.v1" +> size = 150 +> ``` + +```python +### functions.py +from typing import Callable, Iterable, Iterator, List +import spacy +from spacy.gold import Example + +@spacy.registry.batchers("filtering_batch.v1") +def filter_batch(size: int) -> Callable[[Iterable[Example]], Iterator[List[Example]]]: + def create_filtered_batches(examples): + batch = [] + for eg in examples: + # Remove duplicate examples with the same text from batch + if eg.text not in [x.text for x in batch]: + batch.append(eg) + if len(batch) == size: + yield batch + batch = [] + + return create_filtered_batches +``` ### Defining custom architectures {#custom-architectures} - - -## Transfer learning {#transfer-learning} - -### Using transformer models like BERT {#transformers} - -spaCy v3.0 lets you use almost any statistical model to power your pipeline. You -can use models implemented in a variety of frameworks. A transformer model is -just a statistical model, so the -[`spacy-transformers`](https://github.com/explosion/spacy-transformers) package -actually has very little work to do: it just has to provide a few functions that -do the required plumbing. It also provides a pipeline component, -[`Transformer`](/api/transformer), that lets you do multi-task learning and lets -you save the transformer outputs for later use. - - - -Try out a BERT-based model pipeline using this project template: swap in your -data, edit the settings and hyperparameters and train, evaluate, package and -visualize your model. - - - -For more details on how to integrate transformer models into your training -config and customize the implementations, see the usage guide on -[training transformers](/usage/transformers#training). - -### Pretraining with spaCy {#pretraining} - - - -## Parallel Training with Ray {#parallel-training} - - - - - -Lorem ipsum dolor sit amet, consectetur adipiscing elit. Phasellus interdum -sodales lectus, ut sodales orci ullamcorper id. Sed condimentum neque ut erat -mattis pretium. - - + ## Internal training API {#api} @@ -693,9 +754,8 @@ spaCy gives you full control over the training loop. However, for most use cases, it's recommended to train your models via the [`spacy train`](/api/cli#train) command with a [`config.cfg`](#config) to keep track of your settings and hyperparameters, instead of writing your own training -scripts from scratch. -[Custom registered functions](/usage/training/#custom-code) should typically -give you everything you need to train fully custom models with +scripts from scratch. [Custom registered functions](#custom-code) should +typically give you everything you need to train fully custom models with [`spacy train`](/api/cli#train). @@ -705,8 +765,8 @@ called the **gold standard**. It's initialized with a [`Doc`](/api/doc) object that will hold the predictions, and another `Doc` object that holds the gold-standard annotations. It also includes the **alignment** between those two documents if they differ in tokenization. The `Example` class ensures that spaCy -can rely on one **standardized format** that's passed through the pipeline. -Here's an example of a simple `Example` for part-of-speech tags: +can rely on one **standardized format** that's passed through the pipeline. For +instance, let's say we want to define gold-standard part-of-speech tags: ```python words = ["I", "like", "stuff"] @@ -718,9 +778,10 @@ reference = Doc(vocab, words=words).from_array("TAG", numpy.array(tag_ids, dtype example = Example(predicted, reference) ``` -Alternatively, the `reference` `Doc` with the gold-standard annotations can be -created from a dictionary with keyword arguments specifying the annotations, -like `tags` or `entities`. Using the `Example` object and its gold-standard +As this is quite verbose, there's an alternative way to create the reference +`Doc` with the gold-standard annotations. The function `Example.from_dict` takes +a dictionary with keyword arguments specifying the annotations, like `tags` or +`entities`. Using the resulting `Example` object and its gold-standard annotations, the model can be updated to learn a sentence of three words with their assigned part-of-speech tags. @@ -745,8 +806,8 @@ example = Example.from_dict(predicted, {"tags": tags}) Here's another example that shows how to define gold-standard named entities. The letters added before the labels refer to the tags of the [BILUO scheme](/usage/linguistic-features#updating-biluo) – `O` is a token -outside an entity, `U` an single entity unit, `B` the beginning of an entity, -`I` a token inside an entity and `L` the last token of an entity. +outside an entity, `U` a single entity unit, `B` the beginning of an entity, `I` +a token inside an entity and `L` the last token of an entity. ```python doc = Doc(nlp.vocab, words=["Facebook", "released", "React", "in", "2014"]) @@ -820,7 +881,7 @@ dictionary of annotations: ```diff text = "Facebook released React in 2014" annotations = {"entities": ["U-ORG", "O", "U-TECHNOLOGY", "O", "U-DATE"]} -+ example = Example.from_dict(nlp.make_doc(text), {"entities": entities}) ++ example = Example.from_dict(nlp.make_doc(text), annotations) - nlp.update([text], [annotations]) + nlp.update([example]) ``` diff --git a/website/docs/usage/transformers.md b/website/docs/usage/transformers.md deleted file mode 100644 index c3130f57b..000000000 --- a/website/docs/usage/transformers.md +++ /dev/null @@ -1,280 +0,0 @@ ---- -title: Transformers -teaser: Using transformer models like BERT in spaCy -menu: - - ['Installation', 'install'] - - ['Runtime Usage', 'runtime'] - - ['Training Usage', 'training'] -next: /usage/training ---- - -## Installation {#install hidden="true"} - -spaCy v3.0 lets you use almost **any statistical model** to power your pipeline. -You can use models implemented in a variety of -[frameworks](https://thinc.ai/docs/usage-frameworks), including TensorFlow, -PyTorch and MXNet. To keep things sane, spaCy expects models from these -frameworks to be wrapped with a common interface, using our machine learning -library [Thinc](https://thinc.ai). A transformer model is just a statistical -model, so the -[`spacy-transformers`](https://github.com/explosion/spacy-transformers) package -actually has very little work to do: it just has to provide a few functions that -do the required plumbing. It also provides a pipeline component, -[`Transformer`](/api/transformer), that lets you do multi-task learning and lets -you save the transformer outputs for later use. - -To use transformers with spaCy, you need the -[`spacy-transformers`](https://github.com/explosion/spacy-transformers) package -installed. It takes care of all the setup behind the scenes, and makes sure the -transformer pipeline component is available to spaCy. - -```bash -$ pip install spacy-transformers -``` - -## Runtime usage {#runtime} - -Transformer models can be used as **drop-in replacements** for other types of -neural networks, so your spaCy pipeline can include them in a way that's -completely invisible to the user. Users will download, load and use the model in -the standard way, like any other spaCy pipeline. Instead of using the -transformers as subnetworks directly, you can also use them via the -[`Transformer`](/api/transformer) pipeline component. - -![The processing pipeline with the transformer component](../images/pipeline_transformer.svg) - -The `Transformer` component sets the -[`Doc._.trf_data`](/api/transformer#custom_attributes) extension attribute, -which lets you access the transformers outputs at runtime. - -```bash -$ python -m spacy download en_core_trf_lg -``` - -```python -### Example -import spacy - -nlp = spacy.load("en_core_trf_lg") -for doc in nlp.pipe(["some text", "some other text"]): - tokvecs = doc._.trf_data.tensors[-1] -``` - -You can also customize how the [`Transformer`](/api/transformer) component sets -annotations onto the [`Doc`](/api/doc), by customizing the `annotation_setter`. -This callback will be called with the raw input and output data for the whole -batch, along with the batch of `Doc` objects, allowing you to implement whatever -you need. The annotation setter is called with a batch of [`Doc`](/api/doc) -objects and a [`FullTransformerBatch`](/api/transformer#fulltransformerbatch) -containing the transformers data for the batch. - -```python -def custom_annotation_setter(docs, trf_data): - # TODO: - ... - -nlp = spacy.load("en_core_trf_lg") -nlp.get_pipe("transformer").annotation_setter = custom_annotation_setter -doc = nlp("This is a text") -print() # TODO: -``` - -## Training usage {#training} - -The recommended workflow for training is to use spaCy's -[config system](/usage/training#config), usually via the -[`spacy train`](/api/cli#train) command. The training config defines all -component settings and hyperparameters in one place and lets you describe a tree -of objects by referring to creation functions, including functions you register -yourself. For details on how to get started with training your own model, check -out the [training quickstart](/usage/training#quickstart). - - - -The easiest way to get started is to clone a transformers-based project -template. Swap in your data, edit the settings and hyperparameters and train, -evaluate, package and visualize your model. - - - -The `[components]` section in the [`config.cfg`](/api/data-formats#config) -describes the pipeline components and the settings used to construct them, -including their model implementation. Here's a config snippet for the -[`Transformer`](/api/transformer) component, along with matching Python code. In -this case, the `[components.transformer]` block describes the `transformer` -component: - -> #### Python equivalent -> -> ```python -> from spacy_transformers import Transformer, TransformerModel -> from spacy_transformers.annotation_setters import null_annotation_setter -> from spacy_transformers.span_getters import get_doc_spans -> -> trf = Transformer( -> nlp.vocab, -> TransformerModel( -> "bert-base-cased", -> get_spans=get_doc_spans, -> tokenizer_config={"use_fast": True}, -> ), -> annotation_setter=null_annotation_setter, -> max_batch_items=4096, -> ) -> ``` - -```ini -### config.cfg (excerpt) -[components.transformer] -factory = "transformer" -max_batch_items = 4096 - -[components.transformer.model] -@architectures = "spacy-transformers.TransformerModel.v1" -name = "bert-base-cased" -tokenizer_config = {"use_fast": true} - -[components.transformer.model.get_spans] -@span_getters = "doc_spans.v1" - -[components.transformer.annotation_setter] -@annotation_setters = "spacy-transformer.null_annotation_setter.v1" - -``` - -The `[components.transformer.model]` block describes the `model` argument passed -to the transformer component. It's a Thinc -[`Model`](https://thinc.ai/docs/api-model) object that will be passed into the -component. Here, it references the function -[spacy-transformers.TransformerModel.v1](/api/architectures#TransformerModel) -registered in the [`architectures` registry](/api/top-level#registry). If a key -in a block starts with `@`, it's **resolved to a function** and all other -settings are passed to the function as arguments. In this case, `name`, -`tokenizer_config` and `get_spans`. - -`get_spans` is a function that takes a batch of `Doc` object and returns lists -of potentially overlapping `Span` objects to process by the transformer. Several -[built-in functions](/api/transformer#span-getters) are available – for example, -to process the whole document or individual sentences. When the config is -resolved, the function is created and passed into the model as an argument. - - - -Remember that the `config.cfg` used for training should contain **no missing -values** and requires all settings to be defined. You don't want any hidden -defaults creeping in and changing your results! spaCy will tell you if settings -are missing, and you can run -[`spacy init fill-config`](/api/cli#init-fill-config) to automatically fill in -all defaults. - - - -### Customizing the settings {#training-custom-settings} - -To change any of the settings, you can edit the `config.cfg` and re-run the -training. To change any of the functions, like the span getter, you can replace -the name of the referenced function – e.g. `@span_getters = "sent_spans.v1"` to -process sentences. You can also register your own functions using the -`span_getters` registry: - -> #### config.cfg -> -> ```ini -> [components.transformer.model.get_spans] -> @span_getters = "custom_sent_spans" -> ``` - -```python -### code.py -import spacy_transformers - -@spacy_transformers.registry.span_getters("custom_sent_spans") -def configure_custom_sent_spans(): - # TODO: write custom example - def get_sent_spans(docs): - return [list(doc.sents) for doc in docs] - - return get_sent_spans -``` - -To resolve the config during training, spaCy needs to know about your custom -function. You can make it available via the `--code` argument that can point to -a Python file. For more details on training with custom code, see the -[training documentation](/usage/training#custom-code). - -```bash -$ python -m spacy train ./config.cfg --code ./code.py -``` - -### Customizing the model implementations {#training-custom-model} - -The [`Transformer`](/api/transformer) component expects a Thinc -[`Model`](https://thinc.ai/docs/api-model) object to be passed in as its `model` -argument. You're not limited to the implementation provided by -`spacy-transformers` – the only requirement is that your registered function -must return an object of type `Model[List[Doc], FullTransformerBatch]`: that is, -a Thinc model that takes a list of [`Doc`](/api/doc) objects, and returns a -[`FullTransformerBatch`](/api/transformer#fulltransformerbatch) object with the -transformer data. - -> #### Model type annotations -> -> In the documentation and code base, you may come across type annotations and -> descriptions of [Thinc](https://thinc.ai) model types, like -> `Model[List[Doc], List[Floats2d]]`. This so-called generic type describes the -> layer and its input and output type – in this case, it takes a list of `Doc` -> objects as the input and list of 2-dimensional arrays of floats as the output. -> You can read more about defining Thinc -> modelsΒ [here](https://thinc.ai/docs/usage-models). Also see the -> [type checking](https://thinc.ai/docs/usage-type-checking) for how to enable -> linting in your editor to see live feedback if your inputs and outputs don't -> match. - -The same idea applies to task models that power the **downstream components**. -Most of spaCy's built-in model creation functions support a `tok2vec` argument, -which should be a Thinc layer of type `Model[List[Doc], List[Floats2d]]`. This -is where we'll plug in our transformer model, using the -[Tok2VecListener](/api/architectures#Tok2VecListener) layer, which sneakily -delegates to the `Transformer` pipeline component. - -```ini -### config.cfg (excerpt) {highlight="12"} -[components.ner] -factory = "ner" - -[nlp.pipeline.ner.model] -@architectures = "spacy.TransitionBasedParser.v1" -nr_feature_tokens = 3 -hidden_width = 128 -maxout_pieces = 3 -use_upper = false - -[nlp.pipeline.ner.model.tok2vec] -@architectures = "spacy-transformers.Tok2VecListener.v1" -grad_factor = 1.0 - -[nlp.pipeline.ner.model.tok2vec.pooling] -@layers = "reduce_mean.v1" -``` - -The [Tok2VecListener](/api/architectures#Tok2VecListener) layer expects a -[pooling layer](https://thinc.ai/docs/api-layers#reduction-ops) as the argument -`pooling`, which needs to be of type `Model[Ragged, Floats2d]`. This layer -determines how the vector for each spaCy token will be computed from the zero or -more source rows the token is aligned against. Here we use the -[`reduce_mean`](https://thinc.ai/docs/api-layers#reduce_mean) layer, which -averages the wordpiece rows. We could instead use `reduce_last`, -[`reduce_max`](https://thinc.ai/docs/api-layers#reduce_max), or a custom -function you write yourself. - - - -You can have multiple components all listening to the same transformer model, -and all passing gradients back to it. By default, all of the gradients will be -**equally weighted**. You can control this with the `grad_factor` setting, which -lets you reweight the gradients from the different listeners. For instance, -setting `grad_factor = 0` would disable gradients from one of the listeners, -while `grad_factor = 2.0` would multiply them by 2. This is similar to having a -custom learning rate for each component. Instead of a constant, you can also -provide a schedule, allowing you to freeze the shared parameters at the start of -training. diff --git a/website/docs/usage/v2.md b/website/docs/usage/v2.md index 59a842968..f7bcc17d3 100644 --- a/website/docs/usage/v2.md +++ b/website/docs/usage/v2.md @@ -218,7 +218,7 @@ available via `token.orth`. The new [`Vectors`](/api/vectors) class helps the `Vocab` manage the vectors assigned to strings, and lets you assign vectors individually, or -[load in GloVe vectors](/usage/vectors-embeddings#custom-loading-glove) from a +[load in GloVe vectors](/usage/linguistic-features#adding-vectors) from a directory. To help you strike a good balance between coverage and memory usage, the `Vectors` class lets you map **multiple keys** to the **same row** of the table. If you're using the [`spacy init-model`](/api/cli#init-model) command to diff --git a/website/docs/usage/v3.md b/website/docs/usage/v3.md index a32f9cd86..2a47fd264 100644 --- a/website/docs/usage/v3.md +++ b/website/docs/usage/v3.md @@ -10,6 +10,32 @@ menu: ## Summary {#summary} + + +
+ +
+ + + +- [Summary](#summary) +- [New features](#features) +- [Training & config system](#features-training) +- [Transformer-based pipelines](#features-transformers) +- [Custom models](#features-custom-models) +- [End-to-end project workflows](#features-projects) +- [New built-in components](#features-pipeline-components) +- [New custom component API](#features-components) +- [Python type hints](#features-types) +- [New methods & attributes](#new-methods) +- [New & updated documentation](#new-docs) +- [Backwards incompatibilities](#incompat) +- [Migrating from spaCy v2.x](#migrating) + + + +
+ ## New Features {#features} ### New training workflow and config system {#features-training} @@ -28,9 +54,11 @@ menu: ### Transformer-based pipelines {#features-transformers} +![Pipeline components listening to shared embedding component](../images/tok2vec-listener.svg) + -- **Usage:** [Transformers](/usage/transformers), +- **Usage:** [Embeddings & Transformers](/usage/embeddings-transformers), [Training models](/usage/training) - **API:** [`Transformer`](/api/transformer), [`TransformerData`](/api/transformer#transformerdata), @@ -38,16 +66,65 @@ menu: - **Architectures: ** [TransformerModel](/api/architectures#TransformerModel), [Tok2VecListener](/api/architectures#transformers-Tok2VecListener), [Tok2VecTransformer](/api/architectures#Tok2VecTransformer) -- **Models:** [`en_core_bert_sm`](/models/en) +- **Models:** [`en_core_trf_lg_sm`](/models/en) - **Implementation:** [`spacy-transformers`](https://github.com/explosion/spacy-transformers) -### Custom models using any framework {#feautres-custom-models} +### Custom models using any framework {#features-custom-models} + + + + + +- **Thinc: ** + [Wrapping PyTorch, TensorFlow & MXNet](https://thinc.ai/docs/usage-frameworks) +- **API:** [Model architectures](/api/architectures), [`Pipe`](/api/pipe) + + ### Manage end-to-end workflows with projects {#features-projects} + + +> #### Example +> +> ```cli +> # Clone a project template +> $ python -m spacy project clone example +> $ cd example +> # Download data assets +> $ python -m spacy project assets +> # Run a workflow +> $ python -m spacy project run train +> ``` + +spaCy projects let you manage and share **end-to-end spaCy workflows** for +different **use cases and domains**, and orchestrate training, packaging and +serving your custom models. You can start off by cloning a pre-defined project +template, adjust it to fit your needs, load in your data, train a model, export +it as a Python package, upload your outputs to a remote storage and share your +results with your team. + +![Illustration of project workflow and commands](../images/projects.svg) + +spaCy projects also make it easy to **integrate with other tools** in the data +science and machine learning ecosystem, including [DVC](/usage/projects#dvc) for +data version control, [Prodigy](/usage/projects#prodigy) for creating labelled +data, [Streamlit](/usage/projects#streamlit) for building interactive apps, +[FastAPI](/usage/projects#fastapi) for serving models in production, +[Ray](/usage/projects#ray) for parallel training, +[Weights & Biases](/usage/projects#wandb) for experiment tracking, and more! + + + - **Usage:** [spaCy projects](/usage/projects), @@ -59,13 +136,23 @@ menu: ### New built-in pipeline components {#features-pipeline-components} -| Name | Description | -| ----------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| [`SentenceRecognizer`](/api/sentencerecognizer) | Trainable component for sentence segmentation. | -| [`Morphologizer`](/api/morphologizer) | Trainable component to predict morphological features. | -| [`Lemmatizer`](/api/lemmatizer) | Standalone component for rule-based and lookup lemmatization. | -| [`AttributeRuler`](/api/attributeruler) | Component for setting token attributes using match patterns. | -| [`Transformer`](/api/transformer) | Component for using [transformer models](/usage/transformers) in your pipeline, accessing outputs and aligning tokens. Provided via [`spacy-transformers`](https://github.com/explosion/spacy-transformers). | +spaCy v3.0 includes several new trainable and rule-based components that you can +add to your pipeline and customize for your use case: + +> #### Example +> +> ```python +> nlp = spacy.blank("en") +> nlp.add_pipe("lemmatizer") +> ``` + +| Name | Description | +| ----------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [`SentenceRecognizer`](/api/sentencerecognizer) | Trainable component for sentence segmentation. | +| [`Morphologizer`](/api/morphologizer) | Trainable component to predict morphological features. | +| [`Lemmatizer`](/api/lemmatizer) | Standalone component for rule-based and lookup lemmatization. | +| [`AttributeRuler`](/api/attributeruler) | Component for setting token attributes using match patterns. | +| [`Transformer`](/api/transformer) | Component for using [transformer models](/usage/embeddings-transformers) in your pipeline, accessing outputs and aligning tokens. Provided via [`spacy-transformers`](https://github.com/explosion/spacy-transformers). | @@ -78,15 +165,37 @@ menu: ### New and improved pipeline component APIs {#features-components} -- `Language.factory`, `Language.component` -- `Language.analyze_pipes` -- Adding components from other models +> #### Example +> +> ```python +> @Language.component("my_component") +> def my_component(doc): +> return doc +> +> nlp.add_pipe("my_component") +> nlp.add_pipe("ner", source=other_nlp) +> nlp.analyze_pipes(pretty=True) +> ``` + +Defining, configuring, reusing, training and analyzing pipeline components is +now easier and more convenient. The `@Language.component` and +`@Language.factory` decorators let you register your component, define its +default configuration and meta data, like the attribute values it assigns and +requires. Any custom component can be included during training, and sourcing +components from existing pretrained models lets you **mix and match custom +pipelines**. The `nlp.analyze_pipes` method outputs structured information about +the current pipeline and its components, including the attributes they assign, +the scores they compute during training and whether any required attributes +aren't set. - **Usage:** [Custom components](/usage/processing-pipelines#custom_components), - [Defining components during training](/usage/training#config-components) -- **API:** [`Language`](/api/language) + [Defining components for training](/usage/training#config-components) +- **API:** [`@Language.component`](/api/language#component), + [`@Language.factory`](/api/language#factory), + [`Language.add_pipe`](/api/language#add_pipe), + [`Language.analyze_pipes`](/api/language#analyze_pipes) - **Implementation:** [`spacy/language.py`](https://github.com/explosion/spaCy/tree/develop/spacy/language.py) @@ -136,26 +245,72 @@ in your config and see validation errors if the argument values don't match. -### New methods, attributes and commands +### New methods, attributes and commands {#new-methods} The following methods, attributes and commands are new in spaCy v3.0. -| Name | Description | -| ------------------------------------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| [`Token.lex`](/api/token#attributes) | Access a token's [`Lexeme`](/api/lexeme). | -| [`Language.select_pipes`](/api/language#select_pipes) | Contextmanager for enabling or disabling specific pipeline components for a block. | -| [`Language.analyze_pipes`](/api/language#analyze_pipes) | [Analyze](/usage/processing-pipelines#analysis) components and their interdependencies. | -| [`Language.resume_training`](/api/language#resume_training) | Experimental: continue training a pretrained model and initialize "rehearsal" for components that implement a `rehearse` method to prevent catastrophic forgetting. | -| [`@Language.factory`](/api/language#factory) [`@Language.component`](/api/language#component) | Decorators for [registering](/usage/processing-pipelines#custom-components) pipeline component factories and simple stateless component functions. | -| [`Language.has_factory`](/api/language#has_factory) | Check whether a component factory is registered on a language class.s | -| [`Language.get_factory_meta`](/api/language#get_factory_meta) [`Language.get_pipe_meta`](/api/language#get_factory_meta) | Get the [`FactoryMeta`](/api/language#factorymeta) with component metadata for a factory or instance name. | -| [`Language.config`](/api/language#config) | The [config](/usage/training#config) used to create the current `nlp` object. An instance of [`Config`](https://thinc.ai/docs/api-config#config) and can be saved to disk and used for training. | -| [`Pipe.score`](/api/pipe#score) | Method on trainable pipeline components that returns a dictionary of evaluation scores. | -| [`registry`](/api/top-level#registry) | Function registry to map functions to string names that can be referenced in [configs](/usage/training#config). | -| [`init config`](/api/cli#init-config) | CLI command for initializing a [training config](/usage/training) file with the recommended settings. | -| [`init fill-config`](/api/cli#init-fill-config) | CLI command for auto-filling a partial config with all defaults and missing values. | -| [`debug config`](/api/cli#debug-config) | CLI command for debugging a [training config](/usage/training) file and showing validation errors. | -| [`project`](/api/cli#project) | Suite of CLI commands for cloning, running and managing [spaCy projects](/usage/projects). | +| Name | Description | +| ----------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| [`Token.lex`](/api/token#attributes) | Access a token's [`Lexeme`](/api/lexeme). | +| [`Token.morph`](/api/token#attributes) [`Token.morph_`](/api/token#attributes) | Access a token's morphological analysis. | +| [`Language.select_pipes`](/api/language#select_pipes) | Context manager for enabling or disabling specific pipeline components for a block. | +| [`Language.analyze_pipes`](/api/language#analyze_pipes) | [Analyze](/usage/processing-pipelines#analysis) components and their interdependencies. | +| [`Language.resume_training`](/api/language#resume_training) | Experimental: continue training a pretrained model and initialize "rehearsal" for components that implement a `rehearse` method to prevent catastrophic forgetting. | +| [`@Language.factory`](/api/language#factory) [`@Language.component`](/api/language#component) | Decorators for [registering](/usage/processing-pipelines#custom-components) pipeline component factories and simple stateless component functions. | +| [`Language.has_factory`](/api/language#has_factory) | Check whether a component factory is registered on a language class.s | +| [`Language.get_factory_meta`](/api/language#get_factory_meta) [`Language.get_pipe_meta`](/api/language#get_factory_meta) | Get the [`FactoryMeta`](/api/language#factorymeta) with component metadata for a factory or instance name. | +| [`Language.config`](/api/language#config) | The [config](/usage/training#config) used to create the current `nlp` object. An instance of [`Config`](https://thinc.ai/docs/api-config#config) and can be saved to disk and used for training. | +| [`Pipe.score`](/api/pipe#score) | Method on trainable pipeline components that returns a dictionary of evaluation scores. | +| [`registry`](/api/top-level#registry) | Function registry to map functions to string names that can be referenced in [configs](/usage/training#config). | +| [`util.load_meta`](/api/top-level#util.load_meta) [`util.load_config`](/api/top-level#util.load_config) | Updated helpers for loading a model's [`meta.json`](/api/data-formats#meta) and [`config.cfg`](/api/data-formats#config). | +| [`util.get_installed_models`](/api/top-level#util.get_installed_models) | Names of all models installed in the environment. | +| [`init config`](/api/cli#init-config) [`init fill-config`](/api/cli#init-fill-config) [`debug config`](/api/cli#debug-config) | CLI commands for initializing, auto-filling and debugging [training configs](/usage/training). | +| [`project`](/api/cli#project) | Suite of CLI commands for cloning, running and managing [spaCy projects](/usage/projects). | + +### New and updated documentation {#new-docs} + + + +
+ +To help you get started with spaCy v3.0 and the new features, we've added +several new or rewritten documentation pages, including a new usage guide on +[embeddings, transformers and transfer learning](/usage/embeddings-transformers), +a guide on [training models](/usage/training) rewritten from scratch, a page +explaining the new [spaCy projects](/usage/projects) and updated usage +documentation on +[custom pipeline components](/usage/processing-pipelines#custom-components). +We've also added a bunch of new illustrations and new API reference pages +documenting spaCy's machine learning [model architectures](/api/architectures) +and the expected [data formats](/api/data-formats). API pages about +[pipeline components](/api/#architecture-pipeline) now include more information, +like the default config and implementation, and we've adopted a more detailed +format for documenting argument and return types. + +
+ +[![Library architecture](../images/architecture.svg)](/api) + +
+ + + +- **Usage: ** [Embeddings & Transformers](/usage/embeddings-transformers), + [Training models](/usage/training), + [Layers & Architectures](/usage/layers-architectures), + [Projects](/usage/projects), + [Custom pipeline components](/usage/processing-pipelines#custom-components), + [Custom tokenizers](/usage/linguistic-features#custom-tokenizer) +- **API Reference: ** [Library architecture](/api), + [Model architectures](/api/architectures), [Data formats](/api/data-formats) +- **New Classes: ** [`Example`](/api/example), [`Tok2Vec`](/api/tok2vec), + [`Transformer`](/api/transformer), [`Lemmatizer`](/api/lemmatizer), + [`Morphologizer`](/api/morphologizer), + [`AttributeRuler`](/api/attributeruler), + [`SentenceRecognizer`](/api/sentencerecognizer), [`Pipe`](/api/pipe), + [`Corpus`](/api/corpus) + + ## Backwards Incompatibilities {#incompat} @@ -177,18 +332,23 @@ Note that spaCy v3.0 now requires **Python 3.6+**. There can be many [different models](/models) and not just one "English model", so you should always use the full model name like [`en_core_web_sm`](/models/en) explicitly. +- A model's [`meta.json`](/api/data-formats#meta) is now only used to provide + meta information like the model name, author, license and labels. It's **not** + used to construct the processing pipeline anymore. This is all defined in the + [`config.cfg`](/api/data-formats#config), which also includes all settings + used to train the model. - The [`train`](/api/cli#train) and [`pretrain`](/api/cli#pretrain) commands now only take a `config.cfg` file containing the full [training config](/usage/training#config). - [`Language.add_pipe`](/api/language#add_pipe) now takes the **string name** of the component factory instead of the component function. -- **Custom pipeline components** now needs to be decorated with the +- **Custom pipeline components** now need to be decorated with the [`@Language.component`](/api/language#component) or [`@Language.factory`](/api/language#factory) decorator. - [`Language.update`](/api/language#update) now takes a batch of [`Example`](/api/example) objects instead of raw texts and annotations, or `Doc` and `GoldParse` objects. -- The `Language.disable_pipes` contextmanager has been replaced by +- The `Language.disable_pipes` context manager has been replaced by [`Language.select_pipes`](/api/language#select_pipes), which can explicitly disable or enable components. - The [`Language.update`](/api/language#update), @@ -208,14 +368,16 @@ Note that spaCy v3.0 now requires **Python 3.6+**. ### Removed or renamed API {#incompat-removed} -| Removed | Replacement | -| -------------------------------------------------------- | ----------------------------------------------------- | -| `Language.disable_pipes` | [`Language.select_pipes`](/api/language#select_pipes) | -| `GoldParse` | [`Example`](/api/example) | -| `GoldCorpus` | [`Corpus`](/api/corpus) | -| `spacy debug-data` | [`spacy debug data`](/api/cli#debug-data) | -| `spacy profile` | [`spacy debug profile`](/api/cli#debug-profile) | -| `spacy link`, `util.set_data_path`, `util.get_data_path` | not needed, model symlinks are deprecated | +| Removed | Replacement | +| -------------------------------------------------------- | ------------------------------------------------------------------------------------------ | +| `Language.disable_pipes` | [`Language.select_pipes`](/api/language#select_pipes) | +| `GoldParse` | [`Example`](/api/example) | +| `GoldCorpus` | [`Corpus`](/api/corpus) | +| `KnowledgeBase.load_bulk`, `KnowledgeBase.dump` | [`KnowledgeBase.from_disk`](/api/kb#from_disk), [`KnowledgeBase.to_disk`](/api/kb#to_disk) | +| `spacy init-model` | [`spacy init model`](/api/cli#init-model) | +| `spacy debug-data` | [`spacy debug data`](/api/cli#debug-data) | +| `spacy profile` | [`spacy debug profile`](/api/cli#debug-profile) | +| `spacy link`, `util.set_data_path`, `util.get_data_path` | not needed, model symlinks are deprecated | The following deprecated methods, attributes and arguments were removed in v3.0. Most of them have been **deprecated for a while** and many would previously @@ -231,6 +393,7 @@ on them. | `Language.tagger`, `Language.parser`, `Language.entity` | [`Language.get_pipe`](/api/language#get_pipe) | | keyword-arguments like `vocab=False` on `to_disk`, `from_disk`, `to_bytes`, `from_bytes` | `exclude=["vocab"]` | | `n_threads` argument on [`Tokenizer`](/api/tokenizer), [`Matcher`](/api/matcher), [`PhraseMatcher`](/api/phrasematcher) | `n_process` | +| `verbose` argument on [`Language.evaluate`](/api/language#evaluate) | logging (`DEBUG`) | | `SentenceSegmenter` hook, `SimilarityHook` | [user hooks](/usage/processing-pipelines#custom-components-user-hooks), [`Sentencizer`](/api/sentencizer), [`SentenceRecognizer`](/api/sentenceregognizer) | ## Migrating from v2.x {#migrating} @@ -407,33 +570,39 @@ spaCy v3.0 uses a new serializing a [`DocBin`](/api/docbin), which represents a collection of `Doc` objects. This means that you can train spaCy models using the same format it outputs: annotated `Doc` objects. The binary format is extremely **efficient in -storage**, especially when packing multiple documents together. +storage**, especially when packing multiple documents together. You can convert +your existing JSON-formatted data using the [`spacy convert`](/api/cli#convert) +command, which outputs `.spacy` files: -You can convert your existing JSON-formatted data using the -[`spacy convert`](/api/cli#convert) command, which outputs `.spacy` files: - -```bash +```cli $ python -m spacy convert ./training.json ./output ``` #### Training config {#migrating-training-config} The easiest way to get started with a training config is to use the -[`init config`](/api/cli#init-config) command. You can start off with a blank -config for a new model, copy the config from an existing model, or auto-fill a -partial config like a starter config generated by our -[quickstart widget](/usage/training#quickstart). +[`init config`](/api/cli#init-config) command or the +[quickstart widget](/usage/training#quickstart). You can define your +requirements, and it will auto-generate a starter config with the best-matching +default settings. -```bash -python -m spacy init-config ./config.cfg --lang en --pipeline tagger,parser +```cli +$ python -m spacy init config ./config.cfg --lang en --pipeline tagger,parser ``` +If you've exported a starter config from our +[quickstart widget](/usage/training#quickstart), you can use the +[`init fill-config`](/api/cli#init-fill-config) to fill it with all default +values. You can then use the auto-generated `config.cfg` for training: + ```diff ### {wrap="true"} - python -m spacy train en ./output ./train.json ./dev.json --pipeline tagger,parser --cnn-window 1 --bilstm-depth 0 + python -m spacy train ./config.cfg --output ./output ``` + + #### Training via the Python API {#migrating-training-python} For most use cases, you **shouldn't** have to write your own training scripts diff --git a/website/docs/usage/vectors-embeddings.md b/website/docs/usage/vectors-embeddings.md deleted file mode 100644 index 823b30c20..000000000 --- a/website/docs/usage/vectors-embeddings.md +++ /dev/null @@ -1,224 +0,0 @@ ---- -title: Vectors and Embeddings -menu: - - ["What's a Word Vector?", 'whats-a-vector'] - - ['Word Vectors', 'vectors'] - - ['Other Embeddings', 'embeddings'] -next: /usage/transformers ---- - -An old idea in linguistics is that you can "know a word by the company it -keeps": that is, word meanings can be understood relationally, based on their -patterns of usage. This idea inspired a branch of NLP research known as -"distributional semantics" that has aimed to compute databases of lexical -knowledge automatically. The [Word2vec](https://en.wikipedia.org/wiki/Word2vec) -family of algorithms are a key milestone in this line of research. For -simplicity, we will refer to a distributional word representation as a "word -vector", and algorithms that computes word vectors (such as -[GloVe](https://nlp.stanford.edu/projects/glove/), -[FastText](https://fasttext.cc), etc.) as "Word2vec algorithms". - -Word vector tables are included in some of the spaCy [model packages](/models) -we distribute, and you can easily create your own model packages with word -vectors you train or download yourself. In some cases you can also add word -vectors to an existing pipeline, although each pipeline can only have a single -word vectors table, and a model package that already has word vectors is -unlikely to work correctly if you replace the vectors with new ones. - -## What's a word vector? {#whats-a-vector} - -For spaCy's purposes, a "word vector" is a 1-dimensional slice from a -2-dimensional **vectors table**, with a deterministic mapping from word types to -rows in the table. - -```python -def what_is_a_word_vector( - word_id: int, - key2row: Dict[int, int], - vectors_table: Floats2d, - *, - default_row: int=0 -) -> Floats1d: - return vectors_table[key2row.get(word_id, default_row)] -``` - -Word2vec algorithms try to produce vectors tables that let you estimate useful -relationships between words using simple linear algebra operations. For -instance, you can often find close synonyms of a word by finding the vectors -closest to it by cosine distance, and then finding the words that are mapped to -those neighboring vectors. Word vectors can also be useful as features in -statistical models. - -### Word vectors vs. contextual language models {#vectors-vs-language-models} - -The key difference between word vectors and contextual language models such as -ElMo, BERT and GPT-2 is that word vectors model **lexical types**, rather than -_tokens_. If you have a list of terms with no context around them, a model like -BERT can't really help you. BERT is designed to understand language **in -context**, which isn't what you have. A word vectors table will be a much better -fit for your task. However, if you do have words in context β€” whole sentences or -paragraphs of running text β€” word vectors will only provide a very rough -approximation of what the text is about. - -Word vectors are also very computationally efficient, as they map a word to a -vector with a single indexing operation. Word vectors are therefore useful as a -way to **improve the accuracy** of neural network models, especially models that -are small or have received little or no pretraining. In spaCy, word vector -tables are only used as **static features**. spaCy does not backpropagate -gradients to the pretrained word vectors table. The static vectors table is -usually used in combination with a smaller table of learned task-specific -embeddings. - -## Using word vectors directly {#vectors} - -spaCy stores word vector information in the -[`Vocab.vectors`](/api/vocab#attributes) attribute, so you can access the whole -vectors table from most spaCy objects. You can also access the vector for a -[`Doc`](/api/doc), [`Span`](/api/span), [`Token`](/api/token) or -[`Lexeme`](/api/lexeme) instance via the `vector` attribute. If your `Doc` or -`Span` has multiple tokens, the average of the word vectors will be returned, -excluding any "out of vocabulary" entries that have no vector available. If none -of the words have a vector, a zeroed vector will be returned. - -The `vector` attribute is a **read-only** numpy or cupy array (depending on -whether you've configured spaCy to use GPU memory), with dtype `float32`. The -array is read-only so that spaCy can avoid unnecessary copy operations where -possible. You can modify the vectors via the `Vocab` or `Vectors` table. - -### Converting word vectors for use in spaCy - -Custom word vectors can be trained using a number of open-source libraries, such -as [Gensim](https://radimrehurek.com/gensim), [Fast Text](https://fasttext.cc), -or Tomas Mikolov's original -[Word2vec implementation](https://code.google.com/archive/p/word2vec/). Most -word vector libraries output an easy-to-read text-based format, where each line -consists of the word followed by its vector. For everyday use, we want to -convert the vectors model into a binary format that loads faster and takes up -less space on disk. The easiest way to do this is the -[`init-model`](/api/cli#init-model) command-line utility: - -```bash -wget https://s3-us-west-1.amazonaws.com/fasttext-vectors/word-vectors-v2/cc.la.300.vec.gz -python -m spacy init-model en /tmp/la_vectors_wiki_lg --vectors-loc cc.la.300.vec.gz -``` - -This will output a spaCy model in the directory `/tmp/la_vectors_wiki_lg`, -giving you access to some nice Latin vectors πŸ˜‰ You can then pass the directory -path to [`spacy.load()`](/api/top-level#spacy.load). - -```python -nlp_latin = spacy.load("/tmp/la_vectors_wiki_lg") -doc1 = nlp_latin("Caecilius est in horto") -doc2 = nlp_latin("servus est in atrio") -doc1.similarity(doc2) -``` - -The model directory will have a `/vocab` directory with the strings, lexical -entries and word vectors from the input vectors model. The -[`init-model`](/api/cli#init-model) command supports a number of archive formats -for the word vectors: the vectors can be in plain text (`.txt`), zipped -(`.zip`), or tarred and zipped (`.tgz`). - -### Optimizing vector coverage {#custom-vectors-coverage new="2"} - -To help you strike a good balance between coverage and memory usage, spaCy's -[`Vectors`](/api/vectors) class lets you map **multiple keys** to the **same -row** of the table. If you're using the -[`spacy init-model`](/api/cli#init-model) command to create a vocabulary, -pruning the vectors will be taken care of automatically if you set the -`--prune-vectors` flag. You can also do it manually in the following steps: - -1. Start with a **word vectors model** that covers a huge vocabulary. For - instance, the [`en_vectors_web_lg`](/models/en-starters#en_vectors_web_lg) - model provides 300-dimensional GloVe vectors for over 1 million terms of - English. -2. If your vocabulary has values set for the `Lexeme.prob` attribute, the - lexemes will be sorted by descending probability to determine which vectors - to prune. Otherwise, lexemes will be sorted by their order in the `Vocab`. -3. Call [`Vocab.prune_vectors`](/api/vocab#prune_vectors) with the number of - vectors you want to keep. - -```python -nlp = spacy.load('en_vectors_web_lg') -n_vectors = 105000 # number of vectors to keep -removed_words = nlp.vocab.prune_vectors(n_vectors) - -assert len(nlp.vocab.vectors) <= n_vectors # unique vectors have been pruned -assert nlp.vocab.vectors.n_keys > n_vectors # but not the total entries -``` - -[`Vocab.prune_vectors`](/api/vocab#prune_vectors) reduces the current vector -table to a given number of unique entries, and returns a dictionary containing -the removed words, mapped to `(string, score)` tuples, where `string` is the -entry the removed word was mapped to, and `score` the similarity score between -the two words. - -```python -### Removed words -{ - "Shore": ("coast", 0.732257), - "Precautionary": ("caution", 0.490973), - "hopelessness": ("sadness", 0.742366), - "Continous": ("continuous", 0.732549), - "Disemboweled": ("corpse", 0.499432), - "biostatistician": ("scientist", 0.339724), - "somewheres": ("somewheres", 0.402736), - "observing": ("observe", 0.823096), - "Leaving": ("leaving", 1.0), -} -``` - -In the example above, the vector for "Shore" was removed and remapped to the -vector of "coast", which is deemed about 73% similar. "Leaving" was remapped to -the vector of "leaving", which is identical. If you're using the -[`init-model`](/api/cli#init-model) command, you can set the `--prune-vectors` -option to easily reduce the size of the vectors as you add them to a spaCy -model: - -```bash -$ python -m spacy init-model /tmp/la_vectors_web_md --vectors-loc la.300d.vec.tgz --prune-vectors 10000 -``` - -This will create a spaCy model with vectors for the first 10,000 words in the -vectors model. All other words in the vectors model are mapped to the closest -vector among those retained. - -### Adding vectors {#adding-vectors} - -```python -### Adding vectors -from spacy.vocab import Vocab - -vector_data = {"dog": numpy.random.uniform(-1, 1, (300,)), - "cat": numpy.random.uniform(-1, 1, (300,)), - "orange": numpy.random.uniform(-1, 1, (300,))} -vocab = Vocab() -for word, vector in vector_data.items(): - vocab.set_vector(word, vector) -``` - -### Using custom similarity methods {#custom-similarity} - -By default, [`Token.vector`](/api/token#vector) returns the vector for its -underlying [`Lexeme`](/api/lexeme), while [`Doc.vector`](/api/doc#vector) and -[`Span.vector`](/api/span#vector) return an average of the vectors of their -tokens. You can customize these behaviors by modifying the `doc.user_hooks`, -`doc.user_span_hooks` and `doc.user_token_hooks` dictionaries. - - - -For more details on **adding hooks** and **overwriting** the built-in `Doc`, -`Span` and `Token` methods, see the usage guide on -[user hooks](/usage/processing-pipelines#custom-components-user-hooks). - - - - - -## Other embeddings {#embeddings} - - diff --git a/website/docs/usage/visualizers.md b/website/docs/usage/visualizers.md index 5db741d52..4ba0112b6 100644 --- a/website/docs/usage/visualizers.md +++ b/website/docs/usage/visualizers.md @@ -58,12 +58,12 @@ arcs.
-| Argument | Type | Description | Default | -| --------- | ---- | ----------------------------------------------------------- | ----------- | -| `compact` | bool | "Compact mode" with square arrows that takes up less space. | `False` | -| `color` | str | Text color (HEX, RGB or color names). | `"#000000"` | -| `bg` | str | Background color (HEX, RGB or color names). | `"#ffffff"` | -| `font` | str | Font name or font family for all text. | `"Arial"` | +| Argument | Description | +| --------- | ----------------------------------------------------------------------------------------- | +| `compact` | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~ | +| `color` | Text color (HEX, RGB or color names). Defaults to `"#000000"`. ~~str~~ | +| `bg` | Background color (HEX, RGB or color names). Defaults to `"#ffffff"`. ~~str~~ | +| `font` | Font name or font family for all text. Defaults to `"Arial"`. ~~str~~ | For a list of all available options, see the [`displacy` API documentation](/api/top-level#displacy_options). @@ -121,10 +121,10 @@ import DisplacyEntHtml from 'images/displacy-ent2.html' The entity visualizer lets you customize the following `options`: -| Argument | Type | Description | Default | -| -------- | ---- | ------------------------------------------------------------------------------------- | ------- | -| `ents` | list | Β Entity types to highlight (`None` for all types). | `None` | -| `colors` | dict | Color overrides. Entity types in uppercase should be mapped to color names or values. | `{}` | +| Argument | Description | +| -------- | ------------------------------------------------------------------------------------------------------------- | +| `ents` | Entity types to highlight (`None` for all types). Defaults to `None`. ~~Optional[List[str]]~~ | `None` | +| `colors` | Color overrides. Entity types should be mapped to color names or values. Defaults to `{}`. ~~Dict[str, str]~~ | If you specify a list of `ents`, only those entity types will be rendered – for example, you can choose to display `PERSON` entities. Internally, the visualizer diff --git a/website/meta/sidebars.json b/website/meta/sidebars.json index 230900dcd..94fbc2492 100644 --- a/website/meta/sidebars.json +++ b/website/meta/sidebars.json @@ -18,9 +18,17 @@ { "text": "Linguistic Features", "url": "/usage/linguistic-features" }, { "text": "Rule-based Matching", "url": "/usage/rule-based-matching" }, { "text": "Processing Pipelines", "url": "/usage/processing-pipelines" }, - { "text": "Vectors & Embeddings", "url": "/usage/vectors-embeddings" }, - { "text": "Transformers", "url": "/usage/transformers", "tag": "new" }, + { + "text": "Embeddings & Transformers", + "url": "/usage/embeddings-transformers", + "tag": "new" + }, { "text": "Training Models", "url": "/usage/training", "tag": "new" }, + { + "text": "Layers & Model Architectures", + "url": "/usage/layers-architectures", + "tag": "new" + }, { "text": "spaCy Projects", "url": "/usage/projects", "tag": "new" }, { "text": "Saving & Loading", "url": "/usage/saving-loading" }, { "text": "Visualizers", "url": "/usage/visualizers" } @@ -113,7 +121,6 @@ { "text": "Vectors", "url": "/api/vectors" }, { "text": "Lookups", "url": "/api/lookups" }, { "text": "Morphology", "url": "/api/morphology" }, - { "text": "MorphAnalysis", "url": "/api/morphanalysis" }, { "text": "KnowledgeBase", "url": "/api/kb" }, { "text": "Scorer", "url": "/api/scorer" }, { "text": "Corpus", "url": "/api/corpus" } diff --git a/website/meta/type-annotations.json b/website/meta/type-annotations.json new file mode 100644 index 000000000..b1d94403d --- /dev/null +++ b/website/meta/type-annotations.json @@ -0,0 +1,46 @@ +{ + "Doc": "/api/doc", + "Token": "/api/token", + "Span": "/api/span", + "Lexeme": "/api/lexeme", + "Example": "/api/example", + "Alignment": "/api/example#alignment-object", + "Vocab": "/api/vocab", + "StringStore": "/api/stringstore", + "Lookups": "/api/lookups", + "Table": "/api/lookups#table", + "Vectors": "/api/vectors", + "Language": "/api/language", + "Defaults": "/api/language#defaults", + "Scorer": "/api/scorer", + "DocBin": "/api/docbin", + "FactoryMeta": "/api/language#factorymeta", + "Tokenizer": "/api/tokenizer", + "MorphAnalysis": "/api/morphology#morphanalysis", + "KnowledgeBase": "/api/kb", + "Candidate": "/api/kb#candidate", + "Matcher": "/api/matcher", + "PhraseMatcher": "/api/phrasematcher", + "TransformerData": "/api/transformer#transformerdata", + "FullTransformerBatch": "/api/transformer#fulltransformerbatch", + "LexemeC": "/api/cython-structs#lexemec", + "TokenC": "/api/cython-structs#tokenc", + "Config": "https://thinc.ai/docs/api-config#config", + "Optimizer": "https://thinc.ai/docs/api-optimizers", + "Model": "https://thinc.ai/docs/api-model", + "Ragged": "https://thinc.ai/docs/api-types#ragged", + "Padded": "https://thinc.ai/docs/api-types#padded", + "Ints2d": "https://thinc.ai/docs/api-types#types", + "Floats2d": "https://thinc.ai/docs/api-types#types", + "Floats3d": "https://thinc.ai/docs/api-types#types", + "FloatsXd": "https://thinc.ai/docs/api-types#types", + "Ops": "https://thinc.ai/docs/api-backends#ops", + "cymem.Pool": "https://github.com/explosion/cymem", + "preshed.BloomFilter": "https://github.com/explosion/preshed", + "transformers.BatchEncoding": "https://huggingface.co/transformers/main_classes/tokenizer.html#transformers.BatchEncoding", + "torch.Tensor": "https://pytorch.org/docs/stable/tensors.html", + "numpy.ndarray": "https://numpy.org/doc/stable/reference/generated/numpy.ndarray.html", + "Match": "https://docs.python.org/3/library/re.html#match-objects", + "Pattern": "https://docs.python.org/3/library/re.html#regular-expression-objects", + "Path": "https://docs.python.org/3/library/pathlib.html" +} diff --git a/website/package.json b/website/package.json index e61661c11..9e02cda82 100644 --- a/website/package.json +++ b/website/package.json @@ -53,14 +53,14 @@ "remark-react": "^5.0.1" }, "scripts": { - "build": "npm run python:setup && gatsby build", + "build": "npm run python:install && npm run python:setup && gatsby build", "dev": "npm run python:setup && gatsby develop", "dev:nightly": "BRANCH=nightly.spacy.io npm run dev", "lint": "eslint **", "clear": "rm -rf .cache", "test": "echo \"Write tests! -> https://gatsby.app/unit-testing\"", "python:install": "pip install setup/requirements.txt", - "python:setup": "cd setup && ./setup.sh" + "python:setup": "cd setup && sh setup.sh" }, "devDependencies": { "@sindresorhus/slugify": "^0.8.0", diff --git a/website/setup/jinja_to_js.py b/website/setup/jinja_to_js.py index a2c896151..114d0e172 100644 --- a/website/setup/jinja_to_js.py +++ b/website/setup/jinja_to_js.py @@ -2,7 +2,7 @@ # With additional functionality: in/not in, replace, pprint, round, + for lists, # rendering empty dicts # This script is mostly used to generate the JavaScript function for the -# training quicktart widget. +# training quickstart widget. import contextlib import json import re @@ -11,7 +11,8 @@ from os import path from io import StringIO from jinja2 import Environment, FileSystemLoader, nodes from pathlib import Path -import typer +import srsly +import sys OPERANDS = { @@ -437,7 +438,8 @@ class JinjaToJS(object): with self._interpolation(): with self._python_bool_wrapper(**kwargs): if node.items: - raise ValueError(f"Can't process non-empty dict in epxression: {node}") + err = f"Can't process non-empty dict in expression: {node}" + raise ValueError(err) self.output.write("{}") def _process_getattr(self, node, **kwargs): @@ -1232,18 +1234,22 @@ class JinjaToJS(object): self.output.write(")") -def main( - # fmt: off - template_path: Path = typer.Argument(..., exists=True, dir_okay=False, help="Path to .jinja file"), - output: Path = typer.Argument(None, help="Path to output module (stdout if unset)"), - data_path: Path = typer.Option(None, "--data", help="Optional JSON file with additional data to be included as DATA") - # fmt: on -): - """Convert a jinja2 template to a JavaScript module.""" +def main(template_path, output=None, data_path=None): + """Convert a jinja2 template to a JavaScript module. + + template_path (Path): Path to .jijna file. + output (Optional[Path]): Path to output .js module (stdout if unset). + data_path (Optional[Path]): Optional JSON or YAML file with additional data + to be included in the JS module as the exported variable DATA. + """ data = "{}" if data_path is not None: - with data_path.open("r", encoding="utf8") as f: - data = json.dumps(json.loads(f.read())) # dump and load for compactness + if data_path.suffix in (".yml", ".yaml"): + data = srsly.read_yaml(data_path) + else: + data = srsly.read_json(data_path) + data = srsly.json_dumps(data) # dump and load for compactness + template_path = Path(template_path) tpl_file = template_path.parts[-1] compiler = JinjaToJS(template_path.parent, tpl_file, js_module_format="es6") header = f"// This file was auto-generated by {__file__} based on {tpl_file}" @@ -1258,4 +1264,10 @@ def main( if __name__ == "__main__": - typer.run(main) + args = sys.argv[1:] + if not len(args): + raise ValueError("Need at least one argument: path to .jinja template") + template_path = Path(args[0]) + output = Path(args[1]) if len(args) > 1 else None + data_path = Path(args[2]) if len(args) > 2 else None + main(template_path, output, data_path) diff --git a/website/setup/requirements.txt b/website/setup/requirements.txt index 7ffb6df0b..e7a8e65a7 100644 --- a/website/setup/requirements.txt +++ b/website/setup/requirements.txt @@ -1,3 +1,3 @@ # These are used to compile the training quickstart config jinja2 -typer +srsly diff --git a/website/setup/setup.sh b/website/setup/setup.sh index a6bbd3294..674b25674 100755 --- a/website/setup/setup.sh +++ b/website/setup/setup.sh @@ -1 +1 @@ -python jinja_to_js.py ../../spacy/cli/templates/quickstart_training.jinja ../src/widgets/quickstart-training-generator.js --data ../../spacy/cli/templates/quickstart_training_recommendations.json +python jinja_to_js.py ../../spacy/cli/templates/quickstart_training.jinja ../src/widgets/quickstart-training-generator.js ../../spacy/cli/templates/quickstart_training_recommendations.yml diff --git a/website/src/components/code.js b/website/src/components/code.js index 952014ed5..740544f43 100644 --- a/website/src/components/code.js +++ b/website/src/components/code.js @@ -1,4 +1,4 @@ -import React from 'react' +import React, { Fragment } from 'react' import PropTypes from 'prop-types' import classNames from 'classnames' import highlightCode from 'gatsby-remark-prismjs/highlight-code.js' @@ -6,12 +6,13 @@ import rangeParser from 'parse-numeric-range' import { StaticQuery, graphql } from 'gatsby' import { window } from 'browser-monads' +import CUSTOM_TYPES from '../../meta/type-annotations.json' import { isString, htmlToReact } from './util' -import Link from './link' +import Link, { OptionalLink } from './link' import GitHubCode from './github' import classes from '../styles/code.module.sass' -const WRAP_THRESHOLD = 16 +const WRAP_THRESHOLD = 30 export default props => (
@@ -40,6 +41,139 @@ InlineCode.propTypes = {
     children: PropTypes.node,
 }
 
+function linkType(el, showLink = true) {
+    if (!isString(el) || !el.length) return el
+    const elStr = el.trim()
+    if (!elStr) return el
+    const typeUrl = CUSTOM_TYPES[elStr]
+    const url = typeUrl == true ? DEFAULT_TYPE_URL : typeUrl
+    const ws = el[0] == ' '
+    return url && showLink ? (
+        
+            {ws && ' '}
+            
+                {elStr}
+            
+        
+    ) : (
+        el
+    )
+}
+
+export const TypeAnnotation = ({ lang = 'python', link = true, children }) => {
+    // Hacky, but we're temporarily replacing a dot to prevent it from being split during highlighting
+    const TMP_DOT = 'Ϋ”'
+    const code = Array.isArray(children) ? children.join('') : children || ''
+    const [rawText, meta] = code.split(/(?= \(.+\)$)/)
+    const rawStr = rawText.replace(/\./g, TMP_DOT)
+    const rawHtml = lang === 'none' || !code ? code : highlightCode(lang, rawStr)
+    const html = rawHtml.replace(new RegExp(TMP_DOT, 'g'), '.').replace(/\n/g, ' ')
+    const result = htmlToReact(html)
+    const elements = Array.isArray(result) ? result : [result]
+    const annotClassNames = classNames(
+        'type-annotation',
+        `language-${lang}`,
+        classes.inlineCode,
+        classes.typeAnnotation,
+        {
+            [classes.wrap]: code.length >= WRAP_THRESHOLD,
+        }
+    )
+    return (
+        
+            {elements.map((el, i) => (
+                {linkType(el, !!link)}
+            ))}
+            {meta && {meta}}
+        
+    )
+}
+
+function replacePrompt(line, prompt, isFirst = false) {
+    let result = line
+    const hasPrompt = result.startsWith(`${prompt} `)
+    const showPrompt = hasPrompt || isFirst
+    if (hasPrompt) result = result.slice(2)
+    return result && showPrompt ? `${result}` : result
+}
+
+function parseArgs(raw) {
+    const commandGroups = ['init', 'debug', 'project']
+    let args = raw.split(' ').filter(arg => arg)
+    const result = {}
+    while (args.length) {
+        let opt = args.shift()
+        if (opt.length > 1 && opt.startsWith('-')) {
+            const isFlag = !args.length || (args[0].length > 1 && args[0].startsWith('-'))
+            result[opt] = isFlag ? true : args.shift()
+        } else {
+            const key = commandGroups.includes(opt) ? `${opt} ${args.shift()}` : opt
+            result[key] = null
+        }
+    }
+    return result
+}
+
+function formatCode(html, lang, prompt) {
+    if (lang === 'cli') {
+        const cliRegex = /^(\$ )?python -m spacy/
+        const lines = html
+            .trim()
+            .split('\n')
+            .map((line, i) => {
+                if (cliRegex.test(line)) {
+                    const text = line.replace(cliRegex, '')
+                    const args = parseArgs(text)
+                    const cmd = Object.keys(args).map((key, i) => {
+                        const value = args[key]
+                        return value === null || value === true || i === 0 ? key : `${key} ${value}`
+                    })
+                    return (
+                        
+                            
+                                python -m
+                            {' '}
+                            spacy{' '}
+                            {cmd.map((item, j) => {
+                                const isCmd = j === 0
+                                const url = isCmd ? `/api/cli#${item.replace(' ', '-')}` : null
+                                const isAbstract = isString(item) && /^\[(.+)\]$/.test(item)
+                                const itemClassNames = classNames(classes.cliArg, {
+                                    [classes.cliArgHighlight]: isCmd,
+                                    [classes.cliArgEmphasis]: isAbstract,
+                                })
+                                const text = isAbstract ? item.slice(1, -1) : item
+                                return (
+                                    
+                                        {j !== 0 && ' '}
+                                        
+                                            
+                                        
+                                    
+                                )
+                            })}
+                        
+                    )
+                }
+                const htmlLine = replacePrompt(highlightCode('bash', line), '$')
+                return htmlToReact(htmlLine)
+            })
+        return lines.map((line, i) => (
+            
+                {i !== 0 && 
} + {line} +
+ )) + } + const result = html + .split('\n') + .map((line, i) => (prompt ? replacePrompt(line, prompt, i === 0) : line)) + .join('\n') + return htmlToReact(result) +} + export class Code extends React.Component { state = { Juniper: null } @@ -87,7 +221,8 @@ export class Code extends React.Component { children, } = this.props const codeClassNames = classNames(classes.code, className, `language-${lang}`, { - [classes.wrap]: !!highlight || !!wrap, + [classes.wrap]: !!highlight || !!wrap || lang === 'cli', + [classes.cli]: lang === 'cli', }) const ghClassNames = classNames(codeClassNames, classes.maxHeight) const { Juniper } = this.state @@ -105,14 +240,14 @@ export class Code extends React.Component { const codeText = Array.isArray(children) ? children.join('') : children || '' const highlightRange = highlight ? rangeParser.parse(highlight).filter(n => n > 0) : [] - const html = lang === 'none' ? codeText : highlightCode(lang, codeText, highlightRange) - + const rawHtml = ['none', 'cli'].includes(lang) + ? codeText + : highlightCode(lang, codeText, highlightRange) + const html = formatCode(rawHtml, lang, prompt) return ( <> {title &&

{title}

} - - {htmlToReact(html)} - + {html} ) } diff --git a/website/src/components/icon.js b/website/src/components/icon.js index 322337955..8dfba7426 100644 --- a/website/src/components/icon.js +++ b/website/src/components/icon.js @@ -53,7 +53,15 @@ const icons = { package: PackageIcon, } -export default function Icon({ name, width = 20, height, inline = false, variant, className }) { +export default function Icon({ + name, + width = 20, + height, + inline = false, + variant, + className, + ...props +}) { const IconComponent = icons[name] const iconClassNames = classNames(classes.root, className, { [classes.inline]: inline, @@ -67,6 +75,7 @@ export default function Icon({ name, width = 20, height, inline = false, variant aria-hidden="true" width={width} height={height || width} + {...props} /> ) } diff --git a/website/src/components/link.js b/website/src/components/link.js index 3644479c5..acded7d0d 100644 --- a/website/src/components/link.js +++ b/website/src/components/link.js @@ -6,7 +6,7 @@ import classNames from 'classnames' import Icon from './icon' import classes from '../styles/link.module.sass' -import { isString } from './util' +import { isString, isImage } from './util' const internalRegex = /(http(s?)):\/\/(prodi.gy|spacy.io|irl.spacy.io|explosion.ai|course.spacy.io)/gi @@ -39,7 +39,7 @@ export default function Link({ const dest = to || href const external = forceExternal || /(http(s?)):\/\//gi.test(dest) const icon = getIcon(dest) - const withIcon = !hidden && !hideIcon && !!icon + const withIcon = !hidden && !hideIcon && !!icon && !isImage(children) const sourceWithText = withIcon && isString(children) const linkClassNames = classNames(classes.root, className, { [classes.hidden]: hidden, diff --git a/website/src/components/quickstart.js b/website/src/components/quickstart.js index f7ab11fa4..6a335d4a0 100644 --- a/website/src/components/quickstart.js +++ b/website/src/components/quickstart.js @@ -117,7 +117,7 @@ const Quickstart = ({ {help && ( {' '} - + )} @@ -201,7 +201,7 @@ const Quickstart = ({ className={classes.help} > {' '} - + )} diff --git a/website/src/components/table.js b/website/src/components/table.js index 1a7d460d0..bd3d663f3 100644 --- a/website/src/components/table.js +++ b/website/src/components/table.js @@ -5,23 +5,31 @@ import Icon from './icon' import { isString } from './util' import classes from '../styles/table.module.sass' +const FOOT_ROW_REGEX = /^(RETURNS|YIELDS|CREATES|PRINTS|EXECUTES|UPLOADS|DOWNLOADS)/ + function isNum(children) { return isString(children) && /^\d+[.,]?[\dx]+?(|x|ms|mb|gb|k|m)?$/i.test(children) } -function getCellContent(children) { +function getCellContent(cellChildren) { const icons = { - 'βœ…': { name: 'yes', variant: 'success' }, - '❌': { name: 'no', variant: 'error' }, + 'βœ…': { name: 'yes', variant: 'success', 'aria-label': 'positive' }, + '❌': { name: 'no', variant: 'error', 'aria-label': 'negative' }, } - - if (isString(children) && icons[children.trim()]) { - const iconProps = icons[children.trim()] - return - } - // Work around prettier auto-escape - if (isString(children) && children.startsWith('\\')) { - return children.slice(1) + let children = isString(cellChildren) ? [cellChildren] : cellChildren + if (Array.isArray(children)) { + return children.map((child, i) => { + if (isString(child)) { + const icon = icons[child.trim()] + if (icon) { + const props = { ...icon, inline: i < children.length, 'aria-hidden': undefined } + return + } + // Work around prettier auto-escape + if (child.startsWith('\\')) return child.slice(1) + } + return child + }) } return children } @@ -37,7 +45,6 @@ function isDividerRow(children) { } function isFootRow(children) { - const rowRegex = /^(RETURNS|YIELDS|CREATES|PRINTS)/ if (children.length && children[0].props.name === 'td') { const cellChildren = children[0].props.children if ( @@ -46,7 +53,7 @@ function isFootRow(children) { cellChildren.props.children && isString(cellChildren.props.children) ) { - return rowRegex.test(cellChildren.props.children) + return FOOT_ROW_REGEX.test(cellChildren.props.children) } } return false diff --git a/website/src/components/typography.js b/website/src/components/typography.js index 41464473f..d37c345b9 100644 --- a/website/src/components/typography.js +++ b/website/src/components/typography.js @@ -9,7 +9,12 @@ import { isString, github, headingTextClassName } from './util' import classes from '../styles/typography.module.sass' export const H1 = ({ Component = 'h1', className, ...props }) => ( - + ) export const H2 = ({ className, ...props }) => ( @@ -90,6 +95,7 @@ const Headline = ({ source, hidden, action, + permalink = true, className, children, }) => { @@ -102,7 +108,7 @@ const Headline = ({ const tags = tag ? tag.split(',').map(t => t.trim()) : [] return ( - {children} + {children} {tags.map((tag, i) => ( {tag} diff --git a/website/src/components/util.js b/website/src/components/util.js index 844f2c133..a9c6efcf5 100644 --- a/website/src/components/util.js +++ b/website/src/components/util.js @@ -46,6 +46,17 @@ export function isString(obj) { return typeof obj === 'string' || obj instanceof String } +/** + * @param obj - The object to check. + * @returns {boolean} – Whether the object is an image + */ +export function isImage(obj) { + if (!obj || !React.isValidElement(obj)) { + return false + } + return obj.props.name == 'img' || obj.props.className == 'gatsby-resp-image-wrapper' +} + /** * @param obj - The object to check. * @returns {boolean} - Whether the object is empty. diff --git a/website/src/fonts/jetbrainsmono-italic.woff b/website/src/fonts/jetbrainsmono-italic.woff new file mode 100644 index 000000000..f3ddf4db5 Binary files /dev/null and b/website/src/fonts/jetbrainsmono-italic.woff differ diff --git a/website/src/fonts/jetbrainsmono-italic.woff2 b/website/src/fonts/jetbrainsmono-italic.woff2 new file mode 100644 index 000000000..828c42961 Binary files /dev/null and b/website/src/fonts/jetbrainsmono-italic.woff2 differ diff --git a/website/src/styles/aside.module.sass b/website/src/styles/aside.module.sass index 0e73cc61a..1ea3f970a 100644 --- a/website/src/styles/aside.module.sass +++ b/website/src/styles/aside.module.sass @@ -28,7 +28,7 @@ $border-radius: 6px margin-top: 0 !important code - padding: 0 + padding: 0 !important margin: 0 h4 diff --git a/website/src/styles/code.module.sass b/website/src/styles/code.module.sass index e3a27cbba..aa1f499dd 100644 --- a/website/src/styles/code.module.sass +++ b/website/src/styles/code.module.sass @@ -27,7 +27,7 @@ padding: 1.75em 1.5em .code - &[data-prompt]:before, + &[data-prompt]:before, span[data-prompt]:before content: attr(data-prompt) margin-right: 0.65em display: inline-block @@ -56,6 +56,42 @@ --color-inline-code-text: var(--color-back) --color-inline-code-bg: var(--color-dark-secondary) +.type-annotation, + white-space: pre-wrap + font-family: var(--font-code) + + &.wrap + word-wrap: break-word + + a + border: 0 + + // Special style for types in API tables + td:not(:first-child) > &:last-child + display: block + border-top: 1px dotted var(--color-subtle) + border-radius: 0 + background: none + width: calc(100% + 2rem) + margin-left: -1rem + padding-left: 1rem + padding-top: 5px + margin-top: 5px + margin-bottom: -5px + + &:before + content: "Type: " + opacity: 0.75 + font-family: var(--font-primary) + color: var(--color-dark-secondary) + font-weight: bold + text-transform: uppercase + margin-right: 5px + +.type-annotation-meta + font-size: 90% + color: var(--color-subtle-dark) + .wrap white-space: pre-wrap word-wrap: anywhere @@ -127,3 +163,31 @@ font-weight: normal padding-top: 0.1rem color: var(--color-subtle-dark) + +.cli + padding-top: calc(var(--spacing-sm) - 6px) + padding-bottom: calc(var(--spacing-sm) - 12px) + + [data-prompt]:before + color: var(--color-subtle) + +.cli-arg + border: 1px solid var(--color-dark) + padding: 1px 6px + margin-bottom: 5px + border-radius: 0.5em + display: inline-block + + a + color: inherit !important + +.cli-arg-highlight + background: var(--color-theme) + border-color: var(--color-theme) + color: var(--color-back) !important + +.cli-arg-subtle + color: var(--syntax-comment) + +.cli-arg-emphasis + font-style: italic diff --git a/website/src/styles/layout.sass b/website/src/styles/layout.sass index 3591fb005..775523190 100644 --- a/website/src/styles/layout.sass +++ b/website/src/styles/layout.sass @@ -157,6 +157,14 @@ font-display: fallback src: url("../fonts/jetbrainsmono-regular.woff") format("woff"), url("../fonts/jetbrainsmono-regular.woff2") format("woff2") +@font-face + font-family: "JetBrains Mono" + font-style: italic + font-weight: 500 + font-display: fallback + src: url("../fonts/jetbrainsmono-italic.woff") format("woff"), url("../fonts/jetbrainsmono-italic.woff2") format("woff2") + + /* Reset */ *, *:before, *:after @@ -358,6 +366,21 @@ body [id]:target &.italic font-style: italic + +[class*="language-"].type-annotation .token + &.builtin, &.boolean, &.number + color: var(--color-inline-code-text) + + &.operator + color: var(--syntax-comment) + +[class*="language-bash"] .token + &.function + color: var(--color-subtle) + + &.operator, &.variable + color: var(--syntax-comment) + // Settings for ini syntax (config files) [class*="language-ini"] color: var(--syntax-comment) @@ -373,7 +396,7 @@ body [id]:target margin-right: -1.5em margin-left: -1.5em padding-right: 1.5em - padding-left: 1.25em + padding-left: 1.2em &:empty:before // Fix issue where empty lines would disappear diff --git a/website/src/styles/readnext.module.sass b/website/src/styles/readnext.module.sass index 23aa7f016..aef91c09e 100644 --- a/website/src/styles/readnext.module.sass +++ b/website/src/styles/readnext.module.sass @@ -12,7 +12,7 @@ background: var(--color-subtle-light) color: var(--color-subtle-dark) border-radius: 50% - padding: 0.5rem + padding: 0.5rem 0.65rem 0.5rem 0 transition: color 0.2s ease float: right margin-left: 3rem diff --git a/website/src/styles/table.module.sass b/website/src/styles/table.module.sass index 6306e2a15..b46c1af92 100644 --- a/website/src/styles/table.module.sass +++ b/website/src/styles/table.module.sass @@ -29,7 +29,8 @@ border: 0 .td - padding: 1rem + padding: 0.9rem 1rem + font-size: 95% &:not(:last-child) border-right: 1px solid var(--color-subtle) diff --git a/website/src/templates/index.js b/website/src/templates/index.js index c97663317..027241d97 100644 --- a/website/src/templates/index.js +++ b/website/src/templates/index.js @@ -20,7 +20,7 @@ import SEO from '../components/seo' import Link from '../components/link' import Section, { Hr } from '../components/section' import { Table, Tr, Th, Td } from '../components/table' -import { Pre, Code, InlineCode } from '../components/code' +import { Pre, Code, InlineCode, TypeAnnotation } from '../components/code' import { Ol, Ul, Li } from '../components/list' import { H2, H3, H4, H5, P, Abbr, Help } from '../components/typography' import Accordion from '../components/accordion' @@ -41,6 +41,7 @@ const mdxComponents = { pre: Pre, code: Code, inlineCode: InlineCode, + del: TypeAnnotation, table: Table, img: Image, tr: Tr, diff --git a/website/src/widgets/quickstart-training-generator.js b/website/src/widgets/quickstart-training-generator.js deleted file mode 100644 index c7f856073..000000000 --- a/website/src/widgets/quickstart-training-generator.js +++ /dev/null @@ -1,12 +0,0 @@ -// This file was auto-generated by jinja_to_js.py based on quickstart_training.jinja -import jinjaToJS from "jinja-to-js";export default function templateQuickstartTraining(ctx) { - var __result = ""; - var __tmp; - var __runtime = jinjaToJS.runtime; - var __filters = jinjaToJS.filters; - var __globals = jinjaToJS.globals; - var context = jinjaToJS.createContext(ctx); - var use_transformer = context.transformer_data && context.hardware!=="cpu";var transformer = (use_transformer ? context.transformer_data[context.optimize] : {});__result += "[paths]\ntrain = \"\"\ndev = \"\"\n\n[system]\nuse_pytorch_for_gpu_memory = ";__result += "" + __runtime.escape((__tmp = ((use_transformer ? "true" : "false"))) == null ? "" : __tmp);__result += "\n\n[nlp]\nlang = \"";__result += "" + __runtime.escape((__tmp = (context.lang)) == null ? "" : __tmp);__result += "\"";var full_pipeline = [(use_transformer ? "transformer" : "tok2vec")].concat(context.components);__result += "\npipeline = ";__result += "" + ((__tmp = (JSON.stringify(full_pipeline).split("'").join("\""))) == null ? "" : __tmp);__result += "\ntokenizer = {\"@tokenizers\": \"spacy.Tokenizer.v1\"}\n\n[components]\n\n";if(__runtime.boolean(use_transformer)){__result += "[components.transformer]\nfactory = \"transformer\"\n\n[components.transformer.model]\n@architectures = \"spacy-transformers.TransformerModel.v1\"\nname = \"";__result += "" + __runtime.escape((__tmp = (transformer["name"])) == null ? "" : __tmp);__result += "\"\ntokenizer_config = {\"use_fast\": true}\n\n[components.transformer.model.get_spans]\n@span_getters = \"strided_spans.v1\"\nwindow = 128\nstride = 96\n\n";if(context.components.includes("tagger")){__result += "\n[components.tagger]\nfactory = \"tagger\"\n\n[components.tagger.model]\n@architectures = \"spacy.Tagger.v1\"\nnO = null\n\n[components.tagger.model.tok2vec]\n@architectures = \"spacy-transformers.Tok2VecListener.v1\"\ngrad_factor = 1.0\n\n[components.tagger.model.tok2vec.pooling]\n@layers = \"reduce_mean.v1\"";}__result += "\n\n";if(context.components.includes("parser")){__result += "[components.parser]\nfactory = \"parser\"\n\n[components.parser.model]\n@architectures = \"spacy.TransitionBasedParser.v1\"\nnr_feature_tokens = 8\nhidden_width = 128\nmaxout_pieces = 3\nuse_upper = false\nnO = null\n\n[components.parser.model.tok2vec]\n@architectures = \"spacy-transformers.Tok2VecListener.v1\"\ngrad_factor = 1.0\n\n[components.parser.model.tok2vec.pooling]\n@layers = \"reduce_mean.v1\"";}__result += "\n\n";if(context.components.includes("ner")){__result += "[components.ner]\nfactory = \"ner\"\n\n[components.ner.model]\n@architectures = \"spacy.TransitionBasedParser.v1\"\nnr_feature_tokens = 3\nhidden_width = 64\nmaxout_pieces = 2\nuse_upper = false\nnO = null\n\n[components.ner.model.tok2vec]\n@architectures = \"spacy-transformers.Tok2VecListener.v1\"\ngrad_factor = 1.0\n\n[components.ner.model.tok2vec.pooling]\n@layers = \"reduce_mean.v1\"\n";}__result += "\n";} else {if(context.hardware==="gpu"){__result += "# There are no recommended transformer weights available for language '";__result += "" + __runtime.escape((__tmp = (context.lang)) == null ? "" : __tmp);__result += "'\n# yet, so the pipeline described here is not transformer-based.";}__result += "\n\n[components.tok2vec]\nfactory = \"tok2vec\"\n\n[components.tok2vec.model]\n@architectures = \"spacy.Tok2Vec.v1\"\n\n[components.tok2vec.model.embed]\n@architectures = \"spacy.MultiHashEmbed.v1\"\nwidth = ${components.tok2vec.model.encode:width}\nrows = ";__result += "" + __runtime.escape((__tmp = ((context.optimize==="efficiency" ? 2000 : 7000))) == null ? "" : __tmp);__result += "\nalso_embed_subwords = ";__result += "" + __runtime.escape((__tmp = ((context.has_letters ? true : false))) == null ? "" : __tmp);__result += "\nalso_use_static_vectors = ";__result += "" + __runtime.escape((__tmp = ((context.optimize==="accuracy" ? true : false))) == null ? "" : __tmp);__result += "\n\n[components.tok2vec.model.encode]\n@architectures = \"spacy.MaxoutWindowEncoder.v1\"\nwidth = ";__result += "" + __runtime.escape((__tmp = ((context.optimize==="efficiency" ? 96 : 256))) == null ? "" : __tmp);__result += "\ndepth = ";__result += "" + __runtime.escape((__tmp = ((context.optimize==="efficiency" ? 4 : 8))) == null ? "" : __tmp);__result += "\nwindow_size = 1\nmaxout_pieces = 3\n\n";if(context.components.includes("tagger")){__result += "\n[components.tagger]\nfactory = \"tagger\"\n\n[components.tagger.model]\n@architectures = \"spacy.Tagger.v1\"\nnO = null\n\n[components.tagger.model.tok2vec]\n@architectures = \"spacy.Tok2VecListener.v1\"\nwidth = ${components.tok2vec.model.encode:width}";}__result += "\n\n";if(context.components.includes("parser")){__result += "[components.parser]\nfactory = \"parser\"\n\n[components.parser.model]\n@architectures = \"spacy.TransitionBasedParser.v1\"\nnr_feature_tokens = 8\nhidden_width = 128\nmaxout_pieces = 3\nuse_upper = true\nnO = null\n\n[components.parser.model.tok2vec]\n@architectures = \"spacy.Tok2VecListener.v1\"\nwidth = ${components.tok2vec.model.encode:width}";}__result += "\n\n";if(context.components.includes("ner")){__result += "\n[components.ner]\nfactory = \"ner\"\n\n[components.ner.model]\n@architectures = \"spacy.TransitionBasedParser.v1\"\nnr_feature_tokens = 6\nhidden_width = 64\nmaxout_pieces = 2\nuse_upper = true\nnO = null\n\n[components.ner.model.tok2vec]\n@architectures = \"spacy.Tok2VecListener.v1\"\nwidth = ${components.tok2vec.model.encode:width}\n";}__result += "\n";}__result += "\n\n";__runtime.each(context.components,function(pipe){var __$0 = context.pipe;context.pipe = pipe;__result += "\n";if(!["tagger","parser","ner"].includes(pipe)){__result += "\n";__result += "\n[components.";__result += "" + __runtime.escape((__tmp = (pipe)) == null ? "" : __tmp);__result += "]\nfactory = \"";__result += "" + __runtime.escape((__tmp = (pipe)) == null ? "" : __tmp);__result += "\"\n";}__result += "\n";context.pipe = __$0;});__result += "\n\n[training]\n";if(__runtime.boolean(use_transformer) || context.optimize==="efficiency" || !__runtime.boolean(context.word_vectors)){__result += "vectors = null\n";} else {__result += "vectors = \"";__result += "" + __runtime.escape((__tmp = (context.word_vectors)) == null ? "" : __tmp);__result += "\"\n";}if(__runtime.boolean(use_transformer)){__result += "accumulate_gradient = ";__result += "" + __runtime.escape((__tmp = (transformer["size_factor"])) == null ? "" : __tmp);__result += "\n";}__result += "\n\n[training.optimizer]\n@optimizers = \"Adam.v1\"\n\n[training.optimizer.learn_rate]\n@schedules = \"warmup_linear.v1\"\nwarmup_steps = 250\ntotal_steps = 20000\ninitial_rate = 5e-5\n\n[training.train_corpus]\n@readers = \"spacy.Corpus.v1\"\npath = ${paths:train}\nmax_length = ";__result += "" + __runtime.escape((__tmp = ((context.hardware==="gpu" ? 500 : 0))) == null ? "" : __tmp);__result += "\n\n[training.dev_corpus]\n@readers = \"spacy.Corpus.v1\"\npath = ${paths:dev}\nmax_length = 0\n\n";if(__runtime.boolean(use_transformer)){__result += "\n[training.batcher]\n@batchers = \"batch_by_padded.v1\"\ndiscard_oversize = true\nsize = 2000\nbuffer = 256";} else {__result += "\n[training.batcher]\n@batchers = \"batch_by_words.v1\"\ndiscard_oversize = false\ntolerance = 0.2\n\n[training.batcher.size]\n@schedules = \"compounding.v1\"\nstart = 100\nstop = 1000\ncompound = 1.001\n";}__result += "\n\n[training.score_weights]";if(context.components.includes("tagger")){__result += "\ntag_acc = ";__result += "" + __runtime.escape((__tmp = (Math.round((1.0 / __filters.size(context.components)+ Number.EPSILON) * 10**2) / 10**2)) == null ? "" : __tmp);}if(context.components.includes("parser")){__result += "\ndep_uas = 0.0\ndep_las = ";__result += "" + __runtime.escape((__tmp = (Math.round((1.0 / __filters.size(context.components)+ Number.EPSILON) * 10**2) / 10**2)) == null ? "" : __tmp);__result += "\nsents_f = 0.0";}if(context.components.includes("ner")){__result += "\nents_f = ";__result += "" + __runtime.escape((__tmp = (Math.round((1.0 / __filters.size(context.components)+ Number.EPSILON) * 10**2) / 10**2)) == null ? "" : __tmp);__result += "\nents_p = 0.0\nents_r = 0.0";} - return __result; -} -export const DATA = {"en": {"word_vectors": "en_vectors_web_lg", "transformer": {"efficiency": {"name": "roberta-base", "size_factor": 3}, "accuracy": {"name": "roberta-base", "size_factor": 3}}}, "de": {"word_vectors": null, "transformer": null}} \ No newline at end of file diff --git a/website/src/widgets/quickstart-training.js b/website/src/widgets/quickstart-training.js index 4e379e5ec..ae8d41b64 100644 --- a/website/src/widgets/quickstart-training.js +++ b/website/src/widgets/quickstart-training.js @@ -4,7 +4,7 @@ import highlightCode from 'gatsby-remark-prismjs/highlight-code.js' import { Quickstart } from '../components/quickstart' import generator, { DATA as GENERATOR_DATA } from './quickstart-training-generator' -import { isString, htmlToReact } from '../components/util' +import { htmlToReact } from '../components/util' const DEFAULT_LANG = 'en' const DEFAULT_HARDWARE = 'gpu' @@ -38,7 +38,8 @@ const DATA = [ { id: 'optimize', title: 'Optimize for', - help: '...', + help: + 'Optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger & slower model). Will impact the choice of architecture, pretrained weights and hyperparameters.', options: [ { id: 'efficiency', title: 'efficiency', checked: DEFAULT_OPT === 'efficiency' }, { id: 'accuracy', title: 'accuracy', checked: DEFAULT_OPT === 'accuracy' }, @@ -46,13 +47,6 @@ const DATA = [ }, ] -function stringify(value) { - if (isString(value) && value.startsWith('${')) return value - const string = JSON.stringify(value) - if (Array.isArray(value)) return string.replace(/,/g, ', ') - return string -} - export default function QuickstartTraining({ id, title, download = 'config.cfg' }) { const [lang, setLang] = useState(DEFAULT_LANG) const [components, setComponents] = useState([]) @@ -72,6 +66,7 @@ export default function QuickstartTraining({ id, title, download = 'config.cfg' hardware, transformer_data: reco.transformer, word_vectors: reco.word_vectors, + has_letters: reco.has_letters, }) const rawStr = content.trim().replace(/\n\n\n+/g, '\n\n') const rawContent = `${COMMENT}\n${rawStr}` @@ -84,10 +79,12 @@ export default function QuickstartTraining({ id, title, download = 'config.cfg' query={query} render={({ site }) => { const langs = site.siteMetadata.languages - DATA[0].dropdown = langs.map(({ name, code }) => ({ - id: code, - title: name, - })) + DATA[0].dropdown = langs + .map(({ name, code }) => ({ + id: code, + title: name, + })) + .sort((a, b) => a.title.localeCompare(b.title)) return (