diff --git a/README.md b/README.md index f711ea1b1..1fece1e5a 100644 --- a/README.md +++ b/README.md @@ -194,7 +194,7 @@ pip install https://github.com/explosion/spacy-models/releases/download/en_core_ ### Loading and using models -To load a model, use `spacy.load()` with the model name, a shortcut link or a +To load a model, use `spacy.load()` with the model name or a path to the model data directory. ```python diff --git a/netlify.toml b/netlify.toml index 452b5979a..e646dd8f7 100644 --- a/netlify.toml +++ b/netlify.toml @@ -38,6 +38,8 @@ redirects = [ {from = "/docs/usage/showcase", to = "/universe", force = true}, {from = "/tutorials/load-new-word-vectors", to = "/usage/vectors-similarity#custom", force = true}, {from = "/tutorials", to = "/usage/examples", force = true}, + # Old documentation pages (v2.x) + {from = "/usage/adding-languages", to = "/usage/linguistic-features", force = true}, # Rewrite all other docs pages to / {from = "/docs/*", to = "/:splat"}, # Updated documentation pages diff --git a/spacy/cli/download.py b/spacy/cli/download.py index adc8d09fa..ea5e7a890 100644 --- a/spacy/cli/download.py +++ b/spacy/cli/download.py @@ -16,7 +16,7 @@ from ..util import is_package, get_base_version, run_command def download_cli( # fmt: off ctx: typer.Context, - model: str = Arg(..., help="Model to download (shortcut or name)"), + model: str = Arg(..., help="Name of model to download"), direct: bool = Opt(False, "--direct", "-d", "-D", help="Force direct download of name + version"), # fmt: on ): diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index 0591d19a1..672142f31 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -4,7 +4,6 @@ teaser: Download, train and package models, and debug spaCy source: spacy/cli menu: - ['Download', 'download'] - - ['Link', 'link'] - ['Info', 'info'] - ['Validate', 'validate'] - ['Convert', 'convert'] @@ -16,18 +15,16 @@ menu: - ['Package', 'package'] --- -As of v1.7.0, spaCy comes with new command line helpers to download and link -models and show useful debugging information. For a list of available commands, -type `spacy --help`. +For a list of available commands, type `spacy --help`. + + ## Download {#download} Download [models](/usage/models) for spaCy. The downloader finds the -best-matching compatible version, uses `pip install` to download the model as a -package and creates a [shortcut link](/usage/models#usage) if the model was -downloaded via a shortcut. Direct downloads don't perform any compatibility -checks and require the model name to be specified with its version (e.g. -`en_core_web_sm-2.2.0`). +best-matching compatible version and uses `pip install` to download the model as +a package. Direct downloads don't perform any compatibility checks and require +the model name to be specified with its version (e.g. `en_core_web_sm-2.2.0`). > #### Downloading best practices > @@ -43,42 +40,13 @@ checks and require the model name to be specified with its version (e.g. $ python -m spacy download [model] [--direct] [pip args] ``` -| Argument | Type | Description | -| ------------------------------------- | ------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `model` | positional | Model name or shortcut (`en`, `de`, `en_core_web_sm`). | -| `--direct`, `-d` | flag | Force direct download of exact model version. | -| pip args 2.1 | - | Additional installation options to be passed to `pip install` when installing the model package. For example, `--user` to install to the user home directory or `--no-deps` to not install model dependencies. | -| `--help`, `-h` | flag | Show help message and available arguments. | -| **CREATES** | directory, symlink | The installed model package in your `site-packages` directory and a shortcut link as a symlink in `spacy/data` if installed via shortcut. | - -## Link {#link} - -Create a [shortcut link](/usage/models#usage) for a model, either a Python -package or a local directory. This will let you load models from any location -using a custom name via [`spacy.load()`](/api/top-level#spacy.load). - - - -In spaCy v1.x, you had to use the model data directory to set up a shortcut link -for a local path. As of v2.0, spaCy expects all shortcut links to be **loadable -model packages**. If you want to load a data directory, call -[`spacy.load()`](/api/top-level#spacy.load) or -[`Language.from_disk()`](/api/language#from_disk) with the path, or use the -[`package`](/api/cli#package) command to create a model package. - - - -```bash -$ python -m spacy link [origin] [link_name] [--force] -``` - -| Argument | Type | Description | -| --------------- | ---------- | --------------------------------------------------------------- | -| `origin` | positional | Model name if package, or path to local directory. | -| `link_name` | positional | Name of the shortcut link to create. | -| `--force`, `-f` | flag | Force overwriting of existing link. | -| `--help`, `-h` | flag | Show help message and available arguments. | -| **CREATES** | symlink | A shortcut link of the given name as a symlink in `spacy/data`. | +| Argument | Type | Description | +| ------------------------------------- | ---------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `model` | positional | Model name, e.g. `en_core_web_sm`.. | +| `--direct`, `-d` | flag | Force direct download of exact model version. | +| pip args 2.1 | - | Additional installation options to be passed to `pip install` when installing the model package. For example, `--user` to install to the user home directory or `--no-deps` to not install model dependencies. | +| `--help`, `-h` | flag | Show help message and available arguments. | +| **CREATES** | directory | The installed model package in your `site-packages` directory. | ## Info {#info} @@ -94,30 +62,28 @@ $ python -m spacy info [--markdown] [--silent] $ python -m spacy info [model] [--markdown] [--silent] ``` -| Argument | Type | Description | -| ------------------------------------------------ | ---------- | ------------------------------------------------------------- | -| `model` | positional | A model, i.e. shortcut link, package name or path (optional). | -| `--markdown`, `-md` | flag | Print information as Markdown. | -| `--silent`, `-s` 2.0.12 | flag | Don't print anything, just return the values. | -| `--help`, `-h` | flag | Show help message and available arguments. | -| **PRINTS** | `stdout` | Information about your spaCy installation. | +| Argument | Type | Description | +| ------------------------------------------------ | ---------- | ---------------------------------------------- | +| `model` | positional | A model, i.e. package name or path (optional). | +| `--markdown`, `-md` | flag | Print information as Markdown. | +| `--silent`, `-s` 2.0.12 | flag | Don't print anything, just return the values. | +| `--help`, `-h` | flag | Show help message and available arguments. | +| **PRINTS** | `stdout` | Information about your spaCy installation. | ## Validate {#validate new="2"} -Find all models installed in the current environment (both packages and shortcut -links) and check whether they are compatible with the currently installed -version of spaCy. Should be run after upgrading spaCy via `pip install -U spacy` -to ensure that all installed models are can be used with the new version. The -command is also useful to detect out-of-sync model links resulting from links -created in different virtual environments. It will show a list of models and -their installed versions. If any model is out of date, the latest compatible -versions and command for updating are shown. +Find all models installed in the current environment and check whether they are +compatible with the currently installed version of spaCy. Should be run after +upgrading spaCy via `pip install -U spacy` to ensure that all installed models +are can be used with the new version. It will show a list of models and their +installed versions. If any model is out of date, the latest compatible versions +and command for updating are shown. > #### Automated validation > > You can also use the `validate` command as part of your build process or test > suite, to ensure all models are up to date before proceeding. If incompatible -> models or shortcut links are found, it will return `1`. +> models are found, it will return `1`. ```bash $ python -m spacy validate @@ -526,16 +492,6 @@ JSONL-formatted [vocabulary file](<(/api/annotation#vocab-jsonl)>) as `--jsonl-loc` with optional `id` values that correspond to the vectors table. Just loading in vectors will not automatically populate the vocab. - - -As of v2.1.0, the `--freqs-loc` and `--clusters-loc` are deprecated and have -been replaced with the `--jsonl-loc` argument, which lets you pass in a a -[JSONL](http://jsonlines.org/) file containing one lexical entry per line. For -more details on the format, see the -[annotation specs](/api/annotation#vocab-jsonl). - - - ```bash $ python -m spacy init-model [lang] [output_dir] [--jsonl-loc] [--vectors-loc] [--prune-vectors] @@ -569,7 +525,7 @@ $ python -m spacy evaluate [model] [data_path] [--displacy-path] [--displacy-lim | Argument | Type | Description | | ------------------------- | -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `model` | positional | Model to evaluate. Can be a package or shortcut link name, or a path to a model data directory. | +| `model` | positional | Model to evaluate. Can be a package or a path to a model data directory. | | `data_path` | positional | Location of JSON-formatted evaluation data. | | `--displacy-path`, `-dp` | option | Directory to output rendered parses as HTML. If not set, no visualizations will be generated. | | `--displacy-limit`, `-dl` | option | Number of parses to generate per file. Defaults to `25`. Keep in mind that a significantly higher number might cause the `.html` files to render slowly. | diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md index 50fb10756..d585cbd25 100644 --- a/website/docs/api/doc.md +++ b/website/docs/api/doc.md @@ -202,8 +202,8 @@ the character indices don't map to a valid span. | ------------------------------------ | ---------------------------------------- | --------------------------------------------------------------------- | | `start` | int | The index of the first character of the span. | | `end` | int | The index of the last character after the span. | -| `label` | uint64 / unicode | A label to attach to the span, e.g. for named entities. | -| `kb_id` 2.2 | uint64 / unicode | An ID from a knowledge base to capture the meaning of a named entity. | +| `label` | uint64 / str | A label to attach to the span, e.g. for named entities. | +| `kb_id` 2.2 | uint64 / str | An ID from a knowledge base to capture the meaning of a named entity. | | `vector` | `numpy.ndarray[ndim=1, dtype='float32']` | A meaning representation of the span. | | **RETURNS** | `Span` | The newly constructed object or `None`. | @@ -297,15 +297,6 @@ They'll be added to an `"_"` key in the data, e.g. `"_": {"foo": "bar"}`. | `underscore` | list | Optional list of string names of custom JSON-serializable `doc._.` attributes. | | **RETURNS** | dict | The JSON-formatted data. | - - -spaCy previously implemented a `Doc.print_tree` method that returned a similar -JSON-formatted representation of a `Doc`. As of v2.1, this method is deprecated -in favor of `Doc.to_json`. If you need more complex nested representations, you -might want to write your own function to extract the data. - - - ## Doc.to_array {#to_array tag="method"} Export given token attributes to a numpy `ndarray`. If `attr_ids` is a sequence @@ -507,14 +498,6 @@ underlying lexeme (if they're context-independent lexical attributes like ## Doc.merge {#merge tag="method"} - - -As of v2.1.0, `Doc.merge` still works but is considered deprecated. You should -use the new and less error-prone [`Doc.retokenize`](/api/doc#retokenize) -instead. - - - Retokenize the document, such that the span at `doc.text[start_idx : end_idx]` is merged into a single token. If `start_idx` and `end_idx` do not mark start and end token boundaries, the document remains unchanged. @@ -646,26 +629,26 @@ The L2 norm of the document's vector representation. ## Attributes {#attributes} -| Name | Type | Description | -| --------------------------------------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `text` | str | A unicode representation of the document text. | -| `text_with_ws` | str | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`. | -| `mem` | `Pool` | The document's local memory heap, for all C data it owns. | -| `vocab` | `Vocab` | The store of lexical types. | -| `tensor` 2 | `ndarray` | Container for dense vector representations. | -| `cats` 2 | dict | Maps a label to a score for categories applied to the document. The label is a string and the score should be a float. | -| `user_data` | - | A generic storage area, for user custom data. | -| `lang` 2.1 | int | Language of the document's vocabulary. | -| `lang_` 2.1 | str | Language of the document's vocabulary. | -| `is_tagged` | bool | A flag indicating that the document has been part-of-speech tagged. Returns `True` if the `Doc` is empty. | -| `is_parsed` | bool | A flag indicating that the document has been syntactically parsed. Returns `True` if the `Doc` is empty. | -| `is_sentenced` | bool | A flag indicating that sentence boundaries have been applied to the document. Returns `True` if the `Doc` is empty. | -| `is_nered` 2.1 | bool | A flag indicating that named entities have been set. Will return `True` if the `Doc` is empty, or if _any_ of the tokens has an entity tag set, even if the others are unknown. | -| `sentiment` | float | The document's positivity/negativity score, if available. | -| `user_hooks` | dict | A dictionary that allows customization of the `Doc`'s properties. | -| `user_token_hooks` | dict | A dictionary that allows customization of properties of `Token` children. | -| `user_span_hooks` | dict | A dictionary that allows customization of properties of `Span` children. | -| `_` | `Underscore` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). | +| Name | Type | Description | +| --------------------------------------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `text` | str | A string representation of the document text. | +| `text_with_ws` | str | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`. | +| `mem` | `Pool` | The document's local memory heap, for all C data it owns. | +| `vocab` | `Vocab` | The store of lexical types. | +| `tensor` 2 | `ndarray` | Container for dense vector representations. | +| `cats` 2 | dict | Maps a label to a score for categories applied to the document. The label is a string and the score should be a float. | +| `user_data` | - | A generic storage area, for user custom data. | +| `lang` 2.1 | int | Language of the document's vocabulary. | +| `lang_` 2.1 | str | Language of the document's vocabulary. | +| `is_tagged` | bool | A flag indicating that the document has been part-of-speech tagged. Returns `True` if the `Doc` is empty. | +| `is_parsed` | bool | A flag indicating that the document has been syntactically parsed. Returns `True` if the `Doc` is empty. | +| `is_sentenced` | bool | A flag indicating that sentence boundaries have been applied to the document. Returns `True` if the `Doc` is empty. | +| `is_nered` 2.1 | bool | A flag indicating that named entities have been set. Will return `True` if the `Doc` is empty, or if _any_ of the tokens has an entity tag set, even if the others are unknown. | +| `sentiment` | float | The document's positivity/negativity score, if available. | +| `user_hooks` | dict | A dictionary that allows customization of the `Doc`'s properties. | +| `user_token_hooks` | dict | A dictionary that allows customization of properties of `Token` children. | +| `user_span_hooks` | dict | A dictionary that allows customization of properties of `Span` children. | +| `_` | `Underscore` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). | ## Serialization fields {#serialization-fields} diff --git a/website/docs/api/docbin.md b/website/docs/api/docbin.md index 9f12a07e6..fe8c359f7 100644 --- a/website/docs/api/docbin.md +++ b/website/docs/api/docbin.md @@ -22,7 +22,7 @@ the msgpack object has the following structure: "tokens": bytes, # Serialized numpy uint64 array with the token data "spaces": bytes, # Serialized numpy boolean array with spaces data "lengths": bytes, # Serialized numpy int32 array with the doc lengths - "strings": List[unicode] # List of unique strings in the token data + "strings": List[str] # List of unique strings in the token data } ``` diff --git a/website/docs/api/entityruler.md b/website/docs/api/entityruler.md index 7bee3a77a..1279a3685 100644 --- a/website/docs/api/entityruler.md +++ b/website/docs/api/entityruler.md @@ -36,7 +36,7 @@ be a token pattern (list) or a phrase pattern (string). For example: | --------------------- | ------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------- | | `nlp` | `Language` | The shared nlp object to pass the vocab to the matchers and process phrase patterns. | | `patterns` | iterable | Optional patterns to load in. | -| `phrase_matcher_attr` | int / unicode | Optional attr to pass to the internal [`PhraseMatcher`](/api/phrasematcher). defaults to `None` | +| `phrase_matcher_attr` | int / str | Optional attr to pass to the internal [`PhraseMatcher`](/api/phrasematcher). defaults to `None` | | `validate` | bool | Whether patterns should be validated, passed to Matcher and PhraseMatcher as `validate`. Defaults to `False`. | | `overwrite_ents` | bool | If existing entities are present, e.g. entities added by the model, overwrite them by matches if necessary. Defaults to `False`. | | `**cfg` | - | Other config parameters. If pipeline component is loaded as part of a model pipeline, this will include all keyword arguments passed to `spacy.load`. | diff --git a/website/docs/api/goldparse.md b/website/docs/api/goldparse.md index 23937e702..85b62e074 100644 --- a/website/docs/api/goldparse.md +++ b/website/docs/api/goldparse.md @@ -15,7 +15,7 @@ missing – the gradient for those labels will be zero. | Name | Type | Description | | ----------------- | ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `doc` | `Doc` | The document the annotations refer to. | -| `words` | iterable | A sequence of unicode word strings. | +| `words` | iterable | A sequence of word strings. | | `tags` | iterable | A sequence of strings, representing tag annotations. | | `heads` | iterable | A sequence of integers, representing syntactic head offsets. | | `deps` | iterable | A sequence of strings, representing the syntactic relation types. | @@ -136,14 +136,13 @@ The returned tuple contains the following alignment information: Encode labelled spans into per-token tags, using the [BILUO scheme](/api/annotation#biluo) (Begin, In, Last, Unit, Out). Returns a -list of unicode strings, describing the tags. Each tag string will be of the -form of either `""`, `"O"` or `"{action}-{label}"`, where action is one of -`"B"`, `"I"`, `"L"`, `"U"`. The string `"-"` is used where the entity offsets -don't align with the tokenization in the `Doc` object. The training algorithm -will view these as missing values. `O` denotes a non-entity token. `B` denotes -the beginning of a multi-token entity, `I` the inside of an entity of three or -more tokens, and `L` the end of an entity of two or more tokens. `U` denotes a -single-token entity. +list of strings, describing the tags. Each tag string will be of the form of +either `""`, `"O"` or `"{action}-{label}"`, where action is one of `"B"`, `"I"`, +`"L"`, `"U"`. The string `"-"` is used where the entity offsets don't align with +the tokenization in the `Doc` object. The training algorithm will view these as +missing values. `O` denotes a non-entity token. `B` denotes the beginning of a +multi-token entity, `I` the inside of an entity of three or more tokens, and `L` +the end of an entity of two or more tokens. `U` denotes a single-token entity. > #### Example > diff --git a/website/docs/api/language.md b/website/docs/api/language.md index e1991f260..792f2217d 100644 --- a/website/docs/api/language.md +++ b/website/docs/api/language.md @@ -72,17 +72,6 @@ Pipeline components to prevent from being loaded can now be added as a list to Process texts as a stream, and yield `Doc` objects in order. This is usually more efficient than processing texts one-by-one. - - -Early versions of spaCy used simple statistical models that could be efficiently -multi-threaded, as we were able to entirely release Python's global interpreter -lock. The multi-threading was controlled using the `n_threads` keyword argument -to the `.pipe` method. This keyword argument is now deprecated as of v2.1.0. A -new keyword argument, `n_process`, was introduced to control parallel inference -via multiprocessing in v2.2.2. - - - > #### Example > > ```python @@ -91,15 +80,15 @@ via multiprocessing in v2.2.2. > assert doc.is_parsed > ``` -| Name | Type | Description | -| -------------------------------------------- | ----- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `texts` | - | A sequence of unicode objects. | -| `as_tuples` | bool | If set to `True`, inputs should be a sequence of `(text, context)` tuples. Output will then be a sequence of `(doc, context)` tuples. Defaults to `False`. | -| `batch_size` | int | The number of texts to buffer. | -| `disable` | list | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). | -| `component_cfg` 2.1 | dict | Config parameters for specific pipeline components, keyed by component name. | -| `n_process` 2.2.2 | int | Number of processors to use, only supported in Python 3. Defaults to `1`. | -| **YIELDS** | `Doc` | Documents in the order of the original text. | +| Name | Type | Description | +| -------------------------------------------- | -------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `texts` | iterable | A sequence of strings. | +| `as_tuples` | bool | If set to `True`, inputs should be a sequence of `(text, context)` tuples. Output will then be a sequence of `(doc, context)` tuples. Defaults to `False`. | +| `batch_size` | int | The number of texts to buffer. | +| `disable` | list | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). | +| `component_cfg` 2.1 | dict | Config parameters for specific pipeline components, keyed by component name. | +| `n_process` 2.2.2 | int | Number of processors to use, only supported in Python 3. Defaults to `1`. | +| **YIELDS** | `Doc` | Documents in the order of the original text. | ## Language.update {#update tag="method"} @@ -116,7 +105,7 @@ Update the models in the pipeline. | Name | Type | Description | | -------------------------------------------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `docs` | iterable | A batch of `Doc` objects or unicode. If unicode, a `Doc` object will be created from the text. | +| `docs` | iterable | A batch of `Doc` objects or strings. If strings, a `Doc` object will be created from the text. | | `golds` | iterable | A batch of `GoldParse` objects or dictionaries. Dictionaries will be used to create [`GoldParse`](/api/goldparse) objects. For the available keys and their usage, see [`GoldParse.__init__`](/api/goldparse#init). | | `drop` | float | The dropout rate. | | `sgd` | callable | An optimizer. | @@ -134,14 +123,14 @@ Evaluate a model's pipeline components. > print(scorer.scores) > ``` -| Name | Type | Description | -| -------------------------------------------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| Name | Type | Description | +| -------------------------------------------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | `docs_golds` | iterable | Tuples of `Doc` and `GoldParse` objects, such that the `Doc` objects contain the predictions and the `GoldParse` objects the correct annotations. Alternatively, `(text, annotations)` tuples of raw text and a dict (see [simple training style](/usage/training#training-simple-style)). | -| `verbose` | bool | Print debugging information. | -| `batch_size` | int | The batch size to use. | -| `scorer` | `Scorer` | Optional [`Scorer`](/api/scorer) to use. If not passed in, a new one will be created. | -| `component_cfg` 2.1 | dict | Config parameters for specific pipeline components, keyed by component name. | -| **RETURNS** | Scorer | The scorer containing the evaluation scores. | +| `verbose` | bool | Print debugging information. | +| `batch_size` | int | The batch size to use. | +| `scorer` | `Scorer` | Optional [`Scorer`](/api/scorer) to use. If not passed in, a new one will be created. | +| `component_cfg` 2.1 | dict | Config parameters for specific pipeline components, keyed by component name. | +| **RETURNS** | Scorer | The scorer containing the evaluation scores. | ## Language.begin_training {#begin_training tag="method"} @@ -400,20 +389,6 @@ loaded object. | `exclude` | list | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. | | **RETURNS** | `Language` | The modified `Language` object. | - - -As of spaCy v2.0, the `save_to_directory` method has been renamed to `to_disk`, -to improve consistency across classes. Pipeline components to prevent from being -loaded can now be added as a list to `disable` (v2.0) or `exclude` (v2.1), -instead of specifying one keyword argument per component. - -```diff -- nlp = spacy.load("en", tagger=False, entity=False) -+ nlp = English().from_disk("/model", exclude=["tagger", "ner"]) -``` - - - ## Language.to_bytes {#to_bytes tag="method"} Serialize the current state to a binary string. @@ -470,7 +445,7 @@ per component. | ------------------------------------------ | ----------- | ----------------------------------------------------------------------------------------------- | | `vocab` | `Vocab` | A container for the lexical types. | | `tokenizer` | `Tokenizer` | The tokenizer. | -| `make_doc` | `callable` | Callable that takes a unicode text and returns a `Doc`. | +| `make_doc` | `callable` | Callable that takes a string and returns a `Doc`. | | `pipeline` | list | List of `(name, component)` tuples describing the current processing pipeline, in order. | | `pipe_names` 2 | list | List of pipeline component names, in order. | | `pipe_labels` 2.2 | dict | List of labels set by the pipeline components, if available, keyed by component name. | diff --git a/website/docs/api/lemmatizer.md b/website/docs/api/lemmatizer.md index 16cd624f5..fa376d246 100644 --- a/website/docs/api/lemmatizer.md +++ b/website/docs/api/lemmatizer.md @@ -31,20 +31,6 @@ when a `Language` subclass and its `Vocab` is initialized. | `lookups` 2.2 | [`Lookups`](/api/lookups) | The lookups object containing the (optional) tables `"lemma_rules"`, `"lemma_index"`, `"lemma_exc"` and `"lemma_lookup"`. | | **RETURNS** | `Lemmatizer` | The newly created object. | - - -As of v2.2, the lemmatizer is initialized with a [`Lookups`](/api/lookups) -object containing tables for the different components. This makes it easier for -spaCy to share and serialize rules and lookup tables via the `Vocab`, and allows -users to modify lemmatizer data at runtime by updating `nlp.vocab.lookups`. - -```diff -- lemmatizer = Lemmatizer(rules=lemma_rules) -+ lemmatizer = Lemmatizer(lookups) -``` - - - ## Lemmatizer.\_\_call\_\_ {#call tag="method"} Lemmatize a string. diff --git a/website/docs/api/phrasematcher.md b/website/docs/api/phrasematcher.md index 3b5c7a661..6e793a7b9 100644 --- a/website/docs/api/phrasematcher.md +++ b/website/docs/api/phrasematcher.md @@ -39,7 +39,7 @@ be shown. | --------------------------------------- | --------------- | ------------------------------------------------------------------------------------------- | | `vocab` | `Vocab` | The vocabulary object, which must be shared with the documents the matcher will operate on. | | `max_length` | int | Deprecated argument - the `PhraseMatcher` does not have a phrase length limit anymore. | -| `attr` 2.1 | int / unicode | The token attribute to match on. Defaults to `ORTH`, i.e. the verbatim token text. | +| `attr` 2.1 | int / str | The token attribute to match on. Defaults to `ORTH`, i.e. the verbatim token text. | | `validate` 2.1 | bool | Validate patterns added to the matcher. | | **RETURNS** | `PhraseMatcher` | The newly constructed object. | diff --git a/website/docs/api/span.md b/website/docs/api/span.md index c41d9aa03..4d10c08d9 100644 --- a/website/docs/api/span.md +++ b/website/docs/api/span.md @@ -18,15 +18,15 @@ Create a Span object from the slice `doc[start : end]`. > assert [t.text for t in span] == ["it", "back", "!"] > ``` -| Name | Type | Description | -| ----------- | ---------------------------------------- | ----------------------------------------------------------------------------------------------------------------- | -| `doc` | `Doc` | The parent document. | -| `start` | int | The index of the first token of the span. | -| `end` | int | The index of the first token after the span. | -| `label` | int / unicode | A label to attach to the span, e.g. for named entities. As of v2.1, the label can also be a unicode string. | -| `kb_id` | int / unicode | A knowledge base ID to attach to the span, e.g. for named entities. The ID can be an integer or a unicode string. | -| `vector` | `numpy.ndarray[ndim=1, dtype='float32']` | A meaning representation of the span. | -| **RETURNS** | `Span` | The newly constructed object. | +| Name | Type | Description | +| ----------- | ---------------------------------------- | --------------------------------------------------------------------------------------------------------- | +| `doc` | `Doc` | The parent document. | +| `start` | int | The index of the first token of the span. | +| `end` | int | The index of the first token after the span. | +| `label` | int / str | A label to attach to the span, e.g. for named entities. As of v2.1, the label can also be a string. | +| `kb_id` | int / str | A knowledge base ID to attach to the span, e.g. for named entities. The ID can be an integer or a string. | +| `vector` | `numpy.ndarray[ndim=1, dtype='float32']` | A meaning representation of the span. | +| **RETURNS** | `Span` | The newly constructed object. | ## Span.\_\_getitem\_\_ {#getitem tag="method"} @@ -189,8 +189,8 @@ the character indices don't map to a valid span. | ----------- | ---------------------------------------- | --------------------------------------------------------------------- | | `start` | int | The index of the first character of the span. | | `end` | int | The index of the last character after the span. | -| `label` | uint64 / unicode | A label to attach to the span, e.g. for named entities. | -| `kb_id` | uint64 / unicode | An ID from a knowledge base to capture the meaning of a named entity. | +| `label` | uint64 / str | A label to attach to the span, e.g. for named entities. | +| `kb_id` | uint64 / str | An ID from a knowledge base to capture the meaning of a named entity. | | `vector` | `numpy.ndarray[ndim=1, dtype='float32']` | A meaning representation of the span. | | **RETURNS** | `Span` | The newly constructed object or `None`. | @@ -255,33 +255,6 @@ shape `(N, M)`, where `N` is the length of the document. The values will be | `attr_ids` | list | A list of attribute ID ints. | | **RETURNS** | `numpy.ndarray[long, ndim=2]` | A feature matrix, with one row per word, and one column per attribute indicated in the input `attr_ids`. | -## Span.merge {#merge tag="method"} - - - -As of v2.1.0, `Span.merge` still works but is considered deprecated. You should -use the new and less error-prone [`Doc.retokenize`](/api/doc#retokenize) -instead. - - - -Retokenize the document, such that the span is merged into a single token. - -> #### Example -> -> ```python -> doc = nlp("I like New York in Autumn.") -> span = doc[2:4] -> span.merge() -> assert len(doc) == 6 -> assert doc[2].text == "New York" -> ``` - -| Name | Type | Description | -| -------------- | ------- | ------------------------------------------------------------------------------------------------------------------------- | -| `**attributes` | - | Attributes to assign to the merged token. By default, attributes are inherited from the syntactic root token of the span. | -| **RETURNS** | `Token` | The newly merged token. | - ## Span.ents {#ents tag="property" new="2.0.13" model="ner"} The named entities in the span. Returns a tuple of named entity `Span` objects, @@ -497,7 +470,7 @@ The L2 norm of the span's vector representation. | `end` | int | The token offset for the end of the span. | | `start_char` | int | The character offset for the start of the span. | | `end_char` | int | The character offset for the end of the span. | -| `text` | str | A unicode representation of the span text. | +| `text` | str | A string representation of the span text. | | `text_with_ws` | str | The text content of the span with a trailing whitespace character if the last token has one. | | `orth` | int | ID of the verbatim text content. | | `orth_` | str | Verbatim text content (identical to `Span.text`). Exists mostly for consistency with the other attributes. | diff --git a/website/docs/api/stringstore.md b/website/docs/api/stringstore.md index 922174c78..c00c59832 100644 --- a/website/docs/api/stringstore.md +++ b/website/docs/api/stringstore.md @@ -19,10 +19,10 @@ Create the `StringStore`. > stringstore = StringStore(["apple", "orange"]) > ``` -| Name | Type | Description | -| ----------- | ------------- | -------------------------------------------------- | -| `strings` | iterable | A sequence of unicode strings to add to the store. | -| **RETURNS** | `StringStore` | The newly constructed object. | +| Name | Type | Description | +| ----------- | ------------- | ------------------------------------------ | +| `strings` | iterable | A sequence of strings to add to the store. | +| **RETURNS** | `StringStore` | The newly constructed object. | ## StringStore.\_\_len\_\_ {#len tag="method"} @@ -52,10 +52,10 @@ Retrieve a string from a given hash, or vice versa. > assert stringstore[apple_hash] == "apple" > ``` -| Name | Type | Description | -| -------------- | ------------------------ | -------------------------- | -| `string_or_id` | bytes, unicode or uint64 | The value to encode. | -| **RETURNS** | str or int | The value to be retrieved. | +| Name | Type | Description | +| -------------- | -------------------- | -------------------------- | +| `string_or_id` | bytes, str or uint64 | The value to encode. | +| **RETURNS** | str or int | The value to be retrieved. | ## StringStore.\_\_contains\_\_ {#contains tag="method"} diff --git a/website/docs/api/token.md b/website/docs/api/token.md index 9f8594c96..2d25d9db2 100644 --- a/website/docs/api/token.md +++ b/website/docs/api/token.md @@ -58,7 +58,7 @@ For details, see the documentation on | Name | Type | Description | | --------- | -------- | --------------------------------------------------------------------------------------------------------------------------------------- | -| `name` | unicode | Name of the attribute to set by the extension. For example, `'my_attr'` will be available as `token._.my_attr`. | +| `name` | str | Name of the attribute to set by the extension. For example, `'my_attr'` will be available as `token._.my_attr`. | | `default` | - | Optional default value of the attribute if no getter or method is defined. | | `method` | callable | Set a custom method on the object, for example `token._.compare(other_token)`. | | `getter` | callable | Getter function that takes the object and returns an attribute value. Is called when the user accesses the `._` attribute. | @@ -80,10 +80,10 @@ Look up a previously registered extension by name. Returns a 4-tuple > assert extension == (False, None, None, None) > ``` -| Name | Type | Description | -| ----------- | ------- | ------------------------------------------------------------- | -| `name` | unicode | Name of the extension. | -| **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the extension. | +| Name | Type | Description | +| ----------- | ----- | ------------------------------------------------------------- | +| `name` | str | Name of the extension. | +| **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the extension. | ## Token.has_extension {#has_extension tag="classmethod" new="2"} @@ -97,10 +97,10 @@ Check whether an extension has been registered on the `Token` class. > assert Token.has_extension("is_fruit") > ``` -| Name | Type | Description | -| ----------- | ------- | ------------------------------------------ | -| `name` | unicode | Name of the extension to check. | -| **RETURNS** | bool | Whether the extension has been registered. | +| Name | Type | Description | +| ----------- | ---- | ------------------------------------------ | +| `name` | str | Name of the extension to check. | +| **RETURNS** | bool | Whether the extension has been registered. | ## Token.remove_extension {#remove_extension tag="classmethod" new=""2.0.11""} @@ -115,10 +115,10 @@ Remove a previously registered extension. > assert not Token.has_extension("is_fruit") > ``` -| Name | Type | Description | -| ----------- | ------- | --------------------------------------------------------------------- | -| `name` | unicode | Name of the extension. | -| **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the removed extension. | +| Name | Type | Description | +| ----------- | ----- | --------------------------------------------------------------------- | +| `name` | str | Name of the extension. | +| **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the removed extension. | ## Token.check_flag {#check_flag tag="method"} @@ -339,21 +339,6 @@ unknown. Defaults to `True` for the first token in the `Doc`. | ----------- | ---- | ------------------------------------ | | **RETURNS** | bool | Whether the token starts a sentence. | - - -As of spaCy v2.0, the `Token.sent_start` property is deprecated and has been -replaced with `Token.is_sent_start`, which returns a boolean value instead of a -misleading `0` for `False` and `1` for `True`. It also now returns `None` if the -answer is unknown, and fixes a quirk in the old logic that would always set the -property to `0` for the first word of the document. - -```diff -- assert doc[4].sent_start == 1 -+ assert doc[4].is_sent_start == True -``` - - - ## Token.has_vector {#has_vector tag="property" model="vectors"} A boolean value indicating whether a word vector is associated with the token. @@ -412,11 +397,11 @@ The L2 norm of the token's vector representation. | -------------------------------------------- | ------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `doc` | `Doc` | The parent document. | | `sent` 2.0.12 | `Span` | The sentence span that this token is a part of. | -| `text` | unicode | Verbatim text content. | -| `text_with_ws` | unicode | Text content, with trailing space character if present. | -| `whitespace_` | unicode | Trailing space character if present. | +| `text` | str | Verbatim text content. | +| `text_with_ws` | str | Text content, with trailing space character if present. | +| `whitespace_` | str | Trailing space character if present. | | `orth` | int | ID of the verbatim text content. | -| `orth_` | unicode | Verbatim text content (identical to `Token.text`). Exists mostly for consistency with the other attributes. | +| `orth_` | str | Verbatim text content (identical to `Token.text`). Exists mostly for consistency with the other attributes. | | `vocab` | `Vocab` | The vocab object of the parent `Doc`. | | `tensor` 2.1.7 | `ndarray` | The tokens's slice of the parent `Doc`'s tensor. | | `head` | `Token` | The syntactic parent, or "governor", of this token. | @@ -424,25 +409,25 @@ The L2 norm of the token's vector representation. | `right_edge` | `Token` | The rightmost token of this token's syntactic descendants. | | `i` | int | The index of the token within the parent document. | | `ent_type` | int | Named entity type. | -| `ent_type_` | unicode | Named entity type. | +| `ent_type_` | str | Named entity type. | | `ent_iob` | int | IOB code of named entity tag. `3` means the token begins an entity, `2` means it is outside an entity, `1` means it is inside an entity, and `0` means no entity tag is set. | -| `ent_iob_` | unicode | IOB code of named entity tag. "B" means the token begins an entity, "I" means it is inside an entity, "O" means it is outside an entity, and "" means no entity tag is set. | +| `ent_iob_` | str | IOB code of named entity tag. "B" means the token begins an entity, "I" means it is inside an entity, "O" means it is outside an entity, and "" means no entity tag is set. | | `ent_kb_id` 2.2 | int | Knowledge base ID that refers to the named entity this token is a part of, if any. | -| `ent_kb_id_` 2.2 | unicode | Knowledge base ID that refers to the named entity this token is a part of, if any. | +| `ent_kb_id_` 2.2 | str | Knowledge base ID that refers to the named entity this token is a part of, if any. | | `ent_id` | int | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. | -| `ent_id_` | unicode | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. | +| `ent_id_` | str | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. | | `lemma` | int | Base form of the token, with no inflectional suffixes. | -| `lemma_` | unicode | Base form of the token, with no inflectional suffixes. | +| `lemma_` | str | Base form of the token, with no inflectional suffixes. | | `norm` | int | The token's norm, i.e. a normalized form of the token text. Usually set in the language's [tokenizer exceptions](/usage/adding-languages#tokenizer-exceptions) or [norm exceptions](/usage/adding-languages#norm-exceptions). | -| `norm_` | unicode | The token's norm, i.e. a normalized form of the token text. Usually set in the language's [tokenizer exceptions](/usage/adding-languages#tokenizer-exceptions) or [norm exceptions](/usage/adding-languages#norm-exceptions). | +| `norm_` | str | The token's norm, i.e. a normalized form of the token text. Usually set in the language's [tokenizer exceptions](/usage/adding-languages#tokenizer-exceptions) or [norm exceptions](/usage/adding-languages#norm-exceptions). | | `lower` | int | Lowercase form of the token. | -| `lower_` | unicode | Lowercase form of the token text. Equivalent to `Token.text.lower()`. | +| `lower_` | str | Lowercase form of the token text. Equivalent to `Token.text.lower()`. | | `shape` | int | Transform of the tokens's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. | -| `shape_` | unicode | Transform of the tokens's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. | +| `shape_` | str | Transform of the tokens's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. | | `prefix` | int | Hash value of a length-N substring from the start of the token. Defaults to `N=1`. | -| `prefix_` | unicode | A length-N substring from the start of the token. Defaults to `N=1`. | +| `prefix_` | str | A length-N substring from the start of the token. Defaults to `N=1`. | | `suffix` | int | Hash value of a length-N substring from the end of the token. Defaults to `N=3`. | -| `suffix_` | unicode | Length-N substring from the end of the token. Defaults to `N=3`. | +| `suffix_` | str | Length-N substring from the end of the token. Defaults to `N=3`. | | `is_alpha` | bool | Does the token consist of alphabetic characters? Equivalent to `token.text.isalpha()`. | | `is_ascii` | bool | Does the token consist of ASCII characters? Equivalent to `all(ord(c) < 128 for c in token.text)`. | | `is_digit` | bool | Does the token consist of digits? Equivalent to `token.text.isdigit()`. | @@ -459,16 +444,16 @@ The L2 norm of the token's vector representation. | `like_url` | bool | Does the token resemble a URL? | | `like_num` | bool | Does the token represent a number? e.g. "10.9", "10", "ten", etc. | | `like_email` | bool | Does the token resemble an email address? | -| `is_oov` | bool | Does the token have a word vector? | +| `is_oov` | bool | Does the token have a word vector? | | `is_stop` | bool | Is the token part of a "stop list"? | | `pos` | int | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/docs/u/pos/). | -| `pos_` | unicode | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/docs/u/pos/). | +| `pos_` | str | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/docs/u/pos/). | | `tag` | int | Fine-grained part-of-speech. | -| `tag_` | unicode | Fine-grained part-of-speech. | +| `tag_` | str | Fine-grained part-of-speech. | | `dep` | int | Syntactic dependency relation. | -| `dep_` | unicode | Syntactic dependency relation. | +| `dep_` | str | Syntactic dependency relation. | | `lang` | int | Language of the parent document's vocabulary. | -| `lang_` | unicode | Language of the parent document's vocabulary. | +| `lang_` | str | Language of the parent document's vocabulary. | | `prob` | float | Smoothed log probability estimate of token's word type (context-independent entry in the vocabulary). | | `idx` | int | The character offset of the token within the parent document. | | `sentiment` | float | A scalar value indicating the positivity or negativity of the token. | diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md index bdd094021..bd6c30d0f 100644 --- a/website/docs/api/top-level.md +++ b/website/docs/api/top-level.md @@ -11,22 +11,20 @@ menu: ### spacy.load {#spacy.load tag="function" model="any"} -Load a model via its [shortcut link](/usage/models#usage), the name of an -installed [model package](/usage/training#models-generating), a unicode path or -a `Path`-like object. spaCy will try resolving the load argument in this order. -If a model is loaded from a shortcut link or package name, spaCy will assume -it's a Python package and import it and call the model's own `load()` method. If -a model is loaded from a path, spaCy will assume it's a data directory, read the -language and pipeline settings off the meta.json and initialize the `Language` -class. The data will be loaded in via -[`Language.from_disk`](/api/language#from_disk). +Load a model using the name of an installed +[model package](/usage/training#models-generating), a string path or a +`Path`-like object. spaCy will try resolving the load argument in this order. If +a model is loaded from a model name, spaCy will assume it's a Python package and +import it and call the model's own `load()` method. If a model is loaded from a +path, spaCy will assume it's a data directory, read the language and pipeline +settings off the meta.json and initialize the `Language` class. The data will be +loaded in via [`Language.from_disk`](/api/language#from_disk). > #### Example > > ```python -> nlp = spacy.load("en") # shortcut link > nlp = spacy.load("en_core_web_sm") # package -> nlp = spacy.load("/path/to/en") # unicode path +> nlp = spacy.load("/path/to/en") # string path > nlp = spacy.load(Path("/path/to/en")) # pathlib Path > > nlp = spacy.load("en_core_web_sm", disable=["parser", "tagger"]) @@ -34,7 +32,7 @@ class. The data will be loaded in via | Name | Type | Description | | ----------- | ------------ | --------------------------------------------------------------------------------- | -| `name` | str / `Path` | Model to load, i.e. shortcut link, package name or path. | +| `name` | str / `Path` | Model to load, i.e. package name or path. | | `disable` | list | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). | | **RETURNS** | `Language` | A `Language` object with the loaded model. | @@ -98,10 +96,10 @@ meta data as a dictionary instead, you can use the `meta` attribute on your > spacy.info("de", markdown=True) > ``` -| Name | Type | Description | -| ---------- | ---- | ------------------------------------------------------------- | -| `model` | str | A model, i.e. shortcut link, package name or path (optional). | -| `markdown` | bool | Print information as Markdown. | +| Name | Type | Description | +| ---------- | ---- | ------------------------------------------------ | +| `model` | str | A model, i.e. a package name or path (optional). | +| `markdown` | bool | Print information as Markdown. | ### spacy.explain {#spacy.explain tag="function"} @@ -375,12 +373,12 @@ loaded lazily, to avoid expensive setup code associated with the language data. ### util.load_model {#util.load_model tag="function" new="2"} -Load a model from a shortcut link, package or data path. If called with a -shortcut link or package name, spaCy will assume the model is a Python package -and import and call its `load()` method. If called with a path, spaCy will -assume it's a data directory, read the language and pipeline settings from the -meta.json and initialize a `Language` class. The model data will then be loaded -in via [`Language.from_disk()`](/api/language#from_disk). +Load a model from a package or data path. If called with a package name, spaCy +will assume the model is a Python package and import and call its `load()` +method. If called with a path, spaCy will assume it's a data directory, read the +language and pipeline settings from the meta.json and initialize a `Language` +class. The model data will then be loaded in via +[`Language.from_disk()`](/api/language#from_disk). > #### Example > @@ -392,7 +390,7 @@ in via [`Language.from_disk()`](/api/language#from_disk). | Name | Type | Description | | ------------- | ---------- | -------------------------------------------------------- | -| `name` | str | Package name, shortcut link or model path. | +| `name` | str | Package name or model path. | | `**overrides` | - | Specific overrides, like pipeline components to disable. | | **RETURNS** | `Language` | `Language` class with the loaded model. | diff --git a/website/docs/api/vectors.md b/website/docs/api/vectors.md index 939cc8655..a893a1fd2 100644 --- a/website/docs/api/vectors.md +++ b/website/docs/api/vectors.md @@ -124,7 +124,7 @@ Check whether a key has been mapped to a vector entry in the table. Add a key to the table, optionally setting a vector value as well. Keys can be mapped to an existing vector by setting `row`, or a new vector can be added. -When adding unicode keys, keep in mind that the `Vectors` class itself has no +When adding string keys, keep in mind that the `Vectors` class itself has no [`StringStore`](/api/stringstore), so you have to store the hash-to-string mapping separately. If you need to manage the strings, you should use the `Vectors` via the [`Vocab`](/api/vocab) class, e.g. `vocab.vectors`. diff --git a/website/docs/api/vocab.md b/website/docs/api/vocab.md index 2bca6c5b1..a77b9f244 100644 --- a/website/docs/api/vocab.md +++ b/website/docs/api/vocab.md @@ -30,7 +30,7 @@ Create the vocabulary. | `lookups` | `Lookups` | A [`Lookups`](/api/lookups) that stores the `lemma_\*`, `lexeme_norm` and other large lookup tables. Defaults to `None`. | | `lookups_extra` 2.3 | `Lookups` | A [`Lookups`](/api/lookups) that stores the optional `lexeme_cluster`/`lexeme_prob`/`lexeme_sentiment`/`lexeme_settings` lookup tables. Defaults to `None`. | | `oov_prob` | float | The default OOV probability. Defaults to `-20.0`. | -| `vectors_name` 2.2 | unicode | A name to identify the vectors table. | +| `vectors_name` 2.2 | str | A name to identify the vectors table. | | **RETURNS** | `Vocab` | The newly constructed object. | ## Vocab.\_\_len\_\_ {#len tag="method"} @@ -50,8 +50,8 @@ Get the current number of lexemes in the vocabulary. ## Vocab.\_\_getitem\_\_ {#getitem tag="method"} -Retrieve a lexeme, given an int ID or a unicode string. If a previously unseen -unicode string is given, a new lexeme is created and stored. +Retrieve a lexeme, given an int ID or a string. If a previously unseen string is +given, a new lexeme is created and stored. > #### Example > @@ -60,10 +60,10 @@ unicode string is given, a new lexeme is created and stored. > assert nlp.vocab[apple] == nlp.vocab["apple"] > ``` -| Name | Type | Description | -| -------------- | ------------- | ------------------------------------------------ | -| `id_or_string` | int / unicode | The hash value of a word, or its unicode string. | -| **RETURNS** | `Lexeme` | The lexeme indicated by the given ID. | +| Name | Type | Description | +| -------------- | --------- | ---------------------------------------- | +| `id_or_string` | int / str | The hash value of a word, or its string. | +| **RETURNS** | `Lexeme` | The lexeme indicated by the given ID. | ## Vocab.\_\_iter\_\_ {#iter tag="method"} @@ -182,7 +182,7 @@ subword features by average over ngrams of `orth` (introduced in spaCy `v2.1`). | Name | Type | Description | | ----------------------------------- | ---------------------------------------- | ---------------------------------------------------------------------------------------------- | -| `orth` | int / unicode | The hash value of a word, or its unicode string. | +| `orth` | int / str | The hash value of a word, or its unicode string. | | `minn` 2.1 | int | Minimum n-gram length used for FastText's ngram computation. Defaults to the length of `orth`. | | `maxn` 2.1 | int | Maximum n-gram length used for FastText's ngram computation. Defaults to the length of `orth`. | | **RETURNS** | `numpy.ndarray[ndim=1, dtype='float32']` | A word vector. Size and shape are determined by the `Vocab.vectors` instance. | @@ -200,7 +200,7 @@ or hash value. | Name | Type | Description | | -------- | ---------------------------------------- | ------------------------------------------------ | -| `orth` | int / unicode | The hash value of a word, or its unicode string. | +| `orth` | int / str | The hash value of a word, or its unicode string. | | `vector` | `numpy.ndarray[ndim=1, dtype='float32']` | The vector to set. | ## Vocab.has_vector {#has_vector tag="method" new="2"} @@ -215,10 +215,10 @@ Words can be looked up by string or hash value. > vector = nlp.vocab.get_vector("apple") > ``` -| Name | Type | Description | -| ----------- | ------------- | ------------------------------------------------ | -| `orth` | int / unicode | The hash value of a word, or its unicode string. | -| **RETURNS** | bool | Whether the word has a vector. | +| Name | Type | Description | +| ----------- | --------- | ------------------------------------------------ | +| `orth` | int / str | The hash value of a word, or its unicode string. | +| **RETURNS** | bool | Whether the word has a vector. | ## Vocab.to_disk {#to_disk tag="method" new="2"} diff --git a/website/docs/images/spacy-streamlit.png b/website/docs/images/spacy-streamlit.png new file mode 100644 index 000000000..8f617d49f Binary files /dev/null and b/website/docs/images/spacy-streamlit.png differ diff --git a/website/docs/usage/adding-languages.md b/website/docs/usage/adding-languages.md deleted file mode 100644 index 29a9a1c27..000000000 --- a/website/docs/usage/adding-languages.md +++ /dev/null @@ -1,675 +0,0 @@ ---- -title: Adding Languages -next: /usage/training -menu: - - ['Language Data', 'language-data'] - - ['Testing', 'testing'] - - ['Training', 'training'] ---- - -Adding full support for a language touches many different parts of the spaCy -library. This guide explains how to fit everything together, and points you to -the specific workflows for each component. - -> #### Working on spaCy's source -> -> To add a new language to spaCy, you'll need to **modify the library's code**. -> The easiest way to do this is to clone the -> [repository](https://github.com/explosion/spaCy/tree/master/) and **build -> spaCy from source**. For more information on this, see the -> [installation guide](/usage). Unlike spaCy's core, which is mostly written in -> Cython, all language data is stored in regular Python files. This means that -> you won't have to rebuild anything in between – you can simply make edits and -> reload spaCy to test them. - - - -
- -Obviously, there are lots of ways you can organize your code when you implement -your own language data. This guide will focus on how it's done within spaCy. For -full language support, you'll need to create a `Language` subclass, define -custom **language data**, like a stop list and tokenizer exceptions and test the -new tokenizer. Once the language is set up, you can **build the vocabulary**, -including word frequencies, Brown clusters and word vectors. Finally, you can -**train the tagger and parser**, and save the model to a directory. - -For some languages, you may also want to develop a solution for lemmatization -and morphological analysis. - -
- - - -- [Language data 101](#101) -- [The Language subclass](#language-subclass) -- [Stop words](#stop-words) -- [Tokenizer exceptions](#tokenizer-exceptions) -- [Norm exceptions](#norm-exceptions) -- [Lexical attributes](#lex-attrs) -- [Syntax iterators](#syntax-iterators) -- [Lemmatizer](#lemmatizer) -- [Tag map](#tag-map) -- [Morph rules](#morph-rules) -- [Testing the language](#testing) -- [Training](#training) - - - -
- -## Language data {#language-data} - -import LanguageData101 from 'usage/101/\_language-data.md' - - - -The individual components **expose variables** that can be imported within a -language module, and added to the language's `Defaults`. Some components, like -the punctuation rules, usually don't need much customization and can be imported -from the global rules. Others, like the tokenizer and norm exceptions, are very -specific and will make a big difference to spaCy's performance on the particular -language and training a language model. - -| Variable | Type | Description | -| ---------------------- | ----- | ---------------------------------------------------------------------------------------------------------- | -| `STOP_WORDS` | set | Individual words. | -| `TOKENIZER_EXCEPTIONS` | dict | Keyed by strings mapped to list of one dict per token with token attributes. | -| `TOKEN_MATCH` | regex | Regexes to match complex tokens, e.g. URLs. | -| `NORM_EXCEPTIONS` | dict | Keyed by strings, mapped to their norms. | -| `TOKENIZER_PREFIXES` | list | Strings or regexes, usually not customized. | -| `TOKENIZER_SUFFIXES` | list | Strings or regexes, usually not customized. | -| `TOKENIZER_INFIXES` | list | Strings or regexes, usually not customized. | -| `LEX_ATTRS` | dict | Attribute ID mapped to function. | -| `SYNTAX_ITERATORS` | dict | Iterator ID mapped to function. Currently only supports `'noun_chunks'`. | -| `TAG_MAP` | dict | Keyed by strings mapped to [Universal Dependencies](http://universaldependencies.org/u/pos/all.html) tags. | -| `MORPH_RULES` | dict | Keyed by strings mapped to a dict of their morphological features. | - -> #### Should I ever update the global data? -> -> Reusable language data is collected as atomic pieces in the root of the -> [`spacy.lang`](https://github.com/explosion/spaCy/tree/master/spacy/lang) -> module. Often, when a new language is added, you'll find a pattern or symbol -> that's missing. Even if it isn't common in other languages, it might be best -> to add it to the shared language data, unless it has some conflicting -> interpretation. For instance, we don't expect to see guillemot quotation -> symbols (`»` and `«`) in English text. But if we do see them, we'd probably -> prefer the tokenizer to split them off. - - - -In order for the tokenizer to split suffixes, prefixes and infixes, spaCy needs -to know the language's character set. If the language you're adding uses -non-latin characters, you might need to define the required character classes in -the global -[`char_classes.py`](https://github.com/explosion/spaCy/tree/master/spacy/lang/char_classes.py). -For efficiency, spaCy uses hard-coded unicode ranges to define character -classes, the definitions of which can be found on -[Wikipedia](https://en.wikipedia.org/wiki/Unicode_block). If the language -requires very specific punctuation rules, you should consider overwriting the -default regular expressions with your own in the language's `Defaults`. - - - -### Creating a language subclass {#language-subclass} - -Language-specific code and resources should be organized into a sub-package of -spaCy, named according to the language's -[ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes). For instance, -code and resources specific to Spanish are placed into a directory -`spacy/lang/es`, which can be imported as `spacy.lang.es`. - -To get started, you can check out the -[existing languages](https://github.com/explosion/spacy/tree/master/spacy/lang). -Here's what the class could look like: - -```python -### __init__.py (excerpt) -# import language-specific data -from .stop_words import STOP_WORDS -from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS -from .lex_attrs import LEX_ATTRS - -from ..tokenizer_exceptions import BASE_EXCEPTIONS -from ...language import Language -from ...attrs import LANG -from ...util import update_exc - -# Create Defaults class in the module scope (necessary for pickling!) -class XxxxxDefaults(Language.Defaults): - lex_attr_getters = dict(Language.Defaults.lex_attr_getters) - lex_attr_getters[LANG] = lambda text: "xx" # language ISO code - - # Optional: replace flags with custom functions, e.g. like_num() - lex_attr_getters.update(LEX_ATTRS) - - # Merge base exceptions and custom tokenizer exceptions - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) - stop_words = STOP_WORDS - -# Create actual Language class -class Xxxxx(Language): - lang = "xx" # Language ISO code - Defaults = XxxxxDefaults # Override defaults - -# Set default export – this allows the language class to be lazy-loaded -__all__ = ["Xxxxx"] -``` - - - -Some languages contain large volumes of custom data, like lemmatizer lookup -tables, or complex regular expression that are expensive to compute. As of spaCy -v2.0, `Language` classes are not imported on initialization and are only loaded -when you import them directly, or load a model that requires a language to be -loaded. To lazy-load languages in your application, you can use the -[`util.get_lang_class`](/api/top-level#util.get_lang_class) helper function with -the two-letter language code as its argument. - - - -### Stop words {#stop-words} - -A ["stop list"](https://en.wikipedia.org/wiki/Stop_words) is a classic trick -from the early days of information retrieval when search was largely about -keyword presence and absence. It is still sometimes useful today to filter out -common words from a bag-of-words model. To improve readability, `STOP_WORDS` are -separated by spaces and newlines, and added as a multiline string. - -> #### What does spaCy consider a stop word? -> -> There's no particularly principled logic behind what words should be added to -> the stop list. Make a list that you think might be useful to people and is -> likely to be unsurprising. As a rule of thumb, words that are very rare are -> unlikely to be useful stop words. - -```python -### Example -STOP_WORDS = set(""" -a about above across after afterwards again against all almost alone along -already also although always am among amongst amount an and another any anyhow -anyone anything anyway anywhere are around as at - -back be became because become becomes becoming been before beforehand behind -being below beside besides between beyond both bottom but by -""".split()) -``` - - - -When adding stop words from an online source, always **include the link** in a -comment. Make sure to **proofread** and double-check the words carefully. A lot -of the lists available online have been passed around for years and often -contain mistakes, like unicode errors or random words that have once been added -for a specific use case, but don't actually qualify. - - - -### Tokenizer exceptions {#tokenizer-exceptions} - -spaCy's [tokenization algorithm](/usage/linguistic-features#how-tokenizer-works) -lets you deal with whitespace-delimited chunks separately. This makes it easy to -define special-case rules, without worrying about how they interact with the -rest of the tokenizer. Whenever the key string is matched, the special-case rule -is applied, giving the defined sequence of tokens. - -Tokenizer exceptions can be added in the following format: - -```python -### tokenizer_exceptions.py (excerpt) -TOKENIZER_EXCEPTIONS = { - "don't": [ - {ORTH: "do"}, - {ORTH: "n't", NORM: "not"}] -} -``` - - - -If an exception consists of more than one token, the `ORTH` values combined -always need to **match the original string**. The way the original string is -split up can be pretty arbitrary sometimes – for example `"gonna"` is split into -`"gon"` (norm "going") and `"na"` (norm "to"). Because of how the tokenizer -works, it's currently not possible to split single-letter strings into multiple -tokens. - - - -> #### Generating tokenizer exceptions -> -> Keep in mind that generating exceptions only makes sense if there's a clearly -> defined and **finite number** of them, like common contractions in English. -> This is not always the case – in Spanish for instance, infinitive or -> imperative reflexive verbs and pronouns are one token (e.g. "vestirme"). In -> cases like this, spaCy shouldn't be generating exceptions for _all verbs_. -> Instead, this will be handled at a later stage after part-of-speech tagging -> and lemmatization. - -When adding the tokenizer exceptions to the `Defaults`, you can use the -[`update_exc`](/api/top-level#util.update_exc) helper function to merge them -with the global base exceptions (including one-letter abbreviations and -emoticons). The function performs a basic check to make sure exceptions are -provided in the correct format. It can take any number of exceptions dicts as -its arguments, and will update and overwrite the exception in this order. For -example, if your language's tokenizer exceptions include a custom tokenization -pattern for "a.", it will overwrite the base exceptions with the language's -custom one. - -```python -### Example -from ...util import update_exc - -BASE_EXCEPTIONS = {"a.": [{ORTH: "a."}], ":)": [{ORTH: ":)"}]} -TOKENIZER_EXCEPTIONS = {"a.": [{ORTH: "a.", NORM: "all"}]} - -tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) -# {"a.": [{ORTH: "a.", NORM: "all"}], ":)": [{ORTH: ":)"}]} -``` - -### Norm exceptions {#norm-exceptions new="2"} - -In addition to `ORTH`, tokenizer exceptions can also set a `NORM` attribute. -This is useful to specify a normalized version of the token – for example, the -norm of "n't" is "not". By default, a token's norm equals its lowercase text. If -the lowercase spelling of a word exists, norms should always be in lowercase. - -> #### Norms vs. lemmas -> -> ```python -> doc = nlp("I'm gonna realise") -> norms = [token.norm_ for token in doc] -> lemmas = [token.lemma_ for token in doc] -> assert norms == ["i", "am", "going", "to", "realize"] -> assert lemmas == ["i", "be", "go", "to", "realise"] -> ``` - -spaCy usually tries to normalize words with different spellings to a single, -common spelling. This has no effect on any other token attributes, or -tokenization in general, but it ensures that **equivalent tokens receive similar -representations**. This can improve the model's predictions on words that -weren't common in the training data, but are equivalent to other words – for -example, "realise" and "realize", or "thx" and "thanks". - -Similarly, spaCy also includes -[global base norms](https://github.com/explosion/spaCy/tree/master/spacy/lang/norm_exceptions.py) -for normalizing different styles of quotation marks and currency symbols. Even -though `$` and `€` are very different, spaCy normalizes them both to `$`. This -way, they'll always be seen as similar, no matter how common they were in the -training data. - -As of spaCy v2.3, language-specific norm exceptions are provided as a -JSON dictionary in the package -[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) rather -than in the main library. For a full example, see -[`en_lexeme_norm.json`](https://github.com/explosion/spacy-lookups-data/blob/master/spacy_lookups_data/data/en_lexeme_norm.json). - -```json -### Example -{ - "cos": "because", - "fav": "favorite", - "accessorise": "accessorize", - "accessorised": "accessorized" -} -``` - -If you're adding tables for a new languages, be sure to add the tables to -[`spacy_lookups_data/__init__.py`](https://github.com/explosion/spacy-lookups-data/blob/master/spacy_lookups_data/__init__.py) -and register the entry point under `spacy_lookups` in -[`setup.cfg`](https://github.com/explosion/spacy-lookups-data/blob/master/setup.cfg). - -Alternatively, you can initialize your language [`Vocab`](/api/vocab) with a -[`Lookups`](/api/lookups) object that includes the table `lexeme_norm`. - - - -Previously in spaCy v2.0-v2.2, norm exceptions were provided as a simple python -dictionary. For more examples, see the English -[`norm_exceptions.py`](https://github.com/explosion/spaCy/tree/v2.2.x/spacy/lang/en/norm_exceptions.py). - -```python -### Example -NORM_EXCEPTIONS = { - "cos": "because", - "fav": "favorite", - "accessorise": "accessorize", - "accessorised": "accessorized" -} -``` - -To add the custom norm exceptions lookup table, you can use the `add_lookups()` -helper functions. It takes the default attribute getter function as its first -argument, plus a variable list of dictionaries. If a string's norm is found in -one of the dictionaries, that value is used – otherwise, the default function is -called and the token is assigned its default norm. - -```python -lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], - NORM_EXCEPTIONS, BASE_NORMS) -``` - -The order of the dictionaries is also the lookup order – so if your language's -norm exceptions overwrite any of the global exceptions, they should be added -first. Also note that the tokenizer exceptions will always have priority over -the attribute getters. - - - -### Lexical attributes {#lex-attrs new="2"} - -spaCy provides a range of [`Token` attributes](/api/token#attributes) that -return useful information on that token – for example, whether it's uppercase or -lowercase, a left or right punctuation mark, or whether it resembles a number or -email address. Most of these functions, like `is_lower` or `like_url` should be -language-independent. Others, like `like_num` (which includes both digits and -number words), requires some customization. - -> #### Best practices -> -> Keep in mind that those functions are only intended to be an approximation. -> It's always better to prioritize simplicity and performance over covering very -> specific edge cases. -> -> English number words are pretty simple, because even large numbers consist of -> individual tokens, and we can get away with splitting and matching strings -> against a list. In other languages, like German, "two hundred and thirty-four" -> is one word, and thus one token. Here, it's best to match a string against a -> list of number word fragments (instead of a technically almost infinite list -> of possible number words). - -Here's an example from the English -[`lex_attrs.py`](https://github.com/explosion/spaCy/tree/master/spacy/lang/en/lex_attrs.py): - -```python -### lex_attrs.py -_num_words = ["zero", "one", "two", "three", "four", "five", "six", "seven", - "eight", "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", - "fifteen", "sixteen", "seventeen", "eighteen", "nineteen", "twenty", - "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety", - "hundred", "thousand", "million", "billion", "trillion", "quadrillion", - "gajillion", "bazillion"] - -def like_num(text): - text = text.replace(",", "").replace(".", "") - if text.isdigit(): - return True - if text.count("/") == 1: - num, denom = text.split("/") - if num.isdigit() and denom.isdigit(): - return True - if text.lower() in _num_words: - return True - return False - -LEX_ATTRS = { - LIKE_NUM: like_num -} -``` - -By updating the default lexical attributes with a custom `LEX_ATTRS` dictionary -in the language's defaults via `lex_attr_getters.update(LEX_ATTRS)`, only the -new custom functions are overwritten. - -### Syntax iterators {#syntax-iterators} - -Syntax iterators are functions that compute views of a `Doc` object based on its -syntax. At the moment, this data is only used for extracting -[noun chunks](/usage/linguistic-features#noun-chunks), which are available as -the [`Doc.noun_chunks`](/api/doc#noun_chunks) property. Because base noun -phrases work differently across languages, the rules to compute them are part of -the individual language's data. If a language does not include a noun chunks -iterator, the property won't be available. For examples, see the existing syntax -iterators: - -> #### Noun chunks example -> -> ```python -> doc = nlp("A phrase with another phrase occurs.") -> chunks = list(doc.noun_chunks) -> assert chunks[0].text == "A phrase" -> assert chunks[1].text == "another phrase" -> ``` - -| Language | Code | Source | -| ---------------- | ---- | ----------------------------------------------------------------------------------------------------------------- | -| English | `en` | [`lang/en/syntax_iterators.py`](https://github.com/explosion/spaCy/tree/master/spacy/lang/en/syntax_iterators.py) | -| German | `de` | [`lang/de/syntax_iterators.py`](https://github.com/explosion/spaCy/tree/master/spacy/lang/de/syntax_iterators.py) | -| French | `fr` | [`lang/fr/syntax_iterators.py`](https://github.com/explosion/spaCy/tree/master/spacy/lang/fr/syntax_iterators.py) | -| Spanish | `es` | [`lang/es/syntax_iterators.py`](https://github.com/explosion/spaCy/tree/master/spacy/lang/es/syntax_iterators.py) | -| Greek | `el` | [`lang/el/syntax_iterators.py`](https://github.com/explosion/spaCy/tree/master/spacy/lang/el/syntax_iterators.py) | -| Norwegian Bokmål | `nb` | [`lang/nb/syntax_iterators.py`](https://github.com/explosion/spaCy/tree/master/spacy/lang/nb/syntax_iterators.py) | -| Swedish | `sv` | [`lang/sv/syntax_iterators.py`](https://github.com/explosion/spaCy/tree/master/spacy/lang/sv/syntax_iterators.py) | -| Indonesian | `id` | [`lang/id/syntax_iterators.py`](https://github.com/explosion/spaCy/tree/master/spacy/lang/id/syntax_iterators.py) | -| Persian | `fa` | [`lang/fa/syntax_iterators.py`](https://github.com/explosion/spaCy/tree/master/spacy/lang/fa/syntax_iterators.py) | - -### Lemmatizer {#lemmatizer new="2"} - -As of v2.0, spaCy supports simple lookup-based lemmatization. This is usually -the quickest and easiest way to get started. The data is stored in a dictionary -mapping a string to its lemma. To determine a token's lemma, spaCy simply looks -it up in the table. Here's an example from the Spanish language data: - -```json -### es_lemma_lookup.json (excerpt) -{ - "aba": "abar", - "ababa": "abar", - "ababais": "abar", - "ababan": "abar", - "ababanes": "ababán", - "ababas": "abar", - "ababoles": "ababol", - "ababábites": "ababábite" -} -``` - -#### Adding JSON resources {#lemmatizer-resources new="2.2"} - -As of v2.2, resources for the lemmatizer are stored as JSON and have been moved -to a separate repository and package, -[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data). The -package exposes the data files via language-specific -[entry points](/usage/saving-loading#entry-points) that spaCy reads when -constructing the `Vocab` and [`Lookups`](/api/lookups). This allows easier -access to the data, serialization with the models and file compression on disk -(so your spaCy installation is smaller). If you want to use the lookup tables -without a pretrained model, you have to explicitly install spaCy with lookups -via `pip install spacy[lookups]` or by installing -[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) in the -same environment as spaCy. - -### Tag map {#tag-map} - -Most treebanks define a custom part-of-speech tag scheme, striking a balance -between level of detail and ease of prediction. While it's useful to have custom -tagging schemes, it's also useful to have a common scheme, to which the more -specific tags can be related. The tagger can learn a tag scheme with any -arbitrary symbols. However, you need to define how those symbols map down to the -[Universal Dependencies tag set](http://universaldependencies.org/u/pos/all.html). -This is done by providing a tag map. - -The keys of the tag map should be **strings in your tag set**. The values should -be a dictionary. The dictionary must have an entry POS whose value is one of the -[Universal Dependencies](http://universaldependencies.org/u/pos/all.html) tags. -Optionally, you can also include morphological features or other token -attributes in the tag map as well. This allows you to do simple -[rule-based morphological analysis](/usage/linguistic-features#rule-based-morphology). - -```python -### Example -from ..symbols import POS, NOUN, VERB, DET - -TAG_MAP = { - "NNS": {POS: NOUN, "Number": "plur"}, - "VBG": {POS: VERB, "VerbForm": "part", "Tense": "pres", "Aspect": "prog"}, - "DT": {POS: DET} -} -``` - -### Morph rules {#morph-rules} - -The morphology rules let you set token attributes such as lemmas, keyed by the -extended part-of-speech tag and token text. The morphological features and their -possible values are language-specific and based on the -[Universal Dependencies scheme](http://universaldependencies.org). - -```python -### Example -from ..symbols import LEMMA - -MORPH_RULES = { - "VBZ": { - "am": {LEMMA: "be", "VerbForm": "Fin", "Person": "One", "Tense": "Pres", "Mood": "Ind"}, - "are": {LEMMA: "be", "VerbForm": "Fin", "Person": "Two", "Tense": "Pres", "Mood": "Ind"}, - "is": {LEMMA: "be", "VerbForm": "Fin", "Person": "Three", "Tense": "Pres", "Mood": "Ind"}, - "'re": {LEMMA: "be", "VerbForm": "Fin", "Person": "Two", "Tense": "Pres", "Mood": "Ind"}, - "'s": {LEMMA: "be", "VerbForm": "Fin", "Person": "Three", "Tense": "Pres", "Mood": "Ind"} - } -} -``` - -In the example of `"am"`, the attributes look like this: - -| Attribute | Description | -| ------------------- | ------------------------------------------------------------------------------------------------------------------------------ | -| `LEMMA: "be"` | Base form, e.g. "to be". | -| `"VerbForm": "Fin"` | Finite verb. Finite verbs have a subject and can be the root of an independent clause – "I am." is a valid, complete sentence. | -| `"Person": "One"` | First person, i.e. "**I** am". | -| `"Tense": "Pres"` | Present tense, i.e. actions that are happening right now or actions that usually happen. | -| `"Mood": "Ind"` | Indicative, i.e. something happens, has happened or will happen (as opposed to imperative or conditional). | - - - -The morphological attributes are currently **not all used by spaCy**. Full -integration is still being developed. In the meantime, it can still be useful to -add them, especially if the language you're adding includes important -distinctions and special cases. This ensures that as soon as full support is -introduced, your language will be able to assign all possible attributes. - - - -## Testing the new language {#testing} - -Before using the new language or submitting a -[pull request](https://github.com/explosion/spaCy/pulls) to spaCy, you should -make sure it works as expected. This is especially important if you've added -custom regular expressions for token matching or punctuation – you don't want to -be causing regressions. - - - -spaCy uses the [pytest framework](https://docs.pytest.org/en/latest/) for -testing. For more details on how the tests are structured and best practices for -writing your own tests, see our -[tests documentation](https://github.com/explosion/spaCy/tree/master/spacy/tests). - - - -### Writing language-specific tests {#testing-custom} - -It's recommended to always add at least some tests with examples specific to the -language. Language tests should be located in -[`tests/lang`](https://github.com/explosion/spaCy/tree/master/spacy/tests/lang) -in a directory named after the language ID. You'll also need to create a fixture -for your tokenizer in the -[`conftest.py`](https://github.com/explosion/spaCy/tree/master/spacy/tests/conftest.py). -Always use the [`get_lang_class`](/api/top-level#util.get_lang_class) helper -function within the fixture, instead of importing the class at the top of the -file. This will load the language data only when it's needed. (Otherwise, _all -data_ would be loaded every time you run a test.) - -```python -@pytest.fixture -def en_tokenizer(): - return util.get_lang_class("en").Defaults.create_tokenizer() -``` - -When adding test cases, always -[`parametrize`](https://github.com/explosion/spaCy/tree/master/spacy/tests#parameters) -them – this will make it easier for others to add more test cases without having -to modify the test itself. You can also add parameter tuples, for example, a -test sentence and its expected length, or a list of expected tokens. Here's an -example of an English tokenizer test for combinations of punctuation and -abbreviations: - -```python -### Example test -@pytest.mark.parametrize('text,length', [ - ("The U.S. Army likes Shock and Awe.", 8), - ("U.N. regulations are not a part of their concern.", 10), - ("“Isn't it?”", 6)]) -def test_en_tokenizer_handles_punct_abbrev(en_tokenizer, text, length): - tokens = en_tokenizer(text) - assert len(tokens) == length -``` - -## Training a language model {#training} - -Much of spaCy's functionality requires models to be trained from labeled data. -For instance, in order to use the named entity recognizer, you need to first -train a model on text annotated with examples of the entities you want to -recognize. The parser, part-of-speech tagger and text categorizer all also -require models to be trained from labeled examples. The word vectors, word -probabilities and word clusters also require training, although these can be -trained from unlabeled text, which tends to be much easier to collect. - -### Creating a vocabulary file {#vocab-file} - -spaCy expects that common words will be cached in a [`Vocab`](/api/vocab) -instance. The vocabulary caches lexical features. spaCy loads the vocabulary -from binary data, in order to keep loading efficient. The easiest way to save -out a new binary vocabulary file is to use the `spacy init-model` command, which -expects a JSONL file with words and their lexical attributes. See the docs on -the [vocab JSONL format](/api/annotation#vocab-jsonl) for details. - -#### Training the word vectors {#word-vectors} - -[Word2vec](https://en.wikipedia.org/wiki/Word2vec) and related algorithms let -you train useful word similarity models from unlabeled text. This is a key part -of using deep learning for NLP with limited labeled data. The vectors are also -useful by themselves – they power the `.similarity` methods in spaCy. For best -results, you should pre-process the text with spaCy before training the Word2vec -model. This ensures your tokenization will match. You can use our -[word vectors training script](https://github.com/explosion/spacy/tree/master/bin/train_word_vectors.py), -which pre-processes the text with your language-specific tokenizer and trains -the model using [Gensim](https://radimrehurek.com/gensim/). The `vectors.bin` -file should consist of one word and vector per line. - -```python -https://github.com/explosion/spacy/tree/master/bin/train_word_vectors.py -``` - -If you don't have a large sample of text available, you can also convert word -vectors produced by a variety of other tools into spaCy's format. See the docs -on [converting word vectors](/usage/vectors-similarity#converting) for details. - -### Creating or converting a training corpus {#training-corpus} - -The easiest way to train spaCy's tagger, parser, entity recognizer or text -categorizer is to use the [`spacy train`](/api/cli#train) command-line utility. -In order to use this, you'll need training and evaluation data in the -[JSON format](/api/annotation#json-input) spaCy expects for training. - -If your data is in one of the supported formats, the easiest solution might be -to use the [`spacy convert`](/api/cli#convert) command-line utility. This -supports several popular formats, including the IOB format for named entity -recognition, the JSONL format produced by our annotation tool -[Prodigy](https://prodi.gy), and the -[CoNLL-U](http://universaldependencies.org/docs/format.html) format used by the -[Universal Dependencies](http://universaldependencies.org/) corpus. - -One thing to keep in mind is that spaCy expects to train its models from **whole -documents**, not just single sentences. If your corpus only contains single -sentences, spaCy's models will never learn to expect multi-sentence documents, -leading to low performance on real text. To mitigate this problem, you can use -the `-n` argument to the `spacy convert` command, to merge some of the sentences -into longer pseudo-documents. - -### Training the tagger and parser {#train-tagger-parser} - -Once you have your training and evaluation data in the format spaCy expects, you -can train your model use the using spaCy's [`train`](/api/cli#train) command. -Note that training statistical models still involves a degree of -trial-and-error. You may need to tune one or more settings, also called -"hyper-parameters", to achieve optimal performance. See the -[usage guide on training](/usage/training#tagger-parser) for more details. diff --git a/website/docs/usage/index.md b/website/docs/usage/index.md index 473ffded8..1e9473a5d 100644 --- a/website/docs/usage/index.md +++ b/website/docs/usage/index.md @@ -15,21 +15,9 @@ spaCy is compatible with **64-bit CPython 3.6+** and runs on **Unix/Linux**, > #### 📖 Looking for the old docs? > -> To help you make the transition from v1.x to v2.0, we've uploaded the old -> website to [**legacy.spacy.io**](https://legacy.spacy.io/docs). Wherever -> possible, the new docs also include notes on features that have changed in -> v2.0, and features that were introduced in the new version. - - - -We can't yet ship pre-compiled binary wheels for spaCy that work on Python 3.8, -as we're still waiting for our CI providers and other tooling to support it. -This means that in order to run spaCy on Python 3.8, you'll need -[a compiler installed](#source) and compile the library and its Cython -dependencies locally. If this is causing problems for you, the easiest solution -is to **use Python 3.7** in the meantime. - - +> To help you make the transition from v2.x to v3.0, we've uploaded the old +> website to [**v2.spacy.io**](https://v2.spacy.io/docs). To see what's changed +> and how to migrate, see the guide on [v3.0 guide](/usage/v3). ## Quickstart {hidden="true"} @@ -95,29 +83,29 @@ and pull requests to the recipe and setup are always appreciated. ### Upgrading spaCy {#upgrading} -> #### Upgrading from v1 to v2 +> #### Upgrading from v2 to v3 > > Although we've tried to keep breaking changes to a minimum, upgrading from -> spaCy v1.x to v2.x may still require some changes to your code base. For -> details see the sections on [backwards incompatibilities](/usage/v2#incompat) -> and [migrating](/usage/v2#migrating). Also remember to download the new +> spaCy v2.x to v3.x may still require some changes to your code base. For +> details see the sections on [backwards incompatibilities](/usage/v3#incompat) +> and [migrating](/usage/v3#migrating). Also remember to download the new > models, and retrain your own models. When updating to a newer version of spaCy, it's generally recommended to start with a clean virtual environment. If you're upgrading to a new major version, make sure you have the latest **compatible models** installed, and that there -are no old shortcut links or incompatible model packages left over in your -environment, as this can often lead to unexpected results and errors. If you've -trained your own models, keep in mind that your train and runtime inputs must -match. This means you'll have to **retrain your models** with the new version. +are no old and incompatible model packages left over in your environment, as +this can often lead to unexpected results and errors. If you've trained your own +models, keep in mind that your train and runtime inputs must match. This means +you'll have to **retrain your models** with the new version. -As of v2.0, spaCy also provides a [`validate`](/api/cli#validate) command, which -lets you verify that all installed models are compatible with your spaCy -version. If incompatible models are found, tips and installation instructions -are printed. The command is also useful to detect out-of-sync model links -resulting from links created in different virtual environments. It's recommended -to run the command with `python -m` to make sure you're executing the correct -version of spaCy. +spaCy also provides a [`validate`](/api/cli#validate) command, which lets you +verify that all installed models are compatible with your spaCy version. If +incompatible models are found, tips and installation instructions are printed. +The command is also useful to detect out-of-sync model links resulting from +links created in different virtual environments. It's recommended to run the +command with `python -m` to make sure you're executing the correct version of +spaCy. ```bash pip install -U spacy @@ -268,24 +256,6 @@ language's `Language` class instead, for example - - -``` -OSError: symbolic link privilege not held -``` - -To create [shortcut links](/usage/models#usage) that let you load models by -name, spaCy creates a symbolic link in the `spacy/data` directory. This means -your user needs permission to do this. The above error mostly occurs when doing -a system-wide installation, which will create the symlinks in a system -directory. Run the `download` or `link` command as administrator (on Windows, -you can either right-click on your terminal or shell and select "Run as -Administrator"), set the `--user` flag when installing a model or use a virtual -environment to install spaCy in a user directory, instead of doing a system-wide -installation. - - - ``` @@ -363,14 +333,12 @@ ImportError: No module named 'en_core_web_sm' ``` As of spaCy v1.7, all models can be installed as Python packages. This means -that they'll become importable modules of your application. When creating -[shortcut links](/usage/models#usage), spaCy will also try to import the model -to load its meta data. If this fails, it's usually a sign that the package is -not installed in the current environment. Run `pip list` or `pip freeze` to -check which model packages you have installed, and install the -[correct models](/models) if necessary. If you're importing a model manually at -the top of a file, make sure to use the name of the package, not the shortcut -link you've created. +that they'll become importable modules of your application. If this fails, it's +usually a sign that the package is not installed in the current environment. Run +`pip list` or `pip freeze` to check which model packages you have installed, and +install the [correct models](/models) if necessary. If you're importing a model +manually at the top of a file, make sure to use the name of the package, not the +shortcut link you've created. diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md index 2c927555f..e3d83c296 100644 --- a/website/docs/usage/linguistic-features.md +++ b/website/docs/usage/linguistic-features.md @@ -2,13 +2,14 @@ title: Linguistic Features next: /usage/rule-based-matching menu: + - ['Tokenization', 'tokenization'] - ['POS Tagging', 'pos-tagging'] - ['Dependency Parse', 'dependency-parse'] - ['Named Entities', 'named-entities'] - ['Entity Linking', 'entity-linking'] - - ['Tokenization', 'tokenization'] - ['Merging & Splitting', 'retokenization'] - ['Sentence Segmentation', 'sbd'] + - ['Language data', 'language-data'] --- Processing raw text intelligently is difficult: most words are rare, and it's @@ -297,8 +298,8 @@ different languages, see the ### Visualizing dependencies {#displacy} The best way to understand spaCy's dependency parser is interactively. To make -this easier, spaCy v2.0+ comes with a visualization module. You can pass a `Doc` -or a list of `Doc` objects to displaCy and run +this easier, spaCy comes with a visualization module. You can pass a `Doc` or a +list of `Doc` objects to displaCy and run [`displacy.serve`](/api/top-level#displacy.serve) to run the web server, or [`displacy.render`](/api/top-level#displacy.render) to generate the raw markup. If you want to know how to write rules that hook into some type of syntactic @@ -339,25 +340,6 @@ nlp = English().from_disk("/model", disable=["parser"]) doc = nlp("I don't want parsed", disable=["parser"]) ``` - - -Since spaCy v2.0 comes with better support for customizing the processing -pipeline components, the `parser` keyword argument has been replaced with -`disable`, which takes a list of -[pipeline component names](/usage/processing-pipelines). This lets you disable -both default and custom components when loading a model, or initializing a -Language class via [`from_disk`](/api/language#from_disk). - -```diff -+ nlp = spacy.load("en_core_web_sm", disable=["parser"]) -+ doc = nlp("I don't want parsed", disable=["parser"]) - -- nlp = spacy.load("en_core_web_sm", parser=False) -- doc = nlp("I don't want parsed", parse=False) -``` - - - ## Named Entity Recognition {#named-entities} spaCy features an extremely fast statistical entity recognition system, that @@ -551,8 +533,8 @@ The [displaCy ENT visualizer](https://explosion.ai/demos/displacy-ent) lets you explore an entity recognition model's behavior interactively. If you're training a model, it's very useful to run the visualization yourself. To help -you do that, spaCy v2.0+ comes with a visualization module. You can pass a `Doc` -or a list of `Doc` objects to displaCy and run +you do that, spaCy comes with a visualization module. You can pass a `Doc` or a +list of `Doc` objects to displaCy and run [`displacy.serve`](/api/top-level#displacy.serve) to run the web server, or [`displacy.render`](/api/top-level#displacy.render) to generate the raw markup. @@ -789,8 +771,8 @@ The algorithm can be summarized as follows: token. 3. Check whether we have an explicitly defined special case for this substring. If we do, use it. -4. Otherwise, try to consume one prefix. If we consumed a prefix, go back to - #2, so that the token match and special cases always get priority. +4. Otherwise, try to consume one prefix. If we consumed a prefix, go back to #2, + so that the token match and special cases always get priority. 5. If we didn't consume a prefix, try to consume a suffix and then go back to #2. 6. If we can't consume a prefix or a suffix, look for a URL match. @@ -843,7 +825,7 @@ domain. There are six things you may need to define: be split, overriding the infix rules. Useful for things like numbers. 6. An optional boolean function `url_match`, which is similar to `token_match` except that prefixes and suffixes are removed before applying the match. - + In spaCy v2.2.2-v2.2.4, the `token_match` was equivalent to the `url_match` @@ -1470,13 +1452,8 @@ doc = nlp(text) print("After:", [sent.text for sent in doc.sents]) ``` -## Rule-based matching {#rule-based-matching hidden="true"} +## Language data {#language-data} -
- +import LanguageData101 from 'usage/101/\_language-data.md' -The documentation on rule-based matching -[has moved to its own page](/usage/rule-based-matching). - - -
+ diff --git a/website/docs/usage/models.md b/website/docs/usage/models.md index b11e6347a..db8d0ee28 100644 --- a/website/docs/usage/models.md +++ b/website/docs/usage/models.md @@ -17,10 +17,10 @@ your file system. > #### Important note > -> If you're upgrading to spaCy v1.7.x or v2.x, you need to **download the new -> models**. If you've trained statistical models that use spaCy's annotations, -> you should **retrain your models** after updating spaCy. If you don't retrain, -> you may suffer train/test skew, which might decrease your accuracy. +> If you're upgrading to spaCy v3.x, you need to **download the new models**. If +> you've trained statistical models that use spaCy's annotations, you should +> **retrain your models** after updating spaCy. If you don't retrain, you may +> suffer train/test skew, which might decrease your accuracy. ## Quickstart {hidden="true"} @@ -74,10 +74,10 @@ import Languages from 'widgets/languages.js' > nlp = get_lang_class('xx') > ``` -As of v2.0, spaCy supports models trained on more than one language. This is -especially useful for named entity recognition. The language ID used for -multi-language or language-neutral models is `xx`. The language class, a generic -subclass containing only the base language data, can be found in +spaCy also supports models trained on more than one language. This is especially +useful for named entity recognition. The language ID used for multi-language or +language-neutral models is `xx`. The language class, a generic subclass +containing only the base language data, can be found in [`lang/xx`](https://github.com/explosion/spaCy/tree/master/spacy/lang/xx). To load your model with the neutral, multi-language class, simply set @@ -134,11 +134,11 @@ $ pip install https://github.com/honnibal/pkuseg-python/archive/master.zip The `meta` argument of the `Chinese` language class supports the following following tokenizer config settings: -| Name | Type | Description | -| ------------------ | ------- | ---------------------------------------------------------------------------------------------------- | -| `pkuseg_model` | unicode | **Required:** Name of a model provided by `pkuseg` or the path to a local model directory. | -| `pkuseg_user_dict` | unicode | Optional path to a file with one word per line which overrides the default `pkuseg` user dictionary. | -| `require_pkuseg` | bool | Overrides all `jieba` settings (optional but strongly recommended). | +| Name | Type | Description | +| ------------------ | ---- | ---------------------------------------------------------------------------------------------------- | +| `pkuseg_model` | str | **Required:** Name of a model provided by `pkuseg` or the path to a local model directory. | +| `pkuseg_user_dict` | str | Optional path to a file with one word per line which overrides the default `pkuseg` user dictionary. | +| `require_pkuseg` | bool | Overrides all `jieba` settings (optional but strongly recommended). | ```python ### Examples @@ -209,10 +209,9 @@ nlp = Chinese(meta={"tokenizer": {"config": {"pkuseg_model": "/path/to/pkuseg_mo The Japanese language class uses [SudachiPy](https://github.com/WorksApplications/SudachiPy) for word segmentation and part-of-speech tagging. The default Japanese language class and -the provided Japanese models use SudachiPy split mode `A`. - -The `meta` argument of the `Japanese` language class can be used to configure -the split mode to `A`, `B` or `C`. +the provided Japanese models use SudachiPy split mode `A`. The `meta` argument +of the `Japanese` language class can be used to configure the split mode to `A`, +`B` or `C`. @@ -224,34 +223,31 @@ used for training the current [Japanese models](/models/ja). ## Installing and using models {#download} -> #### Downloading models in spaCy < v1.7 -> -> In older versions of spaCy, you can still use the old download commands. This -> will download and install the models into the `spacy/data` directory. -> -> ```bash -> python -m spacy.en.download all -> python -m spacy.de.download all -> python -m spacy.en.download glove -> ``` -> -> The old models are also -> [attached to the v1.6.0 release](https://github.com/explosion/spaCy/tree/v1.6.0). -> To download and install them manually, unpack the archive, drop the contained -> directory into `spacy/data`. - The easiest way to download a model is via spaCy's [`download`](/api/cli#download) command. It takes care of finding the best-matching model compatible with your spaCy installation. +> #### Important note for v3.0 +> +> Note that as of spaCy v3.0, model shortcut links that create (potentially +> brittle) symlinks in your spaCy installation are **deprecated**. To download +> and load an installed model, use its full name: +> +> ```diff +> - python -m spacy download en +> + python -m spacy dowmload en_core_web_sm +> ``` +> +> ```diff +> - nlp = spacy.load("en") +> + nlp = spacy.load("en_core_web_sm") +> ``` + ```bash # Download best-matching version of specific model for your spaCy installation python -m spacy download en_core_web_sm -# Out-of-the-box: download best-matching default model and create shortcut link -python -m spacy download en - -# Download exact model version (doesn't create shortcut link) +# Download exact model version python -m spacy download en_core_web_sm-2.2.0 --direct ``` @@ -269,18 +265,6 @@ nlp = spacy.load("en_core_web_sm") doc = nlp("This is a sentence.") ``` - - -If you're downloading the models using a shortcut like `"en"`, spaCy will create -a symlink within the `spacy/data` directory. This means that your user needs the -**required permissions**. If you've installed spaCy to a system directory and -don't have admin privileges, the model linking may fail. The easiest solution is -to re-run the command as admin, set the `--user` flag or use a virtual -environment. For more info on this, see the -[troubleshooting guide](/usage/#symlink-privilege). - - - ### Installation via pip {#download-pip} To download a model directly using [pip](https://pypi.python.org/pypi/pip), @@ -291,15 +275,14 @@ click on the archive link and copy it to your clipboard. ```bash # With external URL -pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz +pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0.tar.gz # With local file -pip install /Users/you/en_core_web_sm-2.2.0.tar.gz +pip install /Users/you/en_core_web_sm-3.0.0.tar.gz ``` By default, this will install the model into your `site-packages` directory. You -can then use `spacy.load()` to load it via its package name, create a -[shortcut link](#usage-link) to assign it a custom name, or +can then use `spacy.load()` to load it via its package name or [import it](#usage-import) explicitly as a module. If you need to download models as part of an automated process, we recommend using pip with a direct link, instead of relying on spaCy's [`download`](/api/cli#download) command. @@ -319,29 +302,38 @@ model data. ```yaml ### Directory structure {highlight="7"} -└── en_core_web_md-2.2.0.tar.gz # downloaded archive +└── en_core_web_md-3.0.0.tar.gz # downloaded archive ├── meta.json # model meta data ├── setup.py # setup file for pip installation └── en_core_web_md # 📦 model package ├── __init__.py # init for pip installation ├── meta.json # model meta data - └── en_core_web_md-2.2.0 # model data + └── en_core_web_md-3.0.0 # model data ``` You can place the **model package directory** anywhere on your local file -system. To use it with spaCy, assign it a name by creating a shortcut link for -the data directory. +system. ### Using models with spaCy {#usage} To load a model, use [`spacy.load`](/api/top-level#spacy.load) with the model's -shortcut link, package name or a path to the data directory: +package name or a path to the data directory: + +> #### Important note for v3.0 +> +> Note that as of spaCy v3.0, model shortcut links that create (potentially +> brittle) symlinks in your spaCy installation are **deprecated**. To load an +> installed model, use its full name: +> +> ```diff +> - nlp = spacy.load("en") +> + nlp = spacy.load("en_core_web_sm") +> ``` ```python import spacy nlp = spacy.load("en_core_web_sm") # load model package "en_core_web_sm" nlp = spacy.load("/path/to/en_core_web_sm") # load package from a directory -nlp = spacy.load("en") # load model with shortcut link "en" doc = nlp("This is a sentence.") ``` @@ -356,55 +348,6 @@ will return the model's version. -### Using custom shortcut links {#usage-link} - -While previous versions of spaCy required you to maintain a data directory -containing the models for each installation, you can now choose **how and where -you want to keep your data**. For example, you could download all models -manually and put them into a local directory. Whenever your spaCy projects need -a model, you create a shortcut link to tell spaCy to load it from there. This -means you'll never end up with duplicate data. - -The [`link`](/api/cli#link) command will create a symlink in the `spacy/data` -directory. - -> #### Why does spaCy use symlinks? -> -> Symlinks were originally introduced to maintain backwards compatibility, as -> older versions expected model data to live within `spacy/data`. However, we -> decided to keep using them in v2.0 instead of opting for a config file. -> There'll always be a need for assigning and saving custom model names or IDs. -> And your system already comes with a native solution to mapping unicode -> aliases to file paths: symbolic links. - -```bash -$ python -m spacy link [package name or path] [shortcut] [--force] -``` - -The first argument is the **package name** (if the model was installed via pip), -or a local path to the the **model package**. The second argument is the -internal name you want to use for the model. Setting the `--force` flag will -overwrite any existing links. - -```bash -### Examples -# set up shortcut link to load installed package as "en_default" -python -m spacy link en_core_web_md en_default - -# set up shortcut link to load local model as "my_amazing_model" -python -m spacy link /Users/you/model my_amazing_model -``` - - - -In order to create a symlink, your user needs the **required permissions**. If -you've installed spaCy to a system directory and don't have admin privileges, -the `spacy link` command may fail. The easiest solution is to re-run the command -as admin, set the `--user` flag or use a virtual environment. For more info on -this, see the [troubleshooting guide](/usage/#symlink-privilege). - - - ### Importing models as modules {#usage-import} If you've installed a model via spaCy's downloader, or directly via pip, you can @@ -488,10 +431,9 @@ turn it into a loadable package. ### Loading and testing models {#models-loading} -Downloading models directly via pip won't call spaCy's link -[`package`](/api/cli#link) command, which creates symlinks for model shortcuts. -This means that you'll have to run this command separately, or use the native -`import` syntax to load the models: +Models are regular Python packages, so you can also import them as a package +using Python's native `import` syntax, and then call the `load` method to load +the model data and return an `nlp` object: ```python import en_core_web_sm diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md index e7aca3981..0ead27a49 100644 --- a/website/docs/usage/processing-pipelines.md +++ b/website/docs/usage/processing-pipelines.md @@ -295,25 +295,6 @@ nlp.rename_pipe("ner", "entityrecognizer") nlp.replace_pipe("tagger", my_custom_tagger) ``` - - -Since spaCy v2.0 comes with better support for customizing the processing -pipeline components, the `parser`, `tagger` and `entity` keyword arguments have -been replaced with `disable`, which takes a list of pipeline component names. -This lets you disable pre-defined components when loading a model, or -initializing a Language class via [`from_disk`](/api/language#from_disk). - -```diff -- nlp = spacy.load('en', tagger=False, entity=False) -- doc = nlp("I don't want parsed", parse=False) - -+ nlp = spacy.load("en", disable=["ner"]) -+ nlp.remove_pipe("parser") -+ doc = nlp("I don't want parsed") -``` - - - ## Creating custom pipeline components {#custom-components} A component receives a `Doc` object and can modify it – for example, by using @@ -532,13 +513,13 @@ nlp = spacy.load("your_custom_model", terms=["tree kangaroo"], label="ANIMAL") -When you load a model via its shortcut or package name, like `en_core_web_sm`, -spaCy will import the package and then call its `load()` method. This means that -custom code in the model's `__init__.py` will be executed, too. This is **not -the case** if you're loading a model from a path containing the model data. -Here, spaCy will only read in the `meta.json`. If you want to use custom -factories with a model loaded from a path, you need to add them to -`Language.factories` _before_ you load the model. +When you load a model via its package name, like `en_core_web_sm`, spaCy will +import the package and then call its `load()` method. This means that custom +code in the model's `__init__.py` will be executed, too. This is **not the +case** if you're loading a model from a path containing the model data. Here, +spaCy will only read in the `meta.json`. If you want to use custom factories +with a model loaded from a path, you need to add them to `Language.factories` +_before_ you load the model. @@ -719,8 +700,8 @@ class SimilarityModel(object): ## Developing plugins and wrappers {#plugins} We're very excited about all the new possibilities for community extensions and -plugins in spaCy v2.0, and we can't wait to see what you build with it! To get -you started, here are a few tips, tricks and best +plugins in spaCy, and we can't wait to see what you build with it! To get you +started, here are a few tips, tricks and best practices. [See here](/universe/?category=pipeline) for examples of other spaCy extensions. diff --git a/website/docs/usage/projects.md b/website/docs/usage/projects.md new file mode 100644 index 000000000..2631f1438 --- /dev/null +++ b/website/docs/usage/projects.md @@ -0,0 +1,5 @@ +--- +title: Projects +--- + +TODO: write diff --git a/website/docs/usage/saving-loading.md b/website/docs/usage/saving-loading.md index c0dbfc732..e9ba0de6a 100644 --- a/website/docs/usage/saving-loading.md +++ b/website/docs/usage/saving-loading.md @@ -13,15 +13,6 @@ import Serialization101 from 'usage/101/\_serialization.md' - - -In spaCy v2.0, the API for saving and loading has changed to only use the four -methods listed above consistently across objects and classes. For an overview of -the changes, see [this table](/usage/v2#incompat) and the notes on -[migrating](/usage/v2#migrating-saving-loading). - - - ### Serializing the pipeline {#pipeline} When serializing the pipeline, keep in mind that this will only save out the diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md index 55d4accba..8d80655e9 100644 --- a/website/docs/usage/training.md +++ b/website/docs/usage/training.md @@ -1,6 +1,6 @@ --- -title: Training spaCy's Statistical Models -next: /usage/adding-languages +title: Training Models +next: /usage/projects menu: - ['Basics', 'basics'] - ['NER', 'ner'] diff --git a/website/docs/usage/v2.md b/website/docs/usage/v2.md index 0ac8bfe75..a2322c3be 100644 --- a/website/docs/usage/v2.md +++ b/website/docs/usage/v2.md @@ -253,11 +253,10 @@ have a `to_bytes()`, `from_bytes()`, `to_disk()` and `from_disk()` method that supports the Pickle protocol. The improved `spacy.load` makes loading models easier and more transparent. You -can load a model by supplying its [shortcut link](/usage/models#usage), the name -of an installed [model package](/models) or a path. The `Language` class to -initialize will be determined based on the model's settings. For a blank -language, you can import the class directly, e.g. -`from spacy.lang.en import English` or use +can load a model by supplying its shortcut link, the name of an installed +[model package](/models) or a path. The `Language` class to initialize will be +determined based on the model's settings. For a blank language, you can import +the class directly, e.g. `from spacy.lang.en import English` or use [`spacy.blank()`](/api/top-level#spacy.blank). diff --git a/website/docs/usage/visualizers.md b/website/docs/usage/visualizers.md index 9733e09c2..df4987a62 100644 --- a/website/docs/usage/visualizers.md +++ b/website/docs/usage/visualizers.md @@ -7,35 +7,26 @@ menu: - ['Entities', 'ent'] - ['Jupyter Notebooks', 'jupyter'] - ['Rendering HTML', 'html'] + - ['Web app usage', 'webapp'] --- -As of v2.0, our popular visualizers, +Visualizing a dependency parse or named entities in a text is not only a fun NLP +demo – it can also be incredibly helpful in speeding up development and +debugging your code and training process. That's why our popular visualizers, [displaCy](https://explosion.ai/demos/displacy) and -[displaCy ENT](https://explosion.ai/demos/displacy-ent) are finally -an official part of the library. Visualizing a dependency parse or named -entities in a text is not only a fun NLP demo – it can also be incredibly -helpful in speeding up development and debugging your code and training process. -If you're running a [Jupyter](https://jupyter.org) notebook, displaCy will -detect this and return the markup in a format -[ready to be rendered and exported](#jupyter). - -> #### What about the old visualizers? -> -> Our JavaScript-based visualizers -> [`displacy.js`](https://github.com/explosion/displacy) and -> [`displacy-ent.js`](https://github.com/explosion/displacy-ent) will still be -> available on GitHub. If you're looking to implement web-based visualizations, -> we generally recommend using those instead of spaCy's built-in `displacy` -> module. It'll allow your application to perform all rendering on the client -> and only rely on the server for the text processing. The generated markup is -> also more compatible with modern web standards. +[displaCy ENT](https://explosion.ai/demos/displacy-ent) are also an +official part of the core library. If you're running a +[Jupyter](https://jupyter.org) notebook, displaCy will detect this and return +the markup in a format [ready to be rendered and exported](#jupyter). The quickest way to visualize `Doc` is to use [`displacy.serve`](/api/top-level#displacy.serve). This will spin up a simple web server and let you view the result straight from your browser. displaCy can either take a single `Doc` or a list of `Doc` objects as its first argument. This lets you construct them however you like – using any model or modifications -you like. +you like. If you're using [Streamlit](https://streamlit.io), check out the +[`spacy-streamlit`](https://github.com/explosion/spacy-streamlit) package that +helps you integrate spaCy visualizations into your apps! ## Visualizing the dependency parse {#dep} @@ -338,7 +329,7 @@ position. } ``` -### Using displaCy in a web application {#webapp} +## Using displaCy in a web application {#webapp} If you want to use the visualizers as part of a web application, for example to create something like our [online demo](https://explosion.ai/demos/displacy), @@ -359,40 +350,13 @@ JSON-formatted output. > on the client in JavaScript. displaCy.js creates the markup as DOM nodes and > will never insert raw HTML. -The `parse_deps` function takes a `Doc` object and returns a dictionary in a -format that can be rendered by displaCy. + -```python -### Example -import spacy -from spacy import displacy +Alternatively, if you're using [Streamlit](https://streamlit.io), check out the +[`spacy-streamlit`](https://github.com/explosion/spacy-streamlit) package that +helps you integrate spaCy visualizations into your apps. It includes a full +embedded visualizer, as well as individual components. -nlp = spacy.load("en_core_web_sm") +![](../images/spacy-streamlit.png)] -def displacy_service(text): - doc = nlp(text) - return displacy.parse_deps(doc) -``` - -Using a library like [Flask](http://flask.pocoo.org/) or -[Hug](http://www.hug.rest/), you can easily turn the above code into a simple -REST API that receives a text and returns a JSON-formatted parse. In your -front-end, include [`displacy.js`](https://github.com/explosion/displacy) and -initialize it with the API URL and the ID or query selector of the container to -render the visualization in, e.g. `'#displacy'` for `
`. - -```javascript -/// script.js -var displacy = new displaCy('http://localhost:8080', { - container: '#displacy', -}) - -function parse(text) { - displacy.parse(text) -} -``` - -When you call `parse`, it will make a request to your API, receive the -JSON-formatted parse and render it in your container. To create an interactive -experience, you could trigger this function by a button and read the text from -an `` field. + diff --git a/website/meta/sidebars.json b/website/meta/sidebars.json index 14803fe39..e509dade4 100644 --- a/website/meta/sidebars.json +++ b/website/meta/sidebars.json @@ -8,6 +8,7 @@ { "text": "Installation", "url": "/usage" }, { "text": "Models & Languages", "url": "/usage/models" }, { "text": "Facts & Figures", "url": "/usage/facts-figures" }, + { "text": "spaCy 101", "url": "/usage/spacy-101" }, { "text": "New in v3.0", "url": "/usage/v3" } ] }, @@ -19,8 +20,8 @@ { "text": "Processing Pipelines", "url": "/usage/processing-pipelines" }, { "text": "Vectors & Similarity", "url": "/usage/vectors-similarity" }, { "text": "Training Models", "url": "/usage/training" }, + { "text": "spaCy Projects", "url": "/usage/projects", "tag": "new" }, { "text": "Saving & Loading", "url": "/usage/saving-loading" }, - { "text": "Adding Languages", "url": "/usage/adding-languages" }, { "text": "Visualizers", "url": "/usage/visualizers" } ] }, diff --git a/website/src/components/sidebar.js b/website/src/components/sidebar.js index 27c71e70f..6417f107d 100644 --- a/website/src/components/sidebar.js +++ b/website/src/components/sidebar.js @@ -4,6 +4,7 @@ import classNames from 'classnames' import { window } from 'browser-monads' import Link from './link' +import Tag from './tag' import Dropdown from './dropdown' import classes from '../styles/sidebar.module.sass' @@ -65,7 +66,7 @@ const Sidebar = ({ items, pageMenu, slug }) => { {items.map((section, i) => (
  • {section.label}
  • - {section.items.map(({ text, url, onClick, menu, isActive }, j) => { + {section.items.map(({ text, url, tag, onClick, menu, isActive }, j) => { const currentMenu = menu || pageMenu || [] const active = isActive || slug === url const itemClassNames = classNames(classes.link, { @@ -82,6 +83,7 @@ const Sidebar = ({ items, pageMenu, slug }) => { hideIcon > {text} + {tag && {tag}} {active && !!currentMenu.length && (
      diff --git a/website/src/components/tag.js b/website/src/components/tag.js index 261ce9000..72e612666 100644 --- a/website/src/components/tag.js +++ b/website/src/components/tag.js @@ -6,6 +6,8 @@ import { isString } from './util' import Icon from './icon' import classes from '../styles/tag.module.sass' +const MIN_VERSION = 3 + const Tag = ({ spaced, variant, tooltip, children }) => { if (variant === 'new') { const isValid = isString(children) && !isNaN(children) @@ -13,8 +15,8 @@ const Tag = ({ spaced, variant, tooltip, children }) => { const tooltipText = `This feature is new and was introduced in spaCy v${version}` // TODO: we probably want to handle this more elegantly, but the idea is // that we can hide tags referring to old versions - // const hideTag = version.startsWith('2') - return ( + const major = isString(version) ? Number(version.split('.')[0]) : version + return major < MIN_VERSION ? null : ( v{version} diff --git a/website/src/fonts/jetbrainsmono-regular.woff b/website/src/fonts/jetbrainsmono-regular.woff new file mode 100755 index 000000000..dc1d85f57 Binary files /dev/null and b/website/src/fonts/jetbrainsmono-regular.woff differ diff --git a/website/src/fonts/jetbrainsmono-regular.woff2 b/website/src/fonts/jetbrainsmono-regular.woff2 new file mode 100755 index 000000000..fdf95dde6 Binary files /dev/null and b/website/src/fonts/jetbrainsmono-regular.woff2 differ diff --git a/website/src/styles/code.module.sass b/website/src/styles/code.module.sass index b268904f5..cadfbe50a 100644 --- a/website/src/styles/code.module.sass +++ b/website/src/styles/code.module.sass @@ -16,7 +16,7 @@ .code, .juniper-input pre, .juniper-output - font: var(--font-size-xs)/var(--line-height-lg) var(--font-code) !important + font: var(--font-size-code)/var(--line-height-code) var(--font-code) !important -webkit-font-smoothing: subpixel-antialiased -moz-osx-font-smoothing: auto diff --git a/website/src/styles/layout.sass b/website/src/styles/layout.sass index d70447288..c97013ab2 100644 --- a/website/src/styles/layout.sass +++ b/website/src/styles/layout.sass @@ -12,18 +12,20 @@ // Fonts --font-primary: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Helvetica, Arial, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol' --font-secondary: 'HK Grotesk', Roboto, Helvetica, Arial, sans-serif - --font-code: Menlo, Monaco, Consolas, 'Liberation Mono', 'Courier New', monospace + --font-code: "Jetbrains Mono", Menlo, Monaco, Consolas, 'Liberation Mono', 'Courier New', monospace // Font Sizes --font-size-xs: 1.1rem --font-size-sm: 1.3rem --font-size-md: 1.35rem --font-size-lg: 1.4rem + --font-size-code: 1.2rem --line-height-xs: 1.25 --line-height-sm: 1.375 --line-height-md: 1.5 --line-height-lg: 1.9 + --line-height-code: 1.8 // Spacing --spacing-xs: 1rem @@ -148,6 +150,13 @@ src: url("../fonts/hkgrotesk-bolditalic.woff2") format("woff2"), url("../fonts/hkgrotesk-bolditalic.woff") format("woff") font-display: swap +@font-face + font-family: "JetBrains Mono" + font-style: normal + font-weight: 500 + font-display: fallback + src: url("../fonts/jetbrainsmono-regular.woff") format("woff"), url("../fonts/jetbrainsmono-regular.woff2") format("woff2") + /* Reset */ *, *:before, *:after diff --git a/website/src/templates/docs.js b/website/src/templates/docs.js index f468964c9..89bf66605 100644 --- a/website/src/templates/docs.js +++ b/website/src/templates/docs.js @@ -154,6 +154,7 @@ const query = graphql` items { text url + tag } } } diff --git a/website/src/widgets/changelog.js b/website/src/widgets/changelog.js index c279e3ff2..73890d320 100644 --- a/website/src/widgets/changelog.js +++ b/website/src/widgets/changelog.js @@ -1,4 +1,4 @@ -import React, { useState, useEffect } from 'react' +import React, { useState, useEffect, Fragment } from 'react' import { window } from 'browser-monads' import Link from '../components/link' @@ -101,12 +101,12 @@ const Changelog = () => {

      - {prereleases.map(({ title, date, url, tag }) => ( - <> + {prereleases.map(({ title, date, url, tag }, i) => ( + {tag} {' '} - + ))}

      diff --git a/website/src/widgets/quickstart-install.js b/website/src/widgets/quickstart-install.js index d32062627..dd4e10f01 100644 --- a/website/src/widgets/quickstart-install.js +++ b/website/src/widgets/quickstart-install.js @@ -23,11 +23,6 @@ const DATA = [ { id: 'source', title: 'from source' }, ], }, - { - id: 'python', - title: 'Python version', - options: [{ id: '2', title: '2.x' }, { id: '3', title: '3.x', checked: true }], - }, { id: 'config', title: 'Configuration', @@ -70,15 +65,7 @@ const QuickstartInstall = ({ id, title }) => ( ] return ( - - python -m pip install -U virtualenv - - - virtualenv .env - - - python -m venv .env - + python -m venv .env source .env/bin/activate