From d5110ffbf2474339ffde948fc6d899873484285e Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 16 Jun 2020 15:37:35 +0200 Subject: [PATCH] Documentation updates for v2.3.0 (#5593) * Update website models for v2.3.0 * Add docs for Chinese word segmentation * Tighten up Chinese docs section * Merge branch 'master' into docs/v2.3.0 [ci skip] * Merge branch 'master' into docs/v2.3.0 [ci skip] * Auto-format and update version * Update matcher.md * Update languages and sorting * Typo in landing page * Infobox about token_match behavior * Add meta and basic docs for Japanese * POS -> TAG in models table * Add info about lookups for normalization * Updates to API docs for v2.3 * Update adding norm exceptions for adding languages * Add --omit-extra-lookups to CLI API docs * Add initial draft of "What's New in v2.3" * Add new in v2.3 tags to Chinese and Japanese sections * Add tokenizer to migration section * Add new in v2.3 flags to init-model * Typo * More what's new in v2.3 Co-authored-by: Ines Montani --- README.md | 17 +- website/docs/api/cli.md | 21 ++- website/docs/api/cython-structs.md | 3 - website/docs/api/goldparse.md | 1 + website/docs/api/lexeme.md | 2 +- website/docs/api/matcher.md | 11 +- website/docs/api/sentencizer.md | 2 +- website/docs/api/token.md | 2 +- website/docs/api/vocab.md | 3 + website/docs/usage/adding-languages.md | 34 +++- website/docs/usage/linguistic-features.md | 23 ++- website/docs/usage/models.md | 117 ++++++++++++ website/docs/usage/v2-3.md | 213 ++++++++++++++++++++++ website/meta/languages.json | 149 ++++++++++----- website/meta/sidebars.json | 1 + website/src/templates/models.js | 2 +- website/src/widgets/landing.js | 2 +- website/src/widgets/languages.js | 4 +- 18 files changed, 519 insertions(+), 88 deletions(-) create mode 100644 website/docs/usage/v2-3.md diff --git a/README.md b/README.md index 31dc78d63..4b5f3d0fa 100644 --- a/README.md +++ b/README.md @@ -6,12 +6,12 @@ spaCy is a library for advanced Natural Language Processing in Python and Cython. It's built on the very latest research, and was designed from day one to be used in real products. spaCy comes with [pretrained statistical models](https://spacy.io/models) and word vectors, and -currently supports tokenization for **50+ languages**. It features +currently supports tokenization for **60+ languages**. It features state-of-the-art speed, convolutional **neural network models** for tagging, parsing and **named entity recognition** and easy **deep learning** integration. It's commercial open-source software, released under the MIT license. -💫 **Version 2.2 out now!** +💫 **Version 2.3 out now!** [Check out the release notes here.](https://github.com/explosion/spaCy/releases) [![Azure Pipelines]()](https://dev.azure.com/explosion-ai/public/_build?definitionId=8) @@ -32,7 +32,7 @@ It's commercial open-source software, released under the MIT license. | --------------- | -------------------------------------------------------------- | | [spaCy 101] | New to spaCy? Here's everything you need to know! | | [Usage Guides] | How to use spaCy and its features. | -| [New in v2.2] | New features, backwards incompatibilities and migration guide. | +| [New in v2.3] | New features, backwards incompatibilities and migration guide. | | [API Reference] | The detailed reference for spaCy's API. | | [Models] | Download statistical language models for spaCy. | | [Universe] | Libraries, extensions, demos, books and courses. | @@ -40,7 +40,7 @@ It's commercial open-source software, released under the MIT license. | [Contribute] | How to contribute to the spaCy project and code base. | [spacy 101]: https://spacy.io/usage/spacy-101 -[new in v2.2]: https://spacy.io/usage/v2-2 +[new in v2.3]: https://spacy.io/usage/v2-3 [usage guides]: https://spacy.io/usage/ [api reference]: https://spacy.io/api/ [models]: https://spacy.io/models @@ -113,12 +113,13 @@ of `v2.0.13`). pip install spacy ``` -To install additional data tables for lemmatization in **spaCy v2.2+** you can -run `pip install spacy[lookups]` or install +To install additional data tables for lemmatization and normalization in +**spaCy v2.2+** you can run `pip install spacy[lookups]` or install [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) separately. The lookups package is needed to create blank models with -lemmatization data, and to lemmatize in languages that don't yet come with -pretrained models and aren't powered by third-party libraries. +lemmatization data for v2.2+ plus normalization data for v2.3+, and to +lemmatize in languages that don't yet come with pretrained models and aren't +powered by third-party libraries. When using pip it is generally recommended to install packages in a virtual environment to avoid modifying system state: diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index 6f4b8bb73..fe8877c69 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -541,16 +541,17 @@ $ python -m spacy init-model [lang] [output_dir] [--jsonl-loc] [--vectors-loc] [--prune-vectors] ``` -| Argument | Type | Description | -| ------------------------------------------------------- | ---------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `lang` | positional | Model language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes), e.g. `en`. | -| `output_dir` | positional | Model output directory. Will be created if it doesn't exist. | -| `--jsonl-loc`, `-j` | option | Optional location of JSONL-formatted [vocabulary file](/api/annotation#vocab-jsonl) with lexical attributes. | -| `--vectors-loc`, `-v` | option | Optional location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. | -| `--truncate-vectors`, `-t` 2.3 | option | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. | -| `--prune-vectors`, `-V` | option | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. | -| `--vectors-name`, `-vn` | option | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. | -| **CREATES** | model | A spaCy model containing the vocab and vectors. | +| Argument | Type | Description | +| ----------------------------------------------------------- | ---------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `lang` | positional | Model language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes), e.g. `en`. | +| `output_dir` | positional | Model output directory. Will be created if it doesn't exist. | +| `--jsonl-loc`, `-j` | option | Optional location of JSONL-formatted [vocabulary file](/api/annotation#vocab-jsonl) with lexical attributes. | +| `--vectors-loc`, `-v` | option | Optional location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. | +| `--truncate-vectors`, `-t` 2.3 | option | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. | +| `--prune-vectors`, `-V` | option | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. | +| `--vectors-name`, `-vn` | option | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. | +| `--omit-extra-lookups`, `-OEL` 2.3 | flag | Do not include any of the extra lookups tables (`cluster`/`prob`/`sentiment`) from `spacy-lookups-data` in the model. | +| **CREATES** | model | A spaCy model containing the vocab and vectors. | ## Evaluate {#evaluate new="2"} diff --git a/website/docs/api/cython-structs.md b/website/docs/api/cython-structs.md index 935bce25d..8ee1f1b9a 100644 --- a/website/docs/api/cython-structs.md +++ b/website/docs/api/cython-structs.md @@ -171,9 +171,6 @@ struct. | `shape` | `attr_t` | Transform of the lexeme's string, to show orthographic features. | | `prefix` | `attr_t` | Length-N substring from the start of the lexeme. Defaults to `N=1`. | | `suffix` | `attr_t` | Length-N substring from the end of the lexeme. Defaults to `N=3`. | -| `cluster` | `attr_t` | Brown cluster ID. | -| `prob` | `float` | Smoothed log probability estimate of the lexeme's word type (context-independent entry in the vocabulary). | -| `sentiment` | `float` | A scalar value indicating positivity or negativity. | ### Lexeme.get_struct_attr {#lexeme_get_struct_attr tag="staticmethod, nogil" source="spacy/lexeme.pxd"} diff --git a/website/docs/api/goldparse.md b/website/docs/api/goldparse.md index 443913311..5df625991 100644 --- a/website/docs/api/goldparse.md +++ b/website/docs/api/goldparse.md @@ -22,6 +22,7 @@ missing – the gradient for those labels will be zero. | `entities` | iterable | A sequence of named entity annotations, either as BILUO tag strings, or as `(start_char, end_char, label)` tuples, representing the entity positions. If BILUO tag strings, you can specify missing values by setting the tag to None. | | `cats` | dict | Labels for text classification. Each key in the dictionary is a string label for the category and each value is `1.0` (positive) or `0.0` (negative). | | `links` | dict | Labels for entity linking. A dict with `(start_char, end_char)` keys, and the values being dicts with `kb_id:value` entries, representing external KB IDs mapped to either `1.0` (positive) or `0.0` (negative). | +| `make_projective` | bool | Whether to projectivize the dependency tree. Defaults to `False.`. | | **RETURNS** | `GoldParse` | The newly constructed object. | ## GoldParse.\_\_len\_\_ {#len tag="method"} diff --git a/website/docs/api/lexeme.md b/website/docs/api/lexeme.md index feb167a9d..f7f6d654c 100644 --- a/website/docs/api/lexeme.md +++ b/website/docs/api/lexeme.md @@ -156,7 +156,7 @@ The L2 norm of the lexeme's vector representation. | `like_url` | bool | Does the lexeme resemble a URL? | | `like_num` | bool | Does the lexeme represent a number? e.g. "10.9", "10", "ten", etc. | | `like_email` | bool | Does the lexeme resemble an email address? | -| `is_oov` | bool | Is the lexeme out-of-vocabulary? | +| `is_oov` | bool | Does the lexeme have a word vector? | | `is_stop` | bool | Is the lexeme part of a "stop list"? | | `lang` | int | Language of the parent vocabulary. | | `lang_` | unicode | Language of the parent vocabulary. | diff --git a/website/docs/api/matcher.md b/website/docs/api/matcher.md index bfd4fb0ec..ac2f898e0 100644 --- a/website/docs/api/matcher.md +++ b/website/docs/api/matcher.md @@ -40,7 +40,8 @@ string where an integer is expected) or unexpected property names. ## Matcher.\_\_call\_\_ {#call tag="method"} -Find all token sequences matching the supplied patterns on the `Doc`. +Find all token sequences matching the supplied patterns on the `Doc`. As of +spaCy v2.3, the `Matcher` can also be called on `Span` objects. > #### Example > @@ -54,10 +55,10 @@ Find all token sequences matching the supplied patterns on the `Doc`. > matches = matcher(doc) > ``` -| Name | Type | Description | -| ----------- | ----- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `doc` | `Doc` | The document to match over. | -| **RETURNS** | list | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. The `match_id` is the ID of the added match pattern. | +| Name | Type | Description | +| ----------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `doclike` | `Doc`/`Span` | The document to match over or a `Span` (as of v2.3).. | +| **RETURNS** | list | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. The `match_id` is the ID of the added match pattern. | diff --git a/website/docs/api/sentencizer.md b/website/docs/api/sentencizer.md index c9b935f22..5a1ea162a 100644 --- a/website/docs/api/sentencizer.md +++ b/website/docs/api/sentencizer.md @@ -42,7 +42,7 @@ Initialize the sentencizer. | Name | Type | Description | | ------------- | ------------- | ------------------------------------------------------------------------------------------------------ | -| `punct_chars` | list | Optional custom list of punctuation characters that mark sentence ends. Defaults to `[".", "!", "?"].` | +| `punct_chars` | list | Optional custom list of punctuation characters that mark sentence ends. Defaults to `['!', '.', '?', '։', '؟', '۔', '܀', '܁', '܂', '߹', '।', '॥', '၊', '။', '።', '፧', '፨', '᙮', '᜵', '᜶', '᠃', '᠉', '᥄', '᥅', '᪨', '᪩', '᪪', '᪫', '᭚', '᭛', '᭞', '᭟', '᰻', '᰼', '᱾', '᱿', '‼', '‽', '⁇', '⁈', '⁉', '⸮', '⸼', '꓿', '꘎', '꘏', '꛳', '꛷', '꡶', '꡷', '꣎', '꣏', '꤯', '꧈', '꧉', '꩝', '꩞', '꩟', '꫰', '꫱', '꯫', '﹒', '﹖', '﹗', '!', '.', '?', '𐩖', '𐩗', '𑁇', '𑁈', '𑂾', '𑂿', '𑃀', '𑃁', '𑅁', '𑅂', '𑅃', '𑇅', '𑇆', '𑇍', '𑇞', '𑇟', '𑈸', '𑈹', '𑈻', '𑈼', '𑊩', '𑑋', '𑑌', '𑗂', '𑗃', '𑗉', '𑗊', '𑗋', '𑗌', '𑗍', '𑗎', '𑗏', '𑗐', '𑗑', '𑗒', '𑗓', '𑗔', '𑗕', '𑗖', '𑗗', '𑙁', '𑙂', '𑜼', '𑜽', '𑜾', '𑩂', '𑩃', '𑪛', '𑪜', '𑱁', '𑱂', '𖩮', '𖩯', '𖫵', '𖬷', '𖬸', '𖭄', '𛲟', '𝪈', '。', '。']`. | | **RETURNS** | `Sentencizer` | The newly constructed object. | ## Sentencizer.\_\_call\_\_ {#call tag="method"} diff --git a/website/docs/api/token.md b/website/docs/api/token.md index 0fa86b7bc..9f8594c96 100644 --- a/website/docs/api/token.md +++ b/website/docs/api/token.md @@ -459,7 +459,7 @@ The L2 norm of the token's vector representation. | `like_url` | bool | Does the token resemble a URL? | | `like_num` | bool | Does the token represent a number? e.g. "10.9", "10", "ten", etc. | | `like_email` | bool | Does the token resemble an email address? | -| `is_oov` | bool | Is the token out-of-vocabulary? | +| `is_oov` | bool | Does the token have a word vector? | | `is_stop` | bool | Is the token part of a "stop list"? | | `pos` | int | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/docs/u/pos/). | | `pos_` | unicode | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/docs/u/pos/). | diff --git a/website/docs/api/vocab.md b/website/docs/api/vocab.md index e024ab54a..2be6d67ed 100644 --- a/website/docs/api/vocab.md +++ b/website/docs/api/vocab.md @@ -27,6 +27,9 @@ Create the vocabulary. | `tag_map` | dict | A dictionary mapping fine-grained tags to coarse-grained parts-of-speech, and optionally morphological attributes. | | `lemmatizer` | object | A lemmatizer. Defaults to `None`. | | `strings` | `StringStore` / list | A [`StringStore`](/api/stringstore) that maps strings to hash values, and vice versa, or a list of strings. | +| `lookups` | `Lookups` | A [`Lookups`](/api/lookups) that stores the `lemma_\*`, `lexeme_norm` and other large lookup tables. Defaults to `None`. | +| `lookups_extra` 2.3 | `Lookups` | A [`Lookups`](/api/lookups) that stores the optional `lexeme_cluster`/`lexeme_prob`/`lexeme_sentiment`/`lexeme_settings` lookup tables. Defaults to `None`. | +| `oov_prob` | float | The default OOV probability. Defaults to `-20.0`. | | `vectors_name` 2.2 | unicode | A name to identify the vectors table. | | **RETURNS** | `Vocab` | The newly constructed object. | diff --git a/website/docs/usage/adding-languages.md b/website/docs/usage/adding-languages.md index 29de08266..d42aad705 100644 --- a/website/docs/usage/adding-languages.md +++ b/website/docs/usage/adding-languages.md @@ -297,9 +297,35 @@ though `$` and `€` are very different, spaCy normalizes them both to `$`. This way, they'll always be seen as similar, no matter how common they were in the training data. -Norm exceptions can be provided as a simple dictionary. For more examples, see -the English -[`norm_exceptions.py`](https://github.com/explosion/spaCy/tree/master/spacy/lang/en/norm_exceptions.py). +As of spaCy v2.3, language-specific norm exceptions are provided as a +JSON dictionary in the package +[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) rather +than in the main library. For a full example, see +[`en_lexeme_norm.json`](https://github.com/explosion/spacy-lookups-data/blob/master/spacy_lookups_data/data/en_lexeme_norm.json). + +```json +### Example +{ + "cos": "because", + "fav": "favorite", + "accessorise": "accessorize", + "accessorised": "accessorized" +} +``` + +If you're adding tables for a new languages, be sure to add the tables to +[`spacy_lookups_data/__init__.py`](https://github.com/explosion/spacy-lookups-data/blob/master/spacy_lookups_data/__init__.py) +and register the entry point under `spacy_lookups` in +[`setup.cfg`](https://github.com/explosion/spacy-lookups-data/blob/master/setup.cfg). + +Alternatively, you can initialize your language [`Vocab`](/api/vocab) with a +[`Lookups`](/api/lookups) object that includes the table `lexeme_norm`. + + + +Previously in spaCy v2.0-v2.2, norm exceptions were provided as a simple python +dictionary. For more examples, see the English +[`norm_exceptions.py`](https://github.com/explosion/spaCy/tree/v2.2.x/spacy/lang/en/norm_exceptions.py). ```python ### Example @@ -327,6 +353,8 @@ norm exceptions overwrite any of the global exceptions, they should be added first. Also note that the tokenizer exceptions will always have priority over the attribute getters. + + ### Lexical attributes {#lex-attrs new="2"} spaCy provides a range of [`Token` attributes](/api/token#attributes) that diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md index bcc943436..84bb3d71b 100644 --- a/website/docs/usage/linguistic-features.md +++ b/website/docs/usage/linguistic-features.md @@ -732,7 +732,7 @@ rather than performance: ```python def tokenizer_pseudo_code(self, special_cases, prefix_search, suffix_search, - infix_finditer, token_match): + infix_finditer, token_match, url_match): tokens = [] for substring in text.split(): suffixes = [] @@ -829,7 +829,7 @@ for t in tok_exp: ### Customizing spaCy's Tokenizer class {#native-tokenizers} Let's imagine you wanted to create a tokenizer for a new language or specific -domain. There are five things you would need to define: +domain. There are six things you may need to define: 1. A dictionary of **special cases**. This handles things like contractions, units of measurement, emoticons, certain abbreviations, etc. @@ -840,9 +840,22 @@ domain. There are five things you would need to define: 4. A function `infixes_finditer`, to handle non-whitespace separators, such as hyphens etc. 5. An optional boolean function `token_match` matching strings that should never - be split, overriding the infix rules. Useful for things like URLs or numbers. + be split, overriding the infix rules. Useful for things like numbers. 6. An optional boolean function `url_match`, which is similar to `token_match` - except prefixes and suffixes are removed before applying the match. + except that prefixes and suffixes are removed before applying the match. + + + +In spaCy v2.2.2-v2.2.4, the `token_match` was equivalent to the `url_match` +above and there was no match pattern applied before prefixes and suffixes were +analyzed. As of spaCy v2.3.0, the `token_match` has been reverted to its +behavior in v2.2.1 and earlier with precedence over prefixes and suffixes. + +The `url_match` is introduced in v2.3.0 to handle cases like URLs where the +tokenizer should remove prefixes and suffixes (e.g., a comma at the end of a +URL) before applying the match. + + You shouldn't usually need to create a `Tokenizer` subclass. Standard usage is to use `re.compile()` to build a regular expression object, and pass its @@ -865,7 +878,7 @@ def custom_tokenizer(nlp): prefix_search=prefix_re.search, suffix_search=suffix_re.search, infix_finditer=infix_re.finditer, - token_match=simple_url_re.match) + url_match=simple_url_re.match) nlp = spacy.load("en_core_web_sm") nlp.tokenizer = custom_tokenizer(nlp) diff --git a/website/docs/usage/models.md b/website/docs/usage/models.md index 5fd92f8f3..382193157 100644 --- a/website/docs/usage/models.md +++ b/website/docs/usage/models.md @@ -85,6 +85,123 @@ To load your model with the neutral, multi-language class, simply set `meta.json`. You can also import the class directly, or call [`util.get_lang_class()`](/api/top-level#util.get_lang_class) for lazy-loading. +### Chinese language support {#chinese new=2.3} + +The Chinese language class supports three word segmentation options: + +> ```python +> from spacy.lang.zh import Chinese +> +> # Disable jieba to use character segmentation +> Chinese.Defaults.use_jieba = False +> nlp = Chinese() +> +> # Disable jieba through tokenizer config options +> cfg = {"use_jieba": False} +> nlp = Chinese(meta={"tokenizer": {"config": cfg}}) +> +> # Load with "default" model provided by pkuseg +> cfg = {"pkuseg_model": "default", "require_pkuseg": True} +> nlp = Chinese(meta={"tokenizer": {"config": cfg}}) +> ``` + +1. **Jieba:** `Chinese` uses [Jieba](https://github.com/fxsjy/jieba) for word + segmentation by default. It's enabled when you create a new `Chinese` + language class or call `spacy.blank("zh")`. +2. **Character segmentation:** Character segmentation is supported by disabling + `jieba` and setting `Chinese.Defaults.use_jieba = False` _before_ + initializing the language class. As of spaCy v2.3.0, the `meta` tokenizer + config options can be used to configure `use_jieba`. +3. **PKUSeg**: In spaCy v2.3.0, support for + [PKUSeg](https://github.com/lancopku/PKUSeg-python) has been added to support + better segmentation for Chinese OntoNotes and the new + [Chinese models](/models/zh). + + + +The `meta` argument of the `Chinese` language class supports the following +following tokenizer config settings: + +| Name | Type | Description | +| ------------------ | ------- | ---------------------------------------------------------------------------------------------------- | +| `pkuseg_model` | unicode | **Required:** Name of a model provided by `pkuseg` or the path to a local model directory. | +| `pkuseg_user_dict` | unicode | Optional path to a file with one word per line which overrides the default `pkuseg` user dictionary. | +| `require_pkuseg` | bool | Overrides all `jieba` settings (optional but strongly recommended). | + +```python +### Examples +# Load "default" model +cfg = {"pkuseg_model": "default", "require_pkuseg": True} +nlp = Chinese(meta={"tokenizer": {"config": cfg}}) + +# Load local model +cfg = {"pkuseg_model": "/path/to/pkuseg_model", "require_pkuseg": True} +nlp = Chinese(meta={"tokenizer": {"config": cfg}}) + +# Override the user directory +cfg = {"pkuseg_model": "default", "require_pkuseg": True, "pkuseg_user_dict": "/path"} +nlp = Chinese(meta={"tokenizer": {"config": cfg}}) +``` + +You can also modify the user dictionary on-the-fly: + +```python +# Append words to user dict +nlp.tokenizer.pkuseg_update_user_dict(["中国", "ABC"]) + +# Remove all words from user dict and replace with new words +nlp.tokenizer.pkuseg_update_user_dict(["中国"], reset=True) + +# Remove all words from user dict +nlp.tokenizer.pkuseg_update_user_dict([], reset=True) +``` + + + + + +The [Chinese models](/models/zh) provided by spaCy include a custom `pkuseg` +model trained only on +[Chinese OntoNotes 5.0](https://catalog.ldc.upenn.edu/LDC2013T19), since the +models provided by `pkuseg` include data restricted to research use. For +research use, `pkuseg` provides models for several different domains +(`"default"`, `"news"` `"web"`, `"medicine"`, `"tourism"`) and for other uses, +`pkuseg` provides a simple +[training API](https://github.com/lancopku/pkuseg-python/blob/master/readme/readme_english.md#usage): + +```python +import pkuseg +from spacy.lang.zh import Chinese + +# Train pkuseg model +pkuseg.train("train.utf8", "test.utf8", "/path/to/pkuseg_model") +# Load pkuseg model in spaCy Chinese tokenizer +nlp = Chinese(meta={"tokenizer": {"config": {"pkuseg_model": "/path/to/pkuseg_model", "require_pkuseg": True}}}) +``` + + + +### Japanese language support {#japanese new=2.3} + +> ```python +> from spacy.lang.ja import Japanese +> +> # Load SudachiPy with split mode A (default) +> nlp = Japanese() +> +> # Load SudachiPy with split mode B +> cfg = {"split_mode": "B"} +> nlp = Japanese(meta={"tokenizer": {"config": cfg}}) +> ``` + +The Japanese language class uses +[SudachiPy](https://github.com/WorksApplications/SudachiPy) for word +segmentation and part-of-speech tagging. The default Japanese language class +and the provided Japanese models use SudachiPy split mode `A`. + +The `meta` argument of the `Japanese` language class can be used to configure +the split mode to `A`, `B` or `C`. + ## Installing and using models {#download} > #### Downloading models in spaCy < v1.7 diff --git a/website/docs/usage/v2-3.md b/website/docs/usage/v2-3.md new file mode 100644 index 000000000..ba75b01ab --- /dev/null +++ b/website/docs/usage/v2-3.md @@ -0,0 +1,213 @@ +--- +title: What's New in v2.3 +teaser: New features, backwards incompatibilities and migration guide +menu: + - ['New Features', 'features'] + - ['Backwards Incompatibilities', 'incompat'] + - ['Migrating from v2.2', 'migrating'] +--- + +## New Features {#features hidden="true"} + +spaCy v2.3 features new pretrained models for five languages, word vectors for +all language models, and decreased model size and loading times for models with +vectors. We've added pretrained models for **Chinese, Danish, Japanese, Polish +and Romanian** and updated the training data and vectors for most languages. +Model packages with vectors are about **2×** smaller on disk and load +**2-4×** faster. For the full changelog, see the [release notes on +GitHub](https://github.com/explosion/spaCy/releases/tag/v2.3.0). For more +details and a behind-the-scenes look at the new release, [see our blog +post](https://explosion.ai/blog/spacy-v2-3). + +### Expanded model families with vectors {#models} + +> #### Example +> +> ```bash +> python -m spacy download da_core_news_sm +> python -m spacy download ja_core_news_sm +> python -m spacy download pl_core_news_sm +> python -m spacy download ro_core_news_sm +> python -m spacy download zh_core_web_sm +> ``` + +With new model families for Chinese, Danish, Polish, Romanian and Chinese plus +`md` and `lg` models with word vectors for all languages, this release provides +a total of 46 model packages. For models trained using [Universal +Dependencies](https://universaldependencies.org) corpora, the training data has +been updated to UD v2.5 (v2.6 for Japanese, v2.3 for Polish) and Dutch has been +extended to include both UD Dutch Alpino and LassySmall. + + + +**Models:** [Models directory](/models) **Benchmarks: ** +[Release notes](https://github.com/explosion/spaCy/releases/tag/v2.3.0) + + + +### Chinese {#chinese} + +> #### Example +> ```python +> from spacy.lang.zh import Chinese +> +> # Load with "default" model provided by pkuseg +> cfg = {"pkuseg_model": "default", "require_pkuseg": True} +> nlp = Chinese(meta={"tokenizer": {"config": cfg}}) +> +> # Append words to user dict +> nlp.tokenizer.pkuseg_update_user_dict(["中国", "ABC"]) + +This release adds support for +[pkuseg](https://github.com/lancopku/pkuseg-python) for word segmentation and +the new Chinese models ship with a custom pkuseg model trained on OntoNotes. +The Chinese tokenizer can be initialized with both `pkuseg` and custom models +and the `pkuseg` user dictionary is easy to customize. + + + +**Chinese:** [Chinese tokenizer usage](/usage/models#chinese) + + + +### Japanese {#japanese} + +The updated Japanese language class switches to +[SudachiPy](https://github.com/WorksApplications/SudachiPy) for word +segmentation and part-of-speech tagging. Using `sudachipy` greatly simplifies +installing spaCy for Japanese, which is now possible with a single command: +`pip install spacy[ja]`. + + + +**Japanese:** [Japanese tokenizer usage](/usage/models#japanese) + + + +### Small CLI updates + +- `spacy debug-data` provides the coverage of the vectors in a base model with + `spacy debug-data lang train dev -b base_model` +- `spacy evaluate` supports `blank:lg` (e.g. `spacy evaluate blank:en + dev.json`) to evaluate the tokenization accuracy without loading a model +- `spacy train` on GPU restricts the CPU timing evaluation to the first + iteration + +## Backwards incompatibilities {#incompat} + + + +If you've been training **your own models**, you'll need to **retrain** them +with the new version. Also don't forget to upgrade all models to the latest +versions. Models for earlier v2 releases (v2.0, v2.1, v2.2) aren't compatible +with models for v2.3. To check if all of your models are up to date, you can +run the [`spacy validate`](/api/cli#validate) command. + + + +> #### Install with lookups data +> +> ```bash +> $ pip install spacy[lookups] +> ``` +> +> You can also install +> [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) +> directly. + +- If you're training new models, you'll want to install the package + [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data), + which now includes both the lemmatization tables (as in v2.2) and the + normalization tables (new in v2.3). If you're using pretrained models, + **nothing changes**, because the relevant tables are included in the model + packages. +- Due to the updated Universal Dependencies training data, the fine-grained + part-of-speech tags will change for many provided language models. The + coarse-grained part-of-speech tagset remains the same, but the mapping from + particular fine-grained to coarse-grained tags may show minor differences. +- For French, Italian, Portuguese and Spanish, the fine-grained part-of-speech + tagsets contain new merged tags related to contracted forms, such as + `ADP_DET` for French `"au"`, which maps to UPOS `ADP` based on the head + `"à"`. This increases the accuracy of the models by improving the alignment + between spaCy's tokenization and Universal Dependencies multi-word tokens + used for contractions. + +### Migrating from spaCy 2.2 {#migrating} + +#### Tokenizer settings + +In spaCy v2.2.2-v2.2.4, there was a change to the precedence of `token_match` +that gave prefixes and suffixes priority over `token_match`, which caused +problems for many custom tokenizer configurations. This has been reverted in +v2.3 so that `token_match` has priority over prefixes and suffixes as in v2.2.1 +and earlier versions. + +A new tokenizer setting `url_match` has been introduced in v2.3.0 to handle +cases like URLs where the tokenizer should remove prefixes and suffixes (e.g., +a comma at the end of a URL) before applying the match. See the full [tokenizer +documentation](/usage/linguistic-features#tokenization) and try out +[`nlp.tokenizer.explain()`](/usage/linguistic-features#tokenizer-debug) when +debugging your tokenizer configuration. + +#### Warnings configuration + +spaCy's custom warnings have been replaced with native python +[`warnings`](https://docs.python.org/3/library/warnings.html). Instead of +setting `SPACY_WARNING_IGNORE`, use the [warnings +filters](https://docs.python.org/3/library/warnings.html#the-warnings-filter) +to manage warnings. + +#### Normalization tables + +The normalization tables have moved from the language data in +[`spacy/lang`](https://github.com/explosion/spaCy/tree/master/spacy/lang) to +the package +[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data). If +you're adding data for a new language, the normalization table should be added +to `spacy-lookups-data`. See [adding norm +exceptions](/usage/adding-languages#norm-exceptions). + +#### Probability and cluster features + +> #### Load and save extra prob lookups table +> +> ```python +> from spacy.lang.en import English +> nlp = English() +> doc = nlp("the") +> print(doc[0].prob) # lazily loads extra prob table +> nlp.to_disk("/path/to/model") # includes prob table +> ``` + +The `Token.prob` and `Token.cluster` features, which are no longer used by the +core pipeline components as of spaCy v2, are no longer provided in the +pretrained models to reduce the model size. To keep these features available +for users relying on them, the `prob` and `cluster` features for the most +frequent 1M tokens have been moved to +[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) as +`extra` features for the relevant languages (English, German, Greek and +Spanish). + +The extra tables are loaded lazily, so if you have `spacy-lookups-data` +installed and your code accesses `Token.prob`, the full table is loaded into +the model vocab, which will take a few seconds on initial loading. When you +save this model after loading the `prob` table, the full `prob` table will be +saved as part of the model vocab. + +If you'd like to include custom `cluster`, `prob`, or `sentiment` tables as +part of a new model, add the data to +[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) under +the entry point `lg_extra`, e.g. `en_extra` for English. Alternatively, you can +initialize your [`Vocab`](/api/vocab) with the `lookups_extra` argument with a +[`Lookups`](/api/lookups) object that includes the tables `lexeme_cluster`, +`lexeme_prob`, `lexeme_sentiment` or `lexeme_settings`. `lexeme_settings` is +currently only used to provide a custom `oov_prob`. See examples in the [`data` +directory](https://github.com/explosion/spacy-lookups-data/tree/master/spacy_lookups_data/data) +in `spacy-lookups-data`. + +#### Initializing new models without extra lookups tables + +When you initialize a new model with [`spacy init-model`](/api/cli#init-model), +the `prob` table from `spacy-lookups-data` may be loaded as part of the +initialization. If you'd like to omit this extra data as in spaCy's provided +v2.3 models, use the new flag `--omit-extra-lookups`. diff --git a/website/meta/languages.json b/website/meta/languages.json index 41c1bce7f..facfc3541 100644 --- a/website/meta/languages.json +++ b/website/meta/languages.json @@ -1,5 +1,35 @@ { "languages": [ + { + "code": "zh", + "name": "Chinese", + "models": ["zh_core_web_sm", "zh_core_web_md", "zh_core_web_lg"], + "dependencies": [ + { + "name": "Jieba", + "url": "https://github.com/fxsjy/jieba" + }, + { + "name": "PKUSeg", + "url": "https://github.com/lancopku/PKUSeg-python" + } + ], + "has_examples": true + }, + { + "code": "da", + "name": "Danish", + "example": "Dette er en sætning.", + "has_examples": true, + "models": ["da_core_news_sm", "da_core_news_md", "da_core_news_lg"] + }, + { + "code": "nl", + "name": "Dutch", + "models": ["nl_core_news_sm", "nl_core_news_md", "nl_core_news_lg"], + "example": "Dit is een zin.", + "has_examples": true + }, { "code": "en", "name": "English", @@ -14,68 +44,91 @@ "example": "This is a sentence.", "has_examples": true }, + { + "code": "fr", + "name": "French", + "models": ["fr_core_news_sm", "fr_core_news_md", "fr_core_news_lg"], + "example": "C'est une phrase.", + "has_examples": true + }, { "code": "de", "name": "German", - "models": ["de_core_news_sm", "de_core_news_md"], + "models": ["de_core_news_sm", "de_core_news_md", "de_core_news_lg"], "starters": ["de_trf_bertbasecased_lg"], "example": "Dies ist ein Satz.", "has_examples": true }, { - "code": "fr", - "name": "French", - "models": ["fr_core_news_sm", "fr_core_news_md"], - "example": "C'est une phrase.", - "has_examples": true - }, - { - "code": "es", - "name": "Spanish", - "models": ["es_core_news_sm", "es_core_news_md"], - "example": "Esto es una frase.", - "has_examples": true - }, - { - "code": "pt", - "name": "Portuguese", - "models": ["pt_core_news_sm"], - "example": "Esta é uma frase.", + "code": "el", + "name": "Greek", + "models": ["el_core_news_sm", "el_core_news_md", "el_core_news_lg"], + "example": "Αυτή είναι μια πρόταση.", "has_examples": true }, { "code": "it", "name": "Italian", - "models": ["it_core_news_sm"], + "models": ["it_core_news_sm", "it_core_news_md", "it_core_news_lg"], "example": "Questa è una frase.", "has_examples": true }, { - "code": "nl", - "name": "Dutch", - "models": ["nl_core_news_sm"], - "example": "Dit is een zin.", + "code": "ja", + "name": "Japanese", + "models": ["ja_core_news_sm", "ja_core_news_md", "ja_core_news_lg"], + "dependencies": [ + { + "name": "SudachiPy", + "url": "https://github.com/WorksApplications/SudachiPy" + } + ], "has_examples": true }, { - "code": "el", - "name": "Greek", - "models": ["el_core_news_sm", "el_core_news_md"], - "example": "Αυτή είναι μια πρόταση.", - "has_examples": true + "code": "lt", + "name": "Lithuanian", + "has_examples": true, + "models": ["lt_core_news_sm", "lt_core_news_md", "lt_core_news_lg"] }, - { "code": "sv", "name": "Swedish", "has_examples": true }, - { "code": "fi", "name": "Finnish", "has_examples": true }, { "code": "nb", "name": "Norwegian Bokmål", "example": "Dette er en setning.", "has_examples": true, - "models": ["nb_core_news_sm"] + "models": ["nb_core_news_sm", "nb_core_news_md", "nb_core_news_lg"] }, - { "code": "da", "name": "Danish", "example": "Dette er en sætning.", "has_examples": true }, + { + "code": "pl", + "name": "Polish", + "example": "To jest zdanie.", + "has_examples": true, + "models": ["pl_core_news_sm", "pl_core_news_md", "pl_core_news_lg"] + }, + { + "code": "pt", + "name": "Portuguese", + "models": ["pt_core_news_sm", "pt_core_news_md", "pt_core_news_lg"], + "example": "Esta é uma frase.", + "has_examples": true + }, + { + "code": "ro", + "name": "Romanian", + "example": "Aceasta este o propoziție.", + "has_examples": true, + "models": ["ro_core_news_sm", "ro_core_news_md", "ro_core_news_lg"] + }, + { + "code": "es", + "name": "Spanish", + "models": ["es_core_news_sm", "es_core_news_md", "es_core_news_lg"], + "example": "Esto es una frase.", + "has_examples": true + }, + { "code": "sv", "name": "Swedish", "has_examples": true }, + { "code": "fi", "name": "Finnish", "has_examples": true }, { "code": "hu", "name": "Hungarian", "example": "Ez egy mondat.", "has_examples": true }, - { "code": "pl", "name": "Polish", "example": "To jest zdanie.", "has_examples": true }, { "code": "ru", "name": "Russian", @@ -88,12 +141,6 @@ "has_examples": true, "dependencies": [{ "name": "pymorphy2", "url": "https://github.com/kmike/pymorphy2" }] }, - { - "code": "ro", - "name": "Romanian", - "example": "Aceasta este o propoziție.", - "has_examples": true - }, { "code": "hr", "name": "Croatian", "has_examples": true }, { "code": "eu", "name": "Basque", "has_examples": true }, { "code": "yo", "name": "Yoruba", "has_examples": true }, @@ -123,7 +170,6 @@ { "code": "bg", "name": "Bulgarian", "example": "Това е изречение", "has_examples": true }, { "code": "cs", "name": "Czech" }, { "code": "is", "name": "Icelandic" }, - { "code": "lt", "name": "Lithuanian", "has_examples": true, "models": ["lt_core_news_sm"] }, { "code": "lv", "name": "Latvian" }, { "code": "sr", "name": "Serbian" }, { "code": "sk", "name": "Slovak" }, @@ -145,12 +191,6 @@ "example": "นี่คือประโยค", "has_examples": true }, - { - "code": "zh", - "name": "Chinese", - "dependencies": [{ "name": "Jieba", "url": "https://github.com/fxsjy/jieba" }], - "has_examples": true - }, { "code": "ja", "name": "Japanese", @@ -187,6 +227,21 @@ "example": "Sta chì a l'é unna fraxe.", "has_examples": true }, + { + "code": "hy", + "name": "Armenian", + "has_examples": true + }, + { + "code": "gu", + "name": "Gujarati", + "has_examples": true + }, + { + "code": "ml", + "name": "Malayalam", + "has_examples": true + }, { "code": "xx", "name": "Multi-language", diff --git a/website/meta/sidebars.json b/website/meta/sidebars.json index 3fafc52b0..d7129875f 100644 --- a/website/meta/sidebars.json +++ b/website/meta/sidebars.json @@ -9,6 +9,7 @@ { "text": "Models & Languages", "url": "/usage/models" }, { "text": "Facts & Figures", "url": "/usage/facts-figures" }, { "text": "spaCy 101", "url": "/usage/spacy-101" }, + { "text": "New in v2.3", "url": "/usage/v2-3" }, { "text": "New in v2.2", "url": "/usage/v2-2" }, { "text": "New in v2.1", "url": "/usage/v2-1" }, { "text": "New in v2.0", "url": "/usage/v2" } diff --git a/website/src/templates/models.js b/website/src/templates/models.js index 845fec65d..5bba1922b 100644 --- a/website/src/templates/models.js +++ b/website/src/templates/models.js @@ -83,7 +83,7 @@ function formatVectors(data) { function formatAccuracy(data) { if (!data) return null - const labels = { tags_acc: 'POS', ents_f: 'NER F', ents_p: 'NER P', ents_r: 'NER R' } + const labels = { tags_acc: 'TAG', ents_f: 'NER F', ents_p: 'NER P', ents_r: 'NER R' } const isSyntax = key => ['tags_acc', 'las', 'uas'].includes(key) const isNer = key => key.startsWith('ents_') return Object.keys(data).map(key => ({ diff --git a/website/src/widgets/landing.js b/website/src/widgets/landing.js index c96905733..1f788877c 100644 --- a/website/src/widgets/landing.js +++ b/website/src/widgets/landing.js @@ -124,7 +124,7 @@ const Landing = ({ data }) => { {counts.modelLangs} languages
  • - pretrained word vectors + Pretrained word vectors
  • State-of-the-art speed
  • diff --git a/website/src/widgets/languages.js b/website/src/widgets/languages.js index 55645f951..bb26e57cd 100644 --- a/website/src/widgets/languages.js +++ b/website/src/widgets/languages.js @@ -38,10 +38,10 @@ const Languages = () => ( const langs = site.siteMetadata.languages const withModels = langs .filter(({ models }) => models && !!models.length) - .sort((a, b) => a.code.localeCompare(b.code)) + .sort((a, b) => a.name.localeCompare(b.name)) const withoutModels = langs .filter(({ models }) => !models || !models.length) - .sort((a, b) => a.code.localeCompare(b.code)) + .sort((a, b) => a.name.localeCompare(b.name)) const withDeps = langs.filter(({ dependencies }) => dependencies && dependencies.length) return ( <>