Merge branch 'master' into spacy.io

This commit is contained in:
Ines Montani 2020-06-16 16:38:55 +02:00
commit ec26180b8f
18 changed files with 543 additions and 111 deletions

View File

@ -6,12 +6,12 @@ spaCy is a library for advanced Natural Language Processing in Python and
Cython. It's built on the very latest research, and was designed from day one to
be used in real products. spaCy comes with
[pretrained statistical models](https://spacy.io/models) and word vectors, and
currently supports tokenization for **50+ languages**. It features
currently supports tokenization for **60+ languages**. It features
state-of-the-art speed, convolutional **neural network models** for tagging,
parsing and **named entity recognition** and easy **deep learning** integration.
It's commercial open-source software, released under the MIT license.
💫 **Version 2.2 out now!**
💫 **Version 2.3 out now!**
[Check out the release notes here.](https://github.com/explosion/spaCy/releases)
[![Azure Pipelines](<https://img.shields.io/azure-devops/build/explosion-ai/public/8/master.svg?logo=azure-pipelines&style=flat-square&label=build+(3.x)>)](https://dev.azure.com/explosion-ai/public/_build?definitionId=8)
@ -32,7 +32,7 @@ It's commercial open-source software, released under the MIT license.
| --------------- | -------------------------------------------------------------- |
| [spaCy 101] | New to spaCy? Here's everything you need to know! |
| [Usage Guides] | How to use spaCy and its features. |
| [New in v2.2] | New features, backwards incompatibilities and migration guide. |
| [New in v2.3] | New features, backwards incompatibilities and migration guide. |
| [API Reference] | The detailed reference for spaCy's API. |
| [Models] | Download statistical language models for spaCy. |
| [Universe] | Libraries, extensions, demos, books and courses. |
@ -40,7 +40,7 @@ It's commercial open-source software, released under the MIT license.
| [Contribute] | How to contribute to the spaCy project and code base. |
[spacy 101]: https://spacy.io/usage/spacy-101
[new in v2.2]: https://spacy.io/usage/v2-2
[new in v2.3]: https://spacy.io/usage/v2-3
[usage guides]: https://spacy.io/usage/
[api reference]: https://spacy.io/api/
[models]: https://spacy.io/models
@ -113,12 +113,13 @@ of `v2.0.13`).
pip install spacy
```
To install additional data tables for lemmatization in **spaCy v2.2+** you can
run `pip install spacy[lookups]` or install
To install additional data tables for lemmatization and normalization in
**spaCy v2.2+** you can run `pip install spacy[lookups]` or install
[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data)
separately. The lookups package is needed to create blank models with
lemmatization data, and to lemmatize in languages that don't yet come with
pretrained models and aren't powered by third-party libraries.
lemmatization data for v2.2+ plus normalization data for v2.3+, and to
lemmatize in languages that don't yet come with pretrained models and aren't
powered by third-party libraries.
When using pip it is generally recommended to install packages in a virtual
environment to avoid modifying system state:

View File

@ -1,6 +1,6 @@
# fmt: off
__title__ = "spacy"
__version__ = "2.3.0.dev1"
__version__ = "2.3.0"
__release__ = True
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"

View File

@ -541,16 +541,17 @@ $ python -m spacy init-model [lang] [output_dir] [--jsonl-loc] [--vectors-loc]
[--prune-vectors]
```
| Argument | Type | Description |
| ------------------------------------------------------- | ---------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `lang` | positional | Model language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes), e.g. `en`. |
| `output_dir` | positional | Model output directory. Will be created if it doesn't exist. |
| `--jsonl-loc`, `-j` | option | Optional location of JSONL-formatted [vocabulary file](/api/annotation#vocab-jsonl) with lexical attributes. |
| `--vectors-loc`, `-v` | option | Optional location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. |
| `--truncate-vectors`, `-t` <Tag variant="new">2.3</Tag> | option | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. |
| `--prune-vectors`, `-V` | option | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. |
| `--vectors-name`, `-vn` | option | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. |
| **CREATES** | model | A spaCy model containing the vocab and vectors. |
| Argument | Type | Description |
| ----------------------------------------------------------- | ---------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `lang` | positional | Model language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes), e.g. `en`. |
| `output_dir` | positional | Model output directory. Will be created if it doesn't exist. |
| `--jsonl-loc`, `-j` | option | Optional location of JSONL-formatted [vocabulary file](/api/annotation#vocab-jsonl) with lexical attributes. |
| `--vectors-loc`, `-v` | option | Optional location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. |
| `--truncate-vectors`, `-t` <Tag variant="new">2.3</Tag> | option | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. |
| `--prune-vectors`, `-V` | option | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. |
| `--vectors-name`, `-vn` | option | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. |
| `--omit-extra-lookups`, `-OEL` <Tag variant="new">2.3</Tag> | flag | Do not include any of the extra lookups tables (`cluster`/`prob`/`sentiment`) from `spacy-lookups-data` in the model. |
| **CREATES** | model | A spaCy model containing the vocab and vectors. |
## Evaluate {#evaluate new="2"}

View File

@ -171,9 +171,6 @@ struct.
| `shape` | <Abbr title="uint64_t">`attr_t`</Abbr> | Transform of the lexeme's string, to show orthographic features. |
| `prefix` | <Abbr title="uint64_t">`attr_t`</Abbr> | Length-N substring from the start of the lexeme. Defaults to `N=1`. |
| `suffix` | <Abbr title="uint64_t">`attr_t`</Abbr> | Length-N substring from the end of the lexeme. Defaults to `N=3`. |
| `cluster` | <Abbr title="uint64_t">`attr_t`</Abbr> | Brown cluster ID. |
| `prob` | `float` | Smoothed log probability estimate of the lexeme's word type (context-independent entry in the vocabulary). |
| `sentiment` | `float` | A scalar value indicating positivity or negativity. |
### Lexeme.get_struct_attr {#lexeme_get_struct_attr tag="staticmethod, nogil" source="spacy/lexeme.pxd"}

View File

@ -12,17 +12,18 @@ expects true examples of a label to have the value `1.0`, and negative examples
of a label to have the value `0.0`. Labels not in the dictionary are treated as
missing the gradient for those labels will be zero.
| Name | Type | Description |
| ----------- | ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `doc` | `Doc` | The document the annotations refer to. |
| `words` | iterable | A sequence of unicode word strings. |
| `tags` | iterable | A sequence of strings, representing tag annotations. |
| `heads` | iterable | A sequence of integers, representing syntactic head offsets. |
| `deps` | iterable | A sequence of strings, representing the syntactic relation types. |
| `entities` | iterable | A sequence of named entity annotations, either as BILUO tag strings, or as `(start_char, end_char, label)` tuples, representing the entity positions. If BILUO tag strings, you can specify missing values by setting the tag to None. |
| `cats` | dict | Labels for text classification. Each key in the dictionary is a string label for the category and each value is `1.0` (positive) or `0.0` (negative). |
| `links` | dict | Labels for entity linking. A dict with `(start_char, end_char)` keys, and the values being dicts with `kb_id:value` entries, representing external KB IDs mapped to either `1.0` (positive) or `0.0` (negative). |
| **RETURNS** | `GoldParse` | The newly constructed object. |
| Name | Type | Description |
| ----------------- | ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `doc` | `Doc` | The document the annotations refer to. |
| `words` | iterable | A sequence of unicode word strings. |
| `tags` | iterable | A sequence of strings, representing tag annotations. |
| `heads` | iterable | A sequence of integers, representing syntactic head offsets. |
| `deps` | iterable | A sequence of strings, representing the syntactic relation types. |
| `entities` | iterable | A sequence of named entity annotations, either as BILUO tag strings, or as `(start_char, end_char, label)` tuples, representing the entity positions. If BILUO tag strings, you can specify missing values by setting the tag to None. |
| `cats` | dict | Labels for text classification. Each key in the dictionary is a string label for the category and each value is `1.0` (positive) or `0.0` (negative). |
| `links` | dict | Labels for entity linking. A dict with `(start_char, end_char)` keys, and the values being dicts with `kb_id:value` entries, representing external KB IDs mapped to either `1.0` (positive) or `0.0` (negative). |
| `make_projective` | bool | Whether to projectivize the dependency tree. Defaults to `False`. |
| **RETURNS** | `GoldParse` | The newly constructed object. |
## GoldParse.\_\_len\_\_ {#len tag="method"}
@ -42,17 +43,17 @@ Whether the provided syntactic annotations form a projective dependency tree.
## Attributes {#attributes}
| Name | Type | Description |
| ------------------------------------ | ---- | -------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `words` | list | The words. |
| `tags` | list | The part-of-speech tag annotations. |
| `heads` | list | The syntactic head annotations. |
| `labels` | list | The syntactic relation-type annotations. |
| `ner` | list | The named entity annotations as BILUO tags. |
| `cand_to_gold` | list | The alignment from candidate tokenization to gold tokenization. |
| `gold_to_cand` | list | The alignment from gold tokenization to candidate tokenization. |
| `cats` <Tag variant="new">2</Tag> | dict | Keys in the dictionary are string category labels with values `1.0` or `0.0`. |
| `links` <Tag variant="new">2.2</Tag> | dict | Keys in the dictionary are `(start_char, end_char)` triples, and the values are dictionaries with `kb_id:value` entries. |
| Name | Type | Description |
| ------------------------------------ | ---- | ------------------------------------------------------------------------------------------------------------------------ |
| `words` | list | The words. |
| `tags` | list | The part-of-speech tag annotations. |
| `heads` | list | The syntactic head annotations. |
| `labels` | list | The syntactic relation-type annotations. |
| `ner` | list | The named entity annotations as BILUO tags. |
| `cand_to_gold` | list | The alignment from candidate tokenization to gold tokenization. |
| `gold_to_cand` | list | The alignment from gold tokenization to candidate tokenization. |
| `cats` <Tag variant="new">2</Tag> | dict | Keys in the dictionary are string category labels with values `1.0` or `0.0`. |
| `links` <Tag variant="new">2.2</Tag> | dict | Keys in the dictionary are `(start_char, end_char)` triples, and the values are dictionaries with `kb_id:value` entries. |
## Utilities {#util}
@ -60,7 +61,8 @@ Whether the provided syntactic annotations form a projective dependency tree.
Convert a list of Doc objects into the
[JSON-serializable format](/api/annotation#json-input) used by the
[`spacy train`](/api/cli#train) command. Each input doc will be treated as a 'paragraph' in the output doc.
[`spacy train`](/api/cli#train) command. Each input doc will be treated as a
'paragraph' in the output doc.
> #### Example
>

View File

@ -156,7 +156,7 @@ The L2 norm of the lexeme's vector representation.
| `like_url` | bool | Does the lexeme resemble a URL? |
| `like_num` | bool | Does the lexeme represent a number? e.g. "10.9", "10", "ten", etc. |
| `like_email` | bool | Does the lexeme resemble an email address? |
| `is_oov` | bool | Is the lexeme out-of-vocabulary? |
| `is_oov` | bool | Does the lexeme have a word vector? |
| `is_stop` | bool | Is the lexeme part of a "stop list"? |
| `lang` | int | Language of the parent vocabulary. |
| `lang_` | unicode | Language of the parent vocabulary. |

View File

@ -40,7 +40,8 @@ string where an integer is expected) or unexpected property names.
## Matcher.\_\_call\_\_ {#call tag="method"}
Find all token sequences matching the supplied patterns on the `Doc`.
Find all token sequences matching the supplied patterns on the `Doc`. As of
spaCy v2.3, the `Matcher` can also be called on `Span` objects.
> #### Example
>
@ -54,10 +55,10 @@ Find all token sequences matching the supplied patterns on the `Doc`.
> matches = matcher(doc)
> ```
| Name | Type | Description |
| ----------- | ----- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `doc` | `Doc` | The document to match over. |
| **RETURNS** | list | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. The `match_id` is the ID of the added match pattern. |
| Name | Type | Description |
| ----------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `doclike` | `Doc`/`Span` | The document to match over or a `Span` (as of v2.3). |
| **RETURNS** | list | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. The `match_id` is the ID of the added match pattern. |
<Infobox title="Important note" variant="warning">

View File

@ -42,7 +42,7 @@ Initialize the sentencizer.
| Name | Type | Description |
| ------------- | ------------- | ------------------------------------------------------------------------------------------------------ |
| `punct_chars` | list | Optional custom list of punctuation characters that mark sentence ends. Defaults to `[".", "!", "?"].` |
| `punct_chars` | list | Optional custom list of punctuation characters that mark sentence ends. Defaults to `['!', '.', '?', '։', '؟', '۔', '܀', '܁', '܂', '߹', '।', '॥', '၊', '။', '።', '፧', '፨', '', '', '᜶', '', '', '᥄', '᥅', '᪨', '᪩', '᪪', '᪫', '᭚', '᭛', '᭞', '᭟', '᰻', '᰼', '᱾', '᱿', '‼', '‽', '⁇', '⁈', '⁉', '⸮', '⸼', '', '', '꘏', '꛳', '꛷', '꡶', '꡷', '꣎', '꣏', '꤯', '꧈', '꧉', '꩝', '꩞', '꩟', '꫰', '꫱', '꯫', '﹒', '﹖', '﹗', '', '', '', '𐩖', '𐩗', '𑁇', '𑁈', '𑂾', '𑂿', '𑃀', '𑃁', '𑅁', '𑅂', '𑅃', '𑇅', '𑇆', '𑇍', '𑇞', '𑇟', '𑈸', '𑈹', '𑈻', '𑈼', '𑊩', '𑑋', '𑑌', '𑗂', '𑗃', '𑗉', '𑗊', '𑗋', '𑗌', '𑗍', '𑗎', '𑗏', '𑗐', '𑗑', '𑗒', '𑗓', '𑗔', '𑗕', '𑗖', '𑗗', '𑙁', '𑙂', '𑜼', '𑜽', '𑜾', '𑩂', '𑩃', '𑪛', '𑪜', '𑱁', '𑱂', '𖩮', '𖩯', '𖫵', '𖬷', '𖬸', '𖭄', '𛲟', '𝪈', '。', '。']`. |
| **RETURNS** | `Sentencizer` | The newly constructed object. |
## Sentencizer.\_\_call\_\_ {#call tag="method"}

View File

@ -459,7 +459,7 @@ The L2 norm of the token's vector representation.
| `like_url` | bool | Does the token resemble a URL? |
| `like_num` | bool | Does the token represent a number? e.g. "10.9", "10", "ten", etc. |
| `like_email` | bool | Does the token resemble an email address? |
| `is_oov` | bool | Is the token out-of-vocabulary? |
| `is_oov` | bool | Does the token have a word vector? |
| `is_stop` | bool | Is the token part of a "stop list"? |
| `pos` | int | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/docs/u/pos/). |
| `pos_` | unicode | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/docs/u/pos/). |

View File

@ -27,6 +27,9 @@ Create the vocabulary.
| `tag_map` | dict | A dictionary mapping fine-grained tags to coarse-grained parts-of-speech, and optionally morphological attributes. |
| `lemmatizer` | object | A lemmatizer. Defaults to `None`. |
| `strings` | `StringStore` / list | A [`StringStore`](/api/stringstore) that maps strings to hash values, and vice versa, or a list of strings. |
| `lookups` | `Lookups` | A [`Lookups`](/api/lookups) that stores the `lemma_\*`, `lexeme_norm` and other large lookup tables. Defaults to `None`. |
| `lookups_extra` <Tag variant="new">2.3</Tag> | `Lookups` | A [`Lookups`](/api/lookups) that stores the optional `lexeme_cluster`/`lexeme_prob`/`lexeme_sentiment`/`lexeme_settings` lookup tables. Defaults to `None`. |
| `oov_prob` | float | The default OOV probability. Defaults to `-20.0`. |
| `vectors_name` <Tag variant="new">2.2</Tag> | unicode | A name to identify the vectors table. |
| **RETURNS** | `Vocab` | The newly constructed object. |

View File

@ -297,9 +297,35 @@ though `$` and `€` are very different, spaCy normalizes them both to `$`. This
way, they'll always be seen as similar, no matter how common they were in the
training data.
Norm exceptions can be provided as a simple dictionary. For more examples, see
the English
[`norm_exceptions.py`](https://github.com/explosion/spaCy/tree/master/spacy/lang/en/norm_exceptions.py).
As of spaCy v2.3, language-specific norm exceptions are provided as a
JSON dictionary in the package
[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) rather
than in the main library. For a full example, see
[`en_lexeme_norm.json`](https://github.com/explosion/spacy-lookups-data/blob/master/spacy_lookups_data/data/en_lexeme_norm.json).
```json
### Example
{
"cos": "because",
"fav": "favorite",
"accessorise": "accessorize",
"accessorised": "accessorized"
}
```
If you're adding tables for a new languages, be sure to add the tables to
[`spacy_lookups_data/__init__.py`](https://github.com/explosion/spacy-lookups-data/blob/master/spacy_lookups_data/__init__.py)
and register the entry point under `spacy_lookups` in
[`setup.cfg`](https://github.com/explosion/spacy-lookups-data/blob/master/setup.cfg).
Alternatively, you can initialize your language [`Vocab`](/api/vocab) with a
[`Lookups`](/api/lookups) object that includes the table `lexeme_norm`.
<Accordion title="Norm exceptions in spaCy v2.0-v2.2" id="norm-exceptions-v2.2">
Previously in spaCy v2.0-v2.2, norm exceptions were provided as a simple python
dictionary. For more examples, see the English
[`norm_exceptions.py`](https://github.com/explosion/spaCy/tree/v2.2.x/spacy/lang/en/norm_exceptions.py).
```python
### Example
@ -327,6 +353,8 @@ norm exceptions overwrite any of the global exceptions, they should be added
first. Also note that the tokenizer exceptions will always have priority over
the attribute getters.
</Accordion>
### Lexical attributes {#lex-attrs new="2"}
spaCy provides a range of [`Token` attributes](/api/token#attributes) that

View File

@ -732,7 +732,7 @@ rather than performance:
```python
def tokenizer_pseudo_code(self, special_cases, prefix_search, suffix_search,
infix_finditer, token_match):
infix_finditer, token_match, url_match):
tokens = []
for substring in text.split():
suffixes = []
@ -829,7 +829,7 @@ for t in tok_exp:
### Customizing spaCy's Tokenizer class {#native-tokenizers}
Let's imagine you wanted to create a tokenizer for a new language or specific
domain. There are five things you would need to define:
domain. There are six things you may need to define:
1. A dictionary of **special cases**. This handles things like contractions,
units of measurement, emoticons, certain abbreviations, etc.
@ -840,9 +840,22 @@ domain. There are five things you would need to define:
4. A function `infixes_finditer`, to handle non-whitespace separators, such as
hyphens etc.
5. An optional boolean function `token_match` matching strings that should never
be split, overriding the infix rules. Useful for things like URLs or numbers.
be split, overriding the infix rules. Useful for things like numbers.
6. An optional boolean function `url_match`, which is similar to `token_match`
except prefixes and suffixes are removed before applying the match.
except that prefixes and suffixes are removed before applying the match.
<Infobox title="Important note: token match in spaCy v2.2" variant="warning">
In spaCy v2.2.2-v2.2.4, the `token_match` was equivalent to the `url_match`
above and there was no match pattern applied before prefixes and suffixes were
analyzed. As of spaCy v2.3.0, the `token_match` has been reverted to its
behavior in v2.2.1 and earlier with precedence over prefixes and suffixes.
The `url_match` is introduced in v2.3.0 to handle cases like URLs where the
tokenizer should remove prefixes and suffixes (e.g., a comma at the end of a
URL) before applying the match.
</Infobox>
You shouldn't usually need to create a `Tokenizer` subclass. Standard usage is
to use `re.compile()` to build a regular expression object, and pass its
@ -865,7 +878,7 @@ def custom_tokenizer(nlp):
prefix_search=prefix_re.search,
suffix_search=suffix_re.search,
infix_finditer=infix_re.finditer,
token_match=simple_url_re.match)
url_match=simple_url_re.match)
nlp = spacy.load("en_core_web_sm")
nlp.tokenizer = custom_tokenizer(nlp)

View File

@ -85,6 +85,123 @@ To load your model with the neutral, multi-language class, simply set
`meta.json`. You can also import the class directly, or call
[`util.get_lang_class()`](/api/top-level#util.get_lang_class) for lazy-loading.
### Chinese language support {#chinese new=2.3}
The Chinese language class supports three word segmentation options:
> ```python
> from spacy.lang.zh import Chinese
>
> # Disable jieba to use character segmentation
> Chinese.Defaults.use_jieba = False
> nlp = Chinese()
>
> # Disable jieba through tokenizer config options
> cfg = {"use_jieba": False}
> nlp = Chinese(meta={"tokenizer": {"config": cfg}})
>
> # Load with "default" model provided by pkuseg
> cfg = {"pkuseg_model": "default", "require_pkuseg": True}
> nlp = Chinese(meta={"tokenizer": {"config": cfg}})
> ```
1. **Jieba:** `Chinese` uses [Jieba](https://github.com/fxsjy/jieba) for word
segmentation by default. It's enabled when you create a new `Chinese`
language class or call `spacy.blank("zh")`.
2. **Character segmentation:** Character segmentation is supported by disabling
`jieba` and setting `Chinese.Defaults.use_jieba = False` _before_
initializing the language class. As of spaCy v2.3.0, the `meta` tokenizer
config options can be used to configure `use_jieba`.
3. **PKUSeg**: In spaCy v2.3.0, support for
[PKUSeg](https://github.com/lancopku/PKUSeg-python) has been added to support
better segmentation for Chinese OntoNotes and the new
[Chinese models](/models/zh).
<Accordion title="Details on spaCy's PKUSeg API">
The `meta` argument of the `Chinese` language class supports the following
following tokenizer config settings:
| Name | Type | Description |
| ------------------ | ------- | ---------------------------------------------------------------------------------------------------- |
| `pkuseg_model` | unicode | **Required:** Name of a model provided by `pkuseg` or the path to a local model directory. |
| `pkuseg_user_dict` | unicode | Optional path to a file with one word per line which overrides the default `pkuseg` user dictionary. |
| `require_pkuseg` | bool | Overrides all `jieba` settings (optional but strongly recommended). |
```python
### Examples
# Load "default" model
cfg = {"pkuseg_model": "default", "require_pkuseg": True}
nlp = Chinese(meta={"tokenizer": {"config": cfg}})
# Load local model
cfg = {"pkuseg_model": "/path/to/pkuseg_model", "require_pkuseg": True}
nlp = Chinese(meta={"tokenizer": {"config": cfg}})
# Override the user directory
cfg = {"pkuseg_model": "default", "require_pkuseg": True, "pkuseg_user_dict": "/path"}
nlp = Chinese(meta={"tokenizer": {"config": cfg}})
```
You can also modify the user dictionary on-the-fly:
```python
# Append words to user dict
nlp.tokenizer.pkuseg_update_user_dict(["中国", "ABC"])
# Remove all words from user dict and replace with new words
nlp.tokenizer.pkuseg_update_user_dict(["中国"], reset=True)
# Remove all words from user dict
nlp.tokenizer.pkuseg_update_user_dict([], reset=True)
```
</Accordion>
<Accordion title="Details on pretrained and custom Chinese models">
The [Chinese models](/models/zh) provided by spaCy include a custom `pkuseg`
model trained only on
[Chinese OntoNotes 5.0](https://catalog.ldc.upenn.edu/LDC2013T19), since the
models provided by `pkuseg` include data restricted to research use. For
research use, `pkuseg` provides models for several different domains
(`"default"`, `"news"` `"web"`, `"medicine"`, `"tourism"`) and for other uses,
`pkuseg` provides a simple
[training API](https://github.com/lancopku/pkuseg-python/blob/master/readme/readme_english.md#usage):
```python
import pkuseg
from spacy.lang.zh import Chinese
# Train pkuseg model
pkuseg.train("train.utf8", "test.utf8", "/path/to/pkuseg_model")
# Load pkuseg model in spaCy Chinese tokenizer
nlp = Chinese(meta={"tokenizer": {"config": {"pkuseg_model": "/path/to/pkuseg_model", "require_pkuseg": True}}})
```
</Accordion>
### Japanese language support {#japanese new=2.3}
> ```python
> from spacy.lang.ja import Japanese
>
> # Load SudachiPy with split mode A (default)
> nlp = Japanese()
>
> # Load SudachiPy with split mode B
> cfg = {"split_mode": "B"}
> nlp = Japanese(meta={"tokenizer": {"config": cfg}})
> ```
The Japanese language class uses
[SudachiPy](https://github.com/WorksApplications/SudachiPy) for word
segmentation and part-of-speech tagging. The default Japanese language class
and the provided Japanese models use SudachiPy split mode `A`.
The `meta` argument of the `Japanese` language class can be used to configure
the split mode to `A`, `B` or `C`.
## Installing and using models {#download}
> #### Downloading models in spaCy < v1.7

213
website/docs/usage/v2-3.md Normal file
View File

@ -0,0 +1,213 @@
---
title: What's New in v2.3
teaser: New features, backwards incompatibilities and migration guide
menu:
- ['New Features', 'features']
- ['Backwards Incompatibilities', 'incompat']
- ['Migrating from v2.2', 'migrating']
---
## New Features {#features hidden="true"}
spaCy v2.3 features new pretrained models for five languages, word vectors for
all language models, and decreased model size and loading times for models with
vectors. We've added pretrained models for **Chinese, Danish, Japanese, Polish
and Romanian** and updated the training data and vectors for most languages.
Model packages with vectors are about **2&times** smaller on disk and load
**2-4&times;** faster. For the full changelog, see the [release notes on
GitHub](https://github.com/explosion/spaCy/releases/tag/v2.3.0). For more
details and a behind-the-scenes look at the new release, [see our blog
post](https://explosion.ai/blog/spacy-v2-3).
### Expanded model families with vectors {#models}
> #### Example
>
> ```bash
> python -m spacy download da_core_news_sm
> python -m spacy download ja_core_news_sm
> python -m spacy download pl_core_news_sm
> python -m spacy download ro_core_news_sm
> python -m spacy download zh_core_web_sm
> ```
With new model families for Chinese, Danish, Polish, Romanian and Chinese plus
`md` and `lg` models with word vectors for all languages, this release provides
a total of 46 model packages. For models trained using [Universal
Dependencies](https://universaldependencies.org) corpora, the training data has
been updated to UD v2.5 (v2.6 for Japanese, v2.3 for Polish) and Dutch has been
extended to include both UD Dutch Alpino and LassySmall.
<Infobox>
**Models:** [Models directory](/models) **Benchmarks: **
[Release notes](https://github.com/explosion/spaCy/releases/tag/v2.3.0)
</Infobox>
### Chinese {#chinese}
> #### Example
> ```python
> from spacy.lang.zh import Chinese
>
> # Load with "default" model provided by pkuseg
> cfg = {"pkuseg_model": "default", "require_pkuseg": True}
> nlp = Chinese(meta={"tokenizer": {"config": cfg}})
>
> # Append words to user dict
> nlp.tokenizer.pkuseg_update_user_dict(["中国", "ABC"])
This release adds support for
[pkuseg](https://github.com/lancopku/pkuseg-python) for word segmentation and
the new Chinese models ship with a custom pkuseg model trained on OntoNotes.
The Chinese tokenizer can be initialized with both `pkuseg` and custom models
and the `pkuseg` user dictionary is easy to customize.
<Infobox>
**Chinese:** [Chinese tokenizer usage](/usage/models#chinese)
</Infobox>
### Japanese {#japanese}
The updated Japanese language class switches to
[SudachiPy](https://github.com/WorksApplications/SudachiPy) for word
segmentation and part-of-speech tagging. Using `sudachipy` greatly simplifies
installing spaCy for Japanese, which is now possible with a single command:
`pip install spacy[ja]`.
<Infobox>
**Japanese:** [Japanese tokenizer usage](/usage/models#japanese)
</Infobox>
### Small CLI updates
- `spacy debug-data` provides the coverage of the vectors in a base model with
`spacy debug-data lang train dev -b base_model`
- `spacy evaluate` supports `blank:lg` (e.g. `spacy evaluate blank:en
dev.json`) to evaluate the tokenization accuracy without loading a model
- `spacy train` on GPU restricts the CPU timing evaluation to the first
iteration
## Backwards incompatibilities {#incompat}
<Infobox title="Important note on models" variant="warning">
If you've been training **your own models**, you'll need to **retrain** them
with the new version. Also don't forget to upgrade all models to the latest
versions. Models for earlier v2 releases (v2.0, v2.1, v2.2) aren't compatible
with models for v2.3. To check if all of your models are up to date, you can
run the [`spacy validate`](/api/cli#validate) command.
</Infobox>
> #### Install with lookups data
>
> ```bash
> $ pip install spacy[lookups]
> ```
>
> You can also install
> [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data)
> directly.
- If you're training new models, you'll want to install the package
[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data),
which now includes both the lemmatization tables (as in v2.2) and the
normalization tables (new in v2.3). If you're using pretrained models,
**nothing changes**, because the relevant tables are included in the model
packages.
- Due to the updated Universal Dependencies training data, the fine-grained
part-of-speech tags will change for many provided language models. The
coarse-grained part-of-speech tagset remains the same, but the mapping from
particular fine-grained to coarse-grained tags may show minor differences.
- For French, Italian, Portuguese and Spanish, the fine-grained part-of-speech
tagsets contain new merged tags related to contracted forms, such as
`ADP_DET` for French `"au"`, which maps to UPOS `ADP` based on the head
`"à"`. This increases the accuracy of the models by improving the alignment
between spaCy's tokenization and Universal Dependencies multi-word tokens
used for contractions.
### Migrating from spaCy 2.2 {#migrating}
#### Tokenizer settings
In spaCy v2.2.2-v2.2.4, there was a change to the precedence of `token_match`
that gave prefixes and suffixes priority over `token_match`, which caused
problems for many custom tokenizer configurations. This has been reverted in
v2.3 so that `token_match` has priority over prefixes and suffixes as in v2.2.1
and earlier versions.
A new tokenizer setting `url_match` has been introduced in v2.3.0 to handle
cases like URLs where the tokenizer should remove prefixes and suffixes (e.g.,
a comma at the end of a URL) before applying the match. See the full [tokenizer
documentation](/usage/linguistic-features#tokenization) and try out
[`nlp.tokenizer.explain()`](/usage/linguistic-features#tokenizer-debug) when
debugging your tokenizer configuration.
#### Warnings configuration
spaCy's custom warnings have been replaced with native python
[`warnings`](https://docs.python.org/3/library/warnings.html). Instead of
setting `SPACY_WARNING_IGNORE`, use the [warnings
filters](https://docs.python.org/3/library/warnings.html#the-warnings-filter)
to manage warnings.
#### Normalization tables
The normalization tables have moved from the language data in
[`spacy/lang`](https://github.com/explosion/spaCy/tree/master/spacy/lang) to
the package
[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data). If
you're adding data for a new language, the normalization table should be added
to `spacy-lookups-data`. See [adding norm
exceptions](/usage/adding-languages#norm-exceptions).
#### Probability and cluster features
> #### Load and save extra prob lookups table
>
> ```python
> from spacy.lang.en import English
> nlp = English()
> doc = nlp("the")
> print(doc[0].prob) # lazily loads extra prob table
> nlp.to_disk("/path/to/model") # includes prob table
> ```
The `Token.prob` and `Token.cluster` features, which are no longer used by the
core pipeline components as of spaCy v2, are no longer provided in the
pretrained models to reduce the model size. To keep these features available
for users relying on them, the `prob` and `cluster` features for the most
frequent 1M tokens have been moved to
[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) as
`extra` features for the relevant languages (English, German, Greek and
Spanish).
The extra tables are loaded lazily, so if you have `spacy-lookups-data`
installed and your code accesses `Token.prob`, the full table is loaded into
the model vocab, which will take a few seconds on initial loading. When you
save this model after loading the `prob` table, the full `prob` table will be
saved as part of the model vocab.
If you'd like to include custom `cluster`, `prob`, or `sentiment` tables as
part of a new model, add the data to
[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) under
the entry point `lg_extra`, e.g. `en_extra` for English. Alternatively, you can
initialize your [`Vocab`](/api/vocab) with the `lookups_extra` argument with a
[`Lookups`](/api/lookups) object that includes the tables `lexeme_cluster`,
`lexeme_prob`, `lexeme_sentiment` or `lexeme_settings`. `lexeme_settings` is
currently only used to provide a custom `oov_prob`. See examples in the [`data`
directory](https://github.com/explosion/spacy-lookups-data/tree/master/spacy_lookups_data/data)
in `spacy-lookups-data`.
#### Initializing new models without extra lookups tables
When you initialize a new model with [`spacy init-model`](/api/cli#init-model),
the `prob` table from `spacy-lookups-data` may be loaded as part of the
initialization. If you'd like to omit this extra data as in spaCy's provided
v2.3 models, use the new flag `--omit-extra-lookups`.

View File

@ -1,5 +1,35 @@
{
"languages": [
{
"code": "zh",
"name": "Chinese",
"models": ["zh_core_web_sm", "zh_core_web_md", "zh_core_web_lg"],
"dependencies": [
{
"name": "Jieba",
"url": "https://github.com/fxsjy/jieba"
},
{
"name": "PKUSeg",
"url": "https://github.com/lancopku/PKUSeg-python"
}
],
"has_examples": true
},
{
"code": "da",
"name": "Danish",
"example": "Dette er en sætning.",
"has_examples": true,
"models": ["da_core_news_sm", "da_core_news_md", "da_core_news_lg"]
},
{
"code": "nl",
"name": "Dutch",
"models": ["nl_core_news_sm", "nl_core_news_md", "nl_core_news_lg"],
"example": "Dit is een zin.",
"has_examples": true
},
{
"code": "en",
"name": "English",
@ -14,68 +44,91 @@
"example": "This is a sentence.",
"has_examples": true
},
{
"code": "fr",
"name": "French",
"models": ["fr_core_news_sm", "fr_core_news_md", "fr_core_news_lg"],
"example": "C'est une phrase.",
"has_examples": true
},
{
"code": "de",
"name": "German",
"models": ["de_core_news_sm", "de_core_news_md"],
"models": ["de_core_news_sm", "de_core_news_md", "de_core_news_lg"],
"starters": ["de_trf_bertbasecased_lg"],
"example": "Dies ist ein Satz.",
"has_examples": true
},
{
"code": "fr",
"name": "French",
"models": ["fr_core_news_sm", "fr_core_news_md"],
"example": "C'est une phrase.",
"has_examples": true
},
{
"code": "es",
"name": "Spanish",
"models": ["es_core_news_sm", "es_core_news_md"],
"example": "Esto es una frase.",
"has_examples": true
},
{
"code": "pt",
"name": "Portuguese",
"models": ["pt_core_news_sm"],
"example": "Esta é uma frase.",
"code": "el",
"name": "Greek",
"models": ["el_core_news_sm", "el_core_news_md", "el_core_news_lg"],
"example": "Αυτή είναι μια πρόταση.",
"has_examples": true
},
{
"code": "it",
"name": "Italian",
"models": ["it_core_news_sm"],
"models": ["it_core_news_sm", "it_core_news_md", "it_core_news_lg"],
"example": "Questa è una frase.",
"has_examples": true
},
{
"code": "nl",
"name": "Dutch",
"models": ["nl_core_news_sm"],
"example": "Dit is een zin.",
"code": "ja",
"name": "Japanese",
"models": ["ja_core_news_sm", "ja_core_news_md", "ja_core_news_lg"],
"dependencies": [
{
"name": "SudachiPy",
"url": "https://github.com/WorksApplications/SudachiPy"
}
],
"has_examples": true
},
{
"code": "el",
"name": "Greek",
"models": ["el_core_news_sm", "el_core_news_md"],
"example": "Αυτή είναι μια πρόταση.",
"has_examples": true
"code": "lt",
"name": "Lithuanian",
"has_examples": true,
"models": ["lt_core_news_sm", "lt_core_news_md", "lt_core_news_lg"]
},
{ "code": "sv", "name": "Swedish", "has_examples": true },
{ "code": "fi", "name": "Finnish", "has_examples": true },
{
"code": "nb",
"name": "Norwegian Bokmål",
"example": "Dette er en setning.",
"has_examples": true,
"models": ["nb_core_news_sm"]
"models": ["nb_core_news_sm", "nb_core_news_md", "nb_core_news_lg"]
},
{ "code": "da", "name": "Danish", "example": "Dette er en sætning.", "has_examples": true },
{
"code": "pl",
"name": "Polish",
"example": "To jest zdanie.",
"has_examples": true,
"models": ["pl_core_news_sm", "pl_core_news_md", "pl_core_news_lg"]
},
{
"code": "pt",
"name": "Portuguese",
"models": ["pt_core_news_sm", "pt_core_news_md", "pt_core_news_lg"],
"example": "Esta é uma frase.",
"has_examples": true
},
{
"code": "ro",
"name": "Romanian",
"example": "Aceasta este o propoziție.",
"has_examples": true,
"models": ["ro_core_news_sm", "ro_core_news_md", "ro_core_news_lg"]
},
{
"code": "es",
"name": "Spanish",
"models": ["es_core_news_sm", "es_core_news_md", "es_core_news_lg"],
"example": "Esto es una frase.",
"has_examples": true
},
{ "code": "sv", "name": "Swedish", "has_examples": true },
{ "code": "fi", "name": "Finnish", "has_examples": true },
{ "code": "hu", "name": "Hungarian", "example": "Ez egy mondat.", "has_examples": true },
{ "code": "pl", "name": "Polish", "example": "To jest zdanie.", "has_examples": true },
{
"code": "ru",
"name": "Russian",
@ -88,12 +141,6 @@
"has_examples": true,
"dependencies": [{ "name": "pymorphy2", "url": "https://github.com/kmike/pymorphy2" }]
},
{
"code": "ro",
"name": "Romanian",
"example": "Aceasta este o propoziție.",
"has_examples": true
},
{ "code": "hr", "name": "Croatian", "has_examples": true },
{ "code": "eu", "name": "Basque", "has_examples": true },
{ "code": "yo", "name": "Yoruba", "has_examples": true },
@ -123,7 +170,6 @@
{ "code": "bg", "name": "Bulgarian", "example": "Това е изречение", "has_examples": true },
{ "code": "cs", "name": "Czech" },
{ "code": "is", "name": "Icelandic" },
{ "code": "lt", "name": "Lithuanian", "has_examples": true, "models": ["lt_core_news_sm"] },
{ "code": "lv", "name": "Latvian" },
{ "code": "sr", "name": "Serbian" },
{ "code": "sk", "name": "Slovak" },
@ -145,12 +191,6 @@
"example": "นี่คือประโยค",
"has_examples": true
},
{
"code": "zh",
"name": "Chinese",
"dependencies": [{ "name": "Jieba", "url": "https://github.com/fxsjy/jieba" }],
"has_examples": true
},
{
"code": "ja",
"name": "Japanese",
@ -187,6 +227,21 @@
"example": "Sta chì a l'é unna fraxe.",
"has_examples": true
},
{
"code": "hy",
"name": "Armenian",
"has_examples": true
},
{
"code": "gu",
"name": "Gujarati",
"has_examples": true
},
{
"code": "ml",
"name": "Malayalam",
"has_examples": true
},
{
"code": "xx",
"name": "Multi-language",

View File

@ -9,6 +9,7 @@
{ "text": "Models & Languages", "url": "/usage/models" },
{ "text": "Facts & Figures", "url": "/usage/facts-figures" },
{ "text": "spaCy 101", "url": "/usage/spacy-101" },
{ "text": "New in v2.3", "url": "/usage/v2-3" },
{ "text": "New in v2.2", "url": "/usage/v2-2" },
{ "text": "New in v2.1", "url": "/usage/v2-1" },
{ "text": "New in v2.0", "url": "/usage/v2" }

View File

@ -124,7 +124,7 @@ const Landing = ({ data }) => {
{counts.modelLangs} languages
</Li>
<Li>
pretrained <strong>word vectors</strong>
Pretrained <strong>word vectors</strong>
</Li>
<Li>State-of-the-art speed</Li>
<Li>

View File

@ -38,10 +38,10 @@ const Languages = () => (
const langs = site.siteMetadata.languages
const withModels = langs
.filter(({ models }) => models && !!models.length)
.sort((a, b) => a.code.localeCompare(b.code))
.sort((a, b) => a.name.localeCompare(b.name))
const withoutModels = langs
.filter(({ models }) => !models || !models.length)
.sort((a, b) => a.code.localeCompare(b.code))
.sort((a, b) => a.name.localeCompare(b.name))
const withDeps = langs.filter(({ dependencies }) => dependencies && dependencies.length)
return (
<>