Fix code branch for v2.x site [ci skip]

This commit is contained in:
Ines Montani 2021-02-01 11:48:35 +11:00
parent 6daf2381fa
commit c70e6ee72d
21 changed files with 145 additions and 144 deletions

View File

@ -250,7 +250,7 @@ POS tag set.
<Infobox title="Annotation schemes for other models"> <Infobox title="Annotation schemes for other models">
For the label schemes used by the other models, see the respective `tag_map.py` For the label schemes used by the other models, see the respective `tag_map.py`
in [`spacy/lang`](https://github.com/explosion/spaCy/tree/master/spacy/lang). in [`spacy/lang`](https://github.com/explosion/spaCy/tree/v2.x/spacy/lang).
</Infobox> </Infobox>
@ -564,7 +564,7 @@ Here's an example of dependencies, part-of-speech tags and names entities, taken
from the English Wall Street Journal portion of the Penn Treebank: from the English Wall Street Journal portion of the Penn Treebank:
```json ```json
https://github.com/explosion/spaCy/tree/master/examples/training/training-data.json https://github.com/explosion/spaCy/tree/v2.x/examples/training/training-data.json
``` ```
### Lexical data for vocabulary {#vocab-jsonl new="2"} ### Lexical data for vocabulary {#vocab-jsonl new="2"}
@ -619,5 +619,5 @@ data.
Here's an example of the 20 most frequent lexemes in the English training data: Here's an example of the 20 most frequent lexemes in the English training data:
```json ```json
https://github.com/explosion/spaCy/tree/master/examples/training/vocab-data.jsonl https://github.com/explosion/spaCy/tree/v2.x/examples/training/vocab-data.jsonl
``` ```

View File

@ -166,13 +166,13 @@ All output files generated by this command are compatible with
### Converter options ### Converter options
| ID | Description | | ID | Description |
| ------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `auto` | Automatically pick converter based on file extension and file content (default). | | `auto` | Automatically pick converter based on file extension and file content (default). |
| `conll`, `conllu`, `conllubio` | Universal Dependencies `.conllu` or `.conll` format. | | `conll`, `conllu`, `conllubio` | Universal Dependencies `.conllu` or `.conll` format. |
| `ner` | NER with IOB/IOB2 tags, one token per line with columns separated by whitespace. The first column is the token and the final column is the IOB tag. Sentences are separated by blank lines and documents are separated by the line `-DOCSTART- -X- O O`. Supports CoNLL 2003 NER format. See [sample data](https://github.com/explosion/spaCy/tree/master/examples/training/ner_example_data). | | `ner` | NER with IOB/IOB2 tags, one token per line with columns separated by whitespace. The first column is the token and the final column is the IOB tag. Sentences are separated by blank lines and documents are separated by the line `-DOCSTART- -X- O O`. Supports CoNLL 2003 NER format. See [sample data](https://github.com/explosion/spaCy/tree/v2.x/examples/training/ner_example_data). |
| `iob` | NER with IOB/IOB2 tags, one sentence per line with tokens separated by whitespace and annotation separated by `|`, either `word|B-ENT` or `word|POS|B-ENT`. See [sample data](https://github.com/explosion/spaCy/tree/master/examples/training/ner_example_data). | | `iob` | NER with IOB/IOB2 tags, one sentence per line with tokens separated by whitespace and annotation separated by `|`, either `word|B-ENT` or `word|POS|B-ENT`. See [sample data](https://github.com/explosion/spaCy/tree/v2.x/examples/training/ner_example_data). |
| `jsonl` | NER data formatted as JSONL with one dict per line and a `"text"` and `"spans"` key. This is also the format exported by the [Prodigy](https://prodi.gy) annotation tool. See [sample data](https://raw.githubusercontent.com/explosion/projects/master/ner-fashion-brands/fashion_brands_training.jsonl). | | `jsonl` | NER data formatted as JSONL with one dict per line and a `"text"` and `"spans"` key. This is also the format exported by the [Prodigy](https://prodi.gy) annotation tool. See [sample data](https://raw.githubusercontent.com/explosion/projects/master/ner-fashion-brands/fashion_brands_training.jsonl). |
## Debug data {#debug-data new="2.2"} ## Debug data {#debug-data new="2.2"}
@ -473,7 +473,7 @@ $ python -m spacy pretrain [texts_loc] [vectors_model] [output_dir]
| `--use-chars`, `-chr` <Tag variant="new">2.2.2</Tag> | flag | Whether to use character-based embedding. | | `--use-chars`, `-chr` <Tag variant="new">2.2.2</Tag> | flag | Whether to use character-based embedding. |
| `--sa-depth`, `-sa` <Tag variant="new">2.2.2</Tag> | option | Depth of self-attention layers. | | `--sa-depth`, `-sa` <Tag variant="new">2.2.2</Tag> | option | Depth of self-attention layers. |
| `--embed-rows`, `-er` | option | Number of embedding rows. | | `--embed-rows`, `-er` | option | Number of embedding rows. |
| `--loss-func`, `-L` | option | Loss function to use for the objective. Either `"cosine"`, `"L2"` or `"characters"`. | | `--loss-func`, `-L` | option | Loss function to use for the objective. Either `"cosine"`, `"L2"` or `"characters"`. |
| `--dropout`, `-d` | option | Dropout rate. | | `--dropout`, `-d` | option | Dropout rate. |
| `--batch-size`, `-bs` | option | Number of words per training batch. | | `--batch-size`, `-bs` | option | Number of words per training batch. |
| `--max-length`, `-xw` | option | Maximum words per example. Longer examples are discarded. | | `--max-length`, `-xw` | option | Maximum words per example. Longer examples are discarded. |

View File

@ -23,12 +23,12 @@ abruptly.
With Cython there are four ways of declaring complex data types. Unfortunately With Cython there are four ways of declaring complex data types. Unfortunately
we use all four in different places, as they all have different utility: we use all four in different places, as they all have different utility:
| Declaration | Description | Example | | Declaration | Description | Example |
| --------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------- | | --------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------- |
| `class` | A normal Python class. | [`Language`](/api/language) | | `class` | A normal Python class. | [`Language`](/api/language) |
| `cdef class` | A Python extension type. Differs from a normal Python class in that its attributes can be defined on the underlying struct. Can have C-level objects as attributes (notably structs and pointers), and can have methods which have C-level objects as arguments or return types. | [`Lexeme`](/api/cython-classes#lexeme) | | `cdef class` | A Python extension type. Differs from a normal Python class in that its attributes can be defined on the underlying struct. Can have C-level objects as attributes (notably structs and pointers), and can have methods which have C-level objects as arguments or return types. | [`Lexeme`](/api/cython-classes#lexeme) |
| `cdef struct` | A struct is just a collection of variables, sort of like a named tuple, except the memory is contiguous. Structs can't have methods, only attributes. | [`LexemeC`](/api/cython-structs#lexemec) | | `cdef struct` | A struct is just a collection of variables, sort of like a named tuple, except the memory is contiguous. Structs can't have methods, only attributes. | [`LexemeC`](/api/cython-structs#lexemec) |
| `cdef cppclass` | A C++ class. Like a struct, this can be allocated on the stack, but can have methods, a constructor and a destructor. Differs from `cdef class` in that it can be created and destroyed without acquiring the Python global interpreter lock. This style is the most obscure. | [`StateC`](https://github.com/explosion/spaCy/tree/master/spacy/syntax/_state.pxd) | | `cdef cppclass` | A C++ class. Like a struct, this can be allocated on the stack, but can have methods, a constructor and a destructor. Differs from `cdef class` in that it can be created and destroyed without acquiring the Python global interpreter lock. This style is the most obscure. | [`StateC`](https://github.com/explosion/spaCy/tree/v2.x/spacy/syntax/_state.pxd) |
The most important classes in spaCy are defined as `cdef class` objects. The The most important classes in spaCy are defined as `cdef class` objects. The
underlying data for these objects is usually gathered into a struct, which is underlying data for these objects is usually gathered into a struct, which is

View File

@ -14,7 +14,7 @@ Create a `GoldCorpus`. IF the input data is an iterable, each item should be a
`(text, paragraphs)` tuple, where each paragraph is a tuple `(text, paragraphs)` tuple, where each paragraph is a tuple
`(sentences, brackets)`, and each sentence is a tuple `(sentences, brackets)`, and each sentence is a tuple
`(ids, words, tags, heads, ner)`. See the implementation of `(ids, words, tags, heads, ner)`. See the implementation of
[`gold.read_json_file`](https://github.com/explosion/spaCy/tree/master/spacy/gold.pyx) [`gold.read_json_file`](https://github.com/explosion/spaCy/tree/v2.x/spacy/gold.pyx)
for further details. for further details.
| Name | Type | Description | | Name | Type | Description |

View File

@ -107,7 +107,7 @@ meta data as a dictionary instead, you can use the `meta` attribute on your
Get a description for a given POS tag, dependency label or entity type. For a Get a description for a given POS tag, dependency label or entity type. For a
list of available terms, see list of available terms, see
[`glossary.py`](https://github.com/explosion/spaCy/tree/master/spacy/glossary.py). [`glossary.py`](https://github.com/explosion/spaCy/tree/v2.x/spacy/glossary.py).
> #### Example > #### Example
> >
@ -279,7 +279,7 @@ to add custom labels and their colors automatically.
## Utility functions {#util source="spacy/util.py"} ## Utility functions {#util source="spacy/util.py"}
spaCy comes with a small collection of utility functions located in spaCy comes with a small collection of utility functions located in
[`spacy/util.py`](https://github.com/explosion/spaCy/tree/master/spacy/util.py). [`spacy/util.py`](https://github.com/explosion/spaCy/tree/v2.x/spacy/util.py).
Because utility functions are mostly intended for **internal use within spaCy**, Because utility functions are mostly intended for **internal use within spaCy**,
their behavior may change with future releases. The functions documented on this their behavior may change with future releases. The functions documented on this
page should be safe to use and we'll try to ensure backwards compatibility. page should be safe to use and we'll try to ensure backwards compatibility.
@ -538,10 +538,10 @@ Compile a sequence of prefix rules into a regex object.
> nlp.tokenizer.prefix_search = prefix_regex.search > nlp.tokenizer.prefix_search = prefix_regex.search
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ----------- | ------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------- | | ----------- | ------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------- |
| `entries` | tuple | The prefix rules, e.g. [`lang.punctuation.TOKENIZER_PREFIXES`](https://github.com/explosion/spaCy/tree/master/spacy/lang/punctuation.py). | | `entries` | tuple | The prefix rules, e.g. [`lang.punctuation.TOKENIZER_PREFIXES`](https://github.com/explosion/spaCy/tree/v2.x/spacy/lang/punctuation.py). |
| **RETURNS** | [regex](https://docs.python.org/3/library/re.html#re-objects) | The regex object. to be used for [`Tokenizer.prefix_search`](/api/tokenizer#attributes). | | **RETURNS** | [regex](https://docs.python.org/3/library/re.html#re-objects) | The regex object. to be used for [`Tokenizer.prefix_search`](/api/tokenizer#attributes). |
### util.compile_suffix_regex {#util.compile_suffix_regex tag="function"} ### util.compile_suffix_regex {#util.compile_suffix_regex tag="function"}
@ -555,10 +555,10 @@ Compile a sequence of suffix rules into a regex object.
> nlp.tokenizer.suffix_search = suffix_regex.search > nlp.tokenizer.suffix_search = suffix_regex.search
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ----------- | ------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------- | | ----------- | ------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------- |
| `entries` | tuple | The suffix rules, e.g. [`lang.punctuation.TOKENIZER_SUFFIXES`](https://github.com/explosion/spaCy/tree/master/spacy/lang/punctuation.py). | | `entries` | tuple | The suffix rules, e.g. [`lang.punctuation.TOKENIZER_SUFFIXES`](https://github.com/explosion/spaCy/tree/v2.x/spacy/lang/punctuation.py). |
| **RETURNS** | [regex](https://docs.python.org/3/library/re.html#re-objects) | The regex object. to be used for [`Tokenizer.suffix_search`](/api/tokenizer#attributes). | | **RETURNS** | [regex](https://docs.python.org/3/library/re.html#re-objects) | The regex object. to be used for [`Tokenizer.suffix_search`](/api/tokenizer#attributes). |
### util.compile_infix_regex {#util.compile_infix_regex tag="function"} ### util.compile_infix_regex {#util.compile_infix_regex tag="function"}
@ -572,10 +572,10 @@ Compile a sequence of infix rules into a regex object.
> nlp.tokenizer.infix_finditer = infix_regex.finditer > nlp.tokenizer.infix_finditer = infix_regex.finditer
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ----------- | ------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------- | | ----------- | ------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
| `entries` | tuple | The infix rules, e.g. [`lang.punctuation.TOKENIZER_INFIXES`](https://github.com/explosion/spaCy/tree/master/spacy/lang/punctuation.py). | | `entries` | tuple | The infix rules, e.g. [`lang.punctuation.TOKENIZER_INFIXES`](https://github.com/explosion/spaCy/tree/v2.x/spacy/lang/punctuation.py). |
| **RETURNS** | [regex](https://docs.python.org/3/library/re.html#re-objects) | The regex object. to be used for [`Tokenizer.infix_finditer`](/api/tokenizer#attributes). | | **RETURNS** | [regex](https://docs.python.org/3/library/re.html#re-objects) | The regex object. to be used for [`Tokenizer.infix_finditer`](/api/tokenizer#attributes). |
### util.minibatch {#util.minibatch tag="function" new="2"} ### util.minibatch {#util.minibatch tag="function" new="2"}

View File

@ -2,7 +2,7 @@ Every language is different and usually full of **exceptions and special
cases**, especially amongst the most common words. Some of these exceptions are cases**, especially amongst the most common words. Some of these exceptions are
shared across languages, while others are **entirely specific** usually so shared across languages, while others are **entirely specific** usually so
specific that they need to be hard-coded. The specific that they need to be hard-coded. The
[`lang`](https://github.com/explosion/spaCy/tree/master/spacy/lang) module [`lang`](https://github.com/explosion/spacy/tree/v2.x/spacy/lang) module
contains all language-specific data, organized in simple Python files. This contains all language-specific data, organized in simple Python files. This
makes the data easy to update and extend. makes the data easy to update and extend.
@ -39,21 +39,21 @@ together all components and creating the `Language` subclass for example,
| **Lemmatizer**<br />[`spacy-lookups-data`][spacy-lookups-data] | Lemmatization rules or a lookup-based lemmatization table to assign base forms, for example "be" for "was". | | **Lemmatizer**<br />[`spacy-lookups-data`][spacy-lookups-data] | Lemmatization rules or a lookup-based lemmatization table to assign base forms, for example "be" for "was". |
[stop_words.py]: [stop_words.py]:
https://github.com/explosion/spaCy/tree/master/spacy/lang/en/stop_words.py https://github.com/explosion/spacy/tree/v2.x/spacy/lang/en/stop_words.py
[tokenizer_exceptions.py]: [tokenizer_exceptions.py]:
https://github.com/explosion/spaCy/tree/master/spacy/lang/de/tokenizer_exceptions.py https://github.com/explosion/spacy/tree/v2.x/spacy/lang/de/tokenizer_exceptions.py
[norm_exceptions.py]: [norm_exceptions.py]:
https://github.com/explosion/spaCy/tree/master/spacy/lang/norm_exceptions.py https://github.com/explosion/spacy/tree/v2.x/spacy/lang/norm_exceptions.py
[punctuation.py]: [punctuation.py]:
https://github.com/explosion/spaCy/tree/master/spacy/lang/punctuation.py https://github.com/explosion/spacy/tree/v2.x/spacy/lang/punctuation.py
[char_classes.py]: [char_classes.py]:
https://github.com/explosion/spaCy/tree/master/spacy/lang/char_classes.py https://github.com/explosion/spacy/tree/v2.x/spacy/lang/char_classes.py
[lex_attrs.py]: [lex_attrs.py]:
https://github.com/explosion/spaCy/tree/master/spacy/lang/en/lex_attrs.py https://github.com/explosion/spacy/tree/v2.x/spacy/lang/en/lex_attrs.py
[syntax_iterators.py]: [syntax_iterators.py]:
https://github.com/explosion/spaCy/tree/master/spacy/lang/en/syntax_iterators.py https://github.com/explosion/spacy/tree/v2.x/spacy/lang/en/syntax_iterators.py
[tag_map.py]: [tag_map.py]:
https://github.com/explosion/spaCy/tree/master/spacy/lang/en/tag_map.py https://github.com/explosion/spacy/tree/v2.x/spacy/lang/en/tag_map.py
[morph_rules.py]: [morph_rules.py]:
https://github.com/explosion/spaCy/tree/master/spacy/lang/en/morph_rules.py https://github.com/explosion/spacy/tree/v2.x/spacy/lang/en/morph_rules.py
[spacy-lookups-data]: https://github.com/explosion/spacy-lookups-data [spacy-lookups-data]: https://github.com/explosion/spacy-lookups-data

View File

@ -15,8 +15,8 @@ the specific workflows for each component.
> >
> To add a new language to spaCy, you'll need to **modify the library's code**. > To add a new language to spaCy, you'll need to **modify the library's code**.
> The easiest way to do this is to clone the > The easiest way to do this is to clone the
> [repository](https://github.com/explosion/spaCy/tree/master/) and **build > [repository](https://github.com/explosion/spacy/tree/v2.x/) and **build spaCy
> spaCy from source**. For more information on this, see the > from source**. For more information on this, see the
> [installation guide](/usage). Unlike spaCy's core, which is mostly written in > [installation guide](/usage). Unlike spaCy's core, which is mostly written in
> Cython, all language data is stored in regular Python files. This means that > Cython, all language data is stored in regular Python files. This means that
> you won't have to rebuild anything in between you can simply make edits and > you won't have to rebuild anything in between you can simply make edits and
@ -88,7 +88,7 @@ language and training a language model.
> #### Should I ever update the global data? > #### Should I ever update the global data?
> >
> Reusable language data is collected as atomic pieces in the root of the > Reusable language data is collected as atomic pieces in the root of the
> [`spacy.lang`](https://github.com/explosion/spaCy/tree/master/spacy/lang) > [`spacy.lang`](https://github.com/explosion/spacy/tree/v2.x/spacy/lang)
> module. Often, when a new language is added, you'll find a pattern or symbol > module. Often, when a new language is added, you'll find a pattern or symbol
> that's missing. Even if it isn't common in other languages, it might be best > that's missing. Even if it isn't common in other languages, it might be best
> to add it to the shared language data, unless it has some conflicting > to add it to the shared language data, unless it has some conflicting
@ -102,7 +102,7 @@ In order for the tokenizer to split suffixes, prefixes and infixes, spaCy needs
to know the language's character set. If the language you're adding uses to know the language's character set. If the language you're adding uses
non-latin characters, you might need to define the required character classes in non-latin characters, you might need to define the required character classes in
the global the global
[`char_classes.py`](https://github.com/explosion/spaCy/tree/master/spacy/lang/char_classes.py). [`char_classes.py`](https://github.com/explosion/spacy/tree/v2.x/spacy/lang/char_classes.py).
For efficiency, spaCy uses hard-coded unicode ranges to define character For efficiency, spaCy uses hard-coded unicode ranges to define character
classes, the definitions of which can be found on classes, the definitions of which can be found on
[Wikipedia](https://en.wikipedia.org/wiki/Unicode_block). If the language [Wikipedia](https://en.wikipedia.org/wiki/Unicode_block). If the language
@ -120,7 +120,7 @@ code and resources specific to Spanish are placed into a directory
`spacy/lang/es`, which can be imported as `spacy.lang.es`. `spacy/lang/es`, which can be imported as `spacy.lang.es`.
To get started, you can check out the To get started, you can check out the
[existing languages](https://github.com/explosion/spacy/tree/master/spacy/lang). [existing languages](https://github.com/explosion/spacy/tree/v2.x/spacy/lang).
Here's what the class could look like: Here's what the class could look like:
```python ```python
@ -291,14 +291,14 @@ weren't common in the training data, but are equivalent to other words for
example, "realise" and "realize", or "thx" and "thanks". example, "realise" and "realize", or "thx" and "thanks".
Similarly, spaCy also includes Similarly, spaCy also includes
[global base norms](https://github.com/explosion/spaCy/tree/master/spacy/lang/norm_exceptions.py) [global base norms](https://github.com/explosion/spacy/tree/v2.x/spacy/lang/norm_exceptions.py)
for normalizing different styles of quotation marks and currency symbols. Even for normalizing different styles of quotation marks and currency symbols. Even
though `$` and `€` are very different, spaCy normalizes them both to `$`. This though `$` and `€` are very different, spaCy normalizes them both to `$`. This
way, they'll always be seen as similar, no matter how common they were in the way, they'll always be seen as similar, no matter how common they were in the
training data. training data.
As of spaCy v2.3, language-specific norm exceptions are provided as a As of spaCy v2.3, language-specific norm exceptions are provided as a JSON
JSON dictionary in the package dictionary in the package
[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) rather [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) rather
than in the main library. For a full example, see than in the main library. For a full example, see
[`en_lexeme_norm.json`](https://github.com/explosion/spacy-lookups-data/blob/master/spacy_lookups_data/data/en_lexeme_norm.json). [`en_lexeme_norm.json`](https://github.com/explosion/spacy-lookups-data/blob/master/spacy_lookups_data/data/en_lexeme_norm.json).
@ -378,7 +378,7 @@ number words), requires some customization.
> of possible number words). > of possible number words).
Here's an example from the English Here's an example from the English
[`lex_attrs.py`](https://github.com/explosion/spaCy/tree/master/spacy/lang/en/lex_attrs.py): [`lex_attrs.py`](https://github.com/explosion/spacy/tree/v2.x/spacy/lang/en/lex_attrs.py):
```python ```python
### lex_attrs.py ### lex_attrs.py
@ -430,17 +430,17 @@ iterators:
> assert chunks[1].text == "another phrase" > assert chunks[1].text == "another phrase"
> ``` > ```
| Language | Code | Source | | Language | Code | Source |
| ---------------- | ---- | ----------------------------------------------------------------------------------------------------------------- | | ---------------- | ---- | --------------------------------------------------------------------------------------------------------------- |
| English | `en` | [`lang/en/syntax_iterators.py`](https://github.com/explosion/spaCy/tree/master/spacy/lang/en/syntax_iterators.py) | | English | `en` | [`lang/en/syntax_iterators.py`](https://github.com/explosion/spacy/tree/v2.x/spacy/lang/en/syntax_iterators.py) |
| German | `de` | [`lang/de/syntax_iterators.py`](https://github.com/explosion/spaCy/tree/master/spacy/lang/de/syntax_iterators.py) | | German | `de` | [`lang/de/syntax_iterators.py`](https://github.com/explosion/spacy/tree/v2.x/spacy/lang/de/syntax_iterators.py) |
| French | `fr` | [`lang/fr/syntax_iterators.py`](https://github.com/explosion/spaCy/tree/master/spacy/lang/fr/syntax_iterators.py) | | French | `fr` | [`lang/fr/syntax_iterators.py`](https://github.com/explosion/spacy/tree/v2.x/spacy/lang/fr/syntax_iterators.py) |
| Spanish | `es` | [`lang/es/syntax_iterators.py`](https://github.com/explosion/spaCy/tree/master/spacy/lang/es/syntax_iterators.py) | | Spanish | `es` | [`lang/es/syntax_iterators.py`](https://github.com/explosion/spacy/tree/v2.x/spacy/lang/es/syntax_iterators.py) |
| Greek | `el` | [`lang/el/syntax_iterators.py`](https://github.com/explosion/spaCy/tree/master/spacy/lang/el/syntax_iterators.py) | | Greek | `el` | [`lang/el/syntax_iterators.py`](https://github.com/explosion/spacy/tree/v2.x/spacy/lang/el/syntax_iterators.py) |
| Norwegian Bokmål | `nb` | [`lang/nb/syntax_iterators.py`](https://github.com/explosion/spaCy/tree/master/spacy/lang/nb/syntax_iterators.py) | | Norwegian Bokmål | `nb` | [`lang/nb/syntax_iterators.py`](https://github.com/explosion/spacy/tree/v2.x/spacy/lang/nb/syntax_iterators.py) |
| Swedish | `sv` | [`lang/sv/syntax_iterators.py`](https://github.com/explosion/spaCy/tree/master/spacy/lang/sv/syntax_iterators.py) | | Swedish | `sv` | [`lang/sv/syntax_iterators.py`](https://github.com/explosion/spacy/tree/v2.x/spacy/lang/sv/syntax_iterators.py) |
| Indonesian | `id` | [`lang/id/syntax_iterators.py`](https://github.com/explosion/spaCy/tree/master/spacy/lang/id/syntax_iterators.py) | | Indonesian | `id` | [`lang/id/syntax_iterators.py`](https://github.com/explosion/spacy/tree/v2.x/spacy/lang/id/syntax_iterators.py) |
| Persian | `fa` | [`lang/fa/syntax_iterators.py`](https://github.com/explosion/spaCy/tree/master/spacy/lang/fa/syntax_iterators.py) | | Persian | `fa` | [`lang/fa/syntax_iterators.py`](https://github.com/explosion/spacy/tree/v2.x/spacy/lang/fa/syntax_iterators.py) |
### Lemmatizer {#lemmatizer new="2"} ### Lemmatizer {#lemmatizer new="2"}
@ -561,7 +561,7 @@ be causing regressions.
spaCy uses the [pytest framework](https://docs.pytest.org/en/latest/) for spaCy uses the [pytest framework](https://docs.pytest.org/en/latest/) for
testing. For more details on how the tests are structured and best practices for testing. For more details on how the tests are structured and best practices for
writing your own tests, see our writing your own tests, see our
[tests documentation](https://github.com/explosion/spaCy/tree/master/spacy/tests). [tests documentation](https://github.com/explosion/spacy/tree/v2.x/spacy/tests).
</Infobox> </Infobox>
@ -569,10 +569,10 @@ writing your own tests, see our
It's recommended to always add at least some tests with examples specific to the It's recommended to always add at least some tests with examples specific to the
language. Language tests should be located in language. Language tests should be located in
[`tests/lang`](https://github.com/explosion/spaCy/tree/master/spacy/tests/lang) [`tests/lang`](https://github.com/explosion/spacy/tree/v2.x/spacy/tests/lang) in
in a directory named after the language ID. You'll also need to create a fixture a directory named after the language ID. You'll also need to create a fixture
for your tokenizer in the for your tokenizer in the
[`conftest.py`](https://github.com/explosion/spaCy/tree/master/spacy/tests/conftest.py). [`conftest.py`](https://github.com/explosion/spacy/tree/v2.x/spacy/tests/conftest.py).
Always use the [`get_lang_class`](/api/top-level#util.get_lang_class) helper Always use the [`get_lang_class`](/api/top-level#util.get_lang_class) helper
function within the fixture, instead of importing the class at the top of the function within the fixture, instead of importing the class at the top of the
file. This will load the language data only when it's needed. (Otherwise, _all file. This will load the language data only when it's needed. (Otherwise, _all
@ -585,7 +585,7 @@ def en_tokenizer():
``` ```
When adding test cases, always When adding test cases, always
[`parametrize`](https://github.com/explosion/spaCy/tree/master/spacy/tests#parameters) [`parametrize`](https://github.com/explosion/spacy/tree/v2.x/spacy/tests#parameters)
them this will make it easier for others to add more test cases without having them this will make it easier for others to add more test cases without having
to modify the test itself. You can also add parameter tuples, for example, a to modify the test itself. You can also add parameter tuples, for example, a
test sentence and its expected length, or a list of expected tokens. Here's an test sentence and its expected length, or a list of expected tokens. Here's an
@ -630,13 +630,13 @@ of using deep learning for NLP with limited labeled data. The vectors are also
useful by themselves they power the `.similarity` methods in spaCy. For best useful by themselves they power the `.similarity` methods in spaCy. For best
results, you should pre-process the text with spaCy before training the Word2vec results, you should pre-process the text with spaCy before training the Word2vec
model. This ensures your tokenization will match. You can use our model. This ensures your tokenization will match. You can use our
[word vectors training script](https://github.com/explosion/spacy/tree/master/bin/train_word_vectors.py), [word vectors training script](https://github.com/explosion/spacy/tree/v2.x/bin/train_word_vectors.py),
which pre-processes the text with your language-specific tokenizer and trains which pre-processes the text with your language-specific tokenizer and trains
the model using [Gensim](https://radimrehurek.com/gensim/). The `vectors.bin` the model using [Gensim](https://radimrehurek.com/gensim/). The `vectors.bin`
file should consist of one word and vector per line. file should consist of one word and vector per line.
```python ```python
https://github.com/explosion/spacy/tree/master/bin/train_word_vectors.py https://github.com/explosion/spacy/tree/v2.x/bin/train_word_vectors.py
``` ```
If you don't have a large sample of text available, you can also convert word If you don't have a large sample of text available, you can also convert word

View File

@ -17,7 +17,7 @@ This example shows how to use the new [`PhraseMatcher`](/api/phrasematcher) to
efficiently find entities from a large terminology list. efficiently find entities from a large terminology list.
```python ```python
https://github.com/explosion/spaCy/tree/master/examples/information_extraction/phrase_matcher.py https://github.com/explosion/spacy/tree/v2.x/examples/information_extraction/phrase_matcher.py
``` ```
### Extracting entity relations {#entity-relations} ### Extracting entity relations {#entity-relations}
@ -29,7 +29,7 @@ tree to find the noun phrase they are referring to for example:
`"$9.4 million"``"Net income"`. `"$9.4 million"``"Net income"`.
```python ```python
https://github.com/explosion/spaCy/tree/master/examples/information_extraction/entity_relations.py https://github.com/explosion/spacy/tree/v2.x/examples/information_extraction/entity_relations.py
``` ```
### Navigating the parse tree and subtrees {#subtrees} ### Navigating the parse tree and subtrees {#subtrees}
@ -38,7 +38,7 @@ This example shows how to navigate the parse tree including subtrees attached to
a word. a word.
```python ```python
https://github.com/explosion/spaCy/tree/master/examples/information_extraction/parse_subtrees.py https://github.com/explosion/spacy/tree/v2.x/examples/information_extraction/parse_subtrees.py
``` ```
## Pipeline {#pipeline hidden="true"} ## Pipeline {#pipeline hidden="true"}
@ -51,7 +51,7 @@ entities into one token and sets custom attributes on the `Doc`, `Span` and
`Token`. `Token`.
```python ```python
https://github.com/explosion/spaCy/tree/master/examples/pipeline/custom_component_entities.py https://github.com/explosion/spacy/tree/v2.x/examples/pipeline/custom_component_entities.py
``` ```
### Custom pipeline components and attribute extensions via a REST API {#custom-components-api new="2"} ### Custom pipeline components and attribute extensions via a REST API {#custom-components-api new="2"}
@ -63,7 +63,7 @@ attributes on the `Doc`, `Span` and `Token` for example, the capital,
latitude/longitude coordinates and the country flag. latitude/longitude coordinates and the country flag.
```python ```python
https://github.com/explosion/spaCy/tree/master/examples/pipeline/custom_component_countries_api.py https://github.com/explosion/spacy/tree/v2.x/examples/pipeline/custom_component_countries_api.py
``` ```
### Custom method extensions {#custom-components-attr-methods new="2"} ### Custom method extensions {#custom-components-attr-methods new="2"}
@ -72,7 +72,7 @@ A collection of snippets showing examples of extensions adding custom methods to
the `Doc`, `Token` and `Span`. the `Doc`, `Token` and `Span`.
```python ```python
https://github.com/explosion/spaCy/tree/master/examples/pipeline/custom_attr_methods.py https://github.com/explosion/spacy/tree/v2.x/examples/pipeline/custom_attr_methods.py
``` ```
### Multi-processing with Joblib {#multi-processing} ### Multi-processing with Joblib {#multi-processing}
@ -85,7 +85,7 @@ IMDB movie reviews dataset and will be loaded automatically via Thinc's built-in
dataset loader. dataset loader.
```python ```python
https://github.com/explosion/spaCy/tree/master/examples/pipeline/multi_processing.py https://github.com/explosion/spacy/tree/v2.x/examples/pipeline/multi_processing.py
``` ```
## Training {#training hidden="true"} ## Training {#training hidden="true"}
@ -93,11 +93,11 @@ https://github.com/explosion/spaCy/tree/master/examples/pipeline/multi_processin
### Training spaCy's Named Entity Recognizer {#training-ner} ### Training spaCy's Named Entity Recognizer {#training-ner}
This example shows how to update spaCy's entity recognizer with your own This example shows how to update spaCy's entity recognizer with your own
examples, starting off with an existing, pretrained model, or from scratch examples, starting off with an existing, pretrained model, or from scratch using
using a blank `Language` class. a blank `Language` class.
```python ```python
https://github.com/explosion/spaCy/tree/master/examples/training/train_ner.py https://github.com/explosion/spacy/tree/v2.x/examples/training/train_ner.py
``` ```
### Training an additional entity type {#new-entity-type} ### Training an additional entity type {#new-entity-type}
@ -108,28 +108,28 @@ examples. In practice, you'll need many more — a few hundred would be a good
start. start.
```python ```python
https://github.com/explosion/spaCy/tree/master/examples/training/train_new_entity_type.py https://github.com/explosion/spacy/tree/v2.x/examples/training/train_new_entity_type.py
``` ```
### Creating a Knowledge Base for Named Entity Linking {#kb} ### Creating a Knowledge Base for Named Entity Linking {#kb}
This example shows how to create a knowledge base in spaCy, This example shows how to create a knowledge base in spaCy, which is needed to
which is needed to implement entity linking functionality. implement entity linking functionality. It requires as input a spaCy model with
It requires as input a spaCy model with pretrained word vectors, pretrained word vectors, and it stores the KB to file (if an `output_dir` is
and it stores the KB to file (if an `output_dir` is provided). provided).
```python ```python
https://github.com/explosion/spaCy/tree/master/examples/training/create_kb.py https://github.com/explosion/spacy/tree/v2.x/examples/training/create_kb.py
``` ```
### Training spaCy's Named Entity Linker {#nel} ### Training spaCy's Named Entity Linker {#nel}
This example shows how to train spaCy's entity linker with your own custom This example shows how to train spaCy's entity linker with your own custom
examples, starting off with a predefined knowledge base and its vocab, examples, starting off with a predefined knowledge base and its vocab, and using
and using a blank `English` class. a blank `English` class.
```python ```python
https://github.com/explosion/spaCy/tree/master/examples/training/train_entity_linker.py https://github.com/explosion/spacy/tree/v2.x/examples/training/train_entity_linker.py
``` ```
### Training spaCy's Dependency Parser {#parser} ### Training spaCy's Dependency Parser {#parser}
@ -138,7 +138,7 @@ This example shows how to update spaCy's dependency parser, starting off with an
existing, pretrained model, or from scratch using a blank `Language` class. existing, pretrained model, or from scratch using a blank `Language` class.
```python ```python
https://github.com/explosion/spaCy/tree/master/examples/training/train_parser.py https://github.com/explosion/spacy/tree/v2.x/examples/training/train_parser.py
``` ```
### Training spaCy's Part-of-speech Tagger {#tagger} ### Training spaCy's Part-of-speech Tagger {#tagger}
@ -148,7 +148,7 @@ map, mapping our own tags to the mapping those tags to the
[Universal Dependencies scheme](http://universaldependencies.github.io/docs/u/pos/index.html). [Universal Dependencies scheme](http://universaldependencies.github.io/docs/u/pos/index.html).
```python ```python
https://github.com/explosion/spaCy/tree/master/examples/training/train_tagger.py https://github.com/explosion/spacy/tree/v2.x/examples/training/train_tagger.py
``` ```
### Training a custom parser for chat intent semantics {#intent-parser} ### Training a custom parser for chat intent semantics {#intent-parser}
@ -162,7 +162,7 @@ following types of relations: `ROOT`, `PLACE`, `QUALITY`, `ATTRIBUTE`, `TIME`
and `LOCATION`. and `LOCATION`.
```python ```python
https://github.com/explosion/spaCy/tree/master/examples/training/train_intent_parser.py https://github.com/explosion/spacy/tree/v2.x/examples/training/train_intent_parser.py
``` ```
### Training spaCy's text classifier {#textcat new="2"} ### Training spaCy's text classifier {#textcat new="2"}
@ -174,7 +174,7 @@ automatically via Thinc's built-in dataset loader. Predictions are available via
[`Doc.cats`](/api/doc#attributes). [`Doc.cats`](/api/doc#attributes).
```python ```python
https://github.com/explosion/spaCy/tree/master/examples/training/train_textcat.py https://github.com/explosion/spacy/tree/v2.x/examples/training/train_textcat.py
``` ```
## Vectors {#vectors hidden="true"} ## Vectors {#vectors hidden="true"}
@ -186,7 +186,7 @@ This script lets you load any spaCy model containing word vectors into
[embedding visualization](https://github.com/tensorflow/tensorboard/blob/master/docs/tensorboard_projector_plugin.ipynb). [embedding visualization](https://github.com/tensorflow/tensorboard/blob/master/docs/tensorboard_projector_plugin.ipynb).
```python ```python
https://github.com/explosion/spaCy/tree/master/examples/vectors_tensorboard.py https://github.com/explosion/spacy/tree/v2.x/examples/vectors_tensorboard.py
``` ```
## Deep Learning {#deep-learning hidden="true"} ## Deep Learning {#deep-learning hidden="true"}
@ -203,5 +203,5 @@ documents so that they're a fixed size. This hurts review accuracy a lot,
because people often summarize their rating in the final sentence. because people often summarize their rating in the final sentence.
```python ```python
https://github.com/explosion/spaCy/tree/master/examples/deep_learning_keras.py https://github.com/explosion/spacy/tree/v2.x/examples/deep_learning_keras.py
``` ```

View File

@ -176,7 +176,7 @@ pip install -r requirements.txt
``` ```
Compared to regular install via pip, the Compared to regular install via pip, the
[`requirements.txt`](https://github.com/explosion/spaCy/tree/master/requirements.txt) [`requirements.txt`](https://github.com/explosion/spacy/tree/v2.x/requirements.txt)
additionally installs developer dependencies such as Cython. See the the additionally installs developer dependencies such as Cython. See the the
[quickstart widget](#quickstart) to get the right commands for your platform and [quickstart widget](#quickstart) to get the right commands for your platform and
Python version. Python version.
@ -243,14 +243,14 @@ source code and recompiling frequently.
### Run tests {#run-tests} ### Run tests {#run-tests}
spaCy comes with an spaCy comes with an
[extensive test suite](https://github.com/explosion/spaCy/tree/master/spacy/tests). [extensive test suite](https://github.com/explosion/spacy/tree/v2.x/spacy/tests).
In order to run the tests, you'll usually want to clone the In order to run the tests, you'll usually want to clone the
[repository](https://github.com/explosion/spaCy/tree/master/) and [repository](https://github.com/explosion/spacy/tree/v2.x/) and
[build spaCy from source](#source). This will also install the required [build spaCy from source](#source). This will also install the required
development dependencies and test utilities defined in the `requirements.txt`. development dependencies and test utilities defined in the `requirements.txt`.
Alternatively, you can run `pytest` on the tests packaged with the install Alternatively, you can run `pytest` on the tests packaged with the install
`spacy package. Don't forget to also install the test utilities via spaCy's [`requirements.txt`](https://github.com/explosion/spaCy/tree/master/requirements.txt): `spacy package. Don't forget to also install the test utilities via spaCy's [`requirements.txt`](https://github.com/explosion/spacy/tree/v2.x/requirements.txt):
```bash ```bash
pip install -r requirements.txt pip install -r requirements.txt

View File

@ -540,7 +540,7 @@ gold = GoldParse(doc, entities=["U-ANIMAL", "O", "O", "O"])
For more details on **training and updating** the named entity recognizer, see For more details on **training and updating** the named entity recognizer, see
the usage guides on [training](/usage/training) or check out the runnable the usage guides on [training](/usage/training) or check out the runnable
[training script](https://github.com/explosion/spaCy/tree/master/examples/training/train_ner.py) [training script](https://github.com/explosion/spacy/tree/v2.x/examples/training/train_ner.py)
on GitHub. on GitHub.
</Infobox> </Infobox>
@ -646,7 +646,7 @@ import Tokenization101 from 'usage/101/\_tokenization.md'
**Global** and **language-specific** tokenizer data is supplied via the language **Global** and **language-specific** tokenizer data is supplied via the language
data in data in
[`spacy/lang`](https://github.com/explosion/spaCy/tree/master/spacy/lang). The [`spacy/lang`](https://github.com/explosion/spacy/tree/v2.x/spacy/lang). The
tokenizer exceptions define special cases like "don't" in English, which needs tokenizer exceptions define special cases like "don't" in English, which needs
to be split into two tokens: `{ORTH: "do"}` and `{ORTH: "n't", NORM: "not"}`. to be split into two tokens: `{ORTH: "do"}` and `{ORTH: "n't", NORM: "not"}`.
The prefixes, suffixes and infixes mostly define punctuation rules for The prefixes, suffixes and infixes mostly define punctuation rules for
@ -666,7 +666,7 @@ For more details on the language-specific data, see the usage guide on
Tokenization rules that are specific to one language, but can be **generalized Tokenization rules that are specific to one language, but can be **generalized
across that language** should ideally live in the language data in across that language** should ideally live in the language data in
[`spacy/lang`](https://github.com/explosion/spaCy/tree/master/spacy/lang)  we [`spacy/lang`](https://github.com/explosion/spacy/tree/v2.x/spacy/lang)  we
always appreciate pull requests! Anything that's specific to a domain or text always appreciate pull requests! Anything that's specific to a domain or text
type like financial trading abbreviations, or Bavarian youth slang should be type like financial trading abbreviations, or Bavarian youth slang should be
added as a special case rule to your tokenizer instance. If you're dealing with added as a special case rule to your tokenizer instance. If you're dealing with
@ -843,7 +843,7 @@ domain. There are six things you may need to define:
be split, overriding the infix rules. Useful for things like numbers. be split, overriding the infix rules. Useful for things like numbers.
6. An optional boolean function `url_match`, which is similar to `token_match` 6. An optional boolean function `url_match`, which is similar to `token_match`
except that prefixes and suffixes are removed before applying the match. except that prefixes and suffixes are removed before applying the match.
<Infobox title="Important note: token match in spaCy v2.2" variant="warning"> <Infobox title="Important note: token match in spaCy v2.2" variant="warning">
In spaCy v2.2.2-v2.2.4, the `token_match` was equivalent to the `url_match` In spaCy v2.2.2-v2.2.4, the `token_match` was equivalent to the `url_match`

View File

@ -78,7 +78,7 @@ As of v2.0, spaCy supports models trained on more than one language. This is
especially useful for named entity recognition. The language ID used for especially useful for named entity recognition. The language ID used for
multi-language or language-neutral models is `xx`. The language class, a generic multi-language or language-neutral models is `xx`. The language class, a generic
subclass containing only the base language data, can be found in subclass containing only the base language data, can be found in
[`lang/xx`](https://github.com/explosion/spaCy/tree/master/spacy/lang/xx). [`lang/xx`](https://github.com/explosion/spacy/tree/v2.x/spacy/lang/xx).
To load your model with the neutral, multi-language class, simply set To load your model with the neutral, multi-language class, simply set
`"language": "xx"` in your [model package](/usage/training#models-generating)'s `"language": "xx"` in your [model package](/usage/training#models-generating)'s

View File

@ -489,7 +489,7 @@ When you call `nlp` on a text, the custom pipeline component is applied to the
`Doc`. `Doc`.
```python ```python
https://github.com/explosion/spaCy/tree/master/examples/pipeline/custom_component_entities.py https://github.com/explosion/spacy/tree/v2.x/examples/pipeline/custom_component_entities.py
``` ```
Wrapping this functionality in a pipeline component allows you to reuse the Wrapping this functionality in a pipeline component allows you to reuse the
@ -650,7 +650,7 @@ attributes on the `Doc`, `Span` and `Token` for example, the capital,
latitude/longitude coordinates and even the country flag. latitude/longitude coordinates and even the country flag.
```python ```python
https://github.com/explosion/spaCy/tree/master/examples/pipeline/custom_component_countries_api.py https://github.com/explosion/spacy/tree/v2.x/examples/pipeline/custom_component_countries_api.py
``` ```
In this case, all data can be fetched on initialization in one request. However, In this case, all data can be fetched on initialization in one request. However,

View File

@ -193,7 +193,7 @@ computed properties can't be accessed.
The uppercase attribute names like `LOWER` or `IS_PUNCT` refer to symbols from The uppercase attribute names like `LOWER` or `IS_PUNCT` refer to symbols from
the the
[`spacy.attrs`](https://github.com/explosion/spaCy/tree/master/spacy/attrs.pyx) [`spacy.attrs`](https://github.com/explosion/spacy/tree/v2.x/spacy/attrs.pyx)
enum table. They're passed into a function that essentially is a big case/switch enum table. They're passed into a function that essentially is a big case/switch
statement, to figure out which struct field to return. The same attribute statement, to figure out which struct field to return. The same attribute
identifiers are used in [`Doc.to_array`](/api/doc#to_array), and a few other identifiers are used in [`Doc.to_array`](/api/doc#to_array), and a few other

View File

@ -194,7 +194,7 @@ add to that data and saves and loads the data to and from a JSON file.
> >
> To see custom serialization methods in action, check out the new > To see custom serialization methods in action, check out the new
> [`EntityRuler`](/api/entityruler) component and its > [`EntityRuler`](/api/entityruler) component and its
> [source](https://github.com/explosion/spaCy/tree/master/spacy/pipeline/entityruler.py). > [source](https://github.com/explosion/spacy/tree/v2.x/spacy/pipeline/entityruler.py).
> Patterns added to the component will be saved to a `.jsonl` file if the > Patterns added to the component will be saved to a `.jsonl` file if the
> pipeline is serialized to disk, and to a bytestring if the pipeline is > pipeline is serialized to disk, and to a bytestring if the pipeline is
> serialized to bytes. This allows saving out a model with a rule-based entity > serialized to bytes. This allows saving out a model with a rule-based entity

View File

@ -915,9 +915,9 @@ via the following platforms:
questions** and everything related to problems with your specific code. The questions** and everything related to problems with your specific code. The
Stack Overflow community is much larger than ours, so if your problem can be Stack Overflow community is much larger than ours, so if your problem can be
solved by others, you'll receive help much quicker. solved by others, you'll receive help much quicker.
- [GitHub discussions](https://github.com/explosion/spaCy/discussions): **General - [GitHub discussions](https://github.com/explosion/spaCy/discussions): **General
discussion**, **project ideas** and **usage questions**. Meet other community discussion**, **project ideas** and **usage questions**. Meet other community
members to get help with a specific code implementation, discuss ideas for new members to get help with a specific code implementation, discuss ideas for new
projects/plugins, support more languages, and share best practices. projects/plugins, support more languages, and share best practices.
- [GitHub issue tracker](https://github.com/explosion/spaCy/issues): **Bug - [GitHub issue tracker](https://github.com/explosion/spaCy/issues): **Bug
reports** and **improvement suggestions**, i.e. everything that's likely reports** and **improvement suggestions**, i.e. everything that's likely
@ -959,7 +959,7 @@ regressions to the parts of the library that you care about the most.
**For more details on the types of contributions we're looking for, the code **For more details on the types of contributions we're looking for, the code
conventions and other useful tips, make sure to check out the conventions and other useful tips, make sure to check out the
[contributing guidelines](https://github.com/explosion/spaCy/tree/master/CONTRIBUTING.md).** [contributing guidelines](https://github.com/explosion/spacy/tree/v2.x/CONTRIBUTING.md).**
<Infobox title="Code of Conduct" variant="warning"> <Infobox title="Code of Conduct" variant="warning">

View File

@ -352,7 +352,7 @@ a blank `Language` class. To do this, you'll need **example texts** and the
**character offsets** and **labels** of each entity contained in the texts. **character offsets** and **labels** of each entity contained in the texts.
```python ```python
https://github.com/explosion/spaCy/tree/master/examples/training/train_ner.py https://github.com/explosion/spacy/tree/v2.x/examples/training/train_ner.py
``` ```
#### Step by step guide {#step-by-step-ner} #### Step by step guide {#step-by-step-ner}
@ -384,7 +384,7 @@ entity recognizer over unlabelled sentences, and adding their annotations to the
training set. training set.
```python ```python
https://github.com/explosion/spaCy/tree/master/examples/training/train_new_entity_type.py https://github.com/explosion/spacy/tree/v2.x/examples/training/train_new_entity_type.py
``` ```
<Infobox title="Important note" variant="warning"> <Infobox title="Important note" variant="warning">
@ -426,7 +426,7 @@ the respective **heads** and **dependency label** for each token of the example
texts. texts.
```python ```python
https://github.com/explosion/spaCy/tree/master/examples/training/train_parser.py https://github.com/explosion/spacy/tree/v2.x/examples/training/train_parser.py
``` ```
#### Step by step guide {#step-by-step-parser} #### Step by step guide {#step-by-step-parser}
@ -460,7 +460,7 @@ those tags to the
[Universal Dependencies scheme](http://universaldependencies.github.io/docs/u/pos/index.html). [Universal Dependencies scheme](http://universaldependencies.github.io/docs/u/pos/index.html).
```python ```python
https://github.com/explosion/spaCy/tree/master/examples/training/train_tagger.py https://github.com/explosion/spacy/tree/v2.x/examples/training/train_tagger.py
``` ```
#### Step by step guide {#step-by-step-tagger} #### Step by step guide {#step-by-step-tagger}
@ -528,7 +528,7 @@ message semantics will have the following types of relations: `ROOT`, `PLACE`,
`QUALITY`, `ATTRIBUTE`, `TIME` and `LOCATION`. `QUALITY`, `ATTRIBUTE`, `TIME` and `LOCATION`.
```python ```python
https://github.com/explosion/spaCy/tree/master/examples/training/train_intent_parser.py https://github.com/explosion/spacy/tree/v2.x/examples/training/train_intent_parser.py
``` ```
#### Step by step guide {#step-by-step-parser-custom} #### Step by step guide {#step-by-step-parser-custom}
@ -567,7 +567,7 @@ automatically via Thinc's built-in dataset loader. Predictions are available via
[`Doc.cats`](/api/doc#attributes). [`Doc.cats`](/api/doc#attributes).
```python ```python
https://github.com/explosion/spaCy/tree/master/examples/training/train_textcat.py https://github.com/explosion/spacy/tree/v2.x/examples/training/train_textcat.py
``` ```
#### Step by step guide {#step-by-step-textcat} #### Step by step guide {#step-by-step-textcat}
@ -614,7 +614,7 @@ pretrained word vectors to obtain an encoding of an entity's description as its
vector. vector.
```python ```python
https://github.com/explosion/spaCy/tree/master/examples/training/create_kb.py https://github.com/explosion/spacy/tree/v2.x/examples/training/create_kb.py
``` ```
#### Step by step guide {#step-by-step-kb} #### Step by step guide {#step-by-step-kb}
@ -639,7 +639,7 @@ offsets** and **knowledge base identifiers** of each entity contained in the
texts. texts.
```python ```python
https://github.com/explosion/spaCy/tree/master/examples/training/train_entity_linker.py https://github.com/explosion/spacy/tree/v2.x/examples/training/train_entity_linker.py
``` ```
#### Step by step guide {#step-by-step-entity-linker} #### Step by step guide {#step-by-step-entity-linker}

View File

@ -180,7 +180,7 @@ entirely **in Markdown**, without having to compromise on easy-to-use custom UI
components. We're hoping that the Markdown source will make it even easier to components. We're hoping that the Markdown source will make it even easier to
contribute to the documentation. For more details, check out the contribute to the documentation. For more details, check out the
[styleguide](/styleguide) and [styleguide](/styleguide) and
[source](https://github.com/explosion/spaCy/tree/master/website). While [source](https://github.com/explosion/spacy/tree/v2.x/website). While
converting the pages to Markdown, we've also fixed a bunch of typos, improved converting the pages to Markdown, we've also fixed a bunch of typos, improved
the existing pages and added some new content: the existing pages and added some new content:

View File

@ -161,8 +161,8 @@ debugging your tokenizer configuration.
spaCy's custom warnings have been replaced with native Python spaCy's custom warnings have been replaced with native Python
[`warnings`](https://docs.python.org/3/library/warnings.html). Instead of [`warnings`](https://docs.python.org/3/library/warnings.html). Instead of
setting `SPACY_WARNING_IGNORE`, use the [`warnings` setting `SPACY_WARNING_IGNORE`, use the
filters](https://docs.python.org/3/library/warnings.html#the-warnings-filter) [`warnings` filters](https://docs.python.org/3/library/warnings.html#the-warnings-filter)
to manage warnings. to manage warnings.
```diff ```diff
@ -176,7 +176,7 @@ import spacy
#### Normalization tables #### Normalization tables
The normalization tables have moved from the language data in The normalization tables have moved from the language data in
[`spacy/lang`](https://github.com/explosion/spaCy/tree/master/spacy/lang) to the [`spacy/lang`](https://github.com/explosion/spacy/tree/v2.x/spacy/lang) to the
package [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data). package [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data).
If you're adding data for a new language, the normalization table should be If you're adding data for a new language, the normalization table should be
added to `spacy-lookups-data`. See added to `spacy-lookups-data`. See
@ -190,8 +190,8 @@ lexemes will be added to the vocab automatically, just as in small models
without vectors. without vectors.
To see the number of unique vectors and number of words with vectors, see To see the number of unique vectors and number of words with vectors, see
`nlp.meta['vectors']`, for example for `en_core_web_md` there are `20000` `nlp.meta['vectors']`, for example for `en_core_web_md` there are `20000` unique
unique vectors and `684830` words with vectors: vectors and `684830` words with vectors:
```python ```python
{ {
@ -210,8 +210,8 @@ for orth in nlp.vocab.vectors:
_ = nlp.vocab[orth] _ = nlp.vocab[orth]
``` ```
If your workflow previously iterated over `nlp.vocab`, a similar alternative If your workflow previously iterated over `nlp.vocab`, a similar alternative is
is to iterate over words with vectors instead: to iterate over words with vectors instead:
```diff ```diff
- lexemes = [w for w in nlp.vocab] - lexemes = [w for w in nlp.vocab]
@ -220,9 +220,9 @@ is to iterate over words with vectors instead:
Be aware that the set of preloaded lexemes in a v2.2 model is not equivalent to Be aware that the set of preloaded lexemes in a v2.2 model is not equivalent to
the set of words with vectors. For English, v2.2 `md/lg` models have 1.3M the set of words with vectors. For English, v2.2 `md/lg` models have 1.3M
provided lexemes but only 685K words with vectors. The vectors have been provided lexemes but only 685K words with vectors. The vectors have been updated
updated for most languages in v2.2, but the English models contain the same for most languages in v2.2, but the English models contain the same vectors for
vectors for both v2.2 and v2.3. both v2.2 and v2.3.
#### Lexeme.is_oov and Token.is_oov #### Lexeme.is_oov and Token.is_oov
@ -234,8 +234,7 @@ fixed in the next patch release v2.3.1.
</Infobox> </Infobox>
In v2.3, `Lexeme.is_oov` and `Token.is_oov` are `True` if the lexeme does not In v2.3, `Lexeme.is_oov` and `Token.is_oov` are `True` if the lexeme does not
have a word vector. This is equivalent to `token.orth not in have a word vector. This is equivalent to `token.orth not in nlp.vocab.vectors`.
nlp.vocab.vectors`.
Previously in v2.2, `is_oov` corresponded to whether a lexeme had stored Previously in v2.2, `is_oov` corresponded to whether a lexeme had stored
probability and cluster features. The probability and cluster features are no probability and cluster features. The probability and cluster features are no
@ -270,8 +269,8 @@ as part of the model vocab.
To load the probability table into a provided model, first make sure you have To load the probability table into a provided model, first make sure you have
`spacy-lookups-data` installed. To load the table, remove the empty provided `spacy-lookups-data` installed. To load the table, remove the empty provided
`lexeme_prob` table and then access `Lexeme.prob` for any word to load the `lexeme_prob` table and then access `Lexeme.prob` for any word to load the table
table from `spacy-lookups-data`: from `spacy-lookups-data`:
```diff ```diff
+ # prerequisite: pip install spacy-lookups-data + # prerequisite: pip install spacy-lookups-data
@ -321,9 +320,9 @@ the [train CLI](/api/cli#train), you can use the new `--tag-map-path` option to
provide in the tag map as a JSON dict. provide in the tag map as a JSON dict.
If you want to export a tag map from a provided model for use with the train If you want to export a tag map from a provided model for use with the train
CLI, you can save it as a JSON dict. To only use string keys as required by CLI, you can save it as a JSON dict. To only use string keys as required by JSON
JSON and to make it easier to read and edit, any internal integer IDs need to and to make it easier to read and edit, any internal integer IDs need to be
be converted back to strings: converted back to strings:
```python ```python
import spacy import spacy

View File

@ -306,7 +306,7 @@ lookup-based lemmatization and **many new languages**!
<Infobox> <Infobox>
**API:** [`Language`](/api/language) **Code:** **API:** [`Language`](/api/language) **Code:**
[`spacy/lang`](https://github.com/explosion/spaCy/tree/master/spacy/lang) [`spacy/lang`](https://github.com/explosion/spacy/tree/v2.x/spacy/lang)
**Usage:** [Adding languages](/usage/adding-languages) **Usage:** [Adding languages](/usage/adding-languages)
</Infobox> </Infobox>

View File

@ -4,7 +4,8 @@
"slogan": "Industrial-strength Natural Language Processing in Python", "slogan": "Industrial-strength Natural Language Processing in Python",
"siteUrl": "https://v2.spacy.io", "siteUrl": "https://v2.spacy.io",
"domain": "v2.spacy.io", "domain": "v2.spacy.io",
"legacy": true, "legacy": false,
"codeBranch": "v2.x",
"email": "contact@explosion.ai", "email": "contact@explosion.ai",
"company": "Explosion AI", "company": "Explosion AI",
"companyUrl": "https://explosion.ai", "companyUrl": "https://explosion.ai",

View File

@ -6,6 +6,7 @@ import siteMetadata from '../../meta/site.json'
const htmlToReactParser = new HtmlToReactParser() const htmlToReactParser = new HtmlToReactParser()
export const defaultBranch = siteMetadata.codeBranch
export const repo = siteMetadata.repo export const repo = siteMetadata.repo
export const modelsRepo = siteMetadata.modelsRepo export const modelsRepo = siteMetadata.modelsRepo
@ -18,11 +19,11 @@ export const headingTextClassName = 'heading-text'
/** /**
* Create a link to the spaCy repository on GitHub * Create a link to the spaCy repository on GitHub
* @param {string} filepath - The file path relative to the root of the repo. * @param {string} filepath - The file path relative to the root of the repo.
* @param {string} [branch] - Optional branch. Defaults to master. * @param {string} [branch] - Optional branch.
* @returns {string} - URL to the file on GitHub. * @returns {string} - URL to the file on GitHub.
*/ */
export function github(filepath, branch = 'master') { export function github(filepath, branch = defaultBranch) {
const path = filepath ? '/tree/' + (branch || 'master') + '/' + filepath : '' const path = filepath ? '/tree/' + (branch || defaultBranch) + '/' + filepath : ''
return `https://github.com/${repo}${path}` return `https://github.com/${repo}${path}`
} }
@ -30,9 +31,9 @@ export function github(filepath, branch = 'master') {
* Get the source of a file in the documentation based on its slug * Get the source of a file in the documentation based on its slug
* @param {string} slug - The slug, e.g. /api/doc. * @param {string} slug - The slug, e.g. /api/doc.
* @param {boolean} [isIndex] - Whether the page is an index, e.g. /api/index.md * @param {boolean} [isIndex] - Whether the page is an index, e.g. /api/index.md
* @param {string} [branch] - Optional branch on GitHub. Defaults to master. * @param {string} [branch] - Optional branch on GitHub.
*/ */
export function getCurrentSource(slug, isIndex = false, branch = 'master') { export function getCurrentSource(slug, isIndex = false, branch = defaultBranch) {
const ext = isIndex ? '/index.md' : '.md' const ext = isIndex ? '/index.md' : '.md'
return github(`website/docs${slug}${ext}`, branch) return github(`website/docs${slug}${ext}`, branch)
} }