Remove u-strings and fix formatting [ci skip]

This commit is contained in:
Ines Montani 2019-09-12 16:11:15 +02:00
parent 7e3ac2cd41
commit 82c16b7943
44 changed files with 644 additions and 658 deletions

View File

@ -309,7 +309,7 @@ indented block as plain text and preserve whitespace.
### Using spaCy ### Using spaCy
import spacy import spacy
nlp = spacy.load("en_core_web_sm") nlp = spacy.load("en_core_web_sm")
doc = nlp(u"This is a sentence.") doc = nlp("This is a sentence.")
for token in doc: for token in doc:
print(token.text, token.pos_) print(token.text, token.pos_)
``` ```
@ -335,9 +335,9 @@ from spacy.matcher import Matcher
nlp = spacy.load('en_core_web_sm') nlp = spacy.load('en_core_web_sm')
matcher = Matcher(nlp.vocab) matcher = Matcher(nlp.vocab)
pattern = [{'LOWER': 'hello'}, {'IS_PUNCT': True}, {'LOWER': 'world'}] pattern = [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"LOWER": "world"}]
matcher.add('HelloWorld', None, pattern) matcher.add("HelloWorld", None, pattern)
doc = nlp(u'Hello, world! Hello world!') doc = nlp("Hello, world! Hello world!")
matches = matcher(doc) matches = matcher(doc)
``` ```
@ -360,7 +360,7 @@ interactive widget defaults to a regular code block.
### {executable="true"} ### {executable="true"}
import spacy import spacy
nlp = spacy.load("en_core_web_sm") nlp = spacy.load("en_core_web_sm")
doc = nlp(u"This is a sentence.") doc = nlp("This is a sentence.")
for token in doc: for token in doc:
print(token.text, token.pos_) print(token.text, token.pos_)
``` ```
@ -457,7 +457,8 @@ sit amet dignissim justo congue.
## Setup and installation {#setup} ## Setup and installation {#setup}
Before running the setup, make sure your versions of Before running the setup, make sure your versions of
[Node](https://nodejs.org/en/) and [npm](https://www.npmjs.com/) are up to date. Node v10.15 or later is required. [Node](https://nodejs.org/en/) and [npm](https://www.npmjs.com/) are up to date.
Node v10.15 or later is required.
```bash ```bash
# Clone the repository # Clone the repository

View File

@ -16,7 +16,7 @@ menu:
> ```python > ```python
> from spacy.lang.en import English > from spacy.lang.en import English
> nlp = English() > nlp = English()
> tokens = nlp(u"Some\\nspaces and\\ttab characters") > tokens = nlp("Some\\nspaces and\\ttab characters")
> tokens_text = [t.text for t in tokens] > tokens_text = [t.text for t in tokens]
> assert tokens_text == ["Some", "\\n", "spaces", " ", "and", "\\t", "tab", "characters"] > assert tokens_text == ["Some", "\\n", "spaces", " ", "and", "\\t", "tab", "characters"]
> ``` > ```
@ -186,63 +186,63 @@ The German part-of-speech tagger uses the
annotation scheme. We also map the tags to the simpler Google Universal POS tag annotation scheme. We also map the tags to the simpler Google Universal POS tag
set. set.
| Tag |  POS | Morphology | Description | | Tag |  POS | Morphology | Description |
| --------- | ------- | ------------------------------------------- | ------------------------------------------------- | | --------- | ------- | ---------------------------------------- | ------------------------------------------------- |
| `$(` | `PUNCT` | `PunctType=brck` | other sentence-internal punctuation mark | | `$(` | `PUNCT` | `PunctType=brck` | other sentence-internal punctuation mark |
| `$,` | `PUNCT` | `PunctType=comm` | comma | | `$,` | `PUNCT` | `PunctType=comm` | comma |
| `$.` | `PUNCT` | `PunctType=peri` | sentence-final punctuation mark | | `$.` | `PUNCT` | `PunctType=peri` | sentence-final punctuation mark |
| `ADJA` | `ADJ` | | adjective, attributive | | `ADJA` | `ADJ` | | adjective, attributive |
| `ADJD` | `ADJ` | `Variant=short` | adjective, adverbial or predicative | | `ADJD` | `ADJ` | `Variant=short` | adjective, adverbial or predicative |
| `ADV` | `ADV` | | adverb | | `ADV` | `ADV` | | adverb |
| `APPO` | `ADP` | `AdpType=post` | postposition | | `APPO` | `ADP` | `AdpType=post` | postposition |
| `APPR` | `ADP` | `AdpType=prep` | preposition; circumposition left | | `APPR` | `ADP` | `AdpType=prep` | preposition; circumposition left |
| `APPRART` | `ADP` | `AdpType=prep PronType=art` | preposition with article | | `APPRART` | `ADP` | `AdpType=prep PronType=art` | preposition with article |
| `APZR` | `ADP` | `AdpType=circ` | circumposition right | | `APZR` | `ADP` | `AdpType=circ` | circumposition right |
| `ART` | `DET` | `PronType=art` | definite or indefinite article | | `ART` | `DET` | `PronType=art` | definite or indefinite article |
| `CARD` | `NUM` | `NumType=card` | cardinal number | | `CARD` | `NUM` | `NumType=card` | cardinal number |
| `FM` | `X` | `Foreign=yes` | foreign language material | | `FM` | `X` | `Foreign=yes` | foreign language material |
| `ITJ` | `INTJ` | | interjection | | `ITJ` | `INTJ` | | interjection |
| `KOKOM` | `CONJ` | `ConjType=comp` | comparative conjunction | | `KOKOM` | `CONJ` | `ConjType=comp` | comparative conjunction |
| `KON` | `CONJ` | | coordinate conjunction | | `KON` | `CONJ` | | coordinate conjunction |
| `KOUI` | `SCONJ` | | subordinate conjunction with "zu" and infinitive | | `KOUI` | `SCONJ` | | subordinate conjunction with "zu" and infinitive |
| `KOUS` | `SCONJ` | | subordinate conjunction with sentence | | `KOUS` | `SCONJ` | | subordinate conjunction with sentence |
| `NE` | `PROPN` | | proper noun | | `NE` | `PROPN` | | proper noun |
| `NNE` | `PROPN` | | proper noun | | `NNE` | `PROPN` | | proper noun |
| `NN` | `NOUN` | | noun, singular or mass | | `NN` | `NOUN` | | noun, singular or mass |
| `PROAV` | `ADV` | `PronType=dem` | pronominal adverb | | `PROAV` | `ADV` | `PronType=dem` | pronominal adverb |
| `PDAT` | `DET` | `PronType=dem` | attributive demonstrative pronoun | | `PDAT` | `DET` | `PronType=dem` | attributive demonstrative pronoun |
| `PDS` | `PRON` | `PronType=dem` | substituting demonstrative pronoun | | `PDS` | `PRON` | `PronType=dem` | substituting demonstrative pronoun |
| `PIAT` | `DET` | `PronType=ind\|neg\|tot` | attributive indefinite pronoun without determiner | | `PIAT` | `DET` | `PronType=ind\|neg\|tot` | attributive indefinite pronoun without determiner |
| `PIS` | `PRON` | `PronType=ind\|neg\|tot` | substituting indefinite pronoun | | `PIS` | `PRON` | `PronType=ind\|neg\|tot` | substituting indefinite pronoun |
| `PPER` | `PRON` | `PronType=prs` | non-reflexive personal pronoun | | `PPER` | `PRON` | `PronType=prs` | non-reflexive personal pronoun |
| `PPOSAT` | `DET` | `Poss=yes PronType=prs` | attributive possessive pronoun | | `PPOSAT` | `DET` | `Poss=yes PronType=prs` | attributive possessive pronoun |
| `PPOSS` | `PRON` | `PronType=rel` | substituting possessive pronoun | | `PPOSS` | `PRON` | `PronType=rel` | substituting possessive pronoun |
| `PRELAT` | `DET` | `PronType=rel` | attributive relative pronoun | | `PRELAT` | `DET` | `PronType=rel` | attributive relative pronoun |
| `PRELS` | `PRON` | `PronType=rel` | substituting relative pronoun | | `PRELS` | `PRON` | `PronType=rel` | substituting relative pronoun |
| `PRF` | `PRON` | `PronType=prs Reflex=yes` | reflexive personal pronoun | | `PRF` | `PRON` | `PronType=prs Reflex=yes` | reflexive personal pronoun |
| `PTKA` | `PART` | | particle with adjective or adverb | | `PTKA` | `PART` | | particle with adjective or adverb |
| `PTKANT` | `PART` | `PartType=res` | answer particle | | `PTKANT` | `PART` | `PartType=res` | answer particle |
| `PTKNEG` | `PART` | `Negative=yes` | negative particle | | `PTKNEG` | `PART` | `Negative=yes` | negative particle |
| `PTKVZ` | `PART` | `PartType=vbp` | separable verbal particle | | `PTKVZ` | `PART` | `PartType=vbp` | separable verbal particle |
| `PTKZU` | `PART` | `PartType=inf` | "zu" before infinitive | | `PTKZU` | `PART` | `PartType=inf` | "zu" before infinitive |
| `PWAT` | `DET` | `PronType=int` | attributive interrogative pronoun | | `PWAT` | `DET` | `PronType=int` | attributive interrogative pronoun |
| `PWAV` | `ADV` | `PronType=int` | adverbial interrogative or relative pronoun | | `PWAV` | `ADV` | `PronType=int` | adverbial interrogative or relative pronoun |
| `PWS` | `PRON` | `PronType=int` | substituting interrogative pronoun | | `PWS` | `PRON` | `PronType=int` | substituting interrogative pronoun |
| `TRUNC` | `X` | `Hyph=yes` | word remnant | | `TRUNC` | `X` | `Hyph=yes` | word remnant |
| `VAFIN` | `AUX` | `Mood=ind VerbForm=fin` | finite verb, auxiliary | | `VAFIN` | `AUX` | `Mood=ind VerbForm=fin` | finite verb, auxiliary |
| `VAIMP` | `AUX` | `Mood=imp VerbForm=fin` | imperative, auxiliary | | `VAIMP` | `AUX` | `Mood=imp VerbForm=fin` | imperative, auxiliary |
| `VAINF` | `AUX` | `VerbForm=inf` | infinitive, auxiliary | | `VAINF` | `AUX` | `VerbForm=inf` | infinitive, auxiliary |
| `VAPP` | `AUX` | `Aspect=perf VerbForm=fin` | perfect participle, auxiliary | | `VAPP` | `AUX` | `Aspect=perf VerbForm=fin` | perfect participle, auxiliary |
| `VMFIN` | `VERB` | `Mood=ind VerbForm=fin VerbType=mod` | finite verb, modal | | `VMFIN` | `VERB` | `Mood=ind VerbForm=fin VerbType=mod` | finite verb, modal |
| `VMINF` | `VERB` | `VerbForm=fin VerbType=mod` | infinitive, modal | | `VMINF` | `VERB` | `VerbForm=fin VerbType=mod` | infinitive, modal |
| `VMPP` | `VERB` | `Aspect=perf VerbForm=part VerbType=mod` | perfect participle, modal | | `VMPP` | `VERB` | `Aspect=perf VerbForm=part VerbType=mod` | perfect participle, modal |
| `VVFIN` | `VERB` | `Mood=ind VerbForm=fin` | finite verb, full | | `VVFIN` | `VERB` | `Mood=ind VerbForm=fin` | finite verb, full |
| `VVIMP` | `VERB` | `Mood=imp VerbForm=fin` | imperative, full | | `VVIMP` | `VERB` | `Mood=imp VerbForm=fin` | imperative, full |
| `VVINF` | `VERB` | `VerbForm=inf` | infinitive, full | | `VVINF` | `VERB` | `VerbForm=inf` | infinitive, full |
| `VVIZU` | `VERB` | `VerbForm=inf` | infinitive with "zu", full | | `VVIZU` | `VERB` | `VerbForm=inf` | infinitive with "zu", full |
| `VVPP` | `VERB` | `Aspect=perf VerbForm=part` | perfect participle, full | | `VVPP` | `VERB` | `Aspect=perf VerbForm=part` | perfect participle, full |
| `XY` | `X` | | non-word containing non-letter | | `XY` | `X` | | non-word containing non-letter |
| `SP` | `SPACE` | | space | | `SP` | `SPACE` | | space |
</Accordion> </Accordion>
@ -379,51 +379,51 @@ The German dependency labels use the
[TIGER Treebank](http://www.ims.uni-stuttgart.de/forschung/ressourcen/korpora/TIGERCorpus/annotation/index.html) [TIGER Treebank](http://www.ims.uni-stuttgart.de/forschung/ressourcen/korpora/TIGERCorpus/annotation/index.html)
annotation scheme. annotation scheme.
| Label | Description | | Label | Description |
| ------ | ------------------------------- | | ------- | ------------------------------- |
| `ac` | adpositional case marker | | `ac` | adpositional case marker |
| `adc` | adjective component | | `adc` | adjective component |
| `ag` | genitive attribute | | `ag` | genitive attribute |
| `ams` | measure argument of adjective | | `ams` | measure argument of adjective |
| `app` | apposition | | `app` | apposition |
| `avc` | adverbial phrase component | | `avc` | adverbial phrase component |
| `cc` | comparative complement | | `cc` | comparative complement |
| `cd` | coordinating conjunction | | `cd` | coordinating conjunction |
| `cj` | conjunct | | `cj` | conjunct |
| `cm` | comparative conjunction | | `cm` | comparative conjunction |
| `cp` | complementizer | | `cp` | complementizer |
| `cvc` | collocational verb construction | | `cvc` | collocational verb construction |
| `da` | dative | | `da` | dative |
| `dm` | discourse marker | | `dm` | discourse marker |
| `ep` | expletive es | | `ep` | expletive es |
| `ju` | junctor | | `ju` | junctor |
| `mnr` | postnominal modifier | | `mnr` | postnominal modifier |
| `mo` | modifier | | `mo` | modifier |
| `ng` | negation | | `ng` | negation |
| `nk` | noun kernel element | | `nk` | noun kernel element |
| `nmc` | numerical component | | `nmc` | numerical component |
| `oa` | accusative object | | `oa` | accusative object |
| `oa2` | second accusative object | | `oa2` | second accusative object |
| `oc` | clausal object | | `oc` | clausal object |
| `og` | genitive object | | `og` | genitive object |
| `op` | prepositional object | | `op` | prepositional object |
| `par` | parenthetical element | | `par` | parenthetical element |
| `pd` | predicate | | `pd` | predicate |
| `pg` | phrasal genitive | | `pg` | phrasal genitive |
| `ph` | placeholder | | `ph` | placeholder |
| `pm` | morphological particle | | `pm` | morphological particle |
| `pnc` | proper noun component | | `pnc` | proper noun component |
| `punct` | punctuation | | `punct` | punctuation |
| `rc` | relative clause | | `rc` | relative clause |
| `re` | repeated element | | `re` | repeated element |
| `rs` | reported speech | | `rs` | reported speech |
| `sb` | subject | | `sb` | subject |
| `sbp` | passivized subject (PP) | | `sbp` | passivized subject (PP) |
| `sp` | subject or predicate | | `sp` | subject or predicate |
| `svp` | separable verb prefix | | `svp` | separable verb prefix |
| `uc` | unit component | | `uc` | unit component |
| `vo` | vocative | | `vo` | vocative |
| `ROOT` | root | | `ROOT` | root |
</Accordion> </Accordion>

View File

@ -45,9 +45,9 @@ Append a token to the `Doc`. The token can be provided as a
> from spacy.vocab cimport Vocab > from spacy.vocab cimport Vocab
> >
> doc = Doc(Vocab()) > doc = Doc(Vocab())
> lexeme = doc.vocab.get(u'hello') > lexeme = doc.vocab.get("hello")
> doc.push_back(lexeme, True) > doc.push_back(lexeme, True)
> assert doc.text == u'hello ' > assert doc.text == "hello "
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
@ -164,7 +164,7 @@ vocabulary.
> #### Example > #### Example
> >
> ```python > ```python
> lexeme = vocab.get(vocab.mem, u'hello') > lexeme = vocab.get(vocab.mem, "hello")
> ``` > ```
| Name | Type | Description | | Name | Type | Description |

View File

@ -88,7 +88,7 @@ Find a token in a `TokenC*` array by the offset of its first character.
> from spacy.tokens.doc cimport Doc, token_by_start > from spacy.tokens.doc cimport Doc, token_by_start
> from spacy.vocab cimport Vocab > from spacy.vocab cimport Vocab
> >
> doc = Doc(Vocab(), words=[u'hello', u'world']) > doc = Doc(Vocab(), words=["hello", "world"])
> assert token_by_start(doc.c, doc.length, 6) == 1 > assert token_by_start(doc.c, doc.length, 6) == 1
> assert token_by_start(doc.c, doc.length, 4) == -1 > assert token_by_start(doc.c, doc.length, 4) == -1
> ``` > ```
@ -110,7 +110,7 @@ Find a token in a `TokenC*` array by the offset of its final character.
> from spacy.tokens.doc cimport Doc, token_by_end > from spacy.tokens.doc cimport Doc, token_by_end
> from spacy.vocab cimport Vocab > from spacy.vocab cimport Vocab
> >
> doc = Doc(Vocab(), words=[u'hello', u'world']) > doc = Doc(Vocab(), words=["hello", "world"])
> assert token_by_end(doc.c, doc.length, 5) == 0 > assert token_by_end(doc.c, doc.length, 5) == 0
> assert token_by_end(doc.c, doc.length, 1) == -1 > assert token_by_end(doc.c, doc.length, 1) == -1
> ``` > ```
@ -134,7 +134,7 @@ attribute, in order to make the parse tree navigation consistent.
> from spacy.tokens.doc cimport Doc, set_children_from_heads > from spacy.tokens.doc cimport Doc, set_children_from_heads
> from spacy.vocab cimport Vocab > from spacy.vocab cimport Vocab
> >
> doc = Doc(Vocab(), words=[u'Baileys', u'from', u'a', u'shoe']) > doc = Doc(Vocab(), words=["Baileys", "from", "a", "shoe"])
> doc.c[0].head = 0 > doc.c[0].head = 0
> doc.c[1].head = 0 > doc.c[1].head = 0
> doc.c[2].head = 3 > doc.c[2].head = 3

View File

@ -58,7 +58,7 @@ and all pipeline components are applied to the `Doc` in order. Both
> >
> ```python > ```python
> parser = DependencyParser(nlp.vocab) > parser = DependencyParser(nlp.vocab)
> doc = nlp(u"This is a sentence.") > doc = nlp("This is a sentence.")
> # This usually happens under the hood > # This usually happens under the hood
> processed = parser(doc) > processed = parser(doc)
> ``` > ```

View File

@ -20,11 +20,11 @@ Construct a `Doc` object. The most common way to get a `Doc` object is via the
> >
> ```python > ```python
> # Construction 1 > # Construction 1
> doc = nlp(u"Some text") > doc = nlp("Some text")
> >
> # Construction 2 > # Construction 2
> from spacy.tokens import Doc > from spacy.tokens import Doc
> words = [u"hello", u"world", u"!"] > words = ["hello", "world", "!"]
> spaces = [True, False, False] > spaces = [True, False, False]
> doc = Doc(nlp.vocab, words=words, spaces=spaces) > doc = Doc(nlp.vocab, words=words, spaces=spaces)
> ``` > ```
@ -45,7 +45,7 @@ Negative indexing is supported, and follows the usual Python semantics, i.e.
> #### Example > #### Example
> >
> ```python > ```python
> doc = nlp(u"Give it back! He pleaded.") > doc = nlp("Give it back! He pleaded.")
> assert doc[0].text == "Give" > assert doc[0].text == "Give"
> assert doc[-1].text == "." > assert doc[-1].text == "."
> span = doc[1:3] > span = doc[1:3]
@ -76,8 +76,8 @@ Iterate over `Token` objects, from which the annotations can be easily accessed.
> #### Example > #### Example
> >
> ```python > ```python
> doc = nlp(u'Give it back') > doc = nlp("Give it back")
> assert [t.text for t in doc] == [u'Give', u'it', u'back'] > assert [t.text for t in doc] == ["Give", "it", "back"]
> ``` > ```
This is the main way of accessing [`Token`](/api/token) objects, which are the This is the main way of accessing [`Token`](/api/token) objects, which are the
@ -96,7 +96,7 @@ Get the number of tokens in the document.
> #### Example > #### Example
> >
> ```python > ```python
> doc = nlp(u"Give it back! He pleaded.") > doc = nlp("Give it back! He pleaded.")
> assert len(doc) == 7 > assert len(doc) == 7
> ``` > ```
@ -114,9 +114,9 @@ details, see the documentation on
> >
> ```python > ```python
> from spacy.tokens import Doc > from spacy.tokens import Doc
> city_getter = lambda doc: any(city in doc.text for city in ('New York', 'Paris', 'Berlin')) > city_getter = lambda doc: any(city in doc.text for city in ("New York", "Paris", "Berlin"))
> Doc.set_extension('has_city', getter=city_getter) > Doc.set_extension("has_city", getter=city_getter)
> doc = nlp(u'I like New York') > doc = nlp("I like New York")
> assert doc._.has_city > assert doc._.has_city
> ``` > ```
@ -192,8 +192,8 @@ the character indices don't map to a valid span.
> #### Example > #### Example
> >
> ```python > ```python
> doc = nlp(u"I like New York") > doc = nlp("I like New York")
> span = doc.char_span(7, 15, label=u"GPE") > span = doc.char_span(7, 15, label="GPE")
> assert span.text == "New York" > assert span.text == "New York"
> ``` > ```
@ -213,8 +213,8 @@ using an average of word vectors.
> #### Example > #### Example
> >
> ```python > ```python
> apples = nlp(u"I like apples") > apples = nlp("I like apples")
> oranges = nlp(u"I like oranges") > oranges = nlp("I like oranges")
> apples_oranges = apples.similarity(oranges) > apples_oranges = apples.similarity(oranges)
> oranges_apples = oranges.similarity(apples) > oranges_apples = oranges.similarity(apples)
> assert apples_oranges == oranges_apples > assert apples_oranges == oranges_apples
@ -235,7 +235,7 @@ attribute ID.
> >
> ```python > ```python
> from spacy.attrs import ORTH > from spacy.attrs import ORTH
> doc = nlp(u"apple apple orange banana") > doc = nlp("apple apple orange banana")
> assert doc.count_by(ORTH) == {7024L: 1, 119552L: 1, 2087L: 2} > assert doc.count_by(ORTH) == {7024L: 1, 119552L: 1, 2087L: 2}
> doc.to_array([ORTH]) > doc.to_array([ORTH])
> # array([[11880], [11880], [7561], [12800]]) > # array([[11880], [11880], [7561], [12800]])
@ -255,7 +255,7 @@ ancestor is found, e.g. if span excludes a necessary ancestor.
> #### Example > #### Example
> >
> ```python > ```python
> doc = nlp(u"This is a test") > doc = nlp("This is a test")
> matrix = doc.get_lca_matrix() > matrix = doc.get_lca_matrix()
> # array([[0, 1, 1, 1], [1, 1, 1, 1], [1, 1, 2, 3], [1, 1, 3, 3]], dtype=int32) > # array([[0, 1, 1, 1], [1, 1, 1, 1], [1, 1, 2, 3], [1, 1, 3, 3]], dtype=int32)
> ``` > ```
@ -274,7 +274,7 @@ They'll be added to an `"_"` key in the data, e.g. `"_": {"foo": "bar"}`.
> #### Example > #### Example
> >
> ```python > ```python
> doc = nlp(u"Hello") > doc = nlp("Hello")
> json_doc = doc.to_json() > json_doc = doc.to_json()
> ``` > ```
> >
@ -342,7 +342,7 @@ array of attributes.
> ```python > ```python
> from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA > from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA
> from spacy.tokens import Doc > from spacy.tokens import Doc
> doc = nlp(u"Hello world!") > doc = nlp("Hello world!")
> np_array = doc.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA]) > np_array = doc.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA])
> doc2 = Doc(doc.vocab, words=[t.text for t in doc]) > doc2 = Doc(doc.vocab, words=[t.text for t in doc])
> doc2.from_array([LOWER, POS, ENT_TYPE, IS_ALPHA], np_array) > doc2.from_array([LOWER, POS, ENT_TYPE, IS_ALPHA], np_array)
@ -396,7 +396,7 @@ Serialize, i.e. export the document contents to a binary string.
> #### Example > #### Example
> >
> ```python > ```python
> doc = nlp(u"Give it back! He pleaded.") > doc = nlp("Give it back! He pleaded.")
> doc_bytes = doc.to_bytes() > doc_bytes = doc.to_bytes()
> ``` > ```
@ -413,10 +413,9 @@ Deserialize, i.e. import the document contents from a binary string.
> >
> ```python > ```python
> from spacy.tokens import Doc > from spacy.tokens import Doc
> text = u"Give it back! He pleaded." > doc = nlp("Give it back! He pleaded.")
> doc = nlp(text) > doc_bytes = doc.to_bytes()
> bytes = doc.to_bytes() > doc2 = Doc(doc.vocab).from_bytes(doc_bytes)
> doc2 = Doc(doc.vocab).from_bytes(bytes)
> assert doc.text == doc2.text > assert doc.text == doc2.text
> ``` > ```
@ -457,9 +456,9 @@ dictionary mapping attribute names to values as the `"_"` key.
> #### Example > #### Example
> >
> ```python > ```python
> doc = nlp(u"I like David Bowie") > doc = nlp("I like David Bowie")
> with doc.retokenize() as retokenizer: > with doc.retokenize() as retokenizer:
> attrs = {"LEMMA": u"David Bowie"} > attrs = {"LEMMA": "David Bowie"}
> retokenizer.merge(doc[2:4], attrs=attrs) > retokenizer.merge(doc[2:4], attrs=attrs)
> ``` > ```
@ -489,7 +488,7 @@ underlying lexeme (if they're context-independent lexical attributes like
> #### Example > #### Example
> >
> ```python > ```python
> doc = nlp(u"I live in NewYork") > doc = nlp("I live in NewYork")
> with doc.retokenize() as retokenizer: > with doc.retokenize() as retokenizer:
> heads = [(doc[3], 1), doc[2]] > heads = [(doc[3], 1), doc[2]]
> attrs = {"POS": ["PROPN", "PROPN"], > attrs = {"POS": ["PROPN", "PROPN"],
@ -521,9 +520,9 @@ and end token boundaries, the document remains unchanged.
> #### Example > #### Example
> >
> ```python > ```python
> doc = nlp(u"Los Angeles start.") > doc = nlp("Los Angeles start.")
> doc.merge(0, len("Los Angeles"), "NNP", "Los Angeles", "GPE") > doc.merge(0, len("Los Angeles"), "NNP", "Los Angeles", "GPE")
> assert [t.text for t in doc] == [u"Los Angeles", u"start", u"."] > assert [t.text for t in doc] == ["Los Angeles", "start", "."]
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
@ -541,11 +540,11 @@ objects, if the entity recognizer has been applied.
> #### Example > #### Example
> >
> ```python > ```python
> doc = nlp(u"Mr. Best flew to New York on Saturday morning.") > doc = nlp("Mr. Best flew to New York on Saturday morning.")
> ents = list(doc.ents) > ents = list(doc.ents)
> assert ents[0].label == 346 > assert ents[0].label == 346
> assert ents[0].label_ == u"PERSON" > assert ents[0].label_ == "PERSON"
> assert ents[0].text == u"Mr. Best" > assert ents[0].text == "Mr. Best"
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
@ -563,10 +562,10 @@ relative clauses.
> #### Example > #### Example
> >
> ```python > ```python
> doc = nlp(u"A phrase with another phrase occurs.") > doc = nlp("A phrase with another phrase occurs.")
> chunks = list(doc.noun_chunks) > chunks = list(doc.noun_chunks)
> assert chunks[0].text == u"A phrase" > assert chunks[0].text == "A phrase"
> assert chunks[1].text == u"another phrase" > assert chunks[1].text == "another phrase"
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
@ -583,10 +582,10 @@ will be unavailable.
> #### Example > #### Example
> >
> ```python > ```python
> doc = nlp(u"This is a sentence. Here's another...") > doc = nlp("This is a sentence. Here's another...")
> sents = list(doc.sents) > sents = list(doc.sents)
> assert len(sents) == 2 > assert len(sents) == 2
> assert [s.root.text for s in sents] == [u"is", u"'s"] > assert [s.root.text for s in sents] == ["is", "'s"]
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
@ -600,7 +599,7 @@ A boolean value indicating whether a word vector is associated with the object.
> #### Example > #### Example
> >
> ```python > ```python
> doc = nlp(u"I like apples") > doc = nlp("I like apples")
> assert doc.has_vector > assert doc.has_vector
> ``` > ```
@ -616,8 +615,8 @@ vectors.
> #### Example > #### Example
> >
> ```python > ```python
> doc = nlp(u"I like apples") > doc = nlp("I like apples")
> assert doc.vector.dtype == 'float32' > assert doc.vector.dtype == "float32"
> assert doc.vector.shape == (300,) > assert doc.vector.shape == (300,)
> ``` > ```
@ -632,8 +631,8 @@ The L2 norm of the document's vector representation.
> #### Example > #### Example
> >
> ```python > ```python
> doc1 = nlp(u"I like apples") > doc1 = nlp("I like apples")
> doc2 = nlp(u"I like oranges") > doc2 = nlp("I like oranges")
> doc1.vector_norm # 4.54232424414368 > doc1.vector_norm # 4.54232424414368
> doc2.vector_norm # 3.304373298575751 > doc2.vector_norm # 3.304373298575751
> assert doc1.vector_norm != doc2.vector_norm > assert doc1.vector_norm != doc2.vector_norm

View File

@ -1,6 +1,8 @@
--- ---
title: EntityLinker title: EntityLinker
teaser: Functionality to disambiguate a named entity in text to a unique knowledge base identifier. teaser:
Functionality to disambiguate a named entity in text to a unique knowledge
base identifier.
tag: class tag: class
source: spacy/pipeline/pipes.pyx source: spacy/pipeline/pipes.pyx
new: 2.2 new: 2.2
@ -13,9 +15,9 @@ via the ID `"entity_linker"`.
## EntityLinker.Model {#model tag="classmethod"} ## EntityLinker.Model {#model tag="classmethod"}
Initialize a model for the pipe. The model should implement the Initialize a model for the pipe. The model should implement the
`thinc.neural.Model` API, and should contain a field `tok2vec` that contains `thinc.neural.Model` API, and should contain a field `tok2vec` that contains the
the context encoder. Wrappers are under development for most major machine context encoder. Wrappers are under development for most major machine learning
learning libraries. libraries.
| Name | Type | Description | | Name | Type | Description |
| ----------- | ------ | ------------------------------------- | | ----------- | ------ | ------------------------------------- |
@ -40,30 +42,29 @@ shortcut for this and instantiate the component using its string name and
> entity_linker.from_disk("/path/to/model") > entity_linker.from_disk("/path/to/model")
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| --------------- | ----------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------- | | -------------- | ----------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------- |
| `vocab` | `Vocab` | The shared vocabulary. | | `vocab` | `Vocab` | The shared vocabulary. |
| `model` | `thinc.neural.Model` / `True` | The model powering the pipeline component. If no model is supplied, the model is created when you call `begin_training`, `from_disk` or `from_bytes`. | | `model` | `thinc.neural.Model` / `True` | The model powering the pipeline component. If no model is supplied, the model is created when you call `begin_training`, `from_disk` or `from_bytes`. |
| `hidden_width` | int | Width of the hidden layer of the entity linking model, defaults to 128. | | `hidden_width` | int | Width of the hidden layer of the entity linking model, defaults to 128. |
| `incl_prior` | bool | Whether or not to include prior probabilities in the model. Defaults to True. | | `incl_prior` | bool | Whether or not to include prior probabilities in the model. Defaults to True. |
| `incl_context` | bool | Whether or not to include the local context in the model (if not: only prior probabilites are used). Defaults to True. | | `incl_context` | bool | Whether or not to include the local context in the model (if not: only prior probabilites are used). Defaults to True. |
| **RETURNS** | `EntityLinker` | The newly constructed object. | | **RETURNS** | `EntityLinker` | The newly constructed object. |
## EntityLinker.\_\_call\_\_ {#call tag="method"} ## EntityLinker.\_\_call\_\_ {#call tag="method"}
Apply the pipe to one document. The document is modified in place, and returned. Apply the pipe to one document. The document is modified in place, and returned.
This usually happens under the hood when the `nlp` object is called on a text This usually happens under the hood when the `nlp` object is called on a text
and all pipeline components are applied to the `Doc` in order. Both and all pipeline components are applied to the `Doc` in order. Both
[`__call__`](/api/entitylinker#call) and [`__call__`](/api/entitylinker#call) and [`pipe`](/api/entitylinker#pipe)
[`pipe`](/api/entitylinker#pipe) delegate to the delegate to the [`predict`](/api/entitylinker#predict) and
[`predict`](/api/entitylinker#predict) and [`set_annotations`](/api/entitylinker#set_annotations) methods.
[`set_annotations`](/api/entitylinker#set_annotations) methods.
> #### Example > #### Example
> >
> ```python > ```python
> entity_linker = EntityLinker(nlp.vocab) > entity_linker = EntityLinker(nlp.vocab)
> doc = nlp(u"This is a sentence.") > doc = nlp("This is a sentence.")
> # This usually happens under the hood > # This usually happens under the hood
> processed = entity_linker(doc) > processed = entity_linker(doc)
> ``` > ```
@ -107,14 +108,15 @@ Apply the pipeline's model to a batch of docs, without modifying them.
> kb_ids, tensors = entity_linker.predict([doc1, doc2]) > kb_ids, tensors = entity_linker.predict([doc1, doc2])
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ----------- | -------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ----------- | -------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `docs` | iterable | The documents to predict. | | `docs` | iterable | The documents to predict. |
| **RETURNS** | tuple | A `(kb_ids, tensors)` tuple where `kb_ids` are the model's predicted KB identifiers for the entities in the `docs`, and `tensors` are the token representations used to predict these identifiers. | | **RETURNS** | tuple | A `(kb_ids, tensors)` tuple where `kb_ids` are the model's predicted KB identifiers for the entities in the `docs`, and `tensors` are the token representations used to predict these identifiers. |
## EntityLinker.set_annotations {#set_annotations tag="method"} ## EntityLinker.set_annotations {#set_annotations tag="method"}
Modify a batch of documents, using pre-computed entity IDs for a list of named entities. Modify a batch of documents, using pre-computed entity IDs for a list of named
entities.
> #### Example > #### Example
> >
@ -124,16 +126,17 @@ Modify a batch of documents, using pre-computed entity IDs for a list of named e
> entity_linker.set_annotations([doc1, doc2], kb_ids, tensors) > entity_linker.set_annotations([doc1, doc2], kb_ids, tensors)
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ---------- | -------- | --------------------------------------------------------------------------------------------------- | | --------- | -------- | ------------------------------------------------------------------------------------------------- |
| `docs` | iterable | The documents to modify. | | `docs` | iterable | The documents to modify. |
| `kb_ids` | iterable | The knowledge base identifiers for the entities in the docs, predicted by `EntityLinker.predict`. | | `kb_ids` | iterable | The knowledge base identifiers for the entities in the docs, predicted by `EntityLinker.predict`. |
| `tensors` | iterable | The token representations used to predict the identifiers. | | `tensors` | iterable | The token representations used to predict the identifiers. |
## EntityLinker.update {#update tag="method"} ## EntityLinker.update {#update tag="method"}
Learn from a batch of documents and gold-standard information, updating both the Learn from a batch of documents and gold-standard information, updating both the
pipe's entity linking model and context encoder. Delegates to [`predict`](/api/entitylinker#predict) and pipe's entity linking model and context encoder. Delegates to
[`predict`](/api/entitylinker#predict) and
[`get_loss`](/api/entitylinker#get_loss). [`get_loss`](/api/entitylinker#get_loss).
> #### Example > #### Example
@ -145,18 +148,18 @@ pipe's entity linking model and context encoder. Delegates to [`predict`](/api/e
> entity_linker.update([doc1, doc2], [gold1, gold2], losses=losses, sgd=optimizer) > entity_linker.update([doc1, doc2], [gold1, gold2], losses=losses, sgd=optimizer)
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| -------- | -------- | ------------------------------------------------------------------------------------------------------------- | | -------- | -------- | ------------------------------------------------------------------------------------------------------- |
| `docs` | iterable | A batch of documents to learn from. | | `docs` | iterable | A batch of documents to learn from. |
| `golds` | iterable | The gold-standard data. Must have the same length as `docs`. | | `golds` | iterable | The gold-standard data. Must have the same length as `docs`. |
| `drop` | float | The dropout rate, used both for the EL model and the context encoder. | | `drop` | float | The dropout rate, used both for the EL model and the context encoder. |
| `sgd` | callable | The optimizer for the EL model. Should take two arguments `weights` and `gradient`, and an optional ID. | | `sgd` | callable | The optimizer for the EL model. Should take two arguments `weights` and `gradient`, and an optional ID. |
| `losses` | dict | Optional record of the loss during training. The value keyed by the model's name is updated. | | `losses` | dict | Optional record of the loss during training. The value keyed by the model's name is updated. |
## EntityLinker.get_loss {#get_loss tag="method"} ## EntityLinker.get_loss {#get_loss tag="method"}
Find the loss and gradient of loss for the entities in a batch of documents and their Find the loss and gradient of loss for the entities in a batch of documents and
predicted scores. their predicted scores.
> #### Example > #### Example
> >
@ -166,17 +169,18 @@ predicted scores.
> loss, d_loss = entity_linker.get_loss(docs, [gold1, gold2], kb_ids, tensors) > loss, d_loss = entity_linker.get_loss(docs, [gold1, gold2], kb_ids, tensors)
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| --------------- | -------- | ------------------------------------------------------------ | | ----------- | -------- | ------------------------------------------------------------ |
| `docs` | iterable | The batch of documents. | | `docs` | iterable | The batch of documents. |
| `golds` | iterable | The gold-standard data. Must have the same length as `docs`. | | `golds` | iterable | The gold-standard data. Must have the same length as `docs`. |
| `kb_ids` | iterable | KB identifiers representing the model's predictions. | | `kb_ids` | iterable | KB identifiers representing the model's predictions. |
| `tensors` | iterable | The token representations used to predict the identifiers | | `tensors` | iterable | The token representations used to predict the identifiers |
| **RETURNS** | tuple | The loss and the gradient, i.e. `(loss, gradient)`. | | **RETURNS** | tuple | The loss and the gradient, i.e. `(loss, gradient)`. |
## EntityLinker.set_kb {#set_kb tag="method"} ## EntityLinker.set_kb {#set_kb tag="method"}
Define the knowledge base (KB) used for disambiguating named entities to KB identifiers. Define the knowledge base (KB) used for disambiguating named entities to KB
identifiers.
> #### Example > #### Example
> >
@ -185,15 +189,16 @@ Define the knowledge base (KB) used for disambiguating named entities to KB iden
> entity_linker.set_kb(kb) > entity_linker.set_kb(kb)
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| --------------- | --------------- | ------------------------------------------------------------ | | ---- | --------------- | ------------------------------- |
| `kb` | `KnowledgeBase` | The [`KnowledgeBase`](/api/kb). | | `kb` | `KnowledgeBase` | The [`KnowledgeBase`](/api/kb). |
## EntityLinker.begin_training {#begin_training tag="method"} ## EntityLinker.begin_training {#begin_training tag="method"}
Initialize the pipe for training, using data examples if available. If no model Initialize the pipe for training, using data examples if available. If no model
has been initialized yet, the model is added. has been initialized yet, the model is added. Before calling this method, a
Before calling this method, a knowledge base should have been defined with [`set_kb`](/api/entitylinker#set_kb). knowledge base should have been defined with
[`set_kb`](/api/entitylinker#set_kb).
> #### Example > #### Example
> >
@ -204,12 +209,12 @@ Before calling this method, a knowledge base should have been defined with [`set
> optimizer = entity_linker.begin_training(pipeline=nlp.pipeline) > optimizer = entity_linker.begin_training(pipeline=nlp.pipeline)
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ------------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ------------- | -------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `gold_tuples` | iterable | Optional gold-standard annotations from which to construct [`GoldParse`](/api/goldparse) objects. | | `gold_tuples` | iterable | Optional gold-standard annotations from which to construct [`GoldParse`](/api/goldparse) objects. |
| `pipeline` | list | Optional list of pipeline components that this component is part of. | | `pipeline` | list | Optional list of pipeline components that this component is part of. |
| `sgd` | callable | An optional optimizer. Should take two arguments `weights` and `gradient`, and an optional ID. Will be created via [`EntityLinker`](/api/entitylinker#create_optimizer) if not set. | | `sgd` | callable | An optional optimizer. Should take two arguments `weights` and `gradient`, and an optional ID. Will be created via [`EntityLinker`](/api/entitylinker#create_optimizer) if not set. |
| **RETURNS** | callable | An optimizer. | | **RETURNS** | callable | An optimizer. |
## EntityLinker.create_optimizer {#create_optimizer tag="method"} ## EntityLinker.create_optimizer {#create_optimizer tag="method"}
@ -242,7 +247,6 @@ Modify the pipe's EL model, to use the given parameter values.
| -------- | ---- | ---------------------------------------------------------------------------------------------------------- | | -------- | ---- | ---------------------------------------------------------------------------------------------------------- |
| `params` | dict | The parameter values to use in the model. At the end of the context, the original parameters are restored. | | `params` | dict | The parameter values to use in the model. At the end of the context, the original parameters are restored. |
## EntityLinker.to_disk {#to_disk tag="method"} ## EntityLinker.to_disk {#to_disk tag="method"}
Serialize the pipe to disk. Serialize the pipe to disk.
@ -270,11 +274,11 @@ Load the pipe from disk. Modifies the object in place and returns it.
> entity_linker.from_disk("/path/to/entity_linker") > entity_linker.from_disk("/path/to/entity_linker")
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ----------- | ------------------ | -------------------------------------------------------------------------- | | ----------- | ---------------- | -------------------------------------------------------------------------- |
| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | | `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | | `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
| **RETURNS** | `EntityLinker` | The modified `EntityLinker` object. | | **RETURNS** | `EntityLinker` | The modified `EntityLinker` object. |
## Serialization fields {#serialization-fields} ## Serialization fields {#serialization-fields}
@ -294,4 +298,3 @@ serialization by passing in the string names via the `exclude` argument.
| `cfg` | The config file. You usually don't want to exclude this. | | `cfg` | The config file. You usually don't want to exclude this. |
| `model` | The binary model data. You usually don't want to exclude this. | | `model` | The binary model data. You usually don't want to exclude this. |
| `kb` | The knowledge base. You usually don't want to exclude this. | | `kb` | The knowledge base. You usually don't want to exclude this. |

View File

@ -58,7 +58,7 @@ and all pipeline components are applied to the `Doc` in order. Both
> >
> ```python > ```python
> ner = EntityRecognizer(nlp.vocab) > ner = EntityRecognizer(nlp.vocab)
> doc = nlp(u"This is a sentence.") > doc = nlp("This is a sentence.")
> # This usually happens under the hood > # This usually happens under the hood
> processed = ner(doc) > processed = ner(doc)
> ``` > ```
@ -119,11 +119,11 @@ Modify a batch of documents, using pre-computed scores.
> ner.set_annotations([doc1, doc2], scores, tensors) > ner.set_annotations([doc1, doc2], scores, tensors)
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| -------- | -------- | ---------------------------------------------------------- | | --------- | -------- | ---------------------------------------------------------- |
| `docs` | iterable | The documents to modify. | | `docs` | iterable | The documents to modify. |
| `scores` | - | The scores to set, produced by `EntityRecognizer.predict`. | | `scores` | - | The scores to set, produced by `EntityRecognizer.predict`. |
| `tensors`| iterable | The token representations used to predict the scores. | | `tensors` | iterable | The token representations used to predict the scores. |
## EntityRecognizer.update {#update tag="method"} ## EntityRecognizer.update {#update tag="method"}

View File

@ -23,7 +23,7 @@ gradient for those labels will be zero.
| `deps` | iterable | A sequence of strings, representing the syntactic relation types. | | `deps` | iterable | A sequence of strings, representing the syntactic relation types. |
| `entities` | iterable | A sequence of named entity annotations, either as BILUO tag strings, or as `(start_char, end_char, label)` tuples, representing the entity positions. If BILUO tag strings, you can specify missing values by setting the tag to None. | | `entities` | iterable | A sequence of named entity annotations, either as BILUO tag strings, or as `(start_char, end_char, label)` tuples, representing the entity positions. If BILUO tag strings, you can specify missing values by setting the tag to None. |
| `cats` | dict | Labels for text classification. Each key in the dictionary may be a string or an int, or a `(start_char, end_char, label)` tuple, indicating that the label is applied to only part of the document (usually a sentence). | | `cats` | dict | Labels for text classification. Each key in the dictionary may be a string or an int, or a `(start_char, end_char, label)` tuple, indicating that the label is applied to only part of the document (usually a sentence). |
| `links` | dict | Labels for entity linking. A dict with `(start_char, end_char)` keys, and the values being dicts with `kb_id:value` entries, representing external KB IDs mapped to either 1.0 (positive) or 0.0 (negative). | | `links` | dict | Labels for entity linking. A dict with `(start_char, end_char)` keys, and the values being dicts with `kb_id:value` entries, representing external KB IDs mapped to either 1.0 (positive) or 0.0 (negative). |
| **RETURNS** | `GoldParse` | The newly constructed object. | | **RETURNS** | `GoldParse` | The newly constructed object. |
## GoldParse.\_\_len\_\_ {#len tag="method"} ## GoldParse.\_\_len\_\_ {#len tag="method"}
@ -69,7 +69,7 @@ Convert a list of Doc objects into the
> ```python > ```python
> from spacy.gold import docs_to_json > from spacy.gold import docs_to_json
> >
> doc = nlp(u"I like London") > doc = nlp("I like London")
> json_data = docs_to_json([doc]) > json_data = docs_to_json([doc])
> ``` > ```
@ -150,7 +150,7 @@ single-token entity.
> ```python > ```python
> from spacy.gold import biluo_tags_from_offsets > from spacy.gold import biluo_tags_from_offsets
> >
> doc = nlp(u"I like London.") > doc = nlp("I like London.")
> entities = [(7, 13, "LOC")] > entities = [(7, 13, "LOC")]
> tags = biluo_tags_from_offsets(doc, entities) > tags = biluo_tags_from_offsets(doc, entities)
> assert tags == ["O", "O", "U-LOC", "O"] > assert tags == ["O", "O", "U-LOC", "O"]
@ -172,7 +172,7 @@ entity offsets.
> ```python > ```python
> from spacy.gold import offsets_from_biluo_tags > from spacy.gold import offsets_from_biluo_tags
> >
> doc = nlp(u"I like London.") > doc = nlp("I like London.")
> tags = ["O", "O", "U-LOC", "O"] > tags = ["O", "O", "U-LOC", "O"]
> entities = offsets_from_biluo_tags(doc, tags) > entities = offsets_from_biluo_tags(doc, tags)
> assert entities == [(7, 13, "LOC")] > assert entities == [(7, 13, "LOC")]
@ -195,7 +195,7 @@ token-based tags, e.g. to overwrite the `doc.ents`.
> ```python > ```python
> from spacy.gold import spans_from_biluo_tags > from spacy.gold import spans_from_biluo_tags
> >
> doc = nlp(u"I like London.") > doc = nlp("I like London.")
> tags = ["O", "O", "U-LOC", "O"] > tags = ["O", "O", "U-LOC", "O"]
> doc.ents = spans_from_biluo_tags(doc, tags) > doc.ents = spans_from_biluo_tags(doc, tags)
> ``` > ```

View File

@ -45,7 +45,7 @@ contain arbitrary whitespace. Alignment into the original string is preserved.
> #### Example > #### Example
> >
> ```python > ```python
> doc = nlp(u"An example sentence. Another sentence.") > doc = nlp("An example sentence. Another sentence.")
> assert (doc[0].text, doc[0].head.tag_) == ("An", "NN") > assert (doc[0].text, doc[0].head.tag_) == ("An", "NN")
> ``` > ```
@ -61,8 +61,8 @@ Pipeline components to prevent from being loaded can now be added as a list to
`disable`, instead of specifying one keyword argument per component. `disable`, instead of specifying one keyword argument per component.
```diff ```diff
- doc = nlp(u"I don't want parsed", parse=False) - doc = nlp("I don't want parsed", parse=False)
+ doc = nlp(u"I don't want parsed", disable=["parser"]) + doc = nlp("I don't want parsed", disable=["parser"])
``` ```
</Infobox> </Infobox>
@ -86,7 +86,7 @@ multiprocessing.
> #### Example > #### Example
> >
> ```python > ```python
> texts = [u"One document.", u"...", u"Lots of documents"] > texts = ["One document.", "...", "Lots of documents"]
> for doc in nlp.pipe(texts, batch_size=50): > for doc in nlp.pipe(texts, batch_size=50):
> assert doc.is_parsed > assert doc.is_parsed
> ``` > ```

View File

@ -37,8 +37,8 @@ Lemmatize a string.
> from spacy.lemmatizer import Lemmatizer > from spacy.lemmatizer import Lemmatizer
> from spacy.lang.en import LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES > from spacy.lang.en import LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES
> lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES) > lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES)
> lemmas = lemmatizer(u"ducks", u"NOUN") > lemmas = lemmatizer("ducks", "NOUN")
> assert lemmas == [u"duck"] > assert lemmas == ["duck"]
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
@ -58,9 +58,9 @@ variable, set on the individual `Language` class.
> #### Example > #### Example
> >
> ```python > ```python
> lookup = {u"going": u"go"} > lookup = {"going": "go"}
> lemmatizer = Lemmatizer(lookup=lookup) > lemmatizer = Lemmatizer(lookup=lookup)
> assert lemmatizer.lookup(u"going") == u"go" > assert lemmatizer.lookup("going") == "go"
> ``` > ```
| Name | Type | Description | | Name | Type | Description |

View File

@ -27,7 +27,7 @@ Change the value of a boolean flag.
> >
> ```python > ```python
> COOL_FLAG = nlp.vocab.add_flag(lambda text: False) > COOL_FLAG = nlp.vocab.add_flag(lambda text: False)
> nlp.vocab[u'spaCy'].set_flag(COOL_FLAG, True) > nlp.vocab["spaCy"].set_flag(COOL_FLAG, True)
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
@ -42,9 +42,9 @@ Check the value of a boolean flag.
> #### Example > #### Example
> >
> ```python > ```python
> is_my_library = lambda text: text in [u"spaCy", u"Thinc"] > is_my_library = lambda text: text in ["spaCy", "Thinc"]
> MY_LIBRARY = nlp.vocab.add_flag(is_my_library) > MY_LIBRARY = nlp.vocab.add_flag(is_my_library)
> assert nlp.vocab[u"spaCy"].check_flag(MY_LIBRARY) == True > assert nlp.vocab["spaCy"].check_flag(MY_LIBRARY) == True
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
@ -59,8 +59,8 @@ Compute a semantic similarity estimate. Defaults to cosine over vectors.
> #### Example > #### Example
> >
> ```python > ```python
> apple = nlp.vocab[u"apple"] > apple = nlp.vocab["apple"]
> orange = nlp.vocab[u"orange"] > orange = nlp.vocab["orange"]
> apple_orange = apple.similarity(orange) > apple_orange = apple.similarity(orange)
> orange_apple = orange.similarity(apple) > orange_apple = orange.similarity(apple)
> assert apple_orange == orange_apple > assert apple_orange == orange_apple
@ -78,7 +78,7 @@ A boolean value indicating whether a word vector is associated with the lexeme.
> #### Example > #### Example
> >
> ```python > ```python
> apple = nlp.vocab[u"apple"] > apple = nlp.vocab["apple"]
> assert apple.has_vector > assert apple.has_vector
> ``` > ```
@ -93,7 +93,7 @@ A real-valued meaning representation.
> #### Example > #### Example
> >
> ```python > ```python
> apple = nlp.vocab[u"apple"] > apple = nlp.vocab["apple"]
> assert apple.vector.dtype == "float32" > assert apple.vector.dtype == "float32"
> assert apple.vector.shape == (300,) > assert apple.vector.shape == (300,)
> ``` > ```
@ -109,8 +109,8 @@ The L2 norm of the lexeme's vector representation.
> #### Example > #### Example
> >
> ```python > ```python
> apple = nlp.vocab[u"apple"] > apple = nlp.vocab["apple"]
> pasta = nlp.vocab[u"pasta"] > pasta = nlp.vocab["pasta"]
> apple.vector_norm # 7.1346845626831055 > apple.vector_norm # 7.1346845626831055
> pasta.vector_norm # 7.759851932525635 > pasta.vector_norm # 7.759851932525635
> assert apple.vector_norm != pasta.vector_norm > assert apple.vector_norm != pasta.vector_norm

View File

@ -50,7 +50,7 @@ Find all token sequences matching the supplied patterns on the `Doc`.
> matcher = Matcher(nlp.vocab) > matcher = Matcher(nlp.vocab)
> pattern = [{"LOWER": "hello"}, {"LOWER": "world"}] > pattern = [{"LOWER": "hello"}, {"LOWER": "world"}]
> matcher.add("HelloWorld", None, pattern) > matcher.add("HelloWorld", None, pattern)
> doc = nlp(u'hello world!') > doc = nlp("hello world!")
> matches = matcher(doc) > matches = matcher(doc)
> ``` > ```
@ -147,7 +147,7 @@ overwritten.
> matcher = Matcher(nlp.vocab) > matcher = Matcher(nlp.vocab)
> matcher.add("HelloWorld", on_match, [{"LOWER": "hello"}, {"LOWER": "world"}]) > matcher.add("HelloWorld", on_match, [{"LOWER": "hello"}, {"LOWER": "world"}])
> matcher.add("GoogleMaps", on_match, [{"ORTH": "Google"}, {"ORTH": "Maps"}]) > matcher.add("GoogleMaps", on_match, [{"ORTH": "Google"}, {"ORTH": "Maps"}])
> doc = nlp(u"HELLO WORLD on Google Maps.") > doc = nlp("HELLO WORLD on Google Maps.")
> matches = matcher(doc) > matches = matcher(doc)
> ``` > ```

View File

@ -59,8 +59,8 @@ Find all token sequences matching the supplied patterns on the `Doc`.
> from spacy.matcher import PhraseMatcher > from spacy.matcher import PhraseMatcher
> >
> matcher = PhraseMatcher(nlp.vocab) > matcher = PhraseMatcher(nlp.vocab)
> matcher.add("OBAMA", None, nlp(u"Barack Obama")) > matcher.add("OBAMA", None, nlp("Barack Obama"))
> doc = nlp(u"Barack Obama lifts America one last time in emotional farewell") > doc = nlp("Barack Obama lifts America one last time in emotional farewell")
> matches = matcher(doc) > matches = matcher(doc)
> ``` > ```
@ -99,7 +99,7 @@ patterns.
> ```python > ```python
> matcher = PhraseMatcher(nlp.vocab) > matcher = PhraseMatcher(nlp.vocab)
> assert len(matcher) == 0 > assert len(matcher) == 0
> matcher.add("OBAMA", None, nlp(u"Barack Obama")) > matcher.add("OBAMA", None, nlp("Barack Obama"))
> assert len(matcher) == 1 > assert len(matcher) == 1
> ``` > ```
@ -116,7 +116,7 @@ Check whether the matcher contains rules for a match ID.
> ```python > ```python
> matcher = PhraseMatcher(nlp.vocab) > matcher = PhraseMatcher(nlp.vocab)
> assert "OBAMA" not in matcher > assert "OBAMA" not in matcher
> matcher.add("OBAMA", None, nlp(u"Barack Obama")) > matcher.add("OBAMA", None, nlp("Barack Obama"))
> assert "OBAMA" in matcher > assert "OBAMA" in matcher
> ``` > ```
@ -140,10 +140,10 @@ overwritten.
> print('Matched!', matches) > print('Matched!', matches)
> >
> matcher = PhraseMatcher(nlp.vocab) > matcher = PhraseMatcher(nlp.vocab)
> matcher.add("OBAMA", on_match, nlp(u"Barack Obama")) > matcher.add("OBAMA", on_match, nlp("Barack Obama"))
> matcher.add("HEALTH", on_match, nlp(u"health care reform"), > matcher.add("HEALTH", on_match, nlp("health care reform"),
> nlp(u"healthcare reform")) > nlp("healthcare reform"))
> doc = nlp(u"Barack Obama urges Congress to find courage to defend his healthcare reforms") > doc = nlp("Barack Obama urges Congress to find courage to defend his healthcare reforms")
> matches = matcher(doc) > matches = matcher(doc)
> ``` > ```

View File

@ -17,13 +17,13 @@ the processing pipeline using [`nlp.add_pipe`](/api/language#add_pipe).
> #### Example > #### Example
> >
> ```python > ```python
> texts = [t.text for t in nlp(u"I have a blue car")] > texts = [t.text for t in nlp("I have a blue car")]
> assert texts == ["I", "have", "a", "blue", "car"] > assert texts == ["I", "have", "a", "blue", "car"]
> >
> merge_nps = nlp.create_pipe("merge_noun_chunks") > merge_nps = nlp.create_pipe("merge_noun_chunks")
> nlp.add_pipe(merge_nps) > nlp.add_pipe(merge_nps)
> >
> texts = [t.text for t in nlp(u"I have a blue car")] > texts = [t.text for t in nlp("I have a blue car")]
> assert texts == ["I", "have", "a blue car"] > assert texts == ["I", "have", "a blue car"]
> ``` > ```
@ -50,13 +50,13 @@ the processing pipeline using [`nlp.add_pipe`](/api/language#add_pipe).
> #### Example > #### Example
> >
> ```python > ```python
> texts = [t.text for t in nlp(u"I like David Bowie")] > texts = [t.text for t in nlp("I like David Bowie")]
> assert texts == ["I", "like", "David", "Bowie"] > assert texts == ["I", "like", "David", "Bowie"]
> >
> merge_ents = nlp.create_pipe("merge_entities") > merge_ents = nlp.create_pipe("merge_entities")
> nlp.add_pipe(merge_ents) > nlp.add_pipe(merge_ents)
> >
> texts = [t.text for t in nlp(u"I like David Bowie")] > texts = [t.text for t in nlp("I like David Bowie")]
> assert texts == ["I", "like", "David Bowie"] > assert texts == ["I", "like", "David Bowie"]
> ``` > ```

View File

@ -59,7 +59,7 @@ the component has been added to the pipeline using
> nlp = English() > nlp = English()
> sentencizer = nlp.create_pipe("sentencizer") > sentencizer = nlp.create_pipe("sentencizer")
> nlp.add_pipe(sentencizer) > nlp.add_pipe(sentencizer)
> doc = nlp(u"This is a sentence. This is another sentence.") > doc = nlp("This is a sentence. This is another sentence.")
> assert list(doc.sents) == 2 > assert list(doc.sents) == 2
> ``` > ```

View File

@ -13,13 +13,13 @@ Create a Span object from the slice `doc[start : end]`.
> #### Example > #### Example
> >
> ```python > ```python
> doc = nlp(u"Give it back! He pleaded.") > doc = nlp("Give it back! He pleaded.")
> span = doc[1:4] > span = doc[1:4]
> assert [t.text for t in span] == [u"it", u"back", u"!"] > assert [t.text for t in span] == ["it", "back", "!"]
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ----------- | ---------------------------------------- | ------------------------------------------------------------------------------------------------------------------| | ----------- | ---------------------------------------- | ----------------------------------------------------------------------------------------------------------------- |
| `doc` | `Doc` | The parent document. | | `doc` | `Doc` | The parent document. |
| `start` | int | The index of the first token of the span. | | `start` | int | The index of the first token of the span. |
| `end` | int | The index of the first token after the span. | | `end` | int | The index of the first token after the span. |
@ -35,7 +35,7 @@ Get a `Token` object.
> #### Example > #### Example
> >
> ```python > ```python
> doc = nlp(u"Give it back! He pleaded.") > doc = nlp("Give it back! He pleaded.")
> span = doc[1:4] > span = doc[1:4]
> assert span[1].text == "back" > assert span[1].text == "back"
> ``` > ```
@ -50,9 +50,9 @@ Get a `Span` object.
> #### Example > #### Example
> >
> ```python > ```python
> doc = nlp(u"Give it back! He pleaded.") > doc = nlp("Give it back! He pleaded.")
> span = doc[1:4] > span = doc[1:4]
> assert span[1:3].text == u"back!" > assert span[1:3].text == "back!"
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
@ -67,9 +67,9 @@ Iterate over `Token` objects.
> #### Example > #### Example
> >
> ```python > ```python
> doc = nlp(u"Give it back! He pleaded.") > doc = nlp("Give it back! He pleaded.")
> span = doc[1:4] > span = doc[1:4]
> assert [t.text for t in span] == [u"it", u"back", u"!"] > assert [t.text for t in span] == ["it", "back", "!"]
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
@ -83,7 +83,7 @@ Get the number of tokens in the span.
> #### Example > #### Example
> >
> ```python > ```python
> doc = nlp(u"Give it back! He pleaded.") > doc = nlp("Give it back! He pleaded.")
> span = doc[1:4] > span = doc[1:4]
> assert len(span) == 3 > assert len(span) == 3
> ``` > ```
@ -102,9 +102,9 @@ For details, see the documentation on
> >
> ```python > ```python
> from spacy.tokens import Span > from spacy.tokens import Span
> city_getter = lambda span: any(city in span.text for city in (u"New York", u"Paris", u"Berlin")) > city_getter = lambda span: any(city in span.text for city in ("New York", "Paris", "Berlin"))
> Span.set_extension("has_city", getter=city_getter) > Span.set_extension("has_city", getter=city_getter)
> doc = nlp(u"I like New York in Autumn") > doc = nlp("I like New York in Autumn")
> assert doc[1:4]._.has_city > assert doc[1:4]._.has_city
> ``` > ```
@ -180,7 +180,7 @@ using an average of word vectors.
> #### Example > #### Example
> >
> ```python > ```python
> doc = nlp(u"green apples and red oranges") > doc = nlp("green apples and red oranges")
> green_apples = doc[:2] > green_apples = doc[:2]
> red_oranges = doc[3:] > red_oranges = doc[3:]
> apples_oranges = green_apples.similarity(red_oranges) > apples_oranges = green_apples.similarity(red_oranges)
@ -202,7 +202,7 @@ ancestor is found, e.g. if span excludes a necessary ancestor.
> #### Example > #### Example
> >
> ```python > ```python
> doc = nlp(u"I like New York in Autumn") > doc = nlp("I like New York in Autumn")
> span = doc[1:4] > span = doc[1:4]
> matrix = span.get_lca_matrix() > matrix = span.get_lca_matrix()
> # array([[0, 0, 0], [0, 1, 2], [0, 2, 2]], dtype=int32) > # array([[0, 0, 0], [0, 1, 2], [0, 2, 2]], dtype=int32)
@ -222,7 +222,7 @@ shape `(N, M)`, where `N` is the length of the document. The values will be
> >
> ```python > ```python
> from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA > from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA
> doc = nlp(u"I like New York in Autumn.") > doc = nlp("I like New York in Autumn.")
> span = doc[2:3] > span = doc[2:3]
> # All strings mapped to integers, for easy export to numpy > # All strings mapped to integers, for easy export to numpy
> np_array = span.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA]) > np_array = span.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA])
@ -248,11 +248,11 @@ Retokenize the document, such that the span is merged into a single token.
> #### Example > #### Example
> >
> ```python > ```python
> doc = nlp(u"I like New York in Autumn.") > doc = nlp("I like New York in Autumn.")
> span = doc[2:4] > span = doc[2:4]
> span.merge() > span.merge()
> assert len(doc) == 6 > assert len(doc) == 6
> assert doc[2].text == u"New York" > assert doc[2].text == "New York"
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
@ -268,12 +268,12 @@ if the entity recognizer has been applied.
> #### Example > #### Example
> >
> ```python > ```python
> doc = nlp(u"Mr. Best flew to New York on Saturday morning.") > doc = nlp("Mr. Best flew to New York on Saturday morning.")
> span = doc[0:6] > span = doc[0:6]
> ents = list(span.ents) > ents = list(span.ents)
> assert ents[0].label == 346 > assert ents[0].label == 346
> assert ents[0].label_ == "PERSON" > assert ents[0].label_ == "PERSON"
> assert ents[0].text == u"Mr. Best" > assert ents[0].text == "Mr. Best"
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
@ -287,10 +287,10 @@ Create a new `Doc` object corresponding to the `Span`, with a copy of the data.
> #### Example > #### Example
> >
> ```python > ```python
> doc = nlp(u"I like New York in Autumn.") > doc = nlp("I like New York in Autumn.")
> span = doc[2:4] > span = doc[2:4]
> doc2 = span.as_doc() > doc2 = span.as_doc()
> assert doc2.text == u"New York" > assert doc2.text == "New York"
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
@ -306,12 +306,12 @@ taken.
> #### Example > #### Example
> >
> ```python > ```python
> doc = nlp(u"I like New York in Autumn.") > doc = nlp("I like New York in Autumn.")
> i, like, new, york, in_, autumn, dot = range(len(doc)) > i, like, new, york, in_, autumn, dot = range(len(doc))
> assert doc[new].head.text == u"York" > assert doc[new].head.text == "York"
> assert doc[york].head.text == u"like" > assert doc[york].head.text == "like"
> new_york = doc[new:york+1] > new_york = doc[new:york+1]
> assert new_york.root.text == u"York" > assert new_york.root.text == "York"
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
@ -325,9 +325,9 @@ A tuple of tokens coordinated to `span.root`.
> #### Example > #### Example
> >
> ```python > ```python
> doc = nlp(u"I like apples and oranges") > doc = nlp("I like apples and oranges")
> apples_conjuncts = doc[2:3].conjuncts > apples_conjuncts = doc[2:3].conjuncts
> assert [t.text for t in apples_conjuncts] == [u"oranges"] > assert [t.text for t in apples_conjuncts] == ["oranges"]
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
@ -341,9 +341,9 @@ Tokens that are to the left of the span, whose heads are within the span.
> #### Example > #### Example
> >
> ```python > ```python
> doc = nlp(u"I like New York in Autumn.") > doc = nlp("I like New York in Autumn.")
> lefts = [t.text for t in doc[3:7].lefts] > lefts = [t.text for t in doc[3:7].lefts]
> assert lefts == [u"New"] > assert lefts == ["New"]
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
@ -357,9 +357,9 @@ Tokens that are to the right of the span, whose heads are within the span.
> #### Example > #### Example
> >
> ```python > ```python
> doc = nlp(u"I like New York in Autumn.") > doc = nlp("I like New York in Autumn.")
> rights = [t.text for t in doc[2:4].rights] > rights = [t.text for t in doc[2:4].rights]
> assert rights == [u"in"] > assert rights == ["in"]
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
@ -374,7 +374,7 @@ the span.
> #### Example > #### Example
> >
> ```python > ```python
> doc = nlp(u"I like New York in Autumn.") > doc = nlp("I like New York in Autumn.")
> assert doc[3:7].n_lefts == 1 > assert doc[3:7].n_lefts == 1
> ``` > ```
@ -390,7 +390,7 @@ the span.
> #### Example > #### Example
> >
> ```python > ```python
> doc = nlp(u"I like New York in Autumn.") > doc = nlp("I like New York in Autumn.")
> assert doc[2:4].n_rights == 1 > assert doc[2:4].n_rights == 1
> ``` > ```
@ -405,9 +405,9 @@ Tokens within the span and tokens which descend from them.
> #### Example > #### Example
> >
> ```python > ```python
> doc = nlp(u"Give it back! He pleaded.") > doc = nlp("Give it back! He pleaded.")
> subtree = [t.text for t in doc[:3].subtree] > subtree = [t.text for t in doc[:3].subtree]
> assert subtree == [u"Give", u"it", u"back", u"!"] > assert subtree == ["Give", "it", "back", "!"]
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
@ -421,7 +421,7 @@ A boolean value indicating whether a word vector is associated with the object.
> #### Example > #### Example
> >
> ```python > ```python
> doc = nlp(u"I like apples") > doc = nlp("I like apples")
> assert doc[1:].has_vector > assert doc[1:].has_vector
> ``` > ```
@ -437,7 +437,7 @@ vectors.
> #### Example > #### Example
> >
> ```python > ```python
> doc = nlp(u"I like apples") > doc = nlp("I like apples")
> assert doc[1:].vector.dtype == "float32" > assert doc[1:].vector.dtype == "float32"
> assert doc[1:].vector.shape == (300,) > assert doc[1:].vector.shape == (300,)
> ``` > ```
@ -453,7 +453,7 @@ The L2 norm of the span's vector representation.
> #### Example > #### Example
> >
> ```python > ```python
> doc = nlp(u"I like apples") > doc = nlp("I like apples")
> doc[1:].vector_norm # 4.800883928527915 > doc[1:].vector_norm # 4.800883928527915
> doc[2:].vector_norm # 6.895897646384268 > doc[2:].vector_norm # 6.895897646384268
> assert doc[1:].vector_norm != doc[2:].vector_norm > assert doc[1:].vector_norm != doc[2:].vector_norm

View File

@ -16,7 +16,7 @@ Create the `StringStore`.
> >
> ```python > ```python
> from spacy.strings import StringStore > from spacy.strings import StringStore
> stringstore = StringStore([u"apple", u"orange"]) > stringstore = StringStore(["apple", "orange"])
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
@ -31,7 +31,7 @@ Get the number of strings in the store.
> #### Example > #### Example
> >
> ```python > ```python
> stringstore = StringStore([u"apple", u"orange"]) > stringstore = StringStore(["apple", "orange"])
> assert len(stringstore) == 2 > assert len(stringstore) == 2
> ``` > ```
@ -46,10 +46,10 @@ Retrieve a string from a given hash, or vice versa.
> #### Example > #### Example
> >
> ```python > ```python
> stringstore = StringStore([u"apple", u"orange"]) > stringstore = StringStore(["apple", "orange"])
> apple_hash = stringstore[u"apple"] > apple_hash = stringstore["apple"]
> assert apple_hash == 8566208034543834098 > assert apple_hash == 8566208034543834098
> assert stringstore[apple_hash] == u"apple" > assert stringstore[apple_hash] == "apple"
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
@ -64,9 +64,9 @@ Check whether a string is in the store.
> #### Example > #### Example
> >
> ```python > ```python
> stringstore = StringStore([u"apple", u"orange"]) > stringstore = StringStore(["apple", "orange"])
> assert u"apple" in stringstore > assert "apple" in stringstore
> assert not u"cherry" in stringstore > assert not "cherry" in stringstore
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
@ -82,9 +82,9 @@ store will always include an empty string `''` at position `0`.
> #### Example > #### Example
> >
> ```python > ```python
> stringstore = StringStore([u"apple", u"orange"]) > stringstore = StringStore(["apple", "orange"])
> all_strings = [s for s in stringstore] > all_strings = [s for s in stringstore]
> assert all_strings == [u"apple", u"orange"] > assert all_strings == ["apple", "orange"]
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
@ -98,12 +98,12 @@ Add a string to the `StringStore`.
> #### Example > #### Example
> >
> ```python > ```python
> stringstore = StringStore([u"apple", u"orange"]) > stringstore = StringStore(["apple", "orange"])
> banana_hash = stringstore.add(u"banana") > banana_hash = stringstore.add("banana")
> assert len(stringstore) == 3 > assert len(stringstore) == 3
> assert banana_hash == 2525716904149915114 > assert banana_hash == 2525716904149915114
> assert stringstore[banana_hash] == u"banana" > assert stringstore[banana_hash] == "banana"
> assert stringstore[u"banana"] == banana_hash > assert stringstore["banana"] == banana_hash
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
@ -182,7 +182,7 @@ Get a 64-bit hash for a given string.
> >
> ```python > ```python
> from spacy.strings import hash_string > from spacy.strings import hash_string
> assert hash_string(u"apple") == 8566208034543834098 > assert hash_string("apple") == 8566208034543834098
> ``` > ```
| Name | Type | Description | | Name | Type | Description |

View File

@ -57,7 +57,7 @@ and all pipeline components are applied to the `Doc` in order. Both
> >
> ```python > ```python
> tagger = Tagger(nlp.vocab) > tagger = Tagger(nlp.vocab)
> doc = nlp(u"This is a sentence.") > doc = nlp("This is a sentence.")
> # This usually happens under the hood > # This usually happens under the hood
> processed = tagger(doc) > processed = tagger(doc)
> ``` > ```
@ -117,12 +117,11 @@ Modify a batch of documents, using pre-computed scores.
> tagger.set_annotations([doc1, doc2], scores, tensors) > tagger.set_annotations([doc1, doc2], scores, tensors)
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| -------- | -------- | ----------------------------------------------------- | | --------- | -------- | ----------------------------------------------------- |
| `docs` | iterable | The documents to modify. | | `docs` | iterable | The documents to modify. |
| `scores` | - | The scores to set, produced by `Tagger.predict`. | | `scores` | - | The scores to set, produced by `Tagger.predict`. |
| `tensors`| iterable | The token representations used to predict the scores. | | `tensors` | iterable | The token representations used to predict the scores. |
## Tagger.update {#update tag="method"} ## Tagger.update {#update tag="method"}

View File

@ -75,7 +75,7 @@ delegate to the [`predict`](/api/textcategorizer#predict) and
> >
> ```python > ```python
> textcat = TextCategorizer(nlp.vocab) > textcat = TextCategorizer(nlp.vocab)
> doc = nlp(u"This is a sentence.") > doc = nlp("This is a sentence.")
> # This usually happens under the hood > # This usually happens under the hood
> processed = textcat(doc) > processed = textcat(doc)
> ``` > ```
@ -136,11 +136,11 @@ Modify a batch of documents, using pre-computed scores.
> textcat.set_annotations([doc1, doc2], scores, tensors) > textcat.set_annotations([doc1, doc2], scores, tensors)
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| -------- | -------- | --------------------------------------------------------- | | --------- | -------- | --------------------------------------------------------- |
| `docs` | iterable | The documents to modify. | | `docs` | iterable | The documents to modify. |
| `scores` | - | The scores to set, produced by `TextCategorizer.predict`. | | `scores` | - | The scores to set, produced by `TextCategorizer.predict`. |
| `tensors`| iterable | The token representations used to predict the scores. | | `tensors` | iterable | The token representations used to predict the scores. |
## TextCategorizer.update {#update tag="method"} ## TextCategorizer.update {#update tag="method"}

View File

@ -12,9 +12,9 @@ Construct a `Token` object.
> #### Example > #### Example
> >
> ```python > ```python
> doc = nlp(u"Give it back! He pleaded.") > doc = nlp("Give it back! He pleaded.")
> token = doc[0] > token = doc[0]
> assert token.text == u"Give" > assert token.text == "Give"
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
@ -31,7 +31,7 @@ The number of unicode characters in the token, i.e. `token.text`.
> #### Example > #### Example
> >
> ```python > ```python
> doc = nlp(u"Give it back! He pleaded.") > doc = nlp("Give it back! He pleaded.")
> token = doc[0] > token = doc[0]
> assert len(token) == 4 > assert len(token) == 4
> ``` > ```
@ -50,9 +50,9 @@ For details, see the documentation on
> >
> ```python > ```python
> from spacy.tokens import Token > from spacy.tokens import Token
> fruit_getter = lambda token: token.text in (u"apple", u"pear", u"banana") > fruit_getter = lambda token: token.text in ("apple", "pear", "banana")
> Token.set_extension("is_fruit", getter=fruit_getter) > Token.set_extension("is_fruit", getter=fruit_getter)
> doc = nlp(u"I have an apple") > doc = nlp("I have an apple")
> assert doc[3]._.is_fruit > assert doc[3]._.is_fruit
> ``` > ```
@ -128,7 +128,7 @@ Check the value of a boolean flag.
> >
> ```python > ```python
> from spacy.attrs import IS_TITLE > from spacy.attrs import IS_TITLE
> doc = nlp(u"Give it back! He pleaded.") > doc = nlp("Give it back! He pleaded.")
> token = doc[0] > token = doc[0]
> assert token.check_flag(IS_TITLE) == True > assert token.check_flag(IS_TITLE) == True
> ``` > ```
@ -145,7 +145,7 @@ Compute a semantic similarity estimate. Defaults to cosine over vectors.
> #### Example > #### Example
> >
> ```python > ```python
> apples, _, oranges = nlp(u"apples and oranges") > apples, _, oranges = nlp("apples and oranges")
> apples_oranges = apples.similarity(oranges) > apples_oranges = apples.similarity(oranges)
> oranges_apples = oranges.similarity(apples) > oranges_apples = oranges.similarity(apples)
> assert apples_oranges == oranges_apples > assert apples_oranges == oranges_apples
@ -163,9 +163,9 @@ Get a neighboring token.
> #### Example > #### Example
> >
> ```python > ```python
> doc = nlp(u"Give it back! He pleaded.") > doc = nlp("Give it back! He pleaded.")
> give_nbor = doc[0].nbor() > give_nbor = doc[0].nbor()
> assert give_nbor.text == u"it" > assert give_nbor.text == "it"
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
@ -181,7 +181,7 @@ dependency tree.
> #### Example > #### Example
> >
> ```python > ```python
> doc = nlp(u"Give it back! He pleaded.") > doc = nlp("Give it back! He pleaded.")
> give = doc[0] > give = doc[0]
> it = doc[1] > it = doc[1]
> assert give.is_ancestor(it) > assert give.is_ancestor(it)
@ -199,11 +199,11 @@ The rightmost token of this token's syntactic descendants.
> #### Example > #### Example
> >
> ```python > ```python
> doc = nlp(u"Give it back! He pleaded.") > doc = nlp("Give it back! He pleaded.")
> it_ancestors = doc[1].ancestors > it_ancestors = doc[1].ancestors
> assert [t.text for t in it_ancestors] == [u"Give"] > assert [t.text for t in it_ancestors] == ["Give"]
> he_ancestors = doc[4].ancestors > he_ancestors = doc[4].ancestors
> assert [t.text for t in he_ancestors] == [u"pleaded"] > assert [t.text for t in he_ancestors] == ["pleaded"]
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
@ -217,9 +217,9 @@ A tuple of coordinated tokens, not including the token itself.
> #### Example > #### Example
> >
> ```python > ```python
> doc = nlp(u"I like apples and oranges") > doc = nlp("I like apples and oranges")
> apples_conjuncts = doc[2].conjuncts > apples_conjuncts = doc[2].conjuncts
> assert [t.text for t in apples_conjuncts] == [u"oranges"] > assert [t.text for t in apples_conjuncts] == ["oranges"]
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
@ -233,9 +233,9 @@ A sequence of the token's immediate syntactic children.
> #### Example > #### Example
> >
> ```python > ```python
> doc = nlp(u"Give it back! He pleaded.") > doc = nlp("Give it back! He pleaded.")
> give_children = doc[0].children > give_children = doc[0].children
> assert [t.text for t in give_children] == [u"it", u"back", u"!"] > assert [t.text for t in give_children] == ["it", "back", "!"]
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
@ -249,9 +249,9 @@ The leftward immediate children of the word, in the syntactic dependency parse.
> #### Example > #### Example
> >
> ```python > ```python
> doc = nlp(u"I like New York in Autumn.") > doc = nlp("I like New York in Autumn.")
> lefts = [t.text for t in doc[3].lefts] > lefts = [t.text for t in doc[3].lefts]
> assert lefts == [u'New'] > assert lefts == ["New"]
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
@ -265,9 +265,9 @@ The rightward immediate children of the word, in the syntactic dependency parse.
> #### Example > #### Example
> >
> ```python > ```python
> doc = nlp(u"I like New York in Autumn.") > doc = nlp("I like New York in Autumn.")
> rights = [t.text for t in doc[3].rights] > rights = [t.text for t in doc[3].rights]
> assert rights == [u"in"] > assert rights == ["in"]
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
@ -282,7 +282,7 @@ dependency parse.
> #### Example > #### Example
> >
> ```python > ```python
> doc = nlp(u"I like New York in Autumn.") > doc = nlp("I like New York in Autumn.")
> assert doc[3].n_lefts == 1 > assert doc[3].n_lefts == 1
> ``` > ```
@ -298,7 +298,7 @@ dependency parse.
> #### Example > #### Example
> >
> ```python > ```python
> doc = nlp(u"I like New York in Autumn.") > doc = nlp("I like New York in Autumn.")
> assert doc[3].n_rights == 1 > assert doc[3].n_rights == 1
> ``` > ```
@ -313,9 +313,9 @@ A sequence containing the token and all the token's syntactic descendants.
> #### Example > #### Example
> >
> ```python > ```python
> doc = nlp(u"Give it back! He pleaded.") > doc = nlp("Give it back! He pleaded.")
> give_subtree = doc[0].subtree > give_subtree = doc[0].subtree
> assert [t.text for t in give_subtree] == [u"Give", u"it", u"back", u"!"] > assert [t.text for t in give_subtree] == ["Give", "it", "back", "!"]
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
@ -330,7 +330,7 @@ unknown. Defaults to `True` for the first token in the `Doc`.
> #### Example > #### Example
> >
> ```python > ```python
> doc = nlp(u"Give it back! He pleaded.") > doc = nlp("Give it back! He pleaded.")
> assert doc[4].is_sent_start > assert doc[4].is_sent_start
> assert not doc[5].is_sent_start > assert not doc[5].is_sent_start
> ``` > ```
@ -361,7 +361,7 @@ A boolean value indicating whether a word vector is associated with the token.
> #### Example > #### Example
> >
> ```python > ```python
> doc = nlp(u"I like apples") > doc = nlp("I like apples")
> apples = doc[2] > apples = doc[2]
> assert apples.has_vector > assert apples.has_vector
> ``` > ```
@ -377,7 +377,7 @@ A real-valued meaning representation.
> #### Example > #### Example
> >
> ```python > ```python
> doc = nlp(u"I like apples") > doc = nlp("I like apples")
> apples = doc[2] > apples = doc[2]
> assert apples.vector.dtype == "float32" > assert apples.vector.dtype == "float32"
> assert apples.vector.shape == (300,) > assert apples.vector.shape == (300,)
@ -394,7 +394,7 @@ The L2 norm of the token's vector representation.
> #### Example > #### Example
> >
> ```python > ```python
> doc = nlp(u"I like apples and pasta") > doc = nlp("I like apples and pasta")
> apples = doc[2] > apples = doc[2]
> pasta = doc[4] > pasta = doc[4]
> apples.vector_norm # 6.89589786529541 > apples.vector_norm # 6.89589786529541
@ -425,7 +425,7 @@ The L2 norm of the token's vector representation.
| `i` | int | The index of the token within the parent document. | | `i` | int | The index of the token within the parent document. |
| `ent_type` | int | Named entity type. | | `ent_type` | int | Named entity type. |
| `ent_type_` | unicode | Named entity type. | | `ent_type_` | unicode | Named entity type. |
| `ent_iob` | int | IOB code of named entity tag. `3` means the token begins an entity, `2` means it is outside an entity, `1` means it is inside an entity, and `0` means no entity tag is set. | | `ent_iob` | int | IOB code of named entity tag. `3` means the token begins an entity, `2` means it is outside an entity, `1` means it is inside an entity, and `0` means no entity tag is set. |
| `ent_iob_` | unicode | IOB code of named entity tag. "B" means the token begins an entity, "I" means it is inside an entity, "O" means it is outside an entity, and "" means no entity tag is set. | | `ent_iob_` | unicode | IOB code of named entity tag. "B" means the token begins an entity, "I" means it is inside an entity, "O" means it is outside an entity, and "" means no entity tag is set. |
| `ent_kb_id` <Tag variant="new">2.2</Tag> | int | Knowledge base ID that refers to the named entity this token is a part of, if any. | | `ent_kb_id` <Tag variant="new">2.2</Tag> | int | Knowledge base ID that refers to the named entity this token is a part of, if any. |
| `ent_kb_id_` <Tag variant="new">2.2</Tag> | unicode | Knowledge base ID that refers to the named entity this token is a part of, if any. | | `ent_kb_id_` <Tag variant="new">2.2</Tag> | unicode | Knowledge base ID that refers to the named entity this token is a part of, if any. |

View File

@ -5,7 +5,9 @@ tag: class
source: spacy/tokenizer.pyx source: spacy/tokenizer.pyx
--- ---
Segment text, and create `Doc` objects with the discovered segment boundaries. For a deeper understanding, see the docs on [how spaCy's tokenizer works](/usage/linguistic-features#how-tokenizer-works). Segment text, and create `Doc` objects with the discovered segment boundaries.
For a deeper understanding, see the docs on
[how spaCy's tokenizer works](/usage/linguistic-features#how-tokenizer-works).
## Tokenizer.\_\_init\_\_ {#init tag="method"} ## Tokenizer.\_\_init\_\_ {#init tag="method"}
@ -49,7 +51,7 @@ Tokenize a string.
> #### Example > #### Example
> >
> ```python > ```python
> tokens = tokenizer(u"This is a sentence") > tokens = tokenizer("This is a sentence")
> assert len(tokens) == 4 > assert len(tokens) == 4
> ``` > ```
@ -65,7 +67,7 @@ Tokenize a stream of texts.
> #### Example > #### Example
> >
> ```python > ```python
> texts = [u"One document.", u"...", u"Lots of documents"] > texts = ["One document.", "...", "Lots of documents"]
> for doc in tokenizer.pipe(texts, batch_size=50): > for doc in tokenizer.pipe(texts, batch_size=50):
> pass > pass
> ``` > ```
@ -109,8 +111,9 @@ if no suffix rules match.
Add a special-case tokenization rule. This mechanism is also used to add custom Add a special-case tokenization rule. This mechanism is also used to add custom
tokenizer exceptions to the language data. See the usage guide on tokenizer exceptions to the language data. See the usage guide on
[adding languages](/usage/adding-languages#tokenizer-exceptions) and [linguistic features](/usage/linguistic-features#special-cases) for more [adding languages](/usage/adding-languages#tokenizer-exceptions) and
details and examples. [linguistic features](/usage/linguistic-features#special-cases) for more details
and examples.
> #### Example > #### Example
> >

View File

@ -112,10 +112,10 @@ list of available terms, see
> #### Example > #### Example
> >
> ```python > ```python
> spacy.explain(u"NORP") > spacy.explain("NORP")
> # Nationalities or religious or political groups > # Nationalities or religious or political groups
> >
> doc = nlp(u"Hello world") > doc = nlp("Hello world")
> for word in doc: > for word in doc:
> print(word.text, word.tag_, spacy.explain(word.tag_)) > print(word.text, word.tag_, spacy.explain(word.tag_))
> # Hello UH interjection > # Hello UH interjection
@ -181,8 +181,8 @@ browser. Will run a simple web server.
> import spacy > import spacy
> from spacy import displacy > from spacy import displacy
> nlp = spacy.load("en_core_web_sm") > nlp = spacy.load("en_core_web_sm")
> doc1 = nlp(u"This is a sentence.") > doc1 = nlp("This is a sentence.")
> doc2 = nlp(u"This is another sentence.") > doc2 = nlp("This is another sentence.")
> displacy.serve([doc1, doc2], style="dep") > displacy.serve([doc1, doc2], style="dep")
> ``` > ```
@ -192,7 +192,7 @@ browser. Will run a simple web server.
| `style` | unicode | Visualization style, `'dep'` or `'ent'`. | `'dep'` | | `style` | unicode | Visualization style, `'dep'` or `'ent'`. | `'dep'` |
| `page` | bool | Render markup as full HTML page. | `True` | | `page` | bool | Render markup as full HTML page. | `True` |
| `minify` | bool | Minify HTML markup. | `False` | | `minify` | bool | Minify HTML markup. | `False` |
| `options` | dict | [Visualizer-specific options](#displacy_options), e.g. colors. | `{}` | | `options` | dict | [Visualizer-specific options](#displacy_options), e.g. colors. | `{}` |
| `manual` | bool | Don't parse `Doc` and instead, expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. | `False` | | `manual` | bool | Don't parse `Doc` and instead, expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. | `False` |
| `port` | int | Port to serve visualization. | `5000` | | `port` | int | Port to serve visualization. | `5000` |
| `host` | unicode | Host to serve visualization. | `'0.0.0.0'` | | `host` | unicode | Host to serve visualization. | `'0.0.0.0'` |
@ -207,7 +207,7 @@ Render a dependency parse tree or named entity visualization.
> import spacy > import spacy
> from spacy import displacy > from spacy import displacy
> nlp = spacy.load("en_core_web_sm") > nlp = spacy.load("en_core_web_sm")
> doc = nlp(u"This is a sentence.") > doc = nlp("This is a sentence.")
> html = displacy.render(doc, style="dep") > html = displacy.render(doc, style="dep")
> ``` > ```
@ -218,7 +218,7 @@ Render a dependency parse tree or named entity visualization.
| `page` | bool | Render markup as full HTML page. | `False` | | `page` | bool | Render markup as full HTML page. | `False` |
| `minify` | bool | Minify HTML markup. | `False` | | `minify` | bool | Minify HTML markup. | `False` |
| `jupyter` | bool | Explicitly enable or disable "[Jupyter](http://jupyter.org/) mode" to return markup ready to be rendered in a notebook. Detected automatically if `None`. | `None` | | `jupyter` | bool | Explicitly enable or disable "[Jupyter](http://jupyter.org/) mode" to return markup ready to be rendered in a notebook. Detected automatically if `None`. | `None` |
| `options` | dict | [Visualizer-specific options](#displacy_options), e.g. colors. | `{}` | | `options` | dict | [Visualizer-specific options](#displacy_options), e.g. colors. | `{}` |
| `manual` | bool | Don't parse `Doc` and instead, expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. | `False` | | `manual` | bool | Don't parse `Doc` and instead, expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. | `False` |
| **RETURNS** | unicode | Rendered HTML markup. | | **RETURNS** | unicode | Rendered HTML markup. |
@ -262,16 +262,18 @@ If a setting is not present in the options, the default value will be used.
> displacy.serve(doc, style="ent", options=options) > displacy.serve(doc, style="ent", options=options)
> ``` > ```
| Name | Type | Description | Default | | Name | Type | Description | Default |
| -------- | ---- | ------------------------------------------------------------------------------------- | ------- | | --------------------------------------- | ------- | ------------------------------------------------------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------------------ |
| `ents` | list | Entity types to highlight (`None` for all types). | `None` | | `ents` | list | Entity types to highlight (`None` for all types). | `None` |
| `colors` | dict | Color overrides. Entity types in uppercase should be mapped to color names or values. | `{}` | | `colors` | dict | Color overrides. Entity types in uppercase should be mapped to color names or values. | `{}` |
| `template` <Tag variant="new">2.2</Tag> | unicode | Optional template to overwrite the HTML used to render entity spans. Should be a format string and can use `{bg}`, `{text}` and `{label}`. | see [`templates.py`](https://github.com/explosion/spaCy/blob/master/spacy/displacy/templates.py) | | `template` <Tag variant="new">2.2</Tag> | unicode | Optional template to overwrite the HTML used to render entity spans. Should be a format string and can use `{bg}`, `{text}` and `{label}`. | see [`templates.py`](https://github.com/explosion/spaCy/blob/master/spacy/displacy/templates.py) |
By default, displaCy comes with colors for all By default, displaCy comes with colors for all
[entity types supported by spaCy](/api/annotation#named-entities). If you're [entity types supported by spaCy](/api/annotation#named-entities). If you're
using custom entity types, you can use the `colors` setting to add your own using custom entity types, you can use the `colors` setting to add your own
colors for them. Your application or model package can also expose a [`spacy_displacy_colors` entry point](/usage/saving-loading#entry-points-displacy) to add custom labels and their colors automatically. colors for them. Your application or model package can also expose a
[`spacy_displacy_colors` entry point](/usage/saving-loading#entry-points-displacy)
to add custom labels and their colors automatically.
## Utility functions {#util source="spacy/util.py"} ## Utility functions {#util source="spacy/util.py"}
@ -649,11 +651,11 @@ for batching. Larger `bufsize` means less bias.
> shuffled = itershuffle(values) > shuffled = itershuffle(values)
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ---------- | -------- | ------------------------------------- | | ---------- | -------- | ----------------------------------- |
| `iterable` | iterable | Iterator to shuffle. | | `iterable` | iterable | Iterator to shuffle. |
| `bufsize` | int | Items to hold back (default: 1000). | | `bufsize` | int | Items to hold back (default: 1000). |
| **YIELDS** | iterable | The shuffled iterator. | | **YIELDS** | iterable | The shuffled iterator. |
### util.filter_spans {#util.filter_spans tag="function" new="2.1.4"} ### util.filter_spans {#util.filter_spans tag="function" new="2.1.4"}

View File

@ -26,7 +26,7 @@ you can add vectors to later.
> empty_vectors = Vectors(shape=(10000, 300)) > empty_vectors = Vectors(shape=(10000, 300))
> >
> data = numpy.zeros((3, 300), dtype='f') > data = numpy.zeros((3, 300), dtype='f')
> keys = [u"cat", u"dog", u"rat"] > keys = ["cat", "dog", "rat"]
> vectors = Vectors(data=data, keys=keys) > vectors = Vectors(data=data, keys=keys)
> ``` > ```
@ -45,9 +45,9 @@ raised.
> #### Example > #### Example
> >
> ```python > ```python
> cat_id = nlp.vocab.strings[u"cat"] > cat_id = nlp.vocab.strings["cat"]
> cat_vector = nlp.vocab.vectors[cat_id] > cat_vector = nlp.vocab.vectors[cat_id]
> assert cat_vector == nlp.vocab[u"cat"].vector > assert cat_vector == nlp.vocab["cat"].vector
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
@ -62,7 +62,7 @@ Set a vector for the given key.
> #### Example > #### Example
> >
> ```python > ```python
> cat_id = nlp.vocab.strings[u"cat"] > cat_id = nlp.vocab.strings["cat"]
> vector = numpy.random.uniform(-1, 1, (300,)) > vector = numpy.random.uniform(-1, 1, (300,))
> nlp.vocab.vectors[cat_id] = vector > nlp.vocab.vectors[cat_id] = vector
> ``` > ```
@ -109,7 +109,7 @@ Check whether a key has been mapped to a vector entry in the table.
> #### Example > #### Example
> >
> ```python > ```python
> cat_id = nlp.vocab.strings[u"cat"] > cat_id = nlp.vocab.strings["cat"]
> nlp.vectors.add(cat_id, numpy.random.uniform(-1, 1, (300,))) > nlp.vectors.add(cat_id, numpy.random.uniform(-1, 1, (300,)))
> assert cat_id in vectors > assert cat_id in vectors
> ``` > ```
@ -132,9 +132,9 @@ mapping separately. If you need to manage the strings, you should use the
> >
> ```python > ```python
> vector = numpy.random.uniform(-1, 1, (300,)) > vector = numpy.random.uniform(-1, 1, (300,))
> cat_id = nlp.vocab.strings[u"cat"] > cat_id = nlp.vocab.strings["cat"]
> nlp.vocab.vectors.add(cat_id, vector=vector) > nlp.vocab.vectors.add(cat_id, vector=vector)
> nlp.vocab.vectors.add(u"dog", row=0) > nlp.vocab.vectors.add("dog", row=0)
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
@ -218,8 +218,8 @@ Look up one or more keys by row, or vice versa.
> #### Example > #### Example
> >
> ```python > ```python
> row = nlp.vocab.vectors.find(key=u"cat") > row = nlp.vocab.vectors.find(key="cat")
> rows = nlp.vocab.vectors.find(keys=[u"cat", u"dog"]) > rows = nlp.vocab.vectors.find(keys=["cat", "dog"])
> key = nlp.vocab.vectors.find(row=256) > key = nlp.vocab.vectors.find(row=256)
> keys = nlp.vocab.vectors.find(rows=[18, 256, 985]) > keys = nlp.vocab.vectors.find(rows=[18, 256, 985])
> ``` > ```
@ -241,7 +241,7 @@ vector table.
> >
> ```python > ```python
> vectors = Vectors(shape(1, 300)) > vectors = Vectors(shape(1, 300))
> vectors.add(u"cat", numpy.random.uniform(-1, 1, (300,))) > vectors.add("cat", numpy.random.uniform(-1, 1, (300,)))
> rows, dims = vectors.shape > rows, dims = vectors.shape
> assert rows == 1 > assert rows == 1
> assert dims == 300 > assert dims == 300
@ -276,7 +276,7 @@ If a table is full, it can be resized using
> >
> ```python > ```python
> vectors = Vectors(shape=(1, 300)) > vectors = Vectors(shape=(1, 300))
> vectors.add(u"cat", numpy.random.uniform(-1, 1, (300,))) > vectors.add("cat", numpy.random.uniform(-1, 1, (300,)))
> assert vectors.is_full > assert vectors.is_full
> ``` > ```

View File

@ -18,7 +18,7 @@ Create the vocabulary.
> >
> ```python > ```python
> from spacy.vocab import Vocab > from spacy.vocab import Vocab
> vocab = Vocab(strings=[u"hello", u"world"]) > vocab = Vocab(strings=["hello", "world"])
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
@ -36,7 +36,7 @@ Get the current number of lexemes in the vocabulary.
> #### Example > #### Example
> >
> ```python > ```python
> doc = nlp(u"This is a sentence.") > doc = nlp("This is a sentence.")
> assert len(nlp.vocab) > 0 > assert len(nlp.vocab) > 0
> ``` > ```
@ -52,8 +52,8 @@ unicode string is given, a new lexeme is created and stored.
> #### Example > #### Example
> >
> ```python > ```python
> apple = nlp.vocab.strings[u"apple"] > apple = nlp.vocab.strings["apple"]
> assert nlp.vocab[apple] == nlp.vocab[u"apple"] > assert nlp.vocab[apple] == nlp.vocab["apple"]
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
@ -84,8 +84,8 @@ given string, you need to look it up in
> #### Example > #### Example
> >
> ```python > ```python
> apple = nlp.vocab.strings[u"apple"] > apple = nlp.vocab.strings["apple"]
> oov = nlp.vocab.strings[u"dskfodkfos"] > oov = nlp.vocab.strings["dskfodkfos"]
> assert apple in nlp.vocab > assert apple in nlp.vocab
> assert oov not in nlp.vocab > assert oov not in nlp.vocab
> ``` > ```
@ -106,11 +106,11 @@ using `token.check_flag(flag_id)`.
> >
> ```python > ```python
> def is_my_product(text): > def is_my_product(text):
> products = [u"spaCy", u"Thinc", u"displaCy"] > products = ["spaCy", "Thinc", "displaCy"]
> return text in products > return text in products
> >
> MY_PRODUCT = nlp.vocab.add_flag(is_my_product) > MY_PRODUCT = nlp.vocab.add_flag(is_my_product)
> doc = nlp(u"I like spaCy") > doc = nlp("I like spaCy")
> assert doc[2].check_flag(MY_PRODUCT) == True > assert doc[2].check_flag(MY_PRODUCT) == True
> ``` > ```
@ -170,7 +170,7 @@ or hash value. If no vectors data is loaded, a `ValueError` is raised.
> #### Example > #### Example
> >
> ```python > ```python
> nlp.vocab.get_vector(u"apple") > nlp.vocab.get_vector("apple")
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
@ -186,7 +186,7 @@ or hash value.
> #### Example > #### Example
> >
> ```python > ```python
> nlp.vocab.set_vector(u"apple", array([...])) > nlp.vocab.set_vector("apple", array([...]))
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
@ -202,8 +202,8 @@ Words can be looked up by string or hash value.
> #### Example > #### Example
> >
> ```python > ```python
> if nlp.vocab.has_vector(u"apple"): > if nlp.vocab.has_vector("apple"):
> vector = nlp.vocab.get_vector(u"apple") > vector = nlp.vocab.get_vector("apple")
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
@ -282,9 +282,9 @@ Load state from a binary string.
> #### Example > #### Example
> >
> ```python > ```python
> apple_id = nlp.vocab.strings[u"apple"] > apple_id = nlp.vocab.strings["apple"]
> assert type(apple_id) == int > assert type(apple_id) == int
> PERSON = nlp.vocab.strings[u"PERSON"] > PERSON = nlp.vocab.strings["PERSON"]
> assert type(PERSON) == int > assert type(PERSON) == int
> ``` > ```

View File

@ -1,5 +1,5 @@
A named entity is a "real-world object" that's assigned a name for example, a A named entity is a "real-world object" that's assigned a name for example, a
person, a country, a product or a book title. spaCy can **recognize** person, a country, a product or a book title. spaCy can **recognize**
[various types](/api/annotation#named-entities) of named entities in a document, [various types](/api/annotation#named-entities) of named entities in a document,
by asking the model for a **prediction**. Because models are statistical and by asking the model for a **prediction**. Because models are statistical and
strongly depend on the examples they were trained on, this doesn't always work strongly depend on the examples they were trained on, this doesn't always work
@ -12,7 +12,7 @@ Named entities are available as the `ents` property of a `Doc`:
import spacy import spacy
nlp = spacy.load("en_core_web_sm") nlp = spacy.load("en_core_web_sm")
doc = nlp(u"Apple is looking at buying U.K. startup for $1 billion") doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
for ent in doc.ents: for ent in doc.ents:
print(ent.text, ent.start_char, ent.end_char, ent.label_) print(ent.text, ent.start_char, ent.end_char, ent.label_)
@ -23,10 +23,10 @@ for ent in doc.ents:
> - **End:** Index of end of entity in the `Doc`. > - **End:** Index of end of entity in the `Doc`.
> - **Label:** Entity label, i.e. type. > - **Label:** Entity label, i.e. type.
| Text | Start | End | Label | Description | | Text | Start | End | Label | Description |
| ----------- | :---: | :-: | ------- | ---------------------------------------------------- | | ----------- | :---: | :-: | ------- | ---------------------------------------------------- |
| Apple | 0 | 5 | `ORG` | Companies, agencies, institutions. | | Apple | 0 | 5 | `ORG` | Companies, agencies, institutions. |
| U.K. | 27 | 31 | `GPE` | Geopolitical entity, i.e. countries, cities, states. | | U.K. | 27 | 31 | `GPE` | Geopolitical entity, i.e. countries, cities, states. |
| \$1 billion | 44 | 54 | `MONEY` | Monetary values, including unit. | | \$1 billion | 44 | 54 | `MONEY` | Monetary values, including unit. |
Using spaCy's built-in [displaCy visualizer](/usage/visualizers), here's what Using spaCy's built-in [displaCy visualizer](/usage/visualizers), here's what

View File

@ -15,8 +15,8 @@ need to add an underscore `_` to its name:
### {executable="true"} ### {executable="true"}
import spacy import spacy
nlp = spacy.load('en_core_web_sm') nlp = spacy.load("en_core_web_sm")
doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion') doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
for token in doc: for token in doc:
print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_, print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,

View File

@ -9,7 +9,7 @@ tokens, and we can iterate over them:
import spacy import spacy
nlp = spacy.load("en_core_web_sm") nlp = spacy.load("en_core_web_sm")
doc = nlp(u"Apple is looking at buying U.K. startup for $1 billion") doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
for token in doc: for token in doc:
print(token.text) print(token.text)
``` ```

View File

@ -48,8 +48,8 @@ norm, which can be used to normalize vectors.
### {executable="true"} ### {executable="true"}
import spacy import spacy
nlp = spacy.load('en_core_web_md') nlp = spacy.load("en_core_web_md")
tokens = nlp(u'dog cat banana afskfsd') tokens = nlp("dog cat banana afskfsd")
for token in tokens: for token in tokens:
print(token.text, token.has_vector, token.vector_norm, token.is_oov) print(token.text, token.has_vector, token.vector_norm, token.is_oov)
@ -88,8 +88,8 @@ definition of similarity.
### {executable="true"} ### {executable="true"}
import spacy import spacy
nlp = spacy.load('en_core_web_md') # make sure to use larger model! nlp = spacy.load("en_core_web_md") # make sure to use larger model!
tokens = nlp(u'dog cat banana') tokens = nlp("dog cat banana")
for token1 in tokens: for token1 in tokens:
for token2 in tokens: for token2 in tokens:

View File

@ -276,7 +276,7 @@ the lowercase spelling of a word exists, norms should always be in lowercase.
> #### Norms vs. lemmas > #### Norms vs. lemmas
> >
> ```python > ```python
> doc = nlp(u"I'm gonna realise") > doc = nlp("I'm gonna realise")
> norms = [token.norm_ for token in doc] > norms = [token.norm_ for token in doc]
> lemmas = [token.lemma_ for token in doc] > lemmas = [token.lemma_ for token in doc]
> assert norms == ["i", "am", "going", "to", "realize"] > assert norms == ["i", "am", "going", "to", "realize"]
@ -396,10 +396,10 @@ iterators:
> #### Noun chunks example > #### Noun chunks example
> >
> ```python > ```python
> doc = nlp(u"A phrase with another phrase occurs.") > doc = nlp("A phrase with another phrase occurs.")
> chunks = list(doc.noun_chunks) > chunks = list(doc.noun_chunks)
> assert chunks[0].text == u"A phrase" > assert chunks[0].text == "A phrase"
> assert chunks[1].text == u"another phrase" > assert chunks[1].text == "another phrase"
> ``` > ```
| Language | Code | Source | | Language | Code | Source |

View File

@ -392,7 +392,7 @@ from is called `spacy`. So, when using spaCy, never call anything else `spacy`.
<Accordion title="Pronoun lemma is returned as -PRON-" id="pron-lemma"> <Accordion title="Pronoun lemma is returned as -PRON-" id="pron-lemma">
```python ```python
doc = nlp(u"They are") doc = nlp("They are")
print(doc[0].lemma_) print(doc[0].lemma_)
# -PRON- # -PRON-
``` ```

View File

@ -69,7 +69,6 @@ of the two. The system works as follows:
morphological information, without consulting the context of the token. The morphological information, without consulting the context of the token. The
lemmatizer also accepts list-based exception files, acquired from lemmatizer also accepts list-based exception files, acquired from
[WordNet](https://wordnet.princeton.edu/). [WordNet](https://wordnet.princeton.edu/).
## Dependency Parsing {#dependency-parse model="parser"} ## Dependency Parsing {#dependency-parse model="parser"}
@ -93,7 +92,7 @@ get the noun chunks in a document, simply iterate over
import spacy import spacy
nlp = spacy.load("en_core_web_sm") nlp = spacy.load("en_core_web_sm")
doc = nlp(u"Autonomous cars shift insurance liability toward manufacturers") doc = nlp("Autonomous cars shift insurance liability toward manufacturers")
for chunk in doc.noun_chunks: for chunk in doc.noun_chunks:
print(chunk.text, chunk.root.text, chunk.root.dep_, print(chunk.text, chunk.root.text, chunk.root.dep_,
chunk.root.head.text) chunk.root.head.text)
@ -124,7 +123,7 @@ get the string value with `.dep_`.
import spacy import spacy
nlp = spacy.load("en_core_web_sm") nlp = spacy.load("en_core_web_sm")
doc = nlp(u"Autonomous cars shift insurance liability toward manufacturers") doc = nlp("Autonomous cars shift insurance liability toward manufacturers")
for token in doc: for token in doc:
print(token.text, token.dep_, token.head.text, token.head.pos_, print(token.text, token.dep_, token.head.text, token.head.pos_,
[child for child in token.children]) [child for child in token.children])
@ -161,7 +160,7 @@ import spacy
from spacy.symbols import nsubj, VERB from spacy.symbols import nsubj, VERB
nlp = spacy.load("en_core_web_sm") nlp = spacy.load("en_core_web_sm")
doc = nlp(u"Autonomous cars shift insurance liability toward manufacturers") doc = nlp("Autonomous cars shift insurance liability toward manufacturers")
# Finding a verb with a subject from below — good # Finding a verb with a subject from below — good
verbs = set() verbs = set()
@ -204,7 +203,7 @@ children.
import spacy import spacy
nlp = spacy.load("en_core_web_sm") nlp = spacy.load("en_core_web_sm")
doc = nlp(u"bright red apples on the tree") doc = nlp("bright red apples on the tree")
print([token.text for token in doc[2].lefts]) # ['bright', 'red'] print([token.text for token in doc[2].lefts]) # ['bright', 'red']
print([token.text for token in doc[2].rights]) # ['on'] print([token.text for token in doc[2].rights]) # ['on']
print(doc[2].n_lefts) # 2 print(doc[2].n_lefts) # 2
@ -216,7 +215,7 @@ print(doc[2].n_rights) # 1
import spacy import spacy
nlp = spacy.load("de_core_news_sm") nlp = spacy.load("de_core_news_sm")
doc = nlp(u"schöne rote Äpfel auf dem Baum") doc = nlp("schöne rote Äpfel auf dem Baum")
print([token.text for token in doc[2].lefts]) # ['schöne', 'rote'] print([token.text for token in doc[2].lefts]) # ['schöne', 'rote']
print([token.text for token in doc[2].rights]) # ['auf'] print([token.text for token in doc[2].rights]) # ['auf']
``` ```
@ -240,7 +239,7 @@ sequence of tokens. You can walk up the tree with the
import spacy import spacy
nlp = spacy.load("en_core_web_sm") nlp = spacy.load("en_core_web_sm")
doc = nlp(u"Credit and mortgage account holders must submit their requests") doc = nlp("Credit and mortgage account holders must submit their requests")
root = [token for token in doc if token.head == token][0] root = [token for token in doc if token.head == token][0]
subject = list(root.lefts)[0] subject = list(root.lefts)[0]
@ -270,7 +269,7 @@ end-point of a range, don't forget to `+1`!
import spacy import spacy
nlp = spacy.load("en_core_web_sm") nlp = spacy.load("en_core_web_sm")
doc = nlp(u"Credit and mortgage account holders must submit their requests") doc = nlp("Credit and mortgage account holders must submit their requests")
span = doc[doc[4].left_edge.i : doc[4].right_edge.i+1] span = doc[doc[4].left_edge.i : doc[4].right_edge.i+1]
with doc.retokenize() as retokenizer: with doc.retokenize() as retokenizer:
retokenizer.merge(span) retokenizer.merge(span)
@ -311,7 +310,7 @@ import spacy
from spacy import displacy from spacy import displacy
nlp = spacy.load("en_core_web_sm") nlp = spacy.load("en_core_web_sm")
doc = nlp(u"Autonomous cars shift insurance liability toward manufacturers") doc = nlp("Autonomous cars shift insurance liability toward manufacturers")
# Since this is an interactive Jupyter environment, we can use displacy.render here # Since this is an interactive Jupyter environment, we can use displacy.render here
displacy.render(doc, style='dep') displacy.render(doc, style='dep')
``` ```
@ -336,7 +335,7 @@ the `nlp` object.
```python ```python
nlp = spacy.load("en_core_web_sm", disable=["parser"]) nlp = spacy.load("en_core_web_sm", disable=["parser"])
nlp = English().from_disk("/model", disable=["parser"]) nlp = English().from_disk("/model", disable=["parser"])
doc = nlp(u"I don't want parsed", disable=["parser"]) doc = nlp("I don't want parsed", disable=["parser"])
``` ```
<Infobox title="Important note: disabling pipeline components" variant="warning"> <Infobox title="Important note: disabling pipeline components" variant="warning">
@ -350,10 +349,10 @@ Language class via [`from_disk`](/api/language#from_disk).
```diff ```diff
+ nlp = spacy.load("en_core_web_sm", disable=["parser"]) + nlp = spacy.load("en_core_web_sm", disable=["parser"])
+ doc = nlp(u"I don't want parsed", disable=["parser"]) + doc = nlp("I don't want parsed", disable=["parser"])
- nlp = spacy.load("en_core_web_sm", parser=False) - nlp = spacy.load("en_core_web_sm", parser=False)
- doc = nlp(u"I don't want parsed", parse=False) - doc = nlp("I don't want parsed", parse=False)
``` ```
</Infobox> </Infobox>
@ -398,7 +397,7 @@ on a token, it will return an empty string.
import spacy import spacy
nlp = spacy.load("en_core_web_sm") nlp = spacy.load("en_core_web_sm")
doc = nlp(u"San Francisco considers banning sidewalk delivery robots") doc = nlp("San Francisco considers banning sidewalk delivery robots")
# document level # document level
ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents] ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents]
@ -407,8 +406,8 @@ print(ents)
# token level # token level
ent_san = [doc[0].text, doc[0].ent_iob_, doc[0].ent_type_] ent_san = [doc[0].text, doc[0].ent_iob_, doc[0].ent_type_]
ent_francisco = [doc[1].text, doc[1].ent_iob_, doc[1].ent_type_] ent_francisco = [doc[1].text, doc[1].ent_iob_, doc[1].ent_type_]
print(ent_san) # [u'San', u'B', u'GPE'] print(ent_san) # ['San', 'B', 'GPE']
print(ent_francisco) # [u'Francisco', u'I', u'GPE'] print(ent_francisco) # ['Francisco', 'I', 'GPE']
``` ```
| Text | ent_iob | ent_iob\_ | ent_type\_ | Description | | Text | ent_iob | ent_iob\_ | ent_type\_ | Description |
@ -435,18 +434,17 @@ import spacy
from spacy.tokens import Span from spacy.tokens import Span
nlp = spacy.load("en_core_web_sm") nlp = spacy.load("en_core_web_sm")
doc = nlp(u"FB is hiring a new Vice President of global policy") doc = nlp("FB is hiring a new Vice President of global policy")
ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents] ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents]
print('Before', ents) print('Before', ents)
# the model didn't recognise "FB" as an entity :( # the model didn't recognise "FB" as an entity :(
ORG = doc.vocab.strings[u"ORG"] # get hash value of entity label fb_ent = Span(doc, 0, 1, label="ORG") # create a Span for the new entity
fb_ent = Span(doc, 0, 1, label=ORG) # create a Span for the new entity
doc.ents = list(doc.ents) + [fb_ent] doc.ents = list(doc.ents) + [fb_ent]
ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents] ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents]
print('After', ents) print('After', ents)
# [(u'FB', 0, 2, 'ORG')] 🎉 # [('FB', 0, 2, 'ORG')] 🎉
``` ```
Keep in mind that you need to create a `Span` with the start and end index of Keep in mind that you need to create a `Span` with the start and end index of
@ -468,13 +466,13 @@ import spacy
from spacy.attrs import ENT_IOB, ENT_TYPE from spacy.attrs import ENT_IOB, ENT_TYPE
nlp = spacy.load("en_core_web_sm") nlp = spacy.load("en_core_web_sm")
doc = nlp.make_doc(u"London is a big city in the United Kingdom.") doc = nlp.make_doc("London is a big city in the United Kingdom.")
print("Before", doc.ents) # [] print("Before", doc.ents) # []
header = [ENT_IOB, ENT_TYPE] header = [ENT_IOB, ENT_TYPE]
attr_array = numpy.zeros((len(doc), len(header))) attr_array = numpy.zeros((len(doc), len(header)))
attr_array[0, 0] = 3 # B attr_array[0, 0] = 3 # B
attr_array[0, 1] = doc.vocab.strings[u"GPE"] attr_array[0, 1] = doc.vocab.strings["GPE"]
doc.from_array(header, attr_array) doc.from_array(header, attr_array)
print("After", doc.ents) # [London] print("After", doc.ents) # [London]
``` ```
@ -533,8 +531,8 @@ train_data = [
``` ```
```python ```python
doc = Doc(nlp.vocab, [u"rats", u"make", u"good", u"pets"]) doc = Doc(nlp.vocab, ["rats", "make", "good", "pets"])
gold = GoldParse(doc, entities=[u"U-ANIMAL", u"O", u"O", u"O"]) gold = GoldParse(doc, entities=["U-ANIMAL", "O", "O", "O"])
``` ```
<Infobox> <Infobox>
@ -565,7 +563,7 @@ For more details and examples, see the
import spacy import spacy
from spacy import displacy from spacy import displacy
text = u"When Sebastian Thrun started working on self-driving cars at Google in 2007, few people outside of the company took him seriously." text = "When Sebastian Thrun started working on self-driving cars at Google in 2007, few people outside of the company took him seriously."
nlp = spacy.load("en_core_web_sm") nlp = spacy.load("en_core_web_sm")
doc = nlp(text) doc = nlp(text)
@ -578,29 +576,27 @@ import DisplacyEntHtml from 'images/displacy-ent2.html'
## Entity Linking {#entity-linking} ## Entity Linking {#entity-linking}
To ground the named entities into the "real-world", To ground the named entities into the "real-world", spaCy provides functionality
spaCy provides functionality to perform entity linking, which resolves a textual entity to perform entity linking, which resolves a textual entity to a unique
to a unique identifier from a knowledge base (KB). identifier from a knowledge base (KB).
The default model assigns WikiData identifiers, but you can create your own
[`KnowledgeBase`](/api/kb) and [train a new Entity Linking model](/usage/training#entity-linker) using
that custom-made KB.
The default model assigns WikiData identifiers, but you can create your own
[`KnowledgeBase`](/api/kb) and
[train a new Entity Linking model](/usage/training#entity-linker) using that
custom-made KB.
### Accessing entity identifiers {#accessing} ### Accessing entity identifiers {#entity-linking-accessing}
The annotated KB identifier is accessible as either a hash value
or as a string, using the attributes
`ent.kb_id` and `ent.kb_id_` of a [`Span`](/api/span) object,
or the `ent_kb_id` and `ent_kb_id_` attributes of a [`Token`](/api/token) object.
The annotated KB identifier is accessible as either a hash value or as a string,
using the attributes `ent.kb_id` and `ent.kb_id_` of a [`Span`](/api/span)
object, or the `ent_kb_id` and `ent_kb_id_` attributes of a
[`Token`](/api/token) object.
```python ```python
### {executable="true"}
import spacy import spacy
nlp = spacy.load("my_custom_el_model") nlp = spacy.load("my_custom_el_model")
doc = nlp(u"Ada Lovelace was born in London") doc = nlp("Ada Lovelace was born in London")
# document level # document level
ents = [(e.text, e.label_, e.kb_id_) for e in doc.ents] ents = [(e.text, e.label_, e.kb_id_) for e in doc.ents]
@ -615,14 +611,14 @@ print(ent_ada_1) # ['Lovelace', 'PERSON', 'Q7259']
print(ent_london_5) # ['London', 'GPE', 'Q84'] print(ent_london_5) # ['London', 'GPE', 'Q84']
``` ```
| Text | ent_type\_ | ent_kb_id\_ | | Text | ent_type\_ | ent_kb_id\_ |
| --------- | ---------- | ------------ | | -------- | ---------- | ----------- |
| Ada | `"PERSON"` | `"Q7259"` | | Ada | `"PERSON"` | `"Q7259"` |
| Lovelace | `"PERSON"` | `"Q7259"` | | Lovelace | `"PERSON"` | `"Q7259"` |
| was | `""` | `""` | | was | - | - |
| born | `""` | `""` | | born | - | - |
| in | `""` | `""` | | in | - | - |
| London | `"GPE"` | `"Q84"` | | London | `"GPE"` | `"Q84"` |
## Tokenization {#tokenization} ## Tokenization {#tokenization}
@ -692,53 +688,36 @@ this specific field. Here's how to add a special case rule to an existing
```python ```python
### {executable="true"} ### {executable="true"}
import spacy import spacy
from spacy.symbols import ORTH, LEMMA, POS, TAG from spacy.symbols import ORTH
nlp = spacy.load("en_core_web_sm") nlp = spacy.load("en_core_web_sm")
doc = nlp(u"gimme that") # phrase to tokenize doc = nlp("gimme that") # phrase to tokenize
print([w.text for w in doc]) # ['gimme', 'that'] print([w.text for w in doc]) # ['gimme', 'that']
# add special case rule # Add special case rule
special_case = [{ORTH: u"gim", LEMMA: u"give", POS: u"VERB"}, {ORTH: u"me"}] special_case = [{ORTH: "gim"}, {ORTH: "me"}]
nlp.tokenizer.add_special_case(u"gimme", special_case) nlp.tokenizer.add_special_case("gimme", special_case)
# check new tokenization # Check new tokenization
print([w.text for w in nlp(u"gimme that")]) # ['gim', 'me', 'that'] print([w.text for w in nlp("gimme that")]) # ['gim', 'me', 'that']
# Pronoun lemma is returned as -PRON-!
print([w.lemma_ for w in nlp(u"gimme that")]) # ['give', '-PRON-', 'that']
``` ```
<Infobox title="Why -PRON-?" variant="warning">
For details on spaCy's custom pronoun lemma `-PRON-`,
[see here](/usage/#pron-lemma).
</Infobox>
The special case doesn't have to match an entire whitespace-delimited substring. The special case doesn't have to match an entire whitespace-delimited substring.
The tokenizer will incrementally split off punctuation, and keep looking up the The tokenizer will incrementally split off punctuation, and keep looking up the
remaining substring: remaining substring:
```python ```python
assert "gimme" not in [w.text for w in nlp(u"gimme!")] assert "gimme" not in [w.text for w in nlp("gimme!")]
assert "gimme" not in [w.text for w in nlp(u'("...gimme...?")')] assert "gimme" not in [w.text for w in nlp('("...gimme...?")')]
``` ```
The special case rules have precedence over the punctuation splitting: The special case rules have precedence over the punctuation splitting:
```python ```python
special_case = [{ORTH: u"...gimme...?", LEMMA: u"give", TAG: u"VB"}] nlp.tokenizer.add_special_case("...gimme...?", [{ORTH: "...gimme...?"}])
nlp.tokenizer.add_special_case(u"...gimme...?", special_case) assert len(nlp("...gimme...?")) == 1
assert len(nlp(u"...gimme...?")) == 1
``` ```
Because the special-case rules allow you to set arbitrary token attributes, such
as the part-of-speech, lemma, etc, they make a good mechanism for arbitrary
fix-up rules. Having this logic live in the tokenizer isn't very satisfying from
a design perspective, however, so the API may eventually be exposed on the
[`Language`](/api/language) class itself.
### How spaCy's tokenizer works {#how-tokenizer-works} ### How spaCy's tokenizer works {#how-tokenizer-works}
spaCy introduces a novel tokenization algorithm, that gives a better balance spaCy introduces a novel tokenization algorithm, that gives a better balance
@ -838,7 +817,7 @@ def custom_tokenizer(nlp):
nlp = spacy.load("en_core_web_sm") nlp = spacy.load("en_core_web_sm")
nlp.tokenizer = custom_tokenizer(nlp) nlp.tokenizer = custom_tokenizer(nlp)
doc = nlp(u"hello-world.") doc = nlp("hello-world.")
print([t.text for t in doc]) print([t.text for t in doc])
``` ```
@ -955,7 +934,7 @@ class WhitespaceTokenizer(object):
nlp = spacy.load("en_core_web_sm") nlp = spacy.load("en_core_web_sm")
nlp.tokenizer = WhitespaceTokenizer(nlp.vocab) nlp.tokenizer = WhitespaceTokenizer(nlp.vocab)
doc = nlp(u"What's happened to me? he thought. It wasn't a dream.") doc = nlp("What's happened to me? he thought. It wasn't a dream.")
print([t.text for t in doc]) print([t.text for t in doc])
``` ```
@ -980,7 +959,7 @@ from spacy.tokens import Doc
from spacy.lang.en import English from spacy.lang.en import English
nlp = English() nlp = English()
doc = Doc(nlp.vocab, words=[u"Hello", u",", u"world", u"!"], doc = Doc(nlp.vocab, words=["Hello", ",", "world", "!"],
spaces=[False, True, False, False]) spaces=[False, True, False, False])
print([(t.text, t.text_with_ws, t.whitespace_) for t in doc]) print([(t.text, t.text_with_ws, t.whitespace_) for t in doc])
``` ```
@ -997,8 +976,8 @@ from spacy.tokens import Doc
from spacy.lang.en import English from spacy.lang.en import English
nlp = English() nlp = English()
bad_spaces = Doc(nlp.vocab, words=[u"Hello", u",", u"world", u"!"]) bad_spaces = Doc(nlp.vocab, words=["Hello", ",", "world", "!"])
good_spaces = Doc(nlp.vocab, words=[u"Hello", u",", u"world", u"!"], good_spaces = Doc(nlp.vocab, words=["Hello", ",", "world", "!"],
spaces=[False, True, False, False]) spaces=[False, True, False, False])
print(bad_spaces.text) # 'Hello , world !' print(bad_spaces.text) # 'Hello , world !'
@ -1280,7 +1259,7 @@ that yields [`Span`](/api/span) objects.
import spacy import spacy
nlp = spacy.load("en_core_web_sm") nlp = spacy.load("en_core_web_sm")
doc = nlp(u"This is a sentence. This is another sentence.") doc = nlp("This is a sentence. This is another sentence.")
for sent in doc.sents: for sent in doc.sents:
print(sent.text) print(sent.text)
``` ```
@ -1300,7 +1279,7 @@ from spacy.lang.en import English
nlp = English() # just the language with no model nlp = English() # just the language with no model
sentencizer = nlp.create_pipe("sentencizer") sentencizer = nlp.create_pipe("sentencizer")
nlp.add_pipe(sentencizer) nlp.add_pipe(sentencizer)
doc = nlp(u"This is a sentence. This is another sentence.") doc = nlp("This is a sentence. This is another sentence.")
for sent in doc.sents: for sent in doc.sents:
print(sent.text) print(sent.text)
``` ```
@ -1336,7 +1315,7 @@ take advantage of dependency-based sentence segmentation.
### {executable="true"} ### {executable="true"}
import spacy import spacy
text = u"this is a sentence...hello...and another sentence." text = "this is a sentence...hello...and another sentence."
nlp = spacy.load("en_core_web_sm") nlp = spacy.load("en_core_web_sm")
doc = nlp(text) doc = nlp(text)

View File

@ -120,7 +120,7 @@ python -m spacy download en_core_web_sm
```python ```python
import spacy import spacy
nlp = spacy.load("en_core_web_sm") nlp = spacy.load("en_core_web_sm")
doc = nlp(u"This is a sentence.") doc = nlp("This is a sentence.")
``` ```
<Infobox title="Important note" variant="warning"> <Infobox title="Important note" variant="warning">
@ -197,7 +197,7 @@ nlp = spacy.load("en_core_web_sm") # load model package "en_core_web_s
nlp = spacy.load("/path/to/en_core_web_sm") # load package from a directory nlp = spacy.load("/path/to/en_core_web_sm") # load package from a directory
nlp = spacy.load("en") # load model with shortcut link "en" nlp = spacy.load("en") # load model with shortcut link "en"
doc = nlp(u"This is a sentence.") doc = nlp("This is a sentence.")
``` ```
<Infobox title="Tip: Preview model info"> <Infobox title="Tip: Preview model info">
@ -269,7 +269,7 @@ also `import` it and then call its `load()` method with no arguments:
import en_core_web_sm import en_core_web_sm
nlp = en_core_web_sm.load() nlp = en_core_web_sm.load()
doc = nlp(u"This is a sentence.") doc = nlp("This is a sentence.")
``` ```
How you choose to load your models ultimately depends on personal preference. How you choose to load your models ultimately depends on personal preference.

View File

@ -20,7 +20,7 @@ component** on the `Doc`, in order. It then returns the processed `Doc` that you
can work with. can work with.
```python ```python
doc = nlp(u"This is a text") doc = nlp("This is a text")
``` ```
When processing large volumes of text, the statistical models are usually more When processing large volumes of text, the statistical models are usually more
@ -29,7 +29,7 @@ efficient if you let them work on batches of texts. spaCy's
processed `Doc` objects. The batching is done internally. processed `Doc` objects. The batching is done internally.
```diff ```diff
texts = [u"This is a text", u"These are lots of texts", u"..."] texts = ["This is a text", "These are lots of texts", "..."]
- docs = [nlp(text) for text in texts] - docs = [nlp(text) for text in texts]
+ docs = list(nlp.pipe(texts)) + docs = list(nlp.pipe(texts))
``` ```
@ -172,7 +172,7 @@ which is then processed by the component next in the pipeline.
```python ```python
### The pipeline under the hood ### The pipeline under the hood
doc = nlp.make_doc(u"This is a sentence") # create a Doc from raw text doc = nlp.make_doc("This is a sentence") # create a Doc from raw text
for name, proc in nlp.pipeline: # iterate over components in order for name, proc in nlp.pipeline: # iterate over components in order
doc = proc(doc) # apply each component doc = proc(doc) # apply each component
``` ```
@ -263,12 +263,12 @@ blocks.
### Disable for block ### Disable for block
# 1. Use as a contextmanager # 1. Use as a contextmanager
with nlp.disable_pipes("tagger", "parser"): with nlp.disable_pipes("tagger", "parser"):
doc = nlp(u"I won't be tagged and parsed") doc = nlp("I won't be tagged and parsed")
doc = nlp(u"I will be tagged and parsed") doc = nlp("I will be tagged and parsed")
# 2. Restore manually # 2. Restore manually
disabled = nlp.disable_pipes("ner") disabled = nlp.disable_pipes("ner")
doc = nlp(u"I won't have named entities") doc = nlp("I won't have named entities")
disabled.restore() disabled.restore()
``` ```
@ -295,11 +295,11 @@ initializing a Language class via [`from_disk`](/api/language#from_disk).
```diff ```diff
- nlp = spacy.load('en', tagger=False, entity=False) - nlp = spacy.load('en', tagger=False, entity=False)
- doc = nlp(u"I don't want parsed", parse=False) - doc = nlp("I don't want parsed", parse=False)
+ nlp = spacy.load("en", disable=["ner"]) + nlp = spacy.load("en", disable=["ner"])
+ nlp.remove_pipe("parser") + nlp.remove_pipe("parser")
+ doc = nlp(u"I don't want parsed") + doc = nlp("I don't want parsed")
``` ```
</Infobox> </Infobox>
@ -376,7 +376,7 @@ def my_component(doc):
nlp = spacy.load("en_core_web_sm") nlp = spacy.load("en_core_web_sm")
nlp.add_pipe(my_component, name="print_info", last=True) nlp.add_pipe(my_component, name="print_info", last=True)
print(nlp.pipe_names) # ['tagger', 'parser', 'ner', 'print_info'] print(nlp.pipe_names) # ['tagger', 'parser', 'ner', 'print_info']
doc = nlp(u"This is a sentence.") doc = nlp("This is a sentence.")
``` ```
@ -426,14 +426,14 @@ class EntityMatcher(object):
return doc return doc
nlp = spacy.load("en_core_web_sm") nlp = spacy.load("en_core_web_sm")
terms = (u"cat", u"dog", u"tree kangaroo", u"giant sea spider") terms = ("cat", "dog", "tree kangaroo", "giant sea spider")
entity_matcher = EntityMatcher(nlp, terms, "ANIMAL") entity_matcher = EntityMatcher(nlp, terms, "ANIMAL")
nlp.add_pipe(entity_matcher, after="ner") nlp.add_pipe(entity_matcher, after="ner")
print(nlp.pipe_names) # The components in the pipeline print(nlp.pipe_names) # The components in the pipeline
doc = nlp(u"This is a text about Barack Obama and a tree kangaroo") doc = nlp("This is a text about Barack Obama and a tree kangaroo")
print([(ent.text, ent.label_) for ent in doc.ents]) print([(ent.text, ent.label_) for ent in doc.ents])
``` ```
@ -471,7 +471,7 @@ def custom_sentencizer(doc):
nlp = spacy.load("en_core_web_sm") nlp = spacy.load("en_core_web_sm")
nlp.add_pipe(custom_sentencizer, before="parser") # Insert before the parser nlp.add_pipe(custom_sentencizer, before="parser") # Insert before the parser
doc = nlp(u"This is. A sentence. | This is. Another sentence.") doc = nlp("This is. A sentence. | This is. Another sentence.")
for sent in doc.sents: for sent in doc.sents:
print(sent.text) print(sent.text)
``` ```
@ -517,7 +517,7 @@ config parameters are passed all the way down from
components with custom settings: components with custom settings:
```python ```python
nlp = spacy.load("your_custom_model", terms=(u"tree kangaroo"), label="ANIMAL") nlp = spacy.load("your_custom_model", terms=["tree kangaroo"], label="ANIMAL")
``` ```
<Infobox title="Important note" variant="warning"> <Infobox title="Important note" variant="warning">
@ -617,7 +617,7 @@ raise an `AttributeError`.
### Example ### Example
from spacy.tokens import Doc, Span, Token from spacy.tokens import Doc, Span, Token
fruits = [u"apple", u"pear", u"banana", u"orange", u"strawberry"] fruits = ["apple", "pear", "banana", "orange", "strawberry"]
is_fruit_getter = lambda token: token.text in fruits is_fruit_getter = lambda token: token.text in fruits
has_fruit_getter = lambda obj: any([t.text in fruits for t in obj]) has_fruit_getter = lambda obj: any([t.text in fruits for t in obj])
@ -629,7 +629,7 @@ Span.set_extension("has_fruit", getter=has_fruit_getter)
> #### Usage example > #### Usage example
> >
> ```python > ```python
> doc = nlp(u"I have an apple and a melon") > doc = nlp("I have an apple and a melon")
> assert doc[3]._.is_fruit # get Token attributes > assert doc[3]._.is_fruit # get Token attributes
> assert not doc[0]._.is_fruit > assert not doc[0]._.is_fruit
> assert doc._.has_fruit # get Doc attributes > assert doc._.has_fruit # get Doc attributes

View File

@ -90,7 +90,7 @@ the pattern is not going to produce any results. When developing complex
patterns, make sure to check examples against spaCy's tokenization: patterns, make sure to check examples against spaCy's tokenization:
```python ```python
doc = nlp(u"A complex-example,!") doc = nlp("A complex-example,!")
print([token.text for token in doc]) print([token.text for token in doc])
``` ```
@ -113,7 +113,7 @@ matcher = Matcher(nlp.vocab)
pattern = [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"LOWER": "world"}] pattern = [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"LOWER": "world"}]
matcher.add("HelloWorld", None, pattern) matcher.add("HelloWorld", None, pattern)
doc = nlp(u"Hello, world! Hello world!") doc = nlp("Hello, world! Hello world!")
matches = matcher(doc) matches = matcher(doc)
for match_id, start, end in matches: for match_id, start, end in matches:
string_id = nlp.vocab.strings[match_id] # Get string representation string_id = nlp.vocab.strings[match_id] # Get string representation
@ -447,7 +447,7 @@ def add_event_ent(matcher, doc, i, matches):
pattern = [{"ORTH": "Google"}, {"ORTH": "I"}, {"ORTH": "/"}, {"ORTH": "O"}] pattern = [{"ORTH": "Google"}, {"ORTH": "I"}, {"ORTH": "/"}, {"ORTH": "O"}]
matcher.add("GoogleIO", add_event_ent, pattern) matcher.add("GoogleIO", add_event_ent, pattern)
doc = nlp(u"This is a text about Google I/O") doc = nlp("This is a text about Google I/O")
matches = matcher(doc) matches = matcher(doc)
``` ```
@ -539,7 +539,7 @@ class BadHTMLMerger(object):
nlp = spacy.load("en_core_web_sm") nlp = spacy.load("en_core_web_sm")
html_merger = BadHTMLMerger(nlp) html_merger = BadHTMLMerger(nlp)
nlp.add_pipe(html_merger, last=True) # Add component to the pipeline nlp.add_pipe(html_merger, last=True) # Add component to the pipeline
doc = nlp(u"Hello<br>world! <br/> This is a test.") doc = nlp("Hello<br>world! <br/> This is a test.")
for token in doc: for token in doc:
print(token.text, token._.bad_html) print(token.text, token._.bad_html)
@ -617,7 +617,7 @@ def collect_sents(matcher, doc, i, matches):
pattern = [{"LOWER": "facebook"}, {"LEMMA": "be"}, {"POS": "ADV", "OP": "*"}, pattern = [{"LOWER": "facebook"}, {"LEMMA": "be"}, {"POS": "ADV", "OP": "*"},
{"POS": "ADJ"}] {"POS": "ADJ"}]
matcher.add("FacebookIs", collect_sents, pattern) # add pattern matcher.add("FacebookIs", collect_sents, pattern) # add pattern
doc = nlp(u"I'd say that Facebook is evil. Facebook is pretty cool, right?") doc = nlp("I'd say that Facebook is evil. Facebook is pretty cool, right?")
matches = matcher(doc) matches = matcher(doc)
# Serve visualization of sentences containing match with displaCy # Serve visualization of sentences containing match with displaCy
@ -673,7 +673,7 @@ pattern = [{"ORTH": "("}, {"SHAPE": "ddd"}, {"ORTH": ")"}, {"SHAPE": "ddd"},
{"ORTH": "-", "OP": "?"}, {"SHAPE": "ddd"}] {"ORTH": "-", "OP": "?"}, {"SHAPE": "ddd"}]
matcher.add("PHONE_NUMBER", None, pattern) matcher.add("PHONE_NUMBER", None, pattern)
doc = nlp(u"Call me at (123) 456 789 or (123) 456 789!") doc = nlp("Call me at (123) 456 789 or (123) 456 789!")
print([t.text for t in doc]) print([t.text for t in doc])
matches = matcher(doc) matches = matcher(doc)
for match_id, start, end in matches: for match_id, start, end in matches:
@ -719,8 +719,8 @@ from spacy.matcher import Matcher
nlp = English() # We only want the tokenizer, so no need to load a model nlp = English() # We only want the tokenizer, so no need to load a model
matcher = Matcher(nlp.vocab) matcher = Matcher(nlp.vocab)
pos_emoji = [u"😀", u"😃", u"😂", u"🤣", u"😊", u"😍"] # Positive emoji pos_emoji = ["😀", "😃", "😂", "🤣", "😊", "😍"] # Positive emoji
neg_emoji = [u"😞", u"😠", u"😩", u"😢", u"😭", u"😒"] # Negative emoji neg_emoji = ["😞", "😠", "😩", "😢", "😭", "😒"] # Negative emoji
# Add patterns to match one or more emoji tokens # Add patterns to match one or more emoji tokens
pos_patterns = [[{"ORTH": emoji}] for emoji in pos_emoji] pos_patterns = [[{"ORTH": emoji}] for emoji in pos_emoji]
@ -740,7 +740,7 @@ matcher.add("SAD", label_sentiment, *neg_patterns) # Add negative pattern
# Add pattern for valid hashtag, i.e. '#' plus any ASCII token # Add pattern for valid hashtag, i.e. '#' plus any ASCII token
matcher.add("HASHTAG", None, [{"ORTH": "#"}, {"IS_ASCII": True}]) matcher.add("HASHTAG", None, [{"ORTH": "#"}, {"IS_ASCII": True}])
doc = nlp(u"Hello world 😀 #MondayMotivation") doc = nlp("Hello world 😀 #MondayMotivation")
matches = matcher(doc) matches = matcher(doc)
for match_id, start, end in matches: for match_id, start, end in matches:
string_id = doc.vocab.strings[match_id] # Look up string ID string_id = doc.vocab.strings[match_id] # Look up string ID
@ -797,7 +797,7 @@ matcher.add("HASHTAG", None, [{"ORTH": "#"}, {"IS_ASCII": True}])
# Register token extension # Register token extension
Token.set_extension("is_hashtag", default=False) Token.set_extension("is_hashtag", default=False)
doc = nlp(u"Hello world 😀 #MondayMotivation") doc = nlp("Hello world 😀 #MondayMotivation")
matches = matcher(doc) matches = matcher(doc)
hashtags = [] hashtags = []
for match_id, start, end in matches: for match_id, start, end in matches:
@ -838,13 +838,13 @@ from spacy.matcher import PhraseMatcher
nlp = spacy.load('en_core_web_sm') nlp = spacy.load('en_core_web_sm')
matcher = PhraseMatcher(nlp.vocab) matcher = PhraseMatcher(nlp.vocab)
terms = [u"Barack Obama", u"Angela Merkel", u"Washington, D.C."] terms = ["Barack Obama", "Angela Merkel", "Washington, D.C."]
# Only run nlp.make_doc to speed things up # Only run nlp.make_doc to speed things up
patterns = [nlp.make_doc(text) for text in terms] patterns = [nlp.make_doc(text) for text in terms]
matcher.add("TerminologyList", None, *patterns) matcher.add("TerminologyList", None, *patterns)
doc = nlp(u"German Chancellor Angela Merkel and US President Barack Obama " doc = nlp("German Chancellor Angela Merkel and US President Barack Obama "
u"converse in the Oval Office inside the White House in Washington, D.C.") "converse in the Oval Office inside the White House in Washington, D.C.")
matches = matcher(doc) matches = matcher(doc)
for match_id, start, end in matches: for match_id, start, end in matches:
span = doc[start:end] span = doc[start:end]
@ -853,8 +853,8 @@ for match_id, start, end in matches:
Since spaCy is used for processing both the patterns and the text to be matched, Since spaCy is used for processing both the patterns and the text to be matched,
you won't have to worry about specific tokenization for example, you can you won't have to worry about specific tokenization for example, you can
simply pass in `nlp(u"Washington, D.C.")` and won't have to write a complex simply pass in `nlp("Washington, D.C.")` and won't have to write a complex token
token pattern covering the exact tokenization of the term. pattern covering the exact tokenization of the term.
<Infobox title="Important note on creating patterns" variant="warning"> <Infobox title="Important note on creating patterns" variant="warning">
@ -889,10 +889,10 @@ from spacy.matcher import PhraseMatcher
nlp = English() nlp = English()
matcher = PhraseMatcher(nlp.vocab, attr="LOWER") matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
patterns = [nlp.make_doc(name) for name in [u"Angela Merkel", u"Barack Obama"]] patterns = [nlp.make_doc(name) for name in ["Angela Merkel", "Barack Obama"]]
matcher.add("Names", None, *patterns) matcher.add("Names", None, *patterns)
doc = nlp(u"angela merkel and us president barack Obama") doc = nlp("angela merkel and us president barack Obama")
for match_id, start, end in matcher(doc): for match_id, start, end in matcher(doc):
print("Matched based on lowercase token text:", doc[start:end]) print("Matched based on lowercase token text:", doc[start:end])
``` ```
@ -924,9 +924,9 @@ from spacy.matcher import PhraseMatcher
nlp = English() nlp = English()
matcher = PhraseMatcher(nlp.vocab, attr="SHAPE") matcher = PhraseMatcher(nlp.vocab, attr="SHAPE")
matcher.add("IP", None, nlp(u"127.0.0.1"), nlp(u"127.127.0.0")) matcher.add("IP", None, nlp("127.0.0.1"), nlp("127.127.0.0"))
doc = nlp(u"Often the router will have an IP address such as 192.168.1.1 or 192.168.2.1.") doc = nlp("Often the router will have an IP address such as 192.168.1.1 or 192.168.2.1.")
for match_id, start, end in matcher(doc): for match_id, start, end in matcher(doc):
print("Matched based on token shape:", doc[start:end]) print("Matched based on token shape:", doc[start:end])
``` ```
@ -982,7 +982,7 @@ patterns = [{"label": "ORG", "pattern": "Apple"},
ruler.add_patterns(patterns) ruler.add_patterns(patterns)
nlp.add_pipe(ruler) nlp.add_pipe(ruler)
doc = nlp(u"Apple is opening its first big office in San Francisco.") doc = nlp("Apple is opening its first big office in San Francisco.")
print([(ent.text, ent.label_) for ent in doc.ents]) print([(ent.text, ent.label_) for ent in doc.ents])
``` ```
@ -1006,7 +1006,7 @@ patterns = [{"label": "ORG", "pattern": "MyCorp Inc."}]
ruler.add_patterns(patterns) ruler.add_patterns(patterns)
nlp.add_pipe(ruler) nlp.add_pipe(ruler)
doc = nlp(u"MyCorp Inc. is a company in the U.S.") doc = nlp("MyCorp Inc. is a company in the U.S.")
print([(ent.text, ent.label_) for ent in doc.ents]) print([(ent.text, ent.label_) for ent in doc.ents])
``` ```

View File

@ -64,7 +64,7 @@ _then_ loads in the binary data. You can read more about this process
> #### Example > #### Example
> >
> ```python > ```python
> doc = nlp(u"This is a text.") > doc = nlp("This is a text.")
> data = pickle.dumps(doc) > data = pickle.dumps(doc)
> ``` > ```
@ -84,8 +84,8 @@ the _same_ `Vocab` object, it will only be included once.
```python ```python
### Pickling objects with shared data {highlight="8-9"} ### Pickling objects with shared data {highlight="8-9"}
doc1 = nlp(u"Hello world") doc1 = nlp("Hello world")
doc2 = nlp(u"This is a test") doc2 = nlp("This is a test")
doc1_data = pickle.dumps(doc1) doc1_data = pickle.dumps(doc1)
doc2_data = pickle.dumps(doc2) doc2_data = pickle.dumps(doc2)
@ -347,7 +347,7 @@ spaCy is now able to create the pipeline component `'snek'`:
>>> nlp = English() >>> nlp = English()
>>> snek = nlp.create_pipe("snek") # this now works! 🐍🎉 >>> snek = nlp.create_pipe("snek") # this now works! 🐍🎉
>>> nlp.add_pipe(snek) >>> nlp.add_pipe(snek)
>>> doc = nlp(u"I am snek") >>> doc = nlp("I am snek")
--..,_ _,.--. --..,_ _,.--.
`'.'. .'`__ o `;__. `'.'. .'`__ o `;__.
'.'. .'.'` '---'` ` '.'. .'.'` '---'` `
@ -497,8 +497,8 @@ If you're training a named entity recognition model for a custom domain, you may
end up training different labels that don't have pre-defined colors in the end up training different labels that don't have pre-defined colors in the
[`displacy` visualizer](/usage/visualizers#ent). The `spacy_displacy_colors` [`displacy` visualizer](/usage/visualizers#ent). The `spacy_displacy_colors`
entry point lets you define a dictionary of entity labels mapped to their color entry point lets you define a dictionary of entity labels mapped to their color
values. It's added to the pre-defined colors and can also overwrite values. It's added to the pre-defined colors and can also overwrite existing
existing values. values.
> #### Domain-specific NER labels > #### Domain-specific NER labels
> >
@ -528,8 +528,8 @@ setup(
``` ```
After installing the package, the the custom colors will be used when After installing the package, the the custom colors will be used when
visualizing text with `displacy`. Whenever the label `SNEK` is assigned, it visualizing text with `displacy`. Whenever the label `SNEK` is assigned, it will
will be displayed in `#3dff74`. be displayed in `#3dff74`.
import DisplaCyEntSnekHtml from 'images/displacy-ent-snek.html' import DisplaCyEntSnekHtml from 'images/displacy-ent-snek.html'

View File

@ -179,7 +179,7 @@ processed `Doc`:
import spacy import spacy
nlp = spacy.load("en_core_web_sm") nlp = spacy.load("en_core_web_sm")
doc = nlp(u"Apple is looking at buying U.K. startup for $1 billion") doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
for token in doc: for token in doc:
print(token.text, token.pos_, token.dep_) print(token.text, token.pos_, token.dep_)
``` ```
@ -240,8 +240,8 @@ of a model, see the usage guides on
<Infobox title="📖 Entity Linking"> <Infobox title="📖 Entity Linking">
To learn more about entity linking in spaCy, and how to **train and update** To learn more about entity linking in spaCy, and how to **train and update** the
the entity linker predictions, see the usage guides on entity linker predictions, see the usage guides on
[entity linking](/usage/linguistic-features#entity-linking) and [entity linking](/usage/linguistic-features#entity-linking) and
[training the entity linker](/usage/training#entity-linker). [training the entity linker](/usage/training#entity-linker).
@ -307,8 +307,8 @@ its hash, or a hash to get its string:
import spacy import spacy
nlp = spacy.load("en_core_web_sm") nlp = spacy.load("en_core_web_sm")
doc = nlp(u"I love coffee") doc = nlp("I love coffee")
print(doc.vocab.strings[u"coffee"]) # 3197928453018144401 print(doc.vocab.strings["coffee"]) # 3197928453018144401
print(doc.vocab.strings[3197928453018144401]) # 'coffee' print(doc.vocab.strings[3197928453018144401]) # 'coffee'
``` ```
@ -331,7 +331,7 @@ ever change. Its hash value will also always be the same.
import spacy import spacy
nlp = spacy.load("en_core_web_sm") nlp = spacy.load("en_core_web_sm")
doc = nlp(u"I love coffee") doc = nlp("I love coffee")
for word in doc: for word in doc:
lexeme = doc.vocab[word.text] lexeme = doc.vocab[word.text]
print(lexeme.text, lexeme.orth, lexeme.shape_, lexeme.prefix_, lexeme.suffix_, print(lexeme.text, lexeme.orth, lexeme.shape_, lexeme.prefix_, lexeme.suffix_,
@ -372,14 +372,14 @@ from spacy.tokens import Doc
from spacy.vocab import Vocab from spacy.vocab import Vocab
nlp = spacy.load("en_core_web_sm") nlp = spacy.load("en_core_web_sm")
doc = nlp(u"I love coffee") # Original Doc doc = nlp("I love coffee") # Original Doc
print(doc.vocab.strings[u"coffee"]) # 3197928453018144401 print(doc.vocab.strings["coffee"]) # 3197928453018144401
print(doc.vocab.strings[3197928453018144401]) # 'coffee' 👍 print(doc.vocab.strings[3197928453018144401]) # 'coffee' 👍
empty_doc = Doc(Vocab()) # New Doc with empty Vocab empty_doc = Doc(Vocab()) # New Doc with empty Vocab
# empty_doc.vocab.strings[3197928453018144401] will raise an error :( # empty_doc.vocab.strings[3197928453018144401] will raise an error :(
empty_doc.vocab.strings.add(u"coffee") # Add "coffee" and generate hash empty_doc.vocab.strings.add("coffee") # Add "coffee" and generate hash
print(empty_doc.vocab.strings[3197928453018144401]) # 'coffee' 👍 print(empty_doc.vocab.strings[3197928453018144401]) # 'coffee' 👍
new_doc = Doc(doc.vocab) # Create new doc with first doc's vocab new_doc = Doc(doc.vocab) # Create new doc with first doc's vocab
@ -396,20 +396,24 @@ it.
## Knowledge Base {#kb} ## Knowledge Base {#kb}
To support the entity linking task, spaCy stores external knowledge in a To support the entity linking task, spaCy stores external knowledge in a
[`KnowledgeBase`](/api/kb). The knowledge base (KB) uses the `Vocab` to store its [`KnowledgeBase`](/api/kb). The knowledge base (KB) uses the `Vocab` to store
data efficiently. its data efficiently.
> - **Mention**: A textual occurrence of a named entity, e.g. 'Miss Lovelace'. > - **Mention**: A textual occurrence of a named entity, e.g. 'Miss Lovelace'.
> - **KB ID**: A unique identifier refering to a particular real-world concept, e.g. 'Q7259'. > - **KB ID**: A unique identifier refering to a particular real-world concept,
> - **Alias**: A plausible synonym or description for a certain KB ID, e.g. 'Ada Lovelace'. > e.g. 'Q7259'.
> - **Prior probability**: The probability of a certain mention resolving to a certain KB ID, > - **Alias**: A plausible synonym or description for a certain KB ID, e.g. 'Ada
prior to knowing anything about the context in which the mention is used. > Lovelace'.
> - **Entity vector**: A pretrained word vector capturing the entity description. > - **Prior probability**: The probability of a certain mention resolving to a
> certain KB ID, prior to knowing anything about the context in which the
A knowledge base is created by first adding all entities to it. Next, for each > mention is used.
potential mention or alias, a list of relevant KB IDs and their prior probabilities > - **Entity vector**: A pretrained word vector capturing the entity
is added. The sum of these prior probabilities should never exceed 1 for any given alias. > description.
A knowledge base is created by first adding all entities to it. Next, for each
potential mention or alias, a list of relevant KB IDs and their prior
probabilities is added. The sum of these prior probabilities should never exceed
1 for any given alias.
```python ```python
### {executable="true"} ### {executable="true"}
@ -436,10 +440,10 @@ print("Number of aliases in KB:", kb.get_size_aliases()) # 2
### Candidate generation ### Candidate generation
Given a textual entity, the Knowledge Base can provide a list of plausible candidates or Given a textual entity, the Knowledge Base can provide a list of plausible
entity identifiers. The [`EntityLinker`](/api/entitylinker) will take this list of candidates candidates or entity identifiers. The [`EntityLinker`](/api/entitylinker) will
as input, and disambiguate the mention to the most probable identifier, given the take this list of candidates as input, and disambiguate the mention to the most
document context. probable identifier, given the document context.
```python ```python
### {executable="true"} ### {executable="true"}
@ -520,11 +524,11 @@ python -m spacy download de_core_news_sm
import spacy import spacy
nlp = spacy.load("en_core_web_sm") nlp = spacy.load("en_core_web_sm")
doc = nlp(u"Hello, world. Here are two sentences.") doc = nlp("Hello, world. Here are two sentences.")
print([t.text for t in doc]) print([t.text for t in doc])
nlp_de = spacy.load("de_core_news_sm") nlp_de = spacy.load("de_core_news_sm")
doc_de = nlp_de(u"Ich bin ein Berliner.") doc_de = nlp_de("Ich bin ein Berliner.")
print([t.text for t in doc_de]) print([t.text for t in doc_de])
``` ```
@ -543,8 +547,8 @@ print([t.text for t in doc_de])
import spacy import spacy
nlp = spacy.load("en_core_web_sm") nlp = spacy.load("en_core_web_sm")
doc = nlp(u"Peach emoji is where it has always been. Peach is the superior " doc = nlp("Peach emoji is where it has always been. Peach is the superior "
u"emoji. It's outranking eggplant 🍑 ") "emoji. It's outranking eggplant 🍑 ")
print(doc[0].text) # 'Peach' print(doc[0].text) # 'Peach'
print(doc[1].text) # 'emoji' print(doc[1].text) # 'emoji'
print(doc[-1].text) # '🍑' print(doc[-1].text) # '🍑'
@ -572,7 +576,7 @@ print(sentences[1].text) # 'Peach is the superior emoji.'
import spacy import spacy
nlp = spacy.load("en_core_web_sm") nlp = spacy.load("en_core_web_sm")
doc = nlp(u"Apple is looking at buying U.K. startup for $1 billion") doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
apple = doc[0] apple = doc[0]
print("Fine-grained POS tag", apple.pos_, apple.pos) print("Fine-grained POS tag", apple.pos_, apple.pos)
print("Coarse-grained POS tag", apple.tag_, apple.tag) print("Coarse-grained POS tag", apple.tag_, apple.tag)
@ -600,20 +604,20 @@ print("Like an email address?", billion.like_email)
import spacy import spacy
nlp = spacy.load("en_core_web_sm") nlp = spacy.load("en_core_web_sm")
doc = nlp(u"I love coffee") doc = nlp("I love coffee")
coffee_hash = nlp.vocab.strings[u"coffee"] # 3197928453018144401 coffee_hash = nlp.vocab.strings["coffee"] # 3197928453018144401
coffee_text = nlp.vocab.strings[coffee_hash] # 'coffee' coffee_text = nlp.vocab.strings[coffee_hash] # 'coffee'
print(coffee_hash, coffee_text) print(coffee_hash, coffee_text)
print(doc[2].orth, coffee_hash) # 3197928453018144401 print(doc[2].orth, coffee_hash) # 3197928453018144401
print(doc[2].text, coffee_text) # 'coffee' print(doc[2].text, coffee_text) # 'coffee'
beer_hash = doc.vocab.strings.add(u"beer") # 3073001599257881079 beer_hash = doc.vocab.strings.add("beer") # 3073001599257881079
beer_text = doc.vocab.strings[beer_hash] # 'beer' beer_text = doc.vocab.strings[beer_hash] # 'beer'
print(beer_hash, beer_text) print(beer_hash, beer_text)
unicorn_hash = doc.vocab.strings.add(u"🦄 ") # 18234233413267120783 unicorn_hash = doc.vocab.strings.add("🦄") # 18234233413267120783
unicorn_text = doc.vocab.strings[unicorn_hash] # '🦄 ' unicorn_text = doc.vocab.strings[unicorn_hash] # '🦄'
print(unicorn_hash, unicorn_text) print(unicorn_hash, unicorn_text)
``` ```
@ -629,19 +633,17 @@ print(unicorn_hash, unicorn_text)
```python ```python
### {executable="true"} ### {executable="true"}
import spacy import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp(u"San Francisco considers banning sidewalk delivery robots")
for ent in doc.ents:
print(ent.text, ent.start_char, ent.end_char, ent.label_)
from spacy.tokens import Span from spacy.tokens import Span
doc = nlp(u"FB is hiring a new VP of global policy") nlp = spacy.load("en_core_web_sm")
doc.ents = [Span(doc, 0, 1, label=doc.vocab.strings[u"ORG"])] doc = nlp("San Francisco considers banning sidewalk delivery robots")
for ent in doc.ents: for ent in doc.ents:
print(ent.text, ent.start_char, ent.end_char, ent.label_) print(ent.text, ent.start_char, ent.end_char, ent.label_)
doc = nlp("FB is hiring a new VP of global policy")
doc.ents = [Span(doc, 0, 1, label="ORG")]
for ent in doc.ents:
print(ent.text, ent.start_char, ent.end_char, ent.label_)
``` ```
<Infobox> <Infobox>
@ -657,7 +659,7 @@ import spacy
import random import random
nlp = spacy.load("en_core_web_sm") nlp = spacy.load("en_core_web_sm")
train_data = [(u"Uber blew through $1 million", {"entities": [(0, 4, "ORG")]})] train_data = [("Uber blew through $1 million", {"entities": [(0, 4, "ORG")]})]
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"] other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
with nlp.disable_pipes(*other_pipes): with nlp.disable_pipes(*other_pipes):
@ -685,11 +687,11 @@ nlp.to_disk("/model")
```python ```python
from spacy import displacy from spacy import displacy
doc_dep = nlp(u"This is a sentence.") doc_dep = nlp("This is a sentence.")
displacy.serve(doc_dep, style="dep") displacy.serve(doc_dep, style="dep")
doc_ent = nlp(u"When Sebastian Thrun started working on self-driving cars at Google " doc_ent = nlp("When Sebastian Thrun started working on self-driving cars at Google "
u"in 2007, few people outside of the company took him seriously.") "in 2007, few people outside of the company took him seriously.")
displacy.serve(doc_ent, style="ent") displacy.serve(doc_ent, style="ent")
``` ```
@ -707,7 +709,7 @@ displacy.serve(doc_ent, style="ent")
import spacy import spacy
nlp = spacy.load("en_core_web_md") nlp = spacy.load("en_core_web_md")
doc = nlp(u"Apple and banana are similar. Pasta and hippo aren't.") doc = nlp("Apple and banana are similar. Pasta and hippo aren't.")
apple = doc[0] apple = doc[0]
banana = doc[2] banana = doc[2]
@ -769,7 +771,7 @@ pattern2 = [[{"ORTH": emoji, "OP": "+"}] for emoji in ["😀", "😂", "🤣", "
matcher.add("GoogleIO", None, pattern1) # Match "Google I/O" or "Google i/o" matcher.add("GoogleIO", None, pattern1) # Match "Google I/O" or "Google i/o"
matcher.add("HAPPY", set_sentiment, *pattern2) # Match one or more happy emoji matcher.add("HAPPY", set_sentiment, *pattern2) # Match one or more happy emoji
doc = nlp(u"A text about Google I/O 😀😀") doc = nlp("A text about Google I/O 😀😀")
matches = matcher(doc) matches = matcher(doc)
for match_id, start, end in matches: for match_id, start, end in matches:
@ -789,7 +791,7 @@ print("Sentiment", doc.sentiment)
### Minibatched stream processing {#lightning-tour-minibatched} ### Minibatched stream processing {#lightning-tour-minibatched}
```python ```python
texts = [u"One document.", u"...", u"Lots of documents"] texts = ["One document.", "...", "Lots of documents"]
# .pipe streams input, and produces streaming output # .pipe streams input, and produces streaming output
iter_texts = (texts[i % 3] for i in range(100000000)) iter_texts = (texts[i % 3] for i in range(100000000))
for i, doc in enumerate(nlp.pipe(iter_texts, batch_size=50)): for i, doc in enumerate(nlp.pipe(iter_texts, batch_size=50)):
@ -805,8 +807,8 @@ for i, doc in enumerate(nlp.pipe(iter_texts, batch_size=50)):
import spacy import spacy
nlp = spacy.load("en_core_web_sm") nlp = spacy.load("en_core_web_sm")
doc = nlp(u"When Sebastian Thrun started working on self-driving cars at Google " doc = nlp("When Sebastian Thrun started working on self-driving cars at Google "
u"in 2007, few people outside of the company took him seriously.") "in 2007, few people outside of the company took him seriously.")
dep_labels = [] dep_labels = []
for token in doc: for token in doc:
@ -831,7 +833,7 @@ import spacy
from spacy.attrs import ORTH, LIKE_URL from spacy.attrs import ORTH, LIKE_URL
nlp = spacy.load("en_core_web_sm") nlp = spacy.load("en_core_web_sm")
doc = nlp(u"Check out https://spacy.io") doc = nlp("Check out https://spacy.io")
for token in doc: for token in doc:
print(token.text, token.orth, token.like_url) print(token.text, token.orth, token.like_url)
@ -877,7 +879,7 @@ def put_spans_around_tokens(doc):
nlp = spacy.load("en_core_web_sm") nlp = spacy.load("en_core_web_sm")
doc = nlp(u"This is a test.\\n\\nHello world.") doc = nlp("This is a test.\\n\\nHello world.")
html = put_spans_around_tokens(doc) html = put_spans_around_tokens(doc)
print(html) print(html)
``` ```

View File

@ -298,10 +298,10 @@ imports. It also makes it easier to structure and load your training data.
```python ```python
### Simple training loop ### Simple training loop
TRAIN_DATA = [ TRAIN_DATA = [
(u"Uber blew through $1 million a week", {"entities": [(0, 4, "ORG")]}), ("Uber blew through $1 million a week", {"entities": [(0, 4, "ORG")]}),
(u"Google rebrands its business apps", {"entities": [(0, 6, "ORG")]})] ("Google rebrands its business apps", {"entities": [(0, 6, "ORG")]})]
nlp = spacy.blank('en') nlp = spacy.blank("en")
optimizer = nlp.begin_training() optimizer = nlp.begin_training()
for i in range(20): for i in range(20):
random.shuffle(TRAIN_DATA) random.shuffle(TRAIN_DATA)
@ -498,7 +498,7 @@ like this:
![Custom dependencies](../images/displacy-custom-parser.svg) ![Custom dependencies](../images/displacy-custom-parser.svg)
```python ```python
doc = nlp(u"find a hotel with good wifi") doc = nlp("find a hotel with good wifi")
print([(t.text, t.dep_, t.head.text) for t in doc if t.dep_ != '-']) print([(t.text, t.dep_, t.head.text) for t in doc if t.dep_ != '-'])
# [('find', 'ROOT', 'find'), ('hotel', 'PLACE', 'find'), # [('find', 'ROOT', 'find'), ('hotel', 'PLACE', 'find'),
# ('good', 'QUALITY', 'wifi'), ('wifi', 'ATTRIBUTE', 'hotel')] # ('good', 'QUALITY', 'wifi'), ('wifi', 'ATTRIBUTE', 'hotel')]

View File

@ -99,8 +99,8 @@ flexibility.
> >
> ```python > ```python
> matcher = PhraseMatcher(nlp.vocab, attr="POS") > matcher = PhraseMatcher(nlp.vocab, attr="POS")
> matcher.add("PATTERN", None, nlp(u"I love cats")) > matcher.add("PATTERN", None, nlp("I love cats"))
> doc = nlp(u"You like dogs") > doc = nlp("You like dogs")
> matches = matcher(doc) > matches = matcher(doc)
> ``` > ```
@ -122,9 +122,9 @@ or `POS` for finding sequences of the same part-of-speech tags.
> #### Example > #### Example
> >
> ```python > ```python
> doc = nlp(u"I like David Bowie") > doc = nlp("I like David Bowie")
> with doc.retokenize() as retokenizer: > with doc.retokenize() as retokenizer:
> attrs = {"LEMMA": u"David Bowie"} > attrs = {"LEMMA": "David Bowie"}
> retokenizer.merge(doc[2:4], attrs=attrs) > retokenizer.merge(doc[2:4], attrs=attrs)
> ``` > ```

View File

@ -156,7 +156,7 @@ spaCy or plug in your own machine learning models.
> for itn in range(100): > for itn in range(100):
> for doc, gold in train_data: > for doc, gold in train_data:
> nlp.update([doc], [gold]) > nlp.update([doc], [gold])
> doc = nlp(u"This is a text.") > doc = nlp("This is a text.")
> print(doc.cats) > print(doc.cats)
> ``` > ```
@ -179,13 +179,13 @@ network to assign position-sensitive vectors to each word in the document.
> #### Example > #### Example
> >
> ```python > ```python
> doc = nlp(u"I love coffee") > doc = nlp("I love coffee")
> assert doc.vocab.strings[u"coffee"] == 3197928453018144401 > assert doc.vocab.strings["coffee"] == 3197928453018144401
> assert doc.vocab.strings[3197928453018144401] == u"coffee" > assert doc.vocab.strings[3197928453018144401] == "coffee"
> >
> beer_hash = doc.vocab.strings.add(u"beer") > beer_hash = doc.vocab.strings.add("beer")
> assert doc.vocab.strings[u"beer"] == beer_hash > assert doc.vocab.strings["beer"] == beer_hash
> assert doc.vocab.strings[beer_hash] == u"beer" > assert doc.vocab.strings[beer_hash] == "beer"
> ``` > ```
The [`StringStore`](/api/stringstore) now resolves all strings to hash values The [`StringStore`](/api/stringstore) now resolves all strings to hash values
@ -275,7 +275,7 @@ language, you can import the class directly, e.g.
> >
> ```python > ```python
> from spacy import displacy > from spacy import displacy
> doc = nlp(u"This is a sentence about Facebook.") > doc = nlp("This is a sentence about Facebook.")
> displacy.serve(doc, style="dep") # run the web server > displacy.serve(doc, style="dep") # run the web server
> html = displacy.render(doc, style="ent") # generate HTML > html = displacy.render(doc, style="ent") # generate HTML
> ``` > ```
@ -322,7 +322,7 @@ lookup-based lemmatization and **many new languages**!
> matcher.add('HEARTS', None, [{"ORTH": "❤️", "OP": '+'}]) > matcher.add('HEARTS', None, [{"ORTH": "❤️", "OP": '+'}])
> >
> phrasematcher = PhraseMatcher(nlp.vocab) > phrasematcher = PhraseMatcher(nlp.vocab)
> phrasematcher.add("OBAMA", None, nlp(u"Barack Obama")) > phrasematcher.add("OBAMA", None, nlp("Barack Obama"))
> ``` > ```
Patterns can now be added to the matcher by calling Patterns can now be added to the matcher by calling
@ -477,12 +477,12 @@ to the `disable` keyword argument on load, or by using
[`disable_pipes`](/api/language#disable_pipes) as a method or context manager: [`disable_pipes`](/api/language#disable_pipes) as a method or context manager:
```diff ```diff
- nlp = spacy.load("en", tagger=False, entity=False) - nlp = spacy.load("en_core_web_sm", tagger=False, entity=False)
- doc = nlp(u"I don't want parsed", parse=False) - doc = nlp("I don't want parsed", parse=False)
+ nlp = spacy.load("en", disable=["tagger", "ner"]) + nlp = spacy.load("en_core_web_sm", disable=["tagger", "ner"])
+ with nlp.disable_pipes("parser"): + with nlp.disable_pipes("parser"):
+ doc = nlp(u"I don't want parsed") + doc = nlp("I don't want parsed")
``` ```
To add spaCy's built-in pipeline components to your pipeline, you can still To add spaCy's built-in pipeline components to your pipeline, you can still
@ -539,7 +539,7 @@ This means that your application can and should only pass around `Doc`
objects and refer to them as the single source of truth. objects and refer to them as the single source of truth.
```diff ```diff
- doc = nlp(u"This is a regular doc") - doc = nlp("This is a regular doc")
- doc_array = doc.to_array(["ORTH", "POS"]) - doc_array = doc.to_array(["ORTH", "POS"])
- doc_with_meta = {"doc_array": doc_array, "meta": get_doc_meta(doc_array)} - doc_with_meta = {"doc_array": doc_array, "meta": get_doc_meta(doc_array)}
@ -556,11 +556,11 @@ utilities that interact with the pipeline, consider moving this logic into its
own extension module. own extension module.
```diff ```diff
- doc = nlp(u"Doc with a standard pipeline") - doc = nlp("Doc with a standard pipeline")
- meta = get_meta(doc) - meta = get_meta(doc)
+ nlp.add_pipe(meta_component) + nlp.add_pipe(meta_component)
+ doc = nlp(u"Doc with a custom pipeline that assigns meta") + doc = nlp("Doc with a custom pipeline that assigns meta")
+ meta = doc._.meta + meta = doc._.meta
``` ```
@ -572,12 +572,12 @@ to call [`StringStore.add`](/api/stringstore#add) explicitly. You can also now
be sure that the string-to-hash mapping will always match across vocabularies. be sure that the string-to-hash mapping will always match across vocabularies.
```diff ```diff
- nlp.vocab.strings[u"coffee"] # 3672 - nlp.vocab.strings["coffee"] # 3672
- other_nlp.vocab.strings[u"coffee"] # 40259 - other_nlp.vocab.strings["coffee"] # 40259
+ nlp.vocab.strings.add(u"coffee") + nlp.vocab.strings.add("coffee")
+ nlp.vocab.strings[u"coffee"] # 3197928453018144401 + nlp.vocab.strings["coffee"] # 3197928453018144401
+ other_nlp.vocab.strings[u"coffee"] # 3197928453018144401 + other_nlp.vocab.strings["coffee"] # 3197928453018144401
``` ```
### Adding patterns and callbacks to the matcher {#migrating-matcher} ### Adding patterns and callbacks to the matcher {#migrating-matcher}

View File

@ -74,8 +74,8 @@ path to [`spacy.load()`](/api/top-level#spacy.load).
```python ```python
nlp_latin = spacy.load("/tmp/la_vectors_wiki_lg") nlp_latin = spacy.load("/tmp/la_vectors_wiki_lg")
doc1 = nlp_latin(u"Caecilius est in horto") doc1 = nlp_latin("Caecilius est in horto")
doc2 = nlp_latin(u"servus est in atrio") doc2 = nlp_latin("servus est in atrio")
doc1.similarity(doc2) doc1.similarity(doc2)
``` ```
@ -168,10 +168,9 @@ vectors to the vocabulary, you can use the
### Adding vectors ### Adding vectors
from spacy.vocab import Vocab from spacy.vocab import Vocab
vector_data = {u"dog": numpy.random.uniform(-1, 1, (300,)), vector_data = {"dog": numpy.random.uniform(-1, 1, (300,)),
u"cat": numpy.random.uniform(-1, 1, (300,)), "cat": numpy.random.uniform(-1, 1, (300,)),
u"orange": numpy.random.uniform(-1, 1, (300,))} "orange": numpy.random.uniform(-1, 1, (300,))}
vocab = Vocab() vocab = Vocab()
for word, vector in vector_data.items(): for word, vector in vector_data.items():
vocab.set_vector(word, vector) vocab.set_vector(word, vector)
@ -241,7 +240,7 @@ import cupy.cuda
from spacy.vectors import Vectors from spacy.vectors import Vectors
vector_table = numpy.zeros((3, 300), dtype="f") vector_table = numpy.zeros((3, 300), dtype="f")
vectors = Vectors([u"dog", u"cat", u"orange"], vector_table) vectors = Vectors(["dog", "cat", "orange"], vector_table)
with cupy.cuda.Device(0): with cupy.cuda.Device(0):
vectors.data = cupy.asarray(vectors.data) vectors.data = cupy.asarray(vectors.data)
``` ```
@ -252,6 +251,6 @@ import torch
from spacy.vectors import Vectors from spacy.vectors import Vectors
vector_table = numpy.zeros((3, 300), dtype="f") vector_table = numpy.zeros((3, 300), dtype="f")
vectors = Vectors([u"dog", u"cat", u"orange"], vector_table) vectors = Vectors(["dog", "cat", "orange"], vector_table)
vectors.data = torch.Tensor(vectors.data).cuda(0) vectors.data = torch.Tensor(vectors.data).cuda(0)
``` ```

View File

@ -48,7 +48,7 @@ import spacy
from spacy import displacy from spacy import displacy
nlp = spacy.load("en_core_web_sm") nlp = spacy.load("en_core_web_sm")
doc = nlp(u"This is a sentence.") doc = nlp("This is a sentence.")
displacy.serve(doc, style="dep") displacy.serve(doc, style="dep")
``` ```
@ -101,7 +101,7 @@ import spacy
from spacy import displacy from spacy import displacy
nlp = spacy.load("en_core_web_sm") nlp = spacy.load("en_core_web_sm")
text = u"""In ancient Rome, some neighbors live in three adjacent houses. In the center is the house of Senex, who lives there with wife Domina, son Hero, and several slaves, including head slave Hysterium and the musical's main character Pseudolus. A slave belonging to Hero, Pseudolus wishes to buy, win, or steal his freedom. One of the neighboring houses is owned by Marcus Lycus, who is a buyer and seller of beautiful women; the other belongs to the ancient Erronius, who is abroad searching for his long-lost children (stolen in infancy by pirates). One day, Senex and Domina go on a trip and leave Pseudolus in charge of Hero. Hero confides in Pseudolus that he is in love with the lovely Philia, one of the courtesans in the House of Lycus (albeit still a virgin).""" text = """In ancient Rome, some neighbors live in three adjacent houses. In the center is the house of Senex, who lives there with wife Domina, son Hero, and several slaves, including head slave Hysterium and the musical's main character Pseudolus. A slave belonging to Hero, Pseudolus wishes to buy, win, or steal his freedom. One of the neighboring houses is owned by Marcus Lycus, who is a buyer and seller of beautiful women; the other belongs to the ancient Erronius, who is abroad searching for his long-lost children (stolen in infancy by pirates). One day, Senex and Domina go on a trip and leave Pseudolus in charge of Hero. Hero confides in Pseudolus that he is in love with the lovely Philia, one of the courtesans in the House of Lycus (albeit still a virgin)."""
doc = nlp(text) doc = nlp(text)
sentence_spans = list(doc.sents) sentence_spans = list(doc.sents)
displacy.serve(sentence_spans, style="dep") displacy.serve(sentence_spans, style="dep")
@ -117,7 +117,7 @@ text.
import spacy import spacy
from spacy import displacy from spacy import displacy
text = u"When Sebastian Thrun started working on self-driving cars at Google in 2007, few people outside of the company took him seriously." text = "When Sebastian Thrun started working on self-driving cars at Google in 2007, few people outside of the company took him seriously."
nlp = spacy.load("en_core_web_sm") nlp = spacy.load("en_core_web_sm")
doc = nlp(text) doc = nlp(text)
@ -168,7 +168,7 @@ add a headline to each visualization, you can add a `title` to its `user_data`.
User data is never touched or modified by spaCy. User data is never touched or modified by spaCy.
```python ```python
doc = nlp(u"This is a sentence about Google.") doc = nlp("This is a sentence about Google.")
doc.user_data["title"] = "This is a title" doc.user_data["title"] = "This is a title"
displacy.serve(doc, style="ent") displacy.serve(doc, style="ent")
``` ```
@ -193,7 +193,7 @@ import spacy
from spacy import displacy from spacy import displacy
# In[2]: # In[2]:
doc = nlp(u"Rats are various medium-sized, long-tailed rodents.") doc = nlp("Rats are various medium-sized, long-tailed rodents.")
displacy.render(doc, style="dep") displacy.render(doc, style="dep")
# In[3]: # In[3]:
@ -209,7 +209,6 @@ rendering if auto-detection fails.
</Infobox> </Infobox>
![displaCy visualizer in a Jupyter notebook](../images/displacy_jupyter.jpg) ![displaCy visualizer in a Jupyter notebook](../images/displacy_jupyter.jpg)
Internally, displaCy imports `display` and `HTML` from `IPython.core.display` Internally, displaCy imports `display` and `HTML` from `IPython.core.display`
@ -236,8 +235,8 @@ import spacy
from spacy import displacy from spacy import displacy
nlp = spacy.load("en_core_web_sm") nlp = spacy.load("en_core_web_sm")
doc1 = nlp(u"This is a sentence.") doc1 = nlp("This is a sentence.")
doc2 = nlp(u"This is another sentence.") doc2 = nlp("This is another sentence.")
html = displacy.render([doc1, doc2], style="dep", page=True) html = displacy.render([doc1, doc2], style="dep", page=True)
``` ```
@ -281,7 +280,7 @@ from spacy import displacy
from pathlib import Path from pathlib import Path
nlp = spacy.load("en_core_web_sm") nlp = spacy.load("en_core_web_sm")
sentences = [u"This is an example.", u"This is another one."] sentences = ["This is an example.", "This is another one."]
for sent in sentences: for sent in sentences:
doc = nlp(sent) doc = nlp(sent)
svg = displacy.render(doc, style="dep", jupyter=False) svg = displacy.render(doc, style="dep", jupyter=False)

View File

@ -119,14 +119,14 @@
"emoji = Emoji(nlp)", "emoji = Emoji(nlp)",
"nlp.add_pipe(emoji, first=True)", "nlp.add_pipe(emoji, first=True)",
"", "",
"doc = nlp(u'This is a test 😻 👍🏿')", "doc = nlp('This is a test 😻 👍🏿')",
"assert doc._.has_emoji == True", "assert doc._.has_emoji == True",
"assert doc[2:5]._.has_emoji == True", "assert doc[2:5]._.has_emoji == True",
"assert doc[0]._.is_emoji == False", "assert doc[0]._.is_emoji == False",
"assert doc[4]._.is_emoji == True", "assert doc[4]._.is_emoji == True",
"assert doc[5]._.emoji_desc == u'thumbs up dark skin tone'", "assert doc[5]._.emoji_desc == 'thumbs up dark skin tone'",
"assert len(doc._.emoji) == 2", "assert len(doc._.emoji) == 2",
"assert doc._.emoji[1] == (u'👍🏿', 5, u'thumbs up dark skin tone')" "assert doc._.emoji[1] == ('👍🏿', 5, 'thumbs up dark skin tone')"
], ],
"author": "Ines Montani", "author": "Ines Montani",
"author_links": { "author_links": {
@ -747,8 +747,8 @@
"s2v = Sense2VecComponent('/path/to/reddit_vectors-1.1.0')", "s2v = Sense2VecComponent('/path/to/reddit_vectors-1.1.0')",
"nlp.add_pipe(s2v)", "nlp.add_pipe(s2v)",
"", "",
"doc = nlp(u\"A sentence about natural language processing.\")", "doc = nlp(\"A sentence about natural language processing.\")",
"assert doc[3].text == u'natural language processing'", "assert doc[3].text == 'natural language processing'",
"freq = doc[3]._.s2v_freq", "freq = doc[3]._.s2v_freq",
"vector = doc[3]._.s2v_vec", "vector = doc[3]._.s2v_vec",
"most_similar = doc[3]._.s2v_most_similar(3)", "most_similar = doc[3]._.s2v_most_similar(3)",
@ -1297,7 +1297,7 @@
"", "",
"nlp = spacy.load('en')", "nlp = spacy.load('en')",
"nlp.add_pipe(BeneparComponent('benepar_en'))", "nlp.add_pipe(BeneparComponent('benepar_en'))",
"doc = nlp(u'The time for action is now. It's never too late to do something.')", "doc = nlp('The time for action is now. It's never too late to do something.')",
"sent = list(doc.sents)[0]", "sent = list(doc.sents)[0]",
"print(sent._.parse_string)", "print(sent._.parse_string)",
"# (S (NP (NP (DT The) (NN time)) (PP (IN for) (NP (NN action)))) (VP (VBZ is) (ADVP (RB now))) (. .))", "# (S (NP (NP (DT The) (NN time)) (PP (IN for) (NP (NN action)))) (VP (VBZ is) (ADVP (RB now))) (. .))",

View File

@ -65,7 +65,7 @@ const QuickstartInstall = ({ id, title, description, defaultLang, children }) =>
nlp = {pkg}.load() nlp = {pkg}.load()
</QS> </QS>
<QS lang={code} config="example" prompt="python"> <QS lang={code} config="example" prompt="python">
doc = nlp(u"{exampleText}") doc = nlp("{exampleText}")
</QS> </QS>
<QS lang={code} config="example" prompt="python"> <QS lang={code} config="example" prompt="python">
print([ print([