Remove u-strings and fix formatting [ci skip]

This commit is contained in:
Ines Montani 2019-09-12 16:11:15 +02:00
parent 7e3ac2cd41
commit 82c16b7943
44 changed files with 644 additions and 658 deletions

View File

@ -309,7 +309,7 @@ indented block as plain text and preserve whitespace.
### Using spaCy
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp(u"This is a sentence.")
doc = nlp("This is a sentence.")
for token in doc:
print(token.text, token.pos_)
```
@ -335,9 +335,9 @@ from spacy.matcher import Matcher
nlp = spacy.load('en_core_web_sm')
matcher = Matcher(nlp.vocab)
pattern = [{'LOWER': 'hello'}, {'IS_PUNCT': True}, {'LOWER': 'world'}]
matcher.add('HelloWorld', None, pattern)
doc = nlp(u'Hello, world! Hello world!')
pattern = [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"LOWER": "world"}]
matcher.add("HelloWorld", None, pattern)
doc = nlp("Hello, world! Hello world!")
matches = matcher(doc)
```
@ -360,7 +360,7 @@ interactive widget defaults to a regular code block.
### {executable="true"}
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp(u"This is a sentence.")
doc = nlp("This is a sentence.")
for token in doc:
print(token.text, token.pos_)
```
@ -457,7 +457,8 @@ sit amet dignissim justo congue.
## Setup and installation {#setup}
Before running the setup, make sure your versions of
[Node](https://nodejs.org/en/) and [npm](https://www.npmjs.com/) are up to date. Node v10.15 or later is required.
[Node](https://nodejs.org/en/) and [npm](https://www.npmjs.com/) are up to date.
Node v10.15 or later is required.
```bash
# Clone the repository

View File

@ -16,7 +16,7 @@ menu:
> ```python
> from spacy.lang.en import English
> nlp = English()
> tokens = nlp(u"Some\\nspaces and\\ttab characters")
> tokens = nlp("Some\\nspaces and\\ttab characters")
> tokens_text = [t.text for t in tokens]
> assert tokens_text == ["Some", "\\n", "spaces", " ", "and", "\\t", "tab", "characters"]
> ```
@ -187,7 +187,7 @@ annotation scheme. We also map the tags to the simpler Google Universal POS tag
set.
| Tag |  POS | Morphology | Description |
| --------- | ------- | ------------------------------------------- | ------------------------------------------------- |
| --------- | ------- | ---------------------------------------- | ------------------------------------------------- |
| `$(` | `PUNCT` | `PunctType=brck` | other sentence-internal punctuation mark |
| `$,` | `PUNCT` | `PunctType=comm` | comma |
| `$.` | `PUNCT` | `PunctType=peri` | sentence-final punctuation mark |
@ -380,7 +380,7 @@ The German dependency labels use the
annotation scheme.
| Label | Description |
| ------ | ------------------------------- |
| ------- | ------------------------------- |
| `ac` | adpositional case marker |
| `adc` | adjective component |
| `ag` | genitive attribute |

View File

@ -45,9 +45,9 @@ Append a token to the `Doc`. The token can be provided as a
> from spacy.vocab cimport Vocab
>
> doc = Doc(Vocab())
> lexeme = doc.vocab.get(u'hello')
> lexeme = doc.vocab.get("hello")
> doc.push_back(lexeme, True)
> assert doc.text == u'hello '
> assert doc.text == "hello "
> ```
| Name | Type | Description |
@ -164,7 +164,7 @@ vocabulary.
> #### Example
>
> ```python
> lexeme = vocab.get(vocab.mem, u'hello')
> lexeme = vocab.get(vocab.mem, "hello")
> ```
| Name | Type | Description |

View File

@ -88,7 +88,7 @@ Find a token in a `TokenC*` array by the offset of its first character.
> from spacy.tokens.doc cimport Doc, token_by_start
> from spacy.vocab cimport Vocab
>
> doc = Doc(Vocab(), words=[u'hello', u'world'])
> doc = Doc(Vocab(), words=["hello", "world"])
> assert token_by_start(doc.c, doc.length, 6) == 1
> assert token_by_start(doc.c, doc.length, 4) == -1
> ```
@ -110,7 +110,7 @@ Find a token in a `TokenC*` array by the offset of its final character.
> from spacy.tokens.doc cimport Doc, token_by_end
> from spacy.vocab cimport Vocab
>
> doc = Doc(Vocab(), words=[u'hello', u'world'])
> doc = Doc(Vocab(), words=["hello", "world"])
> assert token_by_end(doc.c, doc.length, 5) == 0
> assert token_by_end(doc.c, doc.length, 1) == -1
> ```
@ -134,7 +134,7 @@ attribute, in order to make the parse tree navigation consistent.
> from spacy.tokens.doc cimport Doc, set_children_from_heads
> from spacy.vocab cimport Vocab
>
> doc = Doc(Vocab(), words=[u'Baileys', u'from', u'a', u'shoe'])
> doc = Doc(Vocab(), words=["Baileys", "from", "a", "shoe"])
> doc.c[0].head = 0
> doc.c[1].head = 0
> doc.c[2].head = 3

View File

@ -58,7 +58,7 @@ and all pipeline components are applied to the `Doc` in order. Both
>
> ```python
> parser = DependencyParser(nlp.vocab)
> doc = nlp(u"This is a sentence.")
> doc = nlp("This is a sentence.")
> # This usually happens under the hood
> processed = parser(doc)
> ```

View File

@ -20,11 +20,11 @@ Construct a `Doc` object. The most common way to get a `Doc` object is via the
>
> ```python
> # Construction 1
> doc = nlp(u"Some text")
> doc = nlp("Some text")
>
> # Construction 2
> from spacy.tokens import Doc
> words = [u"hello", u"world", u"!"]
> words = ["hello", "world", "!"]
> spaces = [True, False, False]
> doc = Doc(nlp.vocab, words=words, spaces=spaces)
> ```
@ -45,7 +45,7 @@ Negative indexing is supported, and follows the usual Python semantics, i.e.
> #### Example
>
> ```python
> doc = nlp(u"Give it back! He pleaded.")
> doc = nlp("Give it back! He pleaded.")
> assert doc[0].text == "Give"
> assert doc[-1].text == "."
> span = doc[1:3]
@ -76,8 +76,8 @@ Iterate over `Token` objects, from which the annotations can be easily accessed.
> #### Example
>
> ```python
> doc = nlp(u'Give it back')
> assert [t.text for t in doc] == [u'Give', u'it', u'back']
> doc = nlp("Give it back")
> assert [t.text for t in doc] == ["Give", "it", "back"]
> ```
This is the main way of accessing [`Token`](/api/token) objects, which are the
@ -96,7 +96,7 @@ Get the number of tokens in the document.
> #### Example
>
> ```python
> doc = nlp(u"Give it back! He pleaded.")
> doc = nlp("Give it back! He pleaded.")
> assert len(doc) == 7
> ```
@ -114,9 +114,9 @@ details, see the documentation on
>
> ```python
> from spacy.tokens import Doc
> city_getter = lambda doc: any(city in doc.text for city in ('New York', 'Paris', 'Berlin'))
> Doc.set_extension('has_city', getter=city_getter)
> doc = nlp(u'I like New York')
> city_getter = lambda doc: any(city in doc.text for city in ("New York", "Paris", "Berlin"))
> Doc.set_extension("has_city", getter=city_getter)
> doc = nlp("I like New York")
> assert doc._.has_city
> ```
@ -192,8 +192,8 @@ the character indices don't map to a valid span.
> #### Example
>
> ```python
> doc = nlp(u"I like New York")
> span = doc.char_span(7, 15, label=u"GPE")
> doc = nlp("I like New York")
> span = doc.char_span(7, 15, label="GPE")
> assert span.text == "New York"
> ```
@ -213,8 +213,8 @@ using an average of word vectors.
> #### Example
>
> ```python
> apples = nlp(u"I like apples")
> oranges = nlp(u"I like oranges")
> apples = nlp("I like apples")
> oranges = nlp("I like oranges")
> apples_oranges = apples.similarity(oranges)
> oranges_apples = oranges.similarity(apples)
> assert apples_oranges == oranges_apples
@ -235,7 +235,7 @@ attribute ID.
>
> ```python
> from spacy.attrs import ORTH
> doc = nlp(u"apple apple orange banana")
> doc = nlp("apple apple orange banana")
> assert doc.count_by(ORTH) == {7024L: 1, 119552L: 1, 2087L: 2}
> doc.to_array([ORTH])
> # array([[11880], [11880], [7561], [12800]])
@ -255,7 +255,7 @@ ancestor is found, e.g. if span excludes a necessary ancestor.
> #### Example
>
> ```python
> doc = nlp(u"This is a test")
> doc = nlp("This is a test")
> matrix = doc.get_lca_matrix()
> # array([[0, 1, 1, 1], [1, 1, 1, 1], [1, 1, 2, 3], [1, 1, 3, 3]], dtype=int32)
> ```
@ -274,7 +274,7 @@ They'll be added to an `"_"` key in the data, e.g. `"_": {"foo": "bar"}`.
> #### Example
>
> ```python
> doc = nlp(u"Hello")
> doc = nlp("Hello")
> json_doc = doc.to_json()
> ```
>
@ -342,7 +342,7 @@ array of attributes.
> ```python
> from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA
> from spacy.tokens import Doc
> doc = nlp(u"Hello world!")
> doc = nlp("Hello world!")
> np_array = doc.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA])
> doc2 = Doc(doc.vocab, words=[t.text for t in doc])
> doc2.from_array([LOWER, POS, ENT_TYPE, IS_ALPHA], np_array)
@ -396,7 +396,7 @@ Serialize, i.e. export the document contents to a binary string.
> #### Example
>
> ```python
> doc = nlp(u"Give it back! He pleaded.")
> doc = nlp("Give it back! He pleaded.")
> doc_bytes = doc.to_bytes()
> ```
@ -413,10 +413,9 @@ Deserialize, i.e. import the document contents from a binary string.
>
> ```python
> from spacy.tokens import Doc
> text = u"Give it back! He pleaded."
> doc = nlp(text)
> bytes = doc.to_bytes()
> doc2 = Doc(doc.vocab).from_bytes(bytes)
> doc = nlp("Give it back! He pleaded.")
> doc_bytes = doc.to_bytes()
> doc2 = Doc(doc.vocab).from_bytes(doc_bytes)
> assert doc.text == doc2.text
> ```
@ -457,9 +456,9 @@ dictionary mapping attribute names to values as the `"_"` key.
> #### Example
>
> ```python
> doc = nlp(u"I like David Bowie")
> doc = nlp("I like David Bowie")
> with doc.retokenize() as retokenizer:
> attrs = {"LEMMA": u"David Bowie"}
> attrs = {"LEMMA": "David Bowie"}
> retokenizer.merge(doc[2:4], attrs=attrs)
> ```
@ -489,7 +488,7 @@ underlying lexeme (if they're context-independent lexical attributes like
> #### Example
>
> ```python
> doc = nlp(u"I live in NewYork")
> doc = nlp("I live in NewYork")
> with doc.retokenize() as retokenizer:
> heads = [(doc[3], 1), doc[2]]
> attrs = {"POS": ["PROPN", "PROPN"],
@ -521,9 +520,9 @@ and end token boundaries, the document remains unchanged.
> #### Example
>
> ```python
> doc = nlp(u"Los Angeles start.")
> doc = nlp("Los Angeles start.")
> doc.merge(0, len("Los Angeles"), "NNP", "Los Angeles", "GPE")
> assert [t.text for t in doc] == [u"Los Angeles", u"start", u"."]
> assert [t.text for t in doc] == ["Los Angeles", "start", "."]
> ```
| Name | Type | Description |
@ -541,11 +540,11 @@ objects, if the entity recognizer has been applied.
> #### Example
>
> ```python
> doc = nlp(u"Mr. Best flew to New York on Saturday morning.")
> doc = nlp("Mr. Best flew to New York on Saturday morning.")
> ents = list(doc.ents)
> assert ents[0].label == 346
> assert ents[0].label_ == u"PERSON"
> assert ents[0].text == u"Mr. Best"
> assert ents[0].label_ == "PERSON"
> assert ents[0].text == "Mr. Best"
> ```
| Name | Type | Description |
@ -563,10 +562,10 @@ relative clauses.
> #### Example
>
> ```python
> doc = nlp(u"A phrase with another phrase occurs.")
> doc = nlp("A phrase with another phrase occurs.")
> chunks = list(doc.noun_chunks)
> assert chunks[0].text == u"A phrase"
> assert chunks[1].text == u"another phrase"
> assert chunks[0].text == "A phrase"
> assert chunks[1].text == "another phrase"
> ```
| Name | Type | Description |
@ -583,10 +582,10 @@ will be unavailable.
> #### Example
>
> ```python
> doc = nlp(u"This is a sentence. Here's another...")
> doc = nlp("This is a sentence. Here's another...")
> sents = list(doc.sents)
> assert len(sents) == 2
> assert [s.root.text for s in sents] == [u"is", u"'s"]
> assert [s.root.text for s in sents] == ["is", "'s"]
> ```
| Name | Type | Description |
@ -600,7 +599,7 @@ A boolean value indicating whether a word vector is associated with the object.
> #### Example
>
> ```python
> doc = nlp(u"I like apples")
> doc = nlp("I like apples")
> assert doc.has_vector
> ```
@ -616,8 +615,8 @@ vectors.
> #### Example
>
> ```python
> doc = nlp(u"I like apples")
> assert doc.vector.dtype == 'float32'
> doc = nlp("I like apples")
> assert doc.vector.dtype == "float32"
> assert doc.vector.shape == (300,)
> ```
@ -632,8 +631,8 @@ The L2 norm of the document's vector representation.
> #### Example
>
> ```python
> doc1 = nlp(u"I like apples")
> doc2 = nlp(u"I like oranges")
> doc1 = nlp("I like apples")
> doc2 = nlp("I like oranges")
> doc1.vector_norm # 4.54232424414368
> doc2.vector_norm # 3.304373298575751
> assert doc1.vector_norm != doc2.vector_norm

View File

@ -1,6 +1,8 @@
---
title: EntityLinker
teaser: Functionality to disambiguate a named entity in text to a unique knowledge base identifier.
teaser:
Functionality to disambiguate a named entity in text to a unique knowledge
base identifier.
tag: class
source: spacy/pipeline/pipes.pyx
new: 2.2
@ -13,9 +15,9 @@ via the ID `"entity_linker"`.
## EntityLinker.Model {#model tag="classmethod"}
Initialize a model for the pipe. The model should implement the
`thinc.neural.Model` API, and should contain a field `tok2vec` that contains
the context encoder. Wrappers are under development for most major machine
learning libraries.
`thinc.neural.Model` API, and should contain a field `tok2vec` that contains the
context encoder. Wrappers are under development for most major machine learning
libraries.
| Name | Type | Description |
| ----------- | ------ | ------------------------------------- |
@ -41,7 +43,7 @@ shortcut for this and instantiate the component using its string name and
> ```
| Name | Type | Description |
| --------------- | ----------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------- |
| -------------- | ----------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------- |
| `vocab` | `Vocab` | The shared vocabulary. |
| `model` | `thinc.neural.Model` / `True` | The model powering the pipeline component. If no model is supplied, the model is created when you call `begin_training`, `from_disk` or `from_bytes`. |
| `hidden_width` | int | Width of the hidden layer of the entity linking model, defaults to 128. |
@ -54,16 +56,15 @@ shortcut for this and instantiate the component using its string name and
Apply the pipe to one document. The document is modified in place, and returned.
This usually happens under the hood when the `nlp` object is called on a text
and all pipeline components are applied to the `Doc` in order. Both
[`__call__`](/api/entitylinker#call) and
[`pipe`](/api/entitylinker#pipe) delegate to the
[`predict`](/api/entitylinker#predict) and
[`__call__`](/api/entitylinker#call) and [`pipe`](/api/entitylinker#pipe)
delegate to the [`predict`](/api/entitylinker#predict) and
[`set_annotations`](/api/entitylinker#set_annotations) methods.
> #### Example
>
> ```python
> entity_linker = EntityLinker(nlp.vocab)
> doc = nlp(u"This is a sentence.")
> doc = nlp("This is a sentence.")
> # This usually happens under the hood
> processed = entity_linker(doc)
> ```
@ -108,13 +109,14 @@ Apply the pipeline's model to a batch of docs, without modifying them.
> ```
| Name | Type | Description |
| ----------- | -------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| ----------- | -------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `docs` | iterable | The documents to predict. |
| **RETURNS** | tuple | A `(kb_ids, tensors)` tuple where `kb_ids` are the model's predicted KB identifiers for the entities in the `docs`, and `tensors` are the token representations used to predict these identifiers. |
## EntityLinker.set_annotations {#set_annotations tag="method"}
Modify a batch of documents, using pre-computed entity IDs for a list of named entities.
Modify a batch of documents, using pre-computed entity IDs for a list of named
entities.
> #### Example
>
@ -125,7 +127,7 @@ Modify a batch of documents, using pre-computed entity IDs for a list of named e
> ```
| Name | Type | Description |
| ---------- | -------- | --------------------------------------------------------------------------------------------------- |
| --------- | -------- | ------------------------------------------------------------------------------------------------- |
| `docs` | iterable | The documents to modify. |
| `kb_ids` | iterable | The knowledge base identifiers for the entities in the docs, predicted by `EntityLinker.predict`. |
| `tensors` | iterable | The token representations used to predict the identifiers. |
@ -133,7 +135,8 @@ Modify a batch of documents, using pre-computed entity IDs for a list of named e
## EntityLinker.update {#update tag="method"}
Learn from a batch of documents and gold-standard information, updating both the
pipe's entity linking model and context encoder. Delegates to [`predict`](/api/entitylinker#predict) and
pipe's entity linking model and context encoder. Delegates to
[`predict`](/api/entitylinker#predict) and
[`get_loss`](/api/entitylinker#get_loss).
> #### Example
@ -146,7 +149,7 @@ pipe's entity linking model and context encoder. Delegates to [`predict`](/api/e
> ```
| Name | Type | Description |
| -------- | -------- | ------------------------------------------------------------------------------------------------------------- |
| -------- | -------- | ------------------------------------------------------------------------------------------------------- |
| `docs` | iterable | A batch of documents to learn from. |
| `golds` | iterable | The gold-standard data. Must have the same length as `docs`. |
| `drop` | float | The dropout rate, used both for the EL model and the context encoder. |
@ -155,8 +158,8 @@ pipe's entity linking model and context encoder. Delegates to [`predict`](/api/e
## EntityLinker.get_loss {#get_loss tag="method"}
Find the loss and gradient of loss for the entities in a batch of documents and their
predicted scores.
Find the loss and gradient of loss for the entities in a batch of documents and
their predicted scores.
> #### Example
>
@ -167,7 +170,7 @@ predicted scores.
> ```
| Name | Type | Description |
| --------------- | -------- | ------------------------------------------------------------ |
| ----------- | -------- | ------------------------------------------------------------ |
| `docs` | iterable | The batch of documents. |
| `golds` | iterable | The gold-standard data. Must have the same length as `docs`. |
| `kb_ids` | iterable | KB identifiers representing the model's predictions. |
@ -176,7 +179,8 @@ predicted scores.
## EntityLinker.set_kb {#set_kb tag="method"}
Define the knowledge base (KB) used for disambiguating named entities to KB identifiers.
Define the knowledge base (KB) used for disambiguating named entities to KB
identifiers.
> #### Example
>
@ -186,14 +190,15 @@ Define the knowledge base (KB) used for disambiguating named entities to KB iden
> ```
| Name | Type | Description |
| --------------- | --------------- | ------------------------------------------------------------ |
| ---- | --------------- | ------------------------------- |
| `kb` | `KnowledgeBase` | The [`KnowledgeBase`](/api/kb). |
## EntityLinker.begin_training {#begin_training tag="method"}
Initialize the pipe for training, using data examples if available. If no model
has been initialized yet, the model is added.
Before calling this method, a knowledge base should have been defined with [`set_kb`](/api/entitylinker#set_kb).
has been initialized yet, the model is added. Before calling this method, a
knowledge base should have been defined with
[`set_kb`](/api/entitylinker#set_kb).
> #### Example
>
@ -205,7 +210,7 @@ Before calling this method, a knowledge base should have been defined with [`set
> ```
| Name | Type | Description |
| ------------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| ------------- | -------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `gold_tuples` | iterable | Optional gold-standard annotations from which to construct [`GoldParse`](/api/goldparse) objects. |
| `pipeline` | list | Optional list of pipeline components that this component is part of. |
| `sgd` | callable | An optional optimizer. Should take two arguments `weights` and `gradient`, and an optional ID. Will be created via [`EntityLinker`](/api/entitylinker#create_optimizer) if not set. |
@ -242,7 +247,6 @@ Modify the pipe's EL model, to use the given parameter values.
| -------- | ---- | ---------------------------------------------------------------------------------------------------------- |
| `params` | dict | The parameter values to use in the model. At the end of the context, the original parameters are restored. |
## EntityLinker.to_disk {#to_disk tag="method"}
Serialize the pipe to disk.
@ -271,7 +275,7 @@ Load the pipe from disk. Modifies the object in place and returns it.
> ```
| Name | Type | Description |
| ----------- | ------------------ | -------------------------------------------------------------------------- |
| ----------- | ---------------- | -------------------------------------------------------------------------- |
| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
| **RETURNS** | `EntityLinker` | The modified `EntityLinker` object. |
@ -294,4 +298,3 @@ serialization by passing in the string names via the `exclude` argument.
| `cfg` | The config file. You usually don't want to exclude this. |
| `model` | The binary model data. You usually don't want to exclude this. |
| `kb` | The knowledge base. You usually don't want to exclude this. |

View File

@ -58,7 +58,7 @@ and all pipeline components are applied to the `Doc` in order. Both
>
> ```python
> ner = EntityRecognizer(nlp.vocab)
> doc = nlp(u"This is a sentence.")
> doc = nlp("This is a sentence.")
> # This usually happens under the hood
> processed = ner(doc)
> ```
@ -120,10 +120,10 @@ Modify a batch of documents, using pre-computed scores.
> ```
| Name | Type | Description |
| -------- | -------- | ---------------------------------------------------------- |
| --------- | -------- | ---------------------------------------------------------- |
| `docs` | iterable | The documents to modify. |
| `scores` | - | The scores to set, produced by `EntityRecognizer.predict`. |
| `tensors`| iterable | The token representations used to predict the scores. |
| `tensors` | iterable | The token representations used to predict the scores. |
## EntityRecognizer.update {#update tag="method"}

View File

@ -69,7 +69,7 @@ Convert a list of Doc objects into the
> ```python
> from spacy.gold import docs_to_json
>
> doc = nlp(u"I like London")
> doc = nlp("I like London")
> json_data = docs_to_json([doc])
> ```
@ -150,7 +150,7 @@ single-token entity.
> ```python
> from spacy.gold import biluo_tags_from_offsets
>
> doc = nlp(u"I like London.")
> doc = nlp("I like London.")
> entities = [(7, 13, "LOC")]
> tags = biluo_tags_from_offsets(doc, entities)
> assert tags == ["O", "O", "U-LOC", "O"]
@ -172,7 +172,7 @@ entity offsets.
> ```python
> from spacy.gold import offsets_from_biluo_tags
>
> doc = nlp(u"I like London.")
> doc = nlp("I like London.")
> tags = ["O", "O", "U-LOC", "O"]
> entities = offsets_from_biluo_tags(doc, tags)
> assert entities == [(7, 13, "LOC")]
@ -195,7 +195,7 @@ token-based tags, e.g. to overwrite the `doc.ents`.
> ```python
> from spacy.gold import spans_from_biluo_tags
>
> doc = nlp(u"I like London.")
> doc = nlp("I like London.")
> tags = ["O", "O", "U-LOC", "O"]
> doc.ents = spans_from_biluo_tags(doc, tags)
> ```

View File

@ -45,7 +45,7 @@ contain arbitrary whitespace. Alignment into the original string is preserved.
> #### Example
>
> ```python
> doc = nlp(u"An example sentence. Another sentence.")
> doc = nlp("An example sentence. Another sentence.")
> assert (doc[0].text, doc[0].head.tag_) == ("An", "NN")
> ```
@ -61,8 +61,8 @@ Pipeline components to prevent from being loaded can now be added as a list to
`disable`, instead of specifying one keyword argument per component.
```diff
- doc = nlp(u"I don't want parsed", parse=False)
+ doc = nlp(u"I don't want parsed", disable=["parser"])
- doc = nlp("I don't want parsed", parse=False)
+ doc = nlp("I don't want parsed", disable=["parser"])
```
</Infobox>
@ -86,7 +86,7 @@ multiprocessing.
> #### Example
>
> ```python
> texts = [u"One document.", u"...", u"Lots of documents"]
> texts = ["One document.", "...", "Lots of documents"]
> for doc in nlp.pipe(texts, batch_size=50):
> assert doc.is_parsed
> ```

View File

@ -37,8 +37,8 @@ Lemmatize a string.
> from spacy.lemmatizer import Lemmatizer
> from spacy.lang.en import LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES
> lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES)
> lemmas = lemmatizer(u"ducks", u"NOUN")
> assert lemmas == [u"duck"]
> lemmas = lemmatizer("ducks", "NOUN")
> assert lemmas == ["duck"]
> ```
| Name | Type | Description |
@ -58,9 +58,9 @@ variable, set on the individual `Language` class.
> #### Example
>
> ```python
> lookup = {u"going": u"go"}
> lookup = {"going": "go"}
> lemmatizer = Lemmatizer(lookup=lookup)
> assert lemmatizer.lookup(u"going") == u"go"
> assert lemmatizer.lookup("going") == "go"
> ```
| Name | Type | Description |

View File

@ -27,7 +27,7 @@ Change the value of a boolean flag.
>
> ```python
> COOL_FLAG = nlp.vocab.add_flag(lambda text: False)
> nlp.vocab[u'spaCy'].set_flag(COOL_FLAG, True)
> nlp.vocab["spaCy"].set_flag(COOL_FLAG, True)
> ```
| Name | Type | Description |
@ -42,9 +42,9 @@ Check the value of a boolean flag.
> #### Example
>
> ```python
> is_my_library = lambda text: text in [u"spaCy", u"Thinc"]
> is_my_library = lambda text: text in ["spaCy", "Thinc"]
> MY_LIBRARY = nlp.vocab.add_flag(is_my_library)
> assert nlp.vocab[u"spaCy"].check_flag(MY_LIBRARY) == True
> assert nlp.vocab["spaCy"].check_flag(MY_LIBRARY) == True
> ```
| Name | Type | Description |
@ -59,8 +59,8 @@ Compute a semantic similarity estimate. Defaults to cosine over vectors.
> #### Example
>
> ```python
> apple = nlp.vocab[u"apple"]
> orange = nlp.vocab[u"orange"]
> apple = nlp.vocab["apple"]
> orange = nlp.vocab["orange"]
> apple_orange = apple.similarity(orange)
> orange_apple = orange.similarity(apple)
> assert apple_orange == orange_apple
@ -78,7 +78,7 @@ A boolean value indicating whether a word vector is associated with the lexeme.
> #### Example
>
> ```python
> apple = nlp.vocab[u"apple"]
> apple = nlp.vocab["apple"]
> assert apple.has_vector
> ```
@ -93,7 +93,7 @@ A real-valued meaning representation.
> #### Example
>
> ```python
> apple = nlp.vocab[u"apple"]
> apple = nlp.vocab["apple"]
> assert apple.vector.dtype == "float32"
> assert apple.vector.shape == (300,)
> ```
@ -109,8 +109,8 @@ The L2 norm of the lexeme's vector representation.
> #### Example
>
> ```python
> apple = nlp.vocab[u"apple"]
> pasta = nlp.vocab[u"pasta"]
> apple = nlp.vocab["apple"]
> pasta = nlp.vocab["pasta"]
> apple.vector_norm # 7.1346845626831055
> pasta.vector_norm # 7.759851932525635
> assert apple.vector_norm != pasta.vector_norm

View File

@ -50,7 +50,7 @@ Find all token sequences matching the supplied patterns on the `Doc`.
> matcher = Matcher(nlp.vocab)
> pattern = [{"LOWER": "hello"}, {"LOWER": "world"}]
> matcher.add("HelloWorld", None, pattern)
> doc = nlp(u'hello world!')
> doc = nlp("hello world!")
> matches = matcher(doc)
> ```
@ -147,7 +147,7 @@ overwritten.
> matcher = Matcher(nlp.vocab)
> matcher.add("HelloWorld", on_match, [{"LOWER": "hello"}, {"LOWER": "world"}])
> matcher.add("GoogleMaps", on_match, [{"ORTH": "Google"}, {"ORTH": "Maps"}])
> doc = nlp(u"HELLO WORLD on Google Maps.")
> doc = nlp("HELLO WORLD on Google Maps.")
> matches = matcher(doc)
> ```

View File

@ -59,8 +59,8 @@ Find all token sequences matching the supplied patterns on the `Doc`.
> from spacy.matcher import PhraseMatcher
>
> matcher = PhraseMatcher(nlp.vocab)
> matcher.add("OBAMA", None, nlp(u"Barack Obama"))
> doc = nlp(u"Barack Obama lifts America one last time in emotional farewell")
> matcher.add("OBAMA", None, nlp("Barack Obama"))
> doc = nlp("Barack Obama lifts America one last time in emotional farewell")
> matches = matcher(doc)
> ```
@ -99,7 +99,7 @@ patterns.
> ```python
> matcher = PhraseMatcher(nlp.vocab)
> assert len(matcher) == 0
> matcher.add("OBAMA", None, nlp(u"Barack Obama"))
> matcher.add("OBAMA", None, nlp("Barack Obama"))
> assert len(matcher) == 1
> ```
@ -116,7 +116,7 @@ Check whether the matcher contains rules for a match ID.
> ```python
> matcher = PhraseMatcher(nlp.vocab)
> assert "OBAMA" not in matcher
> matcher.add("OBAMA", None, nlp(u"Barack Obama"))
> matcher.add("OBAMA", None, nlp("Barack Obama"))
> assert "OBAMA" in matcher
> ```
@ -140,10 +140,10 @@ overwritten.
> print('Matched!', matches)
>
> matcher = PhraseMatcher(nlp.vocab)
> matcher.add("OBAMA", on_match, nlp(u"Barack Obama"))
> matcher.add("HEALTH", on_match, nlp(u"health care reform"),
> nlp(u"healthcare reform"))
> doc = nlp(u"Barack Obama urges Congress to find courage to defend his healthcare reforms")
> matcher.add("OBAMA", on_match, nlp("Barack Obama"))
> matcher.add("HEALTH", on_match, nlp("health care reform"),
> nlp("healthcare reform"))
> doc = nlp("Barack Obama urges Congress to find courage to defend his healthcare reforms")
> matches = matcher(doc)
> ```

View File

@ -17,13 +17,13 @@ the processing pipeline using [`nlp.add_pipe`](/api/language#add_pipe).
> #### Example
>
> ```python
> texts = [t.text for t in nlp(u"I have a blue car")]
> texts = [t.text for t in nlp("I have a blue car")]
> assert texts == ["I", "have", "a", "blue", "car"]
>
> merge_nps = nlp.create_pipe("merge_noun_chunks")
> nlp.add_pipe(merge_nps)
>
> texts = [t.text for t in nlp(u"I have a blue car")]
> texts = [t.text for t in nlp("I have a blue car")]
> assert texts == ["I", "have", "a blue car"]
> ```
@ -50,13 +50,13 @@ the processing pipeline using [`nlp.add_pipe`](/api/language#add_pipe).
> #### Example
>
> ```python
> texts = [t.text for t in nlp(u"I like David Bowie")]
> texts = [t.text for t in nlp("I like David Bowie")]
> assert texts == ["I", "like", "David", "Bowie"]
>
> merge_ents = nlp.create_pipe("merge_entities")
> nlp.add_pipe(merge_ents)
>
> texts = [t.text for t in nlp(u"I like David Bowie")]
> texts = [t.text for t in nlp("I like David Bowie")]
> assert texts == ["I", "like", "David Bowie"]
> ```

View File

@ -59,7 +59,7 @@ the component has been added to the pipeline using
> nlp = English()
> sentencizer = nlp.create_pipe("sentencizer")
> nlp.add_pipe(sentencizer)
> doc = nlp(u"This is a sentence. This is another sentence.")
> doc = nlp("This is a sentence. This is another sentence.")
> assert list(doc.sents) == 2
> ```

View File

@ -13,13 +13,13 @@ Create a Span object from the slice `doc[start : end]`.
> #### Example
>
> ```python
> doc = nlp(u"Give it back! He pleaded.")
> doc = nlp("Give it back! He pleaded.")
> span = doc[1:4]
> assert [t.text for t in span] == [u"it", u"back", u"!"]
> assert [t.text for t in span] == ["it", "back", "!"]
> ```
| Name | Type | Description |
| ----------- | ---------------------------------------- | ------------------------------------------------------------------------------------------------------------------|
| ----------- | ---------------------------------------- | ----------------------------------------------------------------------------------------------------------------- |
| `doc` | `Doc` | The parent document. |
| `start` | int | The index of the first token of the span. |
| `end` | int | The index of the first token after the span. |
@ -35,7 +35,7 @@ Get a `Token` object.
> #### Example
>
> ```python
> doc = nlp(u"Give it back! He pleaded.")
> doc = nlp("Give it back! He pleaded.")
> span = doc[1:4]
> assert span[1].text == "back"
> ```
@ -50,9 +50,9 @@ Get a `Span` object.
> #### Example
>
> ```python
> doc = nlp(u"Give it back! He pleaded.")
> doc = nlp("Give it back! He pleaded.")
> span = doc[1:4]
> assert span[1:3].text == u"back!"
> assert span[1:3].text == "back!"
> ```
| Name | Type | Description |
@ -67,9 +67,9 @@ Iterate over `Token` objects.
> #### Example
>
> ```python
> doc = nlp(u"Give it back! He pleaded.")
> doc = nlp("Give it back! He pleaded.")
> span = doc[1:4]
> assert [t.text for t in span] == [u"it", u"back", u"!"]
> assert [t.text for t in span] == ["it", "back", "!"]
> ```
| Name | Type | Description |
@ -83,7 +83,7 @@ Get the number of tokens in the span.
> #### Example
>
> ```python
> doc = nlp(u"Give it back! He pleaded.")
> doc = nlp("Give it back! He pleaded.")
> span = doc[1:4]
> assert len(span) == 3
> ```
@ -102,9 +102,9 @@ For details, see the documentation on
>
> ```python
> from spacy.tokens import Span
> city_getter = lambda span: any(city in span.text for city in (u"New York", u"Paris", u"Berlin"))
> city_getter = lambda span: any(city in span.text for city in ("New York", "Paris", "Berlin"))
> Span.set_extension("has_city", getter=city_getter)
> doc = nlp(u"I like New York in Autumn")
> doc = nlp("I like New York in Autumn")
> assert doc[1:4]._.has_city
> ```
@ -180,7 +180,7 @@ using an average of word vectors.
> #### Example
>
> ```python
> doc = nlp(u"green apples and red oranges")
> doc = nlp("green apples and red oranges")
> green_apples = doc[:2]
> red_oranges = doc[3:]
> apples_oranges = green_apples.similarity(red_oranges)
@ -202,7 +202,7 @@ ancestor is found, e.g. if span excludes a necessary ancestor.
> #### Example
>
> ```python
> doc = nlp(u"I like New York in Autumn")
> doc = nlp("I like New York in Autumn")
> span = doc[1:4]
> matrix = span.get_lca_matrix()
> # array([[0, 0, 0], [0, 1, 2], [0, 2, 2]], dtype=int32)
@ -222,7 +222,7 @@ shape `(N, M)`, where `N` is the length of the document. The values will be
>
> ```python
> from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA
> doc = nlp(u"I like New York in Autumn.")
> doc = nlp("I like New York in Autumn.")
> span = doc[2:3]
> # All strings mapped to integers, for easy export to numpy
> np_array = span.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA])
@ -248,11 +248,11 @@ Retokenize the document, such that the span is merged into a single token.
> #### Example
>
> ```python
> doc = nlp(u"I like New York in Autumn.")
> doc = nlp("I like New York in Autumn.")
> span = doc[2:4]
> span.merge()
> assert len(doc) == 6
> assert doc[2].text == u"New York"
> assert doc[2].text == "New York"
> ```
| Name | Type | Description |
@ -268,12 +268,12 @@ if the entity recognizer has been applied.
> #### Example
>
> ```python
> doc = nlp(u"Mr. Best flew to New York on Saturday morning.")
> doc = nlp("Mr. Best flew to New York on Saturday morning.")
> span = doc[0:6]
> ents = list(span.ents)
> assert ents[0].label == 346
> assert ents[0].label_ == "PERSON"
> assert ents[0].text == u"Mr. Best"
> assert ents[0].text == "Mr. Best"
> ```
| Name | Type | Description |
@ -287,10 +287,10 @@ Create a new `Doc` object corresponding to the `Span`, with a copy of the data.
> #### Example
>
> ```python
> doc = nlp(u"I like New York in Autumn.")
> doc = nlp("I like New York in Autumn.")
> span = doc[2:4]
> doc2 = span.as_doc()
> assert doc2.text == u"New York"
> assert doc2.text == "New York"
> ```
| Name | Type | Description |
@ -306,12 +306,12 @@ taken.
> #### Example
>
> ```python
> doc = nlp(u"I like New York in Autumn.")
> doc = nlp("I like New York in Autumn.")
> i, like, new, york, in_, autumn, dot = range(len(doc))
> assert doc[new].head.text == u"York"
> assert doc[york].head.text == u"like"
> assert doc[new].head.text == "York"
> assert doc[york].head.text == "like"
> new_york = doc[new:york+1]
> assert new_york.root.text == u"York"
> assert new_york.root.text == "York"
> ```
| Name | Type | Description |
@ -325,9 +325,9 @@ A tuple of tokens coordinated to `span.root`.
> #### Example
>
> ```python
> doc = nlp(u"I like apples and oranges")
> doc = nlp("I like apples and oranges")
> apples_conjuncts = doc[2:3].conjuncts
> assert [t.text for t in apples_conjuncts] == [u"oranges"]
> assert [t.text for t in apples_conjuncts] == ["oranges"]
> ```
| Name | Type | Description |
@ -341,9 +341,9 @@ Tokens that are to the left of the span, whose heads are within the span.
> #### Example
>
> ```python
> doc = nlp(u"I like New York in Autumn.")
> doc = nlp("I like New York in Autumn.")
> lefts = [t.text for t in doc[3:7].lefts]
> assert lefts == [u"New"]
> assert lefts == ["New"]
> ```
| Name | Type | Description |
@ -357,9 +357,9 @@ Tokens that are to the right of the span, whose heads are within the span.
> #### Example
>
> ```python
> doc = nlp(u"I like New York in Autumn.")
> doc = nlp("I like New York in Autumn.")
> rights = [t.text for t in doc[2:4].rights]
> assert rights == [u"in"]
> assert rights == ["in"]
> ```
| Name | Type | Description |
@ -374,7 +374,7 @@ the span.
> #### Example
>
> ```python
> doc = nlp(u"I like New York in Autumn.")
> doc = nlp("I like New York in Autumn.")
> assert doc[3:7].n_lefts == 1
> ```
@ -390,7 +390,7 @@ the span.
> #### Example
>
> ```python
> doc = nlp(u"I like New York in Autumn.")
> doc = nlp("I like New York in Autumn.")
> assert doc[2:4].n_rights == 1
> ```
@ -405,9 +405,9 @@ Tokens within the span and tokens which descend from them.
> #### Example
>
> ```python
> doc = nlp(u"Give it back! He pleaded.")
> doc = nlp("Give it back! He pleaded.")
> subtree = [t.text for t in doc[:3].subtree]
> assert subtree == [u"Give", u"it", u"back", u"!"]
> assert subtree == ["Give", "it", "back", "!"]
> ```
| Name | Type | Description |
@ -421,7 +421,7 @@ A boolean value indicating whether a word vector is associated with the object.
> #### Example
>
> ```python
> doc = nlp(u"I like apples")
> doc = nlp("I like apples")
> assert doc[1:].has_vector
> ```
@ -437,7 +437,7 @@ vectors.
> #### Example
>
> ```python
> doc = nlp(u"I like apples")
> doc = nlp("I like apples")
> assert doc[1:].vector.dtype == "float32"
> assert doc[1:].vector.shape == (300,)
> ```
@ -453,7 +453,7 @@ The L2 norm of the span's vector representation.
> #### Example
>
> ```python
> doc = nlp(u"I like apples")
> doc = nlp("I like apples")
> doc[1:].vector_norm # 4.800883928527915
> doc[2:].vector_norm # 6.895897646384268
> assert doc[1:].vector_norm != doc[2:].vector_norm

View File

@ -16,7 +16,7 @@ Create the `StringStore`.
>
> ```python
> from spacy.strings import StringStore
> stringstore = StringStore([u"apple", u"orange"])
> stringstore = StringStore(["apple", "orange"])
> ```
| Name | Type | Description |
@ -31,7 +31,7 @@ Get the number of strings in the store.
> #### Example
>
> ```python
> stringstore = StringStore([u"apple", u"orange"])
> stringstore = StringStore(["apple", "orange"])
> assert len(stringstore) == 2
> ```
@ -46,10 +46,10 @@ Retrieve a string from a given hash, or vice versa.
> #### Example
>
> ```python
> stringstore = StringStore([u"apple", u"orange"])
> apple_hash = stringstore[u"apple"]
> stringstore = StringStore(["apple", "orange"])
> apple_hash = stringstore["apple"]
> assert apple_hash == 8566208034543834098
> assert stringstore[apple_hash] == u"apple"
> assert stringstore[apple_hash] == "apple"
> ```
| Name | Type | Description |
@ -64,9 +64,9 @@ Check whether a string is in the store.
> #### Example
>
> ```python
> stringstore = StringStore([u"apple", u"orange"])
> assert u"apple" in stringstore
> assert not u"cherry" in stringstore
> stringstore = StringStore(["apple", "orange"])
> assert "apple" in stringstore
> assert not "cherry" in stringstore
> ```
| Name | Type | Description |
@ -82,9 +82,9 @@ store will always include an empty string `''` at position `0`.
> #### Example
>
> ```python
> stringstore = StringStore([u"apple", u"orange"])
> stringstore = StringStore(["apple", "orange"])
> all_strings = [s for s in stringstore]
> assert all_strings == [u"apple", u"orange"]
> assert all_strings == ["apple", "orange"]
> ```
| Name | Type | Description |
@ -98,12 +98,12 @@ Add a string to the `StringStore`.
> #### Example
>
> ```python
> stringstore = StringStore([u"apple", u"orange"])
> banana_hash = stringstore.add(u"banana")
> stringstore = StringStore(["apple", "orange"])
> banana_hash = stringstore.add("banana")
> assert len(stringstore) == 3
> assert banana_hash == 2525716904149915114
> assert stringstore[banana_hash] == u"banana"
> assert stringstore[u"banana"] == banana_hash
> assert stringstore[banana_hash] == "banana"
> assert stringstore["banana"] == banana_hash
> ```
| Name | Type | Description |
@ -182,7 +182,7 @@ Get a 64-bit hash for a given string.
>
> ```python
> from spacy.strings import hash_string
> assert hash_string(u"apple") == 8566208034543834098
> assert hash_string("apple") == 8566208034543834098
> ```
| Name | Type | Description |

View File

@ -57,7 +57,7 @@ and all pipeline components are applied to the `Doc` in order. Both
>
> ```python
> tagger = Tagger(nlp.vocab)
> doc = nlp(u"This is a sentence.")
> doc = nlp("This is a sentence.")
> # This usually happens under the hood
> processed = tagger(doc)
> ```
@ -118,11 +118,10 @@ Modify a batch of documents, using pre-computed scores.
> ```
| Name | Type | Description |
| -------- | -------- | ----------------------------------------------------- |
| --------- | -------- | ----------------------------------------------------- |
| `docs` | iterable | The documents to modify. |
| `scores` | - | The scores to set, produced by `Tagger.predict`. |
| `tensors`| iterable | The token representations used to predict the scores. |
| `tensors` | iterable | The token representations used to predict the scores. |
## Tagger.update {#update tag="method"}

View File

@ -75,7 +75,7 @@ delegate to the [`predict`](/api/textcategorizer#predict) and
>
> ```python
> textcat = TextCategorizer(nlp.vocab)
> doc = nlp(u"This is a sentence.")
> doc = nlp("This is a sentence.")
> # This usually happens under the hood
> processed = textcat(doc)
> ```
@ -137,10 +137,10 @@ Modify a batch of documents, using pre-computed scores.
> ```
| Name | Type | Description |
| -------- | -------- | --------------------------------------------------------- |
| --------- | -------- | --------------------------------------------------------- |
| `docs` | iterable | The documents to modify. |
| `scores` | - | The scores to set, produced by `TextCategorizer.predict`. |
| `tensors`| iterable | The token representations used to predict the scores. |
| `tensors` | iterable | The token representations used to predict the scores. |
## TextCategorizer.update {#update tag="method"}

View File

@ -12,9 +12,9 @@ Construct a `Token` object.
> #### Example
>
> ```python
> doc = nlp(u"Give it back! He pleaded.")
> doc = nlp("Give it back! He pleaded.")
> token = doc[0]
> assert token.text == u"Give"
> assert token.text == "Give"
> ```
| Name | Type | Description |
@ -31,7 +31,7 @@ The number of unicode characters in the token, i.e. `token.text`.
> #### Example
>
> ```python
> doc = nlp(u"Give it back! He pleaded.")
> doc = nlp("Give it back! He pleaded.")
> token = doc[0]
> assert len(token) == 4
> ```
@ -50,9 +50,9 @@ For details, see the documentation on
>
> ```python
> from spacy.tokens import Token
> fruit_getter = lambda token: token.text in (u"apple", u"pear", u"banana")
> fruit_getter = lambda token: token.text in ("apple", "pear", "banana")
> Token.set_extension("is_fruit", getter=fruit_getter)
> doc = nlp(u"I have an apple")
> doc = nlp("I have an apple")
> assert doc[3]._.is_fruit
> ```
@ -128,7 +128,7 @@ Check the value of a boolean flag.
>
> ```python
> from spacy.attrs import IS_TITLE
> doc = nlp(u"Give it back! He pleaded.")
> doc = nlp("Give it back! He pleaded.")
> token = doc[0]
> assert token.check_flag(IS_TITLE) == True
> ```
@ -145,7 +145,7 @@ Compute a semantic similarity estimate. Defaults to cosine over vectors.
> #### Example
>
> ```python
> apples, _, oranges = nlp(u"apples and oranges")
> apples, _, oranges = nlp("apples and oranges")
> apples_oranges = apples.similarity(oranges)
> oranges_apples = oranges.similarity(apples)
> assert apples_oranges == oranges_apples
@ -163,9 +163,9 @@ Get a neighboring token.
> #### Example
>
> ```python
> doc = nlp(u"Give it back! He pleaded.")
> doc = nlp("Give it back! He pleaded.")
> give_nbor = doc[0].nbor()
> assert give_nbor.text == u"it"
> assert give_nbor.text == "it"
> ```
| Name | Type | Description |
@ -181,7 +181,7 @@ dependency tree.
> #### Example
>
> ```python
> doc = nlp(u"Give it back! He pleaded.")
> doc = nlp("Give it back! He pleaded.")
> give = doc[0]
> it = doc[1]
> assert give.is_ancestor(it)
@ -199,11 +199,11 @@ The rightmost token of this token's syntactic descendants.
> #### Example
>
> ```python
> doc = nlp(u"Give it back! He pleaded.")
> doc = nlp("Give it back! He pleaded.")
> it_ancestors = doc[1].ancestors
> assert [t.text for t in it_ancestors] == [u"Give"]
> assert [t.text for t in it_ancestors] == ["Give"]
> he_ancestors = doc[4].ancestors
> assert [t.text for t in he_ancestors] == [u"pleaded"]
> assert [t.text for t in he_ancestors] == ["pleaded"]
> ```
| Name | Type | Description |
@ -217,9 +217,9 @@ A tuple of coordinated tokens, not including the token itself.
> #### Example
>
> ```python
> doc = nlp(u"I like apples and oranges")
> doc = nlp("I like apples and oranges")
> apples_conjuncts = doc[2].conjuncts
> assert [t.text for t in apples_conjuncts] == [u"oranges"]
> assert [t.text for t in apples_conjuncts] == ["oranges"]
> ```
| Name | Type | Description |
@ -233,9 +233,9 @@ A sequence of the token's immediate syntactic children.
> #### Example
>
> ```python
> doc = nlp(u"Give it back! He pleaded.")
> doc = nlp("Give it back! He pleaded.")
> give_children = doc[0].children
> assert [t.text for t in give_children] == [u"it", u"back", u"!"]
> assert [t.text for t in give_children] == ["it", "back", "!"]
> ```
| Name | Type | Description |
@ -249,9 +249,9 @@ The leftward immediate children of the word, in the syntactic dependency parse.
> #### Example
>
> ```python
> doc = nlp(u"I like New York in Autumn.")
> doc = nlp("I like New York in Autumn.")
> lefts = [t.text for t in doc[3].lefts]
> assert lefts == [u'New']
> assert lefts == ["New"]
> ```
| Name | Type | Description |
@ -265,9 +265,9 @@ The rightward immediate children of the word, in the syntactic dependency parse.
> #### Example
>
> ```python
> doc = nlp(u"I like New York in Autumn.")
> doc = nlp("I like New York in Autumn.")
> rights = [t.text for t in doc[3].rights]
> assert rights == [u"in"]
> assert rights == ["in"]
> ```
| Name | Type | Description |
@ -282,7 +282,7 @@ dependency parse.
> #### Example
>
> ```python
> doc = nlp(u"I like New York in Autumn.")
> doc = nlp("I like New York in Autumn.")
> assert doc[3].n_lefts == 1
> ```
@ -298,7 +298,7 @@ dependency parse.
> #### Example
>
> ```python
> doc = nlp(u"I like New York in Autumn.")
> doc = nlp("I like New York in Autumn.")
> assert doc[3].n_rights == 1
> ```
@ -313,9 +313,9 @@ A sequence containing the token and all the token's syntactic descendants.
> #### Example
>
> ```python
> doc = nlp(u"Give it back! He pleaded.")
> doc = nlp("Give it back! He pleaded.")
> give_subtree = doc[0].subtree
> assert [t.text for t in give_subtree] == [u"Give", u"it", u"back", u"!"]
> assert [t.text for t in give_subtree] == ["Give", "it", "back", "!"]
> ```
| Name | Type | Description |
@ -330,7 +330,7 @@ unknown. Defaults to `True` for the first token in the `Doc`.
> #### Example
>
> ```python
> doc = nlp(u"Give it back! He pleaded.")
> doc = nlp("Give it back! He pleaded.")
> assert doc[4].is_sent_start
> assert not doc[5].is_sent_start
> ```
@ -361,7 +361,7 @@ A boolean value indicating whether a word vector is associated with the token.
> #### Example
>
> ```python
> doc = nlp(u"I like apples")
> doc = nlp("I like apples")
> apples = doc[2]
> assert apples.has_vector
> ```
@ -377,7 +377,7 @@ A real-valued meaning representation.
> #### Example
>
> ```python
> doc = nlp(u"I like apples")
> doc = nlp("I like apples")
> apples = doc[2]
> assert apples.vector.dtype == "float32"
> assert apples.vector.shape == (300,)
@ -394,7 +394,7 @@ The L2 norm of the token's vector representation.
> #### Example
>
> ```python
> doc = nlp(u"I like apples and pasta")
> doc = nlp("I like apples and pasta")
> apples = doc[2]
> pasta = doc[4]
> apples.vector_norm # 6.89589786529541

View File

@ -5,7 +5,9 @@ tag: class
source: spacy/tokenizer.pyx
---
Segment text, and create `Doc` objects with the discovered segment boundaries. For a deeper understanding, see the docs on [how spaCy's tokenizer works](/usage/linguistic-features#how-tokenizer-works).
Segment text, and create `Doc` objects with the discovered segment boundaries.
For a deeper understanding, see the docs on
[how spaCy's tokenizer works](/usage/linguistic-features#how-tokenizer-works).
## Tokenizer.\_\_init\_\_ {#init tag="method"}
@ -49,7 +51,7 @@ Tokenize a string.
> #### Example
>
> ```python
> tokens = tokenizer(u"This is a sentence")
> tokens = tokenizer("This is a sentence")
> assert len(tokens) == 4
> ```
@ -65,7 +67,7 @@ Tokenize a stream of texts.
> #### Example
>
> ```python
> texts = [u"One document.", u"...", u"Lots of documents"]
> texts = ["One document.", "...", "Lots of documents"]
> for doc in tokenizer.pipe(texts, batch_size=50):
> pass
> ```
@ -109,8 +111,9 @@ if no suffix rules match.
Add a special-case tokenization rule. This mechanism is also used to add custom
tokenizer exceptions to the language data. See the usage guide on
[adding languages](/usage/adding-languages#tokenizer-exceptions) and [linguistic features](/usage/linguistic-features#special-cases) for more
details and examples.
[adding languages](/usage/adding-languages#tokenizer-exceptions) and
[linguistic features](/usage/linguistic-features#special-cases) for more details
and examples.
> #### Example
>

View File

@ -112,10 +112,10 @@ list of available terms, see
> #### Example
>
> ```python
> spacy.explain(u"NORP")
> spacy.explain("NORP")
> # Nationalities or religious or political groups
>
> doc = nlp(u"Hello world")
> doc = nlp("Hello world")
> for word in doc:
> print(word.text, word.tag_, spacy.explain(word.tag_))
> # Hello UH interjection
@ -181,8 +181,8 @@ browser. Will run a simple web server.
> import spacy
> from spacy import displacy
> nlp = spacy.load("en_core_web_sm")
> doc1 = nlp(u"This is a sentence.")
> doc2 = nlp(u"This is another sentence.")
> doc1 = nlp("This is a sentence.")
> doc2 = nlp("This is another sentence.")
> displacy.serve([doc1, doc2], style="dep")
> ```
@ -207,7 +207,7 @@ Render a dependency parse tree or named entity visualization.
> import spacy
> from spacy import displacy
> nlp = spacy.load("en_core_web_sm")
> doc = nlp(u"This is a sentence.")
> doc = nlp("This is a sentence.")
> html = displacy.render(doc, style="dep")
> ```
@ -263,7 +263,7 @@ If a setting is not present in the options, the default value will be used.
> ```
| Name | Type | Description | Default |
| -------- | ---- | ------------------------------------------------------------------------------------- | ------- |
| --------------------------------------- | ------- | ------------------------------------------------------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------------------ |
| `ents` | list | Entity types to highlight (`None` for all types). | `None` |
| `colors` | dict | Color overrides. Entity types in uppercase should be mapped to color names or values. | `{}` |
| `template` <Tag variant="new">2.2</Tag> | unicode | Optional template to overwrite the HTML used to render entity spans. Should be a format string and can use `{bg}`, `{text}` and `{label}`. | see [`templates.py`](https://github.com/explosion/spaCy/blob/master/spacy/displacy/templates.py) |
@ -271,7 +271,9 @@ If a setting is not present in the options, the default value will be used.
By default, displaCy comes with colors for all
[entity types supported by spaCy](/api/annotation#named-entities). If you're
using custom entity types, you can use the `colors` setting to add your own
colors for them. Your application or model package can also expose a [`spacy_displacy_colors` entry point](/usage/saving-loading#entry-points-displacy) to add custom labels and their colors automatically.
colors for them. Your application or model package can also expose a
[`spacy_displacy_colors` entry point](/usage/saving-loading#entry-points-displacy)
to add custom labels and their colors automatically.
## Utility functions {#util source="spacy/util.py"}
@ -650,7 +652,7 @@ for batching. Larger `bufsize` means less bias.
> ```
| Name | Type | Description |
| ---------- | -------- | ------------------------------------- |
| ---------- | -------- | ----------------------------------- |
| `iterable` | iterable | Iterator to shuffle. |
| `bufsize` | int | Items to hold back (default: 1000). |
| **YIELDS** | iterable | The shuffled iterator. |

View File

@ -26,7 +26,7 @@ you can add vectors to later.
> empty_vectors = Vectors(shape=(10000, 300))
>
> data = numpy.zeros((3, 300), dtype='f')
> keys = [u"cat", u"dog", u"rat"]
> keys = ["cat", "dog", "rat"]
> vectors = Vectors(data=data, keys=keys)
> ```
@ -45,9 +45,9 @@ raised.
> #### Example
>
> ```python
> cat_id = nlp.vocab.strings[u"cat"]
> cat_id = nlp.vocab.strings["cat"]
> cat_vector = nlp.vocab.vectors[cat_id]
> assert cat_vector == nlp.vocab[u"cat"].vector
> assert cat_vector == nlp.vocab["cat"].vector
> ```
| Name | Type | Description |
@ -62,7 +62,7 @@ Set a vector for the given key.
> #### Example
>
> ```python
> cat_id = nlp.vocab.strings[u"cat"]
> cat_id = nlp.vocab.strings["cat"]
> vector = numpy.random.uniform(-1, 1, (300,))
> nlp.vocab.vectors[cat_id] = vector
> ```
@ -109,7 +109,7 @@ Check whether a key has been mapped to a vector entry in the table.
> #### Example
>
> ```python
> cat_id = nlp.vocab.strings[u"cat"]
> cat_id = nlp.vocab.strings["cat"]
> nlp.vectors.add(cat_id, numpy.random.uniform(-1, 1, (300,)))
> assert cat_id in vectors
> ```
@ -132,9 +132,9 @@ mapping separately. If you need to manage the strings, you should use the
>
> ```python
> vector = numpy.random.uniform(-1, 1, (300,))
> cat_id = nlp.vocab.strings[u"cat"]
> cat_id = nlp.vocab.strings["cat"]
> nlp.vocab.vectors.add(cat_id, vector=vector)
> nlp.vocab.vectors.add(u"dog", row=0)
> nlp.vocab.vectors.add("dog", row=0)
> ```
| Name | Type | Description |
@ -218,8 +218,8 @@ Look up one or more keys by row, or vice versa.
> #### Example
>
> ```python
> row = nlp.vocab.vectors.find(key=u"cat")
> rows = nlp.vocab.vectors.find(keys=[u"cat", u"dog"])
> row = nlp.vocab.vectors.find(key="cat")
> rows = nlp.vocab.vectors.find(keys=["cat", "dog"])
> key = nlp.vocab.vectors.find(row=256)
> keys = nlp.vocab.vectors.find(rows=[18, 256, 985])
> ```
@ -241,7 +241,7 @@ vector table.
>
> ```python
> vectors = Vectors(shape(1, 300))
> vectors.add(u"cat", numpy.random.uniform(-1, 1, (300,)))
> vectors.add("cat", numpy.random.uniform(-1, 1, (300,)))
> rows, dims = vectors.shape
> assert rows == 1
> assert dims == 300
@ -276,7 +276,7 @@ If a table is full, it can be resized using
>
> ```python
> vectors = Vectors(shape=(1, 300))
> vectors.add(u"cat", numpy.random.uniform(-1, 1, (300,)))
> vectors.add("cat", numpy.random.uniform(-1, 1, (300,)))
> assert vectors.is_full
> ```

View File

@ -18,7 +18,7 @@ Create the vocabulary.
>
> ```python
> from spacy.vocab import Vocab
> vocab = Vocab(strings=[u"hello", u"world"])
> vocab = Vocab(strings=["hello", "world"])
> ```
| Name | Type | Description |
@ -36,7 +36,7 @@ Get the current number of lexemes in the vocabulary.
> #### Example
>
> ```python
> doc = nlp(u"This is a sentence.")
> doc = nlp("This is a sentence.")
> assert len(nlp.vocab) > 0
> ```
@ -52,8 +52,8 @@ unicode string is given, a new lexeme is created and stored.
> #### Example
>
> ```python
> apple = nlp.vocab.strings[u"apple"]
> assert nlp.vocab[apple] == nlp.vocab[u"apple"]
> apple = nlp.vocab.strings["apple"]
> assert nlp.vocab[apple] == nlp.vocab["apple"]
> ```
| Name | Type | Description |
@ -84,8 +84,8 @@ given string, you need to look it up in
> #### Example
>
> ```python
> apple = nlp.vocab.strings[u"apple"]
> oov = nlp.vocab.strings[u"dskfodkfos"]
> apple = nlp.vocab.strings["apple"]
> oov = nlp.vocab.strings["dskfodkfos"]
> assert apple in nlp.vocab
> assert oov not in nlp.vocab
> ```
@ -106,11 +106,11 @@ using `token.check_flag(flag_id)`.
>
> ```python
> def is_my_product(text):
> products = [u"spaCy", u"Thinc", u"displaCy"]
> products = ["spaCy", "Thinc", "displaCy"]
> return text in products
>
> MY_PRODUCT = nlp.vocab.add_flag(is_my_product)
> doc = nlp(u"I like spaCy")
> doc = nlp("I like spaCy")
> assert doc[2].check_flag(MY_PRODUCT) == True
> ```
@ -170,7 +170,7 @@ or hash value. If no vectors data is loaded, a `ValueError` is raised.
> #### Example
>
> ```python
> nlp.vocab.get_vector(u"apple")
> nlp.vocab.get_vector("apple")
> ```
| Name | Type | Description |
@ -186,7 +186,7 @@ or hash value.
> #### Example
>
> ```python
> nlp.vocab.set_vector(u"apple", array([...]))
> nlp.vocab.set_vector("apple", array([...]))
> ```
| Name | Type | Description |
@ -202,8 +202,8 @@ Words can be looked up by string or hash value.
> #### Example
>
> ```python
> if nlp.vocab.has_vector(u"apple"):
> vector = nlp.vocab.get_vector(u"apple")
> if nlp.vocab.has_vector("apple"):
> vector = nlp.vocab.get_vector("apple")
> ```
| Name | Type | Description |
@ -282,9 +282,9 @@ Load state from a binary string.
> #### Example
>
> ```python
> apple_id = nlp.vocab.strings[u"apple"]
> apple_id = nlp.vocab.strings["apple"]
> assert type(apple_id) == int
> PERSON = nlp.vocab.strings[u"PERSON"]
> PERSON = nlp.vocab.strings["PERSON"]
> assert type(PERSON) == int
> ```

View File

@ -12,7 +12,7 @@ Named entities are available as the `ents` property of a `Doc`:
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp(u"Apple is looking at buying U.K. startup for $1 billion")
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
for ent in doc.ents:
print(ent.text, ent.start_char, ent.end_char, ent.label_)

View File

@ -15,8 +15,8 @@ need to add an underscore `_` to its name:
### {executable="true"}
import spacy
nlp = spacy.load('en_core_web_sm')
doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion')
nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
for token in doc:
print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,

View File

@ -9,7 +9,7 @@ tokens, and we can iterate over them:
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp(u"Apple is looking at buying U.K. startup for $1 billion")
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
for token in doc:
print(token.text)
```

View File

@ -48,8 +48,8 @@ norm, which can be used to normalize vectors.
### {executable="true"}
import spacy
nlp = spacy.load('en_core_web_md')
tokens = nlp(u'dog cat banana afskfsd')
nlp = spacy.load("en_core_web_md")
tokens = nlp("dog cat banana afskfsd")
for token in tokens:
print(token.text, token.has_vector, token.vector_norm, token.is_oov)
@ -88,8 +88,8 @@ definition of similarity.
### {executable="true"}
import spacy
nlp = spacy.load('en_core_web_md') # make sure to use larger model!
tokens = nlp(u'dog cat banana')
nlp = spacy.load("en_core_web_md") # make sure to use larger model!
tokens = nlp("dog cat banana")
for token1 in tokens:
for token2 in tokens:

View File

@ -276,7 +276,7 @@ the lowercase spelling of a word exists, norms should always be in lowercase.
> #### Norms vs. lemmas
>
> ```python
> doc = nlp(u"I'm gonna realise")
> doc = nlp("I'm gonna realise")
> norms = [token.norm_ for token in doc]
> lemmas = [token.lemma_ for token in doc]
> assert norms == ["i", "am", "going", "to", "realize"]
@ -396,10 +396,10 @@ iterators:
> #### Noun chunks example
>
> ```python
> doc = nlp(u"A phrase with another phrase occurs.")
> doc = nlp("A phrase with another phrase occurs.")
> chunks = list(doc.noun_chunks)
> assert chunks[0].text == u"A phrase"
> assert chunks[1].text == u"another phrase"
> assert chunks[0].text == "A phrase"
> assert chunks[1].text == "another phrase"
> ```
| Language | Code | Source |

View File

@ -392,7 +392,7 @@ from is called `spacy`. So, when using spaCy, never call anything else `spacy`.
<Accordion title="Pronoun lemma is returned as -PRON-" id="pron-lemma">
```python
doc = nlp(u"They are")
doc = nlp("They are")
print(doc[0].lemma_)
# -PRON-
```

View File

@ -70,7 +70,6 @@ of the two. The system works as follows:
lemmatizer also accepts list-based exception files, acquired from
[WordNet](https://wordnet.princeton.edu/).
## Dependency Parsing {#dependency-parse model="parser"}
spaCy features a fast and accurate syntactic dependency parser, and has a rich
@ -93,7 +92,7 @@ get the noun chunks in a document, simply iterate over
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp(u"Autonomous cars shift insurance liability toward manufacturers")
doc = nlp("Autonomous cars shift insurance liability toward manufacturers")
for chunk in doc.noun_chunks:
print(chunk.text, chunk.root.text, chunk.root.dep_,
chunk.root.head.text)
@ -124,7 +123,7 @@ get the string value with `.dep_`.
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp(u"Autonomous cars shift insurance liability toward manufacturers")
doc = nlp("Autonomous cars shift insurance liability toward manufacturers")
for token in doc:
print(token.text, token.dep_, token.head.text, token.head.pos_,
[child for child in token.children])
@ -161,7 +160,7 @@ import spacy
from spacy.symbols import nsubj, VERB
nlp = spacy.load("en_core_web_sm")
doc = nlp(u"Autonomous cars shift insurance liability toward manufacturers")
doc = nlp("Autonomous cars shift insurance liability toward manufacturers")
# Finding a verb with a subject from below — good
verbs = set()
@ -204,7 +203,7 @@ children.
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp(u"bright red apples on the tree")
doc = nlp("bright red apples on the tree")
print([token.text for token in doc[2].lefts]) # ['bright', 'red']
print([token.text for token in doc[2].rights]) # ['on']
print(doc[2].n_lefts) # 2
@ -216,7 +215,7 @@ print(doc[2].n_rights) # 1
import spacy
nlp = spacy.load("de_core_news_sm")
doc = nlp(u"schöne rote Äpfel auf dem Baum")
doc = nlp("schöne rote Äpfel auf dem Baum")
print([token.text for token in doc[2].lefts]) # ['schöne', 'rote']
print([token.text for token in doc[2].rights]) # ['auf']
```
@ -240,7 +239,7 @@ sequence of tokens. You can walk up the tree with the
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp(u"Credit and mortgage account holders must submit their requests")
doc = nlp("Credit and mortgage account holders must submit their requests")
root = [token for token in doc if token.head == token][0]
subject = list(root.lefts)[0]
@ -270,7 +269,7 @@ end-point of a range, don't forget to `+1`!
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp(u"Credit and mortgage account holders must submit their requests")
doc = nlp("Credit and mortgage account holders must submit their requests")
span = doc[doc[4].left_edge.i : doc[4].right_edge.i+1]
with doc.retokenize() as retokenizer:
retokenizer.merge(span)
@ -311,7 +310,7 @@ import spacy
from spacy import displacy
nlp = spacy.load("en_core_web_sm")
doc = nlp(u"Autonomous cars shift insurance liability toward manufacturers")
doc = nlp("Autonomous cars shift insurance liability toward manufacturers")
# Since this is an interactive Jupyter environment, we can use displacy.render here
displacy.render(doc, style='dep')
```
@ -336,7 +335,7 @@ the `nlp` object.
```python
nlp = spacy.load("en_core_web_sm", disable=["parser"])
nlp = English().from_disk("/model", disable=["parser"])
doc = nlp(u"I don't want parsed", disable=["parser"])
doc = nlp("I don't want parsed", disable=["parser"])
```
<Infobox title="Important note: disabling pipeline components" variant="warning">
@ -350,10 +349,10 @@ Language class via [`from_disk`](/api/language#from_disk).
```diff
+ nlp = spacy.load("en_core_web_sm", disable=["parser"])
+ doc = nlp(u"I don't want parsed", disable=["parser"])
+ doc = nlp("I don't want parsed", disable=["parser"])
- nlp = spacy.load("en_core_web_sm", parser=False)
- doc = nlp(u"I don't want parsed", parse=False)
- doc = nlp("I don't want parsed", parse=False)
```
</Infobox>
@ -398,7 +397,7 @@ on a token, it will return an empty string.
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp(u"San Francisco considers banning sidewalk delivery robots")
doc = nlp("San Francisco considers banning sidewalk delivery robots")
# document level
ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents]
@ -407,8 +406,8 @@ print(ents)
# token level
ent_san = [doc[0].text, doc[0].ent_iob_, doc[0].ent_type_]
ent_francisco = [doc[1].text, doc[1].ent_iob_, doc[1].ent_type_]
print(ent_san) # [u'San', u'B', u'GPE']
print(ent_francisco) # [u'Francisco', u'I', u'GPE']
print(ent_san) # ['San', 'B', 'GPE']
print(ent_francisco) # ['Francisco', 'I', 'GPE']
```
| Text | ent_iob | ent_iob\_ | ent_type\_ | Description |
@ -435,18 +434,17 @@ import spacy
from spacy.tokens import Span
nlp = spacy.load("en_core_web_sm")
doc = nlp(u"FB is hiring a new Vice President of global policy")
doc = nlp("FB is hiring a new Vice President of global policy")
ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents]
print('Before', ents)
# the model didn't recognise "FB" as an entity :(
ORG = doc.vocab.strings[u"ORG"] # get hash value of entity label
fb_ent = Span(doc, 0, 1, label=ORG) # create a Span for the new entity
fb_ent = Span(doc, 0, 1, label="ORG") # create a Span for the new entity
doc.ents = list(doc.ents) + [fb_ent]
ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents]
print('After', ents)
# [(u'FB', 0, 2, 'ORG')] 🎉
# [('FB', 0, 2, 'ORG')] 🎉
```
Keep in mind that you need to create a `Span` with the start and end index of
@ -468,13 +466,13 @@ import spacy
from spacy.attrs import ENT_IOB, ENT_TYPE
nlp = spacy.load("en_core_web_sm")
doc = nlp.make_doc(u"London is a big city in the United Kingdom.")
doc = nlp.make_doc("London is a big city in the United Kingdom.")
print("Before", doc.ents) # []
header = [ENT_IOB, ENT_TYPE]
attr_array = numpy.zeros((len(doc), len(header)))
attr_array[0, 0] = 3 # B
attr_array[0, 1] = doc.vocab.strings[u"GPE"]
attr_array[0, 1] = doc.vocab.strings["GPE"]
doc.from_array(header, attr_array)
print("After", doc.ents) # [London]
```
@ -533,8 +531,8 @@ train_data = [
```
```python
doc = Doc(nlp.vocab, [u"rats", u"make", u"good", u"pets"])
gold = GoldParse(doc, entities=[u"U-ANIMAL", u"O", u"O", u"O"])
doc = Doc(nlp.vocab, ["rats", "make", "good", "pets"])
gold = GoldParse(doc, entities=["U-ANIMAL", "O", "O", "O"])
```
<Infobox>
@ -565,7 +563,7 @@ For more details and examples, see the
import spacy
from spacy import displacy
text = u"When Sebastian Thrun started working on self-driving cars at Google in 2007, few people outside of the company took him seriously."
text = "When Sebastian Thrun started working on self-driving cars at Google in 2007, few people outside of the company took him seriously."
nlp = spacy.load("en_core_web_sm")
doc = nlp(text)
@ -578,29 +576,27 @@ import DisplacyEntHtml from 'images/displacy-ent2.html'
## Entity Linking {#entity-linking}
To ground the named entities into the "real-world",
spaCy provides functionality to perform entity linking, which resolves a textual entity
to a unique identifier from a knowledge base (KB).
To ground the named entities into the "real-world", spaCy provides functionality
to perform entity linking, which resolves a textual entity to a unique
identifier from a knowledge base (KB).
The default model assigns WikiData identifiers, but you can create your own
[`KnowledgeBase`](/api/kb) and [train a new Entity Linking model](/usage/training#entity-linker) using
that custom-made KB.
[`KnowledgeBase`](/api/kb) and
[train a new Entity Linking model](/usage/training#entity-linker) using that
custom-made KB.
### Accessing entity identifiers {#entity-linking-accessing}
### Accessing entity identifiers {#accessing}
The annotated KB identifier is accessible as either a hash value
or as a string, using the attributes
`ent.kb_id` and `ent.kb_id_` of a [`Span`](/api/span) object,
or the `ent_kb_id` and `ent_kb_id_` attributes of a [`Token`](/api/token) object.
The annotated KB identifier is accessible as either a hash value or as a string,
using the attributes `ent.kb_id` and `ent.kb_id_` of a [`Span`](/api/span)
object, or the `ent_kb_id` and `ent_kb_id_` attributes of a
[`Token`](/api/token) object.
```python
### {executable="true"}
import spacy
nlp = spacy.load("my_custom_el_model")
doc = nlp(u"Ada Lovelace was born in London")
doc = nlp("Ada Lovelace was born in London")
# document level
ents = [(e.text, e.label_, e.kb_id_) for e in doc.ents]
@ -616,12 +612,12 @@ print(ent_london_5) # ['London', 'GPE', 'Q84']
```
| Text | ent_type\_ | ent_kb_id\_ |
| --------- | ---------- | ------------ |
| -------- | ---------- | ----------- |
| Ada | `"PERSON"` | `"Q7259"` |
| Lovelace | `"PERSON"` | `"Q7259"` |
| was | `""` | `""` |
| born | `""` | `""` |
| in | `""` | `""` |
| was | - | - |
| born | - | - |
| in | - | - |
| London | `"GPE"` | `"Q84"` |
## Tokenization {#tokenization}
@ -692,53 +688,36 @@ this specific field. Here's how to add a special case rule to an existing
```python
### {executable="true"}
import spacy
from spacy.symbols import ORTH, LEMMA, POS, TAG
from spacy.symbols import ORTH
nlp = spacy.load("en_core_web_sm")
doc = nlp(u"gimme that") # phrase to tokenize
doc = nlp("gimme that") # phrase to tokenize
print([w.text for w in doc]) # ['gimme', 'that']
# add special case rule
special_case = [{ORTH: u"gim", LEMMA: u"give", POS: u"VERB"}, {ORTH: u"me"}]
nlp.tokenizer.add_special_case(u"gimme", special_case)
# Add special case rule
special_case = [{ORTH: "gim"}, {ORTH: "me"}]
nlp.tokenizer.add_special_case("gimme", special_case)
# check new tokenization
print([w.text for w in nlp(u"gimme that")]) # ['gim', 'me', 'that']
# Pronoun lemma is returned as -PRON-!
print([w.lemma_ for w in nlp(u"gimme that")]) # ['give', '-PRON-', 'that']
# Check new tokenization
print([w.text for w in nlp("gimme that")]) # ['gim', 'me', 'that']
```
<Infobox title="Why -PRON-?" variant="warning">
For details on spaCy's custom pronoun lemma `-PRON-`,
[see here](/usage/#pron-lemma).
</Infobox>
The special case doesn't have to match an entire whitespace-delimited substring.
The tokenizer will incrementally split off punctuation, and keep looking up the
remaining substring:
```python
assert "gimme" not in [w.text for w in nlp(u"gimme!")]
assert "gimme" not in [w.text for w in nlp(u'("...gimme...?")')]
assert "gimme" not in [w.text for w in nlp("gimme!")]
assert "gimme" not in [w.text for w in nlp('("...gimme...?")')]
```
The special case rules have precedence over the punctuation splitting:
```python
special_case = [{ORTH: u"...gimme...?", LEMMA: u"give", TAG: u"VB"}]
nlp.tokenizer.add_special_case(u"...gimme...?", special_case)
assert len(nlp(u"...gimme...?")) == 1
nlp.tokenizer.add_special_case("...gimme...?", [{ORTH: "...gimme...?"}])
assert len(nlp("...gimme...?")) == 1
```
Because the special-case rules allow you to set arbitrary token attributes, such
as the part-of-speech, lemma, etc, they make a good mechanism for arbitrary
fix-up rules. Having this logic live in the tokenizer isn't very satisfying from
a design perspective, however, so the API may eventually be exposed on the
[`Language`](/api/language) class itself.
### How spaCy's tokenizer works {#how-tokenizer-works}
spaCy introduces a novel tokenization algorithm, that gives a better balance
@ -838,7 +817,7 @@ def custom_tokenizer(nlp):
nlp = spacy.load("en_core_web_sm")
nlp.tokenizer = custom_tokenizer(nlp)
doc = nlp(u"hello-world.")
doc = nlp("hello-world.")
print([t.text for t in doc])
```
@ -955,7 +934,7 @@ class WhitespaceTokenizer(object):
nlp = spacy.load("en_core_web_sm")
nlp.tokenizer = WhitespaceTokenizer(nlp.vocab)
doc = nlp(u"What's happened to me? he thought. It wasn't a dream.")
doc = nlp("What's happened to me? he thought. It wasn't a dream.")
print([t.text for t in doc])
```
@ -980,7 +959,7 @@ from spacy.tokens import Doc
from spacy.lang.en import English
nlp = English()
doc = Doc(nlp.vocab, words=[u"Hello", u",", u"world", u"!"],
doc = Doc(nlp.vocab, words=["Hello", ",", "world", "!"],
spaces=[False, True, False, False])
print([(t.text, t.text_with_ws, t.whitespace_) for t in doc])
```
@ -997,8 +976,8 @@ from spacy.tokens import Doc
from spacy.lang.en import English
nlp = English()
bad_spaces = Doc(nlp.vocab, words=[u"Hello", u",", u"world", u"!"])
good_spaces = Doc(nlp.vocab, words=[u"Hello", u",", u"world", u"!"],
bad_spaces = Doc(nlp.vocab, words=["Hello", ",", "world", "!"])
good_spaces = Doc(nlp.vocab, words=["Hello", ",", "world", "!"],
spaces=[False, True, False, False])
print(bad_spaces.text) # 'Hello , world !'
@ -1280,7 +1259,7 @@ that yields [`Span`](/api/span) objects.
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp(u"This is a sentence. This is another sentence.")
doc = nlp("This is a sentence. This is another sentence.")
for sent in doc.sents:
print(sent.text)
```
@ -1300,7 +1279,7 @@ from spacy.lang.en import English
nlp = English() # just the language with no model
sentencizer = nlp.create_pipe("sentencizer")
nlp.add_pipe(sentencizer)
doc = nlp(u"This is a sentence. This is another sentence.")
doc = nlp("This is a sentence. This is another sentence.")
for sent in doc.sents:
print(sent.text)
```
@ -1336,7 +1315,7 @@ take advantage of dependency-based sentence segmentation.
### {executable="true"}
import spacy
text = u"this is a sentence...hello...and another sentence."
text = "this is a sentence...hello...and another sentence."
nlp = spacy.load("en_core_web_sm")
doc = nlp(text)

View File

@ -120,7 +120,7 @@ python -m spacy download en_core_web_sm
```python
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp(u"This is a sentence.")
doc = nlp("This is a sentence.")
```
<Infobox title="Important note" variant="warning">
@ -197,7 +197,7 @@ nlp = spacy.load("en_core_web_sm") # load model package "en_core_web_s
nlp = spacy.load("/path/to/en_core_web_sm") # load package from a directory
nlp = spacy.load("en") # load model with shortcut link "en"
doc = nlp(u"This is a sentence.")
doc = nlp("This is a sentence.")
```
<Infobox title="Tip: Preview model info">
@ -269,7 +269,7 @@ also `import` it and then call its `load()` method with no arguments:
import en_core_web_sm
nlp = en_core_web_sm.load()
doc = nlp(u"This is a sentence.")
doc = nlp("This is a sentence.")
```
How you choose to load your models ultimately depends on personal preference.

View File

@ -20,7 +20,7 @@ component** on the `Doc`, in order. It then returns the processed `Doc` that you
can work with.
```python
doc = nlp(u"This is a text")
doc = nlp("This is a text")
```
When processing large volumes of text, the statistical models are usually more
@ -29,7 +29,7 @@ efficient if you let them work on batches of texts. spaCy's
processed `Doc` objects. The batching is done internally.
```diff
texts = [u"This is a text", u"These are lots of texts", u"..."]
texts = ["This is a text", "These are lots of texts", "..."]
- docs = [nlp(text) for text in texts]
+ docs = list(nlp.pipe(texts))
```
@ -172,7 +172,7 @@ which is then processed by the component next in the pipeline.
```python
### The pipeline under the hood
doc = nlp.make_doc(u"This is a sentence") # create a Doc from raw text
doc = nlp.make_doc("This is a sentence") # create a Doc from raw text
for name, proc in nlp.pipeline: # iterate over components in order
doc = proc(doc) # apply each component
```
@ -263,12 +263,12 @@ blocks.
### Disable for block
# 1. Use as a contextmanager
with nlp.disable_pipes("tagger", "parser"):
doc = nlp(u"I won't be tagged and parsed")
doc = nlp(u"I will be tagged and parsed")
doc = nlp("I won't be tagged and parsed")
doc = nlp("I will be tagged and parsed")
# 2. Restore manually
disabled = nlp.disable_pipes("ner")
doc = nlp(u"I won't have named entities")
doc = nlp("I won't have named entities")
disabled.restore()
```
@ -295,11 +295,11 @@ initializing a Language class via [`from_disk`](/api/language#from_disk).
```diff
- nlp = spacy.load('en', tagger=False, entity=False)
- doc = nlp(u"I don't want parsed", parse=False)
- doc = nlp("I don't want parsed", parse=False)
+ nlp = spacy.load("en", disable=["ner"])
+ nlp.remove_pipe("parser")
+ doc = nlp(u"I don't want parsed")
+ doc = nlp("I don't want parsed")
```
</Infobox>
@ -376,7 +376,7 @@ def my_component(doc):
nlp = spacy.load("en_core_web_sm")
nlp.add_pipe(my_component, name="print_info", last=True)
print(nlp.pipe_names) # ['tagger', 'parser', 'ner', 'print_info']
doc = nlp(u"This is a sentence.")
doc = nlp("This is a sentence.")
```
@ -426,14 +426,14 @@ class EntityMatcher(object):
return doc
nlp = spacy.load("en_core_web_sm")
terms = (u"cat", u"dog", u"tree kangaroo", u"giant sea spider")
terms = ("cat", "dog", "tree kangaroo", "giant sea spider")
entity_matcher = EntityMatcher(nlp, terms, "ANIMAL")
nlp.add_pipe(entity_matcher, after="ner")
print(nlp.pipe_names) # The components in the pipeline
doc = nlp(u"This is a text about Barack Obama and a tree kangaroo")
doc = nlp("This is a text about Barack Obama and a tree kangaroo")
print([(ent.text, ent.label_) for ent in doc.ents])
```
@ -471,7 +471,7 @@ def custom_sentencizer(doc):
nlp = spacy.load("en_core_web_sm")
nlp.add_pipe(custom_sentencizer, before="parser") # Insert before the parser
doc = nlp(u"This is. A sentence. | This is. Another sentence.")
doc = nlp("This is. A sentence. | This is. Another sentence.")
for sent in doc.sents:
print(sent.text)
```
@ -517,7 +517,7 @@ config parameters are passed all the way down from
components with custom settings:
```python
nlp = spacy.load("your_custom_model", terms=(u"tree kangaroo"), label="ANIMAL")
nlp = spacy.load("your_custom_model", terms=["tree kangaroo"], label="ANIMAL")
```
<Infobox title="Important note" variant="warning">
@ -617,7 +617,7 @@ raise an `AttributeError`.
### Example
from spacy.tokens import Doc, Span, Token
fruits = [u"apple", u"pear", u"banana", u"orange", u"strawberry"]
fruits = ["apple", "pear", "banana", "orange", "strawberry"]
is_fruit_getter = lambda token: token.text in fruits
has_fruit_getter = lambda obj: any([t.text in fruits for t in obj])
@ -629,7 +629,7 @@ Span.set_extension("has_fruit", getter=has_fruit_getter)
> #### Usage example
>
> ```python
> doc = nlp(u"I have an apple and a melon")
> doc = nlp("I have an apple and a melon")
> assert doc[3]._.is_fruit # get Token attributes
> assert not doc[0]._.is_fruit
> assert doc._.has_fruit # get Doc attributes

View File

@ -90,7 +90,7 @@ the pattern is not going to produce any results. When developing complex
patterns, make sure to check examples against spaCy's tokenization:
```python
doc = nlp(u"A complex-example,!")
doc = nlp("A complex-example,!")
print([token.text for token in doc])
```
@ -113,7 +113,7 @@ matcher = Matcher(nlp.vocab)
pattern = [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"LOWER": "world"}]
matcher.add("HelloWorld", None, pattern)
doc = nlp(u"Hello, world! Hello world!")
doc = nlp("Hello, world! Hello world!")
matches = matcher(doc)
for match_id, start, end in matches:
string_id = nlp.vocab.strings[match_id] # Get string representation
@ -447,7 +447,7 @@ def add_event_ent(matcher, doc, i, matches):
pattern = [{"ORTH": "Google"}, {"ORTH": "I"}, {"ORTH": "/"}, {"ORTH": "O"}]
matcher.add("GoogleIO", add_event_ent, pattern)
doc = nlp(u"This is a text about Google I/O")
doc = nlp("This is a text about Google I/O")
matches = matcher(doc)
```
@ -539,7 +539,7 @@ class BadHTMLMerger(object):
nlp = spacy.load("en_core_web_sm")
html_merger = BadHTMLMerger(nlp)
nlp.add_pipe(html_merger, last=True) # Add component to the pipeline
doc = nlp(u"Hello<br>world! <br/> This is a test.")
doc = nlp("Hello<br>world! <br/> This is a test.")
for token in doc:
print(token.text, token._.bad_html)
@ -617,7 +617,7 @@ def collect_sents(matcher, doc, i, matches):
pattern = [{"LOWER": "facebook"}, {"LEMMA": "be"}, {"POS": "ADV", "OP": "*"},
{"POS": "ADJ"}]
matcher.add("FacebookIs", collect_sents, pattern) # add pattern
doc = nlp(u"I'd say that Facebook is evil. Facebook is pretty cool, right?")
doc = nlp("I'd say that Facebook is evil. Facebook is pretty cool, right?")
matches = matcher(doc)
# Serve visualization of sentences containing match with displaCy
@ -673,7 +673,7 @@ pattern = [{"ORTH": "("}, {"SHAPE": "ddd"}, {"ORTH": ")"}, {"SHAPE": "ddd"},
{"ORTH": "-", "OP": "?"}, {"SHAPE": "ddd"}]
matcher.add("PHONE_NUMBER", None, pattern)
doc = nlp(u"Call me at (123) 456 789 or (123) 456 789!")
doc = nlp("Call me at (123) 456 789 or (123) 456 789!")
print([t.text for t in doc])
matches = matcher(doc)
for match_id, start, end in matches:
@ -719,8 +719,8 @@ from spacy.matcher import Matcher
nlp = English() # We only want the tokenizer, so no need to load a model
matcher = Matcher(nlp.vocab)
pos_emoji = [u"😀", u"😃", u"😂", u"🤣", u"😊", u"😍"] # Positive emoji
neg_emoji = [u"😞", u"😠", u"😩", u"😢", u"😭", u"😒"] # Negative emoji
pos_emoji = ["😀", "😃", "😂", "🤣", "😊", "😍"] # Positive emoji
neg_emoji = ["😞", "😠", "😩", "😢", "😭", "😒"] # Negative emoji
# Add patterns to match one or more emoji tokens
pos_patterns = [[{"ORTH": emoji}] for emoji in pos_emoji]
@ -740,7 +740,7 @@ matcher.add("SAD", label_sentiment, *neg_patterns) # Add negative pattern
# Add pattern for valid hashtag, i.e. '#' plus any ASCII token
matcher.add("HASHTAG", None, [{"ORTH": "#"}, {"IS_ASCII": True}])
doc = nlp(u"Hello world 😀 #MondayMotivation")
doc = nlp("Hello world 😀 #MondayMotivation")
matches = matcher(doc)
for match_id, start, end in matches:
string_id = doc.vocab.strings[match_id] # Look up string ID
@ -797,7 +797,7 @@ matcher.add("HASHTAG", None, [{"ORTH": "#"}, {"IS_ASCII": True}])
# Register token extension
Token.set_extension("is_hashtag", default=False)
doc = nlp(u"Hello world 😀 #MondayMotivation")
doc = nlp("Hello world 😀 #MondayMotivation")
matches = matcher(doc)
hashtags = []
for match_id, start, end in matches:
@ -838,13 +838,13 @@ from spacy.matcher import PhraseMatcher
nlp = spacy.load('en_core_web_sm')
matcher = PhraseMatcher(nlp.vocab)
terms = [u"Barack Obama", u"Angela Merkel", u"Washington, D.C."]
terms = ["Barack Obama", "Angela Merkel", "Washington, D.C."]
# Only run nlp.make_doc to speed things up
patterns = [nlp.make_doc(text) for text in terms]
matcher.add("TerminologyList", None, *patterns)
doc = nlp(u"German Chancellor Angela Merkel and US President Barack Obama "
u"converse in the Oval Office inside the White House in Washington, D.C.")
doc = nlp("German Chancellor Angela Merkel and US President Barack Obama "
"converse in the Oval Office inside the White House in Washington, D.C.")
matches = matcher(doc)
for match_id, start, end in matches:
span = doc[start:end]
@ -853,8 +853,8 @@ for match_id, start, end in matches:
Since spaCy is used for processing both the patterns and the text to be matched,
you won't have to worry about specific tokenization for example, you can
simply pass in `nlp(u"Washington, D.C.")` and won't have to write a complex
token pattern covering the exact tokenization of the term.
simply pass in `nlp("Washington, D.C.")` and won't have to write a complex token
pattern covering the exact tokenization of the term.
<Infobox title="Important note on creating patterns" variant="warning">
@ -889,10 +889,10 @@ from spacy.matcher import PhraseMatcher
nlp = English()
matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
patterns = [nlp.make_doc(name) for name in [u"Angela Merkel", u"Barack Obama"]]
patterns = [nlp.make_doc(name) for name in ["Angela Merkel", "Barack Obama"]]
matcher.add("Names", None, *patterns)
doc = nlp(u"angela merkel and us president barack Obama")
doc = nlp("angela merkel and us president barack Obama")
for match_id, start, end in matcher(doc):
print("Matched based on lowercase token text:", doc[start:end])
```
@ -924,9 +924,9 @@ from spacy.matcher import PhraseMatcher
nlp = English()
matcher = PhraseMatcher(nlp.vocab, attr="SHAPE")
matcher.add("IP", None, nlp(u"127.0.0.1"), nlp(u"127.127.0.0"))
matcher.add("IP", None, nlp("127.0.0.1"), nlp("127.127.0.0"))
doc = nlp(u"Often the router will have an IP address such as 192.168.1.1 or 192.168.2.1.")
doc = nlp("Often the router will have an IP address such as 192.168.1.1 or 192.168.2.1.")
for match_id, start, end in matcher(doc):
print("Matched based on token shape:", doc[start:end])
```
@ -982,7 +982,7 @@ patterns = [{"label": "ORG", "pattern": "Apple"},
ruler.add_patterns(patterns)
nlp.add_pipe(ruler)
doc = nlp(u"Apple is opening its first big office in San Francisco.")
doc = nlp("Apple is opening its first big office in San Francisco.")
print([(ent.text, ent.label_) for ent in doc.ents])
```
@ -1006,7 +1006,7 @@ patterns = [{"label": "ORG", "pattern": "MyCorp Inc."}]
ruler.add_patterns(patterns)
nlp.add_pipe(ruler)
doc = nlp(u"MyCorp Inc. is a company in the U.S.")
doc = nlp("MyCorp Inc. is a company in the U.S.")
print([(ent.text, ent.label_) for ent in doc.ents])
```

View File

@ -64,7 +64,7 @@ _then_ loads in the binary data. You can read more about this process
> #### Example
>
> ```python
> doc = nlp(u"This is a text.")
> doc = nlp("This is a text.")
> data = pickle.dumps(doc)
> ```
@ -84,8 +84,8 @@ the _same_ `Vocab` object, it will only be included once.
```python
### Pickling objects with shared data {highlight="8-9"}
doc1 = nlp(u"Hello world")
doc2 = nlp(u"This is a test")
doc1 = nlp("Hello world")
doc2 = nlp("This is a test")
doc1_data = pickle.dumps(doc1)
doc2_data = pickle.dumps(doc2)
@ -347,7 +347,7 @@ spaCy is now able to create the pipeline component `'snek'`:
>>> nlp = English()
>>> snek = nlp.create_pipe("snek") # this now works! 🐍🎉
>>> nlp.add_pipe(snek)
>>> doc = nlp(u"I am snek")
>>> doc = nlp("I am snek")
--..,_ _,.--.
`'.'. .'`__ o `;__.
'.'. .'.'` '---'` `
@ -497,8 +497,8 @@ If you're training a named entity recognition model for a custom domain, you may
end up training different labels that don't have pre-defined colors in the
[`displacy` visualizer](/usage/visualizers#ent). The `spacy_displacy_colors`
entry point lets you define a dictionary of entity labels mapped to their color
values. It's added to the pre-defined colors and can also overwrite
existing values.
values. It's added to the pre-defined colors and can also overwrite existing
values.
> #### Domain-specific NER labels
>
@ -528,8 +528,8 @@ setup(
```
After installing the package, the the custom colors will be used when
visualizing text with `displacy`. Whenever the label `SNEK` is assigned, it
will be displayed in `#3dff74`.
visualizing text with `displacy`. Whenever the label `SNEK` is assigned, it will
be displayed in `#3dff74`.
import DisplaCyEntSnekHtml from 'images/displacy-ent-snek.html'

View File

@ -179,7 +179,7 @@ processed `Doc`:
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp(u"Apple is looking at buying U.K. startup for $1 billion")
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
for token in doc:
print(token.text, token.pos_, token.dep_)
```
@ -240,8 +240,8 @@ of a model, see the usage guides on
<Infobox title="📖 Entity Linking">
To learn more about entity linking in spaCy, and how to **train and update**
the entity linker predictions, see the usage guides on
To learn more about entity linking in spaCy, and how to **train and update** the
entity linker predictions, see the usage guides on
[entity linking](/usage/linguistic-features#entity-linking) and
[training the entity linker](/usage/training#entity-linker).
@ -307,8 +307,8 @@ its hash, or a hash to get its string:
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp(u"I love coffee")
print(doc.vocab.strings[u"coffee"]) # 3197928453018144401
doc = nlp("I love coffee")
print(doc.vocab.strings["coffee"]) # 3197928453018144401
print(doc.vocab.strings[3197928453018144401]) # 'coffee'
```
@ -331,7 +331,7 @@ ever change. Its hash value will also always be the same.
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp(u"I love coffee")
doc = nlp("I love coffee")
for word in doc:
lexeme = doc.vocab[word.text]
print(lexeme.text, lexeme.orth, lexeme.shape_, lexeme.prefix_, lexeme.suffix_,
@ -372,14 +372,14 @@ from spacy.tokens import Doc
from spacy.vocab import Vocab
nlp = spacy.load("en_core_web_sm")
doc = nlp(u"I love coffee") # Original Doc
print(doc.vocab.strings[u"coffee"]) # 3197928453018144401
doc = nlp("I love coffee") # Original Doc
print(doc.vocab.strings["coffee"]) # 3197928453018144401
print(doc.vocab.strings[3197928453018144401]) # 'coffee' 👍
empty_doc = Doc(Vocab()) # New Doc with empty Vocab
# empty_doc.vocab.strings[3197928453018144401] will raise an error :(
empty_doc.vocab.strings.add(u"coffee") # Add "coffee" and generate hash
empty_doc.vocab.strings.add("coffee") # Add "coffee" and generate hash
print(empty_doc.vocab.strings[3197928453018144401]) # 'coffee' 👍
new_doc = Doc(doc.vocab) # Create new doc with first doc's vocab
@ -396,20 +396,24 @@ it.
## Knowledge Base {#kb}
To support the entity linking task, spaCy stores external knowledge in a
[`KnowledgeBase`](/api/kb). The knowledge base (KB) uses the `Vocab` to store its
data efficiently.
[`KnowledgeBase`](/api/kb). The knowledge base (KB) uses the `Vocab` to store
its data efficiently.
> - **Mention**: A textual occurrence of a named entity, e.g. 'Miss Lovelace'.
> - **KB ID**: A unique identifier refering to a particular real-world concept, e.g. 'Q7259'.
> - **Alias**: A plausible synonym or description for a certain KB ID, e.g. 'Ada Lovelace'.
> - **Prior probability**: The probability of a certain mention resolving to a certain KB ID,
prior to knowing anything about the context in which the mention is used.
> - **Entity vector**: A pretrained word vector capturing the entity description.
> - **KB ID**: A unique identifier refering to a particular real-world concept,
> e.g. 'Q7259'.
> - **Alias**: A plausible synonym or description for a certain KB ID, e.g. 'Ada
> Lovelace'.
> - **Prior probability**: The probability of a certain mention resolving to a
> certain KB ID, prior to knowing anything about the context in which the
> mention is used.
> - **Entity vector**: A pretrained word vector capturing the entity
> description.
A knowledge base is created by first adding all entities to it. Next, for each
potential mention or alias, a list of relevant KB IDs and their prior probabilities
is added. The sum of these prior probabilities should never exceed 1 for any given alias.
potential mention or alias, a list of relevant KB IDs and their prior
probabilities is added. The sum of these prior probabilities should never exceed
1 for any given alias.
```python
### {executable="true"}
@ -436,10 +440,10 @@ print("Number of aliases in KB:", kb.get_size_aliases()) # 2
### Candidate generation
Given a textual entity, the Knowledge Base can provide a list of plausible candidates or
entity identifiers. The [`EntityLinker`](/api/entitylinker) will take this list of candidates
as input, and disambiguate the mention to the most probable identifier, given the
document context.
Given a textual entity, the Knowledge Base can provide a list of plausible
candidates or entity identifiers. The [`EntityLinker`](/api/entitylinker) will
take this list of candidates as input, and disambiguate the mention to the most
probable identifier, given the document context.
```python
### {executable="true"}
@ -520,11 +524,11 @@ python -m spacy download de_core_news_sm
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp(u"Hello, world. Here are two sentences.")
doc = nlp("Hello, world. Here are two sentences.")
print([t.text for t in doc])
nlp_de = spacy.load("de_core_news_sm")
doc_de = nlp_de(u"Ich bin ein Berliner.")
doc_de = nlp_de("Ich bin ein Berliner.")
print([t.text for t in doc_de])
```
@ -543,8 +547,8 @@ print([t.text for t in doc_de])
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp(u"Peach emoji is where it has always been. Peach is the superior "
u"emoji. It's outranking eggplant 🍑 ")
doc = nlp("Peach emoji is where it has always been. Peach is the superior "
"emoji. It's outranking eggplant 🍑 ")
print(doc[0].text) # 'Peach'
print(doc[1].text) # 'emoji'
print(doc[-1].text) # '🍑'
@ -572,7 +576,7 @@ print(sentences[1].text) # 'Peach is the superior emoji.'
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp(u"Apple is looking at buying U.K. startup for $1 billion")
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
apple = doc[0]
print("Fine-grained POS tag", apple.pos_, apple.pos)
print("Coarse-grained POS tag", apple.tag_, apple.tag)
@ -600,20 +604,20 @@ print("Like an email address?", billion.like_email)
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp(u"I love coffee")
doc = nlp("I love coffee")
coffee_hash = nlp.vocab.strings[u"coffee"] # 3197928453018144401
coffee_hash = nlp.vocab.strings["coffee"] # 3197928453018144401
coffee_text = nlp.vocab.strings[coffee_hash] # 'coffee'
print(coffee_hash, coffee_text)
print(doc[2].orth, coffee_hash) # 3197928453018144401
print(doc[2].text, coffee_text) # 'coffee'
beer_hash = doc.vocab.strings.add(u"beer") # 3073001599257881079
beer_hash = doc.vocab.strings.add("beer") # 3073001599257881079
beer_text = doc.vocab.strings[beer_hash] # 'beer'
print(beer_hash, beer_text)
unicorn_hash = doc.vocab.strings.add(u"🦄 ") # 18234233413267120783
unicorn_text = doc.vocab.strings[unicorn_hash] # '🦄 '
unicorn_hash = doc.vocab.strings.add("🦄") # 18234233413267120783
unicorn_text = doc.vocab.strings[unicorn_hash] # '🦄'
print(unicorn_hash, unicorn_text)
```
@ -629,19 +633,17 @@ print(unicorn_hash, unicorn_text)
```python
### {executable="true"}
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp(u"San Francisco considers banning sidewalk delivery robots")
for ent in doc.ents:
print(ent.text, ent.start_char, ent.end_char, ent.label_)
from spacy.tokens import Span
doc = nlp(u"FB is hiring a new VP of global policy")
doc.ents = [Span(doc, 0, 1, label=doc.vocab.strings[u"ORG"])]
nlp = spacy.load("en_core_web_sm")
doc = nlp("San Francisco considers banning sidewalk delivery robots")
for ent in doc.ents:
print(ent.text, ent.start_char, ent.end_char, ent.label_)
doc = nlp("FB is hiring a new VP of global policy")
doc.ents = [Span(doc, 0, 1, label="ORG")]
for ent in doc.ents:
print(ent.text, ent.start_char, ent.end_char, ent.label_)
```
<Infobox>
@ -657,7 +659,7 @@ import spacy
import random
nlp = spacy.load("en_core_web_sm")
train_data = [(u"Uber blew through $1 million", {"entities": [(0, 4, "ORG")]})]
train_data = [("Uber blew through $1 million", {"entities": [(0, 4, "ORG")]})]
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
with nlp.disable_pipes(*other_pipes):
@ -685,11 +687,11 @@ nlp.to_disk("/model")
```python
from spacy import displacy
doc_dep = nlp(u"This is a sentence.")
doc_dep = nlp("This is a sentence.")
displacy.serve(doc_dep, style="dep")
doc_ent = nlp(u"When Sebastian Thrun started working on self-driving cars at Google "
u"in 2007, few people outside of the company took him seriously.")
doc_ent = nlp("When Sebastian Thrun started working on self-driving cars at Google "
"in 2007, few people outside of the company took him seriously.")
displacy.serve(doc_ent, style="ent")
```
@ -707,7 +709,7 @@ displacy.serve(doc_ent, style="ent")
import spacy
nlp = spacy.load("en_core_web_md")
doc = nlp(u"Apple and banana are similar. Pasta and hippo aren't.")
doc = nlp("Apple and banana are similar. Pasta and hippo aren't.")
apple = doc[0]
banana = doc[2]
@ -769,7 +771,7 @@ pattern2 = [[{"ORTH": emoji, "OP": "+"}] for emoji in ["😀", "😂", "🤣", "
matcher.add("GoogleIO", None, pattern1) # Match "Google I/O" or "Google i/o"
matcher.add("HAPPY", set_sentiment, *pattern2) # Match one or more happy emoji
doc = nlp(u"A text about Google I/O 😀😀")
doc = nlp("A text about Google I/O 😀😀")
matches = matcher(doc)
for match_id, start, end in matches:
@ -789,7 +791,7 @@ print("Sentiment", doc.sentiment)
### Minibatched stream processing {#lightning-tour-minibatched}
```python
texts = [u"One document.", u"...", u"Lots of documents"]
texts = ["One document.", "...", "Lots of documents"]
# .pipe streams input, and produces streaming output
iter_texts = (texts[i % 3] for i in range(100000000))
for i, doc in enumerate(nlp.pipe(iter_texts, batch_size=50)):
@ -805,8 +807,8 @@ for i, doc in enumerate(nlp.pipe(iter_texts, batch_size=50)):
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp(u"When Sebastian Thrun started working on self-driving cars at Google "
u"in 2007, few people outside of the company took him seriously.")
doc = nlp("When Sebastian Thrun started working on self-driving cars at Google "
"in 2007, few people outside of the company took him seriously.")
dep_labels = []
for token in doc:
@ -831,7 +833,7 @@ import spacy
from spacy.attrs import ORTH, LIKE_URL
nlp = spacy.load("en_core_web_sm")
doc = nlp(u"Check out https://spacy.io")
doc = nlp("Check out https://spacy.io")
for token in doc:
print(token.text, token.orth, token.like_url)
@ -877,7 +879,7 @@ def put_spans_around_tokens(doc):
nlp = spacy.load("en_core_web_sm")
doc = nlp(u"This is a test.\\n\\nHello world.")
doc = nlp("This is a test.\\n\\nHello world.")
html = put_spans_around_tokens(doc)
print(html)
```

View File

@ -298,10 +298,10 @@ imports. It also makes it easier to structure and load your training data.
```python
### Simple training loop
TRAIN_DATA = [
(u"Uber blew through $1 million a week", {"entities": [(0, 4, "ORG")]}),
(u"Google rebrands its business apps", {"entities": [(0, 6, "ORG")]})]
("Uber blew through $1 million a week", {"entities": [(0, 4, "ORG")]}),
("Google rebrands its business apps", {"entities": [(0, 6, "ORG")]})]
nlp = spacy.blank('en')
nlp = spacy.blank("en")
optimizer = nlp.begin_training()
for i in range(20):
random.shuffle(TRAIN_DATA)
@ -498,7 +498,7 @@ like this:
![Custom dependencies](../images/displacy-custom-parser.svg)
```python
doc = nlp(u"find a hotel with good wifi")
doc = nlp("find a hotel with good wifi")
print([(t.text, t.dep_, t.head.text) for t in doc if t.dep_ != '-'])
# [('find', 'ROOT', 'find'), ('hotel', 'PLACE', 'find'),
# ('good', 'QUALITY', 'wifi'), ('wifi', 'ATTRIBUTE', 'hotel')]

View File

@ -99,8 +99,8 @@ flexibility.
>
> ```python
> matcher = PhraseMatcher(nlp.vocab, attr="POS")
> matcher.add("PATTERN", None, nlp(u"I love cats"))
> doc = nlp(u"You like dogs")
> matcher.add("PATTERN", None, nlp("I love cats"))
> doc = nlp("You like dogs")
> matches = matcher(doc)
> ```
@ -122,9 +122,9 @@ or `POS` for finding sequences of the same part-of-speech tags.
> #### Example
>
> ```python
> doc = nlp(u"I like David Bowie")
> doc = nlp("I like David Bowie")
> with doc.retokenize() as retokenizer:
> attrs = {"LEMMA": u"David Bowie"}
> attrs = {"LEMMA": "David Bowie"}
> retokenizer.merge(doc[2:4], attrs=attrs)
> ```

View File

@ -156,7 +156,7 @@ spaCy or plug in your own machine learning models.
> for itn in range(100):
> for doc, gold in train_data:
> nlp.update([doc], [gold])
> doc = nlp(u"This is a text.")
> doc = nlp("This is a text.")
> print(doc.cats)
> ```
@ -179,13 +179,13 @@ network to assign position-sensitive vectors to each word in the document.
> #### Example
>
> ```python
> doc = nlp(u"I love coffee")
> assert doc.vocab.strings[u"coffee"] == 3197928453018144401
> assert doc.vocab.strings[3197928453018144401] == u"coffee"
> doc = nlp("I love coffee")
> assert doc.vocab.strings["coffee"] == 3197928453018144401
> assert doc.vocab.strings[3197928453018144401] == "coffee"
>
> beer_hash = doc.vocab.strings.add(u"beer")
> assert doc.vocab.strings[u"beer"] == beer_hash
> assert doc.vocab.strings[beer_hash] == u"beer"
> beer_hash = doc.vocab.strings.add("beer")
> assert doc.vocab.strings["beer"] == beer_hash
> assert doc.vocab.strings[beer_hash] == "beer"
> ```
The [`StringStore`](/api/stringstore) now resolves all strings to hash values
@ -275,7 +275,7 @@ language, you can import the class directly, e.g.
>
> ```python
> from spacy import displacy
> doc = nlp(u"This is a sentence about Facebook.")
> doc = nlp("This is a sentence about Facebook.")
> displacy.serve(doc, style="dep") # run the web server
> html = displacy.render(doc, style="ent") # generate HTML
> ```
@ -322,7 +322,7 @@ lookup-based lemmatization and **many new languages**!
> matcher.add('HEARTS', None, [{"ORTH": "❤️", "OP": '+'}])
>
> phrasematcher = PhraseMatcher(nlp.vocab)
> phrasematcher.add("OBAMA", None, nlp(u"Barack Obama"))
> phrasematcher.add("OBAMA", None, nlp("Barack Obama"))
> ```
Patterns can now be added to the matcher by calling
@ -477,12 +477,12 @@ to the `disable` keyword argument on load, or by using
[`disable_pipes`](/api/language#disable_pipes) as a method or context manager:
```diff
- nlp = spacy.load("en", tagger=False, entity=False)
- doc = nlp(u"I don't want parsed", parse=False)
- nlp = spacy.load("en_core_web_sm", tagger=False, entity=False)
- doc = nlp("I don't want parsed", parse=False)
+ nlp = spacy.load("en", disable=["tagger", "ner"])
+ nlp = spacy.load("en_core_web_sm", disable=["tagger", "ner"])
+ with nlp.disable_pipes("parser"):
+ doc = nlp(u"I don't want parsed")
+ doc = nlp("I don't want parsed")
```
To add spaCy's built-in pipeline components to your pipeline, you can still
@ -539,7 +539,7 @@ This means that your application can and should only pass around `Doc`
objects and refer to them as the single source of truth.
```diff
- doc = nlp(u"This is a regular doc")
- doc = nlp("This is a regular doc")
- doc_array = doc.to_array(["ORTH", "POS"])
- doc_with_meta = {"doc_array": doc_array, "meta": get_doc_meta(doc_array)}
@ -556,11 +556,11 @@ utilities that interact with the pipeline, consider moving this logic into its
own extension module.
```diff
- doc = nlp(u"Doc with a standard pipeline")
- doc = nlp("Doc with a standard pipeline")
- meta = get_meta(doc)
+ nlp.add_pipe(meta_component)
+ doc = nlp(u"Doc with a custom pipeline that assigns meta")
+ doc = nlp("Doc with a custom pipeline that assigns meta")
+ meta = doc._.meta
```
@ -572,12 +572,12 @@ to call [`StringStore.add`](/api/stringstore#add) explicitly. You can also now
be sure that the string-to-hash mapping will always match across vocabularies.
```diff
- nlp.vocab.strings[u"coffee"] # 3672
- other_nlp.vocab.strings[u"coffee"] # 40259
- nlp.vocab.strings["coffee"] # 3672
- other_nlp.vocab.strings["coffee"] # 40259
+ nlp.vocab.strings.add(u"coffee")
+ nlp.vocab.strings[u"coffee"] # 3197928453018144401
+ other_nlp.vocab.strings[u"coffee"] # 3197928453018144401
+ nlp.vocab.strings.add("coffee")
+ nlp.vocab.strings["coffee"] # 3197928453018144401
+ other_nlp.vocab.strings["coffee"] # 3197928453018144401
```
### Adding patterns and callbacks to the matcher {#migrating-matcher}

View File

@ -74,8 +74,8 @@ path to [`spacy.load()`](/api/top-level#spacy.load).
```python
nlp_latin = spacy.load("/tmp/la_vectors_wiki_lg")
doc1 = nlp_latin(u"Caecilius est in horto")
doc2 = nlp_latin(u"servus est in atrio")
doc1 = nlp_latin("Caecilius est in horto")
doc2 = nlp_latin("servus est in atrio")
doc1.similarity(doc2)
```
@ -168,10 +168,9 @@ vectors to the vocabulary, you can use the
### Adding vectors
from spacy.vocab import Vocab
vector_data = {u"dog": numpy.random.uniform(-1, 1, (300,)),
u"cat": numpy.random.uniform(-1, 1, (300,)),
u"orange": numpy.random.uniform(-1, 1, (300,))}
vector_data = {"dog": numpy.random.uniform(-1, 1, (300,)),
"cat": numpy.random.uniform(-1, 1, (300,)),
"orange": numpy.random.uniform(-1, 1, (300,))}
vocab = Vocab()
for word, vector in vector_data.items():
vocab.set_vector(word, vector)
@ -241,7 +240,7 @@ import cupy.cuda
from spacy.vectors import Vectors
vector_table = numpy.zeros((3, 300), dtype="f")
vectors = Vectors([u"dog", u"cat", u"orange"], vector_table)
vectors = Vectors(["dog", "cat", "orange"], vector_table)
with cupy.cuda.Device(0):
vectors.data = cupy.asarray(vectors.data)
```
@ -252,6 +251,6 @@ import torch
from spacy.vectors import Vectors
vector_table = numpy.zeros((3, 300), dtype="f")
vectors = Vectors([u"dog", u"cat", u"orange"], vector_table)
vectors = Vectors(["dog", "cat", "orange"], vector_table)
vectors.data = torch.Tensor(vectors.data).cuda(0)
```

View File

@ -48,7 +48,7 @@ import spacy
from spacy import displacy
nlp = spacy.load("en_core_web_sm")
doc = nlp(u"This is a sentence.")
doc = nlp("This is a sentence.")
displacy.serve(doc, style="dep")
```
@ -101,7 +101,7 @@ import spacy
from spacy import displacy
nlp = spacy.load("en_core_web_sm")
text = u"""In ancient Rome, some neighbors live in three adjacent houses. In the center is the house of Senex, who lives there with wife Domina, son Hero, and several slaves, including head slave Hysterium and the musical's main character Pseudolus. A slave belonging to Hero, Pseudolus wishes to buy, win, or steal his freedom. One of the neighboring houses is owned by Marcus Lycus, who is a buyer and seller of beautiful women; the other belongs to the ancient Erronius, who is abroad searching for his long-lost children (stolen in infancy by pirates). One day, Senex and Domina go on a trip and leave Pseudolus in charge of Hero. Hero confides in Pseudolus that he is in love with the lovely Philia, one of the courtesans in the House of Lycus (albeit still a virgin)."""
text = """In ancient Rome, some neighbors live in three adjacent houses. In the center is the house of Senex, who lives there with wife Domina, son Hero, and several slaves, including head slave Hysterium and the musical's main character Pseudolus. A slave belonging to Hero, Pseudolus wishes to buy, win, or steal his freedom. One of the neighboring houses is owned by Marcus Lycus, who is a buyer and seller of beautiful women; the other belongs to the ancient Erronius, who is abroad searching for his long-lost children (stolen in infancy by pirates). One day, Senex and Domina go on a trip and leave Pseudolus in charge of Hero. Hero confides in Pseudolus that he is in love with the lovely Philia, one of the courtesans in the House of Lycus (albeit still a virgin)."""
doc = nlp(text)
sentence_spans = list(doc.sents)
displacy.serve(sentence_spans, style="dep")
@ -117,7 +117,7 @@ text.
import spacy
from spacy import displacy
text = u"When Sebastian Thrun started working on self-driving cars at Google in 2007, few people outside of the company took him seriously."
text = "When Sebastian Thrun started working on self-driving cars at Google in 2007, few people outside of the company took him seriously."
nlp = spacy.load("en_core_web_sm")
doc = nlp(text)
@ -168,7 +168,7 @@ add a headline to each visualization, you can add a `title` to its `user_data`.
User data is never touched or modified by spaCy.
```python
doc = nlp(u"This is a sentence about Google.")
doc = nlp("This is a sentence about Google.")
doc.user_data["title"] = "This is a title"
displacy.serve(doc, style="ent")
```
@ -193,7 +193,7 @@ import spacy
from spacy import displacy
# In[2]:
doc = nlp(u"Rats are various medium-sized, long-tailed rodents.")
doc = nlp("Rats are various medium-sized, long-tailed rodents.")
displacy.render(doc, style="dep")
# In[3]:
@ -209,7 +209,6 @@ rendering if auto-detection fails.
</Infobox>
![displaCy visualizer in a Jupyter notebook](../images/displacy_jupyter.jpg)
Internally, displaCy imports `display` and `HTML` from `IPython.core.display`
@ -236,8 +235,8 @@ import spacy
from spacy import displacy
nlp = spacy.load("en_core_web_sm")
doc1 = nlp(u"This is a sentence.")
doc2 = nlp(u"This is another sentence.")
doc1 = nlp("This is a sentence.")
doc2 = nlp("This is another sentence.")
html = displacy.render([doc1, doc2], style="dep", page=True)
```
@ -281,7 +280,7 @@ from spacy import displacy
from pathlib import Path
nlp = spacy.load("en_core_web_sm")
sentences = [u"This is an example.", u"This is another one."]
sentences = ["This is an example.", "This is another one."]
for sent in sentences:
doc = nlp(sent)
svg = displacy.render(doc, style="dep", jupyter=False)

View File

@ -119,14 +119,14 @@
"emoji = Emoji(nlp)",
"nlp.add_pipe(emoji, first=True)",
"",
"doc = nlp(u'This is a test 😻 👍🏿')",
"doc = nlp('This is a test 😻 👍🏿')",
"assert doc._.has_emoji == True",
"assert doc[2:5]._.has_emoji == True",
"assert doc[0]._.is_emoji == False",
"assert doc[4]._.is_emoji == True",
"assert doc[5]._.emoji_desc == u'thumbs up dark skin tone'",
"assert doc[5]._.emoji_desc == 'thumbs up dark skin tone'",
"assert len(doc._.emoji) == 2",
"assert doc._.emoji[1] == (u'👍🏿', 5, u'thumbs up dark skin tone')"
"assert doc._.emoji[1] == ('👍🏿', 5, 'thumbs up dark skin tone')"
],
"author": "Ines Montani",
"author_links": {
@ -747,8 +747,8 @@
"s2v = Sense2VecComponent('/path/to/reddit_vectors-1.1.0')",
"nlp.add_pipe(s2v)",
"",
"doc = nlp(u\"A sentence about natural language processing.\")",
"assert doc[3].text == u'natural language processing'",
"doc = nlp(\"A sentence about natural language processing.\")",
"assert doc[3].text == 'natural language processing'",
"freq = doc[3]._.s2v_freq",
"vector = doc[3]._.s2v_vec",
"most_similar = doc[3]._.s2v_most_similar(3)",
@ -1297,7 +1297,7 @@
"",
"nlp = spacy.load('en')",
"nlp.add_pipe(BeneparComponent('benepar_en'))",
"doc = nlp(u'The time for action is now. It's never too late to do something.')",
"doc = nlp('The time for action is now. It's never too late to do something.')",
"sent = list(doc.sents)[0]",
"print(sent._.parse_string)",
"# (S (NP (NP (DT The) (NN time)) (PP (IN for) (NP (NN action)))) (VP (VBZ is) (ADVP (RB now))) (. .))",

View File

@ -65,7 +65,7 @@ const QuickstartInstall = ({ id, title, description, defaultLang, children }) =>
nlp = {pkg}.load()
</QS>
<QS lang={code} config="example" prompt="python">
doc = nlp(u"{exampleText}")
doc = nlp("{exampleText}")
</QS>
<QS lang={code} config="example" prompt="python">
print([