From 82c16b7943adcee47ee3ea376e32486546d66043 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 12 Sep 2019 16:11:15 +0200 Subject: [PATCH] Remove u-strings and fix formatting [ci skip] --- website/README.md | 13 +- website/docs/api/annotation.md | 206 +++++++++--------- website/docs/api/cython-classes.md | 6 +- website/docs/api/cython-structs.md | 6 +- website/docs/api/dependencyparser.md | 2 +- website/docs/api/doc.md | 79 ++++--- website/docs/api/entitylinker.md | 129 +++++------ website/docs/api/entityrecognizer.md | 12 +- website/docs/api/goldparse.md | 10 +- website/docs/api/language.md | 8 +- website/docs/api/lemmatizer.md | 8 +- website/docs/api/lexeme.md | 18 +- website/docs/api/matcher.md | 4 +- website/docs/api/phrasematcher.md | 16 +- website/docs/api/pipeline-functions.md | 8 +- website/docs/api/sentencizer.md | 2 +- website/docs/api/span.md | 74 +++---- website/docs/api/stringstore.md | 30 +-- website/docs/api/tagger.md | 13 +- website/docs/api/textcategorizer.md | 12 +- website/docs/api/token.md | 60 ++--- website/docs/api/tokenizer.md | 13 +- website/docs/api/top-level.md | 36 +-- website/docs/api/vectors.md | 22 +- website/docs/api/vocab.md | 28 +-- website/docs/usage/101/_named-entities.md | 12 +- website/docs/usage/101/_pos-deps.md | 4 +- website/docs/usage/101/_tokenization.md | 2 +- website/docs/usage/101/_vectors-similarity.md | 8 +- website/docs/usage/adding-languages.md | 8 +- website/docs/usage/index.md | 2 +- website/docs/usage/linguistic-features.md | 145 ++++++------ website/docs/usage/models.md | 6 +- website/docs/usage/processing-pipelines.md | 30 +-- website/docs/usage/rule-based-matching.md | 42 ++-- website/docs/usage/saving-loading.md | 16 +- website/docs/usage/spacy-101.md | 108 ++++----- website/docs/usage/training.md | 8 +- website/docs/usage/v2-1.md | 8 +- website/docs/usage/v2.md | 42 ++-- website/docs/usage/vectors-similarity.md | 15 +- website/docs/usage/visualizers.md | 17 +- website/meta/universe.json | 12 +- website/src/widgets/quickstart-models.js | 2 +- 44 files changed, 644 insertions(+), 658 deletions(-) diff --git a/website/README.md b/website/README.md index be817225d..a02d5a151 100644 --- a/website/README.md +++ b/website/README.md @@ -309,7 +309,7 @@ indented block as plain text and preserve whitespace. ### Using spaCy import spacy nlp = spacy.load("en_core_web_sm") -doc = nlp(u"This is a sentence.") +doc = nlp("This is a sentence.") for token in doc: print(token.text, token.pos_) ``` @@ -335,9 +335,9 @@ from spacy.matcher import Matcher nlp = spacy.load('en_core_web_sm') matcher = Matcher(nlp.vocab) -pattern = [{'LOWER': 'hello'}, {'IS_PUNCT': True}, {'LOWER': 'world'}] -matcher.add('HelloWorld', None, pattern) -doc = nlp(u'Hello, world! Hello world!') +pattern = [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"LOWER": "world"}] +matcher.add("HelloWorld", None, pattern) +doc = nlp("Hello, world! Hello world!") matches = matcher(doc) ``` @@ -360,7 +360,7 @@ interactive widget defaults to a regular code block. ### {executable="true"} import spacy nlp = spacy.load("en_core_web_sm") -doc = nlp(u"This is a sentence.") +doc = nlp("This is a sentence.") for token in doc: print(token.text, token.pos_) ``` @@ -457,7 +457,8 @@ sit amet dignissim justo congue. ## Setup and installation {#setup} Before running the setup, make sure your versions of -[Node](https://nodejs.org/en/) and [npm](https://www.npmjs.com/) are up to date. Node v10.15 or later is required. +[Node](https://nodejs.org/en/) and [npm](https://www.npmjs.com/) are up to date. +Node v10.15 or later is required. ```bash # Clone the repository diff --git a/website/docs/api/annotation.md b/website/docs/api/annotation.md index ac888cec9..2c52d197a 100644 --- a/website/docs/api/annotation.md +++ b/website/docs/api/annotation.md @@ -16,7 +16,7 @@ menu: > ```python > from spacy.lang.en import English > nlp = English() -> tokens = nlp(u"Some\\nspaces and\\ttab characters") +> tokens = nlp("Some\\nspaces and\\ttab characters") > tokens_text = [t.text for t in tokens] > assert tokens_text == ["Some", "\\n", "spaces", " ", "and", "\\t", "tab", "characters"] > ``` @@ -186,63 +186,63 @@ The German part-of-speech tagger uses the annotation scheme. We also map the tags to the simpler Google Universal POS tag set. -| Tag |  POS | Morphology | Description | -| --------- | ------- | ------------------------------------------- | ------------------------------------------------- | -| `$(` | `PUNCT` | `PunctType=brck` | other sentence-internal punctuation mark | -| `$,` | `PUNCT` | `PunctType=comm` | comma | -| `$.` | `PUNCT` | `PunctType=peri` | sentence-final punctuation mark | -| `ADJA` | `ADJ` | | adjective, attributive | -| `ADJD` | `ADJ` | `Variant=short` | adjective, adverbial or predicative | -| `ADV` | `ADV` | | adverb | -| `APPO` | `ADP` | `AdpType=post` | postposition | -| `APPR` | `ADP` | `AdpType=prep` | preposition; circumposition left | -| `APPRART` | `ADP` | `AdpType=prep PronType=art` | preposition with article | -| `APZR` | `ADP` | `AdpType=circ` | circumposition right | -| `ART` | `DET` | `PronType=art` | definite or indefinite article | -| `CARD` | `NUM` | `NumType=card` | cardinal number | -| `FM` | `X` | `Foreign=yes` | foreign language material | -| `ITJ` | `INTJ` | | interjection | -| `KOKOM` | `CONJ` | `ConjType=comp` | comparative conjunction | -| `KON` | `CONJ` | | coordinate conjunction | -| `KOUI` | `SCONJ` | | subordinate conjunction with "zu" and infinitive | -| `KOUS` | `SCONJ` | | subordinate conjunction with sentence | -| `NE` | `PROPN` | | proper noun | -| `NNE` | `PROPN` | | proper noun | -| `NN` | `NOUN` | | noun, singular or mass | -| `PROAV` | `ADV` | `PronType=dem` | pronominal adverb | -| `PDAT` | `DET` | `PronType=dem` | attributive demonstrative pronoun | -| `PDS` | `PRON` | `PronType=dem` | substituting demonstrative pronoun | -| `PIAT` | `DET` | `PronType=ind\|neg\|tot` | attributive indefinite pronoun without determiner | -| `PIS` | `PRON` | `PronType=ind\|neg\|tot` | substituting indefinite pronoun | -| `PPER` | `PRON` | `PronType=prs` | non-reflexive personal pronoun | -| `PPOSAT` | `DET` | `Poss=yes PronType=prs` | attributive possessive pronoun | -| `PPOSS` | `PRON` | `PronType=rel` | substituting possessive pronoun | -| `PRELAT` | `DET` | `PronType=rel` | attributive relative pronoun | -| `PRELS` | `PRON` | `PronType=rel` | substituting relative pronoun | -| `PRF` | `PRON` | `PronType=prs Reflex=yes` | reflexive personal pronoun | -| `PTKA` | `PART` | | particle with adjective or adverb | -| `PTKANT` | `PART` | `PartType=res` | answer particle | -| `PTKNEG` | `PART` | `Negative=yes` | negative particle | -| `PTKVZ` | `PART` | `PartType=vbp` | separable verbal particle | -| `PTKZU` | `PART` | `PartType=inf` | "zu" before infinitive | -| `PWAT` | `DET` | `PronType=int` | attributive interrogative pronoun | -| `PWAV` | `ADV` | `PronType=int` | adverbial interrogative or relative pronoun | -| `PWS` | `PRON` | `PronType=int` | substituting interrogative pronoun | -| `TRUNC` | `X` | `Hyph=yes` | word remnant | -| `VAFIN` | `AUX` | `Mood=ind VerbForm=fin` | finite verb, auxiliary | -| `VAIMP` | `AUX` | `Mood=imp VerbForm=fin` | imperative, auxiliary | -| `VAINF` | `AUX` | `VerbForm=inf` | infinitive, auxiliary | -| `VAPP` | `AUX` | `Aspect=perf VerbForm=fin` | perfect participle, auxiliary | -| `VMFIN` | `VERB` | `Mood=ind VerbForm=fin VerbType=mod` | finite verb, modal | -| `VMINF` | `VERB` | `VerbForm=fin VerbType=mod` | infinitive, modal | -| `VMPP` | `VERB` | `Aspect=perf VerbForm=part VerbType=mod` | perfect participle, modal | -| `VVFIN` | `VERB` | `Mood=ind VerbForm=fin` | finite verb, full | -| `VVIMP` | `VERB` | `Mood=imp VerbForm=fin` | imperative, full | -| `VVINF` | `VERB` | `VerbForm=inf` | infinitive, full | -| `VVIZU` | `VERB` | `VerbForm=inf` | infinitive with "zu", full | -| `VVPP` | `VERB` | `Aspect=perf VerbForm=part` | perfect participle, full | -| `XY` | `X` | | non-word containing non-letter | -| `SP` | `SPACE` | | space | +| Tag |  POS | Morphology | Description | +| --------- | ------- | ---------------------------------------- | ------------------------------------------------- | +| `$(` | `PUNCT` | `PunctType=brck` | other sentence-internal punctuation mark | +| `$,` | `PUNCT` | `PunctType=comm` | comma | +| `$.` | `PUNCT` | `PunctType=peri` | sentence-final punctuation mark | +| `ADJA` | `ADJ` | | adjective, attributive | +| `ADJD` | `ADJ` | `Variant=short` | adjective, adverbial or predicative | +| `ADV` | `ADV` | | adverb | +| `APPO` | `ADP` | `AdpType=post` | postposition | +| `APPR` | `ADP` | `AdpType=prep` | preposition; circumposition left | +| `APPRART` | `ADP` | `AdpType=prep PronType=art` | preposition with article | +| `APZR` | `ADP` | `AdpType=circ` | circumposition right | +| `ART` | `DET` | `PronType=art` | definite or indefinite article | +| `CARD` | `NUM` | `NumType=card` | cardinal number | +| `FM` | `X` | `Foreign=yes` | foreign language material | +| `ITJ` | `INTJ` | | interjection | +| `KOKOM` | `CONJ` | `ConjType=comp` | comparative conjunction | +| `KON` | `CONJ` | | coordinate conjunction | +| `KOUI` | `SCONJ` | | subordinate conjunction with "zu" and infinitive | +| `KOUS` | `SCONJ` | | subordinate conjunction with sentence | +| `NE` | `PROPN` | | proper noun | +| `NNE` | `PROPN` | | proper noun | +| `NN` | `NOUN` | | noun, singular or mass | +| `PROAV` | `ADV` | `PronType=dem` | pronominal adverb | +| `PDAT` | `DET` | `PronType=dem` | attributive demonstrative pronoun | +| `PDS` | `PRON` | `PronType=dem` | substituting demonstrative pronoun | +| `PIAT` | `DET` | `PronType=ind\|neg\|tot` | attributive indefinite pronoun without determiner | +| `PIS` | `PRON` | `PronType=ind\|neg\|tot` | substituting indefinite pronoun | +| `PPER` | `PRON` | `PronType=prs` | non-reflexive personal pronoun | +| `PPOSAT` | `DET` | `Poss=yes PronType=prs` | attributive possessive pronoun | +| `PPOSS` | `PRON` | `PronType=rel` | substituting possessive pronoun | +| `PRELAT` | `DET` | `PronType=rel` | attributive relative pronoun | +| `PRELS` | `PRON` | `PronType=rel` | substituting relative pronoun | +| `PRF` | `PRON` | `PronType=prs Reflex=yes` | reflexive personal pronoun | +| `PTKA` | `PART` | | particle with adjective or adverb | +| `PTKANT` | `PART` | `PartType=res` | answer particle | +| `PTKNEG` | `PART` | `Negative=yes` | negative particle | +| `PTKVZ` | `PART` | `PartType=vbp` | separable verbal particle | +| `PTKZU` | `PART` | `PartType=inf` | "zu" before infinitive | +| `PWAT` | `DET` | `PronType=int` | attributive interrogative pronoun | +| `PWAV` | `ADV` | `PronType=int` | adverbial interrogative or relative pronoun | +| `PWS` | `PRON` | `PronType=int` | substituting interrogative pronoun | +| `TRUNC` | `X` | `Hyph=yes` | word remnant | +| `VAFIN` | `AUX` | `Mood=ind VerbForm=fin` | finite verb, auxiliary | +| `VAIMP` | `AUX` | `Mood=imp VerbForm=fin` | imperative, auxiliary | +| `VAINF` | `AUX` | `VerbForm=inf` | infinitive, auxiliary | +| `VAPP` | `AUX` | `Aspect=perf VerbForm=fin` | perfect participle, auxiliary | +| `VMFIN` | `VERB` | `Mood=ind VerbForm=fin VerbType=mod` | finite verb, modal | +| `VMINF` | `VERB` | `VerbForm=fin VerbType=mod` | infinitive, modal | +| `VMPP` | `VERB` | `Aspect=perf VerbForm=part VerbType=mod` | perfect participle, modal | +| `VVFIN` | `VERB` | `Mood=ind VerbForm=fin` | finite verb, full | +| `VVIMP` | `VERB` | `Mood=imp VerbForm=fin` | imperative, full | +| `VVINF` | `VERB` | `VerbForm=inf` | infinitive, full | +| `VVIZU` | `VERB` | `VerbForm=inf` | infinitive with "zu", full | +| `VVPP` | `VERB` | `Aspect=perf VerbForm=part` | perfect participle, full | +| `XY` | `X` | | non-word containing non-letter | +| `SP` | `SPACE` | | space | @@ -379,51 +379,51 @@ The German dependency labels use the [TIGER Treebank](http://www.ims.uni-stuttgart.de/forschung/ressourcen/korpora/TIGERCorpus/annotation/index.html) annotation scheme. -| Label | Description | -| ------ | ------------------------------- | -| `ac` | adpositional case marker | -| `adc` | adjective component | -| `ag` | genitive attribute | -| `ams` | measure argument of adjective | -| `app` | apposition | -| `avc` | adverbial phrase component | -| `cc` | comparative complement | -| `cd` | coordinating conjunction | -| `cj` | conjunct | -| `cm` | comparative conjunction | -| `cp` | complementizer | -| `cvc` | collocational verb construction | -| `da` | dative | -| `dm` | discourse marker | -| `ep` | expletive es | -| `ju` | junctor | -| `mnr` | postnominal modifier | -| `mo` | modifier | -| `ng` | negation | -| `nk` | noun kernel element | -| `nmc` | numerical component | -| `oa` | accusative object | -| `oa2` | second accusative object | -| `oc` | clausal object | -| `og` | genitive object | -| `op` | prepositional object | -| `par` | parenthetical element | -| `pd` | predicate | -| `pg` | phrasal genitive | -| `ph` | placeholder | -| `pm` | morphological particle | -| `pnc` | proper noun component | -| `punct` | punctuation | -| `rc` | relative clause | -| `re` | repeated element | -| `rs` | reported speech | -| `sb` | subject | -| `sbp` | passivized subject (PP) | -| `sp` | subject or predicate | -| `svp` | separable verb prefix | -| `uc` | unit component | -| `vo` | vocative | -| `ROOT` | root | +| Label | Description | +| ------- | ------------------------------- | +| `ac` | adpositional case marker | +| `adc` | adjective component | +| `ag` | genitive attribute | +| `ams` | measure argument of adjective | +| `app` | apposition | +| `avc` | adverbial phrase component | +| `cc` | comparative complement | +| `cd` | coordinating conjunction | +| `cj` | conjunct | +| `cm` | comparative conjunction | +| `cp` | complementizer | +| `cvc` | collocational verb construction | +| `da` | dative | +| `dm` | discourse marker | +| `ep` | expletive es | +| `ju` | junctor | +| `mnr` | postnominal modifier | +| `mo` | modifier | +| `ng` | negation | +| `nk` | noun kernel element | +| `nmc` | numerical component | +| `oa` | accusative object | +| `oa2` | second accusative object | +| `oc` | clausal object | +| `og` | genitive object | +| `op` | prepositional object | +| `par` | parenthetical element | +| `pd` | predicate | +| `pg` | phrasal genitive | +| `ph` | placeholder | +| `pm` | morphological particle | +| `pnc` | proper noun component | +| `punct` | punctuation | +| `rc` | relative clause | +| `re` | repeated element | +| `rs` | reported speech | +| `sb` | subject | +| `sbp` | passivized subject (PP) | +| `sp` | subject or predicate | +| `svp` | separable verb prefix | +| `uc` | unit component | +| `vo` | vocative | +| `ROOT` | root | diff --git a/website/docs/api/cython-classes.md b/website/docs/api/cython-classes.md index 4d188d90f..77d6fdd10 100644 --- a/website/docs/api/cython-classes.md +++ b/website/docs/api/cython-classes.md @@ -45,9 +45,9 @@ Append a token to the `Doc`. The token can be provided as a > from spacy.vocab cimport Vocab > > doc = Doc(Vocab()) -> lexeme = doc.vocab.get(u'hello') +> lexeme = doc.vocab.get("hello") > doc.push_back(lexeme, True) -> assert doc.text == u'hello ' +> assert doc.text == "hello " > ``` | Name | Type | Description | @@ -164,7 +164,7 @@ vocabulary. > #### Example > > ```python -> lexeme = vocab.get(vocab.mem, u'hello') +> lexeme = vocab.get(vocab.mem, "hello") > ``` | Name | Type | Description | diff --git a/website/docs/api/cython-structs.md b/website/docs/api/cython-structs.md index 0e427a8d5..935bce25d 100644 --- a/website/docs/api/cython-structs.md +++ b/website/docs/api/cython-structs.md @@ -88,7 +88,7 @@ Find a token in a `TokenC*` array by the offset of its first character. > from spacy.tokens.doc cimport Doc, token_by_start > from spacy.vocab cimport Vocab > -> doc = Doc(Vocab(), words=[u'hello', u'world']) +> doc = Doc(Vocab(), words=["hello", "world"]) > assert token_by_start(doc.c, doc.length, 6) == 1 > assert token_by_start(doc.c, doc.length, 4) == -1 > ``` @@ -110,7 +110,7 @@ Find a token in a `TokenC*` array by the offset of its final character. > from spacy.tokens.doc cimport Doc, token_by_end > from spacy.vocab cimport Vocab > -> doc = Doc(Vocab(), words=[u'hello', u'world']) +> doc = Doc(Vocab(), words=["hello", "world"]) > assert token_by_end(doc.c, doc.length, 5) == 0 > assert token_by_end(doc.c, doc.length, 1) == -1 > ``` @@ -134,7 +134,7 @@ attribute, in order to make the parse tree navigation consistent. > from spacy.tokens.doc cimport Doc, set_children_from_heads > from spacy.vocab cimport Vocab > -> doc = Doc(Vocab(), words=[u'Baileys', u'from', u'a', u'shoe']) +> doc = Doc(Vocab(), words=["Baileys", "from", "a", "shoe"]) > doc.c[0].head = 0 > doc.c[1].head = 0 > doc.c[2].head = 3 diff --git a/website/docs/api/dependencyparser.md b/website/docs/api/dependencyparser.md index 58acc4425..df0df3e38 100644 --- a/website/docs/api/dependencyparser.md +++ b/website/docs/api/dependencyparser.md @@ -58,7 +58,7 @@ and all pipeline components are applied to the `Doc` in order. Both > > ```python > parser = DependencyParser(nlp.vocab) -> doc = nlp(u"This is a sentence.") +> doc = nlp("This is a sentence.") > # This usually happens under the hood > processed = parser(doc) > ``` diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md index 431d3a092..ad684f51e 100644 --- a/website/docs/api/doc.md +++ b/website/docs/api/doc.md @@ -20,11 +20,11 @@ Construct a `Doc` object. The most common way to get a `Doc` object is via the > > ```python > # Construction 1 -> doc = nlp(u"Some text") +> doc = nlp("Some text") > > # Construction 2 > from spacy.tokens import Doc -> words = [u"hello", u"world", u"!"] +> words = ["hello", "world", "!"] > spaces = [True, False, False] > doc = Doc(nlp.vocab, words=words, spaces=spaces) > ``` @@ -45,7 +45,7 @@ Negative indexing is supported, and follows the usual Python semantics, i.e. > #### Example > > ```python -> doc = nlp(u"Give it back! He pleaded.") +> doc = nlp("Give it back! He pleaded.") > assert doc[0].text == "Give" > assert doc[-1].text == "." > span = doc[1:3] @@ -76,8 +76,8 @@ Iterate over `Token` objects, from which the annotations can be easily accessed. > #### Example > > ```python -> doc = nlp(u'Give it back') -> assert [t.text for t in doc] == [u'Give', u'it', u'back'] +> doc = nlp("Give it back") +> assert [t.text for t in doc] == ["Give", "it", "back"] > ``` This is the main way of accessing [`Token`](/api/token) objects, which are the @@ -96,7 +96,7 @@ Get the number of tokens in the document. > #### Example > > ```python -> doc = nlp(u"Give it back! He pleaded.") +> doc = nlp("Give it back! He pleaded.") > assert len(doc) == 7 > ``` @@ -114,9 +114,9 @@ details, see the documentation on > > ```python > from spacy.tokens import Doc -> city_getter = lambda doc: any(city in doc.text for city in ('New York', 'Paris', 'Berlin')) -> Doc.set_extension('has_city', getter=city_getter) -> doc = nlp(u'I like New York') +> city_getter = lambda doc: any(city in doc.text for city in ("New York", "Paris", "Berlin")) +> Doc.set_extension("has_city", getter=city_getter) +> doc = nlp("I like New York") > assert doc._.has_city > ``` @@ -192,8 +192,8 @@ the character indices don't map to a valid span. > #### Example > > ```python -> doc = nlp(u"I like New York") -> span = doc.char_span(7, 15, label=u"GPE") +> doc = nlp("I like New York") +> span = doc.char_span(7, 15, label="GPE") > assert span.text == "New York" > ``` @@ -213,8 +213,8 @@ using an average of word vectors. > #### Example > > ```python -> apples = nlp(u"I like apples") -> oranges = nlp(u"I like oranges") +> apples = nlp("I like apples") +> oranges = nlp("I like oranges") > apples_oranges = apples.similarity(oranges) > oranges_apples = oranges.similarity(apples) > assert apples_oranges == oranges_apples @@ -235,7 +235,7 @@ attribute ID. > > ```python > from spacy.attrs import ORTH -> doc = nlp(u"apple apple orange banana") +> doc = nlp("apple apple orange banana") > assert doc.count_by(ORTH) == {7024L: 1, 119552L: 1, 2087L: 2} > doc.to_array([ORTH]) > # array([[11880], [11880], [7561], [12800]]) @@ -255,7 +255,7 @@ ancestor is found, e.g. if span excludes a necessary ancestor. > #### Example > > ```python -> doc = nlp(u"This is a test") +> doc = nlp("This is a test") > matrix = doc.get_lca_matrix() > # array([[0, 1, 1, 1], [1, 1, 1, 1], [1, 1, 2, 3], [1, 1, 3, 3]], dtype=int32) > ``` @@ -274,7 +274,7 @@ They'll be added to an `"_"` key in the data, e.g. `"_": {"foo": "bar"}`. > #### Example > > ```python -> doc = nlp(u"Hello") +> doc = nlp("Hello") > json_doc = doc.to_json() > ``` > @@ -342,7 +342,7 @@ array of attributes. > ```python > from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA > from spacy.tokens import Doc -> doc = nlp(u"Hello world!") +> doc = nlp("Hello world!") > np_array = doc.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA]) > doc2 = Doc(doc.vocab, words=[t.text for t in doc]) > doc2.from_array([LOWER, POS, ENT_TYPE, IS_ALPHA], np_array) @@ -396,7 +396,7 @@ Serialize, i.e. export the document contents to a binary string. > #### Example > > ```python -> doc = nlp(u"Give it back! He pleaded.") +> doc = nlp("Give it back! He pleaded.") > doc_bytes = doc.to_bytes() > ``` @@ -413,10 +413,9 @@ Deserialize, i.e. import the document contents from a binary string. > > ```python > from spacy.tokens import Doc -> text = u"Give it back! He pleaded." -> doc = nlp(text) -> bytes = doc.to_bytes() -> doc2 = Doc(doc.vocab).from_bytes(bytes) +> doc = nlp("Give it back! He pleaded.") +> doc_bytes = doc.to_bytes() +> doc2 = Doc(doc.vocab).from_bytes(doc_bytes) > assert doc.text == doc2.text > ``` @@ -457,9 +456,9 @@ dictionary mapping attribute names to values as the `"_"` key. > #### Example > > ```python -> doc = nlp(u"I like David Bowie") +> doc = nlp("I like David Bowie") > with doc.retokenize() as retokenizer: -> attrs = {"LEMMA": u"David Bowie"} +> attrs = {"LEMMA": "David Bowie"} > retokenizer.merge(doc[2:4], attrs=attrs) > ``` @@ -489,7 +488,7 @@ underlying lexeme (if they're context-independent lexical attributes like > #### Example > > ```python -> doc = nlp(u"I live in NewYork") +> doc = nlp("I live in NewYork") > with doc.retokenize() as retokenizer: > heads = [(doc[3], 1), doc[2]] > attrs = {"POS": ["PROPN", "PROPN"], @@ -521,9 +520,9 @@ and end token boundaries, the document remains unchanged. > #### Example > > ```python -> doc = nlp(u"Los Angeles start.") +> doc = nlp("Los Angeles start.") > doc.merge(0, len("Los Angeles"), "NNP", "Los Angeles", "GPE") -> assert [t.text for t in doc] == [u"Los Angeles", u"start", u"."] +> assert [t.text for t in doc] == ["Los Angeles", "start", "."] > ``` | Name | Type | Description | @@ -541,11 +540,11 @@ objects, if the entity recognizer has been applied. > #### Example > > ```python -> doc = nlp(u"Mr. Best flew to New York on Saturday morning.") +> doc = nlp("Mr. Best flew to New York on Saturday morning.") > ents = list(doc.ents) > assert ents[0].label == 346 -> assert ents[0].label_ == u"PERSON" -> assert ents[0].text == u"Mr. Best" +> assert ents[0].label_ == "PERSON" +> assert ents[0].text == "Mr. Best" > ``` | Name | Type | Description | @@ -563,10 +562,10 @@ relative clauses. > #### Example > > ```python -> doc = nlp(u"A phrase with another phrase occurs.") +> doc = nlp("A phrase with another phrase occurs.") > chunks = list(doc.noun_chunks) -> assert chunks[0].text == u"A phrase" -> assert chunks[1].text == u"another phrase" +> assert chunks[0].text == "A phrase" +> assert chunks[1].text == "another phrase" > ``` | Name | Type | Description | @@ -583,10 +582,10 @@ will be unavailable. > #### Example > > ```python -> doc = nlp(u"This is a sentence. Here's another...") +> doc = nlp("This is a sentence. Here's another...") > sents = list(doc.sents) > assert len(sents) == 2 -> assert [s.root.text for s in sents] == [u"is", u"'s"] +> assert [s.root.text for s in sents] == ["is", "'s"] > ``` | Name | Type | Description | @@ -600,7 +599,7 @@ A boolean value indicating whether a word vector is associated with the object. > #### Example > > ```python -> doc = nlp(u"I like apples") +> doc = nlp("I like apples") > assert doc.has_vector > ``` @@ -616,8 +615,8 @@ vectors. > #### Example > > ```python -> doc = nlp(u"I like apples") -> assert doc.vector.dtype == 'float32' +> doc = nlp("I like apples") +> assert doc.vector.dtype == "float32" > assert doc.vector.shape == (300,) > ``` @@ -632,8 +631,8 @@ The L2 norm of the document's vector representation. > #### Example > > ```python -> doc1 = nlp(u"I like apples") -> doc2 = nlp(u"I like oranges") +> doc1 = nlp("I like apples") +> doc2 = nlp("I like oranges") > doc1.vector_norm # 4.54232424414368 > doc2.vector_norm # 3.304373298575751 > assert doc1.vector_norm != doc2.vector_norm diff --git a/website/docs/api/entitylinker.md b/website/docs/api/entitylinker.md index 64db50943..88131761f 100644 --- a/website/docs/api/entitylinker.md +++ b/website/docs/api/entitylinker.md @@ -1,6 +1,8 @@ --- title: EntityLinker -teaser: Functionality to disambiguate a named entity in text to a unique knowledge base identifier. +teaser: + Functionality to disambiguate a named entity in text to a unique knowledge + base identifier. tag: class source: spacy/pipeline/pipes.pyx new: 2.2 @@ -13,9 +15,9 @@ via the ID `"entity_linker"`. ## EntityLinker.Model {#model tag="classmethod"} Initialize a model for the pipe. The model should implement the -`thinc.neural.Model` API, and should contain a field `tok2vec` that contains -the context encoder. Wrappers are under development for most major machine -learning libraries. +`thinc.neural.Model` API, and should contain a field `tok2vec` that contains the +context encoder. Wrappers are under development for most major machine learning +libraries. | Name | Type | Description | | ----------- | ------ | ------------------------------------- | @@ -40,30 +42,29 @@ shortcut for this and instantiate the component using its string name and > entity_linker.from_disk("/path/to/model") > ``` -| Name | Type | Description | -| --------------- | ----------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------- | -| `vocab` | `Vocab` | The shared vocabulary. | -| `model` | `thinc.neural.Model` / `True` | The model powering the pipeline component. If no model is supplied, the model is created when you call `begin_training`, `from_disk` or `from_bytes`. | -| `hidden_width` | int | Width of the hidden layer of the entity linking model, defaults to 128. | -| `incl_prior` | bool | Whether or not to include prior probabilities in the model. Defaults to True. | -| `incl_context` | bool | Whether or not to include the local context in the model (if not: only prior probabilites are used). Defaults to True. | -| **RETURNS** | `EntityLinker` | The newly constructed object. | +| Name | Type | Description | +| -------------- | ----------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------- | +| `vocab` | `Vocab` | The shared vocabulary. | +| `model` | `thinc.neural.Model` / `True` | The model powering the pipeline component. If no model is supplied, the model is created when you call `begin_training`, `from_disk` or `from_bytes`. | +| `hidden_width` | int | Width of the hidden layer of the entity linking model, defaults to 128. | +| `incl_prior` | bool | Whether or not to include prior probabilities in the model. Defaults to True. | +| `incl_context` | bool | Whether or not to include the local context in the model (if not: only prior probabilites are used). Defaults to True. | +| **RETURNS** | `EntityLinker` | The newly constructed object. | ## EntityLinker.\_\_call\_\_ {#call tag="method"} Apply the pipe to one document. The document is modified in place, and returned. This usually happens under the hood when the `nlp` object is called on a text and all pipeline components are applied to the `Doc` in order. Both -[`__call__`](/api/entitylinker#call) and -[`pipe`](/api/entitylinker#pipe) delegate to the -[`predict`](/api/entitylinker#predict) and -[`set_annotations`](/api/entitylinker#set_annotations) methods. +[`__call__`](/api/entitylinker#call) and [`pipe`](/api/entitylinker#pipe) +delegate to the [`predict`](/api/entitylinker#predict) and +[`set_annotations`](/api/entitylinker#set_annotations) methods. > #### Example > > ```python > entity_linker = EntityLinker(nlp.vocab) -> doc = nlp(u"This is a sentence.") +> doc = nlp("This is a sentence.") > # This usually happens under the hood > processed = entity_linker(doc) > ``` @@ -107,14 +108,15 @@ Apply the pipeline's model to a batch of docs, without modifying them. > kb_ids, tensors = entity_linker.predict([doc1, doc2]) > ``` -| Name | Type | Description | -| ----------- | -------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `docs` | iterable | The documents to predict. | -| **RETURNS** | tuple | A `(kb_ids, tensors)` tuple where `kb_ids` are the model's predicted KB identifiers for the entities in the `docs`, and `tensors` are the token representations used to predict these identifiers. | +| Name | Type | Description | +| ----------- | -------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `docs` | iterable | The documents to predict. | +| **RETURNS** | tuple | A `(kb_ids, tensors)` tuple where `kb_ids` are the model's predicted KB identifiers for the entities in the `docs`, and `tensors` are the token representations used to predict these identifiers. | ## EntityLinker.set_annotations {#set_annotations tag="method"} -Modify a batch of documents, using pre-computed entity IDs for a list of named entities. +Modify a batch of documents, using pre-computed entity IDs for a list of named +entities. > #### Example > @@ -124,16 +126,17 @@ Modify a batch of documents, using pre-computed entity IDs for a list of named e > entity_linker.set_annotations([doc1, doc2], kb_ids, tensors) > ``` -| Name | Type | Description | -| ---------- | -------- | --------------------------------------------------------------------------------------------------- | -| `docs` | iterable | The documents to modify. | -| `kb_ids` | iterable | The knowledge base identifiers for the entities in the docs, predicted by `EntityLinker.predict`. | -| `tensors` | iterable | The token representations used to predict the identifiers. | +| Name | Type | Description | +| --------- | -------- | ------------------------------------------------------------------------------------------------- | +| `docs` | iterable | The documents to modify. | +| `kb_ids` | iterable | The knowledge base identifiers for the entities in the docs, predicted by `EntityLinker.predict`. | +| `tensors` | iterable | The token representations used to predict the identifiers. | ## EntityLinker.update {#update tag="method"} Learn from a batch of documents and gold-standard information, updating both the -pipe's entity linking model and context encoder. Delegates to [`predict`](/api/entitylinker#predict) and +pipe's entity linking model and context encoder. Delegates to +[`predict`](/api/entitylinker#predict) and [`get_loss`](/api/entitylinker#get_loss). > #### Example @@ -145,18 +148,18 @@ pipe's entity linking model and context encoder. Delegates to [`predict`](/api/e > entity_linker.update([doc1, doc2], [gold1, gold2], losses=losses, sgd=optimizer) > ``` -| Name | Type | Description | -| -------- | -------- | ------------------------------------------------------------------------------------------------------------- | -| `docs` | iterable | A batch of documents to learn from. | -| `golds` | iterable | The gold-standard data. Must have the same length as `docs`. | -| `drop` | float | The dropout rate, used both for the EL model and the context encoder. | -| `sgd` | callable | The optimizer for the EL model. Should take two arguments `weights` and `gradient`, and an optional ID. | -| `losses` | dict | Optional record of the loss during training. The value keyed by the model's name is updated. | +| Name | Type | Description | +| -------- | -------- | ------------------------------------------------------------------------------------------------------- | +| `docs` | iterable | A batch of documents to learn from. | +| `golds` | iterable | The gold-standard data. Must have the same length as `docs`. | +| `drop` | float | The dropout rate, used both for the EL model and the context encoder. | +| `sgd` | callable | The optimizer for the EL model. Should take two arguments `weights` and `gradient`, and an optional ID. | +| `losses` | dict | Optional record of the loss during training. The value keyed by the model's name is updated. | ## EntityLinker.get_loss {#get_loss tag="method"} -Find the loss and gradient of loss for the entities in a batch of documents and their -predicted scores. +Find the loss and gradient of loss for the entities in a batch of documents and +their predicted scores. > #### Example > @@ -166,17 +169,18 @@ predicted scores. > loss, d_loss = entity_linker.get_loss(docs, [gold1, gold2], kb_ids, tensors) > ``` -| Name | Type | Description | -| --------------- | -------- | ------------------------------------------------------------ | -| `docs` | iterable | The batch of documents. | -| `golds` | iterable | The gold-standard data. Must have the same length as `docs`. | -| `kb_ids` | iterable | KB identifiers representing the model's predictions. | -| `tensors` | iterable | The token representations used to predict the identifiers | -| **RETURNS** | tuple | The loss and the gradient, i.e. `(loss, gradient)`. | +| Name | Type | Description | +| ----------- | -------- | ------------------------------------------------------------ | +| `docs` | iterable | The batch of documents. | +| `golds` | iterable | The gold-standard data. Must have the same length as `docs`. | +| `kb_ids` | iterable | KB identifiers representing the model's predictions. | +| `tensors` | iterable | The token representations used to predict the identifiers | +| **RETURNS** | tuple | The loss and the gradient, i.e. `(loss, gradient)`. | ## EntityLinker.set_kb {#set_kb tag="method"} -Define the knowledge base (KB) used for disambiguating named entities to KB identifiers. +Define the knowledge base (KB) used for disambiguating named entities to KB +identifiers. > #### Example > @@ -185,15 +189,16 @@ Define the knowledge base (KB) used for disambiguating named entities to KB iden > entity_linker.set_kb(kb) > ``` -| Name | Type | Description | -| --------------- | --------------- | ------------------------------------------------------------ | -| `kb` | `KnowledgeBase` | The [`KnowledgeBase`](/api/kb). | +| Name | Type | Description | +| ---- | --------------- | ------------------------------- | +| `kb` | `KnowledgeBase` | The [`KnowledgeBase`](/api/kb). | ## EntityLinker.begin_training {#begin_training tag="method"} Initialize the pipe for training, using data examples if available. If no model -has been initialized yet, the model is added. -Before calling this method, a knowledge base should have been defined with [`set_kb`](/api/entitylinker#set_kb). +has been initialized yet, the model is added. Before calling this method, a +knowledge base should have been defined with +[`set_kb`](/api/entitylinker#set_kb). > #### Example > @@ -204,12 +209,12 @@ Before calling this method, a knowledge base should have been defined with [`set > optimizer = entity_linker.begin_training(pipeline=nlp.pipeline) > ``` -| Name | Type | Description | -| ------------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `gold_tuples` | iterable | Optional gold-standard annotations from which to construct [`GoldParse`](/api/goldparse) objects. | -| `pipeline` | list | Optional list of pipeline components that this component is part of. | -| `sgd` | callable | An optional optimizer. Should take two arguments `weights` and `gradient`, and an optional ID. Will be created via [`EntityLinker`](/api/entitylinker#create_optimizer) if not set. | -| **RETURNS** | callable | An optimizer. | +| Name | Type | Description | +| ------------- | -------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `gold_tuples` | iterable | Optional gold-standard annotations from which to construct [`GoldParse`](/api/goldparse) objects. | +| `pipeline` | list | Optional list of pipeline components that this component is part of. | +| `sgd` | callable | An optional optimizer. Should take two arguments `weights` and `gradient`, and an optional ID. Will be created via [`EntityLinker`](/api/entitylinker#create_optimizer) if not set. | +| **RETURNS** | callable | An optimizer. | ## EntityLinker.create_optimizer {#create_optimizer tag="method"} @@ -242,7 +247,6 @@ Modify the pipe's EL model, to use the given parameter values. | -------- | ---- | ---------------------------------------------------------------------------------------------------------- | | `params` | dict | The parameter values to use in the model. At the end of the context, the original parameters are restored. | - ## EntityLinker.to_disk {#to_disk tag="method"} Serialize the pipe to disk. @@ -270,11 +274,11 @@ Load the pipe from disk. Modifies the object in place and returns it. > entity_linker.from_disk("/path/to/entity_linker") > ``` -| Name | Type | Description | -| ----------- | ------------------ | -------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | -| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `EntityLinker` | The modified `EntityLinker` object. | +| Name | Type | Description | +| ----------- | ---------------- | -------------------------------------------------------------------------- | +| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `EntityLinker` | The modified `EntityLinker` object. | ## Serialization fields {#serialization-fields} @@ -294,4 +298,3 @@ serialization by passing in the string names via the `exclude` argument. | `cfg` | The config file. You usually don't want to exclude this. | | `model` | The binary model data. You usually don't want to exclude this. | | `kb` | The knowledge base. You usually don't want to exclude this. | - diff --git a/website/docs/api/entityrecognizer.md b/website/docs/api/entityrecognizer.md index 46e8b44ee..9a2766c07 100644 --- a/website/docs/api/entityrecognizer.md +++ b/website/docs/api/entityrecognizer.md @@ -58,7 +58,7 @@ and all pipeline components are applied to the `Doc` in order. Both > > ```python > ner = EntityRecognizer(nlp.vocab) -> doc = nlp(u"This is a sentence.") +> doc = nlp("This is a sentence.") > # This usually happens under the hood > processed = ner(doc) > ``` @@ -119,11 +119,11 @@ Modify a batch of documents, using pre-computed scores. > ner.set_annotations([doc1, doc2], scores, tensors) > ``` -| Name | Type | Description | -| -------- | -------- | ---------------------------------------------------------- | -| `docs` | iterable | The documents to modify. | -| `scores` | - | The scores to set, produced by `EntityRecognizer.predict`. | -| `tensors`| iterable | The token representations used to predict the scores. | +| Name | Type | Description | +| --------- | -------- | ---------------------------------------------------------- | +| `docs` | iterable | The documents to modify. | +| `scores` | - | The scores to set, produced by `EntityRecognizer.predict`. | +| `tensors` | iterable | The token representations used to predict the scores. | ## EntityRecognizer.update {#update tag="method"} diff --git a/website/docs/api/goldparse.md b/website/docs/api/goldparse.md index db7d07795..2dd24316f 100644 --- a/website/docs/api/goldparse.md +++ b/website/docs/api/goldparse.md @@ -23,7 +23,7 @@ gradient for those labels will be zero. | `deps` | iterable | A sequence of strings, representing the syntactic relation types. | | `entities` | iterable | A sequence of named entity annotations, either as BILUO tag strings, or as `(start_char, end_char, label)` tuples, representing the entity positions. If BILUO tag strings, you can specify missing values by setting the tag to None. | | `cats` | dict | Labels for text classification. Each key in the dictionary may be a string or an int, or a `(start_char, end_char, label)` tuple, indicating that the label is applied to only part of the document (usually a sentence). | -| `links` | dict | Labels for entity linking. A dict with `(start_char, end_char)` keys, and the values being dicts with `kb_id:value` entries, representing external KB IDs mapped to either 1.0 (positive) or 0.0 (negative). | +| `links` | dict | Labels for entity linking. A dict with `(start_char, end_char)` keys, and the values being dicts with `kb_id:value` entries, representing external KB IDs mapped to either 1.0 (positive) or 0.0 (negative). | | **RETURNS** | `GoldParse` | The newly constructed object. | ## GoldParse.\_\_len\_\_ {#len tag="method"} @@ -69,7 +69,7 @@ Convert a list of Doc objects into the > ```python > from spacy.gold import docs_to_json > -> doc = nlp(u"I like London") +> doc = nlp("I like London") > json_data = docs_to_json([doc]) > ``` @@ -150,7 +150,7 @@ single-token entity. > ```python > from spacy.gold import biluo_tags_from_offsets > -> doc = nlp(u"I like London.") +> doc = nlp("I like London.") > entities = [(7, 13, "LOC")] > tags = biluo_tags_from_offsets(doc, entities) > assert tags == ["O", "O", "U-LOC", "O"] @@ -172,7 +172,7 @@ entity offsets. > ```python > from spacy.gold import offsets_from_biluo_tags > -> doc = nlp(u"I like London.") +> doc = nlp("I like London.") > tags = ["O", "O", "U-LOC", "O"] > entities = offsets_from_biluo_tags(doc, tags) > assert entities == [(7, 13, "LOC")] @@ -195,7 +195,7 @@ token-based tags, e.g. to overwrite the `doc.ents`. > ```python > from spacy.gold import spans_from_biluo_tags > -> doc = nlp(u"I like London.") +> doc = nlp("I like London.") > tags = ["O", "O", "U-LOC", "O"] > doc.ents = spans_from_biluo_tags(doc, tags) > ``` diff --git a/website/docs/api/language.md b/website/docs/api/language.md index 9a89d01cc..254ad8fb1 100644 --- a/website/docs/api/language.md +++ b/website/docs/api/language.md @@ -45,7 +45,7 @@ contain arbitrary whitespace. Alignment into the original string is preserved. > #### Example > > ```python -> doc = nlp(u"An example sentence. Another sentence.") +> doc = nlp("An example sentence. Another sentence.") > assert (doc[0].text, doc[0].head.tag_) == ("An", "NN") > ``` @@ -61,8 +61,8 @@ Pipeline components to prevent from being loaded can now be added as a list to `disable`, instead of specifying one keyword argument per component. ```diff -- doc = nlp(u"I don't want parsed", parse=False) -+ doc = nlp(u"I don't want parsed", disable=["parser"]) +- doc = nlp("I don't want parsed", parse=False) ++ doc = nlp("I don't want parsed", disable=["parser"]) ``` @@ -86,7 +86,7 @@ multiprocessing. > #### Example > > ```python -> texts = [u"One document.", u"...", u"Lots of documents"] +> texts = ["One document.", "...", "Lots of documents"] > for doc in nlp.pipe(texts, batch_size=50): > assert doc.is_parsed > ``` diff --git a/website/docs/api/lemmatizer.md b/website/docs/api/lemmatizer.md index 7bc2691e5..fd71d16cf 100644 --- a/website/docs/api/lemmatizer.md +++ b/website/docs/api/lemmatizer.md @@ -37,8 +37,8 @@ Lemmatize a string. > from spacy.lemmatizer import Lemmatizer > from spacy.lang.en import LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES > lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES) -> lemmas = lemmatizer(u"ducks", u"NOUN") -> assert lemmas == [u"duck"] +> lemmas = lemmatizer("ducks", "NOUN") +> assert lemmas == ["duck"] > ``` | Name | Type | Description | @@ -58,9 +58,9 @@ variable, set on the individual `Language` class. > #### Example > > ```python -> lookup = {u"going": u"go"} +> lookup = {"going": "go"} > lemmatizer = Lemmatizer(lookup=lookup) -> assert lemmatizer.lookup(u"going") == u"go" +> assert lemmatizer.lookup("going") == "go" > ``` | Name | Type | Description | diff --git a/website/docs/api/lexeme.md b/website/docs/api/lexeme.md index 018dc72d8..398b71708 100644 --- a/website/docs/api/lexeme.md +++ b/website/docs/api/lexeme.md @@ -27,7 +27,7 @@ Change the value of a boolean flag. > > ```python > COOL_FLAG = nlp.vocab.add_flag(lambda text: False) -> nlp.vocab[u'spaCy'].set_flag(COOL_FLAG, True) +> nlp.vocab["spaCy"].set_flag(COOL_FLAG, True) > ``` | Name | Type | Description | @@ -42,9 +42,9 @@ Check the value of a boolean flag. > #### Example > > ```python -> is_my_library = lambda text: text in [u"spaCy", u"Thinc"] +> is_my_library = lambda text: text in ["spaCy", "Thinc"] > MY_LIBRARY = nlp.vocab.add_flag(is_my_library) -> assert nlp.vocab[u"spaCy"].check_flag(MY_LIBRARY) == True +> assert nlp.vocab["spaCy"].check_flag(MY_LIBRARY) == True > ``` | Name | Type | Description | @@ -59,8 +59,8 @@ Compute a semantic similarity estimate. Defaults to cosine over vectors. > #### Example > > ```python -> apple = nlp.vocab[u"apple"] -> orange = nlp.vocab[u"orange"] +> apple = nlp.vocab["apple"] +> orange = nlp.vocab["orange"] > apple_orange = apple.similarity(orange) > orange_apple = orange.similarity(apple) > assert apple_orange == orange_apple @@ -78,7 +78,7 @@ A boolean value indicating whether a word vector is associated with the lexeme. > #### Example > > ```python -> apple = nlp.vocab[u"apple"] +> apple = nlp.vocab["apple"] > assert apple.has_vector > ``` @@ -93,7 +93,7 @@ A real-valued meaning representation. > #### Example > > ```python -> apple = nlp.vocab[u"apple"] +> apple = nlp.vocab["apple"] > assert apple.vector.dtype == "float32" > assert apple.vector.shape == (300,) > ``` @@ -109,8 +109,8 @@ The L2 norm of the lexeme's vector representation. > #### Example > > ```python -> apple = nlp.vocab[u"apple"] -> pasta = nlp.vocab[u"pasta"] +> apple = nlp.vocab["apple"] +> pasta = nlp.vocab["pasta"] > apple.vector_norm # 7.1346845626831055 > pasta.vector_norm # 7.759851932525635 > assert apple.vector_norm != pasta.vector_norm diff --git a/website/docs/api/matcher.md b/website/docs/api/matcher.md index fb0ba1617..84d9ed888 100644 --- a/website/docs/api/matcher.md +++ b/website/docs/api/matcher.md @@ -50,7 +50,7 @@ Find all token sequences matching the supplied patterns on the `Doc`. > matcher = Matcher(nlp.vocab) > pattern = [{"LOWER": "hello"}, {"LOWER": "world"}] > matcher.add("HelloWorld", None, pattern) -> doc = nlp(u'hello world!') +> doc = nlp("hello world!") > matches = matcher(doc) > ``` @@ -147,7 +147,7 @@ overwritten. > matcher = Matcher(nlp.vocab) > matcher.add("HelloWorld", on_match, [{"LOWER": "hello"}, {"LOWER": "world"}]) > matcher.add("GoogleMaps", on_match, [{"ORTH": "Google"}, {"ORTH": "Maps"}]) -> doc = nlp(u"HELLO WORLD on Google Maps.") +> doc = nlp("HELLO WORLD on Google Maps.") > matches = matcher(doc) > ``` diff --git a/website/docs/api/phrasematcher.md b/website/docs/api/phrasematcher.md index c61fa575d..36a412e34 100644 --- a/website/docs/api/phrasematcher.md +++ b/website/docs/api/phrasematcher.md @@ -59,8 +59,8 @@ Find all token sequences matching the supplied patterns on the `Doc`. > from spacy.matcher import PhraseMatcher > > matcher = PhraseMatcher(nlp.vocab) -> matcher.add("OBAMA", None, nlp(u"Barack Obama")) -> doc = nlp(u"Barack Obama lifts America one last time in emotional farewell") +> matcher.add("OBAMA", None, nlp("Barack Obama")) +> doc = nlp("Barack Obama lifts America one last time in emotional farewell") > matches = matcher(doc) > ``` @@ -99,7 +99,7 @@ patterns. > ```python > matcher = PhraseMatcher(nlp.vocab) > assert len(matcher) == 0 -> matcher.add("OBAMA", None, nlp(u"Barack Obama")) +> matcher.add("OBAMA", None, nlp("Barack Obama")) > assert len(matcher) == 1 > ``` @@ -116,7 +116,7 @@ Check whether the matcher contains rules for a match ID. > ```python > matcher = PhraseMatcher(nlp.vocab) > assert "OBAMA" not in matcher -> matcher.add("OBAMA", None, nlp(u"Barack Obama")) +> matcher.add("OBAMA", None, nlp("Barack Obama")) > assert "OBAMA" in matcher > ``` @@ -140,10 +140,10 @@ overwritten. > print('Matched!', matches) > > matcher = PhraseMatcher(nlp.vocab) -> matcher.add("OBAMA", on_match, nlp(u"Barack Obama")) -> matcher.add("HEALTH", on_match, nlp(u"health care reform"), -> nlp(u"healthcare reform")) -> doc = nlp(u"Barack Obama urges Congress to find courage to defend his healthcare reforms") +> matcher.add("OBAMA", on_match, nlp("Barack Obama")) +> matcher.add("HEALTH", on_match, nlp("health care reform"), +> nlp("healthcare reform")) +> doc = nlp("Barack Obama urges Congress to find courage to defend his healthcare reforms") > matches = matcher(doc) > ``` diff --git a/website/docs/api/pipeline-functions.md b/website/docs/api/pipeline-functions.md index 63b3cd164..6e2b473b1 100644 --- a/website/docs/api/pipeline-functions.md +++ b/website/docs/api/pipeline-functions.md @@ -17,13 +17,13 @@ the processing pipeline using [`nlp.add_pipe`](/api/language#add_pipe). > #### Example > > ```python -> texts = [t.text for t in nlp(u"I have a blue car")] +> texts = [t.text for t in nlp("I have a blue car")] > assert texts == ["I", "have", "a", "blue", "car"] > > merge_nps = nlp.create_pipe("merge_noun_chunks") > nlp.add_pipe(merge_nps) > -> texts = [t.text for t in nlp(u"I have a blue car")] +> texts = [t.text for t in nlp("I have a blue car")] > assert texts == ["I", "have", "a blue car"] > ``` @@ -50,13 +50,13 @@ the processing pipeline using [`nlp.add_pipe`](/api/language#add_pipe). > #### Example > > ```python -> texts = [t.text for t in nlp(u"I like David Bowie")] +> texts = [t.text for t in nlp("I like David Bowie")] > assert texts == ["I", "like", "David", "Bowie"] > > merge_ents = nlp.create_pipe("merge_entities") > nlp.add_pipe(merge_ents) > -> texts = [t.text for t in nlp(u"I like David Bowie")] +> texts = [t.text for t in nlp("I like David Bowie")] > assert texts == ["I", "like", "David Bowie"] > ``` diff --git a/website/docs/api/sentencizer.md b/website/docs/api/sentencizer.md index 26d205c24..237cd6a8a 100644 --- a/website/docs/api/sentencizer.md +++ b/website/docs/api/sentencizer.md @@ -59,7 +59,7 @@ the component has been added to the pipeline using > nlp = English() > sentencizer = nlp.create_pipe("sentencizer") > nlp.add_pipe(sentencizer) -> doc = nlp(u"This is a sentence. This is another sentence.") +> doc = nlp("This is a sentence. This is another sentence.") > assert list(doc.sents) == 2 > ``` diff --git a/website/docs/api/span.md b/website/docs/api/span.md index 79be81ef8..7e3ce19d0 100644 --- a/website/docs/api/span.md +++ b/website/docs/api/span.md @@ -13,13 +13,13 @@ Create a Span object from the slice `doc[start : end]`. > #### Example > > ```python -> doc = nlp(u"Give it back! He pleaded.") +> doc = nlp("Give it back! He pleaded.") > span = doc[1:4] -> assert [t.text for t in span] == [u"it", u"back", u"!"] +> assert [t.text for t in span] == ["it", "back", "!"] > ``` | Name | Type | Description | -| ----------- | ---------------------------------------- | ------------------------------------------------------------------------------------------------------------------| +| ----------- | ---------------------------------------- | ----------------------------------------------------------------------------------------------------------------- | | `doc` | `Doc` | The parent document. | | `start` | int | The index of the first token of the span. | | `end` | int | The index of the first token after the span. | @@ -35,7 +35,7 @@ Get a `Token` object. > #### Example > > ```python -> doc = nlp(u"Give it back! He pleaded.") +> doc = nlp("Give it back! He pleaded.") > span = doc[1:4] > assert span[1].text == "back" > ``` @@ -50,9 +50,9 @@ Get a `Span` object. > #### Example > > ```python -> doc = nlp(u"Give it back! He pleaded.") +> doc = nlp("Give it back! He pleaded.") > span = doc[1:4] -> assert span[1:3].text == u"back!" +> assert span[1:3].text == "back!" > ``` | Name | Type | Description | @@ -67,9 +67,9 @@ Iterate over `Token` objects. > #### Example > > ```python -> doc = nlp(u"Give it back! He pleaded.") +> doc = nlp("Give it back! He pleaded.") > span = doc[1:4] -> assert [t.text for t in span] == [u"it", u"back", u"!"] +> assert [t.text for t in span] == ["it", "back", "!"] > ``` | Name | Type | Description | @@ -83,7 +83,7 @@ Get the number of tokens in the span. > #### Example > > ```python -> doc = nlp(u"Give it back! He pleaded.") +> doc = nlp("Give it back! He pleaded.") > span = doc[1:4] > assert len(span) == 3 > ``` @@ -102,9 +102,9 @@ For details, see the documentation on > > ```python > from spacy.tokens import Span -> city_getter = lambda span: any(city in span.text for city in (u"New York", u"Paris", u"Berlin")) +> city_getter = lambda span: any(city in span.text for city in ("New York", "Paris", "Berlin")) > Span.set_extension("has_city", getter=city_getter) -> doc = nlp(u"I like New York in Autumn") +> doc = nlp("I like New York in Autumn") > assert doc[1:4]._.has_city > ``` @@ -180,7 +180,7 @@ using an average of word vectors. > #### Example > > ```python -> doc = nlp(u"green apples and red oranges") +> doc = nlp("green apples and red oranges") > green_apples = doc[:2] > red_oranges = doc[3:] > apples_oranges = green_apples.similarity(red_oranges) @@ -202,7 +202,7 @@ ancestor is found, e.g. if span excludes a necessary ancestor. > #### Example > > ```python -> doc = nlp(u"I like New York in Autumn") +> doc = nlp("I like New York in Autumn") > span = doc[1:4] > matrix = span.get_lca_matrix() > # array([[0, 0, 0], [0, 1, 2], [0, 2, 2]], dtype=int32) @@ -222,7 +222,7 @@ shape `(N, M)`, where `N` is the length of the document. The values will be > > ```python > from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA -> doc = nlp(u"I like New York in Autumn.") +> doc = nlp("I like New York in Autumn.") > span = doc[2:3] > # All strings mapped to integers, for easy export to numpy > np_array = span.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA]) @@ -248,11 +248,11 @@ Retokenize the document, such that the span is merged into a single token. > #### Example > > ```python -> doc = nlp(u"I like New York in Autumn.") +> doc = nlp("I like New York in Autumn.") > span = doc[2:4] > span.merge() > assert len(doc) == 6 -> assert doc[2].text == u"New York" +> assert doc[2].text == "New York" > ``` | Name | Type | Description | @@ -268,12 +268,12 @@ if the entity recognizer has been applied. > #### Example > > ```python -> doc = nlp(u"Mr. Best flew to New York on Saturday morning.") +> doc = nlp("Mr. Best flew to New York on Saturday morning.") > span = doc[0:6] > ents = list(span.ents) > assert ents[0].label == 346 > assert ents[0].label_ == "PERSON" -> assert ents[0].text == u"Mr. Best" +> assert ents[0].text == "Mr. Best" > ``` | Name | Type | Description | @@ -287,10 +287,10 @@ Create a new `Doc` object corresponding to the `Span`, with a copy of the data. > #### Example > > ```python -> doc = nlp(u"I like New York in Autumn.") +> doc = nlp("I like New York in Autumn.") > span = doc[2:4] > doc2 = span.as_doc() -> assert doc2.text == u"New York" +> assert doc2.text == "New York" > ``` | Name | Type | Description | @@ -306,12 +306,12 @@ taken. > #### Example > > ```python -> doc = nlp(u"I like New York in Autumn.") +> doc = nlp("I like New York in Autumn.") > i, like, new, york, in_, autumn, dot = range(len(doc)) -> assert doc[new].head.text == u"York" -> assert doc[york].head.text == u"like" +> assert doc[new].head.text == "York" +> assert doc[york].head.text == "like" > new_york = doc[new:york+1] -> assert new_york.root.text == u"York" +> assert new_york.root.text == "York" > ``` | Name | Type | Description | @@ -325,9 +325,9 @@ A tuple of tokens coordinated to `span.root`. > #### Example > > ```python -> doc = nlp(u"I like apples and oranges") +> doc = nlp("I like apples and oranges") > apples_conjuncts = doc[2:3].conjuncts -> assert [t.text for t in apples_conjuncts] == [u"oranges"] +> assert [t.text for t in apples_conjuncts] == ["oranges"] > ``` | Name | Type | Description | @@ -341,9 +341,9 @@ Tokens that are to the left of the span, whose heads are within the span. > #### Example > > ```python -> doc = nlp(u"I like New York in Autumn.") +> doc = nlp("I like New York in Autumn.") > lefts = [t.text for t in doc[3:7].lefts] -> assert lefts == [u"New"] +> assert lefts == ["New"] > ``` | Name | Type | Description | @@ -357,9 +357,9 @@ Tokens that are to the right of the span, whose heads are within the span. > #### Example > > ```python -> doc = nlp(u"I like New York in Autumn.") +> doc = nlp("I like New York in Autumn.") > rights = [t.text for t in doc[2:4].rights] -> assert rights == [u"in"] +> assert rights == ["in"] > ``` | Name | Type | Description | @@ -374,7 +374,7 @@ the span. > #### Example > > ```python -> doc = nlp(u"I like New York in Autumn.") +> doc = nlp("I like New York in Autumn.") > assert doc[3:7].n_lefts == 1 > ``` @@ -390,7 +390,7 @@ the span. > #### Example > > ```python -> doc = nlp(u"I like New York in Autumn.") +> doc = nlp("I like New York in Autumn.") > assert doc[2:4].n_rights == 1 > ``` @@ -405,9 +405,9 @@ Tokens within the span and tokens which descend from them. > #### Example > > ```python -> doc = nlp(u"Give it back! He pleaded.") +> doc = nlp("Give it back! He pleaded.") > subtree = [t.text for t in doc[:3].subtree] -> assert subtree == [u"Give", u"it", u"back", u"!"] +> assert subtree == ["Give", "it", "back", "!"] > ``` | Name | Type | Description | @@ -421,7 +421,7 @@ A boolean value indicating whether a word vector is associated with the object. > #### Example > > ```python -> doc = nlp(u"I like apples") +> doc = nlp("I like apples") > assert doc[1:].has_vector > ``` @@ -437,7 +437,7 @@ vectors. > #### Example > > ```python -> doc = nlp(u"I like apples") +> doc = nlp("I like apples") > assert doc[1:].vector.dtype == "float32" > assert doc[1:].vector.shape == (300,) > ``` @@ -453,7 +453,7 @@ The L2 norm of the span's vector representation. > #### Example > > ```python -> doc = nlp(u"I like apples") +> doc = nlp("I like apples") > doc[1:].vector_norm # 4.800883928527915 > doc[2:].vector_norm # 6.895897646384268 > assert doc[1:].vector_norm != doc[2:].vector_norm diff --git a/website/docs/api/stringstore.md b/website/docs/api/stringstore.md index 40d27a62a..268f19125 100644 --- a/website/docs/api/stringstore.md +++ b/website/docs/api/stringstore.md @@ -16,7 +16,7 @@ Create the `StringStore`. > > ```python > from spacy.strings import StringStore -> stringstore = StringStore([u"apple", u"orange"]) +> stringstore = StringStore(["apple", "orange"]) > ``` | Name | Type | Description | @@ -31,7 +31,7 @@ Get the number of strings in the store. > #### Example > > ```python -> stringstore = StringStore([u"apple", u"orange"]) +> stringstore = StringStore(["apple", "orange"]) > assert len(stringstore) == 2 > ``` @@ -46,10 +46,10 @@ Retrieve a string from a given hash, or vice versa. > #### Example > > ```python -> stringstore = StringStore([u"apple", u"orange"]) -> apple_hash = stringstore[u"apple"] +> stringstore = StringStore(["apple", "orange"]) +> apple_hash = stringstore["apple"] > assert apple_hash == 8566208034543834098 -> assert stringstore[apple_hash] == u"apple" +> assert stringstore[apple_hash] == "apple" > ``` | Name | Type | Description | @@ -64,9 +64,9 @@ Check whether a string is in the store. > #### Example > > ```python -> stringstore = StringStore([u"apple", u"orange"]) -> assert u"apple" in stringstore -> assert not u"cherry" in stringstore +> stringstore = StringStore(["apple", "orange"]) +> assert "apple" in stringstore +> assert not "cherry" in stringstore > ``` | Name | Type | Description | @@ -82,9 +82,9 @@ store will always include an empty string `''` at position `0`. > #### Example > > ```python -> stringstore = StringStore([u"apple", u"orange"]) +> stringstore = StringStore(["apple", "orange"]) > all_strings = [s for s in stringstore] -> assert all_strings == [u"apple", u"orange"] +> assert all_strings == ["apple", "orange"] > ``` | Name | Type | Description | @@ -98,12 +98,12 @@ Add a string to the `StringStore`. > #### Example > > ```python -> stringstore = StringStore([u"apple", u"orange"]) -> banana_hash = stringstore.add(u"banana") +> stringstore = StringStore(["apple", "orange"]) +> banana_hash = stringstore.add("banana") > assert len(stringstore) == 3 > assert banana_hash == 2525716904149915114 -> assert stringstore[banana_hash] == u"banana" -> assert stringstore[u"banana"] == banana_hash +> assert stringstore[banana_hash] == "banana" +> assert stringstore["banana"] == banana_hash > ``` | Name | Type | Description | @@ -182,7 +182,7 @@ Get a 64-bit hash for a given string. > > ```python > from spacy.strings import hash_string -> assert hash_string(u"apple") == 8566208034543834098 +> assert hash_string("apple") == 8566208034543834098 > ``` | Name | Type | Description | diff --git a/website/docs/api/tagger.md b/website/docs/api/tagger.md index fc6fc67a6..bd3382f89 100644 --- a/website/docs/api/tagger.md +++ b/website/docs/api/tagger.md @@ -57,7 +57,7 @@ and all pipeline components are applied to the `Doc` in order. Both > > ```python > tagger = Tagger(nlp.vocab) -> doc = nlp(u"This is a sentence.") +> doc = nlp("This is a sentence.") > # This usually happens under the hood > processed = tagger(doc) > ``` @@ -117,12 +117,11 @@ Modify a batch of documents, using pre-computed scores. > tagger.set_annotations([doc1, doc2], scores, tensors) > ``` -| Name | Type | Description | -| -------- | -------- | ----------------------------------------------------- | -| `docs` | iterable | The documents to modify. | -| `scores` | - | The scores to set, produced by `Tagger.predict`. | -| `tensors`| iterable | The token representations used to predict the scores. | - +| Name | Type | Description | +| --------- | -------- | ----------------------------------------------------- | +| `docs` | iterable | The documents to modify. | +| `scores` | - | The scores to set, produced by `Tagger.predict`. | +| `tensors` | iterable | The token representations used to predict the scores. | ## Tagger.update {#update tag="method"} diff --git a/website/docs/api/textcategorizer.md b/website/docs/api/textcategorizer.md index f7158541b..1a0280265 100644 --- a/website/docs/api/textcategorizer.md +++ b/website/docs/api/textcategorizer.md @@ -75,7 +75,7 @@ delegate to the [`predict`](/api/textcategorizer#predict) and > > ```python > textcat = TextCategorizer(nlp.vocab) -> doc = nlp(u"This is a sentence.") +> doc = nlp("This is a sentence.") > # This usually happens under the hood > processed = textcat(doc) > ``` @@ -136,11 +136,11 @@ Modify a batch of documents, using pre-computed scores. > textcat.set_annotations([doc1, doc2], scores, tensors) > ``` -| Name | Type | Description | -| -------- | -------- | --------------------------------------------------------- | -| `docs` | iterable | The documents to modify. | -| `scores` | - | The scores to set, produced by `TextCategorizer.predict`. | -| `tensors`| iterable | The token representations used to predict the scores. | +| Name | Type | Description | +| --------- | -------- | --------------------------------------------------------- | +| `docs` | iterable | The documents to modify. | +| `scores` | - | The scores to set, produced by `TextCategorizer.predict`. | +| `tensors` | iterable | The token representations used to predict the scores. | ## TextCategorizer.update {#update tag="method"} diff --git a/website/docs/api/token.md b/website/docs/api/token.md index 8da13454b..8d7ee5928 100644 --- a/website/docs/api/token.md +++ b/website/docs/api/token.md @@ -12,9 +12,9 @@ Construct a `Token` object. > #### Example > > ```python -> doc = nlp(u"Give it back! He pleaded.") +> doc = nlp("Give it back! He pleaded.") > token = doc[0] -> assert token.text == u"Give" +> assert token.text == "Give" > ``` | Name | Type | Description | @@ -31,7 +31,7 @@ The number of unicode characters in the token, i.e. `token.text`. > #### Example > > ```python -> doc = nlp(u"Give it back! He pleaded.") +> doc = nlp("Give it back! He pleaded.") > token = doc[0] > assert len(token) == 4 > ``` @@ -50,9 +50,9 @@ For details, see the documentation on > > ```python > from spacy.tokens import Token -> fruit_getter = lambda token: token.text in (u"apple", u"pear", u"banana") +> fruit_getter = lambda token: token.text in ("apple", "pear", "banana") > Token.set_extension("is_fruit", getter=fruit_getter) -> doc = nlp(u"I have an apple") +> doc = nlp("I have an apple") > assert doc[3]._.is_fruit > ``` @@ -128,7 +128,7 @@ Check the value of a boolean flag. > > ```python > from spacy.attrs import IS_TITLE -> doc = nlp(u"Give it back! He pleaded.") +> doc = nlp("Give it back! He pleaded.") > token = doc[0] > assert token.check_flag(IS_TITLE) == True > ``` @@ -145,7 +145,7 @@ Compute a semantic similarity estimate. Defaults to cosine over vectors. > #### Example > > ```python -> apples, _, oranges = nlp(u"apples and oranges") +> apples, _, oranges = nlp("apples and oranges") > apples_oranges = apples.similarity(oranges) > oranges_apples = oranges.similarity(apples) > assert apples_oranges == oranges_apples @@ -163,9 +163,9 @@ Get a neighboring token. > #### Example > > ```python -> doc = nlp(u"Give it back! He pleaded.") +> doc = nlp("Give it back! He pleaded.") > give_nbor = doc[0].nbor() -> assert give_nbor.text == u"it" +> assert give_nbor.text == "it" > ``` | Name | Type | Description | @@ -181,7 +181,7 @@ dependency tree. > #### Example > > ```python -> doc = nlp(u"Give it back! He pleaded.") +> doc = nlp("Give it back! He pleaded.") > give = doc[0] > it = doc[1] > assert give.is_ancestor(it) @@ -199,11 +199,11 @@ The rightmost token of this token's syntactic descendants. > #### Example > > ```python -> doc = nlp(u"Give it back! He pleaded.") +> doc = nlp("Give it back! He pleaded.") > it_ancestors = doc[1].ancestors -> assert [t.text for t in it_ancestors] == [u"Give"] +> assert [t.text for t in it_ancestors] == ["Give"] > he_ancestors = doc[4].ancestors -> assert [t.text for t in he_ancestors] == [u"pleaded"] +> assert [t.text for t in he_ancestors] == ["pleaded"] > ``` | Name | Type | Description | @@ -217,9 +217,9 @@ A tuple of coordinated tokens, not including the token itself. > #### Example > > ```python -> doc = nlp(u"I like apples and oranges") +> doc = nlp("I like apples and oranges") > apples_conjuncts = doc[2].conjuncts -> assert [t.text for t in apples_conjuncts] == [u"oranges"] +> assert [t.text for t in apples_conjuncts] == ["oranges"] > ``` | Name | Type | Description | @@ -233,9 +233,9 @@ A sequence of the token's immediate syntactic children. > #### Example > > ```python -> doc = nlp(u"Give it back! He pleaded.") +> doc = nlp("Give it back! He pleaded.") > give_children = doc[0].children -> assert [t.text for t in give_children] == [u"it", u"back", u"!"] +> assert [t.text for t in give_children] == ["it", "back", "!"] > ``` | Name | Type | Description | @@ -249,9 +249,9 @@ The leftward immediate children of the word, in the syntactic dependency parse. > #### Example > > ```python -> doc = nlp(u"I like New York in Autumn.") +> doc = nlp("I like New York in Autumn.") > lefts = [t.text for t in doc[3].lefts] -> assert lefts == [u'New'] +> assert lefts == ["New"] > ``` | Name | Type | Description | @@ -265,9 +265,9 @@ The rightward immediate children of the word, in the syntactic dependency parse. > #### Example > > ```python -> doc = nlp(u"I like New York in Autumn.") +> doc = nlp("I like New York in Autumn.") > rights = [t.text for t in doc[3].rights] -> assert rights == [u"in"] +> assert rights == ["in"] > ``` | Name | Type | Description | @@ -282,7 +282,7 @@ dependency parse. > #### Example > > ```python -> doc = nlp(u"I like New York in Autumn.") +> doc = nlp("I like New York in Autumn.") > assert doc[3].n_lefts == 1 > ``` @@ -298,7 +298,7 @@ dependency parse. > #### Example > > ```python -> doc = nlp(u"I like New York in Autumn.") +> doc = nlp("I like New York in Autumn.") > assert doc[3].n_rights == 1 > ``` @@ -313,9 +313,9 @@ A sequence containing the token and all the token's syntactic descendants. > #### Example > > ```python -> doc = nlp(u"Give it back! He pleaded.") +> doc = nlp("Give it back! He pleaded.") > give_subtree = doc[0].subtree -> assert [t.text for t in give_subtree] == [u"Give", u"it", u"back", u"!"] +> assert [t.text for t in give_subtree] == ["Give", "it", "back", "!"] > ``` | Name | Type | Description | @@ -330,7 +330,7 @@ unknown. Defaults to `True` for the first token in the `Doc`. > #### Example > > ```python -> doc = nlp(u"Give it back! He pleaded.") +> doc = nlp("Give it back! He pleaded.") > assert doc[4].is_sent_start > assert not doc[5].is_sent_start > ``` @@ -361,7 +361,7 @@ A boolean value indicating whether a word vector is associated with the token. > #### Example > > ```python -> doc = nlp(u"I like apples") +> doc = nlp("I like apples") > apples = doc[2] > assert apples.has_vector > ``` @@ -377,7 +377,7 @@ A real-valued meaning representation. > #### Example > > ```python -> doc = nlp(u"I like apples") +> doc = nlp("I like apples") > apples = doc[2] > assert apples.vector.dtype == "float32" > assert apples.vector.shape == (300,) @@ -394,7 +394,7 @@ The L2 norm of the token's vector representation. > #### Example > > ```python -> doc = nlp(u"I like apples and pasta") +> doc = nlp("I like apples and pasta") > apples = doc[2] > pasta = doc[4] > apples.vector_norm # 6.89589786529541 @@ -425,7 +425,7 @@ The L2 norm of the token's vector representation. | `i` | int | The index of the token within the parent document. | | `ent_type` | int | Named entity type. | | `ent_type_` | unicode | Named entity type. | -| `ent_iob` | int | IOB code of named entity tag. `3` means the token begins an entity, `2` means it is outside an entity, `1` means it is inside an entity, and `0` means no entity tag is set. | +| `ent_iob` | int | IOB code of named entity tag. `3` means the token begins an entity, `2` means it is outside an entity, `1` means it is inside an entity, and `0` means no entity tag is set. | | `ent_iob_` | unicode | IOB code of named entity tag. "B" means the token begins an entity, "I" means it is inside an entity, "O" means it is outside an entity, and "" means no entity tag is set. | | `ent_kb_id` 2.2 | int | Knowledge base ID that refers to the named entity this token is a part of, if any. | | `ent_kb_id_` 2.2 | unicode | Knowledge base ID that refers to the named entity this token is a part of, if any. | diff --git a/website/docs/api/tokenizer.md b/website/docs/api/tokenizer.md index ce1ba9a21..63c1e87ea 100644 --- a/website/docs/api/tokenizer.md +++ b/website/docs/api/tokenizer.md @@ -5,7 +5,9 @@ tag: class source: spacy/tokenizer.pyx --- -Segment text, and create `Doc` objects with the discovered segment boundaries. For a deeper understanding, see the docs on [how spaCy's tokenizer works](/usage/linguistic-features#how-tokenizer-works). +Segment text, and create `Doc` objects with the discovered segment boundaries. +For a deeper understanding, see the docs on +[how spaCy's tokenizer works](/usage/linguistic-features#how-tokenizer-works). ## Tokenizer.\_\_init\_\_ {#init tag="method"} @@ -49,7 +51,7 @@ Tokenize a string. > #### Example > > ```python -> tokens = tokenizer(u"This is a sentence") +> tokens = tokenizer("This is a sentence") > assert len(tokens) == 4 > ``` @@ -65,7 +67,7 @@ Tokenize a stream of texts. > #### Example > > ```python -> texts = [u"One document.", u"...", u"Lots of documents"] +> texts = ["One document.", "...", "Lots of documents"] > for doc in tokenizer.pipe(texts, batch_size=50): > pass > ``` @@ -109,8 +111,9 @@ if no suffix rules match. Add a special-case tokenization rule. This mechanism is also used to add custom tokenizer exceptions to the language data. See the usage guide on -[adding languages](/usage/adding-languages#tokenizer-exceptions) and [linguistic features](/usage/linguistic-features#special-cases) for more -details and examples. +[adding languages](/usage/adding-languages#tokenizer-exceptions) and +[linguistic features](/usage/linguistic-features#special-cases) for more details +and examples. > #### Example > diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md index e9bf48869..0a8f638b2 100644 --- a/website/docs/api/top-level.md +++ b/website/docs/api/top-level.md @@ -112,10 +112,10 @@ list of available terms, see > #### Example > > ```python -> spacy.explain(u"NORP") +> spacy.explain("NORP") > # Nationalities or religious or political groups > -> doc = nlp(u"Hello world") +> doc = nlp("Hello world") > for word in doc: > print(word.text, word.tag_, spacy.explain(word.tag_)) > # Hello UH interjection @@ -181,8 +181,8 @@ browser. Will run a simple web server. > import spacy > from spacy import displacy > nlp = spacy.load("en_core_web_sm") -> doc1 = nlp(u"This is a sentence.") -> doc2 = nlp(u"This is another sentence.") +> doc1 = nlp("This is a sentence.") +> doc2 = nlp("This is another sentence.") > displacy.serve([doc1, doc2], style="dep") > ``` @@ -192,7 +192,7 @@ browser. Will run a simple web server. | `style` | unicode | Visualization style, `'dep'` or `'ent'`. | `'dep'` | | `page` | bool | Render markup as full HTML page. | `True` | | `minify` | bool | Minify HTML markup. | `False` | -| `options` | dict | [Visualizer-specific options](#displacy_options), e.g. colors. | `{}` | +| `options` | dict | [Visualizer-specific options](#displacy_options), e.g. colors. | `{}` | | `manual` | bool | Don't parse `Doc` and instead, expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. | `False` | | `port` | int | Port to serve visualization. | `5000` | | `host` | unicode | Host to serve visualization. | `'0.0.0.0'` | @@ -207,7 +207,7 @@ Render a dependency parse tree or named entity visualization. > import spacy > from spacy import displacy > nlp = spacy.load("en_core_web_sm") -> doc = nlp(u"This is a sentence.") +> doc = nlp("This is a sentence.") > html = displacy.render(doc, style="dep") > ``` @@ -218,7 +218,7 @@ Render a dependency parse tree or named entity visualization. | `page` | bool | Render markup as full HTML page. | `False` | | `minify` | bool | Minify HTML markup. | `False` | | `jupyter` | bool | Explicitly enable or disable "[Jupyter](http://jupyter.org/) mode" to return markup ready to be rendered in a notebook. Detected automatically if `None`. | `None` | -| `options` | dict | [Visualizer-specific options](#displacy_options), e.g. colors. | `{}` | +| `options` | dict | [Visualizer-specific options](#displacy_options), e.g. colors. | `{}` | | `manual` | bool | Don't parse `Doc` and instead, expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. | `False` | | **RETURNS** | unicode | Rendered HTML markup. | @@ -262,16 +262,18 @@ If a setting is not present in the options, the default value will be used. > displacy.serve(doc, style="ent", options=options) > ``` -| Name | Type | Description | Default | -| -------- | ---- | ------------------------------------------------------------------------------------- | ------- | -| `ents` | list | Entity types to highlight (`None` for all types). | `None` | -| `colors` | dict | Color overrides. Entity types in uppercase should be mapped to color names or values. | `{}` | +| Name | Type | Description | Default | +| --------------------------------------- | ------- | ------------------------------------------------------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------------------ | +| `ents` | list | Entity types to highlight (`None` for all types). | `None` | +| `colors` | dict | Color overrides. Entity types in uppercase should be mapped to color names or values. | `{}` | | `template` 2.2 | unicode | Optional template to overwrite the HTML used to render entity spans. Should be a format string and can use `{bg}`, `{text}` and `{label}`. | see [`templates.py`](https://github.com/explosion/spaCy/blob/master/spacy/displacy/templates.py) | By default, displaCy comes with colors for all [entity types supported by spaCy](/api/annotation#named-entities). If you're using custom entity types, you can use the `colors` setting to add your own -colors for them. Your application or model package can also expose a [`spacy_displacy_colors` entry point](/usage/saving-loading#entry-points-displacy) to add custom labels and their colors automatically. +colors for them. Your application or model package can also expose a +[`spacy_displacy_colors` entry point](/usage/saving-loading#entry-points-displacy) +to add custom labels and their colors automatically. ## Utility functions {#util source="spacy/util.py"} @@ -649,11 +651,11 @@ for batching. Larger `bufsize` means less bias. > shuffled = itershuffle(values) > ``` -| Name | Type | Description | -| ---------- | -------- | ------------------------------------- | -| `iterable` | iterable | Iterator to shuffle. | -| `bufsize` | int | Items to hold back (default: 1000). | -| **YIELDS** | iterable | The shuffled iterator. | +| Name | Type | Description | +| ---------- | -------- | ----------------------------------- | +| `iterable` | iterable | Iterator to shuffle. | +| `bufsize` | int | Items to hold back (default: 1000). | +| **YIELDS** | iterable | The shuffled iterator. | ### util.filter_spans {#util.filter_spans tag="function" new="2.1.4"} diff --git a/website/docs/api/vectors.md b/website/docs/api/vectors.md index ffc1fc083..bfe0e5f3f 100644 --- a/website/docs/api/vectors.md +++ b/website/docs/api/vectors.md @@ -26,7 +26,7 @@ you can add vectors to later. > empty_vectors = Vectors(shape=(10000, 300)) > > data = numpy.zeros((3, 300), dtype='f') -> keys = [u"cat", u"dog", u"rat"] +> keys = ["cat", "dog", "rat"] > vectors = Vectors(data=data, keys=keys) > ``` @@ -45,9 +45,9 @@ raised. > #### Example > > ```python -> cat_id = nlp.vocab.strings[u"cat"] +> cat_id = nlp.vocab.strings["cat"] > cat_vector = nlp.vocab.vectors[cat_id] -> assert cat_vector == nlp.vocab[u"cat"].vector +> assert cat_vector == nlp.vocab["cat"].vector > ``` | Name | Type | Description | @@ -62,7 +62,7 @@ Set a vector for the given key. > #### Example > > ```python -> cat_id = nlp.vocab.strings[u"cat"] +> cat_id = nlp.vocab.strings["cat"] > vector = numpy.random.uniform(-1, 1, (300,)) > nlp.vocab.vectors[cat_id] = vector > ``` @@ -109,7 +109,7 @@ Check whether a key has been mapped to a vector entry in the table. > #### Example > > ```python -> cat_id = nlp.vocab.strings[u"cat"] +> cat_id = nlp.vocab.strings["cat"] > nlp.vectors.add(cat_id, numpy.random.uniform(-1, 1, (300,))) > assert cat_id in vectors > ``` @@ -132,9 +132,9 @@ mapping separately. If you need to manage the strings, you should use the > > ```python > vector = numpy.random.uniform(-1, 1, (300,)) -> cat_id = nlp.vocab.strings[u"cat"] +> cat_id = nlp.vocab.strings["cat"] > nlp.vocab.vectors.add(cat_id, vector=vector) -> nlp.vocab.vectors.add(u"dog", row=0) +> nlp.vocab.vectors.add("dog", row=0) > ``` | Name | Type | Description | @@ -218,8 +218,8 @@ Look up one or more keys by row, or vice versa. > #### Example > > ```python -> row = nlp.vocab.vectors.find(key=u"cat") -> rows = nlp.vocab.vectors.find(keys=[u"cat", u"dog"]) +> row = nlp.vocab.vectors.find(key="cat") +> rows = nlp.vocab.vectors.find(keys=["cat", "dog"]) > key = nlp.vocab.vectors.find(row=256) > keys = nlp.vocab.vectors.find(rows=[18, 256, 985]) > ``` @@ -241,7 +241,7 @@ vector table. > > ```python > vectors = Vectors(shape(1, 300)) -> vectors.add(u"cat", numpy.random.uniform(-1, 1, (300,))) +> vectors.add("cat", numpy.random.uniform(-1, 1, (300,))) > rows, dims = vectors.shape > assert rows == 1 > assert dims == 300 @@ -276,7 +276,7 @@ If a table is full, it can be resized using > > ```python > vectors = Vectors(shape=(1, 300)) -> vectors.add(u"cat", numpy.random.uniform(-1, 1, (300,))) +> vectors.add("cat", numpy.random.uniform(-1, 1, (300,))) > assert vectors.is_full > ``` diff --git a/website/docs/api/vocab.md b/website/docs/api/vocab.md index 22bfe324e..78e5f7541 100644 --- a/website/docs/api/vocab.md +++ b/website/docs/api/vocab.md @@ -18,7 +18,7 @@ Create the vocabulary. > > ```python > from spacy.vocab import Vocab -> vocab = Vocab(strings=[u"hello", u"world"]) +> vocab = Vocab(strings=["hello", "world"]) > ``` | Name | Type | Description | @@ -36,7 +36,7 @@ Get the current number of lexemes in the vocabulary. > #### Example > > ```python -> doc = nlp(u"This is a sentence.") +> doc = nlp("This is a sentence.") > assert len(nlp.vocab) > 0 > ``` @@ -52,8 +52,8 @@ unicode string is given, a new lexeme is created and stored. > #### Example > > ```python -> apple = nlp.vocab.strings[u"apple"] -> assert nlp.vocab[apple] == nlp.vocab[u"apple"] +> apple = nlp.vocab.strings["apple"] +> assert nlp.vocab[apple] == nlp.vocab["apple"] > ``` | Name | Type | Description | @@ -84,8 +84,8 @@ given string, you need to look it up in > #### Example > > ```python -> apple = nlp.vocab.strings[u"apple"] -> oov = nlp.vocab.strings[u"dskfodkfos"] +> apple = nlp.vocab.strings["apple"] +> oov = nlp.vocab.strings["dskfodkfos"] > assert apple in nlp.vocab > assert oov not in nlp.vocab > ``` @@ -106,11 +106,11 @@ using `token.check_flag(flag_id)`. > > ```python > def is_my_product(text): -> products = [u"spaCy", u"Thinc", u"displaCy"] +> products = ["spaCy", "Thinc", "displaCy"] > return text in products > > MY_PRODUCT = nlp.vocab.add_flag(is_my_product) -> doc = nlp(u"I like spaCy") +> doc = nlp("I like spaCy") > assert doc[2].check_flag(MY_PRODUCT) == True > ``` @@ -170,7 +170,7 @@ or hash value. If no vectors data is loaded, a `ValueError` is raised. > #### Example > > ```python -> nlp.vocab.get_vector(u"apple") +> nlp.vocab.get_vector("apple") > ``` | Name | Type | Description | @@ -186,7 +186,7 @@ or hash value. > #### Example > > ```python -> nlp.vocab.set_vector(u"apple", array([...])) +> nlp.vocab.set_vector("apple", array([...])) > ``` | Name | Type | Description | @@ -202,8 +202,8 @@ Words can be looked up by string or hash value. > #### Example > > ```python -> if nlp.vocab.has_vector(u"apple"): -> vector = nlp.vocab.get_vector(u"apple") +> if nlp.vocab.has_vector("apple"): +> vector = nlp.vocab.get_vector("apple") > ``` | Name | Type | Description | @@ -282,9 +282,9 @@ Load state from a binary string. > #### Example > > ```python -> apple_id = nlp.vocab.strings[u"apple"] +> apple_id = nlp.vocab.strings["apple"] > assert type(apple_id) == int -> PERSON = nlp.vocab.strings[u"PERSON"] +> PERSON = nlp.vocab.strings["PERSON"] > assert type(PERSON) == int > ``` diff --git a/website/docs/usage/101/_named-entities.md b/website/docs/usage/101/_named-entities.md index a282ec370..1ecaf9fe7 100644 --- a/website/docs/usage/101/_named-entities.md +++ b/website/docs/usage/101/_named-entities.md @@ -1,5 +1,5 @@ A named entity is a "real-world object" that's assigned a name – for example, a -person, a country, a product or a book title. spaCy can **recognize** +person, a country, a product or a book title. spaCy can **recognize** [various types](/api/annotation#named-entities) of named entities in a document, by asking the model for a **prediction**. Because models are statistical and strongly depend on the examples they were trained on, this doesn't always work @@ -12,7 +12,7 @@ Named entities are available as the `ents` property of a `Doc`: import spacy nlp = spacy.load("en_core_web_sm") -doc = nlp(u"Apple is looking at buying U.K. startup for $1 billion") +doc = nlp("Apple is looking at buying U.K. startup for $1 billion") for ent in doc.ents: print(ent.text, ent.start_char, ent.end_char, ent.label_) @@ -23,10 +23,10 @@ for ent in doc.ents: > - **End:** Index of end of entity in the `Doc`. > - **Label:** Entity label, i.e. type. -| Text | Start | End | Label | Description | -| ----------- | :---: | :-: | ------- | ---------------------------------------------------- | -| Apple | 0 | 5 | `ORG` | Companies, agencies, institutions. | -| U.K. | 27 | 31 | `GPE` | Geopolitical entity, i.e. countries, cities, states. | +| Text | Start | End | Label | Description | +| ----------- | :---: | :-: | ------- | ---------------------------------------------------- | +| Apple | 0 | 5 | `ORG` | Companies, agencies, institutions. | +| U.K. | 27 | 31 | `GPE` | Geopolitical entity, i.e. countries, cities, states. | | \$1 billion | 44 | 54 | `MONEY` | Monetary values, including unit. | Using spaCy's built-in [displaCy visualizer](/usage/visualizers), here's what diff --git a/website/docs/usage/101/_pos-deps.md b/website/docs/usage/101/_pos-deps.md index d86ee123d..b0e2b33b8 100644 --- a/website/docs/usage/101/_pos-deps.md +++ b/website/docs/usage/101/_pos-deps.md @@ -15,8 +15,8 @@ need to add an underscore `_` to its name: ### {executable="true"} import spacy -nlp = spacy.load('en_core_web_sm') -doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion') +nlp = spacy.load("en_core_web_sm") +doc = nlp("Apple is looking at buying U.K. startup for $1 billion") for token in doc: print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_, diff --git a/website/docs/usage/101/_tokenization.md b/website/docs/usage/101/_tokenization.md index e5f3d3080..764f1e62a 100644 --- a/website/docs/usage/101/_tokenization.md +++ b/website/docs/usage/101/_tokenization.md @@ -9,7 +9,7 @@ tokens, and we can iterate over them: import spacy nlp = spacy.load("en_core_web_sm") -doc = nlp(u"Apple is looking at buying U.K. startup for $1 billion") +doc = nlp("Apple is looking at buying U.K. startup for $1 billion") for token in doc: print(token.text) ``` diff --git a/website/docs/usage/101/_vectors-similarity.md b/website/docs/usage/101/_vectors-similarity.md index 2001d1481..73c35950f 100644 --- a/website/docs/usage/101/_vectors-similarity.md +++ b/website/docs/usage/101/_vectors-similarity.md @@ -48,8 +48,8 @@ norm, which can be used to normalize vectors. ### {executable="true"} import spacy -nlp = spacy.load('en_core_web_md') -tokens = nlp(u'dog cat banana afskfsd') +nlp = spacy.load("en_core_web_md") +tokens = nlp("dog cat banana afskfsd") for token in tokens: print(token.text, token.has_vector, token.vector_norm, token.is_oov) @@ -88,8 +88,8 @@ definition of similarity. ### {executable="true"} import spacy -nlp = spacy.load('en_core_web_md') # make sure to use larger model! -tokens = nlp(u'dog cat banana') +nlp = spacy.load("en_core_web_md") # make sure to use larger model! +tokens = nlp("dog cat banana") for token1 in tokens: for token2 in tokens: diff --git a/website/docs/usage/adding-languages.md b/website/docs/usage/adding-languages.md index 6f8955326..d89891297 100644 --- a/website/docs/usage/adding-languages.md +++ b/website/docs/usage/adding-languages.md @@ -276,7 +276,7 @@ the lowercase spelling of a word exists, norms should always be in lowercase. > #### Norms vs. lemmas > > ```python -> doc = nlp(u"I'm gonna realise") +> doc = nlp("I'm gonna realise") > norms = [token.norm_ for token in doc] > lemmas = [token.lemma_ for token in doc] > assert norms == ["i", "am", "going", "to", "realize"] @@ -396,10 +396,10 @@ iterators: > #### Noun chunks example > > ```python -> doc = nlp(u"A phrase with another phrase occurs.") +> doc = nlp("A phrase with another phrase occurs.") > chunks = list(doc.noun_chunks) -> assert chunks[0].text == u"A phrase" -> assert chunks[1].text == u"another phrase" +> assert chunks[0].text == "A phrase" +> assert chunks[1].text == "another phrase" > ``` | Language | Code | Source | diff --git a/website/docs/usage/index.md b/website/docs/usage/index.md index 1ffd0de0d..1d6c0574c 100644 --- a/website/docs/usage/index.md +++ b/website/docs/usage/index.md @@ -392,7 +392,7 @@ from is called `spacy`. So, when using spaCy, never call anything else `spacy`. ```python -doc = nlp(u"They are") +doc = nlp("They are") print(doc[0].lemma_) # -PRON- ``` diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md index fc1f159ce..a91135d70 100644 --- a/website/docs/usage/linguistic-features.md +++ b/website/docs/usage/linguistic-features.md @@ -69,7 +69,6 @@ of the two. The system works as follows: morphological information, without consulting the context of the token. The lemmatizer also accepts list-based exception files, acquired from [WordNet](https://wordnet.princeton.edu/). - ## Dependency Parsing {#dependency-parse model="parser"} @@ -93,7 +92,7 @@ get the noun chunks in a document, simply iterate over import spacy nlp = spacy.load("en_core_web_sm") -doc = nlp(u"Autonomous cars shift insurance liability toward manufacturers") +doc = nlp("Autonomous cars shift insurance liability toward manufacturers") for chunk in doc.noun_chunks: print(chunk.text, chunk.root.text, chunk.root.dep_, chunk.root.head.text) @@ -124,7 +123,7 @@ get the string value with `.dep_`. import spacy nlp = spacy.load("en_core_web_sm") -doc = nlp(u"Autonomous cars shift insurance liability toward manufacturers") +doc = nlp("Autonomous cars shift insurance liability toward manufacturers") for token in doc: print(token.text, token.dep_, token.head.text, token.head.pos_, [child for child in token.children]) @@ -161,7 +160,7 @@ import spacy from spacy.symbols import nsubj, VERB nlp = spacy.load("en_core_web_sm") -doc = nlp(u"Autonomous cars shift insurance liability toward manufacturers") +doc = nlp("Autonomous cars shift insurance liability toward manufacturers") # Finding a verb with a subject from below — good verbs = set() @@ -204,7 +203,7 @@ children. import spacy nlp = spacy.load("en_core_web_sm") -doc = nlp(u"bright red apples on the tree") +doc = nlp("bright red apples on the tree") print([token.text for token in doc[2].lefts]) # ['bright', 'red'] print([token.text for token in doc[2].rights]) # ['on'] print(doc[2].n_lefts) # 2 @@ -216,7 +215,7 @@ print(doc[2].n_rights) # 1 import spacy nlp = spacy.load("de_core_news_sm") -doc = nlp(u"schöne rote Äpfel auf dem Baum") +doc = nlp("schöne rote Äpfel auf dem Baum") print([token.text for token in doc[2].lefts]) # ['schöne', 'rote'] print([token.text for token in doc[2].rights]) # ['auf'] ``` @@ -240,7 +239,7 @@ sequence of tokens. You can walk up the tree with the import spacy nlp = spacy.load("en_core_web_sm") -doc = nlp(u"Credit and mortgage account holders must submit their requests") +doc = nlp("Credit and mortgage account holders must submit their requests") root = [token for token in doc if token.head == token][0] subject = list(root.lefts)[0] @@ -270,7 +269,7 @@ end-point of a range, don't forget to `+1`! import spacy nlp = spacy.load("en_core_web_sm") -doc = nlp(u"Credit and mortgage account holders must submit their requests") +doc = nlp("Credit and mortgage account holders must submit their requests") span = doc[doc[4].left_edge.i : doc[4].right_edge.i+1] with doc.retokenize() as retokenizer: retokenizer.merge(span) @@ -311,7 +310,7 @@ import spacy from spacy import displacy nlp = spacy.load("en_core_web_sm") -doc = nlp(u"Autonomous cars shift insurance liability toward manufacturers") +doc = nlp("Autonomous cars shift insurance liability toward manufacturers") # Since this is an interactive Jupyter environment, we can use displacy.render here displacy.render(doc, style='dep') ``` @@ -336,7 +335,7 @@ the `nlp` object. ```python nlp = spacy.load("en_core_web_sm", disable=["parser"]) nlp = English().from_disk("/model", disable=["parser"]) -doc = nlp(u"I don't want parsed", disable=["parser"]) +doc = nlp("I don't want parsed", disable=["parser"]) ``` @@ -350,10 +349,10 @@ Language class via [`from_disk`](/api/language#from_disk). ```diff + nlp = spacy.load("en_core_web_sm", disable=["parser"]) -+ doc = nlp(u"I don't want parsed", disable=["parser"]) ++ doc = nlp("I don't want parsed", disable=["parser"]) - nlp = spacy.load("en_core_web_sm", parser=False) -- doc = nlp(u"I don't want parsed", parse=False) +- doc = nlp("I don't want parsed", parse=False) ``` @@ -398,7 +397,7 @@ on a token, it will return an empty string. import spacy nlp = spacy.load("en_core_web_sm") -doc = nlp(u"San Francisco considers banning sidewalk delivery robots") +doc = nlp("San Francisco considers banning sidewalk delivery robots") # document level ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents] @@ -407,8 +406,8 @@ print(ents) # token level ent_san = [doc[0].text, doc[0].ent_iob_, doc[0].ent_type_] ent_francisco = [doc[1].text, doc[1].ent_iob_, doc[1].ent_type_] -print(ent_san) # [u'San', u'B', u'GPE'] -print(ent_francisco) # [u'Francisco', u'I', u'GPE'] +print(ent_san) # ['San', 'B', 'GPE'] +print(ent_francisco) # ['Francisco', 'I', 'GPE'] ``` | Text | ent_iob | ent_iob\_ | ent_type\_ | Description | @@ -435,18 +434,17 @@ import spacy from spacy.tokens import Span nlp = spacy.load("en_core_web_sm") -doc = nlp(u"FB is hiring a new Vice President of global policy") +doc = nlp("FB is hiring a new Vice President of global policy") ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents] print('Before', ents) # the model didn't recognise "FB" as an entity :( -ORG = doc.vocab.strings[u"ORG"] # get hash value of entity label -fb_ent = Span(doc, 0, 1, label=ORG) # create a Span for the new entity +fb_ent = Span(doc, 0, 1, label="ORG") # create a Span for the new entity doc.ents = list(doc.ents) + [fb_ent] ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents] print('After', ents) -# [(u'FB', 0, 2, 'ORG')] 🎉 +# [('FB', 0, 2, 'ORG')] 🎉 ``` Keep in mind that you need to create a `Span` with the start and end index of @@ -468,13 +466,13 @@ import spacy from spacy.attrs import ENT_IOB, ENT_TYPE nlp = spacy.load("en_core_web_sm") -doc = nlp.make_doc(u"London is a big city in the United Kingdom.") +doc = nlp.make_doc("London is a big city in the United Kingdom.") print("Before", doc.ents) # [] header = [ENT_IOB, ENT_TYPE] attr_array = numpy.zeros((len(doc), len(header))) attr_array[0, 0] = 3 # B -attr_array[0, 1] = doc.vocab.strings[u"GPE"] +attr_array[0, 1] = doc.vocab.strings["GPE"] doc.from_array(header, attr_array) print("After", doc.ents) # [London] ``` @@ -533,8 +531,8 @@ train_data = [ ``` ```python -doc = Doc(nlp.vocab, [u"rats", u"make", u"good", u"pets"]) -gold = GoldParse(doc, entities=[u"U-ANIMAL", u"O", u"O", u"O"]) +doc = Doc(nlp.vocab, ["rats", "make", "good", "pets"]) +gold = GoldParse(doc, entities=["U-ANIMAL", "O", "O", "O"]) ``` @@ -565,7 +563,7 @@ For more details and examples, see the import spacy from spacy import displacy -text = u"When Sebastian Thrun started working on self-driving cars at Google in 2007, few people outside of the company took him seriously." +text = "When Sebastian Thrun started working on self-driving cars at Google in 2007, few people outside of the company took him seriously." nlp = spacy.load("en_core_web_sm") doc = nlp(text) @@ -578,29 +576,27 @@ import DisplacyEntHtml from 'images/displacy-ent2.html' ## Entity Linking {#entity-linking} -To ground the named entities into the "real-world", -spaCy provides functionality to perform entity linking, which resolves a textual entity -to a unique identifier from a knowledge base (KB). - -The default model assigns WikiData identifiers, but you can create your own -[`KnowledgeBase`](/api/kb) and [train a new Entity Linking model](/usage/training#entity-linker) using -that custom-made KB. +To ground the named entities into the "real-world", spaCy provides functionality +to perform entity linking, which resolves a textual entity to a unique +identifier from a knowledge base (KB). +The default model assigns WikiData identifiers, but you can create your own +[`KnowledgeBase`](/api/kb) and +[train a new Entity Linking model](/usage/training#entity-linker) using that +custom-made KB. -### Accessing entity identifiers {#accessing} - -The annotated KB identifier is accessible as either a hash value -or as a string, using the attributes -`ent.kb_id` and `ent.kb_id_` of a [`Span`](/api/span) object, -or the `ent_kb_id` and `ent_kb_id_` attributes of a [`Token`](/api/token) object. +### Accessing entity identifiers {#entity-linking-accessing} +The annotated KB identifier is accessible as either a hash value or as a string, +using the attributes `ent.kb_id` and `ent.kb_id_` of a [`Span`](/api/span) +object, or the `ent_kb_id` and `ent_kb_id_` attributes of a +[`Token`](/api/token) object. ```python -### {executable="true"} import spacy nlp = spacy.load("my_custom_el_model") -doc = nlp(u"Ada Lovelace was born in London") +doc = nlp("Ada Lovelace was born in London") # document level ents = [(e.text, e.label_, e.kb_id_) for e in doc.ents] @@ -615,14 +611,14 @@ print(ent_ada_1) # ['Lovelace', 'PERSON', 'Q7259'] print(ent_london_5) # ['London', 'GPE', 'Q84'] ``` -| Text | ent_type\_ | ent_kb_id\_ | -| --------- | ---------- | ------------ | -| Ada | `"PERSON"` | `"Q7259"` | -| Lovelace | `"PERSON"` | `"Q7259"` | -| was | `""` | `""` | -| born | `""` | `""` | -| in | `""` | `""` | -| London | `"GPE"` | `"Q84"` | +| Text | ent_type\_ | ent_kb_id\_ | +| -------- | ---------- | ----------- | +| Ada | `"PERSON"` | `"Q7259"` | +| Lovelace | `"PERSON"` | `"Q7259"` | +| was | - | - | +| born | - | - | +| in | - | - | +| London | `"GPE"` | `"Q84"` | ## Tokenization {#tokenization} @@ -692,53 +688,36 @@ this specific field. Here's how to add a special case rule to an existing ```python ### {executable="true"} import spacy -from spacy.symbols import ORTH, LEMMA, POS, TAG +from spacy.symbols import ORTH nlp = spacy.load("en_core_web_sm") -doc = nlp(u"gimme that") # phrase to tokenize +doc = nlp("gimme that") # phrase to tokenize print([w.text for w in doc]) # ['gimme', 'that'] -# add special case rule -special_case = [{ORTH: u"gim", LEMMA: u"give", POS: u"VERB"}, {ORTH: u"me"}] -nlp.tokenizer.add_special_case(u"gimme", special_case) +# Add special case rule +special_case = [{ORTH: "gim"}, {ORTH: "me"}] +nlp.tokenizer.add_special_case("gimme", special_case) -# check new tokenization -print([w.text for w in nlp(u"gimme that")]) # ['gim', 'me', 'that'] - -# Pronoun lemma is returned as -PRON-! -print([w.lemma_ for w in nlp(u"gimme that")]) # ['give', '-PRON-', 'that'] +# Check new tokenization +print([w.text for w in nlp("gimme that")]) # ['gim', 'me', 'that'] ``` - - -For details on spaCy's custom pronoun lemma `-PRON-`, -[see here](/usage/#pron-lemma). - - - The special case doesn't have to match an entire whitespace-delimited substring. The tokenizer will incrementally split off punctuation, and keep looking up the remaining substring: ```python -assert "gimme" not in [w.text for w in nlp(u"gimme!")] -assert "gimme" not in [w.text for w in nlp(u'("...gimme...?")')] +assert "gimme" not in [w.text for w in nlp("gimme!")] +assert "gimme" not in [w.text for w in nlp('("...gimme...?")')] ``` The special case rules have precedence over the punctuation splitting: ```python -special_case = [{ORTH: u"...gimme...?", LEMMA: u"give", TAG: u"VB"}] -nlp.tokenizer.add_special_case(u"...gimme...?", special_case) -assert len(nlp(u"...gimme...?")) == 1 +nlp.tokenizer.add_special_case("...gimme...?", [{ORTH: "...gimme...?"}]) +assert len(nlp("...gimme...?")) == 1 ``` -Because the special-case rules allow you to set arbitrary token attributes, such -as the part-of-speech, lemma, etc, they make a good mechanism for arbitrary -fix-up rules. Having this logic live in the tokenizer isn't very satisfying from -a design perspective, however, so the API may eventually be exposed on the -[`Language`](/api/language) class itself. - ### How spaCy's tokenizer works {#how-tokenizer-works} spaCy introduces a novel tokenization algorithm, that gives a better balance @@ -838,7 +817,7 @@ def custom_tokenizer(nlp): nlp = spacy.load("en_core_web_sm") nlp.tokenizer = custom_tokenizer(nlp) -doc = nlp(u"hello-world.") +doc = nlp("hello-world.") print([t.text for t in doc]) ``` @@ -955,7 +934,7 @@ class WhitespaceTokenizer(object): nlp = spacy.load("en_core_web_sm") nlp.tokenizer = WhitespaceTokenizer(nlp.vocab) -doc = nlp(u"What's happened to me? he thought. It wasn't a dream.") +doc = nlp("What's happened to me? he thought. It wasn't a dream.") print([t.text for t in doc]) ``` @@ -980,7 +959,7 @@ from spacy.tokens import Doc from spacy.lang.en import English nlp = English() -doc = Doc(nlp.vocab, words=[u"Hello", u",", u"world", u"!"], +doc = Doc(nlp.vocab, words=["Hello", ",", "world", "!"], spaces=[False, True, False, False]) print([(t.text, t.text_with_ws, t.whitespace_) for t in doc]) ``` @@ -997,8 +976,8 @@ from spacy.tokens import Doc from spacy.lang.en import English nlp = English() -bad_spaces = Doc(nlp.vocab, words=[u"Hello", u",", u"world", u"!"]) -good_spaces = Doc(nlp.vocab, words=[u"Hello", u",", u"world", u"!"], +bad_spaces = Doc(nlp.vocab, words=["Hello", ",", "world", "!"]) +good_spaces = Doc(nlp.vocab, words=["Hello", ",", "world", "!"], spaces=[False, True, False, False]) print(bad_spaces.text) # 'Hello , world !' @@ -1280,7 +1259,7 @@ that yields [`Span`](/api/span) objects. import spacy nlp = spacy.load("en_core_web_sm") -doc = nlp(u"This is a sentence. This is another sentence.") +doc = nlp("This is a sentence. This is another sentence.") for sent in doc.sents: print(sent.text) ``` @@ -1300,7 +1279,7 @@ from spacy.lang.en import English nlp = English() # just the language with no model sentencizer = nlp.create_pipe("sentencizer") nlp.add_pipe(sentencizer) -doc = nlp(u"This is a sentence. This is another sentence.") +doc = nlp("This is a sentence. This is another sentence.") for sent in doc.sents: print(sent.text) ``` @@ -1336,7 +1315,7 @@ take advantage of dependency-based sentence segmentation. ### {executable="true"} import spacy -text = u"this is a sentence...hello...and another sentence." +text = "this is a sentence...hello...and another sentence." nlp = spacy.load("en_core_web_sm") doc = nlp(text) diff --git a/website/docs/usage/models.md b/website/docs/usage/models.md index 5df4ab458..a8a478949 100644 --- a/website/docs/usage/models.md +++ b/website/docs/usage/models.md @@ -120,7 +120,7 @@ python -m spacy download en_core_web_sm ```python import spacy nlp = spacy.load("en_core_web_sm") -doc = nlp(u"This is a sentence.") +doc = nlp("This is a sentence.") ``` @@ -197,7 +197,7 @@ nlp = spacy.load("en_core_web_sm") # load model package "en_core_web_s nlp = spacy.load("/path/to/en_core_web_sm") # load package from a directory nlp = spacy.load("en") # load model with shortcut link "en" -doc = nlp(u"This is a sentence.") +doc = nlp("This is a sentence.") ``` @@ -269,7 +269,7 @@ also `import` it and then call its `load()` method with no arguments: import en_core_web_sm nlp = en_core_web_sm.load() -doc = nlp(u"This is a sentence.") +doc = nlp("This is a sentence.") ``` How you choose to load your models ultimately depends on personal preference. diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md index 51a57d7f5..dcd182965 100644 --- a/website/docs/usage/processing-pipelines.md +++ b/website/docs/usage/processing-pipelines.md @@ -20,7 +20,7 @@ component** on the `Doc`, in order. It then returns the processed `Doc` that you can work with. ```python -doc = nlp(u"This is a text") +doc = nlp("This is a text") ``` When processing large volumes of text, the statistical models are usually more @@ -29,7 +29,7 @@ efficient if you let them work on batches of texts. spaCy's processed `Doc` objects. The batching is done internally. ```diff -texts = [u"This is a text", u"These are lots of texts", u"..."] +texts = ["This is a text", "These are lots of texts", "..."] - docs = [nlp(text) for text in texts] + docs = list(nlp.pipe(texts)) ``` @@ -172,7 +172,7 @@ which is then processed by the component next in the pipeline. ```python ### The pipeline under the hood -doc = nlp.make_doc(u"This is a sentence") # create a Doc from raw text +doc = nlp.make_doc("This is a sentence") # create a Doc from raw text for name, proc in nlp.pipeline: # iterate over components in order doc = proc(doc) # apply each component ``` @@ -263,12 +263,12 @@ blocks. ### Disable for block # 1. Use as a contextmanager with nlp.disable_pipes("tagger", "parser"): - doc = nlp(u"I won't be tagged and parsed") -doc = nlp(u"I will be tagged and parsed") + doc = nlp("I won't be tagged and parsed") +doc = nlp("I will be tagged and parsed") # 2. Restore manually disabled = nlp.disable_pipes("ner") -doc = nlp(u"I won't have named entities") +doc = nlp("I won't have named entities") disabled.restore() ``` @@ -295,11 +295,11 @@ initializing a Language class via [`from_disk`](/api/language#from_disk). ```diff - nlp = spacy.load('en', tagger=False, entity=False) -- doc = nlp(u"I don't want parsed", parse=False) +- doc = nlp("I don't want parsed", parse=False) + nlp = spacy.load("en", disable=["ner"]) + nlp.remove_pipe("parser") -+ doc = nlp(u"I don't want parsed") ++ doc = nlp("I don't want parsed") ``` @@ -376,7 +376,7 @@ def my_component(doc): nlp = spacy.load("en_core_web_sm") nlp.add_pipe(my_component, name="print_info", last=True) print(nlp.pipe_names) # ['tagger', 'parser', 'ner', 'print_info'] -doc = nlp(u"This is a sentence.") +doc = nlp("This is a sentence.") ``` @@ -426,14 +426,14 @@ class EntityMatcher(object): return doc nlp = spacy.load("en_core_web_sm") -terms = (u"cat", u"dog", u"tree kangaroo", u"giant sea spider") +terms = ("cat", "dog", "tree kangaroo", "giant sea spider") entity_matcher = EntityMatcher(nlp, terms, "ANIMAL") nlp.add_pipe(entity_matcher, after="ner") print(nlp.pipe_names) # The components in the pipeline -doc = nlp(u"This is a text about Barack Obama and a tree kangaroo") +doc = nlp("This is a text about Barack Obama and a tree kangaroo") print([(ent.text, ent.label_) for ent in doc.ents]) ``` @@ -471,7 +471,7 @@ def custom_sentencizer(doc): nlp = spacy.load("en_core_web_sm") nlp.add_pipe(custom_sentencizer, before="parser") # Insert before the parser -doc = nlp(u"This is. A sentence. | This is. Another sentence.") +doc = nlp("This is. A sentence. | This is. Another sentence.") for sent in doc.sents: print(sent.text) ``` @@ -517,7 +517,7 @@ config parameters are passed all the way down from components with custom settings: ```python -nlp = spacy.load("your_custom_model", terms=(u"tree kangaroo"), label="ANIMAL") +nlp = spacy.load("your_custom_model", terms=["tree kangaroo"], label="ANIMAL") ``` @@ -617,7 +617,7 @@ raise an `AttributeError`. ### Example from spacy.tokens import Doc, Span, Token -fruits = [u"apple", u"pear", u"banana", u"orange", u"strawberry"] +fruits = ["apple", "pear", "banana", "orange", "strawberry"] is_fruit_getter = lambda token: token.text in fruits has_fruit_getter = lambda obj: any([t.text in fruits for t in obj]) @@ -629,7 +629,7 @@ Span.set_extension("has_fruit", getter=has_fruit_getter) > #### Usage example > > ```python -> doc = nlp(u"I have an apple and a melon") +> doc = nlp("I have an apple and a melon") > assert doc[3]._.is_fruit # get Token attributes > assert not doc[0]._.is_fruit > assert doc._.has_fruit # get Doc attributes diff --git a/website/docs/usage/rule-based-matching.md b/website/docs/usage/rule-based-matching.md index 1d67625a5..4c398ecd0 100644 --- a/website/docs/usage/rule-based-matching.md +++ b/website/docs/usage/rule-based-matching.md @@ -90,7 +90,7 @@ the pattern is not going to produce any results. When developing complex patterns, make sure to check examples against spaCy's tokenization: ```python -doc = nlp(u"A complex-example,!") +doc = nlp("A complex-example,!") print([token.text for token in doc]) ``` @@ -113,7 +113,7 @@ matcher = Matcher(nlp.vocab) pattern = [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"LOWER": "world"}] matcher.add("HelloWorld", None, pattern) -doc = nlp(u"Hello, world! Hello world!") +doc = nlp("Hello, world! Hello world!") matches = matcher(doc) for match_id, start, end in matches: string_id = nlp.vocab.strings[match_id] # Get string representation @@ -447,7 +447,7 @@ def add_event_ent(matcher, doc, i, matches): pattern = [{"ORTH": "Google"}, {"ORTH": "I"}, {"ORTH": "/"}, {"ORTH": "O"}] matcher.add("GoogleIO", add_event_ent, pattern) -doc = nlp(u"This is a text about Google I/O") +doc = nlp("This is a text about Google I/O") matches = matcher(doc) ``` @@ -539,7 +539,7 @@ class BadHTMLMerger(object): nlp = spacy.load("en_core_web_sm") html_merger = BadHTMLMerger(nlp) nlp.add_pipe(html_merger, last=True) # Add component to the pipeline -doc = nlp(u"Hello
world!
This is a test.") +doc = nlp("Hello
world!
This is a test.") for token in doc: print(token.text, token._.bad_html) @@ -617,7 +617,7 @@ def collect_sents(matcher, doc, i, matches): pattern = [{"LOWER": "facebook"}, {"LEMMA": "be"}, {"POS": "ADV", "OP": "*"}, {"POS": "ADJ"}] matcher.add("FacebookIs", collect_sents, pattern) # add pattern -doc = nlp(u"I'd say that Facebook is evil. – Facebook is pretty cool, right?") +doc = nlp("I'd say that Facebook is evil. – Facebook is pretty cool, right?") matches = matcher(doc) # Serve visualization of sentences containing match with displaCy @@ -673,7 +673,7 @@ pattern = [{"ORTH": "("}, {"SHAPE": "ddd"}, {"ORTH": ")"}, {"SHAPE": "ddd"}, {"ORTH": "-", "OP": "?"}, {"SHAPE": "ddd"}] matcher.add("PHONE_NUMBER", None, pattern) -doc = nlp(u"Call me at (123) 456 789 or (123) 456 789!") +doc = nlp("Call me at (123) 456 789 or (123) 456 789!") print([t.text for t in doc]) matches = matcher(doc) for match_id, start, end in matches: @@ -719,8 +719,8 @@ from spacy.matcher import Matcher nlp = English() # We only want the tokenizer, so no need to load a model matcher = Matcher(nlp.vocab) -pos_emoji = [u"😀", u"😃", u"😂", u"🤣", u"😊", u"😍"] # Positive emoji -neg_emoji = [u"😞", u"😠", u"😩", u"😢", u"😭", u"😒"] # Negative emoji +pos_emoji = ["😀", "😃", "😂", "🤣", "😊", "😍"] # Positive emoji +neg_emoji = ["😞", "😠", "😩", "😢", "😭", "😒"] # Negative emoji # Add patterns to match one or more emoji tokens pos_patterns = [[{"ORTH": emoji}] for emoji in pos_emoji] @@ -740,7 +740,7 @@ matcher.add("SAD", label_sentiment, *neg_patterns) # Add negative pattern # Add pattern for valid hashtag, i.e. '#' plus any ASCII token matcher.add("HASHTAG", None, [{"ORTH": "#"}, {"IS_ASCII": True}]) -doc = nlp(u"Hello world 😀 #MondayMotivation") +doc = nlp("Hello world 😀 #MondayMotivation") matches = matcher(doc) for match_id, start, end in matches: string_id = doc.vocab.strings[match_id] # Look up string ID @@ -797,7 +797,7 @@ matcher.add("HASHTAG", None, [{"ORTH": "#"}, {"IS_ASCII": True}]) # Register token extension Token.set_extension("is_hashtag", default=False) -doc = nlp(u"Hello world 😀 #MondayMotivation") +doc = nlp("Hello world 😀 #MondayMotivation") matches = matcher(doc) hashtags = [] for match_id, start, end in matches: @@ -838,13 +838,13 @@ from spacy.matcher import PhraseMatcher nlp = spacy.load('en_core_web_sm') matcher = PhraseMatcher(nlp.vocab) -terms = [u"Barack Obama", u"Angela Merkel", u"Washington, D.C."] +terms = ["Barack Obama", "Angela Merkel", "Washington, D.C."] # Only run nlp.make_doc to speed things up patterns = [nlp.make_doc(text) for text in terms] matcher.add("TerminologyList", None, *patterns) -doc = nlp(u"German Chancellor Angela Merkel and US President Barack Obama " - u"converse in the Oval Office inside the White House in Washington, D.C.") +doc = nlp("German Chancellor Angela Merkel and US President Barack Obama " + "converse in the Oval Office inside the White House in Washington, D.C.") matches = matcher(doc) for match_id, start, end in matches: span = doc[start:end] @@ -853,8 +853,8 @@ for match_id, start, end in matches: Since spaCy is used for processing both the patterns and the text to be matched, you won't have to worry about specific tokenization – for example, you can -simply pass in `nlp(u"Washington, D.C.")` and won't have to write a complex -token pattern covering the exact tokenization of the term. +simply pass in `nlp("Washington, D.C.")` and won't have to write a complex token +pattern covering the exact tokenization of the term. @@ -889,10 +889,10 @@ from spacy.matcher import PhraseMatcher nlp = English() matcher = PhraseMatcher(nlp.vocab, attr="LOWER") -patterns = [nlp.make_doc(name) for name in [u"Angela Merkel", u"Barack Obama"]] +patterns = [nlp.make_doc(name) for name in ["Angela Merkel", "Barack Obama"]] matcher.add("Names", None, *patterns) -doc = nlp(u"angela merkel and us president barack Obama") +doc = nlp("angela merkel and us president barack Obama") for match_id, start, end in matcher(doc): print("Matched based on lowercase token text:", doc[start:end]) ``` @@ -924,9 +924,9 @@ from spacy.matcher import PhraseMatcher nlp = English() matcher = PhraseMatcher(nlp.vocab, attr="SHAPE") -matcher.add("IP", None, nlp(u"127.0.0.1"), nlp(u"127.127.0.0")) +matcher.add("IP", None, nlp("127.0.0.1"), nlp("127.127.0.0")) -doc = nlp(u"Often the router will have an IP address such as 192.168.1.1 or 192.168.2.1.") +doc = nlp("Often the router will have an IP address such as 192.168.1.1 or 192.168.2.1.") for match_id, start, end in matcher(doc): print("Matched based on token shape:", doc[start:end]) ``` @@ -982,7 +982,7 @@ patterns = [{"label": "ORG", "pattern": "Apple"}, ruler.add_patterns(patterns) nlp.add_pipe(ruler) -doc = nlp(u"Apple is opening its first big office in San Francisco.") +doc = nlp("Apple is opening its first big office in San Francisco.") print([(ent.text, ent.label_) for ent in doc.ents]) ``` @@ -1006,7 +1006,7 @@ patterns = [{"label": "ORG", "pattern": "MyCorp Inc."}] ruler.add_patterns(patterns) nlp.add_pipe(ruler) -doc = nlp(u"MyCorp Inc. is a company in the U.S.") +doc = nlp("MyCorp Inc. is a company in the U.S.") print([(ent.text, ent.label_) for ent in doc.ents]) ``` diff --git a/website/docs/usage/saving-loading.md b/website/docs/usage/saving-loading.md index 1ad4824fa..d592277aa 100644 --- a/website/docs/usage/saving-loading.md +++ b/website/docs/usage/saving-loading.md @@ -64,7 +64,7 @@ _then_ loads in the binary data. You can read more about this process > #### Example > > ```python -> doc = nlp(u"This is a text.") +> doc = nlp("This is a text.") > data = pickle.dumps(doc) > ``` @@ -84,8 +84,8 @@ the _same_ `Vocab` object, it will only be included once. ```python ### Pickling objects with shared data {highlight="8-9"} -doc1 = nlp(u"Hello world") -doc2 = nlp(u"This is a test") +doc1 = nlp("Hello world") +doc2 = nlp("This is a test") doc1_data = pickle.dumps(doc1) doc2_data = pickle.dumps(doc2) @@ -347,7 +347,7 @@ spaCy is now able to create the pipeline component `'snek'`: >>> nlp = English() >>> snek = nlp.create_pipe("snek") # this now works! 🐍🎉 >>> nlp.add_pipe(snek) ->>> doc = nlp(u"I am snek") +>>> doc = nlp("I am snek") --..,_ _,.--. `'.'. .'`__ o `;__. '.'. .'.'` '---'` ` @@ -497,8 +497,8 @@ If you're training a named entity recognition model for a custom domain, you may end up training different labels that don't have pre-defined colors in the [`displacy` visualizer](/usage/visualizers#ent). The `spacy_displacy_colors` entry point lets you define a dictionary of entity labels mapped to their color -values. It's added to the pre-defined colors and can also overwrite -existing values. +values. It's added to the pre-defined colors and can also overwrite existing +values. > #### Domain-specific NER labels > @@ -528,8 +528,8 @@ setup( ``` After installing the package, the the custom colors will be used when -visualizing text with `displacy`. Whenever the label `SNEK` is assigned, it -will be displayed in `#3dff74`. +visualizing text with `displacy`. Whenever the label `SNEK` is assigned, it will +be displayed in `#3dff74`. import DisplaCyEntSnekHtml from 'images/displacy-ent-snek.html' diff --git a/website/docs/usage/spacy-101.md b/website/docs/usage/spacy-101.md index 081b6d896..12d789410 100644 --- a/website/docs/usage/spacy-101.md +++ b/website/docs/usage/spacy-101.md @@ -179,7 +179,7 @@ processed `Doc`: import spacy nlp = spacy.load("en_core_web_sm") -doc = nlp(u"Apple is looking at buying U.K. startup for $1 billion") +doc = nlp("Apple is looking at buying U.K. startup for $1 billion") for token in doc: print(token.text, token.pos_, token.dep_) ``` @@ -240,8 +240,8 @@ of a model, see the usage guides on -To learn more about entity linking in spaCy, and how to **train and update** -the entity linker predictions, see the usage guides on +To learn more about entity linking in spaCy, and how to **train and update** the +entity linker predictions, see the usage guides on [entity linking](/usage/linguistic-features#entity-linking) and [training the entity linker](/usage/training#entity-linker). @@ -307,8 +307,8 @@ its hash, or a hash to get its string: import spacy nlp = spacy.load("en_core_web_sm") -doc = nlp(u"I love coffee") -print(doc.vocab.strings[u"coffee"]) # 3197928453018144401 +doc = nlp("I love coffee") +print(doc.vocab.strings["coffee"]) # 3197928453018144401 print(doc.vocab.strings[3197928453018144401]) # 'coffee' ``` @@ -331,7 +331,7 @@ ever change. Its hash value will also always be the same. import spacy nlp = spacy.load("en_core_web_sm") -doc = nlp(u"I love coffee") +doc = nlp("I love coffee") for word in doc: lexeme = doc.vocab[word.text] print(lexeme.text, lexeme.orth, lexeme.shape_, lexeme.prefix_, lexeme.suffix_, @@ -372,14 +372,14 @@ from spacy.tokens import Doc from spacy.vocab import Vocab nlp = spacy.load("en_core_web_sm") -doc = nlp(u"I love coffee") # Original Doc -print(doc.vocab.strings[u"coffee"]) # 3197928453018144401 +doc = nlp("I love coffee") # Original Doc +print(doc.vocab.strings["coffee"]) # 3197928453018144401 print(doc.vocab.strings[3197928453018144401]) # 'coffee' 👍 empty_doc = Doc(Vocab()) # New Doc with empty Vocab # empty_doc.vocab.strings[3197928453018144401] will raise an error :( -empty_doc.vocab.strings.add(u"coffee") # Add "coffee" and generate hash +empty_doc.vocab.strings.add("coffee") # Add "coffee" and generate hash print(empty_doc.vocab.strings[3197928453018144401]) # 'coffee' 👍 new_doc = Doc(doc.vocab) # Create new doc with first doc's vocab @@ -396,20 +396,24 @@ it. ## Knowledge Base {#kb} To support the entity linking task, spaCy stores external knowledge in a -[`KnowledgeBase`](/api/kb). The knowledge base (KB) uses the `Vocab` to store its -data efficiently. +[`KnowledgeBase`](/api/kb). The knowledge base (KB) uses the `Vocab` to store +its data efficiently. > - **Mention**: A textual occurrence of a named entity, e.g. 'Miss Lovelace'. -> - **KB ID**: A unique identifier refering to a particular real-world concept, e.g. 'Q7259'. -> - **Alias**: A plausible synonym or description for a certain KB ID, e.g. 'Ada Lovelace'. -> - **Prior probability**: The probability of a certain mention resolving to a certain KB ID, -prior to knowing anything about the context in which the mention is used. -> - **Entity vector**: A pretrained word vector capturing the entity description. - -A knowledge base is created by first adding all entities to it. Next, for each -potential mention or alias, a list of relevant KB IDs and their prior probabilities -is added. The sum of these prior probabilities should never exceed 1 for any given alias. +> - **KB ID**: A unique identifier refering to a particular real-world concept, +> e.g. 'Q7259'. +> - **Alias**: A plausible synonym or description for a certain KB ID, e.g. 'Ada +> Lovelace'. +> - **Prior probability**: The probability of a certain mention resolving to a +> certain KB ID, prior to knowing anything about the context in which the +> mention is used. +> - **Entity vector**: A pretrained word vector capturing the entity +> description. +A knowledge base is created by first adding all entities to it. Next, for each +potential mention or alias, a list of relevant KB IDs and their prior +probabilities is added. The sum of these prior probabilities should never exceed +1 for any given alias. ```python ### {executable="true"} @@ -436,10 +440,10 @@ print("Number of aliases in KB:", kb.get_size_aliases()) # 2 ### Candidate generation -Given a textual entity, the Knowledge Base can provide a list of plausible candidates or -entity identifiers. The [`EntityLinker`](/api/entitylinker) will take this list of candidates -as input, and disambiguate the mention to the most probable identifier, given the -document context. +Given a textual entity, the Knowledge Base can provide a list of plausible +candidates or entity identifiers. The [`EntityLinker`](/api/entitylinker) will +take this list of candidates as input, and disambiguate the mention to the most +probable identifier, given the document context. ```python ### {executable="true"} @@ -520,11 +524,11 @@ python -m spacy download de_core_news_sm import spacy nlp = spacy.load("en_core_web_sm") -doc = nlp(u"Hello, world. Here are two sentences.") +doc = nlp("Hello, world. Here are two sentences.") print([t.text for t in doc]) nlp_de = spacy.load("de_core_news_sm") -doc_de = nlp_de(u"Ich bin ein Berliner.") +doc_de = nlp_de("Ich bin ein Berliner.") print([t.text for t in doc_de]) ``` @@ -543,8 +547,8 @@ print([t.text for t in doc_de]) import spacy nlp = spacy.load("en_core_web_sm") -doc = nlp(u"Peach emoji is where it has always been. Peach is the superior " - u"emoji. It's outranking eggplant 🍑 ") +doc = nlp("Peach emoji is where it has always been. Peach is the superior " + "emoji. It's outranking eggplant 🍑 ") print(doc[0].text) # 'Peach' print(doc[1].text) # 'emoji' print(doc[-1].text) # '🍑' @@ -572,7 +576,7 @@ print(sentences[1].text) # 'Peach is the superior emoji.' import spacy nlp = spacy.load("en_core_web_sm") -doc = nlp(u"Apple is looking at buying U.K. startup for $1 billion") +doc = nlp("Apple is looking at buying U.K. startup for $1 billion") apple = doc[0] print("Fine-grained POS tag", apple.pos_, apple.pos) print("Coarse-grained POS tag", apple.tag_, apple.tag) @@ -600,20 +604,20 @@ print("Like an email address?", billion.like_email) import spacy nlp = spacy.load("en_core_web_sm") -doc = nlp(u"I love coffee") +doc = nlp("I love coffee") -coffee_hash = nlp.vocab.strings[u"coffee"] # 3197928453018144401 +coffee_hash = nlp.vocab.strings["coffee"] # 3197928453018144401 coffee_text = nlp.vocab.strings[coffee_hash] # 'coffee' print(coffee_hash, coffee_text) print(doc[2].orth, coffee_hash) # 3197928453018144401 print(doc[2].text, coffee_text) # 'coffee' -beer_hash = doc.vocab.strings.add(u"beer") # 3073001599257881079 +beer_hash = doc.vocab.strings.add("beer") # 3073001599257881079 beer_text = doc.vocab.strings[beer_hash] # 'beer' print(beer_hash, beer_text) -unicorn_hash = doc.vocab.strings.add(u"🦄 ") # 18234233413267120783 -unicorn_text = doc.vocab.strings[unicorn_hash] # '🦄 ' +unicorn_hash = doc.vocab.strings.add("🦄") # 18234233413267120783 +unicorn_text = doc.vocab.strings[unicorn_hash] # '🦄' print(unicorn_hash, unicorn_text) ``` @@ -629,19 +633,17 @@ print(unicorn_hash, unicorn_text) ```python ### {executable="true"} import spacy - -nlp = spacy.load("en_core_web_sm") -doc = nlp(u"San Francisco considers banning sidewalk delivery robots") -for ent in doc.ents: - print(ent.text, ent.start_char, ent.end_char, ent.label_) - from spacy.tokens import Span -doc = nlp(u"FB is hiring a new VP of global policy") -doc.ents = [Span(doc, 0, 1, label=doc.vocab.strings[u"ORG"])] +nlp = spacy.load("en_core_web_sm") +doc = nlp("San Francisco considers banning sidewalk delivery robots") for ent in doc.ents: print(ent.text, ent.start_char, ent.end_char, ent.label_) +doc = nlp("FB is hiring a new VP of global policy") +doc.ents = [Span(doc, 0, 1, label="ORG")] +for ent in doc.ents: + print(ent.text, ent.start_char, ent.end_char, ent.label_) ``` @@ -657,7 +659,7 @@ import spacy import random nlp = spacy.load("en_core_web_sm") -train_data = [(u"Uber blew through $1 million", {"entities": [(0, 4, "ORG")]})] +train_data = [("Uber blew through $1 million", {"entities": [(0, 4, "ORG")]})] other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"] with nlp.disable_pipes(*other_pipes): @@ -685,11 +687,11 @@ nlp.to_disk("/model") ```python from spacy import displacy -doc_dep = nlp(u"This is a sentence.") +doc_dep = nlp("This is a sentence.") displacy.serve(doc_dep, style="dep") -doc_ent = nlp(u"When Sebastian Thrun started working on self-driving cars at Google " - u"in 2007, few people outside of the company took him seriously.") +doc_ent = nlp("When Sebastian Thrun started working on self-driving cars at Google " + "in 2007, few people outside of the company took him seriously.") displacy.serve(doc_ent, style="ent") ``` @@ -707,7 +709,7 @@ displacy.serve(doc_ent, style="ent") import spacy nlp = spacy.load("en_core_web_md") -doc = nlp(u"Apple and banana are similar. Pasta and hippo aren't.") +doc = nlp("Apple and banana are similar. Pasta and hippo aren't.") apple = doc[0] banana = doc[2] @@ -769,7 +771,7 @@ pattern2 = [[{"ORTH": emoji, "OP": "+"}] for emoji in ["😀", "😂", "🤣", " matcher.add("GoogleIO", None, pattern1) # Match "Google I/O" or "Google i/o" matcher.add("HAPPY", set_sentiment, *pattern2) # Match one or more happy emoji -doc = nlp(u"A text about Google I/O 😀😀") +doc = nlp("A text about Google I/O 😀😀") matches = matcher(doc) for match_id, start, end in matches: @@ -789,7 +791,7 @@ print("Sentiment", doc.sentiment) ### Minibatched stream processing {#lightning-tour-minibatched} ```python -texts = [u"One document.", u"...", u"Lots of documents"] +texts = ["One document.", "...", "Lots of documents"] # .pipe streams input, and produces streaming output iter_texts = (texts[i % 3] for i in range(100000000)) for i, doc in enumerate(nlp.pipe(iter_texts, batch_size=50)): @@ -805,8 +807,8 @@ for i, doc in enumerate(nlp.pipe(iter_texts, batch_size=50)): import spacy nlp = spacy.load("en_core_web_sm") -doc = nlp(u"When Sebastian Thrun started working on self-driving cars at Google " - u"in 2007, few people outside of the company took him seriously.") +doc = nlp("When Sebastian Thrun started working on self-driving cars at Google " + "in 2007, few people outside of the company took him seriously.") dep_labels = [] for token in doc: @@ -831,7 +833,7 @@ import spacy from spacy.attrs import ORTH, LIKE_URL nlp = spacy.load("en_core_web_sm") -doc = nlp(u"Check out https://spacy.io") +doc = nlp("Check out https://spacy.io") for token in doc: print(token.text, token.orth, token.like_url) @@ -877,7 +879,7 @@ def put_spans_around_tokens(doc): nlp = spacy.load("en_core_web_sm") -doc = nlp(u"This is a test.\\n\\nHello world.") +doc = nlp("This is a test.\\n\\nHello world.") html = put_spans_around_tokens(doc) print(html) ``` diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md index 9489615bc..f84fd0ed4 100644 --- a/website/docs/usage/training.md +++ b/website/docs/usage/training.md @@ -298,10 +298,10 @@ imports. It also makes it easier to structure and load your training data. ```python ### Simple training loop TRAIN_DATA = [ - (u"Uber blew through $1 million a week", {"entities": [(0, 4, "ORG")]}), - (u"Google rebrands its business apps", {"entities": [(0, 6, "ORG")]})] + ("Uber blew through $1 million a week", {"entities": [(0, 4, "ORG")]}), + ("Google rebrands its business apps", {"entities": [(0, 6, "ORG")]})] -nlp = spacy.blank('en') +nlp = spacy.blank("en") optimizer = nlp.begin_training() for i in range(20): random.shuffle(TRAIN_DATA) @@ -498,7 +498,7 @@ like this: ![Custom dependencies](../images/displacy-custom-parser.svg) ```python -doc = nlp(u"find a hotel with good wifi") +doc = nlp("find a hotel with good wifi") print([(t.text, t.dep_, t.head.text) for t in doc if t.dep_ != '-']) # [('find', 'ROOT', 'find'), ('hotel', 'PLACE', 'find'), # ('good', 'QUALITY', 'wifi'), ('wifi', 'ATTRIBUTE', 'hotel')] diff --git a/website/docs/usage/v2-1.md b/website/docs/usage/v2-1.md index d3c9fb504..4a8ef5a37 100644 --- a/website/docs/usage/v2-1.md +++ b/website/docs/usage/v2-1.md @@ -99,8 +99,8 @@ flexibility. > > ```python > matcher = PhraseMatcher(nlp.vocab, attr="POS") -> matcher.add("PATTERN", None, nlp(u"I love cats")) -> doc = nlp(u"You like dogs") +> matcher.add("PATTERN", None, nlp("I love cats")) +> doc = nlp("You like dogs") > matches = matcher(doc) > ``` @@ -122,9 +122,9 @@ or `POS` for finding sequences of the same part-of-speech tags. > #### Example > > ```python -> doc = nlp(u"I like David Bowie") +> doc = nlp("I like David Bowie") > with doc.retokenize() as retokenizer: -> attrs = {"LEMMA": u"David Bowie"} +> attrs = {"LEMMA": "David Bowie"} > retokenizer.merge(doc[2:4], attrs=attrs) > ``` diff --git a/website/docs/usage/v2.md b/website/docs/usage/v2.md index 9e54106c7..d7011fb2d 100644 --- a/website/docs/usage/v2.md +++ b/website/docs/usage/v2.md @@ -156,7 +156,7 @@ spaCy or plug in your own machine learning models. > for itn in range(100): > for doc, gold in train_data: > nlp.update([doc], [gold]) -> doc = nlp(u"This is a text.") +> doc = nlp("This is a text.") > print(doc.cats) > ``` @@ -179,13 +179,13 @@ network to assign position-sensitive vectors to each word in the document. > #### Example > > ```python -> doc = nlp(u"I love coffee") -> assert doc.vocab.strings[u"coffee"] == 3197928453018144401 -> assert doc.vocab.strings[3197928453018144401] == u"coffee" +> doc = nlp("I love coffee") +> assert doc.vocab.strings["coffee"] == 3197928453018144401 +> assert doc.vocab.strings[3197928453018144401] == "coffee" > -> beer_hash = doc.vocab.strings.add(u"beer") -> assert doc.vocab.strings[u"beer"] == beer_hash -> assert doc.vocab.strings[beer_hash] == u"beer" +> beer_hash = doc.vocab.strings.add("beer") +> assert doc.vocab.strings["beer"] == beer_hash +> assert doc.vocab.strings[beer_hash] == "beer" > ``` The [`StringStore`](/api/stringstore) now resolves all strings to hash values @@ -275,7 +275,7 @@ language, you can import the class directly, e.g. > > ```python > from spacy import displacy -> doc = nlp(u"This is a sentence about Facebook.") +> doc = nlp("This is a sentence about Facebook.") > displacy.serve(doc, style="dep") # run the web server > html = displacy.render(doc, style="ent") # generate HTML > ``` @@ -322,7 +322,7 @@ lookup-based lemmatization – and **many new languages**! > matcher.add('HEARTS', None, [{"ORTH": "❤️", "OP": '+'}]) > > phrasematcher = PhraseMatcher(nlp.vocab) -> phrasematcher.add("OBAMA", None, nlp(u"Barack Obama")) +> phrasematcher.add("OBAMA", None, nlp("Barack Obama")) > ``` Patterns can now be added to the matcher by calling @@ -477,12 +477,12 @@ to the `disable` keyword argument on load, or by using [`disable_pipes`](/api/language#disable_pipes) as a method or context manager: ```diff -- nlp = spacy.load("en", tagger=False, entity=False) -- doc = nlp(u"I don't want parsed", parse=False) +- nlp = spacy.load("en_core_web_sm", tagger=False, entity=False) +- doc = nlp("I don't want parsed", parse=False) -+ nlp = spacy.load("en", disable=["tagger", "ner"]) ++ nlp = spacy.load("en_core_web_sm", disable=["tagger", "ner"]) + with nlp.disable_pipes("parser"): -+ doc = nlp(u"I don't want parsed") ++ doc = nlp("I don't want parsed") ``` To add spaCy's built-in pipeline components to your pipeline, you can still @@ -539,7 +539,7 @@ This means that your application can – and should – only pass around `Doc` objects and refer to them as the single source of truth. ```diff -- doc = nlp(u"This is a regular doc") +- doc = nlp("This is a regular doc") - doc_array = doc.to_array(["ORTH", "POS"]) - doc_with_meta = {"doc_array": doc_array, "meta": get_doc_meta(doc_array)} @@ -556,11 +556,11 @@ utilities that interact with the pipeline, consider moving this logic into its own extension module. ```diff -- doc = nlp(u"Doc with a standard pipeline") +- doc = nlp("Doc with a standard pipeline") - meta = get_meta(doc) + nlp.add_pipe(meta_component) -+ doc = nlp(u"Doc with a custom pipeline that assigns meta") ++ doc = nlp("Doc with a custom pipeline that assigns meta") + meta = doc._.meta ``` @@ -572,12 +572,12 @@ to call [`StringStore.add`](/api/stringstore#add) explicitly. You can also now be sure that the string-to-hash mapping will always match across vocabularies. ```diff -- nlp.vocab.strings[u"coffee"] # 3672 -- other_nlp.vocab.strings[u"coffee"] # 40259 +- nlp.vocab.strings["coffee"] # 3672 +- other_nlp.vocab.strings["coffee"] # 40259 -+ nlp.vocab.strings.add(u"coffee") -+ nlp.vocab.strings[u"coffee"] # 3197928453018144401 -+ other_nlp.vocab.strings[u"coffee"] # 3197928453018144401 ++ nlp.vocab.strings.add("coffee") ++ nlp.vocab.strings["coffee"] # 3197928453018144401 ++ other_nlp.vocab.strings["coffee"] # 3197928453018144401 ``` ### Adding patterns and callbacks to the matcher {#migrating-matcher} diff --git a/website/docs/usage/vectors-similarity.md b/website/docs/usage/vectors-similarity.md index f7c9d1cd9..53648f66e 100644 --- a/website/docs/usage/vectors-similarity.md +++ b/website/docs/usage/vectors-similarity.md @@ -74,8 +74,8 @@ path to [`spacy.load()`](/api/top-level#spacy.load). ```python nlp_latin = spacy.load("/tmp/la_vectors_wiki_lg") -doc1 = nlp_latin(u"Caecilius est in horto") -doc2 = nlp_latin(u"servus est in atrio") +doc1 = nlp_latin("Caecilius est in horto") +doc2 = nlp_latin("servus est in atrio") doc1.similarity(doc2) ``` @@ -168,10 +168,9 @@ vectors to the vocabulary, you can use the ### Adding vectors from spacy.vocab import Vocab -vector_data = {u"dog": numpy.random.uniform(-1, 1, (300,)), - u"cat": numpy.random.uniform(-1, 1, (300,)), - u"orange": numpy.random.uniform(-1, 1, (300,))} - +vector_data = {"dog": numpy.random.uniform(-1, 1, (300,)), + "cat": numpy.random.uniform(-1, 1, (300,)), + "orange": numpy.random.uniform(-1, 1, (300,))} vocab = Vocab() for word, vector in vector_data.items(): vocab.set_vector(word, vector) @@ -241,7 +240,7 @@ import cupy.cuda from spacy.vectors import Vectors vector_table = numpy.zeros((3, 300), dtype="f") -vectors = Vectors([u"dog", u"cat", u"orange"], vector_table) +vectors = Vectors(["dog", "cat", "orange"], vector_table) with cupy.cuda.Device(0): vectors.data = cupy.asarray(vectors.data) ``` @@ -252,6 +251,6 @@ import torch from spacy.vectors import Vectors vector_table = numpy.zeros((3, 300), dtype="f") -vectors = Vectors([u"dog", u"cat", u"orange"], vector_table) +vectors = Vectors(["dog", "cat", "orange"], vector_table) vectors.data = torch.Tensor(vectors.data).cuda(0) ``` diff --git a/website/docs/usage/visualizers.md b/website/docs/usage/visualizers.md index 6172d2f48..dd0b0eb50 100644 --- a/website/docs/usage/visualizers.md +++ b/website/docs/usage/visualizers.md @@ -48,7 +48,7 @@ import spacy from spacy import displacy nlp = spacy.load("en_core_web_sm") -doc = nlp(u"This is a sentence.") +doc = nlp("This is a sentence.") displacy.serve(doc, style="dep") ``` @@ -101,7 +101,7 @@ import spacy from spacy import displacy nlp = spacy.load("en_core_web_sm") -text = u"""In ancient Rome, some neighbors live in three adjacent houses. In the center is the house of Senex, who lives there with wife Domina, son Hero, and several slaves, including head slave Hysterium and the musical's main character Pseudolus. A slave belonging to Hero, Pseudolus wishes to buy, win, or steal his freedom. One of the neighboring houses is owned by Marcus Lycus, who is a buyer and seller of beautiful women; the other belongs to the ancient Erronius, who is abroad searching for his long-lost children (stolen in infancy by pirates). One day, Senex and Domina go on a trip and leave Pseudolus in charge of Hero. Hero confides in Pseudolus that he is in love with the lovely Philia, one of the courtesans in the House of Lycus (albeit still a virgin).""" +text = """In ancient Rome, some neighbors live in three adjacent houses. In the center is the house of Senex, who lives there with wife Domina, son Hero, and several slaves, including head slave Hysterium and the musical's main character Pseudolus. A slave belonging to Hero, Pseudolus wishes to buy, win, or steal his freedom. One of the neighboring houses is owned by Marcus Lycus, who is a buyer and seller of beautiful women; the other belongs to the ancient Erronius, who is abroad searching for his long-lost children (stolen in infancy by pirates). One day, Senex and Domina go on a trip and leave Pseudolus in charge of Hero. Hero confides in Pseudolus that he is in love with the lovely Philia, one of the courtesans in the House of Lycus (albeit still a virgin).""" doc = nlp(text) sentence_spans = list(doc.sents) displacy.serve(sentence_spans, style="dep") @@ -117,7 +117,7 @@ text. import spacy from spacy import displacy -text = u"When Sebastian Thrun started working on self-driving cars at Google in 2007, few people outside of the company took him seriously." +text = "When Sebastian Thrun started working on self-driving cars at Google in 2007, few people outside of the company took him seriously." nlp = spacy.load("en_core_web_sm") doc = nlp(text) @@ -168,7 +168,7 @@ add a headline to each visualization, you can add a `title` to its `user_data`. User data is never touched or modified by spaCy. ```python -doc = nlp(u"This is a sentence about Google.") +doc = nlp("This is a sentence about Google.") doc.user_data["title"] = "This is a title" displacy.serve(doc, style="ent") ``` @@ -193,7 +193,7 @@ import spacy from spacy import displacy # In[2]: -doc = nlp(u"Rats are various medium-sized, long-tailed rodents.") +doc = nlp("Rats are various medium-sized, long-tailed rodents.") displacy.render(doc, style="dep") # In[3]: @@ -209,7 +209,6 @@ rendering if auto-detection fails. - ![displaCy visualizer in a Jupyter notebook](../images/displacy_jupyter.jpg) Internally, displaCy imports `display` and `HTML` from `IPython.core.display` @@ -236,8 +235,8 @@ import spacy from spacy import displacy nlp = spacy.load("en_core_web_sm") -doc1 = nlp(u"This is a sentence.") -doc2 = nlp(u"This is another sentence.") +doc1 = nlp("This is a sentence.") +doc2 = nlp("This is another sentence.") html = displacy.render([doc1, doc2], style="dep", page=True) ``` @@ -281,7 +280,7 @@ from spacy import displacy from pathlib import Path nlp = spacy.load("en_core_web_sm") -sentences = [u"This is an example.", u"This is another one."] +sentences = ["This is an example.", "This is another one."] for sent in sentences: doc = nlp(sent) svg = displacy.render(doc, style="dep", jupyter=False) diff --git a/website/meta/universe.json b/website/meta/universe.json index 2997f9300..f9dae7ead 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -119,14 +119,14 @@ "emoji = Emoji(nlp)", "nlp.add_pipe(emoji, first=True)", "", - "doc = nlp(u'This is a test 😻 👍🏿')", + "doc = nlp('This is a test 😻 👍🏿')", "assert doc._.has_emoji == True", "assert doc[2:5]._.has_emoji == True", "assert doc[0]._.is_emoji == False", "assert doc[4]._.is_emoji == True", - "assert doc[5]._.emoji_desc == u'thumbs up dark skin tone'", + "assert doc[5]._.emoji_desc == 'thumbs up dark skin tone'", "assert len(doc._.emoji) == 2", - "assert doc._.emoji[1] == (u'👍🏿', 5, u'thumbs up dark skin tone')" + "assert doc._.emoji[1] == ('👍🏿', 5, 'thumbs up dark skin tone')" ], "author": "Ines Montani", "author_links": { @@ -747,8 +747,8 @@ "s2v = Sense2VecComponent('/path/to/reddit_vectors-1.1.0')", "nlp.add_pipe(s2v)", "", - "doc = nlp(u\"A sentence about natural language processing.\")", - "assert doc[3].text == u'natural language processing'", + "doc = nlp(\"A sentence about natural language processing.\")", + "assert doc[3].text == 'natural language processing'", "freq = doc[3]._.s2v_freq", "vector = doc[3]._.s2v_vec", "most_similar = doc[3]._.s2v_most_similar(3)", @@ -1297,7 +1297,7 @@ "", "nlp = spacy.load('en')", "nlp.add_pipe(BeneparComponent('benepar_en'))", - "doc = nlp(u'The time for action is now. It's never too late to do something.')", + "doc = nlp('The time for action is now. It's never too late to do something.')", "sent = list(doc.sents)[0]", "print(sent._.parse_string)", "# (S (NP (NP (DT The) (NN time)) (PP (IN for) (NP (NN action)))) (VP (VBZ is) (ADVP (RB now))) (. .))", diff --git a/website/src/widgets/quickstart-models.js b/website/src/widgets/quickstart-models.js index 83bb4527b..d116fae0a 100644 --- a/website/src/widgets/quickstart-models.js +++ b/website/src/widgets/quickstart-models.js @@ -65,7 +65,7 @@ const QuickstartInstall = ({ id, title, description, defaultLang, children }) => nlp = {pkg}.load() - doc = nlp(u"{exampleText}") + doc = nlp("{exampleText}") print([