mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 01:46:28 +03:00
Fix typos, text, examples and formatting
This commit is contained in:
parent
dcb10da615
commit
b2324be3e9
|
@ -7,7 +7,7 @@
|
||||||
"Lightning tour": "lightning-tour",
|
"Lightning tour": "lightning-tour",
|
||||||
"What's new in v2.0": "v2"
|
"What's new in v2.0": "v2"
|
||||||
},
|
},
|
||||||
"Workflows": {
|
"Guides": {
|
||||||
"POS tagging": "pos-tagging",
|
"POS tagging": "pos-tagging",
|
||||||
"Using the parse": "dependency-parse",
|
"Using the parse": "dependency-parse",
|
||||||
"Entity recognition": "entity-recognition",
|
"Entity recognition": "entity-recognition",
|
||||||
|
|
|
@ -2,9 +2,9 @@
|
||||||
|
|
||||||
p
|
p
|
||||||
| When you call #[code nlp] on a text, spaCy first tokenizes the text to
|
| When you call #[code nlp] on a text, spaCy first tokenizes the text to
|
||||||
| produce a #[code Doc] object. The #[code Doc] is the processed in several
|
| produce a #[code Doc] object. The #[code Doc] is then processed in several
|
||||||
| different steps – this is also referred to as the
|
| different steps – this is also referred to as the
|
||||||
| #[strong processing pipeline]. The pipeline used by our
|
| #[strong processing pipeline]. The pipeline used by the
|
||||||
| #[+a("/docs/usage/models") default models] consists of a
|
| #[+a("/docs/usage/models") default models] consists of a
|
||||||
| vectorizer, a tagger, a parser and an entity recognizer. Each pipeline
|
| vectorizer, a tagger, a parser and an entity recognizer. Each pipeline
|
||||||
| component returns the processed #[code Doc], which is then passed on to
|
| component returns the processed #[code Doc], which is then passed on to
|
||||||
|
|
|
@ -28,7 +28,7 @@ p
|
||||||
| #[strong Text:] The original word text.#[br]
|
| #[strong Text:] The original word text.#[br]
|
||||||
| #[strong Lemma:] The base form of the word.#[br]
|
| #[strong Lemma:] The base form of the word.#[br]
|
||||||
| #[strong POS:] The simple part-of-speech tag.#[br]
|
| #[strong POS:] The simple part-of-speech tag.#[br]
|
||||||
| #[strong Tag:] ...#[br]
|
| #[strong Tag:] The detailed part-of-speech tag.#[br]
|
||||||
| #[strong Dep:] Syntactic dependency, i.e. the relation between tokens.#[br]
|
| #[strong Dep:] Syntactic dependency, i.e. the relation between tokens.#[br]
|
||||||
| #[strong Shape:] The word shape – capitalisation, punctuation, digits.#[br]
|
| #[strong Shape:] The word shape – capitalisation, punctuation, digits.#[br]
|
||||||
| #[strong is alpha:] Is the token an alpha character?#[br]
|
| #[strong is alpha:] Is the token an alpha character?#[br]
|
||||||
|
|
|
@ -33,3 +33,8 @@ p
|
||||||
+annotation-row(["from_bytes", "object", "nlp.from_bytes(bytes)"], style)
|
+annotation-row(["from_bytes", "object", "nlp.from_bytes(bytes)"], style)
|
||||||
+annotation-row(["to_disk", "-", "nlp.to_disk('/path')"], style)
|
+annotation-row(["to_disk", "-", "nlp.to_disk('/path')"], style)
|
||||||
+annotation-row(["from_disk", "object", "nlp.from_disk('/path')"], style)
|
+annotation-row(["from_disk", "object", "nlp.from_disk('/path')"], style)
|
||||||
|
|
||||||
|
+code.
|
||||||
|
moby_dick = open('moby_dick.txt', 'r') # open a large document
|
||||||
|
doc = nlp(moby_dick) # process it
|
||||||
|
doc.to_disk('/moby_dick.bin') # save the processed Doc
|
||||||
|
|
|
@ -2,11 +2,11 @@
|
||||||
|
|
||||||
p
|
p
|
||||||
| During processing, spaCy first #[strong tokenizes] the text, i.e.
|
| During processing, spaCy first #[strong tokenizes] the text, i.e.
|
||||||
| segments it into words, punctuation and so on. For example, punctuation
|
| segments it into words, punctuation and so on. This is done by applying
|
||||||
| at the end of a sentence should be split off – whereas "U.K." should
|
| rules specific to each language. For example, punctuation at the end of a
|
||||||
| remain one token. This is done by applying rules specific to each
|
| sentence should be split off – whereas "U.K." should remain one token.
|
||||||
| language. Each #[code Doc] consists of individual tokens, and we can
|
| Each #[code Doc] consists of individual tokens, and we can simply iterate
|
||||||
| simply iterate over them:
|
| over them:
|
||||||
|
|
||||||
+code.
|
+code.
|
||||||
for token in doc:
|
for token in doc:
|
||||||
|
|
|
@ -6,7 +6,7 @@ p
|
||||||
| vectors can be generated using an algorithm like
|
| vectors can be generated using an algorithm like
|
||||||
| #[+a("https://en.wikipedia.org/wiki/Word2vec") word2vec]. Most of spaCy's
|
| #[+a("https://en.wikipedia.org/wiki/Word2vec") word2vec]. Most of spaCy's
|
||||||
| #[+a("/docs/usage/models") default models] come with
|
| #[+a("/docs/usage/models") default models] come with
|
||||||
| #[strong 300-dimensional vectors], that look like this:
|
| #[strong 300-dimensional vectors] that look like this:
|
||||||
|
|
||||||
+code("banana.vector", false, false, 250).
|
+code("banana.vector", false, false, 250).
|
||||||
array([2.02280000e-01, -7.66180009e-02, 3.70319992e-01,
|
array([2.02280000e-01, -7.66180009e-02, 3.70319992e-01,
|
||||||
|
|
|
@ -52,7 +52,7 @@ p
|
||||||
assert ent_san == [u'San', u'B', u'GPE']
|
assert ent_san == [u'San', u'B', u'GPE']
|
||||||
assert ent_francisco == [u'Francisco', u'I', u'GPE']
|
assert ent_francisco == [u'Francisco', u'I', u'GPE']
|
||||||
|
|
||||||
+table(["Text", "ent_iob", "ent.iob_", "ent_type", "ent_type_", "Description"])
|
+table(["Text", "ent_iob", "ent_iob_", "ent_type", "ent_type_", "Description"])
|
||||||
- var style = [0, 1, 1, 1, 1, 0]
|
- var style = [0, 1, 1, 1, 1, 0]
|
||||||
+annotation-row(["San", 3, "B", 381, "GPE", "beginning of an entity"], style)
|
+annotation-row(["San", 3, "B", 381, "GPE", "beginning of an entity"], style)
|
||||||
+annotation-row(["Francisco", 1, "I", 381, "GPE", "inside an entity"], style)
|
+annotation-row(["Francisco", 1, "I", 381, "GPE", "inside an entity"], style)
|
||||||
|
|
|
@ -344,8 +344,7 @@ p
|
||||||
| Since spaCy v2.0 comes with better support for customising the
|
| Since spaCy v2.0 comes with better support for customising the
|
||||||
| processing pipeline components, the #[code parser], #[code tagger]
|
| processing pipeline components, the #[code parser], #[code tagger]
|
||||||
| and #[code entity] keyword arguments have been replaced with
|
| and #[code entity] keyword arguments have been replaced with
|
||||||
| #[code disable], which takes a list of
|
| #[code disable], which takes a list of pipeline component names.
|
||||||
| #[+a("/docs/usage/language-processing-pipeline") pipeline component names].
|
|
||||||
| This lets you disable both default and custom components when loading
|
| This lets you disable both default and custom components when loading
|
||||||
| a model, or initialising a Language class via
|
| a model, or initialising a Language class via
|
||||||
| #[+api("language-from_disk") #[code from_disk]].
|
| #[+api("language-from_disk") #[code from_disk]].
|
||||||
|
|
|
@ -2,16 +2,12 @@
|
||||||
|
|
||||||
include ../../_includes/_mixins
|
include ../../_includes/_mixins
|
||||||
|
|
||||||
p
|
|
||||||
| Once you have loaded the #[code nlp] object, you can call it as though
|
|
||||||
| it were a function. This allows you to process a single unicode string.
|
|
||||||
|
|
||||||
+h(2, "multithreading") Multi-threading with #[code .pipe()]
|
+h(2, "multithreading") Multi-threading with #[code .pipe()]
|
||||||
|
|
||||||
p
|
p
|
||||||
| If you have a sequence of documents to process, you should use the
|
| If you have a sequence of documents to process, you should use the
|
||||||
| #[+api("language#pipe") #[code .pipe()]] method. The #[code .pipe()]
|
| #[+api("language#pipe") #[code .pipe()]] method. The method takes an
|
||||||
| method takes an iterator of texts, and accumulates an internal buffer,
|
| iterator of texts, and accumulates an internal buffer,
|
||||||
| which it works on in parallel. It then yields the documents in order,
|
| which it works on in parallel. It then yields the documents in order,
|
||||||
| one-by-one. After a long and bitter struggle, the global interpreter
|
| one-by-one. After a long and bitter struggle, the global interpreter
|
||||||
| lock was freed around spaCy's main parsing loop in v0.100.3. This means
|
| lock was freed around spaCy's main parsing loop in v0.100.3. This means
|
||||||
|
|
|
@ -209,5 +209,5 @@ p
|
||||||
| spaCy v2.0 solves this with a clear distinction between setting up
|
| spaCy v2.0 solves this with a clear distinction between setting up
|
||||||
| the instance and loading the data.
|
| the instance and loading the data.
|
||||||
|
|
||||||
+code-new nlp = English.from_disk('/path/to/data')
|
+code-new nlp = English().from_disk('/path/to/data')
|
||||||
+code-old nlp = spacy.load('en', path='/path/to/data')
|
+code-old nlp = spacy.load('en', path='/path/to/data')
|
||||||
|
|
|
@ -81,6 +81,12 @@ p
|
||||||
nlp = spacy.load('en')
|
nlp = spacy.load('en')
|
||||||
doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion')
|
doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion')
|
||||||
|
|
||||||
|
p
|
||||||
|
| Even though a #[code Doc] is processed – e.g. split into individual words
|
||||||
|
| and annotated – it still holds #[strong all information of the original text],
|
||||||
|
| like whitespace characters. This way, you'll never lose any information
|
||||||
|
| when processing text with spaCy.
|
||||||
|
|
||||||
+h(3, "annotations-token") Tokenization
|
+h(3, "annotations-token") Tokenization
|
||||||
|
|
||||||
include _spacy-101/_tokenization
|
include _spacy-101/_tokenization
|
||||||
|
|
|
@ -180,8 +180,8 @@ p
|
||||||
p
|
p
|
||||||
| If you don't need the web server and just want to generate the markup
|
| If you don't need the web server and just want to generate the markup
|
||||||
| – for example, to export it to a file or serve it in a custom
|
| – for example, to export it to a file or serve it in a custom
|
||||||
| way – you can use #[+api("displacy#render") #[code displacy.render]]
|
| way – you can use #[+api("displacy#render") #[code displacy.render]].
|
||||||
| instead. It works the same, but returns a string containing the markup.
|
| It works the same way, but returns a string containing the markup.
|
||||||
|
|
||||||
+code("Example").
|
+code("Example").
|
||||||
import spacy
|
import spacy
|
||||||
|
@ -220,10 +220,32 @@ p
|
||||||
| a standalone graphic.) So instead of rendering all #[code Doc]s at one,
|
| a standalone graphic.) So instead of rendering all #[code Doc]s at one,
|
||||||
| loop over them and export them separately.
|
| loop over them and export them separately.
|
||||||
|
|
||||||
|
|
||||||
|
+h(3, "examples-export-svg") Example: Export SVG graphics of dependency parses
|
||||||
|
|
||||||
|
+code("Example").
|
||||||
|
import spacy
|
||||||
|
from spacy import displacy
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
nlp = spacy.load('en')
|
||||||
|
sentences = ["This is an example.", "This is another one."]
|
||||||
|
for sent in sentences:
|
||||||
|
doc = nlp(sentence)
|
||||||
|
svg = displacy.render(doc, style='dep')
|
||||||
|
file_name = '-'.join([w.text for w in doc if not w.is_punct]) + '.svg'
|
||||||
|
output_path = Path('/images/' + file_name)
|
||||||
|
output_path.open('w', encoding='utf-8').write(svg)
|
||||||
|
|
||||||
|
p
|
||||||
|
| The above code will generate the dependency visualizations and them to
|
||||||
|
| two files, #[code This-is-an-example.svg] and #[code This-is-another-one.svg].
|
||||||
|
|
||||||
|
|
||||||
+h(2, "jupyter") Using displaCy in Jupyter notebooks
|
+h(2, "jupyter") Using displaCy in Jupyter notebooks
|
||||||
|
|
||||||
p
|
p
|
||||||
| displaCy is able to detect whether you're within a
|
| displaCy is able to detect whether you're working in a
|
||||||
| #[+a("https://jupyter.org") Jupyter] notebook, and will return markup
|
| #[+a("https://jupyter.org") Jupyter] notebook, and will return markup
|
||||||
| that can be rendered in a cell straight away. When you export your
|
| that can be rendered in a cell straight away. When you export your
|
||||||
| notebook, the visualizations will be included as HTML.
|
| notebook, the visualizations will be included as HTML.
|
||||||
|
@ -257,28 +279,6 @@ p
|
||||||
html = displacy.render(doc, style='dep')
|
html = displacy.render(doc, style='dep')
|
||||||
return display(HTML(html))
|
return display(HTML(html))
|
||||||
|
|
||||||
+h(2, "examples") Usage examples
|
|
||||||
|
|
||||||
+h(3, "examples-export-svg") Export SVG graphics of dependency parses
|
|
||||||
|
|
||||||
+code("Example").
|
|
||||||
import spacy
|
|
||||||
from spacy import displacy
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
nlp = spacy.load('en')
|
|
||||||
sentences = ["This is an example.", "This is another one."]
|
|
||||||
for sent in sentences:
|
|
||||||
doc = nlp(sentence)
|
|
||||||
svg = displacy.render(doc, style='dep')
|
|
||||||
file_name = '-'.join([w.text for w in doc if not w.is_punct]) + '.svg'
|
|
||||||
output_path = Path('/images/' + file_name)
|
|
||||||
output_path.open('w', encoding='utf-8').write(svg)
|
|
||||||
|
|
||||||
p
|
|
||||||
| The above code will generate the dependency visualizations and them to
|
|
||||||
| two files, #[code This-is-an-example.svg] and #[code This-is-another-one.svg].
|
|
||||||
|
|
||||||
+h(2, "manual-usage") Rendering data manually
|
+h(2, "manual-usage") Rendering data manually
|
||||||
|
|
||||||
p
|
p
|
||||||
|
|
Loading…
Reference in New Issue
Block a user