Fix typos, text, examples and formatting

2026-02-18 05:00:41 +03:00 · 2017-05-25 11:17:21 +02:00 · 2017-05-25 11:17:21 +02:00 · b2324be3e9
commit b2324be3e9
parent dcb10da615
12 changed files with 51 additions and 45 deletions
--- a/website/docs/usage/_data.json
+++ b/website/docs/usage/_data.json
@ -7,7 +7,7 @@
            "Lightning tour": "lightning-tour",
            "What's new in v2.0": "v2"
        },
-        "Workflows": {
+        "Guides": {
            "POS tagging": "pos-tagging",
            "Using the parse": "dependency-parse",
            "Entity recognition": "entity-recognition",
--- a/website/docs/usage/_spacy-101/_pipelines.jade
+++ b/website/docs/usage/_spacy-101/_pipelines.jade
@ -2,9 +2,9 @@

 p
    |  When you call #[code nlp] on a text, spaCy first tokenizes the text to
-    |  produce a #[code Doc] object. The #[code Doc] is the processed in several
+    |  produce a #[code Doc] object. The #[code Doc] is then processed in several
    |  different steps – this is also referred to as the
-    |  #[strong processing pipeline]. The pipeline used by our
+    |  #[strong processing pipeline]. The pipeline used by the
    |  #[+a("/docs/usage/models") default models] consists of a
    |  vectorizer, a tagger, a parser and an entity recognizer. Each pipeline
    |  component returns the processed #[code Doc], which is then passed on to
--- a/website/docs/usage/_spacy-101/_pos-deps.jade
+++ b/website/docs/usage/_spacy-101/_pos-deps.jade
@ -28,7 +28,7 @@ p
    |  #[strong Text:] The original word text.#[br]
    |  #[strong Lemma:] The base form of the word.#[br]
    |  #[strong POS:] The simple part-of-speech tag.#[br]
-    |  #[strong Tag:] ...#[br]
+    |  #[strong Tag:] The detailed part-of-speech tag.#[br]
    |  #[strong Dep:] Syntactic dependency, i.e. the relation between tokens.#[br]
    |  #[strong Shape:] The word shape – capitalisation, punctuation, digits.#[br]
    |  #[strong is alpha:] Is the token an alpha character?#[br]
--- a/website/docs/usage/_spacy-101/_serialization.jade
+++ b/website/docs/usage/_spacy-101/_serialization.jade
@ -33,3 +33,8 @@ p
    +annotation-row(["from_bytes", "object", "nlp.from_bytes(bytes)"], style)
    +annotation-row(["to_disk", "-", "nlp.to_disk('/path')"], style)
    +annotation-row(["from_disk", "object", "nlp.from_disk('/path')"], style)
+
+code.
+    moby_dick = open('moby_dick.txt', 'r') # open a large document
+    doc = nlp(moby_dick) # process it
+    doc.to_disk('/moby_dick.bin') # save the processed Doc
--- a/website/docs/usage/_spacy-101/_tokenization.jade
+++ b/website/docs/usage/_spacy-101/_tokenization.jade
@ -2,11 +2,11 @@

 p
    |  During processing, spaCy first #[strong tokenizes] the text, i.e.
-    |  segments it into words, punctuation and so on. For example, punctuation
-    |  at the end of a sentence should be split off – whereas "U.K." should
-    |  remain one token. This is done by applying rules specific to each
-    |  language. Each #[code Doc] consists of individual tokens, and we can
-    |  simply iterate over them:
+    |  segments it into words, punctuation and so on. This is done by applying
+    |  rules specific to each language. For example, punctuation at the end of a
+    |  sentence should be split off – whereas "U.K." should remain one token.
+    |  Each #[code Doc] consists of individual tokens, and we can simply iterate
+    |  over them:

 +code.
    for token in doc:
--- a/website/docs/usage/_spacy-101/_word-vectors.jade
+++ b/website/docs/usage/_spacy-101/_word-vectors.jade
@ -6,7 +6,7 @@ p
    |  vectors can be generated using an algorithm like
    |  #[+a("https://en.wikipedia.org/wiki/Word2vec") word2vec]. Most of spaCy's
    |  #[+a("/docs/usage/models") default models] come with
-    |  #[strong 300-dimensional vectors], that look like this:
+    |  #[strong 300-dimensional vectors] that look like this:

 +code("banana.vector", false, false, 250).
    array([2.02280000e-01,  -7.66180009e-02,   3.70319992e-01,
--- a/website/docs/usage/entity-recognition.jade
+++ b/website/docs/usage/entity-recognition.jade
@ -52,7 +52,7 @@ p
    assert ent_san == [u'San', u'B', u'GPE']
    assert ent_francisco == [u'Francisco', u'I', u'GPE']

-+table(["Text", "ent_iob", "ent.iob_", "ent_type", "ent_type_", "Description"])
+table(["Text", "ent_iob", "ent_iob_", "ent_type", "ent_type_", "Description"])
    - var style = [0, 1, 1, 1, 1, 0]
    +annotation-row(["San", 3, "B", 381, "GPE", "beginning of an entity"], style)
    +annotation-row(["Francisco", 1, "I", 381, "GPE", "inside an entity"], style)
--- a/website/docs/usage/language-processing-pipeline.jade
+++ b/website/docs/usage/language-processing-pipeline.jade
@ -344,8 +344,7 @@ p
        |  Since spaCy v2.0 comes with better support for customising the
        |  processing pipeline components, the #[code parser], #[code tagger]
        |  and #[code entity] keyword arguments have been replaced with
-        |  #[code disable], which takes a list of
-        |  #[+a("/docs/usage/language-processing-pipeline") pipeline component names].
+        |  #[code disable], which takes a list of pipeline component names.
        |  This lets you disable both default and custom components when loading
        |  a model, or initialising a Language class via
        |  #[+api("language-from_disk") #[code from_disk]].
--- a/website/docs/usage/production-use.jade
+++ b/website/docs/usage/production-use.jade
@ -2,16 +2,12 @@

 include ../../_includes/_mixins

-p
-    |  Once you have loaded the #[code nlp] object, you can call it as though
-    |  it were a function. This allows you to process a single unicode string.
-
 +h(2, "multithreading") Multi-threading with #[code .pipe()]

 p
    |  If you have a sequence of documents to process, you should use the
-    |  #[+api("language#pipe") #[code .pipe()]] method. The #[code .pipe()]
-    |  method takes an iterator of texts, and accumulates an internal buffer,
+    |  #[+api("language#pipe") #[code .pipe()]] method. The method takes an
+    |  iterator of texts, and accumulates an internal buffer,
    |  which it works on in parallel. It then yields the documents in order,
    |  one-by-one. After a long and bitter struggle, the global interpreter
    |  lock was freed around spaCy's main parsing loop in v0.100.3. This means
--- a/website/docs/usage/saving-loading.jade
+++ b/website/docs/usage/saving-loading.jade
@ -209,5 +209,5 @@ p
        |  spaCy v2.0 solves this with a clear distinction between setting up
        |  the instance and loading the data.

-    +code-new nlp = English.from_disk('/path/to/data')
+    +code-new nlp = English().from_disk('/path/to/data')
    +code-old nlp = spacy.load('en', path='/path/to/data')
--- a/website/docs/usage/spacy-101.jade
+++ b/website/docs/usage/spacy-101.jade
@ -81,6 +81,12 @@ p
    nlp = spacy.load('en')
    doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion')

+p
+    |  Even though a #[code Doc] is processed – e.g. split into individual words
+    |  and annotated – it still holds #[strong all information of the original text],
+    |  like whitespace characters. This way, you'll never lose any information
+    |  when processing text with spaCy.
+
 +h(3, "annotations-token") Tokenization

 include _spacy-101/_tokenization
--- a/website/docs/usage/visualizers.jade
+++ b/website/docs/usage/visualizers.jade
@ -180,8 +180,8 @@ p
 p
    |  If you don't need the web server and just want to generate the markup
    |  – for example, to export it to a file or serve it in a custom
-    |  way – you can use #[+api("displacy#render") #[code displacy.render]]
-    |  instead. It works the same, but returns a string containing the markup.
+    |  way – you can use #[+api("displacy#render") #[code displacy.render]].
+    |  It works the same way, but returns a string containing the markup.

 +code("Example").
    import spacy
@ -220,10 +220,32 @@ p
    |  a standalone graphic.) So instead of rendering all #[code Doc]s at one,
    |  loop over them and export them separately.

+
+h(3, "examples-export-svg") Example: Export SVG graphics of dependency parses
+
+code("Example").
+    import spacy
+    from spacy import displacy
+    from pathlib import Path
+
+    nlp = spacy.load('en')
+    sentences = ["This is an example.", "This is another one."]
+    for sent in sentences:
+        doc = nlp(sentence)
+        svg = displacy.render(doc, style='dep')
+        file_name = '-'.join([w.text for w in doc if not w.is_punct]) + '.svg'
+        output_path = Path('/images/' + file_name)
+        output_path.open('w', encoding='utf-8').write(svg)
+
+p
+    |  The above code will generate the dependency visualizations and them to
+    |  two files, #[code This-is-an-example.svg] and #[code This-is-another-one.svg].
+
+
 +h(2, "jupyter") Using displaCy in Jupyter notebooks

 p
-    |  displaCy is able to detect whether you're within a
+    |  displaCy is able to detect whether you're working in a
    |  #[+a("https://jupyter.org") Jupyter] notebook, and will return markup
    |  that can be rendered in a cell straight away. When you export your
    |  notebook, the visualizations will be included as HTML.
@ -257,28 +279,6 @@ p
    html = displacy.render(doc, style='dep')
    return display(HTML(html))

-+h(2, "examples") Usage examples
-
-+h(3, "examples-export-svg") Export SVG graphics of dependency parses
-
-+code("Example").
-    import spacy
-    from spacy import displacy
-    from pathlib import Path
-
-    nlp = spacy.load('en')
-    sentences = ["This is an example.", "This is another one."]
-    for sent in sentences:
-        doc = nlp(sentence)
-        svg = displacy.render(doc, style='dep')
-        file_name = '-'.join([w.text for w in doc if not w.is_punct]) + '.svg'
-        output_path = Path('/images/' + file_name)
-        output_path.open('w', encoding='utf-8').write(svg)
-
-p
-    |  The above code will generate the dependency visualizations and them to
-    |  two files, #[code This-is-an-example.svg] and #[code This-is-another-one.svg].
-
 +h(2, "manual-usage") Rendering data manually

 p