diff --git a/website/docs/usage/_data.json b/website/docs/usage/_data.json index 9f51df5c4..a611151b3 100644 --- a/website/docs/usage/_data.json +++ b/website/docs/usage/_data.json @@ -7,7 +7,7 @@ "Lightning tour": "lightning-tour", "What's new in v2.0": "v2" }, - "Workflows": { + "Guides": { "POS tagging": "pos-tagging", "Using the parse": "dependency-parse", "Entity recognition": "entity-recognition", diff --git a/website/docs/usage/_spacy-101/_pipelines.jade b/website/docs/usage/_spacy-101/_pipelines.jade index d984a4708..db095ef04 100644 --- a/website/docs/usage/_spacy-101/_pipelines.jade +++ b/website/docs/usage/_spacy-101/_pipelines.jade @@ -2,9 +2,9 @@ p | When you call #[code nlp] on a text, spaCy first tokenizes the text to - | produce a #[code Doc] object. The #[code Doc] is the processed in several + | produce a #[code Doc] object. The #[code Doc] is then processed in several | different steps – this is also referred to as the - | #[strong processing pipeline]. The pipeline used by our + | #[strong processing pipeline]. The pipeline used by the | #[+a("/docs/usage/models") default models] consists of a | vectorizer, a tagger, a parser and an entity recognizer. Each pipeline | component returns the processed #[code Doc], which is then passed on to diff --git a/website/docs/usage/_spacy-101/_pos-deps.jade b/website/docs/usage/_spacy-101/_pos-deps.jade index 5aa719c23..b42847aee 100644 --- a/website/docs/usage/_spacy-101/_pos-deps.jade +++ b/website/docs/usage/_spacy-101/_pos-deps.jade @@ -28,7 +28,7 @@ p | #[strong Text:] The original word text.#[br] | #[strong Lemma:] The base form of the word.#[br] | #[strong POS:] The simple part-of-speech tag.#[br] - | #[strong Tag:] ...#[br] + | #[strong Tag:] The detailed part-of-speech tag.#[br] | #[strong Dep:] Syntactic dependency, i.e. the relation between tokens.#[br] | #[strong Shape:] The word shape – capitalisation, punctuation, digits.#[br] | #[strong is alpha:] Is the token an alpha character?#[br] diff --git a/website/docs/usage/_spacy-101/_serialization.jade b/website/docs/usage/_spacy-101/_serialization.jade index b6a889014..f3926dd9c 100644 --- a/website/docs/usage/_spacy-101/_serialization.jade +++ b/website/docs/usage/_spacy-101/_serialization.jade @@ -33,3 +33,8 @@ p +annotation-row(["from_bytes", "object", "nlp.from_bytes(bytes)"], style) +annotation-row(["to_disk", "-", "nlp.to_disk('/path')"], style) +annotation-row(["from_disk", "object", "nlp.from_disk('/path')"], style) + ++code. + moby_dick = open('moby_dick.txt', 'r') # open a large document + doc = nlp(moby_dick) # process it + doc.to_disk('/moby_dick.bin') # save the processed Doc diff --git a/website/docs/usage/_spacy-101/_tokenization.jade b/website/docs/usage/_spacy-101/_tokenization.jade index 28fd448b4..64e3f5881 100644 --- a/website/docs/usage/_spacy-101/_tokenization.jade +++ b/website/docs/usage/_spacy-101/_tokenization.jade @@ -2,11 +2,11 @@ p | During processing, spaCy first #[strong tokenizes] the text, i.e. - | segments it into words, punctuation and so on. For example, punctuation - | at the end of a sentence should be split off – whereas "U.K." should - | remain one token. This is done by applying rules specific to each - | language. Each #[code Doc] consists of individual tokens, and we can - | simply iterate over them: + | segments it into words, punctuation and so on. This is done by applying + | rules specific to each language. For example, punctuation at the end of a + | sentence should be split off – whereas "U.K." should remain one token. + | Each #[code Doc] consists of individual tokens, and we can simply iterate + | over them: +code. for token in doc: diff --git a/website/docs/usage/_spacy-101/_word-vectors.jade b/website/docs/usage/_spacy-101/_word-vectors.jade index 4ed8e4c78..cbb9d06f2 100644 --- a/website/docs/usage/_spacy-101/_word-vectors.jade +++ b/website/docs/usage/_spacy-101/_word-vectors.jade @@ -6,7 +6,7 @@ p | vectors can be generated using an algorithm like | #[+a("https://en.wikipedia.org/wiki/Word2vec") word2vec]. Most of spaCy's | #[+a("/docs/usage/models") default models] come with - | #[strong 300-dimensional vectors], that look like this: + | #[strong 300-dimensional vectors] that look like this: +code("banana.vector", false, false, 250). array([2.02280000e-01, -7.66180009e-02, 3.70319992e-01, diff --git a/website/docs/usage/entity-recognition.jade b/website/docs/usage/entity-recognition.jade index bcad07baa..527c14dde 100644 --- a/website/docs/usage/entity-recognition.jade +++ b/website/docs/usage/entity-recognition.jade @@ -52,7 +52,7 @@ p assert ent_san == [u'San', u'B', u'GPE'] assert ent_francisco == [u'Francisco', u'I', u'GPE'] -+table(["Text", "ent_iob", "ent.iob_", "ent_type", "ent_type_", "Description"]) ++table(["Text", "ent_iob", "ent_iob_", "ent_type", "ent_type_", "Description"]) - var style = [0, 1, 1, 1, 1, 0] +annotation-row(["San", 3, "B", 381, "GPE", "beginning of an entity"], style) +annotation-row(["Francisco", 1, "I", 381, "GPE", "inside an entity"], style) diff --git a/website/docs/usage/language-processing-pipeline.jade b/website/docs/usage/language-processing-pipeline.jade index 8bb92caae..948212d82 100644 --- a/website/docs/usage/language-processing-pipeline.jade +++ b/website/docs/usage/language-processing-pipeline.jade @@ -344,8 +344,7 @@ p | Since spaCy v2.0 comes with better support for customising the | processing pipeline components, the #[code parser], #[code tagger] | and #[code entity] keyword arguments have been replaced with - | #[code disable], which takes a list of - | #[+a("/docs/usage/language-processing-pipeline") pipeline component names]. + | #[code disable], which takes a list of pipeline component names. | This lets you disable both default and custom components when loading | a model, or initialising a Language class via | #[+api("language-from_disk") #[code from_disk]]. diff --git a/website/docs/usage/production-use.jade b/website/docs/usage/production-use.jade index 68a313d8a..c7f872c6d 100644 --- a/website/docs/usage/production-use.jade +++ b/website/docs/usage/production-use.jade @@ -2,16 +2,12 @@ include ../../_includes/_mixins -p - | Once you have loaded the #[code nlp] object, you can call it as though - | it were a function. This allows you to process a single unicode string. - +h(2, "multithreading") Multi-threading with #[code .pipe()] p | If you have a sequence of documents to process, you should use the - | #[+api("language#pipe") #[code .pipe()]] method. The #[code .pipe()] - | method takes an iterator of texts, and accumulates an internal buffer, + | #[+api("language#pipe") #[code .pipe()]] method. The method takes an + | iterator of texts, and accumulates an internal buffer, | which it works on in parallel. It then yields the documents in order, | one-by-one. After a long and bitter struggle, the global interpreter | lock was freed around spaCy's main parsing loop in v0.100.3. This means diff --git a/website/docs/usage/saving-loading.jade b/website/docs/usage/saving-loading.jade index 413b86477..477db925c 100644 --- a/website/docs/usage/saving-loading.jade +++ b/website/docs/usage/saving-loading.jade @@ -209,5 +209,5 @@ p | spaCy v2.0 solves this with a clear distinction between setting up | the instance and loading the data. - +code-new nlp = English.from_disk('/path/to/data') + +code-new nlp = English().from_disk('/path/to/data') +code-old nlp = spacy.load('en', path='/path/to/data') diff --git a/website/docs/usage/spacy-101.jade b/website/docs/usage/spacy-101.jade index f8779b52f..47d49ad40 100644 --- a/website/docs/usage/spacy-101.jade +++ b/website/docs/usage/spacy-101.jade @@ -81,6 +81,12 @@ p nlp = spacy.load('en') doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion') +p + | Even though a #[code Doc] is processed – e.g. split into individual words + | and annotated – it still holds #[strong all information of the original text], + | like whitespace characters. This way, you'll never lose any information + | when processing text with spaCy. + +h(3, "annotations-token") Tokenization include _spacy-101/_tokenization diff --git a/website/docs/usage/visualizers.jade b/website/docs/usage/visualizers.jade index 385fa0fd0..90a343700 100644 --- a/website/docs/usage/visualizers.jade +++ b/website/docs/usage/visualizers.jade @@ -180,8 +180,8 @@ p p | If you don't need the web server and just want to generate the markup | – for example, to export it to a file or serve it in a custom - | way – you can use #[+api("displacy#render") #[code displacy.render]] - | instead. It works the same, but returns a string containing the markup. + | way – you can use #[+api("displacy#render") #[code displacy.render]]. + | It works the same way, but returns a string containing the markup. +code("Example"). import spacy @@ -220,10 +220,32 @@ p | a standalone graphic.) So instead of rendering all #[code Doc]s at one, | loop over them and export them separately. + ++h(3, "examples-export-svg") Example: Export SVG graphics of dependency parses + ++code("Example"). + import spacy + from spacy import displacy + from pathlib import Path + + nlp = spacy.load('en') + sentences = ["This is an example.", "This is another one."] + for sent in sentences: + doc = nlp(sentence) + svg = displacy.render(doc, style='dep') + file_name = '-'.join([w.text for w in doc if not w.is_punct]) + '.svg' + output_path = Path('/images/' + file_name) + output_path.open('w', encoding='utf-8').write(svg) + +p + | The above code will generate the dependency visualizations and them to + | two files, #[code This-is-an-example.svg] and #[code This-is-another-one.svg]. + + +h(2, "jupyter") Using displaCy in Jupyter notebooks p - | displaCy is able to detect whether you're within a + | displaCy is able to detect whether you're working in a | #[+a("https://jupyter.org") Jupyter] notebook, and will return markup | that can be rendered in a cell straight away. When you export your | notebook, the visualizations will be included as HTML. @@ -257,28 +279,6 @@ p html = displacy.render(doc, style='dep') return display(HTML(html)) -+h(2, "examples") Usage examples - -+h(3, "examples-export-svg") Export SVG graphics of dependency parses - -+code("Example"). - import spacy - from spacy import displacy - from pathlib import Path - - nlp = spacy.load('en') - sentences = ["This is an example.", "This is another one."] - for sent in sentences: - doc = nlp(sentence) - svg = displacy.render(doc, style='dep') - file_name = '-'.join([w.text for w in doc if not w.is_punct]) + '.svg' - output_path = Path('/images/' + file_name) - output_path.open('w', encoding='utf-8').write(svg) - -p - | The above code will generate the dependency visualizations and them to - | two files, #[code This-is-an-example.svg] and #[code This-is-another-one.svg]. - +h(2, "manual-usage") Rendering data manually p