From 10d05c2b9274073da0edac0379e3a42d97816992 Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 28 May 2017 01:30:12 +0200 Subject: [PATCH] Fix typos, wording and formatting --- .../docs/usage/_spacy-101/_similarity.jade | 2 +- .../usage/language-processing-pipeline.jade | 2 +- website/docs/usage/spacy-101.jade | 10 ++- website/docs/usage/v2.jade | 85 +++++++++---------- 4 files changed, 49 insertions(+), 50 deletions(-) diff --git a/website/docs/usage/_spacy-101/_similarity.jade b/website/docs/usage/_spacy-101/_similarity.jade index c99bc9658..6eed1eb7f 100644 --- a/website/docs/usage/_spacy-101/_similarity.jade +++ b/website/docs/usage/_spacy-101/_similarity.jade @@ -5,7 +5,7 @@ p | #[strong how similar they are]. Predicting similarity is useful for | building recommendation systems or flagging duplicates. For example, you | can suggest a user content that's similar to what they're currently - | looking at, or label a support ticket as a duplicate, if it's very + | looking at, or label a support ticket as a duplicate if it's very | similar to an already existing one. p diff --git a/website/docs/usage/language-processing-pipeline.jade b/website/docs/usage/language-processing-pipeline.jade index 1392fc2f8..ffad01ead 100644 --- a/website/docs/usage/language-processing-pipeline.jade +++ b/website/docs/usage/language-processing-pipeline.jade @@ -144,7 +144,7 @@ p +table(["Argument", "Type", "Description"]) +row +cell #[code vocab] - +cell #[coce Vocab] + +cell #[code Vocab] +cell | Shared data between components, including strings, morphology, | vectors etc. diff --git a/website/docs/usage/spacy-101.jade b/website/docs/usage/spacy-101.jade index 8b2d0c17e..6a1f780dc 100644 --- a/website/docs/usage/spacy-101.jade +++ b/website/docs/usage/spacy-101.jade @@ -65,7 +65,7 @@ p | spaCy provides a variety of linguistic annotations to give you insights | into a text's grammatical structure. This includes the word types, | i.e. the parts of speech, and how the words are related to each other. - | For example, if you're analysing text, it makes a #[em huge] difference + | For example, if you're analysing text, it makes a huge difference | whether a noun is the subject of a sentence, or the object – or whether | "google" is used as a verb, or refers to the website or company in a | specific context. @@ -119,9 +119,11 @@ include _spacy-101/_named-entities +infobox | To learn more about entity recognition in spaCy, how to - | #[strong add your own entities] to a document and how to train and update - | the entity predictions of a model, see the usage guide on - | #[+a("/docs/usage/entity-recognition") named entity recognition]. + | #[strong add your own entities] to a document and how to + | #[strong train and update] the entity predictions of a model, see the + | usage guides on + | #[+a("/docs/usage/entity-recognition") named entity recognition] and + | #[+a("/docs/usage/training-ner") training the named entity recognizer]. +h(2, "vectors-similarity") Word vectors and similarity +tag-model("vectors") diff --git a/website/docs/usage/v2.jade b/website/docs/usage/v2.jade index 23b234c43..25aae8706 100644 --- a/website/docs/usage/v2.jade +++ b/website/docs/usage/v2.jade @@ -20,19 +20,18 @@ p nlp = Language(pipeline=['my_factory', mycomponent]) p - | It's now much easier to customise the pipeline with your own components. - | Components are functions that receive a #[code Doc] object, modify and - | return it. If your component is stateful, you'll want to create a new one - | for each pipeline. You can do that by defining and registering a factory - | which receives the shared #[code Vocab] object and returns a component. - -p - | spaCy's default components – the vectorizer, tagger, parser and entity - | recognizer, can be added to your pipeline by using their string IDs. - | This way, you won't have to worry about finding and implementing them – - | to use the default tagger, simply add #[code "tagger"] to the pipeline, + | It's now much easier to #[strong customise the pipeline] with your own + | components, functions that receive a #[code Doc] object, modify and + | return it. If your component is stateful, you can define and register a + | factory which receives the shared #[code Vocab] object and returns a + |  component. spaCy's default components can be added to your pipeline by + | using their string IDs. This way, you won't have to worry about finding + | and implementing them – simply add #[code "tagger"] to the pipeline, | and spaCy will know what to do. ++image + include ../../assets/img/docs/pipeline.svg + +infobox | #[strong API:] #[+api("language") #[code Language]] | #[strong Usage:] #[+a("/docs/usage/language-processing-pipeline") Processing text] @@ -96,11 +95,10 @@ p | #[code Language] class, or load a model that initialises one. This allows | languages to contain more custom data, e.g. lemmatizer lookup tables, or | complex regular expressions. The language data has also been tidied up - | and simplified. It's now also possible to overwrite the functions that - | compute lexical attributes like #[code like_num], and supply - | language-specific syntax iterators, e.g. to determine noun chunks. spaCy - | now also supports simple lookup-based lemmatization. The data is stored - | in a dictionary mapping a string to its lemma. + | and simplified. spaCy now also supports simple lookup-based lemmatization. + ++image + include ../../assets/img/docs/language_data.svg +infobox | #[strong API:] #[+api("language") #[code Language]] @@ -111,13 +109,10 @@ p +aside-code("Example"). from spacy.matcher import Matcher - from spacy.attrs import LOWER, IS_PUNCT matcher = Matcher(nlp.vocab) - matcher.add('HelloWorld', None, - [{LOWER: 'hello'}, {IS_PUNCT: True}, {LOWER: 'world'}], - [{LOWER: 'hello'}, {LOWER: 'world'}]) + matcher.add('HEARTS', None, [{'ORTH': '❤️', 'OP': '+'}]) assert len(matcher) == 1 - assert 'HelloWorld' in matcher + assert 'HEARTS' in matcher p | Patterns can now be added to the matcher by calling @@ -157,28 +152,8 @@ p +cell #[+api("language#to_disk") #[code Language.to_disk]] +row - +cell #[code Tokenizer.load] - +cell - | #[+api("tokenizer#from_disk") #[code Tokenizer.from_disk]] - | #[+api("tokenizer#from_bytes") #[code Tokenizer.from_bytes]] - - +row - +cell #[code Tagger.load] - +cell - | #[+api("tagger#from_disk") #[code Tagger.from_disk]] - | #[+api("tagger#from_bytes") #[code Tagger.from_bytes]] - - +row - +cell #[code DependencyParser.load] - +cell - | #[+api("dependencyparser#from_disk") #[code DependencyParser.from_disk]] - | #[+api("dependencyparser#from_bytes") #[code DependencyParser.from_bytes]] - - +row - +cell #[code EntityRecognizer.load] - +cell - | #[+api("entityrecognizer#from_disk") #[code EntityRecognizer.from_disk]] - | #[+api("entityrecognizer#from_bytes") #[code EntityRecognizer.from_bytes]] + +cell #[code Language.create_make_doc] + +cell #[+api("language#attributes") #[code Language.tokenizer]] +row +cell @@ -212,6 +187,28 @@ p | #[+api("stringstore#to_disk") #[code StringStore.to_disk]] | #[+api("stringstore#to_bytes") #[code StringStore.to_bytes]] + +row + +cell #[code Tokenizer.load] + +cell - + + +row + +cell #[code Tagger.load] + +cell + | #[+api("tagger#from_disk") #[code Tagger.from_disk]] + | #[+api("tagger#from_bytes") #[code Tagger.from_bytes]] + + +row + +cell #[code DependencyParser.load] + +cell + | #[+api("dependencyparser#from_disk") #[code DependencyParser.from_disk]] + | #[+api("dependencyparser#from_bytes") #[code DependencyParser.from_bytes]] + + +row + +cell #[code EntityRecognizer.load] + +cell + | #[+api("entityrecognizer#from_disk") #[code EntityRecognizer.from_disk]] + | #[+api("entityrecognizer#from_bytes") #[code EntityRecognizer.from_bytes]] + +row +cell #[code Matcher.load] +cell - @@ -232,7 +229,7 @@ p +row +cell #[code Doc.read_bytes] - +cell + +cell #[+api("binder") #[code Binder]] +row +cell #[code Token.is_ancestor_of]