From 040553ca5920c366259b4ffc6b31547ecf7a254c Mon Sep 17 00:00:00 2001 From: ines Date: Mon, 5 Jun 2017 13:33:01 +0200 Subject: [PATCH 1/4] Update architecture and features table --- .../docs/usage/_spacy-101/_architecture.jade | 45 ++++++++++++++++++- website/docs/usage/spacy-101.jade | 41 +++-------------- 2 files changed, 51 insertions(+), 35 deletions(-) diff --git a/website/docs/usage/_spacy-101/_architecture.jade b/website/docs/usage/_spacy-101/_architecture.jade index 4905171e7..c5a85f0b0 100644 --- a/website/docs/usage/_spacy-101/_architecture.jade +++ b/website/docs/usage/_spacy-101/_architecture.jade @@ -70,14 +70,57 @@ p +cell Map strings to and from hash values. +row - +row +cell #[+api("tokenizer") #[code Tokenizer]] +cell | Segment text, and create #[code Doc] objects with the discovered | segment boundaries. + +row + +cell #[code Lemmatizer] + +cell + | Determine the base forms of words. + +row +cell #[+api("matcher") #[code Matcher]] +cell | Match sequences of tokens, based on pattern rules, similar to | regular expressions. + + ++h(3, "architecture-pipeline") Pipeline components + ++table(["Name", "Description"]) + +row + +cell #[+api("tagger") #[code Tagger]] + +cell Annotate part-of-speech tags on #[code Doc] objects. + + +row + +cell #[+api("dependencyparser") #[code DependencyParser]] + +cell Annotate syntactic dependencies on #[code Doc] objects. + + +row + +cell #[+api("entityrecognizer") #[code EntityRecognizer]] + +cell + | Annotate named entities, e.g. persons or products, on #[code Doc] + | objects. + ++h(3, "architecture-other") Other classes + ++table(["Name", "Description"]) + +row + +cell #[+api("vectors") #[code Vectors]] + +cell Container class for vector data keyed by string. + + +row + +cell #[+api("binder") #[code Binder]] + +cell Container class for serializing collections of #[code Doc] objects. + + +row + +cell #[+api("goldparse") #[code GoldParse]] + +cell Collection for training annotations. + + +row + +cell #[+api("goldcorpus") #[code GoldCorpus]] + +cell + | An annotated corpus, using the JSON file format. Manages + | annotations for tagging, dependency parsing and NER. diff --git a/website/docs/usage/spacy-101.jade b/website/docs/usage/spacy-101.jade index 5b7908651..4c7a8b09d 100644 --- a/website/docs/usage/spacy-101.jade +++ b/website/docs/usage/spacy-101.jade @@ -110,6 +110,13 @@ p | between individual tokens, like subject or object. +cell #[+procon("pro")] + +row + +cell #[strong Lemmatization] + +cell + | Assigning the base forms of words. For example, the lemma of + | "was" is "be", and the lemma of "rats" is "rat". + +cell #[+procon("pro")] + +row +cell #[strong Sentence Boundary Detection] (SBD) +cell Finding and segmenting individual sentences. @@ -274,40 +281,6 @@ include _spacy-101/_language-data include _spacy-101/_architecture.jade -+h(3, "architecture-pipeline") Pipeline components - -+table(["Name", "Description"]) - +row - +cell #[+api("tagger") #[code Tagger]] - +cell Annotate part-of-speech tags on #[code Doc] objects. - - +row - +cell #[+api("dependencyparser") #[code DependencyParser]] - +cell Annotate syntactic dependencies on #[code Doc] objects. - - +row - +cell #[+api("entityrecognizer") #[code EntityRecognizer]] - +cell - | Annotate named entities, e.g. persons or products, on #[code Doc] - | objects. - -+h(3, "architecture-other") Other classes - -+table(["Name", "Description"]) - +row - +cell #[+api("binder") #[code Binder]] - +cell Container class for serializing collections of #[code Doc] objects. - - +row - +cell #[+api("goldparse") #[code GoldParse]] - +cell Collection for training annotations. - - +row - +cell #[+api("goldcorpus") #[code GoldCorpus]] - +cell - | An annotated corpus, using the JSON file format. Manages - | annotations for tagging, dependency parsing and NER. - +h(2, "community") Community & FAQ p From 9f55c0d4f6d17bdf1dbb76bf991bba180d43621b Mon Sep 17 00:00:00 2001 From: ines Date: Mon, 5 Jun 2017 13:33:11 +0200 Subject: [PATCH 2/4] Add Vectors class --- website/docs/api/_data.json | 7 +++++++ website/docs/api/vectors.jade | 7 +++++++ 2 files changed, 14 insertions(+) create mode 100644 website/docs/api/vectors.jade diff --git a/website/docs/api/_data.json b/website/docs/api/_data.json index 16dd816bd..a2e447dc8 100644 --- a/website/docs/api/_data.json +++ b/website/docs/api/_data.json @@ -24,6 +24,7 @@ "Lexeme": "lexeme", "Vocab": "vocab", "StringStore": "stringstore", + "Vectors": "vectors", "GoldParse": "goldparse", "GoldCorpus": "goldcorpus", "Binder": "binder" @@ -164,6 +165,12 @@ "source": "spacy/tokens/binder.pyx" }, + "vectors": { + "title": "Vectors", + "tag": "class", + "source": "spacy/vectors.pyx" + }, + "annotation": { "title": "Annotation Specifications" } diff --git a/website/docs/api/vectors.jade b/website/docs/api/vectors.jade new file mode 100644 index 000000000..ef9aa2b52 --- /dev/null +++ b/website/docs/api/vectors.jade @@ -0,0 +1,7 @@ +//- 💫 DOCS > API > VECTORS + +include ../../_includes/_mixins + +p A container class for vector data keyed by string. + ++under-construction From fd35d910b8b6b5b1aad7201ec3943d6f64049cc7 Mon Sep 17 00:00:00 2001 From: ines Date: Mon, 5 Jun 2017 14:13:38 +0200 Subject: [PATCH 3/4] Update v2 docs and benchmarks --- website/docs/usage/v2.jade | 73 ++++++++++++++++++++++++++------------ 1 file changed, 50 insertions(+), 23 deletions(-) diff --git a/website/docs/usage/v2.jade b/website/docs/usage/v2.jade index 2e00a4a16..c68b7ee9c 100644 --- a/website/docs/usage/v2.jade +++ b/website/docs/usage/v2.jade @@ -22,7 +22,7 @@ p | entirely new #[strong deep learning-powered models] for spaCy's tagger, | parser and entity recognizer. The new models are #[strong 20x smaller] | than the linear models that have powered spaCy until now: from 300 MB to - | only 14 MB. + | only 15 MB. p | We've also made several usability improvements that are @@ -247,12 +247,12 @@ p | #[code spacy.lang.xx] +row - +cell #[code spacy.orth] - +cell #[code spacy.lang.xx.lex_attrs] + +cell #[code orth] + +cell #[code lang.xx.lex_attrs] +row - +cell #[code cli.model] - +cell - + +cell #[code syntax.syntax_iterators] + +cell #[code lang.xx.syntax_iterators] +row +cell #[code Language.save_to_directory] @@ -266,8 +266,6 @@ p +cell | #[code Vocab.load] | #[code Vocab.load_lexemes] - | #[code Vocab.load_vectors] - | #[code Vocab.load_vectors_from_bin_loc] +cell | #[+api("vocab#from_disk") #[code Vocab.from_disk]] | #[+api("vocab#from_bytes") #[code Vocab.from_bytes]] @@ -275,10 +273,24 @@ p +row +cell | #[code Vocab.dump] + +cell + | #[+api("vocab#to_disk") #[code Vocab.to_disk]]#[br] + | #[+api("vocab#to_bytes") #[code Vocab.to_bytes]] + + +row + +cell + | #[code Vocab.load_vectors] + | #[code Vocab.load_vectors_from_bin_loc] + +cell + | #[+api("vectors#from_disk") #[code Vectors.from_disk]] + | #[+api("vectors#from_bytes") #[code Vectors.from_bytes]] + + +row + +cell | #[code Vocab.dump_vectors] +cell - | #[+api("vocab#to_disk") #[code Vocab.to_disk]] - | #[+api("vocab#to_bytes") #[code Vocab.to_bytes]] + | #[+api("vectors#to_disk") #[code Vectors.to_disk]] + | #[+api("vectors#to_bytes") #[code Vectors.to_bytes]] +row +cell @@ -296,7 +308,9 @@ p +row +cell #[code Tokenizer.load] - +cell - + +cell + | #[+api("tokenizer#from_disk") #[code Tokenizer.from_disk]] + | #[+api("tokenizer#from_bytes") #[code Tokenizer.from_bytes]] +row +cell #[code Tagger.load] @@ -342,6 +356,10 @@ p +cell #[code Token.is_ancestor_of] +cell #[+api("token#is_ancestor") #[code Token.is_ancestor]] + +row + +cell #[code cli.model] + +cell - + +h(2, "migrating") Migrating from spaCy 1.x p @@ -466,18 +484,27 @@ p +h(2, "benchmarks") Benchmarks ++under-construction + ++aside("Data sources") + | #[strong Parser, tagger, NER:] #[+a("https://www.gabormelli.com/RKB/OntoNotes_Corpus") OntoNotes 5]#[br] + | #[strong Word vectors:] #[+a("http://commoncrawl.org") Common Crawl]#[br] + +p The evaluation was conducted on raw text with no gold standard information. + +table(["Model", "Version", "Type", "UAS", "LAS", "NER F", "POS", "w/s"]) - +row - +cell #[code en_core_web_sm] - for cell in ["2.0.0", "neural", "", "", "", "", ""] - +cell=cell + mixin benchmark-row(name, details, values, highlight, style) + +row(style) + +cell #[code=name] + for cell in details + +cell=cell + for cell, i in values + +cell.u-text-right + if highlight && highlight[i] + strong=cell + else + !=cell - +row - +cell #[code es_dep_web_sm] - for cell in ["2.0.0", "neural", "", "", "", "", ""] - +cell=cell - - +row("divider") - +cell #[code en_core_web_sm] - for cell in ["1.1.0", "linear", "", "", "", "", ""] - +cell=cell + +benchmark-row("en_core_web_sm", ["2.0.0", "neural"], ["91.2", "89.2", "82.6", "96.6", "10,300"], [1, 1, 1, 0, 0]) + +benchmark-row("en_core_web_sm", ["1.2.0", "linear"], ["86.6", "83.8", "78.5", "96.6", "25,700"], [0, 0, 0, 0, 1], "divider") + +benchmark-row("en_core_web_md", ["1.2.1", "linear"], ["90.6", "88.5", "81.4", "96.7", "18,800"], [0, 0, 0, 1, 0]) From a3f9745a14479c52d5aa185027a6ac1692a34208 Mon Sep 17 00:00:00 2001 From: ines Date: Mon, 5 Jun 2017 15:37:33 +0200 Subject: [PATCH 4/4] Update similarity usage guide and examples --- website/docs/usage/_data.json | 2 +- .../docs/usage/_spacy-101/_similarity.jade | 4 +- .../docs/usage/word-vectors-similarities.jade | 133 ++++++++++++++++-- 3 files changed, 128 insertions(+), 11 deletions(-) diff --git a/website/docs/usage/_data.json b/website/docs/usage/_data.json index 4d8dbb165..81deeb402 100644 --- a/website/docs/usage/_data.json +++ b/website/docs/usage/_data.json @@ -11,7 +11,7 @@ "POS tagging": "pos-tagging", "Using the parse": "dependency-parse", "Entity recognition": "entity-recognition", - "Word vectors": "word-vectors-similarities", + "Vectors & similarity": "word-vectors-similarities", "Custom tokenization": "customizing-tokenizer", "Rule-based matching": "rule-based-matching", "Adding languages": "adding-languages", diff --git a/website/docs/usage/_spacy-101/_similarity.jade b/website/docs/usage/_spacy-101/_similarity.jade index 6eed1eb7f..e8ce692f0 100644 --- a/website/docs/usage/_spacy-101/_similarity.jade +++ b/website/docs/usage/_spacy-101/_similarity.jade @@ -29,11 +29,11 @@ p | #[strong #[+procon("con", 16)] similarity:] dissimilar (lower is less similar) +table(["", "dog", "cat", "banana"]) - each cells, label in {"dog": [1.00, 0.80, 0.24], "cat": [0.80, 1.00, 0.28], "banana": [0.24, 0.28, 1.00]} + each cells, label in {"dog": [1, 0.8, 0.24], "cat": [0.8, 1, 0.28], "banana": [0.24, 0.28, 1]} +row +cell.u-text-label.u-color-theme=label for cell in cells - +cell #[code=cell.toFixed(2)] + +cell.u-text-center #[code=cell.toFixed(2)] | #[+procon(cell < 0.5 ? "con" : cell != 1 ? "pro" : "neutral")] p diff --git a/website/docs/usage/word-vectors-similarities.jade b/website/docs/usage/word-vectors-similarities.jade index e5935cfb6..63ed01776 100644 --- a/website/docs/usage/word-vectors-similarities.jade +++ b/website/docs/usage/word-vectors-similarities.jade @@ -8,10 +8,8 @@ p | to train these vectors is the #[+a("https://en.wikipedia.org/wiki/Word2vec") word2vec] | family of algorithms. The default | #[+a("/docs/usage/models#available") English model] installs - | 300-dimensional vectors trained on the Common Crawl - | corpus using the #[+a("http://nlp.stanford.edu/projects/glove/") GloVe] - | algorithm. The GloVe common crawl vectors have become a de facto - | standard for practical NLP. + | 300-dimensional vectors trained on the + | #[+a("http://commoncrawl.org") Common Crawl] corpus. +aside("Tip: Training a word2vec model") | If you need to train a word2vec model, we recommend the implementation in @@ -23,6 +21,129 @@ p include _spacy-101/_similarity include _spacy-101/_word-vectors ++h(2, "similarity-context") Similarities in context + +p + | Aside from spaCy's built-in word vectors, which were trained on a lot of + | text with a wide vocabulary, the parsing, tagging and NER models also + | rely on vector representations of the #[strong meanings of words in context]. + | As the first component of the + | #[+a("/docs/usage/language-processing-pipeline") processing pipeline], the + | tensorizer encodes a document's internal meaning representations as an + | array of floats, also called a tensor. This allows spaCy to make a + | reasonable guess at a word's meaning, based on its surrounding words. + | Even if a word hasn't been seen before, spaCy will know #[em something] + | about it. Because spaCy uses a 4-layer convolutional network, the + | tensors are sensitive to up to #[strong four words on either side] of a + | word. + +p + | For example, here are three sentences containing the out-of-vocabulary + | word "labrador" in different contexts. + ++code. + doc1 = nlp(u"The labrador barked.") + doc2 = nlp(u"The labrador swam.") + doc3 = nlp(u"the labrador people live in canada.") + + for doc in [doc1, doc2, doc3]: + labrador = doc[1] + dog = nlp(u"dog") + print(labrador.similarity(dog)) + +p + | Even though the model has never seen the word "labrador", it can make a + | fairly accurate prediction of its similarity to "dog" in different + | contexts. + ++table(["Context", "labrador.similarity(dog)"]) + +row + +cell The #[strong labrador] barked. + +cell #[code 0.56] #[+procon("pro")] + + +row + +cell The #[strong labrador] swam. + +cell #[code 0.48] #[+procon("con")] + + +row + +cell the #[strong labrador] people live in canada. + +cell #[code 0.39] #[+procon("con")] + +p + | The same also works for whole documents. Here, the variance of the + | similarities is lower, as all words and their order are taken into + | account. However, the context-specific similarity is often still + | reflected pretty accurately. + ++code. + doc1 = nlp(u"Paris is the largest city in France.") + doc2 = nlp(u"Ljubljana is the capital of Lithuania.") + doc3 = nlp(u"An emu is a large bird.") + + for doc in [doc1, doc2, doc3]: + for other_doc in [doc1, doc2, doc3]: + print(doc.similarity(other_doc)) + +p + | Even though the sentences about Paris and Ljubljana consist of different + | words and entities, they both describe the same concept and are seen as + | more similar than the sentence about emus. In this case, even a misspelled + | version of "Ljubljana" would still produce very similar results. + ++table + - var examples = {"Paris is the largest city in France.": [1, 0.84, 0.65], "Ljubljana is the capital of Lithuania.": [0.84, 1, 0.52], "An emu is a large bird.": [0.65, 0.52, 1]} + - var counter = 0 + + +row + +row + +cell + for _, label in examples + +cell=label + + each cells, label in examples + +row(counter ? null : "divider") + +cell=label + for cell in cells + +cell.u-text-center #[code=cell.toFixed(2)] + | #[+procon(cell < 0.7 ? "con" : cell != 1 ? "pro" : "neutral")] + - counter++ + +p + | Sentences that consist of the same words in different order will likely + | be seen as very similar – but never identical. + ++code. + docs = [nlp(u"dog bites man"), nlp(u"man bites dog"), + nlp(u"man dog bites"), nlp(u"dog man bites")] + + for doc in docs: + for other_doc in docs: + print(doc.similarity(other_doc)) + +p + | Interestingly, "man bites dog" and "man dog bites" are seen as slightly + | more similar than "man bites dog" and "dog bites man". This may be a + | conincidence – or the result of "man" being interpreted as both sentence's + | subject. + ++table + - var examples = {"dog bites man": [1, 0.9, 0.89, 0.92], "man bites dog": [0.9, 1, 0.93, 0.9], "man dog bites": [0.89, 0.93, 1, 0.92], "dog man bites": [0.92, 0.9, 0.92, 1]} + - var counter = 0 + + +row + +row + +cell + for _, label in examples + +cell.u-text-center=label + + each cells, label in examples + +row(counter ? null : "divider") + +cell=label + for cell in cells + +cell.u-text-center #[code=cell.toFixed(2)] + | #[+procon(cell < 0.7 ? "con" : cell != 1 ? "pro" : "neutral")] + - counter++ + +h(2, "custom") Customising word vectors +under-construction @@ -36,7 +157,3 @@ p | behaviours by modifying the #[code doc.user_hooks], | #[code doc.user_span_hooks] and #[code doc.user_token_hooks] | dictionaries. - -+h(2, "similarity") Similarity - -+under-construction