Merge branch 'develop' of https://github.com/explosion/spaCy into develop

This commit is contained in:
Matthew Honnibal 2017-11-05 17:11:24 +01:00
commit bc4dc2da4e
16 changed files with 408 additions and 315 deletions

View File

@ -11,6 +11,23 @@ mixin section(id)
block
//- Accordion (collapsible sections)
title - [string] Section title.
id - [string] Optional section ID for permalinks.
level - [integer] Headline level for section title.
mixin accordion(title, id, level)
section.o-accordion.o-block
+h(level || 4).o-no-block(id=id)
button.o-accordion__button.o-grid.o-grid--vcenter.o-grid--space.js-accordion(aria-expanded="false")=title
svg.o-accordion__icon(width="20" height="20" viewBox="0 0 10 10" aria-hidden="true" focusable="false")
rect.o-accordion__hide(height="8" width="2" y="1" x="4")
rect(height="2" width="8" y="4" x="1")
.o-accordion__content(hidden="")
block
//- Headlines Helper Mixin
level - [integer] 1, 2, 3, 4, or 5

View File

@ -50,7 +50,7 @@ for id in CURRENT_MODELS
+cell
span(data-tpl=id data-tpl-key=field) #[em n/a]
+row(data-tpl=id data-tpl-key="compat-wrapper" style="display: none")
+row(data-tpl=id data-tpl-key="compat-wrapper" hidden="")
+cell
+label Compat #[+help("Latest compatible model version for your spaCy installation").u-color-subtle]
+cell
@ -58,15 +58,15 @@ for id in CURRENT_MODELS
select.o-field__select.u-text-small(data-tpl=id data-tpl-key="compat")
div(data-tpl=id data-tpl-key="compat-versions")  
section(data-tpl=id data-tpl-key="benchmarks" style="display: none")
section(data-tpl=id data-tpl-key="benchmarks" hidden="")
+grid.o-block-small
for keys, label in MODEL_BENCHMARKS
.u-flex-full.u-padding-small(data-tpl=id data-tpl-key=label.toLowerCase() style="display: none")
.u-flex-full.u-padding-small(data-tpl=id data-tpl-key=label.toLowerCase() hidden="")
+table.o-block-small
+row("head")
+head-cell(colspan="2")=(MODEL_META["benchmark_" + label] || label)
for label, field in keys
+row(style="display: none")
+row(hidden="")
+cell.u-nowrap
+label=label
if MODEL_META[field]

View File

@ -41,6 +41,7 @@ if IS_PAGE
https://medium.com/dev-channel/es6-modules-in-chrome-canary-m60-ba588dfb8ab7
- ProgressBar = "new ProgressBar('.js-progress');"
- Accordion = "new Accordion('.js-accordion');"
- Changelog = "new Changelog('" + SOCIAL.github + "', 'spacy');"
- NavHighlighter = "new NavHighlighter('data-section', 'data-nav');"
- GitHubEmbed = "new GitHubEmbed('" + SOCIAL.github + "', 'data-gh-embed');"
@ -57,6 +58,7 @@ if environment == "deploy"
if IS_PAGE
!=NavHighlighter
!=GitHubEmbed
!=Accordion
if HAS_MODELS
!=ModelLoader
if compare_models
@ -74,6 +76,8 @@ else
!=NavHighlighter
| import GitHubEmbed from '/assets/js/github-embed.js';
!=GitHubEmbed
| import Accordion from '/assets/js/accordion.js';
!=Accordion
if HAS_MODELS
| import { ModelLoader } from '/assets/js/models.js';
!=ModelLoader

View File

@ -1,108 +1,112 @@
//- 💫 DOCS > API > ANNOTATION > DEPENDENCY LABELS
+h(3, "dependency-parsing-english") English dependency labels
p
| The English dependency labels use the #[+a("http://www.clearnlp.com") ClearNLP]
| #[+a("http://www.mathcs.emory.edu/~choi/doc/clear-dependency-2012.pdf") CLEAR Style].
| This section lists the syntactic dependency labels assigned by
| spaCy's #[+a("/models") models]. The individual labels are
| language-specific and depend on the training corpus.
+table(["Label", "Description"])
+dep-row("acomp", "adjectival complement")
+dep-row("advcl", "adverbial clause modifier")
+dep-row("advmod", "adverbial modifier")
+dep-row("agent", "agent")
+dep-row("amod", "adjectival modifier")
+dep-row("appos", "appositional modifier")
+dep-row("attr", "attribute")
+dep-row("aux", "auxiliary")
+dep-row("auxpass", "auxiliary (passive)")
+dep-row("cc", "coordinating conjunction")
+dep-row("ccomp", "clausal complement")
+dep-row("complm", "complementizer")
+dep-row("conj", "conjunct")
+dep-row("cop", "copula")
+dep-row("csubj", "clausal subject")
+dep-row("csubjpass", "clausal subject (passive)")
+dep-row("dep", "unclassified dependent")
+dep-row("det", "determiner")
+dep-row("dobj", "direct object")
+dep-row("expl", "expletive")
+dep-row("hmod", "modifier in hyphenation")
+dep-row("hyph", "hyphen")
+dep-row("infmod", "infinitival modifier")
+dep-row("intj", "interjection")
+dep-row("iobj", "indirect object")
+dep-row("mark", "marker")
+dep-row("meta", "meta modifier")
+dep-row("neg", "negation modifier")
+dep-row("nmod", "modifier of nominal")
+dep-row("nn", "noun compound modifier")
+dep-row("npadvmod", "noun phrase as adverbial modifier")
+dep-row("nsubj", "nominal subject")
+dep-row("nsubjpass", "nominal subject (passive)")
+dep-row("num", "number modifier")
+dep-row("number", "number compound modifier")
+dep-row("oprd", "object predicate")
+dep-row("obj", "object")
+dep-row("obl", "oblique nominal")
+dep-row("parataxis", "parataxis")
+dep-row("partmod", "participal modifier")
+dep-row("pcomp", "complement of preposition")
+dep-row("pobj", "object of preposition")
+dep-row("poss", "possession modifier")
+dep-row("possessive", "possessive modifier")
+dep-row("preconj", "pre-correlative conjunction")
+dep-row("prep", "prepositional modifier")
+dep-row("prt", "particle")
+dep-row("punct", "punctuation")
+dep-row("quantmod", "modifier of quantifier")
+dep-row("rcmod", "relative clause modifier")
+dep-row("root", "root")
+dep-row("xcomp", "open clausal complement")
+accordion("English", "dependency-parsing-english")
p
| The English dependency labels use the
| #[+a("http://www.mathcs.emory.edu/~choi/doc/clear-dependency-2012.pdf") CLEAR Style]
| by #[+a("http://www.clearnlp.com") ClearNLP].
+h(3, "dependency-parsing-german") German dependency labels
+table(["Label", "Description"])
+dep-row("acomp", "adjectival complement")
+dep-row("advcl", "adverbial clause modifier")
+dep-row("advmod", "adverbial modifier")
+dep-row("agent", "agent")
+dep-row("amod", "adjectival modifier")
+dep-row("appos", "appositional modifier")
+dep-row("attr", "attribute")
+dep-row("aux", "auxiliary")
+dep-row("auxpass", "auxiliary (passive)")
+dep-row("cc", "coordinating conjunction")
+dep-row("ccomp", "clausal complement")
+dep-row("complm", "complementizer")
+dep-row("conj", "conjunct")
+dep-row("cop", "copula")
+dep-row("csubj", "clausal subject")
+dep-row("csubjpass", "clausal subject (passive)")
+dep-row("dep", "unclassified dependent")
+dep-row("det", "determiner")
+dep-row("dobj", "direct object")
+dep-row("expl", "expletive")
+dep-row("hmod", "modifier in hyphenation")
+dep-row("hyph", "hyphen")
+dep-row("infmod", "infinitival modifier")
+dep-row("intj", "interjection")
+dep-row("iobj", "indirect object")
+dep-row("mark", "marker")
+dep-row("meta", "meta modifier")
+dep-row("neg", "negation modifier")
+dep-row("nmod", "modifier of nominal")
+dep-row("nn", "noun compound modifier")
+dep-row("npadvmod", "noun phrase as adverbial modifier")
+dep-row("nsubj", "nominal subject")
+dep-row("nsubjpass", "nominal subject (passive)")
+dep-row("num", "number modifier")
+dep-row("number", "number compound modifier")
+dep-row("oprd", "object predicate")
+dep-row("obj", "object")
+dep-row("obl", "oblique nominal")
+dep-row("parataxis", "parataxis")
+dep-row("partmod", "participal modifier")
+dep-row("pcomp", "complement of preposition")
+dep-row("pobj", "object of preposition")
+dep-row("poss", "possession modifier")
+dep-row("possessive", "possessive modifier")
+dep-row("preconj", "pre-correlative conjunction")
+dep-row("prep", "prepositional modifier")
+dep-row("prt", "particle")
+dep-row("punct", "punctuation")
+dep-row("quantmod", "modifier of quantifier")
+dep-row("rcmod", "relative clause modifier")
+dep-row("root", "root")
+dep-row("xcomp", "open clausal complement")
p
| The German dependency labels use the
| #[+a("http://www.ims.uni-stuttgart.de/forschung/ressourcen/korpora/TIGERCorpus/annotation/index.html") TIGER Treebank]
| annotation scheme.
+accordion("German", "dependency-parsing-german")
p
| The German dependency labels use the
| #[+a("http://www.ims.uni-stuttgart.de/forschung/ressourcen/korpora/TIGERCorpus/annotation/index.html") TIGER Treebank]
| annotation scheme.
+table(["Label", "Description"])
+dep-row("ac", "adpositional case marker")
+dep-row("adc", "adjective component")
+dep-row("ag", "genitive attribute")
+dep-row("ams", "measure argument of adjective")
+dep-row("app", "apposition")
+dep-row("avc", "adverbial phrase component")
+dep-row("cc", "comparative complement")
+dep-row("cd", "coordinating conjunction")
+dep-row("cj", "conjunct")
+dep-row("cm", "comparative conjunction")
+dep-row("cp", "complementizer")
+dep-row("cvc", "collocational verb construction")
+dep-row("da", "dative")
+dep-row("dh", "discourse-level head")
+dep-row("dm", "discourse marker")
+dep-row("ep", "expletive es")
+dep-row("hd", "head")
+dep-row("ju", "junctor")
+dep-row("mnr", "postnominal modifier")
+dep-row("mo", "modifier")
+dep-row("ng", "negation")
+dep-row("nk", "noun kernel element")
+dep-row("nmc", "numerical component")
+dep-row("oa", "accusative object")
+dep-row("oa", "second accusative object")
+dep-row("oc", "clausal object")
+dep-row("og", "genitive object")
+dep-row("op", "prepositional object")
+dep-row("par", "parenthetical element")
+dep-row("pd", "predicate")
+dep-row("pg", "phrasal genitive")
+dep-row("ph", "placeholder")
+dep-row("pm", "morphological particle")
+dep-row("pnc", "proper noun component")
+dep-row("rc", "relative clause")
+dep-row("re", "repeated element")
+dep-row("rs", "reported speech")
+dep-row("sb", "subject")
+table(["Label", "Description"])
+dep-row("ac", "adpositional case marker")
+dep-row("adc", "adjective component")
+dep-row("ag", "genitive attribute")
+dep-row("ams", "measure argument of adjective")
+dep-row("app", "apposition")
+dep-row("avc", "adverbial phrase component")
+dep-row("cc", "comparative complement")
+dep-row("cd", "coordinating conjunction")
+dep-row("cj", "conjunct")
+dep-row("cm", "comparative conjunction")
+dep-row("cp", "complementizer")
+dep-row("cvc", "collocational verb construction")
+dep-row("da", "dative")
+dep-row("dh", "discourse-level head")
+dep-row("dm", "discourse marker")
+dep-row("ep", "expletive es")
+dep-row("hd", "head")
+dep-row("ju", "junctor")
+dep-row("mnr", "postnominal modifier")
+dep-row("mo", "modifier")
+dep-row("ng", "negation")
+dep-row("nk", "noun kernel element")
+dep-row("nmc", "numerical component")
+dep-row("oa", "accusative object")
+dep-row("oa", "second accusative object")
+dep-row("oc", "clausal object")
+dep-row("og", "genitive object")
+dep-row("op", "prepositional object")
+dep-row("par", "parenthetical element")
+dep-row("pd", "predicate")
+dep-row("pg", "phrasal genitive")
+dep-row("ph", "placeholder")
+dep-row("pm", "morphological particle")
+dep-row("pnc", "proper noun component")
+dep-row("rc", "relative clause")
+dep-row("re", "repeated element")
+dep-row("rs", "reported speech")
+dep-row("sb", "subject")

View File

@ -1,134 +1,138 @@
//- 💫 DOCS > API > ANNOTATION > POS TAGS
+h(3, "pos-tagging-english") English part-of-speech tag scheme
p
| The English part-of-speech tagger uses the
| #[+a("https://catalog.ldc.upenn.edu/LDC2013T19") OntoNotes 5] version of
| the Penn Treebank tag set. We also map the tags to the simpler Google
| Universal POS tag set.
| This section lists the fine-grained and coarse-grained part-of-speech
| tags assigned by spaCy's #[+a("/models") models]. The individual mapping
| is specific to the training corpus and can be defined in the respective
| language data's #[+a("/usage/adding-languages#tag-map") #[code tag_map.py]].
+table(["Tag", "POS", "Morphology", "Description"])
+pos-row("-LRB-", "PUNCT", "PunctType=brck PunctSide=ini", "left round bracket")
+pos-row("-PRB-", "PUNCT", "PunctType=brck PunctSide=fin", "right round bracket")
+pos-row(",", "PUNCT", "PunctType=comm", "punctuation mark, comma")
+pos-row(":", "PUNCT", "", "punctuation mark, colon or ellipsis")
+pos-row(".", "PUNCT", "PunctType=peri", "punctuation mark, sentence closer")
+pos-row("''", "PUNCT", "PunctType=quot PunctSide=fin", "closing quotation mark")
+pos-row("\"\"", "PUNCT", "PunctType=quot PunctSide=fin", "closing quotation mark")
+pos-row("#", "SYM", "SymType=numbersign", "symbol, number sign")
+pos-row("``", "PUNCT", "PunctType=quot PunctSide=ini", "opening quotation mark")
+pos-row("$", "SYM", "SymType=currency", "symbol, currency")
+pos-row("ADD", "X", "", "email")
+pos-row("AFX", "ADJ", "Hyph=yes", "affix")
+pos-row("BES", "VERB", "", 'auxiliary "be"')
+pos-row("CC", "CONJ", "ConjType=coor", "conjunction, coordinating")
+pos-row("CD", "NUM", "NumType=card", "cardinal number")
+pos-row("DT", "DET", "determiner")
+pos-row("EX", "ADV", "AdvType=ex", "existential there")
+pos-row("FW", "X", "Foreign=yes", "foreign word")
+pos-row("GW", "X", "", "additional word in multi-word expression")
+pos-row("HVS", "VERB", "", 'forms of "have"')
+pos-row("HYPH", "PUNCT", "PunctType=dash", "punctuation mark, hyphen")
+pos-row("IN", "ADP", "", "conjunction, subordinating or preposition")
+pos-row("JJ", "ADJ", "Degree=pos", "adjective")
+pos-row("JJR", "ADJ", "Degree=comp", "adjective, comparative")
+pos-row("JJS", "ADJ", "Degree=sup", "adjective, superlative")
+pos-row("LS", "PUNCT", "NumType=ord", "list item marker")
+pos-row("MD", "VERB", "VerbType=mod", "verb, modal auxiliary")
+pos-row("NFP", "PUNCT", "", "superfluous punctuation")
+pos-row("NIL", "", "", "missing tag")
+pos-row("NN", "NOUN", "Number=sing", "noun, singular or mass")
+pos-row("NNP", "PROPN", "NounType=prop Number=sign", "noun, proper singular")
+pos-row("NNPS", "PROPN", "NounType=prop Number=plur", "noun, proper plural")
+pos-row("NNS", "NOUN", "Number=plur", "noun, plural")
+pos-row("PDT", "ADJ", "AdjType=pdt PronType=prn", "predeterminer")
+pos-row("POS", "PART", "Poss=yes", "possessive ending")
+pos-row("PRP", "PRON", "PronType=prs", "pronoun, personal")
+pos-row("PRP$", "ADJ", "PronType=prs Poss=yes", "pronoun, possessive")
+pos-row("RB", "ADV", "Degree=pos", "adverb")
+pos-row("RBR", "ADV", "Degree=comp", "adverb, comparative")
+pos-row("RBS", "ADV", "Degree=sup", "adverb, superlative")
+pos-row("RP", "PART", "", "adverb, particle")
+pos-row("SP", "SPACE", "", "space")
+pos-row("SYM", "SYM", "", "symbol")
+pos-row("TO", "PART", "PartType=inf VerbForm=inf", "infinitival to")
+pos-row("UH", "INTJ", "", "interjection")
+pos-row("VB", "VERB", "VerbForm=inf", "verb, base form")
+pos-row("VBD", "VERB", "VerbForm=fin Tense=past", "verb, past tense")
+pos-row("VBG", "VERB", "VerbForm=part Tense=pres Aspect=prog", "verb, gerund or present participle")
+pos-row("VBN", "VERB", "VerbForm=part Tense=past Aspect=perf", "verb, past participle")
+pos-row("VBP", "VERB", "VerbForm=fin Tense=pres", "verb, non-3rd person singular present")
+pos-row("VBZ", "VERB", "VerbForm=fin Tense=pres Number=sing Person=3", "verb, 3rd person singular present")
+pos-row("WDT", "ADJ", "PronType=int|rel", "wh-determiner")
+pos-row("WP", "NOUN", "PronType=int|rel", "wh-pronoun, personal")
+pos-row("WP$", "ADJ", "Poss=yes PronType=int|rel", "wh-pronoun, possessive")
+pos-row("WRB", "ADV", "PronType=int|rel", "wh-adverb")
+pos-row("XX", "X", "", "unknown")
+accordion("English", "pos-tagging-english")
p
| The English part-of-speech tagger uses the
| #[+a("https://catalog.ldc.upenn.edu/LDC2013T19") OntoNotes 5] version of
| the Penn Treebank tag set. We also map the tags to the simpler Google
| Universal POS tag set.
+h(3, "pos-tagging-german") German part-of-speech tag scheme
+table(["Tag", "POS", "Morphology", "Description"])
+pos-row("-LRB-", "PUNCT", "PunctType=brck PunctSide=ini", "left round bracket")
+pos-row("-PRB-", "PUNCT", "PunctType=brck PunctSide=fin", "right round bracket")
+pos-row(",", "PUNCT", "PunctType=comm", "punctuation mark, comma")
+pos-row(":", "PUNCT", "", "punctuation mark, colon or ellipsis")
+pos-row(".", "PUNCT", "PunctType=peri", "punctuation mark, sentence closer")
+pos-row("''", "PUNCT", "PunctType=quot PunctSide=fin", "closing quotation mark")
+pos-row("\"\"", "PUNCT", "PunctType=quot PunctSide=fin", "closing quotation mark")
+pos-row("#", "SYM", "SymType=numbersign", "symbol, number sign")
+pos-row("``", "PUNCT", "PunctType=quot PunctSide=ini", "opening quotation mark")
+pos-row("$", "SYM", "SymType=currency", "symbol, currency")
+pos-row("ADD", "X", "", "email")
+pos-row("AFX", "ADJ", "Hyph=yes", "affix")
+pos-row("BES", "VERB", "", 'auxiliary "be"')
+pos-row("CC", "CONJ", "ConjType=coor", "conjunction, coordinating")
+pos-row("CD", "NUM", "NumType=card", "cardinal number")
+pos-row("DT", "DET", "determiner")
+pos-row("EX", "ADV", "AdvType=ex", "existential there")
+pos-row("FW", "X", "Foreign=yes", "foreign word")
+pos-row("GW", "X", "", "additional word in multi-word expression")
+pos-row("HVS", "VERB", "", 'forms of "have"')
+pos-row("HYPH", "PUNCT", "PunctType=dash", "punctuation mark, hyphen")
+pos-row("IN", "ADP", "", "conjunction, subordinating or preposition")
+pos-row("JJ", "ADJ", "Degree=pos", "adjective")
+pos-row("JJR", "ADJ", "Degree=comp", "adjective, comparative")
+pos-row("JJS", "ADJ", "Degree=sup", "adjective, superlative")
+pos-row("LS", "PUNCT", "NumType=ord", "list item marker")
+pos-row("MD", "VERB", "VerbType=mod", "verb, modal auxiliary")
+pos-row("NFP", "PUNCT", "", "superfluous punctuation")
+pos-row("NIL", "", "", "missing tag")
+pos-row("NN", "NOUN", "Number=sing", "noun, singular or mass")
+pos-row("NNP", "PROPN", "NounType=prop Number=sign", "noun, proper singular")
+pos-row("NNPS", "PROPN", "NounType=prop Number=plur", "noun, proper plural")
+pos-row("NNS", "NOUN", "Number=plur", "noun, plural")
+pos-row("PDT", "ADJ", "AdjType=pdt PronType=prn", "predeterminer")
+pos-row("POS", "PART", "Poss=yes", "possessive ending")
+pos-row("PRP", "PRON", "PronType=prs", "pronoun, personal")
+pos-row("PRP$", "ADJ", "PronType=prs Poss=yes", "pronoun, possessive")
+pos-row("RB", "ADV", "Degree=pos", "adverb")
+pos-row("RBR", "ADV", "Degree=comp", "adverb, comparative")
+pos-row("RBS", "ADV", "Degree=sup", "adverb, superlative")
+pos-row("RP", "PART", "", "adverb, particle")
+pos-row("SP", "SPACE", "", "space")
+pos-row("SYM", "SYM", "", "symbol")
+pos-row("TO", "PART", "PartType=inf VerbForm=inf", "infinitival to")
+pos-row("UH", "INTJ", "", "interjection")
+pos-row("VB", "VERB", "VerbForm=inf", "verb, base form")
+pos-row("VBD", "VERB", "VerbForm=fin Tense=past", "verb, past tense")
+pos-row("VBG", "VERB", "VerbForm=part Tense=pres Aspect=prog", "verb, gerund or present participle")
+pos-row("VBN", "VERB", "VerbForm=part Tense=past Aspect=perf", "verb, past participle")
+pos-row("VBP", "VERB", "VerbForm=fin Tense=pres", "verb, non-3rd person singular present")
+pos-row("VBZ", "VERB", "VerbForm=fin Tense=pres Number=sing Person=3", "verb, 3rd person singular present")
+pos-row("WDT", "ADJ", "PronType=int|rel", "wh-determiner")
+pos-row("WP", "NOUN", "PronType=int|rel", "wh-pronoun, personal")
+pos-row("WP$", "ADJ", "Poss=yes PronType=int|rel", "wh-pronoun, possessive")
+pos-row("WRB", "ADV", "PronType=int|rel", "wh-adverb")
+pos-row("XX", "X", "", "unknown")
p
| The German part-of-speech tagger uses the
| #[+a("http://www.ims.uni-stuttgart.de/forschung/ressourcen/korpora/TIGERCorpus/annotation/index.html") TIGER Treebank]
| annotation scheme. We also map the tags to the simpler Google
| Universal POS tag set.
+accordion("German", "pos-tagging-german")
p
| The German part-of-speech tagger uses the
| #[+a("http://www.ims.uni-stuttgart.de/forschung/ressourcen/korpora/TIGERCorpus/annotation/index.html") TIGER Treebank]
| annotation scheme. We also map the tags to the simpler Google
| Universal POS tag set.
+table(["Tag", "POS", "Morphology", "Description"])
+pos-row("$(", "PUNCT", "PunctType=brck", "other sentence-internal punctuation mark")
+pos-row("$,", "PUNCT", "PunctType=comm", "comma")
+pos-row("$.", "PUNCT", "PunctType=peri", "sentence-final punctuation mark")
+pos-row("ADJA", "ADJ", "", "adjective, attributive")
+pos-row("ADJD", "ADJ", "Variant=short", "adjective, adverbial or predicative")
+pos-row("ADV", "ADV", "", "adverb")
+pos-row("APPO", "ADP", "AdpType=post", "postposition")
+pos-row("APPR", "ADP", "AdpType=prep", "preposition; circumposition left")
+pos-row("APPRART", "ADP", "AdpType=prep PronType=art", "preposition with article")
+pos-row("APZR", "ADP", "AdpType=circ", "circumposition right")
+pos-row("ART", "DET", "PronType=art", "definite or indefinite article")
+pos-row("CARD", "NUM", "NumType=card", "cardinal number")
+pos-row("FM", "X", "Foreign=yes", "foreign language material")
+pos-row("ITJ", "INTJ", "", "interjection")
+pos-row("KOKOM", "CONJ", "ConjType=comp", "comparative conjunction")
+pos-row("KON", "CONJ", "", "coordinate conjunction")
+pos-row("KOUI", "SCONJ", "", 'subordinate conjunction with "zu" and infinitive')
+pos-row("KOUS", "SCONJ", "", "subordinate conjunction with sentence")
+pos-row("NE", "PROPN", "", "proper noun")
+pos-row("NNE", "PROPN", "", "proper noun")
+pos-row("NN", "NOUN", "", "noun, singular or mass")
+pos-row("PAV", "ADV", "PronType=dem", "pronominal adverb")
+pos-row("PROAV", "ADV", "PronType=dem", "pronominal adverb")
+pos-row("PDAT", "DET", "PronType=dem", "attributive demonstrative pronoun")
+pos-row("PDS", "PRON", "PronType=dem", "substituting demonstrative pronoun")
+pos-row("PIAT", "DET", "PronType=ind|neg|tot", "attributive indefinite pronoun without determiner")
+pos-row("PIDAT", "DET", "AdjType=pdt PronType=ind|neg|tot", "attributive indefinite pronoun with determiner")
+pos-row("PIS", "PRON", "PronType=ind|neg|tot", "substituting indefinite pronoun")
+pos-row("PPER", "PRON", "PronType=prs", "non-reflexive personal pronoun")
+pos-row("PPOSAT", "DET", "Poss=yes PronType=prs", "attributive possessive pronoun")
+pos-row("PPOSS", "PRON", "PronType=rel", "substituting possessive pronoun")
+pos-row("PRELAT", "DET", "PronType=rel", "attributive relative pronoun")
+pos-row("PRELS", "PRON", "PronType=rel", "substituting relative pronoun")
+pos-row("PRF", "PRON", "PronType=prs Reflex=yes", "reflexive personal pronoun")
+pos-row("PTKA", "PART", "", "particle with adjective or adverb")
+pos-row("PTKANT", "PART", "PartType=res", "answer particle")
+pos-row("PTKNEG", "PART", "Negative=yes", "negative particle")
+pos-row("PTKVZ", "PART", "PartType=vbp", "separable verbal particle")
+pos-row("PTKZU", "PART", "PartType=inf", '"zu" before infinitive')
+pos-row("PWAT", "DET", "PronType=int", "attributive interrogative pronoun")
+pos-row("PWAV", "ADV", "PronType=int", "adverbial interrogative or relative pronoun")
+pos-row("PWS", "PRON", "PronType=int", "substituting interrogative pronoun")
+pos-row("TRUNC", "X", "Hyph=yes", "word remnant")
+pos-row("VAFIN", "AUX", "Mood=ind VerbForm=fin", "finite verb, auxiliary")
+pos-row("VAIMP", "AUX", "Mood=imp VerbForm=fin", "imperative, auxiliary")
+pos-row("VAINF", "AUX", "VerbForm=inf", "infinitive, auxiliary")
+pos-row("VAPP", "AUX", "Aspect=perf VerbForm=fin", "perfect participle, auxiliary")
+pos-row("VMFIN", "VERB", "Mood=ind VerbForm=fin VerbType=mod", "finite verb, modal")
+pos-row("VMINF", "VERB", "VerbForm=fin VerbType=mod", "infinitive, modal")
+pos-row("VMPP", "VERB", "Aspect=perf VerbForm=part VerbType=mod", "perfect participle, modal")
+pos-row("VVFIN", "VERB", "Mood=ind VerbForm=fin", "finite verb, full")
+pos-row("VVIMP", "VERB", "Mood=imp VerbForm=fin", "imperative, full")
+pos-row("VVINF", "VERB", "VerbForm=inf", "infinitive, full")
+pos-row("VVIZU", "VERB", "VerbForm=inf", 'infinitive with "zu", full')
+pos-row("VVPP", "VERB", "Aspect=perf VerbForm=part", "perfect participle, full")
+pos-row("XY", "X", "", "non-word containing non-letter")
+pos-row("SP", "SPACE", "", "space")
+table(["Tag", "POS", "Morphology", "Description"])
+pos-row("$(", "PUNCT", "PunctType=brck", "other sentence-internal punctuation mark")
+pos-row("$,", "PUNCT", "PunctType=comm", "comma")
+pos-row("$.", "PUNCT", "PunctType=peri", "sentence-final punctuation mark")
+pos-row("ADJA", "ADJ", "", "adjective, attributive")
+pos-row("ADJD", "ADJ", "Variant=short", "adjective, adverbial or predicative")
+pos-row("ADV", "ADV", "", "adverb")
+pos-row("APPO", "ADP", "AdpType=post", "postposition")
+pos-row("APPR", "ADP", "AdpType=prep", "preposition; circumposition left")
+pos-row("APPRART", "ADP", "AdpType=prep PronType=art", "preposition with article")
+pos-row("APZR", "ADP", "AdpType=circ", "circumposition right")
+pos-row("ART", "DET", "PronType=art", "definite or indefinite article")
+pos-row("CARD", "NUM", "NumType=card", "cardinal number")
+pos-row("FM", "X", "Foreign=yes", "foreign language material")
+pos-row("ITJ", "INTJ", "", "interjection")
+pos-row("KOKOM", "CONJ", "ConjType=comp", "comparative conjunction")
+pos-row("KON", "CONJ", "", "coordinate conjunction")
+pos-row("KOUI", "SCONJ", "", 'subordinate conjunction with "zu" and infinitive')
+pos-row("KOUS", "SCONJ", "", "subordinate conjunction with sentence")
+pos-row("NE", "PROPN", "", "proper noun")
+pos-row("NNE", "PROPN", "", "proper noun")
+pos-row("NN", "NOUN", "", "noun, singular or mass")
+pos-row("PAV", "ADV", "PronType=dem", "pronominal adverb")
+pos-row("PROAV", "ADV", "PronType=dem", "pronominal adverb")
+pos-row("PDAT", "DET", "PronType=dem", "attributive demonstrative pronoun")
+pos-row("PDS", "PRON", "PronType=dem", "substituting demonstrative pronoun")
+pos-row("PIAT", "DET", "PronType=ind|neg|tot", "attributive indefinite pronoun without determiner")
+pos-row("PIDAT", "DET", "AdjType=pdt PronType=ind|neg|tot", "attributive indefinite pronoun with determiner")
+pos-row("PIS", "PRON", "PronType=ind|neg|tot", "substituting indefinite pronoun")
+pos-row("PPER", "PRON", "PronType=prs", "non-reflexive personal pronoun")
+pos-row("PPOSAT", "DET", "Poss=yes PronType=prs", "attributive possessive pronoun")
+pos-row("PPOSS", "PRON", "PronType=rel", "substituting possessive pronoun")
+pos-row("PRELAT", "DET", "PronType=rel", "attributive relative pronoun")
+pos-row("PRELS", "PRON", "PronType=rel", "substituting relative pronoun")
+pos-row("PRF", "PRON", "PronType=prs Reflex=yes", "reflexive personal pronoun")
+pos-row("PTKA", "PART", "", "particle with adjective or adverb")
+pos-row("PTKANT", "PART", "PartType=res", "answer particle")
+pos-row("PTKNEG", "PART", "Negative=yes", "negative particle")
+pos-row("PTKVZ", "PART", "PartType=vbp", "separable verbal particle")
+pos-row("PTKZU", "PART", "PartType=inf", '"zu" before infinitive')
+pos-row("PWAT", "DET", "PronType=int", "attributive interrogative pronoun")
+pos-row("PWAV", "ADV", "PronType=int", "adverbial interrogative or relative pronoun")
+pos-row("PWS", "PRON", "PronType=int", "substituting interrogative pronoun")
+pos-row("TRUNC", "X", "Hyph=yes", "word remnant")
+pos-row("VAFIN", "AUX", "Mood=ind VerbForm=fin", "finite verb, auxiliary")
+pos-row("VAIMP", "AUX", "Mood=imp VerbForm=fin", "imperative, auxiliary")
+pos-row("VAINF", "AUX", "VerbForm=inf", "infinitive, auxiliary")
+pos-row("VAPP", "AUX", "Aspect=perf VerbForm=fin", "perfect participle, auxiliary")
+pos-row("VMFIN", "VERB", "Mood=ind VerbForm=fin VerbType=mod", "finite verb, modal")
+pos-row("VMINF", "VERB", "VerbForm=fin VerbType=mod", "infinitive, modal")
+pos-row("VMPP", "VERB", "Aspect=perf VerbForm=part VerbType=mod", "perfect participle, modal")
+pos-row("VVFIN", "VERB", "Mood=ind VerbForm=fin", "finite verb, full")
+pos-row("VVIMP", "VERB", "Mood=imp VerbForm=fin", "imperative, full")
+pos-row("VVINF", "VERB", "VerbForm=inf", "infinitive, full")
+pos-row("VVIZU", "VERB", "VerbForm=inf", 'infinitive with "zu", full')
+pos-row("VVPP", "VERB", "Aspect=perf VerbForm=part", "perfect participle, full")
+pos-row("XY", "X", "", "non-word containing non-letter")
+pos-row("SP", "SPACE", "", "space")

View File

@ -0,0 +1,55 @@
//- 💫 DOCS > API > ANNOTATION > TEXT PROCESSING
+aside-code("Example").
from spacy.lang.en import English
nlp = English()
tokens = nlp('Some\nspaces and\ttab characters')
tokens_text = [t.text for t in tokens]
assert tokens_text == ['Some', '\n', 'spaces', ' ', 'and',
'\t', 'tab', 'characters']
p
| Tokenization standards are based on the
| #[+a("https://catalog.ldc.upenn.edu/LDC2013T19") OntoNotes 5] corpus.
| The tokenizer differs from most by including
| #[strong tokens for significant whitespace]. Any sequence of
| whitespace characters beyond a single space (#[code ' ']) is included
| as a token. The whitespace tokens are useful for much the same reason
| punctuation is it's often an important delimiter in the text. By
| preserving it in the token output, we are able to maintain a simple
| alignment between the tokens and the original string, and we ensure
| that #[strong no information is lost] during processing.
+h(3, "lemmatization") Lemmatization
+aside("Examples")
| In English, this means:#[br]
| #[strong Adjectives]: happier, happiest → happy#[br]
| #[strong Adverbs]: worse, worst → badly#[br]
| #[strong Nouns]: dogs, children → dog, child#[br]
| #[strong Verbs]: writes, wirting, wrote, written → write
p
| A lemma is the uninflected form of a word. The English lemmatization
| data is taken from #[+a("https://wordnet.princeton.edu") WordNet].
| Lookup tables are taken from
| #[+a("http://www.lexiconista.com/datasets/lemmatization/") Lexiconista].
| spaCy also adds a #[strong special case for pronouns]: all pronouns
| are lemmatized to the special token #[code -PRON-].
+infobox("About spaCy's custom pronoun lemma", "⚠️")
| Unlike verbs and common nouns, there's no clear base form of a personal
| pronoun. Should the lemma of "me" be "I", or should we normalize person
| as well, giving "it" — or maybe "he"? spaCy's solution is to introduce a
| novel symbol, #[code -PRON-], which is used as the lemma for
| all personal pronouns.
+h(3, "sentence-boundary") Sentence boundary detection
p
| Sentence boundaries are calculated from the syntactic parse tree, so
| features such as punctuation and capitalisation play an important but
| non-decisive role in determining the sentence boundaries. Usually this
| means that the sentence boundaries will at least coincide with clause
| boundaries, even given poorly punctuated text.

View File

@ -205,10 +205,8 @@
"title": "Annotation Specifications",
"teaser": "Schemes used for labels, tags and training data.",
"menu": {
"Tokenization": "tokenization",
"Sentence Boundaries": "sbd",
"Text Processing": "text-processing",
"POS Tagging": "pos-tagging",
"Lemmatization": "lemmatization",
"Dependencies": "dependency-parsing",
"Named Entities": "named-entities",
"Models & Training": "training"

View File

@ -2,43 +2,9 @@
include ../_includes/_mixins
p This document describes the target annotations spaCy is trained to predict.
+section("tokenization")
+h(2, "tokenization") Tokenization
p
| Tokenization standards are based on the
| #[+a("https://catalog.ldc.upenn.edu/LDC2013T19") OntoNotes 5] corpus.
| The tokenizer differs from most by including tokens for significant
| whitespace. Any sequence of whitespace characters beyond a single space
| (#[code ' ']) is included as a token.
+aside-code("Example").
from spacy.lang.en import English
nlp = English()
tokens = nlp('Some\nspaces and\ttab characters')
tokens_text = [t.text for t in tokens]
assert tokens_text == ['Some', '\n', 'spaces', ' ', 'and',
'\t', 'tab', 'characters']
p
| The whitespace tokens are useful for much the same reason punctuation is
| it's often an important delimiter in the text. By preserving it in the
| token output, we are able to maintain a simple alignment between the
| tokens and the original string, and we ensure that no information is
| lost during processing.
+section("sbd")
+h(2, "sentence-boundary") Sentence boundary detection
p
| Sentence boundaries are calculated from the syntactic parse tree, so
| features such as punctuation and capitalisation play an important but
| non-decisive role in determining the sentence boundaries. Usually this
| means that the sentence boundaries will at least coincide with clause
| boundaries, even given poorly punctuated text.
+section("text-processing")
+h(2, "text-processing") Text Processing
include _annotation/_text-processing
+section("pos-tagging")
+h(2, "pos-tagging") Part-of-speech Tagging
@ -50,30 +16,6 @@ p This document describes the target annotations spaCy is trained to predict.
include _annotation/_pos-tags
+section("lemmatization")
+h(2, "lemmatization") Lemmatization
p A "lemma" is the uninflected form of a word. In English, this means:
+list
+item #[strong Adjectives]: The form like "happy", not "happier" or "happiest"
+item #[strong Adverbs]: The form like "badly", not "worse" or "worst"
+item #[strong Nouns]: The form like "dog", not "dogs"; like "child", not "children"
+item #[strong Verbs]: The form like "write", not "writes", "writing", "wrote" or "written"
p
| The lemmatization data is taken from
| #[+a("https://wordnet.princeton.edu") WordNet]. However, we also add a
| special case for pronouns: all pronouns are lemmatized to the special
| token #[code -PRON-].
+infobox("About spaCy's custom pronoun lemma")
| Unlike verbs and common nouns, there's no clear base form of a personal
| pronoun. Should the lemma of "me" be "I", or should we normalize person
| as well, giving "it" — or maybe "he"? spaCy's solution is to introduce a
| novel symbol, #[code -PRON-], which is used as the lemma for
| all personal pronouns.
+section("dependency-parsing")
+h(2, "dependency-parsing") Syntactic Dependency Parsing

View File

@ -31,6 +31,9 @@ main > *:not(footer) li a,
main aside a
@extend .u-link
a:focus
outline: 1px dotted $color-theme
//- Selection

View File

@ -74,6 +74,42 @@
border-radius: $border-radius
box-shadow: $box-shadow
//- Accordion
.o-accordion
&:not(:last-child)
margin-bottom: 2rem
.o-accordion__content
margin-top: 3rem
.o-accordion__button
font: inherit
border-radius: $border-radius
width: 100%
padding: 1.5rem 2rem
background: $color-subtle-light
&[aria-expanded="true"]
border-bottom: 3px solid $color-subtle
border-bottom-left-radius: 0
border-bottom-right-radius: 0
.o-accordion__hide
display: none
&:focus:not([aria-expanded="true"])
background: $color-subtle
.o-accordion__icon
@include size(2.5rem)
background: $color-theme
color: $color-back
border-radius: 50%
padding: 0.35rem
pointer-events: none
//- Box
.o-box

View File

@ -0,0 +1,25 @@
'use strict';
import { $$ } from './util.js';
export default class Accordion {
/**
* Simple, collapsible accordion sections.
* Inspired by: https://inclusive-components.design/collapsible-sections/
* @param {string} selector - Query selector of button element.
*/
constructor(selector) {
[...$$(selector)].forEach(btn =>
btn.addEventListener('click', this.onClick.bind(this)))
}
/**
* Toggle aria-expanded attribute on button and visibility of section.
* @param {node} Event.target - The accordion button.
*/
onClick({ target }) {
const exp = target.getAttribute('aria-expanded') === 'true' || false;
target.setAttribute('aria-expanded', !exp);
target.parentElement.nextElementSibling.hidden = exp;
}
}

View File

@ -101,9 +101,9 @@ export class ModelLoader {
showError(modelId) {
const tpl = new Templater(modelId);
tpl.get('table').removeAttribute('data-loading');
tpl.get('error').style.display = 'block';
tpl.get('error').hidden = false;
for (let key of ['sources', 'pipeline', 'vecs', 'author', 'license']) {
tpl.get(key).parentElement.parentElement.style.display = 'none';
tpl.get(key).parentElement.parentElement.hidden = true;
}
}
@ -114,13 +114,12 @@ export class ModelLoader {
const modelId = `${data.lang}_${data.name}`;
const model = `${modelId}-${data.version}`;
const tpl = new Templater(modelId);
tpl.get('error').style.display = 'none';
this.renderDetails(tpl, data)
this.renderBenchmarks(tpl, data.accuracy, data.speed);
this.renderCompat(tpl, modelId);
tpl.get('download').setAttribute('href', `${this.repo}/releases/tag/${model}`);
tpl.get('table').removeAttribute('data-loading');
tpl.get('error').style.display = 'none';
tpl.get('error').hidden = true;
}
renderDetails(tpl, { version, size, description, notes, author, url,
@ -133,9 +132,9 @@ export class ModelLoader {
if (license) tpl.fill('license', formats.license(license, this.licenses[license]), true);
if (sources) tpl.fill('sources', formats.sources(sources));
if (vectors) tpl.fill('vecs', formats.vectors(vectors));
else tpl.get('vecs').parentElement.parentElement.style.display = 'none';
else tpl.get('vecs').parentElement.parentElement.hidden = true;
if (pipeline && pipeline.length) tpl.fill('pipeline', formats.pipeline(pipeline), true);
else tpl.get('pipeline').parentElement.parentElement.style.display = 'none';
else tpl.get('pipeline').parentElement.parentElement.hidden = true;
}
renderBenchmarks(tpl, accuracy = {}, speed = {}) {
@ -143,7 +142,7 @@ export class ModelLoader {
this.renderTable(tpl, 'parser', accuracy, val => val.toFixed(2));
this.renderTable(tpl, 'ner', accuracy, val => val.toFixed(2));
this.renderTable(tpl, 'speed', speed, Math.round);
tpl.get('benchmarks').style.display = 'block';
tpl.get('benchmarks').hidden = false;
}
renderTable(tpl, id, benchmarks, converter = val => val) {
@ -151,13 +150,13 @@ export class ModelLoader {
for (let key of Object.keys(this.benchKeys[id])) {
if (benchmarks[key]) tpl
.fill(key, convertNumber(converter(benchmarks[key])))
.parentElement.style.display = 'table-row';
.parentElement.hidden = false;
}
tpl.get(id).style.display = 'block';
tpl.get(id).hidden = false;
}
renderCompat(tpl, modelId) {
tpl.get('compat-wrapper').style.display = 'table-row';
tpl.get('compat-wrapper').hidden = false;
const header = '<option selected disabled>spaCy version</option>';
const options = Object.keys(this.compat)
.map(v => `<option value="${v}">v${v}</option>`)
@ -197,8 +196,8 @@ export class ModelComparer {
this.colors = CHART_COLORS;
this.fonts = CHART_FONTS;
this.defaultModels = defaultModels;
this.tpl.get('result').style.display = 'block';
this.tpl.get('error').style.display = 'none';
this.tpl.get('result').hidden = false;
this.tpl.get('error').hidden = true;
this.fetchCompat()
.then(compat => this.init(compat))
.catch(this.showError.bind(this))
@ -257,8 +256,8 @@ export class ModelComparer {
showError(err) {
console.error(err || 'Error');
this.tpl.get('result').style.display = 'none';
this.tpl.get('error').style.display = 'block';
this.tpl.get('result').hidden = true;
this.tpl.get('error').hidden = false;
}
onSelect(ev) {
@ -301,8 +300,8 @@ export class ModelComparer {
this.chart.update();
[model1, model2].forEach((model, i) => this.renderTable(metaKeys, i + 1, model));
this.tpl.get('result').removeAttribute('data-loading');
this.tpl.get('error').style.display = 'none';
this.tpl.get('result').style.display = 'block';
this.tpl.get('error').hidden = true;
this.tpl.get('result').hidden = false;
}
renderTable(metaKeys, i, { lang, name, version, size, description,

View File

@ -12,6 +12,7 @@ import ProgressBar from './progress.js';
import NavHighlighter from './nav-highlighter.js';
import Changelog from './changelog.js';
import GitHubEmbed from './github-embed.js';
import Accordion from './accordion.js';
import { ModelLoader, ModelComparer } from './models.js';
// Assign to window so they are bundled by rollup
@ -19,5 +20,6 @@ window.ProgressBar = ProgressBar;
window.NavHighlighter = NavHighlighter;
window.Changelog = Changelog;
window.GitHubEmbed = GitHubEmbed;
window.Accordion = Accordion;
window.ModelLoader = ModelLoader;
window.ModelComparer = ModelComparer;

View File

@ -30,7 +30,7 @@ div(data-tpl=TPL data-tpl-key="error")
| overview of the
| #[+a(gh("spacy-models") + "/releases") latest model releases].
div(data-tpl=TPL data-tpl-key="result" style="display: none")
div(data-tpl=TPL data-tpl-key="result" hidden="")
+chart("compare_accuracy", 350)
+aside-code("Download", "text")

View File

@ -181,6 +181,10 @@ p
+annotation-row(["their", "ADJ", "poss", "requests"], style)
+annotation-row(["requests", "NOUN", "dobj", "submit"], style)
+h(3, "dep-scheme") Dependency label scheme
include ../../api/_annotation/_dep-labels
+h(3, "displacy") Visualizing dependencies
p

View File

@ -2,8 +2,6 @@
include ../_spacy-101/_pos-deps
//-+aside("Help spaCy's output is wrong!")
+h(3, "rule-based-morphology") Rule-based morphology
p
@ -70,4 +68,6 @@ p
| list-based exception files, acquired from
| #[+a("https://wordnet.princeton.edu/") WordNet].
+h(3, "pos-scheme") Part-of-speech tag scheme
include ../../api/_annotation/_pos-tags