From 614ca6fb41a84237e765f4b1ee3c4ef4d0ef5acc Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 18 Dec 2016 17:42:10 +0100 Subject: [PATCH] Split annotation specs into files to they can be included in different places --- .../docs/api/_annotation/_named-entities.jade | 73 ++++++++++ website/docs/api/_annotation/_pos-tags.jade | 136 ++++++++++++++++++ website/docs/api/annotation.jade | 79 +--------- website/docs/usage/entity-recognition.jade | 74 +--------- 4 files changed, 212 insertions(+), 150 deletions(-) create mode 100644 website/docs/api/_annotation/_named-entities.jade create mode 100644 website/docs/api/_annotation/_pos-tags.jade diff --git a/website/docs/api/_annotation/_named-entities.jade b/website/docs/api/_annotation/_named-entities.jade new file mode 100644 index 000000000..476659d4a --- /dev/null +++ b/website/docs/api/_annotation/_named-entities.jade @@ -0,0 +1,73 @@ +//- 💫 DOCS > API > ANNOTATION > NAMED ENTITIES + ++table([ "Type", "Description" ]) + +row + +cell #[code PERSON] + +cell People, including fictional. + + +row + +cell #[code NORP] + +cell Nationalities or religious or political groups. + + +row + +cell #[code FACILITY] + +cell Buildings, airports, highways, bridges, etc. + + +row + +cell #[code ORG] + +cell Companies, agencies, institutions, etc. + + +row + +cell #[code GPE] + +cell Countries, cities, states. + + +row + +cell #[code LOC] + +cell Non-GPE locations, mountain ranges, bodies of water. + + +row + +cell #[code PRODUCT] + +cell Objects, vehicles, foods, etc. (Not services.) + + +row + +cell #[code EVENT] + +cell Named hurricanes, battles, wars, sports events, etc. + + +row + +cell #[code WORK_OF_ART] + +cell Titles of books, songs, etc. + + +row + +cell #[code LANGUAGE] + +cell Any named language. + +p The following values are also annotated in a style similar to names: + ++table([ "Type", "Description" ]) + +row + +cell #[code DATE] + +cell Absolute or relative dates or periods. + + +row + +cell #[code TIME] + +cell Times smaller than a day. + + +row + +cell #[code PERCENT] + +cell Percentage, including "%". + + +row + +cell #[code MONEY] + +cell Monetary values, including unit. + + +row + +cell #[code QUANTITY] + +cell Measurements, as of weight or distance. + + +row + +cell #[code ORDINAL] + +cell "first", "second", etc. + + +row + +cell #[code CARDINAL] + +cell Numerals that do not fall under another type. diff --git a/website/docs/api/_annotation/_pos-tags.jade b/website/docs/api/_annotation/_pos-tags.jade new file mode 100644 index 000000000..d3f561c3f --- /dev/null +++ b/website/docs/api/_annotation/_pos-tags.jade @@ -0,0 +1,136 @@ +//- 💫 DOCS > API > ANNOTATION > POS TAGS + +mixin pos-row(...row) + +row + each cell in row + +cell + each item in cell.split(" ") + if item + | #[code=item] + +p + | The part-of-speech tagger uses the + | #[+a("https://catalog.ldc.upenn.edu/LDC2013T19") OntoNotes 5] version of + | the Penn Treebank tag set. We also map the tags to the simpler Google + | Universal POS tag set. + ++h(3, "pos-tagging-english") English part-of-speech tag scheme + ++table(["Tag", "POS", "Morphology"]) + +pos-row("-LRB-", "PUNCT", "PunctType=brck PunctSide=ini") + +pos-row("-PRB-", "PUNCT", "PunctType=brck PunctSide=fin") + +pos-row(",", "PUNCT", "PunctType=comm") + +pos-row(":", "PUNCT", "") + +pos-row(".", "PUNCT", "PunctType=peri") + +pos-row("''", "PUNCT", "PunctType=quot PunctSide=fin") + +pos-row("\"\"", "PUNCT", "PunctType=quot PunctSide=fin") + +pos-row("#", "SYM", "SymType=numbersign") + +pos-row("``", "PUNCT", "PunctType=quot PunctSide=ini") + +pos-row("$", "SYM", "SymType=currency") + +pos-row("ADD", "X", "") + +pos-row("AFX", "ADJ", "Hyph=yes") + +pos-row("BES", "VERB", "") + +pos-row("CC", "CONJ", "ConjType=coor") + +pos-row("CD", "NUM", "NumType=card") + +pos-row("DT", "DET", "") + +pos-row("EX", "ADV", "AdvType=ex") + +pos-row("FW", "X", "Foreign=yes") + +pos-row("GW", "X", "") + +pos-row("HVS", "VERB", "") + +pos-row("HYPH", "PUNCT", "PunctType=dash") + +pos-row("IN", "ADP", "") + +pos-row("JJ", "ADJ", "Degree=pos") + +pos-row("JJR", "ADJ", "Degree=comp") + +pos-row("JJS", "ADJ", "Degree=sup") + +pos-row("LS", "PUNCT", "NumType=ord") + +pos-row("MD", "VERB", "VerbType=mod") + +pos-row("NFP", "PUNCT", "") + +pos-row("NIL", "", "") + +pos-row("NN", "NOUN", "Number=sing") + +pos-row("NNP", "PROPN", "NounType=prop Number=sign") + +pos-row("NNPS", "PROPN", "NounType=prop Number=plur") + +pos-row("NNS", "NOUN", "Number=plur") + +pos-row("PDT", "ADJ", "AdjType=pdt PronType=prn") + +pos-row("POS", "PART", "Poss=yes") + +pos-row("PRP", "PRON", "PronType=prs") + +pos-row("PRP$", "ADJ", "PronType=prs Poss=yes") + +pos-row("RB", "ADV", "Degree=pos") + +pos-row("RBR", "ADV", "Degree=comp") + +pos-row("RBS", "ADV", "Degree=sup") + +pos-row("RP", "PART", "") + +pos-row("SP", "SPACE", "") + +pos-row("SYM", "SYM", "") + +pos-row("TO", "PART", "PartType=inf VerbForm=inf") + +pos-row("UH", "INTJ", "") + +pos-row("VB", "VERB", "VerbForm=inf") + +pos-row("VBD", "VERB", "VerbForm=fin Tense=past") + +pos-row("VBG", "VERB", "VerbForm=part Tense=pres Aspect=prog") + +pos-row("VBN", "VERB", "VerbForm=part Tense=past Aspect=perf") + +pos-row("VBP", "VERB", "VerbForm=fin Tense=pres") + +pos-row("VBZ", "VERB", "VerbForm=fin Tense=pres Number=sing Person=3") + +pos-row("WDT", "ADJ", "PronType=int|rel") + +pos-row("WP", "NOUN", "PronType=int|rel") + +pos-row("WP$", "ADJ", "Poss=yes PronType=int|rel") + +pos-row("WRB", "ADV", "PronType=int|rel") + +pos-row("XX", "X", "") + ++h(3, "pos-tagging-german") German part-of-speech tag scheme + ++table(["Tag", "POS", "Morphology"]) + +pos-row("$(", "PUNCT", "PunctType=brck") + +pos-row("$,", "PUNCT", "PunctType=comm") + +pos-row("$.", "PUNCT", "PunctType=peri") + +pos-row("ADJA", "ADJ", "") + +pos-row("ADJD", "ADJ", "Variant=short") + +pos-row("ADV", "ADV", "") + +pos-row("APPO", "ADP", "AdpType=post") + +pos-row("APPR", "ADP", "AdpType=prep") + +pos-row("APPRART", "ADP", "AdpType=prep PronType=art") + +pos-row("APZR", "ADP", "AdpType=circ") + +pos-row("ART", "DET", "PronType=art") + +pos-row("CARD", "NUM", "NumType=card") + +pos-row("FM", "X", "Foreign=yes") + +pos-row("ITJ", "INTJ", "") + +pos-row("KOKOM", "CONJ", "ConjType=comp") + +pos-row("KON", "CONJ", "") + +pos-row("KOUI", "SCONJ", "") + +pos-row("KOUS", "SCONJ", "") + +pos-row("NE", "PROPN", "") + +pos-row("NNE", "PROPN", "") + +pos-row("NN", "NOUN", "") + +pos-row("PAV", "ADV", "PronType=dem") + +pos-row("PROAV", "ADV", "PronType=dem") + +pos-row("PDAT", "DET", "PronType=dem") + +pos-row("PDS", "PRON", "PronType=dem") + +pos-row("PIAT", "DET", "PronType=ind|neg|tot") + +pos-row("PIDAT", "DET", "AdjType=pdt PronType=ind|neg|tot") + +pos-row("PIS", "PRON", "PronType=ind|neg|tot") + +pos-row("PPER", "PRON", "PronType=prs") + +pos-row("PPOSAT", "DET", "Poss=yes PronType=prs") + +pos-row("PPOSS", "PRON", "PronType=rel") + +pos-row("PRELAT", "DET", "PronType=rel") + +pos-row("PRELS", "PRON", "PronType=rel") + +pos-row("PRF", "PRON", "PronType=prs Reflex=yes") + +pos-row("PTKA", "PART", "") + +pos-row("PTKANT", "PART", "PartType=res") + +pos-row("PTKNEG", "PART", "Negative=yes") + +pos-row("PTKVZ", "PART", "PartType=vbp") + +pos-row("PTKZU", "PART", "PartType=inf") + +pos-row("PWAT", "DET", "PronType=int") + +pos-row("PWAV", "ADV", "PronType=int") + +pos-row("PWS", "PRON", "PronType=int") + +pos-row("TRUNC", "X", "Hyph=yes") + +pos-row("VAFIN", "AUX", "Mood=ind VerbForm=fin") + +pos-row("VAIMP", "AUX", "Mood=imp VerbForm=fin") + +pos-row("VAINF", "AUX", "VerbForm=inf") + +pos-row("VAPP", "AUX", "Aspect=perf VerbForm=fin") + +pos-row("VMFIN", "VERB", "Mood=ind VerbForm=fin VerbType=mod") + +pos-row("VMINF", "VERB", "VerbForm=fin VerbType=mod") + +pos-row("VMPP", "VERB", "Aspect=perf VerbForm=part VerbType=mod") + +pos-row("VVFIN", "VERB", "Mood=ind VerbForm=fin") + +pos-row("VVIMP", "VERB", "Mood=imp VerbForm=fin") + +pos-row("VVINF", "VERB", "VerbForm=inf") + +pos-row("VVIZU", "VERB", "VerbForm=inf") + +pos-row("VVPP", "VERB", "Aspect=perf VerbForm=part") + +pos-row("XY", "X", "") + +pos-row("SP", "SPACE", "") diff --git a/website/docs/api/annotation.jade b/website/docs/api/annotation.jade index 1875c3882..de678b472 100644 --- a/website/docs/api/annotation.jade +++ b/website/docs/api/annotation.jade @@ -38,12 +38,7 @@ p +h(2, "pos-tagging") Part-of-speech Tagging -p - | The part-of-speech tagger uses the - | #[+a("https://catalog.ldc.upenn.edu/LDC2013T19") OntoNotes 5] version of - | the Penn Treebank tag set. We also map the tags to the simpler Google - | Universal POS Tag set. See - | #[+src(gh("spaCy", "spacy/tagger.pyx")) tagger.pyx] for details. +include _annotation/_pos-tags +h(2, "lemmatization") Lemmatization @@ -71,74 +66,4 @@ p +h(2, "named-entities") Named Entity Recognition -+table([ "Type", "Description" ]) - +row - +cell #[code PERSON] - +cell People, including fictional. - - +row - +cell #[code NORP] - +cell Nationalities or religious or political groups. - - +row - +cell #[code FACILITY] - +cell Buildings, airports, highways, bridges, etc. - - +row - +cell #[code ORG] - +cell Companies, agencies, institutions, etc. - - +row - +cell #[code GPE] - +cell Countries, cities, states. - - +row - +cell #[code LOC] - +cell Non-GPE locations, mountain ranges, bodies of water. - - +row - +cell #[code PRODUCT] - +cell Objects, vehicles, foods, etc. (Not services.) - - +row - +cell #[code EVENT] - +cell Named hurricanes, battles, wars, sports events, etc. - - +row - +cell #[code WORK_OF_ART] - +cell Titles of books, songs, etc. - - +row - +cell #[code LANGUAGE] - +cell Any named language. - -p The following values are also annotated in a style similar to names: - -+table([ "Type", "Description" ]) - +row - +cell #[code DATE] - +cell Absolute or relative dates or periods. - - +row - +cell #[code TIME] - +cell Times smaller than a day. - - +row - +cell #[code PERCENT] - +cell Percentage, including "%". - - +row - +cell #[code MONEY] - +cell Monetary values, including unit. - - +row - +cell #[code QUANTITY] - +cell Measurements, as of weight or distance. - - +row - +cell #[code ORDINAL] - +cell "first", "second", etc. - - +row - +cell #[code CARDINAL] - +cell Numerals that do not fall under another type. +include _annotation/_named-entities diff --git a/website/docs/usage/entity-recognition.jade b/website/docs/usage/entity-recognition.jade index ed29142f4..4b62a290b 100644 --- a/website/docs/usage/entity-recognition.jade +++ b/website/docs/usage/entity-recognition.jade @@ -119,79 +119,7 @@ p +h(2, "entity-types") Built-in entity types -+h(3, "entity-types-named") Named types - -+table([ "Type", "Description" ]) - +row - +cell #[code PERSON] - +cell People, including fictional. - - +row - +cell #[code NORP] - +cell Nationalities or religious or political groups. - - +row - +cell #[code FACILITY] - +cell Buildings, airports, highways, bridges, etc. - - +row - +cell #[code ORG] - +cell Companies, agencies, institutions, etc. - - +row - +cell #[code GPE] - +cell Countries, cities, states. - - +row - +cell #[code LOC] - +cell Non-GPE locations, mountain ranges, bodies of water. - - +row - +cell #[code PRODUCT] - +cell Objects, vehicles, foods, etc. (Not services.) - - +row - +cell #[code EVENT] - +cell Named hurricanes, battles, wars, sports events, etc. - - +row - +cell #[code WORK_OF_ART] - +cell Titles of books, songs, etc. - - +row - +cell #[code LANGUAGE] - +cell Any named language - -+h(3, "entity-types-numeric") Numeric types - -+table([ "Type", "Description" ]) - +row - +cell #[code DATE] - +cell Absolute or relative dates or periods. - - +row - +cell #[code TIME] - +cell Times smaller than a day. - - +row - +cell #[code PERCENT] - +cell Percentage, including "%". - - +row - +cell #[code MONEY] - +cell Monetary values, including unit. - - +row - +cell #[code QUANTITY] - +cell Measurements, as of weight or distance. - - +row - +cell #[code ORDINAL] - +cell "first", "second", etc. - - +row - +cell #[code CARDINAL] - +cell Numerals that do not fall under another type. +include ../api/_annotation/_named-entities +aside("Install") | The #[+api("load") spacy.load()] function configures a pipeline that