Split annotation specs into files to they can be included in different places

2025-11-08 20:07:51 +03:00 · 2016-12-18 17:42:10 +01:00 · 2016-12-18 17:42:10 +01:00 · 614ca6fb41
commit 614ca6fb41
parent ac95779a75
4 changed files with 212 additions and 150 deletions
--- a/website/docs/api/_annotation/_named-entities.jade
+++ b/website/docs/api/_annotation/_named-entities.jade
@ -0,0 +1,73 @@
+//- 💫 DOCS > API > ANNOTATION > NAMED ENTITIES
+
+table([ "Type", "Description" ])
+    +row
+        +cell #[code PERSON]
+        +cell People, including fictional.
+
+    +row
+        +cell #[code NORP]
+        +cell Nationalities or religious or political groups.
+
+    +row
+        +cell #[code FACILITY]
+        +cell Buildings, airports, highways, bridges, etc.
+
+    +row
+        +cell #[code ORG]
+        +cell Companies, agencies, institutions, etc.
+
+    +row
+        +cell #[code GPE]
+        +cell Countries, cities, states.
+
+    +row
+        +cell #[code LOC]
+        +cell Non-GPE locations, mountain ranges, bodies of water.
+
+    +row
+        +cell #[code PRODUCT]
+        +cell Objects, vehicles, foods, etc. (Not services.)
+
+    +row
+        +cell #[code EVENT]
+        +cell Named hurricanes, battles, wars, sports events, etc.
+
+    +row
+        +cell #[code WORK_OF_ART]
+        +cell Titles of books, songs, etc.
+
+    +row
+        +cell #[code LANGUAGE]
+        +cell Any named language.
+
+p The following values are also annotated in a style similar to names:
+
+table([ "Type", "Description" ])
+    +row
+        +cell #[code DATE]
+        +cell Absolute or relative dates or periods.
+
+    +row
+        +cell #[code TIME]
+        +cell Times smaller than a day.
+
+    +row
+        +cell #[code PERCENT]
+        +cell Percentage, including "%".
+
+    +row
+        +cell #[code MONEY]
+        +cell Monetary values, including unit.
+
+    +row
+        +cell #[code QUANTITY]
+        +cell Measurements, as of weight or distance.
+
+    +row
+        +cell #[code ORDINAL]
+        +cell "first", "second", etc.
+
+    +row
+        +cell #[code CARDINAL]
+        +cell Numerals that do not fall under another type.
--- a/website/docs/api/_annotation/_pos-tags.jade
+++ b/website/docs/api/_annotation/_pos-tags.jade
@ -0,0 +1,136 @@
+//- 💫 DOCS > API > ANNOTATION > POS TAGS
+
+mixin pos-row(...row)
+    +row
+        each cell in row
+            +cell
+                each item in cell.split(" ")
+                    if item
+                        |  #[code=item]
+
+p
+    |  The part-of-speech tagger uses the
+    |  #[+a("https://catalog.ldc.upenn.edu/LDC2013T19") OntoNotes 5] version of
+    |  the Penn Treebank tag set. We also map the tags to the simpler Google
+    |  Universal POS tag set.
+
+h(3, "pos-tagging-english") English part-of-speech tag scheme
+
+table(["Tag", "POS", "Morphology"])
+    +pos-row("-LRB-", "PUNCT", "PunctType=brck PunctSide=ini")
+    +pos-row("-PRB-", "PUNCT", "PunctType=brck PunctSide=fin")
+    +pos-row(",", "PUNCT", "PunctType=comm")
+    +pos-row(":", "PUNCT", "")
+    +pos-row(".", "PUNCT", "PunctType=peri")
+    +pos-row("''", "PUNCT", "PunctType=quot PunctSide=fin")
+    +pos-row("\"\"", "PUNCT", "PunctType=quot PunctSide=fin")
+    +pos-row("#", "SYM", "SymType=numbersign")
+    +pos-row("``", "PUNCT", "PunctType=quot PunctSide=ini")
+    +pos-row("$", "SYM", "SymType=currency")
+    +pos-row("ADD", "X", "")
+    +pos-row("AFX", "ADJ", "Hyph=yes")
+    +pos-row("BES", "VERB", "")
+    +pos-row("CC", "CONJ", "ConjType=coor")
+    +pos-row("CD", "NUM", "NumType=card")
+    +pos-row("DT", "DET", "")
+    +pos-row("EX", "ADV", "AdvType=ex")
+    +pos-row("FW", "X", "Foreign=yes")
+    +pos-row("GW", "X", "")
+    +pos-row("HVS", "VERB", "")
+    +pos-row("HYPH", "PUNCT", "PunctType=dash")
+    +pos-row("IN", "ADP", "")
+    +pos-row("JJ", "ADJ", "Degree=pos")
+    +pos-row("JJR", "ADJ", "Degree=comp")
+    +pos-row("JJS", "ADJ", "Degree=sup")
+    +pos-row("LS", "PUNCT", "NumType=ord")
+    +pos-row("MD", "VERB", "VerbType=mod")
+    +pos-row("NFP", "PUNCT", "")
+    +pos-row("NIL", "", "")
+    +pos-row("NN", "NOUN", "Number=sing")
+    +pos-row("NNP", "PROPN", "NounType=prop Number=sign")
+    +pos-row("NNPS", "PROPN", "NounType=prop Number=plur")
+    +pos-row("NNS", "NOUN", "Number=plur")
+    +pos-row("PDT", "ADJ", "AdjType=pdt PronType=prn")
+    +pos-row("POS", "PART", "Poss=yes")
+    +pos-row("PRP", "PRON", "PronType=prs")
+    +pos-row("PRP$", "ADJ", "PronType=prs Poss=yes")
+    +pos-row("RB", "ADV", "Degree=pos")
+    +pos-row("RBR", "ADV", "Degree=comp")
+    +pos-row("RBS", "ADV", "Degree=sup")
+    +pos-row("RP", "PART", "")
+    +pos-row("SP", "SPACE", "")
+    +pos-row("SYM", "SYM", "")
+    +pos-row("TO", "PART", "PartType=inf VerbForm=inf")
+    +pos-row("UH", "INTJ", "")
+    +pos-row("VB", "VERB", "VerbForm=inf")
+    +pos-row("VBD", "VERB", "VerbForm=fin Tense=past")
+    +pos-row("VBG", "VERB", "VerbForm=part Tense=pres Aspect=prog")
+    +pos-row("VBN", "VERB", "VerbForm=part Tense=past Aspect=perf")
+    +pos-row("VBP", "VERB", "VerbForm=fin Tense=pres")
+    +pos-row("VBZ", "VERB", "VerbForm=fin Tense=pres Number=sing Person=3")
+    +pos-row("WDT", "ADJ", "PronType=int|rel")
+    +pos-row("WP", "NOUN", "PronType=int|rel")
+    +pos-row("WP$", "ADJ", "Poss=yes PronType=int|rel")
+    +pos-row("WRB", "ADV", "PronType=int|rel")
+    +pos-row("XX", "X", "")
+
+h(3, "pos-tagging-german") German part-of-speech tag scheme
+
+table(["Tag", "POS", "Morphology"])
+    +pos-row("$(", "PUNCT", "PunctType=brck")
+    +pos-row("$,", "PUNCT", "PunctType=comm")
+    +pos-row("$.", "PUNCT", "PunctType=peri")
+    +pos-row("ADJA", "ADJ", "")
+    +pos-row("ADJD", "ADJ", "Variant=short")
+    +pos-row("ADV", "ADV", "")
+    +pos-row("APPO", "ADP", "AdpType=post")
+    +pos-row("APPR", "ADP", "AdpType=prep")
+    +pos-row("APPRART", "ADP", "AdpType=prep PronType=art")
+    +pos-row("APZR", "ADP", "AdpType=circ")
+    +pos-row("ART", "DET", "PronType=art")
+    +pos-row("CARD", "NUM", "NumType=card")
+    +pos-row("FM", "X", "Foreign=yes")
+    +pos-row("ITJ", "INTJ", "")
+    +pos-row("KOKOM", "CONJ", "ConjType=comp")
+    +pos-row("KON", "CONJ", "")
+    +pos-row("KOUI", "SCONJ", "")
+    +pos-row("KOUS", "SCONJ", "")
+    +pos-row("NE", "PROPN", "")
+    +pos-row("NNE", "PROPN", "")
+    +pos-row("NN", "NOUN", "")
+    +pos-row("PAV", "ADV", "PronType=dem")
+    +pos-row("PROAV", "ADV", "PronType=dem")
+    +pos-row("PDAT", "DET", "PronType=dem")
+    +pos-row("PDS", "PRON", "PronType=dem")
+    +pos-row("PIAT", "DET", "PronType=ind|neg|tot")
+    +pos-row("PIDAT", "DET", "AdjType=pdt PronType=ind|neg|tot")
+    +pos-row("PIS", "PRON", "PronType=ind|neg|tot")
+    +pos-row("PPER", "PRON", "PronType=prs")
+    +pos-row("PPOSAT", "DET", "Poss=yes PronType=prs")
+    +pos-row("PPOSS", "PRON", "PronType=rel")
+    +pos-row("PRELAT", "DET", "PronType=rel")
+    +pos-row("PRELS", "PRON", "PronType=rel")
+    +pos-row("PRF", "PRON", "PronType=prs Reflex=yes")
+    +pos-row("PTKA", "PART", "")
+    +pos-row("PTKANT", "PART", "PartType=res")
+    +pos-row("PTKNEG", "PART", "Negative=yes")
+    +pos-row("PTKVZ", "PART", "PartType=vbp")
+    +pos-row("PTKZU", "PART", "PartType=inf")
+    +pos-row("PWAT", "DET", "PronType=int")
+    +pos-row("PWAV", "ADV", "PronType=int")
+    +pos-row("PWS", "PRON", "PronType=int")
+    +pos-row("TRUNC", "X", "Hyph=yes")
+    +pos-row("VAFIN", "AUX", "Mood=ind VerbForm=fin")
+    +pos-row("VAIMP", "AUX", "Mood=imp VerbForm=fin")
+    +pos-row("VAINF", "AUX", "VerbForm=inf")
+    +pos-row("VAPP", "AUX", "Aspect=perf VerbForm=fin")
+    +pos-row("VMFIN", "VERB", "Mood=ind VerbForm=fin VerbType=mod")
+    +pos-row("VMINF", "VERB", "VerbForm=fin VerbType=mod")
+    +pos-row("VMPP", "VERB", "Aspect=perf VerbForm=part VerbType=mod")
+    +pos-row("VVFIN", "VERB", "Mood=ind VerbForm=fin")
+    +pos-row("VVIMP", "VERB", "Mood=imp VerbForm=fin")
+    +pos-row("VVINF", "VERB", "VerbForm=inf")
+    +pos-row("VVIZU", "VERB", "VerbForm=inf")
+    +pos-row("VVPP", "VERB", "Aspect=perf VerbForm=part")
+    +pos-row("XY", "X", "")
+    +pos-row("SP", "SPACE", "")
--- a/website/docs/api/annotation.jade
+++ b/website/docs/api/annotation.jade
@ -38,12 +38,7 @@ p

 +h(2, "pos-tagging") Part-of-speech Tagging

-p
-    |  The part-of-speech tagger uses the
-    |  #[+a("https://catalog.ldc.upenn.edu/LDC2013T19") OntoNotes 5] version of
-    |  the Penn Treebank tag set. We also map the tags to the simpler Google
-    |  Universal POS Tag set. See
-    |  #[+src(gh("spaCy", "spacy/tagger.pyx")) tagger.pyx] for details.
+include _annotation/_pos-tags

 +h(2, "lemmatization") Lemmatization

@ -71,74 +66,4 @@ p

 +h(2, "named-entities") Named Entity Recognition

-+table([ "Type", "Description" ])
-    +row
-        +cell #[code PERSON]
-        +cell People, including fictional.
-
-    +row
-        +cell #[code NORP]
-        +cell Nationalities or religious or political groups.
-
-    +row
-        +cell #[code FACILITY]
-        +cell Buildings, airports, highways, bridges, etc.
-
-    +row
-        +cell #[code ORG]
-        +cell Companies, agencies, institutions, etc.
-
-    +row
-        +cell #[code GPE]
-        +cell Countries, cities, states.
-
-    +row
-        +cell #[code LOC]
-        +cell Non-GPE locations, mountain ranges, bodies of water.
-
-    +row
-        +cell #[code PRODUCT]
-        +cell Objects, vehicles, foods, etc. (Not services.)
-
-    +row
-        +cell #[code EVENT]
-        +cell Named hurricanes, battles, wars, sports events, etc.
-
-    +row
-        +cell #[code WORK_OF_ART]
-        +cell Titles of books, songs, etc.
-
-    +row
-        +cell #[code LANGUAGE]
-        +cell Any named language.
-
-p The following values are also annotated in a style similar to names:
-
-+table([ "Type", "Description" ])
-    +row
-        +cell #[code DATE]
-        +cell Absolute or relative dates or periods.
-
-    +row
-        +cell #[code TIME]
-        +cell Times smaller than a day.
-
-    +row
-        +cell #[code PERCENT]
-        +cell Percentage, including "%".
-
-    +row
-        +cell #[code MONEY]
-        +cell Monetary values, including unit.
-
-    +row
-        +cell #[code QUANTITY]
-        +cell Measurements, as of weight or distance.
-
-    +row
-        +cell #[code ORDINAL]
-        +cell "first", "second", etc.
-
-    +row
-        +cell #[code CARDINAL]
-        +cell Numerals that do not fall under another type.
+include _annotation/_named-entities
--- a/website/docs/usage/entity-recognition.jade
+++ b/website/docs/usage/entity-recognition.jade
@ -119,79 +119,7 @@ p

 +h(2, "entity-types") Built-in entity types

-+h(3, "entity-types-named") Named types
-
-+table([ "Type", "Description" ])
-    +row
-        +cell #[code PERSON]
-        +cell People, including fictional.
-
-    +row
-        +cell #[code NORP]
-        +cell Nationalities or religious or political groups.
-
-    +row
-        +cell #[code FACILITY]
-        +cell Buildings, airports, highways, bridges, etc.
-
-    +row
-        +cell #[code ORG]
-        +cell Companies, agencies, institutions, etc.
-
-    +row
-        +cell #[code GPE]
-        +cell Countries, cities, states.
-
-    +row
-        +cell #[code LOC]
-        +cell Non-GPE locations, mountain ranges, bodies of water.
-
-    +row
-        +cell #[code PRODUCT]
-        +cell Objects, vehicles, foods, etc. (Not services.)
-
-    +row
-        +cell #[code EVENT]
-        +cell Named hurricanes, battles, wars, sports events, etc.
-
-    +row
-        +cell #[code WORK_OF_ART]
-        +cell Titles of books, songs, etc.
-
-    +row
-        +cell #[code LANGUAGE]
-        +cell Any named language
-
-+h(3, "entity-types-numeric") Numeric types
-
-+table([ "Type", "Description" ])
-    +row
-        +cell #[code DATE]
-        +cell Absolute or relative dates or periods.
-
-    +row
-        +cell #[code TIME]
-        +cell Times smaller than a day.
-
-    +row
-        +cell #[code PERCENT]
-        +cell Percentage, including "%".
-
-    +row
-        +cell #[code MONEY]
-        +cell Monetary values, including unit.
-
-    +row
-        +cell #[code QUANTITY]
-        +cell Measurements, as of weight or distance.
-
-    +row
-        +cell #[code ORDINAL]
-        +cell "first", "second", etc.
-
-    +row
-        +cell #[code CARDINAL]
-        +cell Numerals that do not fall under another type.
+include ../api/_annotation/_named-entities

 +aside("Install")
    |  The #[+api("load") spacy.load()] function configures a pipeline that