From 5453821a9f93390c3cefbc4d976aad823594ff7c Mon Sep 17 00:00:00 2001 From: ines Date: Mon, 30 Oct 2017 13:53:49 +0100 Subject: [PATCH] Update NER annotation scheme Add note on training data sources and include coarse-grained Wikipedia scheme --- spacy/glossary.py | 12 +++++- website/api/_annotation/_named-entities.jade | 40 ++++++++++++++++++-- website/usage/_install/_changelog.jade | 2 +- 3 files changed, 48 insertions(+), 6 deletions(-) diff --git a/spacy/glossary.py b/spacy/glossary.py index 78e61f8a7..c17cb7467 100644 --- a/spacy/glossary.py +++ b/spacy/glossary.py @@ -300,5 +300,15 @@ GLOSSARY = { 'MONEY': 'Monetary values, including unit', 'QUANTITY': 'Measurements, as of weight or distance', 'ORDINAL': '"first", "second", etc.', - 'CARDINAL': 'Numerals that do not fall under another type' + 'CARDINAL': 'Numerals that do not fall under another type', + + + # Named Entity Recognition + # Wikipedia + # http://www.sciencedirect.com/science/article/pii/S0004370212000276 + # https://pdfs.semanticscholar.org/5744/578cc243d92287f47448870bb426c66cc941.pdf + + 'PER': 'Named person or family.', + 'MISC': ('Miscellaneous entities, e.g. events, nationalities, ' + 'products or works of art'), } diff --git a/website/api/_annotation/_named-entities.jade b/website/api/_annotation/_named-entities.jade index 93e705c72..4cc8a707f 100644 --- a/website/api/_annotation/_named-entities.jade +++ b/website/api/_annotation/_named-entities.jade @@ -1,6 +1,11 @@ //- 💫 DOCS > API > ANNOTATION > NAMED ENTITIES -+table([ "Type", "Description" ]) +p + | Models trained on the + | #[+a("https://catalog.ldc.upenn.edu/ldc2013t19") OntoNotes 5] corpus + | support the following entity types: + ++table(["Type", "Description"]) +row +cell #[code PERSON] +cell People, including fictional. @@ -45,9 +50,6 @@ +cell #[code LANGUAGE] +cell Any named language. -p The following values are also annotated in a style similar to names: - -+table([ "Type", "Description" ]) +row +cell #[code DATE] +cell Absolute or relative dates or periods. @@ -75,3 +77,33 @@ p The following values are also annotated in a style similar to names: +row +cell #[code CARDINAL] +cell Numerals that do not fall under another type. + ++h(4, "ner-wikipedia-scheme") Wikipedia scheme + +p + | Models trained on Wikipedia corpus + | (#[+a("http://www.sciencedirect.com/science/article/pii/S0004370212000276") Nothman et al., 2013]) + | use a less fine-grained NER annotation scheme and recognise the + | following entities: + ++table(["Type", "Description"]) + +row + +cell #[code PER] + +cell Named person or family. + + +row + +cell #[code LOC] + +cell + | Name of politically or geographically defined location (cities, + | provinces, countries, international regions, bodies of water, + | mountains). + + +row + +cell #[code ORG] + +cell Named corporate, governmental, or other organizational entity. + + +row + +cell #[code MISC] + +cell + | Miscellaneous entities, e.g. events, nationalities, products or + | works of art. diff --git a/website/usage/_install/_changelog.jade b/website/usage/_install/_changelog.jade index e966b6695..7b802ce63 100644 --- a/website/usage/_install/_changelog.jade +++ b/website/usage/_install/_changelog.jade @@ -3,7 +3,7 @@ +h(2, "changelog") Changelog +button(gh("spacy") + "/releases", false, "secondary", "small").u-float-right.u-nowrap View releases -div(data-tpl="changelog" data-tpl-key="error") +div(data-tpl="changelog" data-tpl-key="error" style="display: none") +infobox | Unable to load changelog from GitHub. Please see the | #[+a(gh("spacy") + "/releases") releases page] instead.