diff --git a/examples/training/training-data.json b/examples/training/training-data.json index 532ab4ea8..7737b9a14 100644 --- a/examples/training/training-data.json +++ b/examples/training/training-data.json @@ -1,1103 +1,641 @@ [ - { - "id": 0, - "paragraphs": [ - { - "sentences": [ - { - "tokens": [ - { - "dep": "prep", - "head": 44, - "tag": "IN", - "orth": "In" - }, - { - "dep": "det", - "head": 3, - "tag": "DT", - "orth": "an" - }, - { - "dep": "nn", - "head": 2, - "tag": "NNP", - "orth": "Oct." - }, - { - "dep": "num", - "head": 1, - "tag": "CD", - "orth": "19" - }, - { - "dep": "pobj", - "head": -4, - "tag": "NN", - "orth": "review" - }, - { - "dep": "prep", - "head": -1, - "tag": "IN", - "orth": "of" - }, - { - "dep": "punct", - "head": 2, - "tag": "``", - "orth": "``" - }, - { - "dep": "det", - "head": 1, - "tag": "DT", - "orth": "The" - }, - { - "dep": "pobj", - "head": -3, - "tag": "NN", - "orth": "Misanthrope" - }, - { - "dep": "punct", - "head": -1, - "tag": "''", - "orth": "''" - }, - { - "dep": "prep", - "head": -2, - "tag": "IN", - "orth": "at" - }, - { - "dep": "poss", - "head": 3, - "tag": "NNP", - "orth": "Chicago" - }, - { - "dep": "possessive", - "head": -1, - "tag": "POS", - "orth": "'s" - }, - { - "dep": "nn", - "head": 1, - "tag": "NNP", - "orth": "Goodman" - }, - { - "dep": "pobj", - "head": -4, - "tag": "NNP", - "orth": "Theatre" - }, - { - "dep": "punct", - "head": 4, - "tag": "-LRB-", - "orth": "-LRB-" - }, - { - "dep": "punct", - "head": 3, - "tag": "``", - "orth": "``" - }, - { - "dep": "amod", - "head": 1, - "tag": "VBN", - "orth": "Revitalized" - }, - { - "dep": "nsubj", - "head": 1, - "tag": "NNS", - "orth": "Classics" - }, - { - "dep": "dep", - "head": -15, - "tag": "VBP", - "orth": "Take" - }, - { - "dep": "det", - "head": 1, - "tag": "DT", - "orth": "the" - }, - { - "dep": "dobj", - "head": -2, - "tag": "NN", - "orth": "Stage" - }, - { - "dep": "prep", - "head": -3, - "tag": "IN", - "orth": "in" - }, - { - "dep": "nn", - "head": 1, - "tag": "NNP", - "orth": "Windy" - }, - { - "dep": "pobj", - "head": -2, - "tag": "NNP", - "orth": "City" - }, - { - "dep": "punct", - "head": -6, - "tag": ",", - "orth": "," - }, - { - "dep": "punct", - "head": -7, - "tag": "''", - "orth": "''" - }, - { - "dep": "dep", - "head": -8, - "tag": "NN", - "orth": "Leisure" - }, - { - "dep": "cc", - "head": -1, - "tag": "CC", - "orth": "&" - }, - { - "dep": "conj", - "head": -2, - "tag": "NNS", - "orth": "Arts" - }, - { - "dep": "punct", - "head": -11, - "tag": "-RRB-", - "orth": "-RRB-" - }, - { - "dep": "punct", - "head": 13, - "tag": ",", - "orth": "," - }, - { - "dep": "det", - "head": 1, - "tag": "DT", - "orth": "the" - }, - { - "dep": "nsubjpass", - "head": 11, - "tag": "NN", - "orth": "role" - }, - { - "dep": "prep", - "head": -1, - "tag": "IN", - "orth": "of" - }, - { - "dep": "pobj", - "head": -1, - "tag": "NNP", - "orth": "Celimene" - }, - { - "dep": "punct", - "head": -3, - "tag": ",", - "orth": "," - }, - { - "dep": "partmod", - "head": -4, - "tag": "VBN", - "orth": "played" - }, - { - "dep": "prep", - "head": -1, - "tag": "IN", - "orth": "by" - }, - { - "dep": "nn", - "head": 1, - "tag": "NNP", - "orth": "Kim" - }, - { - "dep": "pobj", - "head": -2, - "tag": "NNP", - "orth": "Cattrall" - }, - { - "dep": "punct", - "head": -8, - "tag": ",", - "orth": "," - }, - { - "dep": "auxpass", - "head": 2, - "tag": "VBD", - "orth": "was" - }, - { - "dep": "advmod", - "head": 1, - "tag": "RB", - "orth": "mistakenly" - }, - { - "dep": "ROOT", - "head": 0, - "tag": "VBN", - "orth": "attributed" - }, - { - "dep": "prep", - "head": -1, - "tag": "TO", - "orth": "to" - }, - { - "dep": "nn", - "head": 1, - "tag": "NNP", - "orth": "Christina" - }, - { - "dep": "pobj", - "head": -2, - "tag": "NNP", - "orth": "Haag" - }, - { - "dep": "punct", - "head": -4, - "tag": ".", - "orth": "." - } - ] - } - ] - } - ] - }, - { - "id": 1, - "paragraphs": [ - { - "sentences": [ - { - "tokens": [ - { - "dep": "nn", - "head": 1, - "tag": "NNP", - "orth": "Ms." - }, - { - "dep": "nsubj", - "head": 1, - "tag": "NNP", - "orth": "Haag" - }, - { - "dep": "ROOT", - "head": 0, - "tag": "VBZ", - "orth": "plays" - }, - { - "dep": "dobj", - "head": -1, - "tag": "NNP", - "orth": "Elianti" - }, - { - "dep": "punct", - "head": -2, - "tag": ".", - "orth": "." - } - ] - } - ] - } - ] - }, - { - "id": 2, - "paragraphs": [ - { - "sentences": [ - { - "tokens": [ - { - "dep": "nn", - "head": 3, - "tag": "NNP", - "orth": "Rolls-Royce" - }, - { - "dep": "nn", - "head": 2, - "tag": "NNP", - "orth": "Motor" - }, - { - "dep": "nn", - "head": 1, - "tag": "NNPS", - "orth": "Cars" - }, - { - "dep": "nsubj", - "head": 1, - "tag": "NNP", - "orth": "Inc." - }, - { - "dep": "ROOT", - "head": 0, - "tag": "VBD", - "orth": "said" - }, - { - "dep": "nsubj", - "head": 1, - "tag": "PRP", - "orth": "it" - }, - { - "dep": "ccomp", - "head": -2, - "tag": "VBZ", - "orth": "expects" - }, - { - "dep": "poss", - "head": 2, - "tag": "PRP$", - "orth": "its" - }, - { - "dep": "nn", - "head": 1, - "tag": "NNP", - "orth": "U.S." - }, - { - "dep": "nsubj", - "head": 3, - "tag": "NNS", - "orth": "sales" - }, - { - "dep": "aux", - "head": 2, - "tag": "TO", - "orth": "to" - }, - { - "dep": "cop", - "head": 1, - "tag": "VB", - "orth": "remain" - }, - { - "dep": "xcomp", - "head": -6, - "tag": "JJ", - "orth": "steady" - }, - { - "dep": "prep", - "head": -1, - "tag": "IN", - "orth": "at" - }, - { - "dep": "quantmod", - "head": 1, - "tag": "IN", - "orth": "about" - }, - { - "dep": "num", - "head": 1, - "tag": "CD", - "orth": "1,200" - }, - { - "dep": "pobj", - "head": -3, - "tag": "NNS", - "orth": "cars" - }, - { - "dep": "prep", - "head": -5, - "tag": "IN", - "orth": "in" - }, - { - "dep": "pobj", - "head": -1, - "tag": "CD", - "orth": "1990" - }, - { - "dep": "punct", - "head": -15, - "tag": ".", - "orth": "." - } - ] - } - ] - } - ] - }, - { - "id": 3, - "paragraphs": [ - { - "sentences": [ - { - "tokens": [ - { - "dep": "det", - "head": 3, - "tag": "DT", - "orth": "The" - }, - { - "dep": "nn", - "head": 2, - "tag": "NN", - "orth": "luxury" - }, - { - "dep": "nn", - "head": 1, - "tag": "NN", - "orth": "auto" - }, - { - "dep": "nsubj", - "head": 3, - "tag": "NN", - "orth": "maker" - }, - { - "dep": "amod", - "head": 1, - "tag": "JJ", - "orth": "last" - }, - { - "dep": "tmod", - "head": 1, - "tag": "NN", - "orth": "year" - }, - { - "dep": "ROOT", - "head": 0, - "tag": "VBD", - "orth": "sold" - }, - { - "dep": "num", - "head": 1, - "tag": "CD", - "orth": "1,214" - }, - { - "dep": "dobj", - "head": -2, - "tag": "NNS", - "orth": "cars" - }, - { - "dep": "prep", - "head": -3, - "tag": "IN", - "orth": "in" - }, - { - "dep": "det", - "head": 1, - "tag": "DT", - "orth": "the" - }, - { - "dep": "pobj", - "head": -2, - "tag": "NNP", - "orth": "U.S." - } - ] - } - ] - } - ] - }, - { - "id": 4, - "paragraphs": [ - { - "sentences": [ - { - "tokens": [ - { - "dep": "nn", - "head": 1, - "tag": "NNP", - "orth": "Howard" - }, - { - "dep": "nsubj", - "head": 8, - "tag": "NNP", - "orth": "Mosher" - }, - { - "dep": "punct", - "head": -1, - "tag": ",", - "orth": "," - }, - { - "dep": "appos", - "head": -2, - "tag": "NN", - "orth": "president" - }, - { - "dep": "cc", - "head": -1, - "tag": "CC", - "orth": "and" - }, - { - "dep": "amod", - "head": 2, - "tag": "JJ", - "orth": "chief" - }, - { - "dep": "nn", - "head": 1, - "tag": "NN", - "orth": "executive" - }, - { - "dep": "conj", - "head": -4, - "tag": "NN", - "orth": "officer" - }, - { - "dep": "punct", - "head": -7, - "tag": ",", - "orth": "," - }, - { - "dep": "ROOT", - "head": 0, - "tag": "VBD", - "orth": "said" - }, - { - "dep": "nsubj", - "head": 1, - "tag": "PRP", - "orth": "he" - }, - { - "dep": "ccomp", - "head": -2, - "tag": "VBZ", - "orth": "anticipates" - }, - { - "dep": "dobj", - "head": -1, - "tag": "NN", - "orth": "growth" - }, - { - "dep": "prep", - "head": -1, - "tag": "IN", - "orth": "for" - }, - { - "dep": "det", - "head": 3, - "tag": "DT", - "orth": "the" - }, - { - "dep": "nn", - "head": 2, - "tag": "NN", - "orth": "luxury" - }, - { - "dep": "nn", - "head": 1, - "tag": "NN", - "orth": "auto" - }, - { - "dep": "pobj", - "head": -4, - "tag": "NN", - "orth": "maker" - }, - { - "dep": "prep", - "head": -6, - "tag": "IN", - "orth": "in" - }, - { - "dep": "pobj", - "head": -1, - "tag": "NNP", - "orth": "Britain" - }, - { - "dep": "cc", - "head": -1, - "tag": "CC", - "orth": "and" - }, - { - "dep": "conj", - "head": -2, - "tag": "NNP", - "orth": "Europe" - }, - { - "dep": "punct", - "head": -4, - "tag": ",", - "orth": "," - }, - { - "dep": "cc", - "head": -5, - "tag": "CC", - "orth": "and" - }, - { - "dep": "conj", - "head": -6, - "tag": "IN", - "orth": "in" - }, - { - "dep": "amod", - "head": 1, - "tag": "JJ", - "orth": "Far" - }, - { - "dep": "amod", - "head": 1, - "tag": "JJ", - "orth": "Eastern" - }, - { - "dep": "pobj", - "head": -3, - "tag": "NNS", - "orth": "markets" - }, - { - "dep": "punct", - "head": -19, - "tag": ".", - "orth": "." - } - ] - } - ] - } - ] - }, - { - "id": 5, - "paragraphs": [ - { - "sentences": [ - { - "tokens": [ - { - "dep": "nn", - "head": 2, - "tag": "NNP", - "orth": "BELL" - }, - { - "dep": "nn", - "head": 1, - "tag": "NNP", - "orth": "INDUSTRIES" - }, - { - "dep": "nsubj", - "head": 1, - "tag": "NNP", - "orth": "Inc." - }, - { - "dep": "ROOT", - "head": 0, - "tag": "VBD", - "orth": "increased" - }, - { - "dep": "poss", - "head": 1, - "tag": "PRP$", - "orth": "its" - }, - { - "dep": "dobj", - "head": -2, - "tag": "NN", - "orth": "quarterly" - }, - { - "dep": "prep", - "head": -3, - "tag": "TO", - "orth": "to" - }, - { - "dep": "num", - "head": 1, - "tag": "CD", - "orth": "10" - }, - { - "dep": "pobj", - "head": -2, - "tag": "NNS", - "orth": "cents" - }, - { - "dep": "prep", - "head": -6, - "tag": "IN", - "orth": "from" - }, - { - "dep": "num", - "head": 1, - "tag": "CD", - "orth": "seven" - }, - { - "dep": "pobj", - "head": -2, - "tag": "NNS", - "orth": "cents" - }, - { - "dep": "det", - "head": 1, - "tag": "DT", - "orth": "a" - }, - { - "dep": "npadvmod", - "head": -2, - "tag": "NN", - "orth": "share" - }, - { - "dep": "punct", - "head": -11, - "tag": ".", - "orth": "." - } - ] - } - ] - } - ] - }, - { - "id": 6, - "paragraphs": [ - { - "sentences": [ - { - "tokens": [ - { - "dep": "det", - "head": 2, - "tag": "DT", - "orth": "The" - }, - { - "dep": "amod", - "head": 1, - "tag": "JJ", - "orth": "new" - }, - { - "dep": "nsubj", - "head": 3, - "tag": "NN", - "orth": "rate" - }, - { - "dep": "aux", - "head": 2, - "tag": "MD", - "orth": "will" - }, - { - "dep": "cop", - "head": 1, - "tag": "VB", - "orth": "be" - }, - { - "dep": "ROOT", - "head": 0, - "tag": "JJ", - "orth": "payable" - }, - { - "dep": "tmod", - "head": -1, - "tag": "NNP", - "orth": "Feb." - }, - { - "dep": "num", - "head": -1, - "tag": "CD", - "orth": "15" - }, - { - "dep": "punct", - "head": -3, - "tag": ".", - "orth": "." - } - ] - } - ] - } - ] - }, - { - "id": 7, - "paragraphs": [ - { - "sentences": [ - { - "tokens": [ - { - "dep": "det", - "head": 2, - "tag": "DT", - "orth": "A" - }, - { - "dep": "nn", - "head": 1, - "tag": "NN", - "orth": "record" - }, - { - "dep": "nsubjpass", - "head": 4, - "tag": "NN", - "orth": "date" - }, - { - "dep": "aux", - "head": 3, - "tag": "VBZ", - "orth": "has" - }, - { - "dep": "neg", - "head": 2, - "tag": "RB", - "orth": "n't" - }, - { - "dep": "auxpass", - "head": 1, - "tag": "VBN", - "orth": "been" - }, - { - "dep": "ROOT", - "head": 0, - "tag": "VBN", - "orth": "set" - }, - { - "dep": "punct", - "head": -1, - "tag": ".", - "orth": "." - } - ] - } - ] - } - ] - }, - { - "id": 8, - "paragraphs": [ - { - "sentences": [ - { - "tokens": [ - { - "dep": "nsubj", - "head": 7, - "tag": "NNP", - "orth": "Bell" - }, - { - "dep": "punct", - "head": -1, - "tag": ",", - "orth": "," - }, - { - "dep": "partmod", - "head": -2, - "tag": "VBN", - "orth": "based" - }, - { - "dep": "prep", - "head": -1, - "tag": "IN", - "orth": "in" - }, - { - "dep": "nn", - "head": 1, - "tag": "NNP", - "orth": "Los" - }, - { - "dep": "pobj", - "head": -2, - "tag": "NNP", - "orth": "Angeles" - }, - { - "dep": "punct", - "head": -6, - "tag": ",", - "orth": "," - }, - { - "dep": "ROOT", - "head": 0, - "tag": "VBZ", - "orth": "makes" - }, - { - "dep": "cc", - "head": -1, - "tag": "CC", - "orth": "and" - }, - { - "dep": "conj", - "head": -2, - "tag": "VBZ", - "orth": "distributes" - }, - { - "dep": "amod", - "head": 5, - "tag": "JJ", - "orth": "electronic" - }, - { - "dep": "punct", - "head": -1, - "tag": ",", - "orth": "," - }, - { - "dep": "conj", - "head": -2, - "tag": "NN", - "orth": "computer" - }, - { - "dep": "cc", - "head": -3, - "tag": "CC", - "orth": "and" - }, - { - "dep": "conj", - "head": -4, - "tag": "NN", - "orth": "building" - }, - { - "dep": "dobj", - "head": -8, - "tag": "NNS", - "orth": "products" - }, - { - "dep": "punct", - "head": -9, - "tag": ".", - "orth": "." - } - ] - } - ] - } - ] - } -] + { + "id": "wsj_0200", + "paragraphs": [ + { + "raw": "In an Oct. 19 review of \"The Misanthrope\" at Chicago's Goodman Theatre (\"Revitalized Classics Take the Stage in Windy City,\" Leisure & Arts), the role of Celimene, played by Kim Cattrall, was mistakenly attributed to Christina Haag. Ms. Haag plays Elianti.", + "sentences": [ + { + "tokens": [ + { + "head": 44, + "dep": "prep", + "tag": "IN", + "orth": "In", + "ner": "O", + "id": 0 + }, + { + "head": 3, + "dep": "det", + "tag": "DT", + "orth": "an", + "ner": "O", + "id": 1 + }, + { + "head": 2, + "dep": "nmod", + "tag": "NNP", + "orth": "Oct.", + "ner": "B-DATE", + "id": 2 + }, + { + "head": -1, + "dep": "nummod", + "tag": "CD", + "orth": "19", + "ner": "L-DATE", + "id": 3 + }, + { + "head": -4, + "dep": "pobj", + "tag": "NN", + "orth": "review", + "ner": "O", + "id": 4 + }, + { + "head": -1, + "dep": "prep", + "tag": "IN", + "orth": "of", + "ner": "O", + "id": 5 + }, + { + "head": 2, + "dep": "punct", + "tag": "``", + "orth": "``", + "ner": "O", + "id": 6 + }, + { + "head": 1, + "dep": "det", + "tag": "DT", + "orth": "The", + "ner": "B-WORK_OF_ART", + "id": 7 + }, + { + "head": -3, + "dep": "pobj", + "tag": "NN", + "orth": "Misanthrope", + "ner": "L-WORK_OF_ART", + "id": 8 + }, + { + "head": -1, + "dep": "punct", + "tag": "''", + "orth": "''", + "ner": "O", + "id": 9 + }, + { + "head": -2, + "dep": "prep", + "tag": "IN", + "orth": "at", + "ner": "O", + "id": 10 + }, + { + "head": 3, + "dep": "poss", + "tag": "NNP", + "orth": "Chicago", + "ner": "U-GPE", + "id": 11 + }, + { + "head": -1, + "dep": "case", + "tag": "POS", + "orth": "'s", + "ner": "O", + "id": 12 + }, + { + "head": 1, + "dep": "compound", + "tag": "NNP", + "orth": "Goodman", + "ner": "B-FAC", + "id": 13 + }, + { + "head": -4, + "dep": "pobj", + "tag": "NNP", + "orth": "Theatre", + "ner": "L-FAC", + "id": 14 + }, + { + "head": 4, + "dep": "punct", + "tag": "-LRB-", + "orth": "(", + "ner": "O", + "id": 15 + }, + { + "head": 3, + "dep": "punct", + "tag": "``", + "orth": "``", + "ner": "O", + "id": 16 + }, + { + "head": 1, + "dep": "amod", + "tag": "VBN", + "orth": "Revitalized", + "ner": "B-WORK_OF_ART", + "id": 17 + }, + { + "head": 1, + "dep": "nsubj", + "tag": "NNS", + "orth": "Classics", + "ner": "I-WORK_OF_ART", + "id": 18 + }, + { + "head": -15, + "dep": "appos", + "tag": "VBP", + "orth": "Take", + "ner": "I-WORK_OF_ART", + "id": 19 + }, + { + "head": 1, + "dep": "det", + "tag": "DT", + "orth": "the", + "ner": "I-WORK_OF_ART", + "id": 20 + }, + { + "head": -2, + "dep": "dobj", + "tag": "NN", + "orth": "Stage", + "ner": "I-WORK_OF_ART", + "id": 21 + }, + { + "head": -3, + "dep": "prep", + "tag": "IN", + "orth": "in", + "ner": "I-WORK_OF_ART", + "id": 22 + }, + { + "head": 1, + "dep": "compound", + "tag": "NNP", + "orth": "Windy", + "ner": "I-WORK_OF_ART", + "id": 23 + }, + { + "head": -2, + "dep": "pobj", + "tag": "NNP", + "orth": "City", + "ner": "L-WORK_OF_ART", + "id": 24 + }, + { + "head": -6, + "dep": "punct", + "tag": ",", + "orth": ",", + "ner": "O", + "id": 25 + }, + { + "head": -7, + "dep": "punct", + "tag": "''", + "orth": "''", + "ner": "O", + "id": 26 + }, + { + "head": -8, + "dep": "npadvmod", + "tag": "NN", + "orth": "Leisure", + "ner": "B-ORG", + "id": 27 + }, + { + "head": -1, + "dep": "cc", + "tag": "CC", + "orth": "&", + "ner": "I-ORG", + "id": 28 + }, + { + "head": -2, + "dep": "conj", + "tag": "NNS", + "orth": "Arts", + "ner": "L-ORG", + "id": 29 + }, + { + "head": -11, + "dep": "punct", + "tag": "-RRB-", + "orth": ")", + "ner": "O", + "id": 30 + }, + { + "head": 13, + "dep": "punct", + "tag": ",", + "orth": ",", + "ner": "O", + "id": 31 + }, + { + "head": 1, + "dep": "det", + "tag": "DT", + "orth": "the", + "ner": "O", + "id": 32 + }, + { + "head": 11, + "dep": "nsubjpass", + "tag": "NN", + "orth": "role", + "ner": "O", + "id": 33 + }, + { + "head": -1, + "dep": "prep", + "tag": "IN", + "orth": "of", + "ner": "O", + "id": 34 + }, + { + "head": -1, + "dep": "pobj", + "tag": "NNP", + "orth": "Celimene", + "ner": "U-PERSON", + "id": 35 + }, + { + "head": -3, + "dep": "punct", + "tag": ",", + "orth": ",", + "ner": "O", + "id": 36 + }, + { + "head": -4, + "dep": "acl", + "tag": "VBN", + "orth": "played", + "ner": "O", + "id": 37 + }, + { + "head": -1, + "dep": "agent", + "tag": "IN", + "orth": "by", + "ner": "O", + "id": 38 + }, + { + "head": 1, + "dep": "compound", + "tag": "NNP", + "orth": "Kim", + "ner": "B-PERSON", + "id": 39 + }, + { + "head": -2, + "dep": "pobj", + "tag": "NNP", + "orth": "Cattrall", + "ner": "L-PERSON", + "id": 40 + }, + { + "head": -8, + "dep": "punct", + "tag": ",", + "orth": ",", + "ner": "O", + "id": 41 + }, + { + "head": 2, + "dep": "auxpass", + "tag": "VBD", + "orth": "was", + "ner": "O", + "id": 42 + }, + { + "head": 1, + "dep": "advmod", + "tag": "RB", + "orth": "mistakenly", + "ner": "O", + "id": 43 + }, + { + "head": 0, + "dep": "root", + "tag": "VBN", + "orth": "attributed", + "ner": "O", + "id": 44 + }, + { + "head": -1, + "dep": "prep", + "tag": "IN", + "orth": "to", + "ner": "O", + "id": 45 + }, + { + "head": 1, + "dep": "compound", + "tag": "NNP", + "orth": "Christina", + "ner": "B-PERSON", + "id": 46 + }, + { + "head": -2, + "dep": "pobj", + "tag": "NNP", + "orth": "Haag", + "ner": "L-PERSON", + "id": 47 + }, + { + "head": -4, + "dep": "punct", + "tag": ".", + "orth": ".", + "ner": "O", + "id": 48 + } + ], + "brackets": [ + { + "first": 2, + "last": 3, + "label": "NML" + }, + { + "first": 1, + "last": 4, + "label": "NP" + }, + { + "first": 7, + "last": 8, + "label": "NP-TTL" + }, + { + "first": 11, + "last": 12, + "label": "NP" + }, + { + "first": 11, + "last": 14, + "label": "NP" + }, + { + "first": 10, + "last": 14, + "label": "PP-LOC" + }, + { + "first": 6, + "last": 14, + "label": "NP" + }, + { + "first": 5, + "last": 14, + "label": "PP" + }, + { + "first": 1, + "last": 14, + "label": "NP" + }, + { + "first": 17, + "last": 18, + "label": "NP-SBJ" + }, + { + "first": 20, + "last": 21, + "label": "NP" + }, + { + "first": 23, + "last": 24, + "label": "NP" + }, + { + "first": 22, + "last": 24, + "label": "PP-LOC" + }, + { + "first": 19, + "last": 24, + "label": "VP" + }, + { + "first": 17, + "last": 24, + "label": "S-HLN" + }, + { + "first": 27, + "last": 29, + "label": "NP-TMP" + }, + { + "first": 15, + "last": 30, + "label": "NP" + }, + { + "first": 1, + "last": 30, + "label": "NP" + }, + { + "first": 0, + "last": 30, + "label": "PP-LOC" + }, + { + "first": 32, + "last": 33, + "label": "NP" + }, + { + "first": 35, + "last": 35, + "label": "NP" + }, + { + "first": 34, + "last": 35, + "label": "PP" + }, + { + "first": 32, + "last": 35, + "label": "NP" + }, + { + "first": 39, + "last": 40, + "label": "NP-LGS" + }, + { + "first": 38, + "last": 40, + "label": "PP" + }, + { + "first": 37, + "last": 40, + "label": "VP" + }, + { + "first": 32, + "last": 41, + "label": "NP-SBJ-2" + }, + { + "first": 43, + "last": 43, + "label": "ADVP-MNR" + }, + { + "first": 46, + "last": 47, + "label": "NP" + }, + { + "first": 45, + "last": 47, + "label": "PP-CLR" + }, + { + "first": 44, + "last": 47, + "label": "VP" + }, + { + "first": 42, + "last": 47, + "label": "VP" + }, + { + "first": 0, + "last": 48, + "label": "S" + } + ] + }, + { + "tokens": [ + { + "head": 1, + "dep": "compound", + "tag": "NNP", + "orth": "Ms.", + "ner": "O", + "id": 0 + }, + { + "head": 1, + "dep": "nsubj", + "tag": "NNP", + "orth": "Haag", + "ner": "U-PERSON", + "id": 1 + }, + { + "head": 0, + "dep": "root", + "tag": "VBZ", + "orth": "plays", + "ner": "O", + "id": 2 + }, + { + "head": -1, + "dep": "dobj", + "tag": "NNP", + "orth": "Elianti", + "ner": "U-PERSON", + "id": 3 + }, + { + "head": -2, + "dep": "punct", + "tag": ".", + "orth": ".", + "ner": "O", + "id": 4 + } + ], + "brackets": [ + { + "first": 0, + "last": 1, + "label": "NP-SBJ" + }, + { + "first": 3, + "last": 3, + "label": "NP" + }, + { + "first": 2, + "last": 3, + "label": "VP" + }, + { + "first": 0, + "last": 4, + "label": "S" + } + ] + } + ] + } + ] + } + ] diff --git a/spacy/cli/package.py b/spacy/cli/package.py index 5ffc493c3..83d4917f6 100644 --- a/spacy/cli/package.py +++ b/spacy/cli/package.py @@ -101,7 +101,7 @@ def generate_meta(): def generate_pipeline(): prints("If set to 'True', the default pipeline is used. If set to 'False', " "the pipeline will be disabled. Components should be specified as a " - "comma-separated list of component names, e.g. tensorizer, tagger, " + "comma-separated list of component names, e.g. tagger, " "parser, ner. For more information, see the docs on processing pipelines.", title="Enter your model's pipeline components") pipeline = util.get_raw_input("Pipeline components", True) diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index bd2ca766a..f3327a1d7 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -7,8 +7,8 @@ from .symbols import VerbForm_inf, VerbForm_none, Number_sing, Degree_pos class Lemmatizer(object): @classmethod - def load(cls, path, index=None, exc=None, rules=None): - return cls(index or {}, exc or {}, rules or {}) + def load(cls, path, index=None, exc=None, rules=None, lookup=None): + return cls(index or {}, exc or {}, rules or {}, lookup or {}) def __init__(self, index=None, exceptions=None, rules=None, lookup=None): self.index = index if index is not None else {} @@ -26,10 +26,10 @@ class Lemmatizer(object): elif univ_pos in (PUNCT, 'PUNCT', 'punct'): univ_pos = 'punct' else: - return set([string.lower()]) + return list(set([string.lower()])) # See Issue #435 for example of where this logic is requied. if self.is_base_form(univ_pos, morphology): - return set([string.lower()]) + return list(set([string.lower()])) lemmas = lemmatize(string, self.index.get(univ_pos, {}), self.exc.get(univ_pos, {}), self.rules.get(univ_pos, [])) @@ -108,4 +108,4 @@ def lemmatize(string, index, exceptions, rules): forms.extend(oov_forms) if not forms: forms.append(string) - return set(forms) + return list(set(forms)) diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index 7845ab4e7..090a07fe8 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -172,7 +172,7 @@ cdef class Morphology: cdef unicode py_string = self.strings[orth] if self.lemmatizer is None: return self.strings.add(py_string.lower()) - cdef set lemma_strings + cdef list lemma_strings cdef unicode lemma_string lemma_strings = self.lemmatizer(py_string, univ_pos, morphology) lemma_string = sorted(lemma_strings)[0] diff --git a/spacy/tests/spans/test_span.py b/spacy/tests/spans/test_span.py index 5e7c638b6..4050809b5 100644 --- a/spacy/tests/spans/test_span.py +++ b/spacy/tests/spans/test_span.py @@ -100,7 +100,7 @@ def test_spans_are_hashable(en_tokenizer): assert hash(span1) != hash(span2) span3 = tokens[0:2] assert hash(span3) == hash(span1) - + def test_spans_by_character(doc): span1 = doc[1:-2] @@ -117,3 +117,9 @@ def test_span_to_array(doc): assert arr[0, 0] == span[0].orth assert arr[0, 1] == len(span[0]) + +@pytest.mark.xfail +def test_span_as_doc(doc): + span = doc[4:10] + span_doc = span.as_doc() + assert span.text == span_doc.text diff --git a/website/_includes/_mixins.jade b/website/_includes/_mixins.jade index 414ee809e..b7375e2e0 100644 --- a/website/_includes/_mixins.jade +++ b/website/_includes/_mixins.jade @@ -181,7 +181,7 @@ mixin codepen(slug, height, default_tab) alt_file - [string] alternative file path used in footer and link button height - [integer] height of code preview in px -mixin github(repo, file, alt_file, height) +mixin github(repo, file, alt_file, height, language) - var branch = ALPHA ? "develop" : "master" - var height = height || 250 diff --git a/website/api/_annotation/_training.jade b/website/api/_annotation/_training.jade new file mode 100644 index 000000000..3b11eb2f5 --- /dev/null +++ b/website/api/_annotation/_training.jade @@ -0,0 +1,46 @@ +//- 💫 DOCS > API > ANNOTATION > TRAINING + +p + | spaCy takes training data in JSON format. The built-in + | #[+api("cli#convert") #[code convert]] command helps you convert the + | #[code .conllu] format used by the + | #[+a("https://github.com/UniversalDependencies") Universal Dependencies corpora] + | to spaCy's training format. + ++aside("Annotating entities") + | Named entities are provided in the #[+a("/api/annotation#biluo") BILUO] + | notation. Tokens outside an entity are set to #[code "O"] and tokens + | that are part of an entity are set to the entity label, prefixed by the + | BILUO marker. For example #[code "B-ORG"] describes the first token of + | a multi-token #[code ORG] entity and #[code "U-PERSON"] a single + | token representing a #[code PERSON] entity + ++code("Example structure"). + [{ + "id": int, # ID of the document within the corpus + "paragraphs": [{ # list of paragraphs in the corpus + "raw": string, # raw text of the paragraph + "sentences": [{ # list of sentences in the paragraph + "tokens": [{ # list of tokens in the sentence + "id": int, # index of the token in the document + "dep": string, # dependency label + "head": int, # offset of token head relative to token index + "tag": string, # part-of-speech tag + "orth": string, # verbatim text of the token + "ner": string # BILUO label, e.g. "O" or "B-ORG" + }], + "brackets": [{ # phrase structure (NOT USED by current models) + "first": int, # index of first token + "last": int, # index of last token + "label": string # phrase label + }] + }] + }] + }] + +p + | Here's an example of dependencies, part-of-speech tags and names + | entities, taken from the English Wall Street Journal portion of the Penn + | Treebank: + ++github("spacy", "examples/training/training-data.json", false, false, "json") diff --git a/website/api/_data.json b/website/api/_data.json index d85b103dc..ba7997690 100644 --- a/website/api/_data.json +++ b/website/api/_data.json @@ -154,13 +154,16 @@ "tokenizer": { "title": "Tokenizer", + "teaser": "Segment text into words, punctuations marks etc.", "tag": "class", "source": "spacy/tokenizer.pyx" }, "lemmatizer": { "title": "Lemmatizer", - "tag": "class" + "teaser": "Assign the base forms of words.", + "tag": "class", + "source": "spacy/lemmatizer.py" }, "tagger": { diff --git a/website/api/annotation.jade b/website/api/annotation.jade index efada23d7..c65cd3983 100644 --- a/website/api/annotation.jade +++ b/website/api/annotation.jade @@ -101,31 +101,4 @@ p This document describes the target annotations spaCy is trained to predict. +section("training") +h(2, "json-input") JSON input format for training - +under-construction - - p spaCy takes training data in the following format: - - +code("Example structure"). - doc: { - id: string, - paragraphs: [{ - raw: string, - sents: [int], - tokens: [{ - start: int, - tag: string, - head: int, - dep: string - }], - ner: [{ - start: int, - end: int, - label: string - }], - brackets: [{ - start: int, - end: int, - label: string - }] - }] - } + include _annotation/_training diff --git a/website/api/lemmatizer.jade b/website/api/lemmatizer.jade index 9699395b1..eb061f10a 100644 --- a/website/api/lemmatizer.jade +++ b/website/api/lemmatizer.jade @@ -2,4 +2,159 @@ include ../_includes/_mixins -+under-construction +p + | The #[code Lemmatizer] supports simple part-of-speech-sensitive suffix + | rules and lookup tables. + ++h(2, "init") Lemmatizer.__init__ + +tag method + +p Create a #[code Lemmatizer]. + ++aside-code("Example"). + from spacy.lemmatizer import Lemmatizer + lemmatizer = Lemmatizer() + ++table(["Name", "Type", "Description"]) + +row + +cell #[code index] + +cell dict / #[code None] + +cell Inventory of lemmas in the language. + + +row + +cell #[code exceptions] + +cell dict / #[code None] + +cell Mapping of string forms to lemmas that bypass the #[code rules]. + + +row + +cell #[code rules] + +cell dict / #[code None] + +cell List of suffix rewrite rules. + + +row + +cell #[code lookup] + +cell dict / #[code None] + +cell Lookup table mapping string to their lemmas. + + +row("foot") + +cell returns + +cell #[code Lemmatizer] + +cell The newly created object. + ++h(2, "call") Lemmatizer.__call__ + +tag method + +p Lemmatize a string. + ++aside-code("Example"). + from spacy.lemmatizer import Lemmatizer + from spacy.lang.en import LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES + lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES) + lemmas = lemmatizer(u'ducks', u'NOUN') + assert lemmas == [u'duck'] + ++table(["Name", "Type", "Description"]) + +row + +cell #[code string] + +cell unicode + +cell The string to lemmatize, e.g. the token text. + + +row + +cell #[code univ_pos] + +cell unicode / int + +cell The token's universal part-of-speech tag. + + +row + +cell #[code morphology] + +cell dict / #[code None] + +cell + | Morphological features following the + | #[+a("http://universaldependencies.org/") Universal Dependencies] + | scheme. + + +row("foot") + +cell returns + +cell list + +cell The available lemmas for the string. + ++h(2, "lookup") Lemmatizer.lookup + +tag method + +tag-new(2) + +p + | Look up a lemma in the lookup table, if available. If no lemma is found, + | the original string is returned. Languages can provide a + | #[+a("/usage/adding-languages#lemmatizer") lookup table] via the + | #[code lemma_lookup] variable, set on the individual #[code Language] + | class. + ++aside-code("Example"). + lookup = {u'going': u'go'} + lemmatizer = Lemmatizer(lookup=lookup) + assert lemmatizer.lookup(u'going') == u'go' + ++table(["Name", "Type", "Description"]) + +row + +cell #[code string] + +cell unicode + +cell The string to look up. + + +row("foot") + +cell returns + +cell unicode + +cell The lemma if the string was found, otherwise the original string. + ++h(2, "is_base_form") Lemmatizer.is_base_form + +tag method + +p + | Check whether we're dealing with an uninflected paradigm, so we can + | avoid lemmatization entirely. + ++aside-code("Example"). + pos = 'verb' + morph = {'VerbForm': 'inf'} + is_base_form = lemmatizer.is_base_form(pos, morph) + assert is_base_form == True + ++table(["Name", "Type", "Description"]) + +row + +cell #[code univ_pos] + +cell unicode / int + +cell The token's universal part-of-speech tag. + + +row + +cell #[code morphology] + +cell dict + +cell The token's morphological features. + + +row("foot") + +cell returns + +cell bool + +cell + | Whether the token's part-of-speech tag and morphological features + | describe a base form. + ++h(2, "attributes") Attributes + ++table(["Name", "Type", "Description"]) + +row + +cell #[code index] + +cell dict / #[code None] + +cell Inventory of lemmas in the language. + + +row + +cell #[code exc] + +cell dict / #[code None] + +cell Mapping of string forms to lemmas that bypass the #[code rules]. + + +row + +cell #[code rules] + +cell dict / #[code None] + +cell List of suffix rewrite rules. + + +row + +cell #[code lookup_table] + +tag-new(2) + +cell dict / #[code None] + +cell The lemma lookup table, if available. diff --git a/website/usage/_training/_tagger-parser.jade b/website/usage/_training/_tagger-parser.jade index 4011464c7..a62b9d43e 100644 --- a/website/usage/_training/_tagger-parser.jade +++ b/website/usage/_training/_tagger-parser.jade @@ -1,3 +1,7 @@ //- 💫 DOCS > USAGE > TRAINING > TAGGER & PARSER +under-construction + ++h(3, "training-json") JSON format for training + +include ../../api/_annotation/_training