mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-11 12:18:04 +03:00
82159b5c19
* Updates/bugfixes for NER/IOB converters * Converter formats `ner` and `iob` use autodetect to choose a converter if possible * `iob2json` is reverted to handle sentence-per-line data like `word1|pos1|ent1 word2|pos2|ent2` * Fix bug in `merge_sentences()` so the second sentence in each batch isn't skipped * `conll_ner2json` is made more general so it can handle more formats with whitespace-separated columns * Supports all formats where the first column is the token and the final column is the IOB tag; if present, the second column is the POS tag * As in CoNLL 2003 NER, blank lines separate sentences, `-DOCSTART- -X- O O` separates documents * Add option for segmenting sentences (new flag `-s`) * Parser-based sentence segmentation with a provided model, otherwise with sentencizer (new option `-b` to specify model) * Can group sentences into documents with `n_sents` as long as sentence segmentation is available * Only applies automatic segmentation when there are no existing delimiters in the data * Provide info about settings applied during conversion with warnings and suggestions if settings conflict or might not be not optimal. * Add tests for common formats * Add '(default)' back to docs for -c auto * Add document count back to output * Revert changes to converter output message * Use explicit tabs in convert CLI test data * Adjust/add messages for n_sents=1 default * Add sample NER data to training examples * Update README * Add links in docs to example NER data * Define msg within converters
349 lines
7.9 KiB
JSON
349 lines
7.9 KiB
JSON
[
|
|
{
|
|
"id":0,
|
|
"paragraphs":[
|
|
{
|
|
"sentences":[
|
|
{
|
|
"tokens":[
|
|
{
|
|
"orth":"When",
|
|
"tag":"WRB",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"Sebastian",
|
|
"tag":"NNP",
|
|
"ner":"B-PERSON"
|
|
},
|
|
{
|
|
"orth":"Thrun",
|
|
"tag":"NNP",
|
|
"ner":"L-PERSON"
|
|
},
|
|
{
|
|
"orth":"started",
|
|
"tag":"VBD",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"working",
|
|
"tag":"VBG",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"on",
|
|
"tag":"IN",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"self",
|
|
"tag":"NN",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"-",
|
|
"tag":"HYPH",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"driving",
|
|
"tag":"VBG",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"cars",
|
|
"tag":"NNS",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"at",
|
|
"tag":"IN",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"Google",
|
|
"tag":"NNP",
|
|
"ner":"U-ORG"
|
|
},
|
|
{
|
|
"orth":"in",
|
|
"tag":"IN",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"2007",
|
|
"tag":"CD",
|
|
"ner":"U-DATE"
|
|
},
|
|
{
|
|
"orth":",",
|
|
"tag":",",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"few",
|
|
"tag":"JJ",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"people",
|
|
"tag":"NNS",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"outside",
|
|
"tag":"RB",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"of",
|
|
"tag":"IN",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"the",
|
|
"tag":"DT",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"company",
|
|
"tag":"NN",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"took",
|
|
"tag":"VBD",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"him",
|
|
"tag":"PRP",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"seriously",
|
|
"tag":"RB",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":".",
|
|
"tag":".",
|
|
"ner":"O"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tokens":[
|
|
{
|
|
"orth":"\u201c",
|
|
"tag":"''",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"I",
|
|
"tag":"PRP",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"can",
|
|
"tag":"MD",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"tell",
|
|
"tag":"VB",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"you",
|
|
"tag":"PRP",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"very",
|
|
"tag":"RB",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"senior",
|
|
"tag":"JJ",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"CEOs",
|
|
"tag":"NNS",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"of",
|
|
"tag":"IN",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"major",
|
|
"tag":"JJ",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"American",
|
|
"tag":"JJ",
|
|
"ner":"U-NORP"
|
|
},
|
|
{
|
|
"orth":"car",
|
|
"tag":"NN",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"companies",
|
|
"tag":"NNS",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"would",
|
|
"tag":"MD",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"shake",
|
|
"tag":"VB",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"my",
|
|
"tag":"PRP$",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"hand",
|
|
"tag":"NN",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"and",
|
|
"tag":"CC",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"turn",
|
|
"tag":"VB",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"away",
|
|
"tag":"RB",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"because",
|
|
"tag":"IN",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"I",
|
|
"tag":"PRP",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"was",
|
|
"tag":"VBD",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"n\u2019t",
|
|
"tag":"RB",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"worth",
|
|
"tag":"JJ",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"talking",
|
|
"tag":"VBG",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"to",
|
|
"tag":"IN",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":",",
|
|
"tag":",",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"\u201d",
|
|
"tag":"''",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"said",
|
|
"tag":"VBD",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"Thrun",
|
|
"tag":"NNP",
|
|
"ner":"U-PERSON"
|
|
},
|
|
{
|
|
"orth":",",
|
|
"tag":",",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"in",
|
|
"tag":"IN",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"an",
|
|
"tag":"DT",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"interview",
|
|
"tag":"NN",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"with",
|
|
"tag":"IN",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"Recode",
|
|
"tag":"NNP",
|
|
"ner":"U-ORG"
|
|
},
|
|
{
|
|
"orth":"earlier",
|
|
"tag":"RBR",
|
|
"ner":"B-DATE"
|
|
},
|
|
{
|
|
"orth":"this",
|
|
"tag":"DT",
|
|
"ner":"I-DATE"
|
|
},
|
|
{
|
|
"orth":"week",
|
|
"tag":"NN",
|
|
"ner":"L-DATE"
|
|
},
|
|
{
|
|
"orth":".",
|
|
"tag":".",
|
|
"ner":"O"
|
|
}
|
|
]
|
|
}
|
|
]
|
|
}
|
|
]
|
|
}
|
|
] |