mirror of
https://github.com/explosion/spaCy.git
synced 2025-12-25 02:53:14 +03:00
* Updates/bugfixes for NER/IOB converters
* Converter formats `ner` and `iob` use autodetect to choose a converter if
possible
* `iob2json` is reverted to handle sentence-per-line data like
`word1|pos1|ent1 word2|pos2|ent2`
* Fix bug in `merge_sentences()` so the second sentence in each batch isn't
skipped
* `conll_ner2json` is made more general so it can handle more formats with
whitespace-separated columns
* Supports all formats where the first column is the token and the final
column is the IOB tag; if present, the second column is the POS tag
* As in CoNLL 2003 NER, blank lines separate sentences, `-DOCSTART- -X- O O`
separates documents
* Add option for segmenting sentences (new flag `-s`)
* Parser-based sentence segmentation with a provided model, otherwise with
sentencizer (new option `-b` to specify model)
* Can group sentences into documents with `n_sents` as long as sentence
segmentation is available
* Only applies automatic segmentation when there are no existing delimiters
in the data
* Provide info about settings applied during conversion with warnings and
suggestions if settings conflict or might not be not optimal.
* Add tests for common formats
* Add '(default)' back to docs for -c auto
* Add document count back to output
* Revert changes to converter output message
* Use explicit tabs in convert CLI test data
* Adjust/add messages for n_sents=1 default
* Add sample NER data to training examples
* Update README
* Add links in docs to example NER data
* Define msg within converters
353 lines
7.9 KiB
JSON
353 lines
7.9 KiB
JSON
[
|
|
{
|
|
"id":0,
|
|
"paragraphs":[
|
|
{
|
|
"sentences":[
|
|
{
|
|
"tokens":[
|
|
{
|
|
"orth":"When",
|
|
"tag":"-",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"Sebastian",
|
|
"tag":"-",
|
|
"ner":"B-PERSON"
|
|
},
|
|
{
|
|
"orth":"Thrun",
|
|
"tag":"-",
|
|
"ner":"L-PERSON"
|
|
},
|
|
{
|
|
"orth":"started",
|
|
"tag":"-",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"working",
|
|
"tag":"-",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"on",
|
|
"tag":"-",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"self",
|
|
"tag":"-",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"-",
|
|
"tag":"-",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"driving",
|
|
"tag":"-",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"cars",
|
|
"tag":"-",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"at",
|
|
"tag":"-",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"Google",
|
|
"tag":"-",
|
|
"ner":"U-ORG"
|
|
},
|
|
{
|
|
"orth":"in",
|
|
"tag":"-",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"2007",
|
|
"tag":"-",
|
|
"ner":"U-DATE"
|
|
},
|
|
{
|
|
"orth":",",
|
|
"tag":"-",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"few",
|
|
"tag":"-",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"people",
|
|
"tag":"-",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"outside",
|
|
"tag":"-",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"of",
|
|
"tag":"-",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"the",
|
|
"tag":"-",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"company",
|
|
"tag":"-",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"took",
|
|
"tag":"-",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"him",
|
|
"tag":"-",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"seriously",
|
|
"tag":"-",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":".",
|
|
"tag":"-",
|
|
"ner":"O"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tokens":[
|
|
{
|
|
"orth":"\u201c",
|
|
"tag":"-",
|
|
"ner":"O"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"tokens":[
|
|
{
|
|
"orth":"I",
|
|
"tag":"-",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"can",
|
|
"tag":"-",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"tell",
|
|
"tag":"-",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"you",
|
|
"tag":"-",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"very",
|
|
"tag":"-",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"senior",
|
|
"tag":"-",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"CEOs",
|
|
"tag":"-",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"of",
|
|
"tag":"-",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"major",
|
|
"tag":"-",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"American",
|
|
"tag":"-",
|
|
"ner":"U-NORP"
|
|
},
|
|
{
|
|
"orth":"car",
|
|
"tag":"-",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"companies",
|
|
"tag":"-",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"would",
|
|
"tag":"-",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"shake",
|
|
"tag":"-",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"my",
|
|
"tag":"-",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"hand",
|
|
"tag":"-",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"and",
|
|
"tag":"-",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"turn",
|
|
"tag":"-",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"away",
|
|
"tag":"-",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"because",
|
|
"tag":"-",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"I",
|
|
"tag":"-",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"was",
|
|
"tag":"-",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"n\u2019t",
|
|
"tag":"-",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"worth",
|
|
"tag":"-",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"talking",
|
|
"tag":"-",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"to",
|
|
"tag":"-",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":",",
|
|
"tag":"-",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"\u201d",
|
|
"tag":"-",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"said",
|
|
"tag":"-",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"Thrun",
|
|
"tag":"-",
|
|
"ner":"U-PERSON"
|
|
},
|
|
{
|
|
"orth":",",
|
|
"tag":"-",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"in",
|
|
"tag":"-",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"an",
|
|
"tag":"-",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"interview",
|
|
"tag":"-",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"with",
|
|
"tag":"-",
|
|
"ner":"O"
|
|
},
|
|
{
|
|
"orth":"Recode",
|
|
"tag":"-",
|
|
"ner":"U-ORG"
|
|
},
|
|
{
|
|
"orth":"earlier",
|
|
"tag":"-",
|
|
"ner":"B-DATE"
|
|
},
|
|
{
|
|
"orth":"this",
|
|
"tag":"-",
|
|
"ner":"I-DATE"
|
|
},
|
|
{
|
|
"orth":"week",
|
|
"tag":"-",
|
|
"ner":"L-DATE"
|
|
},
|
|
{
|
|
"orth":".",
|
|
"tag":"-",
|
|
"ner":"O"
|
|
}
|
|
]
|
|
}
|
|
]
|
|
}
|
|
]
|
|
}
|
|
] |