mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 00:46:28 +03:00
Update develop from master
This commit is contained in:
commit
1a65c5b7af
BIN
.github/contributors/Schibsted.png
vendored
Normal file
BIN
.github/contributors/Schibsted.png
vendored
Normal file
Binary file not shown.
After Width: | Height: | Size: 259 KiB |
|
@ -12,15 +12,15 @@ class KerasSimilarityShim(object):
|
|||
|
||||
@classmethod
|
||||
def load(cls, path, nlp, max_length=100, get_features=None):
|
||||
|
||||
|
||||
if get_features is None:
|
||||
get_features = get_word_ids
|
||||
|
||||
with (path / 'config.json').open() as file_:
|
||||
|
||||
with (path / "config.json").open() as file_:
|
||||
model = model_from_json(file_.read())
|
||||
with (path / 'model').open('rb') as file_:
|
||||
with (path / "model").open("rb") as file_:
|
||||
weights = pickle.load(file_)
|
||||
|
||||
|
||||
embeddings = get_embeddings(nlp.vocab)
|
||||
weights.insert(1, embeddings)
|
||||
model.set_weights(weights)
|
||||
|
@ -33,8 +33,8 @@ class KerasSimilarityShim(object):
|
|||
self.max_length = max_length
|
||||
|
||||
def __call__(self, doc):
|
||||
doc.user_hooks['similarity'] = self.predict
|
||||
doc.user_span_hooks['similarity'] = self.predict
|
||||
doc.user_hooks["similarity"] = self.predict
|
||||
doc.user_span_hooks["similarity"] = self.predict
|
||||
|
||||
return doc
|
||||
|
||||
|
@ -48,24 +48,24 @@ class KerasSimilarityShim(object):
|
|||
|
||||
def get_embeddings(vocab, nr_unk=100):
|
||||
# the extra +1 is for a zero vector representing sentence-final padding
|
||||
num_vectors = max(lex.rank for lex in vocab) + 2
|
||||
|
||||
num_vectors = max(lex.rank for lex in vocab) + 2
|
||||
|
||||
# create random vectors for OOV tokens
|
||||
oov = np.random.normal(size=(nr_unk, vocab.vectors_length))
|
||||
oov = oov / oov.sum(axis=1, keepdims=True)
|
||||
|
||||
vectors = np.zeros((num_vectors + nr_unk, vocab.vectors_length), dtype='float32')
|
||||
vectors[1:(nr_unk + 1), ] = oov
|
||||
|
||||
vectors = np.zeros((num_vectors + nr_unk, vocab.vectors_length), dtype="float32")
|
||||
vectors[1 : (nr_unk + 1),] = oov
|
||||
for lex in vocab:
|
||||
if lex.has_vector and lex.vector_norm > 0:
|
||||
vectors[nr_unk + lex.rank + 1] = lex.vector / lex.vector_norm
|
||||
vectors[nr_unk + lex.rank + 1] = lex.vector / lex.vector_norm
|
||||
|
||||
return vectors
|
||||
|
||||
|
||||
def get_word_ids(docs, max_length=100, nr_unk=100):
|
||||
Xs = np.zeros((len(docs), max_length), dtype='int32')
|
||||
|
||||
Xs = np.zeros((len(docs), max_length), dtype="int32")
|
||||
|
||||
for i, doc in enumerate(docs):
|
||||
for j, token in enumerate(doc):
|
||||
if j == max_length:
|
||||
|
|
7
examples/training/ner_example_data/README.md
Normal file
7
examples/training/ner_example_data/README.md
Normal file
|
@ -0,0 +1,7 @@
|
|||
## Examples of NER/IOB data that can be converted with `spacy convert`
|
||||
|
||||
spacy JSON training files were generated with:
|
||||
|
||||
```
|
||||
python -m spacy convert -c iob -s -n 10 -b en file.iob
|
||||
```
|
2
examples/training/ner_example_data/ner-sent-per-line.iob
Normal file
2
examples/training/ner_example_data/ner-sent-per-line.iob
Normal file
|
@ -0,0 +1,2 @@
|
|||
When|WRB|O Sebastian|NNP|B-PERSON Thrun|NNP|I-PERSON started|VBD|O working|VBG|O on|IN|O self|NN|O -|HYPH|O driving|VBG|O cars|NNS|O at|IN|O Google|NNP|B-ORG in|IN|O 2007|CD|B-DATE ,|,|O few|JJ|O people|NNS|O outside|RB|O of|IN|O the|DT|O company|NN|O took|VBD|O him|PRP|O seriously|RB|O .|.|O
|
||||
“|''|O I|PRP|O can|MD|O tell|VB|O you|PRP|O very|RB|O senior|JJ|O CEOs|NNS|O of|IN|O major|JJ|O American|JJ|B-NORP car|NN|O companies|NNS|O would|MD|O shake|VB|O my|PRP$|O hand|NN|O and|CC|O turn|VB|O away|RB|O because|IN|O I|PRP|O was|VBD|O n’t|RB|O worth|JJ|O talking|VBG|O to|IN|O ,|,|O ”|''|O said|VBD|O Thrun|NNP|B-PERSON ,|,|O in|IN|O an|DT|O interview|NN|O with|IN|O Recode|NNP|B-ORG earlier|RBR|B-DATE this|DT|I-DATE week|NN|I-DATE .|.|O
|
349
examples/training/ner_example_data/ner-sent-per-line.json
Normal file
349
examples/training/ner_example_data/ner-sent-per-line.json
Normal file
|
@ -0,0 +1,349 @@
|
|||
[
|
||||
{
|
||||
"id":0,
|
||||
"paragraphs":[
|
||||
{
|
||||
"sentences":[
|
||||
{
|
||||
"tokens":[
|
||||
{
|
||||
"orth":"When",
|
||||
"tag":"WRB",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"Sebastian",
|
||||
"tag":"NNP",
|
||||
"ner":"B-PERSON"
|
||||
},
|
||||
{
|
||||
"orth":"Thrun",
|
||||
"tag":"NNP",
|
||||
"ner":"L-PERSON"
|
||||
},
|
||||
{
|
||||
"orth":"started",
|
||||
"tag":"VBD",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"working",
|
||||
"tag":"VBG",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"on",
|
||||
"tag":"IN",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"self",
|
||||
"tag":"NN",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"-",
|
||||
"tag":"HYPH",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"driving",
|
||||
"tag":"VBG",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"cars",
|
||||
"tag":"NNS",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"at",
|
||||
"tag":"IN",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"Google",
|
||||
"tag":"NNP",
|
||||
"ner":"U-ORG"
|
||||
},
|
||||
{
|
||||
"orth":"in",
|
||||
"tag":"IN",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"2007",
|
||||
"tag":"CD",
|
||||
"ner":"U-DATE"
|
||||
},
|
||||
{
|
||||
"orth":",",
|
||||
"tag":",",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"few",
|
||||
"tag":"JJ",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"people",
|
||||
"tag":"NNS",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"outside",
|
||||
"tag":"RB",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"of",
|
||||
"tag":"IN",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"the",
|
||||
"tag":"DT",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"company",
|
||||
"tag":"NN",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"took",
|
||||
"tag":"VBD",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"him",
|
||||
"tag":"PRP",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"seriously",
|
||||
"tag":"RB",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":".",
|
||||
"tag":".",
|
||||
"ner":"O"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"tokens":[
|
||||
{
|
||||
"orth":"\u201c",
|
||||
"tag":"''",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"I",
|
||||
"tag":"PRP",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"can",
|
||||
"tag":"MD",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"tell",
|
||||
"tag":"VB",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"you",
|
||||
"tag":"PRP",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"very",
|
||||
"tag":"RB",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"senior",
|
||||
"tag":"JJ",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"CEOs",
|
||||
"tag":"NNS",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"of",
|
||||
"tag":"IN",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"major",
|
||||
"tag":"JJ",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"American",
|
||||
"tag":"JJ",
|
||||
"ner":"U-NORP"
|
||||
},
|
||||
{
|
||||
"orth":"car",
|
||||
"tag":"NN",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"companies",
|
||||
"tag":"NNS",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"would",
|
||||
"tag":"MD",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"shake",
|
||||
"tag":"VB",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"my",
|
||||
"tag":"PRP$",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"hand",
|
||||
"tag":"NN",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"and",
|
||||
"tag":"CC",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"turn",
|
||||
"tag":"VB",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"away",
|
||||
"tag":"RB",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"because",
|
||||
"tag":"IN",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"I",
|
||||
"tag":"PRP",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"was",
|
||||
"tag":"VBD",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"n\u2019t",
|
||||
"tag":"RB",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"worth",
|
||||
"tag":"JJ",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"talking",
|
||||
"tag":"VBG",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"to",
|
||||
"tag":"IN",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":",",
|
||||
"tag":",",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"\u201d",
|
||||
"tag":"''",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"said",
|
||||
"tag":"VBD",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"Thrun",
|
||||
"tag":"NNP",
|
||||
"ner":"U-PERSON"
|
||||
},
|
||||
{
|
||||
"orth":",",
|
||||
"tag":",",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"in",
|
||||
"tag":"IN",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"an",
|
||||
"tag":"DT",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"interview",
|
||||
"tag":"NN",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"with",
|
||||
"tag":"IN",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"Recode",
|
||||
"tag":"NNP",
|
||||
"ner":"U-ORG"
|
||||
},
|
||||
{
|
||||
"orth":"earlier",
|
||||
"tag":"RBR",
|
||||
"ner":"B-DATE"
|
||||
},
|
||||
{
|
||||
"orth":"this",
|
||||
"tag":"DT",
|
||||
"ner":"I-DATE"
|
||||
},
|
||||
{
|
||||
"orth":"week",
|
||||
"tag":"NN",
|
||||
"ner":"L-DATE"
|
||||
},
|
||||
{
|
||||
"orth":".",
|
||||
"tag":".",
|
||||
"ner":"O"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
|
@ -0,0 +1,70 @@
|
|||
-DOCSTART- -X- O O
|
||||
|
||||
When WRB _ O
|
||||
Sebastian NNP _ B-PERSON
|
||||
Thrun NNP _ I-PERSON
|
||||
started VBD _ O
|
||||
working VBG _ O
|
||||
on IN _ O
|
||||
self NN _ O
|
||||
- HYPH _ O
|
||||
driving VBG _ O
|
||||
cars NNS _ O
|
||||
at IN _ O
|
||||
Google NNP _ B-ORG
|
||||
in IN _ O
|
||||
2007 CD _ B-DATE
|
||||
, , _ O
|
||||
few JJ _ O
|
||||
people NNS _ O
|
||||
outside RB _ O
|
||||
of IN _ O
|
||||
the DT _ O
|
||||
company NN _ O
|
||||
took VBD _ O
|
||||
him PRP _ O
|
||||
seriously RB _ O
|
||||
. . _ O
|
||||
|
||||
“ '' _ O
|
||||
I PRP _ O
|
||||
can MD _ O
|
||||
tell VB _ O
|
||||
you PRP _ O
|
||||
very RB _ O
|
||||
senior JJ _ O
|
||||
CEOs NNS _ O
|
||||
of IN _ O
|
||||
major JJ _ O
|
||||
American JJ _ B-NORP
|
||||
car NN _ O
|
||||
companies NNS _ O
|
||||
would MD _ O
|
||||
shake VB _ O
|
||||
my PRP$ _ O
|
||||
hand NN _ O
|
||||
and CC _ O
|
||||
turn VB _ O
|
||||
away RB _ O
|
||||
because IN _ O
|
||||
I PRP _ O
|
||||
was VBD _ O
|
||||
n’t RB _ O
|
||||
worth JJ _ O
|
||||
talking VBG _ O
|
||||
to IN _ O
|
||||
, , _ O
|
||||
” '' _ O
|
||||
said VBD _ O
|
||||
Thrun NNP _ B-PERSON
|
||||
, , _ O
|
||||
in IN _ O
|
||||
an DT _ O
|
||||
interview NN _ O
|
||||
with IN _ O
|
||||
Recode NNP _ B-ORG
|
||||
earlier RBR _ B-DATE
|
||||
this DT _ I-DATE
|
||||
week NN _ I-DATE
|
||||
. . _ O
|
||||
|
|
@ -0,0 +1,349 @@
|
|||
[
|
||||
{
|
||||
"id":0,
|
||||
"paragraphs":[
|
||||
{
|
||||
"sentences":[
|
||||
{
|
||||
"tokens":[
|
||||
{
|
||||
"orth":"When",
|
||||
"tag":"WRB",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"Sebastian",
|
||||
"tag":"NNP",
|
||||
"ner":"B-PERSON"
|
||||
},
|
||||
{
|
||||
"orth":"Thrun",
|
||||
"tag":"NNP",
|
||||
"ner":"L-PERSON"
|
||||
},
|
||||
{
|
||||
"orth":"started",
|
||||
"tag":"VBD",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"working",
|
||||
"tag":"VBG",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"on",
|
||||
"tag":"IN",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"self",
|
||||
"tag":"NN",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"-",
|
||||
"tag":"HYPH",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"driving",
|
||||
"tag":"VBG",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"cars",
|
||||
"tag":"NNS",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"at",
|
||||
"tag":"IN",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"Google",
|
||||
"tag":"NNP",
|
||||
"ner":"U-ORG"
|
||||
},
|
||||
{
|
||||
"orth":"in",
|
||||
"tag":"IN",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"2007",
|
||||
"tag":"CD",
|
||||
"ner":"U-DATE"
|
||||
},
|
||||
{
|
||||
"orth":",",
|
||||
"tag":",",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"few",
|
||||
"tag":"JJ",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"people",
|
||||
"tag":"NNS",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"outside",
|
||||
"tag":"RB",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"of",
|
||||
"tag":"IN",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"the",
|
||||
"tag":"DT",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"company",
|
||||
"tag":"NN",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"took",
|
||||
"tag":"VBD",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"him",
|
||||
"tag":"PRP",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"seriously",
|
||||
"tag":"RB",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":".",
|
||||
"tag":".",
|
||||
"ner":"O"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"tokens":[
|
||||
{
|
||||
"orth":"\u201c",
|
||||
"tag":"''",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"I",
|
||||
"tag":"PRP",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"can",
|
||||
"tag":"MD",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"tell",
|
||||
"tag":"VB",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"you",
|
||||
"tag":"PRP",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"very",
|
||||
"tag":"RB",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"senior",
|
||||
"tag":"JJ",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"CEOs",
|
||||
"tag":"NNS",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"of",
|
||||
"tag":"IN",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"major",
|
||||
"tag":"JJ",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"American",
|
||||
"tag":"JJ",
|
||||
"ner":"U-NORP"
|
||||
},
|
||||
{
|
||||
"orth":"car",
|
||||
"tag":"NN",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"companies",
|
||||
"tag":"NNS",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"would",
|
||||
"tag":"MD",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"shake",
|
||||
"tag":"VB",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"my",
|
||||
"tag":"PRP$",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"hand",
|
||||
"tag":"NN",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"and",
|
||||
"tag":"CC",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"turn",
|
||||
"tag":"VB",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"away",
|
||||
"tag":"RB",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"because",
|
||||
"tag":"IN",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"I",
|
||||
"tag":"PRP",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"was",
|
||||
"tag":"VBD",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"n\u2019t",
|
||||
"tag":"RB",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"worth",
|
||||
"tag":"JJ",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"talking",
|
||||
"tag":"VBG",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"to",
|
||||
"tag":"IN",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":",",
|
||||
"tag":",",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"\u201d",
|
||||
"tag":"''",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"said",
|
||||
"tag":"VBD",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"Thrun",
|
||||
"tag":"NNP",
|
||||
"ner":"U-PERSON"
|
||||
},
|
||||
{
|
||||
"orth":",",
|
||||
"tag":",",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"in",
|
||||
"tag":"IN",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"an",
|
||||
"tag":"DT",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"interview",
|
||||
"tag":"NN",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"with",
|
||||
"tag":"IN",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"Recode",
|
||||
"tag":"NNP",
|
||||
"ner":"U-ORG"
|
||||
},
|
||||
{
|
||||
"orth":"earlier",
|
||||
"tag":"RBR",
|
||||
"ner":"B-DATE"
|
||||
},
|
||||
{
|
||||
"orth":"this",
|
||||
"tag":"DT",
|
||||
"ner":"I-DATE"
|
||||
},
|
||||
{
|
||||
"orth":"week",
|
||||
"tag":"NN",
|
||||
"ner":"L-DATE"
|
||||
},
|
||||
{
|
||||
"orth":".",
|
||||
"tag":".",
|
||||
"ner":"O"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
|
@ -0,0 +1,66 @@
|
|||
When WRB O
|
||||
Sebastian NNP B-PERSON
|
||||
Thrun NNP I-PERSON
|
||||
started VBD O
|
||||
working VBG O
|
||||
on IN O
|
||||
self NN O
|
||||
- HYPH O
|
||||
driving VBG O
|
||||
cars NNS O
|
||||
at IN O
|
||||
Google NNP B-ORG
|
||||
in IN O
|
||||
2007 CD B-DATE
|
||||
, , O
|
||||
few JJ O
|
||||
people NNS O
|
||||
outside RB O
|
||||
of IN O
|
||||
the DT O
|
||||
company NN O
|
||||
took VBD O
|
||||
him PRP O
|
||||
seriously RB O
|
||||
. . O
|
||||
“ '' O
|
||||
I PRP O
|
||||
can MD O
|
||||
tell VB O
|
||||
you PRP O
|
||||
very RB O
|
||||
senior JJ O
|
||||
CEOs NNS O
|
||||
of IN O
|
||||
major JJ O
|
||||
American JJ B-NORP
|
||||
car NN O
|
||||
companies NNS O
|
||||
would MD O
|
||||
shake VB O
|
||||
my PRP$ O
|
||||
hand NN O
|
||||
and CC O
|
||||
turn VB O
|
||||
away RB O
|
||||
because IN O
|
||||
I PRP O
|
||||
was VBD O
|
||||
n’t RB O
|
||||
worth JJ O
|
||||
talking VBG O
|
||||
to IN O
|
||||
, , O
|
||||
” '' O
|
||||
said VBD O
|
||||
Thrun NNP B-PERSON
|
||||
, , O
|
||||
in IN O
|
||||
an DT O
|
||||
interview NN O
|
||||
with IN O
|
||||
Recode NNP B-ORG
|
||||
earlier RBR B-DATE
|
||||
this DT I-DATE
|
||||
week NN I-DATE
|
||||
. . O
|
|
@ -0,0 +1,353 @@
|
|||
[
|
||||
{
|
||||
"id":0,
|
||||
"paragraphs":[
|
||||
{
|
||||
"sentences":[
|
||||
{
|
||||
"tokens":[
|
||||
{
|
||||
"orth":"When",
|
||||
"tag":"WRB",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"Sebastian",
|
||||
"tag":"NNP",
|
||||
"ner":"B-PERSON"
|
||||
},
|
||||
{
|
||||
"orth":"Thrun",
|
||||
"tag":"NNP",
|
||||
"ner":"L-PERSON"
|
||||
},
|
||||
{
|
||||
"orth":"started",
|
||||
"tag":"VBD",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"working",
|
||||
"tag":"VBG",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"on",
|
||||
"tag":"IN",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"self",
|
||||
"tag":"NN",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"-",
|
||||
"tag":"HYPH",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"driving",
|
||||
"tag":"VBG",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"cars",
|
||||
"tag":"NNS",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"at",
|
||||
"tag":"IN",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"Google",
|
||||
"tag":"NNP",
|
||||
"ner":"U-ORG"
|
||||
},
|
||||
{
|
||||
"orth":"in",
|
||||
"tag":"IN",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"2007",
|
||||
"tag":"CD",
|
||||
"ner":"U-DATE"
|
||||
},
|
||||
{
|
||||
"orth":",",
|
||||
"tag":",",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"few",
|
||||
"tag":"JJ",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"people",
|
||||
"tag":"NNS",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"outside",
|
||||
"tag":"RB",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"of",
|
||||
"tag":"IN",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"the",
|
||||
"tag":"DT",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"company",
|
||||
"tag":"NN",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"took",
|
||||
"tag":"VBD",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"him",
|
||||
"tag":"PRP",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"seriously",
|
||||
"tag":"RB",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":".",
|
||||
"tag":".",
|
||||
"ner":"O"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"tokens":[
|
||||
{
|
||||
"orth":"\u201c",
|
||||
"tag":"''",
|
||||
"ner":"O"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"tokens":[
|
||||
{
|
||||
"orth":"I",
|
||||
"tag":"PRP",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"can",
|
||||
"tag":"MD",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"tell",
|
||||
"tag":"VB",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"you",
|
||||
"tag":"PRP",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"very",
|
||||
"tag":"RB",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"senior",
|
||||
"tag":"JJ",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"CEOs",
|
||||
"tag":"NNS",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"of",
|
||||
"tag":"IN",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"major",
|
||||
"tag":"JJ",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"American",
|
||||
"tag":"JJ",
|
||||
"ner":"U-NORP"
|
||||
},
|
||||
{
|
||||
"orth":"car",
|
||||
"tag":"NN",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"companies",
|
||||
"tag":"NNS",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"would",
|
||||
"tag":"MD",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"shake",
|
||||
"tag":"VB",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"my",
|
||||
"tag":"PRP$",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"hand",
|
||||
"tag":"NN",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"and",
|
||||
"tag":"CC",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"turn",
|
||||
"tag":"VB",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"away",
|
||||
"tag":"RB",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"because",
|
||||
"tag":"IN",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"I",
|
||||
"tag":"PRP",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"was",
|
||||
"tag":"VBD",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"n\u2019t",
|
||||
"tag":"RB",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"worth",
|
||||
"tag":"JJ",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"talking",
|
||||
"tag":"VBG",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"to",
|
||||
"tag":"IN",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":",",
|
||||
"tag":",",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"\u201d",
|
||||
"tag":"''",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"said",
|
||||
"tag":"VBD",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"Thrun",
|
||||
"tag":"NNP",
|
||||
"ner":"U-PERSON"
|
||||
},
|
||||
{
|
||||
"orth":",",
|
||||
"tag":",",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"in",
|
||||
"tag":"IN",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"an",
|
||||
"tag":"DT",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"interview",
|
||||
"tag":"NN",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"with",
|
||||
"tag":"IN",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"Recode",
|
||||
"tag":"NNP",
|
||||
"ner":"U-ORG"
|
||||
},
|
||||
{
|
||||
"orth":"earlier",
|
||||
"tag":"RBR",
|
||||
"ner":"B-DATE"
|
||||
},
|
||||
{
|
||||
"orth":"this",
|
||||
"tag":"DT",
|
||||
"ner":"I-DATE"
|
||||
},
|
||||
{
|
||||
"orth":"week",
|
||||
"tag":"NN",
|
||||
"ner":"L-DATE"
|
||||
},
|
||||
{
|
||||
"orth":".",
|
||||
"tag":".",
|
||||
"ner":"O"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
66
examples/training/ner_example_data/ner-token-per-line.iob
Normal file
66
examples/training/ner_example_data/ner-token-per-line.iob
Normal file
|
@ -0,0 +1,66 @@
|
|||
When O
|
||||
Sebastian B-PERSON
|
||||
Thrun I-PERSON
|
||||
started O
|
||||
working O
|
||||
on O
|
||||
self O
|
||||
- O
|
||||
driving O
|
||||
cars O
|
||||
at O
|
||||
Google B-ORG
|
||||
in O
|
||||
2007 B-DATE
|
||||
, O
|
||||
few O
|
||||
people O
|
||||
outside O
|
||||
of O
|
||||
the O
|
||||
company O
|
||||
took O
|
||||
him O
|
||||
seriously O
|
||||
. O
|
||||
“ O
|
||||
I O
|
||||
can O
|
||||
tell O
|
||||
you O
|
||||
very O
|
||||
senior O
|
||||
CEOs O
|
||||
of O
|
||||
major O
|
||||
American B-NORP
|
||||
car O
|
||||
companies O
|
||||
would O
|
||||
shake O
|
||||
my O
|
||||
hand O
|
||||
and O
|
||||
turn O
|
||||
away O
|
||||
because O
|
||||
I O
|
||||
was O
|
||||
n’t O
|
||||
worth O
|
||||
talking O
|
||||
to O
|
||||
, O
|
||||
” O
|
||||
said O
|
||||
Thrun B-PERSON
|
||||
, O
|
||||
in O
|
||||
an O
|
||||
interview O
|
||||
with O
|
||||
Recode B-ORG
|
||||
earlier B-DATE
|
||||
this I-DATE
|
||||
week I-DATE
|
||||
. O
|
353
examples/training/ner_example_data/ner-token-per-line.json
Normal file
353
examples/training/ner_example_data/ner-token-per-line.json
Normal file
|
@ -0,0 +1,353 @@
|
|||
[
|
||||
{
|
||||
"id":0,
|
||||
"paragraphs":[
|
||||
{
|
||||
"sentences":[
|
||||
{
|
||||
"tokens":[
|
||||
{
|
||||
"orth":"When",
|
||||
"tag":"-",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"Sebastian",
|
||||
"tag":"-",
|
||||
"ner":"B-PERSON"
|
||||
},
|
||||
{
|
||||
"orth":"Thrun",
|
||||
"tag":"-",
|
||||
"ner":"L-PERSON"
|
||||
},
|
||||
{
|
||||
"orth":"started",
|
||||
"tag":"-",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"working",
|
||||
"tag":"-",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"on",
|
||||
"tag":"-",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"self",
|
||||
"tag":"-",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"-",
|
||||
"tag":"-",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"driving",
|
||||
"tag":"-",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"cars",
|
||||
"tag":"-",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"at",
|
||||
"tag":"-",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"Google",
|
||||
"tag":"-",
|
||||
"ner":"U-ORG"
|
||||
},
|
||||
{
|
||||
"orth":"in",
|
||||
"tag":"-",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"2007",
|
||||
"tag":"-",
|
||||
"ner":"U-DATE"
|
||||
},
|
||||
{
|
||||
"orth":",",
|
||||
"tag":"-",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"few",
|
||||
"tag":"-",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"people",
|
||||
"tag":"-",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"outside",
|
||||
"tag":"-",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"of",
|
||||
"tag":"-",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"the",
|
||||
"tag":"-",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"company",
|
||||
"tag":"-",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"took",
|
||||
"tag":"-",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"him",
|
||||
"tag":"-",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"seriously",
|
||||
"tag":"-",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":".",
|
||||
"tag":"-",
|
||||
"ner":"O"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"tokens":[
|
||||
{
|
||||
"orth":"\u201c",
|
||||
"tag":"-",
|
||||
"ner":"O"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"tokens":[
|
||||
{
|
||||
"orth":"I",
|
||||
"tag":"-",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"can",
|
||||
"tag":"-",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"tell",
|
||||
"tag":"-",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"you",
|
||||
"tag":"-",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"very",
|
||||
"tag":"-",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"senior",
|
||||
"tag":"-",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"CEOs",
|
||||
"tag":"-",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"of",
|
||||
"tag":"-",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"major",
|
||||
"tag":"-",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"American",
|
||||
"tag":"-",
|
||||
"ner":"U-NORP"
|
||||
},
|
||||
{
|
||||
"orth":"car",
|
||||
"tag":"-",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"companies",
|
||||
"tag":"-",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"would",
|
||||
"tag":"-",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"shake",
|
||||
"tag":"-",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"my",
|
||||
"tag":"-",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"hand",
|
||||
"tag":"-",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"and",
|
||||
"tag":"-",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"turn",
|
||||
"tag":"-",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"away",
|
||||
"tag":"-",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"because",
|
||||
"tag":"-",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"I",
|
||||
"tag":"-",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"was",
|
||||
"tag":"-",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"n\u2019t",
|
||||
"tag":"-",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"worth",
|
||||
"tag":"-",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"talking",
|
||||
"tag":"-",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"to",
|
||||
"tag":"-",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":",",
|
||||
"tag":"-",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"\u201d",
|
||||
"tag":"-",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"said",
|
||||
"tag":"-",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"Thrun",
|
||||
"tag":"-",
|
||||
"ner":"U-PERSON"
|
||||
},
|
||||
{
|
||||
"orth":",",
|
||||
"tag":"-",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"in",
|
||||
"tag":"-",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"an",
|
||||
"tag":"-",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"interview",
|
||||
"tag":"-",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"with",
|
||||
"tag":"-",
|
||||
"ner":"O"
|
||||
},
|
||||
{
|
||||
"orth":"Recode",
|
||||
"tag":"-",
|
||||
"ner":"U-ORG"
|
||||
},
|
||||
{
|
||||
"orth":"earlier",
|
||||
"tag":"-",
|
||||
"ner":"B-DATE"
|
||||
},
|
||||
{
|
||||
"orth":"this",
|
||||
"tag":"-",
|
||||
"ner":"I-DATE"
|
||||
},
|
||||
{
|
||||
"orth":"week",
|
||||
"tag":"-",
|
||||
"ner":"L-DATE"
|
||||
},
|
||||
{
|
||||
"orth":".",
|
||||
"tag":"-",
|
||||
"ner":"O"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
|
@ -80,7 +80,7 @@ def main(model_name, unlabelled_loc):
|
|||
nlp.rehearse(raw_batch, sgd=optimizer, losses=r_losses)
|
||||
print("Losses", losses)
|
||||
print("R. Losses", r_losses)
|
||||
print(nlp.get_pipe('ner').model.unseen_classes)
|
||||
print(nlp.get_pipe("ner").model.unseen_classes)
|
||||
test_text = "Do you like horses?"
|
||||
doc = nlp(test_text)
|
||||
print("Entities in '%s'" % test_text)
|
||||
|
@ -88,7 +88,5 @@ def main(model_name, unlabelled_loc):
|
|||
print(ent.label_, ent.text)
|
||||
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
plac.call(main)
|
||||
|
|
|
@ -24,7 +24,7 @@ from spacy.util import minibatch, compounding
|
|||
output_dir=("Optional output directory", "option", "o", Path),
|
||||
n_texts=("Number of texts to train from", "option", "t", int),
|
||||
n_iter=("Number of training iterations", "option", "n", int),
|
||||
init_tok2vec=("Pretrained tok2vec weights", "option", "t2v", Path)
|
||||
init_tok2vec=("Pretrained tok2vec weights", "option", "t2v", Path),
|
||||
)
|
||||
def main(model=None, output_dir=None, n_iter=20, n_texts=2000, init_tok2vec=None):
|
||||
if output_dir is not None:
|
||||
|
@ -43,11 +43,7 @@ def main(model=None, output_dir=None, n_iter=20, n_texts=2000, init_tok2vec=None
|
|||
# nlp.create_pipe works for built-ins that are registered with spaCy
|
||||
if "textcat" not in nlp.pipe_names:
|
||||
textcat = nlp.create_pipe(
|
||||
"textcat",
|
||||
config={
|
||||
"exclusive_classes": True,
|
||||
"architecture": "simple_cnn",
|
||||
}
|
||||
"textcat", config={"exclusive_classes": True, "architecture": "simple_cnn"}
|
||||
)
|
||||
nlp.add_pipe(textcat, last=True)
|
||||
# otherwise, get it, so we can add labels to it
|
||||
|
|
|
@ -5,12 +5,14 @@ import plac
|
|||
from pathlib import Path
|
||||
from wasabi import Printer
|
||||
import srsly
|
||||
import re
|
||||
|
||||
from .converters import conllu2json, iob2json, conll_ner2json
|
||||
from .converters import ner_jsonl2json
|
||||
|
||||
|
||||
# Converters are matched by file extension. To add a converter, add a new
|
||||
# Converters are matched by file extension except for ner/iob, which are
|
||||
# matched by file extension and content. To add a converter, add a new
|
||||
# entry to this dict with the file extension mapped to the converter function
|
||||
# imported from /converters.
|
||||
CONVERTERS = {
|
||||
|
@ -31,7 +33,9 @@ FILE_TYPES_STDOUT = ("json", "jsonl")
|
|||
input_file=("Input file", "positional", None, str),
|
||||
output_dir=("Output directory. '-' for stdout.", "positional", None, str),
|
||||
file_type=("Type of data to produce: {}".format(FILE_TYPES), "option", "t", str),
|
||||
n_sents=("Number of sentences per doc", "option", "n", int),
|
||||
n_sents=("Number of sentences per doc (0 to disable)", "option", "n", int),
|
||||
seg_sents=("Segment sentences (for -c ner)", "flag", "s"),
|
||||
model=("Model for sentence segmentation (for -s)", "option", "b", str),
|
||||
converter=("Converter: {}".format(tuple(CONVERTERS.keys())), "option", "c", str),
|
||||
lang=("Language (if tokenizer required)", "option", "l", str),
|
||||
morphology=("Enable appending morphology to tags", "flag", "m", bool),
|
||||
|
@ -41,6 +45,8 @@ def convert(
|
|||
output_dir="-",
|
||||
file_type="json",
|
||||
n_sents=1,
|
||||
seg_sents=False,
|
||||
model=None,
|
||||
morphology=False,
|
||||
converter="auto",
|
||||
lang=None,
|
||||
|
@ -70,14 +76,33 @@ def convert(
|
|||
msg.fail("Input file not found", input_path, exits=1)
|
||||
if output_dir != "-" and not Path(output_dir).exists():
|
||||
msg.fail("Output directory not found", output_dir, exits=1)
|
||||
input_data = input_path.open("r", encoding="utf-8").read()
|
||||
if converter == "auto":
|
||||
converter = input_path.suffix[1:]
|
||||
if converter == "ner" or converter == "iob":
|
||||
converter_autodetect = autodetect_ner_format(input_data)
|
||||
if converter_autodetect == "ner":
|
||||
msg.info("Auto-detected token-per-line NER format")
|
||||
converter = converter_autodetect
|
||||
elif converter_autodetect == "iob":
|
||||
msg.info("Auto-detected sentence-per-line NER format")
|
||||
converter = converter_autodetect
|
||||
else:
|
||||
msg.warn(
|
||||
"Can't automatically detect NER format. Conversion may not succeed. See https://spacy.io/api/cli#convert"
|
||||
)
|
||||
if converter not in CONVERTERS:
|
||||
msg.fail("Can't find converter for {}".format(converter), exits=1)
|
||||
# Use converter function to convert data
|
||||
func = CONVERTERS[converter]
|
||||
input_data = input_path.open("r", encoding="utf-8").read()
|
||||
data = func(input_data, n_sents=n_sents, use_morphology=morphology, lang=lang)
|
||||
data = func(
|
||||
input_data,
|
||||
n_sents=n_sents,
|
||||
seg_sents=seg_sents,
|
||||
use_morphology=morphology,
|
||||
lang=lang,
|
||||
model=model,
|
||||
)
|
||||
if output_dir != "-":
|
||||
# Export data to a file
|
||||
suffix = ".{}".format(file_type)
|
||||
|
@ -88,10 +113,31 @@ def convert(
|
|||
srsly.write_jsonl(output_file, data)
|
||||
elif file_type == "msg":
|
||||
srsly.write_msgpack(output_file, data)
|
||||
msg.good("Generated output file ({} documents)".format(len(data)), output_file)
|
||||
msg.good(
|
||||
"Generated output file ({} documents): {}".format(len(data), output_file)
|
||||
)
|
||||
else:
|
||||
# Print to stdout
|
||||
if file_type == "json":
|
||||
srsly.write_json("-", data)
|
||||
elif file_type == "jsonl":
|
||||
srsly.write_jsonl("-", data)
|
||||
|
||||
|
||||
def autodetect_ner_format(input_data):
|
||||
# guess format from the first 20 lines
|
||||
lines = input_data.split("\n")[:20]
|
||||
format_guesses = {"ner": 0, "iob": 0}
|
||||
iob_re = re.compile(r"\S+\|(O|[IB]-\S+)")
|
||||
ner_re = re.compile(r"\S+\s+(O|[IB]-\S+)$")
|
||||
for line in lines:
|
||||
line = line.strip()
|
||||
if iob_re.search(line):
|
||||
format_guesses["iob"] += 1
|
||||
if ner_re.search(line):
|
||||
format_guesses["ner"] += 1
|
||||
if format_guesses["iob"] == 0 and format_guesses["ner"] > 0:
|
||||
return "ner"
|
||||
if format_guesses["ner"] == 0 and format_guesses["iob"] > 0:
|
||||
return "iob"
|
||||
return None
|
||||
|
|
|
@ -1,17 +1,89 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from wasabi import Printer
|
||||
|
||||
from ...gold import iob_to_biluo
|
||||
from ...lang.xx import MultiLanguage
|
||||
from ...tokens.doc import Doc
|
||||
from ...util import load_model
|
||||
|
||||
|
||||
def conll_ner2json(input_data, **kwargs):
|
||||
def conll_ner2json(input_data, n_sents=10, seg_sents=False, model=None, **kwargs):
|
||||
"""
|
||||
Convert files in the CoNLL-2003 NER format into JSON format for use with
|
||||
train cli.
|
||||
Convert files in the CoNLL-2003 NER format and similar
|
||||
whitespace-separated columns into JSON format for use with train cli.
|
||||
|
||||
The first column is the tokens, the final column is the IOB tags. If an
|
||||
additional second column is present, the second column is the tags.
|
||||
|
||||
Sentences are separated with whitespace and documents can be separated
|
||||
using the line "-DOCSTART- -X- O O".
|
||||
|
||||
Sample format:
|
||||
|
||||
-DOCSTART- -X- O O
|
||||
|
||||
I O
|
||||
like O
|
||||
London B-GPE
|
||||
and O
|
||||
New B-GPE
|
||||
York I-GPE
|
||||
City I-GPE
|
||||
. O
|
||||
|
||||
"""
|
||||
delimit_docs = "-DOCSTART- -X- O O"
|
||||
msg = Printer()
|
||||
doc_delimiter = "-DOCSTART- -X- O O"
|
||||
# check for existing delimiters, which should be preserved
|
||||
if "\n\n" in input_data and seg_sents:
|
||||
msg.warn(
|
||||
"Sentence boundaries found, automatic sentence segmentation with "
|
||||
"`-s` disabled."
|
||||
)
|
||||
seg_sents = False
|
||||
if doc_delimiter in input_data and n_sents:
|
||||
msg.warn(
|
||||
"Document delimiters found, automatic document segmentation with "
|
||||
"`-n` disabled."
|
||||
)
|
||||
n_sents = 0
|
||||
# do document segmentation with existing sentences
|
||||
if "\n\n" in input_data and doc_delimiter not in input_data and n_sents:
|
||||
n_sents_info(msg, n_sents)
|
||||
input_data = segment_docs(input_data, n_sents, doc_delimiter)
|
||||
# do sentence segmentation with existing documents
|
||||
if "\n\n" not in input_data and doc_delimiter in input_data and seg_sents:
|
||||
input_data = segment_sents_and_docs(input_data, 0, "", model=model, msg=msg)
|
||||
# do both sentence segmentation and document segmentation according
|
||||
# to options
|
||||
if "\n\n" not in input_data and doc_delimiter not in input_data:
|
||||
# sentence segmentation required for document segmentation
|
||||
if n_sents > 0 and not seg_sents:
|
||||
msg.warn(
|
||||
"No sentence boundaries found to use with option `-n {}`. "
|
||||
"Use `-s` to automatically segment sentences or `-n 0` "
|
||||
"to disable.".format(n_sents)
|
||||
)
|
||||
else:
|
||||
n_sents_info(msg, n_sents)
|
||||
input_data = segment_sents_and_docs(
|
||||
input_data, n_sents, doc_delimiter, model=model, msg=msg
|
||||
)
|
||||
# provide warnings for problematic data
|
||||
if "\n\n" not in input_data:
|
||||
msg.warn(
|
||||
"No sentence boundaries found. Use `-s` to automatically segment "
|
||||
"sentences."
|
||||
)
|
||||
if doc_delimiter not in input_data:
|
||||
msg.warn(
|
||||
"No document delimiters found. Use `-n` to automatically group "
|
||||
"sentences into documents."
|
||||
)
|
||||
output_docs = []
|
||||
for doc in input_data.strip().split(delimit_docs):
|
||||
for doc in input_data.strip().split(doc_delimiter):
|
||||
doc = doc.strip()
|
||||
if not doc:
|
||||
continue
|
||||
|
@ -21,7 +93,19 @@ def conll_ner2json(input_data, **kwargs):
|
|||
if not sent:
|
||||
continue
|
||||
lines = [line.strip() for line in sent.split("\n") if line.strip()]
|
||||
words, tags, chunks, iob_ents = zip(*[line.split() for line in lines])
|
||||
cols = list(zip(*[line.split() for line in lines]))
|
||||
if len(cols) < 2:
|
||||
raise ValueError(
|
||||
"The token-per-line NER file is not formatted correctly. "
|
||||
"Try checking whitespace and delimiters. See "
|
||||
"https://spacy.io/api/cli#convert"
|
||||
)
|
||||
words = cols[0]
|
||||
iob_ents = cols[-1]
|
||||
if len(cols) > 2:
|
||||
tags = cols[1]
|
||||
else:
|
||||
tags = ["-"] * len(words)
|
||||
biluo_ents = iob_to_biluo(iob_ents)
|
||||
output_doc.append(
|
||||
{
|
||||
|
@ -36,3 +120,53 @@ def conll_ner2json(input_data, **kwargs):
|
|||
)
|
||||
output_doc = []
|
||||
return output_docs
|
||||
|
||||
|
||||
def segment_sents_and_docs(doc, n_sents, doc_delimiter, model=None, msg=None):
|
||||
sentencizer = None
|
||||
if model:
|
||||
nlp = load_model(model)
|
||||
if "parser" in nlp.pipe_names:
|
||||
msg.info("Segmenting sentences with parser from model '{}'.".format(model))
|
||||
sentencizer = nlp.get_pipe("parser")
|
||||
if not sentencizer:
|
||||
msg.info(
|
||||
"Segmenting sentences with sentencizer. (Use `-b model` for "
|
||||
"improved parser-based sentence segmentation.)"
|
||||
)
|
||||
nlp = MultiLanguage()
|
||||
sentencizer = nlp.create_pipe("sentencizer")
|
||||
lines = doc.strip().split("\n")
|
||||
words = [line.strip().split()[0] for line in lines]
|
||||
nlpdoc = Doc(nlp.vocab, words=words)
|
||||
sentencizer(nlpdoc)
|
||||
lines_with_segs = []
|
||||
sent_count = 0
|
||||
for i, token in enumerate(nlpdoc):
|
||||
if token.is_sent_start:
|
||||
if n_sents and sent_count % n_sents == 0:
|
||||
lines_with_segs.append(doc_delimiter)
|
||||
lines_with_segs.append("")
|
||||
sent_count += 1
|
||||
lines_with_segs.append(lines[i])
|
||||
return "\n".join(lines_with_segs)
|
||||
|
||||
|
||||
def segment_docs(input_data, n_sents, doc_delimiter):
|
||||
sent_delimiter = "\n\n"
|
||||
sents = input_data.split(sent_delimiter)
|
||||
docs = [sents[i : i + n_sents] for i in range(0, len(sents), n_sents)]
|
||||
input_data = ""
|
||||
for doc in docs:
|
||||
input_data += sent_delimiter + doc_delimiter
|
||||
input_data += sent_delimiter.join(doc)
|
||||
return input_data
|
||||
|
||||
|
||||
def n_sents_info(msg, n_sents):
|
||||
msg.info("Grouping every {} sentences into a document.".format(n_sents))
|
||||
if n_sents == 1:
|
||||
msg.warn(
|
||||
"To generate better training data, you may want to group "
|
||||
"sentences into documents with `-n 10`."
|
||||
)
|
||||
|
|
|
@ -2,17 +2,30 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
import re
|
||||
from wasabi import Printer
|
||||
|
||||
from ...gold import iob_to_biluo
|
||||
from ...util import minibatch
|
||||
from .conll_ner2json import n_sents_info
|
||||
|
||||
|
||||
def iob2json(input_data, n_sents=10, *args, **kwargs):
|
||||
"""
|
||||
Convert IOB files into JSON format for use with train cli.
|
||||
Convert IOB files with one sentence per line and tags separated with '|'
|
||||
into JSON format for use with train cli. IOB and IOB2 are accepted.
|
||||
|
||||
Sample formats:
|
||||
|
||||
I|O like|O London|I-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O
|
||||
I|O like|O London|B-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O
|
||||
I|PRP|O like|VBP|O London|NNP|I-GPE and|CC|O New|NNP|B-GPE York|NNP|I-GPE City|NNP|I-GPE .|.|O
|
||||
I|PRP|O like|VBP|O London|NNP|B-GPE and|CC|O New|NNP|B-GPE York|NNP|I-GPE City|NNP|I-GPE .|.|O
|
||||
"""
|
||||
sentences = read_iob(input_data.split("\n"))
|
||||
docs = merge_sentences(sentences, n_sents)
|
||||
msg = Printer()
|
||||
docs = read_iob(input_data.split("\n"))
|
||||
if n_sents > 0:
|
||||
n_sents_info(msg, n_sents)
|
||||
docs = merge_sentences(docs, n_sents)
|
||||
return docs
|
||||
|
||||
|
||||
|
@ -21,7 +34,7 @@ def read_iob(raw_sents):
|
|||
for line in raw_sents:
|
||||
if not line.strip():
|
||||
continue
|
||||
tokens = [re.split("[^\w\-]", line.strip())]
|
||||
tokens = [t.split("|") for t in line.split()]
|
||||
if len(tokens[0]) == 3:
|
||||
words, pos, iob = zip(*tokens)
|
||||
elif len(tokens[0]) == 2:
|
||||
|
@ -29,7 +42,7 @@ def read_iob(raw_sents):
|
|||
pos = ["-"] * len(words)
|
||||
else:
|
||||
raise ValueError(
|
||||
"The iob/iob2 file is not formatted correctly. Try checking whitespace and delimiters."
|
||||
"The sentence-per-line IOB/IOB2 file is not formatted correctly. Try checking whitespace and delimiters. See https://spacy.io/api/cli#convert"
|
||||
)
|
||||
biluo = iob_to_biluo(iob)
|
||||
sentences.append(
|
||||
|
@ -40,7 +53,7 @@ def read_iob(raw_sents):
|
|||
)
|
||||
sentences = [{"tokens": sent} for sent in sentences]
|
||||
paragraphs = [{"sentences": [sent]} for sent in sentences]
|
||||
docs = [{"id": 0, "paragraphs": [para]} for para in paragraphs]
|
||||
docs = [{"id": i, "paragraphs": [para]} for i, para in enumerate(paragraphs)]
|
||||
return docs
|
||||
|
||||
|
||||
|
@ -50,7 +63,7 @@ def merge_sentences(docs, n_sents):
|
|||
group = list(group)
|
||||
first = group.pop(0)
|
||||
to_extend = first["paragraphs"][0]["sentences"]
|
||||
for sent in group[1:]:
|
||||
for sent in group:
|
||||
to_extend.extend(sent["paragraphs"][0]["sentences"])
|
||||
merged.append(first)
|
||||
return merged
|
||||
|
|
|
@ -18,6 +18,7 @@ class CroatianDefaults(Language.Defaults):
|
|||
)
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
|
||||
stop_words = STOP_WORDS
|
||||
resources = {"lemma_lookup": "lemma_lookup.json"}
|
||||
|
||||
|
||||
class Croatian(Language):
|
||||
|
|
1313609
spacy/lang/hr/lemma_lookup.json
Normal file
1313609
spacy/lang/hr/lemma_lookup.json
Normal file
File diff suppressed because it is too large
Load Diff
15
spacy/lang/hr/lemma_lookup_license.txt
Normal file
15
spacy/lang/hr/lemma_lookup_license.txt
Normal file
|
@ -0,0 +1,15 @@
|
|||
The list of Croatian lemmas was extracted from the reldi-tagger repository (https://github.com/clarinsi/reldi-tagger).
|
||||
Reldi-tagger is licesned under the Apache 2.0 licence.
|
||||
|
||||
@InProceedings{ljubesic16-new,
|
||||
author = {Nikola Ljubešić and Filip Klubička and Željko Agić and Ivo-Pavao Jazbec},
|
||||
title = {New Inflectional Lexicons and Training Corpora for Improved Morphosyntactic Annotation of Croatian and Serbian},
|
||||
booktitle = {Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016)},
|
||||
year = {2016},
|
||||
date = {23-28},
|
||||
location = {Portorož, Slovenia},
|
||||
editor = {Nicoletta Calzolari (Conference Chair) and Khalid Choukri and Thierry Declerck and Sara Goggi and Marko Grobelnik and Bente Maegaard and Joseph Mariani and Helene Mazo and Asuncion Moreno and Jan Odijk and Stelios Piperidis},
|
||||
publisher = {European Language Resources Association (ELRA)},
|
||||
address = {Paris, France},
|
||||
isbn = {978-2-9517408-9-1}
|
||||
}
|
|
@ -58,7 +58,8 @@ def check_spaces(text, tokens):
|
|||
yield prev_end != idx
|
||||
prev_end = idx + len(token)
|
||||
start = prev_end
|
||||
yield False
|
||||
if start > 0:
|
||||
yield False
|
||||
|
||||
|
||||
class KoreanTokenizer(DummyTokenizer):
|
||||
|
|
|
@ -21,6 +21,7 @@ class SerbianDefaults(Language.Defaults):
|
|||
)
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
stop_words = STOP_WORDS
|
||||
resources = {"lemma_lookup": "lemma_lookup.json"}
|
||||
|
||||
|
||||
class Serbian(Language):
|
||||
|
|
|
@ -12,13 +12,14 @@ Example sentences to test spaCy and its language models.
|
|||
|
||||
sentences = [
|
||||
# Translations from English
|
||||
"Apple планира куповину америчког стартапа за $1 милијарду."
|
||||
"Apple планира куповину америчког стартапа за $1 милијарду.",
|
||||
"Беспилотни аутомобили пребацују одговорност осигурања на произвођаче.",
|
||||
"Лондон је велики град у Уједињеном Краљевству.",
|
||||
"Где си ти?",
|
||||
"Ко је председник Француске?",
|
||||
# Serbian common and slang
|
||||
"Moj ћале је инжењер!",
|
||||
"Новак Ђоковић је најбољи тенисер света." "У Пироту има добрих кафана!",
|
||||
"Новак Ђоковић је најбољи тенисер света.",
|
||||
"У Пироту има добрих кафана!",
|
||||
"Музеј Николе Тесле се налази у Београду.",
|
||||
]
|
||||
|
|
253316
spacy/lang/sr/lemma_lookup.json
Executable file
253316
spacy/lang/sr/lemma_lookup.json
Executable file
File diff suppressed because it is too large
Load Diff
32
spacy/lang/sr/lemma_lookup_licence.txt
Normal file
32
spacy/lang/sr/lemma_lookup_licence.txt
Normal file
|
@ -0,0 +1,32 @@
|
|||
Copyright @InProceedings{ljubesic16-new,
|
||||
author = {Nikola Ljubešić and Filip Klubička and Željko Agić and Ivo-Pavao Jazbec},
|
||||
title = {New Inflectional Lexicons and Training Corpora for Improved Morphosyntactic Annotation of Croatian and Serbian},
|
||||
booktitle = {Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016)},
|
||||
year = {2016},
|
||||
date = {23-28},
|
||||
location = {Portorož, Slovenia},
|
||||
editor = {Nicoletta Calzolari (Conference Chair) and Khalid Choukri and Thierry Declerck and Sara Goggi and Marko Grobelnik and Bente Maegaard and Joseph Mariani and Helene Mazo and Asuncion Moreno and Jan Odijk and Stelios Piperidis},
|
||||
publisher = {European Language Resources Association (ELRA)},
|
||||
address = {Paris, France},
|
||||
isbn = {978-2-9517408-9-1}
|
||||
}
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
|
||||
|
||||
The licence of Serbian lemmas was adopted from Serbian lexicon:
|
||||
- sr.lexicon (https://github.com/clarinsi/reldi-tagger/blob/master/sr.lexicon)
|
||||
|
||||
Changelog:
|
||||
- Lexicon is translated into cyrilic
|
||||
- Word order is sorted
|
|
@ -15,6 +15,7 @@ _abbrev_exc = [
|
|||
{ORTH: "пет", LEMMA: "петак", NORM: "петак"},
|
||||
{ORTH: "суб", LEMMA: "субота", NORM: "субота"},
|
||||
{ORTH: "нед", LEMMA: "недеља", NORM: "недеља"},
|
||||
|
||||
# Months abbreviations
|
||||
{ORTH: "јан", LEMMA: "јануар", NORM: "јануар"},
|
||||
{ORTH: "феб", LEMMA: "фебруар", NORM: "фебруар"},
|
||||
|
@ -27,7 +28,7 @@ _abbrev_exc = [
|
|||
{ORTH: "септ", LEMMA: "септембар", NORM: "септембар"},
|
||||
{ORTH: "окт", LEMMA: "октобар", NORM: "октобар"},
|
||||
{ORTH: "нов", LEMMA: "новембар", NORM: "новембар"},
|
||||
{ORTH: "дец", LEMMA: "децембар", NORM: "децембар"},
|
||||
{ORTH: "дец", LEMMA: "децембар", NORM: "децембар"}
|
||||
]
|
||||
|
||||
|
||||
|
|
|
@ -41,8 +41,8 @@ class BaseDefaults(object):
|
|||
def create_lemmatizer(cls, nlp=None, lookups=None):
|
||||
if lookups is None:
|
||||
lookups = cls.create_lookups(nlp=nlp)
|
||||
lemma_rules, lemma_index, lemma_exc, lemma_lookup = util.get_lemma_tables(lookups)
|
||||
return Lemmatizer(lemma_index, lemma_exc, lemma_rules, lemma_lookup)
|
||||
rules, index, exc, lookup = util.get_lemma_tables(lookups)
|
||||
return Lemmatizer(index, exc, rules, lookup)
|
||||
|
||||
@classmethod
|
||||
def create_lookups(cls, nlp=None):
|
||||
|
|
|
@ -142,10 +142,34 @@ TOKEN_PATTERN_SCHEMA = {
|
|||
"title": "Token is whitespace",
|
||||
"$ref": "#/definitions/boolean_value",
|
||||
},
|
||||
"IS_BRACKET": {
|
||||
"title": "Token is a bracket",
|
||||
"$ref": "#/definitions/boolean_value",
|
||||
},
|
||||
"IS_QUOTE": {
|
||||
"title": "Token is a quotation mark",
|
||||
"$ref": "#/definitions/boolean_value",
|
||||
},
|
||||
"IS_LEFT_PUNCT": {
|
||||
"title": "Token is a left punctuation mark",
|
||||
"$ref": "#/definitions/boolean_value",
|
||||
},
|
||||
"IS_RIGHT_PUNCT": {
|
||||
"title": "Token is a right punctuation mark",
|
||||
"$ref": "#/definitions/boolean_value",
|
||||
},
|
||||
"IS_CURRENCY": {
|
||||
"title": "Token is a currency symbol",
|
||||
"$ref": "#/definitions/boolean_value",
|
||||
},
|
||||
"IS_STOP": {
|
||||
"title": "Token is stop word",
|
||||
"$ref": "#/definitions/boolean_value",
|
||||
},
|
||||
"IS_SENT_START": {
|
||||
"title": "Token is the first in a sentence",
|
||||
"$ref": "#/definitions/boolean_value",
|
||||
},
|
||||
"LIKE_NUM": {
|
||||
"title": "Token resembles a number",
|
||||
"$ref": "#/definitions/boolean_value",
|
||||
|
|
|
@ -258,7 +258,7 @@ cdef class Begin:
|
|||
@staticmethod
|
||||
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
||||
cdef int preset_ent_iob = st.B_(0).ent_iob
|
||||
cdef int preset_ent_label = st.B_(0).ent_type
|
||||
cdef attr_t preset_ent_label = st.B_(0).ent_type
|
||||
# If we're the last token of the input, we can't B -- must U or O.
|
||||
if st.B(1) == -1:
|
||||
return False
|
||||
|
@ -395,6 +395,9 @@ cdef class Last:
|
|||
return False
|
||||
elif not st.entity_is_open():
|
||||
return False
|
||||
elif st.B_(0).ent_iob == 1 and st.B_(1).ent_iob != 1:
|
||||
# If a preset entity has I followed by not-I, is L
|
||||
return True
|
||||
elif st.E_(0).ent_type != label:
|
||||
return False
|
||||
elif st.B_(1).ent_iob == 1:
|
||||
|
|
|
@ -103,6 +103,11 @@ def he_tokenizer():
|
|||
return get_lang_class("he").Defaults.create_tokenizer()
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def hr_tokenizer():
|
||||
return get_lang_class("hr").Defaults.create_tokenizer()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def hu_tokenizer():
|
||||
return get_lang_class("hu").Defaults.create_tokenizer()
|
||||
|
|
|
@ -99,6 +99,41 @@ def test_doc_retokenize_spans_merge_tokens(en_tokenizer):
|
|||
assert doc[0].ent_type_ == "GPE"
|
||||
|
||||
|
||||
def test_doc_retokenize_spans_merge_tokens_default_attrs(en_tokenizer):
|
||||
text = "The players start."
|
||||
heads = [1, 1, 0, -1]
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], tags=["DT", "NN", "VBZ", "."], pos=["DET", "NOUN", "VERB", "PUNCT"], heads=heads)
|
||||
assert len(doc) == 4
|
||||
assert doc[0].text == "The"
|
||||
assert doc[0].tag_ == "DT"
|
||||
assert doc[0].pos_ == "DET"
|
||||
with doc.retokenize() as retokenizer:
|
||||
retokenizer.merge(doc[0:2])
|
||||
assert len(doc) == 3
|
||||
assert doc[0].text == "The players"
|
||||
assert doc[0].tag_ == "NN"
|
||||
assert doc[0].pos_ == "NOUN"
|
||||
assert doc[0].lemma_ == "The players"
|
||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], tags=["DT", "NN", "VBZ", "."], pos=["DET", "NOUN", "VERB", "PUNCT"], heads=heads)
|
||||
assert len(doc) == 4
|
||||
assert doc[0].text == "The"
|
||||
assert doc[0].tag_ == "DT"
|
||||
assert doc[0].pos_ == "DET"
|
||||
with doc.retokenize() as retokenizer:
|
||||
retokenizer.merge(doc[0:2])
|
||||
retokenizer.merge(doc[2:4])
|
||||
assert len(doc) == 2
|
||||
assert doc[0].text == "The players"
|
||||
assert doc[0].tag_ == "NN"
|
||||
assert doc[0].pos_ == "NOUN"
|
||||
assert doc[0].lemma_ == "The players"
|
||||
assert doc[1].text == "start ."
|
||||
assert doc[1].tag_ == "VBZ"
|
||||
assert doc[1].pos_ == "VERB"
|
||||
assert doc[1].lemma_ == "start ."
|
||||
|
||||
|
||||
def test_doc_retokenize_spans_merge_heads(en_tokenizer):
|
||||
text = "I found a pilates class near work."
|
||||
heads = [1, 0, 2, 1, -3, -1, -1, -6]
|
||||
|
@ -182,7 +217,7 @@ def test_doc_retokenize_spans_entity_merge(en_tokenizer):
|
|||
assert len(doc) == 15
|
||||
|
||||
|
||||
def test_doc_retokenize_spans_entity_merge_iob():
|
||||
def test_doc_retokenize_spans_entity_merge_iob(en_vocab):
|
||||
# Test entity IOB stays consistent after merging
|
||||
words = ["a", "b", "c", "d", "e"]
|
||||
doc = Doc(Vocab(), words=words)
|
||||
|
@ -195,10 +230,23 @@ def test_doc_retokenize_spans_entity_merge_iob():
|
|||
assert doc[2].ent_iob_ == "I"
|
||||
assert doc[3].ent_iob_ == "B"
|
||||
with doc.retokenize() as retokenizer:
|
||||
retokenizer.merge(doc[0:1])
|
||||
retokenizer.merge(doc[0:2])
|
||||
assert len(doc) == len(words) - 1
|
||||
assert doc[0].ent_iob_ == "B"
|
||||
assert doc[1].ent_iob_ == "I"
|
||||
|
||||
# Test that IOB stays consistent with provided IOB
|
||||
words = ["a", "b", "c", "d", "e"]
|
||||
doc = Doc(Vocab(), words=words)
|
||||
with doc.retokenize() as retokenizer:
|
||||
attrs = {"ent_type": "ent-abc", "ent_iob": 1}
|
||||
retokenizer.merge(doc[0:3], attrs=attrs)
|
||||
retokenizer.merge(doc[3:5], attrs=attrs)
|
||||
assert doc[0].ent_iob_ == "B"
|
||||
assert doc[1].ent_iob_ == "I"
|
||||
|
||||
# if no parse/heads, the first word in the span is the root and provides
|
||||
# default values
|
||||
words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"]
|
||||
doc = Doc(Vocab(), words=words)
|
||||
doc.ents = [
|
||||
|
@ -215,7 +263,53 @@ def test_doc_retokenize_spans_entity_merge_iob():
|
|||
retokenizer.merge(doc[7:9])
|
||||
assert len(doc) == 6
|
||||
assert doc[3].ent_iob_ == "B"
|
||||
assert doc[4].ent_iob_ == "I"
|
||||
assert doc[3].ent_type_ == "ent-de"
|
||||
assert doc[4].ent_iob_ == "B"
|
||||
assert doc[4].ent_type_ == "ent-fg"
|
||||
|
||||
# if there is a parse, span.root provides default values
|
||||
words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"]
|
||||
heads = [ 0, -1, 1, -3, -4, -5, -1, -7, -8 ]
|
||||
ents = [
|
||||
(3, 5, "ent-de"),
|
||||
(5, 7, "ent-fg"),
|
||||
]
|
||||
deps = ["dep"] * len(words)
|
||||
en_vocab.strings.add("ent-de")
|
||||
en_vocab.strings.add("ent-fg")
|
||||
en_vocab.strings.add("dep")
|
||||
doc = get_doc(en_vocab, words=words, heads=heads, deps=deps, ents=ents)
|
||||
assert doc[2:4].root == doc[3] # root of 'c d' is d
|
||||
assert doc[4:6].root == doc[4] # root is 'e f' is e
|
||||
with doc.retokenize() as retokenizer:
|
||||
retokenizer.merge(doc[2:4])
|
||||
retokenizer.merge(doc[4:6])
|
||||
retokenizer.merge(doc[7:9])
|
||||
assert len(doc) == 6
|
||||
assert doc[2].ent_iob_ == "B"
|
||||
assert doc[2].ent_type_ == "ent-de"
|
||||
assert doc[3].ent_iob_ == "I"
|
||||
assert doc[3].ent_type_ == "ent-de"
|
||||
assert doc[4].ent_iob_ == "B"
|
||||
assert doc[4].ent_type_ == "ent-fg"
|
||||
|
||||
# check that B is preserved if span[start] is B
|
||||
words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"]
|
||||
heads = [ 0, -1, 1, 1, -4, -5, -1, -7, -8 ]
|
||||
ents = [
|
||||
(3, 5, "ent-de"),
|
||||
(5, 7, "ent-de"),
|
||||
]
|
||||
deps = ["dep"] * len(words)
|
||||
doc = get_doc(en_vocab, words=words, heads=heads, deps=deps, ents=ents)
|
||||
with doc.retokenize() as retokenizer:
|
||||
retokenizer.merge(doc[3:5])
|
||||
retokenizer.merge(doc[5:7])
|
||||
assert len(doc) == 7
|
||||
assert doc[3].ent_iob_ == "B"
|
||||
assert doc[3].ent_type_ == "ent-de"
|
||||
assert doc[4].ent_iob_ == "B"
|
||||
assert doc[4].ent_type_ == "ent-de"
|
||||
|
||||
|
||||
def test_doc_retokenize_spans_sentence_update_after_merge(en_tokenizer):
|
||||
|
|
20
spacy/tests/lang/hr/test_lemma.py
Normal file
20
spacy/tests/lang/hr/test_lemma.py
Normal file
|
@ -0,0 +1,20 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"string,lemma",
|
||||
[
|
||||
("trčao", "trčati"),
|
||||
("adekvatnim", "adekvatan"),
|
||||
("dekontaminacijama", "dekontaminacija"),
|
||||
("filologovih", "filologov"),
|
||||
("je", "biti"),
|
||||
("se", "sebe"),
|
||||
],
|
||||
)
|
||||
def test_hr_lemmatizer_lookup_assigns(hr_tokenizer, string, lemma):
|
||||
tokens = hr_tokenizer(string)
|
||||
assert tokens[0].lemma_ == lemma
|
|
@ -45,3 +45,8 @@ def test_ko_tokenizer_full_tags(ko_tokenizer, text, expected_tags):
|
|||
def test_ko_tokenizer_pos(ko_tokenizer, text, expected_pos):
|
||||
pos = [token.pos_ for token in ko_tokenizer(text)]
|
||||
assert pos == expected_pos.split()
|
||||
|
||||
|
||||
def test_ko_empty_doc(ko_tokenizer):
|
||||
tokens = ko_tokenizer("")
|
||||
assert len(tokens) == 0
|
||||
|
|
20
spacy/tests/lang/sr/test_lemmatizer.py
Normal file
20
spacy/tests/lang/sr/test_lemmatizer.py
Normal file
|
@ -0,0 +1,20 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"string,lemma",
|
||||
[
|
||||
("најадекватнији", "адекватан"),
|
||||
("матурирао", "матурирати"),
|
||||
("планираћемо", "планирати"),
|
||||
("певају", "певати"),
|
||||
("нама", "ми"),
|
||||
("се", "себе"),
|
||||
],
|
||||
)
|
||||
def test_sr_lemmatizer_lookup_assigns(sr_tokenizer, string, lemma):
|
||||
tokens = sr_tokenizer(string)
|
||||
assert tokens[0].lemma_ == lemma
|
|
@ -6,8 +6,13 @@ import pytest
|
|||
|
||||
@pytest.mark.parametrize(
|
||||
"text,norms,lemmas",
|
||||
[("о.г.", ["ове године"], ["ова година"]), ("чет.", ["четвртак"], ["четвртак"]),
|
||||
("гђа", ["госпођа"], ["госпођа"]), ("ил'", ["или"], ["или"])])
|
||||
[
|
||||
("о.г.", ["ове године"], ["ова година"]),
|
||||
("чет.", ["четвртак"], ["четвртак"]),
|
||||
("гђа", ["госпођа"], ["госпођа"]),
|
||||
("ил'", ["или"], ["или"]),
|
||||
],
|
||||
)
|
||||
def test_sr_tokenizer_abbrev_exceptions(sr_tokenizer, text, norms, lemmas):
|
||||
tokens = sr_tokenizer(text)
|
||||
assert len(tokens) == 1
|
||||
|
|
|
@ -380,3 +380,33 @@ def test_attr_pipeline_checks(en_vocab):
|
|||
matcher(doc1)
|
||||
matcher(doc2)
|
||||
matcher(doc3)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"pattern,text",
|
||||
[
|
||||
([{"IS_ALPHA": True}], "a"),
|
||||
([{"IS_ASCII": True}], "a"),
|
||||
([{"IS_DIGIT": True}], "1"),
|
||||
([{"IS_LOWER": True}], "a"),
|
||||
([{"IS_UPPER": True}], "A"),
|
||||
([{"IS_TITLE": True}], "Aaaa"),
|
||||
([{"IS_PUNCT": True}], "."),
|
||||
([{"IS_SPACE": True}], "\n"),
|
||||
([{"IS_BRACKET": True}], "["),
|
||||
([{"IS_QUOTE": True}], '"'),
|
||||
([{"IS_LEFT_PUNCT": True}], "``"),
|
||||
([{"IS_RIGHT_PUNCT": True}], "''"),
|
||||
([{"IS_STOP": True}], "the"),
|
||||
([{"LIKE_NUM": True}], "1"),
|
||||
([{"LIKE_URL": True}], "http://example.com"),
|
||||
([{"LIKE_EMAIL": True}], "mail@example.com"),
|
||||
],
|
||||
)
|
||||
def test_matcher_schema_token_attributes(en_vocab, pattern, text):
|
||||
matcher = Matcher(en_vocab)
|
||||
doc = Doc(en_vocab, words=text.split(" "))
|
||||
matcher.add("Rule", None, pattern)
|
||||
assert len(matcher) == 1
|
||||
matches = matcher(doc)
|
||||
assert len(matches) == 1
|
||||
|
|
|
@ -13,6 +13,28 @@ from spacy.lemmatizer import Lemmatizer
|
|||
from spacy.symbols import ORTH, LEMMA, POS, VERB, VerbForm_part
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
def test_issue1061():
|
||||
'''Test special-case works after tokenizing. Was caching problem.'''
|
||||
text = 'I like _MATH_ even _MATH_ when _MATH_, except when _MATH_ is _MATH_! but not _MATH_.'
|
||||
tokenizer = English.Defaults.create_tokenizer()
|
||||
doc = tokenizer(text)
|
||||
assert 'MATH' in [w.text for w in doc]
|
||||
assert '_MATH_' not in [w.text for w in doc]
|
||||
|
||||
tokenizer.add_special_case('_MATH_', [{ORTH: '_MATH_'}])
|
||||
doc = tokenizer(text)
|
||||
assert '_MATH_' in [w.text for w in doc]
|
||||
assert 'MATH' not in [w.text for w in doc]
|
||||
|
||||
# For sanity, check it works when pipeline is clean.
|
||||
tokenizer = English.Defaults.create_tokenizer()
|
||||
tokenizer.add_special_case('_MATH_', [{ORTH: '_MATH_'}])
|
||||
doc = tokenizer(text)
|
||||
assert '_MATH_' in [w.text for w in doc]
|
||||
assert 'MATH' not in [w.text for w in doc]
|
||||
|
||||
|
||||
@pytest.mark.xfail(
|
||||
reason="g is split of as a unit, as the suffix regular expression can not look back further (variable-width)"
|
||||
)
|
||||
|
|
|
@ -329,3 +329,4 @@ def test_issue_1971_4(en_vocab):
|
|||
matches = matcher(doc)
|
||||
# Uncommenting this caused a segmentation fault
|
||||
assert len(matches) == 1
|
||||
assert matches[0] == (en_vocab.strings["TEST"], 0, 3)
|
||||
|
|
57
spacy/tests/regression/test_issue4190.py
Normal file
57
spacy/tests/regression/test_issue4190.py
Normal file
|
@ -0,0 +1,57 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from spacy.lang.en import English
|
||||
|
||||
import spacy
|
||||
from spacy.tokenizer import Tokenizer
|
||||
|
||||
from spacy.tests.util import make_tempdir
|
||||
|
||||
|
||||
def test_issue4190():
|
||||
test_string = "Test c."
|
||||
|
||||
# Load default language
|
||||
nlp_1 = English()
|
||||
doc_1a = nlp_1(test_string)
|
||||
result_1a = [token.text for token in doc_1a]
|
||||
|
||||
# Modify tokenizer
|
||||
customize_tokenizer(nlp_1)
|
||||
doc_1b = nlp_1(test_string)
|
||||
result_1b = [token.text for token in doc_1b]
|
||||
|
||||
# Save and Reload
|
||||
with make_tempdir() as model_dir:
|
||||
nlp_1.to_disk(model_dir)
|
||||
nlp_2 = spacy.load(model_dir)
|
||||
|
||||
# This should be the modified tokenizer
|
||||
doc_2 = nlp_2(test_string)
|
||||
result_2 = [token.text for token in doc_2]
|
||||
|
||||
assert result_1b == result_2
|
||||
|
||||
|
||||
def customize_tokenizer(nlp):
|
||||
prefix_re = spacy.util.compile_prefix_regex(nlp.Defaults.prefixes)
|
||||
suffix_re = spacy.util.compile_suffix_regex(nlp.Defaults.suffixes)
|
||||
infix_re = spacy.util.compile_infix_regex(nlp.Defaults.infixes)
|
||||
|
||||
# remove all exceptions where a single letter is followed by a period (e.g. 'h.')
|
||||
exceptions = {
|
||||
k: v
|
||||
for k, v in dict(nlp.Defaults.tokenizer_exceptions).items()
|
||||
if not (len(k) == 2 and k[1] == ".")
|
||||
}
|
||||
new_tokenizer = Tokenizer(
|
||||
nlp.vocab,
|
||||
exceptions,
|
||||
prefix_search=prefix_re.search,
|
||||
suffix_search=suffix_re.search,
|
||||
infix_finditer=infix_re.finditer,
|
||||
token_match=nlp.tokenizer.token_match,
|
||||
)
|
||||
|
||||
nlp.tokenizer = new_tokenizer
|
|
@ -4,7 +4,7 @@ from __future__ import unicode_literals
|
|||
import pytest
|
||||
|
||||
from spacy.lang.en import English
|
||||
from spacy.cli.converters import conllu2json
|
||||
from spacy.cli.converters import conllu2json, iob2json, conll_ner2json
|
||||
from spacy.cli.pretrain import make_docs
|
||||
|
||||
|
||||
|
@ -32,6 +32,95 @@ def test_cli_converters_conllu2json():
|
|||
assert [t["ner"] for t in tokens] == ["O", "B-PER", "L-PER", "O"]
|
||||
|
||||
|
||||
def test_cli_converters_iob2json():
|
||||
lines = [
|
||||
"I|O like|O London|I-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O",
|
||||
"I|O like|O London|B-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O",
|
||||
"I|PRP|O like|VBP|O London|NNP|I-GPE and|CC|O New|NNP|B-GPE York|NNP|I-GPE City|NNP|I-GPE .|.|O",
|
||||
"I|PRP|O like|VBP|O London|NNP|B-GPE and|CC|O New|NNP|B-GPE York|NNP|I-GPE City|NNP|I-GPE .|.|O",
|
||||
]
|
||||
input_data = "\n".join(lines)
|
||||
converted = iob2json(input_data, n_sents=10)
|
||||
assert len(converted) == 1
|
||||
assert converted[0]["id"] == 0
|
||||
assert len(converted[0]["paragraphs"]) == 1
|
||||
assert len(converted[0]["paragraphs"][0]["sentences"]) == 4
|
||||
for i in range(0, 4):
|
||||
sent = converted[0]["paragraphs"][0]["sentences"][i]
|
||||
assert len(sent["tokens"]) == 8
|
||||
tokens = sent["tokens"]
|
||||
# fmt: off
|
||||
assert [t["orth"] for t in tokens] == ["I", "like", "London", "and", "New", "York", "City", "."]
|
||||
assert [t["ner"] for t in tokens] == ["O", "O", "U-GPE", "O", "B-GPE", "I-GPE", "L-GPE", "O"]
|
||||
# fmt: on
|
||||
|
||||
|
||||
def test_cli_converters_conll_ner2json():
|
||||
lines = [
|
||||
"-DOCSTART- -X- O O",
|
||||
"",
|
||||
"I\tO",
|
||||
"like\tO",
|
||||
"London\tB-GPE",
|
||||
"and\tO",
|
||||
"New\tB-GPE",
|
||||
"York\tI-GPE",
|
||||
"City\tI-GPE",
|
||||
".\tO",
|
||||
"",
|
||||
"I O",
|
||||
"like O",
|
||||
"London B-GPE",
|
||||
"and O",
|
||||
"New B-GPE",
|
||||
"York I-GPE",
|
||||
"City I-GPE",
|
||||
". O",
|
||||
"",
|
||||
"I PRP O",
|
||||
"like VBP O",
|
||||
"London NNP B-GPE",
|
||||
"and CC O",
|
||||
"New NNP B-GPE",
|
||||
"York NNP I-GPE",
|
||||
"City NNP I-GPE",
|
||||
". . O",
|
||||
"",
|
||||
"I PRP _ O",
|
||||
"like VBP _ O",
|
||||
"London NNP _ B-GPE",
|
||||
"and CC _ O",
|
||||
"New NNP _ B-GPE",
|
||||
"York NNP _ I-GPE",
|
||||
"City NNP _ I-GPE",
|
||||
". . _ O",
|
||||
"",
|
||||
"I\tPRP\t_\tO",
|
||||
"like\tVBP\t_\tO",
|
||||
"London\tNNP\t_\tB-GPE",
|
||||
"and\tCC\t_\tO",
|
||||
"New\tNNP\t_\tB-GPE",
|
||||
"York\tNNP\t_\tI-GPE",
|
||||
"City\tNNP\t_\tI-GPE",
|
||||
".\t.\t_\tO",
|
||||
]
|
||||
input_data = "\n".join(lines)
|
||||
converted = conll_ner2json(input_data, n_sents=10)
|
||||
print(converted)
|
||||
assert len(converted) == 1
|
||||
assert converted[0]["id"] == 0
|
||||
assert len(converted[0]["paragraphs"]) == 1
|
||||
assert len(converted[0]["paragraphs"][0]["sentences"]) == 5
|
||||
for i in range(0, 5):
|
||||
sent = converted[0]["paragraphs"][0]["sentences"][i]
|
||||
assert len(sent["tokens"]) == 8
|
||||
tokens = sent["tokens"]
|
||||
# fmt: off
|
||||
assert [t["orth"] for t in tokens] == ["I", "like", "London", "and", "New", "York", "City", "."]
|
||||
assert [t["ner"] for t in tokens] == ["O", "O", "U-GPE", "O", "B-GPE", "I-GPE", "L-GPE", "O"]
|
||||
# fmt: on
|
||||
|
||||
|
||||
def test_pretrain_make_docs():
|
||||
nlp = English()
|
||||
|
||||
|
|
|
@ -441,8 +441,13 @@ cdef class Tokenizer:
|
|||
self.infix_finditer = re.compile(data["infix_finditer"]).finditer
|
||||
if data.get("token_match"):
|
||||
self.token_match = re.compile(data["token_match"]).match
|
||||
for string, substrings in data.get("rules", {}).items():
|
||||
self.add_special_case(string, substrings)
|
||||
if data.get("rules"):
|
||||
# make sure to hard reset the cache to remove data from the default exceptions
|
||||
self._rules = {}
|
||||
self._cache = PreshMap()
|
||||
for string, substrings in data.get("rules", {}).items():
|
||||
self.add_special_case(string, substrings)
|
||||
|
||||
return self
|
||||
|
||||
|
||||
|
|
|
@ -109,13 +109,8 @@ cdef class Retokenizer:
|
|||
|
||||
def __exit__(self, *args):
|
||||
# Do the actual merging here
|
||||
if len(self.merges) > 1:
|
||||
_bulk_merge(self.doc, self.merges)
|
||||
elif len(self.merges) == 1:
|
||||
(span, attrs) = self.merges[0]
|
||||
start = span.start
|
||||
end = span.end
|
||||
_merge(self.doc, start, end, attrs)
|
||||
if len(self.merges) >= 1:
|
||||
_merge(self.doc, self.merges)
|
||||
# Iterate in order, to keep things simple.
|
||||
for start_char, orths, heads, attrs in sorted(self.splits):
|
||||
# Resolve token index
|
||||
|
@ -140,95 +135,7 @@ cdef class Retokenizer:
|
|||
_split(self.doc, token_index, orths, head_indices, attrs)
|
||||
|
||||
|
||||
def _merge(Doc doc, int start, int end, attributes):
|
||||
"""Retokenize the document, such that the span at
|
||||
`doc.text[start_idx : end_idx]` is merged into a single token. If
|
||||
`start_idx` and `end_idx `do not mark start and end token boundaries,
|
||||
the document remains unchanged.
|
||||
start_idx (int): Character index of the start of the slice to merge.
|
||||
end_idx (int): Character index after the end of the slice to merge.
|
||||
**attributes: Attributes to assign to the merged token. By default,
|
||||
attributes are inherited from the syntactic root of the span.
|
||||
RETURNS (Token): The newly merged token, or `None` if the start and end
|
||||
indices did not fall at token boundaries.
|
||||
"""
|
||||
cdef Span span = doc[start:end]
|
||||
cdef int start_char = span.start_char
|
||||
cdef int end_char = span.end_char
|
||||
# Resize the doc.tensor, if it's set. Let the last row for each token stand
|
||||
# for the merged region. To do this, we create a boolean array indicating
|
||||
# whether the row is to be deleted, then use numpy.delete
|
||||
if doc.tensor is not None and doc.tensor.size != 0:
|
||||
doc.tensor = _resize_tensor(doc.tensor, [(start, end)])
|
||||
# Get LexemeC for newly merged token
|
||||
new_orth = ''.join([t.text_with_ws for t in span])
|
||||
if span[-1].whitespace_:
|
||||
new_orth = new_orth[:-len(span[-1].whitespace_)]
|
||||
cdef const LexemeC* lex = doc.vocab.get(doc.mem, new_orth)
|
||||
# House the new merged token where it starts
|
||||
cdef TokenC* token = &doc.c[start]
|
||||
token.spacy = doc.c[end-1].spacy
|
||||
for attr_name, attr_value in attributes.items():
|
||||
if attr_name == "_": # Set extension attributes
|
||||
for ext_attr_key, ext_attr_value in attr_value.items():
|
||||
doc[start]._.set(ext_attr_key, ext_attr_value)
|
||||
elif attr_name == TAG:
|
||||
doc.vocab.morphology.assign_tag(token, attr_value)
|
||||
else:
|
||||
# Set attributes on both token and lexeme to take care of token
|
||||
# attribute vs. lexical attribute without having to enumerate them.
|
||||
# If an attribute name is not valid, set_struct_attr will ignore it.
|
||||
Token.set_struct_attr(token, attr_name, attr_value)
|
||||
Lexeme.set_struct_attr(<LexemeC*>lex, attr_name, attr_value)
|
||||
# Make sure ent_iob remains consistent
|
||||
if doc.c[end].ent_iob == 1 and token.ent_iob in (0, 2):
|
||||
if token.ent_type == doc.c[end].ent_type:
|
||||
token.ent_iob = 3
|
||||
else:
|
||||
# If they're not the same entity type, let them be two entities
|
||||
doc.c[end].ent_iob = 3
|
||||
# Begin by setting all the head indices to absolute token positions
|
||||
# This is easier to work with for now than the offsets
|
||||
# Before thinking of something simpler, beware the case where a
|
||||
# dependency bridges over the entity. Here the alignment of the
|
||||
# tokens changes.
|
||||
span_root = span.root.i
|
||||
token.dep = span.root.dep
|
||||
# We update token.lex after keeping span root and dep, since
|
||||
# setting token.lex will change span.start and span.end properties
|
||||
# as it modifies the character offsets in the doc
|
||||
token.lex = lex
|
||||
for i in range(doc.length):
|
||||
doc.c[i].head += i
|
||||
# Set the head of the merged token, and its dep relation, from the Span
|
||||
token.head = doc.c[span_root].head
|
||||
# Adjust deps before shrinking tokens
|
||||
# Tokens which point into the merged token should now point to it
|
||||
# Subtract the offset from all tokens which point to >= end
|
||||
offset = (end - start) - 1
|
||||
for i in range(doc.length):
|
||||
head_idx = doc.c[i].head
|
||||
if start <= head_idx < end:
|
||||
doc.c[i].head = start
|
||||
elif head_idx >= end:
|
||||
doc.c[i].head -= offset
|
||||
# Now compress the token array
|
||||
for i in range(end, doc.length):
|
||||
doc.c[i - offset] = doc.c[i]
|
||||
for i in range(doc.length - offset, doc.length):
|
||||
memset(&doc.c[i], 0, sizeof(TokenC))
|
||||
doc.c[i].lex = &EMPTY_LEXEME
|
||||
doc.length -= offset
|
||||
for i in range(doc.length):
|
||||
# ...And, set heads back to a relative position
|
||||
doc.c[i].head -= i
|
||||
# Set the left/right children, left/right edges
|
||||
set_children_from_heads(doc.c, doc.length)
|
||||
# Return the merged Python object
|
||||
return doc[start]
|
||||
|
||||
|
||||
def _bulk_merge(Doc doc, merges):
|
||||
def _merge(Doc doc, merges):
|
||||
"""Retokenize the document, such that the spans described in 'merges'
|
||||
are merged into a single token. This method assumes that the merges
|
||||
are in the same order at which they appear in the doc, and that merges
|
||||
|
@ -256,6 +163,26 @@ def _bulk_merge(Doc doc, merges):
|
|||
spans.append(span)
|
||||
# House the new merged token where it starts
|
||||
token = &doc.c[start]
|
||||
# Initially set attributes to attributes of span root
|
||||
token.tag = doc.c[span.root.i].tag
|
||||
token.pos = doc.c[span.root.i].pos
|
||||
token.morph = doc.c[span.root.i].morph
|
||||
token.ent_iob = doc.c[span.root.i].ent_iob
|
||||
token.ent_type = doc.c[span.root.i].ent_type
|
||||
merged_iob = token.ent_iob
|
||||
# If span root is part of an entity, merged token is B-ENT
|
||||
if token.ent_iob in (1, 3):
|
||||
merged_iob = 3
|
||||
# If start token is I-ENT and previous token is of the same
|
||||
# type, then I-ENT (could check I-ENT from start to span root)
|
||||
if doc.c[start].ent_iob == 1 and start > 0 \
|
||||
and doc.c[start].ent_type == token.ent_type \
|
||||
and doc.c[start - 1].ent_type == token.ent_type:
|
||||
merged_iob = 1
|
||||
token.ent_iob = merged_iob
|
||||
# Unset attributes that don't match new token
|
||||
token.lemma = 0
|
||||
token.norm = 0
|
||||
tokens[merge_index] = token
|
||||
# Resize the doc.tensor, if it's set. Let the last row for each token stand
|
||||
# for the merged region. To do this, we create a boolean array indicating
|
||||
|
@ -351,17 +278,7 @@ def _bulk_merge(Doc doc, merges):
|
|||
# Set the left/right children, left/right edges
|
||||
set_children_from_heads(doc.c, doc.length)
|
||||
# Make sure ent_iob remains consistent
|
||||
for (span, _) in merges:
|
||||
if(span.end < len(offsets)):
|
||||
# If it's not the last span
|
||||
token_after_span_position = offsets[span.end]
|
||||
if doc.c[token_after_span_position].ent_iob == 1\
|
||||
and doc.c[token_after_span_position - 1].ent_iob in (0, 2):
|
||||
if doc.c[token_after_span_position - 1].ent_type == doc.c[token_after_span_position].ent_type:
|
||||
doc.c[token_after_span_position - 1].ent_iob = 3
|
||||
else:
|
||||
# If they're not the same entity type, let them be two entities
|
||||
doc.c[token_after_span_position].ent_iob = 3
|
||||
make_iob_consistent(doc.c, doc.length)
|
||||
# Return the merged Python object
|
||||
return doc[spans[0].start]
|
||||
|
||||
|
@ -480,3 +397,12 @@ def _validate_extensions(extensions):
|
|||
raise ValueError(Errors.E118.format(attr=key))
|
||||
if not is_writable_attr(extension):
|
||||
raise ValueError(Errors.E119.format(attr=key))
|
||||
|
||||
|
||||
cdef make_iob_consistent(TokenC* tokens, int length):
|
||||
cdef int i
|
||||
if tokens[0].ent_iob == 1:
|
||||
tokens[0].ent_iob = 3
|
||||
for i in range(1, length):
|
||||
if tokens[i].ent_iob == 1 and tokens[i - 1].ent_type != tokens[i].ent_type:
|
||||
tokens[i].ent_iob = 3
|
||||
|
|
|
@ -145,6 +145,8 @@ $ python -m spacy convert [input_file] [output_dir] [--file-type] [--converter]
|
|||
| `--file-type`, `-t` <Tag variant="new">2.1</Tag> | option | Type of file to create (see below). |
|
||||
| `--converter`, `-c` <Tag variant="new">2</Tag> | option | Name of converter to use (see below). |
|
||||
| `--n-sents`, `-n` | option | Number of sentences per document. |
|
||||
| `--seg-sents`, `-s` <Tag variant="new">2.2</Tag> | flag | Segment sentences (for `-c ner`) |
|
||||
| `--model`, `-b` <Tag variant="new">2.2</Tag> | option | Model for parser-based sentence segmentation (for `-s`) |
|
||||
| `--morphology`, `-m` | option | Enable appending morphology to tags. |
|
||||
| `--lang`, `-l` <Tag variant="new">2.1</Tag> | option | Language code (if tokenizer required). |
|
||||
| `--help`, `-h` | flag | Show help message and available arguments. |
|
||||
|
@ -174,10 +176,10 @@ All output files generated by this command are compatible with
|
|||
|
||||
| ID | Description |
|
||||
| ------------------------------ | --------------------------------------------------------------- |
|
||||
| `auto` | Automatically pick converter based on file extension (default). |
|
||||
| `auto` | Automatically pick converter based on file extension and file content (default). |
|
||||
| `conll`, `conllu`, `conllubio` | Universal Dependencies `.conllu` or `.conll` format. |
|
||||
| `ner` | Tab-based named entity recognition format. |
|
||||
| `iob` | IOB or IOB2 named entity recognition format. |
|
||||
| `ner` | NER with IOB/IOB2 tags, one token per line with columns separated by whitespace. The first column is the token and the final column is the IOB tag. Sentences are separated by blank lines and documents are separated by the line `-DOCSTART- -X- O O`. Supports CoNLL 2003 NER format. See [sample data](https://github.com/explosion/spaCy/tree/master/examples/training/ner_example_data). |
|
||||
| `iob` | NER with IOB/IOB2 tags, one sentence per line with tokens separated by whitespace and annotation separated by `|`, either `word|B-ENT` or `word|POS|B-ENT`. See [sample data](https://github.com/explosion/spaCy/tree/master/examples/training/ner_example_data). |
|
||||
|
||||
## Train {#train}
|
||||
|
||||
|
|
|
@ -639,7 +639,7 @@ Yield an infinite series of linearly decaying values.
|
|||
|
||||
Shuffle an iterator. This works by holding `bufsize` items back and yielding
|
||||
them sometime later. Obviously, this is not unbiased – but should be good enough
|
||||
for batching. Larger `buffsize` means less bias.
|
||||
for batching. Larger `bufsize` means less bias.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
|
@ -648,11 +648,11 @@ for batching. Larger `buffsize` means less bias.
|
|||
> shuffled = itershuffle(values)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ---------- | -------- | ---------------------- |
|
||||
| `iterable` | iterable | Iterator to shuffle. |
|
||||
| `buffsize` | int | Items to hold back. |
|
||||
| **YIELDS** | iterable | The shuffled iterator. |
|
||||
| Name | Type | Description |
|
||||
| ---------- | -------- | ------------------------------------- |
|
||||
| `iterable` | iterable | Iterator to shuffle. |
|
||||
| `bufsize` | int | Items to hold back (default: 1000). |
|
||||
| **YIELDS** | iterable | The shuffled iterator. |
|
||||
|
||||
### util.filter_spans {#util.filter_spans tag="function" new="2.1.4"}
|
||||
|
||||
|
|
|
@ -26,6 +26,14 @@ import PosDeps101 from 'usage/101/\_pos-deps.md'
|
|||
|
||||
<PosDeps101 />
|
||||
|
||||
<Infobox title="📖 Part-of-speech tag scheme">
|
||||
|
||||
For a list of the fine-grained and coarse-grained part-of-speech tags assigned
|
||||
by spaCy's models across different languages, see the
|
||||
[POS tag scheme documentation](/api/annotation#pos-tagging).
|
||||
|
||||
</Infobox>
|
||||
|
||||
### Rule-based morphology {#rule-based-morphology}
|
||||
|
||||
Inflectional morphology is the process by which a root form of a word is
|
||||
|
@ -61,14 +69,7 @@ of the two. The system works as follows:
|
|||
morphological information, without consulting the context of the token. The
|
||||
lemmatizer also accepts list-based exception files, acquired from
|
||||
[WordNet](https://wordnet.princeton.edu/).
|
||||
|
||||
<Infobox title="📖 Part-of-speech tag scheme">
|
||||
|
||||
For a list of the fine-grained and coarse-grained part-of-speech tags assigned
|
||||
by spaCy's models across different languages, see the
|
||||
[POS tag scheme documentation](/api/annotation#pos-tagging).
|
||||
|
||||
</Infobox>
|
||||
|
||||
|
||||
## Dependency Parsing {#dependency-parse model="parser"}
|
||||
|
||||
|
@ -289,7 +290,7 @@ for token in doc:
|
|||
|
||||
For a list of the syntactic dependency labels assigned by spaCy's models across
|
||||
different languages, see the
|
||||
[dependency label scheme documentation](/api/annotation#pos-tagging).
|
||||
[dependency label scheme documentation](/api/annotation#dependency-parsing).
|
||||
|
||||
</Infobox>
|
||||
|
||||
|
|
|
@ -10,6 +10,7 @@
|
|||
"en_vectors_web_lg",
|
||||
"en_pytt_bertbaseuncased_lg",
|
||||
"en_pytt_robertabase_lg",
|
||||
"en_pytt_distilbertbaseuncased_lg",
|
||||
"en_pytt_xlnetbasecased_lg"
|
||||
],
|
||||
"example": "This is a sentence.",
|
||||
|
|
|
@ -1562,7 +1562,7 @@
|
|||
},
|
||||
{
|
||||
"id": "pyInflect",
|
||||
"slogan": "A python module for word inflections",
|
||||
"slogan": "A Python module for word inflections",
|
||||
"description": "This package uses the [spaCy 2.0 extensions](https://spacy.io/usage/processing-pipelines#extensions) to add word inflections to the system.",
|
||||
"github": "bjascob/pyInflect",
|
||||
"pip": "pyinflect",
|
||||
|
@ -1582,6 +1582,29 @@
|
|||
"category": ["pipeline"],
|
||||
"tags": ["inflection"]
|
||||
},
|
||||
{
|
||||
"id": "lemminflect",
|
||||
"slogan": "A Python module for English lemmatization and inflection",
|
||||
"description": "LemmInflect uses a dictionary approach to lemmatize English words and inflect them into forms specified by a user supplied [Universal Dependencies](https://universaldependencies.org/u/pos/) or [Penn Treebank](https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html) tag. The library works with out-of-vocabulary (OOV) words by applying neural network techniques to classify word forms and choose the appropriate morphing rules. The system acts as a standalone module or as an extension to spaCy.",
|
||||
"github": "bjascob/LemmInflect",
|
||||
"pip": "lemminflect",
|
||||
"thumb": "https://raw.githubusercontent.com/bjascob/LemmInflect/master/docs/img/icons8-citrus-80.png",
|
||||
"code_example": [
|
||||
"import spacy",
|
||||
"import lemminflect",
|
||||
"",
|
||||
"nlp = spacy.load('en_core_web_sm')",
|
||||
"doc = nlp('I am testing this example.')",
|
||||
"doc[2]._.lemma() # 'test'",
|
||||
"doc[4]._.inflect('NNS') # 'examples'"
|
||||
],
|
||||
"author": "Brad Jascob",
|
||||
"author_links": {
|
||||
"github": "bjascob"
|
||||
},
|
||||
"category": ["pipeline"],
|
||||
"tags": ["inflection", "lemmatizer"]
|
||||
},
|
||||
{
|
||||
"id": "blackstone",
|
||||
"title": "Blackstone",
|
||||
|
@ -1744,6 +1767,21 @@
|
|||
"twitter": "yanaiela",
|
||||
"website": "https://yanaiela.github.io"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "presidio",
|
||||
"title": "Presidio",
|
||||
"slogan": "Context aware, pluggable and customizable data protection and PII data anonymization",
|
||||
"description": "Presidio *(Origin from Latin praesidium ‘protection, garrison’)* helps to ensure sensitive text is properly managed and governed. It provides fast ***analytics*** and ***anonymization*** for sensitive text such as credit card numbers, names, locations, social security numbers, bitcoin wallets, US phone numbers and financial data. Presidio analyzes the text using predefined or custom recognizers to identify entities, patterns, formats, and checksums with relevant context.",
|
||||
"url": "https://aka.ms/presidio",
|
||||
"image": "https://raw.githubusercontent.com/microsoft/presidio/master/docs/assets/before-after.png",
|
||||
"github": "microsoft/presidio",
|
||||
"category": ["standalone"],
|
||||
"thumb": "https://avatars0.githubusercontent.com/u/6154722",
|
||||
"author": "Microsoft",
|
||||
"author_links": {
|
||||
"github": "microsoft"
|
||||
}
|
||||
}
|
||||
],
|
||||
|
||||
|
|
|
@ -8,6 +8,8 @@ import Icon from './icon'
|
|||
import classes from '../styles/link.module.sass'
|
||||
import { isString } from './util'
|
||||
|
||||
const internalRegex = /(http(s?)):\/\/(prodi.gy|spacy.io|irl.spacy.io)/gi
|
||||
|
||||
const Whitespace = ({ children }) => (
|
||||
// Ensure that links are always wrapped in spaces
|
||||
<> {children} </>
|
||||
|
@ -68,13 +70,15 @@ const Link = ({
|
|||
</Wrapper>
|
||||
)
|
||||
}
|
||||
const isInternal = internalRegex.test(dest)
|
||||
const rel = isInternal ? null : 'noopener nofollow noreferrer'
|
||||
return (
|
||||
<Wrapper>
|
||||
<OutboundLink
|
||||
href={dest}
|
||||
className={linkClassNames}
|
||||
target="_blank"
|
||||
rel="noopener nofollow noreferrer"
|
||||
rel={rel}
|
||||
{...other}
|
||||
>
|
||||
{content}
|
||||
|
|
Loading…
Reference in New Issue
Block a user