Update develop from master

This commit is contained in:
Matthew Honnibal 2019-09-08 18:21:41 +02:00
commit 1a65c5b7af
47 changed files with 1569315 additions and 184 deletions

BIN
.github/contributors/Schibsted.png vendored Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 259 KiB

View File

@ -16,9 +16,9 @@ class KerasSimilarityShim(object):
if get_features is None: if get_features is None:
get_features = get_word_ids get_features = get_word_ids
with (path / 'config.json').open() as file_: with (path / "config.json").open() as file_:
model = model_from_json(file_.read()) model = model_from_json(file_.read())
with (path / 'model').open('rb') as file_: with (path / "model").open("rb") as file_:
weights = pickle.load(file_) weights = pickle.load(file_)
embeddings = get_embeddings(nlp.vocab) embeddings = get_embeddings(nlp.vocab)
@ -33,8 +33,8 @@ class KerasSimilarityShim(object):
self.max_length = max_length self.max_length = max_length
def __call__(self, doc): def __call__(self, doc):
doc.user_hooks['similarity'] = self.predict doc.user_hooks["similarity"] = self.predict
doc.user_span_hooks['similarity'] = self.predict doc.user_span_hooks["similarity"] = self.predict
return doc return doc
@ -54,8 +54,8 @@ def get_embeddings(vocab, nr_unk=100):
oov = np.random.normal(size=(nr_unk, vocab.vectors_length)) oov = np.random.normal(size=(nr_unk, vocab.vectors_length))
oov = oov / oov.sum(axis=1, keepdims=True) oov = oov / oov.sum(axis=1, keepdims=True)
vectors = np.zeros((num_vectors + nr_unk, vocab.vectors_length), dtype='float32') vectors = np.zeros((num_vectors + nr_unk, vocab.vectors_length), dtype="float32")
vectors[1:(nr_unk + 1), ] = oov vectors[1 : (nr_unk + 1),] = oov
for lex in vocab: for lex in vocab:
if lex.has_vector and lex.vector_norm > 0: if lex.has_vector and lex.vector_norm > 0:
vectors[nr_unk + lex.rank + 1] = lex.vector / lex.vector_norm vectors[nr_unk + lex.rank + 1] = lex.vector / lex.vector_norm
@ -64,7 +64,7 @@ def get_embeddings(vocab, nr_unk=100):
def get_word_ids(docs, max_length=100, nr_unk=100): def get_word_ids(docs, max_length=100, nr_unk=100):
Xs = np.zeros((len(docs), max_length), dtype='int32') Xs = np.zeros((len(docs), max_length), dtype="int32")
for i, doc in enumerate(docs): for i, doc in enumerate(docs):
for j, token in enumerate(doc): for j, token in enumerate(doc):

View File

@ -0,0 +1,7 @@
## Examples of NER/IOB data that can be converted with `spacy convert`
spacy JSON training files were generated with:
```
python -m spacy convert -c iob -s -n 10 -b en file.iob
```

View File

@ -0,0 +1,2 @@
When|WRB|O Sebastian|NNP|B-PERSON Thrun|NNP|I-PERSON started|VBD|O working|VBG|O on|IN|O self|NN|O -|HYPH|O driving|VBG|O cars|NNS|O at|IN|O Google|NNP|B-ORG in|IN|O 2007|CD|B-DATE ,|,|O few|JJ|O people|NNS|O outside|RB|O of|IN|O the|DT|O company|NN|O took|VBD|O him|PRP|O seriously|RB|O .|.|O
“|''|O I|PRP|O can|MD|O tell|VB|O you|PRP|O very|RB|O senior|JJ|O CEOs|NNS|O of|IN|O major|JJ|O American|JJ|B-NORP car|NN|O companies|NNS|O would|MD|O shake|VB|O my|PRP$|O hand|NN|O and|CC|O turn|VB|O away|RB|O because|IN|O I|PRP|O was|VBD|O nt|RB|O worth|JJ|O talking|VBG|O to|IN|O ,|,|O ”|''|O said|VBD|O Thrun|NNP|B-PERSON ,|,|O in|IN|O an|DT|O interview|NN|O with|IN|O Recode|NNP|B-ORG earlier|RBR|B-DATE this|DT|I-DATE week|NN|I-DATE .|.|O

View File

@ -0,0 +1,349 @@
[
{
"id":0,
"paragraphs":[
{
"sentences":[
{
"tokens":[
{
"orth":"When",
"tag":"WRB",
"ner":"O"
},
{
"orth":"Sebastian",
"tag":"NNP",
"ner":"B-PERSON"
},
{
"orth":"Thrun",
"tag":"NNP",
"ner":"L-PERSON"
},
{
"orth":"started",
"tag":"VBD",
"ner":"O"
},
{
"orth":"working",
"tag":"VBG",
"ner":"O"
},
{
"orth":"on",
"tag":"IN",
"ner":"O"
},
{
"orth":"self",
"tag":"NN",
"ner":"O"
},
{
"orth":"-",
"tag":"HYPH",
"ner":"O"
},
{
"orth":"driving",
"tag":"VBG",
"ner":"O"
},
{
"orth":"cars",
"tag":"NNS",
"ner":"O"
},
{
"orth":"at",
"tag":"IN",
"ner":"O"
},
{
"orth":"Google",
"tag":"NNP",
"ner":"U-ORG"
},
{
"orth":"in",
"tag":"IN",
"ner":"O"
},
{
"orth":"2007",
"tag":"CD",
"ner":"U-DATE"
},
{
"orth":",",
"tag":",",
"ner":"O"
},
{
"orth":"few",
"tag":"JJ",
"ner":"O"
},
{
"orth":"people",
"tag":"NNS",
"ner":"O"
},
{
"orth":"outside",
"tag":"RB",
"ner":"O"
},
{
"orth":"of",
"tag":"IN",
"ner":"O"
},
{
"orth":"the",
"tag":"DT",
"ner":"O"
},
{
"orth":"company",
"tag":"NN",
"ner":"O"
},
{
"orth":"took",
"tag":"VBD",
"ner":"O"
},
{
"orth":"him",
"tag":"PRP",
"ner":"O"
},
{
"orth":"seriously",
"tag":"RB",
"ner":"O"
},
{
"orth":".",
"tag":".",
"ner":"O"
}
]
},
{
"tokens":[
{
"orth":"\u201c",
"tag":"''",
"ner":"O"
},
{
"orth":"I",
"tag":"PRP",
"ner":"O"
},
{
"orth":"can",
"tag":"MD",
"ner":"O"
},
{
"orth":"tell",
"tag":"VB",
"ner":"O"
},
{
"orth":"you",
"tag":"PRP",
"ner":"O"
},
{
"orth":"very",
"tag":"RB",
"ner":"O"
},
{
"orth":"senior",
"tag":"JJ",
"ner":"O"
},
{
"orth":"CEOs",
"tag":"NNS",
"ner":"O"
},
{
"orth":"of",
"tag":"IN",
"ner":"O"
},
{
"orth":"major",
"tag":"JJ",
"ner":"O"
},
{
"orth":"American",
"tag":"JJ",
"ner":"U-NORP"
},
{
"orth":"car",
"tag":"NN",
"ner":"O"
},
{
"orth":"companies",
"tag":"NNS",
"ner":"O"
},
{
"orth":"would",
"tag":"MD",
"ner":"O"
},
{
"orth":"shake",
"tag":"VB",
"ner":"O"
},
{
"orth":"my",
"tag":"PRP$",
"ner":"O"
},
{
"orth":"hand",
"tag":"NN",
"ner":"O"
},
{
"orth":"and",
"tag":"CC",
"ner":"O"
},
{
"orth":"turn",
"tag":"VB",
"ner":"O"
},
{
"orth":"away",
"tag":"RB",
"ner":"O"
},
{
"orth":"because",
"tag":"IN",
"ner":"O"
},
{
"orth":"I",
"tag":"PRP",
"ner":"O"
},
{
"orth":"was",
"tag":"VBD",
"ner":"O"
},
{
"orth":"n\u2019t",
"tag":"RB",
"ner":"O"
},
{
"orth":"worth",
"tag":"JJ",
"ner":"O"
},
{
"orth":"talking",
"tag":"VBG",
"ner":"O"
},
{
"orth":"to",
"tag":"IN",
"ner":"O"
},
{
"orth":",",
"tag":",",
"ner":"O"
},
{
"orth":"\u201d",
"tag":"''",
"ner":"O"
},
{
"orth":"said",
"tag":"VBD",
"ner":"O"
},
{
"orth":"Thrun",
"tag":"NNP",
"ner":"U-PERSON"
},
{
"orth":",",
"tag":",",
"ner":"O"
},
{
"orth":"in",
"tag":"IN",
"ner":"O"
},
{
"orth":"an",
"tag":"DT",
"ner":"O"
},
{
"orth":"interview",
"tag":"NN",
"ner":"O"
},
{
"orth":"with",
"tag":"IN",
"ner":"O"
},
{
"orth":"Recode",
"tag":"NNP",
"ner":"U-ORG"
},
{
"orth":"earlier",
"tag":"RBR",
"ner":"B-DATE"
},
{
"orth":"this",
"tag":"DT",
"ner":"I-DATE"
},
{
"orth":"week",
"tag":"NN",
"ner":"L-DATE"
},
{
"orth":".",
"tag":".",
"ner":"O"
}
]
}
]
}
]
}
]

View File

@ -0,0 +1,70 @@
-DOCSTART- -X- O O
When WRB _ O
Sebastian NNP _ B-PERSON
Thrun NNP _ I-PERSON
started VBD _ O
working VBG _ O
on IN _ O
self NN _ O
- HYPH _ O
driving VBG _ O
cars NNS _ O
at IN _ O
Google NNP _ B-ORG
in IN _ O
2007 CD _ B-DATE
, , _ O
few JJ _ O
people NNS _ O
outside RB _ O
of IN _ O
the DT _ O
company NN _ O
took VBD _ O
him PRP _ O
seriously RB _ O
. . _ O
“ '' _ O
I PRP _ O
can MD _ O
tell VB _ O
you PRP _ O
very RB _ O
senior JJ _ O
CEOs NNS _ O
of IN _ O
major JJ _ O
American JJ _ B-NORP
car NN _ O
companies NNS _ O
would MD _ O
shake VB _ O
my PRP$ _ O
hand NN _ O
and CC _ O
turn VB _ O
away RB _ O
because IN _ O
I PRP _ O
was VBD _ O
nt RB _ O
worth JJ _ O
talking VBG _ O
to IN _ O
, , _ O
” '' _ O
said VBD _ O
Thrun NNP _ B-PERSON
, , _ O
in IN _ O
an DT _ O
interview NN _ O
with IN _ O
Recode NNP _ B-ORG
earlier RBR _ B-DATE
this DT _ I-DATE
week NN _ I-DATE
. . _ O

View File

@ -0,0 +1,349 @@
[
{
"id":0,
"paragraphs":[
{
"sentences":[
{
"tokens":[
{
"orth":"When",
"tag":"WRB",
"ner":"O"
},
{
"orth":"Sebastian",
"tag":"NNP",
"ner":"B-PERSON"
},
{
"orth":"Thrun",
"tag":"NNP",
"ner":"L-PERSON"
},
{
"orth":"started",
"tag":"VBD",
"ner":"O"
},
{
"orth":"working",
"tag":"VBG",
"ner":"O"
},
{
"orth":"on",
"tag":"IN",
"ner":"O"
},
{
"orth":"self",
"tag":"NN",
"ner":"O"
},
{
"orth":"-",
"tag":"HYPH",
"ner":"O"
},
{
"orth":"driving",
"tag":"VBG",
"ner":"O"
},
{
"orth":"cars",
"tag":"NNS",
"ner":"O"
},
{
"orth":"at",
"tag":"IN",
"ner":"O"
},
{
"orth":"Google",
"tag":"NNP",
"ner":"U-ORG"
},
{
"orth":"in",
"tag":"IN",
"ner":"O"
},
{
"orth":"2007",
"tag":"CD",
"ner":"U-DATE"
},
{
"orth":",",
"tag":",",
"ner":"O"
},
{
"orth":"few",
"tag":"JJ",
"ner":"O"
},
{
"orth":"people",
"tag":"NNS",
"ner":"O"
},
{
"orth":"outside",
"tag":"RB",
"ner":"O"
},
{
"orth":"of",
"tag":"IN",
"ner":"O"
},
{
"orth":"the",
"tag":"DT",
"ner":"O"
},
{
"orth":"company",
"tag":"NN",
"ner":"O"
},
{
"orth":"took",
"tag":"VBD",
"ner":"O"
},
{
"orth":"him",
"tag":"PRP",
"ner":"O"
},
{
"orth":"seriously",
"tag":"RB",
"ner":"O"
},
{
"orth":".",
"tag":".",
"ner":"O"
}
]
},
{
"tokens":[
{
"orth":"\u201c",
"tag":"''",
"ner":"O"
},
{
"orth":"I",
"tag":"PRP",
"ner":"O"
},
{
"orth":"can",
"tag":"MD",
"ner":"O"
},
{
"orth":"tell",
"tag":"VB",
"ner":"O"
},
{
"orth":"you",
"tag":"PRP",
"ner":"O"
},
{
"orth":"very",
"tag":"RB",
"ner":"O"
},
{
"orth":"senior",
"tag":"JJ",
"ner":"O"
},
{
"orth":"CEOs",
"tag":"NNS",
"ner":"O"
},
{
"orth":"of",
"tag":"IN",
"ner":"O"
},
{
"orth":"major",
"tag":"JJ",
"ner":"O"
},
{
"orth":"American",
"tag":"JJ",
"ner":"U-NORP"
},
{
"orth":"car",
"tag":"NN",
"ner":"O"
},
{
"orth":"companies",
"tag":"NNS",
"ner":"O"
},
{
"orth":"would",
"tag":"MD",
"ner":"O"
},
{
"orth":"shake",
"tag":"VB",
"ner":"O"
},
{
"orth":"my",
"tag":"PRP$",
"ner":"O"
},
{
"orth":"hand",
"tag":"NN",
"ner":"O"
},
{
"orth":"and",
"tag":"CC",
"ner":"O"
},
{
"orth":"turn",
"tag":"VB",
"ner":"O"
},
{
"orth":"away",
"tag":"RB",
"ner":"O"
},
{
"orth":"because",
"tag":"IN",
"ner":"O"
},
{
"orth":"I",
"tag":"PRP",
"ner":"O"
},
{
"orth":"was",
"tag":"VBD",
"ner":"O"
},
{
"orth":"n\u2019t",
"tag":"RB",
"ner":"O"
},
{
"orth":"worth",
"tag":"JJ",
"ner":"O"
},
{
"orth":"talking",
"tag":"VBG",
"ner":"O"
},
{
"orth":"to",
"tag":"IN",
"ner":"O"
},
{
"orth":",",
"tag":",",
"ner":"O"
},
{
"orth":"\u201d",
"tag":"''",
"ner":"O"
},
{
"orth":"said",
"tag":"VBD",
"ner":"O"
},
{
"orth":"Thrun",
"tag":"NNP",
"ner":"U-PERSON"
},
{
"orth":",",
"tag":",",
"ner":"O"
},
{
"orth":"in",
"tag":"IN",
"ner":"O"
},
{
"orth":"an",
"tag":"DT",
"ner":"O"
},
{
"orth":"interview",
"tag":"NN",
"ner":"O"
},
{
"orth":"with",
"tag":"IN",
"ner":"O"
},
{
"orth":"Recode",
"tag":"NNP",
"ner":"U-ORG"
},
{
"orth":"earlier",
"tag":"RBR",
"ner":"B-DATE"
},
{
"orth":"this",
"tag":"DT",
"ner":"I-DATE"
},
{
"orth":"week",
"tag":"NN",
"ner":"L-DATE"
},
{
"orth":".",
"tag":".",
"ner":"O"
}
]
}
]
}
]
}
]

View File

@ -0,0 +1,66 @@
When WRB O
Sebastian NNP B-PERSON
Thrun NNP I-PERSON
started VBD O
working VBG O
on IN O
self NN O
- HYPH O
driving VBG O
cars NNS O
at IN O
Google NNP B-ORG
in IN O
2007 CD B-DATE
, , O
few JJ O
people NNS O
outside RB O
of IN O
the DT O
company NN O
took VBD O
him PRP O
seriously RB O
. . O
“ '' O
I PRP O
can MD O
tell VB O
you PRP O
very RB O
senior JJ O
CEOs NNS O
of IN O
major JJ O
American JJ B-NORP
car NN O
companies NNS O
would MD O
shake VB O
my PRP$ O
hand NN O
and CC O
turn VB O
away RB O
because IN O
I PRP O
was VBD O
nt RB O
worth JJ O
talking VBG O
to IN O
, , O
” '' O
said VBD O
Thrun NNP B-PERSON
, , O
in IN O
an DT O
interview NN O
with IN O
Recode NNP B-ORG
earlier RBR B-DATE
this DT I-DATE
week NN I-DATE
. . O

View File

@ -0,0 +1,353 @@
[
{
"id":0,
"paragraphs":[
{
"sentences":[
{
"tokens":[
{
"orth":"When",
"tag":"WRB",
"ner":"O"
},
{
"orth":"Sebastian",
"tag":"NNP",
"ner":"B-PERSON"
},
{
"orth":"Thrun",
"tag":"NNP",
"ner":"L-PERSON"
},
{
"orth":"started",
"tag":"VBD",
"ner":"O"
},
{
"orth":"working",
"tag":"VBG",
"ner":"O"
},
{
"orth":"on",
"tag":"IN",
"ner":"O"
},
{
"orth":"self",
"tag":"NN",
"ner":"O"
},
{
"orth":"-",
"tag":"HYPH",
"ner":"O"
},
{
"orth":"driving",
"tag":"VBG",
"ner":"O"
},
{
"orth":"cars",
"tag":"NNS",
"ner":"O"
},
{
"orth":"at",
"tag":"IN",
"ner":"O"
},
{
"orth":"Google",
"tag":"NNP",
"ner":"U-ORG"
},
{
"orth":"in",
"tag":"IN",
"ner":"O"
},
{
"orth":"2007",
"tag":"CD",
"ner":"U-DATE"
},
{
"orth":",",
"tag":",",
"ner":"O"
},
{
"orth":"few",
"tag":"JJ",
"ner":"O"
},
{
"orth":"people",
"tag":"NNS",
"ner":"O"
},
{
"orth":"outside",
"tag":"RB",
"ner":"O"
},
{
"orth":"of",
"tag":"IN",
"ner":"O"
},
{
"orth":"the",
"tag":"DT",
"ner":"O"
},
{
"orth":"company",
"tag":"NN",
"ner":"O"
},
{
"orth":"took",
"tag":"VBD",
"ner":"O"
},
{
"orth":"him",
"tag":"PRP",
"ner":"O"
},
{
"orth":"seriously",
"tag":"RB",
"ner":"O"
},
{
"orth":".",
"tag":".",
"ner":"O"
}
]
},
{
"tokens":[
{
"orth":"\u201c",
"tag":"''",
"ner":"O"
}
]
},
{
"tokens":[
{
"orth":"I",
"tag":"PRP",
"ner":"O"
},
{
"orth":"can",
"tag":"MD",
"ner":"O"
},
{
"orth":"tell",
"tag":"VB",
"ner":"O"
},
{
"orth":"you",
"tag":"PRP",
"ner":"O"
},
{
"orth":"very",
"tag":"RB",
"ner":"O"
},
{
"orth":"senior",
"tag":"JJ",
"ner":"O"
},
{
"orth":"CEOs",
"tag":"NNS",
"ner":"O"
},
{
"orth":"of",
"tag":"IN",
"ner":"O"
},
{
"orth":"major",
"tag":"JJ",
"ner":"O"
},
{
"orth":"American",
"tag":"JJ",
"ner":"U-NORP"
},
{
"orth":"car",
"tag":"NN",
"ner":"O"
},
{
"orth":"companies",
"tag":"NNS",
"ner":"O"
},
{
"orth":"would",
"tag":"MD",
"ner":"O"
},
{
"orth":"shake",
"tag":"VB",
"ner":"O"
},
{
"orth":"my",
"tag":"PRP$",
"ner":"O"
},
{
"orth":"hand",
"tag":"NN",
"ner":"O"
},
{
"orth":"and",
"tag":"CC",
"ner":"O"
},
{
"orth":"turn",
"tag":"VB",
"ner":"O"
},
{
"orth":"away",
"tag":"RB",
"ner":"O"
},
{
"orth":"because",
"tag":"IN",
"ner":"O"
},
{
"orth":"I",
"tag":"PRP",
"ner":"O"
},
{
"orth":"was",
"tag":"VBD",
"ner":"O"
},
{
"orth":"n\u2019t",
"tag":"RB",
"ner":"O"
},
{
"orth":"worth",
"tag":"JJ",
"ner":"O"
},
{
"orth":"talking",
"tag":"VBG",
"ner":"O"
},
{
"orth":"to",
"tag":"IN",
"ner":"O"
},
{
"orth":",",
"tag":",",
"ner":"O"
},
{
"orth":"\u201d",
"tag":"''",
"ner":"O"
},
{
"orth":"said",
"tag":"VBD",
"ner":"O"
},
{
"orth":"Thrun",
"tag":"NNP",
"ner":"U-PERSON"
},
{
"orth":",",
"tag":",",
"ner":"O"
},
{
"orth":"in",
"tag":"IN",
"ner":"O"
},
{
"orth":"an",
"tag":"DT",
"ner":"O"
},
{
"orth":"interview",
"tag":"NN",
"ner":"O"
},
{
"orth":"with",
"tag":"IN",
"ner":"O"
},
{
"orth":"Recode",
"tag":"NNP",
"ner":"U-ORG"
},
{
"orth":"earlier",
"tag":"RBR",
"ner":"B-DATE"
},
{
"orth":"this",
"tag":"DT",
"ner":"I-DATE"
},
{
"orth":"week",
"tag":"NN",
"ner":"L-DATE"
},
{
"orth":".",
"tag":".",
"ner":"O"
}
]
}
]
}
]
}
]

View File

@ -0,0 +1,66 @@
When O
Sebastian B-PERSON
Thrun I-PERSON
started O
working O
on O
self O
- O
driving O
cars O
at O
Google B-ORG
in O
2007 B-DATE
, O
few O
people O
outside O
of O
the O
company O
took O
him O
seriously O
. O
“ O
I O
can O
tell O
you O
very O
senior O
CEOs O
of O
major O
American B-NORP
car O
companies O
would O
shake O
my O
hand O
and O
turn O
away O
because O
I O
was O
nt O
worth O
talking O
to O
, O
” O
said O
Thrun B-PERSON
, O
in O
an O
interview O
with O
Recode B-ORG
earlier B-DATE
this I-DATE
week I-DATE
. O

View File

@ -0,0 +1,353 @@
[
{
"id":0,
"paragraphs":[
{
"sentences":[
{
"tokens":[
{
"orth":"When",
"tag":"-",
"ner":"O"
},
{
"orth":"Sebastian",
"tag":"-",
"ner":"B-PERSON"
},
{
"orth":"Thrun",
"tag":"-",
"ner":"L-PERSON"
},
{
"orth":"started",
"tag":"-",
"ner":"O"
},
{
"orth":"working",
"tag":"-",
"ner":"O"
},
{
"orth":"on",
"tag":"-",
"ner":"O"
},
{
"orth":"self",
"tag":"-",
"ner":"O"
},
{
"orth":"-",
"tag":"-",
"ner":"O"
},
{
"orth":"driving",
"tag":"-",
"ner":"O"
},
{
"orth":"cars",
"tag":"-",
"ner":"O"
},
{
"orth":"at",
"tag":"-",
"ner":"O"
},
{
"orth":"Google",
"tag":"-",
"ner":"U-ORG"
},
{
"orth":"in",
"tag":"-",
"ner":"O"
},
{
"orth":"2007",
"tag":"-",
"ner":"U-DATE"
},
{
"orth":",",
"tag":"-",
"ner":"O"
},
{
"orth":"few",
"tag":"-",
"ner":"O"
},
{
"orth":"people",
"tag":"-",
"ner":"O"
},
{
"orth":"outside",
"tag":"-",
"ner":"O"
},
{
"orth":"of",
"tag":"-",
"ner":"O"
},
{
"orth":"the",
"tag":"-",
"ner":"O"
},
{
"orth":"company",
"tag":"-",
"ner":"O"
},
{
"orth":"took",
"tag":"-",
"ner":"O"
},
{
"orth":"him",
"tag":"-",
"ner":"O"
},
{
"orth":"seriously",
"tag":"-",
"ner":"O"
},
{
"orth":".",
"tag":"-",
"ner":"O"
}
]
},
{
"tokens":[
{
"orth":"\u201c",
"tag":"-",
"ner":"O"
}
]
},
{
"tokens":[
{
"orth":"I",
"tag":"-",
"ner":"O"
},
{
"orth":"can",
"tag":"-",
"ner":"O"
},
{
"orth":"tell",
"tag":"-",
"ner":"O"
},
{
"orth":"you",
"tag":"-",
"ner":"O"
},
{
"orth":"very",
"tag":"-",
"ner":"O"
},
{
"orth":"senior",
"tag":"-",
"ner":"O"
},
{
"orth":"CEOs",
"tag":"-",
"ner":"O"
},
{
"orth":"of",
"tag":"-",
"ner":"O"
},
{
"orth":"major",
"tag":"-",
"ner":"O"
},
{
"orth":"American",
"tag":"-",
"ner":"U-NORP"
},
{
"orth":"car",
"tag":"-",
"ner":"O"
},
{
"orth":"companies",
"tag":"-",
"ner":"O"
},
{
"orth":"would",
"tag":"-",
"ner":"O"
},
{
"orth":"shake",
"tag":"-",
"ner":"O"
},
{
"orth":"my",
"tag":"-",
"ner":"O"
},
{
"orth":"hand",
"tag":"-",
"ner":"O"
},
{
"orth":"and",
"tag":"-",
"ner":"O"
},
{
"orth":"turn",
"tag":"-",
"ner":"O"
},
{
"orth":"away",
"tag":"-",
"ner":"O"
},
{
"orth":"because",
"tag":"-",
"ner":"O"
},
{
"orth":"I",
"tag":"-",
"ner":"O"
},
{
"orth":"was",
"tag":"-",
"ner":"O"
},
{
"orth":"n\u2019t",
"tag":"-",
"ner":"O"
},
{
"orth":"worth",
"tag":"-",
"ner":"O"
},
{
"orth":"talking",
"tag":"-",
"ner":"O"
},
{
"orth":"to",
"tag":"-",
"ner":"O"
},
{
"orth":",",
"tag":"-",
"ner":"O"
},
{
"orth":"\u201d",
"tag":"-",
"ner":"O"
},
{
"orth":"said",
"tag":"-",
"ner":"O"
},
{
"orth":"Thrun",
"tag":"-",
"ner":"U-PERSON"
},
{
"orth":",",
"tag":"-",
"ner":"O"
},
{
"orth":"in",
"tag":"-",
"ner":"O"
},
{
"orth":"an",
"tag":"-",
"ner":"O"
},
{
"orth":"interview",
"tag":"-",
"ner":"O"
},
{
"orth":"with",
"tag":"-",
"ner":"O"
},
{
"orth":"Recode",
"tag":"-",
"ner":"U-ORG"
},
{
"orth":"earlier",
"tag":"-",
"ner":"B-DATE"
},
{
"orth":"this",
"tag":"-",
"ner":"I-DATE"
},
{
"orth":"week",
"tag":"-",
"ner":"L-DATE"
},
{
"orth":".",
"tag":"-",
"ner":"O"
}
]
}
]
}
]
}
]

View File

@ -80,7 +80,7 @@ def main(model_name, unlabelled_loc):
nlp.rehearse(raw_batch, sgd=optimizer, losses=r_losses) nlp.rehearse(raw_batch, sgd=optimizer, losses=r_losses)
print("Losses", losses) print("Losses", losses)
print("R. Losses", r_losses) print("R. Losses", r_losses)
print(nlp.get_pipe('ner').model.unseen_classes) print(nlp.get_pipe("ner").model.unseen_classes)
test_text = "Do you like horses?" test_text = "Do you like horses?"
doc = nlp(test_text) doc = nlp(test_text)
print("Entities in '%s'" % test_text) print("Entities in '%s'" % test_text)
@ -88,7 +88,5 @@ def main(model_name, unlabelled_loc):
print(ent.label_, ent.text) print(ent.label_, ent.text)
if __name__ == "__main__": if __name__ == "__main__":
plac.call(main) plac.call(main)

View File

@ -24,7 +24,7 @@ from spacy.util import minibatch, compounding
output_dir=("Optional output directory", "option", "o", Path), output_dir=("Optional output directory", "option", "o", Path),
n_texts=("Number of texts to train from", "option", "t", int), n_texts=("Number of texts to train from", "option", "t", int),
n_iter=("Number of training iterations", "option", "n", int), n_iter=("Number of training iterations", "option", "n", int),
init_tok2vec=("Pretrained tok2vec weights", "option", "t2v", Path) init_tok2vec=("Pretrained tok2vec weights", "option", "t2v", Path),
) )
def main(model=None, output_dir=None, n_iter=20, n_texts=2000, init_tok2vec=None): def main(model=None, output_dir=None, n_iter=20, n_texts=2000, init_tok2vec=None):
if output_dir is not None: if output_dir is not None:
@ -43,11 +43,7 @@ def main(model=None, output_dir=None, n_iter=20, n_texts=2000, init_tok2vec=None
# nlp.create_pipe works for built-ins that are registered with spaCy # nlp.create_pipe works for built-ins that are registered with spaCy
if "textcat" not in nlp.pipe_names: if "textcat" not in nlp.pipe_names:
textcat = nlp.create_pipe( textcat = nlp.create_pipe(
"textcat", "textcat", config={"exclusive_classes": True, "architecture": "simple_cnn"}
config={
"exclusive_classes": True,
"architecture": "simple_cnn",
}
) )
nlp.add_pipe(textcat, last=True) nlp.add_pipe(textcat, last=True)
# otherwise, get it, so we can add labels to it # otherwise, get it, so we can add labels to it

View File

@ -5,12 +5,14 @@ import plac
from pathlib import Path from pathlib import Path
from wasabi import Printer from wasabi import Printer
import srsly import srsly
import re
from .converters import conllu2json, iob2json, conll_ner2json from .converters import conllu2json, iob2json, conll_ner2json
from .converters import ner_jsonl2json from .converters import ner_jsonl2json
# Converters are matched by file extension. To add a converter, add a new # Converters are matched by file extension except for ner/iob, which are
# matched by file extension and content. To add a converter, add a new
# entry to this dict with the file extension mapped to the converter function # entry to this dict with the file extension mapped to the converter function
# imported from /converters. # imported from /converters.
CONVERTERS = { CONVERTERS = {
@ -31,7 +33,9 @@ FILE_TYPES_STDOUT = ("json", "jsonl")
input_file=("Input file", "positional", None, str), input_file=("Input file", "positional", None, str),
output_dir=("Output directory. '-' for stdout.", "positional", None, str), output_dir=("Output directory. '-' for stdout.", "positional", None, str),
file_type=("Type of data to produce: {}".format(FILE_TYPES), "option", "t", str), file_type=("Type of data to produce: {}".format(FILE_TYPES), "option", "t", str),
n_sents=("Number of sentences per doc", "option", "n", int), n_sents=("Number of sentences per doc (0 to disable)", "option", "n", int),
seg_sents=("Segment sentences (for -c ner)", "flag", "s"),
model=("Model for sentence segmentation (for -s)", "option", "b", str),
converter=("Converter: {}".format(tuple(CONVERTERS.keys())), "option", "c", str), converter=("Converter: {}".format(tuple(CONVERTERS.keys())), "option", "c", str),
lang=("Language (if tokenizer required)", "option", "l", str), lang=("Language (if tokenizer required)", "option", "l", str),
morphology=("Enable appending morphology to tags", "flag", "m", bool), morphology=("Enable appending morphology to tags", "flag", "m", bool),
@ -41,6 +45,8 @@ def convert(
output_dir="-", output_dir="-",
file_type="json", file_type="json",
n_sents=1, n_sents=1,
seg_sents=False,
model=None,
morphology=False, morphology=False,
converter="auto", converter="auto",
lang=None, lang=None,
@ -70,14 +76,33 @@ def convert(
msg.fail("Input file not found", input_path, exits=1) msg.fail("Input file not found", input_path, exits=1)
if output_dir != "-" and not Path(output_dir).exists(): if output_dir != "-" and not Path(output_dir).exists():
msg.fail("Output directory not found", output_dir, exits=1) msg.fail("Output directory not found", output_dir, exits=1)
input_data = input_path.open("r", encoding="utf-8").read()
if converter == "auto": if converter == "auto":
converter = input_path.suffix[1:] converter = input_path.suffix[1:]
if converter == "ner" or converter == "iob":
converter_autodetect = autodetect_ner_format(input_data)
if converter_autodetect == "ner":
msg.info("Auto-detected token-per-line NER format")
converter = converter_autodetect
elif converter_autodetect == "iob":
msg.info("Auto-detected sentence-per-line NER format")
converter = converter_autodetect
else:
msg.warn(
"Can't automatically detect NER format. Conversion may not succeed. See https://spacy.io/api/cli#convert"
)
if converter not in CONVERTERS: if converter not in CONVERTERS:
msg.fail("Can't find converter for {}".format(converter), exits=1) msg.fail("Can't find converter for {}".format(converter), exits=1)
# Use converter function to convert data # Use converter function to convert data
func = CONVERTERS[converter] func = CONVERTERS[converter]
input_data = input_path.open("r", encoding="utf-8").read() data = func(
data = func(input_data, n_sents=n_sents, use_morphology=morphology, lang=lang) input_data,
n_sents=n_sents,
seg_sents=seg_sents,
use_morphology=morphology,
lang=lang,
model=model,
)
if output_dir != "-": if output_dir != "-":
# Export data to a file # Export data to a file
suffix = ".{}".format(file_type) suffix = ".{}".format(file_type)
@ -88,10 +113,31 @@ def convert(
srsly.write_jsonl(output_file, data) srsly.write_jsonl(output_file, data)
elif file_type == "msg": elif file_type == "msg":
srsly.write_msgpack(output_file, data) srsly.write_msgpack(output_file, data)
msg.good("Generated output file ({} documents)".format(len(data)), output_file) msg.good(
"Generated output file ({} documents): {}".format(len(data), output_file)
)
else: else:
# Print to stdout # Print to stdout
if file_type == "json": if file_type == "json":
srsly.write_json("-", data) srsly.write_json("-", data)
elif file_type == "jsonl": elif file_type == "jsonl":
srsly.write_jsonl("-", data) srsly.write_jsonl("-", data)
def autodetect_ner_format(input_data):
# guess format from the first 20 lines
lines = input_data.split("\n")[:20]
format_guesses = {"ner": 0, "iob": 0}
iob_re = re.compile(r"\S+\|(O|[IB]-\S+)")
ner_re = re.compile(r"\S+\s+(O|[IB]-\S+)$")
for line in lines:
line = line.strip()
if iob_re.search(line):
format_guesses["iob"] += 1
if ner_re.search(line):
format_guesses["ner"] += 1
if format_guesses["iob"] == 0 and format_guesses["ner"] > 0:
return "ner"
if format_guesses["ner"] == 0 and format_guesses["iob"] > 0:
return "iob"
return None

View File

@ -1,17 +1,89 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
from wasabi import Printer
from ...gold import iob_to_biluo from ...gold import iob_to_biluo
from ...lang.xx import MultiLanguage
from ...tokens.doc import Doc
from ...util import load_model
def conll_ner2json(input_data, **kwargs): def conll_ner2json(input_data, n_sents=10, seg_sents=False, model=None, **kwargs):
""" """
Convert files in the CoNLL-2003 NER format into JSON format for use with Convert files in the CoNLL-2003 NER format and similar
train cli. whitespace-separated columns into JSON format for use with train cli.
The first column is the tokens, the final column is the IOB tags. If an
additional second column is present, the second column is the tags.
Sentences are separated with whitespace and documents can be separated
using the line "-DOCSTART- -X- O O".
Sample format:
-DOCSTART- -X- O O
I O
like O
London B-GPE
and O
New B-GPE
York I-GPE
City I-GPE
. O
""" """
delimit_docs = "-DOCSTART- -X- O O" msg = Printer()
doc_delimiter = "-DOCSTART- -X- O O"
# check for existing delimiters, which should be preserved
if "\n\n" in input_data and seg_sents:
msg.warn(
"Sentence boundaries found, automatic sentence segmentation with "
"`-s` disabled."
)
seg_sents = False
if doc_delimiter in input_data and n_sents:
msg.warn(
"Document delimiters found, automatic document segmentation with "
"`-n` disabled."
)
n_sents = 0
# do document segmentation with existing sentences
if "\n\n" in input_data and doc_delimiter not in input_data and n_sents:
n_sents_info(msg, n_sents)
input_data = segment_docs(input_data, n_sents, doc_delimiter)
# do sentence segmentation with existing documents
if "\n\n" not in input_data and doc_delimiter in input_data and seg_sents:
input_data = segment_sents_and_docs(input_data, 0, "", model=model, msg=msg)
# do both sentence segmentation and document segmentation according
# to options
if "\n\n" not in input_data and doc_delimiter not in input_data:
# sentence segmentation required for document segmentation
if n_sents > 0 and not seg_sents:
msg.warn(
"No sentence boundaries found to use with option `-n {}`. "
"Use `-s` to automatically segment sentences or `-n 0` "
"to disable.".format(n_sents)
)
else:
n_sents_info(msg, n_sents)
input_data = segment_sents_and_docs(
input_data, n_sents, doc_delimiter, model=model, msg=msg
)
# provide warnings for problematic data
if "\n\n" not in input_data:
msg.warn(
"No sentence boundaries found. Use `-s` to automatically segment "
"sentences."
)
if doc_delimiter not in input_data:
msg.warn(
"No document delimiters found. Use `-n` to automatically group "
"sentences into documents."
)
output_docs = [] output_docs = []
for doc in input_data.strip().split(delimit_docs): for doc in input_data.strip().split(doc_delimiter):
doc = doc.strip() doc = doc.strip()
if not doc: if not doc:
continue continue
@ -21,7 +93,19 @@ def conll_ner2json(input_data, **kwargs):
if not sent: if not sent:
continue continue
lines = [line.strip() for line in sent.split("\n") if line.strip()] lines = [line.strip() for line in sent.split("\n") if line.strip()]
words, tags, chunks, iob_ents = zip(*[line.split() for line in lines]) cols = list(zip(*[line.split() for line in lines]))
if len(cols) < 2:
raise ValueError(
"The token-per-line NER file is not formatted correctly. "
"Try checking whitespace and delimiters. See "
"https://spacy.io/api/cli#convert"
)
words = cols[0]
iob_ents = cols[-1]
if len(cols) > 2:
tags = cols[1]
else:
tags = ["-"] * len(words)
biluo_ents = iob_to_biluo(iob_ents) biluo_ents = iob_to_biluo(iob_ents)
output_doc.append( output_doc.append(
{ {
@ -36,3 +120,53 @@ def conll_ner2json(input_data, **kwargs):
) )
output_doc = [] output_doc = []
return output_docs return output_docs
def segment_sents_and_docs(doc, n_sents, doc_delimiter, model=None, msg=None):
sentencizer = None
if model:
nlp = load_model(model)
if "parser" in nlp.pipe_names:
msg.info("Segmenting sentences with parser from model '{}'.".format(model))
sentencizer = nlp.get_pipe("parser")
if not sentencizer:
msg.info(
"Segmenting sentences with sentencizer. (Use `-b model` for "
"improved parser-based sentence segmentation.)"
)
nlp = MultiLanguage()
sentencizer = nlp.create_pipe("sentencizer")
lines = doc.strip().split("\n")
words = [line.strip().split()[0] for line in lines]
nlpdoc = Doc(nlp.vocab, words=words)
sentencizer(nlpdoc)
lines_with_segs = []
sent_count = 0
for i, token in enumerate(nlpdoc):
if token.is_sent_start:
if n_sents and sent_count % n_sents == 0:
lines_with_segs.append(doc_delimiter)
lines_with_segs.append("")
sent_count += 1
lines_with_segs.append(lines[i])
return "\n".join(lines_with_segs)
def segment_docs(input_data, n_sents, doc_delimiter):
sent_delimiter = "\n\n"
sents = input_data.split(sent_delimiter)
docs = [sents[i : i + n_sents] for i in range(0, len(sents), n_sents)]
input_data = ""
for doc in docs:
input_data += sent_delimiter + doc_delimiter
input_data += sent_delimiter.join(doc)
return input_data
def n_sents_info(msg, n_sents):
msg.info("Grouping every {} sentences into a document.".format(n_sents))
if n_sents == 1:
msg.warn(
"To generate better training data, you may want to group "
"sentences into documents with `-n 10`."
)

View File

@ -2,17 +2,30 @@
from __future__ import unicode_literals from __future__ import unicode_literals
import re import re
from wasabi import Printer
from ...gold import iob_to_biluo from ...gold import iob_to_biluo
from ...util import minibatch from ...util import minibatch
from .conll_ner2json import n_sents_info
def iob2json(input_data, n_sents=10, *args, **kwargs): def iob2json(input_data, n_sents=10, *args, **kwargs):
""" """
Convert IOB files into JSON format for use with train cli. Convert IOB files with one sentence per line and tags separated with '|'
into JSON format for use with train cli. IOB and IOB2 are accepted.
Sample formats:
I|O like|O London|I-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O
I|O like|O London|B-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O
I|PRP|O like|VBP|O London|NNP|I-GPE and|CC|O New|NNP|B-GPE York|NNP|I-GPE City|NNP|I-GPE .|.|O
I|PRP|O like|VBP|O London|NNP|B-GPE and|CC|O New|NNP|B-GPE York|NNP|I-GPE City|NNP|I-GPE .|.|O
""" """
sentences = read_iob(input_data.split("\n")) msg = Printer()
docs = merge_sentences(sentences, n_sents) docs = read_iob(input_data.split("\n"))
if n_sents > 0:
n_sents_info(msg, n_sents)
docs = merge_sentences(docs, n_sents)
return docs return docs
@ -21,7 +34,7 @@ def read_iob(raw_sents):
for line in raw_sents: for line in raw_sents:
if not line.strip(): if not line.strip():
continue continue
tokens = [re.split("[^\w\-]", line.strip())] tokens = [t.split("|") for t in line.split()]
if len(tokens[0]) == 3: if len(tokens[0]) == 3:
words, pos, iob = zip(*tokens) words, pos, iob = zip(*tokens)
elif len(tokens[0]) == 2: elif len(tokens[0]) == 2:
@ -29,7 +42,7 @@ def read_iob(raw_sents):
pos = ["-"] * len(words) pos = ["-"] * len(words)
else: else:
raise ValueError( raise ValueError(
"The iob/iob2 file is not formatted correctly. Try checking whitespace and delimiters." "The sentence-per-line IOB/IOB2 file is not formatted correctly. Try checking whitespace and delimiters. See https://spacy.io/api/cli#convert"
) )
biluo = iob_to_biluo(iob) biluo = iob_to_biluo(iob)
sentences.append( sentences.append(
@ -40,7 +53,7 @@ def read_iob(raw_sents):
) )
sentences = [{"tokens": sent} for sent in sentences] sentences = [{"tokens": sent} for sent in sentences]
paragraphs = [{"sentences": [sent]} for sent in sentences] paragraphs = [{"sentences": [sent]} for sent in sentences]
docs = [{"id": 0, "paragraphs": [para]} for para in paragraphs] docs = [{"id": i, "paragraphs": [para]} for i, para in enumerate(paragraphs)]
return docs return docs
@ -50,7 +63,7 @@ def merge_sentences(docs, n_sents):
group = list(group) group = list(group)
first = group.pop(0) first = group.pop(0)
to_extend = first["paragraphs"][0]["sentences"] to_extend = first["paragraphs"][0]["sentences"]
for sent in group[1:]: for sent in group:
to_extend.extend(sent["paragraphs"][0]["sentences"]) to_extend.extend(sent["paragraphs"][0]["sentences"])
merged.append(first) merged.append(first)
return merged return merged

View File

@ -18,6 +18,7 @@ class CroatianDefaults(Language.Defaults):
) )
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
stop_words = STOP_WORDS stop_words = STOP_WORDS
resources = {"lemma_lookup": "lemma_lookup.json"}
class Croatian(Language): class Croatian(Language):

1313609
spacy/lang/hr/lemma_lookup.json Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,15 @@
The list of Croatian lemmas was extracted from the reldi-tagger repository (https://github.com/clarinsi/reldi-tagger).
Reldi-tagger is licesned under the Apache 2.0 licence.
@InProceedings{ljubesic16-new,
author = {Nikola Ljubešić and Filip Klubička and Željko Agić and Ivo-Pavao Jazbec},
title = {New Inflectional Lexicons and Training Corpora for Improved Morphosyntactic Annotation of Croatian and Serbian},
booktitle = {Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016)},
year = {2016},
date = {23-28},
location = {Portorož, Slovenia},
editor = {Nicoletta Calzolari (Conference Chair) and Khalid Choukri and Thierry Declerck and Sara Goggi and Marko Grobelnik and Bente Maegaard and Joseph Mariani and Helene Mazo and Asuncion Moreno and Jan Odijk and Stelios Piperidis},
publisher = {European Language Resources Association (ELRA)},
address = {Paris, France},
isbn = {978-2-9517408-9-1}
}

View File

@ -58,6 +58,7 @@ def check_spaces(text, tokens):
yield prev_end != idx yield prev_end != idx
prev_end = idx + len(token) prev_end = idx + len(token)
start = prev_end start = prev_end
if start > 0:
yield False yield False

View File

@ -21,6 +21,7 @@ class SerbianDefaults(Language.Defaults):
) )
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS stop_words = STOP_WORDS
resources = {"lemma_lookup": "lemma_lookup.json"}
class Serbian(Language): class Serbian(Language):

View File

@ -12,13 +12,14 @@ Example sentences to test spaCy and its language models.
sentences = [ sentences = [
# Translations from English # Translations from English
"Apple планира куповину америчког стартапа за $1 милијарду." "Apple планира куповину америчког стартапа за $1 милијарду.",
"Беспилотни аутомобили пребацују одговорност осигурања на произвођаче.", "Беспилотни аутомобили пребацују одговорност осигурања на произвођаче.",
"Лондон је велики град у Уједињеном Краљевству.", "Лондон је велики град у Уједињеном Краљевству.",
"Где си ти?", "Где си ти?",
"Ко је председник Француске?", "Ко је председник Француске?",
# Serbian common and slang # Serbian common and slang
"Moj ћале је инжењер!", "Moj ћале је инжењер!",
"Новак Ђоковић је најбољи тенисер света." "У Пироту има добрих кафана!", "Новак Ђоковић је најбољи тенисер света.",
"У Пироту има добрих кафана!",
"Музеј Николе Тесле се налази у Београду.", "Музеј Николе Тесле се налази у Београду.",
] ]

253316
spacy/lang/sr/lemma_lookup.json Executable file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,32 @@
Copyright @InProceedings{ljubesic16-new,
author = {Nikola Ljubešić and Filip Klubička and Željko Agić and Ivo-Pavao Jazbec},
title = {New Inflectional Lexicons and Training Corpora for Improved Morphosyntactic Annotation of Croatian and Serbian},
booktitle = {Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016)},
year = {2016},
date = {23-28},
location = {Portorož, Slovenia},
editor = {Nicoletta Calzolari (Conference Chair) and Khalid Choukri and Thierry Declerck and Sara Goggi and Marko Grobelnik and Bente Maegaard and Joseph Mariani and Helene Mazo and Asuncion Moreno and Jan Odijk and Stelios Piperidis},
publisher = {European Language Resources Association (ELRA)},
address = {Paris, France},
isbn = {978-2-9517408-9-1}
}
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
The licence of Serbian lemmas was adopted from Serbian lexicon:
- sr.lexicon (https://github.com/clarinsi/reldi-tagger/blob/master/sr.lexicon)
Changelog:
- Lexicon is translated into cyrilic
- Word order is sorted

View File

@ -15,6 +15,7 @@ _abbrev_exc = [
{ORTH: "пет", LEMMA: "петак", NORM: "петак"}, {ORTH: "пет", LEMMA: "петак", NORM: "петак"},
{ORTH: "суб", LEMMA: "субота", NORM: "субота"}, {ORTH: "суб", LEMMA: "субота", NORM: "субота"},
{ORTH: "нед", LEMMA: "недеља", NORM: "недеља"}, {ORTH: "нед", LEMMA: "недеља", NORM: "недеља"},
# Months abbreviations # Months abbreviations
{ORTH: "јан", LEMMA: "јануар", NORM: "јануар"}, {ORTH: "јан", LEMMA: "јануар", NORM: "јануар"},
{ORTH: "феб", LEMMA: "фебруар", NORM: "фебруар"}, {ORTH: "феб", LEMMA: "фебруар", NORM: "фебруар"},
@ -27,7 +28,7 @@ _abbrev_exc = [
{ORTH: "септ", LEMMA: "септембар", NORM: "септембар"}, {ORTH: "септ", LEMMA: "септембар", NORM: "септембар"},
{ORTH: "окт", LEMMA: "октобар", NORM: "октобар"}, {ORTH: "окт", LEMMA: "октобар", NORM: "октобар"},
{ORTH: "нов", LEMMA: "новембар", NORM: "новембар"}, {ORTH: "нов", LEMMA: "новембар", NORM: "новембар"},
{ORTH: "дец", LEMMA: "децембар", NORM: "децембар"}, {ORTH: "дец", LEMMA: "децембар", NORM: "децембар"}
] ]

View File

@ -41,8 +41,8 @@ class BaseDefaults(object):
def create_lemmatizer(cls, nlp=None, lookups=None): def create_lemmatizer(cls, nlp=None, lookups=None):
if lookups is None: if lookups is None:
lookups = cls.create_lookups(nlp=nlp) lookups = cls.create_lookups(nlp=nlp)
lemma_rules, lemma_index, lemma_exc, lemma_lookup = util.get_lemma_tables(lookups) rules, index, exc, lookup = util.get_lemma_tables(lookups)
return Lemmatizer(lemma_index, lemma_exc, lemma_rules, lemma_lookup) return Lemmatizer(index, exc, rules, lookup)
@classmethod @classmethod
def create_lookups(cls, nlp=None): def create_lookups(cls, nlp=None):

View File

@ -142,10 +142,34 @@ TOKEN_PATTERN_SCHEMA = {
"title": "Token is whitespace", "title": "Token is whitespace",
"$ref": "#/definitions/boolean_value", "$ref": "#/definitions/boolean_value",
}, },
"IS_BRACKET": {
"title": "Token is a bracket",
"$ref": "#/definitions/boolean_value",
},
"IS_QUOTE": {
"title": "Token is a quotation mark",
"$ref": "#/definitions/boolean_value",
},
"IS_LEFT_PUNCT": {
"title": "Token is a left punctuation mark",
"$ref": "#/definitions/boolean_value",
},
"IS_RIGHT_PUNCT": {
"title": "Token is a right punctuation mark",
"$ref": "#/definitions/boolean_value",
},
"IS_CURRENCY": {
"title": "Token is a currency symbol",
"$ref": "#/definitions/boolean_value",
},
"IS_STOP": { "IS_STOP": {
"title": "Token is stop word", "title": "Token is stop word",
"$ref": "#/definitions/boolean_value", "$ref": "#/definitions/boolean_value",
}, },
"IS_SENT_START": {
"title": "Token is the first in a sentence",
"$ref": "#/definitions/boolean_value",
},
"LIKE_NUM": { "LIKE_NUM": {
"title": "Token resembles a number", "title": "Token resembles a number",
"$ref": "#/definitions/boolean_value", "$ref": "#/definitions/boolean_value",

View File

@ -258,7 +258,7 @@ cdef class Begin:
@staticmethod @staticmethod
cdef bint is_valid(const StateC* st, attr_t label) nogil: cdef bint is_valid(const StateC* st, attr_t label) nogil:
cdef int preset_ent_iob = st.B_(0).ent_iob cdef int preset_ent_iob = st.B_(0).ent_iob
cdef int preset_ent_label = st.B_(0).ent_type cdef attr_t preset_ent_label = st.B_(0).ent_type
# If we're the last token of the input, we can't B -- must U or O. # If we're the last token of the input, we can't B -- must U or O.
if st.B(1) == -1: if st.B(1) == -1:
return False return False
@ -395,6 +395,9 @@ cdef class Last:
return False return False
elif not st.entity_is_open(): elif not st.entity_is_open():
return False return False
elif st.B_(0).ent_iob == 1 and st.B_(1).ent_iob != 1:
# If a preset entity has I followed by not-I, is L
return True
elif st.E_(0).ent_type != label: elif st.E_(0).ent_type != label:
return False return False
elif st.B_(1).ent_iob == 1: elif st.B_(1).ent_iob == 1:

View File

@ -103,6 +103,11 @@ def he_tokenizer():
return get_lang_class("he").Defaults.create_tokenizer() return get_lang_class("he").Defaults.create_tokenizer()
@pytest.fixture(scope="session")
def hr_tokenizer():
return get_lang_class("hr").Defaults.create_tokenizer()
@pytest.fixture @pytest.fixture
def hu_tokenizer(): def hu_tokenizer():
return get_lang_class("hu").Defaults.create_tokenizer() return get_lang_class("hu").Defaults.create_tokenizer()

View File

@ -99,6 +99,41 @@ def test_doc_retokenize_spans_merge_tokens(en_tokenizer):
assert doc[0].ent_type_ == "GPE" assert doc[0].ent_type_ == "GPE"
def test_doc_retokenize_spans_merge_tokens_default_attrs(en_tokenizer):
text = "The players start."
heads = [1, 1, 0, -1]
tokens = en_tokenizer(text)
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], tags=["DT", "NN", "VBZ", "."], pos=["DET", "NOUN", "VERB", "PUNCT"], heads=heads)
assert len(doc) == 4
assert doc[0].text == "The"
assert doc[0].tag_ == "DT"
assert doc[0].pos_ == "DET"
with doc.retokenize() as retokenizer:
retokenizer.merge(doc[0:2])
assert len(doc) == 3
assert doc[0].text == "The players"
assert doc[0].tag_ == "NN"
assert doc[0].pos_ == "NOUN"
assert doc[0].lemma_ == "The players"
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], tags=["DT", "NN", "VBZ", "."], pos=["DET", "NOUN", "VERB", "PUNCT"], heads=heads)
assert len(doc) == 4
assert doc[0].text == "The"
assert doc[0].tag_ == "DT"
assert doc[0].pos_ == "DET"
with doc.retokenize() as retokenizer:
retokenizer.merge(doc[0:2])
retokenizer.merge(doc[2:4])
assert len(doc) == 2
assert doc[0].text == "The players"
assert doc[0].tag_ == "NN"
assert doc[0].pos_ == "NOUN"
assert doc[0].lemma_ == "The players"
assert doc[1].text == "start ."
assert doc[1].tag_ == "VBZ"
assert doc[1].pos_ == "VERB"
assert doc[1].lemma_ == "start ."
def test_doc_retokenize_spans_merge_heads(en_tokenizer): def test_doc_retokenize_spans_merge_heads(en_tokenizer):
text = "I found a pilates class near work." text = "I found a pilates class near work."
heads = [1, 0, 2, 1, -3, -1, -1, -6] heads = [1, 0, 2, 1, -3, -1, -1, -6]
@ -182,7 +217,7 @@ def test_doc_retokenize_spans_entity_merge(en_tokenizer):
assert len(doc) == 15 assert len(doc) == 15
def test_doc_retokenize_spans_entity_merge_iob(): def test_doc_retokenize_spans_entity_merge_iob(en_vocab):
# Test entity IOB stays consistent after merging # Test entity IOB stays consistent after merging
words = ["a", "b", "c", "d", "e"] words = ["a", "b", "c", "d", "e"]
doc = Doc(Vocab(), words=words) doc = Doc(Vocab(), words=words)
@ -195,10 +230,23 @@ def test_doc_retokenize_spans_entity_merge_iob():
assert doc[2].ent_iob_ == "I" assert doc[2].ent_iob_ == "I"
assert doc[3].ent_iob_ == "B" assert doc[3].ent_iob_ == "B"
with doc.retokenize() as retokenizer: with doc.retokenize() as retokenizer:
retokenizer.merge(doc[0:1]) retokenizer.merge(doc[0:2])
assert len(doc) == len(words) - 1
assert doc[0].ent_iob_ == "B" assert doc[0].ent_iob_ == "B"
assert doc[1].ent_iob_ == "I" assert doc[1].ent_iob_ == "I"
# Test that IOB stays consistent with provided IOB
words = ["a", "b", "c", "d", "e"]
doc = Doc(Vocab(), words=words)
with doc.retokenize() as retokenizer:
attrs = {"ent_type": "ent-abc", "ent_iob": 1}
retokenizer.merge(doc[0:3], attrs=attrs)
retokenizer.merge(doc[3:5], attrs=attrs)
assert doc[0].ent_iob_ == "B"
assert doc[1].ent_iob_ == "I"
# if no parse/heads, the first word in the span is the root and provides
# default values
words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"] words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"]
doc = Doc(Vocab(), words=words) doc = Doc(Vocab(), words=words)
doc.ents = [ doc.ents = [
@ -215,7 +263,53 @@ def test_doc_retokenize_spans_entity_merge_iob():
retokenizer.merge(doc[7:9]) retokenizer.merge(doc[7:9])
assert len(doc) == 6 assert len(doc) == 6
assert doc[3].ent_iob_ == "B" assert doc[3].ent_iob_ == "B"
assert doc[4].ent_iob_ == "I" assert doc[3].ent_type_ == "ent-de"
assert doc[4].ent_iob_ == "B"
assert doc[4].ent_type_ == "ent-fg"
# if there is a parse, span.root provides default values
words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"]
heads = [ 0, -1, 1, -3, -4, -5, -1, -7, -8 ]
ents = [
(3, 5, "ent-de"),
(5, 7, "ent-fg"),
]
deps = ["dep"] * len(words)
en_vocab.strings.add("ent-de")
en_vocab.strings.add("ent-fg")
en_vocab.strings.add("dep")
doc = get_doc(en_vocab, words=words, heads=heads, deps=deps, ents=ents)
assert doc[2:4].root == doc[3] # root of 'c d' is d
assert doc[4:6].root == doc[4] # root is 'e f' is e
with doc.retokenize() as retokenizer:
retokenizer.merge(doc[2:4])
retokenizer.merge(doc[4:6])
retokenizer.merge(doc[7:9])
assert len(doc) == 6
assert doc[2].ent_iob_ == "B"
assert doc[2].ent_type_ == "ent-de"
assert doc[3].ent_iob_ == "I"
assert doc[3].ent_type_ == "ent-de"
assert doc[4].ent_iob_ == "B"
assert doc[4].ent_type_ == "ent-fg"
# check that B is preserved if span[start] is B
words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"]
heads = [ 0, -1, 1, 1, -4, -5, -1, -7, -8 ]
ents = [
(3, 5, "ent-de"),
(5, 7, "ent-de"),
]
deps = ["dep"] * len(words)
doc = get_doc(en_vocab, words=words, heads=heads, deps=deps, ents=ents)
with doc.retokenize() as retokenizer:
retokenizer.merge(doc[3:5])
retokenizer.merge(doc[5:7])
assert len(doc) == 7
assert doc[3].ent_iob_ == "B"
assert doc[3].ent_type_ == "ent-de"
assert doc[4].ent_iob_ == "B"
assert doc[4].ent_type_ == "ent-de"
def test_doc_retokenize_spans_sentence_update_after_merge(en_tokenizer): def test_doc_retokenize_spans_sentence_update_after_merge(en_tokenizer):

View File

@ -0,0 +1,20 @@
# coding: utf-8
from __future__ import unicode_literals
import pytest
@pytest.mark.parametrize(
"string,lemma",
[
("trčao", "trčati"),
("adekvatnim", "adekvatan"),
("dekontaminacijama", "dekontaminacija"),
("filologovih", "filologov"),
("je", "biti"),
("se", "sebe"),
],
)
def test_hr_lemmatizer_lookup_assigns(hr_tokenizer, string, lemma):
tokens = hr_tokenizer(string)
assert tokens[0].lemma_ == lemma

View File

@ -45,3 +45,8 @@ def test_ko_tokenizer_full_tags(ko_tokenizer, text, expected_tags):
def test_ko_tokenizer_pos(ko_tokenizer, text, expected_pos): def test_ko_tokenizer_pos(ko_tokenizer, text, expected_pos):
pos = [token.pos_ for token in ko_tokenizer(text)] pos = [token.pos_ for token in ko_tokenizer(text)]
assert pos == expected_pos.split() assert pos == expected_pos.split()
def test_ko_empty_doc(ko_tokenizer):
tokens = ko_tokenizer("")
assert len(tokens) == 0

View File

@ -0,0 +1,20 @@
# coding: utf-8
from __future__ import unicode_literals
import pytest
@pytest.mark.parametrize(
"string,lemma",
[
("најадекватнији", "адекватан"),
("матурирао", "матурирати"),
("планираћемо", "планирати"),
("певају", "певати"),
("нама", "ми"),
("се", "себе"),
],
)
def test_sr_lemmatizer_lookup_assigns(sr_tokenizer, string, lemma):
tokens = sr_tokenizer(string)
assert tokens[0].lemma_ == lemma

View File

@ -6,8 +6,13 @@ import pytest
@pytest.mark.parametrize( @pytest.mark.parametrize(
"text,norms,lemmas", "text,norms,lemmas",
[("о.г.", ["ове године"], ["ова година"]), ("чет.", ["четвртак"], ["четвртак"]), [
("гђа", ["госпођа"], ["госпођа"]), ("ил'", ["или"], ["или"])]) ("о.г.", ["ове године"], ["ова година"]),
("чет.", ["четвртак"], ["четвртак"]),
("гђа", ["госпођа"], ["госпођа"]),
("ил'", ["или"], ["или"]),
],
)
def test_sr_tokenizer_abbrev_exceptions(sr_tokenizer, text, norms, lemmas): def test_sr_tokenizer_abbrev_exceptions(sr_tokenizer, text, norms, lemmas):
tokens = sr_tokenizer(text) tokens = sr_tokenizer(text)
assert len(tokens) == 1 assert len(tokens) == 1

View File

@ -380,3 +380,33 @@ def test_attr_pipeline_checks(en_vocab):
matcher(doc1) matcher(doc1)
matcher(doc2) matcher(doc2)
matcher(doc3) matcher(doc3)
@pytest.mark.parametrize(
"pattern,text",
[
([{"IS_ALPHA": True}], "a"),
([{"IS_ASCII": True}], "a"),
([{"IS_DIGIT": True}], "1"),
([{"IS_LOWER": True}], "a"),
([{"IS_UPPER": True}], "A"),
([{"IS_TITLE": True}], "Aaaa"),
([{"IS_PUNCT": True}], "."),
([{"IS_SPACE": True}], "\n"),
([{"IS_BRACKET": True}], "["),
([{"IS_QUOTE": True}], '"'),
([{"IS_LEFT_PUNCT": True}], "``"),
([{"IS_RIGHT_PUNCT": True}], "''"),
([{"IS_STOP": True}], "the"),
([{"LIKE_NUM": True}], "1"),
([{"LIKE_URL": True}], "http://example.com"),
([{"LIKE_EMAIL": True}], "mail@example.com"),
],
)
def test_matcher_schema_token_attributes(en_vocab, pattern, text):
matcher = Matcher(en_vocab)
doc = Doc(en_vocab, words=text.split(" "))
matcher.add("Rule", None, pattern)
assert len(matcher) == 1
matches = matcher(doc)
assert len(matches) == 1

View File

@ -13,6 +13,28 @@ from spacy.lemmatizer import Lemmatizer
from spacy.symbols import ORTH, LEMMA, POS, VERB, VerbForm_part from spacy.symbols import ORTH, LEMMA, POS, VERB, VerbForm_part
@pytest.mark.xfail
def test_issue1061():
'''Test special-case works after tokenizing. Was caching problem.'''
text = 'I like _MATH_ even _MATH_ when _MATH_, except when _MATH_ is _MATH_! but not _MATH_.'
tokenizer = English.Defaults.create_tokenizer()
doc = tokenizer(text)
assert 'MATH' in [w.text for w in doc]
assert '_MATH_' not in [w.text for w in doc]
tokenizer.add_special_case('_MATH_', [{ORTH: '_MATH_'}])
doc = tokenizer(text)
assert '_MATH_' in [w.text for w in doc]
assert 'MATH' not in [w.text for w in doc]
# For sanity, check it works when pipeline is clean.
tokenizer = English.Defaults.create_tokenizer()
tokenizer.add_special_case('_MATH_', [{ORTH: '_MATH_'}])
doc = tokenizer(text)
assert '_MATH_' in [w.text for w in doc]
assert 'MATH' not in [w.text for w in doc]
@pytest.mark.xfail( @pytest.mark.xfail(
reason="g is split of as a unit, as the suffix regular expression can not look back further (variable-width)" reason="g is split of as a unit, as the suffix regular expression can not look back further (variable-width)"
) )

View File

@ -329,3 +329,4 @@ def test_issue_1971_4(en_vocab):
matches = matcher(doc) matches = matcher(doc)
# Uncommenting this caused a segmentation fault # Uncommenting this caused a segmentation fault
assert len(matches) == 1 assert len(matches) == 1
assert matches[0] == (en_vocab.strings["TEST"], 0, 3)

View File

@ -0,0 +1,57 @@
# coding: utf8
from __future__ import unicode_literals
from spacy.lang.en import English
import spacy
from spacy.tokenizer import Tokenizer
from spacy.tests.util import make_tempdir
def test_issue4190():
test_string = "Test c."
# Load default language
nlp_1 = English()
doc_1a = nlp_1(test_string)
result_1a = [token.text for token in doc_1a]
# Modify tokenizer
customize_tokenizer(nlp_1)
doc_1b = nlp_1(test_string)
result_1b = [token.text for token in doc_1b]
# Save and Reload
with make_tempdir() as model_dir:
nlp_1.to_disk(model_dir)
nlp_2 = spacy.load(model_dir)
# This should be the modified tokenizer
doc_2 = nlp_2(test_string)
result_2 = [token.text for token in doc_2]
assert result_1b == result_2
def customize_tokenizer(nlp):
prefix_re = spacy.util.compile_prefix_regex(nlp.Defaults.prefixes)
suffix_re = spacy.util.compile_suffix_regex(nlp.Defaults.suffixes)
infix_re = spacy.util.compile_infix_regex(nlp.Defaults.infixes)
# remove all exceptions where a single letter is followed by a period (e.g. 'h.')
exceptions = {
k: v
for k, v in dict(nlp.Defaults.tokenizer_exceptions).items()
if not (len(k) == 2 and k[1] == ".")
}
new_tokenizer = Tokenizer(
nlp.vocab,
exceptions,
prefix_search=prefix_re.search,
suffix_search=suffix_re.search,
infix_finditer=infix_re.finditer,
token_match=nlp.tokenizer.token_match,
)
nlp.tokenizer = new_tokenizer

View File

@ -4,7 +4,7 @@ from __future__ import unicode_literals
import pytest import pytest
from spacy.lang.en import English from spacy.lang.en import English
from spacy.cli.converters import conllu2json from spacy.cli.converters import conllu2json, iob2json, conll_ner2json
from spacy.cli.pretrain import make_docs from spacy.cli.pretrain import make_docs
@ -32,6 +32,95 @@ def test_cli_converters_conllu2json():
assert [t["ner"] for t in tokens] == ["O", "B-PER", "L-PER", "O"] assert [t["ner"] for t in tokens] == ["O", "B-PER", "L-PER", "O"]
def test_cli_converters_iob2json():
lines = [
"I|O like|O London|I-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O",
"I|O like|O London|B-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O",
"I|PRP|O like|VBP|O London|NNP|I-GPE and|CC|O New|NNP|B-GPE York|NNP|I-GPE City|NNP|I-GPE .|.|O",
"I|PRP|O like|VBP|O London|NNP|B-GPE and|CC|O New|NNP|B-GPE York|NNP|I-GPE City|NNP|I-GPE .|.|O",
]
input_data = "\n".join(lines)
converted = iob2json(input_data, n_sents=10)
assert len(converted) == 1
assert converted[0]["id"] == 0
assert len(converted[0]["paragraphs"]) == 1
assert len(converted[0]["paragraphs"][0]["sentences"]) == 4
for i in range(0, 4):
sent = converted[0]["paragraphs"][0]["sentences"][i]
assert len(sent["tokens"]) == 8
tokens = sent["tokens"]
# fmt: off
assert [t["orth"] for t in tokens] == ["I", "like", "London", "and", "New", "York", "City", "."]
assert [t["ner"] for t in tokens] == ["O", "O", "U-GPE", "O", "B-GPE", "I-GPE", "L-GPE", "O"]
# fmt: on
def test_cli_converters_conll_ner2json():
lines = [
"-DOCSTART- -X- O O",
"",
"I\tO",
"like\tO",
"London\tB-GPE",
"and\tO",
"New\tB-GPE",
"York\tI-GPE",
"City\tI-GPE",
".\tO",
"",
"I O",
"like O",
"London B-GPE",
"and O",
"New B-GPE",
"York I-GPE",
"City I-GPE",
". O",
"",
"I PRP O",
"like VBP O",
"London NNP B-GPE",
"and CC O",
"New NNP B-GPE",
"York NNP I-GPE",
"City NNP I-GPE",
". . O",
"",
"I PRP _ O",
"like VBP _ O",
"London NNP _ B-GPE",
"and CC _ O",
"New NNP _ B-GPE",
"York NNP _ I-GPE",
"City NNP _ I-GPE",
". . _ O",
"",
"I\tPRP\t_\tO",
"like\tVBP\t_\tO",
"London\tNNP\t_\tB-GPE",
"and\tCC\t_\tO",
"New\tNNP\t_\tB-GPE",
"York\tNNP\t_\tI-GPE",
"City\tNNP\t_\tI-GPE",
".\t.\t_\tO",
]
input_data = "\n".join(lines)
converted = conll_ner2json(input_data, n_sents=10)
print(converted)
assert len(converted) == 1
assert converted[0]["id"] == 0
assert len(converted[0]["paragraphs"]) == 1
assert len(converted[0]["paragraphs"][0]["sentences"]) == 5
for i in range(0, 5):
sent = converted[0]["paragraphs"][0]["sentences"][i]
assert len(sent["tokens"]) == 8
tokens = sent["tokens"]
# fmt: off
assert [t["orth"] for t in tokens] == ["I", "like", "London", "and", "New", "York", "City", "."]
assert [t["ner"] for t in tokens] == ["O", "O", "U-GPE", "O", "B-GPE", "I-GPE", "L-GPE", "O"]
# fmt: on
def test_pretrain_make_docs(): def test_pretrain_make_docs():
nlp = English() nlp = English()

View File

@ -441,8 +441,13 @@ cdef class Tokenizer:
self.infix_finditer = re.compile(data["infix_finditer"]).finditer self.infix_finditer = re.compile(data["infix_finditer"]).finditer
if data.get("token_match"): if data.get("token_match"):
self.token_match = re.compile(data["token_match"]).match self.token_match = re.compile(data["token_match"]).match
if data.get("rules"):
# make sure to hard reset the cache to remove data from the default exceptions
self._rules = {}
self._cache = PreshMap()
for string, substrings in data.get("rules", {}).items(): for string, substrings in data.get("rules", {}).items():
self.add_special_case(string, substrings) self.add_special_case(string, substrings)
return self return self

View File

@ -109,13 +109,8 @@ cdef class Retokenizer:
def __exit__(self, *args): def __exit__(self, *args):
# Do the actual merging here # Do the actual merging here
if len(self.merges) > 1: if len(self.merges) >= 1:
_bulk_merge(self.doc, self.merges) _merge(self.doc, self.merges)
elif len(self.merges) == 1:
(span, attrs) = self.merges[0]
start = span.start
end = span.end
_merge(self.doc, start, end, attrs)
# Iterate in order, to keep things simple. # Iterate in order, to keep things simple.
for start_char, orths, heads, attrs in sorted(self.splits): for start_char, orths, heads, attrs in sorted(self.splits):
# Resolve token index # Resolve token index
@ -140,95 +135,7 @@ cdef class Retokenizer:
_split(self.doc, token_index, orths, head_indices, attrs) _split(self.doc, token_index, orths, head_indices, attrs)
def _merge(Doc doc, int start, int end, attributes): def _merge(Doc doc, merges):
"""Retokenize the document, such that the span at
`doc.text[start_idx : end_idx]` is merged into a single token. If
`start_idx` and `end_idx `do not mark start and end token boundaries,
the document remains unchanged.
start_idx (int): Character index of the start of the slice to merge.
end_idx (int): Character index after the end of the slice to merge.
**attributes: Attributes to assign to the merged token. By default,
attributes are inherited from the syntactic root of the span.
RETURNS (Token): The newly merged token, or `None` if the start and end
indices did not fall at token boundaries.
"""
cdef Span span = doc[start:end]
cdef int start_char = span.start_char
cdef int end_char = span.end_char
# Resize the doc.tensor, if it's set. Let the last row for each token stand
# for the merged region. To do this, we create a boolean array indicating
# whether the row is to be deleted, then use numpy.delete
if doc.tensor is not None and doc.tensor.size != 0:
doc.tensor = _resize_tensor(doc.tensor, [(start, end)])
# Get LexemeC for newly merged token
new_orth = ''.join([t.text_with_ws for t in span])
if span[-1].whitespace_:
new_orth = new_orth[:-len(span[-1].whitespace_)]
cdef const LexemeC* lex = doc.vocab.get(doc.mem, new_orth)
# House the new merged token where it starts
cdef TokenC* token = &doc.c[start]
token.spacy = doc.c[end-1].spacy
for attr_name, attr_value in attributes.items():
if attr_name == "_": # Set extension attributes
for ext_attr_key, ext_attr_value in attr_value.items():
doc[start]._.set(ext_attr_key, ext_attr_value)
elif attr_name == TAG:
doc.vocab.morphology.assign_tag(token, attr_value)
else:
# Set attributes on both token and lexeme to take care of token
# attribute vs. lexical attribute without having to enumerate them.
# If an attribute name is not valid, set_struct_attr will ignore it.
Token.set_struct_attr(token, attr_name, attr_value)
Lexeme.set_struct_attr(<LexemeC*>lex, attr_name, attr_value)
# Make sure ent_iob remains consistent
if doc.c[end].ent_iob == 1 and token.ent_iob in (0, 2):
if token.ent_type == doc.c[end].ent_type:
token.ent_iob = 3
else:
# If they're not the same entity type, let them be two entities
doc.c[end].ent_iob = 3
# Begin by setting all the head indices to absolute token positions
# This is easier to work with for now than the offsets
# Before thinking of something simpler, beware the case where a
# dependency bridges over the entity. Here the alignment of the
# tokens changes.
span_root = span.root.i
token.dep = span.root.dep
# We update token.lex after keeping span root and dep, since
# setting token.lex will change span.start and span.end properties
# as it modifies the character offsets in the doc
token.lex = lex
for i in range(doc.length):
doc.c[i].head += i
# Set the head of the merged token, and its dep relation, from the Span
token.head = doc.c[span_root].head
# Adjust deps before shrinking tokens
# Tokens which point into the merged token should now point to it
# Subtract the offset from all tokens which point to >= end
offset = (end - start) - 1
for i in range(doc.length):
head_idx = doc.c[i].head
if start <= head_idx < end:
doc.c[i].head = start
elif head_idx >= end:
doc.c[i].head -= offset
# Now compress the token array
for i in range(end, doc.length):
doc.c[i - offset] = doc.c[i]
for i in range(doc.length - offset, doc.length):
memset(&doc.c[i], 0, sizeof(TokenC))
doc.c[i].lex = &EMPTY_LEXEME
doc.length -= offset
for i in range(doc.length):
# ...And, set heads back to a relative position
doc.c[i].head -= i
# Set the left/right children, left/right edges
set_children_from_heads(doc.c, doc.length)
# Return the merged Python object
return doc[start]
def _bulk_merge(Doc doc, merges):
"""Retokenize the document, such that the spans described in 'merges' """Retokenize the document, such that the spans described in 'merges'
are merged into a single token. This method assumes that the merges are merged into a single token. This method assumes that the merges
are in the same order at which they appear in the doc, and that merges are in the same order at which they appear in the doc, and that merges
@ -256,6 +163,26 @@ def _bulk_merge(Doc doc, merges):
spans.append(span) spans.append(span)
# House the new merged token where it starts # House the new merged token where it starts
token = &doc.c[start] token = &doc.c[start]
# Initially set attributes to attributes of span root
token.tag = doc.c[span.root.i].tag
token.pos = doc.c[span.root.i].pos
token.morph = doc.c[span.root.i].morph
token.ent_iob = doc.c[span.root.i].ent_iob
token.ent_type = doc.c[span.root.i].ent_type
merged_iob = token.ent_iob
# If span root is part of an entity, merged token is B-ENT
if token.ent_iob in (1, 3):
merged_iob = 3
# If start token is I-ENT and previous token is of the same
# type, then I-ENT (could check I-ENT from start to span root)
if doc.c[start].ent_iob == 1 and start > 0 \
and doc.c[start].ent_type == token.ent_type \
and doc.c[start - 1].ent_type == token.ent_type:
merged_iob = 1
token.ent_iob = merged_iob
# Unset attributes that don't match new token
token.lemma = 0
token.norm = 0
tokens[merge_index] = token tokens[merge_index] = token
# Resize the doc.tensor, if it's set. Let the last row for each token stand # Resize the doc.tensor, if it's set. Let the last row for each token stand
# for the merged region. To do this, we create a boolean array indicating # for the merged region. To do this, we create a boolean array indicating
@ -351,17 +278,7 @@ def _bulk_merge(Doc doc, merges):
# Set the left/right children, left/right edges # Set the left/right children, left/right edges
set_children_from_heads(doc.c, doc.length) set_children_from_heads(doc.c, doc.length)
# Make sure ent_iob remains consistent # Make sure ent_iob remains consistent
for (span, _) in merges: make_iob_consistent(doc.c, doc.length)
if(span.end < len(offsets)):
# If it's not the last span
token_after_span_position = offsets[span.end]
if doc.c[token_after_span_position].ent_iob == 1\
and doc.c[token_after_span_position - 1].ent_iob in (0, 2):
if doc.c[token_after_span_position - 1].ent_type == doc.c[token_after_span_position].ent_type:
doc.c[token_after_span_position - 1].ent_iob = 3
else:
# If they're not the same entity type, let them be two entities
doc.c[token_after_span_position].ent_iob = 3
# Return the merged Python object # Return the merged Python object
return doc[spans[0].start] return doc[spans[0].start]
@ -480,3 +397,12 @@ def _validate_extensions(extensions):
raise ValueError(Errors.E118.format(attr=key)) raise ValueError(Errors.E118.format(attr=key))
if not is_writable_attr(extension): if not is_writable_attr(extension):
raise ValueError(Errors.E119.format(attr=key)) raise ValueError(Errors.E119.format(attr=key))
cdef make_iob_consistent(TokenC* tokens, int length):
cdef int i
if tokens[0].ent_iob == 1:
tokens[0].ent_iob = 3
for i in range(1, length):
if tokens[i].ent_iob == 1 and tokens[i - 1].ent_type != tokens[i].ent_type:
tokens[i].ent_iob = 3

View File

@ -145,6 +145,8 @@ $ python -m spacy convert [input_file] [output_dir] [--file-type] [--converter]
| `--file-type`, `-t` <Tag variant="new">2.1</Tag> | option | Type of file to create (see below). | | `--file-type`, `-t` <Tag variant="new">2.1</Tag> | option | Type of file to create (see below). |
| `--converter`, `-c` <Tag variant="new">2</Tag> | option | Name of converter to use (see below). | | `--converter`, `-c` <Tag variant="new">2</Tag> | option | Name of converter to use (see below). |
| `--n-sents`, `-n` | option | Number of sentences per document. | | `--n-sents`, `-n` | option | Number of sentences per document. |
| `--seg-sents`, `-s` <Tag variant="new">2.2</Tag> | flag | Segment sentences (for `-c ner`) |
| `--model`, `-b` <Tag variant="new">2.2</Tag> | option | Model for parser-based sentence segmentation (for `-s`) |
| `--morphology`, `-m` | option | Enable appending morphology to tags. | | `--morphology`, `-m` | option | Enable appending morphology to tags. |
| `--lang`, `-l` <Tag variant="new">2.1</Tag> | option | Language code (if tokenizer required). | | `--lang`, `-l` <Tag variant="new">2.1</Tag> | option | Language code (if tokenizer required). |
| `--help`, `-h` | flag | Show help message and available arguments. | | `--help`, `-h` | flag | Show help message and available arguments. |
@ -174,10 +176,10 @@ All output files generated by this command are compatible with
| ID | Description | | ID | Description |
| ------------------------------ | --------------------------------------------------------------- | | ------------------------------ | --------------------------------------------------------------- |
| `auto` | Automatically pick converter based on file extension (default). | | `auto` | Automatically pick converter based on file extension and file content (default). |
| `conll`, `conllu`, `conllubio` | Universal Dependencies `.conllu` or `.conll` format. | | `conll`, `conllu`, `conllubio` | Universal Dependencies `.conllu` or `.conll` format. |
| `ner` | Tab-based named entity recognition format. | | `ner` | NER with IOB/IOB2 tags, one token per line with columns separated by whitespace. The first column is the token and the final column is the IOB tag. Sentences are separated by blank lines and documents are separated by the line `-DOCSTART- -X- O O`. Supports CoNLL 2003 NER format. See [sample data](https://github.com/explosion/spaCy/tree/master/examples/training/ner_example_data). |
| `iob` | IOB or IOB2 named entity recognition format. | | `iob` | NER with IOB/IOB2 tags, one sentence per line with tokens separated by whitespace and annotation separated by `|`, either `word|B-ENT` or `word|POS|B-ENT`. See [sample data](https://github.com/explosion/spaCy/tree/master/examples/training/ner_example_data). |
## Train {#train} ## Train {#train}

View File

@ -639,7 +639,7 @@ Yield an infinite series of linearly decaying values.
Shuffle an iterator. This works by holding `bufsize` items back and yielding Shuffle an iterator. This works by holding `bufsize` items back and yielding
them sometime later. Obviously, this is not unbiased but should be good enough them sometime later. Obviously, this is not unbiased but should be good enough
for batching. Larger `buffsize` means less bias. for batching. Larger `bufsize` means less bias.
> #### Example > #### Example
> >
@ -649,9 +649,9 @@ for batching. Larger `buffsize` means less bias.
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ---------- | -------- | ---------------------- | | ---------- | -------- | ------------------------------------- |
| `iterable` | iterable | Iterator to shuffle. | | `iterable` | iterable | Iterator to shuffle. |
| `buffsize` | int | Items to hold back. | | `bufsize` | int | Items to hold back (default: 1000). |
| **YIELDS** | iterable | The shuffled iterator. | | **YIELDS** | iterable | The shuffled iterator. |
### util.filter_spans {#util.filter_spans tag="function" new="2.1.4"} ### util.filter_spans {#util.filter_spans tag="function" new="2.1.4"}

View File

@ -26,6 +26,14 @@ import PosDeps101 from 'usage/101/\_pos-deps.md'
<PosDeps101 /> <PosDeps101 />
<Infobox title="📖 Part-of-speech tag scheme">
For a list of the fine-grained and coarse-grained part-of-speech tags assigned
by spaCy's models across different languages, see the
[POS tag scheme documentation](/api/annotation#pos-tagging).
</Infobox>
### Rule-based morphology {#rule-based-morphology} ### Rule-based morphology {#rule-based-morphology}
Inflectional morphology is the process by which a root form of a word is Inflectional morphology is the process by which a root form of a word is
@ -62,13 +70,6 @@ of the two. The system works as follows:
lemmatizer also accepts list-based exception files, acquired from lemmatizer also accepts list-based exception files, acquired from
[WordNet](https://wordnet.princeton.edu/). [WordNet](https://wordnet.princeton.edu/).
<Infobox title="📖 Part-of-speech tag scheme">
For a list of the fine-grained and coarse-grained part-of-speech tags assigned
by spaCy's models across different languages, see the
[POS tag scheme documentation](/api/annotation#pos-tagging).
</Infobox>
## Dependency Parsing {#dependency-parse model="parser"} ## Dependency Parsing {#dependency-parse model="parser"}
@ -289,7 +290,7 @@ for token in doc:
For a list of the syntactic dependency labels assigned by spaCy's models across For a list of the syntactic dependency labels assigned by spaCy's models across
different languages, see the different languages, see the
[dependency label scheme documentation](/api/annotation#pos-tagging). [dependency label scheme documentation](/api/annotation#dependency-parsing).
</Infobox> </Infobox>

View File

@ -10,6 +10,7 @@
"en_vectors_web_lg", "en_vectors_web_lg",
"en_pytt_bertbaseuncased_lg", "en_pytt_bertbaseuncased_lg",
"en_pytt_robertabase_lg", "en_pytt_robertabase_lg",
"en_pytt_distilbertbaseuncased_lg",
"en_pytt_xlnetbasecased_lg" "en_pytt_xlnetbasecased_lg"
], ],
"example": "This is a sentence.", "example": "This is a sentence.",

View File

@ -1562,7 +1562,7 @@
}, },
{ {
"id": "pyInflect", "id": "pyInflect",
"slogan": "A python module for word inflections", "slogan": "A Python module for word inflections",
"description": "This package uses the [spaCy 2.0 extensions](https://spacy.io/usage/processing-pipelines#extensions) to add word inflections to the system.", "description": "This package uses the [spaCy 2.0 extensions](https://spacy.io/usage/processing-pipelines#extensions) to add word inflections to the system.",
"github": "bjascob/pyInflect", "github": "bjascob/pyInflect",
"pip": "pyinflect", "pip": "pyinflect",
@ -1582,6 +1582,29 @@
"category": ["pipeline"], "category": ["pipeline"],
"tags": ["inflection"] "tags": ["inflection"]
}, },
{
"id": "lemminflect",
"slogan": "A Python module for English lemmatization and inflection",
"description": "LemmInflect uses a dictionary approach to lemmatize English words and inflect them into forms specified by a user supplied [Universal Dependencies](https://universaldependencies.org/u/pos/) or [Penn Treebank](https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html) tag. The library works with out-of-vocabulary (OOV) words by applying neural network techniques to classify word forms and choose the appropriate morphing rules. The system acts as a standalone module or as an extension to spaCy.",
"github": "bjascob/LemmInflect",
"pip": "lemminflect",
"thumb": "https://raw.githubusercontent.com/bjascob/LemmInflect/master/docs/img/icons8-citrus-80.png",
"code_example": [
"import spacy",
"import lemminflect",
"",
"nlp = spacy.load('en_core_web_sm')",
"doc = nlp('I am testing this example.')",
"doc[2]._.lemma() # 'test'",
"doc[4]._.inflect('NNS') # 'examples'"
],
"author": "Brad Jascob",
"author_links": {
"github": "bjascob"
},
"category": ["pipeline"],
"tags": ["inflection", "lemmatizer"]
},
{ {
"id": "blackstone", "id": "blackstone",
"title": "Blackstone", "title": "Blackstone",
@ -1744,6 +1767,21 @@
"twitter": "yanaiela", "twitter": "yanaiela",
"website": "https://yanaiela.github.io" "website": "https://yanaiela.github.io"
} }
},
{
"id": "presidio",
"title": "Presidio",
"slogan": "Context aware, pluggable and customizable data protection and PII data anonymization",
"description": "Presidio *(Origin from Latin praesidium protection, garrison)* helps to ensure sensitive text is properly managed and governed. It provides fast ***analytics*** and ***anonymization*** for sensitive text such as credit card numbers, names, locations, social security numbers, bitcoin wallets, US phone numbers and financial data. Presidio analyzes the text using predefined or custom recognizers to identify entities, patterns, formats, and checksums with relevant context.",
"url": "https://aka.ms/presidio",
"image": "https://raw.githubusercontent.com/microsoft/presidio/master/docs/assets/before-after.png",
"github": "microsoft/presidio",
"category": ["standalone"],
"thumb": "https://avatars0.githubusercontent.com/u/6154722",
"author": "Microsoft",
"author_links": {
"github": "microsoft"
}
} }
], ],

View File

@ -8,6 +8,8 @@ import Icon from './icon'
import classes from '../styles/link.module.sass' import classes from '../styles/link.module.sass'
import { isString } from './util' import { isString } from './util'
const internalRegex = /(http(s?)):\/\/(prodi.gy|spacy.io|irl.spacy.io)/gi
const Whitespace = ({ children }) => ( const Whitespace = ({ children }) => (
// Ensure that links are always wrapped in spaces // Ensure that links are always wrapped in spaces
<> {children} </> <> {children} </>
@ -68,13 +70,15 @@ const Link = ({
</Wrapper> </Wrapper>
) )
} }
const isInternal = internalRegex.test(dest)
const rel = isInternal ? null : 'noopener nofollow noreferrer'
return ( return (
<Wrapper> <Wrapper>
<OutboundLink <OutboundLink
href={dest} href={dest}
className={linkClassNames} className={linkClassNames}
target="_blank" target="_blank"
rel="noopener nofollow noreferrer" rel={rel}
{...other} {...other}
> >
{content} {content}