mirror of
https://github.com/explosion/spaCy.git
synced 2025-05-31 03:03:17 +03:00
morphologizer: avoid recreating label tuple for each token (#9764)
* morphologizer: avoid recreating label tuple for each token The `labels` property converts the dictionary key set to a tuple. This property was used for every annotated token, recreating the tuple over and over again. Construct the tuple once in the set_annotations function and reuse it. On a Finnish pipeline that I was experimenting with, this results in a speedup of ~15% (~13000 -> ~15000 WPS). * tagger: avoid recreating label tuple for each token
This commit is contained in:
parent
c19f0c1604
commit
72f7f4e68a
|
@ -231,12 +231,13 @@ class Morphologizer(Tagger):
|
||||||
cdef Vocab vocab = self.vocab
|
cdef Vocab vocab = self.vocab
|
||||||
cdef bint overwrite = self.cfg["overwrite"]
|
cdef bint overwrite = self.cfg["overwrite"]
|
||||||
cdef bint extend = self.cfg["extend"]
|
cdef bint extend = self.cfg["extend"]
|
||||||
|
labels = self.labels
|
||||||
for i, doc in enumerate(docs):
|
for i, doc in enumerate(docs):
|
||||||
doc_tag_ids = batch_tag_ids[i]
|
doc_tag_ids = batch_tag_ids[i]
|
||||||
if hasattr(doc_tag_ids, "get"):
|
if hasattr(doc_tag_ids, "get"):
|
||||||
doc_tag_ids = doc_tag_ids.get()
|
doc_tag_ids = doc_tag_ids.get()
|
||||||
for j, tag_id in enumerate(doc_tag_ids):
|
for j, tag_id in enumerate(doc_tag_ids):
|
||||||
morph = self.labels[tag_id]
|
morph = labels[tag_id]
|
||||||
# set morph
|
# set morph
|
||||||
if doc.c[j].morph == 0 or overwrite or extend:
|
if doc.c[j].morph == 0 or overwrite or extend:
|
||||||
if overwrite and extend:
|
if overwrite and extend:
|
||||||
|
|
|
@ -166,13 +166,14 @@ class Tagger(TrainablePipe):
|
||||||
cdef Doc doc
|
cdef Doc doc
|
||||||
cdef Vocab vocab = self.vocab
|
cdef Vocab vocab = self.vocab
|
||||||
cdef bint overwrite = self.cfg["overwrite"]
|
cdef bint overwrite = self.cfg["overwrite"]
|
||||||
|
labels = self.labels
|
||||||
for i, doc in enumerate(docs):
|
for i, doc in enumerate(docs):
|
||||||
doc_tag_ids = batch_tag_ids[i]
|
doc_tag_ids = batch_tag_ids[i]
|
||||||
if hasattr(doc_tag_ids, "get"):
|
if hasattr(doc_tag_ids, "get"):
|
||||||
doc_tag_ids = doc_tag_ids.get()
|
doc_tag_ids = doc_tag_ids.get()
|
||||||
for j, tag_id in enumerate(doc_tag_ids):
|
for j, tag_id in enumerate(doc_tag_ids):
|
||||||
if doc.c[j].tag == 0 or overwrite:
|
if doc.c[j].tag == 0 or overwrite:
|
||||||
doc.c[j].tag = self.vocab.strings[self.labels[tag_id]]
|
doc.c[j].tag = self.vocab.strings[labels[tag_id]]
|
||||||
|
|
||||||
def update(self, examples, *, drop=0., sgd=None, losses=None):
|
def update(self, examples, *, drop=0., sgd=None, losses=None):
|
||||||
"""Learn from a batch of documents and gold-standard information,
|
"""Learn from a batch of documents and gold-standard information,
|
||||||
|
|
Loading…
Reference in New Issue
Block a user