mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
morphologizer: avoid recreating label tuple for each token (#9764)
* morphologizer: avoid recreating label tuple for each token The `labels` property converts the dictionary key set to a tuple. This property was used for every annotated token, recreating the tuple over and over again. Construct the tuple once in the set_annotations function and reuse it. On a Finnish pipeline that I was experimenting with, this results in a speedup of ~15% (~13000 -> ~15000 WPS). * tagger: avoid recreating label tuple for each token
This commit is contained in:
parent
c19f0c1604
commit
72f7f4e68a
|
@ -231,12 +231,13 @@ class Morphologizer(Tagger):
|
|||
cdef Vocab vocab = self.vocab
|
||||
cdef bint overwrite = self.cfg["overwrite"]
|
||||
cdef bint extend = self.cfg["extend"]
|
||||
labels = self.labels
|
||||
for i, doc in enumerate(docs):
|
||||
doc_tag_ids = batch_tag_ids[i]
|
||||
if hasattr(doc_tag_ids, "get"):
|
||||
doc_tag_ids = doc_tag_ids.get()
|
||||
for j, tag_id in enumerate(doc_tag_ids):
|
||||
morph = self.labels[tag_id]
|
||||
morph = labels[tag_id]
|
||||
# set morph
|
||||
if doc.c[j].morph == 0 or overwrite or extend:
|
||||
if overwrite and extend:
|
||||
|
|
|
@ -166,13 +166,14 @@ class Tagger(TrainablePipe):
|
|||
cdef Doc doc
|
||||
cdef Vocab vocab = self.vocab
|
||||
cdef bint overwrite = self.cfg["overwrite"]
|
||||
labels = self.labels
|
||||
for i, doc in enumerate(docs):
|
||||
doc_tag_ids = batch_tag_ids[i]
|
||||
if hasattr(doc_tag_ids, "get"):
|
||||
doc_tag_ids = doc_tag_ids.get()
|
||||
for j, tag_id in enumerate(doc_tag_ids):
|
||||
if doc.c[j].tag == 0 or overwrite:
|
||||
doc.c[j].tag = self.vocab.strings[self.labels[tag_id]]
|
||||
doc.c[j].tag = self.vocab.strings[labels[tag_id]]
|
||||
|
||||
def update(self, examples, *, drop=0., sgd=None, losses=None):
|
||||
"""Learn from a batch of documents and gold-standard information,
|
||||
|
|
Loading…
Reference in New Issue
Block a user