morphologizer: avoid recreating label tuple for each token (#9764)

* morphologizer: avoid recreating label tuple for each token The `labels` property converts the dictionary key set to a tuple. This property was used for every annotated token, recreating the tuple over and over again. Construct the tuple once in the set_annotations function and reuse it. On a Finnish pipeline that I was experimenting with, this results in a speedup of ~15% (~13000 -> ~15000 WPS). * tagger: avoid recreating label tuple for each token
2025-11-11 05:19:52 +03:00 · 2021-11-30 11:58:59 +01:00 · 2021-11-30 11:58:59 +01:00 · 72f7f4e68a
commit 72f7f4e68a
parent c19f0c1604
2 changed files with 4 additions and 2 deletions
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@ -231,12 +231,13 @@ class Morphologizer(Tagger):
        cdef Vocab vocab = self.vocab
        cdef bint overwrite = self.cfg["overwrite"]
        cdef bint extend = self.cfg["extend"]
        labels = self.labels
        for i, doc in enumerate(docs):
            doc_tag_ids = batch_tag_ids[i]
            if hasattr(doc_tag_ids, "get"):
                doc_tag_ids = doc_tag_ids.get()
            for j, tag_id in enumerate(doc_tag_ids):
-                morph = self.labels[tag_id]
+                morph = labels[tag_id]
                # set morph
                if doc.c[j].morph == 0 or overwrite or extend:
                    if overwrite and extend:
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@ -166,13 +166,14 @@ class Tagger(TrainablePipe):
        cdef Doc doc
        cdef Vocab vocab = self.vocab
        cdef bint overwrite = self.cfg["overwrite"]
        labels = self.labels
        for i, doc in enumerate(docs):
            doc_tag_ids = batch_tag_ids[i]
            if hasattr(doc_tag_ids, "get"):
                doc_tag_ids = doc_tag_ids.get()
            for j, tag_id in enumerate(doc_tag_ids):
                if doc.c[j].tag == 0 or overwrite:
-                    doc.c[j].tag = self.vocab.strings[self.labels[tag_id]]
+                    doc.c[j].tag = self.vocab.strings[labels[tag_id]]
    def update(self, examples, *, drop=0., sgd=None, losses=None):
        """Learn from a batch of documents and gold-standard information,