* Use an AttributeCodec based on orth for words. Still no oov handling mechanism.

2025-07-16 03:02:41 +03:00 · 2015-07-18 22:43:18 +02:00 · 2015-07-18 22:43:18 +02:00 · 5b4c78bbb2
commit 5b4c78bbb2
parent 82d84b0f2b
1 changed files with 2 additions and 15 deletions
--- a/spacy/serialize/packer.pyx
+++ b/spacy/serialize/packer.pyx
@ -8,7 +8,7 @@ from libcpp.pair cimport pair
 from cymem.cymem cimport Address, Pool
 from preshed.maps cimport PreshMap

-from ..attrs cimport ID, SPACY, TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
+from ..attrs cimport ORTH, SPACY, TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
 from ..tokens.doc cimport Doc
 from ..vocab cimport Vocab
 from ..typedefs cimport attr_t
@ -34,17 +34,6 @@ cimport cython
 #       Entity tag


-def make_vocab_codec(Vocab vocab):
-    cdef int length = len(vocab)
-    cdef Address mem = Address(length, sizeof(float))
-    probs = <float*>mem.ptr
-    cdef int i
-    for i in range(length):
-        probs[i] = <float>c_exp(vocab.lexemes[i].prob)
-    cdef float[:] cv_probs = <float[:len(vocab)]>probs
-    return HuffmanCodec(cv_probs)
-
-
 cdef class _BinaryCodec:
    def encode(self, attr_t[:] msg, BitArray bits):
        cdef int i
@ -112,9 +101,7 @@ cdef class Packer:
        attrs = []

        for attr, freqs in list_of_attr_freqs:
-            if attr == ID:
-                codecs.append(make_vocab_codec(vocab))
-            elif attr == SPACY:
+            if attr == SPACY:
                codecs.append(_BinaryCodec())
            else:
                codecs.append(_AttributeCodec(freqs))