* Use an AttributeCodec based on orth for words. Still no oov handling mechanism.

This commit is contained in:
Matthew Honnibal 2015-07-18 22:43:18 +02:00
parent 82d84b0f2b
commit 5b4c78bbb2

View File

@ -8,7 +8,7 @@ from libcpp.pair cimport pair
from cymem.cymem cimport Address, Pool
from preshed.maps cimport PreshMap
from ..attrs cimport ID, SPACY, TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
from ..attrs cimport ORTH, SPACY, TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
from ..tokens.doc cimport Doc
from ..vocab cimport Vocab
from ..typedefs cimport attr_t
@ -34,17 +34,6 @@ cimport cython
# Entity tag
def make_vocab_codec(Vocab vocab):
cdef int length = len(vocab)
cdef Address mem = Address(length, sizeof(float))
probs = <float*>mem.ptr
cdef int i
for i in range(length):
probs[i] = <float>c_exp(vocab.lexemes[i].prob)
cdef float[:] cv_probs = <float[:len(vocab)]>probs
return HuffmanCodec(cv_probs)
cdef class _BinaryCodec:
def encode(self, attr_t[:] msg, BitArray bits):
cdef int i
@ -112,9 +101,7 @@ cdef class Packer:
attrs = []
for attr, freqs in list_of_attr_freqs:
if attr == ID:
codecs.append(make_vocab_codec(vocab))
elif attr == SPACY:
if attr == SPACY:
codecs.append(_BinaryCodec())
else:
codecs.append(_AttributeCodec(freqs))