* Use an AttributeCodec based on orth for words. Still no oov handling mechanism.

This commit is contained in:
Matthew Honnibal 2015-07-18 22:43:18 +02:00
parent 82d84b0f2b
commit 5b4c78bbb2

View File

@ -8,7 +8,7 @@ from libcpp.pair cimport pair
from cymem.cymem cimport Address, Pool from cymem.cymem cimport Address, Pool
from preshed.maps cimport PreshMap from preshed.maps cimport PreshMap
from ..attrs cimport ID, SPACY, TAG, HEAD, DEP, ENT_IOB, ENT_TYPE from ..attrs cimport ORTH, SPACY, TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
from ..tokens.doc cimport Doc from ..tokens.doc cimport Doc
from ..vocab cimport Vocab from ..vocab cimport Vocab
from ..typedefs cimport attr_t from ..typedefs cimport attr_t
@ -34,17 +34,6 @@ cimport cython
# Entity tag # Entity tag
def make_vocab_codec(Vocab vocab):
cdef int length = len(vocab)
cdef Address mem = Address(length, sizeof(float))
probs = <float*>mem.ptr
cdef int i
for i in range(length):
probs[i] = <float>c_exp(vocab.lexemes[i].prob)
cdef float[:] cv_probs = <float[:len(vocab)]>probs
return HuffmanCodec(cv_probs)
cdef class _BinaryCodec: cdef class _BinaryCodec:
def encode(self, attr_t[:] msg, BitArray bits): def encode(self, attr_t[:] msg, BitArray bits):
cdef int i cdef int i
@ -112,9 +101,7 @@ cdef class Packer:
attrs = [] attrs = []
for attr, freqs in list_of_attr_freqs: for attr, freqs in list_of_attr_freqs:
if attr == ID: if attr == SPACY:
codecs.append(make_vocab_codec(vocab))
elif attr == SPACY:
codecs.append(_BinaryCodec()) codecs.append(_BinaryCodec())
else: else:
codecs.append(_AttributeCodec(freqs)) codecs.append(_AttributeCodec(freqs))