* Rename NORM1 and NORM2 attrs to lower and norm

This commit is contained in:
Matthew Honnibal 2015-01-24 06:17:03 +11:00
parent 75feb52c5d
commit fda94271af
9 changed files with 42 additions and 47 deletions

View File

@ -20,8 +20,8 @@ def get_lex_props(string):
'flags': get_flags(string),
'length': len(string),
'orth': string,
'norm1': string.lower(),
'norm2': string,
'lower': string.lower(),
'norm': string,
'shape': orth.word_shape(string),
'prefix': string[0],
'suffix': string[-3:],

View File

@ -2,13 +2,14 @@ from ..attrs cimport FLAG0, FLAG1, FLAG2, FLAG3, FLAG4, FLAG5, FLAG6, FLAG7
from ..attrs cimport FLAG8, FLAG9, FLAG10
from ..attrs cimport ORTH as _ORTH
from ..attrs cimport SHAPE as _SHAPE
from ..attrs cimport NORM1 as _NORM1
from ..attrs cimport NORM2 as _NORM2
from ..attrs cimport LOWER as _LOWER
from ..attrs cimport NORM as _NORM
from ..attrs cimport CLUSTER as _CLUSTER
from ..attrs cimport PREFIX as _PREFIX
from ..attrs cimport SUFFIX as _SUFFIX
from ..attrs cimport LEMMA as _LEMMA
from ..attrs cimport POS as _POS
from ..attrs cimport TAG as _TAG
cpdef enum:
@ -26,10 +27,11 @@ cpdef enum:
ORTH = _ORTH
SHAPE = _SHAPE
LOWER = _NORM1
NORM2 = _NORM2
LOWER = _LOWER
NORM = _NORM
PREFIX = _PREFIX
SUFFIX = _SUFFIX
CLUSTER = _CLUSTER
LEMMA = _LEMMA
POS = _POS
TAG = _TAG

View File

@ -1,5 +1,5 @@
from .typedefs cimport hash_t, flags_t, id_t, len_t, tag_t, attr_t, attr_id_t
from .typedefs cimport ID, ORTH, NORM1, NORM2, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
from .typedefs cimport ID, ORTH, LOWER, NORM, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
from .structs cimport LexemeC
from .strings cimport StringStore
@ -21,15 +21,15 @@ cdef class Lexeme:
cdef readonly attr_t length
cdef readonly attr_t orth
cdef readonly attr_t norm1
cdef readonly attr_t norm2
cdef readonly attr_t lower
cdef readonly attr_t norm
cdef readonly attr_t shape
cdef readonly attr_t prefix
cdef readonly attr_t suffix
cdef readonly unicode orth_
cdef readonly unicode norm1_
cdef readonly unicode norm2_
cdef readonly unicode lower_
cdef readonly unicode norm_
cdef readonly unicode shape_
cdef readonly unicode prefix_
cdef readonly unicode suffix_
@ -50,15 +50,15 @@ cdef class Lexeme:
py.length = ptr.length
py.orth = ptr.orth
py.norm1 = ptr.norm1
py.norm2 = ptr.norm2
py.lower = ptr.lower
py.norm = ptr.norm
py.shape = ptr.shape
py.prefix = ptr.prefix
py.suffix = ptr.suffix
py.orth_ = strings[ptr.orth]
py.norm1_ = strings[ptr.norm1]
py.norm2_ = strings[ptr.norm2]
py.lower_ = strings[ptr.lower]
py.norm_ = strings[ptr.norm]
py.shape_ = strings[ptr.shape]
py.prefix_ = strings[ptr.prefix]
py.suffix_ = strings[ptr.suffix]
@ -80,10 +80,10 @@ cdef inline attr_t get_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
return lex.id
elif feat_name == ORTH:
return lex.orth
elif feat_name == NORM1:
return lex.norm1
elif feat_name == NORM2:
return lex.norm2
elif feat_name == LOWER:
return lex.norm
elif feat_name == NORM:
return lex.norm
elif feat_name == SHAPE:
return lex.shape
elif feat_name == PREFIX:

View File

@ -17,8 +17,8 @@ cdef int set_lex_struct_props(LexemeC* lex, dict props, StringStore string_store
const float* empty_vec) except -1:
lex.length = props['length']
lex.orth = string_store[props['orth']]
lex.norm1 = string_store[props['norm1']]
lex.norm2 = string_store[props['norm2']]
lex.lower = string_store[props['lower']]
lex.norm = string_store[props['norm']]
lex.shape = string_store[props['shape']]
lex.prefix = string_store[props['prefix']]
lex.suffix = string_store[props['suffix']]

View File

@ -12,8 +12,8 @@ cdef struct LexemeC:
attr_t length
attr_t orth
attr_t norm1
attr_t norm2
attr_t lower
attr_t norm
attr_t shape
attr_t prefix
attr_t suffix

View File

@ -51,8 +51,8 @@ cdef class Token:
cdef readonly attr_t cluster
cdef readonly attr_t length
cdef readonly attr_t orth
cdef readonly attr_t norm1
cdef readonly attr_t norm2
cdef readonly attr_t lower
cdef readonly attr_t norm
cdef readonly attr_t shape
cdef readonly attr_t prefix
cdef readonly attr_t suffix

View File

@ -7,7 +7,7 @@ from preshed.counter cimport PreshCounter
from .vocab cimport EMPTY_LEXEME
from .typedefs cimport attr_id_t, attr_t
from .typedefs cimport LEMMA
from .typedefs cimport ID, ORTH, NORM1, NORM2, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
from .typedefs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
from .typedefs cimport POS, LEMMA
from unidecode import unidecode
@ -44,10 +44,10 @@ cdef attr_t get_lex_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
return lex.id
elif feat_name == ORTH:
return lex.orth
elif feat_name == NORM1:
return lex.norm1
elif feat_name == NORM2:
return lex.norm2
elif feat_name == LOWER:
return lex.lower
elif feat_name == NORM:
return lex.norm
elif feat_name == SHAPE:
return lex.shape
elif feat_name == PREFIX:
@ -223,8 +223,8 @@ cdef class Token:
self.cluster = t.lex.cluster
self.length = t.lex.length
self.orth = t.lex.orth
self.norm1 = t.lex.norm1
self.norm2 = t.lex.norm2
self.lower = t.lex.lower
self.norm = t.lex.norm
self.shape = t.lex.shape
self.prefix = t.lex.prefix
self.suffix = t.lex.suffix
@ -254,12 +254,6 @@ cdef class Token:
"""
return self._seq.data[self.i].lex.length
def check_flag(self, attr_id_t flag):
return self.flags & (1 << flag)
def is_pos(self, univ_tag_t pos):
return self.tag == pos
property head:
"""The token predicted by the parser to be the head of the current token."""
def __get__(self):
@ -267,7 +261,6 @@ cdef class Token:
return Token(self._seq, self.i + t.head)
property string:
"""The unicode string of the word, with no whitespace padding."""
def __get__(self):
cdef const TokenC* t = &self._seq.data[self.i]
if t.lex.orth == 0:
@ -279,13 +272,13 @@ cdef class Token:
def __get__(self):
return self._seq.vocab.strings[self.orth]
property norm1_:
property lower_:
def __get__(self):
return self._seq.vocab.strings[self.norm1]
return self._seq.vocab.strings[self.lower]
property norm2_:
property norm_:
def __get__(self):
return self._seq.vocab.strings[self.norm2]
return self._seq.vocab.strings[self.norm]
property shape_:
def __get__(self):

View File

@ -90,8 +90,8 @@ cpdef enum attr_id_t:
ID
ORTH
NORM1
NORM2
LOWER
NORM
SHAPE
PREFIX
SUFFIX

View File

@ -195,8 +195,8 @@ cdef class Vocab:
for i in range(self.lexemes.size()):
# Cast away the const, cos we can modify our lexemes
lex = <LexemeC*>self.lexemes[i]
if lex.norm1 < vectors.size():
lex.repvec = vectors[lex.norm1]
if lex.lower < vectors.size():
lex.repvec = vectors[lex.lower]
else:
lex.repvec = EMPTY_VEC