mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 17:24:41 +03:00
* Rename NORM1 and NORM2 attrs to lower and norm
This commit is contained in:
parent
75feb52c5d
commit
fda94271af
|
@ -20,8 +20,8 @@ def get_lex_props(string):
|
|||
'flags': get_flags(string),
|
||||
'length': len(string),
|
||||
'orth': string,
|
||||
'norm1': string.lower(),
|
||||
'norm2': string,
|
||||
'lower': string.lower(),
|
||||
'norm': string,
|
||||
'shape': orth.word_shape(string),
|
||||
'prefix': string[0],
|
||||
'suffix': string[-3:],
|
||||
|
|
|
@ -2,13 +2,14 @@ from ..attrs cimport FLAG0, FLAG1, FLAG2, FLAG3, FLAG4, FLAG5, FLAG6, FLAG7
|
|||
from ..attrs cimport FLAG8, FLAG9, FLAG10
|
||||
from ..attrs cimport ORTH as _ORTH
|
||||
from ..attrs cimport SHAPE as _SHAPE
|
||||
from ..attrs cimport NORM1 as _NORM1
|
||||
from ..attrs cimport NORM2 as _NORM2
|
||||
from ..attrs cimport LOWER as _LOWER
|
||||
from ..attrs cimport NORM as _NORM
|
||||
from ..attrs cimport CLUSTER as _CLUSTER
|
||||
from ..attrs cimport PREFIX as _PREFIX
|
||||
from ..attrs cimport SUFFIX as _SUFFIX
|
||||
from ..attrs cimport LEMMA as _LEMMA
|
||||
from ..attrs cimport POS as _POS
|
||||
from ..attrs cimport TAG as _TAG
|
||||
|
||||
|
||||
cpdef enum:
|
||||
|
@ -26,10 +27,11 @@ cpdef enum:
|
|||
|
||||
ORTH = _ORTH
|
||||
SHAPE = _SHAPE
|
||||
LOWER = _NORM1
|
||||
NORM2 = _NORM2
|
||||
LOWER = _LOWER
|
||||
NORM = _NORM
|
||||
PREFIX = _PREFIX
|
||||
SUFFIX = _SUFFIX
|
||||
CLUSTER = _CLUSTER
|
||||
LEMMA = _LEMMA
|
||||
POS = _POS
|
||||
TAG = _TAG
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
from .typedefs cimport hash_t, flags_t, id_t, len_t, tag_t, attr_t, attr_id_t
|
||||
from .typedefs cimport ID, ORTH, NORM1, NORM2, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
|
||||
from .typedefs cimport ID, ORTH, LOWER, NORM, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
|
||||
from .structs cimport LexemeC
|
||||
from .strings cimport StringStore
|
||||
|
||||
|
@ -21,15 +21,15 @@ cdef class Lexeme:
|
|||
cdef readonly attr_t length
|
||||
|
||||
cdef readonly attr_t orth
|
||||
cdef readonly attr_t norm1
|
||||
cdef readonly attr_t norm2
|
||||
cdef readonly attr_t lower
|
||||
cdef readonly attr_t norm
|
||||
cdef readonly attr_t shape
|
||||
cdef readonly attr_t prefix
|
||||
cdef readonly attr_t suffix
|
||||
|
||||
cdef readonly unicode orth_
|
||||
cdef readonly unicode norm1_
|
||||
cdef readonly unicode norm2_
|
||||
cdef readonly unicode lower_
|
||||
cdef readonly unicode norm_
|
||||
cdef readonly unicode shape_
|
||||
cdef readonly unicode prefix_
|
||||
cdef readonly unicode suffix_
|
||||
|
@ -50,15 +50,15 @@ cdef class Lexeme:
|
|||
py.length = ptr.length
|
||||
|
||||
py.orth = ptr.orth
|
||||
py.norm1 = ptr.norm1
|
||||
py.norm2 = ptr.norm2
|
||||
py.lower = ptr.lower
|
||||
py.norm = ptr.norm
|
||||
py.shape = ptr.shape
|
||||
py.prefix = ptr.prefix
|
||||
py.suffix = ptr.suffix
|
||||
|
||||
py.orth_ = strings[ptr.orth]
|
||||
py.norm1_ = strings[ptr.norm1]
|
||||
py.norm2_ = strings[ptr.norm2]
|
||||
py.lower_ = strings[ptr.lower]
|
||||
py.norm_ = strings[ptr.norm]
|
||||
py.shape_ = strings[ptr.shape]
|
||||
py.prefix_ = strings[ptr.prefix]
|
||||
py.suffix_ = strings[ptr.suffix]
|
||||
|
@ -80,10 +80,10 @@ cdef inline attr_t get_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
|
|||
return lex.id
|
||||
elif feat_name == ORTH:
|
||||
return lex.orth
|
||||
elif feat_name == NORM1:
|
||||
return lex.norm1
|
||||
elif feat_name == NORM2:
|
||||
return lex.norm2
|
||||
elif feat_name == LOWER:
|
||||
return lex.norm
|
||||
elif feat_name == NORM:
|
||||
return lex.norm
|
||||
elif feat_name == SHAPE:
|
||||
return lex.shape
|
||||
elif feat_name == PREFIX:
|
||||
|
|
|
@ -17,8 +17,8 @@ cdef int set_lex_struct_props(LexemeC* lex, dict props, StringStore string_store
|
|||
const float* empty_vec) except -1:
|
||||
lex.length = props['length']
|
||||
lex.orth = string_store[props['orth']]
|
||||
lex.norm1 = string_store[props['norm1']]
|
||||
lex.norm2 = string_store[props['norm2']]
|
||||
lex.lower = string_store[props['lower']]
|
||||
lex.norm = string_store[props['norm']]
|
||||
lex.shape = string_store[props['shape']]
|
||||
lex.prefix = string_store[props['prefix']]
|
||||
lex.suffix = string_store[props['suffix']]
|
||||
|
|
|
@ -12,8 +12,8 @@ cdef struct LexemeC:
|
|||
attr_t length
|
||||
|
||||
attr_t orth
|
||||
attr_t norm1
|
||||
attr_t norm2
|
||||
attr_t lower
|
||||
attr_t norm
|
||||
attr_t shape
|
||||
attr_t prefix
|
||||
attr_t suffix
|
||||
|
|
|
@ -51,8 +51,8 @@ cdef class Token:
|
|||
cdef readonly attr_t cluster
|
||||
cdef readonly attr_t length
|
||||
cdef readonly attr_t orth
|
||||
cdef readonly attr_t norm1
|
||||
cdef readonly attr_t norm2
|
||||
cdef readonly attr_t lower
|
||||
cdef readonly attr_t norm
|
||||
cdef readonly attr_t shape
|
||||
cdef readonly attr_t prefix
|
||||
cdef readonly attr_t suffix
|
||||
|
|
|
@ -7,7 +7,7 @@ from preshed.counter cimport PreshCounter
|
|||
from .vocab cimport EMPTY_LEXEME
|
||||
from .typedefs cimport attr_id_t, attr_t
|
||||
from .typedefs cimport LEMMA
|
||||
from .typedefs cimport ID, ORTH, NORM1, NORM2, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
|
||||
from .typedefs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
|
||||
from .typedefs cimport POS, LEMMA
|
||||
|
||||
from unidecode import unidecode
|
||||
|
@ -44,10 +44,10 @@ cdef attr_t get_lex_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
|
|||
return lex.id
|
||||
elif feat_name == ORTH:
|
||||
return lex.orth
|
||||
elif feat_name == NORM1:
|
||||
return lex.norm1
|
||||
elif feat_name == NORM2:
|
||||
return lex.norm2
|
||||
elif feat_name == LOWER:
|
||||
return lex.lower
|
||||
elif feat_name == NORM:
|
||||
return lex.norm
|
||||
elif feat_name == SHAPE:
|
||||
return lex.shape
|
||||
elif feat_name == PREFIX:
|
||||
|
@ -223,8 +223,8 @@ cdef class Token:
|
|||
self.cluster = t.lex.cluster
|
||||
self.length = t.lex.length
|
||||
self.orth = t.lex.orth
|
||||
self.norm1 = t.lex.norm1
|
||||
self.norm2 = t.lex.norm2
|
||||
self.lower = t.lex.lower
|
||||
self.norm = t.lex.norm
|
||||
self.shape = t.lex.shape
|
||||
self.prefix = t.lex.prefix
|
||||
self.suffix = t.lex.suffix
|
||||
|
@ -254,12 +254,6 @@ cdef class Token:
|
|||
"""
|
||||
return self._seq.data[self.i].lex.length
|
||||
|
||||
def check_flag(self, attr_id_t flag):
|
||||
return self.flags & (1 << flag)
|
||||
|
||||
def is_pos(self, univ_tag_t pos):
|
||||
return self.tag == pos
|
||||
|
||||
property head:
|
||||
"""The token predicted by the parser to be the head of the current token."""
|
||||
def __get__(self):
|
||||
|
@ -267,7 +261,6 @@ cdef class Token:
|
|||
return Token(self._seq, self.i + t.head)
|
||||
|
||||
property string:
|
||||
"""The unicode string of the word, with no whitespace padding."""
|
||||
def __get__(self):
|
||||
cdef const TokenC* t = &self._seq.data[self.i]
|
||||
if t.lex.orth == 0:
|
||||
|
@ -279,13 +272,13 @@ cdef class Token:
|
|||
def __get__(self):
|
||||
return self._seq.vocab.strings[self.orth]
|
||||
|
||||
property norm1_:
|
||||
property lower_:
|
||||
def __get__(self):
|
||||
return self._seq.vocab.strings[self.norm1]
|
||||
return self._seq.vocab.strings[self.lower]
|
||||
|
||||
property norm2_:
|
||||
property norm_:
|
||||
def __get__(self):
|
||||
return self._seq.vocab.strings[self.norm2]
|
||||
return self._seq.vocab.strings[self.norm]
|
||||
|
||||
property shape_:
|
||||
def __get__(self):
|
||||
|
|
|
@ -90,8 +90,8 @@ cpdef enum attr_id_t:
|
|||
|
||||
ID
|
||||
ORTH
|
||||
NORM1
|
||||
NORM2
|
||||
LOWER
|
||||
NORM
|
||||
SHAPE
|
||||
PREFIX
|
||||
SUFFIX
|
||||
|
|
|
@ -195,8 +195,8 @@ cdef class Vocab:
|
|||
for i in range(self.lexemes.size()):
|
||||
# Cast away the const, cos we can modify our lexemes
|
||||
lex = <LexemeC*>self.lexemes[i]
|
||||
if lex.norm1 < vectors.size():
|
||||
lex.repvec = vectors[lex.norm1]
|
||||
if lex.lower < vectors.size():
|
||||
lex.repvec = vectors[lex.lower]
|
||||
else:
|
||||
lex.repvec = EMPTY_VEC
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user