* Rename sic to orth

This commit is contained in:
Matthew Honnibal 2015-01-23 02:08:25 +11:00
parent 93d4bd6c2e
commit 5ed8b2b98f
13 changed files with 63 additions and 63 deletions

View File

@ -19,7 +19,7 @@ def get_lex_props(string):
return {
'flags': get_flags(string),
'length': len(string),
'sic': string,
'orth': string,
'norm1': string.lower(),
'norm2': string,
'shape': orth.word_shape(string),

View File

@ -1,6 +1,6 @@
from ..attrs cimport FLAG0, FLAG1, FLAG2, FLAG3, FLAG4, FLAG5, FLAG6, FLAG7
from ..attrs cimport FLAG8, FLAG9, FLAG10
from ..attrs cimport SIC as _SIC
from ..attrs cimport ORTH as _ORTH
from ..attrs cimport SHAPE as _SHAPE
from ..attrs cimport NORM1 as _NORM1
from ..attrs cimport NORM2 as _NORM2
@ -24,7 +24,7 @@ cpdef enum:
LIKE_NUM = FLAG9
IS_STOP = FLAG10
SIC = _SIC
ORTH = _ORTH
SHAPE = _SHAPE
LOWER = _NORM1
NORM2 = _NORM2

View File

@ -70,7 +70,7 @@ cpdef enum misc_t:
cpdef enum:
P2_sic
P2_orth
P2_cluster
P2_shape
P2_prefix
@ -78,7 +78,7 @@ cpdef enum:
P2_pos
P2_lemma
P1_sic
P1_orth
P1_cluster
P1_shape
P1_prefix
@ -86,7 +86,7 @@ cpdef enum:
P1_pos
P1_lemma
W_sic
W_orth
W_cluster
W_shape
W_prefix
@ -94,7 +94,7 @@ cpdef enum:
W_pos
W_lemma
N1_sic
N1_orth
N1_cluster
N1_shape
N1_prefix
@ -102,7 +102,7 @@ cpdef enum:
N1_pos
N1_lemma
N2_sic
N2_orth
N2_cluster
N2_shape
N2_prefix
@ -169,11 +169,11 @@ POS_TAGS = {
POS_TEMPLATES = (
(W_sic,),
(W_orth,),
(P1_lemma, P1_pos),
(P2_lemma, P2_pos),
(N1_sic,),
(N2_sic,),
(N1_orth,),
(N2_orth,),
(W_suffix,),
(W_prefix,),
@ -181,7 +181,7 @@ POS_TEMPLATES = (
(P1_pos,),
(P2_pos,),
(P1_pos, P2_pos),
(P1_pos, W_sic),
(P1_pos, W_orth),
(P1_suffix,),
(N1_suffix,),
@ -272,21 +272,21 @@ cdef class EnPosTagger:
cdef int set_morph(self, const int i, TokenC* tokens) except -1:
cdef const PosTag* tag = &self.tags[tokens[i].tag]
tokens[i].pos = tag.pos
cached = <_CachedMorph*>self._morph_cache.get(tag.id, tokens[i].lex.sic)
cached = <_CachedMorph*>self._morph_cache.get(tag.id, tokens[i].lex.orth)
if cached is NULL:
cached = <_CachedMorph*>self.mem.alloc(1, sizeof(_CachedMorph))
cached.lemma = self.lemmatize(tag.pos, tokens[i].lex)
cached.morph = tag.morph
self._morph_cache.set(tag.id, tokens[i].lex.sic, <void*>cached)
self._morph_cache.set(tag.id, tokens[i].lex.orth, <void*>cached)
tokens[i].lemma = cached.lemma
tokens[i].morph = cached.morph
cdef int lemmatize(self, const univ_tag_t pos, const LexemeC* lex) except -1:
if self.lemmatizer is None:
return lex.sic
cdef unicode py_string = self.strings[lex.sic]
return lex.orth
cdef unicode py_string = self.strings[lex.orth]
if pos != NOUN and pos != VERB and pos != ADJ:
return lex.sic
return lex.orth
cdef set lemma_strings
cdef unicode lemma_string
lemma_strings = self.lemmatizer(py_string, pos)
@ -301,29 +301,29 @@ cdef class EnPosTagger:
cdef dict entries
cdef dict props
cdef int lemma
cdef id_t sic
cdef id_t orth
cdef int pos
for pos_str, entries in exc.items():
pos = self.tag_names.index(pos_str)
for form_str, props in entries.items():
lemma_str = props.get('L', form_str)
sic = self.strings[form_str]
orth = self.strings[form_str]
cached = <_CachedMorph*>self.mem.alloc(1, sizeof(_CachedMorph))
cached.lemma = self.strings[lemma_str]
set_morph_from_dict(&cached.morph, props)
self._morph_cache.set(pos, sic, <void*>cached)
self._morph_cache.set(pos, orth, <void*>cached)
cdef int fill_context(atom_t* context, const int i, const TokenC* tokens) except -1:
_fill_from_token(&context[P2_sic], &tokens[i-2])
_fill_from_token(&context[P1_sic], &tokens[i-1])
_fill_from_token(&context[W_sic], &tokens[i])
_fill_from_token(&context[N1_sic], &tokens[i+1])
_fill_from_token(&context[N2_sic], &tokens[i+2])
_fill_from_token(&context[P2_orth], &tokens[i-2])
_fill_from_token(&context[P1_orth], &tokens[i-1])
_fill_from_token(&context[W_orth], &tokens[i])
_fill_from_token(&context[N1_orth], &tokens[i+1])
_fill_from_token(&context[N2_orth], &tokens[i+2])
cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
context[0] = t.lex.sic
context[0] = t.lex.orth
context[1] = t.lex.cluster
context[2] = t.lex.shape
context[3] = t.lex.prefix

View File

@ -1,5 +1,5 @@
from .typedefs cimport hash_t, flags_t, id_t, len_t, tag_t, attr_t, attr_id_t
from .typedefs cimport ID, SIC, NORM1, NORM2, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
from .typedefs cimport ID, ORTH, NORM1, NORM2, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
from .structs cimport LexemeC
from .strings cimport StringStore
@ -14,20 +14,20 @@ cdef int set_lex_struct_props(LexemeC* lex, dict props, StringStore strings,
const float* empty_vec) except -1
cdef class Lexeme:
cdef readonly ndarray vec
cdef readonly ndarray repvec
cdef readonly flags_t flags
cdef readonly attr_t id
cdef readonly attr_t length
cdef readonly attr_t sic
cdef readonly attr_t orth
cdef readonly attr_t norm1
cdef readonly attr_t norm2
cdef readonly attr_t shape
cdef readonly attr_t prefix
cdef readonly attr_t suffix
cdef readonly unicode sic_
cdef readonly unicode orth_
cdef readonly unicode norm1_
cdef readonly unicode norm2_
cdef readonly unicode shape_
@ -49,14 +49,14 @@ cdef class Lexeme:
py.id = ptr.id
py.length = ptr.length
py.sic = ptr.sic
py.orth = ptr.orth
py.norm1 = ptr.norm1
py.norm2 = ptr.norm2
py.shape = ptr.shape
py.prefix = ptr.prefix
py.suffix = ptr.suffix
py.sic_ = strings[ptr.sic]
py.orth_ = strings[ptr.orth]
py.norm1_ = strings[ptr.norm1]
py.norm2_ = strings[ptr.norm2]
py.shape_ = strings[ptr.shape]
@ -78,8 +78,8 @@ cdef inline attr_t get_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
return check_flag(lex, feat_name)
elif feat_name == ID:
return lex.id
elif feat_name == SIC:
return lex.sic
elif feat_name == ORTH:
return lex.orth
elif feat_name == NORM1:
return lex.norm1
elif feat_name == NORM2:

View File

@ -16,7 +16,7 @@ memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
cdef int set_lex_struct_props(LexemeC* lex, dict props, StringStore string_store,
const float* empty_vec) except -1:
lex.length = props['length']
lex.sic = string_store[props['sic']]
lex.orth = string_store[props['orth']]
lex.norm1 = string_store[props['norm1']]
lex.norm2 = string_store[props['norm2']]
lex.shape = string_store[props['shape']]
@ -34,4 +34,4 @@ cdef int set_lex_struct_props(LexemeC* lex, dict props, StringStore string_store
cdef class Lexeme:
"""A dummy docstring"""
def __cinit__(self, int vec_size):
self.vec = numpy.ndarray(shape=(vec_size,), dtype=numpy.float32)
self.repvec = numpy.ndarray(shape=(vec_size,), dtype=numpy.float32)

View File

@ -11,7 +11,7 @@ cdef struct LexemeC:
attr_t id
attr_t length
attr_t sic
attr_t orth
attr_t norm1
attr_t norm2
attr_t shape

View File

@ -26,7 +26,7 @@ cdef inline void fill_token(atom_t* context, const TokenC* token) nogil:
context[5] = 0
context[6] = 0
else:
context[0] = token.lex.sic
context[0] = token.lex.orth
context[1] = token.lemma
context[2] = token.tag
context[3] = token.lex.cluster

View File

@ -50,7 +50,7 @@ cdef class Token:
cdef readonly attr_t idx
cdef readonly attr_t cluster
cdef readonly attr_t length
cdef readonly attr_t sic
cdef readonly attr_t orth
cdef readonly attr_t norm1
cdef readonly attr_t norm2
cdef readonly attr_t shape

View File

@ -7,7 +7,7 @@ from preshed.counter cimport PreshCounter
from .vocab cimport EMPTY_LEXEME
from .typedefs cimport attr_id_t, attr_t
from .typedefs cimport LEMMA
from .typedefs cimport ID, SIC, NORM1, NORM2, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
from .typedefs cimport ID, ORTH, NORM1, NORM2, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
from .typedefs cimport POS, LEMMA
from unidecode import unidecode
@ -42,8 +42,8 @@ cdef attr_t get_lex_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
return check_flag(lex, feat_name)
elif feat_name == ID:
return lex.id
elif feat_name == SIC:
return lex.sic
elif feat_name == ORTH:
return lex.orth
elif feat_name == NORM1:
return lex.norm1
elif feat_name == NORM2:
@ -97,8 +97,8 @@ cdef class Tokens:
for i in range(self.length):
if start is None:
start = i
if self.data[i].lex.sic == period or self.data[i].lex.sic == exclamation or \
self.data[i].lex.sic == question:
if self.data[i].lex.orth == period or self.data[i].lex.orth == exclamation or \
self.data[i].lex.orth == question:
spans.append((start, i+1))
start = None
if start is not None:
@ -176,9 +176,9 @@ cdef class Tokens:
>>> from spacy.en import English, attrs
>>> nlp = English()
>>> tokens = nlp(u'apple apple orange banana')
>>> tokens.count_by(attrs.SIC)
>>> tokens.count_by(attrs.ORTH)
{12800L: 1, 11880L: 2, 7561L: 1}
>>> tokens.to_array([attrs.SIC])
>>> tokens.to_array([attrs.ORTH])
array([[11880],
[11880],
[ 7561],
@ -222,7 +222,7 @@ cdef class Token:
self.idx = t.idx
self.cluster = t.lex.cluster
self.length = t.lex.length
self.sic = t.lex.sic
self.orth = t.lex.orth
self.norm1 = t.lex.norm1
self.norm2 = t.lex.norm2
self.shape = t.lex.shape
@ -270,14 +270,14 @@ cdef class Token:
"""The unicode string of the word, with no whitespace padding."""
def __get__(self):
cdef const TokenC* t = &self._seq.data[self.i]
if t.lex.sic == 0:
if t.lex.orth == 0:
return ''
cdef unicode py_ustr = self._seq.vocab.strings[t.lex.sic]
cdef unicode py_ustr = self._seq.vocab.strings[t.lex.orth]
return py_ustr
property sic_:
property orth_:
def __get__(self):
return self._seq.vocab.strings[self.sic]
return self._seq.vocab.strings[self.orth]
property norm1_:
def __get__(self):

View File

@ -89,7 +89,7 @@ cpdef enum attr_id_t:
FLAG63
ID
SIC
ORTH
NORM1
NORM2
SHAPE

View File

@ -10,8 +10,8 @@ def EN():
def test_possess(EN):
tokens = EN("Mike's")
assert EN.vocab.strings[tokens[0].sic] == b"Mike"
assert EN.vocab.strings[tokens[1].sic] == b"'s"
assert EN.vocab.strings[tokens[0].orth] == "Mike"
assert EN.vocab.strings[tokens[1].orth] == "'s"
assert len(tokens) == 2

View File

@ -33,17 +33,17 @@ def test_punct(EN):
def test_digits(EN):
tokens = EN('The year: 1984.')
assert len(tokens) == 5
assert tokens[0].sic == EN.vocab['The'].sic
assert tokens[3].sic == EN.vocab['1984'].sic
assert tokens[0].orth == EN.vocab['The'].orth
assert tokens[3].orth == EN.vocab['1984'].orth
def test_contraction(EN):
tokens = EN("don't giggle")
assert len(tokens) == 3
assert tokens[1].sic == EN.vocab["n't"].sic
assert tokens[1].orth == EN.vocab["n't"].orth
tokens = EN("i said don't!")
assert len(tokens) == 5
assert tokens[4].sic == EN.vocab['!'].sic
assert tokens[4].orth == EN.vocab['!'].orth
def test_contraction_punct(EN):

View File

@ -11,24 +11,24 @@ def EN():
def test_neq(EN):
addr = EN.vocab['Hello']
assert EN.vocab['bye'].sic != addr.sic
assert EN.vocab['bye'].orth != addr.orth
def test_eq(EN):
addr = EN.vocab['Hello']
assert EN.vocab['Hello'].sic == addr.sic
assert EN.vocab['Hello'].orth == addr.orth
def test_case_neq(EN):
addr = EN.vocab['Hello']
assert EN.vocab['hello'].sic != addr.sic
assert EN.vocab['hello'].orth != addr.orth
def test_punct_neq(EN):
addr = EN.vocab['Hello']
assert EN.vocab['Hello,'].sic != addr.sic
assert EN.vocab['Hello,'].orth != addr.orth
def test_shape_attr(EN):
example = EN.vocab['example']
assert example.sic != example.shape
assert example.orth != example.shape