* Rename sic to orth

This commit is contained in:
Matthew Honnibal 2015-01-23 02:08:25 +11:00
parent 93d4bd6c2e
commit 5ed8b2b98f
13 changed files with 63 additions and 63 deletions

View File

@ -19,7 +19,7 @@ def get_lex_props(string):
return { return {
'flags': get_flags(string), 'flags': get_flags(string),
'length': len(string), 'length': len(string),
'sic': string, 'orth': string,
'norm1': string.lower(), 'norm1': string.lower(),
'norm2': string, 'norm2': string,
'shape': orth.word_shape(string), 'shape': orth.word_shape(string),

View File

@ -1,6 +1,6 @@
from ..attrs cimport FLAG0, FLAG1, FLAG2, FLAG3, FLAG4, FLAG5, FLAG6, FLAG7 from ..attrs cimport FLAG0, FLAG1, FLAG2, FLAG3, FLAG4, FLAG5, FLAG6, FLAG7
from ..attrs cimport FLAG8, FLAG9, FLAG10 from ..attrs cimport FLAG8, FLAG9, FLAG10
from ..attrs cimport SIC as _SIC from ..attrs cimport ORTH as _ORTH
from ..attrs cimport SHAPE as _SHAPE from ..attrs cimport SHAPE as _SHAPE
from ..attrs cimport NORM1 as _NORM1 from ..attrs cimport NORM1 as _NORM1
from ..attrs cimport NORM2 as _NORM2 from ..attrs cimport NORM2 as _NORM2
@ -24,7 +24,7 @@ cpdef enum:
LIKE_NUM = FLAG9 LIKE_NUM = FLAG9
IS_STOP = FLAG10 IS_STOP = FLAG10
SIC = _SIC ORTH = _ORTH
SHAPE = _SHAPE SHAPE = _SHAPE
LOWER = _NORM1 LOWER = _NORM1
NORM2 = _NORM2 NORM2 = _NORM2

View File

@ -70,7 +70,7 @@ cpdef enum misc_t:
cpdef enum: cpdef enum:
P2_sic P2_orth
P2_cluster P2_cluster
P2_shape P2_shape
P2_prefix P2_prefix
@ -78,7 +78,7 @@ cpdef enum:
P2_pos P2_pos
P2_lemma P2_lemma
P1_sic P1_orth
P1_cluster P1_cluster
P1_shape P1_shape
P1_prefix P1_prefix
@ -86,7 +86,7 @@ cpdef enum:
P1_pos P1_pos
P1_lemma P1_lemma
W_sic W_orth
W_cluster W_cluster
W_shape W_shape
W_prefix W_prefix
@ -94,7 +94,7 @@ cpdef enum:
W_pos W_pos
W_lemma W_lemma
N1_sic N1_orth
N1_cluster N1_cluster
N1_shape N1_shape
N1_prefix N1_prefix
@ -102,7 +102,7 @@ cpdef enum:
N1_pos N1_pos
N1_lemma N1_lemma
N2_sic N2_orth
N2_cluster N2_cluster
N2_shape N2_shape
N2_prefix N2_prefix
@ -169,11 +169,11 @@ POS_TAGS = {
POS_TEMPLATES = ( POS_TEMPLATES = (
(W_sic,), (W_orth,),
(P1_lemma, P1_pos), (P1_lemma, P1_pos),
(P2_lemma, P2_pos), (P2_lemma, P2_pos),
(N1_sic,), (N1_orth,),
(N2_sic,), (N2_orth,),
(W_suffix,), (W_suffix,),
(W_prefix,), (W_prefix,),
@ -181,7 +181,7 @@ POS_TEMPLATES = (
(P1_pos,), (P1_pos,),
(P2_pos,), (P2_pos,),
(P1_pos, P2_pos), (P1_pos, P2_pos),
(P1_pos, W_sic), (P1_pos, W_orth),
(P1_suffix,), (P1_suffix,),
(N1_suffix,), (N1_suffix,),
@ -272,21 +272,21 @@ cdef class EnPosTagger:
cdef int set_morph(self, const int i, TokenC* tokens) except -1: cdef int set_morph(self, const int i, TokenC* tokens) except -1:
cdef const PosTag* tag = &self.tags[tokens[i].tag] cdef const PosTag* tag = &self.tags[tokens[i].tag]
tokens[i].pos = tag.pos tokens[i].pos = tag.pos
cached = <_CachedMorph*>self._morph_cache.get(tag.id, tokens[i].lex.sic) cached = <_CachedMorph*>self._morph_cache.get(tag.id, tokens[i].lex.orth)
if cached is NULL: if cached is NULL:
cached = <_CachedMorph*>self.mem.alloc(1, sizeof(_CachedMorph)) cached = <_CachedMorph*>self.mem.alloc(1, sizeof(_CachedMorph))
cached.lemma = self.lemmatize(tag.pos, tokens[i].lex) cached.lemma = self.lemmatize(tag.pos, tokens[i].lex)
cached.morph = tag.morph cached.morph = tag.morph
self._morph_cache.set(tag.id, tokens[i].lex.sic, <void*>cached) self._morph_cache.set(tag.id, tokens[i].lex.orth, <void*>cached)
tokens[i].lemma = cached.lemma tokens[i].lemma = cached.lemma
tokens[i].morph = cached.morph tokens[i].morph = cached.morph
cdef int lemmatize(self, const univ_tag_t pos, const LexemeC* lex) except -1: cdef int lemmatize(self, const univ_tag_t pos, const LexemeC* lex) except -1:
if self.lemmatizer is None: if self.lemmatizer is None:
return lex.sic return lex.orth
cdef unicode py_string = self.strings[lex.sic] cdef unicode py_string = self.strings[lex.orth]
if pos != NOUN and pos != VERB and pos != ADJ: if pos != NOUN and pos != VERB and pos != ADJ:
return lex.sic return lex.orth
cdef set lemma_strings cdef set lemma_strings
cdef unicode lemma_string cdef unicode lemma_string
lemma_strings = self.lemmatizer(py_string, pos) lemma_strings = self.lemmatizer(py_string, pos)
@ -301,29 +301,29 @@ cdef class EnPosTagger:
cdef dict entries cdef dict entries
cdef dict props cdef dict props
cdef int lemma cdef int lemma
cdef id_t sic cdef id_t orth
cdef int pos cdef int pos
for pos_str, entries in exc.items(): for pos_str, entries in exc.items():
pos = self.tag_names.index(pos_str) pos = self.tag_names.index(pos_str)
for form_str, props in entries.items(): for form_str, props in entries.items():
lemma_str = props.get('L', form_str) lemma_str = props.get('L', form_str)
sic = self.strings[form_str] orth = self.strings[form_str]
cached = <_CachedMorph*>self.mem.alloc(1, sizeof(_CachedMorph)) cached = <_CachedMorph*>self.mem.alloc(1, sizeof(_CachedMorph))
cached.lemma = self.strings[lemma_str] cached.lemma = self.strings[lemma_str]
set_morph_from_dict(&cached.morph, props) set_morph_from_dict(&cached.morph, props)
self._morph_cache.set(pos, sic, <void*>cached) self._morph_cache.set(pos, orth, <void*>cached)
cdef int fill_context(atom_t* context, const int i, const TokenC* tokens) except -1: cdef int fill_context(atom_t* context, const int i, const TokenC* tokens) except -1:
_fill_from_token(&context[P2_sic], &tokens[i-2]) _fill_from_token(&context[P2_orth], &tokens[i-2])
_fill_from_token(&context[P1_sic], &tokens[i-1]) _fill_from_token(&context[P1_orth], &tokens[i-1])
_fill_from_token(&context[W_sic], &tokens[i]) _fill_from_token(&context[W_orth], &tokens[i])
_fill_from_token(&context[N1_sic], &tokens[i+1]) _fill_from_token(&context[N1_orth], &tokens[i+1])
_fill_from_token(&context[N2_sic], &tokens[i+2]) _fill_from_token(&context[N2_orth], &tokens[i+2])
cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil: cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
context[0] = t.lex.sic context[0] = t.lex.orth
context[1] = t.lex.cluster context[1] = t.lex.cluster
context[2] = t.lex.shape context[2] = t.lex.shape
context[3] = t.lex.prefix context[3] = t.lex.prefix

View File

@ -1,5 +1,5 @@
from .typedefs cimport hash_t, flags_t, id_t, len_t, tag_t, attr_t, attr_id_t from .typedefs cimport hash_t, flags_t, id_t, len_t, tag_t, attr_t, attr_id_t
from .typedefs cimport ID, SIC, NORM1, NORM2, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER from .typedefs cimport ID, ORTH, NORM1, NORM2, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
from .structs cimport LexemeC from .structs cimport LexemeC
from .strings cimport StringStore from .strings cimport StringStore
@ -14,20 +14,20 @@ cdef int set_lex_struct_props(LexemeC* lex, dict props, StringStore strings,
const float* empty_vec) except -1 const float* empty_vec) except -1
cdef class Lexeme: cdef class Lexeme:
cdef readonly ndarray vec cdef readonly ndarray repvec
cdef readonly flags_t flags cdef readonly flags_t flags
cdef readonly attr_t id cdef readonly attr_t id
cdef readonly attr_t length cdef readonly attr_t length
cdef readonly attr_t sic cdef readonly attr_t orth
cdef readonly attr_t norm1 cdef readonly attr_t norm1
cdef readonly attr_t norm2 cdef readonly attr_t norm2
cdef readonly attr_t shape cdef readonly attr_t shape
cdef readonly attr_t prefix cdef readonly attr_t prefix
cdef readonly attr_t suffix cdef readonly attr_t suffix
cdef readonly unicode sic_ cdef readonly unicode orth_
cdef readonly unicode norm1_ cdef readonly unicode norm1_
cdef readonly unicode norm2_ cdef readonly unicode norm2_
cdef readonly unicode shape_ cdef readonly unicode shape_
@ -49,14 +49,14 @@ cdef class Lexeme:
py.id = ptr.id py.id = ptr.id
py.length = ptr.length py.length = ptr.length
py.sic = ptr.sic py.orth = ptr.orth
py.norm1 = ptr.norm1 py.norm1 = ptr.norm1
py.norm2 = ptr.norm2 py.norm2 = ptr.norm2
py.shape = ptr.shape py.shape = ptr.shape
py.prefix = ptr.prefix py.prefix = ptr.prefix
py.suffix = ptr.suffix py.suffix = ptr.suffix
py.sic_ = strings[ptr.sic] py.orth_ = strings[ptr.orth]
py.norm1_ = strings[ptr.norm1] py.norm1_ = strings[ptr.norm1]
py.norm2_ = strings[ptr.norm2] py.norm2_ = strings[ptr.norm2]
py.shape_ = strings[ptr.shape] py.shape_ = strings[ptr.shape]
@ -78,8 +78,8 @@ cdef inline attr_t get_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
return check_flag(lex, feat_name) return check_flag(lex, feat_name)
elif feat_name == ID: elif feat_name == ID:
return lex.id return lex.id
elif feat_name == SIC: elif feat_name == ORTH:
return lex.sic return lex.orth
elif feat_name == NORM1: elif feat_name == NORM1:
return lex.norm1 return lex.norm1
elif feat_name == NORM2: elif feat_name == NORM2:

View File

@ -16,7 +16,7 @@ memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
cdef int set_lex_struct_props(LexemeC* lex, dict props, StringStore string_store, cdef int set_lex_struct_props(LexemeC* lex, dict props, StringStore string_store,
const float* empty_vec) except -1: const float* empty_vec) except -1:
lex.length = props['length'] lex.length = props['length']
lex.sic = string_store[props['sic']] lex.orth = string_store[props['orth']]
lex.norm1 = string_store[props['norm1']] lex.norm1 = string_store[props['norm1']]
lex.norm2 = string_store[props['norm2']] lex.norm2 = string_store[props['norm2']]
lex.shape = string_store[props['shape']] lex.shape = string_store[props['shape']]
@ -34,4 +34,4 @@ cdef int set_lex_struct_props(LexemeC* lex, dict props, StringStore string_store
cdef class Lexeme: cdef class Lexeme:
"""A dummy docstring""" """A dummy docstring"""
def __cinit__(self, int vec_size): def __cinit__(self, int vec_size):
self.vec = numpy.ndarray(shape=(vec_size,), dtype=numpy.float32) self.repvec = numpy.ndarray(shape=(vec_size,), dtype=numpy.float32)

View File

@ -11,7 +11,7 @@ cdef struct LexemeC:
attr_t id attr_t id
attr_t length attr_t length
attr_t sic attr_t orth
attr_t norm1 attr_t norm1
attr_t norm2 attr_t norm2
attr_t shape attr_t shape

View File

@ -26,7 +26,7 @@ cdef inline void fill_token(atom_t* context, const TokenC* token) nogil:
context[5] = 0 context[5] = 0
context[6] = 0 context[6] = 0
else: else:
context[0] = token.lex.sic context[0] = token.lex.orth
context[1] = token.lemma context[1] = token.lemma
context[2] = token.tag context[2] = token.tag
context[3] = token.lex.cluster context[3] = token.lex.cluster

View File

@ -50,7 +50,7 @@ cdef class Token:
cdef readonly attr_t idx cdef readonly attr_t idx
cdef readonly attr_t cluster cdef readonly attr_t cluster
cdef readonly attr_t length cdef readonly attr_t length
cdef readonly attr_t sic cdef readonly attr_t orth
cdef readonly attr_t norm1 cdef readonly attr_t norm1
cdef readonly attr_t norm2 cdef readonly attr_t norm2
cdef readonly attr_t shape cdef readonly attr_t shape

View File

@ -7,7 +7,7 @@ from preshed.counter cimport PreshCounter
from .vocab cimport EMPTY_LEXEME from .vocab cimport EMPTY_LEXEME
from .typedefs cimport attr_id_t, attr_t from .typedefs cimport attr_id_t, attr_t
from .typedefs cimport LEMMA from .typedefs cimport LEMMA
from .typedefs cimport ID, SIC, NORM1, NORM2, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER from .typedefs cimport ID, ORTH, NORM1, NORM2, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
from .typedefs cimport POS, LEMMA from .typedefs cimport POS, LEMMA
from unidecode import unidecode from unidecode import unidecode
@ -42,8 +42,8 @@ cdef attr_t get_lex_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
return check_flag(lex, feat_name) return check_flag(lex, feat_name)
elif feat_name == ID: elif feat_name == ID:
return lex.id return lex.id
elif feat_name == SIC: elif feat_name == ORTH:
return lex.sic return lex.orth
elif feat_name == NORM1: elif feat_name == NORM1:
return lex.norm1 return lex.norm1
elif feat_name == NORM2: elif feat_name == NORM2:
@ -97,8 +97,8 @@ cdef class Tokens:
for i in range(self.length): for i in range(self.length):
if start is None: if start is None:
start = i start = i
if self.data[i].lex.sic == period or self.data[i].lex.sic == exclamation or \ if self.data[i].lex.orth == period or self.data[i].lex.orth == exclamation or \
self.data[i].lex.sic == question: self.data[i].lex.orth == question:
spans.append((start, i+1)) spans.append((start, i+1))
start = None start = None
if start is not None: if start is not None:
@ -176,9 +176,9 @@ cdef class Tokens:
>>> from spacy.en import English, attrs >>> from spacy.en import English, attrs
>>> nlp = English() >>> nlp = English()
>>> tokens = nlp(u'apple apple orange banana') >>> tokens = nlp(u'apple apple orange banana')
>>> tokens.count_by(attrs.SIC) >>> tokens.count_by(attrs.ORTH)
{12800L: 1, 11880L: 2, 7561L: 1} {12800L: 1, 11880L: 2, 7561L: 1}
>>> tokens.to_array([attrs.SIC]) >>> tokens.to_array([attrs.ORTH])
array([[11880], array([[11880],
[11880], [11880],
[ 7561], [ 7561],
@ -222,7 +222,7 @@ cdef class Token:
self.idx = t.idx self.idx = t.idx
self.cluster = t.lex.cluster self.cluster = t.lex.cluster
self.length = t.lex.length self.length = t.lex.length
self.sic = t.lex.sic self.orth = t.lex.orth
self.norm1 = t.lex.norm1 self.norm1 = t.lex.norm1
self.norm2 = t.lex.norm2 self.norm2 = t.lex.norm2
self.shape = t.lex.shape self.shape = t.lex.shape
@ -270,14 +270,14 @@ cdef class Token:
"""The unicode string of the word, with no whitespace padding.""" """The unicode string of the word, with no whitespace padding."""
def __get__(self): def __get__(self):
cdef const TokenC* t = &self._seq.data[self.i] cdef const TokenC* t = &self._seq.data[self.i]
if t.lex.sic == 0: if t.lex.orth == 0:
return '' return ''
cdef unicode py_ustr = self._seq.vocab.strings[t.lex.sic] cdef unicode py_ustr = self._seq.vocab.strings[t.lex.orth]
return py_ustr return py_ustr
property sic_: property orth_:
def __get__(self): def __get__(self):
return self._seq.vocab.strings[self.sic] return self._seq.vocab.strings[self.orth]
property norm1_: property norm1_:
def __get__(self): def __get__(self):

View File

@ -89,7 +89,7 @@ cpdef enum attr_id_t:
FLAG63 FLAG63
ID ID
SIC ORTH
NORM1 NORM1
NORM2 NORM2
SHAPE SHAPE

View File

@ -10,8 +10,8 @@ def EN():
def test_possess(EN): def test_possess(EN):
tokens = EN("Mike's") tokens = EN("Mike's")
assert EN.vocab.strings[tokens[0].sic] == b"Mike" assert EN.vocab.strings[tokens[0].orth] == "Mike"
assert EN.vocab.strings[tokens[1].sic] == b"'s" assert EN.vocab.strings[tokens[1].orth] == "'s"
assert len(tokens) == 2 assert len(tokens) == 2

View File

@ -33,17 +33,17 @@ def test_punct(EN):
def test_digits(EN): def test_digits(EN):
tokens = EN('The year: 1984.') tokens = EN('The year: 1984.')
assert len(tokens) == 5 assert len(tokens) == 5
assert tokens[0].sic == EN.vocab['The'].sic assert tokens[0].orth == EN.vocab['The'].orth
assert tokens[3].sic == EN.vocab['1984'].sic assert tokens[3].orth == EN.vocab['1984'].orth
def test_contraction(EN): def test_contraction(EN):
tokens = EN("don't giggle") tokens = EN("don't giggle")
assert len(tokens) == 3 assert len(tokens) == 3
assert tokens[1].sic == EN.vocab["n't"].sic assert tokens[1].orth == EN.vocab["n't"].orth
tokens = EN("i said don't!") tokens = EN("i said don't!")
assert len(tokens) == 5 assert len(tokens) == 5
assert tokens[4].sic == EN.vocab['!'].sic assert tokens[4].orth == EN.vocab['!'].orth
def test_contraction_punct(EN): def test_contraction_punct(EN):

View File

@ -11,24 +11,24 @@ def EN():
def test_neq(EN): def test_neq(EN):
addr = EN.vocab['Hello'] addr = EN.vocab['Hello']
assert EN.vocab['bye'].sic != addr.sic assert EN.vocab['bye'].orth != addr.orth
def test_eq(EN): def test_eq(EN):
addr = EN.vocab['Hello'] addr = EN.vocab['Hello']
assert EN.vocab['Hello'].sic == addr.sic assert EN.vocab['Hello'].orth == addr.orth
def test_case_neq(EN): def test_case_neq(EN):
addr = EN.vocab['Hello'] addr = EN.vocab['Hello']
assert EN.vocab['hello'].sic != addr.sic assert EN.vocab['hello'].orth != addr.orth
def test_punct_neq(EN): def test_punct_neq(EN):
addr = EN.vocab['Hello'] addr = EN.vocab['Hello']
assert EN.vocab['Hello,'].sic != addr.sic assert EN.vocab['Hello,'].orth != addr.orth
def test_shape_attr(EN): def test_shape_attr(EN):
example = EN.vocab['example'] example = EN.vocab['example']
assert example.sic != example.shape assert example.orth != example.shape