mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 18:26:30 +03:00
* Rename sic to orth
This commit is contained in:
parent
93d4bd6c2e
commit
5ed8b2b98f
|
@ -19,7 +19,7 @@ def get_lex_props(string):
|
|||
return {
|
||||
'flags': get_flags(string),
|
||||
'length': len(string),
|
||||
'sic': string,
|
||||
'orth': string,
|
||||
'norm1': string.lower(),
|
||||
'norm2': string,
|
||||
'shape': orth.word_shape(string),
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
from ..attrs cimport FLAG0, FLAG1, FLAG2, FLAG3, FLAG4, FLAG5, FLAG6, FLAG7
|
||||
from ..attrs cimport FLAG8, FLAG9, FLAG10
|
||||
from ..attrs cimport SIC as _SIC
|
||||
from ..attrs cimport ORTH as _ORTH
|
||||
from ..attrs cimport SHAPE as _SHAPE
|
||||
from ..attrs cimport NORM1 as _NORM1
|
||||
from ..attrs cimport NORM2 as _NORM2
|
||||
|
@ -24,7 +24,7 @@ cpdef enum:
|
|||
LIKE_NUM = FLAG9
|
||||
IS_STOP = FLAG10
|
||||
|
||||
SIC = _SIC
|
||||
ORTH = _ORTH
|
||||
SHAPE = _SHAPE
|
||||
LOWER = _NORM1
|
||||
NORM2 = _NORM2
|
||||
|
|
|
@ -70,7 +70,7 @@ cpdef enum misc_t:
|
|||
|
||||
|
||||
cpdef enum:
|
||||
P2_sic
|
||||
P2_orth
|
||||
P2_cluster
|
||||
P2_shape
|
||||
P2_prefix
|
||||
|
@ -78,7 +78,7 @@ cpdef enum:
|
|||
P2_pos
|
||||
P2_lemma
|
||||
|
||||
P1_sic
|
||||
P1_orth
|
||||
P1_cluster
|
||||
P1_shape
|
||||
P1_prefix
|
||||
|
@ -86,7 +86,7 @@ cpdef enum:
|
|||
P1_pos
|
||||
P1_lemma
|
||||
|
||||
W_sic
|
||||
W_orth
|
||||
W_cluster
|
||||
W_shape
|
||||
W_prefix
|
||||
|
@ -94,7 +94,7 @@ cpdef enum:
|
|||
W_pos
|
||||
W_lemma
|
||||
|
||||
N1_sic
|
||||
N1_orth
|
||||
N1_cluster
|
||||
N1_shape
|
||||
N1_prefix
|
||||
|
@ -102,7 +102,7 @@ cpdef enum:
|
|||
N1_pos
|
||||
N1_lemma
|
||||
|
||||
N2_sic
|
||||
N2_orth
|
||||
N2_cluster
|
||||
N2_shape
|
||||
N2_prefix
|
||||
|
@ -169,11 +169,11 @@ POS_TAGS = {
|
|||
|
||||
|
||||
POS_TEMPLATES = (
|
||||
(W_sic,),
|
||||
(W_orth,),
|
||||
(P1_lemma, P1_pos),
|
||||
(P2_lemma, P2_pos),
|
||||
(N1_sic,),
|
||||
(N2_sic,),
|
||||
(N1_orth,),
|
||||
(N2_orth,),
|
||||
|
||||
(W_suffix,),
|
||||
(W_prefix,),
|
||||
|
@ -181,7 +181,7 @@ POS_TEMPLATES = (
|
|||
(P1_pos,),
|
||||
(P2_pos,),
|
||||
(P1_pos, P2_pos),
|
||||
(P1_pos, W_sic),
|
||||
(P1_pos, W_orth),
|
||||
(P1_suffix,),
|
||||
(N1_suffix,),
|
||||
|
||||
|
@ -272,21 +272,21 @@ cdef class EnPosTagger:
|
|||
cdef int set_morph(self, const int i, TokenC* tokens) except -1:
|
||||
cdef const PosTag* tag = &self.tags[tokens[i].tag]
|
||||
tokens[i].pos = tag.pos
|
||||
cached = <_CachedMorph*>self._morph_cache.get(tag.id, tokens[i].lex.sic)
|
||||
cached = <_CachedMorph*>self._morph_cache.get(tag.id, tokens[i].lex.orth)
|
||||
if cached is NULL:
|
||||
cached = <_CachedMorph*>self.mem.alloc(1, sizeof(_CachedMorph))
|
||||
cached.lemma = self.lemmatize(tag.pos, tokens[i].lex)
|
||||
cached.morph = tag.morph
|
||||
self._morph_cache.set(tag.id, tokens[i].lex.sic, <void*>cached)
|
||||
self._morph_cache.set(tag.id, tokens[i].lex.orth, <void*>cached)
|
||||
tokens[i].lemma = cached.lemma
|
||||
tokens[i].morph = cached.morph
|
||||
|
||||
cdef int lemmatize(self, const univ_tag_t pos, const LexemeC* lex) except -1:
|
||||
if self.lemmatizer is None:
|
||||
return lex.sic
|
||||
cdef unicode py_string = self.strings[lex.sic]
|
||||
return lex.orth
|
||||
cdef unicode py_string = self.strings[lex.orth]
|
||||
if pos != NOUN and pos != VERB and pos != ADJ:
|
||||
return lex.sic
|
||||
return lex.orth
|
||||
cdef set lemma_strings
|
||||
cdef unicode lemma_string
|
||||
lemma_strings = self.lemmatizer(py_string, pos)
|
||||
|
@ -301,29 +301,29 @@ cdef class EnPosTagger:
|
|||
cdef dict entries
|
||||
cdef dict props
|
||||
cdef int lemma
|
||||
cdef id_t sic
|
||||
cdef id_t orth
|
||||
cdef int pos
|
||||
for pos_str, entries in exc.items():
|
||||
pos = self.tag_names.index(pos_str)
|
||||
for form_str, props in entries.items():
|
||||
lemma_str = props.get('L', form_str)
|
||||
sic = self.strings[form_str]
|
||||
orth = self.strings[form_str]
|
||||
cached = <_CachedMorph*>self.mem.alloc(1, sizeof(_CachedMorph))
|
||||
cached.lemma = self.strings[lemma_str]
|
||||
set_morph_from_dict(&cached.morph, props)
|
||||
self._morph_cache.set(pos, sic, <void*>cached)
|
||||
self._morph_cache.set(pos, orth, <void*>cached)
|
||||
|
||||
|
||||
cdef int fill_context(atom_t* context, const int i, const TokenC* tokens) except -1:
|
||||
_fill_from_token(&context[P2_sic], &tokens[i-2])
|
||||
_fill_from_token(&context[P1_sic], &tokens[i-1])
|
||||
_fill_from_token(&context[W_sic], &tokens[i])
|
||||
_fill_from_token(&context[N1_sic], &tokens[i+1])
|
||||
_fill_from_token(&context[N2_sic], &tokens[i+2])
|
||||
_fill_from_token(&context[P2_orth], &tokens[i-2])
|
||||
_fill_from_token(&context[P1_orth], &tokens[i-1])
|
||||
_fill_from_token(&context[W_orth], &tokens[i])
|
||||
_fill_from_token(&context[N1_orth], &tokens[i+1])
|
||||
_fill_from_token(&context[N2_orth], &tokens[i+2])
|
||||
|
||||
|
||||
cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
|
||||
context[0] = t.lex.sic
|
||||
context[0] = t.lex.orth
|
||||
context[1] = t.lex.cluster
|
||||
context[2] = t.lex.shape
|
||||
context[3] = t.lex.prefix
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
from .typedefs cimport hash_t, flags_t, id_t, len_t, tag_t, attr_t, attr_id_t
|
||||
from .typedefs cimport ID, SIC, NORM1, NORM2, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
|
||||
from .typedefs cimport ID, ORTH, NORM1, NORM2, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
|
||||
from .structs cimport LexemeC
|
||||
from .strings cimport StringStore
|
||||
|
||||
|
@ -14,20 +14,20 @@ cdef int set_lex_struct_props(LexemeC* lex, dict props, StringStore strings,
|
|||
const float* empty_vec) except -1
|
||||
|
||||
cdef class Lexeme:
|
||||
cdef readonly ndarray vec
|
||||
cdef readonly ndarray repvec
|
||||
|
||||
cdef readonly flags_t flags
|
||||
cdef readonly attr_t id
|
||||
cdef readonly attr_t length
|
||||
|
||||
cdef readonly attr_t sic
|
||||
cdef readonly attr_t orth
|
||||
cdef readonly attr_t norm1
|
||||
cdef readonly attr_t norm2
|
||||
cdef readonly attr_t shape
|
||||
cdef readonly attr_t prefix
|
||||
cdef readonly attr_t suffix
|
||||
|
||||
cdef readonly unicode sic_
|
||||
cdef readonly unicode orth_
|
||||
cdef readonly unicode norm1_
|
||||
cdef readonly unicode norm2_
|
||||
cdef readonly unicode shape_
|
||||
|
@ -49,14 +49,14 @@ cdef class Lexeme:
|
|||
py.id = ptr.id
|
||||
py.length = ptr.length
|
||||
|
||||
py.sic = ptr.sic
|
||||
py.orth = ptr.orth
|
||||
py.norm1 = ptr.norm1
|
||||
py.norm2 = ptr.norm2
|
||||
py.shape = ptr.shape
|
||||
py.prefix = ptr.prefix
|
||||
py.suffix = ptr.suffix
|
||||
|
||||
py.sic_ = strings[ptr.sic]
|
||||
py.orth_ = strings[ptr.orth]
|
||||
py.norm1_ = strings[ptr.norm1]
|
||||
py.norm2_ = strings[ptr.norm2]
|
||||
py.shape_ = strings[ptr.shape]
|
||||
|
@ -78,8 +78,8 @@ cdef inline attr_t get_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
|
|||
return check_flag(lex, feat_name)
|
||||
elif feat_name == ID:
|
||||
return lex.id
|
||||
elif feat_name == SIC:
|
||||
return lex.sic
|
||||
elif feat_name == ORTH:
|
||||
return lex.orth
|
||||
elif feat_name == NORM1:
|
||||
return lex.norm1
|
||||
elif feat_name == NORM2:
|
||||
|
|
|
@ -16,7 +16,7 @@ memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
|
|||
cdef int set_lex_struct_props(LexemeC* lex, dict props, StringStore string_store,
|
||||
const float* empty_vec) except -1:
|
||||
lex.length = props['length']
|
||||
lex.sic = string_store[props['sic']]
|
||||
lex.orth = string_store[props['orth']]
|
||||
lex.norm1 = string_store[props['norm1']]
|
||||
lex.norm2 = string_store[props['norm2']]
|
||||
lex.shape = string_store[props['shape']]
|
||||
|
@ -34,4 +34,4 @@ cdef int set_lex_struct_props(LexemeC* lex, dict props, StringStore string_store
|
|||
cdef class Lexeme:
|
||||
"""A dummy docstring"""
|
||||
def __cinit__(self, int vec_size):
|
||||
self.vec = numpy.ndarray(shape=(vec_size,), dtype=numpy.float32)
|
||||
self.repvec = numpy.ndarray(shape=(vec_size,), dtype=numpy.float32)
|
||||
|
|
|
@ -11,7 +11,7 @@ cdef struct LexemeC:
|
|||
attr_t id
|
||||
attr_t length
|
||||
|
||||
attr_t sic
|
||||
attr_t orth
|
||||
attr_t norm1
|
||||
attr_t norm2
|
||||
attr_t shape
|
||||
|
|
|
@ -26,7 +26,7 @@ cdef inline void fill_token(atom_t* context, const TokenC* token) nogil:
|
|||
context[5] = 0
|
||||
context[6] = 0
|
||||
else:
|
||||
context[0] = token.lex.sic
|
||||
context[0] = token.lex.orth
|
||||
context[1] = token.lemma
|
||||
context[2] = token.tag
|
||||
context[3] = token.lex.cluster
|
||||
|
|
|
@ -50,7 +50,7 @@ cdef class Token:
|
|||
cdef readonly attr_t idx
|
||||
cdef readonly attr_t cluster
|
||||
cdef readonly attr_t length
|
||||
cdef readonly attr_t sic
|
||||
cdef readonly attr_t orth
|
||||
cdef readonly attr_t norm1
|
||||
cdef readonly attr_t norm2
|
||||
cdef readonly attr_t shape
|
||||
|
|
|
@ -7,7 +7,7 @@ from preshed.counter cimport PreshCounter
|
|||
from .vocab cimport EMPTY_LEXEME
|
||||
from .typedefs cimport attr_id_t, attr_t
|
||||
from .typedefs cimport LEMMA
|
||||
from .typedefs cimport ID, SIC, NORM1, NORM2, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
|
||||
from .typedefs cimport ID, ORTH, NORM1, NORM2, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
|
||||
from .typedefs cimport POS, LEMMA
|
||||
|
||||
from unidecode import unidecode
|
||||
|
@ -42,8 +42,8 @@ cdef attr_t get_lex_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
|
|||
return check_flag(lex, feat_name)
|
||||
elif feat_name == ID:
|
||||
return lex.id
|
||||
elif feat_name == SIC:
|
||||
return lex.sic
|
||||
elif feat_name == ORTH:
|
||||
return lex.orth
|
||||
elif feat_name == NORM1:
|
||||
return lex.norm1
|
||||
elif feat_name == NORM2:
|
||||
|
@ -97,8 +97,8 @@ cdef class Tokens:
|
|||
for i in range(self.length):
|
||||
if start is None:
|
||||
start = i
|
||||
if self.data[i].lex.sic == period or self.data[i].lex.sic == exclamation or \
|
||||
self.data[i].lex.sic == question:
|
||||
if self.data[i].lex.orth == period or self.data[i].lex.orth == exclamation or \
|
||||
self.data[i].lex.orth == question:
|
||||
spans.append((start, i+1))
|
||||
start = None
|
||||
if start is not None:
|
||||
|
@ -176,9 +176,9 @@ cdef class Tokens:
|
|||
>>> from spacy.en import English, attrs
|
||||
>>> nlp = English()
|
||||
>>> tokens = nlp(u'apple apple orange banana')
|
||||
>>> tokens.count_by(attrs.SIC)
|
||||
>>> tokens.count_by(attrs.ORTH)
|
||||
{12800L: 1, 11880L: 2, 7561L: 1}
|
||||
>>> tokens.to_array([attrs.SIC])
|
||||
>>> tokens.to_array([attrs.ORTH])
|
||||
array([[11880],
|
||||
[11880],
|
||||
[ 7561],
|
||||
|
@ -222,7 +222,7 @@ cdef class Token:
|
|||
self.idx = t.idx
|
||||
self.cluster = t.lex.cluster
|
||||
self.length = t.lex.length
|
||||
self.sic = t.lex.sic
|
||||
self.orth = t.lex.orth
|
||||
self.norm1 = t.lex.norm1
|
||||
self.norm2 = t.lex.norm2
|
||||
self.shape = t.lex.shape
|
||||
|
@ -270,14 +270,14 @@ cdef class Token:
|
|||
"""The unicode string of the word, with no whitespace padding."""
|
||||
def __get__(self):
|
||||
cdef const TokenC* t = &self._seq.data[self.i]
|
||||
if t.lex.sic == 0:
|
||||
if t.lex.orth == 0:
|
||||
return ''
|
||||
cdef unicode py_ustr = self._seq.vocab.strings[t.lex.sic]
|
||||
cdef unicode py_ustr = self._seq.vocab.strings[t.lex.orth]
|
||||
return py_ustr
|
||||
|
||||
property sic_:
|
||||
property orth_:
|
||||
def __get__(self):
|
||||
return self._seq.vocab.strings[self.sic]
|
||||
return self._seq.vocab.strings[self.orth]
|
||||
|
||||
property norm1_:
|
||||
def __get__(self):
|
||||
|
|
|
@ -89,7 +89,7 @@ cpdef enum attr_id_t:
|
|||
FLAG63
|
||||
|
||||
ID
|
||||
SIC
|
||||
ORTH
|
||||
NORM1
|
||||
NORM2
|
||||
SHAPE
|
||||
|
|
|
@ -10,8 +10,8 @@ def EN():
|
|||
|
||||
def test_possess(EN):
|
||||
tokens = EN("Mike's")
|
||||
assert EN.vocab.strings[tokens[0].sic] == b"Mike"
|
||||
assert EN.vocab.strings[tokens[1].sic] == b"'s"
|
||||
assert EN.vocab.strings[tokens[0].orth] == "Mike"
|
||||
assert EN.vocab.strings[tokens[1].orth] == "'s"
|
||||
assert len(tokens) == 2
|
||||
|
||||
|
||||
|
|
|
@ -33,17 +33,17 @@ def test_punct(EN):
|
|||
def test_digits(EN):
|
||||
tokens = EN('The year: 1984.')
|
||||
assert len(tokens) == 5
|
||||
assert tokens[0].sic == EN.vocab['The'].sic
|
||||
assert tokens[3].sic == EN.vocab['1984'].sic
|
||||
assert tokens[0].orth == EN.vocab['The'].orth
|
||||
assert tokens[3].orth == EN.vocab['1984'].orth
|
||||
|
||||
|
||||
def test_contraction(EN):
|
||||
tokens = EN("don't giggle")
|
||||
assert len(tokens) == 3
|
||||
assert tokens[1].sic == EN.vocab["n't"].sic
|
||||
assert tokens[1].orth == EN.vocab["n't"].orth
|
||||
tokens = EN("i said don't!")
|
||||
assert len(tokens) == 5
|
||||
assert tokens[4].sic == EN.vocab['!'].sic
|
||||
assert tokens[4].orth == EN.vocab['!'].orth
|
||||
|
||||
|
||||
def test_contraction_punct(EN):
|
||||
|
|
|
@ -11,24 +11,24 @@ def EN():
|
|||
|
||||
def test_neq(EN):
|
||||
addr = EN.vocab['Hello']
|
||||
assert EN.vocab['bye'].sic != addr.sic
|
||||
assert EN.vocab['bye'].orth != addr.orth
|
||||
|
||||
|
||||
def test_eq(EN):
|
||||
addr = EN.vocab['Hello']
|
||||
assert EN.vocab['Hello'].sic == addr.sic
|
||||
assert EN.vocab['Hello'].orth == addr.orth
|
||||
|
||||
|
||||
def test_case_neq(EN):
|
||||
addr = EN.vocab['Hello']
|
||||
assert EN.vocab['hello'].sic != addr.sic
|
||||
assert EN.vocab['hello'].orth != addr.orth
|
||||
|
||||
|
||||
def test_punct_neq(EN):
|
||||
addr = EN.vocab['Hello']
|
||||
assert EN.vocab['Hello,'].sic != addr.sic
|
||||
assert EN.vocab['Hello,'].orth != addr.orth
|
||||
|
||||
|
||||
def test_shape_attr(EN):
|
||||
example = EN.vocab['example']
|
||||
assert example.sic != example.shape
|
||||
assert example.orth != example.shape
|
||||
|
|
Loading…
Reference in New Issue
Block a user