* Rename sic to orth

2025-07-31 18:39:49 +03:00 · 2015-01-23 02:08:25 +11:00 · 2015-01-23 02:08:25 +11:00 · 5ed8b2b98f
commit 5ed8b2b98f
parent 93d4bd6c2e
13 changed files with 63 additions and 63 deletions
--- a/spacy/en/init.py
+++ b/spacy/en/init.py
@ -19,7 +19,7 @@ def get_lex_props(string):
    return {
        'flags': get_flags(string),
        'length': len(string),
-        'sic': string,
+        'orth': string,
        'norm1': string.lower(),
        'norm2': string,
        'shape': orth.word_shape(string),
--- a/spacy/en/attrs.pxd
+++ b/spacy/en/attrs.pxd
@ -1,6 +1,6 @@
 from ..attrs cimport FLAG0, FLAG1, FLAG2, FLAG3, FLAG4, FLAG5, FLAG6, FLAG7
 from ..attrs cimport FLAG8, FLAG9, FLAG10
-from ..attrs cimport SIC as _SIC
+from ..attrs cimport ORTH as _ORTH
 from ..attrs cimport SHAPE as _SHAPE
 from ..attrs cimport NORM1 as _NORM1
 from ..attrs cimport NORM2 as _NORM2
@ -24,7 +24,7 @@ cpdef enum:
    LIKE_NUM = FLAG9
    IS_STOP = FLAG10

-    SIC = _SIC
+    ORTH = _ORTH
    SHAPE = _SHAPE
    LOWER = _NORM1
    NORM2 = _NORM2
--- a/spacy/en/pos.pyx
+++ b/spacy/en/pos.pyx
@ -70,7 +70,7 @@ cpdef enum misc_t:


 cpdef enum:
-    P2_sic
+    P2_orth
    P2_cluster
    P2_shape
    P2_prefix
@ -78,7 +78,7 @@ cpdef enum:
    P2_pos
    P2_lemma

-    P1_sic
+    P1_orth
    P1_cluster
    P1_shape
    P1_prefix
@ -86,7 +86,7 @@ cpdef enum:
    P1_pos
    P1_lemma

-    W_sic
+    W_orth
    W_cluster
    W_shape
    W_prefix
@ -94,7 +94,7 @@ cpdef enum:
    W_pos
    W_lemma

-    N1_sic
+    N1_orth
    N1_cluster
    N1_shape
    N1_prefix
@ -102,7 +102,7 @@ cpdef enum:
    N1_pos
    N1_lemma

-    N2_sic
+    N2_orth
    N2_cluster
    N2_shape
    N2_prefix
@ -169,11 +169,11 @@ POS_TAGS = {


 POS_TEMPLATES = (
-    (W_sic,),
+    (W_orth,),
    (P1_lemma, P1_pos),
    (P2_lemma, P2_pos),
-    (N1_sic,),
-    (N2_sic,),
+    (N1_orth,),
+    (N2_orth,),

    (W_suffix,),
    (W_prefix,),
@ -181,7 +181,7 @@ POS_TEMPLATES = (
    (P1_pos,),
    (P2_pos,),
    (P1_pos, P2_pos),
-    (P1_pos, W_sic),
+    (P1_pos, W_orth),
    (P1_suffix,),
    (N1_suffix,),

@ -272,21 +272,21 @@ cdef class EnPosTagger:
    cdef int set_morph(self, const int i, TokenC* tokens) except -1:
        cdef const PosTag* tag = &self.tags[tokens[i].tag]
        tokens[i].pos = tag.pos
-        cached = <_CachedMorph*>self._morph_cache.get(tag.id, tokens[i].lex.sic)
+        cached = <_CachedMorph*>self._morph_cache.get(tag.id, tokens[i].lex.orth)
        if cached is NULL:
            cached = <_CachedMorph*>self.mem.alloc(1, sizeof(_CachedMorph))
            cached.lemma = self.lemmatize(tag.pos, tokens[i].lex)
            cached.morph = tag.morph
-            self._morph_cache.set(tag.id, tokens[i].lex.sic, <void*>cached)
+            self._morph_cache.set(tag.id, tokens[i].lex.orth, <void*>cached)
        tokens[i].lemma = cached.lemma
        tokens[i].morph = cached.morph

    cdef int lemmatize(self, const univ_tag_t pos, const LexemeC* lex) except -1:
        if self.lemmatizer is None:
-            return lex.sic
-        cdef unicode py_string = self.strings[lex.sic]
+            return lex.orth
+        cdef unicode py_string = self.strings[lex.orth]
        if pos != NOUN and pos != VERB and pos != ADJ:
-            return lex.sic
+            return lex.orth
        cdef set lemma_strings
        cdef unicode lemma_string
        lemma_strings = self.lemmatizer(py_string, pos)
@ -301,29 +301,29 @@ cdef class EnPosTagger:
        cdef dict entries
        cdef dict props
        cdef int lemma
-        cdef id_t sic
+        cdef id_t orth
        cdef int pos
        for pos_str, entries in exc.items():
            pos = self.tag_names.index(pos_str)
            for form_str, props in entries.items():
                lemma_str = props.get('L', form_str)
-                sic = self.strings[form_str]
+                orth = self.strings[form_str]
                cached = <_CachedMorph*>self.mem.alloc(1, sizeof(_CachedMorph))
                cached.lemma = self.strings[lemma_str]
                set_morph_from_dict(&cached.morph, props)
-                self._morph_cache.set(pos, sic, <void*>cached)
+                self._morph_cache.set(pos, orth, <void*>cached)
 

 cdef int fill_context(atom_t* context, const int i, const TokenC* tokens) except -1:
-    _fill_from_token(&context[P2_sic], &tokens[i-2])
-    _fill_from_token(&context[P1_sic], &tokens[i-1])
-    _fill_from_token(&context[W_sic], &tokens[i])
-    _fill_from_token(&context[N1_sic], &tokens[i+1])
-    _fill_from_token(&context[N2_sic], &tokens[i+2])
+    _fill_from_token(&context[P2_orth], &tokens[i-2])
+    _fill_from_token(&context[P1_orth], &tokens[i-1])
+    _fill_from_token(&context[W_orth], &tokens[i])
+    _fill_from_token(&context[N1_orth], &tokens[i+1])
+    _fill_from_token(&context[N2_orth], &tokens[i+2])


 cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
-    context[0] = t.lex.sic
+    context[0] = t.lex.orth
    context[1] = t.lex.cluster
    context[2] = t.lex.shape
    context[3] = t.lex.prefix
--- a/spacy/lexeme.pxd
+++ b/spacy/lexeme.pxd
@ -1,5 +1,5 @@
 from .typedefs cimport hash_t, flags_t, id_t, len_t, tag_t, attr_t, attr_id_t
-from .typedefs cimport ID, SIC, NORM1, NORM2, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
+from .typedefs cimport ID, ORTH, NORM1, NORM2, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
 from .structs cimport LexemeC
 from .strings cimport StringStore

@ -14,20 +14,20 @@ cdef int set_lex_struct_props(LexemeC* lex, dict props, StringStore strings,
                              const float* empty_vec) except -1
 
 cdef class Lexeme:
-    cdef readonly ndarray vec
+    cdef readonly ndarray repvec

    cdef readonly flags_t flags
    cdef readonly attr_t id
    cdef readonly attr_t length

-    cdef readonly attr_t sic
+    cdef readonly attr_t orth
    cdef readonly attr_t norm1
    cdef readonly attr_t norm2
    cdef readonly attr_t shape
    cdef readonly attr_t prefix
    cdef readonly attr_t suffix

-    cdef readonly unicode sic_
+    cdef readonly unicode orth_
    cdef readonly unicode norm1_
    cdef readonly unicode norm2_
    cdef readonly unicode shape_
@ -49,14 +49,14 @@ cdef class Lexeme:
        py.id = ptr.id
        py.length = ptr.length

-        py.sic = ptr.sic
+        py.orth = ptr.orth
        py.norm1 = ptr.norm1
        py.norm2 = ptr.norm2
        py.shape = ptr.shape
        py.prefix = ptr.prefix
        py.suffix = ptr.suffix

-        py.sic_ = strings[ptr.sic]
+        py.orth_ = strings[ptr.orth]
        py.norm1_ = strings[ptr.norm1]
        py.norm2_ = strings[ptr.norm2]
        py.shape_ = strings[ptr.shape]
@ -78,8 +78,8 @@ cdef inline attr_t get_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
        return check_flag(lex, feat_name)
    elif feat_name == ID:
        return lex.id
-    elif feat_name == SIC:
-        return lex.sic
+    elif feat_name == ORTH:
+        return lex.orth
    elif feat_name == NORM1:
        return lex.norm1
    elif feat_name == NORM2:
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@ -16,7 +16,7 @@ memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
 cdef int set_lex_struct_props(LexemeC* lex, dict props, StringStore string_store,
                              const float* empty_vec) except -1:
    lex.length = props['length']
-    lex.sic = string_store[props['sic']]
+    lex.orth = string_store[props['orth']]
    lex.norm1 = string_store[props['norm1']] 
    lex.norm2 = string_store[props['norm2']] 
    lex.shape = string_store[props['shape']] 
@ -34,4 +34,4 @@ cdef int set_lex_struct_props(LexemeC* lex, dict props, StringStore string_store
 cdef class Lexeme:
    """A dummy docstring"""
    def __cinit__(self, int vec_size):
-        self.vec = numpy.ndarray(shape=(vec_size,), dtype=numpy.float32)
+        self.repvec = numpy.ndarray(shape=(vec_size,), dtype=numpy.float32)
--- a/spacy/structs.pxd
+++ b/spacy/structs.pxd
@ -11,7 +11,7 @@ cdef struct LexemeC:
    attr_t id
    attr_t length

-    attr_t sic
+    attr_t orth
    attr_t norm1
    attr_t norm2
    attr_t shape
--- a/spacy/syntax/_parse_features.pyx
+++ b/spacy/syntax/_parse_features.pyx
@ -26,7 +26,7 @@ cdef inline void fill_token(atom_t* context, const TokenC* token) nogil:
        context[5] = 0
        context[6] = 0
    else:
-        context[0] = token.lex.sic
+        context[0] = token.lex.orth
        context[1] = token.lemma
        context[2] = token.tag
        context[3] = token.lex.cluster
--- a/spacy/tokens.pxd
+++ b/spacy/tokens.pxd
@ -50,7 +50,7 @@ cdef class Token:
    cdef readonly attr_t idx
    cdef readonly attr_t cluster
    cdef readonly attr_t length
-    cdef readonly attr_t sic
+    cdef readonly attr_t orth
    cdef readonly attr_t norm1
    cdef readonly attr_t norm2
    cdef readonly attr_t shape
--- a/spacy/tokens.pyx
+++ b/spacy/tokens.pyx
@ -7,7 +7,7 @@ from preshed.counter cimport PreshCounter
 from .vocab cimport EMPTY_LEXEME
 from .typedefs cimport attr_id_t, attr_t
 from .typedefs cimport LEMMA
-from .typedefs cimport ID, SIC, NORM1, NORM2, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
+from .typedefs cimport ID, ORTH, NORM1, NORM2, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
 from .typedefs cimport POS, LEMMA

 from unidecode import unidecode
@ -42,8 +42,8 @@ cdef attr_t get_lex_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
        return check_flag(lex, feat_name)
    elif feat_name == ID:
        return lex.id
-    elif feat_name == SIC:
-        return lex.sic
+    elif feat_name == ORTH:
+        return lex.orth
    elif feat_name == NORM1:
        return lex.norm1
    elif feat_name == NORM2:
@ -97,8 +97,8 @@ cdef class Tokens:
        for i in range(self.length):
            if start is None:
                start = i
-            if self.data[i].lex.sic == period or self.data[i].lex.sic == exclamation or \
-              self.data[i].lex.sic == question:
+            if self.data[i].lex.orth == period or self.data[i].lex.orth == exclamation or \
+              self.data[i].lex.orth == question:
                spans.append((start, i+1))
                start = None
        if start is not None:
@ -176,9 +176,9 @@ cdef class Tokens:
          >>> from spacy.en import English, attrs
          >>> nlp = English()
          >>> tokens = nlp(u'apple apple orange banana')
-          >>> tokens.count_by(attrs.SIC)
+          >>> tokens.count_by(attrs.ORTH)
          {12800L: 1, 11880L: 2, 7561L: 1}
-          >>> tokens.to_array([attrs.SIC])
+          >>> tokens.to_array([attrs.ORTH])
          array([[11880],
                 [11880],
                 [ 7561],
@ -222,7 +222,7 @@ cdef class Token:
        self.idx = t.idx
        self.cluster = t.lex.cluster
        self.length = t.lex.length
-        self.sic = t.lex.sic
+        self.orth = t.lex.orth
        self.norm1 = t.lex.norm1
        self.norm2 = t.lex.norm2
        self.shape = t.lex.shape
@ -270,14 +270,14 @@ cdef class Token:
        """The unicode string of the word, with no whitespace padding."""
        def __get__(self):
            cdef const TokenC* t = &self._seq.data[self.i]
-            if t.lex.sic == 0:
+            if t.lex.orth == 0:
                return ''
-            cdef unicode py_ustr = self._seq.vocab.strings[t.lex.sic]
+            cdef unicode py_ustr = self._seq.vocab.strings[t.lex.orth]
            return py_ustr

-    property sic_:
+    property orth_:
        def __get__(self):
-            return self._seq.vocab.strings[self.sic]
+            return self._seq.vocab.strings[self.orth]

    property norm1_:
        def __get__(self):
--- a/spacy/typedefs.pxd
+++ b/spacy/typedefs.pxd
@ -89,7 +89,7 @@ cpdef enum attr_id_t:
    FLAG63

    ID
-    SIC
+    ORTH
    NORM1
    NORM2
    SHAPE
--- a/tests/test_contractions.py
+++ b/tests/test_contractions.py
@ -10,8 +10,8 @@ def EN():

 def test_possess(EN):
    tokens = EN("Mike's")
-    assert EN.vocab.strings[tokens[0].sic] == b"Mike"
-    assert EN.vocab.strings[tokens[1].sic] == b"'s"
+    assert EN.vocab.strings[tokens[0].orth] == "Mike"
+    assert EN.vocab.strings[tokens[1].orth] == "'s"
    assert len(tokens) == 2


--- a/tests/test_tokenizer.py
+++ b/tests/test_tokenizer.py
@ -33,17 +33,17 @@ def test_punct(EN):
 def test_digits(EN):
    tokens = EN('The year: 1984.')
    assert len(tokens) == 5
-    assert tokens[0].sic == EN.vocab['The'].sic
-    assert tokens[3].sic == EN.vocab['1984'].sic
+    assert tokens[0].orth == EN.vocab['The'].orth
+    assert tokens[3].orth == EN.vocab['1984'].orth


 def test_contraction(EN):
    tokens = EN("don't giggle")
    assert len(tokens) == 3
-    assert tokens[1].sic == EN.vocab["n't"].sic
+    assert tokens[1].orth == EN.vocab["n't"].orth
    tokens = EN("i said don't!")
    assert len(tokens) == 5
-    assert tokens[4].sic == EN.vocab['!'].sic
+    assert tokens[4].orth == EN.vocab['!'].orth


 def test_contraction_punct(EN):
--- a/tests/test_vocab.py
+++ b/tests/test_vocab.py
@ -11,24 +11,24 @@ def EN():

 def test_neq(EN):
    addr = EN.vocab['Hello']
-    assert EN.vocab['bye'].sic != addr.sic
+    assert EN.vocab['bye'].orth != addr.orth


 def test_eq(EN):
    addr = EN.vocab['Hello']
-    assert EN.vocab['Hello'].sic == addr.sic
+    assert EN.vocab['Hello'].orth == addr.orth


 def test_case_neq(EN):
    addr = EN.vocab['Hello']
-    assert EN.vocab['hello'].sic != addr.sic
+    assert EN.vocab['hello'].orth != addr.orth


 def test_punct_neq(EN):
    addr = EN.vocab['Hello']
-    assert EN.vocab['Hello,'].sic != addr.sic
+    assert EN.vocab['Hello,'].orth != addr.orth


 def test_shape_attr(EN):
    example = EN.vocab['example']
-    assert example.sic != example.shape
+    assert example.orth != example.shape