* Move POS tag definitions to parts_of_speech.pxd

2025-11-07 19:37:38 +03:00 · 2015-01-25 16:31:07 +11:00 · 2015-01-25 16:31:07 +11:00 · 12b034e3ef
commit 12b034e3ef
parent 7431c133d8
7 changed files with 15 additions and 50 deletions
--- a/spacy/en/pos.pxd
+++ b/spacy/en/pos.pxd
@ -4,7 +4,7 @@ from cymem.cymem cimport Pool
 from .._ml cimport Model
 from ..strings cimport StringStore
 from ..structs cimport TokenC, LexemeC, Morphology, PosTag
-from ..typedefs cimport univ_tag_t
+from ..parts_of_speech cimport univ_pos_t
 from .lemmatizer import Lemmatizer


@ -21,5 +21,5 @@ cdef class EnPosTagger:
    cdef readonly int n_tags

    cdef int set_morph(self, const int i, TokenC* tokens) except -1
-    cdef int lemmatize(self, const univ_tag_t pos, const LexemeC* lex) except -1
+    cdef int lemmatize(self, const univ_pos_t pos, const LexemeC* lex) except -1

--- a/spacy/en/pos.pyx
+++ b/spacy/en/pos.pyx
@ -8,9 +8,9 @@ from libc.string cimport memset
 from cymem.cymem cimport Address
 from thinc.typedefs cimport atom_t, weight_t

-from ..typedefs cimport univ_tag_t
-from ..typedefs cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT, VERB
-from ..typedefs cimport X, PUNCT, EOL
+from ..parts_of_speech cimport univ_pos_t
+from ..parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT, VERB
+from ..parts_of_speech cimport X, PUNCT, EOL
 from ..typedefs cimport id_t
 from ..structs cimport TokenC, Morphology, LexemeC
 from ..tokens cimport Tokens
@ -282,7 +282,7 @@ cdef class EnPosTagger:
        tokens[i].lemma = cached.lemma
        tokens[i].morph = cached.morph

-    cdef int lemmatize(self, const univ_tag_t pos, const LexemeC* lex) except -1:
+    cdef int lemmatize(self, const univ_pos_t pos, const LexemeC* lex) except -1:
        if self.lemmatizer is None:
            return lex.orth
        cdef unicode py_string = self.strings[lex.orth]
--- a/spacy/structs.pxd
+++ b/spacy/structs.pxd
@ -1,6 +1,7 @@
 from libc.stdint cimport uint8_t, uint32_t

-from .typedefs cimport flags_t, attr_t, id_t, hash_t, univ_tag_t
+from .typedefs cimport flags_t, attr_t, id_t, hash_t
+from .parts_of_speech cimport univ_pos_t


 cdef struct LexemeC:
@ -37,13 +38,13 @@ cdef struct Morphology:
 cdef struct PosTag:
    Morphology morph
    int id
-    univ_tag_t pos
+    univ_pos_t pos


 cdef struct TokenC:
    const LexemeC* lex
    Morphology morph
-    univ_tag_t pos
+    univ_pos_t pos
    int tag
    int idx
    int lemma
--- a/spacy/tokens.pxd
+++ b/spacy/tokens.pxd
@ -6,7 +6,8 @@ cimport numpy
 from cymem.cymem cimport Pool
 from thinc.typedefs cimport atom_t

-from .typedefs cimport flags_t, attr_id_t, attr_t, univ_tag_t
+from .typedefs cimport flags_t, attr_id_t, attr_t
+from .parts_of_speech cimport univ_pos_t
 from .structs cimport Morphology, TokenC, LexemeC
 from .vocab cimport Vocab
 from .strings cimport StringStore
@ -66,7 +67,7 @@ cdef class Token:
    cdef readonly float sentiment
    cdef readonly attr_t flags
    cdef readonly attr_t lemma
-    cdef readonly univ_tag_t pos
+    cdef readonly univ_pos_t pos
    cdef readonly attr_t tag
    cdef readonly attr_t dep
    cdef readonly ndarray repvec
--- a/spacy/tokens.pyx
+++ b/spacy/tokens.pyx
@ -8,7 +8,7 @@ from .typedefs cimport attr_id_t, attr_t
 from .typedefs cimport LEMMA
 from .typedefs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
 from .typedefs cimport POS, LEMMA
-from .typedefs import UNIV_TAG_NAMES
+from .parts_of_speech import UNIV_POS_NAMES

 from unidecode import unidecode

@ -325,7 +325,7 @@ cdef class Token:

    property pos_:
        def __get__(self):
-            id_to_string = {id_: string for string, id_ in UNIV_TAG_NAMES.items()}
+            id_to_string = {id_: string for string, id_ in UNIV_POS_NAMES.items()}
            return id_to_string[self.pos]

    property tag_:
--- a/spacy/typedefs.pxd
+++ b/spacy/typedefs.pxd
@ -2,25 +2,6 @@ from libc.stdint cimport uint16_t, uint32_t, uint64_t, uintptr_t
 from libc.stdint cimport uint8_t


-# Google universal tag set
-cpdef enum univ_tag_t:
-    NO_TAG
-    ADJ
-    ADV
-    ADP
-    CONJ
-    DET
-    NOUN
-    NUM
-    PRON
-    PRT
-    VERB
-    X
-    PUNCT
-    EOL
-    N_UNIV_TAGS
-
-
 # Reserve 64 values for flag features
 cpdef enum attr_id_t:
    FLAG0
--- a/spacy/typedefs.pyx
+++ b/spacy/typedefs.pyx
@ -1,19 +1 @@
-from __future__ import unicode_literals

-
-UNIV_TAG_NAMES = {
-    "NO_TAG": NO_TAG,
-    "ADJ": ADJ,
-    "ADV": ADV,
-    "ADP": ADP,
-    "CONJ": CONJ,
-    "DET": DET,
-    "NOUN": NOUN,
-    "NUM": NUM,
-    "PRON": PRON,
-    "PRT": PRT,
-    "VERB": VERB,
-    "X": X,
-    "PUNCT": PUNCT,
-    "EOL": EOL
-}