* Move POS tag definitions to parts_of_speech.pxd

This commit is contained in:
Matthew Honnibal 2015-01-25 16:31:07 +11:00
parent 7431c133d8
commit 12b034e3ef
7 changed files with 15 additions and 50 deletions

View File

@ -4,7 +4,7 @@ from cymem.cymem cimport Pool
from .._ml cimport Model
from ..strings cimport StringStore
from ..structs cimport TokenC, LexemeC, Morphology, PosTag
from ..typedefs cimport univ_tag_t
from ..parts_of_speech cimport univ_pos_t
from .lemmatizer import Lemmatizer
@ -21,5 +21,5 @@ cdef class EnPosTagger:
cdef readonly int n_tags
cdef int set_morph(self, const int i, TokenC* tokens) except -1
cdef int lemmatize(self, const univ_tag_t pos, const LexemeC* lex) except -1
cdef int lemmatize(self, const univ_pos_t pos, const LexemeC* lex) except -1

View File

@ -8,9 +8,9 @@ from libc.string cimport memset
from cymem.cymem cimport Address
from thinc.typedefs cimport atom_t, weight_t
from ..typedefs cimport univ_tag_t
from ..typedefs cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT, VERB
from ..typedefs cimport X, PUNCT, EOL
from ..parts_of_speech cimport univ_pos_t
from ..parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT, VERB
from ..parts_of_speech cimport X, PUNCT, EOL
from ..typedefs cimport id_t
from ..structs cimport TokenC, Morphology, LexemeC
from ..tokens cimport Tokens
@ -282,7 +282,7 @@ cdef class EnPosTagger:
tokens[i].lemma = cached.lemma
tokens[i].morph = cached.morph
cdef int lemmatize(self, const univ_tag_t pos, const LexemeC* lex) except -1:
cdef int lemmatize(self, const univ_pos_t pos, const LexemeC* lex) except -1:
if self.lemmatizer is None:
return lex.orth
cdef unicode py_string = self.strings[lex.orth]

View File

@ -1,6 +1,7 @@
from libc.stdint cimport uint8_t, uint32_t
from .typedefs cimport flags_t, attr_t, id_t, hash_t, univ_tag_t
from .typedefs cimport flags_t, attr_t, id_t, hash_t
from .parts_of_speech cimport univ_pos_t
cdef struct LexemeC:
@ -37,13 +38,13 @@ cdef struct Morphology:
cdef struct PosTag:
Morphology morph
int id
univ_tag_t pos
univ_pos_t pos
cdef struct TokenC:
const LexemeC* lex
Morphology morph
univ_tag_t pos
univ_pos_t pos
int tag
int idx
int lemma

View File

@ -6,7 +6,8 @@ cimport numpy
from cymem.cymem cimport Pool
from thinc.typedefs cimport atom_t
from .typedefs cimport flags_t, attr_id_t, attr_t, univ_tag_t
from .typedefs cimport flags_t, attr_id_t, attr_t
from .parts_of_speech cimport univ_pos_t
from .structs cimport Morphology, TokenC, LexemeC
from .vocab cimport Vocab
from .strings cimport StringStore
@ -66,7 +67,7 @@ cdef class Token:
cdef readonly float sentiment
cdef readonly attr_t flags
cdef readonly attr_t lemma
cdef readonly univ_tag_t pos
cdef readonly univ_pos_t pos
cdef readonly attr_t tag
cdef readonly attr_t dep
cdef readonly ndarray repvec

View File

@ -8,7 +8,7 @@ from .typedefs cimport attr_id_t, attr_t
from .typedefs cimport LEMMA
from .typedefs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
from .typedefs cimport POS, LEMMA
from .typedefs import UNIV_TAG_NAMES
from .parts_of_speech import UNIV_POS_NAMES
from unidecode import unidecode
@ -325,7 +325,7 @@ cdef class Token:
property pos_:
def __get__(self):
id_to_string = {id_: string for string, id_ in UNIV_TAG_NAMES.items()}
id_to_string = {id_: string for string, id_ in UNIV_POS_NAMES.items()}
return id_to_string[self.pos]
property tag_:

View File

@ -2,25 +2,6 @@ from libc.stdint cimport uint16_t, uint32_t, uint64_t, uintptr_t
from libc.stdint cimport uint8_t
# Google universal tag set
cpdef enum univ_tag_t:
NO_TAG
ADJ
ADV
ADP
CONJ
DET
NOUN
NUM
PRON
PRT
VERB
X
PUNCT
EOL
N_UNIV_TAGS
# Reserve 64 values for flag features
cpdef enum attr_id_t:
FLAG0

View File

@ -1,19 +1 @@
from __future__ import unicode_literals
UNIV_TAG_NAMES = {
"NO_TAG": NO_TAG,
"ADJ": ADJ,
"ADV": ADV,
"ADP": ADP,
"CONJ": CONJ,
"DET": DET,
"NOUN": NOUN,
"NUM": NUM,
"PRON": PRON,
"PRT": PRT,
"VERB": VERB,
"X": X,
"PUNCT": PUNCT,
"EOL": EOL
}