* Move POS tag definitions to parts_of_speech.pxd

This commit is contained in:
Matthew Honnibal 2015-01-25 16:31:07 +11:00
parent 7431c133d8
commit 12b034e3ef
7 changed files with 15 additions and 50 deletions

View File

@ -4,7 +4,7 @@ from cymem.cymem cimport Pool
from .._ml cimport Model from .._ml cimport Model
from ..strings cimport StringStore from ..strings cimport StringStore
from ..structs cimport TokenC, LexemeC, Morphology, PosTag from ..structs cimport TokenC, LexemeC, Morphology, PosTag
from ..typedefs cimport univ_tag_t from ..parts_of_speech cimport univ_pos_t
from .lemmatizer import Lemmatizer from .lemmatizer import Lemmatizer
@ -21,5 +21,5 @@ cdef class EnPosTagger:
cdef readonly int n_tags cdef readonly int n_tags
cdef int set_morph(self, const int i, TokenC* tokens) except -1 cdef int set_morph(self, const int i, TokenC* tokens) except -1
cdef int lemmatize(self, const univ_tag_t pos, const LexemeC* lex) except -1 cdef int lemmatize(self, const univ_pos_t pos, const LexemeC* lex) except -1

View File

@ -8,9 +8,9 @@ from libc.string cimport memset
from cymem.cymem cimport Address from cymem.cymem cimport Address
from thinc.typedefs cimport atom_t, weight_t from thinc.typedefs cimport atom_t, weight_t
from ..typedefs cimport univ_tag_t from ..parts_of_speech cimport univ_pos_t
from ..typedefs cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT, VERB from ..parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT, VERB
from ..typedefs cimport X, PUNCT, EOL from ..parts_of_speech cimport X, PUNCT, EOL
from ..typedefs cimport id_t from ..typedefs cimport id_t
from ..structs cimport TokenC, Morphology, LexemeC from ..structs cimport TokenC, Morphology, LexemeC
from ..tokens cimport Tokens from ..tokens cimport Tokens
@ -282,7 +282,7 @@ cdef class EnPosTagger:
tokens[i].lemma = cached.lemma tokens[i].lemma = cached.lemma
tokens[i].morph = cached.morph tokens[i].morph = cached.morph
cdef int lemmatize(self, const univ_tag_t pos, const LexemeC* lex) except -1: cdef int lemmatize(self, const univ_pos_t pos, const LexemeC* lex) except -1:
if self.lemmatizer is None: if self.lemmatizer is None:
return lex.orth return lex.orth
cdef unicode py_string = self.strings[lex.orth] cdef unicode py_string = self.strings[lex.orth]

View File

@ -1,6 +1,7 @@
from libc.stdint cimport uint8_t, uint32_t from libc.stdint cimport uint8_t, uint32_t
from .typedefs cimport flags_t, attr_t, id_t, hash_t, univ_tag_t from .typedefs cimport flags_t, attr_t, id_t, hash_t
from .parts_of_speech cimport univ_pos_t
cdef struct LexemeC: cdef struct LexemeC:
@ -37,13 +38,13 @@ cdef struct Morphology:
cdef struct PosTag: cdef struct PosTag:
Morphology morph Morphology morph
int id int id
univ_tag_t pos univ_pos_t pos
cdef struct TokenC: cdef struct TokenC:
const LexemeC* lex const LexemeC* lex
Morphology morph Morphology morph
univ_tag_t pos univ_pos_t pos
int tag int tag
int idx int idx
int lemma int lemma

View File

@ -6,7 +6,8 @@ cimport numpy
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
from thinc.typedefs cimport atom_t from thinc.typedefs cimport atom_t
from .typedefs cimport flags_t, attr_id_t, attr_t, univ_tag_t from .typedefs cimport flags_t, attr_id_t, attr_t
from .parts_of_speech cimport univ_pos_t
from .structs cimport Morphology, TokenC, LexemeC from .structs cimport Morphology, TokenC, LexemeC
from .vocab cimport Vocab from .vocab cimport Vocab
from .strings cimport StringStore from .strings cimport StringStore
@ -66,7 +67,7 @@ cdef class Token:
cdef readonly float sentiment cdef readonly float sentiment
cdef readonly attr_t flags cdef readonly attr_t flags
cdef readonly attr_t lemma cdef readonly attr_t lemma
cdef readonly univ_tag_t pos cdef readonly univ_pos_t pos
cdef readonly attr_t tag cdef readonly attr_t tag
cdef readonly attr_t dep cdef readonly attr_t dep
cdef readonly ndarray repvec cdef readonly ndarray repvec

View File

@ -8,7 +8,7 @@ from .typedefs cimport attr_id_t, attr_t
from .typedefs cimport LEMMA from .typedefs cimport LEMMA
from .typedefs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER from .typedefs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
from .typedefs cimport POS, LEMMA from .typedefs cimport POS, LEMMA
from .typedefs import UNIV_TAG_NAMES from .parts_of_speech import UNIV_POS_NAMES
from unidecode import unidecode from unidecode import unidecode
@ -325,7 +325,7 @@ cdef class Token:
property pos_: property pos_:
def __get__(self): def __get__(self):
id_to_string = {id_: string for string, id_ in UNIV_TAG_NAMES.items()} id_to_string = {id_: string for string, id_ in UNIV_POS_NAMES.items()}
return id_to_string[self.pos] return id_to_string[self.pos]
property tag_: property tag_:

View File

@ -2,25 +2,6 @@ from libc.stdint cimport uint16_t, uint32_t, uint64_t, uintptr_t
from libc.stdint cimport uint8_t from libc.stdint cimport uint8_t
# Google universal tag set
cpdef enum univ_tag_t:
NO_TAG
ADJ
ADV
ADP
CONJ
DET
NOUN
NUM
PRON
PRT
VERB
X
PUNCT
EOL
N_UNIV_TAGS
# Reserve 64 values for flag features # Reserve 64 values for flag features
cpdef enum attr_id_t: cpdef enum attr_id_t:
FLAG0 FLAG0

View File

@ -1,19 +1 @@
from __future__ import unicode_literals
UNIV_TAG_NAMES = {
"NO_TAG": NO_TAG,
"ADJ": ADJ,
"ADV": ADV,
"ADP": ADP,
"CONJ": CONJ,
"DET": DET,
"NOUN": NOUN,
"NUM": NUM,
"PRON": PRON,
"PRT": PRT,
"VERB": VERB,
"X": X,
"PUNCT": PUNCT,
"EOL": EOL
}