mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 18:06:29 +03:00
* Move POS tag definitions to parts_of_speech.pxd
This commit is contained in:
parent
7431c133d8
commit
12b034e3ef
|
@ -4,7 +4,7 @@ from cymem.cymem cimport Pool
|
||||||
from .._ml cimport Model
|
from .._ml cimport Model
|
||||||
from ..strings cimport StringStore
|
from ..strings cimport StringStore
|
||||||
from ..structs cimport TokenC, LexemeC, Morphology, PosTag
|
from ..structs cimport TokenC, LexemeC, Morphology, PosTag
|
||||||
from ..typedefs cimport univ_tag_t
|
from ..parts_of_speech cimport univ_pos_t
|
||||||
from .lemmatizer import Lemmatizer
|
from .lemmatizer import Lemmatizer
|
||||||
|
|
||||||
|
|
||||||
|
@ -21,5 +21,5 @@ cdef class EnPosTagger:
|
||||||
cdef readonly int n_tags
|
cdef readonly int n_tags
|
||||||
|
|
||||||
cdef int set_morph(self, const int i, TokenC* tokens) except -1
|
cdef int set_morph(self, const int i, TokenC* tokens) except -1
|
||||||
cdef int lemmatize(self, const univ_tag_t pos, const LexemeC* lex) except -1
|
cdef int lemmatize(self, const univ_pos_t pos, const LexemeC* lex) except -1
|
||||||
|
|
||||||
|
|
|
@ -8,9 +8,9 @@ from libc.string cimport memset
|
||||||
from cymem.cymem cimport Address
|
from cymem.cymem cimport Address
|
||||||
from thinc.typedefs cimport atom_t, weight_t
|
from thinc.typedefs cimport atom_t, weight_t
|
||||||
|
|
||||||
from ..typedefs cimport univ_tag_t
|
from ..parts_of_speech cimport univ_pos_t
|
||||||
from ..typedefs cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT, VERB
|
from ..parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT, VERB
|
||||||
from ..typedefs cimport X, PUNCT, EOL
|
from ..parts_of_speech cimport X, PUNCT, EOL
|
||||||
from ..typedefs cimport id_t
|
from ..typedefs cimport id_t
|
||||||
from ..structs cimport TokenC, Morphology, LexemeC
|
from ..structs cimport TokenC, Morphology, LexemeC
|
||||||
from ..tokens cimport Tokens
|
from ..tokens cimport Tokens
|
||||||
|
@ -282,7 +282,7 @@ cdef class EnPosTagger:
|
||||||
tokens[i].lemma = cached.lemma
|
tokens[i].lemma = cached.lemma
|
||||||
tokens[i].morph = cached.morph
|
tokens[i].morph = cached.morph
|
||||||
|
|
||||||
cdef int lemmatize(self, const univ_tag_t pos, const LexemeC* lex) except -1:
|
cdef int lemmatize(self, const univ_pos_t pos, const LexemeC* lex) except -1:
|
||||||
if self.lemmatizer is None:
|
if self.lemmatizer is None:
|
||||||
return lex.orth
|
return lex.orth
|
||||||
cdef unicode py_string = self.strings[lex.orth]
|
cdef unicode py_string = self.strings[lex.orth]
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
from libc.stdint cimport uint8_t, uint32_t
|
from libc.stdint cimport uint8_t, uint32_t
|
||||||
|
|
||||||
from .typedefs cimport flags_t, attr_t, id_t, hash_t, univ_tag_t
|
from .typedefs cimport flags_t, attr_t, id_t, hash_t
|
||||||
|
from .parts_of_speech cimport univ_pos_t
|
||||||
|
|
||||||
|
|
||||||
cdef struct LexemeC:
|
cdef struct LexemeC:
|
||||||
|
@ -37,13 +38,13 @@ cdef struct Morphology:
|
||||||
cdef struct PosTag:
|
cdef struct PosTag:
|
||||||
Morphology morph
|
Morphology morph
|
||||||
int id
|
int id
|
||||||
univ_tag_t pos
|
univ_pos_t pos
|
||||||
|
|
||||||
|
|
||||||
cdef struct TokenC:
|
cdef struct TokenC:
|
||||||
const LexemeC* lex
|
const LexemeC* lex
|
||||||
Morphology morph
|
Morphology morph
|
||||||
univ_tag_t pos
|
univ_pos_t pos
|
||||||
int tag
|
int tag
|
||||||
int idx
|
int idx
|
||||||
int lemma
|
int lemma
|
||||||
|
|
|
@ -6,7 +6,8 @@ cimport numpy
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
from thinc.typedefs cimport atom_t
|
from thinc.typedefs cimport atom_t
|
||||||
|
|
||||||
from .typedefs cimport flags_t, attr_id_t, attr_t, univ_tag_t
|
from .typedefs cimport flags_t, attr_id_t, attr_t
|
||||||
|
from .parts_of_speech cimport univ_pos_t
|
||||||
from .structs cimport Morphology, TokenC, LexemeC
|
from .structs cimport Morphology, TokenC, LexemeC
|
||||||
from .vocab cimport Vocab
|
from .vocab cimport Vocab
|
||||||
from .strings cimport StringStore
|
from .strings cimport StringStore
|
||||||
|
@ -66,7 +67,7 @@ cdef class Token:
|
||||||
cdef readonly float sentiment
|
cdef readonly float sentiment
|
||||||
cdef readonly attr_t flags
|
cdef readonly attr_t flags
|
||||||
cdef readonly attr_t lemma
|
cdef readonly attr_t lemma
|
||||||
cdef readonly univ_tag_t pos
|
cdef readonly univ_pos_t pos
|
||||||
cdef readonly attr_t tag
|
cdef readonly attr_t tag
|
||||||
cdef readonly attr_t dep
|
cdef readonly attr_t dep
|
||||||
cdef readonly ndarray repvec
|
cdef readonly ndarray repvec
|
||||||
|
|
|
@ -8,7 +8,7 @@ from .typedefs cimport attr_id_t, attr_t
|
||||||
from .typedefs cimport LEMMA
|
from .typedefs cimport LEMMA
|
||||||
from .typedefs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
|
from .typedefs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
|
||||||
from .typedefs cimport POS, LEMMA
|
from .typedefs cimport POS, LEMMA
|
||||||
from .typedefs import UNIV_TAG_NAMES
|
from .parts_of_speech import UNIV_POS_NAMES
|
||||||
|
|
||||||
from unidecode import unidecode
|
from unidecode import unidecode
|
||||||
|
|
||||||
|
@ -325,7 +325,7 @@ cdef class Token:
|
||||||
|
|
||||||
property pos_:
|
property pos_:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
id_to_string = {id_: string for string, id_ in UNIV_TAG_NAMES.items()}
|
id_to_string = {id_: string for string, id_ in UNIV_POS_NAMES.items()}
|
||||||
return id_to_string[self.pos]
|
return id_to_string[self.pos]
|
||||||
|
|
||||||
property tag_:
|
property tag_:
|
||||||
|
|
|
@ -2,25 +2,6 @@ from libc.stdint cimport uint16_t, uint32_t, uint64_t, uintptr_t
|
||||||
from libc.stdint cimport uint8_t
|
from libc.stdint cimport uint8_t
|
||||||
|
|
||||||
|
|
||||||
# Google universal tag set
|
|
||||||
cpdef enum univ_tag_t:
|
|
||||||
NO_TAG
|
|
||||||
ADJ
|
|
||||||
ADV
|
|
||||||
ADP
|
|
||||||
CONJ
|
|
||||||
DET
|
|
||||||
NOUN
|
|
||||||
NUM
|
|
||||||
PRON
|
|
||||||
PRT
|
|
||||||
VERB
|
|
||||||
X
|
|
||||||
PUNCT
|
|
||||||
EOL
|
|
||||||
N_UNIV_TAGS
|
|
||||||
|
|
||||||
|
|
||||||
# Reserve 64 values for flag features
|
# Reserve 64 values for flag features
|
||||||
cpdef enum attr_id_t:
|
cpdef enum attr_id_t:
|
||||||
FLAG0
|
FLAG0
|
||||||
|
|
|
@ -1,19 +1 @@
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
|
|
||||||
UNIV_TAG_NAMES = {
|
|
||||||
"NO_TAG": NO_TAG,
|
|
||||||
"ADJ": ADJ,
|
|
||||||
"ADV": ADV,
|
|
||||||
"ADP": ADP,
|
|
||||||
"CONJ": CONJ,
|
|
||||||
"DET": DET,
|
|
||||||
"NOUN": NOUN,
|
|
||||||
"NUM": NUM,
|
|
||||||
"PRON": PRON,
|
|
||||||
"PRT": PRT,
|
|
||||||
"VERB": VERB,
|
|
||||||
"X": X,
|
|
||||||
"PUNCT": PUNCT,
|
|
||||||
"EOL": EOL
|
|
||||||
}
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user