mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-13 09:42:26 +03:00
* Add pos_tag method to Language
This commit is contained in:
parent
99b5cefa88
commit
fcd9490d56
|
@ -6,6 +6,7 @@ from cymem.cymem cimport Pool
|
||||||
from .typedefs cimport hash_t
|
from .typedefs cimport hash_t
|
||||||
from .tokens cimport Tokens
|
from .tokens cimport Tokens
|
||||||
from .lexeme cimport Lexeme
|
from .lexeme cimport Lexeme
|
||||||
|
from .pos cimport Tagger as PosTagger
|
||||||
from .utf8string cimport StringStore
|
from .utf8string cimport StringStore
|
||||||
|
|
||||||
|
|
||||||
|
@ -40,11 +41,14 @@ cdef class Language:
|
||||||
cdef PreshMap _specials
|
cdef PreshMap _specials
|
||||||
cpdef readonly Lexicon lexicon
|
cpdef readonly Lexicon lexicon
|
||||||
|
|
||||||
|
cpdef readonly PosTagger pos_tagger
|
||||||
|
|
||||||
cdef object _prefix_re
|
cdef object _prefix_re
|
||||||
cdef object _suffix_re
|
cdef object _suffix_re
|
||||||
cdef object _infix_re
|
cdef object _infix_re
|
||||||
|
|
||||||
cpdef Tokens tokenize(self, unicode text)
|
cpdef Tokens tokenize(self, unicode text)
|
||||||
|
cpdef Tokens pos_tag(self, Tokens t)
|
||||||
|
|
||||||
cdef int _tokenize(self, Tokens tokens, String* span, int start, int end) except -1
|
cdef int _tokenize(self, Tokens tokens, String* span, int start, int end) except -1
|
||||||
cdef String* _split_affixes(self, String* string, vector[Lexeme*] *prefixes,
|
cdef String* _split_affixes(self, String* string, vector[Lexeme*] *prefixes,
|
||||||
|
|
|
@ -23,6 +23,8 @@ from . import util
|
||||||
from .util import read_lang_data
|
from .util import read_lang_data
|
||||||
from .tokens import Tokens
|
from .tokens import Tokens
|
||||||
|
|
||||||
|
from .pos cimport Tagger as PosTagger
|
||||||
|
|
||||||
|
|
||||||
cdef class Language:
|
cdef class Language:
|
||||||
def __init__(self, name):
|
def __init__(self, name):
|
||||||
|
@ -39,6 +41,10 @@ cdef class Language:
|
||||||
self.lexicon.load(path.join(util.DATA_DIR, name, 'lexemes'))
|
self.lexicon.load(path.join(util.DATA_DIR, name, 'lexemes'))
|
||||||
self.lexicon.strings.load(path.join(util.DATA_DIR, name, 'strings'))
|
self.lexicon.strings.load(path.join(util.DATA_DIR, name, 'strings'))
|
||||||
self._load_special_tokenization(rules)
|
self._load_special_tokenization(rules)
|
||||||
|
if path.exists(path.join(util.DATA_DIR, name, 'pos')):
|
||||||
|
self.pos_tagger = PosTagger(path.join(util.DATA_DIR, name, 'pos'))
|
||||||
|
else:
|
||||||
|
self.pos_tagger = None
|
||||||
|
|
||||||
cpdef Tokens tokenize(self, unicode string):
|
cpdef Tokens tokenize(self, unicode string):
|
||||||
"""Tokenize a string.
|
"""Tokenize a string.
|
||||||
|
@ -87,6 +93,16 @@ cdef class Language:
|
||||||
self._tokenize(tokens, &span, start, i)
|
self._tokenize(tokens, &span, start, i)
|
||||||
return tokens
|
return tokens
|
||||||
|
|
||||||
|
cpdef Tokens pos_tag(self, Tokens t):
|
||||||
|
if self.pos_tagger is None:
|
||||||
|
return t
|
||||||
|
cdef int i
|
||||||
|
t.pos[-1] = self.pos_tagger.encode_pos('EOL')
|
||||||
|
t.pos[-2] = self.pos_tagger.encode_pos('EOL')
|
||||||
|
for i in range(t.length):
|
||||||
|
t.pos[i] = self.pos_tagger.predict(i, t, t.pos[i-1], t.pos[i-2])
|
||||||
|
return t
|
||||||
|
|
||||||
cdef int _tokenize(self, Tokens tokens, String* span, int start, int end) except -1:
|
cdef int _tokenize(self, Tokens tokens, String* span, int start, int end) except -1:
|
||||||
cdef vector[Lexeme*] prefixes
|
cdef vector[Lexeme*] prefixes
|
||||||
cdef vector[Lexeme*] suffixes
|
cdef vector[Lexeme*] suffixes
|
||||||
|
|
Loading…
Reference in New Issue
Block a user