* Add pos_tag method to Language

This commit is contained in:
Matthew Honnibal 2014-11-02 14:21:43 +11:00
parent 99b5cefa88
commit fcd9490d56
2 changed files with 20 additions and 0 deletions

View File

@ -6,6 +6,7 @@ from cymem.cymem cimport Pool
from .typedefs cimport hash_t from .typedefs cimport hash_t
from .tokens cimport Tokens from .tokens cimport Tokens
from .lexeme cimport Lexeme from .lexeme cimport Lexeme
from .pos cimport Tagger as PosTagger
from .utf8string cimport StringStore from .utf8string cimport StringStore
@ -40,11 +41,14 @@ cdef class Language:
cdef PreshMap _specials cdef PreshMap _specials
cpdef readonly Lexicon lexicon cpdef readonly Lexicon lexicon
cpdef readonly PosTagger pos_tagger
cdef object _prefix_re cdef object _prefix_re
cdef object _suffix_re cdef object _suffix_re
cdef object _infix_re cdef object _infix_re
cpdef Tokens tokenize(self, unicode text) cpdef Tokens tokenize(self, unicode text)
cpdef Tokens pos_tag(self, Tokens t)
cdef int _tokenize(self, Tokens tokens, String* span, int start, int end) except -1 cdef int _tokenize(self, Tokens tokens, String* span, int start, int end) except -1
cdef String* _split_affixes(self, String* string, vector[Lexeme*] *prefixes, cdef String* _split_affixes(self, String* string, vector[Lexeme*] *prefixes,

View File

@ -23,6 +23,8 @@ from . import util
from .util import read_lang_data from .util import read_lang_data
from .tokens import Tokens from .tokens import Tokens
from .pos cimport Tagger as PosTagger
cdef class Language: cdef class Language:
def __init__(self, name): def __init__(self, name):
@ -39,6 +41,10 @@ cdef class Language:
self.lexicon.load(path.join(util.DATA_DIR, name, 'lexemes')) self.lexicon.load(path.join(util.DATA_DIR, name, 'lexemes'))
self.lexicon.strings.load(path.join(util.DATA_DIR, name, 'strings')) self.lexicon.strings.load(path.join(util.DATA_DIR, name, 'strings'))
self._load_special_tokenization(rules) self._load_special_tokenization(rules)
if path.exists(path.join(util.DATA_DIR, name, 'pos')):
self.pos_tagger = PosTagger(path.join(util.DATA_DIR, name, 'pos'))
else:
self.pos_tagger = None
cpdef Tokens tokenize(self, unicode string): cpdef Tokens tokenize(self, unicode string):
"""Tokenize a string. """Tokenize a string.
@ -87,6 +93,16 @@ cdef class Language:
self._tokenize(tokens, &span, start, i) self._tokenize(tokens, &span, start, i)
return tokens return tokens
cpdef Tokens pos_tag(self, Tokens t):
if self.pos_tagger is None:
return t
cdef int i
t.pos[-1] = self.pos_tagger.encode_pos('EOL')
t.pos[-2] = self.pos_tagger.encode_pos('EOL')
for i in range(t.length):
t.pos[i] = self.pos_tagger.predict(i, t, t.pos[i-1], t.pos[i-2])
return t
cdef int _tokenize(self, Tokens tokens, String* span, int start, int end) except -1: cdef int _tokenize(self, Tokens tokens, String* span, int start, int end) except -1:
cdef vector[Lexeme*] prefixes cdef vector[Lexeme*] prefixes
cdef vector[Lexeme*] suffixes cdef vector[Lexeme*] suffixes