mirror of
https://github.com/explosion/spaCy.git
synced 2025-06-02 20:23:12 +03:00
* Messily fix morphology and POS tags on special tokens.
This commit is contained in:
parent
81d878beb2
commit
9976aa976e
|
@ -21,7 +21,8 @@ from .tokens import Tokens
|
||||||
|
|
||||||
|
|
||||||
cdef class Tokenizer:
|
cdef class Tokenizer:
|
||||||
def __init__(self, Vocab vocab, rules, prefix_re, suffix_re, infix_re):
|
def __init__(self, Vocab vocab, rules, prefix_re, suffix_re, infix_re,
|
||||||
|
pos_tags, tag_names):
|
||||||
self.mem = Pool()
|
self.mem = Pool()
|
||||||
self._cache = PreshMap()
|
self._cache = PreshMap()
|
||||||
self._specials = PreshMap()
|
self._specials = PreshMap()
|
||||||
|
@ -29,10 +30,10 @@ cdef class Tokenizer:
|
||||||
self._suffix_re = suffix_re
|
self._suffix_re = suffix_re
|
||||||
self._infix_re = infix_re
|
self._infix_re = infix_re
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
self._load_special_tokenization(rules)
|
self._load_special_tokenization(rules, pos_tags, tag_names)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_dir(cls, Vocab vocab, object data_dir):
|
def from_dir(cls, Vocab vocab, object data_dir, object pos_tags, object tag_names):
|
||||||
if not path.exists(data_dir):
|
if not path.exists(data_dir):
|
||||||
raise IOError("Directory %s not found -- cannot load Tokenizer." % data_dir)
|
raise IOError("Directory %s not found -- cannot load Tokenizer." % data_dir)
|
||||||
if not path.isdir(data_dir):
|
if not path.isdir(data_dir):
|
||||||
|
@ -41,7 +42,7 @@ cdef class Tokenizer:
|
||||||
assert path.exists(data_dir) and path.isdir(data_dir)
|
assert path.exists(data_dir) and path.isdir(data_dir)
|
||||||
rules, prefix_re, suffix_re, infix_re = util.read_lang_data(data_dir)
|
rules, prefix_re, suffix_re, infix_re = util.read_lang_data(data_dir)
|
||||||
return cls(vocab, rules, re.compile(prefix_re), re.compile(suffix_re),
|
return cls(vocab, rules, re.compile(prefix_re), re.compile(suffix_re),
|
||||||
re.compile(infix_re))
|
re.compile(infix_re), pos_tags, tag_names)
|
||||||
|
|
||||||
cpdef Tokens tokens_from_list(self, list strings):
|
cpdef Tokens tokens_from_list(self, list strings):
|
||||||
cdef int length = sum([len(s) for s in strings])
|
cdef int length = sum([len(s) for s in strings])
|
||||||
|
@ -234,7 +235,7 @@ cdef class Tokenizer:
|
||||||
match = self._suffix_re.search(string)
|
match = self._suffix_re.search(string)
|
||||||
return (match.end() - match.start()) if match is not None else 0
|
return (match.end() - match.start()) if match is not None else 0
|
||||||
|
|
||||||
def _load_special_tokenization(self, object rules):
|
def _load_special_tokenization(self, object rules, object tag_map, object tag_names):
|
||||||
'''Add a special-case tokenization rule.
|
'''Add a special-case tokenization rule.
|
||||||
'''
|
'''
|
||||||
cdef int i
|
cdef int i
|
||||||
|
@ -255,6 +256,13 @@ cdef class Tokenizer:
|
||||||
tokens[i].lex = <Lexeme*>self.vocab.get(self.vocab.mem, &string)
|
tokens[i].lex = <Lexeme*>self.vocab.get(self.vocab.mem, &string)
|
||||||
if lemma:
|
if lemma:
|
||||||
tokens[i].lemma = self.vocab.strings[lemma]
|
tokens[i].lemma = self.vocab.strings[lemma]
|
||||||
|
if 'pos' in props:
|
||||||
|
# TODO: Clean up this mess...
|
||||||
|
tokens[i].fine_pos = tag_names.index(props['pos'])
|
||||||
|
tokens[i].pos = tag_map[props['pos']][0]
|
||||||
|
# These are defaults, which can be over-ridden by the
|
||||||
|
# token-specific props.
|
||||||
|
set_morph_from_dict(&tokens[i].morph, tag_map[props['pos']][1])
|
||||||
set_morph_from_dict(&tokens[i].morph, props)
|
set_morph_from_dict(&tokens[i].morph, props)
|
||||||
cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
|
cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
|
||||||
cached.length = len(substrings)
|
cached.length = len(substrings)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user