* Work on fixing special-cases, reading them in as JSON objects so that they can specify lemmas

This commit is contained in:
Matthew Honnibal 2014-12-09 14:48:01 +11:00
parent cda9ea9a4a
commit 302e09018b
8 changed files with 136 additions and 70 deletions

View File

@ -10,6 +10,7 @@ cpdef enum en_person_t:
FIRST
SECOND
THIRD
NON_THIRD
cpdef enum en_number_t:
@ -17,14 +18,22 @@ cpdef enum en_number_t:
SINGULAR
PLURAL
MASS
CARDINAL
ORDINAL
cpdef enum en_gender_t:
NO_GENDER
MASCULINE
FEMININE
NEUTER
cpdef enum en_case_t:
NO_CASE
NOMINATIVE
GENITIVE
ACCUSATIVE
REFLEXIVE
DEMONYM
cpdef enum en_tenspect_t:
@ -37,23 +46,12 @@ cpdef enum en_tenspect_t:
MODAL
cpdef enum en_case_t:
NO_CASE
NOMINATIVE
ACCUSATIVE
GENITIVE
DEMONYM
cpdef enum misc_t:
NO_MISC
COMPARATIVE
SUPERLATIVE
RELATIVE
NAME
URL
EMAIL
EMOTICON
# Flags

View File

@ -38,6 +38,8 @@ import orth
from .tagger cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT, VERB
from .tagger cimport X, PUNCT, EOL
from .tokens cimport Morphology
POS_TAGS = {
'NULL': (NO_TAG, {}),
@ -152,7 +154,8 @@ cdef class English(Language):
for i in range(tokens.length):
fill_pos_context(context, i, t)
t[i].pos = self.pos_tagger.predict(context)
#self.morphalyser.set_token(&t[i])
_merge_morph(&t[i].morph, &self.pos_tagger.tags[t[i].pos].morph)
t[i].lemma = self.lemmatize(self.pos_tagger.tags[t[i].pos].pos, t[i].lex)
def train_pos(self, Tokens tokens, golds):
cdef int i
@ -162,11 +165,27 @@ cdef class English(Language):
for i in range(tokens.length):
fill_pos_context(context, i, t)
t[i].pos = self.pos_tagger.predict(context, [golds[i]])
t[i].morph = self.pos_tagger.tags[t[i].pos].morph
#self.analyse_morph(&t[i].lemma, &t[i].morph, t[i].pos, t[i].lex)
_merge_morph(&t[i].morph, &self.pos_tagger.tags[t[i].pos].morph)
t[i].lemma = self.lemmatize(self.pos_tagger.tags[t[i].pos].pos, t[i].lex)
c += t[i].pos == golds[i]
return c
cdef int _merge_morph(Morphology* tok_morph, const Morphology* pos_morph) except -1:
if tok_morph.number == 0:
tok_morph.number = pos_morph.number
if tok_morph.tenspect == 0:
tok_morph.tenspect = pos_morph.tenspect
if tok_morph.mood == 0:
tok_morph.mood = pos_morph.mood
if tok_morph.gender == 0:
tok_morph.gender = pos_morph.gender
if tok_morph.person == 0:
tok_morph.person = pos_morph.person
if tok_morph.case == 0:
tok_morph.case = pos_morph.case
if tok_morph.misc == 0:
tok_morph.misc = pos_morph.misc
EN = English('en')

View File

@ -9,7 +9,7 @@ from .typedefs cimport hash_t
from .tokens cimport Tokens, TokenC
from .lexeme cimport Lexeme
from .tagger cimport Tagger
from .tagger cimport PosTag
from .tagger cimport univ_tag_t
from .utf8string cimport StringStore, UniStr
@ -38,11 +38,12 @@ cdef class Language:
cdef object _suffix_re
cdef object _infix_re
cdef int lemmatize(self, const PosTag* pos, const Lexeme* lex) except -1
cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1
cpdef Tokens tokens_from_list(self, list strings)
cpdef Tokens tokenize(self, unicode text)
cdef int _try_cache(self, int idx, hash_t key, Tokens tokens) except -1
cdef int _tokenize(self, Tokens tokens, UniStr* span, int start, int end) except -1
cdef UniStr* _split_affixes(self, UniStr* string, vector[Lexeme*] *prefixes,
vector[Lexeme*] *suffixes) except NULL

View File

@ -28,6 +28,7 @@ from .util import read_lang_data
from .tokens import Tokens
from .tagger cimport NOUN, VERB, ADJ, N_UNIV_TAGS
from .tokens cimport Morphology
cdef class Language:
@ -53,27 +54,27 @@ cdef class Language:
if path.exists(path.join(util.DATA_DIR, self.name, 'pos')):
self.pos_tagger = Tagger(path.join(util.DATA_DIR, self.name, 'pos'))
cdef int lemmatize(self, const PosTag* pos, const Lexeme* lex) except -1:
cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1:
if self.lemmatizer is None:
return lex.sic
if pos.pos != NOUN and pos.pos != VERB and pos.pos != ADJ:
if pos != NOUN and pos != VERB and pos != ADJ:
return lex.sic
cdef int lemma = <int><size_t>self._lemmas.get(pos.pos, lex.sic)
cdef int lemma = <int><size_t>self._lemmas.get(pos, lex.sic)
if lemma != 0:
return lemma
cdef bytes py_string = self.lexicon.strings[lex.sic]
cdef set lemma_strings
cdef bytes lemma_string
if pos.pos == NOUN:
if pos == NOUN:
lemma_strings = self.lemmatizer.noun(py_string)
elif pos.pos == VERB:
elif pos == VERB:
lemma_strings = self.lemmatizer.verb(py_string)
else:
assert pos.pos == ADJ
assert pos == ADJ
lemma_strings = self.lemmatizer.adj(py_string)
lemma_string = sorted(lemma_strings)[0]
lemma = self.lexicon.strings.intern(lemma_string, len(lemma_string)).i
self._lemmas.set(pos.pos, lex.sic, <void*>lemma)
self._lemmas.set(pos, lex.sic, <void*>lemma)
return lemma
cpdef Tokens tokens_from_list(self, list strings):
@ -111,6 +112,7 @@ cdef class Language:
return tokens
cdef int i = 0
cdef int start = 0
cdef bint cache_hit
cdef Py_UNICODE* chars = string
cdef bint in_ws = Py_UNICODE_ISSPACE(chars[0])
cdef UniStr span
@ -118,10 +120,8 @@ cdef class Language:
if Py_UNICODE_ISSPACE(chars[i]) != in_ws:
if start < i:
slice_unicode(&span, chars, start, i)
lexemes = <const Lexeme* const*>self._cache.get(span.key)
if lexemes != NULL:
tokens.extend(start, lexemes, 0)
else:
cache_hit = self._try_cache(start, span.key, tokens)
if not cache_hit:
self._tokenize(tokens, &span, start, i)
in_ws = not in_ws
start = i
@ -130,13 +130,32 @@ cdef class Language:
i += 1
if start < i:
slice_unicode(&span, chars, start, i)
lexemes = <const Lexeme* const*>self._cache.get(span.key)
if lexemes != NULL:
tokens.extend(start, lexemes, 0)
else:
cache_hit = self._try_cache(start, span.key, tokens)
if not cache_hit:
self._tokenize(tokens, &span, start, i)
return tokens
cdef int _try_cache(self, int idx, hash_t key, Tokens tokens) except -1:
cdef int i
specials = <TokenC*>self._specials.get(key)
if specials != NULL:
i = 0
while specials[i].lex != NULL:
tokens.push_back(idx, specials[i].lex)
tokens.data[tokens.length - 1].pos = specials[i].pos
tokens.data[tokens.length - 1].morph = specials[i].morph
tokens.data[tokens.length - 1].lemma = specials[i].lemma
tokens.data[tokens.length - 1].sense = specials[i].sense
i += 1
return True
else:
cached = <const Lexeme* const*>self._cache.get(key)
if cached != NULL:
tokens.extend(i, cached, 0)
return True
else:
return False
cdef int _tokenize(self, Tokens tokens, UniStr* span, int start, int end) except -1:
cdef vector[Lexeme*] prefixes
cdef vector[Lexeme*] suffixes
@ -190,10 +209,10 @@ cdef class Language:
break
return string
cdef int _attach_tokens(self, Tokens tokens,
int idx, UniStr* string,
cdef int _attach_tokens(self, Tokens tokens, int idx, UniStr* string,
vector[const Lexeme*] *prefixes,
vector[const Lexeme*] *suffixes) except -1:
cdef bint cache_hit
cdef int split
cdef const Lexeme* const* lexemes
cdef Lexeme* lexeme
@ -201,10 +220,9 @@ cdef class Language:
if prefixes.size():
idx = tokens.extend(idx, prefixes.data(), prefixes.size())
if string.n != 0:
lexemes = <const Lexeme* const*>self._cache.get(string.key)
if lexemes != NULL:
idx = tokens.extend(idx, lexemes, 0)
cache_hit = self._try_cache(idx, string.key, tokens)
if cache_hit:
idx = tokens.data[tokens.length - 1].idx + 1
else:
split = self._find_infix(string.chars, string.n)
if split == 0 or split == -1:
@ -247,30 +265,42 @@ cdef class Language:
match = self._suffix_re.search(string)
return (match.end() - match.start()) if match is not None else 0
def _load_special_tokenization(self, token_rules):
'''Load special-case tokenization rules.
Loads special-case tokenization rules into the Language._cache cache,
read from data/<lang>/tokenization . The special cases are loaded before
any language data is tokenized, giving these priority. For instance,
the English tokenization rules map "ain't" to ["are", "not"].
Args:
token_rules (list): A list of (chunk, tokens) pairs, where chunk is
a string and tokens is a list of strings.
def _load_special_tokenization(self, object rules):
'''Add a special-case tokenization rule.
'''
cdef int i
cdef unicode chunk
cdef list substrings
cdef unicode form
cdef unicode lemma
cdef dict props
cdef Lexeme** lexemes
cdef hash_t hashed
cdef UniStr string
for uni_string, substrings in token_rules:
lexemes = <Lexeme**>self.mem.alloc(len(substrings) + 1, sizeof(Lexeme*))
for i, substring in enumerate(substrings):
slice_unicode(&string, substring, 0, len(substring))
lexemes[i] = <Lexeme*>self.lexicon.get(self.lexicon.mem, &string)
lexemes[i + 1] = NULL
slice_unicode(&string, uni_string, 0, len(uni_string))
self._specials.set(string.key, lexemes)
self._cache.set(string.key, lexemes)
for chunk, substrings in sorted(rules.items()):
tokens = <TokenC*>self.mem.alloc(len(substrings) + 1, sizeof(TokenC))
for i, props in enumerate(substrings):
form = props['F']
lemma = props.get("L", None)
slice_unicode(&string, form, 0, len(form))
tokens[i].lex = <Lexeme*>self.lexicon.get(self.lexicon.mem, &string)
if lemma:
tokens[i].lemma = self.lexicon.strings[lemma]
set_morph_from_dict(&tokens[i].morph, props)
# Null-terminated array
tokens[i+1].lex = NULL
slice_unicode(&string, chunk, 0, len(chunk))
self._specials.set(string.key, tokens)
cdef int set_morph_from_dict(Morphology* morph, dict props) except -1:
morph.number = props.get('number', 0)
morph.tenspect = props.get('tenspect', 0)
morph.mood = props.get('mood', 0)
morph.gender = props.get('gender', 0)
morph.person = props.get('person', 0)
morph.case = props.get('case', 0)
morph.misc = props.get('misc', 0)
cdef class Lexicon:

View File

@ -21,7 +21,6 @@ cdef struct Morphology:
uint8_t misc
cdef struct TokenC:
const Lexeme* lex
Morphology morph

View File

@ -13,7 +13,8 @@ def utf8open(loc, mode='r'):
def read_lang_data(name):
data_dir = path.join(DATA_DIR, name)
tokenization = read_tokenization(name)
with open(path.join(data_dir, 'specials.json')) as file_:
tokenization = ujson.load(file_)
prefix = read_prefix(data_dir)
suffix = read_suffix(data_dir)
infix = read_infix(data_dir)
@ -26,12 +27,17 @@ def read_prefix(data_dir):
expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()])
return expression
def read_suffix(data_dir):
with utf8open(path.join(data_dir, 'suffix')) as file_:
with utf8open(path.join(data_dir, 'suffix')) as file_:
entries = file_.read().split('\n')
expression = '|'.join([re.escape(piece) + '$' for piece in entries if piece.strip()])
# TODO: Fix this hack!
expression += r'|(?<=[a-z0-9])\.$'
expression += r'|(?<=[0-9])km$'
return expression
def read_infix(data_dir):
with utf8open(path.join(data_dir, 'infix')) as file_:
entries = file_.read().split('\n')

View File

@ -20,15 +20,18 @@ def test_apostrophe():
def test_LL():
tokens = EN.tokenize("we'll")
assert len(tokens) == 2
assert tokens[1].string == "will"
assert tokens[1].string == "'ll"
assert tokens[1].lemma == "will"
assert tokens[0].string == "we"
def test_aint():
tokens = EN.tokenize("ain't")
assert len(tokens) == 2
assert tokens[0].string == "are"
assert tokens[1].string == "not"
assert tokens[0].string == "ai"
assert tokens[0].lemma == "be"
assert tokens[1].string == "n't"
assert tokens[1].lemma == "not"
def test_capitalized():
@ -38,7 +41,8 @@ def test_capitalized():
assert len(tokens) == 2
tokens = EN.tokenize("Ain't")
assert len(tokens) == 2
assert tokens[0].string == "Are"
assert tokens[0].string == "Ai"
assert tokens[0].lemma == "be"
def test_punct():

View File

@ -34,7 +34,7 @@ def test_digits():
def test_contraction():
tokens = EN.tokenize("don't giggle")
assert len(tokens) == 3
assert tokens[1].sic == EN.lexicon["not"]['sic']
assert tokens[1].sic == EN.lexicon["n't"]['sic']
tokens = EN.tokenize("i said don't!")
assert len(tokens) == 5
assert tokens[4].sic == EN.lexicon['!']['sic']
@ -71,30 +71,39 @@ def test_cnts1():
tokens = EN.tokenize(text)
assert len(tokens) == 8
def test_cnts2():
text = u"""U.N. regulations are not a part of their concern."""
tokens = EN.tokenize(text)
assert len(tokens) == 10
def test_cnts3():
text = u"“Isn't it?”"
tokens = EN.tokenize(text)
assert len(tokens) == 6
words = [t.string for t in tokens]
assert len(words) == 6
def test_cnts4():
text = u"""Yes! "I'd rather have a walk", Ms. Comble sighed. """
tokens = EN.tokenize(text)
assert len(tokens) == 15
words = [t.string for t in tokens]
assert len(words) == 15
def test_cnts5():
text = """'Me too!', Mr. P. Delaware cried. """
tokens = EN.tokenize(text)
assert len(tokens) == 11
def test_cnts6():
text = u'They ran about 10km.'
tokens = EN.tokenize(text)
assert len(tokens) == 6
words = [t.string for t in tokens]
assert len(words) == 6
#def test_cnts7():
# text = 'But then the 6,000-year ice age came...'