* Work on fixing special-cases, reading them in as JSON objects so that they can specify lemmas

This commit is contained in:
Matthew Honnibal 2014-12-09 14:48:01 +11:00
parent cda9ea9a4a
commit 302e09018b
8 changed files with 136 additions and 70 deletions

View File

@ -10,6 +10,7 @@ cpdef enum en_person_t:
FIRST FIRST
SECOND SECOND
THIRD THIRD
NON_THIRD
cpdef enum en_number_t: cpdef enum en_number_t:
@ -17,14 +18,22 @@ cpdef enum en_number_t:
SINGULAR SINGULAR
PLURAL PLURAL
MASS MASS
CARDINAL
ORDINAL
cpdef enum en_gender_t: cpdef enum en_gender_t:
NO_GENDER NO_GENDER
MASCULINE MASCULINE
FEMININE FEMININE
NEUTER
cpdef enum en_case_t:
NO_CASE
NOMINATIVE
GENITIVE
ACCUSATIVE
REFLEXIVE
DEMONYM
cpdef enum en_tenspect_t: cpdef enum en_tenspect_t:
@ -37,23 +46,12 @@ cpdef enum en_tenspect_t:
MODAL MODAL
cpdef enum en_case_t:
NO_CASE
NOMINATIVE
ACCUSATIVE
GENITIVE
DEMONYM
cpdef enum misc_t: cpdef enum misc_t:
NO_MISC NO_MISC
COMPARATIVE COMPARATIVE
SUPERLATIVE SUPERLATIVE
RELATIVE RELATIVE
NAME NAME
URL
EMAIL
EMOTICON
# Flags # Flags

View File

@ -38,6 +38,8 @@ import orth
from .tagger cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT, VERB from .tagger cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT, VERB
from .tagger cimport X, PUNCT, EOL from .tagger cimport X, PUNCT, EOL
from .tokens cimport Morphology
POS_TAGS = { POS_TAGS = {
'NULL': (NO_TAG, {}), 'NULL': (NO_TAG, {}),
@ -152,7 +154,8 @@ cdef class English(Language):
for i in range(tokens.length): for i in range(tokens.length):
fill_pos_context(context, i, t) fill_pos_context(context, i, t)
t[i].pos = self.pos_tagger.predict(context) t[i].pos = self.pos_tagger.predict(context)
#self.morphalyser.set_token(&t[i]) _merge_morph(&t[i].morph, &self.pos_tagger.tags[t[i].pos].morph)
t[i].lemma = self.lemmatize(self.pos_tagger.tags[t[i].pos].pos, t[i].lex)
def train_pos(self, Tokens tokens, golds): def train_pos(self, Tokens tokens, golds):
cdef int i cdef int i
@ -162,11 +165,27 @@ cdef class English(Language):
for i in range(tokens.length): for i in range(tokens.length):
fill_pos_context(context, i, t) fill_pos_context(context, i, t)
t[i].pos = self.pos_tagger.predict(context, [golds[i]]) t[i].pos = self.pos_tagger.predict(context, [golds[i]])
t[i].morph = self.pos_tagger.tags[t[i].pos].morph _merge_morph(&t[i].morph, &self.pos_tagger.tags[t[i].pos].morph)
#self.analyse_morph(&t[i].lemma, &t[i].morph, t[i].pos, t[i].lex) t[i].lemma = self.lemmatize(self.pos_tagger.tags[t[i].pos].pos, t[i].lex)
c += t[i].pos == golds[i] c += t[i].pos == golds[i]
return c return c
cdef int _merge_morph(Morphology* tok_morph, const Morphology* pos_morph) except -1:
if tok_morph.number == 0:
tok_morph.number = pos_morph.number
if tok_morph.tenspect == 0:
tok_morph.tenspect = pos_morph.tenspect
if tok_morph.mood == 0:
tok_morph.mood = pos_morph.mood
if tok_morph.gender == 0:
tok_morph.gender = pos_morph.gender
if tok_morph.person == 0:
tok_morph.person = pos_morph.person
if tok_morph.case == 0:
tok_morph.case = pos_morph.case
if tok_morph.misc == 0:
tok_morph.misc = pos_morph.misc
EN = English('en') EN = English('en')

View File

@ -9,7 +9,7 @@ from .typedefs cimport hash_t
from .tokens cimport Tokens, TokenC from .tokens cimport Tokens, TokenC
from .lexeme cimport Lexeme from .lexeme cimport Lexeme
from .tagger cimport Tagger from .tagger cimport Tagger
from .tagger cimport PosTag from .tagger cimport univ_tag_t
from .utf8string cimport StringStore, UniStr from .utf8string cimport StringStore, UniStr
@ -38,11 +38,12 @@ cdef class Language:
cdef object _suffix_re cdef object _suffix_re
cdef object _infix_re cdef object _infix_re
cdef int lemmatize(self, const PosTag* pos, const Lexeme* lex) except -1 cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1
cpdef Tokens tokens_from_list(self, list strings) cpdef Tokens tokens_from_list(self, list strings)
cpdef Tokens tokenize(self, unicode text) cpdef Tokens tokenize(self, unicode text)
cdef int _try_cache(self, int idx, hash_t key, Tokens tokens) except -1
cdef int _tokenize(self, Tokens tokens, UniStr* span, int start, int end) except -1 cdef int _tokenize(self, Tokens tokens, UniStr* span, int start, int end) except -1
cdef UniStr* _split_affixes(self, UniStr* string, vector[Lexeme*] *prefixes, cdef UniStr* _split_affixes(self, UniStr* string, vector[Lexeme*] *prefixes,
vector[Lexeme*] *suffixes) except NULL vector[Lexeme*] *suffixes) except NULL

View File

@ -28,6 +28,7 @@ from .util import read_lang_data
from .tokens import Tokens from .tokens import Tokens
from .tagger cimport NOUN, VERB, ADJ, N_UNIV_TAGS from .tagger cimport NOUN, VERB, ADJ, N_UNIV_TAGS
from .tokens cimport Morphology
cdef class Language: cdef class Language:
@ -53,27 +54,27 @@ cdef class Language:
if path.exists(path.join(util.DATA_DIR, self.name, 'pos')): if path.exists(path.join(util.DATA_DIR, self.name, 'pos')):
self.pos_tagger = Tagger(path.join(util.DATA_DIR, self.name, 'pos')) self.pos_tagger = Tagger(path.join(util.DATA_DIR, self.name, 'pos'))
cdef int lemmatize(self, const PosTag* pos, const Lexeme* lex) except -1: cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1:
if self.lemmatizer is None: if self.lemmatizer is None:
return lex.sic return lex.sic
if pos.pos != NOUN and pos.pos != VERB and pos.pos != ADJ: if pos != NOUN and pos != VERB and pos != ADJ:
return lex.sic return lex.sic
cdef int lemma = <int><size_t>self._lemmas.get(pos.pos, lex.sic) cdef int lemma = <int><size_t>self._lemmas.get(pos, lex.sic)
if lemma != 0: if lemma != 0:
return lemma return lemma
cdef bytes py_string = self.lexicon.strings[lex.sic] cdef bytes py_string = self.lexicon.strings[lex.sic]
cdef set lemma_strings cdef set lemma_strings
cdef bytes lemma_string cdef bytes lemma_string
if pos.pos == NOUN: if pos == NOUN:
lemma_strings = self.lemmatizer.noun(py_string) lemma_strings = self.lemmatizer.noun(py_string)
elif pos.pos == VERB: elif pos == VERB:
lemma_strings = self.lemmatizer.verb(py_string) lemma_strings = self.lemmatizer.verb(py_string)
else: else:
assert pos.pos == ADJ assert pos == ADJ
lemma_strings = self.lemmatizer.adj(py_string) lemma_strings = self.lemmatizer.adj(py_string)
lemma_string = sorted(lemma_strings)[0] lemma_string = sorted(lemma_strings)[0]
lemma = self.lexicon.strings.intern(lemma_string, len(lemma_string)).i lemma = self.lexicon.strings.intern(lemma_string, len(lemma_string)).i
self._lemmas.set(pos.pos, lex.sic, <void*>lemma) self._lemmas.set(pos, lex.sic, <void*>lemma)
return lemma return lemma
cpdef Tokens tokens_from_list(self, list strings): cpdef Tokens tokens_from_list(self, list strings):
@ -111,6 +112,7 @@ cdef class Language:
return tokens return tokens
cdef int i = 0 cdef int i = 0
cdef int start = 0 cdef int start = 0
cdef bint cache_hit
cdef Py_UNICODE* chars = string cdef Py_UNICODE* chars = string
cdef bint in_ws = Py_UNICODE_ISSPACE(chars[0]) cdef bint in_ws = Py_UNICODE_ISSPACE(chars[0])
cdef UniStr span cdef UniStr span
@ -118,10 +120,8 @@ cdef class Language:
if Py_UNICODE_ISSPACE(chars[i]) != in_ws: if Py_UNICODE_ISSPACE(chars[i]) != in_ws:
if start < i: if start < i:
slice_unicode(&span, chars, start, i) slice_unicode(&span, chars, start, i)
lexemes = <const Lexeme* const*>self._cache.get(span.key) cache_hit = self._try_cache(start, span.key, tokens)
if lexemes != NULL: if not cache_hit:
tokens.extend(start, lexemes, 0)
else:
self._tokenize(tokens, &span, start, i) self._tokenize(tokens, &span, start, i)
in_ws = not in_ws in_ws = not in_ws
start = i start = i
@ -130,13 +130,32 @@ cdef class Language:
i += 1 i += 1
if start < i: if start < i:
slice_unicode(&span, chars, start, i) slice_unicode(&span, chars, start, i)
lexemes = <const Lexeme* const*>self._cache.get(span.key) cache_hit = self._try_cache(start, span.key, tokens)
if lexemes != NULL: if not cache_hit:
tokens.extend(start, lexemes, 0)
else:
self._tokenize(tokens, &span, start, i) self._tokenize(tokens, &span, start, i)
return tokens return tokens
cdef int _try_cache(self, int idx, hash_t key, Tokens tokens) except -1:
cdef int i
specials = <TokenC*>self._specials.get(key)
if specials != NULL:
i = 0
while specials[i].lex != NULL:
tokens.push_back(idx, specials[i].lex)
tokens.data[tokens.length - 1].pos = specials[i].pos
tokens.data[tokens.length - 1].morph = specials[i].morph
tokens.data[tokens.length - 1].lemma = specials[i].lemma
tokens.data[tokens.length - 1].sense = specials[i].sense
i += 1
return True
else:
cached = <const Lexeme* const*>self._cache.get(key)
if cached != NULL:
tokens.extend(i, cached, 0)
return True
else:
return False
cdef int _tokenize(self, Tokens tokens, UniStr* span, int start, int end) except -1: cdef int _tokenize(self, Tokens tokens, UniStr* span, int start, int end) except -1:
cdef vector[Lexeme*] prefixes cdef vector[Lexeme*] prefixes
cdef vector[Lexeme*] suffixes cdef vector[Lexeme*] suffixes
@ -190,10 +209,10 @@ cdef class Language:
break break
return string return string
cdef int _attach_tokens(self, Tokens tokens, cdef int _attach_tokens(self, Tokens tokens, int idx, UniStr* string,
int idx, UniStr* string,
vector[const Lexeme*] *prefixes, vector[const Lexeme*] *prefixes,
vector[const Lexeme*] *suffixes) except -1: vector[const Lexeme*] *suffixes) except -1:
cdef bint cache_hit
cdef int split cdef int split
cdef const Lexeme* const* lexemes cdef const Lexeme* const* lexemes
cdef Lexeme* lexeme cdef Lexeme* lexeme
@ -201,10 +220,9 @@ cdef class Language:
if prefixes.size(): if prefixes.size():
idx = tokens.extend(idx, prefixes.data(), prefixes.size()) idx = tokens.extend(idx, prefixes.data(), prefixes.size())
if string.n != 0: if string.n != 0:
cache_hit = self._try_cache(idx, string.key, tokens)
lexemes = <const Lexeme* const*>self._cache.get(string.key) if cache_hit:
if lexemes != NULL: idx = tokens.data[tokens.length - 1].idx + 1
idx = tokens.extend(idx, lexemes, 0)
else: else:
split = self._find_infix(string.chars, string.n) split = self._find_infix(string.chars, string.n)
if split == 0 or split == -1: if split == 0 or split == -1:
@ -247,30 +265,42 @@ cdef class Language:
match = self._suffix_re.search(string) match = self._suffix_re.search(string)
return (match.end() - match.start()) if match is not None else 0 return (match.end() - match.start()) if match is not None else 0
def _load_special_tokenization(self, token_rules): def _load_special_tokenization(self, object rules):
'''Load special-case tokenization rules. '''Add a special-case tokenization rule.
Loads special-case tokenization rules into the Language._cache cache,
read from data/<lang>/tokenization . The special cases are loaded before
any language data is tokenized, giving these priority. For instance,
the English tokenization rules map "ain't" to ["are", "not"].
Args:
token_rules (list): A list of (chunk, tokens) pairs, where chunk is
a string and tokens is a list of strings.
''' '''
cdef int i
cdef unicode chunk
cdef list substrings
cdef unicode form
cdef unicode lemma
cdef dict props
cdef Lexeme** lexemes cdef Lexeme** lexemes
cdef hash_t hashed cdef hash_t hashed
cdef UniStr string cdef UniStr string
for uni_string, substrings in token_rules: for chunk, substrings in sorted(rules.items()):
lexemes = <Lexeme**>self.mem.alloc(len(substrings) + 1, sizeof(Lexeme*)) tokens = <TokenC*>self.mem.alloc(len(substrings) + 1, sizeof(TokenC))
for i, substring in enumerate(substrings): for i, props in enumerate(substrings):
slice_unicode(&string, substring, 0, len(substring)) form = props['F']
lexemes[i] = <Lexeme*>self.lexicon.get(self.lexicon.mem, &string) lemma = props.get("L", None)
lexemes[i + 1] = NULL slice_unicode(&string, form, 0, len(form))
slice_unicode(&string, uni_string, 0, len(uni_string)) tokens[i].lex = <Lexeme*>self.lexicon.get(self.lexicon.mem, &string)
self._specials.set(string.key, lexemes) if lemma:
self._cache.set(string.key, lexemes) tokens[i].lemma = self.lexicon.strings[lemma]
set_morph_from_dict(&tokens[i].morph, props)
# Null-terminated array
tokens[i+1].lex = NULL
slice_unicode(&string, chunk, 0, len(chunk))
self._specials.set(string.key, tokens)
cdef int set_morph_from_dict(Morphology* morph, dict props) except -1:
morph.number = props.get('number', 0)
morph.tenspect = props.get('tenspect', 0)
morph.mood = props.get('mood', 0)
morph.gender = props.get('gender', 0)
morph.person = props.get('person', 0)
morph.case = props.get('case', 0)
morph.misc = props.get('misc', 0)
cdef class Lexicon: cdef class Lexicon:

View File

@ -21,7 +21,6 @@ cdef struct Morphology:
uint8_t misc uint8_t misc
cdef struct TokenC: cdef struct TokenC:
const Lexeme* lex const Lexeme* lex
Morphology morph Morphology morph

View File

@ -13,7 +13,8 @@ def utf8open(loc, mode='r'):
def read_lang_data(name): def read_lang_data(name):
data_dir = path.join(DATA_DIR, name) data_dir = path.join(DATA_DIR, name)
tokenization = read_tokenization(name) with open(path.join(data_dir, 'specials.json')) as file_:
tokenization = ujson.load(file_)
prefix = read_prefix(data_dir) prefix = read_prefix(data_dir)
suffix = read_suffix(data_dir) suffix = read_suffix(data_dir)
infix = read_infix(data_dir) infix = read_infix(data_dir)
@ -26,12 +27,17 @@ def read_prefix(data_dir):
expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()]) expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()])
return expression return expression
def read_suffix(data_dir): def read_suffix(data_dir):
with utf8open(path.join(data_dir, 'suffix')) as file_: with utf8open(path.join(data_dir, 'suffix')) as file_:
entries = file_.read().split('\n') entries = file_.read().split('\n')
expression = '|'.join([re.escape(piece) + '$' for piece in entries if piece.strip()]) expression = '|'.join([re.escape(piece) + '$' for piece in entries if piece.strip()])
# TODO: Fix this hack!
expression += r'|(?<=[a-z0-9])\.$'
expression += r'|(?<=[0-9])km$'
return expression return expression
def read_infix(data_dir): def read_infix(data_dir):
with utf8open(path.join(data_dir, 'infix')) as file_: with utf8open(path.join(data_dir, 'infix')) as file_:
entries = file_.read().split('\n') entries = file_.read().split('\n')

View File

@ -20,15 +20,18 @@ def test_apostrophe():
def test_LL(): def test_LL():
tokens = EN.tokenize("we'll") tokens = EN.tokenize("we'll")
assert len(tokens) == 2 assert len(tokens) == 2
assert tokens[1].string == "will" assert tokens[1].string == "'ll"
assert tokens[1].lemma == "will"
assert tokens[0].string == "we" assert tokens[0].string == "we"
def test_aint(): def test_aint():
tokens = EN.tokenize("ain't") tokens = EN.tokenize("ain't")
assert len(tokens) == 2 assert len(tokens) == 2
assert tokens[0].string == "are" assert tokens[0].string == "ai"
assert tokens[1].string == "not" assert tokens[0].lemma == "be"
assert tokens[1].string == "n't"
assert tokens[1].lemma == "not"
def test_capitalized(): def test_capitalized():
@ -38,7 +41,8 @@ def test_capitalized():
assert len(tokens) == 2 assert len(tokens) == 2
tokens = EN.tokenize("Ain't") tokens = EN.tokenize("Ain't")
assert len(tokens) == 2 assert len(tokens) == 2
assert tokens[0].string == "Are" assert tokens[0].string == "Ai"
assert tokens[0].lemma == "be"
def test_punct(): def test_punct():

View File

@ -34,7 +34,7 @@ def test_digits():
def test_contraction(): def test_contraction():
tokens = EN.tokenize("don't giggle") tokens = EN.tokenize("don't giggle")
assert len(tokens) == 3 assert len(tokens) == 3
assert tokens[1].sic == EN.lexicon["not"]['sic'] assert tokens[1].sic == EN.lexicon["n't"]['sic']
tokens = EN.tokenize("i said don't!") tokens = EN.tokenize("i said don't!")
assert len(tokens) == 5 assert len(tokens) == 5
assert tokens[4].sic == EN.lexicon['!']['sic'] assert tokens[4].sic == EN.lexicon['!']['sic']
@ -71,30 +71,39 @@ def test_cnts1():
tokens = EN.tokenize(text) tokens = EN.tokenize(text)
assert len(tokens) == 8 assert len(tokens) == 8
def test_cnts2(): def test_cnts2():
text = u"""U.N. regulations are not a part of their concern.""" text = u"""U.N. regulations are not a part of their concern."""
tokens = EN.tokenize(text) tokens = EN.tokenize(text)
assert len(tokens) == 10 assert len(tokens) == 10
def test_cnts3(): def test_cnts3():
text = u"“Isn't it?”" text = u"“Isn't it?”"
tokens = EN.tokenize(text) tokens = EN.tokenize(text)
assert len(tokens) == 6 words = [t.string for t in tokens]
assert len(words) == 6
def test_cnts4(): def test_cnts4():
text = u"""Yes! "I'd rather have a walk", Ms. Comble sighed. """ text = u"""Yes! "I'd rather have a walk", Ms. Comble sighed. """
tokens = EN.tokenize(text) tokens = EN.tokenize(text)
assert len(tokens) == 15 words = [t.string for t in tokens]
assert len(words) == 15
def test_cnts5(): def test_cnts5():
text = """'Me too!', Mr. P. Delaware cried. """ text = """'Me too!', Mr. P. Delaware cried. """
tokens = EN.tokenize(text) tokens = EN.tokenize(text)
assert len(tokens) == 11 assert len(tokens) == 11
def test_cnts6(): def test_cnts6():
text = u'They ran about 10km.' text = u'They ran about 10km.'
tokens = EN.tokenize(text) tokens = EN.tokenize(text)
assert len(tokens) == 6 words = [t.string for t in tokens]
assert len(words) == 6
#def test_cnts7(): #def test_cnts7():
# text = 'But then the 6,000-year ice age came...' # text = 'But then the 6,000-year ice age came...'