mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-13 10:46:29 +03:00
* Work on fixing special-cases, reading them in as JSON objects so that they can specify lemmas
This commit is contained in:
parent
cda9ea9a4a
commit
302e09018b
24
spacy/en.pxd
24
spacy/en.pxd
|
@ -10,6 +10,7 @@ cpdef enum en_person_t:
|
|||
FIRST
|
||||
SECOND
|
||||
THIRD
|
||||
NON_THIRD
|
||||
|
||||
|
||||
cpdef enum en_number_t:
|
||||
|
@ -17,14 +18,22 @@ cpdef enum en_number_t:
|
|||
SINGULAR
|
||||
PLURAL
|
||||
MASS
|
||||
CARDINAL
|
||||
ORDINAL
|
||||
|
||||
|
||||
cpdef enum en_gender_t:
|
||||
NO_GENDER
|
||||
MASCULINE
|
||||
FEMININE
|
||||
NEUTER
|
||||
|
||||
|
||||
cpdef enum en_case_t:
|
||||
NO_CASE
|
||||
NOMINATIVE
|
||||
GENITIVE
|
||||
ACCUSATIVE
|
||||
REFLEXIVE
|
||||
DEMONYM
|
||||
|
||||
|
||||
cpdef enum en_tenspect_t:
|
||||
|
@ -37,23 +46,12 @@ cpdef enum en_tenspect_t:
|
|||
MODAL
|
||||
|
||||
|
||||
cpdef enum en_case_t:
|
||||
NO_CASE
|
||||
NOMINATIVE
|
||||
ACCUSATIVE
|
||||
GENITIVE
|
||||
DEMONYM
|
||||
|
||||
|
||||
cpdef enum misc_t:
|
||||
NO_MISC
|
||||
COMPARATIVE
|
||||
SUPERLATIVE
|
||||
RELATIVE
|
||||
NAME
|
||||
URL
|
||||
EMAIL
|
||||
EMOTICON
|
||||
|
||||
|
||||
# Flags
|
||||
|
|
25
spacy/en.pyx
25
spacy/en.pyx
|
@ -38,6 +38,8 @@ import orth
|
|||
from .tagger cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT, VERB
|
||||
from .tagger cimport X, PUNCT, EOL
|
||||
|
||||
from .tokens cimport Morphology
|
||||
|
||||
|
||||
POS_TAGS = {
|
||||
'NULL': (NO_TAG, {}),
|
||||
|
@ -152,7 +154,8 @@ cdef class English(Language):
|
|||
for i in range(tokens.length):
|
||||
fill_pos_context(context, i, t)
|
||||
t[i].pos = self.pos_tagger.predict(context)
|
||||
#self.morphalyser.set_token(&t[i])
|
||||
_merge_morph(&t[i].morph, &self.pos_tagger.tags[t[i].pos].morph)
|
||||
t[i].lemma = self.lemmatize(self.pos_tagger.tags[t[i].pos].pos, t[i].lex)
|
||||
|
||||
def train_pos(self, Tokens tokens, golds):
|
||||
cdef int i
|
||||
|
@ -162,11 +165,27 @@ cdef class English(Language):
|
|||
for i in range(tokens.length):
|
||||
fill_pos_context(context, i, t)
|
||||
t[i].pos = self.pos_tagger.predict(context, [golds[i]])
|
||||
t[i].morph = self.pos_tagger.tags[t[i].pos].morph
|
||||
#self.analyse_morph(&t[i].lemma, &t[i].morph, t[i].pos, t[i].lex)
|
||||
_merge_morph(&t[i].morph, &self.pos_tagger.tags[t[i].pos].morph)
|
||||
t[i].lemma = self.lemmatize(self.pos_tagger.tags[t[i].pos].pos, t[i].lex)
|
||||
c += t[i].pos == golds[i]
|
||||
return c
|
||||
|
||||
|
||||
cdef int _merge_morph(Morphology* tok_morph, const Morphology* pos_morph) except -1:
|
||||
if tok_morph.number == 0:
|
||||
tok_morph.number = pos_morph.number
|
||||
if tok_morph.tenspect == 0:
|
||||
tok_morph.tenspect = pos_morph.tenspect
|
||||
if tok_morph.mood == 0:
|
||||
tok_morph.mood = pos_morph.mood
|
||||
if tok_morph.gender == 0:
|
||||
tok_morph.gender = pos_morph.gender
|
||||
if tok_morph.person == 0:
|
||||
tok_morph.person = pos_morph.person
|
||||
if tok_morph.case == 0:
|
||||
tok_morph.case = pos_morph.case
|
||||
if tok_morph.misc == 0:
|
||||
tok_morph.misc = pos_morph.misc
|
||||
|
||||
|
||||
EN = English('en')
|
||||
|
|
|
@ -9,7 +9,7 @@ from .typedefs cimport hash_t
|
|||
from .tokens cimport Tokens, TokenC
|
||||
from .lexeme cimport Lexeme
|
||||
from .tagger cimport Tagger
|
||||
from .tagger cimport PosTag
|
||||
from .tagger cimport univ_tag_t
|
||||
from .utf8string cimport StringStore, UniStr
|
||||
|
||||
|
||||
|
@ -38,11 +38,12 @@ cdef class Language:
|
|||
cdef object _suffix_re
|
||||
cdef object _infix_re
|
||||
|
||||
cdef int lemmatize(self, const PosTag* pos, const Lexeme* lex) except -1
|
||||
cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1
|
||||
|
||||
cpdef Tokens tokens_from_list(self, list strings)
|
||||
cpdef Tokens tokenize(self, unicode text)
|
||||
|
||||
cdef int _try_cache(self, int idx, hash_t key, Tokens tokens) except -1
|
||||
cdef int _tokenize(self, Tokens tokens, UniStr* span, int start, int end) except -1
|
||||
cdef UniStr* _split_affixes(self, UniStr* string, vector[Lexeme*] *prefixes,
|
||||
vector[Lexeme*] *suffixes) except NULL
|
||||
|
|
112
spacy/lang.pyx
112
spacy/lang.pyx
|
@ -28,6 +28,7 @@ from .util import read_lang_data
|
|||
from .tokens import Tokens
|
||||
|
||||
from .tagger cimport NOUN, VERB, ADJ, N_UNIV_TAGS
|
||||
from .tokens cimport Morphology
|
||||
|
||||
|
||||
cdef class Language:
|
||||
|
@ -53,27 +54,27 @@ cdef class Language:
|
|||
if path.exists(path.join(util.DATA_DIR, self.name, 'pos')):
|
||||
self.pos_tagger = Tagger(path.join(util.DATA_DIR, self.name, 'pos'))
|
||||
|
||||
cdef int lemmatize(self, const PosTag* pos, const Lexeme* lex) except -1:
|
||||
cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1:
|
||||
if self.lemmatizer is None:
|
||||
return lex.sic
|
||||
if pos.pos != NOUN and pos.pos != VERB and pos.pos != ADJ:
|
||||
if pos != NOUN and pos != VERB and pos != ADJ:
|
||||
return lex.sic
|
||||
cdef int lemma = <int><size_t>self._lemmas.get(pos.pos, lex.sic)
|
||||
cdef int lemma = <int><size_t>self._lemmas.get(pos, lex.sic)
|
||||
if lemma != 0:
|
||||
return lemma
|
||||
cdef bytes py_string = self.lexicon.strings[lex.sic]
|
||||
cdef set lemma_strings
|
||||
cdef bytes lemma_string
|
||||
if pos.pos == NOUN:
|
||||
if pos == NOUN:
|
||||
lemma_strings = self.lemmatizer.noun(py_string)
|
||||
elif pos.pos == VERB:
|
||||
elif pos == VERB:
|
||||
lemma_strings = self.lemmatizer.verb(py_string)
|
||||
else:
|
||||
assert pos.pos == ADJ
|
||||
assert pos == ADJ
|
||||
lemma_strings = self.lemmatizer.adj(py_string)
|
||||
lemma_string = sorted(lemma_strings)[0]
|
||||
lemma = self.lexicon.strings.intern(lemma_string, len(lemma_string)).i
|
||||
self._lemmas.set(pos.pos, lex.sic, <void*>lemma)
|
||||
self._lemmas.set(pos, lex.sic, <void*>lemma)
|
||||
return lemma
|
||||
|
||||
cpdef Tokens tokens_from_list(self, list strings):
|
||||
|
@ -111,6 +112,7 @@ cdef class Language:
|
|||
return tokens
|
||||
cdef int i = 0
|
||||
cdef int start = 0
|
||||
cdef bint cache_hit
|
||||
cdef Py_UNICODE* chars = string
|
||||
cdef bint in_ws = Py_UNICODE_ISSPACE(chars[0])
|
||||
cdef UniStr span
|
||||
|
@ -118,10 +120,8 @@ cdef class Language:
|
|||
if Py_UNICODE_ISSPACE(chars[i]) != in_ws:
|
||||
if start < i:
|
||||
slice_unicode(&span, chars, start, i)
|
||||
lexemes = <const Lexeme* const*>self._cache.get(span.key)
|
||||
if lexemes != NULL:
|
||||
tokens.extend(start, lexemes, 0)
|
||||
else:
|
||||
cache_hit = self._try_cache(start, span.key, tokens)
|
||||
if not cache_hit:
|
||||
self._tokenize(tokens, &span, start, i)
|
||||
in_ws = not in_ws
|
||||
start = i
|
||||
|
@ -130,13 +130,32 @@ cdef class Language:
|
|||
i += 1
|
||||
if start < i:
|
||||
slice_unicode(&span, chars, start, i)
|
||||
lexemes = <const Lexeme* const*>self._cache.get(span.key)
|
||||
if lexemes != NULL:
|
||||
tokens.extend(start, lexemes, 0)
|
||||
else:
|
||||
cache_hit = self._try_cache(start, span.key, tokens)
|
||||
if not cache_hit:
|
||||
self._tokenize(tokens, &span, start, i)
|
||||
return tokens
|
||||
|
||||
cdef int _try_cache(self, int idx, hash_t key, Tokens tokens) except -1:
|
||||
cdef int i
|
||||
specials = <TokenC*>self._specials.get(key)
|
||||
if specials != NULL:
|
||||
i = 0
|
||||
while specials[i].lex != NULL:
|
||||
tokens.push_back(idx, specials[i].lex)
|
||||
tokens.data[tokens.length - 1].pos = specials[i].pos
|
||||
tokens.data[tokens.length - 1].morph = specials[i].morph
|
||||
tokens.data[tokens.length - 1].lemma = specials[i].lemma
|
||||
tokens.data[tokens.length - 1].sense = specials[i].sense
|
||||
i += 1
|
||||
return True
|
||||
else:
|
||||
cached = <const Lexeme* const*>self._cache.get(key)
|
||||
if cached != NULL:
|
||||
tokens.extend(i, cached, 0)
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
cdef int _tokenize(self, Tokens tokens, UniStr* span, int start, int end) except -1:
|
||||
cdef vector[Lexeme*] prefixes
|
||||
cdef vector[Lexeme*] suffixes
|
||||
|
@ -190,10 +209,10 @@ cdef class Language:
|
|||
break
|
||||
return string
|
||||
|
||||
cdef int _attach_tokens(self, Tokens tokens,
|
||||
int idx, UniStr* string,
|
||||
cdef int _attach_tokens(self, Tokens tokens, int idx, UniStr* string,
|
||||
vector[const Lexeme*] *prefixes,
|
||||
vector[const Lexeme*] *suffixes) except -1:
|
||||
cdef bint cache_hit
|
||||
cdef int split
|
||||
cdef const Lexeme* const* lexemes
|
||||
cdef Lexeme* lexeme
|
||||
|
@ -201,10 +220,9 @@ cdef class Language:
|
|||
if prefixes.size():
|
||||
idx = tokens.extend(idx, prefixes.data(), prefixes.size())
|
||||
if string.n != 0:
|
||||
|
||||
lexemes = <const Lexeme* const*>self._cache.get(string.key)
|
||||
if lexemes != NULL:
|
||||
idx = tokens.extend(idx, lexemes, 0)
|
||||
cache_hit = self._try_cache(idx, string.key, tokens)
|
||||
if cache_hit:
|
||||
idx = tokens.data[tokens.length - 1].idx + 1
|
||||
else:
|
||||
split = self._find_infix(string.chars, string.n)
|
||||
if split == 0 or split == -1:
|
||||
|
@ -247,30 +265,42 @@ cdef class Language:
|
|||
match = self._suffix_re.search(string)
|
||||
return (match.end() - match.start()) if match is not None else 0
|
||||
|
||||
def _load_special_tokenization(self, token_rules):
|
||||
'''Load special-case tokenization rules.
|
||||
|
||||
Loads special-case tokenization rules into the Language._cache cache,
|
||||
read from data/<lang>/tokenization . The special cases are loaded before
|
||||
any language data is tokenized, giving these priority. For instance,
|
||||
the English tokenization rules map "ain't" to ["are", "not"].
|
||||
|
||||
Args:
|
||||
token_rules (list): A list of (chunk, tokens) pairs, where chunk is
|
||||
a string and tokens is a list of strings.
|
||||
def _load_special_tokenization(self, object rules):
|
||||
'''Add a special-case tokenization rule.
|
||||
'''
|
||||
cdef int i
|
||||
cdef unicode chunk
|
||||
cdef list substrings
|
||||
cdef unicode form
|
||||
cdef unicode lemma
|
||||
cdef dict props
|
||||
cdef Lexeme** lexemes
|
||||
cdef hash_t hashed
|
||||
cdef UniStr string
|
||||
for uni_string, substrings in token_rules:
|
||||
lexemes = <Lexeme**>self.mem.alloc(len(substrings) + 1, sizeof(Lexeme*))
|
||||
for i, substring in enumerate(substrings):
|
||||
slice_unicode(&string, substring, 0, len(substring))
|
||||
lexemes[i] = <Lexeme*>self.lexicon.get(self.lexicon.mem, &string)
|
||||
lexemes[i + 1] = NULL
|
||||
slice_unicode(&string, uni_string, 0, len(uni_string))
|
||||
self._specials.set(string.key, lexemes)
|
||||
self._cache.set(string.key, lexemes)
|
||||
for chunk, substrings in sorted(rules.items()):
|
||||
tokens = <TokenC*>self.mem.alloc(len(substrings) + 1, sizeof(TokenC))
|
||||
for i, props in enumerate(substrings):
|
||||
form = props['F']
|
||||
lemma = props.get("L", None)
|
||||
slice_unicode(&string, form, 0, len(form))
|
||||
tokens[i].lex = <Lexeme*>self.lexicon.get(self.lexicon.mem, &string)
|
||||
if lemma:
|
||||
tokens[i].lemma = self.lexicon.strings[lemma]
|
||||
set_morph_from_dict(&tokens[i].morph, props)
|
||||
# Null-terminated array
|
||||
tokens[i+1].lex = NULL
|
||||
slice_unicode(&string, chunk, 0, len(chunk))
|
||||
self._specials.set(string.key, tokens)
|
||||
|
||||
|
||||
cdef int set_morph_from_dict(Morphology* morph, dict props) except -1:
|
||||
morph.number = props.get('number', 0)
|
||||
morph.tenspect = props.get('tenspect', 0)
|
||||
morph.mood = props.get('mood', 0)
|
||||
morph.gender = props.get('gender', 0)
|
||||
morph.person = props.get('person', 0)
|
||||
morph.case = props.get('case', 0)
|
||||
morph.misc = props.get('misc', 0)
|
||||
|
||||
|
||||
cdef class Lexicon:
|
||||
|
|
|
@ -21,7 +21,6 @@ cdef struct Morphology:
|
|||
uint8_t misc
|
||||
|
||||
|
||||
|
||||
cdef struct TokenC:
|
||||
const Lexeme* lex
|
||||
Morphology morph
|
||||
|
|
|
@ -13,7 +13,8 @@ def utf8open(loc, mode='r'):
|
|||
|
||||
def read_lang_data(name):
|
||||
data_dir = path.join(DATA_DIR, name)
|
||||
tokenization = read_tokenization(name)
|
||||
with open(path.join(data_dir, 'specials.json')) as file_:
|
||||
tokenization = ujson.load(file_)
|
||||
prefix = read_prefix(data_dir)
|
||||
suffix = read_suffix(data_dir)
|
||||
infix = read_infix(data_dir)
|
||||
|
@ -26,12 +27,17 @@ def read_prefix(data_dir):
|
|||
expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()])
|
||||
return expression
|
||||
|
||||
|
||||
def read_suffix(data_dir):
|
||||
with utf8open(path.join(data_dir, 'suffix')) as file_:
|
||||
with utf8open(path.join(data_dir, 'suffix')) as file_:
|
||||
entries = file_.read().split('\n')
|
||||
expression = '|'.join([re.escape(piece) + '$' for piece in entries if piece.strip()])
|
||||
# TODO: Fix this hack!
|
||||
expression += r'|(?<=[a-z0-9])\.$'
|
||||
expression += r'|(?<=[0-9])km$'
|
||||
return expression
|
||||
|
||||
|
||||
def read_infix(data_dir):
|
||||
with utf8open(path.join(data_dir, 'infix')) as file_:
|
||||
entries = file_.read().split('\n')
|
||||
|
|
|
@ -20,15 +20,18 @@ def test_apostrophe():
|
|||
def test_LL():
|
||||
tokens = EN.tokenize("we'll")
|
||||
assert len(tokens) == 2
|
||||
assert tokens[1].string == "will"
|
||||
assert tokens[1].string == "'ll"
|
||||
assert tokens[1].lemma == "will"
|
||||
assert tokens[0].string == "we"
|
||||
|
||||
|
||||
def test_aint():
|
||||
tokens = EN.tokenize("ain't")
|
||||
assert len(tokens) == 2
|
||||
assert tokens[0].string == "are"
|
||||
assert tokens[1].string == "not"
|
||||
assert tokens[0].string == "ai"
|
||||
assert tokens[0].lemma == "be"
|
||||
assert tokens[1].string == "n't"
|
||||
assert tokens[1].lemma == "not"
|
||||
|
||||
|
||||
def test_capitalized():
|
||||
|
@ -38,7 +41,8 @@ def test_capitalized():
|
|||
assert len(tokens) == 2
|
||||
tokens = EN.tokenize("Ain't")
|
||||
assert len(tokens) == 2
|
||||
assert tokens[0].string == "Are"
|
||||
assert tokens[0].string == "Ai"
|
||||
assert tokens[0].lemma == "be"
|
||||
|
||||
|
||||
def test_punct():
|
||||
|
|
|
@ -34,7 +34,7 @@ def test_digits():
|
|||
def test_contraction():
|
||||
tokens = EN.tokenize("don't giggle")
|
||||
assert len(tokens) == 3
|
||||
assert tokens[1].sic == EN.lexicon["not"]['sic']
|
||||
assert tokens[1].sic == EN.lexicon["n't"]['sic']
|
||||
tokens = EN.tokenize("i said don't!")
|
||||
assert len(tokens) == 5
|
||||
assert tokens[4].sic == EN.lexicon['!']['sic']
|
||||
|
@ -71,30 +71,39 @@ def test_cnts1():
|
|||
tokens = EN.tokenize(text)
|
||||
assert len(tokens) == 8
|
||||
|
||||
|
||||
def test_cnts2():
|
||||
text = u"""U.N. regulations are not a part of their concern."""
|
||||
tokens = EN.tokenize(text)
|
||||
assert len(tokens) == 10
|
||||
|
||||
|
||||
def test_cnts3():
|
||||
text = u"“Isn't it?”"
|
||||
tokens = EN.tokenize(text)
|
||||
assert len(tokens) == 6
|
||||
words = [t.string for t in tokens]
|
||||
assert len(words) == 6
|
||||
|
||||
|
||||
def test_cnts4():
|
||||
text = u"""Yes! "I'd rather have a walk", Ms. Comble sighed. """
|
||||
tokens = EN.tokenize(text)
|
||||
assert len(tokens) == 15
|
||||
words = [t.string for t in tokens]
|
||||
assert len(words) == 15
|
||||
|
||||
|
||||
def test_cnts5():
|
||||
text = """'Me too!', Mr. P. Delaware cried. """
|
||||
tokens = EN.tokenize(text)
|
||||
assert len(tokens) == 11
|
||||
|
||||
|
||||
def test_cnts6():
|
||||
text = u'They ran about 10km.'
|
||||
tokens = EN.tokenize(text)
|
||||
assert len(tokens) == 6
|
||||
words = [t.string for t in tokens]
|
||||
assert len(words) == 6
|
||||
|
||||
|
||||
#def test_cnts7():
|
||||
# text = 'But then the 6,000-year ice age came...'
|
||||
|
|
Loading…
Reference in New Issue
Block a user