diff --git a/spacy/en.pyx b/spacy/en.pyx index 98f96610a..ebfbff8d2 100644 --- a/spacy/en.pyx +++ b/spacy/en.pyx @@ -45,8 +45,71 @@ cimport lang from spacy import orth +TAG_THRESH = 0.5 +UPPER_THRESH = 0.2 +LOWER_THRESH = 0.5 +TITLE_THRESH = 0.7 + +NR_FLAGS = 0 + +OFT_UPPER = NR_FLAGS; NR_FLAGS += 1 +OFT_LOWER = NR_FLAGS; NR_FLAGS += 1 +OFT_TITLE = NR_FLAGS; NR_FLAGS += 1 + +IS_ALPHA = NR_FLAGS; NR_FLAGS += 1 +IS_DIGIT = NR_FLAGS; NR_FLAGS += 1 +IS_PUNCT = NR_FLAGS; NR_FLAGS += 1 +IS_SPACE = NR_FLAGS; NR_FLAGS += 1 +IS_ASCII = NR_FLAGS; NR_FLAGS += 1 +IS_TITLE = NR_FLAGS; NR_FLAGS += 1 +IS_LOWER = NR_FLAGS; NR_FLAGS += 1 +IS_UPPER = NR_FLAGS; NR_FLAGS += 1 + +CAN_PUNCT = NR_FLAGS; NR_FLAGS += 1 +CAN_CONJ = NR_FLAGS; NR_FLAGS += 1 +CAN_NUM = NR_FLAGS; NR_FLAGS += 1 +CAN_DET = NR_FLAGS; NR_FLAGS += 1 +CAN_ADP = NR_FLAGS; NR_FLAGS += 1 +CAN_ADJ = NR_FLAGS; NR_FLAGS += 1 +CAN_ADV = NR_FLAGS; NR_FLAGS += 1 +CAN_VERB = NR_FLAGS; NR_FLAGS += 1 +CAN_NOUN = NR_FLAGS; NR_FLAGS += 1 +CAN_PDT = NR_FLAGS; NR_FLAGS += 1 +CAN_POS = NR_FLAGS; NR_FLAGS += 1 +CAN_PRON = NR_FLAGS; NR_FLAGS += 1 +CAN_PRT = NR_FLAGS; NR_FLAGS += 1 + cdef class English(Language): + def __cinit__(self, name): + flag_funcs = [0 for _ in range(NR_FLAGS)] + + flag_funcs[OFT_UPPER] = orth.oft_case('upper', UPPER_THRESH) + flag_funcs[OFT_LOWER] = orth.oft_case('lower', LOWER_THRESH) + flag_funcs[OFT_TITLE] = orth.oft_case('title', TITLE_THRESH) + + flag_funcs[IS_ALPHA] = orth.is_alpha + flag_funcs[IS_DIGIT] = orth.is_digit + flag_funcs[IS_PUNCT] = orth.is_punct + flag_funcs[IS_SPACE] = orth.is_space + flag_funcs[IS_TITLE] = orth.is_title + flag_funcs[IS_LOWER] = orth.is_lower + flag_funcs[IS_UPPER] = orth.is_upper + + flag_funcs[CAN_PUNCT] = orth.can_tag('PUNCT', TAG_THRESH) + flag_funcs[CAN_CONJ] = orth.can_tag('CONJ', TAG_THRESH) + flag_funcs[CAN_NUM] = orth.can_tag('NUM', TAG_THRESH) + flag_funcs[CAN_DET] = orth.can_tag('DET', TAG_THRESH) + flag_funcs[CAN_ADP] = orth.can_tag('ADP', TAG_THRESH) + flag_funcs[CAN_ADJ] = orth.can_tag('ADJ', TAG_THRESH) + flag_funcs[CAN_VERB] = orth.can_tag('VERB', TAG_THRESH) + flag_funcs[CAN_NOUN] = orth.can_tag('NOUN', TAG_THRESH) + flag_funcs[CAN_PDT] = orth.can_tag('PDT', TAG_THRESH) + flag_funcs[CAN_POS] = orth.can_tag('POS', TAG_THRESH) + flag_funcs[CAN_PRT] = orth.can_tag('PRT', TAG_THRESH) + + Language.__init__(self, name, flag_funcs) + cpdef int _split_one(self, unicode word): cdef size_t length = len(word) cdef int i = 0 @@ -81,48 +144,3 @@ cdef bint _check_punct(unicode word, size_t i, size_t length): EN = English('en') - - -# Thresholds for frequency related flags -cdef double TAG_THRESH = 0.5 -cdef double LOWER_THRESH = 0.5 -cdef double UPPER_THRESH = 0.3 -cdef double TITLE_THRESH = 0.9 - - -# Python-readable flag constants --- can't read an enum from Python -ALPHA = EN.lexicon.add_flag(orth.is_alpha) -DIGIT = EN.lexicon.add_flag(orth.is_digit) -PUNCT = EN.lexicon.add_flag(orth.is_punct) -SPACE = EN.lexicon.add_flag(orth.is_space) -PUNCT = EN.lexicon.add_flag(orth.is_punct) -ASCII = EN.lexicon.add_flag(orth.is_ascii) -TITLE = EN.lexicon.add_flag(orth.is_title) -LOWER = EN.lexicon.add_flag(orth.is_lower) -UPPER = EN.lexicon.add_flag(orth.is_upper) - -OFT_LOWER = EN.lexicon.add_flag(orth.case_trend('lower', LOWER_THRESH)) -OFT_UPPER = EN.lexicon.add_flag(orth.case_trend('upper', UPPER_THRESH)) -OFT_TITLE = EN.lexicon.add_flag(orth.case_trend('title', TITLE_THRESH)) - -CAN_PUNCT = EN.lexicon.add_flag(orth.can_tag("PUNCT", TAG_THRESH)) -CAN_CONJ = EN.lexicon.add_flag(orth.can_tag("CONJ", TAG_THRESH)) -CAN_NUM = EN.lexicon.add_flag(orth.can_tag("NUM", TAG_THRESH)) -CAN_N = EN.lexicon.add_flag(orth.can_tag("N", TAG_THRESH)) -CAN_DET = EN.lexicon.add_flag(orth.can_tag("DET", TAG_THRESH)) -CAN_ADP = EN.lexicon.add_flag(orth.can_tag("ADP", TAG_THRESH)) -CAN_ADJ = EN.lexicon.add_flag(orth.can_tag("ADJ", TAG_THRESH)) -CAN_ADV = EN.lexicon.add_flag(orth.can_tag("ADV", TAG_THRESH)) -CAN_VERB = EN.lexicon.add_flag(orth.can_tag("VERB", TAG_THRESH)) -CAN_NOUN = EN.lexicon.add_flag(orth.can_tag("NOUN", TAG_THRESH)) -CAN_PDT = EN.lexicon.add_flag(orth.can_tag("PDT", TAG_THRESH)) -CAN_POS = EN.lexicon.add_flag(orth.can_tag("POS", TAG_THRESH)) -CAN_PRON = EN.lexicon.add_flag(orth.can_tag("PRON", TAG_THRESH)) -CAN_PRT = EN.lexicon.add_flag(orth.can_tag("PRT", TAG_THRESH)) - - -# These are the name of string transforms -SIC = EN.lexicon.add_transform(orth.sic_string) -CANON_CASED = EN.lexicon.add_transform(orth.canon_case) -SHAPE = EN.lexicon.add_transform(orth.word_shape) -NON_SPARSE = EN.lexicon.add_transform(orth.non_sparse) diff --git a/spacy/lang.pxd b/spacy/lang.pxd index e86fc926e..43e21577b 100644 --- a/spacy/lang.pxd +++ b/spacy/lang.pxd @@ -4,14 +4,10 @@ from spacy.word cimport Lexeme cdef class Lexicon: - cdef public dict probs - cdef public dict clusters - cdef public dict case_stats - cdef public dict tag_stats - cdef public list flag_checkers - cdef public list string_transformers + cdef list string_features + cdef list flag_features - cdef dict lexicon + cdef dict _dict cpdef Lexeme lookup(self, unicode string) @@ -26,4 +22,3 @@ cdef class Language: cdef list _tokenize(self, unicode string) cpdef list _split(self, unicode string) cpdef int _split_one(self, unicode word) - diff --git a/spacy/lang.pyx b/spacy/lang.pyx index 05dd390d4..221e25b6e 100644 --- a/spacy/lang.pyx +++ b/spacy/lang.pyx @@ -10,17 +10,38 @@ from __future__ import unicode_literals from libc.stdlib cimport calloc, free -from . import util import json from os import path +from .util import read_lang_data + cdef class Language: - def __cinit__(self, name): + """Base class for language-specific tokenizers. + + Most subclasses will override the _split or _split_one methods, which take + a string of non-whitespace characters and output a list of strings. This + function is called by _tokenize, which sits behind a cache and turns the + list of strings into Lexeme objects via the Lexicon. Most languages will not + need to override _tokenize or tokenize. + + The language is supplied a list of boolean functions, used to compute flag + features. These are passed to the language's Lexicon object. + + The language's name is used to look up default data-files, found in data/hash(self.views[i]) - - cpdef int add_view(self, unicode view) except -1: - self.nr_views += 1 - self.views.append(view) - cpdef bint check_flag(self, size_t flag_id) except *: """Access the value of one of the pre-computed boolean distribution features. diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py index c99d387ce..4b0dde524 100644 --- a/tests/test_tokenizer.py +++ b/tests/test_tokenizer.py @@ -1,42 +1,41 @@ from __future__ import unicode_literals -from spacy.en import tokenize -from spacy.en import lookup +from spacy.en import EN def test_single_word(): - lex_ids = tokenize(u'hello') - assert lex_ids[0] == lookup(u'hello') + lex_ids = EN.tokenize(u'hello') + assert lex_ids[0] == EN.lexicon.lookup(u'hello') def test_two_words(): - words = tokenize('hello possums') + words = EN.tokenize('hello possums') assert len(words) == 2 - assert words[0] == lookup('hello') + assert words[0] == EN.lexicon.lookup('hello') assert words[0] != words[1] def test_punct(): - tokens = tokenize('hello, possums.') + tokens = EN.tokenize('hello, possums.') assert len(tokens) == 4 - assert tokens[0].lex == lookup('hello').lex - assert tokens[1].lex == lookup(',').lex - assert tokens[2].lex == lookup('possums').lex - assert tokens[1].lex != lookup('hello').lex + assert tokens[0].string == EN.lexicon.lookup('hello').string + assert tokens[1].string == EN.lexicon.lookup(',').string + assert tokens[2].string == EN.lexicon.lookup('possums').string + assert tokens[1].string != EN.lexicon.lookup('hello').string def test_digits(): - lex_ids = tokenize('The year: 1984.') + lex_ids = EN.tokenize('The year: 1984.') assert len(lex_ids) == 5 - assert lex_ids[0].lex == lookup('The').lex - assert lex_ids[3].lex == lookup('1984').lex - assert lex_ids[4].lex == lookup('.').lex + assert lex_ids[0].string == EN.lexicon.lookup('The').string + assert lex_ids[3].string == EN.lexicon.lookup('1984').string + assert lex_ids[4].string == EN.lexicon.lookup('.').string def test_contraction(): - lex_ids = tokenize("don't giggle") + lex_ids = EN.tokenize("don't giggle") assert len(lex_ids) == 3 - assert lex_ids[1].lex == lookup("not").lex - lex_ids = tokenize("i said don't!") + assert lex_ids[1].string == EN.lexicon.lookup("not").string + lex_ids = EN.tokenize("i said don't!") assert len(lex_ids) == 4 - assert lex_ids[3].lex == lookup('!').lex + assert lex_ids[3].string == EN.lexicon.lookup('!').string