* Redesign proceeding

2025-10-29 15:07:54 +03:00 · 2014-08-28 19:45:09 +02:00 · 2014-08-28 19:45:09 +02:00 · c282e6d5fb
commit c282e6d5fb
parent fd4e61e58b
6 changed files with 137 additions and 178 deletions
--- a/spacy/en.pyx
+++ b/spacy/en.pyx
@ -45,8 +45,71 @@ cimport lang

 from spacy import orth

+TAG_THRESH = 0.5
+UPPER_THRESH = 0.2
+LOWER_THRESH = 0.5
+TITLE_THRESH = 0.7
+
+NR_FLAGS = 0
+
+OFT_UPPER = NR_FLAGS; NR_FLAGS += 1
+OFT_LOWER = NR_FLAGS; NR_FLAGS += 1
+OFT_TITLE = NR_FLAGS; NR_FLAGS += 1
+
+IS_ALPHA = NR_FLAGS; NR_FLAGS += 1
+IS_DIGIT = NR_FLAGS; NR_FLAGS += 1
+IS_PUNCT = NR_FLAGS; NR_FLAGS += 1
+IS_SPACE = NR_FLAGS; NR_FLAGS += 1
+IS_ASCII = NR_FLAGS; NR_FLAGS += 1
+IS_TITLE = NR_FLAGS; NR_FLAGS += 1
+IS_LOWER = NR_FLAGS; NR_FLAGS += 1
+IS_UPPER = NR_FLAGS; NR_FLAGS += 1
+
+CAN_PUNCT = NR_FLAGS; NR_FLAGS += 1
+CAN_CONJ = NR_FLAGS; NR_FLAGS += 1
+CAN_NUM = NR_FLAGS; NR_FLAGS += 1
+CAN_DET = NR_FLAGS; NR_FLAGS += 1
+CAN_ADP = NR_FLAGS; NR_FLAGS += 1
+CAN_ADJ = NR_FLAGS; NR_FLAGS += 1
+CAN_ADV = NR_FLAGS; NR_FLAGS += 1
+CAN_VERB = NR_FLAGS; NR_FLAGS += 1
+CAN_NOUN = NR_FLAGS; NR_FLAGS += 1
+CAN_PDT = NR_FLAGS; NR_FLAGS += 1
+CAN_POS = NR_FLAGS; NR_FLAGS += 1
+CAN_PRON = NR_FLAGS; NR_FLAGS += 1
+CAN_PRT = NR_FLAGS; NR_FLAGS += 1
+

 cdef class English(Language):
+    def __cinit__(self, name):
+        flag_funcs = [0 for _ in range(NR_FLAGS)]
+        
+        flag_funcs[OFT_UPPER] = orth.oft_case('upper', UPPER_THRESH)
+        flag_funcs[OFT_LOWER] = orth.oft_case('lower', LOWER_THRESH)
+        flag_funcs[OFT_TITLE] = orth.oft_case('title', TITLE_THRESH)
+        
+        flag_funcs[IS_ALPHA] = orth.is_alpha
+        flag_funcs[IS_DIGIT] = orth.is_digit
+        flag_funcs[IS_PUNCT] = orth.is_punct
+        flag_funcs[IS_SPACE] = orth.is_space
+        flag_funcs[IS_TITLE] = orth.is_title
+        flag_funcs[IS_LOWER] = orth.is_lower
+        flag_funcs[IS_UPPER] = orth.is_upper
+        
+        flag_funcs[CAN_PUNCT] = orth.can_tag('PUNCT', TAG_THRESH)
+        flag_funcs[CAN_CONJ] = orth.can_tag('CONJ', TAG_THRESH)
+        flag_funcs[CAN_NUM] = orth.can_tag('NUM', TAG_THRESH)
+        flag_funcs[CAN_DET] = orth.can_tag('DET', TAG_THRESH)
+        flag_funcs[CAN_ADP] = orth.can_tag('ADP', TAG_THRESH)
+        flag_funcs[CAN_ADJ] = orth.can_tag('ADJ', TAG_THRESH)
+        flag_funcs[CAN_VERB] = orth.can_tag('VERB', TAG_THRESH)
+        flag_funcs[CAN_NOUN] = orth.can_tag('NOUN', TAG_THRESH)
+        flag_funcs[CAN_PDT] = orth.can_tag('PDT', TAG_THRESH)
+        flag_funcs[CAN_POS] = orth.can_tag('POS', TAG_THRESH)
+        flag_funcs[CAN_PRT] = orth.can_tag('PRT', TAG_THRESH)
+        
+        Language.__init__(self, name, flag_funcs)
+
    cpdef int _split_one(self, unicode word):
        cdef size_t length = len(word)
        cdef int i = 0
@ -81,48 +144,3 @@ cdef bint _check_punct(unicode word, size_t i, size_t length):


 EN = English('en')
-
-
-# Thresholds for frequency related flags
-cdef double TAG_THRESH = 0.5
-cdef double LOWER_THRESH = 0.5
-cdef double UPPER_THRESH = 0.3
-cdef double TITLE_THRESH = 0.9
-
-
-# Python-readable flag constants --- can't read an enum from Python
-ALPHA = EN.lexicon.add_flag(orth.is_alpha)
-DIGIT = EN.lexicon.add_flag(orth.is_digit)
-PUNCT = EN.lexicon.add_flag(orth.is_punct)
-SPACE = EN.lexicon.add_flag(orth.is_space)
-PUNCT = EN.lexicon.add_flag(orth.is_punct)
-ASCII = EN.lexicon.add_flag(orth.is_ascii)
-TITLE = EN.lexicon.add_flag(orth.is_title)
-LOWER = EN.lexicon.add_flag(orth.is_lower)
-UPPER = EN.lexicon.add_flag(orth.is_upper)
-
-OFT_LOWER = EN.lexicon.add_flag(orth.case_trend('lower', LOWER_THRESH))
-OFT_UPPER = EN.lexicon.add_flag(orth.case_trend('upper', UPPER_THRESH))
-OFT_TITLE = EN.lexicon.add_flag(orth.case_trend('title', TITLE_THRESH))
-
-CAN_PUNCT = EN.lexicon.add_flag(orth.can_tag("PUNCT", TAG_THRESH))
-CAN_CONJ = EN.lexicon.add_flag(orth.can_tag("CONJ", TAG_THRESH))
-CAN_NUM = EN.lexicon.add_flag(orth.can_tag("NUM", TAG_THRESH))
-CAN_N = EN.lexicon.add_flag(orth.can_tag("N", TAG_THRESH))
-CAN_DET = EN.lexicon.add_flag(orth.can_tag("DET", TAG_THRESH))
-CAN_ADP = EN.lexicon.add_flag(orth.can_tag("ADP", TAG_THRESH))
-CAN_ADJ = EN.lexicon.add_flag(orth.can_tag("ADJ", TAG_THRESH))
-CAN_ADV = EN.lexicon.add_flag(orth.can_tag("ADV", TAG_THRESH))
-CAN_VERB = EN.lexicon.add_flag(orth.can_tag("VERB", TAG_THRESH))
-CAN_NOUN = EN.lexicon.add_flag(orth.can_tag("NOUN", TAG_THRESH))
-CAN_PDT = EN.lexicon.add_flag(orth.can_tag("PDT", TAG_THRESH))
-CAN_POS = EN.lexicon.add_flag(orth.can_tag("POS", TAG_THRESH))
-CAN_PRON = EN.lexicon.add_flag(orth.can_tag("PRON", TAG_THRESH))
-CAN_PRT = EN.lexicon.add_flag(orth.can_tag("PRT", TAG_THRESH))
-
-
-# These are the name of string transforms
-SIC = EN.lexicon.add_transform(orth.sic_string)
-CANON_CASED = EN.lexicon.add_transform(orth.canon_case)
-SHAPE = EN.lexicon.add_transform(orth.word_shape)
-NON_SPARSE = EN.lexicon.add_transform(orth.non_sparse)
--- a/spacy/lang.pxd
+++ b/spacy/lang.pxd
@ -4,14 +4,10 @@ from spacy.word cimport Lexeme


 cdef class Lexicon:
-    cdef public dict probs
-    cdef public dict clusters
-    cdef public dict case_stats
-    cdef public dict tag_stats
-    cdef public list flag_checkers
-    cdef public list string_transformers
+    cdef list string_features
+    cdef list flag_features

-    cdef dict lexicon
+    cdef dict _dict

    cpdef Lexeme lookup(self, unicode string)

@ -26,4 +22,3 @@ cdef class Language:
    cdef list _tokenize(self, unicode string)
    cpdef list _split(self, unicode string)
    cpdef int _split_one(self, unicode word)
-    
--- a/spacy/lang.pyx
+++ b/spacy/lang.pyx
@ -10,17 +10,38 @@ from __future__ import unicode_literals

 from libc.stdlib cimport calloc, free

-from . import util
 import json
 from os import path

+from .util import read_lang_data
+

 cdef class Language:
-    def __cinit__(self, name):
+    """Base class for language-specific tokenizers.
+
+    Most subclasses will override the _split or _split_one methods, which take
+    a string of non-whitespace characters and output a list of strings.  This
+    function is called by _tokenize, which sits behind a cache and turns the
+    list of strings into Lexeme objects via the Lexicon. Most languages will not
+    need to override _tokenize or tokenize.
+
+    The language is supplied a list of boolean functions, used to compute flag
+    features. These are passed to the language's Lexicon object.
+
+    The language's name is used to look up default data-files, found in data/<name.
+    """
+    def __cinit__(self, name, string_features=None, flag_features=None):
+        if flag_features is None:
+            flag_features = []
+        if string_features is None:
+            string_features = []
        self.name = name
        self.cache = {}
-        self.lexicon = Lexicon()
-        self.load_special_tokenization(util.read_tokenization(name))
+        lang_data = read_lang_data(name)
+        rules, words, probs, clusters, case_stats, tag_stats = lang_data
+        self.lexicon = Lexicon(words, probs, clusters, case_stats, tag_stats,
+                               string_features, flag_features)
+        self.load_special_tokenization(rules)

    cpdef list tokenize(self, unicode string):
        """Tokenize a string.
@ -37,6 +58,8 @@ cdef class Language:
        Returns:
            tokens (Tokens): A Tokens object, giving access to a sequence of LexIDs.
        """
+        if not string:
+            return []
        cdef list tokens = []
        cdef size_t length = len(string)
        cdef size_t start = 0
@ -107,85 +130,32 @@ cdef class Language:
 

 cdef class Lexicon:
-    def __cinit__(self):
-        self.flag_checkers = []
-        self.string_transformers = []
-        self.probs = {}
-        self.clusters = {}
-        self.case_stats = {}
-        self.tag_stats = {}
-        self.lexicon = {}
+    def __cinit__(self, words, probs, clusters, case_stats, tag_stats,
+                  string_features, flag_features):
+        self.flag_features = flag_features
+        self.string_features = string_features
+        self._dict = {}
+        cdef Lexeme word
+        for string in words:
+            word = Lexeme(string, probs.get(string, 0.0), clusters.get(string, 0),
+                          case_stats.get(string, {}), tag_stats.get(string, {}),
+                          self.string_features, self.flag_features)
+            self._dict[string] = word

    cpdef Lexeme lookup(self, unicode string):
        """Retrieve (or create, if not found) a Lexeme for a string, and return it.
    
-        Args:
+        Args
            string (unicode):  The string to be looked up. Must be unicode, not bytes.

        Returns:
            lexeme (Lexeme): A reference to a lexical type.
        """
        assert len(string) != 0
-        if string in self.lexicon:
-            return self.lexicon[string]
+        if string in self._dict:
+            return self._dict[string]
        
-        prob = _pop_default(self.probs, string, 0.0)
-        cluster = _pop_default(self.clusters, string, 0.0)
-        case_stats = _pop_default(self.case_stats, string, {})
-        tag_stats = _pop_default(self.tag_stats, string, {})
-
-        cdef Lexeme word = Lexeme(string, prob, cluster, case_stats, tag_stats,
-                                  self.flag_checkers, self.string_transformers)
-        self.lexicon[string] = word
+        cdef Lexeme word = Lexeme(string, 0, 0, {}, {}, self.string_features,
+                                  self.flag_features)
+        self._dict[string] = word
        return word
-
-    def add_flag(self, flag_checker):
-        cdef unicode string
-        cdef Lexeme word
-        flag_id = len(self.flag_checkers)
-        for string, word in self.lexicon.items():
-            if flag_checker(string, word.prob, {}, {}):
-                word.set_flag(flag_id)
-        self.flag_checkers.append(flag_checker)
-        return flag_id
-
-    def add_transform(self, string_transform):
-        self.string_transformers.append(string_transform)
-        for string, word in self.lexicon.items():
-            word.add_view(string_transform(string, word.prob, {}, {}))
-        return len(self.string_transformers) - 1
-
-    def load_probs(self, location):
-        """Load unigram probabilities.
-        """
-        # Dict mapping words to floats
-        self.probs = json.load(location)
-        
-        cdef Lexeme word
-        cdef unicode string
-
-        for string, word in self.lexicon.items():
-            prob = _pop_default(self.probs, string, 0.0)
-            word.prob = prob
-
-    def load_clusters(self, location):
-        # TODO: Find out endianness
-        # Dict mapping words to ??-endian ints
-        self.clusters = json.load(location)
-        
-        cdef Lexeme word
-        cdef unicode string
-
-        for string, word in self.lexicon.items():
-            cluster = _pop_default(self.clusters, string, 0)
-            word.cluster = cluster
-
-    def load_stats(self, location):
-        """Load distributional stats.
-        """
-        # Dict mapping string to dict of arbitrary stuff.
-        raise NotImplementedError
-
-
-def _pop_default(dict d, key, default):
-    return d.pop(key) if key in d else default
--- a/spacy/word.pxd
+++ b/spacy/word.pxd
@ -7,19 +7,14 @@ DEF MAX_FLAG = 64
 cdef class Lexeme:
    # NB: the readonly keyword refers to _Python_ access. The attributes are
    # writeable from Cython.
-    cpdef readonly id_t id
    cpdef readonly size_t length
    cpdef readonly double prob
    cpdef readonly size_t cluster

-    cdef list views
-    cdef size_t nr_views
+    cpdef readonly string
+    cpdef readonly list views

    cdef readonly flag_t flags

    cpdef bint check_flag(self, size_t flag_id) except *
    cpdef int set_flag(self, size_t flag_id) except -1
-    
-    cpdef unicode get_view_string(self, size_t i)
-    cpdef id_t get_view_id(self, size_t i) except 0
-    cpdef int add_view(self, unicode view) except -1
--- a/spacy/word.pyx
+++ b/spacy/word.pyx
@ -49,42 +49,24 @@ cdef class Lexeme:
            while "dapple" is totally different. On the other hand, "scalable" receives
            the same cluster ID as "pineapple", which is not what we'd like.
    """
-    def __cinit__(self, unicode string, prob, cluster, case_stats,
-                  tag_stats, flag_checkers, string_transformers):
+    def __cinit__(self, unicode string, double prob, int cluster, dict case_stats,
+                  dict tag_stats, list string_features, list flag_features):
        self.prob = prob
        self.cluster = cluster
        self.length = len(string)
-        self.id = hash(string)
+        self.string = string

-        self.nr_views = len(string_transformers)
-        self.views = []
-        cdef unicode view
-        for i, string_transformer in enumerate(string_transformers):
-            view = string_transformer(string, prob, case_stats, tag_stats)
+        for string_feature in string_features:
+            view = string_feature(string, prob, cluster, case_stats, tag_stats)
            self.views.append(view)

-        for i, flag_checker in enumerate(flag_checkers):
-            if flag_checker(string, prob, case_stats, tag_stats):
+        for i, flag_feature in enumerate(flag_features):
+            if flag_feature(string, prob, case_stats, tag_stats):
                self.set_flag(i)

    def __dealloc__(self):
        pass

-    property string:
-        def __get__(self):
-            return self.views[0]
-
-    cpdef unicode get_view_string(self, size_t i):
-        assert i < self.nr_views
-        return self.views[i]
-
-    cpdef id_t get_view_id(self, size_t i) except 0:
-        return <id_t>hash(self.views[i])
-
-    cpdef int add_view(self, unicode view) except -1:
-        self.nr_views += 1
-        self.views.append(view)
-
    cpdef bint check_flag(self, size_t flag_id) except *:
        """Access the value of one of the pre-computed boolean distribution features.

--- a/tests/test_tokenizer.py
+++ b/tests/test_tokenizer.py
@ -1,42 +1,41 @@
 from __future__ import unicode_literals

-from spacy.en import tokenize
-from spacy.en import lookup
+from spacy.en import EN


 def test_single_word():
-    lex_ids = tokenize(u'hello')
-    assert lex_ids[0] == lookup(u'hello')
+    lex_ids = EN.tokenize(u'hello')
+    assert lex_ids[0] == EN.lexicon.lookup(u'hello')


 def test_two_words():
-    words = tokenize('hello possums')
+    words = EN.tokenize('hello possums')
    assert len(words) == 2
-    assert words[0] == lookup('hello')
+    assert words[0] == EN.lexicon.lookup('hello')
    assert words[0] != words[1]


 def test_punct():
-    tokens = tokenize('hello, possums.')
+    tokens = EN.tokenize('hello, possums.')
    assert len(tokens) == 4
-    assert tokens[0].lex == lookup('hello').lex
-    assert tokens[1].lex == lookup(',').lex
-    assert tokens[2].lex == lookup('possums').lex
-    assert tokens[1].lex != lookup('hello').lex
+    assert tokens[0].string == EN.lexicon.lookup('hello').string
+    assert tokens[1].string == EN.lexicon.lookup(',').string
+    assert tokens[2].string == EN.lexicon.lookup('possums').string
+    assert tokens[1].string != EN.lexicon.lookup('hello').string


 def test_digits():
-    lex_ids = tokenize('The year: 1984.')
+    lex_ids = EN.tokenize('The year: 1984.')
    assert len(lex_ids) == 5
-    assert lex_ids[0].lex == lookup('The').lex
-    assert lex_ids[3].lex == lookup('1984').lex
-    assert lex_ids[4].lex == lookup('.').lex
+    assert lex_ids[0].string == EN.lexicon.lookup('The').string
+    assert lex_ids[3].string == EN.lexicon.lookup('1984').string
+    assert lex_ids[4].string == EN.lexicon.lookup('.').string


 def test_contraction():
-    lex_ids = tokenize("don't giggle")
+    lex_ids = EN.tokenize("don't giggle")
    assert len(lex_ids) == 3
-    assert lex_ids[1].lex == lookup("not").lex
-    lex_ids = tokenize("i said don't!")
+    assert lex_ids[1].string == EN.lexicon.lookup("not").string
+    lex_ids = EN.tokenize("i said don't!")
    assert len(lex_ids) == 4
-    assert lex_ids[3].lex == lookup('!').lex
+    assert lex_ids[3].string == EN.lexicon.lookup('!').string