From fdaf24604a2496e0666a53232e997df05560c2e1 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 27 Aug 2014 19:38:57 +0200 Subject: [PATCH] * Basic punct tests updated and passing --- spacy/__init__.py | 14 ------------- spacy/en.pxd | 37 --------------------------------- spacy/en.pyx | 8 ++++---- spacy/lang.pxd | 4 ++++ spacy/lang.pyx | 18 +++++++++++----- spacy/word.pxd | 2 +- spacy/word.pyx | 40 +++++++++++++++++++++--------------- tests/test_post_punct.py | 24 ++++++++++------------ tests/test_pre_punct.py | 28 ++++++++++++------------- tests/test_surround_punct.py | 26 +++++++++++------------ 10 files changed, 81 insertions(+), 120 deletions(-) diff --git a/spacy/__init__.py b/spacy/__init__.py index 16d71aec6..e69de29bb 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -1,14 +0,0 @@ -from .lexeme import lex_of -from .lexeme import length_of - -from .tokens import Tokens - -# Don't know how to get the enum Python visible :( - -LEX = 0 -NORM = 1 -SHAPE = 2 -LAST3 = 3 -LENGTH = 4 - -__all__ = [Tokens, lex_of, length_of, LEX, NORM, SHAPE, LAST3, LENGTH] diff --git a/spacy/en.pxd b/spacy/en.pxd index 2c9f4c718..1a08834ec 100644 --- a/spacy/en.pxd +++ b/spacy/en.pxd @@ -3,42 +3,5 @@ from spacy.word cimport Lexeme cimport cython -cpdef size_t ALPHA -cpdef size_t DIGIT -cpdef size_t PUNCT -cpdef size_t SPACE -cpdef size_t LOWER -cpdef size_t UPPER -cpdef size_t TITLE -cpdef size_t ASCII - -cpdef size_t OFT_LOWER -cpdef size_t OFT_TITLE -cpdef size_t OFT_UPPER - -cpdef size_t PUNCT -cpdef size_t CONJ -cpdef size_t NUM -cpdef size_t N -cpdef size_t DET -cpdef size_t ADP -cpdef size_t ADJ -cpdef size_t ADV -cpdef size_t VERB -cpdef size_t NOUN -cpdef size_t PDT -cpdef size_t POS -cpdef size_t PRON -cpdef size_t PRT - -cpdef size_t SIC -cpdef size_t CANON_CASED -cpdef size_t SHAPE -cpdef size_t NON_SPARSE - - cdef class English(Language): cpdef int _split_one(self, unicode word) - - -cpdef English EN diff --git a/spacy/en.pyx b/spacy/en.pyx index c4185968f..98f96610a 100644 --- a/spacy/en.pyx +++ b/spacy/en.pyx @@ -84,10 +84,10 @@ EN = English('en') # Thresholds for frequency related flags -TAG_THRESH = 0.5 -LOWER_THRESH = 0.5 -UPPER_THRESH = 0.3 -TITLE_THRESH = 0.9 +cdef double TAG_THRESH = 0.5 +cdef double LOWER_THRESH = 0.5 +cdef double UPPER_THRESH = 0.3 +cdef double TITLE_THRESH = 0.9 # Python-readable flag constants --- can't read an enum from Python diff --git a/spacy/lang.pxd b/spacy/lang.pxd index 841e18818..e86fc926e 100644 --- a/spacy/lang.pxd +++ b/spacy/lang.pxd @@ -4,6 +4,10 @@ from spacy.word cimport Lexeme cdef class Lexicon: + cdef public dict probs + cdef public dict clusters + cdef public dict case_stats + cdef public dict tag_stats cdef public list flag_checkers cdef public list string_transformers diff --git a/spacy/lang.pyx b/spacy/lang.pyx index 3713e9320..8e64ca828 100644 --- a/spacy/lang.pyx +++ b/spacy/lang.pyx @@ -20,7 +20,7 @@ cdef class Language: self.name = name self.cache = {} self.lexicon = Lexicon() - self.load_tokenization(util.read_tokenization(name)) + #self.load_special_tokenization(util.read_tokenization(name)) cpdef list tokenize(self, unicode string): """Tokenize a string. @@ -57,7 +57,7 @@ cdef class Language: cdef list lexemes = [] substrings = self._split(string) for i, substring in enumerate(substrings): - lexemes.append(self.lookup(substring)) + lexemes.append(self.lexicon.lookup(substring)) self.cache[string] = lexemes return lexemes @@ -108,7 +108,11 @@ cdef class Language: cdef class Lexicon: def __cinit__(self): self.flag_checkers = [] - self.string_transforms = [] + self.string_transformers = [] + self.probs = {} + self.clusters = {} + self.case_stats = {} + self.tag_stats = {} self.lexicon = {} cpdef Lexeme lookup(self, unicode string): @@ -151,6 +155,7 @@ cdef class Lexicon: def load_probs(self, location): """Load unigram probabilities. """ + # Dict mapping words to floats self.probs = json.load(location) cdef Lexeme word @@ -161,18 +166,21 @@ cdef class Lexicon: word.prob = prob def load_clusters(self, location): - self.probs = json.load(location) + # TODO: Find out endianness + # Dict mapping words to ??-endian ints + self.clusters = json.load(location) cdef Lexeme word cdef unicode string for string, word in self.lexicon.items(): - cluster = _pop_default(self.cluster, string, 0) + cluster = _pop_default(self.clusters, string, 0) word.cluster = cluster def load_stats(self, location): """Load distributional stats. """ + # Dict mapping string to dict of arbitrary stuff. raise NotImplementedError diff --git a/spacy/word.pxd b/spacy/word.pxd index 4e9d416fa..bdddfd53e 100644 --- a/spacy/word.pxd +++ b/spacy/word.pxd @@ -12,7 +12,7 @@ cdef class Lexeme: cpdef readonly double prob cpdef readonly size_t cluster - cdef utf8_t* views + cdef list views cdef size_t nr_views cdef readonly flag_t flags diff --git a/spacy/word.pyx b/spacy/word.pyx index 99c0845a3..d411e96c8 100644 --- a/spacy/word.pyx +++ b/spacy/word.pyx @@ -49,35 +49,41 @@ cdef class Lexeme: while "dapple" is totally different. On the other hand, "scalable" receives the same cluster ID as "pineapple", which is not what we'd like. """ - def __cinit__(self, utf8_t string, size_t length, list views, prob=0.0, - flags=0): - self.id = &string - self.length = length - self.nr_strings = 0 - self.add_views(views) + def __cinit__(self, unicode string, prob, cluster, case_stats, + tag_stats, flag_checkers, string_transformers): + self.prob = prob + self.cluster = cluster + self.length = len(string) + self.id = hash(string) + + self.nr_views = len(string_transformers) + self.views = [] + cdef unicode view + for i, string_transformer in enumerate(string_transformers): + view = string_transformer(string, prob, case_stats, tag_stats) + self.views.append(view) + + for i, flag_checker in enumerate(flag_checkers): + if flag_checker(string, prob, case_stats, tag_stats): + self.set_flag(i) def __dealloc__(self): - free(self.views) + pass property string: def __get__(self): - return self.strings[0].decode('utf8') + return self.views[0] cpdef unicode get_view_string(self, size_t i): - assert i < self.nr_strings - return self.strings[i].decode('utf8') + assert i < self.nr_views + return self.views[i] cpdef id_t get_view_id(self, size_t i) except 0: - assert i < self.nr_strings - return &self.views[i] + return hash(self.views[i]) cpdef int add_view(self, unicode view) except -1: self.nr_views += 1 - self.views = realloc(self.views, self.nr_views * sizeof(utf8_t)) - cdef bytes utf8_string = view.encode('utf8') - # Intern strings, allowing pointer comparison - utf8_string = intern(utf8_string) - self.views[self.nr_views - 1] = utf8_string + self.views.append(view) cpdef bint check_flag(self, size_t flag_id) except *: """Access the value of one of the pre-computed boolean distribution features. diff --git a/tests/test_post_punct.py b/tests/test_post_punct.py index e5d2d0705..5ee0eb066 100644 --- a/tests/test_post_punct.py +++ b/tests/test_post_punct.py @@ -1,8 +1,6 @@ from __future__ import unicode_literals -from spacy.en import lookup -from spacy.en import tokenize -from spacy.en import unhash +from spacy.en import EN import pytest @@ -16,28 +14,28 @@ def test_close(close_puncts): word_str = 'Hello' for p in close_puncts: string = word_str + p - tokens = tokenize(string) + tokens = EN.tokenize(string) assert len(tokens) == 2 - assert unhash(tokens[1].lex) == p - assert unhash(tokens[0].lex) == word_str + assert tokens[1].string == p + assert tokens[0].string == word_str def test_two_different_close(close_puncts): word_str = 'Hello' for p in close_puncts: string = word_str + p + "'" - tokens = tokenize(string) + tokens = EN.tokenize(string) assert len(tokens) == 3 - assert unhash(tokens[0].lex) == word_str - assert unhash(tokens[1].lex) == p - assert unhash(tokens[2].lex) == "'" + assert tokens[0].string == word_str + assert tokens[1].string == p + assert tokens[2].string == "'" def test_three_same_close(close_puncts): word_str = 'Hello' for p in close_puncts: string = word_str + p + p + p - tokens = tokenize(string) + tokens = EN.tokenize(string) assert len(tokens) == 4 - assert unhash(tokens[0].lex) == word_str - assert unhash(tokens[1].lex) == p + assert tokens[0].string == word_str + assert tokens[1].string == p diff --git a/tests/test_pre_punct.py b/tests/test_pre_punct.py index 83e743c44..557655330 100644 --- a/tests/test_pre_punct.py +++ b/tests/test_pre_punct.py @@ -1,8 +1,6 @@ from __future__ import unicode_literals -from spacy.en import lookup -from spacy.en import tokenize -from spacy.en import unhash +from spacy.en import EN import pytest @@ -16,35 +14,35 @@ def test_open(open_puncts): word_str = 'Hello' for p in open_puncts: string = p + word_str - tokens = tokenize(string) + tokens = EN.tokenize(string) assert len(tokens) == 2 - assert unhash(tokens[0].lex) == p - assert unhash(tokens[1].lex) == word_str + assert tokens[0].string == p + assert tokens[1].string == word_str def test_two_different_open(open_puncts): word_str = 'Hello' for p in open_puncts: string = p + "`" + word_str - tokens = tokenize(string) + tokens = EN.tokenize(string) assert len(tokens) == 3 - assert unhash(tokens[0].lex) == p - assert unhash(tokens[1].lex) == "`" - assert unhash(tokens[2].lex) == word_str + assert tokens[0].string == p + assert tokens[1].string == "`" + assert tokens[2].string == word_str def test_three_same_open(open_puncts): word_str = 'Hello' for p in open_puncts: string = p + p + p + word_str - tokens = tokenize(string) + tokens = EN.tokenize(string) assert len(tokens) == 4 - assert unhash(tokens[0].lex) == p - assert unhash(tokens[3].lex) == word_str + assert tokens[0].string == p + assert tokens[3].string == word_str def test_open_appostrophe(): string = "'The" - tokens = tokenize(string) + tokens = EN.tokenize(string) assert len(tokens) == 2 - assert unhash(tokens[0].lex) == "'" + assert tokens[0].string == "'" diff --git a/tests/test_surround_punct.py b/tests/test_surround_punct.py index 686d8cfc2..b7be782f2 100644 --- a/tests/test_surround_punct.py +++ b/tests/test_surround_punct.py @@ -1,8 +1,6 @@ from __future__ import unicode_literals -from spacy.en import tokenize -from spacy.en import lookup -from spacy.en import unhash +from spacy.en import EN import pytest @@ -16,22 +14,22 @@ def test_token(paired_puncts): word_str = 'Hello' for open_, close_ in paired_puncts: string = open_ + word_str + close_ - tokens = tokenize(string) + tokens = EN.tokenize(string) assert len(tokens) == 3 - assert unhash(tokens[0].lex) == open_ - assert unhash(tokens[1].lex) == word_str - assert unhash(tokens[2].lex) == close_ + assert tokens[0].string == open_ + assert tokens[1].string == word_str + assert tokens[2].string == close_ def test_two_different(paired_puncts): word_str = 'Hello' for open_, close_ in paired_puncts: string = "`" + open_ + word_str + close_ + "'" - tokens = tokenize(string) + tokens = EN.tokenize(string) assert len(tokens) == 5 - assert unhash(tokens[0].lex) == "`" - assert unhash(tokens[1].lex) == open_ - assert unhash(tokens[2].lex) == word_str - assert unhash(tokens[2].lex) == word_str - assert unhash(tokens[3].lex) == close_ - assert unhash(tokens[4].lex) == "'" + assert tokens[0].string == "`" + assert tokens[1].string == open_ + assert tokens[2].string == word_str + assert tokens[2].string == word_str + assert tokens[3].string == close_ + assert tokens[4].string == "'"