* Basic punct tests updated and passing

This commit is contained in:
Matthew Honnibal 2014-08-27 19:38:57 +02:00
parent 8d20617dfd
commit fdaf24604a
10 changed files with 81 additions and 120 deletions

View File

@ -1,14 +0,0 @@
from .lexeme import lex_of
from .lexeme import length_of
from .tokens import Tokens
# Don't know how to get the enum Python visible :(
LEX = 0
NORM = 1
SHAPE = 2
LAST3 = 3
LENGTH = 4
__all__ = [Tokens, lex_of, length_of, LEX, NORM, SHAPE, LAST3, LENGTH]

View File

@ -3,42 +3,5 @@ from spacy.word cimport Lexeme
cimport cython cimport cython
cpdef size_t ALPHA
cpdef size_t DIGIT
cpdef size_t PUNCT
cpdef size_t SPACE
cpdef size_t LOWER
cpdef size_t UPPER
cpdef size_t TITLE
cpdef size_t ASCII
cpdef size_t OFT_LOWER
cpdef size_t OFT_TITLE
cpdef size_t OFT_UPPER
cpdef size_t PUNCT
cpdef size_t CONJ
cpdef size_t NUM
cpdef size_t N
cpdef size_t DET
cpdef size_t ADP
cpdef size_t ADJ
cpdef size_t ADV
cpdef size_t VERB
cpdef size_t NOUN
cpdef size_t PDT
cpdef size_t POS
cpdef size_t PRON
cpdef size_t PRT
cpdef size_t SIC
cpdef size_t CANON_CASED
cpdef size_t SHAPE
cpdef size_t NON_SPARSE
cdef class English(Language): cdef class English(Language):
cpdef int _split_one(self, unicode word) cpdef int _split_one(self, unicode word)
cpdef English EN

View File

@ -84,10 +84,10 @@ EN = English('en')
# Thresholds for frequency related flags # Thresholds for frequency related flags
TAG_THRESH = 0.5 cdef double TAG_THRESH = 0.5
LOWER_THRESH = 0.5 cdef double LOWER_THRESH = 0.5
UPPER_THRESH = 0.3 cdef double UPPER_THRESH = 0.3
TITLE_THRESH = 0.9 cdef double TITLE_THRESH = 0.9
# Python-readable flag constants --- can't read an enum from Python # Python-readable flag constants --- can't read an enum from Python

View File

@ -4,6 +4,10 @@ from spacy.word cimport Lexeme
cdef class Lexicon: cdef class Lexicon:
cdef public dict probs
cdef public dict clusters
cdef public dict case_stats
cdef public dict tag_stats
cdef public list flag_checkers cdef public list flag_checkers
cdef public list string_transformers cdef public list string_transformers

View File

@ -20,7 +20,7 @@ cdef class Language:
self.name = name self.name = name
self.cache = {} self.cache = {}
self.lexicon = Lexicon() self.lexicon = Lexicon()
self.load_tokenization(util.read_tokenization(name)) #self.load_special_tokenization(util.read_tokenization(name))
cpdef list tokenize(self, unicode string): cpdef list tokenize(self, unicode string):
"""Tokenize a string. """Tokenize a string.
@ -57,7 +57,7 @@ cdef class Language:
cdef list lexemes = [] cdef list lexemes = []
substrings = self._split(string) substrings = self._split(string)
for i, substring in enumerate(substrings): for i, substring in enumerate(substrings):
lexemes.append(self.lookup(substring)) lexemes.append(self.lexicon.lookup(substring))
self.cache[string] = lexemes self.cache[string] = lexemes
return lexemes return lexemes
@ -108,7 +108,11 @@ cdef class Language:
cdef class Lexicon: cdef class Lexicon:
def __cinit__(self): def __cinit__(self):
self.flag_checkers = [] self.flag_checkers = []
self.string_transforms = [] self.string_transformers = []
self.probs = {}
self.clusters = {}
self.case_stats = {}
self.tag_stats = {}
self.lexicon = {} self.lexicon = {}
cpdef Lexeme lookup(self, unicode string): cpdef Lexeme lookup(self, unicode string):
@ -151,6 +155,7 @@ cdef class Lexicon:
def load_probs(self, location): def load_probs(self, location):
"""Load unigram probabilities. """Load unigram probabilities.
""" """
# Dict mapping words to floats
self.probs = json.load(location) self.probs = json.load(location)
cdef Lexeme word cdef Lexeme word
@ -161,18 +166,21 @@ cdef class Lexicon:
word.prob = prob word.prob = prob
def load_clusters(self, location): def load_clusters(self, location):
self.probs = json.load(location) # TODO: Find out endianness
# Dict mapping words to ??-endian ints
self.clusters = json.load(location)
cdef Lexeme word cdef Lexeme word
cdef unicode string cdef unicode string
for string, word in self.lexicon.items(): for string, word in self.lexicon.items():
cluster = _pop_default(self.cluster, string, 0) cluster = _pop_default(self.clusters, string, 0)
word.cluster = cluster word.cluster = cluster
def load_stats(self, location): def load_stats(self, location):
"""Load distributional stats. """Load distributional stats.
""" """
# Dict mapping string to dict of arbitrary stuff.
raise NotImplementedError raise NotImplementedError

View File

@ -12,7 +12,7 @@ cdef class Lexeme:
cpdef readonly double prob cpdef readonly double prob
cpdef readonly size_t cluster cpdef readonly size_t cluster
cdef utf8_t* views cdef list views
cdef size_t nr_views cdef size_t nr_views
cdef readonly flag_t flags cdef readonly flag_t flags

View File

@ -49,35 +49,41 @@ cdef class Lexeme:
while "dapple" is totally different. On the other hand, "scalable" receives while "dapple" is totally different. On the other hand, "scalable" receives
the same cluster ID as "pineapple", which is not what we'd like. the same cluster ID as "pineapple", which is not what we'd like.
""" """
def __cinit__(self, utf8_t string, size_t length, list views, prob=0.0, def __cinit__(self, unicode string, prob, cluster, case_stats,
flags=0): tag_stats, flag_checkers, string_transformers):
self.id = <id_t>&string self.prob = prob
self.length = length self.cluster = cluster
self.nr_strings = 0 self.length = len(string)
self.add_views(views) self.id = hash(string)
self.nr_views = len(string_transformers)
self.views = []
cdef unicode view
for i, string_transformer in enumerate(string_transformers):
view = string_transformer(string, prob, case_stats, tag_stats)
self.views.append(view)
for i, flag_checker in enumerate(flag_checkers):
if flag_checker(string, prob, case_stats, tag_stats):
self.set_flag(i)
def __dealloc__(self): def __dealloc__(self):
free(self.views) pass
property string: property string:
def __get__(self): def __get__(self):
return self.strings[0].decode('utf8') return self.views[0]
cpdef unicode get_view_string(self, size_t i): cpdef unicode get_view_string(self, size_t i):
assert i < self.nr_strings assert i < self.nr_views
return self.strings[i].decode('utf8') return self.views[i]
cpdef id_t get_view_id(self, size_t i) except 0: cpdef id_t get_view_id(self, size_t i) except 0:
assert i < self.nr_strings return <id_t>hash(self.views[i])
return <id_t>&self.views[i]
cpdef int add_view(self, unicode view) except -1: cpdef int add_view(self, unicode view) except -1:
self.nr_views += 1 self.nr_views += 1
self.views = <char**>realloc(self.views, self.nr_views * sizeof(utf8_t)) self.views.append(view)
cdef bytes utf8_string = view.encode('utf8')
# Intern strings, allowing pointer comparison
utf8_string = intern(utf8_string)
self.views[self.nr_views - 1] = utf8_string
cpdef bint check_flag(self, size_t flag_id) except *: cpdef bint check_flag(self, size_t flag_id) except *:
"""Access the value of one of the pre-computed boolean distribution features. """Access the value of one of the pre-computed boolean distribution features.

View File

@ -1,8 +1,6 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from spacy.en import lookup from spacy.en import EN
from spacy.en import tokenize
from spacy.en import unhash
import pytest import pytest
@ -16,28 +14,28 @@ def test_close(close_puncts):
word_str = 'Hello' word_str = 'Hello'
for p in close_puncts: for p in close_puncts:
string = word_str + p string = word_str + p
tokens = tokenize(string) tokens = EN.tokenize(string)
assert len(tokens) == 2 assert len(tokens) == 2
assert unhash(tokens[1].lex) == p assert tokens[1].string == p
assert unhash(tokens[0].lex) == word_str assert tokens[0].string == word_str
def test_two_different_close(close_puncts): def test_two_different_close(close_puncts):
word_str = 'Hello' word_str = 'Hello'
for p in close_puncts: for p in close_puncts:
string = word_str + p + "'" string = word_str + p + "'"
tokens = tokenize(string) tokens = EN.tokenize(string)
assert len(tokens) == 3 assert len(tokens) == 3
assert unhash(tokens[0].lex) == word_str assert tokens[0].string == word_str
assert unhash(tokens[1].lex) == p assert tokens[1].string == p
assert unhash(tokens[2].lex) == "'" assert tokens[2].string == "'"
def test_three_same_close(close_puncts): def test_three_same_close(close_puncts):
word_str = 'Hello' word_str = 'Hello'
for p in close_puncts: for p in close_puncts:
string = word_str + p + p + p string = word_str + p + p + p
tokens = tokenize(string) tokens = EN.tokenize(string)
assert len(tokens) == 4 assert len(tokens) == 4
assert unhash(tokens[0].lex) == word_str assert tokens[0].string == word_str
assert unhash(tokens[1].lex) == p assert tokens[1].string == p

View File

@ -1,8 +1,6 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from spacy.en import lookup from spacy.en import EN
from spacy.en import tokenize
from spacy.en import unhash
import pytest import pytest
@ -16,35 +14,35 @@ def test_open(open_puncts):
word_str = 'Hello' word_str = 'Hello'
for p in open_puncts: for p in open_puncts:
string = p + word_str string = p + word_str
tokens = tokenize(string) tokens = EN.tokenize(string)
assert len(tokens) == 2 assert len(tokens) == 2
assert unhash(tokens[0].lex) == p assert tokens[0].string == p
assert unhash(tokens[1].lex) == word_str assert tokens[1].string == word_str
def test_two_different_open(open_puncts): def test_two_different_open(open_puncts):
word_str = 'Hello' word_str = 'Hello'
for p in open_puncts: for p in open_puncts:
string = p + "`" + word_str string = p + "`" + word_str
tokens = tokenize(string) tokens = EN.tokenize(string)
assert len(tokens) == 3 assert len(tokens) == 3
assert unhash(tokens[0].lex) == p assert tokens[0].string == p
assert unhash(tokens[1].lex) == "`" assert tokens[1].string == "`"
assert unhash(tokens[2].lex) == word_str assert tokens[2].string == word_str
def test_three_same_open(open_puncts): def test_three_same_open(open_puncts):
word_str = 'Hello' word_str = 'Hello'
for p in open_puncts: for p in open_puncts:
string = p + p + p + word_str string = p + p + p + word_str
tokens = tokenize(string) tokens = EN.tokenize(string)
assert len(tokens) == 4 assert len(tokens) == 4
assert unhash(tokens[0].lex) == p assert tokens[0].string == p
assert unhash(tokens[3].lex) == word_str assert tokens[3].string == word_str
def test_open_appostrophe(): def test_open_appostrophe():
string = "'The" string = "'The"
tokens = tokenize(string) tokens = EN.tokenize(string)
assert len(tokens) == 2 assert len(tokens) == 2
assert unhash(tokens[0].lex) == "'" assert tokens[0].string == "'"

View File

@ -1,8 +1,6 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from spacy.en import tokenize from spacy.en import EN
from spacy.en import lookup
from spacy.en import unhash
import pytest import pytest
@ -16,22 +14,22 @@ def test_token(paired_puncts):
word_str = 'Hello' word_str = 'Hello'
for open_, close_ in paired_puncts: for open_, close_ in paired_puncts:
string = open_ + word_str + close_ string = open_ + word_str + close_
tokens = tokenize(string) tokens = EN.tokenize(string)
assert len(tokens) == 3 assert len(tokens) == 3
assert unhash(tokens[0].lex) == open_ assert tokens[0].string == open_
assert unhash(tokens[1].lex) == word_str assert tokens[1].string == word_str
assert unhash(tokens[2].lex) == close_ assert tokens[2].string == close_
def test_two_different(paired_puncts): def test_two_different(paired_puncts):
word_str = 'Hello' word_str = 'Hello'
for open_, close_ in paired_puncts: for open_, close_ in paired_puncts:
string = "`" + open_ + word_str + close_ + "'" string = "`" + open_ + word_str + close_ + "'"
tokens = tokenize(string) tokens = EN.tokenize(string)
assert len(tokens) == 5 assert len(tokens) == 5
assert unhash(tokens[0].lex) == "`" assert tokens[0].string == "`"
assert unhash(tokens[1].lex) == open_ assert tokens[1].string == open_
assert unhash(tokens[2].lex) == word_str assert tokens[2].string == word_str
assert unhash(tokens[2].lex) == word_str assert tokens[2].string == word_str
assert unhash(tokens[3].lex) == close_ assert tokens[3].string == close_
assert unhash(tokens[4].lex) == "'" assert tokens[4].string == "'"