mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 01:46:28 +03:00
* Basic punct tests updated and passing
This commit is contained in:
parent
8d20617dfd
commit
fdaf24604a
|
@ -1,14 +0,0 @@
|
||||||
from .lexeme import lex_of
|
|
||||||
from .lexeme import length_of
|
|
||||||
|
|
||||||
from .tokens import Tokens
|
|
||||||
|
|
||||||
# Don't know how to get the enum Python visible :(
|
|
||||||
|
|
||||||
LEX = 0
|
|
||||||
NORM = 1
|
|
||||||
SHAPE = 2
|
|
||||||
LAST3 = 3
|
|
||||||
LENGTH = 4
|
|
||||||
|
|
||||||
__all__ = [Tokens, lex_of, length_of, LEX, NORM, SHAPE, LAST3, LENGTH]
|
|
37
spacy/en.pxd
37
spacy/en.pxd
|
@ -3,42 +3,5 @@ from spacy.word cimport Lexeme
|
||||||
cimport cython
|
cimport cython
|
||||||
|
|
||||||
|
|
||||||
cpdef size_t ALPHA
|
|
||||||
cpdef size_t DIGIT
|
|
||||||
cpdef size_t PUNCT
|
|
||||||
cpdef size_t SPACE
|
|
||||||
cpdef size_t LOWER
|
|
||||||
cpdef size_t UPPER
|
|
||||||
cpdef size_t TITLE
|
|
||||||
cpdef size_t ASCII
|
|
||||||
|
|
||||||
cpdef size_t OFT_LOWER
|
|
||||||
cpdef size_t OFT_TITLE
|
|
||||||
cpdef size_t OFT_UPPER
|
|
||||||
|
|
||||||
cpdef size_t PUNCT
|
|
||||||
cpdef size_t CONJ
|
|
||||||
cpdef size_t NUM
|
|
||||||
cpdef size_t N
|
|
||||||
cpdef size_t DET
|
|
||||||
cpdef size_t ADP
|
|
||||||
cpdef size_t ADJ
|
|
||||||
cpdef size_t ADV
|
|
||||||
cpdef size_t VERB
|
|
||||||
cpdef size_t NOUN
|
|
||||||
cpdef size_t PDT
|
|
||||||
cpdef size_t POS
|
|
||||||
cpdef size_t PRON
|
|
||||||
cpdef size_t PRT
|
|
||||||
|
|
||||||
cpdef size_t SIC
|
|
||||||
cpdef size_t CANON_CASED
|
|
||||||
cpdef size_t SHAPE
|
|
||||||
cpdef size_t NON_SPARSE
|
|
||||||
|
|
||||||
|
|
||||||
cdef class English(Language):
|
cdef class English(Language):
|
||||||
cpdef int _split_one(self, unicode word)
|
cpdef int _split_one(self, unicode word)
|
||||||
|
|
||||||
|
|
||||||
cpdef English EN
|
|
||||||
|
|
|
@ -84,10 +84,10 @@ EN = English('en')
|
||||||
|
|
||||||
|
|
||||||
# Thresholds for frequency related flags
|
# Thresholds for frequency related flags
|
||||||
TAG_THRESH = 0.5
|
cdef double TAG_THRESH = 0.5
|
||||||
LOWER_THRESH = 0.5
|
cdef double LOWER_THRESH = 0.5
|
||||||
UPPER_THRESH = 0.3
|
cdef double UPPER_THRESH = 0.3
|
||||||
TITLE_THRESH = 0.9
|
cdef double TITLE_THRESH = 0.9
|
||||||
|
|
||||||
|
|
||||||
# Python-readable flag constants --- can't read an enum from Python
|
# Python-readable flag constants --- can't read an enum from Python
|
||||||
|
|
|
@ -4,6 +4,10 @@ from spacy.word cimport Lexeme
|
||||||
|
|
||||||
|
|
||||||
cdef class Lexicon:
|
cdef class Lexicon:
|
||||||
|
cdef public dict probs
|
||||||
|
cdef public dict clusters
|
||||||
|
cdef public dict case_stats
|
||||||
|
cdef public dict tag_stats
|
||||||
cdef public list flag_checkers
|
cdef public list flag_checkers
|
||||||
cdef public list string_transformers
|
cdef public list string_transformers
|
||||||
|
|
||||||
|
|
|
@ -20,7 +20,7 @@ cdef class Language:
|
||||||
self.name = name
|
self.name = name
|
||||||
self.cache = {}
|
self.cache = {}
|
||||||
self.lexicon = Lexicon()
|
self.lexicon = Lexicon()
|
||||||
self.load_tokenization(util.read_tokenization(name))
|
#self.load_special_tokenization(util.read_tokenization(name))
|
||||||
|
|
||||||
cpdef list tokenize(self, unicode string):
|
cpdef list tokenize(self, unicode string):
|
||||||
"""Tokenize a string.
|
"""Tokenize a string.
|
||||||
|
@ -57,7 +57,7 @@ cdef class Language:
|
||||||
cdef list lexemes = []
|
cdef list lexemes = []
|
||||||
substrings = self._split(string)
|
substrings = self._split(string)
|
||||||
for i, substring in enumerate(substrings):
|
for i, substring in enumerate(substrings):
|
||||||
lexemes.append(self.lookup(substring))
|
lexemes.append(self.lexicon.lookup(substring))
|
||||||
self.cache[string] = lexemes
|
self.cache[string] = lexemes
|
||||||
return lexemes
|
return lexemes
|
||||||
|
|
||||||
|
@ -108,7 +108,11 @@ cdef class Language:
|
||||||
cdef class Lexicon:
|
cdef class Lexicon:
|
||||||
def __cinit__(self):
|
def __cinit__(self):
|
||||||
self.flag_checkers = []
|
self.flag_checkers = []
|
||||||
self.string_transforms = []
|
self.string_transformers = []
|
||||||
|
self.probs = {}
|
||||||
|
self.clusters = {}
|
||||||
|
self.case_stats = {}
|
||||||
|
self.tag_stats = {}
|
||||||
self.lexicon = {}
|
self.lexicon = {}
|
||||||
|
|
||||||
cpdef Lexeme lookup(self, unicode string):
|
cpdef Lexeme lookup(self, unicode string):
|
||||||
|
@ -151,6 +155,7 @@ cdef class Lexicon:
|
||||||
def load_probs(self, location):
|
def load_probs(self, location):
|
||||||
"""Load unigram probabilities.
|
"""Load unigram probabilities.
|
||||||
"""
|
"""
|
||||||
|
# Dict mapping words to floats
|
||||||
self.probs = json.load(location)
|
self.probs = json.load(location)
|
||||||
|
|
||||||
cdef Lexeme word
|
cdef Lexeme word
|
||||||
|
@ -161,18 +166,21 @@ cdef class Lexicon:
|
||||||
word.prob = prob
|
word.prob = prob
|
||||||
|
|
||||||
def load_clusters(self, location):
|
def load_clusters(self, location):
|
||||||
self.probs = json.load(location)
|
# TODO: Find out endianness
|
||||||
|
# Dict mapping words to ??-endian ints
|
||||||
|
self.clusters = json.load(location)
|
||||||
|
|
||||||
cdef Lexeme word
|
cdef Lexeme word
|
||||||
cdef unicode string
|
cdef unicode string
|
||||||
|
|
||||||
for string, word in self.lexicon.items():
|
for string, word in self.lexicon.items():
|
||||||
cluster = _pop_default(self.cluster, string, 0)
|
cluster = _pop_default(self.clusters, string, 0)
|
||||||
word.cluster = cluster
|
word.cluster = cluster
|
||||||
|
|
||||||
def load_stats(self, location):
|
def load_stats(self, location):
|
||||||
"""Load distributional stats.
|
"""Load distributional stats.
|
||||||
"""
|
"""
|
||||||
|
# Dict mapping string to dict of arbitrary stuff.
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -12,7 +12,7 @@ cdef class Lexeme:
|
||||||
cpdef readonly double prob
|
cpdef readonly double prob
|
||||||
cpdef readonly size_t cluster
|
cpdef readonly size_t cluster
|
||||||
|
|
||||||
cdef utf8_t* views
|
cdef list views
|
||||||
cdef size_t nr_views
|
cdef size_t nr_views
|
||||||
|
|
||||||
cdef readonly flag_t flags
|
cdef readonly flag_t flags
|
||||||
|
|
|
@ -49,35 +49,41 @@ cdef class Lexeme:
|
||||||
while "dapple" is totally different. On the other hand, "scalable" receives
|
while "dapple" is totally different. On the other hand, "scalable" receives
|
||||||
the same cluster ID as "pineapple", which is not what we'd like.
|
the same cluster ID as "pineapple", which is not what we'd like.
|
||||||
"""
|
"""
|
||||||
def __cinit__(self, utf8_t string, size_t length, list views, prob=0.0,
|
def __cinit__(self, unicode string, prob, cluster, case_stats,
|
||||||
flags=0):
|
tag_stats, flag_checkers, string_transformers):
|
||||||
self.id = <id_t>&string
|
self.prob = prob
|
||||||
self.length = length
|
self.cluster = cluster
|
||||||
self.nr_strings = 0
|
self.length = len(string)
|
||||||
self.add_views(views)
|
self.id = hash(string)
|
||||||
|
|
||||||
|
self.nr_views = len(string_transformers)
|
||||||
|
self.views = []
|
||||||
|
cdef unicode view
|
||||||
|
for i, string_transformer in enumerate(string_transformers):
|
||||||
|
view = string_transformer(string, prob, case_stats, tag_stats)
|
||||||
|
self.views.append(view)
|
||||||
|
|
||||||
|
for i, flag_checker in enumerate(flag_checkers):
|
||||||
|
if flag_checker(string, prob, case_stats, tag_stats):
|
||||||
|
self.set_flag(i)
|
||||||
|
|
||||||
def __dealloc__(self):
|
def __dealloc__(self):
|
||||||
free(self.views)
|
pass
|
||||||
|
|
||||||
property string:
|
property string:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.strings[0].decode('utf8')
|
return self.views[0]
|
||||||
|
|
||||||
cpdef unicode get_view_string(self, size_t i):
|
cpdef unicode get_view_string(self, size_t i):
|
||||||
assert i < self.nr_strings
|
assert i < self.nr_views
|
||||||
return self.strings[i].decode('utf8')
|
return self.views[i]
|
||||||
|
|
||||||
cpdef id_t get_view_id(self, size_t i) except 0:
|
cpdef id_t get_view_id(self, size_t i) except 0:
|
||||||
assert i < self.nr_strings
|
return <id_t>hash(self.views[i])
|
||||||
return <id_t>&self.views[i]
|
|
||||||
|
|
||||||
cpdef int add_view(self, unicode view) except -1:
|
cpdef int add_view(self, unicode view) except -1:
|
||||||
self.nr_views += 1
|
self.nr_views += 1
|
||||||
self.views = <char**>realloc(self.views, self.nr_views * sizeof(utf8_t))
|
self.views.append(view)
|
||||||
cdef bytes utf8_string = view.encode('utf8')
|
|
||||||
# Intern strings, allowing pointer comparison
|
|
||||||
utf8_string = intern(utf8_string)
|
|
||||||
self.views[self.nr_views - 1] = utf8_string
|
|
||||||
|
|
||||||
cpdef bint check_flag(self, size_t flag_id) except *:
|
cpdef bint check_flag(self, size_t flag_id) except *:
|
||||||
"""Access the value of one of the pre-computed boolean distribution features.
|
"""Access the value of one of the pre-computed boolean distribution features.
|
||||||
|
|
|
@ -1,8 +1,6 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from spacy.en import lookup
|
from spacy.en import EN
|
||||||
from spacy.en import tokenize
|
|
||||||
from spacy.en import unhash
|
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
@ -16,28 +14,28 @@ def test_close(close_puncts):
|
||||||
word_str = 'Hello'
|
word_str = 'Hello'
|
||||||
for p in close_puncts:
|
for p in close_puncts:
|
||||||
string = word_str + p
|
string = word_str + p
|
||||||
tokens = tokenize(string)
|
tokens = EN.tokenize(string)
|
||||||
assert len(tokens) == 2
|
assert len(tokens) == 2
|
||||||
assert unhash(tokens[1].lex) == p
|
assert tokens[1].string == p
|
||||||
assert unhash(tokens[0].lex) == word_str
|
assert tokens[0].string == word_str
|
||||||
|
|
||||||
|
|
||||||
def test_two_different_close(close_puncts):
|
def test_two_different_close(close_puncts):
|
||||||
word_str = 'Hello'
|
word_str = 'Hello'
|
||||||
for p in close_puncts:
|
for p in close_puncts:
|
||||||
string = word_str + p + "'"
|
string = word_str + p + "'"
|
||||||
tokens = tokenize(string)
|
tokens = EN.tokenize(string)
|
||||||
assert len(tokens) == 3
|
assert len(tokens) == 3
|
||||||
assert unhash(tokens[0].lex) == word_str
|
assert tokens[0].string == word_str
|
||||||
assert unhash(tokens[1].lex) == p
|
assert tokens[1].string == p
|
||||||
assert unhash(tokens[2].lex) == "'"
|
assert tokens[2].string == "'"
|
||||||
|
|
||||||
|
|
||||||
def test_three_same_close(close_puncts):
|
def test_three_same_close(close_puncts):
|
||||||
word_str = 'Hello'
|
word_str = 'Hello'
|
||||||
for p in close_puncts:
|
for p in close_puncts:
|
||||||
string = word_str + p + p + p
|
string = word_str + p + p + p
|
||||||
tokens = tokenize(string)
|
tokens = EN.tokenize(string)
|
||||||
assert len(tokens) == 4
|
assert len(tokens) == 4
|
||||||
assert unhash(tokens[0].lex) == word_str
|
assert tokens[0].string == word_str
|
||||||
assert unhash(tokens[1].lex) == p
|
assert tokens[1].string == p
|
||||||
|
|
|
@ -1,8 +1,6 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from spacy.en import lookup
|
from spacy.en import EN
|
||||||
from spacy.en import tokenize
|
|
||||||
from spacy.en import unhash
|
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
@ -16,35 +14,35 @@ def test_open(open_puncts):
|
||||||
word_str = 'Hello'
|
word_str = 'Hello'
|
||||||
for p in open_puncts:
|
for p in open_puncts:
|
||||||
string = p + word_str
|
string = p + word_str
|
||||||
tokens = tokenize(string)
|
tokens = EN.tokenize(string)
|
||||||
assert len(tokens) == 2
|
assert len(tokens) == 2
|
||||||
assert unhash(tokens[0].lex) == p
|
assert tokens[0].string == p
|
||||||
assert unhash(tokens[1].lex) == word_str
|
assert tokens[1].string == word_str
|
||||||
|
|
||||||
|
|
||||||
def test_two_different_open(open_puncts):
|
def test_two_different_open(open_puncts):
|
||||||
word_str = 'Hello'
|
word_str = 'Hello'
|
||||||
for p in open_puncts:
|
for p in open_puncts:
|
||||||
string = p + "`" + word_str
|
string = p + "`" + word_str
|
||||||
tokens = tokenize(string)
|
tokens = EN.tokenize(string)
|
||||||
assert len(tokens) == 3
|
assert len(tokens) == 3
|
||||||
assert unhash(tokens[0].lex) == p
|
assert tokens[0].string == p
|
||||||
assert unhash(tokens[1].lex) == "`"
|
assert tokens[1].string == "`"
|
||||||
assert unhash(tokens[2].lex) == word_str
|
assert tokens[2].string == word_str
|
||||||
|
|
||||||
|
|
||||||
def test_three_same_open(open_puncts):
|
def test_three_same_open(open_puncts):
|
||||||
word_str = 'Hello'
|
word_str = 'Hello'
|
||||||
for p in open_puncts:
|
for p in open_puncts:
|
||||||
string = p + p + p + word_str
|
string = p + p + p + word_str
|
||||||
tokens = tokenize(string)
|
tokens = EN.tokenize(string)
|
||||||
assert len(tokens) == 4
|
assert len(tokens) == 4
|
||||||
assert unhash(tokens[0].lex) == p
|
assert tokens[0].string == p
|
||||||
assert unhash(tokens[3].lex) == word_str
|
assert tokens[3].string == word_str
|
||||||
|
|
||||||
|
|
||||||
def test_open_appostrophe():
|
def test_open_appostrophe():
|
||||||
string = "'The"
|
string = "'The"
|
||||||
tokens = tokenize(string)
|
tokens = EN.tokenize(string)
|
||||||
assert len(tokens) == 2
|
assert len(tokens) == 2
|
||||||
assert unhash(tokens[0].lex) == "'"
|
assert tokens[0].string == "'"
|
||||||
|
|
|
@ -1,8 +1,6 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from spacy.en import tokenize
|
from spacy.en import EN
|
||||||
from spacy.en import lookup
|
|
||||||
from spacy.en import unhash
|
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
@ -16,22 +14,22 @@ def test_token(paired_puncts):
|
||||||
word_str = 'Hello'
|
word_str = 'Hello'
|
||||||
for open_, close_ in paired_puncts:
|
for open_, close_ in paired_puncts:
|
||||||
string = open_ + word_str + close_
|
string = open_ + word_str + close_
|
||||||
tokens = tokenize(string)
|
tokens = EN.tokenize(string)
|
||||||
assert len(tokens) == 3
|
assert len(tokens) == 3
|
||||||
assert unhash(tokens[0].lex) == open_
|
assert tokens[0].string == open_
|
||||||
assert unhash(tokens[1].lex) == word_str
|
assert tokens[1].string == word_str
|
||||||
assert unhash(tokens[2].lex) == close_
|
assert tokens[2].string == close_
|
||||||
|
|
||||||
|
|
||||||
def test_two_different(paired_puncts):
|
def test_two_different(paired_puncts):
|
||||||
word_str = 'Hello'
|
word_str = 'Hello'
|
||||||
for open_, close_ in paired_puncts:
|
for open_, close_ in paired_puncts:
|
||||||
string = "`" + open_ + word_str + close_ + "'"
|
string = "`" + open_ + word_str + close_ + "'"
|
||||||
tokens = tokenize(string)
|
tokens = EN.tokenize(string)
|
||||||
assert len(tokens) == 5
|
assert len(tokens) == 5
|
||||||
assert unhash(tokens[0].lex) == "`"
|
assert tokens[0].string == "`"
|
||||||
assert unhash(tokens[1].lex) == open_
|
assert tokens[1].string == open_
|
||||||
assert unhash(tokens[2].lex) == word_str
|
assert tokens[2].string == word_str
|
||||||
assert unhash(tokens[2].lex) == word_str
|
assert tokens[2].string == word_str
|
||||||
assert unhash(tokens[3].lex) == close_
|
assert tokens[3].string == close_
|
||||||
assert unhash(tokens[4].lex) == "'"
|
assert tokens[4].string == "'"
|
||||||
|
|
Loading…
Reference in New Issue
Block a user