* Initial commit. Tests passing for punctuation handling. Need contractions, file transport, tokenize function, etc.

This commit is contained in:
Matthew Honnibal 2014-07-05 20:51:42 +02:00
parent 5c1705d5be
commit 556f6a18ca
17 changed files with 9704 additions and 0 deletions

28
spacy/__init__.py Normal file
View File

@ -0,0 +1,28 @@
from .lexeme import lex_of
from .lexeme import sic_of
__all__ = [lex_of, sic_of]
"""
from .tokens import ids_from_string
from .tokens import group_by
from .lex import sic_of
from .lex import lex_of
from .lex import normed_of
from .lex import first_of
from .lex import last_three_of
from .lex import cluster_of
from .lex import prob_of
from .lex import is_oft_upper
from .lex import is_oft_title
from .lex import can_noun
from .lex import can_verb
from .lex import can_adj
from .lex import can_adv
"""

4529
spacy/en.cpp Normal file

File diff suppressed because it is too large Load Diff

17
spacy/en.pxd Normal file
View File

@ -0,0 +1,17 @@
from ext.sparsehash cimport dense_hash_map
from spacy.lexeme cimport StringHash
from spacy.lexeme cimport Lexeme
ctypedef Py_UNICODE* string_ptr
ctypedef size_t Lexeme_addr # For python interop
ctypedef Lexeme* Lexeme_ptr
cdef dense_hash_map[StringHash, Lexeme_ptr] LEXEMES
cpdef Lexeme_addr lookup(unicode word) except 0
cpdef Lexeme_addr lookup_chunk(unicode chunk, int start, int end) except 0
cdef StringHash hash_string(unicode s, size_t length) except 0
cpdef unicode unhash(StringHash hash_value)

165
spacy/en.pyx Normal file
View File

@ -0,0 +1,165 @@
'''Serve pointers to Lexeme structs, given strings. Maintain a reverse index,
so that strings can be retrieved from hashes. Use 64-bit hash values and
boldly assume no collisions.
'''
from __future__ import unicode_literals
from libc.stdlib cimport malloc, calloc, free
from libc.stdint cimport uint64_t
from spacy.lexeme cimport Lexeme
from ext.murmurhash cimport MurmurHash64A
from ext.murmurhash cimport MurmurHash64B
STRINGS = {}
LEXEMES = dense_hash_map[StringHash, Lexeme_ptr]()
LEXEMES.set_empty_key(0)
cdef Lexeme BLANK_WORD = Lexeme(0, 0, 0, 0, 0, 0.0, 0, False, False, NULL)
cpdef Lexeme_addr lookup(unicode string) except 0:
'''.. function:: enumerate(sequence[, start=0])
Fetch a Lexeme representing a word string. If the word has not been seen,
construct one, splitting off any attached punctuation or clitics. A
reference to BLANK_WORD is returned for the empty string.
To specify the boundaries of the word if it has not been seen, use lookup_chunk.
'''
if string == '':
return <Lexeme_addr>&BLANK_WORD
cdef size_t length = len(string)
cdef StringHash hashed = hash_string(string, length)
cdef Lexeme* word_ptr = LEXEMES[hashed]
cdef size_t n
if word_ptr == NULL:
word_ptr = _add(hashed, string, _find_split(string, length), length)
return <Lexeme_addr>word_ptr
cpdef Lexeme_addr lookup_chunk(unicode string, int start, int end) except 0:
'''Fetch a Lexeme representing a word string. If the word has not been seen,
construct one, given the specified start and end indices. A negative index
significes 0 for start, and the string length for end --- i.e. the string
will not be sliced if start == -1 and end == -1.
A reference to BLANK_WORD is returned for the empty string.
'''
if string == '':
return <Lexeme_addr>&BLANK_WORD
cdef size_t length = len(string)
cdef StringHash hashed = hash_string(string, length)
cdef Lexeme* chunk_ptr = LEXEMES[hashed]
if chunk_ptr == NULL:
chunk_ptr = _add(hashed, string, start, length)
return <Lexeme_addr>chunk_ptr
cdef StringHash hash_string(unicode s, size_t length) except 0:
'''Hash unicode with MurmurHash64A'''
assert length
return MurmurHash64A(<string_ptr>s, length * sizeof(Py_UNICODE), 0)
cpdef unicode unhash(StringHash hash_value):
'''Fetch a string from the reverse index, given its hash value.'''
cdef string_ptr string = STRINGS[hash_value]
if string == NULL:
raise ValueError(hash_value)
return string
cdef unicode normalize_word_string(unicode word):
'''Return a normalized version of the word, mapping:
- 4 digit strings into !YEAR
- Other digit strings into !DIGITS
- All other strings into lower-case
'''
cdef unicode s
if word.isdigit() and len(word) == 4:
return '!YEAR'
elif word[0].isdigit():
return '!DIGITS'
else:
return word.lower()
cpdef unicode _substr(unicode string, int start, int end, size_t length):
if end >= length:
end = -1
if start >= length:
start = 0
if start <= 0 and end < 0:
return string
elif start < 0:
start = 0
elif end < 0:
end = length
return string[start:end]
cdef Lexeme* _add(StringHash hashed, unicode string, int split, size_t length) except NULL:
assert string
assert split <= length
word = _init_lexeme(string, hashed, split, length)
LEXEMES[hashed] = word
STRINGS[hashed] = string
return word
cdef Lexeme* _init_lexeme(unicode string, StringHash hashed,
int split, size_t length) except NULL:
assert split <= length
cdef Lexeme* word = <Lexeme*>calloc(1, sizeof(Lexeme))
word.first = <Py_UNICODE>(string[0] if string else 0)
word.sic = hashed
cdef unicode tail_string
cdef unicode lex
if split != 0 and split < length:
lex = _substr(string, 0, split, length)
tail_string = _substr(string, split, length, length)
else:
lex = string
tail_string = ''
assert lex
cdef unicode normed = normalize_word_string(lex)
cdef unicode last3 = _substr(string, length - 3, length, length)
assert normed
assert len(normed)
word.lex = hash_string(lex, len(lex))
word.normed = hash_string(normed, len(normed))
word.last3 = hash_string(last3, len(last3))
STRINGS[word.lex] = lex
STRINGS[word.normed] = normed
STRINGS[word.last3] = last3
# These are loaded later
word.prob = 0
word.cluster = 0
word.oft_upper = False
word.oft_title = False
# Now recurse, and deal with the tail
if tail_string:
word.tail = <Lexeme*>lookup(tail_string)
return word
cdef size_t _find_split(unicode word, size_t length):
cdef size_t i = 0
if word[0].isalnum():
while i < length and word[i].isalnum():
i += 1
else:
# Split off a punctuation character, or a sequence of the same punctuation character
while i < length and not word[i].isalnum() and (i == 0 or word[i-1] == word[i]):
i += 1
return i

2433
spacy/lexeme.cpp Normal file

File diff suppressed because it is too large Load Diff

35
spacy/lexeme.pxd Normal file
View File

@ -0,0 +1,35 @@
from libc.stdint cimport uint64_t
ctypedef int ClusterID
ctypedef uint64_t StringHash
cdef struct Lexeme:
StringHash sic # Hash of the original string
StringHash lex # Hash of the word, with punctuation and clitics split off
StringHash normed # Hash of the normalized version of lex
StringHash last3 # Last 3 characters of the token
Py_UNICODE first # First character of the token
double prob # What is the log probability of the lex value?
ClusterID cluster # Brown cluster of the token
bint oft_upper # Is the lowered version of the lex value often in all caps?
bint oft_title # Is the lowered version of the lex value often title-cased?
Lexeme* tail # Lexemes are linked lists, to deal with sub-tokens
# Use these to access the Lexeme fields via get_attr(Lexeme*, LexAttr), which
# has a conditional to pick out the correct item. This allows safe iteration
# over the Lexeme, via:
# for field in range(LexAttr.n): get_attr(Lexeme*, field)
cdef enum HashFields:
sic
lex
normed
cluster
n
#cdef uint64_t get_attr(Lexeme* word, HashFields attr)

114
spacy/lexeme.pyx Normal file
View File

@ -0,0 +1,114 @@
'''Accessors for Lexeme properties, given a lex_id, which is cast to a Lexeme*.
Mostly useful from Python-space. From Cython-space, you can just cast to
Lexeme* yourself.
'''
cpdef StringHash sic_of(size_t lex_id) except 0:
'''Access the `sic' field of the Lexeme pointed to by lex_id.
The sic field stores the hash of the whitespace-delimited string-chunk used to
construct the Lexeme.
>>> [unhash(sic_of(lex_id)) for lex_id in from_string(u'Hi! world')]
[u'Hi!', u'', u'world]
'''
return (<Lexeme*>lex_id).sic
cpdef StringHash lex_of(size_t lex_id) except 0:
'''Access the `lex' field of the Lexeme pointed to by lex_id.
The lex field is the hash of the string you would expect to get back from
a standard tokenizer, i.e. the word with punctuation and other non-whitespace
delimited tokens split off. The other fields refer to properties of the
string that the lex field stores a hash of, except sic and tail.
>>> [unhash(lex_of(lex_id) for lex_id in from_string(u'Hi! world')]
[u'Hi', u'!', u'world']
'''
return (<Lexeme*>lex_id).lex
cpdef ClusterID cluster_of(size_t lex_id):
'''Access the `cluster' field of the Lexeme pointed to by lex_id, which
gives an integer representation of the cluster ID of the word,
which should be understood as a binary address:
>>> strings = (u'pineapple', u'apple', u'dapple', u'scalable')
>>> token_ids = [lookup(s) for s in strings]
>>> clusters = [cluster_of(t) for t in token_ids]
>>> print ["{0:b"} % cluster_of(t) for t in token_ids]
["100111110110", "100111100100", "01010111011001", "100111110110"]
The clusterings are unideal, but often slightly useful.
"pineapple" and "apple" share a long prefix, indicating a similar meaning,
while "dapple" is totally different. On the other hand, "scalable" receives
the same cluster ID as "pineapple", which is not what we'd like.
'''
return (<Lexeme*>lex_id).cluster
cpdef Py_UNICODE first_of(size_t lex_id):
'''Access the `first' field of the Lexeme pointed to by lex_id, which
stores the first character of the lex string of the word.
>>> lex_id = lookup(u'Hello')
>>> unhash(first_of(lex_id))
u'H'
'''
return (<Lexeme*>lex_id).first
cpdef double prob_of(size_t lex_id):
'''Access the `prob' field of the Lexeme pointed to by lex_id, which stores
the smoothed unigram log probability of the word, as estimated from a large
text corpus. By default, probabilities are based on counts from Gigaword,
smoothed using Knesser-Ney; but any probabilities file can be supplied to
load_probs.
>>> prob_of(lookup(u'world'))
-20.10340371976182
'''
pass
cpdef StringHash last3_of(size_t lex_id):
'''Access the `last3' field of the Lexeme pointed to by lex_id, which stores
the hash of the last three characters of the word:
>>> lex_ids = [lookup(w) for w in (u'Hello', u'!')]
>>> [unhash(last3_of(lex_id)) for lex_id in lex_ids]
[u'llo', u'!']
'''
return (<Lexeme*>lex_id).last3
cpdef bint is_oft_upper(size_t lex_id):
'''Access the `oft_upper' field of the Lexeme pointed to by lex_id, which
stores whether the lowered version of the string hashed by `lex' is found
in all-upper case frequently in a large sample of text. Users are free
to load different data, by default we use a sample from Wikipedia, with
a threshold of 0.95, picked to maximize mutual information for POS tagging.
>>> is_oft_upper(lookup(u'abc'))
True
>>> is_oft_upper(lookup(u'aBc')) # This must get the same answer
True
'''
return (<Lexeme*>lex_id).oft_upper
cpdef bint is_oft_title(size_t lex_id):
'''Access the `oft_upper' field of the Lexeme pointed to by lex_id, which
stores whether the lowered version of the string hashed by `lex' is found
title-cased frequently in a large sample of text. Users are free
to load different data, by default we use a sample from Wikipedia, with
a threshold of 0.3, picked to maximize mutual information for POS tagging.
>>> is_oft_title(lookup(u'marcus'))
True
>>> is_oft_title(lookup(u'MARCUS')) # This must get the same value
True
'''
return (<Lexeme*>lex_id).oft_title

2064
spacy/spacy.cpp Normal file

File diff suppressed because it is too large Load Diff

5
spacy/spacy.pxd Normal file
View File

@ -0,0 +1,5 @@
from libcpp.vector cimport vector
from spacy.lexeme cimport Lexeme
cpdef vector[size_t] expand_chunk(size_t addr) except *

72
spacy/spacy.pyx Normal file
View File

@ -0,0 +1,72 @@
from __future__ import unicode_literals
from spacy.lexeme cimport Lexeme
cpdef vector[size_t] expand_chunk(size_t addr) except *:
cdef vector[size_t] tokens = vector[size_t]()
word = <Lexeme*>addr
while word is not NULL:
tokens.push_back(<size_t>word)
word = word.tail
return tokens
"""
cpdef vector[size_t] ids_from_text(unicode text) except *:
cdef size_t length = len(text)
cdef Py_UNICODE* characters = <Py_UNICODE*>text
cdef size_t i
cdef Py_UNICODE c
cdef vector[size_t] tokens = vector[size_t]()
cdef unicode current = u''
cdef Lexeme* token
cdef int alnum_end = -1
cdef size_t alnum_start = 0
cdef bint seen_alnum = False
for i in range(length):
c = characters[i]
if is_whitespace(c):
token = <Lexeme*>lookup(current)
tokens.push_back(<size_t>token)
clitic = 0
while token.clitics[clitic]:
tokens.push_back(token.clitics[clitic])
clitic += 1
current = u''
alnum_start = 0
alnum_end = -1
seen_alnum = False
else:
if not seen_alnum and c.isalnum():
alnum_start = i
seen_alnum = True
elif seen_alnum and alnum_end == -1 and not c.isalnum():
alnum_end = i
current += c
if current:
token = <Lexeme*>lookup(current)
tokens.push_back(<size_t>token)
clitic = 0
while token.clitics[clitic]:
tokens.push_back(token.clitics[clitic])
clitic += 1
return tokens
"""
#cdef vector[Tokens] group_by(Tokens tokens, LexAttr field) except *:
# pass
cdef inline bint is_whitespace(Py_UNICODE c):
# TODO: Support other unicode spaces
# https://www.cs.tut.fi/~jkorpela/chars/spaces.html
if c == u' ':
return True
elif c == u'\n':
return True
elif c == u'\t':
return True
else:
return False

75
spacy/util.py Normal file
View File

@ -0,0 +1,75 @@
def utf8open(loc, mode='r'):
return codecs.open(loc, mode, 'utf8')
def load_case_stats(data_dir):
case_loc = path.join(data_dir, 'english.case')
case_stats = {}
with utf8open(case_loc) as cases_file:
for line in cases_file:
word, upper, title = line.split()
case_stats[word] = (float(upper), float(title))
return case_stats
def load_clitics(data_dir):
clitics_loc = path.join(data_dir, 'clitics.txt')
entries = []
seen = set()
with utf8open(clitics_loc) as clitics_file:
for line in clitics_file:
line = line.strip()
if line.startswith('#'):
continue
if not line:
continue
clitics = line.split()
word = clitics.pop(0)
norm_form = clitics.pop(0)
assert word not in seen, word
seen.add(word)
entries.append((word, norm_form, clitics))
return entries
"""
def load_browns(self, data_dir):
cdef Lexeme* w
case_stats = load_case_stats(data_dir)
brown_loc = path.join(data_dir, 'bllip-clusters')
assert path.exists(brown_loc)
cdef size_t start
cdef int end
with utf8open(brown_loc) as browns_file:
for i, line in enumerate(browns_file):
cluster_str, word, freq_str = line.split()
# Decode as a little-endian string, so that we can do & 15 to get
# the first 4 bits. See redshift._parse_features.pyx
cluster = int(cluster_str[::-1], 2)
upper_pc, title_pc = case_stats.get(word.lower(), (0.0, 0.0))
start = 0
end = -1
find_slice(&start, &end, word)
print "Load", repr(word), start, end
w = <Lexeme*>init_word(word, start, end, cluster,
upper_pc, title_pc, int(freq_str))
self.words[_hash_str(word)] = <size_t>w
self.strings[<size_t>w] = word
def load_clitics(self, data_dir):
cdef unicode orig_str
cdef unicode clitic
for orig_str, norm_form, clitic_strs in util.load_clitics(data_dir):
w = init_clitic(orig_str, <Lexeme*>self.lookup_slice(norm_form, 0, -1))
self.words[w.orig] = <size_t>w
self.strings[<size_t>w] = orig_str
assert len(clitic_strs) < MAX_CLITICS
assert clitic_strs
for i, clitic in enumerate(clitic_strs):
# If we write punctuation here, assume we want to keep it,
# so tell it the slice boundaries (the full string)
w.clitics[i] = self.lookup_slice(clitic, 0, -1)
# Ensure we null terminate
w.clitics[i+1] = 0
"""

Binary file not shown.

0
tests/my_test.py Normal file
View File

48
tests/test_post_punct.py Normal file
View File

@ -0,0 +1,48 @@
from __future__ import unicode_literals
from spacy import lex_of
from spacy.spacy import expand_chunk
from spacy.en import lookup
from spacy.en import unhash
import pytest
@pytest.fixture
def close_puncts():
return [')', ']', '}', '*']
def test_close(close_puncts):
word_str = 'Hello'
for p in close_puncts:
string = word_str + p
token = lookup(string)
assert unhash(lex_of(token)) == word_str
tokens = expand_chunk(token)
assert len(tokens) == 2
assert unhash(lex_of(tokens[0])) == word_str
assert unhash(lex_of(tokens[1])) == p
def test_two_different_close(close_puncts):
word_str = 'Hello'
for p in close_puncts:
string = word_str + p + "'"
token = lookup(string)
assert unhash(lex_of(token)) == word_str
tokens = expand_chunk(token)
assert len(tokens) == 3
assert unhash(lex_of(tokens[0])) == word_str
assert unhash(lex_of(tokens[1])) == p
assert unhash(lex_of(tokens[2])) == "'"
def test_three_same_close(close_puncts):
word_str = 'Hello'
for p in close_puncts:
string = word_str + p + p + p
tokens = expand_chunk(lookup(string))
assert len(tokens) == 2
assert unhash(lex_of(tokens[0])) == word_str
assert unhash(lex_of(tokens[1])) == p + p + p

50
tests/test_pre_punct.py Normal file
View File

@ -0,0 +1,50 @@
from __future__ import unicode_literals
from spacy import lex_of
from spacy.spacy import expand_chunk
from spacy.en import lookup
from spacy.en import unhash
import pytest
@pytest.fixture
def open_puncts():
return ['(', '[', '{', '*']
def test_open(open_puncts):
word_str = 'Hello'
for p in open_puncts:
string = p + word_str
token = lookup(string)
assert unhash(lex_of(token)) == p
tokens = expand_chunk(token)
assert len(tokens) == 2
assert unhash(lex_of(tokens[0])) == p
assert unhash(lex_of(tokens[1])) == word_str
def test_two_different_open(open_puncts):
word_str = 'Hello'
for p in open_puncts:
string = p + "`" + word_str
token = lookup(string)
assert unhash(lex_of(token)) == p
tokens = expand_chunk(token)
assert len(tokens) == 3
assert unhash(lex_of(tokens[0])) == p
assert unhash(lex_of(tokens[1])) == "`"
assert unhash(lex_of(tokens[2])) == word_str
def test_three_same_open(open_puncts):
word_str = 'Hello'
for p in open_puncts:
string = p + p + p + word_str
token = lookup(string)
assert unhash(lex_of(token)) == p + p + p
tokens = expand_chunk(token)
assert len(tokens) == 2
assert unhash(lex_of(tokens[0])) == p + p + p
assert unhash(lex_of(tokens[1])) == word_str

View File

@ -0,0 +1,39 @@
from __future__ import unicode_literals
from spacy import lex_of, sic_of
from spacy.spacy import expand_chunk
from spacy.en import lookup
from spacy.en import unhash
import pytest
@pytest.fixture
def paired_puncts():
return [('(', ')'), ('[', ']'), ('{', '}'), ('*', '*')]
def test_token(paired_puncts):
word_str = 'Hello'
for open_, close_ in paired_puncts:
string = open_ + word_str + close_
tokens = expand_chunk(lookup(string))
assert len(tokens) == 3
assert unhash(lex_of(tokens[0])) == open_
assert unhash(lex_of(tokens[1])) == word_str
assert unhash(lex_of(tokens[2])) == close_
assert unhash(sic_of(tokens[0])) == string
def test_two_different(paired_puncts):
word_str = 'Hello'
for open_, close_ in paired_puncts:
string = "`" + open_ + word_str + close_ + "'"
tokens = expand_chunk(lookup(string))
assert len(tokens) == 5
assert unhash(lex_of(tokens[0])) == "`"
assert unhash(lex_of(tokens[1])) == open_
assert unhash(lex_of(tokens[2])) == word_str
assert unhash(lex_of(tokens[2])) == word_str
assert unhash(lex_of(tokens[3])) == close_
assert unhash(lex_of(tokens[4])) == "'"

30
tests/test_vocab.py Normal file
View File

@ -0,0 +1,30 @@
from __future__ import unicode_literals
from spacy import lex_of
from spacy.en import lookup
from spacy.en import unhash
def test_neq():
addr = lookup('Hello')
assert lookup('bye') != addr
def test_eq():
addr = lookup('Hello')
assert lookup('Hello') == addr
def test_round_trip():
hello = lookup('Hello')
assert unhash(lex_of(hello)) == 'Hello'
def test_case_neq():
addr = lookup('Hello')
assert lookup('hello') != addr
def test_punct_neq():
addr = lookup('Hello')
assert lookup('Hello,') != addr