Merge branch 'feature/contractions' into develop

This commit is contained in:
Matthew Honnibal 2014-07-07 05:11:43 +02:00
commit aaae66114c
36 changed files with 926535 additions and 12119 deletions

4
.gitignore vendored
View File

@ -1,6 +1,10 @@
# Vim # Vim
*.swp *.swp
spacy/*.cpp
ext/murmurhash.cpp
ext/sparsehash.cpp
_build/ _build/
.env/ .env/

146129
data/en/case Normal file

File diff suppressed because it is too large Load Diff

316709
data/en/clusters Normal file

File diff suppressed because it is too large Load Diff

93
data/en/tokenization Normal file
View File

@ -0,0 +1,93 @@
# https://en.wikipedia.org/wiki/Wikipedia:List_of_English_contractions
# 21:09, 25 June 2014
#*--* --
#*---* ---
#*'s 's
ain't are not
aren't are not
can't can not
could've could have
couldn't could not
couldn't've could not have
didn't did not
doesn't does not
don't do not
hadn't had not
hadn't've had not have
hasn't has not
haven't have not
he'd he would
he'd've he would have
he'll he will
he's he 's
how'd he would
how'll he will
how's how 's
I'd I would
I'd've I would have
I'll I will
I'm I am
I've I have
isn't is not
it'd it would
it'd've it would have
it'll it will
it's it 's
let's let 's
mightn't might not
mightn't've might not have
might've might have
mustn't must not
must've must have
needn't need not
not've not have
shan't shall not
she'd she would
she'd've she would have
she'll she will
she's she 's
should've should have
shouldn't should not
shouldn't've should not have
that's that 's
there'd there would
there'd've there would have
there's there is
they'd there would
they'd've they would have
they'll they will
they're they are
they've they have
wasn't was not
we'd we would
we'd've we would have
we'll we will
we're we are
we've we have
weren't were not
what'll what will
what're what are
what's what 's
what've what have
when's when 's
where'd where would
where's where 's
where've where have
who'd who would
who'll who will
who're who are
who's who 's
who've who have
why'll who will
why're why are
why's why 's
won't will not
would've would have
wouldn't would not
wouldn't've would not have
you'd you would
you'd've you would have
you'll you will
you're you are
you've you have

146129
data/en_ptb/case Normal file

File diff suppressed because it is too large Load Diff

316709
data/en_ptb/clusters Normal file

File diff suppressed because it is too large Load Diff

104
data/en_ptb/tokenization Normal file
View File

@ -0,0 +1,104 @@
# https://en.wikipedia.org/wiki/Wikipedia:List_of_English_contractions
# 21:09, 25 June 2014
#*--* --
#*---* ---
#*'s 's
cannot can not
d'ye d' ye
gimme gim me
gonna gon na
lemme lem me
more'n more 'n
'tis 't is
'twas 't was
wanna wan na
whaddya wha dd ya
whatcha wha t cha
ain't ai n't
aren't are n't
can't can n't
could've could 've
couldn't could n't
couldn't've could n't 've
didn't did n't
doesn't does n't
don't do n't
hadn't had n't
hadn't've had n't 've
hasn't has n't
haven't have n't
he'd he 'd
he'd've he 'd 've
he'll he 'll
he's he 's
how'd he 'd
how'll he 'll
how's how 's
I'd I 'd
I'd've I 'd 've
I'll I 'll
I'm I 'm
I've I 've
isn't is n't
it'd it 'd
it'd've it 'd 've
it'll it 'll
it's it 's
let's let 's
mightn't might n't
mightn't've might n't 've
might've might 've
mustn't must n't
must've must 've
needn't need n't
not've not h've
shan't sha n't
she'd she 'd
she'd've she 'd 've
she'll she 'll
she's she 's
should've should 've
shouldn't should n't
shouldn't've should n't 've
that's that 's
there'd there 'd
there'd've there 'd 've
there's there 's
they'd there 'd
they'd've they 'd 've
they'll they 'll
they're they 're
they've they 've
wasn't was n't
we'd we 'd
we'd've we 'd h've
we'll we 'll
we're we 're
we've we h've
weren't were n't
what'll what 'll
what're what 're
what's what 's
what've what 've
when's when 's
where'd where 'd
where's where 's
where've where 've
who'd who 'd
who'll who 'll
who're who 're
who's who 's
who've who 've
why'll why 'll
why're why 're
why's why 's
won't will n't
would've would 've
wouldn't would n't
wouldn't've would n't 've
you'd you 'd
you'd've you 'd 've
you'll you 'll
you're you 're
you've you 've

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -45,8 +45,18 @@ exts = [
["spacy/en.pyx", "ext/MurmurHash3.cpp", "ext/MurmurHash2.cpp"], ["spacy/en.pyx", "ext/MurmurHash3.cpp", "ext/MurmurHash2.cpp"],
language="c++", language="c++",
include_dirs=[path.join(HERE, 'ext')]), include_dirs=[path.join(HERE, 'ext')]),
Extension("spacy.en_ptb",
["spacy/en_ptb.pyx", "ext/MurmurHash3.cpp", "ext/MurmurHash2.cpp"],
language="c++",
include_dirs=[path.join(HERE, 'ext')]),
Extension("spacy.lexeme", ["spacy/lexeme.pyx"], language="c++", include_dirs=includes), Extension("spacy.lexeme", ["spacy/lexeme.pyx"], language="c++", include_dirs=includes),
Extension("spacy.spacy", ["spacy/spacy.pyx"], language="c++", include_dirs=includes), Extension("spacy.spacy",
["spacy/spacy.pyx", "ext/MurmurHash3.cpp", "ext/MurmurHash2.cpp"],
language="c++", include_dirs=includes),
Extension("spacy.string_tools",
["spacy/string_tools.pyx", "ext/MurmurHash3.cpp", "ext/MurmurHash2.cpp"],
language="c++", include_dirs=includes),
] ]

File diff suppressed because it is too large Load Diff

View File

@ -1,17 +1,15 @@
from ext.sparsehash cimport dense_hash_map from libcpp.vector cimport vector
from spacy.lexeme cimport StringHash
from spacy.spacy cimport StringHash
from spacy.spacy cimport Vocab
from spacy.lexeme cimport Lexeme from spacy.lexeme cimport Lexeme
from spacy.lexeme cimport Lexeme_addr
ctypedef Py_UNICODE* string_ptr cdef Vocab VOCAB
ctypedef size_t Lexeme_addr # For python interop cdef dict BACOV
ctypedef Lexeme* Lexeme_ptr
cdef dense_hash_map[StringHash, Lexeme_ptr] LEXEMES
cpdef Lexeme_addr lookup(unicode word) except 0 cpdef Lexeme_addr lookup(unicode word) except 0
cpdef Lexeme_addr lookup_chunk(unicode chunk, int start, int end) except 0 cpdef vector[Lexeme_addr] tokenize(unicode string) except *
cdef StringHash hash_string(unicode s, size_t length) except 0
cpdef unicode unhash(StringHash hash_value) cpdef unicode unhash(StringHash hash_value)

View File

@ -6,160 +6,69 @@ from __future__ import unicode_literals
from libc.stdlib cimport malloc, calloc, free from libc.stdlib cimport malloc, calloc, free
from libc.stdint cimport uint64_t from libc.stdint cimport uint64_t
from libcpp.vector cimport vector
from spacy.lexeme cimport Lexeme from spacy.lexeme cimport Lexeme
from ext.murmurhash cimport MurmurHash64A from spacy.string_tools cimport substr
from ext.murmurhash cimport MurmurHash64B from . import util
cimport spacy
BACOV = {}
VOCAB = Vocab()
VOCAB.set_empty_key(0)
STRINGS = {} spacy.load_tokenization(VOCAB, BACOV, util.read_tokenization('en'))
LEXEMES = dense_hash_map[StringHash, Lexeme_ptr]()
LEXEMES.set_empty_key(0)
cdef Lexeme BLANK_WORD = Lexeme(0, 0, 0, 0, 0, 0.0, 0, False, False, NULL) cpdef vector[Lexeme_addr] tokenize(unicode string) except *:
return spacy.tokenize(VOCAB, BACOV, find_split, string)
cpdef Lexeme_addr lookup(unicode string) except 0: cpdef Lexeme_addr lookup(unicode string) except 0:
'''.. function:: enumerate(sequence[, start=0]) return spacy.lookup(VOCAB, BACOV, find_split, -1, string)
Fetch a Lexeme representing a word string. If the word has not been seen,
construct one, splitting off any attached punctuation or clitics. A
reference to BLANK_WORD is returned for the empty string.
To specify the boundaries of the word if it has not been seen, use lookup_chunk.
'''
if string == '':
return <Lexeme_addr>&BLANK_WORD
cdef size_t length = len(string)
cdef StringHash hashed = hash_string(string, length)
cdef Lexeme* word_ptr = LEXEMES[hashed]
cdef size_t n
if word_ptr == NULL:
word_ptr = _add(hashed, string, _find_split(string, length), length)
return <Lexeme_addr>word_ptr
cpdef Lexeme_addr lookup_chunk(unicode string, int start, int end) except 0:
'''Fetch a Lexeme representing a word string. If the word has not been seen,
construct one, given the specified start and end indices. A negative index
significes 0 for start, and the string length for end --- i.e. the string
will not be sliced if start == -1 and end == -1.
A reference to BLANK_WORD is returned for the empty string.
'''
if string == '':
return <Lexeme_addr>&BLANK_WORD
cdef size_t length = len(string)
cdef StringHash hashed = hash_string(string, length)
cdef Lexeme* chunk_ptr = LEXEMES[hashed]
if chunk_ptr == NULL:
chunk_ptr = _add(hashed, string, start, length)
return <Lexeme_addr>chunk_ptr
cdef StringHash hash_string(unicode s, size_t length) except 0:
'''Hash unicode with MurmurHash64A'''
assert length
return MurmurHash64A(<string_ptr>s, length * sizeof(Py_UNICODE), 0)
cpdef unicode unhash(StringHash hash_value): cpdef unicode unhash(StringHash hash_value):
'''Fetch a string from the reverse index, given its hash value.''' return spacy.unhash(BACOV, hash_value)
cdef string_ptr string = STRINGS[hash_value]
if string == NULL:
raise ValueError(hash_value)
return string
cdef unicode normalize_word_string(unicode word): cdef vector[StringHash] make_string_views(unicode word):
'''Return a normalized version of the word, mapping:
- 4 digit strings into !YEAR
- Other digit strings into !DIGITS
- All other strings into lower-case
'''
cdef unicode s cdef unicode s
if word.isdigit() and len(word) == 4: return vector[StringHash]()
return '!YEAR' #if word.isdigit() and len(word) == 4:
elif word[0].isdigit(): # return '!YEAR'
return '!DIGITS' #elif word[0].isdigit():
else: # return '!DIGITS'
return word.lower() #else:
# return word.lower()
cpdef unicode _substr(unicode string, int start, int end, size_t length):
if end >= length:
end = -1
if start >= length:
start = 0
if start <= 0 and end < 0:
return string
elif start < 0:
start = 0
elif end < 0:
end = length
return string[start:end]
cdef Lexeme* _add(StringHash hashed, unicode string, int split, size_t length) except NULL: cdef int find_split(unicode word, size_t length):
assert string cdef int i = 0
assert split <= length # Contractions
word = _init_lexeme(string, hashed, split, length) if word.endswith("'s"):
LEXEMES[hashed] = word return length - 2
STRINGS[hashed] = string # Leading punctuation
return word if is_punct(word, 0, length):
return 1
elif length >= 1:
cdef Lexeme* _init_lexeme(unicode string, StringHash hashed, # Split off all trailing punctuation characters
int split, size_t length) except NULL: i = 0
assert split <= length while i < length and not is_punct(word, i, length):
cdef Lexeme* word = <Lexeme*>calloc(1, sizeof(Lexeme))
word.first = <Py_UNICODE>(string[0] if string else 0)
word.sic = hashed
cdef unicode tail_string
cdef unicode lex
if split != 0 and split < length:
lex = _substr(string, 0, split, length)
tail_string = _substr(string, split, length, length)
else:
lex = string
tail_string = ''
assert lex
cdef unicode normed = normalize_word_string(lex)
cdef unicode last3 = _substr(string, length - 3, length, length)
assert normed
assert len(normed)
word.lex = hash_string(lex, len(lex))
word.normed = hash_string(normed, len(normed))
word.last3 = hash_string(last3, len(last3))
STRINGS[word.lex] = lex
STRINGS[word.normed] = normed
STRINGS[word.last3] = last3
# These are loaded later
word.prob = 0
word.cluster = 0
word.oft_upper = False
word.oft_title = False
# Now recurse, and deal with the tail
if tail_string:
word.tail = <Lexeme*>lookup(tail_string)
return word
cdef size_t _find_split(unicode word, size_t length):
cdef size_t i = 0
if word[0].isalnum():
while i < length and word[i].isalnum():
i += 1
else:
# Split off a punctuation character, or a sequence of the same punctuation character
while i < length and not word[i].isalnum() and (i == 0 or word[i-1] == word[i]):
i += 1 i += 1
return i return i
cdef bint is_punct(unicode word, size_t i, size_t length):
# Don't count appostrophes as punct if the next char is a letter
if word[i] == "'" and i < (length - 1) and word[i+1].isalpha():
return False
# Don't count commas as punct if the next char is a number
if word[i] == "," and i < (length - 1) and word[i+1].isdigit():
return False
# Don't count periods as punct if the next char is a number
if word[i] == "." and i < (length - 1) and word[i+1].isdigit():
return False
return not word[i].isalnum()

15
spacy/en_ptb.pxd Normal file
View File

@ -0,0 +1,15 @@
from libcpp.vector cimport vector
from spacy.spacy cimport StringHash
from spacy.spacy cimport Vocab
from spacy.lexeme cimport Lexeme
from spacy.lexeme cimport Lexeme_addr
cdef Vocab VOCAB
cdef dict BACOV
cpdef Lexeme_addr lookup(unicode word) except 0
cpdef vector[Lexeme_addr] tokenize(unicode string) except *
cpdef unicode unhash(StringHash hash_value)

74
spacy/en_ptb.pyx Normal file
View File

@ -0,0 +1,74 @@
'''Serve pointers to Lexeme structs, given strings. Maintain a reverse index,
so that strings can be retrieved from hashes. Use 64-bit hash values and
boldly assume no collisions.
'''
from __future__ import unicode_literals
from libc.stdlib cimport malloc, calloc, free
from libc.stdint cimport uint64_t
from libcpp.vector cimport vector
from spacy.lexeme cimport Lexeme
from spacy.string_tools cimport substr
from . import util
cimport spacy
BACOV = {}
VOCAB = Vocab()
VOCAB.set_empty_key(0)
spacy.load_tokenization(VOCAB, BACOV, util.read_tokenization('en_ptb'))
cpdef vector[Lexeme_addr] tokenize(unicode string) except *:
return spacy.tokenize(VOCAB, BACOV, find_split, string)
cpdef Lexeme_addr lookup(unicode string) except 0:
return spacy.lookup(VOCAB, BACOV, find_split, -1, string)
cpdef unicode unhash(StringHash hash_value):
return spacy.unhash(BACOV, hash_value)
cdef vector[StringHash] make_string_views(unicode word):
cdef unicode s
return vector[StringHash]()
#if word.isdigit() and len(word) == 4:
# return '!YEAR'
#elif word[0].isdigit():
# return '!DIGITS'
#else:
# return word.lower()
cdef int find_split(unicode word, size_t length):
cdef int i = 0
# Contractions
if word.endswith("'s"):
return length - 2
# Leading punctuation
if is_punct(word, 0, length):
return 1
elif length >= 1:
# Split off all trailing punctuation characters
i = 0
while i < length and not is_punct(word, i, length):
i += 1
return i
cdef bint is_punct(unicode word, size_t i, size_t length):
is_final = i == (length - 1)
if word[i] == '.':
return False
if not is_final and word[i] == '-' and word[i+1] == '-':
return True
# Don't count appostrophes as punct if the next char is a letter
if word[i] == "'" and i < (length - 1) and word[i+1].isalpha():
return False
punct_chars = set(',;:' + '@#$%&' + '!?' + '[({' + '})]')
return word[i] in punct_chars

File diff suppressed because it is too large Load Diff

View File

@ -1,9 +1,12 @@
from libc.stdint cimport uint64_t from libc.stdint cimport uint64_t
# Put these above import to avoid circular import problem
ctypedef int ClusterID ctypedef int ClusterID
ctypedef uint64_t StringHash ctypedef uint64_t StringHash
ctypedef size_t Lexeme_addr
from spacy.spacy cimport Vocab
from spacy.spacy cimport Splitter
cdef struct Lexeme: cdef struct Lexeme:
StringHash sic # Hash of the original string StringHash sic # Hash of the original string
@ -20,6 +23,12 @@ cdef struct Lexeme:
Lexeme* tail # Lexemes are linked lists, to deal with sub-tokens Lexeme* tail # Lexemes are linked lists, to deal with sub-tokens
cdef Lexeme BLANK_WORD = Lexeme(0, 0, 0, 0, 0, 0.0, 0, False, False, NULL)
cdef Lexeme* init_lexeme(Vocab vocab, dict bacov, Splitter find_split,
unicode string, StringHash hashed,
int split, size_t length) except NULL
# Use these to access the Lexeme fields via get_attr(Lexeme*, LexAttr), which # Use these to access the Lexeme fields via get_attr(Lexeme*, LexAttr), which
# has a conditional to pick out the correct item. This allows safe iteration # has a conditional to pick out the correct item. This allows safe iteration
# over the Lexeme, via: # over the Lexeme, via:

View File

@ -2,6 +2,60 @@
Mostly useful from Python-space. From Cython-space, you can just cast to Mostly useful from Python-space. From Cython-space, you can just cast to
Lexeme* yourself. Lexeme* yourself.
''' '''
from __future__ import unicode_literals
from spacy.string_tools cimport substr
from spacy.spacy cimport hash_string
from spacy.spacy cimport lookup
from libc.stdlib cimport malloc, calloc, free
from libc.stdint cimport uint64_t
from libcpp.vector cimport vector
cdef Lexeme* init_lexeme(Vocab vocab, dict bacov, Splitter find_split,
unicode string, StringHash hashed,
int split, size_t length) except NULL:
assert split <= length
cdef Lexeme* word = <Lexeme*>calloc(1, sizeof(Lexeme))
word.first = <Py_UNICODE>(string[0] if string else 0)
word.sic = hashed
cdef unicode tail_string
cdef unicode lex
if split != 0 and split < length:
lex = substr(string, 0, split, length)
tail_string = substr(string, split, length, length)
else:
lex = string
tail_string = ''
assert lex
#cdef unicode normed = normalize_word_string(lex)
cdef unicode normed = '?'
cdef unicode last3 = substr(string, length - 3, length, length)
assert normed
assert len(normed)
word.lex = hash_string(lex, len(lex))
word.normed = hash_string(normed, len(normed))
word.last3 = hash_string(last3, len(last3))
bacov[word.lex] = lex
bacov[word.normed] = normed
bacov[word.last3] = last3
# These are loaded later
word.prob = 0
word.cluster = 0
word.oft_upper = False
word.oft_title = False
# Now recurse, and deal with the tail
if tail_string:
word.tail = <Lexeme*>lookup(vocab, bacov, find_split, -1, tail_string)
return word
cpdef StringHash sic_of(size_t lex_id) except 0: cpdef StringHash sic_of(size_t lex_id) except 0:

File diff suppressed because it is too large Load Diff

View File

@ -1,5 +1,24 @@
from libcpp.vector cimport vector from libcpp.vector cimport vector
from libc.stdint cimport uint64_t
from ext.sparsehash cimport dense_hash_map
# Circular import problems here
ctypedef size_t Lexeme_addr
ctypedef uint64_t StringHash
ctypedef dense_hash_map[StringHash, Lexeme_addr] Vocab
ctypedef int (*Splitter)(unicode word, size_t length)
from spacy.lexeme cimport Lexeme from spacy.lexeme cimport Lexeme
cdef load_tokenization(Vocab& vocab, dict bacov, token_rules)
cdef vector[Lexeme_addr] tokenize(Vocab& vocab, dict bacov, Splitter splitter,
unicode string) except *
cdef Lexeme_addr lookup(Vocab& vocab, dict bacov, Splitter splitter, int start,
unicode string) except 0
cdef StringHash hash_string(unicode s, size_t length) except 0
cdef unicode unhash(dict bacov, StringHash hash_value)
cpdef vector[size_t] expand_chunk(size_t addr) except * cpdef vector[size_t] expand_chunk(size_t addr) except *

View File

@ -1,5 +1,78 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from spacy.lexeme cimport Lexeme
from ext.murmurhash cimport MurmurHash64A
from ext.murmurhash cimport MurmurHash64B
from spacy.lexeme cimport init_lexeme
from spacy.lexeme cimport BLANK_WORD
from spacy.string_tools cimport is_whitespace
from . import util
cdef load_tokenization(Vocab& vocab, dict bacov, token_rules):
cdef Lexeme* word
cdef StringHash hashed
for chunk, lex, tokens in token_rules:
hashed = hash_string(chunk, len(chunk))
assert vocab[hashed] == 0, chunk
word = _add(vocab, bacov, <Splitter>NULL, hashed, lex, len(lex), len(lex))
for i, lex in enumerate(tokens):
token_string = '%s:@:%d:@:%s' % (chunk, i, lex)
length = len(token_string)
hashed = hash_string(token_string, length)
word.tail = _add(vocab, bacov, <Splitter>NULL, hashed, lex, 0, len(lex))
word = word.tail
cdef vector[Lexeme_addr] tokenize(Vocab& vocab, dict bacov, Splitter splitter,
unicode string) except *:
cdef size_t length = len(string)
cdef Py_UNICODE* characters = <Py_UNICODE*>string
cdef size_t i
cdef Py_UNICODE c
cdef vector[Lexeme_addr] tokens = vector[Lexeme_addr]()
cdef unicode current = u''
cdef Lexeme* token
for i in range(length):
c = characters[i]
if is_whitespace(c):
if current:
token = <Lexeme*>lookup(vocab, bacov, splitter, -1, current)
while token != NULL:
tokens.push_back(<Lexeme_addr>token)
token = token.tail
current = u''
else:
current += c
if current:
token = <Lexeme*>lookup(vocab, bacov, splitter, -1, current)
while token != NULL:
tokens.push_back(<Lexeme_addr>token)
token = token.tail
return tokens
cdef Lexeme_addr lookup(Vocab& vocab, dict bacov, Splitter find_split, int start,
unicode string) except 0:
'''Fetch a Lexeme representing a word string. If the word has not been seen,
construct one, splitting off any attached punctuation or clitics. A
reference to BLANK_WORD is returned for the empty string.
To specify the boundaries of the word if it has not been seen, use lookup_chunk.
'''
if string == '':
return <Lexeme_addr>&BLANK_WORD
cdef size_t length = len(string)
cdef StringHash hashed = hash_string(string, length)
cdef Lexeme* word_ptr = <Lexeme*>vocab[hashed]
if word_ptr == NULL:
start = find_split(string, length) if start == -1 else start
word_ptr = _add(vocab, bacov, find_split, hashed, string, start, length)
return <Lexeme_addr>word_ptr
cpdef vector[size_t] expand_chunk(size_t addr) except *: cpdef vector[size_t] expand_chunk(size_t addr) except *:
@ -11,62 +84,22 @@ cpdef vector[size_t] expand_chunk(size_t addr) except *:
return tokens return tokens
""" cdef StringHash hash_string(unicode s, size_t length) except 0:
cpdef vector[size_t] ids_from_text(unicode text) except *: '''Hash unicode with MurmurHash64A'''
cdef size_t length = len(text) assert length
cdef Py_UNICODE* characters = <Py_UNICODE*>text return MurmurHash64A(<Py_UNICODE*>s, length * sizeof(Py_UNICODE), 0)
cdef size_t i
cdef Py_UNICODE c
cdef vector[size_t] tokens = vector[size_t]()
cdef unicode current = u''
cdef Lexeme* token
cdef int alnum_end = -1
cdef size_t alnum_start = 0
cdef bint seen_alnum = False
for i in range(length):
c = characters[i]
if is_whitespace(c):
token = <Lexeme*>lookup(current)
tokens.push_back(<size_t>token)
clitic = 0
while token.clitics[clitic]:
tokens.push_back(token.clitics[clitic])
clitic += 1
current = u''
alnum_start = 0
alnum_end = -1
seen_alnum = False
else:
if not seen_alnum and c.isalnum():
alnum_start = i
seen_alnum = True
elif seen_alnum and alnum_end == -1 and not c.isalnum():
alnum_end = i
current += c
if current:
token = <Lexeme*>lookup(current)
tokens.push_back(<size_t>token)
clitic = 0
while token.clitics[clitic]:
tokens.push_back(token.clitics[clitic])
clitic += 1
return tokens
"""
#cdef vector[Tokens] group_by(Tokens tokens, LexAttr field) except *:
# pass
cdef inline bint is_whitespace(Py_UNICODE c): cdef unicode unhash(dict bacov, StringHash hash_value):
# TODO: Support other unicode spaces '''Fetch a string from the reverse index, given its hash value.'''
# https://www.cs.tut.fi/~jkorpela/chars/spaces.html return bacov[hash_value]
if c == u' ':
return True
elif c == u'\n': cdef Lexeme* _add(Vocab& vocab, dict bacov, Splitter find_split, StringHash hashed,
return True unicode string, int split, size_t length) except NULL:
elif c == u'\t': assert string
return True assert split <= length
else: word = init_lexeme(vocab, bacov, find_split, string, hashed, split, length)
return False vocab[hashed] = <Lexeme_addr>word
bacov[hashed] = string
return word

3
spacy/string_tools.pxd Normal file
View File

@ -0,0 +1,3 @@
cpdef unicode substr(unicode string, int start, int end, size_t length)
cdef bint is_whitespace(Py_UNICODE c)

25
spacy/string_tools.pyx Normal file
View File

@ -0,0 +1,25 @@
cpdef unicode substr(unicode string, int start, int end, size_t length):
if end >= length:
end = -1
if start >= length:
start = 0
if start <= 0 and end < 0:
return string
elif start < 0:
start = 0
elif end < 0:
end = length
return string[start:end]
cdef bint is_whitespace(Py_UNICODE c):
# TODO: Support other unicode spaces
# https://www.cs.tut.fi/~jkorpela/chars/spaces.html
if c == u' ':
return True
elif c == u'\n':
return True
elif c == u'\t':
return True
else:
return False

View File

@ -1,3 +1,10 @@
import os
from os import path
import codecs
DATA_DIR = path.join(path.dirname(__file__), '..', 'data')
def utf8open(loc, mode='r'): def utf8open(loc, mode='r'):
return codecs.open(loc, mode, 'utf8') return codecs.open(loc, mode, 'utf8')
@ -12,23 +19,28 @@ def load_case_stats(data_dir):
return case_stats return case_stats
def load_clitics(data_dir): def read_tokenization(lang):
clitics_loc = path.join(data_dir, 'clitics.txt') loc = path.join(DATA_DIR, lang, 'tokenization')
entries = [] entries = []
seen = set() seen = set()
with utf8open(clitics_loc) as clitics_file: with utf8open(loc) as file_:
for line in clitics_file: for line in file_:
line = line.strip() line = line.strip()
if line.startswith('#'): if line.startswith('#'):
continue continue
if not line: if not line:
continue continue
clitics = line.split() pieces = line.split()
word = clitics.pop(0) chunk = pieces.pop(0)
norm_form = clitics.pop(0) lex = pieces.pop(0)
assert word not in seen, word assert chunk not in seen, chunk
seen.add(word) seen.add(chunk)
entries.append((word, norm_form, clitics)) entries.append((chunk, lex, pieces))
if chunk[0].isalpha() and chunk[0].islower():
chunk = chunk[0].title() + chunk[1:]
lex = lex[0].title() + lex[1:]
seen.add(chunk)
entries.append((chunk, lex, pieces))
return entries return entries

Binary file not shown.

4
tests/sun.tokens Normal file
View File

@ -0,0 +1,4 @@
The Sun is the star at the center of the Solar System. It is almost perfectly spherical and consists of hot plasma interwoven with magnetic fields. [ 12 ] [ 13 ] It has a diameter of about 1 , 392 , 684 km ( 865 , 374 mi ) , [ 5 ] around 109 times that of Earth , and its mass ( 1.989×1030 kilograms , approximately 330 , 000 times the mass of Earth ) accounts for about 99.86 % of the total mass of the Solar System. [ 14 ] Chemically , about three quarters of the Sun 's mass consists of hydrogen , while the rest is mostly helium. The remaining 1.69 % ( equal to 5 , 600 times the mass of Earth ) consists of heavier elements , including oxygen , carbon , neon and iron , among others. [ 15 ]
The Sun formed about 4.567 billion [ a ] [ 16 ] years ago from the gravitational collapse of a region within a large molecular cloud. Most of the matter gathered in the center , while the rest flattened into an orbiting disk that would become the Solar System. The central mass became increasingly hot and dense , eventually initiating thermonuclear fusion in its core. It is thought that almost all stars form by this process. The Sun is a G-type main-sequence star ( G2V ) based on spectral class and it is informally designated as a yellow dwarf because its visible radiation is most intense in the yellow-green portion of the spectrum , and although it is actually white in color , from the surface of the Earth it may appear yellow because of atmospheric scattering of blue light. [ 17 ] In the spectral class label , G2 indicates its surface temperature , of approximately 5778 K ( 5505 °C ) , and V indicates that the Sun , like most stars , is a main-sequence star , and thus generates its energy by nuclear fusion of hydrogen nuclei into helium. In its core , the Sun fuses about 620 million metric tons of hydrogen each second. [ 18 ] [ 19 ]
Once regarded by astronomers as a small and relatively insignificant star , the Sun is now thought to be brighter than about 85 % of the stars in the Milky Way , most of which are red dwarfs. [ 20 ] [ 21 ] The absolute magnitude of the Sun is +4.83 ; however , as the star closest to Earth , the Sun is by far the brightest object in the sky with an apparent magnitude of 26.74. [ 22 ] [ 23 ] This is about 13 billion times brighter than the next brightest star , Sirius , with an apparent magnitude of 1.46. The Sun 's hot corona continuously expands in space creating the solar wind , a stream of charged particles that extends to the heliopause at roughly 100 astronomical units. The bubble in the interstellar medium formed by the solar wind , the heliosphere , is the largest continuous structure in the Solar System. [ 24 ] [ 25 ]

4
tests/sun.txt Normal file
View File

@ -0,0 +1,4 @@
The Sun is the star at the center of the Solar System. It is almost perfectly spherical and consists of hot plasma interwoven with magnetic fields.[12][13] It has a diameter of about 1,392,684 km (865,374 mi),[5] around 109 times that of Earth, and its mass (1.989×1030 kilograms, approximately 330,000 times the mass of Earth) accounts for about 99.86% of the total mass of the Solar System.[14] Chemically, about three quarters of the Sun's mass consists of hydrogen, while the rest is mostly helium. The remaining 1.69% (equal to 5,600 times the mass of Earth) consists of heavier elements, including oxygen, carbon, neon and iron, among others.[15]
The Sun formed about 4.567 billion[a][16] years ago from the gravitational collapse of a region within a large molecular cloud. Most of the matter gathered in the center, while the rest flattened into an orbiting disk that would become the Solar System. The central mass became increasingly hot and dense, eventually initiating thermonuclear fusion in its core. It is thought that almost all stars form by this process. The Sun is a G-type main-sequence star (G2V) based on spectral class and it is informally designated as a yellow dwarf because its visible radiation is most intense in the yellow-green portion of the spectrum, and although it is actually white in color, from the surface of the Earth it may appear yellow because of atmospheric scattering of blue light.[17] In the spectral class label, G2 indicates its surface temperature, of approximately 5778 K (5505 °C), and V indicates that the Sun, like most stars, is a main-sequence star, and thus generates its energy by nuclear fusion of hydrogen nuclei into helium. In its core, the Sun fuses about 620 million metric tons of hydrogen each second.[18][19]
Once regarded by astronomers as a small and relatively insignificant star, the Sun is now thought to be brighter than about 85% of the stars in the Milky Way, most of which are red dwarfs.[20][21] The absolute magnitude of the Sun is +4.83; however, as the star closest to Earth, the Sun is by far the brightest object in the sky with an apparent magnitude of 26.74.[22][23] This is about 13 billion times brighter than the next brightest star, Sirius, with an apparent magnitude of 1.46. The Sun's hot corona continuously expands in space creating the solar wind, a stream of charged particles that extends to the heliopause at roughly 100 astronomical units. The bubble in the interstellar medium formed by the solar wind, the heliosphere, is the largest continuous structure in the Solar System.[24][25]

View File

@ -0,0 +1,44 @@
from __future__ import unicode_literals
from spacy.spacy import expand_chunk
from spacy.en import lookup, unhash
from spacy import lex_of
def test_possess():
tokens = expand_chunk(lookup("Mike's"))
assert len(tokens) == 2
assert unhash(lex_of(tokens[0])) == "Mike"
assert unhash(lex_of(tokens[1])) == "'s"
def test_apostrophe():
tokens = expand_chunk(lookup("schools'"))
assert len(tokens) == 2
assert unhash(lex_of(tokens[1])) == "'"
assert unhash(lex_of(tokens[0])) == "schools"
def test_LL():
tokens = expand_chunk(lookup("we'll"))
assert len(tokens) == 2
assert unhash(lex_of(tokens[1])) == "will"
assert unhash(lex_of(tokens[0])) == "we"
def test_aint():
tokens = expand_chunk(lookup("ain't"))
assert len(tokens) == 2
assert unhash(lex_of(tokens[0])) == "are"
assert unhash(lex_of(tokens[1])) == "not"
def test_capitalized():
tokens = expand_chunk(lookup("can't"))
assert len(tokens) == 2
tokens = expand_chunk(lookup("Can't"))
assert len(tokens) == 2
tokens = expand_chunk(lookup("Ain't"))
assert len(tokens) == 2
assert unhash(lex_of(tokens[0])) == "Are"

View File

@ -18,11 +18,10 @@ def test_close(close_puncts):
for p in close_puncts: for p in close_puncts:
string = word_str + p string = word_str + p
token = lookup(string) token = lookup(string)
assert unhash(lex_of(token)) == word_str
tokens = expand_chunk(token) tokens = expand_chunk(token)
assert len(tokens) == 2 assert len(tokens) == 2
assert unhash(lex_of(tokens[0])) == word_str
assert unhash(lex_of(tokens[1])) == p assert unhash(lex_of(tokens[1])) == p
assert unhash(lex_of(tokens[0])) == word_str
def test_two_different_close(close_puncts): def test_two_different_close(close_puncts):
@ -43,6 +42,6 @@ def test_three_same_close(close_puncts):
for p in close_puncts: for p in close_puncts:
string = word_str + p + p + p string = word_str + p + p + p
tokens = expand_chunk(lookup(string)) tokens = expand_chunk(lookup(string))
assert len(tokens) == 2 assert len(tokens) == 4
assert unhash(lex_of(tokens[0])) == word_str assert unhash(lex_of(tokens[0])) == word_str
assert unhash(lex_of(tokens[1])) == p + p + p assert unhash(lex_of(tokens[1])) == p

View File

@ -43,8 +43,8 @@ def test_three_same_open(open_puncts):
for p in open_puncts: for p in open_puncts:
string = p + p + p + word_str string = p + p + p + word_str
token = lookup(string) token = lookup(string)
assert unhash(lex_of(token)) == p + p + p assert unhash(lex_of(token)) == p
tokens = expand_chunk(token) tokens = expand_chunk(token)
assert len(tokens) == 2 assert len(tokens) == 4
assert unhash(lex_of(tokens[0])) == p + p + p assert unhash(lex_of(tokens[0])) == p
assert unhash(lex_of(tokens[1])) == word_str assert unhash(lex_of(tokens[3])) == word_str

View File

@ -0,0 +1,46 @@
from __future__ import unicode_literals
from spacy.en import unhash
from spacy import lex_of
from spacy.util import utf8open
from spacy.en_ptb import tokenize, lookup, unhash
import pytest
import os
from os import path
HERE = path.dirname(__file__)
@pytest.fixture
def sun_txt():
loc = path.join(HERE, 'sun.txt')
return utf8open(loc).read()
@pytest.fixture
def my_tokens(sun_txt):
assert len(sun_txt) != 0
tokens = tokenize(sun_txt)
return [unhash(lex_of(t)) for t in tokens]
@pytest.fixture
def sed_tokens():
loc = path.join(HERE, 'sun.tokens')
return utf8open(loc).read().split()
def test_compare_tokens(my_tokens, sed_tokens):
me = my_tokens
sed = sed_tokens
i = 0
while i < len(me) and i < len(sed):
assert me[i] == sed[i]
i += 1
assert len(me) == len(sed)

11
tests/test_rules.py Normal file
View File

@ -0,0 +1,11 @@
from spacy import util
def test_load_en():
rules = util.read_tokenization('en')
assert len(rules) != 0
aint = [rule for rule in rules if rule[0] == "ain't"][0]
chunk, lex, pieces = aint
assert chunk == "ain't"
assert lex == "are"
assert pieces == ["not"]

47
tests/test_tokenizer.py Normal file
View File

@ -0,0 +1,47 @@
from __future__ import unicode_literals
from spacy.en import tokenize
from spacy.en import lookup
from spacy.lexeme import lex_of
def test_single_word():
lex_ids = tokenize(u'hello')
assert lex_ids[0] == lookup(u'hello')
def test_two_words():
lex_ids = tokenize(u'hello possums')
assert len(lex_ids) == 2
assert lex_ids[0] == lookup(u'hello')
assert lex_ids[0] != lex_ids[1]
def test_punct():
lex_ids = tokenize('hello, possums.')
assert len(lex_ids) == 4
assert lex_ids[0] != lookup('hello')
assert lex_of(lex_ids[0]) == lex_of(lookup('hello'))
assert lex_ids[2] == lookup('possums.')
assert lex_of(lex_ids[2]) == lex_of(lookup('possums.'))
assert lex_of(lex_ids[2]) == lex_of(lookup('possums'))
assert lex_of(lex_ids[1]) != lex_of(lookup('hello'))
assert lex_ids[0] != lookup('hello.')
def test_digits():
lex_ids = tokenize('The year: 1984.')
assert len(lex_ids) == 5
assert lex_of(lex_ids[0]) == lex_of(lookup('The'))
assert lex_of(lex_ids[3]) == lex_of(lookup('1984'))
assert lex_of(lex_ids[4]) == lex_of(lookup('.'))
def test_contraction():
lex_ids = tokenize("don't giggle")
assert len(lex_ids) == 3
assert lex_of(lex_ids[1]) == lex_of(lookup("not"))
lex_ids = tokenize("i said don't!")
assert len(lex_ids) == 4
assert lex_of(lex_ids[3]) == lex_of(lookup('!'))

View File

@ -28,3 +28,10 @@ def test_case_neq():
def test_punct_neq(): def test_punct_neq():
addr = lookup('Hello') addr = lookup('Hello')
assert lookup('Hello,') != addr assert lookup('Hello,') != addr
def test_short():
addr = lookup('I')
assert unhash(lex_of(addr)) == 'I'
addr = lookup('not')
assert unhash(lex_of(addr)) == 'not'

25
tests/test_wiki_sun.py Normal file
View File

@ -0,0 +1,25 @@
from __future__ import unicode_literals
from spacy.en import unhash
from spacy import lex_of
from spacy import en
from spacy.util import utf8open
import pytest
import os
from os import path
HERE = path.dirname(__file__)
@pytest.fixture
def sun_txt():
loc = path.join(HERE, 'sun.txt')
return utf8open(loc).read()
def test_tokenize(sun_txt):
assert len(sun_txt) != 0
tokens = en.tokenize(sun_txt)
assert True

82
tests/tokenizer.sed Normal file
View File

@ -0,0 +1,82 @@
#!/bin/sed -f
# Sed script to produce Penn Treebank tokenization on arbitrary raw text.
# Yeah, sure.
# expected input: raw text with ONE SENTENCE TOKEN PER LINE
# by Robert MacIntyre, University of Pennsylvania, late 1995.
# If this wasn't such a trivial program, I'd include all that stuff about
# no warrantee, free use, etc. from the GNU General Public License. If you
# want to be picky, assume that all of its terms apply. Okay?
# attempt to get correct directional quotes
s=^"=`` =g
s=\([ ([{<]\)"=\1 `` =g
# close quotes handled at end
s=\.\.\.= ... =g
s=[,;:@#$%&]= & =g
# Assume sentence tokenization has been done first, so split FINAL periods
# only.
s=\([^.]\)\([.]\)\([])}>"']*\)[ ]*$=\1 \2\3 =g
# however, we may as well split ALL question marks and exclamation points,
# since they shouldn't have the abbrev.-marker ambiguity problem
s=[?!]= & =g
# parentheses, brackets, etc.
s=[][(){}<>]= & =g
# Some taggers, such as Adwait Ratnaparkhi's MXPOST, use the parsed-file
# version of these symbols.
# UNCOMMENT THE FOLLOWING 6 LINES if you're using MXPOST.
# s/(/-LRB-/g
# s/)/-RRB-/g
# s/\[/-LSB-/g
# s/\]/-RSB-/g
# s/{/-LCB-/g
# s/}/-RCB-/g
s=--= -- =g
# NOTE THAT SPLIT WORDS ARE NOT MARKED. Obviously this isn't great, since
# you might someday want to know how the words originally fit together --
# but it's too late to make a better system now, given the millions of
# words we've already done "wrong".
# First off, add a space to the beginning and end of each line, to reduce
# necessary number of regexps.
s=$= =
s=^= =
s="= '' =g
# possessive or close-single-quote
s=\([^']\)' =\1 ' =g
# as in it's, I'm, we'd
s='\([sSmMdD]\) = '\1 =g
s='ll = 'll =g
s='re = 're =g
s='ve = 've =g
s=n't = n't =g
s='LL = 'LL =g
s='RE = 'RE =g
s='VE = 'VE =g
s=N'T = N'T =g
s= \([Cc]\)annot = \1an not =g
s= \([Dd]\)'ye = \1' ye =g
s= \([Gg]\)imme = \1im me =g
s= \([Gg]\)onna = \1on na =g
s= \([Gg]\)otta = \1ot ta =g
s= \([Ll]\)emme = \1em me =g
s= \([Mm]\)ore'n = \1ore 'n =g
s= '\([Tt]\)is = '\1 is =g
s= '\([Tt]\)was = '\1 was =g
s= \([Ww]\)anna = \1an na =g
# s= \([Ww]\)haddya = \1ha dd ya =g
# s= \([Ww]\)hatcha = \1ha t cha =g
# clean out extra spaces
s= *= =g
s=^ *==g