* Begin work on full PTB-compatible English tokenization

This commit is contained in:
Matthew Honnibal 2014-07-07 04:29:24 +02:00
parent 0c1be7effe
commit df0458001d
5 changed files with 463020 additions and 0 deletions

146129
data/en_ptb/case Normal file

File diff suppressed because it is too large Load Diff

316709
data/en_ptb/clusters Normal file

File diff suppressed because it is too large Load Diff

93
data/en_ptb/tokenization Normal file
View File

@ -0,0 +1,93 @@
# https://en.wikipedia.org/wiki/Wikipedia:List_of_English_contractions
# 21:09, 25 June 2014
#*--* --
#*---* ---
#*'s 's
ain't are not
aren't are not
can't can not
could've could have
couldn't could not
couldn't've could not have
didn't did not
doesn't does not
don't do not
hadn't had not
hadn't've had not have
hasn't has not
haven't have not
he'd he would
he'd've he would have
he'll he will
he's he 's
how'd he would
how'll he will
how's how 's
I'd I would
I'd've I would have
I'll I will
I'm I am
I've I have
isn't is not
it'd it would
it'd've it would have
it'll it will
it's it 's
let's let 's
mightn't might not
mightn't've might not have
might've might have
mustn't must not
must've must have
needn't need not
not've not have
shan't shall not
she'd she would
she'd've she would have
she'll she will
she's she 's
should've should have
shouldn't should not
shouldn't've should not have
that's that 's
there'd there would
there'd've there would have
there's there is
they'd there would
they'd've they would have
they'll they will
they're they are
they've they have
wasn't was not
we'd we would
we'd've we would have
we'll we will
we're we are
we've we have
weren't were not
what'll what will
what're what are
what's what 's
what've what have
when's when 's
where'd where would
where's where 's
where've where have
who'd who would
who'll who will
who're who are
who's who 's
who've who have
why'll who will
why're why are
why's why is
won't will not
would've would have
wouldn't would not
wouldn't've would not have
you'd you would
you'd've you would have
you'll you will
you're you are
you've you have

15
spacy/en_ptb.pxd Normal file
View File

@ -0,0 +1,15 @@
from libcpp.vector cimport vector
from spacy.spacy cimport StringHash
from spacy.spacy cimport Vocab
from spacy.lexeme cimport Lexeme
from spacy.lexeme cimport Lexeme_addr
cdef Vocab VOCAB
cdef dict BACOV
cpdef Lexeme_addr lookup(unicode word) except 0
cpdef vector[Lexeme_addr] tokenize(unicode string) except *
cpdef unicode unhash(StringHash hash_value)

74
spacy/en_ptb.pyx Normal file
View File

@ -0,0 +1,74 @@
'''Serve pointers to Lexeme structs, given strings. Maintain a reverse index,
so that strings can be retrieved from hashes. Use 64-bit hash values and
boldly assume no collisions.
'''
from __future__ import unicode_literals
from libc.stdlib cimport malloc, calloc, free
from libc.stdint cimport uint64_t
from libcpp.vector cimport vector
from spacy.lexeme cimport Lexeme
from spacy.string_tools cimport substr
from . import util
cimport spacy
BACOV = {}
VOCAB = Vocab()
VOCAB.set_empty_key(0)
spacy.load_tokenization(VOCAB, BACOV, util.read_tokenization('en'))
cpdef vector[Lexeme_addr] tokenize(unicode string) except *:
return spacy.tokenize(VOCAB, BACOV, find_split, string)
cpdef Lexeme_addr lookup(unicode string) except 0:
return spacy.lookup(VOCAB, BACOV, find_split, -1, string)
cpdef unicode unhash(StringHash hash_value):
return spacy.unhash(BACOV, hash_value)
cdef vector[StringHash] make_string_views(unicode word):
cdef unicode s
return vector[StringHash]()
#if word.isdigit() and len(word) == 4:
# return '!YEAR'
#elif word[0].isdigit():
# return '!DIGITS'
#else:
# return word.lower()
cdef int find_split(unicode word, size_t length):
cdef int i = 0
# Contractions
if word.endswith("'s"):
return length - 2
# Leading punctuation
if is_punct(word, 0, length):
return 1
elif length >= 1:
# Split off all trailing punctuation characters
i = 0
while i < length and not is_punct(word, i, length):
i += 1
return i
cdef bint is_punct(unicode word, size_t i, size_t length):
# Don't count appostrophes as punct if the next char is a letter
if word[i] == "'" and i < (length - 1) and word[i+1].isalpha():
return False
# Don't count commas as punct if the next char is a number
if word[i] == "," and i < (length - 1) and word[i+1].isdigit():
return False
# Don't count periods as punct if the next char is a number
if word[i] == "." and i < (length - 1) and word[i+1].isdigit():
return False
return not word[i].isalnum()