mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-11 17:56:30 +03:00
* Begin work on full PTB-compatible English tokenization
This commit is contained in:
parent
0c1be7effe
commit
df0458001d
146129
data/en_ptb/case
Normal file
146129
data/en_ptb/case
Normal file
File diff suppressed because it is too large
Load Diff
316709
data/en_ptb/clusters
Normal file
316709
data/en_ptb/clusters
Normal file
File diff suppressed because it is too large
Load Diff
93
data/en_ptb/tokenization
Normal file
93
data/en_ptb/tokenization
Normal file
|
@ -0,0 +1,93 @@
|
|||
# https://en.wikipedia.org/wiki/Wikipedia:List_of_English_contractions
|
||||
# 21:09, 25 June 2014
|
||||
#*--* --
|
||||
#*---* ---
|
||||
#*'s 's
|
||||
|
||||
ain't are not
|
||||
aren't are not
|
||||
can't can not
|
||||
could've could have
|
||||
couldn't could not
|
||||
couldn't've could not have
|
||||
didn't did not
|
||||
doesn't does not
|
||||
don't do not
|
||||
hadn't had not
|
||||
hadn't've had not have
|
||||
hasn't has not
|
||||
haven't have not
|
||||
he'd he would
|
||||
he'd've he would have
|
||||
he'll he will
|
||||
he's he 's
|
||||
how'd he would
|
||||
how'll he will
|
||||
how's how 's
|
||||
I'd I would
|
||||
I'd've I would have
|
||||
I'll I will
|
||||
I'm I am
|
||||
I've I have
|
||||
isn't is not
|
||||
it'd it would
|
||||
it'd've it would have
|
||||
it'll it will
|
||||
it's it 's
|
||||
let's let 's
|
||||
mightn't might not
|
||||
mightn't've might not have
|
||||
might've might have
|
||||
mustn't must not
|
||||
must've must have
|
||||
needn't need not
|
||||
not've not have
|
||||
shan't shall not
|
||||
she'd she would
|
||||
she'd've she would have
|
||||
she'll she will
|
||||
she's she 's
|
||||
should've should have
|
||||
shouldn't should not
|
||||
shouldn't've should not have
|
||||
that's that 's
|
||||
there'd there would
|
||||
there'd've there would have
|
||||
there's there is
|
||||
they'd there would
|
||||
they'd've they would have
|
||||
they'll they will
|
||||
they're they are
|
||||
they've they have
|
||||
wasn't was not
|
||||
we'd we would
|
||||
we'd've we would have
|
||||
we'll we will
|
||||
we're we are
|
||||
we've we have
|
||||
weren't were not
|
||||
what'll what will
|
||||
what're what are
|
||||
what's what 's
|
||||
what've what have
|
||||
when's when 's
|
||||
where'd where would
|
||||
where's where 's
|
||||
where've where have
|
||||
who'd who would
|
||||
who'll who will
|
||||
who're who are
|
||||
who's who 's
|
||||
who've who have
|
||||
why'll who will
|
||||
why're why are
|
||||
why's why is
|
||||
won't will not
|
||||
would've would have
|
||||
wouldn't would not
|
||||
wouldn't've would not have
|
||||
you'd you would
|
||||
you'd've you would have
|
||||
you'll you will
|
||||
you're you are
|
||||
you've you have
|
15
spacy/en_ptb.pxd
Normal file
15
spacy/en_ptb.pxd
Normal file
|
@ -0,0 +1,15 @@
|
|||
from libcpp.vector cimport vector
|
||||
|
||||
from spacy.spacy cimport StringHash
|
||||
from spacy.spacy cimport Vocab
|
||||
from spacy.lexeme cimport Lexeme
|
||||
from spacy.lexeme cimport Lexeme_addr
|
||||
|
||||
|
||||
cdef Vocab VOCAB
|
||||
cdef dict BACOV
|
||||
|
||||
|
||||
cpdef Lexeme_addr lookup(unicode word) except 0
|
||||
cpdef vector[Lexeme_addr] tokenize(unicode string) except *
|
||||
cpdef unicode unhash(StringHash hash_value)
|
74
spacy/en_ptb.pyx
Normal file
74
spacy/en_ptb.pyx
Normal file
|
@ -0,0 +1,74 @@
|
|||
'''Serve pointers to Lexeme structs, given strings. Maintain a reverse index,
|
||||
so that strings can be retrieved from hashes. Use 64-bit hash values and
|
||||
boldly assume no collisions.
|
||||
'''
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from libc.stdlib cimport malloc, calloc, free
|
||||
from libc.stdint cimport uint64_t
|
||||
from libcpp.vector cimport vector
|
||||
|
||||
from spacy.lexeme cimport Lexeme
|
||||
from spacy.string_tools cimport substr
|
||||
from . import util
|
||||
|
||||
cimport spacy
|
||||
|
||||
BACOV = {}
|
||||
VOCAB = Vocab()
|
||||
VOCAB.set_empty_key(0)
|
||||
|
||||
|
||||
spacy.load_tokenization(VOCAB, BACOV, util.read_tokenization('en'))
|
||||
|
||||
|
||||
cpdef vector[Lexeme_addr] tokenize(unicode string) except *:
|
||||
return spacy.tokenize(VOCAB, BACOV, find_split, string)
|
||||
|
||||
|
||||
cpdef Lexeme_addr lookup(unicode string) except 0:
|
||||
return spacy.lookup(VOCAB, BACOV, find_split, -1, string)
|
||||
|
||||
|
||||
cpdef unicode unhash(StringHash hash_value):
|
||||
return spacy.unhash(BACOV, hash_value)
|
||||
|
||||
|
||||
cdef vector[StringHash] make_string_views(unicode word):
|
||||
cdef unicode s
|
||||
return vector[StringHash]()
|
||||
#if word.isdigit() and len(word) == 4:
|
||||
# return '!YEAR'
|
||||
#elif word[0].isdigit():
|
||||
# return '!DIGITS'
|
||||
#else:
|
||||
# return word.lower()
|
||||
|
||||
|
||||
cdef int find_split(unicode word, size_t length):
|
||||
cdef int i = 0
|
||||
# Contractions
|
||||
if word.endswith("'s"):
|
||||
return length - 2
|
||||
# Leading punctuation
|
||||
if is_punct(word, 0, length):
|
||||
return 1
|
||||
elif length >= 1:
|
||||
# Split off all trailing punctuation characters
|
||||
i = 0
|
||||
while i < length and not is_punct(word, i, length):
|
||||
i += 1
|
||||
return i
|
||||
|
||||
|
||||
cdef bint is_punct(unicode word, size_t i, size_t length):
|
||||
# Don't count appostrophes as punct if the next char is a letter
|
||||
if word[i] == "'" and i < (length - 1) and word[i+1].isalpha():
|
||||
return False
|
||||
# Don't count commas as punct if the next char is a number
|
||||
if word[i] == "," and i < (length - 1) and word[i+1].isdigit():
|
||||
return False
|
||||
# Don't count periods as punct if the next char is a number
|
||||
if word[i] == "." and i < (length - 1) and word[i+1].isdigit():
|
||||
return False
|
||||
return not word[i].isalnum()
|
Loading…
Reference in New Issue
Block a user