mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 01:04:34 +03:00
* Begin work on full PTB-compatible English tokenization
This commit is contained in:
parent
0c1be7effe
commit
df0458001d
146129
data/en_ptb/case
Normal file
146129
data/en_ptb/case
Normal file
File diff suppressed because it is too large
Load Diff
316709
data/en_ptb/clusters
Normal file
316709
data/en_ptb/clusters
Normal file
File diff suppressed because it is too large
Load Diff
93
data/en_ptb/tokenization
Normal file
93
data/en_ptb/tokenization
Normal file
|
@ -0,0 +1,93 @@
|
||||||
|
# https://en.wikipedia.org/wiki/Wikipedia:List_of_English_contractions
|
||||||
|
# 21:09, 25 June 2014
|
||||||
|
#*--* --
|
||||||
|
#*---* ---
|
||||||
|
#*'s 's
|
||||||
|
|
||||||
|
ain't are not
|
||||||
|
aren't are not
|
||||||
|
can't can not
|
||||||
|
could've could have
|
||||||
|
couldn't could not
|
||||||
|
couldn't've could not have
|
||||||
|
didn't did not
|
||||||
|
doesn't does not
|
||||||
|
don't do not
|
||||||
|
hadn't had not
|
||||||
|
hadn't've had not have
|
||||||
|
hasn't has not
|
||||||
|
haven't have not
|
||||||
|
he'd he would
|
||||||
|
he'd've he would have
|
||||||
|
he'll he will
|
||||||
|
he's he 's
|
||||||
|
how'd he would
|
||||||
|
how'll he will
|
||||||
|
how's how 's
|
||||||
|
I'd I would
|
||||||
|
I'd've I would have
|
||||||
|
I'll I will
|
||||||
|
I'm I am
|
||||||
|
I've I have
|
||||||
|
isn't is not
|
||||||
|
it'd it would
|
||||||
|
it'd've it would have
|
||||||
|
it'll it will
|
||||||
|
it's it 's
|
||||||
|
let's let 's
|
||||||
|
mightn't might not
|
||||||
|
mightn't've might not have
|
||||||
|
might've might have
|
||||||
|
mustn't must not
|
||||||
|
must've must have
|
||||||
|
needn't need not
|
||||||
|
not've not have
|
||||||
|
shan't shall not
|
||||||
|
she'd she would
|
||||||
|
she'd've she would have
|
||||||
|
she'll she will
|
||||||
|
she's she 's
|
||||||
|
should've should have
|
||||||
|
shouldn't should not
|
||||||
|
shouldn't've should not have
|
||||||
|
that's that 's
|
||||||
|
there'd there would
|
||||||
|
there'd've there would have
|
||||||
|
there's there is
|
||||||
|
they'd there would
|
||||||
|
they'd've they would have
|
||||||
|
they'll they will
|
||||||
|
they're they are
|
||||||
|
they've they have
|
||||||
|
wasn't was not
|
||||||
|
we'd we would
|
||||||
|
we'd've we would have
|
||||||
|
we'll we will
|
||||||
|
we're we are
|
||||||
|
we've we have
|
||||||
|
weren't were not
|
||||||
|
what'll what will
|
||||||
|
what're what are
|
||||||
|
what's what 's
|
||||||
|
what've what have
|
||||||
|
when's when 's
|
||||||
|
where'd where would
|
||||||
|
where's where 's
|
||||||
|
where've where have
|
||||||
|
who'd who would
|
||||||
|
who'll who will
|
||||||
|
who're who are
|
||||||
|
who's who 's
|
||||||
|
who've who have
|
||||||
|
why'll who will
|
||||||
|
why're why are
|
||||||
|
why's why is
|
||||||
|
won't will not
|
||||||
|
would've would have
|
||||||
|
wouldn't would not
|
||||||
|
wouldn't've would not have
|
||||||
|
you'd you would
|
||||||
|
you'd've you would have
|
||||||
|
you'll you will
|
||||||
|
you're you are
|
||||||
|
you've you have
|
15
spacy/en_ptb.pxd
Normal file
15
spacy/en_ptb.pxd
Normal file
|
@ -0,0 +1,15 @@
|
||||||
|
from libcpp.vector cimport vector
|
||||||
|
|
||||||
|
from spacy.spacy cimport StringHash
|
||||||
|
from spacy.spacy cimport Vocab
|
||||||
|
from spacy.lexeme cimport Lexeme
|
||||||
|
from spacy.lexeme cimport Lexeme_addr
|
||||||
|
|
||||||
|
|
||||||
|
cdef Vocab VOCAB
|
||||||
|
cdef dict BACOV
|
||||||
|
|
||||||
|
|
||||||
|
cpdef Lexeme_addr lookup(unicode word) except 0
|
||||||
|
cpdef vector[Lexeme_addr] tokenize(unicode string) except *
|
||||||
|
cpdef unicode unhash(StringHash hash_value)
|
74
spacy/en_ptb.pyx
Normal file
74
spacy/en_ptb.pyx
Normal file
|
@ -0,0 +1,74 @@
|
||||||
|
'''Serve pointers to Lexeme structs, given strings. Maintain a reverse index,
|
||||||
|
so that strings can be retrieved from hashes. Use 64-bit hash values and
|
||||||
|
boldly assume no collisions.
|
||||||
|
'''
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from libc.stdlib cimport malloc, calloc, free
|
||||||
|
from libc.stdint cimport uint64_t
|
||||||
|
from libcpp.vector cimport vector
|
||||||
|
|
||||||
|
from spacy.lexeme cimport Lexeme
|
||||||
|
from spacy.string_tools cimport substr
|
||||||
|
from . import util
|
||||||
|
|
||||||
|
cimport spacy
|
||||||
|
|
||||||
|
BACOV = {}
|
||||||
|
VOCAB = Vocab()
|
||||||
|
VOCAB.set_empty_key(0)
|
||||||
|
|
||||||
|
|
||||||
|
spacy.load_tokenization(VOCAB, BACOV, util.read_tokenization('en'))
|
||||||
|
|
||||||
|
|
||||||
|
cpdef vector[Lexeme_addr] tokenize(unicode string) except *:
|
||||||
|
return spacy.tokenize(VOCAB, BACOV, find_split, string)
|
||||||
|
|
||||||
|
|
||||||
|
cpdef Lexeme_addr lookup(unicode string) except 0:
|
||||||
|
return spacy.lookup(VOCAB, BACOV, find_split, -1, string)
|
||||||
|
|
||||||
|
|
||||||
|
cpdef unicode unhash(StringHash hash_value):
|
||||||
|
return spacy.unhash(BACOV, hash_value)
|
||||||
|
|
||||||
|
|
||||||
|
cdef vector[StringHash] make_string_views(unicode word):
|
||||||
|
cdef unicode s
|
||||||
|
return vector[StringHash]()
|
||||||
|
#if word.isdigit() and len(word) == 4:
|
||||||
|
# return '!YEAR'
|
||||||
|
#elif word[0].isdigit():
|
||||||
|
# return '!DIGITS'
|
||||||
|
#else:
|
||||||
|
# return word.lower()
|
||||||
|
|
||||||
|
|
||||||
|
cdef int find_split(unicode word, size_t length):
|
||||||
|
cdef int i = 0
|
||||||
|
# Contractions
|
||||||
|
if word.endswith("'s"):
|
||||||
|
return length - 2
|
||||||
|
# Leading punctuation
|
||||||
|
if is_punct(word, 0, length):
|
||||||
|
return 1
|
||||||
|
elif length >= 1:
|
||||||
|
# Split off all trailing punctuation characters
|
||||||
|
i = 0
|
||||||
|
while i < length and not is_punct(word, i, length):
|
||||||
|
i += 1
|
||||||
|
return i
|
||||||
|
|
||||||
|
|
||||||
|
cdef bint is_punct(unicode word, size_t i, size_t length):
|
||||||
|
# Don't count appostrophes as punct if the next char is a letter
|
||||||
|
if word[i] == "'" and i < (length - 1) and word[i+1].isalpha():
|
||||||
|
return False
|
||||||
|
# Don't count commas as punct if the next char is a number
|
||||||
|
if word[i] == "," and i < (length - 1) and word[i+1].isdigit():
|
||||||
|
return False
|
||||||
|
# Don't count periods as punct if the next char is a number
|
||||||
|
if word[i] == "." and i < (length - 1) and word[i+1].isdigit():
|
||||||
|
return False
|
||||||
|
return not word[i].isalnum()
|
Loading…
Reference in New Issue
Block a user