* Begin work on full PTB-compatible English tokenization

2025-08-09 22:54:53 +03:00 · 2014-07-07 04:29:24 +02:00 · 2014-07-07 04:29:24 +02:00 · df0458001d
commit df0458001d
parent 0c1be7effe
5 changed files with 463020 additions and 0 deletions
--- a/data/en_ptb/case
+++ b/data/en_ptb/case
--- a/data/en_ptb/clusters
+++ b/data/en_ptb/clusters
--- a/data/en_ptb/tokenization
+++ b/data/en_ptb/tokenization
@ -0,0 +1,93 @@
+# https://en.wikipedia.org/wiki/Wikipedia:List_of_English_contractions
+#  21:09, 25 June 2014
+#*--*  --
+#*---* ---
+#*'s  's
+
+ain't   are not
+aren't  are not
+can't   can not
+could've    could have
+couldn't    could not
+couldn't've could not have
+didn't  did not
+doesn't does not
+don't   do not
+hadn't  had not
+hadn't've   had not have
+hasn't  has not
+haven't have not
+he'd    he would
+he'd've he would have
+he'll   he will
+he's    he 's
+how'd   he would
+how'll  he will
+how's   how 's
+I'd I would
+I'd've  I would have
+I'll    I will
+I'm I am
+I've    I have
+isn't   is not
+it'd    it would
+it'd've it would have
+it'll   it will
+it's    it 's
+let's   let 's
+mightn't    might not
+mightn't've might not have
+might've    might have
+mustn't must not
+must've must have
+needn't need not
+not've  not have
+shan't  shall not
+she'd   she would
+she'd've    she would have
+she'll  she will
+she's   she 's
+should've   should have
+shouldn't   should not
+shouldn't've    should not have
+that's  that 's
+there'd there would
+there'd've  there would have
+there's there is
+they'd  there would
+they'd've   they would have
+they'll they will
+they're they are
+they've they have
+wasn't  was not
+we'd    we would
+we'd've we would have
+we'll   we will
+we're   we are
+we've   we have
+weren't were not
+what'll what will
+what're what are
+what's  what 's
+what've what have
+when's  when 's
+where'd where would
+where's where 's
+where've    where have
+who'd   who would
+who'll  who will
+who're  who are
+who's   who 's
+who've  who have
+why'll  who will
+why're  why are
+why's   why is
+won't   will not
+would've    would have
+wouldn't    would not
+wouldn't've would not have
+you'd   you would
+you'd've    you would have
+you'll  you will
+you're  you are
+you've  you have
--- a/spacy/en_ptb.pxd
+++ b/spacy/en_ptb.pxd
@ -0,0 +1,15 @@
+from libcpp.vector cimport vector
+
+from spacy.spacy cimport StringHash
+from spacy.spacy cimport Vocab
+from spacy.lexeme cimport Lexeme
+from spacy.lexeme cimport Lexeme_addr
+
+
+cdef Vocab VOCAB
+cdef dict BACOV
+
+
+cpdef Lexeme_addr lookup(unicode word) except 0
+cpdef vector[Lexeme_addr] tokenize(unicode string) except *
+cpdef unicode unhash(StringHash hash_value)
--- a/spacy/en_ptb.pyx
+++ b/spacy/en_ptb.pyx
@ -0,0 +1,74 @@
+'''Serve pointers to Lexeme structs, given strings. Maintain a reverse index,
+so that strings can be retrieved from hashes.  Use 64-bit hash values and
+boldly assume no collisions.
+'''
+from __future__ import unicode_literals
+
+from libc.stdlib cimport malloc, calloc, free
+from libc.stdint cimport uint64_t
+from libcpp.vector cimport vector
+
+from spacy.lexeme cimport Lexeme
+from spacy.string_tools cimport substr
+from . import util
+
+cimport spacy
+
+BACOV = {}
+VOCAB = Vocab()
+VOCAB.set_empty_key(0)
+
+
+spacy.load_tokenization(VOCAB, BACOV, util.read_tokenization('en'))
+
+
+cpdef vector[Lexeme_addr] tokenize(unicode string) except *:
+    return spacy.tokenize(VOCAB, BACOV, find_split, string)
+ 
+
+cpdef Lexeme_addr lookup(unicode string) except 0:
+    return spacy.lookup(VOCAB, BACOV, find_split, -1, string)
+
+
+cpdef unicode unhash(StringHash hash_value):
+    return spacy.unhash(BACOV, hash_value)
+
+
+cdef vector[StringHash] make_string_views(unicode word):
+    cdef unicode s
+    return vector[StringHash]()
+    #if word.isdigit() and len(word) == 4:
+    #    return '!YEAR'
+    #elif word[0].isdigit():
+    #    return '!DIGITS'
+    #else:
+    #    return word.lower()
+  
+
+cdef int find_split(unicode word, size_t length):
+    cdef int i = 0
+    # Contractions
+    if word.endswith("'s"):
+        return length - 2
+    # Leading punctuation
+    if is_punct(word, 0, length):
+        return 1
+    elif length >= 1:
+        # Split off all trailing punctuation characters
+        i = 0
+        while i < length and not is_punct(word, i, length):
+            i += 1
+    return i
+
+
+cdef bint is_punct(unicode word, size_t i, size_t length):
+    # Don't count appostrophes as punct if the next char is a letter
+    if word[i] == "'" and i < (length - 1) and word[i+1].isalpha():
+        return False
+    # Don't count commas as punct if the next char is a number
+    if word[i] == "," and i < (length - 1) and word[i+1].isdigit():
+        return False
+    # Don't count periods as punct if the next char is a number
+    if word[i] == "." and i < (length - 1) and word[i+1].isdigit():
+        return False
+    return not word[i].isalnum()