* Begin work on full PTB-compatible English tokenization

2025-10-30 23:47:31 +03:00 · 2014-07-07 04:29:24 +02:00 · 2014-07-07 04:29:24 +02:00 · df0458001d
commit df0458001d
parent 0c1be7effe
5 changed files with 463020 additions and 0 deletions
--- a/data/en_ptb/case
+++ b/data/en_ptb/case
--- a/data/en_ptb/clusters
+++ b/data/en_ptb/clusters
--- a/data/en_ptb/tokenization
+++ b/data/en_ptb/tokenization
@ -0,0 +1,93 @@
 # https://en.wikipedia.org/wiki/Wikipedia:List_of_English_contractions
 #  21:09, 25 June 2014
 #*--*  --
 #*---* ---
 #*'s  's
 ain't   are not
 aren't  are not
 can't   can not
 could've    could have
 couldn't    could not
 couldn't've could not have
 didn't  did not
 doesn't does not
 don't   do not
 hadn't  had not
 hadn't've   had not have
 hasn't  has not
 haven't have not
 he'd    he would
 he'd've he would have
 he'll   he will
 he's    he 's
 how'd   he would
 how'll  he will
 how's   how 's
 I'd I would
 I'd've  I would have
 I'll    I will
 I'm I am
 I've    I have
 isn't   is not
 it'd    it would
 it'd've it would have
 it'll   it will
 it's    it 's
 let's   let 's
 mightn't    might not
 mightn't've might not have
 might've    might have
 mustn't must not
 must've must have
 needn't need not
 not've  not have
 shan't  shall not
 she'd   she would
 she'd've    she would have
 she'll  she will
 she's   she 's
 should've   should have
 shouldn't   should not
 shouldn't've    should not have
 that's  that 's
 there'd there would
 there'd've  there would have
 there's there is
 they'd  there would
 they'd've   they would have
 they'll they will
 they're they are
 they've they have
 wasn't  was not
 we'd    we would
 we'd've we would have
 we'll   we will
 we're   we are
 we've   we have
 weren't were not
 what'll what will
 what're what are
 what's  what 's
 what've what have
 when's  when 's
 where'd where would
 where's where 's
 where've    where have
 who'd   who would
 who'll  who will
 who're  who are
 who's   who 's
 who've  who have
 why'll  who will
 why're  why are
 why's   why is
 won't   will not
 would've    would have
 wouldn't    would not
 wouldn't've would not have
 you'd   you would
 you'd've    you would have
 you'll  you will
 you're  you are
 you've  you have
--- a/spacy/en_ptb.pxd
+++ b/spacy/en_ptb.pxd
@ -0,0 +1,15 @@
 from libcpp.vector cimport vector
 from spacy.spacy cimport StringHash
 from spacy.spacy cimport Vocab
 from spacy.lexeme cimport Lexeme
 from spacy.lexeme cimport Lexeme_addr
 cdef Vocab VOCAB
 cdef dict BACOV
 cpdef Lexeme_addr lookup(unicode word) except 0
 cpdef vector[Lexeme_addr] tokenize(unicode string) except *
 cpdef unicode unhash(StringHash hash_value)
--- a/spacy/en_ptb.pyx
+++ b/spacy/en_ptb.pyx
@ -0,0 +1,74 @@
 '''Serve pointers to Lexeme structs, given strings. Maintain a reverse index,
 so that strings can be retrieved from hashes.  Use 64-bit hash values and
 boldly assume no collisions.
 '''
 from __future__ import unicode_literals
 from libc.stdlib cimport malloc, calloc, free
 from libc.stdint cimport uint64_t
 from libcpp.vector cimport vector
 from spacy.lexeme cimport Lexeme
 from spacy.string_tools cimport substr
 from . import util
 cimport spacy
 BACOV = {}
 VOCAB = Vocab()
 VOCAB.set_empty_key(0)
 spacy.load_tokenization(VOCAB, BACOV, util.read_tokenization('en'))
 cpdef vector[Lexeme_addr] tokenize(unicode string) except *:
    return spacy.tokenize(VOCAB, BACOV, find_split, string)
 cpdef Lexeme_addr lookup(unicode string) except 0:
    return spacy.lookup(VOCAB, BACOV, find_split, -1, string)
 cpdef unicode unhash(StringHash hash_value):
    return spacy.unhash(BACOV, hash_value)
 cdef vector[StringHash] make_string_views(unicode word):
    cdef unicode s
    return vector[StringHash]()
    #if word.isdigit() and len(word) == 4:
    #    return '!YEAR'
    #elif word[0].isdigit():
    #    return '!DIGITS'
    #else:
    #    return word.lower()
 cdef int find_split(unicode word, size_t length):
    cdef int i = 0
    # Contractions
    if word.endswith("'s"):
        return length - 2
    # Leading punctuation
    if is_punct(word, 0, length):
        return 1
    elif length >= 1:
        # Split off all trailing punctuation characters
        i = 0
        while i < length and not is_punct(word, i, length):
            i += 1
    return i
 cdef bint is_punct(unicode word, size_t i, size_t length):
    # Don't count appostrophes as punct if the next char is a letter
    if word[i] == "'" and i < (length - 1) and word[i+1].isalpha():
        return False
    # Don't count commas as punct if the next char is a number
    if word[i] == "," and i < (length - 1) and word[i+1].isdigit():
        return False
    # Don't count periods as punct if the next char is a number
    if word[i] == "." and i < (length - 1) and word[i+1].isdigit():
        return False
    return not word[i].isalnum()