From e244739dfef40bd1f1d32f14f3276aa2e15ef132 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 7 Jul 2014 05:10:09 +0200 Subject: [PATCH] * Fix ptb tokenization --- spacy/en_ptb.pyx | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/spacy/en_ptb.pyx b/spacy/en_ptb.pyx index df8d30ff9..2ad8f96b2 100644 --- a/spacy/en_ptb.pyx +++ b/spacy/en_ptb.pyx @@ -19,7 +19,7 @@ VOCAB = Vocab() VOCAB.set_empty_key(0) -spacy.load_tokenization(VOCAB, BACOV, util.read_tokenization('en')) +spacy.load_tokenization(VOCAB, BACOV, util.read_tokenization('en_ptb')) cpdef vector[Lexeme_addr] tokenize(unicode string) except *: @@ -62,13 +62,13 @@ cdef int find_split(unicode word, size_t length): cdef bint is_punct(unicode word, size_t i, size_t length): + is_final = i == (length - 1) + if word[i] == '.': + return False + if not is_final and word[i] == '-' and word[i+1] == '-': + return True # Don't count appostrophes as punct if the next char is a letter if word[i] == "'" and i < (length - 1) and word[i+1].isalpha(): return False - # Don't count commas as punct if the next char is a number - if word[i] == "," and i < (length - 1) and word[i+1].isdigit(): - return False - # Don't count periods as punct if the next char is a number - if word[i] == "." and i < (length - 1) and word[i+1].isdigit(): - return False - return not word[i].isalnum() + punct_chars = set(',;:' + '@#$%&' + '!?' + '[({' + '})]') + return word[i] in punct_chars