* Fix ptb tokenization

2025-07-04 11:53:09 +03:00 · 2014-07-07 05:10:09 +02:00 · 2014-07-07 05:10:09 +02:00 · e244739dfe
commit e244739dfe
parent cc3971ce5c
1 changed files with 8 additions and 8 deletions
--- a/spacy/en_ptb.pyx
+++ b/spacy/en_ptb.pyx
@ -19,7 +19,7 @@ VOCAB = Vocab()
 VOCAB.set_empty_key(0)


-spacy.load_tokenization(VOCAB, BACOV, util.read_tokenization('en'))
+spacy.load_tokenization(VOCAB, BACOV, util.read_tokenization('en_ptb'))


 cpdef vector[Lexeme_addr] tokenize(unicode string) except *:
@ -62,13 +62,13 @@ cdef int find_split(unicode word, size_t length):


 cdef bint is_punct(unicode word, size_t i, size_t length):
+    is_final = i == (length - 1)
+    if word[i] == '.':
+        return False
+    if not is_final and word[i] == '-' and word[i+1] == '-':
+        return True
    # Don't count appostrophes as punct if the next char is a letter
    if word[i] == "'" and i < (length - 1) and word[i+1].isalpha():
        return False
-    # Don't count commas as punct if the next char is a number
-    if word[i] == "," and i < (length - 1) and word[i+1].isdigit():
-        return False
-    # Don't count periods as punct if the next char is a number
-    if word[i] == "." and i < (length - 1) and word[i+1].isdigit():
-        return False
-    return not word[i].isalnum()
+    punct_chars = set(',;:' + '@#$%&' + '!?' + '[({' + '})]')
+    return word[i] in punct_chars