* Fix ptb3 module

2025-08-19 19:45:01 +03:00 · 2014-08-22 16:35:48 +02:00 · 2014-08-22 16:35:48 +02:00 · e289896603
commit e289896603
parent a22101404a
5 changed files with 114 additions and 60 deletions
--- a/spacy/en.pyx
+++ b/spacy/en.pyx
@ -3,7 +3,7 @@
 '''Tokenize English text, using a scheme that differs from the Penn Treebank 3
 scheme in several important respects:

-* Whitespace added as tokens, except for single spaces. e.g.,
+* Whitespace is added as tokens, except for single spaces. e.g.,

    >>> tokenize(u'\\nHello  \\tThere').strings
    [u'\\n', u'Hello', u' ', u'\\t', u'There']
@ -18,13 +18,15 @@ scheme in several important respects:
    >>> tokenize(u'New York-based').strings
    [u'New', u'York', u'-', u'based']

+Other improvements:
+
 * Full unicode support
 * Email addresses, URLs, European-formatted dates and other numeric entities not
  found in the PTB are tokenized correctly
 * Heuristic handling of word-final periods (PTB expects sentence boundary detection
  as a pre-process before tokenization.)

-Take care to ensure you training and run-time data is tokenized according to the
+Take care to ensure your training and run-time data is tokenized according to the
 same scheme. Tokenization problems are a major cause of poor performance for
 NLP tools. If you're using a pre-trained model, the :py:mod:`spacy.ptb3` module
 provides a fully Penn Treebank 3-compliant tokenizer.
@ -49,7 +51,6 @@ from .orthography.latin import *
 from .lexeme import *


-
 cdef class English(spacy.Language):
    # How to ensure the order here aligns with orthography.latin?
    view_funcs = [
@ -101,7 +102,7 @@ cpdef Tokens tokenize(unicode string):
    The tokenization rules are defined in two places:

    * The data/en/tokenization table, which handles special cases like contractions;
-    * The `spacy.en.English.find_split` function, which is used to split off punctuation etc.
+    * The :py:meth:`spacy.en.English.find_split` function, which is used to split off punctuation etc.

    Args:
        string (unicode): The string to be tokenized. 
@ -113,9 +114,10 @@ cpdef Tokens tokenize(unicode string):


 cpdef LexID lookup(unicode string) except 0:
-    """Retrieve (or create, if not found) a Lexeme ID for a string.
+    """Retrieve (or create, if not found) a Lexeme for a string, and return its ID.

-    The LexID is really a memory address, making dereferencing it essentially free.
+    Properties of the Lexeme are accessed by passing LexID to the accessor methods.
+    Access is cheap/free, as the LexID is the memory address of the Lexeme.
    
    Args:
        string (unicode):  The string to be looked up. Must be unicode, not bytes.
--- a/spacy/lexeme.pxd
+++ b/spacy/lexeme.pxd
@ -25,10 +25,15 @@ cdef struct Lexeme:
 cpdef StringHash lex_of(LexID lex_id) except 0
 cpdef char first_of(LexID lex_id) except 0
 cpdef size_t length_of(LexID lex_id) except 0
-cpdef double prob_of(LexID lex_id) except 0
+cpdef double prob_of(LexID lex_id) except 1
 cpdef ClusterID cluster_of(LexID lex_id) except 0

-cpdef bint check_tag_flag(LexID lex, TagFlags flag) except *
+
+cpdef bint is_often_titled(size_t lex_id)
+cpdef bint is_often_uppered(size_t lex_id)
+
+
+cpdef bint can_tag(LexID lex, TagFlags flag) except *
 cpdef bint check_dist_flag(LexID lex, DistFlags flag) except *
 cpdef bint check_orth_flag(LexID lex, OrthFlags flag) except *

--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@ -11,6 +11,21 @@ from libc.stdint cimport uint64_t

 from spacy.spacy cimport StringHash

+# Python-visible enum for POS tags
+PUNCT = 0
+CONJ = 1
+NUM = 2
+X = 3
+DET = 4
+ADP = 5
+ADJ = 6
+ADV = 7
+VERB = 8
+NOUN = 9
+PDT = 10
+POS = 11
+PRON = 12
+PRT = 13

 cpdef int set_flags(LexID lex_id, object active_flags) except *:
    """Set orthographic bit flags for a Lexeme.
@ -75,7 +90,7 @@ cpdef size_t length_of(size_t lex_id) except 0:
    return word.length


-cpdef double prob_of(size_t lex_id) except 0:
+cpdef double prob_of(size_t lex_id) except 1:
    '''Access an estimate of the word's unigram log probability.

    Probabilities are calculated from a large text corpus, and smoothed using
@ -90,7 +105,7 @@ cpdef double prob_of(size_t lex_id) except 0:
 DEF OFT_UPPER = 1
 DEF OFT_TITLE = 2

-cpdef bint is_oft_upper(size_t lex_id):
+cpdef bint is_often_uppered(size_t lex_id):
    '''Check the OFT_UPPER distributional flag for the word.
    
    The OFT_UPPER flag records whether a lower-cased version of the word
@ -101,15 +116,15 @@ cpdef bint is_oft_upper(size_t lex_id):
    Case statistics are estimated from a large text corpus. Estimates are read
    from data/en/case_stats, and can be replaced using spacy.en.load_case_stats.
    
-    >>> is_oft_upper(lookup(u'nato'))
+    >>> is_often_uppered(lookup(u'nato'))
    True
-    >>> is_oft_upper(lookup(u'the')) 
+    >>> is_often_uppered(lookup(u'the')) 
    False
    '''
    return (<Lexeme*>lex_id).dist_flags & (1 << OFT_UPPER)


-cpdef bint is_oft_title(size_t lex_id):
+cpdef bint is_often_titled(size_t lex_id):
    '''Check the OFT_TITLE distributional flag for the word.
    
    The OFT_TITLE flag records whether a lower-cased version of the word
@ -127,6 +142,7 @@ cpdef bint is_oft_title(size_t lex_id):
    '''
    return (<Lexeme*>lex_id).dist_flags & (1 << OFT_TITLE)

+
 cpdef bint check_orth_flag(size_t lex_id, OrthFlags flag) except *:
    return (<Lexeme*>lex_id).orth_flags & (1 << flag)

@ -135,5 +151,5 @@ cpdef bint check_dist_flag(size_t lex_id, DistFlags flag) except *:
    return (<Lexeme*>lex_id).dist_flags & (1 << flag)


-cpdef bint check_tag_flag(LexID lex_id, TagFlags flag) except *:
+cpdef bint can_tag(LexID lex_id, TagFlags flag) except *:
    return (<Lexeme*>lex_id).possible_tags & (1 << flag)
--- a/spacy/ptb3.pxd
+++ b/spacy/ptb3.pxd
@ -1,18 +1,15 @@
-from libcpp.vector cimport vector
-
-from spacy.spacy cimport StringHash
 from spacy.spacy cimport Language
-from spacy.spacy cimport Lexeme
-from spacy.spacy cimport Lexeme_addr
+from spacy.lexeme cimport LexID
 from spacy.tokens cimport Tokens
+from spacy.lexeme cimport StringHash


-cdef class EnglishPTB(Language):
-    cdef int find_split(self, unicode word)
+cdef class PennTreebank3(Language):
+    cpdef list find_substrings(self, unicode word)
    

-cdef EnglishPTB EN_PTB
+cdef PennTreebank3 PTB3

-cpdef Lexeme_addr lookup(unicode word) except 0
+cpdef LexID lookup(unicode word) except 0
 cpdef Tokens tokenize(unicode string)
 cpdef unicode unhash(StringHash hash_value)
--- a/spacy/ptb3.pyx
+++ b/spacy/ptb3.pyx
@ -7,55 +7,89 @@ from __future__ import unicode_literals

 from libc.stdlib cimport malloc, calloc, free
 from libc.stdint cimport uint64_t
-from libcpp.vector cimport vector

-from spacy.string_tools cimport substr
-from spacy.spacy cimport Language
-from . import util

 cimport spacy

+import re

-cdef class EnglishPTB(Language):
-    cdef int find_split(self, unicode word):
-        length = len(word)
-        cdef int i = 0
-        # Contractions
-        if word.endswith("'s"):
-            return length - 2
-        # Leading punctuation
-        if is_punct(word, 0, length):
-            return 1
-        elif length >= 1:
-            # Split off all trailing punctuation characters
-            i = 0
-            while i < length and not is_punct(word, i, length):
-                i += 1
-        return i
+# List of contractions adapted from Robert MacIntyre's tokenizer.
+CONTRACTIONS2 = [re.compile(r"(?i)\b(can)(not)\b"),
+                 re.compile(r"(?i)\b(d)('ye)\b"),
+                 re.compile(r"(?i)\b(gim)(me)\b"),
+                 re.compile(r"(?i)\b(gon)(na)\b"),
+                 re.compile(r"(?i)\b(got)(ta)\b"),
+                 re.compile(r"(?i)\b(lem)(me)\b"),
+                 re.compile(r"(?i)\b(mor)('n)\b"),
+                 re.compile(r"(?i)\b(wan)(na) ")]
+
+CONTRACTIONS3 = [re.compile(r"(?i) ('t)(is)\b"),
+                 re.compile(r"(?i) ('t)(was)\b")]
+
+CONTRACTIONS4 = [re.compile(r"(?i)\b(whad)(dd)(ya)\b"),
+                 re.compile(r"(?i)\b(wha)(t)(cha)\b")]
+
+def nltk_regex_tokenize(text):
+    # Implementation taken from NLTK 3.0, based on tokenizer.sed
+    
+    #starting quotes
+    text = re.sub(r'^\"', r'``', text)
+    text = re.sub(r'(``)', r' \1 ', text)
+    text = re.sub(r'([ (\[{<])"', r'\1 `` ', text)
+
+    #punctuation
+    text = re.sub(r'([:,])([^\d])', r' \1 \2', text)
+    text = re.sub(r'\.\.\.', r' ... ', text)
+    text = re.sub(r'[;@#$%&]', r' \g<0> ', text)
+    text = re.sub(r'([^\.])(\.)([\]\)}>"\']*)\s*$', r'\1 \2\3 ', text)
+    text = re.sub(r'[?!]', r' \g<0> ', text)
+
+    text = re.sub(r"([^'])' ", r"\1 ' ", text)
+
+    #parens, brackets, etc.
+    text = re.sub(r'[\]\[\(\)\{\}\<\>]', r' \g<0> ', text)
+    text = re.sub(r'--', r' -- ', text)
+
+    #add extra space to make things easier
+    text = " " + text + " "
+
+    #ending quotes
+    text = re.sub(r'"', " '' ", text)
+    text = re.sub(r'(\S)(\'\')', r'\1 \2 ', text)
+
+    text = re.sub(r"([^' ])('[sS]|'[mM]|'[dD]|') ", r"\1 \2 ", text)
+    text = re.sub(r"([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) ", r"\1 \2 ",
+                  text)
+
+    for regexp in CONTRACTIONS2:
+        text = regexp.sub(r' \1 \2 ', text)
+    for regexp in CONTRACTIONS3:
+        text = regexp.sub(r' \1 \2 ', text)
+
+    # We are not using CONTRACTIONS4 since
+    # they are also commented out in the SED scripts
+    # for regexp in self.CONTRACTIONS4:
+    #     text = regexp.sub(r' \1 \2 \3 ', text)
+
+    return text.split()


-cdef bint is_punct(unicode word, size_t i, size_t length):
-    is_final = i == (length - 1)
-    if word[i] == '.':
-        return False
-    if not is_final and word[i] == '-' and word[i+1] == '-':
-        return True
-    # Don't count appostrophes as punct if the next char is a letter
-    if word[i] == "'" and i < (length - 1) and word[i+1].isalpha():
-        return False
-    punct_chars = set(',;:' + '@#$%&' + '!?' + '[({' + '})]')
-    return word[i] in punct_chars
+cdef class PennTreebank3(Language):
+    cpdef list find_substrings(self, unicode chunk):
+        strings = nltk_regex_tokenize(chunk)
+        assert strings
+        return strings
+    

-
-cdef EnglishPTB EN_PTB = EnglishPTB('en_ptb')
+cdef PennTreebank3 PTB3 = PennTreebank3('ptb3')

 cpdef Tokens tokenize(unicode string):
-    return EN_PTB.tokenize(string)
+    return PTB3.tokenize(string)


-cpdef Lexeme_addr lookup(unicode string) except 0:
-    return <Lexeme_addr>EN_PTB.lookup(string)
+cpdef LexID lookup(unicode string) except 0:
+    return <LexID>PTB3.lookup(string)


 cpdef unicode unhash(StringHash hash_value):
-    return EN_PTB.unhash(hash_value)
+    return PTB3.unhash(hash_value)