* Fix ptb3 module

2025-11-04 09:57:26 +03:00 · 2014-08-22 16:35:48 +02:00 · 2014-08-22 16:35:48 +02:00 · e289896603
commit e289896603
parent a22101404a
5 changed files with 114 additions and 60 deletions
--- a/spacy/en.pyx
+++ b/spacy/en.pyx
@ -3,7 +3,7 @@
 '''Tokenize English text, using a scheme that differs from the Penn Treebank 3
 scheme in several important respects:
-* Whitespace added as tokens, except for single spaces. e.g.,
+* Whitespace is added as tokens, except for single spaces. e.g.,
    >>> tokenize(u'\\nHello  \\tThere').strings
    [u'\\n', u'Hello', u' ', u'\\t', u'There']
@ -18,13 +18,15 @@ scheme in several important respects:
    >>> tokenize(u'New York-based').strings
    [u'New', u'York', u'-', u'based']
 Other improvements:
 * Full unicode support
 * Email addresses, URLs, European-formatted dates and other numeric entities not
  found in the PTB are tokenized correctly
 * Heuristic handling of word-final periods (PTB expects sentence boundary detection
  as a pre-process before tokenization.)
-Take care to ensure you training and run-time data is tokenized according to the
+Take care to ensure your training and run-time data is tokenized according to the
 same scheme. Tokenization problems are a major cause of poor performance for
 NLP tools. If you're using a pre-trained model, the :py:mod:`spacy.ptb3` module
 provides a fully Penn Treebank 3-compliant tokenizer.
@ -49,7 +51,6 @@ from .orthography.latin import *
 from .lexeme import *
 cdef class English(spacy.Language):
    # How to ensure the order here aligns with orthography.latin?
    view_funcs = [
@ -101,7 +102,7 @@ cpdef Tokens tokenize(unicode string):
    The tokenization rules are defined in two places:
    * The data/en/tokenization table, which handles special cases like contractions;
-    * The `spacy.en.English.find_split` function, which is used to split off punctuation etc.
+    * The :py:meth:`spacy.en.English.find_split` function, which is used to split off punctuation etc.
    Args:
        string (unicode): The string to be tokenized. 
@ -113,9 +114,10 @@ cpdef Tokens tokenize(unicode string):
 cpdef LexID lookup(unicode string) except 0:
-    """Retrieve (or create, if not found) a Lexeme ID for a string.
+    """Retrieve (or create, if not found) a Lexeme for a string, and return its ID.
-    The LexID is really a memory address, making dereferencing it essentially free.
+    Properties of the Lexeme are accessed by passing LexID to the accessor methods.
    Access is cheap/free, as the LexID is the memory address of the Lexeme.
    Args:
        string (unicode):  The string to be looked up. Must be unicode, not bytes.
--- a/spacy/lexeme.pxd
+++ b/spacy/lexeme.pxd
@ -25,10 +25,15 @@ cdef struct Lexeme:
 cpdef StringHash lex_of(LexID lex_id) except 0
 cpdef char first_of(LexID lex_id) except 0
 cpdef size_t length_of(LexID lex_id) except 0
-cpdef double prob_of(LexID lex_id) except 0
+cpdef double prob_of(LexID lex_id) except 1
 cpdef ClusterID cluster_of(LexID lex_id) except 0
-cpdef bint check_tag_flag(LexID lex, TagFlags flag) except *
+
 cpdef bint is_often_titled(size_t lex_id)
 cpdef bint is_often_uppered(size_t lex_id)
 cpdef bint can_tag(LexID lex, TagFlags flag) except *
 cpdef bint check_dist_flag(LexID lex, DistFlags flag) except *
 cpdef bint check_orth_flag(LexID lex, OrthFlags flag) except *
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@ -11,6 +11,21 @@ from libc.stdint cimport uint64_t
 from spacy.spacy cimport StringHash
 # Python-visible enum for POS tags
 PUNCT = 0
 CONJ = 1
 NUM = 2
 X = 3
 DET = 4
 ADP = 5
 ADJ = 6
 ADV = 7
 VERB = 8
 NOUN = 9
 PDT = 10
 POS = 11
 PRON = 12
 PRT = 13
 cpdef int set_flags(LexID lex_id, object active_flags) except *:
    """Set orthographic bit flags for a Lexeme.
@ -75,7 +90,7 @@ cpdef size_t length_of(size_t lex_id) except 0:
    return word.length
-cpdef double prob_of(size_t lex_id) except 0:
+cpdef double prob_of(size_t lex_id) except 1:
    '''Access an estimate of the word's unigram log probability.
    Probabilities are calculated from a large text corpus, and smoothed using
@ -90,7 +105,7 @@ cpdef double prob_of(size_t lex_id) except 0:
 DEF OFT_UPPER = 1
 DEF OFT_TITLE = 2
-cpdef bint is_oft_upper(size_t lex_id):
+cpdef bint is_often_uppered(size_t lex_id):
    '''Check the OFT_UPPER distributional flag for the word.
    The OFT_UPPER flag records whether a lower-cased version of the word
@ -101,15 +116,15 @@ cpdef bint is_oft_upper(size_t lex_id):
    Case statistics are estimated from a large text corpus. Estimates are read
    from data/en/case_stats, and can be replaced using spacy.en.load_case_stats.
-    >>> is_oft_upper(lookup(u'nato'))
+    >>> is_often_uppered(lookup(u'nato'))
    True
-    >>> is_oft_upper(lookup(u'the')) 
+    >>> is_often_uppered(lookup(u'the')) 
    False
    '''
    return (<Lexeme*>lex_id).dist_flags & (1 << OFT_UPPER)
-cpdef bint is_oft_title(size_t lex_id):
+cpdef bint is_often_titled(size_t lex_id):
    '''Check the OFT_TITLE distributional flag for the word.
    The OFT_TITLE flag records whether a lower-cased version of the word
@ -127,6 +142,7 @@ cpdef bint is_oft_title(size_t lex_id):
    '''
    return (<Lexeme*>lex_id).dist_flags & (1 << OFT_TITLE)
 cpdef bint check_orth_flag(size_t lex_id, OrthFlags flag) except *:
    return (<Lexeme*>lex_id).orth_flags & (1 << flag)
@ -135,5 +151,5 @@ cpdef bint check_dist_flag(size_t lex_id, DistFlags flag) except *:
    return (<Lexeme*>lex_id).dist_flags & (1 << flag)
-cpdef bint check_tag_flag(LexID lex_id, TagFlags flag) except *:
+cpdef bint can_tag(LexID lex_id, TagFlags flag) except *:
    return (<Lexeme*>lex_id).possible_tags & (1 << flag)
--- a/spacy/ptb3.pxd
+++ b/spacy/ptb3.pxd
@ -1,18 +1,15 @@
 from libcpp.vector cimport vector
 from spacy.spacy cimport StringHash
 from spacy.spacy cimport Language
-from spacy.spacy cimport Lexeme
+from spacy.lexeme cimport LexID
 from spacy.spacy cimport Lexeme_addr
 from spacy.tokens cimport Tokens
 from spacy.lexeme cimport StringHash
-cdef class EnglishPTB(Language):
+cdef class PennTreebank3(Language):
-    cdef int find_split(self, unicode word)
+    cpdef list find_substrings(self, unicode word)
-cdef EnglishPTB EN_PTB
+cdef PennTreebank3 PTB3
-cpdef Lexeme_addr lookup(unicode word) except 0
+cpdef LexID lookup(unicode word) except 0
 cpdef Tokens tokenize(unicode string)
 cpdef unicode unhash(StringHash hash_value)
--- a/spacy/ptb3.pyx
+++ b/spacy/ptb3.pyx
@ -7,55 +7,89 @@ from __future__ import unicode_literals
 from libc.stdlib cimport malloc, calloc, free
 from libc.stdint cimport uint64_t
 from libcpp.vector cimport vector
 from spacy.string_tools cimport substr
 from spacy.spacy cimport Language
 from . import util
 cimport spacy
 import re
-cdef class EnglishPTB(Language):
+# List of contractions adapted from Robert MacIntyre's tokenizer.
-    cdef int find_split(self, unicode word):
+CONTRACTIONS2 = [re.compile(r"(?i)\b(can)(not)\b"),
-        length = len(word)
+                 re.compile(r"(?i)\b(d)('ye)\b"),
-        cdef int i = 0
+                 re.compile(r"(?i)\b(gim)(me)\b"),
-        # Contractions
+                 re.compile(r"(?i)\b(gon)(na)\b"),
-        if word.endswith("'s"):
+                 re.compile(r"(?i)\b(got)(ta)\b"),
-            return length - 2
+                 re.compile(r"(?i)\b(lem)(me)\b"),
-        # Leading punctuation
+                 re.compile(r"(?i)\b(mor)('n)\b"),
-        if is_punct(word, 0, length):
+                 re.compile(r"(?i)\b(wan)(na) ")]
-            return 1
+
-        elif length >= 1:
+CONTRACTIONS3 = [re.compile(r"(?i) ('t)(is)\b"),
-            # Split off all trailing punctuation characters
+                 re.compile(r"(?i) ('t)(was)\b")]
-            i = 0
+
-            while i < length and not is_punct(word, i, length):
+CONTRACTIONS4 = [re.compile(r"(?i)\b(whad)(dd)(ya)\b"),
-                i += 1
+                 re.compile(r"(?i)\b(wha)(t)(cha)\b")]
-        return i
+
 def nltk_regex_tokenize(text):
    # Implementation taken from NLTK 3.0, based on tokenizer.sed
    #starting quotes
    text = re.sub(r'^\"', r'``', text)
    text = re.sub(r'(``)', r' \1 ', text)
    text = re.sub(r'([ (\[{<])"', r'\1 `` ', text)
    #punctuation
    text = re.sub(r'([:,])([^\d])', r' \1 \2', text)
    text = re.sub(r'\.\.\.', r' ... ', text)
    text = re.sub(r'[;@#$%&]', r' \g<0> ', text)
    text = re.sub(r'([^\.])(\.)([\]\)}>"\']*)\s*$', r'\1 \2\3 ', text)
    text = re.sub(r'[?!]', r' \g<0> ', text)
    text = re.sub(r"([^'])' ", r"\1 ' ", text)
    #parens, brackets, etc.
    text = re.sub(r'[\]\[\(\)\{\}\<\>]', r' \g<0> ', text)
    text = re.sub(r'--', r' -- ', text)
    #add extra space to make things easier
    text = " " + text + " "
    #ending quotes
    text = re.sub(r'"', " '' ", text)
    text = re.sub(r'(\S)(\'\')', r'\1 \2 ', text)
    text = re.sub(r"([^' ])('[sS]|'[mM]|'[dD]|') ", r"\1 \2 ", text)
    text = re.sub(r"([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) ", r"\1 \2 ",
                  text)
    for regexp in CONTRACTIONS2:
        text = regexp.sub(r' \1 \2 ', text)
    for regexp in CONTRACTIONS3:
        text = regexp.sub(r' \1 \2 ', text)
    # We are not using CONTRACTIONS4 since
    # they are also commented out in the SED scripts
    # for regexp in self.CONTRACTIONS4:
    #     text = regexp.sub(r' \1 \2 \3 ', text)
    return text.split()
-cdef bint is_punct(unicode word, size_t i, size_t length):
+cdef class PennTreebank3(Language):
-    is_final = i == (length - 1)
+    cpdef list find_substrings(self, unicode chunk):
-    if word[i] == '.':
+        strings = nltk_regex_tokenize(chunk)
-        return False
+        assert strings
-    if not is_final and word[i] == '-' and word[i+1] == '-':
+        return strings
        return True
    # Don't count appostrophes as punct if the next char is a letter
    if word[i] == "'" and i < (length - 1) and word[i+1].isalpha():
        return False
    punct_chars = set(',;:' + '@#$%&' + '!?' + '[({' + '})]')
    return word[i] in punct_chars
-cdef EnglishPTB EN_PTB = EnglishPTB('en_ptb')
+cdef PennTreebank3 PTB3 = PennTreebank3('ptb3')
 cpdef Tokens tokenize(unicode string):
-    return EN_PTB.tokenize(string)
+    return PTB3.tokenize(string)
-cpdef Lexeme_addr lookup(unicode string) except 0:
+cpdef LexID lookup(unicode string) except 0:
-    return <Lexeme_addr>EN_PTB.lookup(string)
+    return <LexID>PTB3.lookup(string)
 cpdef unicode unhash(StringHash hash_value):
-    return EN_PTB.unhash(hash_value)
+    return PTB3.unhash(hash_value)