* Fix ptb3 module

This commit is contained in:
Matthew Honnibal 2014-08-22 16:35:48 +02:00
parent a22101404a
commit e289896603
5 changed files with 114 additions and 60 deletions

View File

@ -3,7 +3,7 @@
'''Tokenize English text, using a scheme that differs from the Penn Treebank 3
scheme in several important respects:
* Whitespace added as tokens, except for single spaces. e.g.,
* Whitespace is added as tokens, except for single spaces. e.g.,
>>> tokenize(u'\\nHello \\tThere').strings
[u'\\n', u'Hello', u' ', u'\\t', u'There']
@ -18,13 +18,15 @@ scheme in several important respects:
>>> tokenize(u'New York-based').strings
[u'New', u'York', u'-', u'based']
Other improvements:
* Full unicode support
* Email addresses, URLs, European-formatted dates and other numeric entities not
found in the PTB are tokenized correctly
* Heuristic handling of word-final periods (PTB expects sentence boundary detection
as a pre-process before tokenization.)
Take care to ensure you training and run-time data is tokenized according to the
Take care to ensure your training and run-time data is tokenized according to the
same scheme. Tokenization problems are a major cause of poor performance for
NLP tools. If you're using a pre-trained model, the :py:mod:`spacy.ptb3` module
provides a fully Penn Treebank 3-compliant tokenizer.
@ -49,7 +51,6 @@ from .orthography.latin import *
from .lexeme import *
cdef class English(spacy.Language):
# How to ensure the order here aligns with orthography.latin?
view_funcs = [
@ -101,7 +102,7 @@ cpdef Tokens tokenize(unicode string):
The tokenization rules are defined in two places:
* The data/en/tokenization table, which handles special cases like contractions;
* The `spacy.en.English.find_split` function, which is used to split off punctuation etc.
* The :py:meth:`spacy.en.English.find_split` function, which is used to split off punctuation etc.
Args:
string (unicode): The string to be tokenized.
@ -113,9 +114,10 @@ cpdef Tokens tokenize(unicode string):
cpdef LexID lookup(unicode string) except 0:
"""Retrieve (or create, if not found) a Lexeme ID for a string.
"""Retrieve (or create, if not found) a Lexeme for a string, and return its ID.
The LexID is really a memory address, making dereferencing it essentially free.
Properties of the Lexeme are accessed by passing LexID to the accessor methods.
Access is cheap/free, as the LexID is the memory address of the Lexeme.
Args:
string (unicode): The string to be looked up. Must be unicode, not bytes.

View File

@ -25,10 +25,15 @@ cdef struct Lexeme:
cpdef StringHash lex_of(LexID lex_id) except 0
cpdef char first_of(LexID lex_id) except 0
cpdef size_t length_of(LexID lex_id) except 0
cpdef double prob_of(LexID lex_id) except 0
cpdef double prob_of(LexID lex_id) except 1
cpdef ClusterID cluster_of(LexID lex_id) except 0
cpdef bint check_tag_flag(LexID lex, TagFlags flag) except *
cpdef bint is_often_titled(size_t lex_id)
cpdef bint is_often_uppered(size_t lex_id)
cpdef bint can_tag(LexID lex, TagFlags flag) except *
cpdef bint check_dist_flag(LexID lex, DistFlags flag) except *
cpdef bint check_orth_flag(LexID lex, OrthFlags flag) except *

View File

@ -11,6 +11,21 @@ from libc.stdint cimport uint64_t
from spacy.spacy cimport StringHash
# Python-visible enum for POS tags
PUNCT = 0
CONJ = 1
NUM = 2
X = 3
DET = 4
ADP = 5
ADJ = 6
ADV = 7
VERB = 8
NOUN = 9
PDT = 10
POS = 11
PRON = 12
PRT = 13
cpdef int set_flags(LexID lex_id, object active_flags) except *:
"""Set orthographic bit flags for a Lexeme.
@ -75,7 +90,7 @@ cpdef size_t length_of(size_t lex_id) except 0:
return word.length
cpdef double prob_of(size_t lex_id) except 0:
cpdef double prob_of(size_t lex_id) except 1:
'''Access an estimate of the word's unigram log probability.
Probabilities are calculated from a large text corpus, and smoothed using
@ -90,7 +105,7 @@ cpdef double prob_of(size_t lex_id) except 0:
DEF OFT_UPPER = 1
DEF OFT_TITLE = 2
cpdef bint is_oft_upper(size_t lex_id):
cpdef bint is_often_uppered(size_t lex_id):
'''Check the OFT_UPPER distributional flag for the word.
The OFT_UPPER flag records whether a lower-cased version of the word
@ -101,15 +116,15 @@ cpdef bint is_oft_upper(size_t lex_id):
Case statistics are estimated from a large text corpus. Estimates are read
from data/en/case_stats, and can be replaced using spacy.en.load_case_stats.
>>> is_oft_upper(lookup(u'nato'))
>>> is_often_uppered(lookup(u'nato'))
True
>>> is_oft_upper(lookup(u'the'))
>>> is_often_uppered(lookup(u'the'))
False
'''
return (<Lexeme*>lex_id).dist_flags & (1 << OFT_UPPER)
cpdef bint is_oft_title(size_t lex_id):
cpdef bint is_often_titled(size_t lex_id):
'''Check the OFT_TITLE distributional flag for the word.
The OFT_TITLE flag records whether a lower-cased version of the word
@ -127,6 +142,7 @@ cpdef bint is_oft_title(size_t lex_id):
'''
return (<Lexeme*>lex_id).dist_flags & (1 << OFT_TITLE)
cpdef bint check_orth_flag(size_t lex_id, OrthFlags flag) except *:
return (<Lexeme*>lex_id).orth_flags & (1 << flag)
@ -135,5 +151,5 @@ cpdef bint check_dist_flag(size_t lex_id, DistFlags flag) except *:
return (<Lexeme*>lex_id).dist_flags & (1 << flag)
cpdef bint check_tag_flag(LexID lex_id, TagFlags flag) except *:
cpdef bint can_tag(LexID lex_id, TagFlags flag) except *:
return (<Lexeme*>lex_id).possible_tags & (1 << flag)

View File

@ -1,18 +1,15 @@
from libcpp.vector cimport vector
from spacy.spacy cimport StringHash
from spacy.spacy cimport Language
from spacy.spacy cimport Lexeme
from spacy.spacy cimport Lexeme_addr
from spacy.lexeme cimport LexID
from spacy.tokens cimport Tokens
from spacy.lexeme cimport StringHash
cdef class EnglishPTB(Language):
cdef int find_split(self, unicode word)
cdef class PennTreebank3(Language):
cpdef list find_substrings(self, unicode word)
cdef EnglishPTB EN_PTB
cdef PennTreebank3 PTB3
cpdef Lexeme_addr lookup(unicode word) except 0
cpdef LexID lookup(unicode word) except 0
cpdef Tokens tokenize(unicode string)
cpdef unicode unhash(StringHash hash_value)

View File

@ -7,55 +7,89 @@ from __future__ import unicode_literals
from libc.stdlib cimport malloc, calloc, free
from libc.stdint cimport uint64_t
from libcpp.vector cimport vector
from spacy.string_tools cimport substr
from spacy.spacy cimport Language
from . import util
cimport spacy
import re
cdef class EnglishPTB(Language):
cdef int find_split(self, unicode word):
length = len(word)
cdef int i = 0
# Contractions
if word.endswith("'s"):
return length - 2
# Leading punctuation
if is_punct(word, 0, length):
return 1
elif length >= 1:
# Split off all trailing punctuation characters
i = 0
while i < length and not is_punct(word, i, length):
i += 1
return i
# List of contractions adapted from Robert MacIntyre's tokenizer.
CONTRACTIONS2 = [re.compile(r"(?i)\b(can)(not)\b"),
re.compile(r"(?i)\b(d)('ye)\b"),
re.compile(r"(?i)\b(gim)(me)\b"),
re.compile(r"(?i)\b(gon)(na)\b"),
re.compile(r"(?i)\b(got)(ta)\b"),
re.compile(r"(?i)\b(lem)(me)\b"),
re.compile(r"(?i)\b(mor)('n)\b"),
re.compile(r"(?i)\b(wan)(na) ")]
CONTRACTIONS3 = [re.compile(r"(?i) ('t)(is)\b"),
re.compile(r"(?i) ('t)(was)\b")]
CONTRACTIONS4 = [re.compile(r"(?i)\b(whad)(dd)(ya)\b"),
re.compile(r"(?i)\b(wha)(t)(cha)\b")]
def nltk_regex_tokenize(text):
# Implementation taken from NLTK 3.0, based on tokenizer.sed
#starting quotes
text = re.sub(r'^\"', r'``', text)
text = re.sub(r'(``)', r' \1 ', text)
text = re.sub(r'([ (\[{<])"', r'\1 `` ', text)
#punctuation
text = re.sub(r'([:,])([^\d])', r' \1 \2', text)
text = re.sub(r'\.\.\.', r' ... ', text)
text = re.sub(r'[;@#$%&]', r' \g<0> ', text)
text = re.sub(r'([^\.])(\.)([\]\)}>"\']*)\s*$', r'\1 \2\3 ', text)
text = re.sub(r'[?!]', r' \g<0> ', text)
text = re.sub(r"([^'])' ", r"\1 ' ", text)
#parens, brackets, etc.
text = re.sub(r'[\]\[\(\)\{\}\<\>]', r' \g<0> ', text)
text = re.sub(r'--', r' -- ', text)
#add extra space to make things easier
text = " " + text + " "
#ending quotes
text = re.sub(r'"', " '' ", text)
text = re.sub(r'(\S)(\'\')', r'\1 \2 ', text)
text = re.sub(r"([^' ])('[sS]|'[mM]|'[dD]|') ", r"\1 \2 ", text)
text = re.sub(r"([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) ", r"\1 \2 ",
text)
for regexp in CONTRACTIONS2:
text = regexp.sub(r' \1 \2 ', text)
for regexp in CONTRACTIONS3:
text = regexp.sub(r' \1 \2 ', text)
# We are not using CONTRACTIONS4 since
# they are also commented out in the SED scripts
# for regexp in self.CONTRACTIONS4:
# text = regexp.sub(r' \1 \2 \3 ', text)
return text.split()
cdef bint is_punct(unicode word, size_t i, size_t length):
is_final = i == (length - 1)
if word[i] == '.':
return False
if not is_final and word[i] == '-' and word[i+1] == '-':
return True
# Don't count appostrophes as punct if the next char is a letter
if word[i] == "'" and i < (length - 1) and word[i+1].isalpha():
return False
punct_chars = set(',;:' + '@#$%&' + '!?' + '[({' + '})]')
return word[i] in punct_chars
cdef class PennTreebank3(Language):
cpdef list find_substrings(self, unicode chunk):
strings = nltk_regex_tokenize(chunk)
assert strings
return strings
cdef EnglishPTB EN_PTB = EnglishPTB('en_ptb')
cdef PennTreebank3 PTB3 = PennTreebank3('ptb3')
cpdef Tokens tokenize(unicode string):
return EN_PTB.tokenize(string)
return PTB3.tokenize(string)
cpdef Lexeme_addr lookup(unicode string) except 0:
return <Lexeme_addr>EN_PTB.lookup(string)
cpdef LexID lookup(unicode string) except 0:
return <LexID>PTB3.lookup(string)
cpdef unicode unhash(StringHash hash_value):
return EN_PTB.unhash(hash_value)
return PTB3.unhash(hash_value)