mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 09:14:32 +03:00
* Fix ptb3 module
This commit is contained in:
parent
a22101404a
commit
e289896603
14
spacy/en.pyx
14
spacy/en.pyx
|
@ -3,7 +3,7 @@
|
|||
'''Tokenize English text, using a scheme that differs from the Penn Treebank 3
|
||||
scheme in several important respects:
|
||||
|
||||
* Whitespace added as tokens, except for single spaces. e.g.,
|
||||
* Whitespace is added as tokens, except for single spaces. e.g.,
|
||||
|
||||
>>> tokenize(u'\\nHello \\tThere').strings
|
||||
[u'\\n', u'Hello', u' ', u'\\t', u'There']
|
||||
|
@ -18,13 +18,15 @@ scheme in several important respects:
|
|||
>>> tokenize(u'New York-based').strings
|
||||
[u'New', u'York', u'-', u'based']
|
||||
|
||||
Other improvements:
|
||||
|
||||
* Full unicode support
|
||||
* Email addresses, URLs, European-formatted dates and other numeric entities not
|
||||
found in the PTB are tokenized correctly
|
||||
* Heuristic handling of word-final periods (PTB expects sentence boundary detection
|
||||
as a pre-process before tokenization.)
|
||||
|
||||
Take care to ensure you training and run-time data is tokenized according to the
|
||||
Take care to ensure your training and run-time data is tokenized according to the
|
||||
same scheme. Tokenization problems are a major cause of poor performance for
|
||||
NLP tools. If you're using a pre-trained model, the :py:mod:`spacy.ptb3` module
|
||||
provides a fully Penn Treebank 3-compliant tokenizer.
|
||||
|
@ -49,7 +51,6 @@ from .orthography.latin import *
|
|||
from .lexeme import *
|
||||
|
||||
|
||||
|
||||
cdef class English(spacy.Language):
|
||||
# How to ensure the order here aligns with orthography.latin?
|
||||
view_funcs = [
|
||||
|
@ -101,7 +102,7 @@ cpdef Tokens tokenize(unicode string):
|
|||
The tokenization rules are defined in two places:
|
||||
|
||||
* The data/en/tokenization table, which handles special cases like contractions;
|
||||
* The `spacy.en.English.find_split` function, which is used to split off punctuation etc.
|
||||
* The :py:meth:`spacy.en.English.find_split` function, which is used to split off punctuation etc.
|
||||
|
||||
Args:
|
||||
string (unicode): The string to be tokenized.
|
||||
|
@ -113,9 +114,10 @@ cpdef Tokens tokenize(unicode string):
|
|||
|
||||
|
||||
cpdef LexID lookup(unicode string) except 0:
|
||||
"""Retrieve (or create, if not found) a Lexeme ID for a string.
|
||||
"""Retrieve (or create, if not found) a Lexeme for a string, and return its ID.
|
||||
|
||||
The LexID is really a memory address, making dereferencing it essentially free.
|
||||
Properties of the Lexeme are accessed by passing LexID to the accessor methods.
|
||||
Access is cheap/free, as the LexID is the memory address of the Lexeme.
|
||||
|
||||
Args:
|
||||
string (unicode): The string to be looked up. Must be unicode, not bytes.
|
||||
|
|
|
@ -25,10 +25,15 @@ cdef struct Lexeme:
|
|||
cpdef StringHash lex_of(LexID lex_id) except 0
|
||||
cpdef char first_of(LexID lex_id) except 0
|
||||
cpdef size_t length_of(LexID lex_id) except 0
|
||||
cpdef double prob_of(LexID lex_id) except 0
|
||||
cpdef double prob_of(LexID lex_id) except 1
|
||||
cpdef ClusterID cluster_of(LexID lex_id) except 0
|
||||
|
||||
cpdef bint check_tag_flag(LexID lex, TagFlags flag) except *
|
||||
|
||||
cpdef bint is_often_titled(size_t lex_id)
|
||||
cpdef bint is_often_uppered(size_t lex_id)
|
||||
|
||||
|
||||
cpdef bint can_tag(LexID lex, TagFlags flag) except *
|
||||
cpdef bint check_dist_flag(LexID lex, DistFlags flag) except *
|
||||
cpdef bint check_orth_flag(LexID lex, OrthFlags flag) except *
|
||||
|
||||
|
|
|
@ -11,6 +11,21 @@ from libc.stdint cimport uint64_t
|
|||
|
||||
from spacy.spacy cimport StringHash
|
||||
|
||||
# Python-visible enum for POS tags
|
||||
PUNCT = 0
|
||||
CONJ = 1
|
||||
NUM = 2
|
||||
X = 3
|
||||
DET = 4
|
||||
ADP = 5
|
||||
ADJ = 6
|
||||
ADV = 7
|
||||
VERB = 8
|
||||
NOUN = 9
|
||||
PDT = 10
|
||||
POS = 11
|
||||
PRON = 12
|
||||
PRT = 13
|
||||
|
||||
cpdef int set_flags(LexID lex_id, object active_flags) except *:
|
||||
"""Set orthographic bit flags for a Lexeme.
|
||||
|
@ -75,7 +90,7 @@ cpdef size_t length_of(size_t lex_id) except 0:
|
|||
return word.length
|
||||
|
||||
|
||||
cpdef double prob_of(size_t lex_id) except 0:
|
||||
cpdef double prob_of(size_t lex_id) except 1:
|
||||
'''Access an estimate of the word's unigram log probability.
|
||||
|
||||
Probabilities are calculated from a large text corpus, and smoothed using
|
||||
|
@ -90,7 +105,7 @@ cpdef double prob_of(size_t lex_id) except 0:
|
|||
DEF OFT_UPPER = 1
|
||||
DEF OFT_TITLE = 2
|
||||
|
||||
cpdef bint is_oft_upper(size_t lex_id):
|
||||
cpdef bint is_often_uppered(size_t lex_id):
|
||||
'''Check the OFT_UPPER distributional flag for the word.
|
||||
|
||||
The OFT_UPPER flag records whether a lower-cased version of the word
|
||||
|
@ -101,15 +116,15 @@ cpdef bint is_oft_upper(size_t lex_id):
|
|||
Case statistics are estimated from a large text corpus. Estimates are read
|
||||
from data/en/case_stats, and can be replaced using spacy.en.load_case_stats.
|
||||
|
||||
>>> is_oft_upper(lookup(u'nato'))
|
||||
>>> is_often_uppered(lookup(u'nato'))
|
||||
True
|
||||
>>> is_oft_upper(lookup(u'the'))
|
||||
>>> is_often_uppered(lookup(u'the'))
|
||||
False
|
||||
'''
|
||||
return (<Lexeme*>lex_id).dist_flags & (1 << OFT_UPPER)
|
||||
|
||||
|
||||
cpdef bint is_oft_title(size_t lex_id):
|
||||
cpdef bint is_often_titled(size_t lex_id):
|
||||
'''Check the OFT_TITLE distributional flag for the word.
|
||||
|
||||
The OFT_TITLE flag records whether a lower-cased version of the word
|
||||
|
@ -127,6 +142,7 @@ cpdef bint is_oft_title(size_t lex_id):
|
|||
'''
|
||||
return (<Lexeme*>lex_id).dist_flags & (1 << OFT_TITLE)
|
||||
|
||||
|
||||
cpdef bint check_orth_flag(size_t lex_id, OrthFlags flag) except *:
|
||||
return (<Lexeme*>lex_id).orth_flags & (1 << flag)
|
||||
|
||||
|
@ -135,5 +151,5 @@ cpdef bint check_dist_flag(size_t lex_id, DistFlags flag) except *:
|
|||
return (<Lexeme*>lex_id).dist_flags & (1 << flag)
|
||||
|
||||
|
||||
cpdef bint check_tag_flag(LexID lex_id, TagFlags flag) except *:
|
||||
cpdef bint can_tag(LexID lex_id, TagFlags flag) except *:
|
||||
return (<Lexeme*>lex_id).possible_tags & (1 << flag)
|
||||
|
|
|
@ -1,18 +1,15 @@
|
|||
from libcpp.vector cimport vector
|
||||
|
||||
from spacy.spacy cimport StringHash
|
||||
from spacy.spacy cimport Language
|
||||
from spacy.spacy cimport Lexeme
|
||||
from spacy.spacy cimport Lexeme_addr
|
||||
from spacy.lexeme cimport LexID
|
||||
from spacy.tokens cimport Tokens
|
||||
from spacy.lexeme cimport StringHash
|
||||
|
||||
|
||||
cdef class EnglishPTB(Language):
|
||||
cdef int find_split(self, unicode word)
|
||||
cdef class PennTreebank3(Language):
|
||||
cpdef list find_substrings(self, unicode word)
|
||||
|
||||
|
||||
cdef EnglishPTB EN_PTB
|
||||
cdef PennTreebank3 PTB3
|
||||
|
||||
cpdef Lexeme_addr lookup(unicode word) except 0
|
||||
cpdef LexID lookup(unicode word) except 0
|
||||
cpdef Tokens tokenize(unicode string)
|
||||
cpdef unicode unhash(StringHash hash_value)
|
||||
|
|
108
spacy/ptb3.pyx
108
spacy/ptb3.pyx
|
@ -7,55 +7,89 @@ from __future__ import unicode_literals
|
|||
|
||||
from libc.stdlib cimport malloc, calloc, free
|
||||
from libc.stdint cimport uint64_t
|
||||
from libcpp.vector cimport vector
|
||||
|
||||
from spacy.string_tools cimport substr
|
||||
from spacy.spacy cimport Language
|
||||
from . import util
|
||||
|
||||
cimport spacy
|
||||
|
||||
import re
|
||||
|
||||
cdef class EnglishPTB(Language):
|
||||
cdef int find_split(self, unicode word):
|
||||
length = len(word)
|
||||
cdef int i = 0
|
||||
# Contractions
|
||||
if word.endswith("'s"):
|
||||
return length - 2
|
||||
# Leading punctuation
|
||||
if is_punct(word, 0, length):
|
||||
return 1
|
||||
elif length >= 1:
|
||||
# Split off all trailing punctuation characters
|
||||
i = 0
|
||||
while i < length and not is_punct(word, i, length):
|
||||
i += 1
|
||||
return i
|
||||
# List of contractions adapted from Robert MacIntyre's tokenizer.
|
||||
CONTRACTIONS2 = [re.compile(r"(?i)\b(can)(not)\b"),
|
||||
re.compile(r"(?i)\b(d)('ye)\b"),
|
||||
re.compile(r"(?i)\b(gim)(me)\b"),
|
||||
re.compile(r"(?i)\b(gon)(na)\b"),
|
||||
re.compile(r"(?i)\b(got)(ta)\b"),
|
||||
re.compile(r"(?i)\b(lem)(me)\b"),
|
||||
re.compile(r"(?i)\b(mor)('n)\b"),
|
||||
re.compile(r"(?i)\b(wan)(na) ")]
|
||||
|
||||
CONTRACTIONS3 = [re.compile(r"(?i) ('t)(is)\b"),
|
||||
re.compile(r"(?i) ('t)(was)\b")]
|
||||
|
||||
CONTRACTIONS4 = [re.compile(r"(?i)\b(whad)(dd)(ya)\b"),
|
||||
re.compile(r"(?i)\b(wha)(t)(cha)\b")]
|
||||
|
||||
def nltk_regex_tokenize(text):
|
||||
# Implementation taken from NLTK 3.0, based on tokenizer.sed
|
||||
|
||||
#starting quotes
|
||||
text = re.sub(r'^\"', r'``', text)
|
||||
text = re.sub(r'(``)', r' \1 ', text)
|
||||
text = re.sub(r'([ (\[{<])"', r'\1 `` ', text)
|
||||
|
||||
#punctuation
|
||||
text = re.sub(r'([:,])([^\d])', r' \1 \2', text)
|
||||
text = re.sub(r'\.\.\.', r' ... ', text)
|
||||
text = re.sub(r'[;@#$%&]', r' \g<0> ', text)
|
||||
text = re.sub(r'([^\.])(\.)([\]\)}>"\']*)\s*$', r'\1 \2\3 ', text)
|
||||
text = re.sub(r'[?!]', r' \g<0> ', text)
|
||||
|
||||
text = re.sub(r"([^'])' ", r"\1 ' ", text)
|
||||
|
||||
#parens, brackets, etc.
|
||||
text = re.sub(r'[\]\[\(\)\{\}\<\>]', r' \g<0> ', text)
|
||||
text = re.sub(r'--', r' -- ', text)
|
||||
|
||||
#add extra space to make things easier
|
||||
text = " " + text + " "
|
||||
|
||||
#ending quotes
|
||||
text = re.sub(r'"', " '' ", text)
|
||||
text = re.sub(r'(\S)(\'\')', r'\1 \2 ', text)
|
||||
|
||||
text = re.sub(r"([^' ])('[sS]|'[mM]|'[dD]|') ", r"\1 \2 ", text)
|
||||
text = re.sub(r"([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) ", r"\1 \2 ",
|
||||
text)
|
||||
|
||||
for regexp in CONTRACTIONS2:
|
||||
text = regexp.sub(r' \1 \2 ', text)
|
||||
for regexp in CONTRACTIONS3:
|
||||
text = regexp.sub(r' \1 \2 ', text)
|
||||
|
||||
# We are not using CONTRACTIONS4 since
|
||||
# they are also commented out in the SED scripts
|
||||
# for regexp in self.CONTRACTIONS4:
|
||||
# text = regexp.sub(r' \1 \2 \3 ', text)
|
||||
|
||||
return text.split()
|
||||
|
||||
|
||||
cdef bint is_punct(unicode word, size_t i, size_t length):
|
||||
is_final = i == (length - 1)
|
||||
if word[i] == '.':
|
||||
return False
|
||||
if not is_final and word[i] == '-' and word[i+1] == '-':
|
||||
return True
|
||||
# Don't count appostrophes as punct if the next char is a letter
|
||||
if word[i] == "'" and i < (length - 1) and word[i+1].isalpha():
|
||||
return False
|
||||
punct_chars = set(',;:' + '@#$%&' + '!?' + '[({' + '})]')
|
||||
return word[i] in punct_chars
|
||||
cdef class PennTreebank3(Language):
|
||||
cpdef list find_substrings(self, unicode chunk):
|
||||
strings = nltk_regex_tokenize(chunk)
|
||||
assert strings
|
||||
return strings
|
||||
|
||||
|
||||
|
||||
cdef EnglishPTB EN_PTB = EnglishPTB('en_ptb')
|
||||
cdef PennTreebank3 PTB3 = PennTreebank3('ptb3')
|
||||
|
||||
cpdef Tokens tokenize(unicode string):
|
||||
return EN_PTB.tokenize(string)
|
||||
return PTB3.tokenize(string)
|
||||
|
||||
|
||||
cpdef Lexeme_addr lookup(unicode string) except 0:
|
||||
return <Lexeme_addr>EN_PTB.lookup(string)
|
||||
cpdef LexID lookup(unicode string) except 0:
|
||||
return <LexID>PTB3.lookup(string)
|
||||
|
||||
|
||||
cpdef unicode unhash(StringHash hash_value):
|
||||
return EN_PTB.unhash(hash_value)
|
||||
return PTB3.unhash(hash_value)
|
||||
|
|
Loading…
Reference in New Issue
Block a user