2014-07-07 06:29:24 +04:00
|
|
|
'''Serve pointers to Lexeme structs, given strings. Maintain a reverse index,
|
|
|
|
so that strings can be retrieved from hashes. Use 64-bit hash values and
|
|
|
|
boldly assume no collisions.
|
|
|
|
'''
|
|
|
|
from __future__ import unicode_literals
|
|
|
|
|
2014-07-07 18:58:48 +04:00
|
|
|
|
2014-07-07 06:29:24 +04:00
|
|
|
from libc.stdlib cimport malloc, calloc, free
|
|
|
|
from libc.stdint cimport uint64_t
|
|
|
|
|
|
|
|
|
|
|
|
cimport spacy
|
|
|
|
|
2014-08-22 18:35:48 +04:00
|
|
|
import re
|
2014-07-07 06:29:24 +04:00
|
|
|
|
2014-08-29 04:30:27 +04:00
|
|
|
from spacy import orth
|
|
|
|
|
|
|
|
TAG_THRESH = 0.5
|
|
|
|
UPPER_THRESH = 0.2
|
|
|
|
LOWER_THRESH = 0.5
|
|
|
|
TITLE_THRESH = 0.7
|
|
|
|
|
|
|
|
NR_FLAGS = 0
|
|
|
|
|
|
|
|
OFT_UPPER = NR_FLAGS; NR_FLAGS += 1
|
|
|
|
OFT_LOWER = NR_FLAGS; NR_FLAGS += 1
|
|
|
|
OFT_TITLE = NR_FLAGS; NR_FLAGS += 1
|
|
|
|
|
|
|
|
IS_ALPHA = NR_FLAGS; NR_FLAGS += 1
|
|
|
|
IS_DIGIT = NR_FLAGS; NR_FLAGS += 1
|
|
|
|
IS_PUNCT = NR_FLAGS; NR_FLAGS += 1
|
|
|
|
IS_SPACE = NR_FLAGS; NR_FLAGS += 1
|
|
|
|
IS_ASCII = NR_FLAGS; NR_FLAGS += 1
|
|
|
|
IS_TITLE = NR_FLAGS; NR_FLAGS += 1
|
|
|
|
IS_LOWER = NR_FLAGS; NR_FLAGS += 1
|
|
|
|
IS_UPPER = NR_FLAGS; NR_FLAGS += 1
|
|
|
|
|
|
|
|
CAN_PUNCT = NR_FLAGS; NR_FLAGS += 1
|
|
|
|
CAN_CONJ = NR_FLAGS; NR_FLAGS += 1
|
|
|
|
CAN_NUM = NR_FLAGS; NR_FLAGS += 1
|
|
|
|
CAN_DET = NR_FLAGS; NR_FLAGS += 1
|
|
|
|
CAN_ADP = NR_FLAGS; NR_FLAGS += 1
|
|
|
|
CAN_ADJ = NR_FLAGS; NR_FLAGS += 1
|
|
|
|
CAN_ADV = NR_FLAGS; NR_FLAGS += 1
|
|
|
|
CAN_VERB = NR_FLAGS; NR_FLAGS += 1
|
|
|
|
CAN_NOUN = NR_FLAGS; NR_FLAGS += 1
|
|
|
|
CAN_PDT = NR_FLAGS; NR_FLAGS += 1
|
|
|
|
CAN_POS = NR_FLAGS; NR_FLAGS += 1
|
|
|
|
CAN_PRON = NR_FLAGS; NR_FLAGS += 1
|
|
|
|
CAN_PRT = NR_FLAGS; NR_FLAGS += 1
|
|
|
|
|
|
|
|
|
2014-08-22 18:35:48 +04:00
|
|
|
# List of contractions adapted from Robert MacIntyre's tokenizer.
|
|
|
|
CONTRACTIONS2 = [re.compile(r"(?i)\b(can)(not)\b"),
|
|
|
|
re.compile(r"(?i)\b(d)('ye)\b"),
|
|
|
|
re.compile(r"(?i)\b(gim)(me)\b"),
|
|
|
|
re.compile(r"(?i)\b(gon)(na)\b"),
|
|
|
|
re.compile(r"(?i)\b(got)(ta)\b"),
|
|
|
|
re.compile(r"(?i)\b(lem)(me)\b"),
|
|
|
|
re.compile(r"(?i)\b(mor)('n)\b"),
|
|
|
|
re.compile(r"(?i)\b(wan)(na) ")]
|
|
|
|
|
|
|
|
CONTRACTIONS3 = [re.compile(r"(?i) ('t)(is)\b"),
|
|
|
|
re.compile(r"(?i) ('t)(was)\b")]
|
|
|
|
|
|
|
|
CONTRACTIONS4 = [re.compile(r"(?i)\b(whad)(dd)(ya)\b"),
|
|
|
|
re.compile(r"(?i)\b(wha)(t)(cha)\b")]
|
|
|
|
|
|
|
|
def nltk_regex_tokenize(text):
|
|
|
|
# Implementation taken from NLTK 3.0, based on tokenizer.sed
|
|
|
|
|
|
|
|
#starting quotes
|
|
|
|
text = re.sub(r'^\"', r'``', text)
|
|
|
|
text = re.sub(r'(``)', r' \1 ', text)
|
|
|
|
text = re.sub(r'([ (\[{<])"', r'\1 `` ', text)
|
|
|
|
|
|
|
|
#punctuation
|
|
|
|
text = re.sub(r'([:,])([^\d])', r' \1 \2', text)
|
|
|
|
text = re.sub(r'\.\.\.', r' ... ', text)
|
|
|
|
text = re.sub(r'[;@#$%&]', r' \g<0> ', text)
|
|
|
|
text = re.sub(r'([^\.])(\.)([\]\)}>"\']*)\s*$', r'\1 \2\3 ', text)
|
|
|
|
text = re.sub(r'[?!]', r' \g<0> ', text)
|
|
|
|
|
|
|
|
text = re.sub(r"([^'])' ", r"\1 ' ", text)
|
|
|
|
|
|
|
|
#parens, brackets, etc.
|
|
|
|
text = re.sub(r'[\]\[\(\)\{\}\<\>]', r' \g<0> ', text)
|
|
|
|
text = re.sub(r'--', r' -- ', text)
|
|
|
|
|
|
|
|
#add extra space to make things easier
|
|
|
|
text = " " + text + " "
|
|
|
|
|
|
|
|
#ending quotes
|
|
|
|
text = re.sub(r'"', " '' ", text)
|
|
|
|
text = re.sub(r'(\S)(\'\')', r'\1 \2 ', text)
|
|
|
|
|
|
|
|
text = re.sub(r"([^' ])('[sS]|'[mM]|'[dD]|') ", r"\1 \2 ", text)
|
|
|
|
text = re.sub(r"([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) ", r"\1 \2 ",
|
|
|
|
text)
|
|
|
|
|
|
|
|
for regexp in CONTRACTIONS2:
|
|
|
|
text = regexp.sub(r' \1 \2 ', text)
|
|
|
|
for regexp in CONTRACTIONS3:
|
|
|
|
text = regexp.sub(r' \1 \2 ', text)
|
|
|
|
|
|
|
|
# We are not using CONTRACTIONS4 since
|
|
|
|
# they are also commented out in the SED scripts
|
|
|
|
# for regexp in self.CONTRACTIONS4:
|
|
|
|
# text = regexp.sub(r' \1 \2 \3 ', text)
|
|
|
|
|
|
|
|
return text.split()
|
|
|
|
|
|
|
|
|
|
|
|
cdef class PennTreebank3(Language):
|
2014-08-29 04:30:27 +04:00
|
|
|
"""Fully PTB compatible English tokenizer, tightly coupled to lexicon.
|
|
|
|
|
|
|
|
Attributes:
|
|
|
|
name (unicode): The two letter code used by Wikipedia for the language.
|
|
|
|
lexicon (Lexicon): The lexicon. Exposes the lookup method.
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
def __cinit__(self, name):
|
|
|
|
flag_funcs = [0 for _ in range(NR_FLAGS)]
|
|
|
|
|
|
|
|
flag_funcs[OFT_UPPER] = orth.oft_case('upper', UPPER_THRESH)
|
|
|
|
flag_funcs[OFT_LOWER] = orth.oft_case('lower', LOWER_THRESH)
|
|
|
|
flag_funcs[OFT_TITLE] = orth.oft_case('title', TITLE_THRESH)
|
|
|
|
|
|
|
|
flag_funcs[IS_ALPHA] = orth.is_alpha
|
|
|
|
flag_funcs[IS_DIGIT] = orth.is_digit
|
|
|
|
flag_funcs[IS_PUNCT] = orth.is_punct
|
|
|
|
flag_funcs[IS_SPACE] = orth.is_space
|
|
|
|
flag_funcs[IS_TITLE] = orth.is_title
|
|
|
|
flag_funcs[IS_LOWER] = orth.is_lower
|
|
|
|
flag_funcs[IS_UPPER] = orth.is_upper
|
|
|
|
|
|
|
|
flag_funcs[CAN_PUNCT] = orth.can_tag('PUNCT', TAG_THRESH)
|
|
|
|
flag_funcs[CAN_CONJ] = orth.can_tag('CONJ', TAG_THRESH)
|
|
|
|
flag_funcs[CAN_NUM] = orth.can_tag('NUM', TAG_THRESH)
|
|
|
|
flag_funcs[CAN_DET] = orth.can_tag('DET', TAG_THRESH)
|
|
|
|
flag_funcs[CAN_ADP] = orth.can_tag('ADP', TAG_THRESH)
|
|
|
|
flag_funcs[CAN_ADJ] = orth.can_tag('ADJ', TAG_THRESH)
|
|
|
|
flag_funcs[CAN_VERB] = orth.can_tag('VERB', TAG_THRESH)
|
|
|
|
flag_funcs[CAN_NOUN] = orth.can_tag('NOUN', TAG_THRESH)
|
|
|
|
flag_funcs[CAN_PDT] = orth.can_tag('PDT', TAG_THRESH)
|
|
|
|
flag_funcs[CAN_POS] = orth.can_tag('POS', TAG_THRESH)
|
|
|
|
flag_funcs[CAN_PRT] = orth.can_tag('PRT', TAG_THRESH)
|
|
|
|
|
|
|
|
Language.__init__(self, name, flag_funcs)
|
|
|
|
|
|
|
|
|
|
|
|
cdef list _split(self, unicode chunk):
|
2014-08-22 18:35:48 +04:00
|
|
|
strings = nltk_regex_tokenize(chunk)
|
2014-08-23 21:55:06 +04:00
|
|
|
if strings[-1] == '.':
|
|
|
|
strings.pop()
|
|
|
|
strings[-1] += '.'
|
2014-08-22 18:35:48 +04:00
|
|
|
assert strings
|
|
|
|
return strings
|
|
|
|
|
|
|
|
|
2014-08-29 04:30:27 +04:00
|
|
|
PTB3 = PennTreebank3('ptb3')
|