2014-07-07 06:29:24 +04:00
|
|
|
'''Serve pointers to Lexeme structs, given strings. Maintain a reverse index,
|
|
|
|
so that strings can be retrieved from hashes. Use 64-bit hash values and
|
|
|
|
boldly assume no collisions.
|
|
|
|
'''
|
|
|
|
from __future__ import unicode_literals
|
|
|
|
|
2014-07-07 18:58:48 +04:00
|
|
|
|
2014-07-07 06:29:24 +04:00
|
|
|
from libc.stdlib cimport malloc, calloc, free
|
|
|
|
from libc.stdint cimport uint64_t
|
|
|
|
|
|
|
|
|
|
|
|
cimport spacy
|
|
|
|
|
2014-08-22 18:35:48 +04:00
|
|
|
import re
|
2014-07-07 06:29:24 +04:00
|
|
|
|
2014-08-22 18:35:48 +04:00
|
|
|
# List of contractions adapted from Robert MacIntyre's tokenizer.
|
|
|
|
CONTRACTIONS2 = [re.compile(r"(?i)\b(can)(not)\b"),
|
|
|
|
re.compile(r"(?i)\b(d)('ye)\b"),
|
|
|
|
re.compile(r"(?i)\b(gim)(me)\b"),
|
|
|
|
re.compile(r"(?i)\b(gon)(na)\b"),
|
|
|
|
re.compile(r"(?i)\b(got)(ta)\b"),
|
|
|
|
re.compile(r"(?i)\b(lem)(me)\b"),
|
|
|
|
re.compile(r"(?i)\b(mor)('n)\b"),
|
|
|
|
re.compile(r"(?i)\b(wan)(na) ")]
|
|
|
|
|
|
|
|
CONTRACTIONS3 = [re.compile(r"(?i) ('t)(is)\b"),
|
|
|
|
re.compile(r"(?i) ('t)(was)\b")]
|
|
|
|
|
|
|
|
CONTRACTIONS4 = [re.compile(r"(?i)\b(whad)(dd)(ya)\b"),
|
|
|
|
re.compile(r"(?i)\b(wha)(t)(cha)\b")]
|
|
|
|
|
|
|
|
def nltk_regex_tokenize(text):
|
|
|
|
# Implementation taken from NLTK 3.0, based on tokenizer.sed
|
|
|
|
|
|
|
|
#starting quotes
|
|
|
|
text = re.sub(r'^\"', r'``', text)
|
|
|
|
text = re.sub(r'(``)', r' \1 ', text)
|
|
|
|
text = re.sub(r'([ (\[{<])"', r'\1 `` ', text)
|
|
|
|
|
|
|
|
#punctuation
|
|
|
|
text = re.sub(r'([:,])([^\d])', r' \1 \2', text)
|
|
|
|
text = re.sub(r'\.\.\.', r' ... ', text)
|
|
|
|
text = re.sub(r'[;@#$%&]', r' \g<0> ', text)
|
|
|
|
text = re.sub(r'([^\.])(\.)([\]\)}>"\']*)\s*$', r'\1 \2\3 ', text)
|
|
|
|
text = re.sub(r'[?!]', r' \g<0> ', text)
|
|
|
|
|
|
|
|
text = re.sub(r"([^'])' ", r"\1 ' ", text)
|
|
|
|
|
|
|
|
#parens, brackets, etc.
|
|
|
|
text = re.sub(r'[\]\[\(\)\{\}\<\>]', r' \g<0> ', text)
|
|
|
|
text = re.sub(r'--', r' -- ', text)
|
|
|
|
|
|
|
|
#add extra space to make things easier
|
|
|
|
text = " " + text + " "
|
|
|
|
|
|
|
|
#ending quotes
|
|
|
|
text = re.sub(r'"', " '' ", text)
|
|
|
|
text = re.sub(r'(\S)(\'\')', r'\1 \2 ', text)
|
|
|
|
|
|
|
|
text = re.sub(r"([^' ])('[sS]|'[mM]|'[dD]|') ", r"\1 \2 ", text)
|
|
|
|
text = re.sub(r"([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) ", r"\1 \2 ",
|
|
|
|
text)
|
|
|
|
|
|
|
|
for regexp in CONTRACTIONS2:
|
|
|
|
text = regexp.sub(r' \1 \2 ', text)
|
|
|
|
for regexp in CONTRACTIONS3:
|
|
|
|
text = regexp.sub(r' \1 \2 ', text)
|
|
|
|
|
|
|
|
# We are not using CONTRACTIONS4 since
|
|
|
|
# they are also commented out in the SED scripts
|
|
|
|
# for regexp in self.CONTRACTIONS4:
|
|
|
|
# text = regexp.sub(r' \1 \2 \3 ', text)
|
|
|
|
|
|
|
|
return text.split()
|
|
|
|
|
|
|
|
|
|
|
|
cdef class PennTreebank3(Language):
|
|
|
|
cpdef list find_substrings(self, unicode chunk):
|
|
|
|
strings = nltk_regex_tokenize(chunk)
|
2014-08-23 21:55:06 +04:00
|
|
|
if strings[-1] == '.':
|
|
|
|
strings.pop()
|
|
|
|
strings[-1] += '.'
|
2014-08-22 18:35:48 +04:00
|
|
|
assert strings
|
|
|
|
return strings
|
|
|
|
|
|
|
|
|
|
|
|
cdef PennTreebank3 PTB3 = PennTreebank3('ptb3')
|
2014-07-07 14:47:21 +04:00
|
|
|
|
2014-08-23 21:55:06 +04:00
|
|
|
cpdef list tokenize(unicode string):
|
2014-08-22 18:35:48 +04:00
|
|
|
return PTB3.tokenize(string)
|
2014-07-07 14:47:21 +04:00
|
|
|
|
|
|
|
|
2014-08-23 21:55:06 +04:00
|
|
|
cpdef Word lookup(unicode string):
|
|
|
|
return PTB3.lookup(string)
|
2014-07-07 14:47:21 +04:00
|
|
|
|
|
|
|
|
|
|
|
cpdef unicode unhash(StringHash hash_value):
|
2014-08-22 18:35:48 +04:00
|
|
|
return PTB3.unhash(hash_value)
|