spaCy/spacy/ptb3.pyx

'''Serve pointers to Lexeme structs, given strings. Maintain a reverse index,
so that strings can be retrieved from hashes.  Use 64-bit hash values and
boldly assume no collisions.
'''
from __future__ import unicode_literals


from libc.stdlib cimport malloc, calloc, free
from libc.stdint cimport uint64_t


cimport spacy

import re

# List of contractions adapted from Robert MacIntyre's tokenizer.
CONTRACTIONS2 = [re.compile(r"(?i)\b(can)(not)\b"),
                 re.compile(r"(?i)\b(d)('ye)\b"),
                 re.compile(r"(?i)\b(gim)(me)\b"),
                 re.compile(r"(?i)\b(gon)(na)\b"),
                 re.compile(r"(?i)\b(got)(ta)\b"),
                 re.compile(r"(?i)\b(lem)(me)\b"),
                 re.compile(r"(?i)\b(mor)('n)\b"),
                 re.compile(r"(?i)\b(wan)(na) ")]

CONTRACTIONS3 = [re.compile(r"(?i) ('t)(is)\b"),
                 re.compile(r"(?i) ('t)(was)\b")]

CONTRACTIONS4 = [re.compile(r"(?i)\b(whad)(dd)(ya)\b"),
                 re.compile(r"(?i)\b(wha)(t)(cha)\b")]

def nltk_regex_tokenize(text):
    # Implementation taken from NLTK 3.0, based on tokenizer.sed
    
    #starting quotes
    text = re.sub(r'^\"', r'``', text)
    text = re.sub(r'(``)', r' \1 ', text)
    text = re.sub(r'([ (\[{<])"', r'\1 `` ', text)

    #punctuation
    text = re.sub(r'([:,])([^\d])', r' \1 \2', text)
    text = re.sub(r'\.\.\.', r' ... ', text)
    text = re.sub(r'[;@#$%&]', r' \g<0> ', text)
    text = re.sub(r'([^\.])(\.)([\]\)}>"\']*)\s*$', r'\1 \2\3 ', text)
    text = re.sub(r'[?!]', r' \g<0> ', text)

    text = re.sub(r"([^'])' ", r"\1 ' ", text)

    #parens, brackets, etc.
    text = re.sub(r'[\]\[\(\)\{\}\<\>]', r' \g<0> ', text)
    text = re.sub(r'--', r' -- ', text)

    #add extra space to make things easier
    text = " " + text + " "

    #ending quotes
    text = re.sub(r'"', " '' ", text)
    text = re.sub(r'(\S)(\'\')', r'\1 \2 ', text)

    text = re.sub(r"([^' ])('[sS]|'[mM]|'[dD]|') ", r"\1 \2 ", text)
    text = re.sub(r"([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) ", r"\1 \2 ",
                  text)

    for regexp in CONTRACTIONS2:
        text = regexp.sub(r' \1 \2 ', text)
    for regexp in CONTRACTIONS3:
        text = regexp.sub(r' \1 \2 ', text)

    # We are not using CONTRACTIONS4 since
    # they are also commented out in the SED scripts
    # for regexp in self.CONTRACTIONS4:
    #     text = regexp.sub(r' \1 \2 \3 ', text)

    return text.split()


cdef class PennTreebank3(Language):
    cpdef list find_substrings(self, unicode chunk):
        strings = nltk_regex_tokenize(chunk)
        if strings[-1] == '.':
            strings.pop()
            strings[-1] += '.'
        assert strings
        return strings
    

cdef PennTreebank3 PTB3 = PennTreebank3('ptb3')

cpdef list tokenize(unicode string):
    return PTB3.tokenize(string)


cpdef Word lookup(unicode string):
    return PTB3.lookup(string)


cpdef unicode unhash(StringHash hash_value):
    return PTB3.unhash(hash_value)
* Begin work on full PTB-compatible English tokenization 2014-07-07 06:29:24 +04:00			`'''Serve pointers to Lexeme structs, given strings. Maintain a reverse index,`
			`so that strings can be retrieved from hashes. Use 64-bit hash values and`
			`boldly assume no collisions.`
			`'''`
			`from __future__ import unicode_literals`

* Refactor for string view features. Working on setting up flags and enums. 2014-07-07 18:58:48 +04:00
* Begin work on full PTB-compatible English tokenization 2014-07-07 06:29:24 +04:00			`from libc.stdlib cimport malloc, calloc, free`
			`from libc.stdint cimport uint64_t`


			`cimport spacy`

* Fix ptb3 module 2014-08-22 18:35:48 +04:00			`import re`
* Begin work on full PTB-compatible English tokenization 2014-07-07 06:29:24 +04:00
* Fix ptb3 module 2014-08-22 18:35:48 +04:00			`# List of contractions adapted from Robert MacIntyre's tokenizer.`
			`CONTRACTIONS2 = [re.compile(r"(?i)\b(can)(not)\b"),`
			`re.compile(r"(?i)\b(d)('ye)\b"),`
			`re.compile(r"(?i)\b(gim)(me)\b"),`
			`re.compile(r"(?i)\b(gon)(na)\b"),`
			`re.compile(r"(?i)\b(got)(ta)\b"),`
			`re.compile(r"(?i)\b(lem)(me)\b"),`
			`re.compile(r"(?i)\b(mor)('n)\b"),`
			`re.compile(r"(?i)\b(wan)(na) ")]`

			`CONTRACTIONS3 = [re.compile(r"(?i) ('t)(is)\b"),`
			`re.compile(r"(?i) ('t)(was)\b")]`

			`CONTRACTIONS4 = [re.compile(r"(?i)\b(whad)(dd)(ya)\b"),`
			`re.compile(r"(?i)\b(wha)(t)(cha)\b")]`

			`def nltk_regex_tokenize(text):`
			`# Implementation taken from NLTK 3.0, based on tokenizer.sed`

			`#starting quotes`
			text = re.sub(r'^\"', r'``', text)
			text = re.sub(r'(``)', r' \1 ', text)
			text = re.sub(r'([ (\[{<])"', r'\1 `` ', text)

			`#punctuation`
			`text = re.sub(r'([:,])([^\d])', r' \1 \2', text)`
			`text = re.sub(r'\.\.\.', r' ... ', text)`
			`text = re.sub(r'[;@#$%&]', r' \g<0> ', text)`
			`text = re.sub(r'([^\.])(\.)([\]\)}>"\'])\s$', r'\1 \2\3 ', text)`
			`text = re.sub(r'[?!]', r' \g<0> ', text)`

			`text = re.sub(r"([^'])' ", r"\1 ' ", text)`

			`#parens, brackets, etc.`
			`text = re.sub(r'[\]\[\(\)\{\}\<\>]', r' \g<0> ', text)`
			`text = re.sub(r'--', r' -- ', text)`

			`#add extra space to make things easier`
			`text = " " + text + " "`

			`#ending quotes`
			`text = re.sub(r'"', " '' ", text)`
			`text = re.sub(r'(\S)(\'\')', r'\1 \2 ', text)`

			`text = re.sub(r"([^' ])('[sS]\|'[mM]\|'[dD]\|') ", r"\1 \2 ", text)`
			`text = re.sub(r"([^' ])('ll\|'LL\|'re\|'RE\|'ve\|'VE\|n't\|N'T) ", r"\1 \2 ",`
			`text)`

			`for regexp in CONTRACTIONS2:`
			`text = regexp.sub(r' \1 \2 ', text)`
			`for regexp in CONTRACTIONS3:`
			`text = regexp.sub(r' \1 \2 ', text)`

			`# We are not using CONTRACTIONS4 since`
			`# they are also commented out in the SED scripts`
			`# for regexp in self.CONTRACTIONS4:`
			`# text = regexp.sub(r' \1 \2 \3 ', text)`

			`return text.split()`


			`cdef class PennTreebank3(Language):`
			`cpdef list find_substrings(self, unicode chunk):`
			`strings = nltk_regex_tokenize(chunk)`
* Refactor around Word objects, adapting tests. Tests passing, except for string views. 2014-08-23 21:55:06 +04:00			`if strings[-1] == '.':`
			`strings.pop()`
			`strings[-1] += '.'`
* Fix ptb3 module 2014-08-22 18:35:48 +04:00			`assert strings`
			`return strings`


			`cdef PennTreebank3 PTB3 = PennTreebank3('ptb3')`
* Reorganized code to accomodate Tokens class. Need string views before group_by and count_by can be done well. 2014-07-07 14:47:21 +04:00
* Refactor around Word objects, adapting tests. Tests passing, except for string views. 2014-08-23 21:55:06 +04:00			`cpdef list tokenize(unicode string):`
* Fix ptb3 module 2014-08-22 18:35:48 +04:00			`return PTB3.tokenize(string)`
* Reorganized code to accomodate Tokens class. Need string views before group_by and count_by can be done well. 2014-07-07 14:47:21 +04:00

* Refactor around Word objects, adapting tests. Tests passing, except for string views. 2014-08-23 21:55:06 +04:00			`cpdef Word lookup(unicode string):`
			`return PTB3.lookup(string)`
* Reorganized code to accomodate Tokens class. Need string views before group_by and count_by can be done well. 2014-07-07 14:47:21 +04:00

			`cpdef unicode unhash(StringHash hash_value):`
* Fix ptb3 module 2014-08-22 18:35:48 +04:00			`return PTB3.unhash(hash_value)`