spaCy/spacy/ptb3.pyx

'''Serve pointers to Lexeme structs, given strings. Maintain a reverse index,
so that strings can be retrieved from hashes.  Use 64-bit hash values and
boldly assume no collisions.
'''
from __future__ import unicode_literals


from libc.stdlib cimport malloc, calloc, free
from libc.stdint cimport uint64_t


cimport spacy

import re

from spacy import orth

TAG_THRESH = 0.5
UPPER_THRESH = 0.2
LOWER_THRESH = 0.5
TITLE_THRESH = 0.7

NR_FLAGS = 0

OFT_UPPER = NR_FLAGS; NR_FLAGS += 1
OFT_LOWER = NR_FLAGS; NR_FLAGS += 1
OFT_TITLE = NR_FLAGS; NR_FLAGS += 1

IS_ALPHA = NR_FLAGS; NR_FLAGS += 1
IS_DIGIT = NR_FLAGS; NR_FLAGS += 1
IS_PUNCT = NR_FLAGS; NR_FLAGS += 1
IS_SPACE = NR_FLAGS; NR_FLAGS += 1
IS_ASCII = NR_FLAGS; NR_FLAGS += 1
IS_TITLE = NR_FLAGS; NR_FLAGS += 1
IS_LOWER = NR_FLAGS; NR_FLAGS += 1
IS_UPPER = NR_FLAGS; NR_FLAGS += 1

CAN_PUNCT = NR_FLAGS; NR_FLAGS += 1
CAN_CONJ = NR_FLAGS; NR_FLAGS += 1
CAN_NUM = NR_FLAGS; NR_FLAGS += 1
CAN_DET = NR_FLAGS; NR_FLAGS += 1
CAN_ADP = NR_FLAGS; NR_FLAGS += 1
CAN_ADJ = NR_FLAGS; NR_FLAGS += 1
CAN_ADV = NR_FLAGS; NR_FLAGS += 1
CAN_VERB = NR_FLAGS; NR_FLAGS += 1
CAN_NOUN = NR_FLAGS; NR_FLAGS += 1
CAN_PDT = NR_FLAGS; NR_FLAGS += 1
CAN_POS = NR_FLAGS; NR_FLAGS += 1
CAN_PRON = NR_FLAGS; NR_FLAGS += 1
CAN_PRT = NR_FLAGS; NR_FLAGS += 1


# List of contractions adapted from Robert MacIntyre's tokenizer.
CONTRACTIONS2 = [re.compile(r"(?i)\b(can)(not)\b"),
                 re.compile(r"(?i)\b(d)('ye)\b"),
                 re.compile(r"(?i)\b(gim)(me)\b"),
                 re.compile(r"(?i)\b(gon)(na)\b"),
                 re.compile(r"(?i)\b(got)(ta)\b"),
                 re.compile(r"(?i)\b(lem)(me)\b"),
                 re.compile(r"(?i)\b(mor)('n)\b"),
                 re.compile(r"(?i)\b(wan)(na) ")]

CONTRACTIONS3 = [re.compile(r"(?i) ('t)(is)\b"),
                 re.compile(r"(?i) ('t)(was)\b")]

CONTRACTIONS4 = [re.compile(r"(?i)\b(whad)(dd)(ya)\b"),
                 re.compile(r"(?i)\b(wha)(t)(cha)\b")]

def nltk_regex_tokenize(text):
    # Implementation taken from NLTK 3.0, based on tokenizer.sed
    
    #starting quotes
    text = re.sub(r'^\"', r'``', text)
    text = re.sub(r'(``)', r' \1 ', text)
    text = re.sub(r'([ (\[{<])"', r'\1 `` ', text)

    #punctuation
    text = re.sub(r'([:,])([^\d])', r' \1 \2', text)
    text = re.sub(r'\.\.\.', r' ... ', text)
    text = re.sub(r'[;@#$%&]', r' \g<0> ', text)
    text = re.sub(r'([^\.])(\.)([\]\)}>"\']*)\s*$', r'\1 \2\3 ', text)
    text = re.sub(r'[?!]', r' \g<0> ', text)

    text = re.sub(r"([^'])' ", r"\1 ' ", text)

    #parens, brackets, etc.
    text = re.sub(r'[\]\[\(\)\{\}\<\>]', r' \g<0> ', text)
    text = re.sub(r'--', r' -- ', text)

    #add extra space to make things easier
    text = " " + text + " "

    #ending quotes
    text = re.sub(r'"', " '' ", text)
    text = re.sub(r'(\S)(\'\')', r'\1 \2 ', text)

    text = re.sub(r"([^' ])('[sS]|'[mM]|'[dD]|') ", r"\1 \2 ", text)
    text = re.sub(r"([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) ", r"\1 \2 ",
                  text)

    for regexp in CONTRACTIONS2:
        text = regexp.sub(r' \1 \2 ', text)
    for regexp in CONTRACTIONS3:
        text = regexp.sub(r' \1 \2 ', text)

    # We are not using CONTRACTIONS4 since
    # they are also commented out in the SED scripts
    # for regexp in self.CONTRACTIONS4:
    #     text = regexp.sub(r' \1 \2 \3 ', text)

    return text.split()


cdef class PennTreebank3(Language):
    """Fully PTB compatible English tokenizer, tightly coupled to lexicon.

    Attributes:
        name (unicode): The two letter code used by Wikipedia for the language.
        lexicon (Lexicon): The lexicon. Exposes the lookup method.
    """


    def __cinit__(self, name):
        flag_funcs = [0 for _ in range(NR_FLAGS)]
        
        flag_funcs[OFT_UPPER] = orth.oft_case('upper', UPPER_THRESH)
        flag_funcs[OFT_LOWER] = orth.oft_case('lower', LOWER_THRESH)
        flag_funcs[OFT_TITLE] = orth.oft_case('title', TITLE_THRESH)
        
        flag_funcs[IS_ALPHA] = orth.is_alpha
        flag_funcs[IS_DIGIT] = orth.is_digit
        flag_funcs[IS_PUNCT] = orth.is_punct
        flag_funcs[IS_SPACE] = orth.is_space
        flag_funcs[IS_TITLE] = orth.is_title
        flag_funcs[IS_LOWER] = orth.is_lower
        flag_funcs[IS_UPPER] = orth.is_upper
        
        flag_funcs[CAN_PUNCT] = orth.can_tag('PUNCT', TAG_THRESH)
        flag_funcs[CAN_CONJ] = orth.can_tag('CONJ', TAG_THRESH)
        flag_funcs[CAN_NUM] = orth.can_tag('NUM', TAG_THRESH)
        flag_funcs[CAN_DET] = orth.can_tag('DET', TAG_THRESH)
        flag_funcs[CAN_ADP] = orth.can_tag('ADP', TAG_THRESH)
        flag_funcs[CAN_ADJ] = orth.can_tag('ADJ', TAG_THRESH)
        flag_funcs[CAN_VERB] = orth.can_tag('VERB', TAG_THRESH)
        flag_funcs[CAN_NOUN] = orth.can_tag('NOUN', TAG_THRESH)
        flag_funcs[CAN_PDT] = orth.can_tag('PDT', TAG_THRESH)
        flag_funcs[CAN_POS] = orth.can_tag('POS', TAG_THRESH)
        flag_funcs[CAN_PRT] = orth.can_tag('PRT', TAG_THRESH)
        
        Language.__init__(self, name, flag_funcs)


    cdef list _split(self, unicode chunk):
        strings = nltk_regex_tokenize(chunk)
        if strings[-1] == '.':
            strings.pop()
            strings[-1] += '.'
        assert strings
        return strings
    

PTB3 = PennTreebank3('ptb3')
* Begin work on full PTB-compatible English tokenization 2014-07-07 06:29:24 +04:00			`'''Serve pointers to Lexeme structs, given strings. Maintain a reverse index,`
			`so that strings can be retrieved from hashes. Use 64-bit hash values and`
			`boldly assume no collisions.`
			`'''`
			`from __future__ import unicode_literals`

* Refactor for string view features. Working on setting up flags and enums. 2014-07-07 18:58:48 +04:00
* Begin work on full PTB-compatible English tokenization 2014-07-07 06:29:24 +04:00			`from libc.stdlib cimport malloc, calloc, free`
			`from libc.stdint cimport uint64_t`


			`cimport spacy`

* Fix ptb3 module 2014-08-22 18:35:48 +04:00			`import re`
* Begin work on full PTB-compatible English tokenization 2014-07-07 06:29:24 +04:00
* Adding PTB3 tokenizer back in, so can understand how much boilerplate is in the docs for multiple tokenizers 2014-08-29 04:30:27 +04:00			`from spacy import orth`

			`TAG_THRESH = 0.5`
			`UPPER_THRESH = 0.2`
			`LOWER_THRESH = 0.5`
			`TITLE_THRESH = 0.7`

			`NR_FLAGS = 0`

			`OFT_UPPER = NR_FLAGS; NR_FLAGS += 1`
			`OFT_LOWER = NR_FLAGS; NR_FLAGS += 1`
			`OFT_TITLE = NR_FLAGS; NR_FLAGS += 1`

			`IS_ALPHA = NR_FLAGS; NR_FLAGS += 1`
			`IS_DIGIT = NR_FLAGS; NR_FLAGS += 1`
			`IS_PUNCT = NR_FLAGS; NR_FLAGS += 1`
			`IS_SPACE = NR_FLAGS; NR_FLAGS += 1`
			`IS_ASCII = NR_FLAGS; NR_FLAGS += 1`
			`IS_TITLE = NR_FLAGS; NR_FLAGS += 1`
			`IS_LOWER = NR_FLAGS; NR_FLAGS += 1`
			`IS_UPPER = NR_FLAGS; NR_FLAGS += 1`

			`CAN_PUNCT = NR_FLAGS; NR_FLAGS += 1`
			`CAN_CONJ = NR_FLAGS; NR_FLAGS += 1`
			`CAN_NUM = NR_FLAGS; NR_FLAGS += 1`
			`CAN_DET = NR_FLAGS; NR_FLAGS += 1`
			`CAN_ADP = NR_FLAGS; NR_FLAGS += 1`
			`CAN_ADJ = NR_FLAGS; NR_FLAGS += 1`
			`CAN_ADV = NR_FLAGS; NR_FLAGS += 1`
			`CAN_VERB = NR_FLAGS; NR_FLAGS += 1`
			`CAN_NOUN = NR_FLAGS; NR_FLAGS += 1`
			`CAN_PDT = NR_FLAGS; NR_FLAGS += 1`
			`CAN_POS = NR_FLAGS; NR_FLAGS += 1`
			`CAN_PRON = NR_FLAGS; NR_FLAGS += 1`
			`CAN_PRT = NR_FLAGS; NR_FLAGS += 1`


* Fix ptb3 module 2014-08-22 18:35:48 +04:00			`# List of contractions adapted from Robert MacIntyre's tokenizer.`
			`CONTRACTIONS2 = [re.compile(r"(?i)\b(can)(not)\b"),`
			`re.compile(r"(?i)\b(d)('ye)\b"),`
			`re.compile(r"(?i)\b(gim)(me)\b"),`
			`re.compile(r"(?i)\b(gon)(na)\b"),`
			`re.compile(r"(?i)\b(got)(ta)\b"),`
			`re.compile(r"(?i)\b(lem)(me)\b"),`
			`re.compile(r"(?i)\b(mor)('n)\b"),`
			`re.compile(r"(?i)\b(wan)(na) ")]`

			`CONTRACTIONS3 = [re.compile(r"(?i) ('t)(is)\b"),`
			`re.compile(r"(?i) ('t)(was)\b")]`

			`CONTRACTIONS4 = [re.compile(r"(?i)\b(whad)(dd)(ya)\b"),`
			`re.compile(r"(?i)\b(wha)(t)(cha)\b")]`

			`def nltk_regex_tokenize(text):`
			`# Implementation taken from NLTK 3.0, based on tokenizer.sed`

			`#starting quotes`
			text = re.sub(r'^\"', r'``', text)
			text = re.sub(r'(``)', r' \1 ', text)
			text = re.sub(r'([ (\[{<])"', r'\1 `` ', text)

			`#punctuation`
			`text = re.sub(r'([:,])([^\d])', r' \1 \2', text)`
			`text = re.sub(r'\.\.\.', r' ... ', text)`
			`text = re.sub(r'[;@#$%&]', r' \g<0> ', text)`
			`text = re.sub(r'([^\.])(\.)([\]\)}>"\'])\s$', r'\1 \2\3 ', text)`
			`text = re.sub(r'[?!]', r' \g<0> ', text)`

			`text = re.sub(r"([^'])' ", r"\1 ' ", text)`

			`#parens, brackets, etc.`
			`text = re.sub(r'[\]\[\(\)\{\}\<\>]', r' \g<0> ', text)`
			`text = re.sub(r'--', r' -- ', text)`

			`#add extra space to make things easier`
			`text = " " + text + " "`

			`#ending quotes`
			`text = re.sub(r'"', " '' ", text)`
			`text = re.sub(r'(\S)(\'\')', r'\1 \2 ', text)`

			`text = re.sub(r"([^' ])('[sS]\|'[mM]\|'[dD]\|') ", r"\1 \2 ", text)`
			`text = re.sub(r"([^' ])('ll\|'LL\|'re\|'RE\|'ve\|'VE\|n't\|N'T) ", r"\1 \2 ",`
			`text)`

			`for regexp in CONTRACTIONS2:`
			`text = regexp.sub(r' \1 \2 ', text)`
			`for regexp in CONTRACTIONS3:`
			`text = regexp.sub(r' \1 \2 ', text)`

			`# We are not using CONTRACTIONS4 since`
			`# they are also commented out in the SED scripts`
			`# for regexp in self.CONTRACTIONS4:`
			`# text = regexp.sub(r' \1 \2 \3 ', text)`

			`return text.split()`


			`cdef class PennTreebank3(Language):`
* Adding PTB3 tokenizer back in, so can understand how much boilerplate is in the docs for multiple tokenizers 2014-08-29 04:30:27 +04:00			`"""Fully PTB compatible English tokenizer, tightly coupled to lexicon.`

			`Attributes:`
			`name (unicode): The two letter code used by Wikipedia for the language.`
			`lexicon (Lexicon): The lexicon. Exposes the lookup method.`
			`"""`


			`def __cinit__(self, name):`
			`flag_funcs = [0 for _ in range(NR_FLAGS)]`

			`flag_funcs[OFT_UPPER] = orth.oft_case('upper', UPPER_THRESH)`
			`flag_funcs[OFT_LOWER] = orth.oft_case('lower', LOWER_THRESH)`
			`flag_funcs[OFT_TITLE] = orth.oft_case('title', TITLE_THRESH)`

			`flag_funcs[IS_ALPHA] = orth.is_alpha`
			`flag_funcs[IS_DIGIT] = orth.is_digit`
			`flag_funcs[IS_PUNCT] = orth.is_punct`
			`flag_funcs[IS_SPACE] = orth.is_space`
			`flag_funcs[IS_TITLE] = orth.is_title`
			`flag_funcs[IS_LOWER] = orth.is_lower`
			`flag_funcs[IS_UPPER] = orth.is_upper`

			`flag_funcs[CAN_PUNCT] = orth.can_tag('PUNCT', TAG_THRESH)`
			`flag_funcs[CAN_CONJ] = orth.can_tag('CONJ', TAG_THRESH)`
			`flag_funcs[CAN_NUM] = orth.can_tag('NUM', TAG_THRESH)`
			`flag_funcs[CAN_DET] = orth.can_tag('DET', TAG_THRESH)`
			`flag_funcs[CAN_ADP] = orth.can_tag('ADP', TAG_THRESH)`
			`flag_funcs[CAN_ADJ] = orth.can_tag('ADJ', TAG_THRESH)`
			`flag_funcs[CAN_VERB] = orth.can_tag('VERB', TAG_THRESH)`
			`flag_funcs[CAN_NOUN] = orth.can_tag('NOUN', TAG_THRESH)`
			`flag_funcs[CAN_PDT] = orth.can_tag('PDT', TAG_THRESH)`
			`flag_funcs[CAN_POS] = orth.can_tag('POS', TAG_THRESH)`
			`flag_funcs[CAN_PRT] = orth.can_tag('PRT', TAG_THRESH)`

			`Language.__init__(self, name, flag_funcs)`


			`cdef list _split(self, unicode chunk):`
* Fix ptb3 module 2014-08-22 18:35:48 +04:00			`strings = nltk_regex_tokenize(chunk)`
* Refactor around Word objects, adapting tests. Tests passing, except for string views. 2014-08-23 21:55:06 +04:00			`if strings[-1] == '.':`
			`strings.pop()`
			`strings[-1] += '.'`
* Fix ptb3 module 2014-08-22 18:35:48 +04:00			`assert strings`
			`return strings`


* Adding PTB3 tokenizer back in, so can understand how much boilerplate is in the docs for multiple tokenizers 2014-08-29 04:30:27 +04:00			`PTB3 = PennTreebank3('ptb3')`