mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 09:26:27 +03:00
* Adding PTB3 tokenizer back in, so can understand how much boilerplate is in the docs for multiple tokenizers
This commit is contained in:
parent
45a22d6b2c
commit
5233f110c4
|
@ -1,14 +1,5 @@
|
|||
from spacy.spacy cimport Language
|
||||
from spacy.lexeme cimport StringHash
|
||||
from spacy.word cimport Word
|
||||
from spacy.lang cimport Language
|
||||
|
||||
|
||||
cdef class PennTreebank3(Language):
|
||||
cpdef list find_substrings(self, unicode word)
|
||||
|
||||
|
||||
cdef PennTreebank3 PTB3
|
||||
|
||||
cpdef Word lookup(unicode word)
|
||||
cpdef list tokenize(unicode string)
|
||||
cpdef unicode unhash(StringHash hash_value)
|
||||
cdef list _split(self, unicode split)
|
||||
|
|
|
@ -13,6 +13,43 @@ cimport spacy
|
|||
|
||||
import re
|
||||
|
||||
from spacy import orth
|
||||
|
||||
TAG_THRESH = 0.5
|
||||
UPPER_THRESH = 0.2
|
||||
LOWER_THRESH = 0.5
|
||||
TITLE_THRESH = 0.7
|
||||
|
||||
NR_FLAGS = 0
|
||||
|
||||
OFT_UPPER = NR_FLAGS; NR_FLAGS += 1
|
||||
OFT_LOWER = NR_FLAGS; NR_FLAGS += 1
|
||||
OFT_TITLE = NR_FLAGS; NR_FLAGS += 1
|
||||
|
||||
IS_ALPHA = NR_FLAGS; NR_FLAGS += 1
|
||||
IS_DIGIT = NR_FLAGS; NR_FLAGS += 1
|
||||
IS_PUNCT = NR_FLAGS; NR_FLAGS += 1
|
||||
IS_SPACE = NR_FLAGS; NR_FLAGS += 1
|
||||
IS_ASCII = NR_FLAGS; NR_FLAGS += 1
|
||||
IS_TITLE = NR_FLAGS; NR_FLAGS += 1
|
||||
IS_LOWER = NR_FLAGS; NR_FLAGS += 1
|
||||
IS_UPPER = NR_FLAGS; NR_FLAGS += 1
|
||||
|
||||
CAN_PUNCT = NR_FLAGS; NR_FLAGS += 1
|
||||
CAN_CONJ = NR_FLAGS; NR_FLAGS += 1
|
||||
CAN_NUM = NR_FLAGS; NR_FLAGS += 1
|
||||
CAN_DET = NR_FLAGS; NR_FLAGS += 1
|
||||
CAN_ADP = NR_FLAGS; NR_FLAGS += 1
|
||||
CAN_ADJ = NR_FLAGS; NR_FLAGS += 1
|
||||
CAN_ADV = NR_FLAGS; NR_FLAGS += 1
|
||||
CAN_VERB = NR_FLAGS; NR_FLAGS += 1
|
||||
CAN_NOUN = NR_FLAGS; NR_FLAGS += 1
|
||||
CAN_PDT = NR_FLAGS; NR_FLAGS += 1
|
||||
CAN_POS = NR_FLAGS; NR_FLAGS += 1
|
||||
CAN_PRON = NR_FLAGS; NR_FLAGS += 1
|
||||
CAN_PRT = NR_FLAGS; NR_FLAGS += 1
|
||||
|
||||
|
||||
# List of contractions adapted from Robert MacIntyre's tokenizer.
|
||||
CONTRACTIONS2 = [re.compile(r"(?i)\b(can)(not)\b"),
|
||||
re.compile(r"(?i)\b(d)('ye)\b"),
|
||||
|
@ -75,7 +112,45 @@ def nltk_regex_tokenize(text):
|
|||
|
||||
|
||||
cdef class PennTreebank3(Language):
|
||||
cpdef list find_substrings(self, unicode chunk):
|
||||
"""Fully PTB compatible English tokenizer, tightly coupled to lexicon.
|
||||
|
||||
Attributes:
|
||||
name (unicode): The two letter code used by Wikipedia for the language.
|
||||
lexicon (Lexicon): The lexicon. Exposes the lookup method.
|
||||
"""
|
||||
|
||||
|
||||
def __cinit__(self, name):
|
||||
flag_funcs = [0 for _ in range(NR_FLAGS)]
|
||||
|
||||
flag_funcs[OFT_UPPER] = orth.oft_case('upper', UPPER_THRESH)
|
||||
flag_funcs[OFT_LOWER] = orth.oft_case('lower', LOWER_THRESH)
|
||||
flag_funcs[OFT_TITLE] = orth.oft_case('title', TITLE_THRESH)
|
||||
|
||||
flag_funcs[IS_ALPHA] = orth.is_alpha
|
||||
flag_funcs[IS_DIGIT] = orth.is_digit
|
||||
flag_funcs[IS_PUNCT] = orth.is_punct
|
||||
flag_funcs[IS_SPACE] = orth.is_space
|
||||
flag_funcs[IS_TITLE] = orth.is_title
|
||||
flag_funcs[IS_LOWER] = orth.is_lower
|
||||
flag_funcs[IS_UPPER] = orth.is_upper
|
||||
|
||||
flag_funcs[CAN_PUNCT] = orth.can_tag('PUNCT', TAG_THRESH)
|
||||
flag_funcs[CAN_CONJ] = orth.can_tag('CONJ', TAG_THRESH)
|
||||
flag_funcs[CAN_NUM] = orth.can_tag('NUM', TAG_THRESH)
|
||||
flag_funcs[CAN_DET] = orth.can_tag('DET', TAG_THRESH)
|
||||
flag_funcs[CAN_ADP] = orth.can_tag('ADP', TAG_THRESH)
|
||||
flag_funcs[CAN_ADJ] = orth.can_tag('ADJ', TAG_THRESH)
|
||||
flag_funcs[CAN_VERB] = orth.can_tag('VERB', TAG_THRESH)
|
||||
flag_funcs[CAN_NOUN] = orth.can_tag('NOUN', TAG_THRESH)
|
||||
flag_funcs[CAN_PDT] = orth.can_tag('PDT', TAG_THRESH)
|
||||
flag_funcs[CAN_POS] = orth.can_tag('POS', TAG_THRESH)
|
||||
flag_funcs[CAN_PRT] = orth.can_tag('PRT', TAG_THRESH)
|
||||
|
||||
Language.__init__(self, name, flag_funcs)
|
||||
|
||||
|
||||
cdef list _split(self, unicode chunk):
|
||||
strings = nltk_regex_tokenize(chunk)
|
||||
if strings[-1] == '.':
|
||||
strings.pop()
|
||||
|
@ -84,15 +159,4 @@ cdef class PennTreebank3(Language):
|
|||
return strings
|
||||
|
||||
|
||||
cdef PennTreebank3 PTB3 = PennTreebank3('ptb3')
|
||||
|
||||
cpdef list tokenize(unicode string):
|
||||
return PTB3.tokenize(string)
|
||||
|
||||
|
||||
cpdef Word lookup(unicode string):
|
||||
return PTB3.lookup(string)
|
||||
|
||||
|
||||
cpdef unicode unhash(StringHash hash_value):
|
||||
return PTB3.unhash(hash_value)
|
||||
PTB3 = PennTreebank3('ptb3')
|
||||
|
|
Loading…
Reference in New Issue
Block a user