mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 18:26:30 +03:00
* Adding PTB3 tokenizer back in, so can understand how much boilerplate is in the docs for multiple tokenizers
This commit is contained in:
parent
45a22d6b2c
commit
5233f110c4
|
@ -1,14 +1,5 @@
|
||||||
from spacy.spacy cimport Language
|
from spacy.lang cimport Language
|
||||||
from spacy.lexeme cimport StringHash
|
|
||||||
from spacy.word cimport Word
|
|
||||||
|
|
||||||
|
|
||||||
cdef class PennTreebank3(Language):
|
cdef class PennTreebank3(Language):
|
||||||
cpdef list find_substrings(self, unicode word)
|
cdef list _split(self, unicode split)
|
||||||
|
|
||||||
|
|
||||||
cdef PennTreebank3 PTB3
|
|
||||||
|
|
||||||
cpdef Word lookup(unicode word)
|
|
||||||
cpdef list tokenize(unicode string)
|
|
||||||
cpdef unicode unhash(StringHash hash_value)
|
|
||||||
|
|
|
@ -13,6 +13,43 @@ cimport spacy
|
||||||
|
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
from spacy import orth
|
||||||
|
|
||||||
|
TAG_THRESH = 0.5
|
||||||
|
UPPER_THRESH = 0.2
|
||||||
|
LOWER_THRESH = 0.5
|
||||||
|
TITLE_THRESH = 0.7
|
||||||
|
|
||||||
|
NR_FLAGS = 0
|
||||||
|
|
||||||
|
OFT_UPPER = NR_FLAGS; NR_FLAGS += 1
|
||||||
|
OFT_LOWER = NR_FLAGS; NR_FLAGS += 1
|
||||||
|
OFT_TITLE = NR_FLAGS; NR_FLAGS += 1
|
||||||
|
|
||||||
|
IS_ALPHA = NR_FLAGS; NR_FLAGS += 1
|
||||||
|
IS_DIGIT = NR_FLAGS; NR_FLAGS += 1
|
||||||
|
IS_PUNCT = NR_FLAGS; NR_FLAGS += 1
|
||||||
|
IS_SPACE = NR_FLAGS; NR_FLAGS += 1
|
||||||
|
IS_ASCII = NR_FLAGS; NR_FLAGS += 1
|
||||||
|
IS_TITLE = NR_FLAGS; NR_FLAGS += 1
|
||||||
|
IS_LOWER = NR_FLAGS; NR_FLAGS += 1
|
||||||
|
IS_UPPER = NR_FLAGS; NR_FLAGS += 1
|
||||||
|
|
||||||
|
CAN_PUNCT = NR_FLAGS; NR_FLAGS += 1
|
||||||
|
CAN_CONJ = NR_FLAGS; NR_FLAGS += 1
|
||||||
|
CAN_NUM = NR_FLAGS; NR_FLAGS += 1
|
||||||
|
CAN_DET = NR_FLAGS; NR_FLAGS += 1
|
||||||
|
CAN_ADP = NR_FLAGS; NR_FLAGS += 1
|
||||||
|
CAN_ADJ = NR_FLAGS; NR_FLAGS += 1
|
||||||
|
CAN_ADV = NR_FLAGS; NR_FLAGS += 1
|
||||||
|
CAN_VERB = NR_FLAGS; NR_FLAGS += 1
|
||||||
|
CAN_NOUN = NR_FLAGS; NR_FLAGS += 1
|
||||||
|
CAN_PDT = NR_FLAGS; NR_FLAGS += 1
|
||||||
|
CAN_POS = NR_FLAGS; NR_FLAGS += 1
|
||||||
|
CAN_PRON = NR_FLAGS; NR_FLAGS += 1
|
||||||
|
CAN_PRT = NR_FLAGS; NR_FLAGS += 1
|
||||||
|
|
||||||
|
|
||||||
# List of contractions adapted from Robert MacIntyre's tokenizer.
|
# List of contractions adapted from Robert MacIntyre's tokenizer.
|
||||||
CONTRACTIONS2 = [re.compile(r"(?i)\b(can)(not)\b"),
|
CONTRACTIONS2 = [re.compile(r"(?i)\b(can)(not)\b"),
|
||||||
re.compile(r"(?i)\b(d)('ye)\b"),
|
re.compile(r"(?i)\b(d)('ye)\b"),
|
||||||
|
@ -75,7 +112,45 @@ def nltk_regex_tokenize(text):
|
||||||
|
|
||||||
|
|
||||||
cdef class PennTreebank3(Language):
|
cdef class PennTreebank3(Language):
|
||||||
cpdef list find_substrings(self, unicode chunk):
|
"""Fully PTB compatible English tokenizer, tightly coupled to lexicon.
|
||||||
|
|
||||||
|
Attributes:
|
||||||
|
name (unicode): The two letter code used by Wikipedia for the language.
|
||||||
|
lexicon (Lexicon): The lexicon. Exposes the lookup method.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
def __cinit__(self, name):
|
||||||
|
flag_funcs = [0 for _ in range(NR_FLAGS)]
|
||||||
|
|
||||||
|
flag_funcs[OFT_UPPER] = orth.oft_case('upper', UPPER_THRESH)
|
||||||
|
flag_funcs[OFT_LOWER] = orth.oft_case('lower', LOWER_THRESH)
|
||||||
|
flag_funcs[OFT_TITLE] = orth.oft_case('title', TITLE_THRESH)
|
||||||
|
|
||||||
|
flag_funcs[IS_ALPHA] = orth.is_alpha
|
||||||
|
flag_funcs[IS_DIGIT] = orth.is_digit
|
||||||
|
flag_funcs[IS_PUNCT] = orth.is_punct
|
||||||
|
flag_funcs[IS_SPACE] = orth.is_space
|
||||||
|
flag_funcs[IS_TITLE] = orth.is_title
|
||||||
|
flag_funcs[IS_LOWER] = orth.is_lower
|
||||||
|
flag_funcs[IS_UPPER] = orth.is_upper
|
||||||
|
|
||||||
|
flag_funcs[CAN_PUNCT] = orth.can_tag('PUNCT', TAG_THRESH)
|
||||||
|
flag_funcs[CAN_CONJ] = orth.can_tag('CONJ', TAG_THRESH)
|
||||||
|
flag_funcs[CAN_NUM] = orth.can_tag('NUM', TAG_THRESH)
|
||||||
|
flag_funcs[CAN_DET] = orth.can_tag('DET', TAG_THRESH)
|
||||||
|
flag_funcs[CAN_ADP] = orth.can_tag('ADP', TAG_THRESH)
|
||||||
|
flag_funcs[CAN_ADJ] = orth.can_tag('ADJ', TAG_THRESH)
|
||||||
|
flag_funcs[CAN_VERB] = orth.can_tag('VERB', TAG_THRESH)
|
||||||
|
flag_funcs[CAN_NOUN] = orth.can_tag('NOUN', TAG_THRESH)
|
||||||
|
flag_funcs[CAN_PDT] = orth.can_tag('PDT', TAG_THRESH)
|
||||||
|
flag_funcs[CAN_POS] = orth.can_tag('POS', TAG_THRESH)
|
||||||
|
flag_funcs[CAN_PRT] = orth.can_tag('PRT', TAG_THRESH)
|
||||||
|
|
||||||
|
Language.__init__(self, name, flag_funcs)
|
||||||
|
|
||||||
|
|
||||||
|
cdef list _split(self, unicode chunk):
|
||||||
strings = nltk_regex_tokenize(chunk)
|
strings = nltk_regex_tokenize(chunk)
|
||||||
if strings[-1] == '.':
|
if strings[-1] == '.':
|
||||||
strings.pop()
|
strings.pop()
|
||||||
|
@ -84,15 +159,4 @@ cdef class PennTreebank3(Language):
|
||||||
return strings
|
return strings
|
||||||
|
|
||||||
|
|
||||||
cdef PennTreebank3 PTB3 = PennTreebank3('ptb3')
|
PTB3 = PennTreebank3('ptb3')
|
||||||
|
|
||||||
cpdef list tokenize(unicode string):
|
|
||||||
return PTB3.tokenize(string)
|
|
||||||
|
|
||||||
|
|
||||||
cpdef Word lookup(unicode string):
|
|
||||||
return PTB3.lookup(string)
|
|
||||||
|
|
||||||
|
|
||||||
cpdef unicode unhash(StringHash hash_value):
|
|
||||||
return PTB3.unhash(hash_value)
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user