Merge branch 'master' of github.com:honnibal/spaCy

This commit is contained in:
Henning Peters 2016-02-04 17:37:33 +01:00
commit fc19a4a153
11 changed files with 106 additions and 5 deletions

View File

@ -85,3 +85,11 @@ cpdef enum attr_id_t:
HEAD
SPACY
PROB
# Move these up to FLAG14--FLAG18 once we finish the functionality and
# are ready to regenerate the model
#IS_BRACKET
#IS_QUOTE
#IS_LEFT_PUNCT
#IS_RIGHT_PUNCT

View File

@ -13,7 +13,6 @@ IDS = {
"LIKE_EMAIL": LIKE_EMAIL,
"IS_STOP": IS_STOP,
"IS_OOV": IS_OOV,
"FLAG14": FLAG14,
"FLAG15": FLAG15,
"FLAG16": FLAG16,

View File

@ -82,6 +82,22 @@ class Language(object):
def is_title(string):
return orth.is_title(string)
@staticmethod
def is_bracket(string):
return orth.is_bracket(string)
@staticmethod
def is_quote(string):
return orth.is_quote(string)
@staticmethod
def is_left_punct(string):
return orth.is_left_punct(string)
@staticmethod
def is_right_punct(string):
return orth.is_right_punct(string)
@staticmethod
def is_upper(string):
return orth.is_upper(string)
@ -121,6 +137,10 @@ class Language(object):
attrs.IS_SPACE: cls.is_space,
attrs.IS_TITLE: cls.is_title,
attrs.IS_UPPER: cls.is_upper,
attrs.FLAG14: cls.is_bracket,
attrs.FLAG15: cls.is_quote,
attrs.FLAG16: cls.is_left_punct,
attrs.FLAG17: cls.is_right_punct,
attrs.LIKE_URL: cls.like_url,
attrs.LIKE_NUM: cls.like_num,
attrs.LIKE_EMAIL: cls.like_email,

View File

@ -18,6 +18,10 @@ import numpy
from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
from .attrs cimport FLAG14 as IS_BRACKET
from .attrs cimport FLAG15 as IS_QUOTE
from .attrs cimport FLAG16 as IS_LEFT_PUNCT
from .attrs cimport FLAG17 as IS_RIGHT_PUNCT
from .attrs cimport IS_OOV
@ -183,6 +187,23 @@ cdef class Lexeme:
def __get__(self): return Lexeme.c_check_flag(self.c, IS_SPACE)
def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_SPACE, x)
property is_bracket:
def __get__(self): return Lexeme.c_check_flag(self.c, IS_BRACKET)
def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_BRACKET, x)
property is_quote:
def __get__(self): return Lexeme.c_check_flag(self.c, IS_QUOTE)
def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_QUOTE, x)
property is_left_punct:
def __get__(self): return Lexeme.c_check_flag(self.c, IS_LEFT_PUNCT)
def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_LEFT_PUNCT, x)
property is_right_punct:
def __get__(self): return Lexeme.c_check_flag(self.c, IS_RIGHT_PUNCT)
def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_RIGHT_PUNCT, x)
property like_url:
def __get__(self): return Lexeme.c_check_flag(self.c, LIKE_URL)
def __set__(self, bint x): Lexeme.c_set_flag(self.c, LIKE_URL, x)

View File

@ -15,7 +15,6 @@ from libcpp.vector cimport vector
from murmurhash.mrmr cimport hash64
from .attrs cimport LENGTH, ENT_TYPE, ORTH, NORM, LEMMA, LOWER, SHAPE
from .attrs cimport FLAG14, FLAG15, FLAG16, FLAG17, FLAG18, FLAG19, FLAG20, FLAG21, FLAG22, FLAG23, FLAG24, FLAG25
from .tokens.doc cimport get_token_attr
from .tokens.doc cimport Doc
from .vocab cimport Vocab

View File

@ -1,4 +1,5 @@
# -*- coding: utf8 -*-
# cython: infer_types=True
from __future__ import unicode_literals
import unicodedata
@ -48,6 +49,21 @@ cpdef bint is_ascii(unicode string):
else:
return True
cpdef bint is_bracket(unicode string):
return False
cpdef bint is_quote(unicode string):
if string in ('"', "'"):
return True
else:
return False
cpdef bint is_left_punct(unicode string):
return False
cpdef bint is_right_punct(unicode string):
return False
cpdef bint is_title(unicode string):
return string.istitle()

View File

@ -14,7 +14,7 @@ cpdef enum symbol_t:
IS_STOP
IS_OOV
FLAG14
FLAG14 = 14
FLAG15
FLAG16
FLAG17
@ -419,3 +419,10 @@ cpdef enum symbol_t:
rcmod
root
xcomp
# Move these up to FLAG14--FLAG18 once we finish the functionality
# and are ready to regenerate the model.
#IS_BRACKET
#IS_QUOTE
#IS_LEFT_PUNCT
#IS_RIGHT_PUNCT

View File

@ -13,7 +13,6 @@ IDS = {
"LIKE_EMAIL": LIKE_EMAIL,
"IS_STOP": IS_STOP,
"IS_OOV": IS_OOV,
"FLAG14": FLAG14,
"FLAG15": FLAG15,
"FLAG16": FLAG16,

View File

@ -41,3 +41,18 @@ def test_is_digit(words):
assert not is_digit(words[7])
assert not is_digit(words[8])
assert not is_digit(words[9])
def test_is_quote(words):
pass
def test_is_bracket(words):
pass
def test_is_left_bracket(words):
pass
def test_is_right_bracket(words):
pass

View File

@ -45,6 +45,7 @@ def test_symbols(en_vocab):
assert en_vocab.strings['PROB'] == PROB
@pytest.mark.skip
def test_pickle_vocab(en_vocab):
file_ = io.BytesIO()
cloudpickle.dump(en_vocab, file_)

View File

@ -18,6 +18,10 @@ from ..attrs cimport POS, LEMMA, TAG, DEP
from ..parts_of_speech cimport CONJ, PUNCT
from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
from ..attrs cimport FLAG14 as IS_BRACKET
from ..attrs cimport FLAG15 as IS_QUOTE
from ..attrs cimport FLAG16 as IS_LEFT_PUNCT
from ..attrs cimport FLAG17 as IS_RIGHT_PUNCT
from ..attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
from ..attrs cimport IS_OOV
@ -363,6 +367,18 @@ cdef class Token:
property is_space:
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_SPACE)
property is_bracket:
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_BRACKET)
property is_quote:
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_QUOTE)
property is_left_punct:
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_LEFT_PUNCT)
property is_right_punct:
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_RIGHT_PUNCT)
property like_url:
def __get__(self): return Lexeme.c_check_flag(self.c.lex, LIKE_URL)