Merge branch 'master' of github.com:honnibal/spaCy

2025-11-01 08:27:44 +03:00 · 2016-02-04 17:37:33 +01:00 · 2016-02-04 17:37:33 +01:00 · fc19a4a153
commit fc19a4a153
parent e7ec06cea2 48ce09687d
11 changed files with 106 additions and 5 deletions
--- a/spacy/attrs.pxd
+++ b/spacy/attrs.pxd
@ -85,3 +85,11 @@ cpdef enum attr_id_t:
    HEAD
    SPACY
    PROB
+    
+# Move these up to FLAG14--FLAG18 once we finish the functionality and
+# are ready to regenerate the model
+#IS_BRACKET
+#IS_QUOTE
+#IS_LEFT_PUNCT
+#IS_RIGHT_PUNCT
+ 
--- a/spacy/attrs.pyx
+++ b/spacy/attrs.pyx
@ -13,7 +13,6 @@ IDS = {
    "LIKE_EMAIL": LIKE_EMAIL,
    "IS_STOP": IS_STOP,
    "IS_OOV": IS_OOV,
-
    "FLAG14": FLAG14,
    "FLAG15": FLAG15,
    "FLAG16": FLAG16,
--- a/spacy/language.py
+++ b/spacy/language.py
@ -82,6 +82,22 @@ class Language(object):
    def is_title(string):
        return orth.is_title(string)

+    @staticmethod
+    def is_bracket(string):
+        return orth.is_bracket(string)
+
+    @staticmethod
+    def is_quote(string):
+        return orth.is_quote(string)
+
+    @staticmethod
+    def is_left_punct(string):
+        return orth.is_left_punct(string)
+
+    @staticmethod
+    def is_right_punct(string):
+        return orth.is_right_punct(string)
+
    @staticmethod
    def is_upper(string):
        return orth.is_upper(string)
@ -121,6 +137,10 @@ class Language(object):
            attrs.IS_SPACE: cls.is_space,
            attrs.IS_TITLE: cls.is_title,
            attrs.IS_UPPER: cls.is_upper,
+            attrs.FLAG14: cls.is_bracket,
+            attrs.FLAG15: cls.is_quote,
+            attrs.FLAG16: cls.is_left_punct,
+            attrs.FLAG17: cls.is_right_punct,
            attrs.LIKE_URL: cls.like_url,
            attrs.LIKE_NUM: cls.like_num,
            attrs.LIKE_EMAIL: cls.like_email,
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@ -18,6 +18,10 @@ import numpy

 from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
 from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
+from .attrs cimport FLAG14 as IS_BRACKET
+from .attrs cimport FLAG15 as IS_QUOTE
+from .attrs cimport FLAG16 as IS_LEFT_PUNCT
+from .attrs cimport FLAG17 as IS_RIGHT_PUNCT
 from .attrs cimport IS_OOV


@ -183,6 +187,23 @@ cdef class Lexeme:
        def __get__(self): return Lexeme.c_check_flag(self.c, IS_SPACE)
        def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_SPACE, x)

+    property is_bracket: 
+        def __get__(self): return Lexeme.c_check_flag(self.c, IS_BRACKET)
+        def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_BRACKET, x)
+
+    property is_quote: 
+        def __get__(self): return Lexeme.c_check_flag(self.c, IS_QUOTE)
+        def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_QUOTE, x)
+
+    property is_left_punct: 
+        def __get__(self): return Lexeme.c_check_flag(self.c, IS_LEFT_PUNCT)
+        def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_LEFT_PUNCT, x)
+
+    property is_right_punct: 
+        def __get__(self): return Lexeme.c_check_flag(self.c, IS_RIGHT_PUNCT)
+        def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_RIGHT_PUNCT, x)
+
+
    property like_url:
        def __get__(self): return Lexeme.c_check_flag(self.c, LIKE_URL)
        def __set__(self, bint x): Lexeme.c_set_flag(self.c, LIKE_URL, x)
--- a/spacy/matcher.pyx
+++ b/spacy/matcher.pyx
@ -15,7 +15,6 @@ from libcpp.vector cimport vector
 from murmurhash.mrmr cimport hash64

 from .attrs cimport LENGTH, ENT_TYPE, ORTH, NORM, LEMMA, LOWER, SHAPE
-from .attrs cimport FLAG14, FLAG15, FLAG16, FLAG17, FLAG18, FLAG19, FLAG20, FLAG21, FLAG22, FLAG23, FLAG24, FLAG25
 from .tokens.doc cimport get_token_attr
 from .tokens.doc cimport Doc
 from .vocab cimport Vocab
--- a/spacy/orth.pyx
+++ b/spacy/orth.pyx
@ -1,4 +1,5 @@
 # -*- coding: utf8 -*-
+# cython: infer_types=True
 from __future__ import unicode_literals
 import unicodedata

@ -48,6 +49,21 @@ cpdef bint is_ascii(unicode string):
    else:
        return True

+cpdef bint is_bracket(unicode string):
+    return False
+
+cpdef bint is_quote(unicode string):
+    if string in ('"', "'"):
+        return True
+    else:
+        return False
+
+cpdef bint is_left_punct(unicode string):
+    return False
+
+cpdef bint is_right_punct(unicode string):
+    return False
+

 cpdef bint is_title(unicode string):
    return string.istitle()
--- a/spacy/symbols.pxd
+++ b/spacy/symbols.pxd
@ -14,7 +14,7 @@ cpdef enum symbol_t:
    IS_STOP
    IS_OOV
    
-    FLAG14
+    FLAG14 = 14
    FLAG15
    FLAG16
    FLAG17
@ -419,3 +419,10 @@ cpdef enum symbol_t:
    rcmod
    root
    xcomp
+
+# Move these up to FLAG14--FLAG18 once we finish the functionality
+# and are ready to regenerate the model.
+#IS_BRACKET
+#IS_QUOTE
+#IS_LEFT_PUNCT
+#IS_RIGHT_PUNCT
--- a/spacy/symbols.pyx
+++ b/spacy/symbols.pyx
@ -13,7 +13,6 @@ IDS = {
    "LIKE_EMAIL": LIKE_EMAIL,
    "IS_STOP": IS_STOP,
    "IS_OOV": IS_OOV,
-    
    "FLAG14": FLAG14,
    "FLAG15": FLAG15,
    "FLAG16": FLAG16,
--- a/spacy/tests/vocab/test_flag_features.py
+++ b/spacy/tests/vocab/test_flag_features.py
@ -41,3 +41,18 @@ def test_is_digit(words):
    assert not is_digit(words[7])
    assert not is_digit(words[8])
    assert not is_digit(words[9])
+
+
+def test_is_quote(words):
+    pass
+
+
+def test_is_bracket(words):
+    pass
+
+
+def test_is_left_bracket(words):
+    pass
+
+def test_is_right_bracket(words):
+    pass
--- a/spacy/tests/vocab/test_vocab.py
+++ b/spacy/tests/vocab/test_vocab.py
@ -45,6 +45,7 @@ def test_symbols(en_vocab):
    assert en_vocab.strings['PROB'] == PROB
    

+@pytest.mark.skip
 def test_pickle_vocab(en_vocab):
    file_ = io.BytesIO()
    cloudpickle.dump(en_vocab, file_)
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@ -18,6 +18,10 @@ from ..attrs cimport POS, LEMMA, TAG, DEP
 from ..parts_of_speech cimport CONJ, PUNCT

 from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
+from ..attrs cimport FLAG14 as IS_BRACKET
+from ..attrs cimport FLAG15 as IS_QUOTE
+from ..attrs cimport FLAG16 as IS_LEFT_PUNCT
+from ..attrs cimport FLAG17 as IS_RIGHT_PUNCT
 from ..attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
 from ..attrs cimport IS_OOV

@ -363,6 +367,18 @@ cdef class Token:
    property is_space: 
        def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_SPACE)
    
+    property is_bracket: 
+        def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_BRACKET)
+
+    property is_quote: 
+        def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_QUOTE)
+
+    property is_left_punct: 
+        def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_LEFT_PUNCT)
+
+    property is_right_punct: 
+        def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_RIGHT_PUNCT)
+
    property like_url:
        def __get__(self): return Lexeme.c_check_flag(self.c.lex, LIKE_URL)