Merge branch 'master' into develop

2025-12-10 11:44:35 +03:00 · 2018-05-20 16:49:40 +02:00 · 2018-05-20 16:49:40 +02:00 · 5401c55c75
commit 5401c55c75
parent 7431e9c87f b59e3b157f
4 changed files with 52 additions and 10 deletions
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -245,6 +245,8 @@ class Errors(object):
            "the meta.json. Vector names are required to avoid issue #1660.")
    E093 = ("token.ent_iob values make invalid sequence: I without B\n{seq}")
    E094 = ("Error reading line {line_num} in vectors file {loc}.")
    E095 = ("Can't write to frozen dictionary. This is likely an internal "
            "error. Are you writing to a default function argument?")
@add_codes
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@ -4,6 +4,7 @@ from __future__ import unicode_literals
 from ..util import get_doc
 from ...tokens import Doc
 from ...vocab import Vocab
 from ...attrs import LEMMA
 import pytest
 import numpy
@ -178,6 +179,26 @@ def test_doc_api_merge_hang(en_tokenizer):
    doc.merge(8, 32, tag='', lemma='', ent_type='ORG')
 def test_doc_api_retokenizer(en_tokenizer):
    doc = en_tokenizer("WKRO played songs by the beach boys all night")
    with doc.retokenize() as retokenizer:
        retokenizer.merge(doc[4:7])
    assert len(doc) == 7
    assert doc[4].text == 'the beach boys'
 def test_doc_api_retokenizer_attrs(en_tokenizer):
    doc = en_tokenizer("WKRO played songs by the beach boys all night")
    # test both string and integer attributes and values
    attrs = {LEMMA: 'boys', 'ENT_TYPE': doc.vocab.strings['ORG']}
    with doc.retokenize() as retokenizer:
        retokenizer.merge(doc[4:7], attrs=attrs)
    assert len(doc) == 7
    assert doc[4].text == 'the beach boys'
    assert doc[4].lemma_ == 'boys'
    assert doc[4].ent_type_ == 'ORG'
 def test_doc_api_sents_empty_string(en_tokenizer):
    doc = en_tokenizer("")
    doc.is_parsed = True
--- a/spacy/tokens/_retokenize.pyx
+++ b/spacy/tokens/_retokenize.pyx
@ -11,11 +11,13 @@ from .span cimport Span
 from .token cimport Token
 from ..lexeme cimport Lexeme, EMPTY_LEXEME
 from ..structs cimport LexemeC, TokenC
-from ..attrs cimport *
+from ..attrs cimport TAG
 from ..attrs import intify_attrs
 from ..util import SimpleFrozenDict
 cdef class Retokenizer:
-    '''Helper class for doc.retokenize() context manager.'''
+    """Helper class for doc.retokenize() context manager."""
    cdef Doc doc
    cdef list merges
    cdef list splits
@ -24,14 +26,18 @@ cdef class Retokenizer:
        self.merges = []
        self.splits = []
-    def merge(self, Span span, attrs=None):
+    def merge(self, Span span, attrs=SimpleFrozenDict()):
-        '''Mark a span for merging. The attrs will be applied to the resulting
+        """Mark a span for merging. The attrs will be applied to the resulting
-        token.'''
+        token.
        """
        attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings)
        self.merges.append((span.start_char, span.end_char, attrs))
-    def split(self, Token token, orths, attrs=None):
+    def split(self, Token token, orths, attrs=SimpleFrozenDict()):
-        '''Mark a Token for splitting, into the specified orths. The attrs
+        """Mark a Token for splitting, into the specified orths. The attrs
-        will be applied to each subtoken.'''
+        will be applied to each subtoken.
        """
        attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings)
        self.splits.append((token.start_char, orths, attrs))
    def __enter__(self):
@ -125,5 +131,3 @@ def _merge(Doc doc, int start, int end, attributes):
    # Clear the cached Python objects
    # Return the merged Python object
    return doc[start]
--- a/spacy/util.py
+++ b/spacy/util.py
@ -635,3 +635,18 @@ def use_gpu(gpu_id):
 def fix_random_seed(seed=0):
    random.seed(seed)
    numpy.random.seed(seed)
 class SimpleFrozenDict(dict):
    """Simplified implementation of a frozen dict, mainly used as default
    function or method argument (for arguments that should default to empty
    dictionary). Will raise an error if user or spaCy attempts to add to dict.
    """
    def __setitem__(self, key, value):
        raise NotImplementedError(Errors.E095)
    def pop(self, key, default=None):
        raise NotImplementedError(Errors.E095)
    def update(self, other):
        raise NotImplementedError(Errors.E095)