Merge pull request #5533 from svlandeg/bugfix/minibatch-oversize

add oversize examples before StopIteration returns
2026-03-06 21:01:34 +03:00 · 2020-06-02 22:54:38 +02:00 · 2020-06-02 22:54:38 +02:00 · f74784575c
commit f74784575c
parent b5ae2edcba c5ac382f0a
3 changed files with 124 additions and 26 deletions
--- a/spacy/tests/test_util.py
+++ b/spacy/tests/test_util.py
@ -0,0 +1,59 @@
+import pytest
+from spacy.gold import Example
+
+from .util import get_random_doc
+
+from spacy.util import minibatch_by_words
+
+
+@pytest.mark.parametrize(
+    "doc_sizes, expected_batches",
+    [
+        ([400, 400, 199], [3]),
+        ([400, 400, 199, 3], [4]),
+        ([400, 400, 199, 3, 200], [3, 2]),
+        ([400, 400, 199, 3, 1], [5]),
+        ([400, 400, 199, 3, 1, 1500], [5]),    # 1500 will be discarded
+        ([400, 400, 199, 3, 1, 200], [3, 3]),
+        ([400, 400, 199, 3, 1, 999], [3, 3]),
+        ([400, 400, 199, 3, 1, 999, 999], [3, 2, 1, 1]),
+        ([1, 2, 999], [3]),
+        ([1, 2, 999, 1], [4]),
+        ([1, 200, 999, 1], [2, 2]),
+        ([1, 999, 200, 1], [2, 2]),
+    ],
+)
+def test_util_minibatch(doc_sizes, expected_batches):
+    docs = [get_random_doc(doc_size) for doc_size in doc_sizes]
+    examples = [Example(doc=doc) for doc in docs]
+    tol = 0.2
+    batch_size = 1000
+    batches = list(minibatch_by_words(examples=examples, size=batch_size, tolerance=tol, discard_oversize=True))
+    assert [len(batch) for batch in batches] == expected_batches
+
+    max_size = batch_size + batch_size * tol
+    for batch in batches:
+        assert sum([len(example.doc) for example in batch]) < max_size
+
+
+@pytest.mark.parametrize(
+    "doc_sizes, expected_batches",
+    [
+        ([400, 4000, 199], [1, 2]),
+        ([400, 400, 199, 3000, 200], [1, 4]),
+        ([400, 400, 199, 3, 1, 1500], [1, 5]),
+        ([400, 400, 199, 3000, 2000, 200, 200], [1, 1, 3, 2]),
+        ([1, 2, 9999], [1, 2]),
+        ([2000, 1, 2000, 1, 1, 1, 2000], [1, 1, 1, 4]),
+    ],
+)
+def test_util_minibatch_oversize(doc_sizes, expected_batches):
+    """ Test that oversized documents are returned in their own batch"""
+    docs = [get_random_doc(doc_size) for doc_size in doc_sizes]
+    examples = [Example(doc=doc) for doc in docs]
+    tol = 0.2
+    batch_size = 1000
+    batches = list(minibatch_by_words(examples=examples, size=batch_size, tolerance=tol, discard_oversize=False))
+    assert [len(batch) for batch in batches] == expected_batches
+
+
--- a/spacy/tests/util.py
+++ b/spacy/tests/util.py
@ -92,6 +92,13 @@ def get_batch(batch_size):
    return docs


+def get_random_doc(n_words):
+    vocab = Vocab()
+    # Make the words numbers, so that they're easy to track.
+    numbers = [str(i) for i in range(0, n_words)]
+    return Doc(vocab, words=numbers)
+
+
 def apply_transition_sequence(parser, doc, sequence):
    """Perform a series of pre-specified transitions, to put the parser in a
    desired state."""
--- a/spacy/util.py
+++ b/spacy/util.py
@ -656,42 +656,74 @@ def decaying(start, stop, decay):
        curr -= decay


-def minibatch_by_words(examples, size, tuples=True, count_words=len, tolerance=0.2):
+def minibatch_by_words(examples, size, count_words=len, tolerance=0.2, discard_oversize=False):
    """Create minibatches of roughly a given number of words. If any examples
    are longer than the specified batch length, they will appear in a batch by
-    themselves."""
+    themselves, or be discarded if discard_oversize=True."""
    if isinstance(size, int):
        size_ = itertools.repeat(size)
    elif isinstance(size, List):
        size_ = iter(size)
    else:
        size_ = size
-    examples = iter(examples)
-    oversize = []
-    while True:
-        batch_size = next(size_)
-        tol_size = batch_size * 0.2
-        batch = []
-        if oversize:
-            example = oversize.pop(0)
-            n_words = count_words(example.doc)
+
+    target_size = next(size_)
+    tol_size = target_size * tolerance
+    batch = []
+    overflow = []
+    batch_size = 0
+    overflow_size = 0
+
+    for example in examples:
+        n_words = count_words(example.doc)
+        # if the current example exceeds the maximum batch size, it is returned separately
+        # but only if discard_oversize=False.
+        if n_words > target_size + tol_size:
+            if not discard_oversize:
+                yield [example]
+
+        # add the example to the current batch if there's no overflow yet and it still fits
+        elif overflow_size == 0 and (batch_size + n_words) <= target_size:
            batch.append(example)
-            batch_size -= n_words
-        while batch_size >= 1:
-            try:
-                example = next(examples)
-            except StopIteration:
-                if batch:
-                    yield batch
-                return
-            n_words = count_words(example.doc)
-            if n_words < (batch_size + tol_size):
-                batch_size -= n_words
-                batch.append(example)
-            else:
-                oversize.append(example)
-        if batch:
+            batch_size += n_words
+
+        # add the example to the overflow buffer if it fits in the tolerance margin
+        elif (batch_size + overflow_size + n_words) <= (target_size + tol_size):
+            overflow.append(example)
+            overflow_size += n_words
+
+        # yield the previous batch and start a new one. The new one gets the overflow examples.
+        else:
            yield batch
+            target_size = next(size_)
+            tol_size = target_size * tolerance
+            batch = overflow
+            batch_size = overflow_size
+            overflow = []
+            overflow_size = 0
+
+            # this example still fits
+            if (batch_size + n_words) <= target_size:
+                batch.append(example)
+                batch_size += n_words
+
+            # this example fits in overflow
+            elif (batch_size + n_words) <= (target_size + tol_size):
+                overflow.append(example)
+                overflow_size += n_words
+
+            # this example does not fit with the previous overflow: start another new batch
+            else:
+                yield batch
+                target_size = next(size_)
+                tol_size = target_size * tolerance
+                batch = [example]
+                batch_size = n_words
+
+    # yield the final batch
+    if batch:
+        batch.extend(overflow)
+        yield batch


 def itershuffle(iterable, bufsize=1000):