diff --git a/examples/training/train_textcat.py b/examples/training/train_textcat.py index 07fba47c6..9be7eb5f5 100644 --- a/examples/training/train_textcat.py +++ b/examples/training/train_textcat.py @@ -18,8 +18,7 @@ from pathlib import Path import thinc.extra.datasets import spacy -from spacy.gold import minibatch -from spacy.util import compounding +from spacy.util import minibatch, compounding @plac.annotations( diff --git a/spacy/util.py b/spacy/util.py index d15e33cca..d3dc391e1 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -392,7 +392,7 @@ def minibatch(items, size=8): so that batch-size can vary on each step. """ if isinstance(size, int): - size_ = itertools.repeat(8) + size_ = itertools.repeat(size) else: size_ = size items = iter(items) diff --git a/website/api/_top-level/_util.jade b/website/api/_top-level/_util.jade index 90b4a7b4b..a08cf1276 100644 --- a/website/api/_top-level/_util.jade +++ b/website/api/_top-level/_util.jade @@ -320,3 +320,137 @@ p | #[code title] is rendered as coloured headline. #[code exits] | performs system exit after printing, using the value of the | argument as the exit code, e.g. #[code exits=1]. + + ++h(3, "util.minibatch") util.minibatch + +tag function + +tag-new(2) + +p + | Iterate over batches of items. #[code size] may be an iterator, so that + | batch-size can vary on each step. + ++aside-code("Example"). + batches = minibatch(train_data) + for batch in batches: + texts, annotations = zip(*batch) + nlp.update(texts, annotations) + ++table(["Name", "Type", "Description"]) + +row + +cell #[code items] + +cell iterable + +cell The items to batch up. + + +row + +cell #[code size] + +cell int / iterable + +cell + | The batch size(s). Use + | #[+api("top-level#util.compounding") #[code util.compounding]] or + | #[+api("top-level#util.decaying") #[code util.decaying]] or + | for an infinite series of compounding or decaying values. + + +row("foot") + +cell yields + +cell list + +cell The batches. + ++h(3, "util.compounding") util.compounding + +tag function + +tag-new(2) + +p + | Yield an infinite series of compounding values. Each time the generator + | is called, a value is produced by multiplying the previous value by the + | compound rate. + ++aside-code("Example"). + sizes = compounding(1., 10., 1.5) + assert next(sizes) == 1. + assert next(sizes) == 1. * 1.5 + assert next(sizes) == 1.5 * 1.5 + ++table(["Name", "Type", "Description"]) + +row + +cell #[code start] + +cell int / float + +cell The first value. + + +row + +cell #[code stop] + +cell int / float + +cell The maximum value. + + +row + +cell #[code compound] + +cell int / float + +cell The compounding factor. + + +row("foot") + +cell yields + +cell int + +cell Compounding values. + ++h(3, "util.decaying") util.decaying + +tag function + +tag-new(2) + +p + | Yield an infinite series of linearly decaying values. + ++aside-code("Example"). + sizes = decaying(1., 10., 0.001) + assert next(sizes) == 1. + assert next(sizes) == 1. - 0.001 + assert next(sizes) == 0.999 - 0.001 + ++table(["Name", "Type", "Description"]) + +row + +cell #[code start] + +cell int / float + +cell The first value. + + +row + +cell #[code end] + +cell int / float + +cell The maximum value. + + +row + +cell #[code decay] + +cell int / float + +cell The decaying factor. + + +row("foot") + +cell yields + +cell int + +cell The decaying values. + ++h(3, "util.itershuffle") util.itershuffle + +tag function + +tag-new(2) + +p + | Shuffle an iterator. This works by holding #[code bufsize] items back and + | yielding them sometime later. Obviously, this is not unbiased – but + | should be good enough for batching. Larger bufsize means less bias. + ++aside-code("Example"). + values = range(1000) + shuffled = itershuffle(values) + ++table(["Name", "Type", "Description"]) + +row + +cell #[code iterable] + +cell iterable + +cell Iterator to shuffle. + + +row + +cell #[code buffsize] + +cell int + +cell Items to hold back. + + +row("foot") + +cell yields + +cell iterable + +cell The shuffled iterator.