Update API docs with changes in spacy.gold and spacy.language

This commit is contained in:
ines 2017-05-22 12:29:30 +02:00
parent b5fb43fdd8
commit 54f04a9fe0
6 changed files with 87 additions and 29 deletions

View File

@ -142,9 +142,14 @@ def _min_edit_path(cand_words, gold_words):
class GoldCorpus(object):
'''An annotated corpus, using the JSON file format. Manages
annotations for tagging, dependency parsing, NER.'''
"""An annotated corpus, using the JSON file format. Manages
annotations for tagging, dependency parsing and NER."""
def __init__(self, train_path, dev_path):
"""Create a GoldCorpus.
train_path (unicode or Path): File or directory of training data.
dev_path (unicode or Path): File or directory of development data.
"""
self.train_path = util.ensure_path(train_path)
self.dev_path = util.ensure_path(dev_path)
self.train_locs = self.walk_corpus(self.train_path)

View File

@ -236,6 +236,12 @@ class Language(object):
doc.tensor = None
def preprocess_gold(self, docs_golds):
"""Can be called before training to pre-process gold data. By default,
it handles nonprojectivity and adds missing tags to the tag map.
docs_golds (iterable): Tuples of `Doc` and `GoldParse` objects.
YIELDS (tuple): Tuples of preprocessed `Doc` and `GoldParse` objects.
"""
for proc in self.pipeline:
if hasattr(proc, 'preprocess_gold'):
docs_golds = proc.preprocess_gold(docs_golds)

View File

@ -23,7 +23,8 @@
"Lexeme": "lexeme",
"Vocab": "vocab",
"StringStore": "stringstore",
"GoldParse": "goldparse"
"GoldParse": "goldparse",
"GoldCorpus": "goldcorpus"
},
"Other": {
"Annotation Specs": "annotation",
@ -135,6 +136,11 @@
"tag": "class"
},
"goldcorpus": {
"title": "GoldCorpus",
"tag": "class"
},
"annotation": {
"title": "Annotation Specifications"
},

View File

@ -0,0 +1,23 @@
//- 💫 DOCS > API > GOLDCORPUS
include ../../_includes/_mixins
p
| An annotated corpus, using the JSON file format. Manages annotations for
| tagging, dependency parsing and NER.
+h(2, "init") GoldCorpus.__init__
+tag method
p Create a #[code GoldCorpus].
+table(["Name", "Type", "Description"])
+row
+cell #[code train_path]
+cell unicode or #[code Path]
+cell File or directory of training data.
+row
+cell #[code dev_path]
+cell unicode or #[code Path]
+cell File or directory of development data.

View File

@ -7,7 +7,7 @@ p Collection for training annotations.
+h(2, "init") GoldParse.__init__
+tag method
p Create a GoldParse.
p Create a #[code GoldParse].
+table(["Name", "Type", "Description"])
+row

View File

@ -82,6 +82,41 @@ p
+cell #[code Doc]
+cell A container for accessing the annotations.
+h(2, "pipe") Language.pipe
+tag method
p
| Process texts as a stream, and yield #[code Doc] objects in order.
| Supports GIL-free multi-threading.
+aside-code("Example").
texts = [u'One document.', u'...', u'Lots of documents']
for doc in nlp.pipe(texts, batch_size=50, n_threads=4):
assert doc.is_parsed
+table(["Name", "Type", "Description"])
+row
+cell #[code texts]
+cell -
+cell A sequence of unicode objects.
+row
+cell #[code n_threads]
+cell int
+cell
| The number of worker threads to use. If #[code -1], OpenMP will
| decide how many to use at run time. Default is #[code 2].
+row
+cell #[code batch_size]
+cell int
+cell The number of texts to buffer.
+footrow
+cell yields
+cell #[code Doc]
+cell Documents in the order of the original text.
+h(2, "update") Language.update
+tag method
@ -172,40 +207,23 @@ p
+cell -
+cell Config parameters.
+h(2, "pipe") Language.pipe
+tag method
+h(2, "preprocess_gold") Language.preprocess_gold
p
| Process texts as a stream, and yield #[code Doc] objects in order.
| Supports GIL-free multi-threading.
| Can be called before training to pre-process gold data. By default, it
| handles nonprojectivity and adds missing tags to the tag map.
+aside-code("Example").
texts = [u'One document.', u'...', u'Lots of documents']
for doc in nlp.pipe(texts, batch_size=50, n_threads=4):
assert doc.is_parsed
+table(["Name", "Type", "Description"])
+row
+cell #[code texts]
+cell -
+cell A sequence of unicode objects.
+row
+cell #[code n_threads]
+cell int
+cell
| The number of worker threads to use. If #[code -1], OpenMP will
| decide how many to use at run time. Default is #[code 2].
+row
+cell #[code batch_size]
+cell int
+cell The number of texts to buffer.
+cell #[code docs_golds]
+cell iterable
+cell Tuples of #[code Doc] and #[code GoldParse] objects.
+footrow
+cell yields
+cell #[code Doc]
+cell Documents in the order of the original text.
+cell tuple
+cell Tuples of #[code Doc] and #[code GoldParse] objects.
+h(2, "to_disk") Language.to_disk
+tag method