mirror of
https://github.com/explosion/spaCy.git
synced 2025-06-30 18:03:04 +03:00
* Read json files recursively from a directory, instead of requiring a single .json file
This commit is contained in:
parent
8f31d3b864
commit
b76bbbd12c
|
@ -138,8 +138,8 @@ def write_parses(Language, dev_loc, model_dir, out_loc):
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
@plac.annotations(
|
||||||
train_loc=("Location of training json file"),
|
train_loc=("Location of training file or directory"),
|
||||||
dev_loc=("Location of development json file"),
|
dev_loc=("Location of development file or directory"),
|
||||||
corruption_level=("Amount of noise to add to training data", "option", "c", float),
|
corruption_level=("Amount of noise to add to training data", "option", "c", float),
|
||||||
model_dir=("Location of output model directory",),
|
model_dir=("Location of output model directory",),
|
||||||
out_loc=("Out location", "option", "o", str),
|
out_loc=("Out location", "option", "o", str),
|
||||||
|
|
|
@ -4,6 +4,8 @@ import json
|
||||||
import ijson
|
import ijson
|
||||||
import random
|
import random
|
||||||
import re
|
import re
|
||||||
|
import os
|
||||||
|
from os import path
|
||||||
|
|
||||||
from spacy.munge.read_ner import tags_to_entities
|
from spacy.munge.read_ner import tags_to_entities
|
||||||
from libc.string cimport memset
|
from libc.string cimport memset
|
||||||
|
@ -94,28 +96,32 @@ def _min_edit_path(cand_words, gold_words):
|
||||||
|
|
||||||
|
|
||||||
def read_json_file(loc):
|
def read_json_file(loc):
|
||||||
with open(loc) as file_:
|
if path.isdir(loc):
|
||||||
for doc in ijson.items(file_, 'item'):
|
for filename in os.listdir(loc):
|
||||||
paragraphs = []
|
yield from read_json_file(path.join(loc, filename))
|
||||||
for paragraph in doc['paragraphs']:
|
else:
|
||||||
words = []
|
with open(loc) as file_:
|
||||||
ids = []
|
for doc in ijson.items(file_, 'item'):
|
||||||
tags = []
|
paragraphs = []
|
||||||
heads = []
|
for paragraph in doc['paragraphs']:
|
||||||
labels = []
|
words = []
|
||||||
ner = []
|
ids = []
|
||||||
for token in paragraph['tokens']:
|
tags = []
|
||||||
words.append(token['orth'])
|
heads = []
|
||||||
ids.append(token['id'])
|
labels = []
|
||||||
tags.append(token['tag'])
|
ner = []
|
||||||
heads.append(token['head'] if token['head'] >= 0 else token['id'])
|
for token in paragraph['tokens']:
|
||||||
labels.append(token['dep'])
|
words.append(token['orth'])
|
||||||
ner.append(token.get('ner', '-'))
|
ids.append(token['id'])
|
||||||
|
tags.append(token['tag'])
|
||||||
|
heads.append(token['head'] if token['head'] >= 0 else token['id'])
|
||||||
|
labels.append(token['dep'])
|
||||||
|
ner.append(token.get('ner', '-'))
|
||||||
|
|
||||||
yield (
|
yield (
|
||||||
paragraph.get('raw', None),
|
paragraph.get('raw', None),
|
||||||
(ids, words, tags, heads, labels, ner),
|
(ids, words, tags, heads, labels, ner),
|
||||||
paragraph.get('brackets', []))
|
paragraph.get('brackets', []))
|
||||||
|
|
||||||
|
|
||||||
def _iob_to_biluo(tags):
|
def _iob_to_biluo(tags):
|
||||||
|
|
Loading…
Reference in New Issue
Block a user