Split CONLLX file using tabs and not default split separators

This commit is contained in:
Raphaël Bournhonesque 2017-03-21 23:00:13 +01:00
parent 81b28ca606
commit 7568cd6bf8

View File

@ -1,18 +1,13 @@
from __future__ import unicode_literals
import plac
import json
from os import path
import shutil
import os
import random
import io
import pathlib
from spacy.tokens import Doc
from spacy.syntax.nonproj import PseudoProjectivity
from spacy.language import Language
from spacy.gold import GoldParse
from spacy.vocab import Vocab
from spacy.tagger import Tagger
from spacy.pipeline import DependencyParser, BeamDependencyParser
from spacy.syntax.parser import get_templates
@ -23,7 +18,6 @@ import spacy.attrs
import io
def read_conllx(loc, n=0):
with io.open(loc, 'r', encoding='utf8') as file_:
text = file_.read()
@ -35,7 +29,8 @@ def read_conllx(loc, n=0):
lines.pop(0)
tokens = []
for line in lines:
id_, word, lemma, pos, tag, morph, head, dep, _1, _2 = line.split()
id_, word, lemma, pos, tag, morph, head, dep, _1, \
_2 = line.split('\t')
if '-' in id_ or '.' in id_:
continue
try: