mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-10 19:57:17 +03:00
Split CONLLX file using tabs and not default split separators
This commit is contained in:
parent
81b28ca606
commit
7568cd6bf8
|
@ -1,18 +1,13 @@
|
|||
from __future__ import unicode_literals
|
||||
import plac
|
||||
import json
|
||||
from os import path
|
||||
import shutil
|
||||
import os
|
||||
import random
|
||||
import io
|
||||
import pathlib
|
||||
|
||||
from spacy.tokens import Doc
|
||||
from spacy.syntax.nonproj import PseudoProjectivity
|
||||
from spacy.language import Language
|
||||
from spacy.gold import GoldParse
|
||||
from spacy.vocab import Vocab
|
||||
from spacy.tagger import Tagger
|
||||
from spacy.pipeline import DependencyParser, BeamDependencyParser
|
||||
from spacy.syntax.parser import get_templates
|
||||
|
@ -23,7 +18,6 @@ import spacy.attrs
|
|||
import io
|
||||
|
||||
|
||||
|
||||
def read_conllx(loc, n=0):
|
||||
with io.open(loc, 'r', encoding='utf8') as file_:
|
||||
text = file_.read()
|
||||
|
@ -35,7 +29,8 @@ def read_conllx(loc, n=0):
|
|||
lines.pop(0)
|
||||
tokens = []
|
||||
for line in lines:
|
||||
id_, word, lemma, pos, tag, morph, head, dep, _1, _2 = line.split()
|
||||
id_, word, lemma, pos, tag, morph, head, dep, _1, \
|
||||
_2 = line.split('\t')
|
||||
if '-' in id_ or '.' in id_:
|
||||
continue
|
||||
try:
|
||||
|
|
Loading…
Reference in New Issue
Block a user