From 35214053fd2c9684437620db60eb9868fae61946 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 23 May 2016 10:53:00 +0000 Subject: [PATCH] * Work around get_lex_attr bug introduced during German parsing --- bin/parser/train_ud.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/bin/parser/train_ud.py b/bin/parser/train_ud.py index cd938de1f..41cd40989 100644 --- a/bin/parser/train_ud.py +++ b/bin/parser/train_ud.py @@ -14,6 +14,7 @@ from spacy.syntax.parser import Parser from spacy.syntax.arc_eager import ArcEager from spacy.syntax.parser import get_templates from spacy.scorer import Scorer +import spacy.attrs from spacy.language import Language @@ -47,6 +48,7 @@ class TreebankParser(object): @classmethod def from_dir(cls, tag_map, model_dir): vocab = Vocab(tag_map=tag_map, get_lex_attr=Language.default_lex_attrs()) + vocab.get_lex_attr[spacy.attrs.LANG] = lambda _: 0 tokenizer = Tokenizer(vocab, {}, None, None, None) tagger = Tagger.blank(vocab, TAGGER_TEMPLATES) @@ -99,7 +101,7 @@ def read_conllx(loc): for sent in text.strip().split('\n\n'): lines = sent.strip().split('\n') if lines: - if lines[0].startswith('#'): + while lines[0].startswith('#'): lines.pop(0) tokens = [] for line in lines: