From 3a302ae6f2227aaa28328f56eb3462f56b94ab14 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Tue, 17 Feb 2015 22:02:09 -0500
Subject: [PATCH] * Merge train.py

---
 bin/parser/train.py | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/bin/parser/train.py b/bin/parser/train.py
index deb025ecd..a1d87d059 100755
--- a/bin/parser/train.py
+++ b/bin/parser/train.py
@@ -61,8 +61,8 @@ def read_docparse_gold(file_):
         tags = []
         ids = []
         lines = sent_str.strip().split('\n')
-        raw_text = lines.pop(0)
-        tok_text = lines.pop(0)
+        raw_text = lines.pop(0).strip()
+        tok_text = lines.pop(0).strip()
         for i, line in enumerate(lines):
             id_, word, pos_string, head_idx, label = _parse_line(line)
             if label == 'root':
@@ -234,6 +234,7 @@ def evaluate(Language, dev_loc, model_dir, gold_preproc=False):
     skipped = 0
     loss = 0
     with codecs.open(dev_loc, 'r', 'utf8') as file_:
+        #paragraphs = read_tokenized_gold(file_)
         paragraphs = read_docparse_gold(file_)
     for tokens, tag_strs, heads, labels in iter_data(paragraphs, nlp.tokenizer,
                                                      gold_preproc=gold_preproc):
@@ -241,11 +242,7 @@ def evaluate(Language, dev_loc, model_dir, gold_preproc=False):
         nlp.tagger(tokens)
         nlp.parser(tokens)
         for i, token in enumerate(tokens):
-            try:
-                pos_corr += token.tag_ == tag_strs[i]
-            except:
-                print i, token.orth_, token.tag
-                raise
+            pos_corr += token.tag_ == tag_strs[i]
             n_tokens += 1
             if heads[i] is None:
                 skipped += 1