* Fix alignment in prepare_treebank

2025-11-10 04:47:51 +03:00 · 2015-05-06 16:31:00 +02:00 · 2015-05-06 16:31:00 +02:00 · e0ef6b6992
commit e0ef6b6992
parent 0605af6838
1 changed files with 11 additions and 7 deletions
--- a/bin/prepare_treebank.py
+++ b/bin/prepare_treebank.py
@ -16,6 +16,8 @@ doc: {
            end: int,
            label: string,
            flabel: int}]}]}
+
+Consumes output of spacy/munge/align_raw.py
 """
 import plac
 import json
@ -39,7 +41,7 @@ def _get_word_indices(raw_sent, word_idx, offset):
            indices[word_idx] = offset + match.start()
            word_idx += 1
        offset += len(piece)
-    return indices, word_idx, offset
+    return indices, word_idx, offset + 1
            

 def format_doc(section, filename, raw_paras, ptb_loc, dep_loc):
@ -49,25 +51,27 @@ def format_doc(section, filename, raw_paras, ptb_loc, dep_loc):
    assert len(ptb_sents) == len(dep_sents)

    word_idx = 0
-    offset = 0
    i = 0
    doc = {'id': 'wsj_%s%s' % (section, filename), 'paragraphs': []}
    for raw_sents in raw_paras:
        para = {'raw': ' '.join(sent.replace('<SEP>', '') for sent in raw_sents),
-                    'segmented': '<PARA>'.join(raw_sents),
+                    'segmented': '<SENT>'.join(raw_sents),
                    'sents': [],
                    'tokens': [],
                    'brackets': []}
+        offset = 0
        for raw_sent in raw_sents:
+            words = raw_sent.replace('<SEP>', ' ').split()
            para['sents'].append(offset) 
            _, brackets = read_ptb.parse(ptb_sents[i], strip_bad_periods=True)
            _, annot = read_conll.parse(dep_sents[i], strip_bad_periods=True)
            indices, word_idx, offset = _get_word_indices(raw_sent, 0, offset)
-
-            for token in annot:
-                head = indices[token['head']]
+            for j, token in enumerate(annot):
+                head = indices[token['head']] if token['head'] != -1 else -1
                try:
-                    para['tokens'].append({'start': indices[token['id']],
+                    para['tokens'].append({
+                        'start': indices[token['id']],
+                        'orth': words[j],
                        'tag': token['tag'],
                        'head': head,
                        'dep': token['dep']})