xlh/.agent/scripts/read_docx.py

import zipfile
import xml.etree.ElementTree as ET
import sys
import os

# Microsoft Word XML Namespace
ns = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}

def extract_text_from_docx(file_path):
    """
    Extracts text from a .docx file by parsing the internal word/document.xml.
    Does not require python-docx library.
    """
    if not os.path.exists(file_path):
        print(f"File not found: {file_path}")
        return

    try:
        # docx is essentially a zip file
        with zipfile.ZipFile(file_path) as zf:
            if 'word/document.xml' not in zf.namelist():
                print(f"Invalid docx file (no document.xml): {file_path}")
                return

            xml_content = zf.read('word/document.xml')
            tree = ET.fromstring(xml_content)

            full_text = []
            # Iterate through paragraphs
            for p in tree.iterfind('.//w:p', ns):
                # Iterate through runs and text nodes
                texts = [node.text for node in p.iterfind('.//w:t', ns) if node.text]
                if texts:
                    full_text.append(''.join(texts))
                else:
                    # Preserve empty lines for readability
                    full_text.append('')

            return '\n'.join(full_text)

    except zipfile.BadZipFile:
        print(f"Error: {file_path} is not a valid zip/docx file.")
    except Exception as e:
        print(f"Error reading {file_path}: {str(e)}")

if __name__ == "__main__":
    if len(sys.argv) < 3:
        # If no output file specified, use stdout but force utf-8 handling
        if len(sys.argv) < 2:
            print("Usage: python read_docx.py <input_docx> <output_txt>")
            sys.exit(1)

        # Fallback for printing to console (might verify utf-8 support)
        try:
            sys.stdout.reconfigure(encoding='utf-8')
        except:
            pass

        file_path = sys.argv[1]
        print(extract_text_from_docx(file_path))
    else:
        # Write to file
        input_path = sys.argv[1]
        output_path = sys.argv[2]

        content = extract_text_from_docx(input_path)

        try:
            with open(output_path, 'w', encoding='utf-8') as f:
                f.write(content)
            print(f"Successfully wrote to {output_path}")
        except Exception as e:
            print(f"Failed to write output: {e}")