Files
xlh/.agent/scripts/read_docx.py

74 lines
2.4 KiB
Python

import zipfile
import xml.etree.ElementTree as ET
import sys
import os
# Microsoft Word XML Namespace
ns = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}
def extract_text_from_docx(file_path):
"""
Extracts text from a .docx file by parsing the internal word/document.xml.
Does not require python-docx library.
"""
if not os.path.exists(file_path):
print(f"File not found: {file_path}")
return
try:
# docx is essentially a zip file
with zipfile.ZipFile(file_path) as zf:
if 'word/document.xml' not in zf.namelist():
print(f"Invalid docx file (no document.xml): {file_path}")
return
xml_content = zf.read('word/document.xml')
tree = ET.fromstring(xml_content)
full_text = []
# Iterate through paragraphs
for p in tree.iterfind('.//w:p', ns):
# Iterate through runs and text nodes
texts = [node.text for node in p.iterfind('.//w:t', ns) if node.text]
if texts:
full_text.append(''.join(texts))
else:
# Preserve empty lines for readability
full_text.append('')
return '\n'.join(full_text)
except zipfile.BadZipFile:
print(f"Error: {file_path} is not a valid zip/docx file.")
except Exception as e:
print(f"Error reading {file_path}: {str(e)}")
if __name__ == "__main__":
if len(sys.argv) < 3:
# If no output file specified, use stdout but force utf-8 handling
if len(sys.argv) < 2:
print("Usage: python read_docx.py <input_docx> <output_txt>")
sys.exit(1)
# Fallback for printing to console (might verify utf-8 support)
try:
sys.stdout.reconfigure(encoding='utf-8')
except:
pass
file_path = sys.argv[1]
print(extract_text_from_docx(file_path))
else:
# Write to file
input_path = sys.argv[1]
output_path = sys.argv[2]
content = extract_text_from_docx(input_path)
try:
with open(output_path, 'w', encoding='utf-8') as f:
f.write(content)
print(f"Successfully wrote to {output_path}")
except Exception as e:
print(f"Failed to write output: {e}")