74 lines
2.4 KiB
Python
74 lines
2.4 KiB
Python
import zipfile
|
|
import xml.etree.ElementTree as ET
|
|
import sys
|
|
import os
|
|
|
|
# Microsoft Word XML Namespace
|
|
ns = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}
|
|
|
|
def extract_text_from_docx(file_path):
|
|
"""
|
|
Extracts text from a .docx file by parsing the internal word/document.xml.
|
|
Does not require python-docx library.
|
|
"""
|
|
if not os.path.exists(file_path):
|
|
print(f"File not found: {file_path}")
|
|
return
|
|
|
|
try:
|
|
# docx is essentially a zip file
|
|
with zipfile.ZipFile(file_path) as zf:
|
|
if 'word/document.xml' not in zf.namelist():
|
|
print(f"Invalid docx file (no document.xml): {file_path}")
|
|
return
|
|
|
|
xml_content = zf.read('word/document.xml')
|
|
tree = ET.fromstring(xml_content)
|
|
|
|
full_text = []
|
|
# Iterate through paragraphs
|
|
for p in tree.iterfind('.//w:p', ns):
|
|
# Iterate through runs and text nodes
|
|
texts = [node.text for node in p.iterfind('.//w:t', ns) if node.text]
|
|
if texts:
|
|
full_text.append(''.join(texts))
|
|
else:
|
|
# Preserve empty lines for readability
|
|
full_text.append('')
|
|
|
|
return '\n'.join(full_text)
|
|
|
|
except zipfile.BadZipFile:
|
|
print(f"Error: {file_path} is not a valid zip/docx file.")
|
|
except Exception as e:
|
|
print(f"Error reading {file_path}: {str(e)}")
|
|
|
|
if __name__ == "__main__":
|
|
if len(sys.argv) < 3:
|
|
# If no output file specified, use stdout but force utf-8 handling
|
|
if len(sys.argv) < 2:
|
|
print("Usage: python read_docx.py <input_docx> <output_txt>")
|
|
sys.exit(1)
|
|
|
|
# Fallback for printing to console (might verify utf-8 support)
|
|
try:
|
|
sys.stdout.reconfigure(encoding='utf-8')
|
|
except:
|
|
pass
|
|
|
|
file_path = sys.argv[1]
|
|
print(extract_text_from_docx(file_path))
|
|
else:
|
|
# Write to file
|
|
input_path = sys.argv[1]
|
|
output_path = sys.argv[2]
|
|
|
|
content = extract_text_from_docx(input_path)
|
|
|
|
try:
|
|
with open(output_path, 'w', encoding='utf-8') as f:
|
|
f.write(content)
|
|
print(f"Successfully wrote to {output_path}")
|
|
except Exception as e:
|
|
print(f"Failed to write output: {e}")
|