import zipfile import xml.etree.ElementTree as ET import sys import os # Microsoft Word XML Namespace ns = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'} def extract_text_from_docx(file_path): """ Extracts text from a .docx file by parsing the internal word/document.xml. Does not require python-docx library. """ if not os.path.exists(file_path): print(f"File not found: {file_path}") return try: # docx is essentially a zip file with zipfile.ZipFile(file_path) as zf: if 'word/document.xml' not in zf.namelist(): print(f"Invalid docx file (no document.xml): {file_path}") return xml_content = zf.read('word/document.xml') tree = ET.fromstring(xml_content) full_text = [] # Iterate through paragraphs for p in tree.iterfind('.//w:p', ns): # Iterate through runs and text nodes texts = [node.text for node in p.iterfind('.//w:t', ns) if node.text] if texts: full_text.append(''.join(texts)) else: # Preserve empty lines for readability full_text.append('') return '\n'.join(full_text) except zipfile.BadZipFile: print(f"Error: {file_path} is not a valid zip/docx file.") except Exception as e: print(f"Error reading {file_path}: {str(e)}") if __name__ == "__main__": if len(sys.argv) < 3: # If no output file specified, use stdout but force utf-8 handling if len(sys.argv) < 2: print("Usage: python read_docx.py ") sys.exit(1) # Fallback for printing to console (might verify utf-8 support) try: sys.stdout.reconfigure(encoding='utf-8') except: pass file_path = sys.argv[1] print(extract_text_from_docx(file_path)) else: # Write to file input_path = sys.argv[1] output_path = sys.argv[2] content = extract_text_from_docx(input_path) try: with open(output_path, 'w', encoding='utf-8') as f: f.write(content) print(f"Successfully wrote to {output_path}") except Exception as e: print(f"Failed to write output: {e}")