import os import sys from docx import Document from pypdf import PdfReader
# .docx defread_docx(filepath): doc = Document(filepath) full_text = [] for para in doc.paragraphs: full_text.append(para.text) return"\n".join(full_text)
# .pdf defread_pdf(filepath): reader = PdfReader(filepath) full_text = "" for p in reader.pages: full_text += p.extract_text() return full_text