Hello everyone, today we will demonstrate how to use Python to batch convert Word documents to PDF format.
Without further ado, let's get started!
pypdf2 is a Python module that can be used to read, write, and manipulate PDF files. To install the pypdf2 module, please follow these steps:
Make sure you have Python installed. You can check if Python is installed by entering python --version in the terminal or command prompt.
Installation of the pypdf2 module:
ModuleNotFoundError: No module named 'PyPDF2'
After the installation is complete, you can use the pypdf2 module in Python to read, write, and manipulate PDF files.
For example, to read the text content of a PDF file, you can import the pypdf2 module in your Python script, then use the PdfFileReader class to read the file and iterate through each page. Here is a simple example code:
import pypdf2
pdf_file = pypdf2.PdfFileReader('example.pdf')
for page_num in range(pdf_file.getNumPages()):
page = pdf_file.getPage(page_num)
print(page.extractText())
This will print the text content of each page in the PDF file.
Note:
Due to updates in the PyPDF2 version, some classes and functions have been deprecated. To use alternative functions, such as replacing getNumPages with len(reader.pages).
Here are two error messages, replace the functions accordingly:
PyPDF2.errors.DeprecationError: PdfFileReader is deprecated and was removed in PyPDF2 3.0.0. Use PdfReader instead.
PyPDF2.errors.DeprecationError: reader.getNumPages is deprecated and was removed in PyPDF2 3.0.0. Use len(reader.pages) instead.
Use Python code to batch convert Word documents to PDF format
And perform page count on the converted documents, as shown below (code example):
# -*- coding:utf-8 -*-
import os # Import system function module
from win32com.client import Dispatch, DispatchEx # Import functions from the client package of the pywin32 module
from win32com.client import constants # Import the class that saves COM constants from the client package of the pywin32 module
from win32com.client import gencache # Import the gencache function from the client package of the pywin32 module
from PyPDF2 import PdfReader # Get the page count
import re # Import the regular expression module
import pythoncom # Import the module that encapsulates the OLE Automation API, which is a sub-module of pywin32
'''Get files in the specified directory
filepath: the directory to traverse
filelist_out: output file list
file_ext: file extension, default to any type of file
'''
def getfilenames(filepath='',filelist_out=[],file_ext='all'):
# Traverse all files under filepath, including files under subdirectories
for fpath, dirs, fs in os.walk(filepath):
for f in fs:
fi_d = os.path.join(fpath, f)
if file_ext == '.doc': # Traverse Word document files
if os.path.splitext(fi_d)[1] in ['.doc','.docx']: # Check if it is a Word file
filelist_out.append(re.sub(r'\\','/',fi_d)) # Add it to the path list
else:
if file_ext == 'all': # Get all files
filelist_out.append(fi_d) # Add the file path to the path list
elif os.path.splitext(fi_d)[1] == file_ext: # Get files other than Word files
filelist_out.append(fi_d) # Add the file path to the path list
else:
pass
filelist_out.sort() # Sort the paths
return filelist_out # Return the complete file path list
# Convert Word to PDF (multiple files)
def wordtopdf(filelist,targetpath):
totalPages = 0 # Record the total number of pages
valueList = []
try:
pythoncom.CoInitialize() # Call the thread to initialize the COM library to solve the problem of "CoInitialize not called yet" when calling Word 2007
gencache.EnsureModule('{00020905-0000-0000-C000-000000000046}', 0, 8, 4)
# Start conversion
w = Dispatch("Word.Application")
for fullfilename in filelist:
(filepath,filename) = os.path.split(fullfilename) # Split the file path and file name, where filepath represents the file path; filename represents the file name
softfilename = os.path.splitext(filename) # Split the file name and extension
os.chdir(filepath)
doc = os.path.abspath(filename)
os.chdir(targetpath)
pdfname = softfilename[0] + ".pdf"
output = os.path.abspath(pdfname)
pdf_name = output
# The document path needs to be an absolute path, because the current path is not the current path when the Word is started.
try: # Catch exceptions
doc = w.Documents.Open(doc, ReadOnly=1)
doc.ExportAsFixedFormat(output, constants.wdExportFormatPDF, \
Item=constants.wdExportDocumentWithMarkup,
CreateBookmarks=constants.wdExportCreateHeadingBookmarks)
except Exception as e: # Handle exceptions
print(e)
if os.path.isfile(pdf_name): # Check if the file exists
# Get the page count
pages = getPdfPageNum(pdf_name) # Get the page count
valueList.append([fullfilename,str(pages)])
totalPages += pages # Accumulate the page count
# os.remove(pdf_name) # Delete the generated PDF file
else:
print('Conversion failed!')
return False
w.Quit(constants.wdDoNotSaveChanges) # Exit the Word application
return totalPages,valueList # Return the total number of pages and the page count of each document
except TypeError as e:
print('Error occurred!')
print(e)
return False
'''
Function: Count the number of pages in the document
path: absolute path of the file
'''
def getPdfPageNum(path):
with open(path, "rb") as file:
doc = PdfReader(file)
pagecount = len(doc.pages)
return pagecount
if __name__ == '__main__':
sourcepath = r"C:/Users/Lenovo/Desktop/python代码示例/word/" # Specify the source path (path where the Word documents are located)
targetpath = r"C:/Users/Lenovo/Desktop/python代码示例/pdf/" # Specify the target path (PDF save path)
filelist = getfilenames(sourcepath,[],'.doc') # Get the path of the Word document
valueList = wordtopdf(filelist,targetpath) # Batch convert Word documents to PDF
resultList = valueList[1] # Get the statistical result
if valueList:
for i in resultList:
print(i[0],i[1])
totalPages = str(valueList[0]) # Total number of pages
print("Total pages:",totalPages)
else:
print("No files to count or counting failed!")