[Python Automation in Office] Batch Convert Word Documents to PDF and Count Page Numbers

Hello everyone, today we will demonstrate how to use Python to batch convert Word documents to PDF format.

Without further ado, let's get started!

pypdf2 is a Python module that can be used to read, write, and manipulate PDF files. To install the pypdf2 module, please follow these steps:

Make sure you have Python installed. You can check if Python is installed by entering python --version in the terminal or command prompt.

Installation of the pypdf2 module:
ModuleNotFoundError: No module named 'PyPDF2'

After the installation is complete, you can use the pypdf2 module in Python to read, write, and manipulate PDF files.

For example, to read the text content of a PDF file, you can import the pypdf2 module in your Python script, then use the PdfFileReader class to read the file and iterate through each page. Here is a simple example code:

import pypdf2

pdf_file = pypdf2.PdfFileReader('example.pdf')
for page_num in range(pdf_file.getNumPages()):
    page = pdf_file.getPage(page_num)
    print(page.extractText())

This will print the text content of each page in the PDF file.

Note:
Due to updates in the PyPDF2 version, some classes and functions have been deprecated. To use alternative functions, such as replacing getNumPages with len(reader.pages).

Here are two error messages, replace the functions accordingly:

PyPDF2.errors.DeprecationError: PdfFileReader is deprecated and was removed in PyPDF2 3.0.0. Use PdfReader instead.

PyPDF2.errors.DeprecationError: reader.getNumPages is deprecated and was removed in PyPDF2 3.0.0. Use len(reader.pages) instead.

Use Python code to batch convert Word documents to PDF format
And perform page count on the converted documents, as shown below (code example):

# -*- coding:utf-8 -*-
import os  # Import system function module
from win32com.client import Dispatch, DispatchEx  # Import functions from the client package of the pywin32 module
from win32com.client import constants  # Import the class that saves COM constants from the client package of the pywin32 module
from win32com.client import gencache    # Import the gencache function from the client package of the pywin32 module
from PyPDF2 import  PdfReader  # Get the page count
import re  # Import the regular expression module

import pythoncom  # Import the module that encapsulates the OLE Automation API, which is a sub-module of pywin32


'''Get files in the specified directory
   filepath: the directory to traverse
   filelist_out: output file list
   file_ext: file extension, default to any type of file
'''
def getfilenames(filepath='',filelist_out=[],file_ext='all'):
    # Traverse all files under filepath, including files under subdirectories
    for fpath, dirs, fs in os.walk(filepath):
        for f in fs:
            fi_d = os.path.join(fpath, f)
            if file_ext == '.doc':  # Traverse Word document files
                if os.path.splitext(fi_d)[1] in ['.doc','.docx']:   # Check if it is a Word file
                    filelist_out.append(re.sub(r'\\','/',fi_d))  # Add it to the path list
            else:
                if  file_ext == 'all':  # Get all files
                    filelist_out.append(fi_d)  # Add the file path to the path list
                elif os.path.splitext(fi_d)[1] == file_ext:  # Get files other than Word files
                    filelist_out.append(fi_d)  # Add the file path to the path list
                else:
                    pass
        filelist_out.sort()  # Sort the paths
    return filelist_out  # Return the complete file path list

# Convert Word to PDF (multiple files)
def wordtopdf(filelist,targetpath):
    totalPages = 0   # Record the total number of pages
    valueList = []
    try:
        pythoncom.CoInitialize()   # Call the thread to initialize the COM library to solve the problem of "CoInitialize not called yet" when calling Word 2007
        gencache.EnsureModule('{00020905-0000-0000-C000-000000000046}', 0, 8, 4)
        # Start conversion
        w = Dispatch("Word.Application")
        for fullfilename in filelist:
            (filepath,filename) = os.path.split(fullfilename)  # Split the file path and file name, where filepath represents the file path; filename represents the file name
            softfilename = os.path.splitext(filename)  # Split the file name and extension
            os.chdir(filepath)  
            doc = os.path.abspath(filename)
            os.chdir(targetpath)
            pdfname = softfilename[0] + ".pdf"
            output = os.path.abspath(pdfname)
            pdf_name = output

            # The document path needs to be an absolute path, because the current path is not the current path when the Word is started.
            try: # Catch exceptions
                doc = w.Documents.Open(doc, ReadOnly=1)
                doc.ExportAsFixedFormat(output, constants.wdExportFormatPDF, \
                                        Item=constants.wdExportDocumentWithMarkup,
                                        CreateBookmarks=constants.wdExportCreateHeadingBookmarks)
            except Exception as e: # Handle exceptions
                print(e)
            if os.path.isfile(pdf_name): # Check if the file exists
                # Get the page count
                pages = getPdfPageNum(pdf_name)   # Get the page count
                valueList.append([fullfilename,str(pages)])
                totalPages += pages  # Accumulate the page count
                # os.remove(pdf_name)  # Delete the generated PDF file
            else:
                print('Conversion failed!')
                return False
        w.Quit(constants.wdDoNotSaveChanges) # Exit the Word application
        return totalPages,valueList  # Return the total number of pages and the page count of each document
    except TypeError as e:
        print('Error occurred!')
        print(e)
        return False
'''
Function: Count the number of pages in the document
path: absolute path of the file
'''
def getPdfPageNum(path):
    with open(path, "rb") as file:
        doc = PdfReader(file)
        pagecount = len(doc.pages)
    return pagecount

if __name__ == '__main__':
    sourcepath = r"C:/Users/Lenovo/Desktop/python代码示例/word/"  # Specify the source path (path where the Word documents are located)
    targetpath = r"C:/Users/Lenovo/Desktop/python代码示例/pdf/"  # Specify the target path (PDF save path)
    filelist = getfilenames(sourcepath,[],'.doc')  # Get the path of the Word document
    valueList = wordtopdf(filelist,targetpath)  # Batch convert Word documents to PDF
    resultList = valueList[1]  # Get the statistical result
    if valueList:
        for i in resultList:
            print(i[0],i[1])
        totalPages = str(valueList[0]) # Total number of pages
        print("Total pages:",totalPages)
    else:
        print("No files to count or counting failed!")