我目前正在成功使用python 2.7脚本,该脚本以递归方式遍历巨大的目录/文件路径,收集所有文件的路径,获取此类文件的mtime以及具有相同路径和名称但pdf的各个文件的mtime文件进行比较.我在python 2.7脚本中使用scandir.walk(),在python 3.7中使用os.walk(),最近已更新为也使用scandir算法(无其他stat()调用).
但是,脚本的python 3版本仍然明显慢一些!这不是由于算法的scandir / walk部分造成的,而是由于getmtime算法(但是,在python2和3中是相同的调用)或由于处理了巨大的列表(我们在说〜).此列表中有500.000个条目).
任何想法可能导致此问题以及如何解决此问题?
#!/usr/bin/env python3
#
# Imports
#
import sys
import time
from datetime import datetime
import os
import re
#
# MAIN THREAD
#
if __name__ == '__main__':
source_dir = '/path_to_data/'
# Get file list
files_list = []
for root,directories,filenames in os.walk(source_dir):
# Filter for extension
for filename in filenames:
if (filename.lower().endswith(('.msg','.doc','.docx','.xls','.xlsx'))) and (not filename.lower().startswith('~')):
files_list.append(os.path.join(root,filename))
# Sort list
files_list.sort(reverse=True)
# For each file,the printing routine is performed (including necessity check)
all_documents_counter = len(files_list)
for docfile_abs in files_list:
print('\n' + docfile_abs)
# Define files
filepathname_abs,file_extension = os.path.splitext(docfile_abs)
filepath_abs,filename = os.path.split(filepathname_abs)
# If the filename does not have the format # # # # # # # *.xxx (e.g. seven numbers),then it is checked whether it is referenced in the databse. If not,it is moved to a certain directory
if (re.match(r'[0-9][0-9][0-9][0-9][0-9][0-9][0-9](([Aa][0-9][0-9]?)?|(_[0-9][0-9]?)?|([Aa][0-9][0-9]?_[0-9][0-9]?)?)\...?.?',filename + file_extension) is None):
if any(expression in docfile_abs for expression in ignore_subdirs):
pass
else:
print('Not in database')
# DOC
docfile_rel = docfile_abs.replace(source_dir,'')
# Check pdf
try:
pdf_file_abs = filepathname_abs + '.pdf'
pdf_file_timestamp = os.path.getmtime(pdf_file_abs)
check_pdf = True
except(FileNotFoundError):
check_pdf = False
# Check PDF
try:
PDF_file_abs = filepathname_abs + '.PDF'
PDF_file_timestamp = os.path.getmtime(PDF_file_abs)
check_PDF = True
except(FileNotFoundError):
check_PDF = False
# Check whether ther are lowercase or uppercase extension and decide what to do if there are none,just one or both present
if (check_pdf is True) and (check_PDF is False):
# Lower case case
pdf_extension = '.pdf'
pdffile_timestamp = pdf_file_timestamp
elif (check_pdf is False) and (check_PDF is True):
# Upper case case
pdf_extension = '.PDF'
pdffile_timestamp = PDF_file_timestamp
elif (check_pdf is False) and (check_PDF is False):
# None -> set timestampt to zero
pdf_extension = '.pdf'
pdffile_timestamp = 0
elif (check_pdf is True) and (check_PDF is True):
# Both are present,decide for the newest and move the other to a directory
if (pdf_file_timestamp < PDF_file_timestamp):
pdf_extension = '.PDF'
pdf_file_rel = pdf_file_abs.replace(source_dir,'')
pdffile_timestamp = PDF_file_timestamp
elif (PDF_file_timestamp < pdf_file_timestamp):
pdf_extension = '.pdf'
PDF_file_rel = PDF_file_abs.replace(source_dir,'')
pdffile_timestamp = pdf_file_timestamp
# Get timestamps of doc and pdf files
try:
docfile_timestamp = os.path.getmtime(docfile_abs)
except OSError:
docfile_timestamp = 0
# Enable this to force a certain period to be printed
DateBegin = time.mktime(time.strptime('01/02/2017',"%d/%m/%Y"))
DateEnd = time.mktime(time.strptime('01/03/2017',"%d/%m/%Y"))
# Compare stimestamps and print or not
if (pdffile_timestamp < docfile_timestamp) or (pdffile_timestamp == 0):
# Inform that there should be printed
print('\tPDF should be printe.')
else:
# Inform that there was no need to print
print('\tPDF is up to date.')
# Exit
sys.exit(0)