使用python多进程计算一个目录下所有文件的MD5值(接上文)

简单的把上篇博客的脚本由多线程改为多进程,测试了下时间花费差别不大,使用多线程时花费30秒那样,使用多进程花费27秒那样,效率并没有什么太大的提升。这个一个是因为本来不是计算型的程序,使用进程意义也不是很大,另一个目录下的文件也不多,看不出差别。

 1 #!/usr/bin/env python
 2 #coding: utf-8
 3 
 4 import os
 5 import time
 6 import hashlib
 7 import torndb
 8 from multiprocessing import Process, Queue
 9 
10 queue = Queue()
11 
12 def md5_checksum(file_path):
13     '''计算一个文件的MD5值'''
14     with open(file_path, 'rb') as fh:
15         m = hashlib.md5()
16         while True:
17             data = fh.read(8192)
18             if not data:
19                 break
20             m.update(data)
21         return m.hexdigest()
22 
23 def produce_file(dir_path):
24     '''返回一个目录下所有的文件'''
25     all_files = []
26     for root,dirs,files in os.walk(dir_path):
27         for file in files:
28             file_path = os.path.join(root,file)
29             all_files.append(file_path)
30     return all_files
31 
32 def insert_mysql(file_path,file_md5):
33     db = torndb.Connection("localhost","test","root","budong")
34     db.execute("insert into file_info(file_name,file_md5) values(%s,%s)",file_path,file_md5)
35 
36 def worker(queue):
37     #从队列中获取文件路径、计算MD5、插入数据库
38     for file_path in iter(queue.get,"STOP"):
39         file_md5 = md5_checksum(file_path)
40         insert_mysql(file_path,file_md5)
41     return True
42 
43 def main():
44     #向队列中填充数据
45     all_files = produce_file("/Users/budong/Downloads/")
46     for file_path in all_files:
47         queue.put(file_path)
48 
49     workers = 4
50     processes = []
51     for w in xrange(workers):
52         p = Process(target=worker,args=(queue,))
53         p.start()
54         processes.append(p)
55         queue.put("STOP")
56     for p in processes:
57         p.join()
58 
59 if __name__ == "__main__":
60     main()

参考资料:

Python多进程并发(multiprocessing):https://www.coder4.com/archives/3352

正确使用 Multiprocessing 的姿势:https://jingsam.github.io/2015/12/31/multiprocessing.html

A Brief Introduction To Multiprocessing:http://toastdriven.com/blog/2008/nov/11/brief-introduction-multiprocessing/

multiprocessing Basics:https://pymotw.com/2/multiprocessing/basics.html

multiprocessing — Process-based “threading” interface:https://docs.python.org/2/library/multiprocessing.html