使用python多线程计算一个目录下所有文件的MD5值

做运维有时会碰到这种需求:计算一个目录下所有文件的MD5。拿到需求后分解成如下三个步骤:1.列出目录下所有的文件 2.计算文件的MD5 3.存入数据库。

一:未使用多线程时,脚本如下,耗时40秒。

 1 #!/usr/bin/env python
 2 #coding: utf-8
 3 
 4 import os
 5 import time
 6 import hashlib
 7 import torndb
 8 
 9 def md5_checksum(file_path):
10     '''计算一个文件的MD5值'''
11     with open(file_path, 'rb') as fh:
12         m = hashlib.md5()
13         while True:
14             data = fh.read(8192)
15             if not data:
16                 break
17             m.update(data)
18         return m.hexdigest()
19 
20 def produce_file(dir_path):
21     '''返回一个目录下所有的文件'''
22     all_files = []
23     for root,dirs,files in os.walk(dir_path):
24         for file in files:
25             file_path = os.path.join(root,file)
26             all_files.append(file_path)
27     return all_files
28 
29 def insert_mysql(file_path,file_md5):
30     db = torndb.Connection("localhost","test","root","budong")
31     db.execute("insert into file_info(file_name,file_md5) values(%s,%s)",file_path,file_md5)
32 
33 def main():
34     start = time.time()
35     all_files = produce_file("/Users/budong/Downloads/")
36     for file_path in all_files:
37         file_md5 = md5_checksum(file_path)
38         insert_mysql(file_path,file_md5)
39     print "花费时间: %s" % (time.time() - start)
40 
41 if __name__ == "__main__":
42     main()

二:使用多线程时,脚本如下,耗时25秒(文件大小不一,有几个大文件计算MD5很耗时)。

 1 #!/usr/bin/env python
 2 #coding: utf-8
 3 
 4 import os
 5 import time
 6 import hashlib
 7 import torndb
 8 import Queue
 9 import threading
10 
11 queue = Queue.Queue()
12 
13 def md5_checksum(file_path):
14     '''计算一个文件的MD5值'''
15     with open(file_path, 'rb') as fh:
16         m = hashlib.md5()
17         while True:
18             data = fh.read(8192)
19             if not data:
20                 break
21             m.update(data)
22         return m.hexdigest()
23 
24 def produce_file(dir_path):
25     '''返回一个目录下所有的文件'''
26     all_files = []
27     for root,dirs,files in os.walk(dir_path):
28         for file in files:
29             file_path = os.path.join(root,file)
30             all_files.append(file_path)
31     return all_files
32 
33 def insert_mysql(file_path,file_md5):
34     db = torndb.Connection("localhost","test","root","budong")
35     db.execute("insert into file_info(file_name,file_md5) values(%s,%s)",file_path,file_md5)
36 
37 class ThreadMd5(threading.Thread):
38     def __init__(self,queue):
39         threading.Thread.__init__(self)
40         self.queue = queue
41 
42     def run(self):
43         while True:
44             #从队列中获取文件路径、计算MD5、插入数据库
45             file_path = self.queue.get()
46             file_md5 = md5_checksum(file_path)
47             insert_mysql(file_path,file_md5)
48 
49             #通知队列任务完成
50             self.queue.task_done()
51 
52 def main():
53     start = time.time()
54 
55     #生成三个线程
56     for i in range(3):
57         t = ThreadMd5(queue)
58         t.setDaemon(True)
59         t.start()
60 
61     #向队列中填充数据
62     all_files = produce_file("/Users/budong/Downloads/")
63     for file_path in all_files:
64         queue.put(file_path)
65 
66     queue.join()
67     print "花费时间: %s" % (time.time() - start)
68 
69 if __name__ == "__main__":
70     main()

最终生成到数据库的内容如下:

mysql> select count(*) from file_info;
+----------+
| count(*) |
+----------+
|      413 |
+----------+
1 row in set (0.00 sec)

mysql> select * from file_info limit 10;
+------+--------------------------------------------------------------------------------+----------------------------------+
| id   | file_name                                                                      | file_md5                         |
+------+--------------------------------------------------------------------------------+----------------------------------+
| 2096 | /Users/budong/Downloads/.localized                                             | d41d8cd98f00b204e9800998ecf8427e |
| 2097 | /Users/budong/Downloads/.DS_Store                                              | 79c60a9b4f64e3d30e27efcc95863736 |
| 2098 | /Users/budong/Downloads/002aHdDNjx06ZespG4Vq05040100m9960k01.mp4               | 76a204df1bc348eb0e453add232081a1 |
| 2099 | /Users/budong/Downloads/140062197.flv                                          | c11f911e2b943d8f4de5eb94bf0c0cca |
| 2100 | /Users/budong/Downloads/2016092309131313132667286718613811363.pdf              | fc7020de6d1e8e1fbfd8cc27abb4a49d |
| 2101 | /Users/budong/Downloads/1[56UP人生七年][HD-MKV][比特鱼BitFish8.Com].mkv        | a75879ccf39f5df5d6e358280030a715 |
| 2102 | /Users/budong/Downloads/2[56UP人生七年][HD-MKV][比特鱼BitFish8.Com].mkv        | 8f16765f153539fca427169644fe4545 |
| 2103 | /Users/budong/Downloads/537ce825gw1ez07602fpaj20hsbmdhdu.jpg                   | b0bf6c46793d7b31c3e104d4ea41b8af |
| 2104 | /Users/budong/Downloads/64945e3djw1ex8lklec8zj20c33rdars.jpg                   | de5f912683d3881a8bfa74380252af73 |
| 2105 | /Users/budong/Downloads/67dd74e0gw1f1bxdhh4plj20hsb4yx6r.jpg                   | aa9740c8e234427c28bd3ce52813691a |
+------+--------------------------------------------------------------------------------+----------------------------------+
10 rows in set (0.00 sec)

mysql>

参考资料:

使用 Python 进行线程编程:https://www.ibm.com/developerworks/cn/aix/library/au-threadingpython/

Python Multithreaded Programming:https://www.tutorialspoint.com/python/python_multithreading.htm

Python多线程thread与threading实现:http://www.pythonclub.org/python-basic/threading

Python 多线程:http://www.runoob.com/python/python-multithreading.html

Python模块学习:threading 多线程控制和处理:http://python.jobbole.com/81546/

threading – Manage concurrent threads:https://pymotw.com/2/threading/

threading — Higher-level threading interface:https://docs.python.org/2/library/threading.html