计算一个目录's大小使用Python?

在我重新发明这个特殊的轮子之前,有没有人有一个很好的用Python计算目录大小的例程?如果该例程能以Mb/Gb等格式格式化大小,那就太好了。

263251 次浏览

这将遍历所有子目录;文件大小总和:

import os


def get_size(start_path = '.'):
total_size = 0
for dirpath, dirnames, filenames in os.walk(start_path):
for f in filenames:
fp = os.path.join(dirpath, f)
# skip if it is symbolic link
if not os.path.islink(fp):
total_size += os.path.getsize(fp)


return total_size


print(get_size(), 'bytes')

还有一个使用os.listdir (不包括子目录)的线性程序:

import os
sum(os.path.getsize(f) for f in os.listdir('.') if os.path.isfile(f))

参考:

<强>更新 要使用os.path.getsize,这比使用os.stat()更清楚。st_size方法。< / p >

感谢ghostdog74指出这一点!

os.stat - st_size给出以字节为单位的大小。也可用于获取文件大小等文件相关信息。

import os


nbytes = sum(d.stat().st_size for d in os.scandir('.') if d.is_file())

更新2018

如果你使用的是Python 3.4或更早版本,那么你可以考虑使用第三方scandir包提供的更有效的walk方法。在Python 3.5及以后版本中,此包已被合并到标准库中,并且os.walk的性能得到了相应的提升。

更新2019

最近我越来越多地使用pathlib,这里有一个pathlib解决方案:

from pathlib import Path


root_directory = Path('.')
sum(f.stat().st_size for f in root_directory.glob('**/*') if f.is_file())


蒙纳特的答案是很好的,但它失败的符号链接,所以你还必须检查这个路径是否真的存在

if os.path.exists(fp):
total_size += os.stat(fp).st_size

要获取一个文件的大小,可以使用os.path.getsize()

>>> import os
>>> os.path.getsize("/path/file")
35L

它以字节为单位报告。

下面是一个递归函数(它递归地总结所有子文件夹及其各自文件的大小),返回的字节与在linux中运行“du -sb .”时完全相同(其中“。”表示“当前文件夹”):

import os


def getFolderSize(folder):
total_size = os.path.getsize(folder)
for item in os.listdir(folder):
itempath = os.path.join(folder, item)
if os.path.isfile(itempath):
total_size += os.path.getsize(itempath)
elif os.path.isdir(itempath):
total_size += getFolderSize(itempath)
return total_size


print "Size: " + str(getFolderSize("."))

接受的答案不考虑硬链接或软链接,并将这些文件计算两次。您可能希望跟踪已看到的inode,而不是为这些文件添加大小。

import os
def get_size(start_path='.'):
total_size = 0
seen = {}
for dirpath, dirnames, filenames in os.walk(start_path):
for f in filenames:
fp = os.path.join(dirpath, f)
try:
stat = os.stat(fp)
except OSError:
continue


try:
seen[stat.st_ino]
except KeyError:
seen[stat.st_ino] = True
else:
continue


total_size += stat.st_size


return total_size


print get_size()

你可以这样做:

import commands
size = commands.getoutput('du -sh /path/').split()[0]

在这种情况下,我没有在返回之前测试结果,如果你想要,你可以用commands.getstatusoutput检查它。

Chris的回答很好,但可以通过使用set来检查已看到的目录来使其更加惯用,这也避免了对控制流使用异常:

def directory_size(path):
total_size = 0
seen = set()


for dirpath, dirnames, filenames in os.walk(path):
for f in filenames:
fp = os.path.join(dirpath, f)


try:
stat = os.stat(fp)
except OSError:
continue


if stat.st_ino in seen:
continue


seen.add(stat.st_ino)


total_size += stat.st_size


return total_size  # size in bytes

递归的一行代码:

def getFolderSize(p):
from functools import partial
prepend = partial(os.path.join, p)
return sum([(os.path.getsize(f) if os.path.isfile(f) else getFolderSize(f)) for f in map(prepend, os.listdir(p))])
import os


def get_size(path):
total_size = 0
for dirpath, dirnames, filenames in os.walk(path):
for f in filenames:
if os.path.exists(fp):
fp = os.path.join(dirpath, f)
total_size += os.path.getsize(fp)


return total_size   # in megabytes

感谢monkut &troex !

这个脚本告诉你哪个文件是CWD中最大的,也告诉你这个文件在哪个文件夹中。 这个脚本在win8和python 3.3.3 shell

上为我工作
import os


folder=os.cwd()


number=0
string=""


for root, dirs, files in os.walk(folder):
for file in files:
pathname=os.path.join(root,file)
##        print (pathname)
##        print (os.path.getsize(pathname)/1024/1024)
if number < os.path.getsize(pathname):
number = os.path.getsize(pathname)
string=pathname




##        print ()




print (string)
print ()
print (number)
print ("Number in bytes")

到目前为止,建议的一些方法实现了递归,其他方法使用shell或不会生成格式整齐的结果。当您的代码对于Linux平台是一次性的,您可以像往常一样获得格式化,包括递归,作为一行程序。除了最后一行中的print,它将适用于当前版本的python2python3:

du.py
-----
#!/usr/bin/python3
import subprocess


def du(path):
"""disk usage in human readable format (e.g. '2,1GB')"""
return subprocess.check_output(['du','-sh', path]).split()[0].decode('utf-8')


if __name__ == "__main__":
print(du('.'))

简单,高效,将工作于文件和多级目录:

$ chmod 750 du.py
$ ./du.py
2,9M

问题的第二部分

def human(size):


B = "B"
KB = "KB"
MB = "MB"
GB = "GB"
TB = "TB"
UNITS = [B, KB, MB, GB, TB]
HUMANFMT = "%f %s"
HUMANRADIX = 1024.


for u in UNITS[:-1]:
if size < HUMANRADIX : return HUMANFMT % (size, u)
size /= HUMANRADIX


return HUMANFMT % (size,  UNITS[-1])

你说的一句话… 下面是一行代码:

sum([sum(map(lambda fname: os.path.getsize(os.path.join(directory, fname)), files)) for directory, folders, files in os.walk(path)])

尽管我可能会把它分开,它不执行检查。

要转换为kb,请参阅可重用库获得人类可读版本的文件大小?并将其代入

下面的脚本打印指定目录的所有子目录的目录大小。它还试图(如果可能的话)从缓存递归函数的调用中获益。如果省略一个参数,脚本将在当前目录中工作。输出按目录大小从大到小排序。所以你可以根据自己的需要进行调整。

PS我已经使用配方578019以人类友好的格式显示目录大小(http://code.activestate.com/recipes/578019/)

from __future__ import print_function
import os
import sys
import operator


def null_decorator(ob):
return ob


if sys.version_info >= (3,2,0):
import functools
my_cache_decorator = functools.lru_cache(maxsize=4096)
else:
my_cache_decorator = null_decorator


start_dir = os.path.normpath(os.path.abspath(sys.argv[1])) if len(sys.argv) > 1 else '.'


@my_cache_decorator
def get_dir_size(start_path = '.'):
total_size = 0
if 'scandir' in dir(os):
# using fast 'os.scandir' method (new in version 3.5)
for entry in os.scandir(start_path):
if entry.is_dir(follow_symlinks = False):
total_size += get_dir_size(entry.path)
elif entry.is_file(follow_symlinks = False):
total_size += entry.stat().st_size
else:
# using slow, but compatible 'os.listdir' method
for entry in os.listdir(start_path):
full_path = os.path.abspath(os.path.join(start_path, entry))
if os.path.isdir(full_path):
total_size += get_dir_size(full_path)
elif os.path.isfile(full_path):
total_size += os.path.getsize(full_path)
return total_size


def get_dir_size_walk(start_path = '.'):
total_size = 0
for dirpath, dirnames, filenames in os.walk(start_path):
for f in filenames:
fp = os.path.join(dirpath, f)
total_size += os.path.getsize(fp)
return total_size


def bytes2human(n, format='%(value).0f%(symbol)s', symbols='customary'):
"""
(c) http://code.activestate.com/recipes/578019/


Convert n bytes into a human readable string based on format.
symbols can be either "customary", "customary_ext", "iec" or "iec_ext",
see: http://goo.gl/kTQMs


>>> bytes2human(0)
'0.0 B'
>>> bytes2human(0.9)
'0.0 B'
>>> bytes2human(1)
'1.0 B'
>>> bytes2human(1.9)
'1.0 B'
>>> bytes2human(1024)
'1.0 K'
>>> bytes2human(1048576)
'1.0 M'
>>> bytes2human(1099511627776127398123789121)
'909.5 Y'


>>> bytes2human(9856, symbols="customary")
'9.6 K'
>>> bytes2human(9856, symbols="customary_ext")
'9.6 kilo'
>>> bytes2human(9856, symbols="iec")
'9.6 Ki'
>>> bytes2human(9856, symbols="iec_ext")
'9.6 kibi'


>>> bytes2human(10000, "%(value).1f %(symbol)s/sec")
'9.8 K/sec'


>>> # precision can be adjusted by playing with %f operator
>>> bytes2human(10000, format="%(value).5f %(symbol)s")
'9.76562 K'
"""
SYMBOLS = {
'customary'     : ('B', 'K', 'M', 'G', 'T', 'P', 'E', 'Z', 'Y'),
'customary_ext' : ('byte', 'kilo', 'mega', 'giga', 'tera', 'peta', 'exa',
'zetta', 'iotta'),
'iec'           : ('Bi', 'Ki', 'Mi', 'Gi', 'Ti', 'Pi', 'Ei', 'Zi', 'Yi'),
'iec_ext'       : ('byte', 'kibi', 'mebi', 'gibi', 'tebi', 'pebi', 'exbi',
'zebi', 'yobi'),
}
n = int(n)
if n < 0:
raise ValueError("n < 0")
symbols = SYMBOLS[symbols]
prefix = {}
for i, s in enumerate(symbols[1:]):
prefix[s] = 1 << (i+1)*10
for symbol in reversed(symbols[1:]):
if n >= prefix[symbol]:
value = float(n) / prefix[symbol]
return format % locals()
return format % dict(symbol=symbols[0], value=n)


############################################################
###
###  main ()
###
############################################################
if __name__ == '__main__':
dir_tree = {}
### version, that uses 'slow' [os.walk method]
#get_size = get_dir_size_walk
### this recursive version can benefit from caching the function calls (functools.lru_cache)
get_size = get_dir_size


for root, dirs, files in os.walk(start_dir):
for d in dirs:
dir_path = os.path.join(root, d)
if os.path.isdir(dir_path):
dir_tree[dir_path] = get_size(dir_path)


for d, size in sorted(dir_tree.items(), key=operator.itemgetter(1), reverse=True):
print('%s\t%s' %(bytes2human(size, format='%(value).2f%(symbol)s'), d))


print('-' * 80)
if sys.version_info >= (3,2,0):
print(get_dir_size.cache_info())

样例输出:

37.61M  .\subdir_b
2.18M   .\subdir_a
2.17M   .\subdir_a\subdir_a_2
4.41K   .\subdir_a\subdir_a_1
----------------------------------------------------------
CacheInfo(hits=2, misses=4, maxsize=4096, currsize=4)

编辑:根据user2233949的建议,将null_decorator移到上面

Python 3.5使用os.scandir递归文件夹大小

def folder_size(path='.'):
total = 0
for entry in os.scandir(path):
if entry.is_file():
total += entry.stat().st_size
elif entry.is_dir():
total += folder_size(entry.path)
return total

有点晚了,但在一行中,只要你安装了glob2人性化。注意,在Python 3中,默认iglob具有递归模式。如何修改Python 3的代码是留给读者的简单练习。

>>> import os
>>> from humanize import naturalsize
>>> from glob2 import iglob
>>> naturalsize(sum(os.path.getsize(x) for x in iglob('/var/**'))))
'546.2 MB'

不可否认,这有点像黑客,只适用于Unix/Linux。

它匹配du -sb .,因为实际上这是一个运行du -sb .命令的Python bash包装器。

import subprocess


def system_command(cmd):
""""Function executes cmd parameter as a bash command."""
p = subprocess.Popen(cmd,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
shell=True)
stdout, stderr = p.communicate()
return stdout, stderr


size = int(system_command('du -sb . ')[0].split()[0])

我使用python 2.7.13和scandir,这里是我的一行递归函数,以获得文件夹的总大小:

from scandir import scandir
def getTotFldrSize(path):
return sum([s.stat(follow_symlinks=False).st_size for s in scandir(path) if s.is_file(follow_symlinks=False)]) + \
+ sum([getTotFldrSize(s.path) for s in scandir(path) if s.is_dir(follow_symlinks=False)])


>>> print getTotFldrSize('.')
1203245680

https://pypi.python.org/pypi/scandir

使用库上海:模块du完成:

pip install sh


import sh
print( sh.du("-s", ".") )
91154728        .

如果你想传递asterix,使用glob,如在这里所述。

要转换人类可读对象中的值,使用人性化:

pip install humanize


import humanize
print( humanize.naturalsize( 91157384 ) )
91.2 MB

当计算子目录的大小时,它应该更新其父目录的文件夹大小,这将一直进行下去,直到它到达根父目录。

下面的函数计算文件夹及其所有子文件夹的大小。

import os


def folder_size(path):
parent = {}  # path to parent path mapper
folder_size = {}  # storing the size of directories
folder = os.path.realpath(path)


for root, _, filenames in os.walk(folder):
if root == folder:
parent[root] = -1  # the root folder will not have any parent
folder_size[root] = 0.0  # intializing the size to 0


elif root not in parent:
immediate_parent_path = os.path.dirname(root)  # extract the immediate parent of the subdirectory
parent[root] = immediate_parent_path  # store the parent of the subdirectory
folder_size[root] = 0.0  # initialize the size to 0


total_size = 0
for filename in filenames:
filepath = os.path.join(root, filename)
total_size += os.stat(filepath).st_size  # computing the size of the files under the directory
folder_size[root] = total_size  # store the updated size


temp_path = root  # for subdirectories, we need to update the size of the parent till the root parent
while parent[temp_path] != -1:
folder_size[parent[temp_path]] += total_size
temp_path = parent[temp_path]


return folder_size[folder]/1000000.0

不管怎样……树命令免费完成所有这些工作:

tree -h --du /path/to/dir  # files and dirs
tree -h -d --du /path/to/dir  # dirs only

我喜欢Python,但到目前为止,这个问题最简单的解决方案不需要新的代码。

我在这里有点晚(和新),但我选择使用subprocess模块和Linux中的'du'命令行来检索文件夹大小的准确值,单位为MB。我必须使用if和elif用于根文件夹,否则子进程会由于返回的非零值而引发错误。

import subprocess
import os


#
# get folder size
#
def get_size(self, path):
if os.path.exists(path) and path != '/':
cmd = str(subprocess.check_output(['sudo', 'du', '-s', path])).\
replace('b\'', '').replace('\'', '').split('\\t')[0]
return float(cmd) / 1000000
elif os.path.exists(path) and path == '/':
cmd = str(subprocess.getoutput(['sudo du -s /'])). \
replace('b\'', '').replace('\'', '').split('\n')
val = cmd[len(cmd) - 1].replace('/', '').replace(' ', '')
return float(val) / 1000000
else: raise ValueError

如果你使用的是Windows操作系统,你可以:

通过以下方式安装pywin32模块:

PIP安装pywin32

然后编码如下:

import win32com.client as com


def get_folder_size(path):
try:
fso = com.Dispatch("Scripting.FileSystemObject")
folder = fso.GetFolder(path)
size = str(round(folder.Size / 1048576))
print("Size: " + size + " MB")
except Exception as e:
print("Error --> " + str(e))

它很方便:

import os
import stat


size = 0
path_ = ""
def calculate(path=os.environ["SYSTEMROOT"]):
global size, path_
size = 0
path_ = path


for x, y, z in os.walk(path):
for i in z:
size += os.path.getsize(x + os.sep + i)


def cevir(x):
global path_
print(path_, x, "Byte")
print(path_, x/1024, "Kilobyte")
print(path_, x/1048576, "Megabyte")
print(path_, x/1073741824, "Gigabyte")


calculate("C:\Users\Jundullah\Desktop")
cevir(size)


Output:
C:\Users\Jundullah\Desktop 87874712211 Byte
C:\Users\Jundullah\Desktop 85815148.64355469 Kilobyte
C:\Users\Jundullah\Desktop 83803.85609722137 Megabyte
C:\Users\Jundullah\Desktop 81.83970321994275 Gigabyte

这是一个递归地完成它的一行代码(从Python 3.5开始提供递归选项):

import os
import glob
print(sum(os.path.getsize(f) for f in glob.glob('**', recursive=True) if os.path.isfile(f))/(1024*1024))

获取目录大小

解决方案的性质:

  • 返回两者:表观大小(文件中的字节数)和文件使用的实际磁盘空间。
  • 硬链接文件只计算一次
  • 计数符号链接的方式与du相同
  • 不使用递归
  • 使用st.st_blocks作为所使用的磁盘空间,因此只适用于类unix系统

代码:

import os




def du(path):
if os.path.islink(path):
return (os.lstat(path).st_size, 0)
if os.path.isfile(path):
st = os.lstat(path)
return (st.st_size, st.st_blocks * 512)
apparent_total_bytes = 0
total_bytes = 0
have = []
for dirpath, dirnames, filenames in os.walk(path):
apparent_total_bytes += os.lstat(dirpath).st_size
total_bytes += os.lstat(dirpath).st_blocks * 512
for f in filenames:
fp = os.path.join(dirpath, f)
if os.path.islink(fp):
apparent_total_bytes += os.lstat(fp).st_size
continue
st = os.lstat(fp)
if st.st_ino in have:
continue  # skip hardlinks which were already counted
have.append(st.st_ino)
apparent_total_bytes += st.st_size
total_bytes += st.st_blocks * 512
for d in dirnames:
dp = os.path.join(dirpath, d)
if os.path.islink(dp):
apparent_total_bytes += os.lstat(dp).st_size
return (apparent_total_bytes, total_bytes)

使用示例:

>>> du('/lib')
(236425839, 244363264)


$ du -sb /lib
236425839   /lib
$ du -sB1 /lib
244363264   /lib

人类可读的文件大小

解决方案的性质:

代码:

def humanized_size(num, suffix='B', si=False):
if si:
units = ['','K','M','G','T','P','E','Z']
last_unit = 'Y'
div = 1000.0
else:
units = ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']
last_unit = 'Yi'
div = 1024.0
for unit in units:
if abs(num) < div:
return "%3.1f%s%s" % (num, unit, suffix)
num /= div
return "%.1f%s%s" % (num, last_unit, suffix)

使用示例:

>>> humanized_size(236425839)
'225.5MiB'
>>> humanized_size(236425839, si=True)
'236.4MB'
>>> humanized_size(236425839, si=True, suffix='')
'236.4M'

使用pathlib,我想出了这个一行程序来获取文件夹的大小:

sum(file.stat().st_size for file in Path(folder).rglob('*'))

这是我为一个漂亮的格式化输出:

from pathlib import Path




def get_folder_size(folder):
return ByteSize(sum(file.stat().st_size for file in Path(folder).rglob('*')))




class ByteSize(int):


_KB = 1024
_suffixes = 'B', 'KB', 'MB', 'GB', 'PB'


def __new__(cls, *args, **kwargs):
return super().__new__(cls, *args, **kwargs)


def __init__(self, *args, **kwargs):
self.bytes = self.B = int(self)
self.kilobytes = self.KB = self / self._KB**1
self.megabytes = self.MB = self / self._KB**2
self.gigabytes = self.GB = self / self._KB**3
self.petabytes = self.PB = self / self._KB**4
*suffixes, last = self._suffixes
suffix = next((
suffix
for suffix in suffixes
if 1 < getattr(self, suffix) < self._KB
), last)
self.readable = suffix, getattr(self, suffix)


super().__init__()


def __str__(self):
return self.__format__('.2f')


def __repr__(self):
return '{}({})'.format(self.__class__.__name__, super().__repr__())


def __format__(self, format_spec):
suffix, val = self.readable
return '{val:{fmt}} {suf}'.format(val=val, fmt=format_spec, suf=suffix)


def __sub__(self, other):
return self.__class__(super().__sub__(other))


def __add__(self, other):
return self.__class__(super().__add__(other))
    

def __mul__(self, other):
return self.__class__(super().__mul__(other))


def __rsub__(self, other):
return self.__class__(super().__sub__(other))


def __radd__(self, other):
return self.__class__(super().__add__(other))
    

def __rmul__(self, other):
return self.__class__(super().__rmul__(other))

用法:

>>> size = get_folder_size("c:/users/tdavis/downloads")
>>> print(size)
5.81 GB
>>> size.GB
5.810891855508089
>>> size.gigabytes
5.810891855508089
>>> size.PB
0.005674699077644618
>>> size.MB
5950.353260040283
>>> size
ByteSize(6239397620)

我还遇到了这个问题,它有一些更紧凑,可能更性能的打印文件大小的策略。

使用pathlib在Python 3.6上工作的解决方案。

from pathlib import Path


sum([f.stat().st_size for f in Path("path").glob("**/*")])

Python 3.6 +递归文件夹/文件大小使用os.scandir。与@blakev的回答一样强大,但在EAFP python style中更短。

import os


def size(path, *, follow_symlinks=False):
try:
with os.scandir(path) as it:
return sum(size(entry, follow_symlinks=follow_symlinks) for entry in it)
except NotADirectoryError:
return os.stat(path, follow_symlinks=follow_symlinks).st_size

python3.5 +

from pathlib import Path


def get_size(folder: str) -> int:
return sum(p.stat().st_size for p in Path(folder).rglob('*'))

用法::

In [6]: get_size('/etc/not-exist-path')
Out[6]: 0
In [7]: get_size('.')
Out[7]: 12038689
In [8]: def filesize(size: int) -> str:
...:     for unit in ("B", "K", "M", "G", "T"):
...:         if size < 1024:
...:             break
...:         size /= 1024
...:     return f"{size:.1f}{unit}"
...:


In [9]: filesize(get_size('.'))
Out[9]: '11.5M'


def recursive_dir_size(path):
size = 0


for x in os.listdir(path):
if not os.path.isdir(os.path.join(path,x)):
size += os.stat(os.path.join(path,x)).st_size
else:
size += recursive_dir_size(os.path.join(path,x))


return size

我写了这个函数,它给了我一个目录的准确总体大小,我尝试了其他for循环解决方案与os。行走,但我不知道为什么最终结果总是小于实际大小(在ubuntu 18 env)。我一定是做错了什么,但谁在乎写这个工作完美无缺。

du默认情况下不遵循符号链接。这里没有答案,请使用follow_symlinks=False

下面是一个遵循du默认行为的实现:

def du(path) -> int:
total = 0
for entry in os.scandir(path):
if entry.is_file(follow_symlinks=False):
total += entry.stat().st_size
elif entry.is_dir(follow_symlinks=False):
total += du(entry.path)
return total

测试:

class Test(unittest.TestCase):
def test_du(self):
root = '/tmp/du_test'
subprocess.run(['rm', '-rf', root])
test_utils.mkdir(root)
test_utils.create_file(root, 'A', '1M')
test_utils.create_file(root, 'B', '1M')
sub = '/'.join([root, 'sub'])
test_utils.mkdir(sub)
test_utils.create_file(sub, 'C', '1M')
test_utils.create_file(sub, 'D', '1M')
subprocess.run(['ln', '-s', '/tmp', '/'.join([root, 'link']), ])
self.assertEqual(4 << 20, util.du(root))
import os
def get_size(path = os.getcwd()):
print("Calculating Size: ",path)
total_size = 0
#if path is directory--
if os.path.isdir(path):
print("Path type : Directory/Folder")
for dirpath, dirnames, filenames in os.walk(path):
for f in filenames:
fp = os.path.join(dirpath, f)
# skip if it is symbolic link
if not os.path.islink(fp):
total_size += os.path.getsize(fp)
#if path is a file---
elif os.path.isfile(path):
print("Path type : File")
total_size=os.path.getsize(path)
else:
print("Path Type : Special File (Socket, FIFO, Device File)" )
total_size=0
bytesize=total_size
print(bytesize, 'bytes')
print(bytesize/(1024), 'kilobytes')
print(bytesize/(1024*1024), 'megabytes')
print(bytesize/(1024*1024*1024), 'gegabytes')
return total_size




x=get_size("/content/examples")

我相信这很有帮助!文件夹和文件!