获取人类可读版本的文件大小?

从字节大小返回人类可读大小的函数:

>>> human_readable(2048)
'2 kilobytes'
>>>

如何做到这一点?

144395 次浏览

其中一个库是hurry.filesize

>>> from hurry.filesize import alternative
>>> size(1, system=alternative)
'1 byte'
>>> size(10, system=alternative)
'10 bytes'
>>> size(1024, system=alternative)
'1 KB'

解决上述“太小的任务不需要一个图书馆”;问题由一个简单的实现(使用f-strings,所以Python 3.6+):

def sizeof_fmt(num, suffix="B"):
for unit in ["", "Ki", "Mi", "Gi", "Ti", "Pi", "Ei", "Zi"]:
if abs(num) < 1024.0:
return f"{num:3.1f}{unit}{suffix}"
num /= 1024.0
return f"{num:.1f}Yi{suffix}"

支持:

  • 所有目前已知的二进制前缀
  • 负数和正数
  • 大于1000约字节的数字
  • 任意单位(也许你喜欢用吉比特来计数!)

例子:

>>> sizeof_fmt(168963795964)
'157.4GiB'

弗雷德Cirera

def human_readable_data_quantity(quantity, multiple=1024):
if quantity == 0:
quantity = +0
SUFFIXES = ["B"] + [i + {1000: "B", 1024: "iB"}[multiple] for i in "KMGTPEZY"]
for suffix in SUFFIXES:
if quantity < multiple or suffix == SUFFIXES[-1]:
if suffix == SUFFIXES[0]:
return "%d%s" % (quantity, suffix)
else:
return "%.1f%s" % (quantity, suffix)
else:
quantity /= multiple

重复作为匆匆.filesize()替代方案提供的代码段,下面的代码段根据所使用的前缀给出不同的精度数字。它不像某些片段那样简洁,但我喜欢这样的结果。

def human_size(size_bytes):
"""
format a size in bytes into a 'human' file size, e.g. bytes, KB, MB, GB, TB, PB
Note that bytes/KB will be reported in whole numbers but MB and above will have greater precision
e.g. 1 byte, 43 bytes, 443 KB, 4.3 MB, 4.43 GB, etc
"""
if size_bytes == 1:
# because I really hate unnecessary plurals
return "1 byte"


suffixes_table = [('bytes',0),('KB',0),('MB',1),('GB',2),('TB',2), ('PB',2)]


num = float(size_bytes)
for suffix, precision in suffixes_table:
if num < 1024.0:
break
num /= 1024.0


if precision == 0:
formatted_size = "%d" % num
else:
formatted_size = str(round(num, ndigits=precision))


return "%s %s" % (formatted_size, suffix)

这是我的版本。它不使用for循环。它具有常量复杂度O(1),理论上比这里使用for循环的答案更有效。

from math import log
unit_list = zip(['bytes', 'kB', 'MB', 'GB', 'TB', 'PB'], [0, 0, 1, 2, 2, 2])
def sizeof_fmt(num):
"""Human friendly file size"""
if num > 1:
exponent = min(int(log(num, 1024)), len(unit_list) - 1)
quotient = float(num) / 1024**exponent
unit, num_decimals = unit_list[exponent]
format_string = '{:.%sf} {}' % (num_decimals)
return format_string.format(quotient, unit)
if num == 0:
return '0 bytes'
if num == 1:
return '1 byte'

为了更清楚地说明发生了什么,我们可以省略字符串格式化的代码。以下是真正起作用的台词:

exponent = int(log(num, 1024))
quotient = num / 1024**exponent
unit_list[exponent]

根据之前所有的答案,以下是我的看法。它是一个以字节为单位以整数形式存储文件大小的对象。但是当你尝试打印对象时,你会自动得到一个人类可读的版本。

class Filesize(object):
"""
Container for a size in bytes with a human readable representation
Use it like this::


>>> size = Filesize(123123123)
>>> print size
'117.4 MB'
"""


chunk = 1024
units = ['bytes', 'KB', 'MB', 'GB', 'TB', 'PB']
precisions = [0, 0, 1, 2, 2, 2]


def __init__(self, size):
self.size = size


def __int__(self):
return self.size


def __str__(self):
if self.size == 0: return '0 bytes'
from math import log
unit = self.units[min(int(log(self.size, self.chunk)), len(self.units) - 1)]
return self.format(unit)


def format(self, unit):
if unit not in self.units: raise Exception("Not a valid file size unit: %s" % unit)
if self.size == 1 and unit == 'bytes': return '1 byte'
exponent = self.units.index(unit)
quotient = float(self.size) / self.chunk**exponent
precision = self.precisions[exponent]
format_string = '{:.%sf} {}' % (precision)
return format_string.format(quotient, unit)

humanize是一个具有你正在寻找的所有功能的库。humanize.naturalsize()似乎能做你想要的一切。

示例代码(python 3.10)

import humanize


disk_sizes_list = [1, 100, 999, 1000,1024, 2000,2048, 3000, 9999, 10000, 2048000000, 9990000000, 9000000000000000000000]
for size in disk_sizes_list:
natural_size = humanize.naturalsize(size)
binary_size = humanize.naturalsize(size, binary=True)
print(f" {natural_size} \t| {binary_size}\t|{size}")

输出

 1 Byte     | 1 Byte    |1
100 Bytes  | 100 Bytes |100
999 Bytes  | 999 Bytes |999
1.0 kB     | 1000 Bytes    |1000
1.0 kB     | 1.0 KiB   |1024
2.0 kB     | 2.0 KiB   |2000
2.0 kB     | 2.0 KiB   |2048
3.0 kB     | 2.9 KiB   |3000
10.0 kB    | 9.8 KiB   |9999
10.0 kB    | 9.8 KiB   |10000
2.0 GB     | 1.9 GiB   |2048000000
10.0 GB    | 9.3 GiB   |9990000000
9.0 ZB     | 7.6 ZiB   |9000000000000000000000

我喜欢Senderle的十进制版本的固定精度,所以这里有一种与上面joctee的答案的混合(你知道你可以取非整数基底的对数吗?):

from math import log
def human_readable_bytes(x):
# hybrid of https://stackoverflow.com/a/10171475/2595465
#      with https://stackoverflow.com/a/5414105/2595465
if x == 0: return '0'
magnitude = int(log(abs(x),10.24))
if magnitude > 16:
format_str = '%iP'
denominator_mag = 15
else:
float_fmt = '%2.1f' if magnitude % 3 == 1 else '%1.2f'
illion = (magnitude + 1) // 3
format_str = float_fmt + ['', 'K', 'M', 'G', 'T', 'P'][illion]
return (format_str % (x * 1.0 / (1024 ** illion))).lstrip('0')

使用1000的幂或kibibytes将更符合标准:

def sizeof_fmt(num, use_kibibyte=True):
base, suffix = [(1000.,'B'),(1024.,'iB')][use_kibibyte]
for x in ['B'] + map(lambda x: x+suffix, list('kMGTP')):
if -base < num < base:
return "%3.1f %s" % (num, x)
num /= base
return "%3.1f %s" % (num, x)

附注:永远不要相信一个以K(大写)后缀打印数千的库。

如果你正在使用Django安装,你也可以尝试filesizeformat:

from django.template.defaultfilters import filesizeformat
filesizeformat(1073741824)


=>


"1.0 GB"

我最近提出了一个避免循环的版本,使用log2来确定大小顺序,它双倍作为移位和后缀列表的索引:

from math import log2


_suffixes = ['bytes', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB']


def file_size(size):
# determine binary order in steps of size 10
# (coerce to int, // still returns a float)
order = int(log2(size) / 10) if size else 0
# format file size
# (.4g results in rounded numbers for exact matches and max 3 decimals,
# should never resort to exponent values)
return '{:.4g} {}'.format(size / (1 << (order * 10)), _suffixes[order])

不过,它的可读性很可能被认为是非python化的。

简单的两行字怎么样:

def humanizeFileSize(filesize):
p = int(math.floor(math.log(filesize, 2)/10))
return "%.3f%s" % (filesize/math.pow(1024,p), ['B','KiB','MiB','GiB','TiB','PiB','EiB','ZiB','YiB'][p])

下面是它的工作原理:

  1. 计算log2(文件大小)
  2. 除以10得到最接近的单位。(例如,如果size是5000字节,最接近的单位是Kb,所以答案应该是X KiB)
  3. 返回file_size/value_of_closest_unit和unit。

然而,如果filesize为0或负(因为log对于0和-ve数字是未定义的),它就不起作用。你可以为他们添加额外的检查:

def humanizeFileSize(filesize):
filesize = abs(filesize)
if (filesize==0):
return "0 Bytes"
p = int(math.floor(math.log(filesize, 2)/10))
return "%0.2f %s" % (filesize/math.pow(1024,p), ['Bytes','KiB','MiB','GiB','TiB','PiB','EiB','ZiB','YiB'][p])

例子:

>>> humanizeFileSize(538244835492574234)
'478.06 PiB'
>>> humanizeFileSize(-924372537)
'881.55 MiB'
>>> humanizeFileSize(0)
'0 Bytes'

请注意 - Kb和KiB之间有区别。KB表示1000字节,而KiB表示1024字节。KB、MB、GB都是1000的倍数,KiB、MiB、GiB等都是1024的倍数。更多信息请点击这里

这将在几乎任何情况下做你需要做的事情,是可选参数自定义的,并且正如你所看到的,是漂亮的自文档化:

from math import log
def pretty_size(n,pow=0,b=1024,u='B',pre=['']+[p+'i'for p in'KMGTPEZY']):
pow,n=min(int(log(max(n*b**pow,1),b)),len(pre)-1),n*b**pow
return "%%.%if %%s%%s"%abs(pow%(-pow-1))%(n/b**float(pow),pre[pow],u)

示例输出:

>>> pretty_size(42)
'42 B'


>>> pretty_size(2015)
'2.0 KiB'


>>> pretty_size(987654321)
'941.9 MiB'


>>> pretty_size(9876543210)
'9.2 GiB'


>>> pretty_size(0.5,pow=1)
'512 B'


>>> pretty_size(0)
'0 B'

高级定制:

>>> pretty_size(987654321,b=1000,u='bytes',pre=['','kilo','mega','giga'])
'987.7 megabytes'


>>> pretty_size(9876543210,b=1000,u='bytes',pre=['','kilo','mega','giga'])
'9.9 gigabytes'

此代码与Python 2和Python 3兼容。对读者来说,遵从PEP8是一个练习。记住,漂亮的是输出

更新:

如果你需要数千个逗号,只需应用明显的扩展:

def prettier_size(n,pow=0,b=1024,u='B',pre=['']+[p+'i'for p in'KMGTPEZY']):
r,f=min(int(log(max(n*b**pow,1),b)),len(pre)-1),'{:,.%if} %s%s'
return (f%(abs(r%(-r-1)),pre[r],u)).format(n*b**pow/b**float(r))

例如:

>>> pretty_units(987654321098765432109876543210)
'816,968.5 YiB'

现代Django有自己的模板标签filesizeformat:

将值格式化为human-readable文件大小(即:“13 KB”,“4.1 MB”,“102字节”等)。

例如:

\{\{ value|filesizeformat }}

如果值是123456789,输出将是117.7 MB。

更多信息:https://docs.djangoproject.com/en/1.10/ref/templates/builtins/#filesizeformat

以下工作在Python 3.6+中,在我看来,是这里最容易理解的答案,并允许您自定义使用的小数位数。

def human_readable_size(size, decimal_places=2):
for unit in ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB']:
if size < 1024.0 or unit == 'PiB':
break
size /= 1024.0
return f"{size:.{decimal_places}f} {unit}"

总有一个这样的人。今天轮到我了。这是一行代码——如果算上函数签名的话是两行。

def human_size(bytes, units=[' bytes','KB','MB','GB','TB', 'PB', 'EB']):
""" Returns a human readable string representation of bytes """
return str(bytes) + units[0] if bytes < 1024 else human_size(bytes>>10, units[1:])

>>> human_size(123)
123 bytes
>>> human_size(123456789)
117GB

如果你需要大于1艾字节的大小,那就有点麻烦了:

def human_size(bytes, units=[' bytes','KB','MB','GB','TB', 'PB', 'EB']):
return str(bytes) + units[0] if bytes < 1024 else human_size(bytes>>10, units[1:]) if units[1:] else f'{bytes>>10}ZB'

你应该用humanize。

>>> humanize.naturalsize(1000000)
'1.0 MB'
>>> humanize.naturalsize(1000000, binary=True)
'976.6 KiB'
>>> humanize.naturalsize(1000000, gnu=True)
'976.6K'

参考:

https://pypi.org/project/humanize/

HumanFriendly项目帮助用这个

import humanfriendly
humanfriendly.format_size(1024)
上面的代码将给出1KB作为答案 可以在这里找到例子。< / p >

参考Sridhar Ratnakumar的回答,更新为:

def formatSize(sizeInBytes, decimalNum=1, isUnitWithI=False, sizeUnitSeperator=""):
"""format size to human readable string"""
# https://en.wikipedia.org/wiki/Binary_prefix#Specific_units_of_IEC_60027-2_A.2_and_ISO.2FIEC_80000
# K=kilo, M=mega, G=giga, T=tera, P=peta, E=exa, Z=zetta, Y=yotta
sizeUnitList = ['','K','M','G','T','P','E','Z']
largestUnit = 'Y'


if isUnitWithI:
sizeUnitListWithI = []
for curIdx, eachUnit in enumerate(sizeUnitList):
unitWithI = eachUnit
if curIdx >= 1:
unitWithI += 'i'
sizeUnitListWithI.append(unitWithI)


# sizeUnitListWithI = ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']
sizeUnitList = sizeUnitListWithI


largestUnit += 'i'


suffix = "B"
decimalFormat = "." + str(decimalNum) + "f" # ".1f"
finalFormat = "%" + decimalFormat + sizeUnitSeperator + "%s%s" # "%.1f%s%s"
sizeNum = sizeInBytes
for sizeUnit in sizeUnitList:
if abs(sizeNum) < 1024.0:
return finalFormat % (sizeNum, sizeUnit, suffix)
sizeNum /= 1024.0
return finalFormat % (sizeNum, largestUnit, suffix)

示例输出如下:

def testKb():
kbSize = 3746
kbStr = formatSize(kbSize)
print("%s -> %s" % (kbSize, kbStr))


def testI():
iSize = 87533
iStr = formatSize(iSize, isUnitWithI=True)
print("%s -> %s" % (iSize, iStr))


def testSeparator():
seperatorSize = 98654
seperatorStr = formatSize(seperatorSize, sizeUnitSeperator=" ")
print("%s -> %s" % (seperatorSize, seperatorStr))


def testBytes():
bytesSize = 352
bytesStr = formatSize(bytesSize)
print("%s -> %s" % (bytesSize, bytesStr))


def testMb():
mbSize = 76383285
mbStr = formatSize(mbSize, decimalNum=2)
print("%s -> %s" % (mbSize, mbStr))


def testTb():
tbSize = 763832854988542
tbStr = formatSize(tbSize, decimalNum=2)
print("%s -> %s" % (tbSize, tbStr))


def testPb():
pbSize = 763832854988542665
pbStr = formatSize(pbSize, decimalNum=4)
print("%s -> %s" % (pbSize, pbStr))




def demoFormatSize():
testKb()
testI()
testSeparator()
testBytes()
testMb()
testTb()
testPb()


# 3746 -> 3.7KB
# 87533 -> 85.5KiB
# 98654 -> 96.3 KB
# 352 -> 352.0B
# 76383285 -> 72.84MB
# 763832854988542 -> 694.70TB
# 763832854988542665 -> 678.4199PB

您将在下面发现的决不是已经发布的解决方案中性能最好或最短的解决方案。相反,它专注于一个特别的问题,许多其他答案错过了。

即当输入类似999_995时的情况:

Python 3.6.1 ...
...
>>> value = 999_995
>>> base = 1000
>>> math.log(value, base)
1.999999276174054

哪个,被截断为最近的整数,并应用回输入给出

>>> order = int(math.log(value, base))
>>> value/base**order
999.995

这似乎正是我们所期望的,直到我们被要求控制输出精度。这就是事情开始变得有点困难的时候。

将精度设置为2位,我们得到:

>>> round(value/base**order, 2)
1000 # K

而不是1M

我们该如何应对呢?

当然,我们可以显式地检查它:

if round(value/base**order, 2) == base:
order += 1

但我们能做得更好吗?在我们做最后一步之前,我们可以知道order应该被切割的方式吗?

事实证明我们可以。

假设0.5十进制舍入规则,上面的if条件转换为:

enter image description here

导致

def abbreviate(value, base=1000, precision=2, suffixes=None):
if suffixes is None:
suffixes = ['', 'K', 'M', 'B', 'T']


if value == 0:
return f'{0}{suffixes[0]}'


order_max = len(suffixes) - 1
order = log(abs(value), base)
order_corr = order - int(order) >= log(base - 0.5/10**precision, base)
order = min(int(order) + order_corr, order_max)


factored = round(value/base**order, precision)


return f'{factored:,g}{suffixes[order]}'

>>> abbreviate(999_994)
'999.99K'
>>> abbreviate(999_995)
'1M'
>>> abbreviate(999_995, precision=3)
'999.995K'
>>> abbreviate(2042, base=1024)
'1.99K'
>>> abbreviate(2043, base=1024)
'2K'

这个解决方案可能也会吸引你,这取决于你的思维方式:

from pathlib import Path


def get_size(path = Path('.')):
""" Gets file size, or total directory size """
if path.is_file():
size = path.stat().st_size
elif path.is_dir():
size = sum(file.stat().st_size for file in path.glob('*.*'))
return size


def format_size(path, unit="MB"):
""" Converts integers to common size units used in computing """
bit_shift = {"B": 0,
"kb": 7,
"KB": 10,
"mb": 17,
"MB": 20,
"gb": 27,
"GB": 30,
"TB": 40,}
return "{:,.0f}".format(get_size(path) / float(1 << bit_shift[unit])) + " " + unit


# Tests and test results
>>> get_size("d:\\media\\bags of fun.avi")
'38 MB'
>>> get_size("d:\\media\\bags of fun.avi","KB")
'38,763 KB'
>>> get_size("d:\\media\\bags of fun.avi","kb")
'310,104 kb'

这个特性在博尔顿中可用,对于大多数项目来说这是一个非常方便的库。

>>> bytes2human(128991)
'126K'
>>> bytes2human(100001221)
'95M'
>>> bytes2human(0, 2)
'0.00B'

下面是一个使用while的选项:

def number_format(n):
n2, n3 = n, 0
while n2 >= 1e3:
n2 /= 1e3
n3 += 1
return '%.3f' % n2 + ('', ' k', ' M', ' G')[n3]


s = number_format(9012345678)
print(s == '9.012 G')

https://docs.python.org/reference/compound_stmts.html#while

为了以人类可读的形式获取文件大小,我创建了这个函数:

import os


def get_size(path):
size = os.path.getsize(path)
if size < 1024:
return f"{size} bytes"
elif size < pow(1024,2):
return f"{round(size/1024, 2)} KB"
elif size < pow(1024,3):
return f"{round(size/(pow(1024,2)), 2)} MB"
elif size < pow(1024,4):
return f"{round(size/(pow(1024,3)), 2)} GB"
>>> get_size("a.txt")
1.4KB

这是我为不同的问题写的东西…

xApple回答非常相似,该对象将始终以人类可读的格式打印。不同之处在于它也是一个合适的int,所以你可以用它来做数学运算! 它将格式说明符直接传递给数字格式,并附加后缀,因此几乎可以保证请求的长度将超出两到三个字符。我从来没有使用过这个代码,所以我没有费心去修复它!< / p >

class ByteSize(int):


_KB = 1024
_suffixes = 'B', 'KB', 'MB', 'GB', 'PB'


def __new__(cls, *args, **kwargs):
return super().__new__(cls, *args, **kwargs)


def __init__(self, *args, **kwargs):
self.bytes = self.B = int(self)
self.kilobytes = self.KB = self / self._KB**1
self.megabytes = self.MB = self / self._KB**2
self.gigabytes = self.GB = self / self._KB**3
self.petabytes = self.PB = self / self._KB**4
*suffixes, last = self._suffixes
suffix = next((
suffix
for suffix in suffixes
if 1 < getattr(self, suffix) < self._KB
), last)
self.readable = suffix, getattr(self, suffix)


super().__init__()


def __str__(self):
return self.__format__('.2f')


def __repr__(self):
return '{}({})'.format(self.__class__.__name__, super().__repr__())


def __format__(self, format_spec):
suffix, val = self.readable
return '{val:{fmt}} {suf}'.format(val=val, fmt=format_spec, suf=suffix)


def __sub__(self, other):
return self.__class__(super().__sub__(other))


def __add__(self, other):
return self.__class__(super().__add__(other))
    

def __mul__(self, other):
return self.__class__(super().__mul__(other))


def __rsub__(self, other):
return self.__class__(super().__sub__(other))


def __radd__(self, other):
return self.__class__(super().__add__(other))
    

def __rmul__(self, other):
return self.__class__(super().__rmul__(other))

用法:

>>> size = 6239397620
>>> print(size)
5.81 GB
>>> size.GB
5.810891855508089
>>> size.gigabytes
5.810891855508089
>>> size.PB
0.005674699077644618
>>> size.MB
5950.353260040283
>>> size
ByteSize(6239397620)