如何在 Python 中将文件转换为 utf-8?

我需要在 Python 中将大量文件转换为 utf-8,而且在“转换文件”部分遇到了麻烦。

我想做的相当于:

iconv -t utf-8 $file > converted/$file # this is shell code

谢谢!

151967 次浏览

You can use the codecs module, like this:

import codecs
BLOCKSIZE = 1048576 # or some other, desired size in bytes
with codecs.open(sourceFileName, "r", "your-source-encoding") as sourceFile:
with codecs.open(targetFileName, "w", "utf-8") as targetFile:
while True:
contents = sourceFile.read(BLOCKSIZE)
if not contents:
break
targetFile.write(contents)

EDIT: added BLOCKSIZE parameter to control file chunk size.

This worked for me in a small test:

sourceEncoding = "iso-8859-1"
targetEncoding = "utf-8"
source = open("source")
target = open("target", "w")


target.write(unicode(source.read(), sourceEncoding).encode(targetEncoding))

Thanks for the replies, it works!

And since the source files are in mixed formats, I added a list of source formats to be tried in sequence (sourceFormats), and on UnicodeDecodeError I try the next format:

from __future__ import with_statement


import os
import sys
import codecs
from chardet.universaldetector import UniversalDetector


targetFormat = 'utf-8'
outputDir = 'converted'
detector = UniversalDetector()


def get_encoding_type(current_file):
detector.reset()
for line in file(current_file):
detector.feed(line)
if detector.done: break
detector.close()
return detector.result['encoding']


def convertFileBestGuess(filename):
sourceFormats = ['ascii', 'iso-8859-1']
for format in sourceFormats:
try:
with codecs.open(fileName, 'rU', format) as sourceFile:
writeConversion(sourceFile)
print('Done.')
return
except UnicodeDecodeError:
pass


def convertFileWithDetection(fileName):
print("Converting '" + fileName + "'...")
format=get_encoding_type(fileName)
try:
with codecs.open(fileName, 'rU', format) as sourceFile:
writeConversion(sourceFile)
print('Done.')
return
except UnicodeDecodeError:
pass


print("Error: failed to convert '" + fileName + "'.")




def writeConversion(file):
with codecs.open(outputDir + '/' + fileName, 'w', targetFormat) as targetFile:
for line in file:
targetFile.write(line)


# Off topic: get the file list and call convertFile on each file
# ...

(EDIT by Rudro Badhon: this incorporates the original try multiple formats until you don't get an exception as well as an alternate approach that uses chardet.universaldetector)

To guess what's the source encoding you can use the file *nix command.

Example:

$ file --mime jumper.xml


jumper.xml: application/xml; charset=utf-8

This is a Python3 function for converting any text file into the one with UTF-8 encoding. (without using unnecessary packages)

def correctSubtitleEncoding(filename, newFilename, encoding_from, encoding_to='UTF-8'):
with open(filename, 'r', encoding=encoding_from) as fr:
with open(newFilename, 'w', encoding=encoding_to) as fw:
for line in fr:
fw.write(line[:-1]+'\r\n')

You can use it easily in a loop to convert a list of files.

This is my brute force method. It also takes care of mingled \n and \r\n in the input.

    # open the CSV file
inputfile = open(filelocation, 'rb')
outputfile = open(outputfilelocation, 'w', encoding='utf-8')
for line in inputfile:
if line[-2:] == b'\r\n' or line[-2:] == b'\n\r':
output = line[:-2].decode('utf-8', 'replace') + '\n'
elif line[-1:] == b'\r' or line[-1:] == b'\n':
output = line[:-1].decode('utf-8', 'replace') + '\n'
else:
output = line.decode('utf-8', 'replace') + '\n'
outputfile.write(output)
outputfile.close()
except BaseException as error:
cfg.log(self.outf, "Error(18): opening CSV-file " + filelocation + " failed: " + str(error))
self.loadedwitherrors = 1
return ([])
try:
# open the CSV-file of this source table
csvreader = csv.reader(open(outputfilelocation, "rU"), delimiter=delimitervalue, quoting=quotevalue, dialect=csv.excel_tab)
except BaseException as error:
cfg.log(self.outf, "Error(19): reading CSV-file " + filelocation + " failed: " + str(error))

Answer for unknown source encoding type

based on @Sébastien RoccaSerra

python3.6

import os
from chardet import detect


# get file encoding type
def get_encoding_type(file):
with open(file, 'rb') as f:
rawdata = f.read()
return detect(rawdata)['encoding']


from_codec = get_encoding_type(srcfile)


# add try: except block for reliability
try:
with open(srcfile, 'r', encoding=from_codec) as f, open(trgfile, 'w', encoding='utf-8') as e:
text = f.read() # for small files, for big use chunks
e.write(text)


os.remove(srcfile) # remove old encoding file
os.rename(trgfile, srcfile) # rename new encoding
except UnicodeDecodeError:
print('Decode Error')
except UnicodeEncodeError:
print('Encode Error')

You can use this one liner (assuming you want to convert from utf16 to utf8)

    python -c "from pathlib import Path; path = Path('yourfile.txt') ; path.write_text(path.read_text(encoding='utf16'), encoding='utf8')"

Where yourfile.txt is a path to your $file.

For this to work you need python 3.4 or newer (probably nowadays you do).

Below a more readable version of the code above

from pathlib import Path
path = Path("yourfile.txt")
path.write_text(path.read_text(encoding="utf16"), encoding="utf8")

convert all file in a dir to utf-8 encode. it is recursive and can filter file by suffix. thanks @Sole Sensei

# pip install -i https://pypi.tuna.tsinghua.edu.cn/simple chardet
import os
import re
from chardet import detect




def get_file_list(d):
result = []
for root, dirs, files in os.walk(d):
dirs[:] = [d for d in dirs if d not in ['venv', 'cmake-build-debug']]
for filename in files:
# your filter
if re.search(r'(\.c|\.cpp|\.h|\.txt)$', filename):
result.append(os.path.join(root, filename))
return result




# get file encoding type
def get_encoding_type(file):
with open(file, 'rb') as f:
raw_data = f.read()
return detect(raw_data)['encoding']




if __name__ == "__main__":
file_list = get_file_list('.')
for src_file in file_list:
print(src_file)
trg_file = src_file + '.swp'
from_codec = get_encoding_type(src_file)
try:
with open(src_file, 'r', encoding=from_codec) as f, open(trg_file, 'w', encoding='utf-8') as e:
text = f.read()
e.write(text)
os.remove(src_file)
os.rename(trg_file, src_file)
except UnicodeDecodeError:
print('Decode Error')
except UnicodeEncodeError:
print('Encode Error')