>>> s = "string. With. Punctuation?" # Sample string>>> import string>>> for c in string.punctuation:... s= s.replace(c,"")...>>> s'string With Punctuation'
import re, string, timeit
s = "string. With. Punctuation"exclude = set(string.punctuation)table = string.maketrans("","")regex = re.compile('[%s]' % re.escape(string.punctuation))
def test_set(s):return ''.join(ch for ch in s if ch not in exclude)
def test_re(s): # From Vinko's solution, with fix.return regex.sub('', s)
def test_trans(s):return s.translate(table, string.punctuation)
def test_repl(s): # From S.Lott's solutionfor c in string.punctuation:s=s.replace(c,"")return s
print "sets :",timeit.Timer('f(s)', 'from __main__ import s,test_set as f').timeit(1000000)print "regex :",timeit.Timer('f(s)', 'from __main__ import s,test_re as f').timeit(1000000)print "translate :",timeit.Timer('f(s)', 'from __main__ import s,test_trans as f').timeit(1000000)print "replace :",timeit.Timer('f(s)', 'from __main__ import s,test_repl as f').timeit(1000000)
# -*- coding: utf-8 -*-from unicodedata import categorys = u'String — with - «punctation »...'s = ''.join(ch for ch in s if category(ch)[0] != 'P')print 'stripped', s
您也可以泛化和剥离其他类型的字符:
''.join(ch for ch in s if category(ch)[0] not in 'SP')
def stripPunc(wordList):"""Strips punctuation from list of words"""puncList = [".",";",":","!","?","/","\\",",","#","@","$","&",")","(","\""]for punc in puncList:for word in wordList:wordList=[word.replace(punc,'') for word in wordList]return wordList
import string
s = "string. With. Punctuation?"table = string.maketrans("","")new_s = s.translate(table, string.punctuation) # Output: string without punctuation
python3
import string
s = "string. With. Punctuation?"table = str.maketrans(dict.fromkeys(string.punctuation)) # OR {key: None for key in string.punctuation}new_s = s.translate(table) # Output: string without punctuation
import string
input_text = "!where??and!!or$$then:)"punctuation_replacer = string.maketrans(string.punctuation, ' '*len(string.punctuation))print ' '.join(input_text.translate(punctuation_replacer).split()).strip()
Output>> where and or then
# FIRST METHOD# Storing all punctuations in a variablepunctuation='!?,.:;"\')(_-'newstring ='' # Creating empty stringword = raw_input("Enter string: ")for i in word:if(i not in punctuation):newstring += iprint ("The string without punctuation is", newstring)
# SECOND METHODword = raw_input("Enter string: ")punctuation = '!?,.:;"\')(_-'newstring = word.translate(None, punctuation)print ("The string without punctuation is",newstring)
# Output for both methodsEnter string: hello! welcome -to_python(programming.language)??,The string without punctuation is: hello welcome topythonprogramminglanguage
print('====THIS IS HOW TO REMOVE STOP WORS====')
with open('one.txt','r')as myFile:
str1=myFile.read()
stop_words ="not", "is", "it", "By","between","This","By","A","when","And","up","Then","was","by","It","If","can","an","he","This","or","And","a","i","it","am","at","on","in","of","to","is","so","too","my","the","and","but","are","very","here","even","from","them","then","than","this","that","though","be","But","these"
myList=[]
myList.extend(str1.split(" "))
for i in myList:
if i not in stop_words:
print ("____________")
print(i,end='\n')
table = str.maketrans({key: None for key in string.punctuation})
vs
table = str.maketrans('', '', string.punctuation)
另外,我添加了另一种使用set的方法,我利用交集函数来减少迭代次数。
这是完整的代码:
import re, string, timeit
s = "string. With. Punctuation"
def test_set(s):exclude = set(string.punctuation)return ''.join(ch for ch in s if ch not in exclude)
def test_set2(s):_punctuation = set(string.punctuation)for punct in set(s).intersection(_punctuation):s = s.replace(punct, ' ')return ' '.join(s.split())
def test_re(s): # From Vinko's solution, with fix.regex = re.compile('[%s]' % re.escape(string.punctuation))return regex.sub('', s)
def test_trans(s):table = str.maketrans({key: None for key in string.punctuation})return s.translate(table)
def test_trans2(s):table = str.maketrans('', '', string.punctuation)return(s.translate(table))
def test_repl(s): # From S.Lott's solutionfor c in string.punctuation:s=s.replace(c,"")return s
print("sets :",timeit.Timer('f(s)', 'from __main__ import s,test_set as f').timeit(1000000))print("sets2 :",timeit.Timer('f(s)', 'from __main__ import s,test_set2 as f').timeit(1000000))print("regex :",timeit.Timer('f(s)', 'from __main__ import s,test_re as f').timeit(1000000))print("translate :",timeit.Timer('f(s)', 'from __main__ import s,test_trans as f').timeit(1000000))print("translate2 :",timeit.Timer('f(s)', 'from __main__ import s,test_trans2 as f').timeit(1000000))print("replace :",timeit.Timer('f(s)', 'from __main__ import s,test_repl as f').timeit(1000000))
import re
punct = re.compile(r'(\w+)')
sentence = 'This ! is : a # sample $ sentence.' # Text with punctuationtokenized = [m.group() for m in punct.finditer(sentence)]sentence = ' '.join(tokenized)print(sentence)'This is a sample sentence'
# using lambda''.join(filter(lambda c: c not in string.punctuation, s))
# using list comprehension''.join('' if c in string.punctuation else c for c in s)
#!/usr/bin/env python3
"""Determination of most efficient way to remove punctuation in Python 3.
Results in Python 3.8.10 on my system using the default arguments:
set : 51.897regex : 17.901translate : 2.059replace : 13.209"""
import argparseimport reimport stringimport timeit
parser = argparse.ArgumentParser()parser.add_argument("--filename", "-f", default=argparse.__file__)parser.add_argument("--iterations", "-i", type=int, default=10000)opts = parser.parse_args()with open(opts.filename) as fp:s = fp.read()exclude = set(string.punctuation)table = str.maketrans("", "", string.punctuation)regex = re.compile(f"[{re.escape(string.punctuation)}]")
def test_set(s):return "".join(ch for ch in s if ch not in exclude)
def test_regex(s): # From Vinko's solution, with fix.return regex.sub("", s)
def test_translate(s):return s.translate(table)
def test_replace(s): # From S.Lott's solutionfor c in string.punctuation:s = s.replace(c, "")return s
opts = dict(globals=globals(), number=opts.iterations)solutions = "set", "regex", "translate", "replace"for solution in solutions:elapsed = timeit.timeit(f"test_{solution}(s)", **opts)print(f"{solution:<10}: {elapsed:6.3f}")
regex.sub(r'[\p{P}\p{S}]', '', text) # to remove one by oneregex.sub(r'[\p{P}\p{S}]+', '', text) # to remove all consecutive punctuation/symbols with one goregex.sub(r'[[:punct:]]+', '', text) # Same with a POSIX character class