filtered_word_list = word_list[:] #make a copy of the word_list
for word in word_list: # iterate over word_list
if word in stopwords.words('english'):
filtered_word_list.remove(word) # remove word from filtered_word_list if it is a stopword
from nltk.corpus import stopwords
def remove_stopwords(word_list):
processed_word_list = []
for word in word_list:
word = word.lower() # in case they arenet all lower cased
if word not in stopwords.words("english"):
processed_word_list.append(word)
return processed_word_list
from stop_words import get_stop_words
from nltk.corpus import stopwords
stop_words = list(get_stop_words('en')) #About 900 stopwords
nltk_words = list(stopwords.words('english')) #About 150 stopwords
stop_words.extend(nltk_words)
output = [w for w in word_list if not w in stop_words]
import textcleaner as tc
data = tc.document(<file_name>)
#you can also pass list of sentences to the document class constructor.
data.remove_stpwrds() #inplace is set to False by default
Arabic
Bulgarian
Catalan
Czech
Danish
Dutch
English
Finnish
French
German
Hungarian
Indonesian
Italian
Norwegian
Polish
Portuguese
Romanian
Russian
Spanish
Swedish
Turkish
Ukrainian
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
example_sent = "This is a sample sentence, showing off the stop words filtration."
stop_words = set(stopwords.words('english'))
word_tokens = word_tokenize(example_sent)
filtered_sentence = [w for w in word_tokens if not w in stop_words]
filtered_sentence = []
for w in word_tokens:
if w not in stop_words:
filtered_sentence.append(w)
print(word_tokens)
print(filtered_sentence)
from nltk.tokenize import word_tokenize
tweetText = twitter_df['text']
然后使用以下方法进行标记
from nltk.tokenize import word_tokenize
tweetText = tweetText.apply(word_tokenize)
然后,删除停止的话,
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
tweetText = tweetText.apply(lambda x:[word for word in x if word not in stop_words])
tweetText.head()
! pip install textfeatures
import textfeatures as tf
import pandas as pd
例如,假设您有以下一组字符串:
texts = [
"blue car and blue window",
"black crow in the window",
"i see my reflection in the window"]
df = pd.DataFrame(texts) # Convert to a dataframe
df.columns = ['text'] # give a name to the column
df
现在,调用 stop words ()函数并传递所需的参数:
tf.stopwords(df,"text","stopwords") # extract stop words
df[["text","stopwords"]].head() # give names to columns
结果将是:
text stopwords
0 blue car and blue window [and]
1 black crow in the window [in, the]
2 i see my reflection in the window [i, my, in, the]