>>> from nltk import word_tokenize
>>> from nltk.corpus import stopwords
>>> stop = set(stopwords.words('english'))
>>> sentence = "this is a foo bar sentence"
>>> print([i for i in sentence.lower().split() if i not in stop])
['foo', 'bar', 'sentence']
>>> [i for i in word_tokenize(sentence.lower()) if i not in stop]
['foo', 'bar', 'sentence']
import nltk
STOP_TYPES = ['DET', 'CNJ']
text = "some data here "
tokens = nltk.pos_tag(nltk.word_tokenize(text))
good_words = [w for w, wtype in tokens if wtype not in STOP_TYPES]
from nltk.corpus import stopwords
from nltk.tokenize import wordpunct_tokenize
stop_words = set(stopwords.words('english'))
stop_words.update(['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}']) # remove it if you need punctuation
for doc in documents:
list_of_words = [i.lower() for i in wordpunct_tokenize(doc) if i.lower() not in stop_words]
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from string import punctuation
words = tokenize(text)
wordsWOStopwords = removeStopWords(words)
def tokenize(text):
sents = sent_tokenize(text)
return [word_tokenize(sent) for sent in sents]
def removeStopWords(words):
customStopWords = set(stopwords.words('english')+list(punctuation))
return [word for word in words if word not in customStopWords]