Convert to Lowercase
## CONVERT TO LOWERCASE
def to_lowercase(text):
text = text.lower()
return text
df['review'] = df['review'].apply(to_lowercase)
Remove Numbers
## REMOVE NUMBERS
def remove_numbers(text):
text = re.sub(r'\d+', '', text)
return text
df['review'] = df['review'].apply(remove_numbers)
Remove Punctuation
## REMOVE PUNCTUATION
def remove_punctuations(text):
return text.translate(str.maketrans('', '', string.punctuation))
df['review'] = df['review'].apply(remove_punctuations)
Remove Special Characters
## REMOVE SPECIAL CHARACTERS
def remove_special_chars(text):
return re.sub('[^a-zA-Z]', ' ', text)
df['review'] = df['review'].apply(remove_special_chars)
Remove Unnecessary White Space
## REMOVE UNNECESSARY WHITE SPACE
def remove_whitespace(text):
return " ".join(text.split())
df['review'] = df['review'].apply(remove_whitespace)
Remove Stopwords
## REMOVE STOPWORDS
# create list of your own words to also remove
my_stopwords = []
def remove_stopwords(text):
new_list = []
words = word_tokenize(text)
stopwrds = stopwords.words('english') + my_stopwords
for word in words:
if word not in stopwrds:
new_list.append(word)
return ' '.join(new_list)
df['review'] = df['review'].apply(remove_stopwords)
Correct Misspelled Words
## CORRECT MISSPELLED WORDS
# for dictionary path
# https://symspellpy.readthedocs.io/en/latest/examples/dictionary.html
# for parameters
# https://symspellpy.readthedocs.io/en/latest/examples/lookup.html#basic-usage
symsp = SymSpell()
dictionary_path = pkg_resources.resource_filename("symspellpy", "frequency_dictionary_en_82_765.txt")
symsp.load_dictionary(dictionary_path,
term_index=0,
count_index=1,
separator=' ')
def symspell_corrector(input_term):
# look up suggestions for multi-word input strings
suggestions = symsp.lookup_compound(
phrase=input_term,
max_edit_distance=2,
transfer_casing=True,
ignore_term_with_digits=True,
ignore_non_words=True,
split_by_space=True
)
# the correction
#print(suggestions[0].term)
return suggestions[0].term
df['review'] = df['review'].apply(symspell_corrector)
Stemming
## STEMMING
# caring -> car
stem_df = df.copy()
def perform_stemming(text):
stemmer = PorterStemmer()
new_list = []
words = word_tokenize(text)
for word in words:
new_list.append(stemmer.stem(word))
return " ".join(new_list)
stem_df['review'] = stem_df['review'].apply(perform_stemming)
Lemmatization
## LEMMATIZATION
# usually preferred over stemming
# considers context (word part of speech)
# caring -> care
lem_df = df.copy()
# Part of speech tagger function
def pos_tagger(nltk_tag):
if nltk_tag.startswith('J'):
return wordnet.ADJ
elif nltk_tag.startswith('V'):
return wordnet.VERB
elif nltk_tag.startswith('N'):
return wordnet.NOUN
elif nltk_tag.startswith('R'):
return wordnet.ADV
else:
return None
# Instantiate lemmatizer
lemmatizer = WordNetLemmatizer()
def lemmatize_word(text):
pos_tagged = nltk.pos_tag(nltk.word_tokenize(text))
#word_tokens = word_tokenize(text)
wordnet_tagged = list(map(lambda x: (x[0], pos_tagger(x[1])), pos_tagged))
lemmatized_review = []
for word, tag in wordnet_tagged:
if tag is None:
# if there is no available tag, append the token as is
lemmatized_review.append(word)
else:
# else use the tag to lemmatize the token
lemmatized_review.append(lemmatizer.lemmatize(word, tag))
lemmatized_review = " ".join(lemmatized_review)
return lemmatized_review
lem_df['review'] = lem_df['review'].apply(lemmatize_word)
Below is the full code file containing all of the normalization techniques outlined above, as well as an example dataset to run the code file with.
# -*- coding: utf-8 -*-
"""
Created on Thu May 18 13:05:43 2023
@author: casey
"""
# Walks through basic text normalization techniques
## LOAD LIBRARIES
import pandas as pd
# Import nlp
import nltk
import string
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from textblob import TextBlob
import jamspell
from symspellpy import SymSpell, Verbosity
import pkg_resources
# ------------------------------------------------------------------------------------------------ #
## LOAD DATA
df = pd.read_csv('C:/Users/casey/OneDrive/Documents/Data_Science/NLP/Data/movie.csv')
# ------------------------------------------------------------------------------------------------ #
## CONVERT TO LOWERCASE
def to_lowercase(text):
text = text.lower()
return text
df['review'] = df['review'].apply(to_lowercase)
# ------------------------------------------------------------------------------------------------ #
## REMOVE NUMBERS
def remove_numbers(text):
text = re.sub(r'\d+', '', text)
return text
df['review'] = df['review'].apply(remove_numbers)
# ------------------------------------------------------------------------------------------------ #
## REMOVE PUNCTUATION
def remove_punctuations(text):
return text.translate(str.maketrans('', '', string.punctuation))
df['review'] = df['review'].apply(remove_punctuations)
# ------------------------------------------------------------------------------------------------ #
## REMOVE SPECIAL CHARACTERS
def remove_special_chars(text):
return re.sub('[^a-zA-Z]', ' ', text)
df['review'] = df['review'].apply(remove_special_chars)
# ------------------------------------------------------------------------------------------------ #
## REMOVE UNNECESSARY WHITE SPACE
def remove_whitespace(text):
return " ".join(text.split())
df['review'] = df['review'].apply(remove_whitespace)
# ------------------------------------------------------------------------------------------------ #
## REMOVE STOPWORDS
# create list of your own words to also remove
my_stopwords = ['br', 'b']
def remove_stopwords(text):
new_list = []
words = word_tokenize(text)
stopwrds = stopwords.words('english') + my_stopwords
for word in words:
if word not in stopwrds:
new_list.append(word)
return ' '.join(new_list)
df['review'] = df['review'].apply(remove_stopwords)
# ------------------------------------------------------------------------------------------------ #
## CORRECT MISSPELLED WORDS
# for dictionary path
# https://symspellpy.readthedocs.io/en/latest/examples/dictionary.html
# for parameters
# https://symspellpy.readthedocs.io/en/latest/examples/lookup.html#basic-usage
symsp = SymSpell()
dictionary_path = pkg_resources.resource_filename("symspellpy", "frequency_dictionary_en_82_765.txt")
symsp.load_dictionary(dictionary_path,
term_index=0,
count_index=1,
separator=' ')
def symspell_corrector(input_term):
# look up suggestions for multi-word input strings
suggestions = symsp.lookup_compound(
phrase=input_term,
max_edit_distance=2,
transfer_casing=True,
ignore_term_with_digits=True,
ignore_non_words=True,
split_by_space=True
)
# the correction
#print(suggestions[0].term)
return suggestions[0].term
df['review'] = df['review'].apply(symspell_corrector)
# ------------------------------------------------------------------------------------------------ #
## STEMMING
# caring -> car
stem_df = df.copy()
def perform_stemming(text):
stemmer = PorterStemmer()
new_list = []
words = word_tokenize(text)
for word in words:
new_list.append(stemmer.stem(word))
return " ".join(new_list)
stem_df['review'] = stem_df['review'].apply(perform_stemming)
#stem_df.to_csv('C:/Users/casey/OneDrive/Documents/Data_Science/NLP/Data/stem_movie_normalized.csv',
#index = False)
# ------------------------------------------------------------------------------------------------ #
## LEMMATIZATION
# usually preferred over stemming
# considers context (word part of speech)
# caring -> care
lem_df = df.copy()
# Part of speech tagger function
def pos_tagger(nltk_tag):
if nltk_tag.startswith('J'):
return wordnet.ADJ
elif nltk_tag.startswith('V'):
return wordnet.VERB
elif nltk_tag.startswith('N'):
return wordnet.NOUN
elif nltk_tag.startswith('R'):
return wordnet.ADV
else:
return None
# Instantiate lemmatizer
lemmatizer = WordNetLemmatizer()
def lemmatize_word(text):
pos_tagged = nltk.pos_tag(nltk.word_tokenize(text))
#word_tokens = word_tokenize(text)
wordnet_tagged = list(map(lambda x: (x[0], pos_tagger(x[1])), pos_tagged))
lemmatized_review = []
for word, tag in wordnet_tagged:
if tag is None:
# if there is no available tag, append the token as is
lemmatized_review.append(word)
else:
# else use the tag to lemmatize the token
lemmatized_review.append(lemmatizer.lemmatize(word, tag))
lemmatized_review = " ".join(lemmatized_review)
return lemmatized_review
lem_df['review'] = lem_df['review'].apply(lemmatize_word)
lem_df.to_csv('C:/Users/casey/OneDrive/Documents/Data_Science/NLP/Data/lem_movie_normalized.csv',
index = False)