NLP Normalization - Casey Cooper

Convert to Lowercase

## CONVERT TO LOWERCASE
def to_lowercase(text):
    text = text.lower()
    return text

df['review'] = df['review'].apply(to_lowercase)

Remove Numbers

## REMOVE NUMBERS
def remove_numbers(text):
    text = re.sub(r'\d+', '', text)
    return text
 
df['review'] = df['review'].apply(remove_numbers)

Remove Punctuation

## REMOVE PUNCTUATION
def remove_punctuations(text):
    return text.translate(str.maketrans('', '', string.punctuation))

df['review'] = df['review'].apply(remove_punctuations)

Remove Special Characters

## REMOVE SPECIAL CHARACTERS
def remove_special_chars(text):
    return re.sub('[^a-zA-Z]', ' ', text)

df['review'] = df['review'].apply(remove_special_chars)

Remove Unnecessary White Space

## REMOVE UNNECESSARY WHITE SPACE
def remove_whitespace(text):
    return  " ".join(text.split())
 
df['review'] = df['review'].apply(remove_whitespace)

Remove Stopwords

## REMOVE STOPWORDS

# create list of your own words to also remove
my_stopwords = []

def remove_stopwords(text):
    new_list = []
    words = word_tokenize(text)
    stopwrds = stopwords.words('english') + my_stopwords
    for word in words:
        if word not in stopwrds:
            new_list.append(word)
    return ' '.join(new_list)

df['review'] = df['review'].apply(remove_stopwords)

Correct Misspelled Words

## CORRECT MISSPELLED WORDS
# for dictionary path
# https://symspellpy.readthedocs.io/en/latest/examples/dictionary.html
# for parameters
# https://symspellpy.readthedocs.io/en/latest/examples/lookup.html#basic-usage

symsp = SymSpell()
dictionary_path = pkg_resources.resource_filename("symspellpy", "frequency_dictionary_en_82_765.txt")
symsp.load_dictionary(dictionary_path,
                      term_index=0, 
                      count_index=1, 
                      separator=' ')

def symspell_corrector(input_term):
  # look up suggestions for multi-word input strings 
  suggestions = symsp.lookup_compound( 
      phrase=input_term,  
      max_edit_distance=2,  
      transfer_casing=True,  
      ignore_term_with_digits=True, 
      ignore_non_words=True, 
      split_by_space=True
  ) 
  # the correction
  #print(suggestions[0].term)
  return suggestions[0].term

df['review'] = df['review'].apply(symspell_corrector)

Stemming

## STEMMING
# caring -> car

stem_df = df.copy()

def perform_stemming(text):
    stemmer = PorterStemmer()
    new_list = []
    words = word_tokenize(text)
    for word in words:
        new_list.append(stemmer.stem(word))

    return " ".join(new_list)

stem_df['review'] = stem_df['review'].apply(perform_stemming)

Lemmatization

## LEMMATIZATION
# usually preferred over stemming
# considers context (word part of speech)
# caring -> care

lem_df = df.copy()

# Part of speech tagger function
def pos_tagger(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:         
        return None

# Instantiate lemmatizer 
lemmatizer = WordNetLemmatizer()

def lemmatize_word(text):
    pos_tagged = nltk.pos_tag(nltk.word_tokenize(text)) 
    #word_tokens = word_tokenize(text)
    wordnet_tagged = list(map(lambda x: (x[0], pos_tagger(x[1])), pos_tagged))
    
    lemmatized_review = []
    for word, tag in wordnet_tagged:
        if tag is None:
            # if there is no available tag, append the token as is
            lemmatized_review.append(word)
        else:       
            # else use the tag to lemmatize the token
            lemmatized_review.append(lemmatizer.lemmatize(word, tag))
    lemmatized_review = " ".join(lemmatized_review)
    return lemmatized_review
 
lem_df['review'] = lem_df['review'].apply(lemmatize_word)

Below is the full code file containing all of the normalization techniques outlined above, as well as an example dataset to run the code file with.

NLP_Normalization.py Download

movie_reviews.csv Download

Copy Code

# -*- coding: utf-8 -*-
"""
Created on Thu May 18 13:05:43 2023

@author: casey
"""

# Walks through basic text normalization techniques

## LOAD LIBRARIES
import pandas as pd

# Import nlp
import nltk
import string
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from textblob import TextBlob
import jamspell
from symspellpy import SymSpell, Verbosity
import pkg_resources


# ------------------------------------------------------------------------------------------------ #
## LOAD DATA
df = pd.read_csv('C:/Users/casey/OneDrive/Documents/Data_Science/NLP/Data/movie.csv')

# ------------------------------------------------------------------------------------------------ #
## CONVERT TO LOWERCASE
def to_lowercase(text):
    text = text.lower()
    return text

df['review'] = df['review'].apply(to_lowercase)

# ------------------------------------------------------------------------------------------------ #
## REMOVE NUMBERS
def remove_numbers(text):
    text = re.sub(r'\d+', '', text)
    return text
 
df['review'] = df['review'].apply(remove_numbers)

# ------------------------------------------------------------------------------------------------ #
## REMOVE PUNCTUATION
def remove_punctuations(text):
    return text.translate(str.maketrans('', '', string.punctuation))

df['review'] = df['review'].apply(remove_punctuations)

# ------------------------------------------------------------------------------------------------ #
## REMOVE SPECIAL CHARACTERS
def remove_special_chars(text):
    return re.sub('[^a-zA-Z]', ' ', text)

df['review'] = df['review'].apply(remove_special_chars)

# ------------------------------------------------------------------------------------------------ #
## REMOVE UNNECESSARY WHITE SPACE
def remove_whitespace(text):
    return  " ".join(text.split())
 
df['review'] = df['review'].apply(remove_whitespace)

# ------------------------------------------------------------------------------------------------ #
## REMOVE STOPWORDS

# create list of your own words to also remove
my_stopwords = ['br', 'b']

def remove_stopwords(text):
    new_list = []
    words = word_tokenize(text)
    stopwrds = stopwords.words('english') + my_stopwords
    for word in words:
        if word not in stopwrds:
            new_list.append(word)
    return ' '.join(new_list)

df['review'] = df['review'].apply(remove_stopwords)

# ------------------------------------------------------------------------------------------------ #
## CORRECT MISSPELLED WORDS
# for dictionary path
# https://symspellpy.readthedocs.io/en/latest/examples/dictionary.html
# for parameters
# https://symspellpy.readthedocs.io/en/latest/examples/lookup.html#basic-usage

symsp = SymSpell()
dictionary_path = pkg_resources.resource_filename("symspellpy", "frequency_dictionary_en_82_765.txt")
symsp.load_dictionary(dictionary_path,
                      term_index=0, 
                      count_index=1, 
                      separator=' ')

def symspell_corrector(input_term):
  # look up suggestions for multi-word input strings 
  suggestions = symsp.lookup_compound( 
      phrase=input_term,  
      max_edit_distance=2,  
      transfer_casing=True,  
      ignore_term_with_digits=True, 
      ignore_non_words=True, 
      split_by_space=True
  ) 
  # the correction
  #print(suggestions[0].term)
  return suggestions[0].term


df['review'] = df['review'].apply(symspell_corrector)
    
# ------------------------------------------------------------------------------------------------ #
## STEMMING
# caring -> car

stem_df = df.copy()

def perform_stemming(text):
    stemmer = PorterStemmer()
    new_list = []
    words = word_tokenize(text)
    for word in words:
        new_list.append(stemmer.stem(word))

    return " ".join(new_list)

stem_df['review'] = stem_df['review'].apply(perform_stemming)

#stem_df.to_csv('C:/Users/casey/OneDrive/Documents/Data_Science/NLP/Data/stem_movie_normalized.csv',
              #index = False)

# ------------------------------------------------------------------------------------------------ #
## LEMMATIZATION
# usually preferred over stemming
# considers context (word part of speech)
# caring -> care

lem_df = df.copy()

# Part of speech tagger function
def pos_tagger(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:         
        return None

# Instantiate lemmatizer 
lemmatizer = WordNetLemmatizer()

def lemmatize_word(text):
    pos_tagged = nltk.pos_tag(nltk.word_tokenize(text)) 
    #word_tokens = word_tokenize(text)
    wordnet_tagged = list(map(lambda x: (x[0], pos_tagger(x[1])), pos_tagged))
    
    lemmatized_review = []
    for word, tag in wordnet_tagged:
        if tag is None:
            # if there is no available tag, append the token as is
            lemmatized_review.append(word)
        else:       
            # else use the tag to lemmatize the token
            lemmatized_review.append(lemmatizer.lemmatize(word, tag))
    lemmatized_review = " ".join(lemmatized_review)
    return lemmatized_review
 
lem_df['review'] = lem_df['review'].apply(lemmatize_word)

lem_df.to_csv('C:/Users/casey/OneDrive/Documents/Data_Science/NLP/Data/lem_movie_normalized.csv',
              index = False)