Goal: Data Visualization Capstone Dashboard

Steps: Preprocessing (Cleansing, Change Data Types, Feature Engineering, Data Aggregation) → Tooltip Setup → Visualization → Interactive Visualization → Web Dashboard Development

Preprocessing / Data Wrangling Stages: - ✅ Select columns → select() - ✅ Filter rows → filter() - ✅ Change data types → mutate() - ✅ Add new columns → mutate() - ✅ Aggregate data → group_by() + summarise() + ungroup() - ✅ Sort data → arrange()

Introduction

# Import Library
library(dplyr)
library(ggplot2) # For visualization
library(ggpubr)
library(scales) # For number formatting (adding commas, etc.)
library(glue)
library(plotly) 
library(lubridate) # For working with date and time
options(scipen = 100) # Prevent scientific notation in outputs

Load Data

We will re-explore the Pakistan Largest Ecommerce Dataset using the dplyr package!

Let’s start by reading the dataset:

# Read Data
data <- read.csv("Pakistan Largest Ecommerce Dataset.csv", stringsAsFactors = TRUE, encoding = "latin1")

# Data Cleansing
data_clean <- data %>% 
  select(
    # Delete Unneeded column
    -c(increment_id, sales_commission_code, Working.Date, BI.Status,MV, Year, Month, Customer.Since, M.Y, FY, X, X.1, X.2, X.3, X.4)) %>% 
  
  mutate(
    # Change Data Types
    created_at= dmy(created_at),
    sku= as.character(sku),
    
    # Add new columns
    month= month(created_at, label = FALSE, abbr = TRUE),
    year= year(created_at)
  ) %>% 
  
  # Rename columns
  rename(
    c(product_name = sku, category_name = category_name_1)) %>% 

  # Filter data
  filter(
    # only take data in the year 2017
    year(created_at) %in% c(2017),
    category_name != "\\N",
    grepl("_", product_name) 
  )

data_clean
# Check Missing Data
data_clean %>% is.na() %>% colSums()
#>         item_id          status      created_at    product_name           price 
#>               0               0               0               0               0 
#>     qty_ordered     grand_total   category_name discount_amount  payment_method 
#>               0               0               0               0               0 
#>     Customer.ID           month            year 
#>               0               0               0

Capstone’s Objective: - The accuracy in selecting appropriate plots to compare and represent the data effectively.

Page 1 (Overview)

PLOT 1: (line plot)

“Monthly Sales Trend in Pakistan for 2017”

# Data Wrangling
sales_trend <- data_clean %>% 
  group_by(month) %>% 
  summarise(total_sales = sum(grand_total)) %>% 
  ungroup() %>% 
  arrange(month)
sales_trend
sales_trend <- sales_trend %>%
  mutate(
    month_name = factor(month, levels = 1:12, labels = month.abb),
    full_month_name = factor(month, levels = 1:12, labels = month.name)
  )
sales_trend
# Tooltip
sales_trend <- sales_trend %>% 
  mutate(
    label = glue(
      "Total Sales: {number(total_sales,  big.mark = '.', decimal.mark =',', accuracy = 1)}
      Month: {full_month_name}"
    )
  )
sales_trend
# Visualization 
#"Monthly Sales Trend in Pakistan for 2017"
plot1 <- ggplot(sales_trend, aes(x=month_name, y= total_sales))+
  
  geom_line(col="darkgreen", group=1) +
  geom_point(aes(text=label), col="black") +
  scale_y_continuous(labels = label_number(big.mark = ".", decimal.mark = ","), 
                     breaks = seq(0, 100000000, 10000000)) +
  labs(
    title = "Monthly Sales Trend",
    x = NULL,
    y = "Total Sales"
  ) +
  theme_minimal() +
  theme(
     plot.title = element_text(hjust = 0.5)
  )

ggplotly(plot1, tooltip = "text")

Summary: This shows the Ecommerce sales growth throughout the year.

Total Sales

data_clean
Total_Sales<- data_clean %>% summarise(Total_sales=number(sum(grand_total),  big.mark = '.', decimal.mark =',', accuracy = 1))

Total_Sales

Total Orders

Total_Orders<- data_clean %>% summarise(Total_orders=n())

Total_Orders

Total Products

Total_product<- data_clean %>% summarise(total_product=number(sum(qty_ordered),  big.mark = '.', decimal.mark =',', accuracy = 1))

Total_product

Total Customer

Total_customer<- data_clean %>% 
  summarise(total_cust=number(length(unique(Customer.ID)),  big.mark = '.', decimal.mark =',', accuracy = 1))

Total_customer

PLOT 2: (lolipop plot)

“Total Orders by Payment Method in Pakistan”

# Data Wrangling 
fav_payment<- data_clean %>% 
  group_by(payment_method) %>% 
  summarise(Payment_type = n()) %>% 
  ungroup() %>% 
  arrange(-Payment_type) 

fav_payment
fav_payment <- fav_payment %>% 
  mutate(
    label = glue(
      "Payment Method: {payment_method}
      Total Orders: {number(Payment_type,  big.mark = '.', decimal.mark =',', accuracy = 1)}"
    )
  )

fav_payment
# Visualization ~> Opt. 1: Lollipop Plot
#"Fav Payment Method"
plot2 <- ggplot(fav_payment, aes(x = Payment_type, 
                    y = reorder(payment_method, Payment_type),
                    text = label)) +
  
  geom_segment(aes(x= 0, xend=Payment_type, yend=reorder(payment_method, Payment_type)), color="darkgreen", size= 1) +
  geom_point(color="black", size=3) +
  scale_x_continuous(labels = comma) +
  labs(title = "Top Payment Method",
       x = "Total Orders",
       y = NULL) +
  theme_minimal() +
  theme(
     plot.title = element_text(hjust = 0.5)
  )

ggplotly(plot2, tooltip = "text")
# Visualization ~> Opt. 2: Bar Plot
#"Fav Payment Method"
plot2 <- ggplot(fav_payment, aes(x = Payment_type, 
                    y = reorder(payment_method, Payment_type),
                    text = label)) +
  
  geom_col(aes(fill=Payment_type)) +
  scale_x_continuous(labels = comma)+
  scale_fill_gradient(low="black", high="darkgreen") +
  labs(title = "Top Payment Method in Pakistan 2017",
       x = "Total Orders",
       y = NULL) +
  theme_minimal()+
  theme(legend.position = "none")

ggplotly(plot2, tooltip = "text")

Summary: This identifies the most popular payment methods in the country. In the end, I decided to use a lollipop plot because, in a bar plot, it was difficult to view the tooltip for the lowest-ranked categories (the bars were too thin due to the small quantities).

Page 2 (Sales Analysis)

PLOT 3: (line plot)

“Monthly total sales in each category”

data_clean
# Data Wrangling
category_sales_trend <- data_clean %>%
  filter(category_name =="Beauty & Grooming") %>% 
  group_by(month) %>% 
  summarise(total_sales_category = sum(grand_total)) %>% 
  ungroup() %>% 
  arrange(month)

category_sales_trend
category_sales_trend <- category_sales_trend %>%
  mutate(
    month_name = factor(month, levels = 1:12, labels = month.abb),
    full_month_name = factor(month, levels = 1:12, labels = month.name)
  )

category_sales_trend
# Tooltip
category_sales_trend <- category_sales_trend %>% 
  mutate(
    label = glue(
      "Total Sales: {number(total_sales_category,  big.mark = '.', decimal.mark =',', accuracy = 1)}
      Month: {full_month_name}"
    )
  )

category_sales_trend
# Visualization ~> Opt. 1: Line Plot
# "Monthly total sales in each category"
plot3 <- ggplot(category_sales_trend, aes(x=month_name, y= total_sales_category))+
  
  geom_line(col="darkgreen", group=1) +
  geom_point(aes(text=label), col="black") +
  scale_y_continuous(labels = label_number(big.mark = '.', decimal.mark =','), 
                     breaks = seq(0, 100000000, 200000)) +
  labs(
    title = "Monthly Sales for Beauty & Grooming",
    x = NULL,
    y = "Total Sales"
  ) +
  theme_minimal() +
  theme(
     plot.title = element_text(hjust = 0.5)
  )

ggplotly(plot3, tooltip = "text")
# Visualization ~> Opt. 2: Lollipop Plot
# "Monthly total sales in each category"
plot3 <- ggplot(category_sales_trend, aes(x=month_name, y= total_sales_category, text=label)) +
  
  geom_segment(aes(xend=month_name, y=0,yend=total_sales_category), color="darkgreen", size= 1) +
  geom_point(color="black", size=3) +
  scale_y_continuous(labels = label_number(,  big.mark = '.', decimal.mark =','),
                     breaks = seq(0, 100000000, 200000)) +
  labs(title = "Monthly Sales for Beauty & Grooming",
       x = NULL,
       y = "Total Sales") +
  theme_minimal() +
  theme(
     plot.title = element_text(hjust = 0.5)
  )

ggplotly(plot3, tooltip = "text")

Summary: This gives a view of which month was has the most sales in Pakistan. In the end, I chose to use a line plot because the data is presented on a monthly basis, making the line plot more suitable for showing trends over time.

PLOT 4: (bar plot)

“Top 10 Best Selling Product in each category”

data_clean
# Data Wrangling
fav_product <- data_clean %>% 
  filter(category_name =="Mobiles & Tablets") %>% 
  group_by(product_name) %>% 
  summarise(total_qty = sum(qty_ordered), total_sold= sum(grand_total)) %>% 
  ungroup() %>% 
  arrange(-total_qty) %>% 
  head(10)

fav_product
process_product_name <- function(product_name) {
  parts <- str_split(product_name, " - ") %>% unlist()  # Divided by ' - '
  if (length(parts) > 2) {
    # if more than 2 words, print first and last word 
    return(paste(parts[1], parts[length(parts)], sep = " - "))
  } else {
    # if less than 2 words, do nothing 
    return(product_name)
  }
}
library(stringr)
fav_product <- fav_product %>%
  mutate(short_name = sapply(product_name, process_product_name)) %>%
  mutate(
    label = glue(
      "Product : {product_name}
      Total Sales : {number(total_sold, big.mark='.', decimal.mark=',', accuracy = 1)}
      Quantity : {number(as.numeric(total_qty), big.mark='.', decimal.mark=',', accuracy = 1)}"
    )
  )

fav_product
# Visualization 
# "Top 10 Best Selling Product in each category"
plot4 <- ggplot(fav_product, aes(x = total_sold,
                    y = reorder(short_name, total_sold),
                    text = label)) +
  
  geom_col(aes(fill = total_sold)) +
  scale_fill_gradient(low="black", high="darkgreen") +
  labs(title = "Top Mobiles & Tablets Products",
       x = "Total Sales",
       y = NULL) +
  scale_x_continuous(labels = label_number(big.mark = '.', decimal.mark =',')) +
  theme_minimal() +
  theme(legend.position = "none",  plot.title = element_text(hjust = 0.5))

ggplotly(plot4, tooltip = "text")

Summary: This shows the most popular product by the number of items sold & contributed to the highest sales in Pakistan.

Page 3 (Customer Preferance)

PLOT 5: (bar plot)

“Quantity Product in each category Ordered over the year 2017”

data_clean
# Data Wrangling
rank_qty_pcategory <- data_clean %>% 
  filter(created_at >= as.Date("2017-01-01") & created_at <= as.Date("2017-01-06")) %>% 
  group_by(category_name) %>% 
  summarise(qty_pcategory = sum(qty_ordered)) %>% 
  ungroup() %>% 
  arrange(-qty_pcategory) 

rank_qty_pcategory
# Tooltip
rank_qty_pcategory <- rank_qty_pcategory %>% 
  mutate(label = glue(
      "Category : {category_name}
      Quantity : {number(qty_pcategory,  big.mark = '.', decimal.mark =',', accuracy = 1)}"
    )
 )

rank_qty_pcategory
# Visualization 
# "Top Categories by Quantity Sold"
plot5 <- ggplot(rank_qty_pcategory, aes(x = qty_pcategory,
                                        y = reorder(category_name, qty_pcategory),
                                        text = label)) +
  geom_col(aes(fill = qty_pcategory)) +
  scale_fill_gradient(low="black", high="darkgreen") +
  labs(title = "Top Categories by Quantity Sold",
       x = "Quantity",
       y = NULL) +
  scale_x_continuous(labels = label_number(big.mark = '.', decimal.mark =',')) +
  theme_minimal() +
  theme(legend.position = "none", plot.title = element_text(hjust = 0.5))

ggplotly(plot5, tooltip = "text")

Summary: This shows the most popular categories based on the quantity sold from a certain range of date.

PLOT 6: (scatter plot)

“Relationship between price and quantity from a certain range of date”

data_clean
# Data Wrangling
rank_qty_pprice <- data_clean %>% 
  filter(created_at >= as.Date("2017-01-01") & created_at <= as.Date("2017-12-31")) %>% 
  group_by(price) %>% 
  summarise(qty_pprice = sum(qty_ordered)) %>% 
  ungroup() %>% 
  arrange(-qty_pprice) 

rank_qty_pprice
# Tooltip
rank_qty_pprice <- rank_qty_pprice %>%
  mutate(label = glue(
      "Price : {number(price, big.mark = '.', decimal.mark =',', accuracy = 1)}
      Quantity : {number(qty_pprice, big.mark = '.', decimal.mark =',', accuracy = 1)}"
    )
 )

rank_qty_pprice
# Visualization 
# "Quantity Product in each category Ordered over the year 2017"
plot6 <- ggplot(rank_qty_pprice, aes(x = qty_pprice,
                    y = price,
                    text = label)) +
  
  geom_jitter(color="darkgreen") +
  labs(title = "Relationship Between Price and Quantity Sold Products",
       x = "Quantity",
       y = "Price") +
  scale_x_continuous(labels = label_number(big.mark = '.', decimal.mark =',')) +
  scale_y_continuous(labels = label_number(big.mark = '.', decimal.mark =','))+
  theme_minimal() +
  theme(
     plot.title = element_text(hjust = 0.5)
  )

ggplotly(plot6, tooltip = "text")

Summary: Shows the relationship between price and quantity. The higher the price, the less it is sold, and the higher it was sold, the lower the price are.

PLOT 7: (line plot)

“Sales/ Trend over date with many categories(top 8)”

data_clean
# Data Wrangling
rank_sales_cat <- data_clean %>% 
  filter(created_at >= as.Date("2017-01-01") & created_at <= as.Date("2017-01-06")) %>% 
  group_by(category_name) %>% 
  summarise(sales = sum(grand_total)) %>% 
  ungroup() %>% 
  arrange(-sales) 

rank_sales_cat
rank_sales_cat <- rank_sales_cat %>% 
  mutate(
    label = glue(
      "Category: {category_name}
      Total Sales: {number(sales,  big.mark = '.', decimal.mark =',', accuracy = 1)}"
    )
  )

rank_sales_cat
# Visualization 
# "Top Categories by Total Sales"
plot7 <- ggplot(rank_sales_cat, aes(x = sales, 
                                    y = reorder(category_name, sales),
                                    text = label)) +
  geom_segment(aes(x= 0, 
                   xend=sales, 
                   yend=reorder(category_name, sales)),
               color="darkgreen", 
               size= 1) +
  geom_point(color="black", size=3) +
  scale_x_continuous(labels = label_number(big.mark = '.', decimal.mark =',')) +
  labs(title = "Top Categories by Total Sales",
       x = "Total Sales",
       y = NULL) +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))

ggplotly(plot7, tooltip = "text")

Summary: This ranked the highest sales categories from a certain range of date.

DONE