Goal: Data Visualization Capstone Dashboard
Steps: Preprocessing (Cleansing, Change Data Types, Feature Engineering, Data Aggregation) → Tooltip Setup → Visualization → Interactive Visualization → Web Dashboard Development
Preprocessing / Data Wrangling Stages: - ✅ Select columns → select() - ✅ Filter rows → filter() - ✅ Change data types → mutate() - ✅ Add new columns → mutate() - ✅ Aggregate data → group_by() + summarise() + ungroup() - ✅ Sort data → arrange()
We will re-explore the Pakistan Largest Ecommerce Dataset using the dplyr package!
Let’s start by reading the dataset:
# Read Data
data <- read.csv("Pakistan Largest Ecommerce Dataset.csv", stringsAsFactors = TRUE, encoding = "latin1")
# Data Cleansing
data_clean <- data %>%
select(
# Delete Unneeded column
-c(increment_id, sales_commission_code, Working.Date, BI.Status,MV, Year, Month, Customer.Since, M.Y, FY, X, X.1, X.2, X.3, X.4)) %>%
mutate(
# Change Data Types
created_at= dmy(created_at),
sku= as.character(sku),
# Add new columns
month= month(created_at, label = FALSE, abbr = TRUE),
year= year(created_at)
) %>%
# Rename columns
rename(
c(product_name = sku, category_name = category_name_1)) %>%
# Filter data
filter(
# only take data in the year 2017
year(created_at) %in% c(2017),
category_name != "\\N",
grepl("_", product_name)
)
data_clean#> item_id status created_at product_name price
#> 0 0 0 0 0
#> qty_ordered grand_total category_name discount_amount payment_method
#> 0 0 0 0 0
#> Customer.ID month year
#> 0 0 0
Capstone’s Objective: - The accuracy in selecting appropriate plots to compare and represent the data effectively.
“Monthly Sales Trend in Pakistan for 2017”
# Data Wrangling
sales_trend <- data_clean %>%
group_by(month) %>%
summarise(total_sales = sum(grand_total)) %>%
ungroup() %>%
arrange(month)
sales_trendsales_trend <- sales_trend %>%
mutate(
month_name = factor(month, levels = 1:12, labels = month.abb),
full_month_name = factor(month, levels = 1:12, labels = month.name)
)
sales_trend# Tooltip
sales_trend <- sales_trend %>%
mutate(
label = glue(
"Total Sales: {number(total_sales, big.mark = '.', decimal.mark =',', accuracy = 1)}
Month: {full_month_name}"
)
)
sales_trend# Visualization
#"Monthly Sales Trend in Pakistan for 2017"
plot1 <- ggplot(sales_trend, aes(x=month_name, y= total_sales))+
geom_line(col="darkgreen", group=1) +
geom_point(aes(text=label), col="black") +
scale_y_continuous(labels = label_number(big.mark = ".", decimal.mark = ","),
breaks = seq(0, 100000000, 10000000)) +
labs(
title = "Monthly Sales Trend",
x = NULL,
y = "Total Sales"
) +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5)
)
ggplotly(plot1, tooltip = "text")Summary: This shows the Ecommerce sales growth throughout the year.
Total_Sales<- data_clean %>% summarise(Total_sales=number(sum(grand_total), big.mark = '.', decimal.mark =',', accuracy = 1))
Total_SalesTotal_product<- data_clean %>% summarise(total_product=number(sum(qty_ordered), big.mark = '.', decimal.mark =',', accuracy = 1))
Total_productTotal_customer<- data_clean %>%
summarise(total_cust=number(length(unique(Customer.ID)), big.mark = '.', decimal.mark =',', accuracy = 1))
Total_customer“Total Orders by Payment Method in Pakistan”
# Data Wrangling
fav_payment<- data_clean %>%
group_by(payment_method) %>%
summarise(Payment_type = n()) %>%
ungroup() %>%
arrange(-Payment_type)
fav_paymentfav_payment <- fav_payment %>%
mutate(
label = glue(
"Payment Method: {payment_method}
Total Orders: {number(Payment_type, big.mark = '.', decimal.mark =',', accuracy = 1)}"
)
)
fav_payment# Visualization ~> Opt. 1: Lollipop Plot
#"Fav Payment Method"
plot2 <- ggplot(fav_payment, aes(x = Payment_type,
y = reorder(payment_method, Payment_type),
text = label)) +
geom_segment(aes(x= 0, xend=Payment_type, yend=reorder(payment_method, Payment_type)), color="darkgreen", size= 1) +
geom_point(color="black", size=3) +
scale_x_continuous(labels = comma) +
labs(title = "Top Payment Method",
x = "Total Orders",
y = NULL) +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5)
)
ggplotly(plot2, tooltip = "text")# Visualization ~> Opt. 2: Bar Plot
#"Fav Payment Method"
plot2 <- ggplot(fav_payment, aes(x = Payment_type,
y = reorder(payment_method, Payment_type),
text = label)) +
geom_col(aes(fill=Payment_type)) +
scale_x_continuous(labels = comma)+
scale_fill_gradient(low="black", high="darkgreen") +
labs(title = "Top Payment Method in Pakistan 2017",
x = "Total Orders",
y = NULL) +
theme_minimal()+
theme(legend.position = "none")
ggplotly(plot2, tooltip = "text")Summary: This identifies the most popular payment methods in the country. In the end, I decided to use a lollipop plot because, in a bar plot, it was difficult to view the tooltip for the lowest-ranked categories (the bars were too thin due to the small quantities).
“Monthly total sales in each category”
# Data Wrangling
category_sales_trend <- data_clean %>%
filter(category_name =="Beauty & Grooming") %>%
group_by(month) %>%
summarise(total_sales_category = sum(grand_total)) %>%
ungroup() %>%
arrange(month)
category_sales_trendcategory_sales_trend <- category_sales_trend %>%
mutate(
month_name = factor(month, levels = 1:12, labels = month.abb),
full_month_name = factor(month, levels = 1:12, labels = month.name)
)
category_sales_trend# Tooltip
category_sales_trend <- category_sales_trend %>%
mutate(
label = glue(
"Total Sales: {number(total_sales_category, big.mark = '.', decimal.mark =',', accuracy = 1)}
Month: {full_month_name}"
)
)
category_sales_trend# Visualization ~> Opt. 1: Line Plot
# "Monthly total sales in each category"
plot3 <- ggplot(category_sales_trend, aes(x=month_name, y= total_sales_category))+
geom_line(col="darkgreen", group=1) +
geom_point(aes(text=label), col="black") +
scale_y_continuous(labels = label_number(big.mark = '.', decimal.mark =','),
breaks = seq(0, 100000000, 200000)) +
labs(
title = "Monthly Sales for Beauty & Grooming",
x = NULL,
y = "Total Sales"
) +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5)
)
ggplotly(plot3, tooltip = "text")# Visualization ~> Opt. 2: Lollipop Plot
# "Monthly total sales in each category"
plot3 <- ggplot(category_sales_trend, aes(x=month_name, y= total_sales_category, text=label)) +
geom_segment(aes(xend=month_name, y=0,yend=total_sales_category), color="darkgreen", size= 1) +
geom_point(color="black", size=3) +
scale_y_continuous(labels = label_number(, big.mark = '.', decimal.mark =','),
breaks = seq(0, 100000000, 200000)) +
labs(title = "Monthly Sales for Beauty & Grooming",
x = NULL,
y = "Total Sales") +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5)
)
ggplotly(plot3, tooltip = "text")Summary: This gives a view of which month was has the most sales in Pakistan. In the end, I chose to use a line plot because the data is presented on a monthly basis, making the line plot more suitable for showing trends over time.
“Top 10 Best Selling Product in each category”
# Data Wrangling
fav_product <- data_clean %>%
filter(category_name =="Mobiles & Tablets") %>%
group_by(product_name) %>%
summarise(total_qty = sum(qty_ordered), total_sold= sum(grand_total)) %>%
ungroup() %>%
arrange(-total_qty) %>%
head(10)
fav_productprocess_product_name <- function(product_name) {
parts <- str_split(product_name, " - ") %>% unlist() # Divided by ' - '
if (length(parts) > 2) {
# if more than 2 words, print first and last word
return(paste(parts[1], parts[length(parts)], sep = " - "))
} else {
# if less than 2 words, do nothing
return(product_name)
}
}library(stringr)
fav_product <- fav_product %>%
mutate(short_name = sapply(product_name, process_product_name)) %>%
mutate(
label = glue(
"Product : {product_name}
Total Sales : {number(total_sold, big.mark='.', decimal.mark=',', accuracy = 1)}
Quantity : {number(as.numeric(total_qty), big.mark='.', decimal.mark=',', accuracy = 1)}"
)
)
fav_product# Visualization
# "Top 10 Best Selling Product in each category"
plot4 <- ggplot(fav_product, aes(x = total_sold,
y = reorder(short_name, total_sold),
text = label)) +
geom_col(aes(fill = total_sold)) +
scale_fill_gradient(low="black", high="darkgreen") +
labs(title = "Top Mobiles & Tablets Products",
x = "Total Sales",
y = NULL) +
scale_x_continuous(labels = label_number(big.mark = '.', decimal.mark =',')) +
theme_minimal() +
theme(legend.position = "none", plot.title = element_text(hjust = 0.5))
ggplotly(plot4, tooltip = "text")Summary: This shows the most popular product by the number of items sold & contributed to the highest sales in Pakistan.
“Quantity Product in each category Ordered over the year 2017”
# Data Wrangling
rank_qty_pcategory <- data_clean %>%
filter(created_at >= as.Date("2017-01-01") & created_at <= as.Date("2017-01-06")) %>%
group_by(category_name) %>%
summarise(qty_pcategory = sum(qty_ordered)) %>%
ungroup() %>%
arrange(-qty_pcategory)
rank_qty_pcategory# Tooltip
rank_qty_pcategory <- rank_qty_pcategory %>%
mutate(label = glue(
"Category : {category_name}
Quantity : {number(qty_pcategory, big.mark = '.', decimal.mark =',', accuracy = 1)}"
)
)
rank_qty_pcategory# Visualization
# "Top Categories by Quantity Sold"
plot5 <- ggplot(rank_qty_pcategory, aes(x = qty_pcategory,
y = reorder(category_name, qty_pcategory),
text = label)) +
geom_col(aes(fill = qty_pcategory)) +
scale_fill_gradient(low="black", high="darkgreen") +
labs(title = "Top Categories by Quantity Sold",
x = "Quantity",
y = NULL) +
scale_x_continuous(labels = label_number(big.mark = '.', decimal.mark =',')) +
theme_minimal() +
theme(legend.position = "none", plot.title = element_text(hjust = 0.5))
ggplotly(plot5, tooltip = "text")Summary: This shows the most popular categories based on the quantity sold from a certain range of date.
“Relationship between price and quantity from a certain range of date”
# Data Wrangling
rank_qty_pprice <- data_clean %>%
filter(created_at >= as.Date("2017-01-01") & created_at <= as.Date("2017-12-31")) %>%
group_by(price) %>%
summarise(qty_pprice = sum(qty_ordered)) %>%
ungroup() %>%
arrange(-qty_pprice)
rank_qty_pprice# Tooltip
rank_qty_pprice <- rank_qty_pprice %>%
mutate(label = glue(
"Price : {number(price, big.mark = '.', decimal.mark =',', accuracy = 1)}
Quantity : {number(qty_pprice, big.mark = '.', decimal.mark =',', accuracy = 1)}"
)
)
rank_qty_pprice# Visualization
# "Quantity Product in each category Ordered over the year 2017"
plot6 <- ggplot(rank_qty_pprice, aes(x = qty_pprice,
y = price,
text = label)) +
geom_jitter(color="darkgreen") +
labs(title = "Relationship Between Price and Quantity Sold Products",
x = "Quantity",
y = "Price") +
scale_x_continuous(labels = label_number(big.mark = '.', decimal.mark =',')) +
scale_y_continuous(labels = label_number(big.mark = '.', decimal.mark =','))+
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5)
)
ggplotly(plot6, tooltip = "text")Summary: Shows the relationship between price and quantity. The higher the price, the less it is sold, and the higher it was sold, the lower the price are.
“Sales/ Trend over date with many categories(top 8)”
# Data Wrangling
rank_sales_cat <- data_clean %>%
filter(created_at >= as.Date("2017-01-01") & created_at <= as.Date("2017-01-06")) %>%
group_by(category_name) %>%
summarise(sales = sum(grand_total)) %>%
ungroup() %>%
arrange(-sales)
rank_sales_catrank_sales_cat <- rank_sales_cat %>%
mutate(
label = glue(
"Category: {category_name}
Total Sales: {number(sales, big.mark = '.', decimal.mark =',', accuracy = 1)}"
)
)
rank_sales_cat# Visualization
# "Top Categories by Total Sales"
plot7 <- ggplot(rank_sales_cat, aes(x = sales,
y = reorder(category_name, sales),
text = label)) +
geom_segment(aes(x= 0,
xend=sales,
yend=reorder(category_name, sales)),
color="darkgreen",
size= 1) +
geom_point(color="black", size=3) +
scale_x_continuous(labels = label_number(big.mark = '.', decimal.mark =',')) +
labs(title = "Top Categories by Total Sales",
x = "Total Sales",
y = NULL) +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))
ggplotly(plot7, tooltip = "text")Summary: This ranked the highest sales categories from a certain range of date.