Notes

Notes - notes.io

rm(list = ls())
setwd("~/../../../../../data/Zasti/132419/R/shiny_v4")
options(shiny.autoreload = TRUE)
options(shiny.host = '10.224.11.6')
options(shiny.port = 8000)

all_packages <- c('shinycssloaders','utils','ggplot2','robustHD','isotree','plyr','ggthemes','plotly','robustHD',
'isotree','tidyverse','grid','ggplot2','shinythemes','shinyFiles','shiny','ggplot2','robustHD',
'isotree','ggridges','wesanderson','ggrepel','dplyr','widyr','GGally','igraph','ggraph','tidytext',
'rjson','network','sna','wordcloud','RMariaDB','shinyjs','lime','vip','h2o','caret','pdp','ggplot2',
'randomForest','reshape2','ggraph','igraph','gutenbergr','mallet','topicmodels',
'htm2txt','timevis','reticulate','data.table','tidytext','textdata')
not_installed <- all_packages[!(all_packages %in% installed.packages()[, "Package"])]
if(length(not_installed)) install.packages(not_installed, dependencies = T, repos = "https://cloud.r-project.org")

library(shinycssloaders)
library(utils)
library(solitude)
library(ggplot2)
library(robustHD)
library(isotree)
library(plyr)
library(ggthemes)
library(plotly)
library(robustHD)
library(isotree)
library(tidyverse)
library(grid)
library(ggplot2)
library(shinythemes)
library(shinyFiles)
library(shiny)
library(solitude)
library(ggplot2)
library(robustHD)
library(isotree)
library(ggridges)
library(wesanderson)
library(ggrepel)
library(dplyr)
library(widyr)
library(GGally)
library(igraph)
library(ggraph)
library(tidytext)
library(rjson)
library(network)
library(sna)
library(wordcloud)
library(RMariaDB)
library(shinyjs)
library(lime)
library(vip)
library(h2o)
library(caret)
library(pdp)
library(ggplot2)
# library(IsolationForest)
library(randomForest)
library(reshape2)
library(ggraph)
library(igraph)
library(gutenbergr)
library(mallet)
library(topicmodels)
library(htm2txt)
library(timevis)
library(reticulate)
library(data.table)

# jsResetCode <- "shinyjs.reset = function() {history.go(0)}"

# print(memory.size())
db_connect <- function () {
db_con <- dbConnect(RMariaDB::MariaDB(),
user = 'zasti',
password = 'Zastipass1@',
dbname = 'axa_test',
host = 'localhost')
return(db_con)
}

db_con <- db_connect()
# dbListTables(db_con)

get_tmp_all_data <- function (db_con) {
tmp_all_data <- dbReadTable( db_con, 'case_details')
return(tmp_all_data)
}

tmp_all_data <- get_tmp_all_data(db_con)
get_all_data <- function (tmp_all_data) {

sum(is.na(tmp_all_data$city))
summary(tmp_all_data$city)
tmp_all_data_filtered <- tmp_all_data[, c('claim_number', 'language', 'language_confidence', 'summary', 'sentiment',
'snapshot_date', 'policy_number', 'policy_status', 'effective_date',
'expiration_date', 'product_type', 'producing_office', 'coinsurance_share',
'insured_insured', 'insured_street', 'trigger_date', 'incident_date',
'date_received', 'reported_by', 'reported_to', 'loss_type', 'line_of_business',
'method_of_notification', 'file_type', 'loss_description', 'entire_text',
'category')]
all_data <- tmp_all_data_filtered %>%
rename(
case_id = claim_number,
primary_language = language,
primary_language_confidence = language_confidence,
snapshot_datetime = snapshot_date,
insured = insured_insured,
street = insured_street,
text = entire_text
)
all_data$effective_date <- as.Date(all_data$effective_date)
all_data$incident_date <- as.Date(all_data$incident_date)
all_data$date_received <- as.Date(all_data$date_received)
all_data$expiration_date <- as.Date(all_data$expiration_date)
all_data$snapshot_datetime <- as.Date(all_data$snapshot_datetime)

all_data$incident_month <- as.Date(cut(all_data$incident_date, breaks = 'month'))
all_data$incident_week <- as.Date(cut(all_data$incident_date, breaks = 'week'))
all_data$incident_year <- as.Date(cut(all_data$incident_date, breaks = 'year'))
all_data$incident_year_factor <- as.factor(format(all_data$incident_date, '%Y'))
all_data$incident_month_factor <- as.factor(format(all_data$incident_date, '%m'))

all_data$effective_month <- as.Date(cut(all_data$effective_date, breaks = 'month'))
all_data$effective_week <- as.Date(cut(all_data$effective_date, breaks = 'week'))
all_data$effective_year <- as.Date(cut(all_data$effective_date, breaks = 'year'))
all_data$effective_year_factor <- as.factor(format(all_data$effective_date, '%Y'))
all_data$effective_month_factor <- as.factor(format(all_data$effective_date, '%m'))

all_data$received_month <- as.Date(cut(all_data$date_received, breaks = 'month'))
all_data$received_week <- as.Date(cut(all_data$date_received, breaks = 'week'))
all_data$received_year <- as.Date(cut(all_data$date_received, breaks = 'year'))
all_data$received_year_factor <- as.factor(format(all_data$date_received, '%Y'))
all_data$received_month_factor <- as.factor(format(all_data$date_received, '%m'))

all_data$incident_effective_days_gap <- all_data$incident_date - all_data$effective_date
all_data$received_incident_days_gap <- all_data$date_received - all_data$incident_date
all_data$expiration_received_days_gap <- all_data$expiration_date - all_data$date_received
all_data$expiration_incident_days_gap <- all_data$expiration_date - all_data$incident_date
all_data$received_expiration_days_gap <- all_data$date_received - all_data$expiration_date
all_data$snapshot_received_days_gap <- all_data$snapshot_datetime - all_data$date_received
all_data$expiration_effective_days_gap <- all_data$expiration_date - all_data$effective_date
all_data$effective_received_days_gap <- all_data$effective_date - all_data$date_received

all_data$incident_effective_days_gap <- as.numeric(all_data$incident_effective_days_gap)
all_data$received_incident_days_gap <- as.numeric(all_data$received_incident_days_gap)
all_data$expiration_received_days_gap <- as.numeric(all_data$expiration_received_days_gap)
all_data$expiration_incident_days_gap <- as.numeric(all_data$expiration_incident_days_gap)
all_data$received_expiration_days_gap <- as.numeric(all_data$received_expiration_days_gap)
all_data$snapshot_received_days_gap <- as.numeric(all_data$snapshot_received_days_gap)
all_data$expiration_effective_days_gap <- as.numeric(all_data$expiration_effective_days_gap)
all_data$effective_received_days_gap <- as.numeric(all_data$effective_received_days_gap)

all_data$category <- as.factor(all_data$category)
levels(all_data$category) <- c('None', 'Accident', 'Fire', 'Natural Calamity', 'Temperature Related', 'Theft / Missing', 'Transportation / Delay')

# str(all_data$number_of_iso_matching_claims)

# all_data$number_of_iso_matching_claims <- as.numeric(all_data$number_of_iso_matching_claims)
# map(all_data, ~sum(is.na(.)))
# all_data[is.na(all_data$number_of_iso_matching_claims), 'number_of_iso_matching_claims'] <- as.numeric(-1)
# colnames(all_data)
all_data <- all_data[,-c(5)]
all_data <- na.omit(all_data)
all_data$case_id <- as.integer(all_data$case_id)
# all_data<-all_data[c(1:500),]

final <- read.csv('final_data_set.csv')
final_to_join <- final[,c(2,13,31,177:179)]
colnames(final_to_join)[1] <- 'case_id'
all_data <- merge(all_data, final_to_join, by = 'case_id', all.x = T)

all_data$cause_score <- ifelse(all_data$Cause.of.Loss %in% c('Theft ', "Fire Damage ", 'Robbery ', 'Arson '), 5, 0.5)
all_data$loss_effective_score <- ifelse((all_data$incident_effective_days_gap <= 7) & (all_data$incident_effective_days_gap >= 0), 5, 0.5)
all_data$loss_expiry_score <- ifelse((all_data$expiration_incident_days_gap <= 7) & (all_data$incident_effective_days_gap >= 0), 5, 0.5)
all_data$complexity_score <- ifelse(all_data$Complexity == 'High ', 5, ifelse(all_data$Complexity == 'Medium ', 3, ifelse(all_data$Complexity == 'Low ', 1.5, 0)))
all_data$fraud_score <- 0.3 * all_data$cause_score + 0.3 * all_data$loss_effective_score + 0.3 * all_data$loss_expiry_score + 0.1 * all_data$complexity_score

# colnames(all_data)

return(all_data)
}

all_data <- get_all_data(tmp_all_data)

# dbListTables(db_con)
# dbListFields( db_con, 'all_file_details')
# dbListFields( db_con, 'case_details')
# dbListFields( db_con, 'case_monetary_relationship_details')
# dbListFields( db_con, 'case_ner_details')

# all_data <- read.csv('case_level_master_df.csv')

get_all_data_file <- function () {
all_data_file <- dbReadTable( db_con, 'all_file_details')
all_data_file$file_name <- basename(all_data_file$file_path)
all_data_file$sentiment <- as.numeric(all_data_file$sentiment)
all_data_file$summary <- as.character(all_data_file$summary)
all_data_file$file_name <- as.character(all_data_file$file_name)
all_data_file <- all_data_file %>%
rename(
case_id = claim_id,
)
return(all_data_file)
}

all_data_file <- get_all_data_file()

# all_data_file <- read.csv('4_id_file_name_summary_sentiment_top.csv')
# print('3')
# all_ner <- read.csv('4_ner_entities.csv')
get_all_ner <- function () {
all_ner <- dbReadTable( db_con, 'case_ner_details')
all_ner <- all_ner %>%
rename(
case_id = claim_id
)
return(all_ner)
}

all_ner <- get_all_ner()
# print('4')
# all_ner_relations <- read.csv('5_ner_relations.csv')
get_all_ner_relations <- function () {
all_ner_relations <- dbReadTable( db_con, 'case_monetary_relationship_details')
all_ner_relations <- all_ner_relations %>%
rename(
case_id = claim_id
)
return(all_ner_relations)
}

all_ner_relations <- get_all_ner_relations()
# print('5')

dbDisconnect(db_con)
# rows <- sample(nrow(all_data))
# all_data <- all_data[rows,]
#
# colnames(first_snaps)
# colnames(all_data)
get_first_snaps <- function (all_data) {
colnames(all_data)[50] <- 'complexity_label'
colnames(all_data)[51] <- 'cause_of_loss'
# colnames(all_data)[59] <- 'fraud_scr'
first_snaps = all_data[,c(1,2,3, 5:24, 26:49, 50, 51, 59)]
return(first_snaps)
}

first_snaps <- get_first_snaps(all_data)
# first_snaps = all_data[,c(3,4,5,29:60)]
# data_iso <- all_data[,c(54:60)]

get_data_iso <- function (all_data) {
data_iso <- all_data[,c(42:49)]
}

data_iso <- get_data_iso(all_data)
do_pca <- function(data_iso, first_snaps) {

data_iso_standard <- standardize(data_iso)
data_iso_pca <- prcomp(data_iso, scale = F, rank = 2)
first_snaps$PC1 <- data_iso_pca$x[,'PC1']
first_snaps$PC2 <- data_iso_pca$x[,'PC2']
return(first_snaps)
}

first_snaps <- do_pca(data_iso, first_snaps)

get_pred_iso <- function (first_snaps) {
iso <- isolation.forest(first_snaps[,c('PC1', 'PC2')],
ntrees = 100,
random_seed = 6)
pred_iso <- predict(iso, first_snaps[,c('PC1', 'PC2')])
# first_snaps$outlier_score <- pred_iso
return(pred_iso)
}

pred_iso <- get_pred_iso(first_snaps)

get_outlier_score <- function (all_data, pred_iso) {
all_data$outlier_score <- pred_iso
return(all_data)
}

all_data <- get_outlier_score(all_data, pred_iso)

get_score <- function( first_snaps, pred_iso) {
first_snaps$outlier_score <- pred_iso
return(first_snaps)
}

first_snaps <- get_score(first_snaps, pred_iso)

get_data_rf <- function (first_snaps) {
final <- read.csv('final_data_set.csv')
colnames(final)[2] <- 'case_id'
all_data_join_rf <- merge(first_snaps, final, by = 'case_id', all.x = T)
data_rf <- all_data_join_rf[,c(1,9,10,11,12,21,29,40:47,53,65,230,83)]
data_rf$product_type <- as.factor(data_rf$product_type)
data_rf$producing_office <- as.factor(data_rf$producing_office)
data_rf$coinsurance_share <- as.numeric(data_rf$coinsurance_share)
data_rf$insured <- as.factor(data_rf$insured)
data_rf$method_of_notification <- as.factor(data_rf$method_of_notification)
data_rf$incident_month_factor <- as.factor(data_rf$incident_month_factor)
data_rf$Complexity <- ifelse(data_rf$Complexity == 'Fast Track ' ,'Fast_Track', 'Normal')
data_rf$Complexity <- as.factor(data_rf$Complexity)
data_rf$Indemnity_Payment <- as.numeric(data_rf$Indemnity_Payment)

data_rf <- na.omit(data_rf)

data_rf <- droplevels(data_rf)
colnames(data_rf)
print('rf_done')
return(data_rf)
}

data_rf <- get_data_rf(first_snaps)

# get_lime_complexity <- function (data_rf) {
# rf_fit <- readRDS('rf_model.rds')
# explainer_rf <- lime(data_rf[,-17], rf_fit, n_bins = 50)
# return(explainer_rf)
# }
#
# lime_exp_com <- get_lime_complexity(data_rf)

get_iso_data <- function (first_snaps) {
rf_iso_data <- first_snaps[,c(1, 40:47)]
rf_iso_data$outlier_label <- ifelse(first_snaps$outlier_score > 0.73, 'outlier', 'normal')
table(rf_iso_data$outlier_label)
rf_iso_data$outlier_label <- as.factor(rf_iso_data$outlier_label)
return(rf_iso_data)
}
rf_iso_data <- get_iso_data(first_snaps)

# get_out_complexity <- function (rf_iso_data) {
# iso_fit <- readRDS('iso_model.rds')
# # colnames(data_rf)
# explainer_rf <- lime(rf_iso_data[,-10], iso_fit, n_bins = 1000000)
# return(explainer_rf)
# }
#
# lime_exp_out <- get_out_complexity(rf_iso_data)

get_notes <- function () {
notes <- read.csv('Zasti-ClaimNotes19May2020.csv', as.is = T, flush = T)

notes$N_CLAIM_NUMBER <- as.integer(as.character(notes$N_CLAIM_NUMBER))
notes$claim_no_nchar <- nchar(notes$N_CLAIM_NUMBER)
notes <- notes[notes$claim_no_nchar == 7, ]
notes <- notes[is.na(notes$N_CLAIM_NUMBER) == FALSE,]
notes$date_nchar <- nchar(as.character(notes[,3]))
notes <- notes[notes$date_nchar == 23, ]
notes$D_CREATE_TS <- as.Date(notes$D_CREATE_TS)
notes$end <- NA
notes$T_FILE_NTE_DESC <- htm2txt(gsub( 'n', ' ', gsub( '-', '', notes$T_FILE_NTE_DESC)))
# write.csv(notes, 'notes.csv')
# str(notes$T_SHRT_DESC)
notes$T_SHRT_TXT <- as.character(notes$T_SHRT_TXT)
notes$T_SHRT_DESC <- as.character(notes$T_SHRT_DESC)
notes$text <- paste0(notes$T_SHRT_DESC , notes$T_SHRT_TXT)
print('notes doing')
return(notes)

}

notes <- get_notes()

print('notes done')

# first_snaps <- do_iso(first_snaps)
set.seed(6)
# print(.Random.seed)

# pred_iso

# str(first_snaps)
# data_iso_pca$x[,'PC1']
# tmp[,1]
# iso <- isolation.forest(data_iso_standard, ntrees = 100)
# pred <- predict(iso, data_iso_standard)

plot_heights = '300px'
plot_heights_bigger = '400px'
wes_p <- 'Darjeeling1'
gg_theme <- theme_clean()

# Corpus
# case_ids <- fromJSON(file = '1_all_case_ids.json')
# file_names <- fromJSON(file = '1_all_file_names.json')
# texts <- fromJSON(file = '1_all_parsed_texts.json')
# texts_df <- data.frame(case_id = case_ids,
# file_name = file_names,
# text = texts)
# colnames(all_data_file)
get_text_df <- function (all_data_file) {
texts_df <- all_data_file
texts_df$text <- as.character(texts_df$text)
texts_df$file_name <- as.character(texts_df$file_name)
return(texts_df)
}
texts_df <- get_text_df(all_data_file)
# str(texts_df)

graph_pal <- wes_palette(wes_p, type = 'discrete', n = 5)
ner_levels <- c('GPE', 'LOC', 'PERSON', 'MONEY', 'DATE')

fnames <- levels(all_data_file$file_name)

explanation_rf <- py_load_object('complexity_lime')
explanation_iso <- py_load_object('outlier_lime')

text_doc_tidy <- fread('text_doc_tidy.csv')
text_doc_corr_tokens <- fread('text_doc_corr_tokens.csv')

#
#
#
# text_doc_tidy <- texts_df %>%
# unnest_tokens(word, text) %>%
# anti_join(stop_words) %>%
# group_by(case_id) %>%
# dplyr::count(word, sort = T) %>%
# ungroup() %>%
# inner_join(get_sentiments('afinn')) %>%
# mutate(sentiment = scales::rescale(value, c(0,1)))
# fwrite(text_doc_tidy, 'text_doc_tidy.csv')
#
#
#
#
# text_doc_corr_tokens <- tibble(chapter = seq_along(texts_df$file_name),
# texts = texts_df$text,
# case_id = texts_df$case_id,
# fnames = texts_df$file_name) %>%
# unnest_tokens(word,texts) %>%
# filter(!word %in% stop_words$word)
#
# fwrite(text_doc_corr_tokens, 'text_doc_corr_tokens.csv')
#
#
#
#
#
#
#
#
# get_lime_complexity <- function (data_rf) {
# rf_fit <- readRDS('rf_model.rds')
# explainer_rf <- lime(data_rf[,-17], rf_fit, n_bins = 50)
# return(explainer_rf)
# }
#
# lime_exp_com <- get_lime_complexity(data_rf)
# get_out_complexity <- function (rf_iso_data) {
# iso_fit <- readRDS('iso_model.rds')
# # colnames(data_rf)
# explainer_rf <- lime(rf_iso_data[,-10], iso_fit, n_bins = 50)
# return(explainer_rf)
# }
#
# lime_exp_out <- get_out_complexity(rf_iso_data)
#
#
#
#
# data_exp_com <- data_rf[!duplicated(data_rf$case_id),]
# rownames(data_exp_com) <- data_exp_com$case_id
# explanation_rf <- explain(
# x = data_exp_com[ , -17],
# explainer = lime_exp_com,
# n_permutations = 100,
# dist_fun = 'manhattan',
# kernel_width = 3,
# n_features = 7,
# feature_select = 'lasso_path',
# labels = 'Fast_Track',
# # n_labels = 2
# )
# py_save_object(explanation_rf, 'complexity_lime')
#
#
#
# data_exp_out <- rf_iso_data[!duplicated(rf_iso_data$case_id),]
# rownames(data_exp_out) <- data_exp_out$case_id
# explanation_iso <- explain(
# x = data_exp_out[,-10],
# explainer = lime_exp_out,
# n_permutations = 100,
# dist_fun = 'manhattan',
# kernel_width = 3,
# n_features =7,
# feature_select = 'lasso_path',
# labels = 'outlier',
# # n_labels = 2
# )
# py_save_object(explanation_iso, 'outlier_lime')

Notes is a web-based application for online taking notes. You can take your notes and share with others people. If you like taking long notes, notes.io is designed for you. To date, over 8,000,000,000+ notes created and continuing...

With notes.io;

* You can take a note from anywhere and any device with internet connection.
* You can share the notes in social platforms (YouTube, Facebook, Twitter, instagram etc.).
* You can quickly share your contents without website, blog and e-mail.
* You don't need to create any Account to share a note. As you wish you can use quick, easy and best shortened notes with sms, websites, e-mail, or messaging services (WhatsApp, iMessage, Telegram, Signal).
* Notes.io has fabulous infrastructure design for a short link and allows you to share the note as an easy and understandable link.

Fast: Notes.io is built for speed and performance. You can take a notes quickly and browse your archive.

Easy: Notes.io doesn’t require installation. Just write and share note!

Short: Notes.io’s url just 8 character. You’ll get shorten link of your note when you want to share. (Ex: notes.io/q )

Free: Notes.io works for 14 years and has been free since the day it was started.

You immediately create your first note and start sharing with the ones you wish. If you want to contact us, you can use the following communication channels;

Email: [email protected]

Twitter: http://twitter.com/notesio

Instagram: http://instagram.com/notes.io

Facebook: http://facebook.com/notesio

Regards;
Notes.io Team

Notes

Notes - notes.io

Shortened Note Link

Long File

Notes