From f848cc4796f2af6df5b23d198260ea41019646ac Mon Sep 17 00:00:00 2001 From: Peter Gedeck Date: Tue, 7 Jan 2025 12:45:58 -0500 Subject: [PATCH] cleanup --- .Rhistory | 288 ----------------------------------------------------- .gitignore | 1 + 2 files changed, 1 insertion(+), 288 deletions(-) delete mode 100644 .Rhistory diff --git a/.Rhistory b/.Rhistory deleted file mode 100644 index c9bee4d..0000000 --- a/.Rhistory +++ /dev/null @@ -1,288 +0,0 @@ -library(dplyr) -library(tidyr) -library(ggplot2) -library(vioplot) -library(corrplot) -library(gmodels) -library(matrixStats) -PSDS_PATH <- file.path(dirname(dirname(getwd()))) -PSDS_PATH -PSDS_PATH = getwd() -state <- read.csv(file.path(PSDS_PATH, 'data', 'state.csv')) -dfw <- read.csv(file.path(PSDS_PATH, 'data', 'dfw_airline.csv')) -sp500_px <- read.csv(file.path(PSDS_PATH, 'data', 'sp500_data.csv.gz'), row.names=1) -sp500_sym <- read.csv(file.path(PSDS_PATH, 'data', 'sp500_sectors.csv'), stringsAsFactors = FALSE) -kc_tax <- read.csv(file.path(PSDS_PATH, 'data', 'kc_tax.csv.gz')) -lc_loans <- read.csv(file.path(PSDS_PATH, 'data', 'lc_loans.csv')) -airline_stats <- read.csv(file.path(PSDS_PATH, 'data', 'airline_stats.csv'), stringsAsFactors = FALSE) -airline_stats$airline <- ordered(airline_stats$airline, -levels=c('Alaska', 'American', 'Jet Blue', 'Delta', 'United', 'Southwest')) -graph <- ggplot(kc_tax0, (aes(x=SqFtTotLiving, y=TaxAssessedValue))) + -stat_binhex(color='white') + -theme_bw() + -scale_fill_gradient(low='white', high='blue') + -scale_y_continuous(labels = function(x) format(x, scientific = FALSE)) + -labs(x='Finished Square Feet', y='Tax-Assessed Value') -kc_tax0 <- subset(kc_tax, TaxAssessedValue < 750000 & -SqFtTotLiving > 100 & -SqFtTotLiving < 3500) -nrow(kc_tax0) -graph <- ggplot(kc_tax0, (aes(x=SqFtTotLiving, y=TaxAssessedValue))) + -stat_binhex(color='white') + -theme_bw() + -scale_fill_gradient(low='white', high='blue') + -scale_y_continuous(labels = function(x) format(x, scientific = FALSE)) + -labs(x='Finished Square Feet', y='Tax-Assessed Value') -graph -graph <- ggplot(kc_tax0, (aes(x=SqFtTotLiving, y=TaxAssessedValue))) + -stat_binhex(color='white') -graph -graph + scale_fill_gradient(low='white', high='blue') -graph <- ggplot(kc_tax0, (aes(x=SqFtTotLiving, y=TaxAssessedValue))) + -stat_binhex() -graph -ggplot(kc_tax0, (aes(x=SqFtTotLiving, y=TaxAssessedValue))) + -stat_binhex(color='white') + -# theme_bw() + -scale_fill_gradient(low='white', high='blue') -library(dplyr) -library(tidyr) -library(ggplot2) -library(lubridate) -library(ellipse) -library(mclust) -library(cluster) -library(ca) -PSDS_PATH <- file.path(dirname(dirname(getwd()))) -sp500_px <- read.csv(file.path(PSDS_PATH, 'data', 'sp500_data.csv.gz'), row.names=1) -sp500_sym <- read.csv(file.path(PSDS_PATH, 'data', 'sp500_sectors.csv'), stringsAsFactors = FALSE) -loan_data <- read.csv(file.path(PSDS_PATH, 'data', 'loan_data.csv.gz')) -loan_data$outcome <- ordered(loan_data$outcome, levels=c('paid off', 'default')) -housetasks <- read.csv(file.path(PSDS_PATH, 'data', 'housetasks.csv'), row.names=1) -PSDS_PATH -PSDS_PATH <- file.path(dirname(getwd())) -PSDS_PATH -PSDS_PATH <- file.path(getwd()) -sp500_px <- read.csv(file.path(PSDS_PATH, 'data', 'sp500_data.csv.gz'), row.names=1) -sp500_sym <- read.csv(file.path(PSDS_PATH, 'data', 'sp500_sectors.csv'), stringsAsFactors = FALSE) -loan_data <- read.csv(file.path(PSDS_PATH, 'data', 'loan_data.csv.gz')) -loan_data$outcome <- ordered(loan_data$outcome, levels=c('paid off', 'default')) -housetasks <- read.csv(file.path(PSDS_PATH, 'data', 'housetasks.csv'), row.names=1) -oil_px <- sp500_px[, c('CVX', 'XOM')] -pca <- princomp(oil_px) -pca$loadings -loadings <- pca$loadings -graph <- ggplot(data=oil_px, aes(x=CVX, y=XOM)) + -geom_point(alpha=.3) + -scale_shape_manual(values=c(46)) + -stat_ellipse(type='norm', level=.99, color='grey25') + -geom_abline(intercept = 0, slope = loadings[2,1]/loadings[1,1], color='grey25', linetype=2) + -geom_abline(intercept = 0, slope = loadings[2,2]/loadings[1,2], color='grey25', linetype=2) + -scale_x_continuous(expand=c(0,0)) + -scale_y_continuous(expand=c(0,0)) + -coord_cartesian(xlim=c(-3, 3), ylim=c(-3, 3)) + -theme_bw() -graph -syms <- c( 'AAPL', 'MSFT', 'CSCO', 'INTC', 'CVX', 'XOM', 'SLB', 'COP', -'JPM', 'WFC', 'USB', 'AXP', 'WMT', 'TGT', 'HD', 'COST') -top_sp <- sp500_px[row.names(sp500_px)>='2011-01-01', syms] -sp_pca <- princomp(top_sp) -par(mar=c(6,3,0,0)+.1, las=2) -screeplot(sp_pca, main='') -loadings <- sp_pca$loadings[,1:5] -loadings <- as.data.frame(loadings) -loadings$Symbol <- row.names(loadings) -loadings <- gather(loadings, 'Component', 'Weight', -Symbol) -loadings$Color = loadings$Weight > 0 -graph <- ggplot(loadings, aes(x=Symbol, y=Weight, fill=Color)) + -geom_bar(stat='identity', position='identity', width=.75) + -facet_grid(Component ~ ., scales='free_y') + -guides(fill='none') + -ylab('Component Loading') + -theme_bw() + -theme(axis.title.x=element_blank(), -axis.text.x=element_text(angle=90, vjust=0.5)) -graph -ca_analysis <- ca(housetasks) -plot(ca_analysis) -if (!require(ggrepel)) { -install.packages('ggrepel') -library(ggrepel) -} -ca_analysis$sv ** 2 -summary(ca_analysis) -contrib = ca_analysis$sv ** 2 -contrib = contrib / sum(contrib) -colcoord = as.data.frame(ca_analysis$colcoord) -rowcoord = as.data.frame(ca_analysis$rowcoord) -coords = rbind( -cbind(rowcoord, type='rowcoord'), -cbind(colcoord, type='columns') -) -row.names(coords) <- gsub('_', ' ', row.names(coords)) -graph <- ggplot(coords, aes(x=Dim1, y=Dim2, color=type, label=rownames(coords), shape=type)) + -geom_hline(yintercept=0, linetype='dotted', color='#444444') + -geom_vline(xintercept = 0, linetype='dotted', color='#444444') + -geom_point() + -geom_text_repel() + -xlab(sprintf('Dimension 1 (%.1f%%)', 100 * contrib[1])) + -ylab(sprintf('Dimension 2 (%.1f%%)', 100 * contrib[2])) + -scale_color_manual(values = c('blue', 'red')) + -theme_bw() + -theme(legend.position = "none") -graph -set.seed(1010103) -df <- sp500_px[row.names(sp500_px)>='2011-01-01', c('XOM', 'CVX')] -km <- kmeans(df, centers=4, nstart=1) -df$cluster <- factor(km$cluster) -head(df) -centers <- data.frame(cluster=factor(1:4), km$centers) -centers -graph <- ggplot(data=df, aes(x=XOM, y=CVX, color=cluster, shape=cluster)) + -geom_point() + -scale_shape_manual(values = c(1, 3, 2, 4), -guide = guide_legend(override.aes=aes(size=1))) + -geom_point(data=centers, aes(x=XOM, y=CVX), size=2, stroke=2, color='black') + -theme_bw() + -scale_x_continuous(expand=c(0,0)) + -scale_y_continuous(expand=c(0,0)) + -coord_cartesian(xlim=c(-2, 2), ylim=c(-2.5, 2.5)) -graph -syms <- c( 'AAPL', 'MSFT', 'CSCO', 'INTC', 'CVX', 'XOM', 'SLB', 'COP', -'JPM', 'WFC', 'USB', 'AXP', 'WMT', 'TGT', 'HD', 'COST') -df <- sp500_px[row.names(sp500_px) >= '2011-01-01', syms] -set.seed(10010) -km <- kmeans(df, centers=5, nstart=10) -km$size -centers <- as.data.frame(t(km$centers)) -names(centers) <- paste('Cluster', 1:5) -centers$Symbol <- row.names(centers) -centers <- gather(centers, 'Cluster', 'Mean', -Symbol) -centers$Color = centers$Mean > 0 -graph <- ggplot(centers, aes(x=Symbol, y=Mean, fill=Color)) + -geom_bar(stat='identity', position='identity', width=.75) + -facet_grid(Cluster ~ ., scales='free_y') + -guides(fill='none') + -ylab('Component Loading') + -theme_bw() + -theme(axis.title.x=element_blank(), -axis.text.x=element_text(angle=90, vjust=0.5)) -graph -pct_var <- data.frame(pct_var = 0, -num_clusters=2:14) -totalss <- kmeans(df, centers=14, nstart=50, iter.max=100)$totss -for (i in 2:14) { -kmCluster <- kmeans(df, centers=i, nstart=50, iter.max = 100) -pct_var[i-1, 'pct_var'] <- kmCluster$betweenss / totalss -} -graph <- ggplot(pct_var, aes(x=num_clusters, y=pct_var)) + -geom_line() + -geom_point() + -labs(y='% Variance Explained', x='Number of Clusters') + -scale_x_continuous(breaks=seq(2, 14, by=2)) + -theme_bw() -graph -syms1 <- c('GOOGL', 'AMZN', 'AAPL', 'MSFT', 'CSCO', 'INTC', 'CVX', -'XOM', 'SLB', 'COP', 'JPM', 'WFC', 'USB', 'AXP', -'WMT', 'TGT', 'HD', 'COST') -df <- sp500_px[row.names(sp500_px) >= '2011-01-01', syms1] -d <- dist(t(df)) -hcl <- hclust(d) -plot(hcl, ylab='distance', xlab='', sub='', main='') -cutree(hcl, k=4) -cluster_fun <- function(df, method) -{ -d <- dist(df) -hcl <- hclust(d, method=method) -tree <- cutree(hcl, k=4) -df$cluster <- factor(tree) -df$method <- method -return(df) -} -df0 <- sp500_px[row.names(sp500_px) >= '2011-01-01', c('XOM', 'CVX')] -df <- rbind(cluster_fun(df0, method='single'), -cluster_fun(df0, method='average'), -cluster_fun(df0, method='complete'), -cluster_fun(df0, method='ward.D')) -df$method <- ordered(df$method, c('single', 'average', 'complete', 'ward.D')) -graph <- ggplot(data=df, aes(x=XOM, y=CVX, color=cluster, shape=cluster)) + -geom_point(alpha=0.6) + -scale_shape_manual(values=c(1, 3, 4, 2), -guide=guide_legend(override.aes=aes(size=2))) + -facet_wrap( ~ method) + -theme_bw() -graph -mu <- c(.5, -.5) -sigma <- matrix(c(1, 1, 1, 2), nrow=2) -prob <- c(.5, .75, .95, .99) ## or whatever you want -names(prob) <- prob ## to get id column in result -x <- NULL -for (p in prob){ -x <- rbind(x, ellipse(x=sigma, centre=mu, level=p)) -} -df <- data.frame(x, prob=factor(rep(prob, rep(100, length(prob))))) -names(df) <- c('X', 'Y', 'Prob') -## Figure 7-9: Multivariate normal ellipses -dfmu <- data.frame(X=mu[1], Y=mu[2]) -graph <- ggplot(df, aes(X, Y)) + -geom_path(aes(linetype=Prob)) + -geom_point(data=dfmu, aes(X, Y), size=3) + -theme_bw() -graph -df <- sp500_px[row.names(sp500_px)>='2011-01-01', c('XOM', 'CVX')] -mcl <- Mclust(df) -summary(mcl) -cluster <- factor(predict(mcl)$classification) -graph <- ggplot(data=df, aes(x=XOM, y=CVX, color=cluster, shape=cluster)) + -geom_point(alpha=.8) + -theme_bw() + -scale_shape_manual(values = c(1, 3), -guide = guide_legend(override.aes=aes(size=2))) -graph -summary(mcl, parameters=TRUE)$mean -summary(mcl, parameters=TRUE)$variance[,,1] -summary(mcl, parameters=TRUE)$variance[,,2] -plot(mcl, what='BIC', ask=FALSE, cex=.75) -defaults <- loan_data[loan_data$outcome=='default',] -df <- defaults[, c('loan_amnt', 'annual_inc', 'revol_bal', 'open_acc', 'dti', 'revol_util')] -km <- kmeans(df, centers=4, nstart=10) -centers <- data.frame(size=km$size, km$centers) -print(round(centers, digits=2)) -df0 <- scale(df) -km0 <- kmeans(df0, centers=4, nstart=10) -centers0 <- scale(km0$centers, center=FALSE, scale=1/attr(df0, 'scaled:scale')) -centers0 <- scale(centers0, center=-attr(df0, 'scaled:center'), scale=FALSE) -centers0 <- data.frame(size=km0$size, centers0) -print(round(centers0, digits=2)) -km <- kmeans(df, centers=4, nstart=10) -centers <- data.frame(size=km$size, km$centers) -round(centers, digits=2) -syms <- c('GOOGL', 'AMZN', 'AAPL', 'MSFT', 'CSCO', 'INTC', 'CVX', 'XOM', -'SLB', 'COP', 'JPM', 'WFC', 'USB', 'AXP', 'WMT', 'TGT', 'HD', 'COST') -top_15 <- sp500_px[row.names(sp500_px)>='2011-01-01', syms] -sp_pca1 <- princomp(top_15) -screeplot(sp_pca1, main='') -round(sp_pca1$loadings[,1:2], 3) -x <- loan_data[1:5, c('dti', 'payment_inc_ratio', 'home_', 'purpose_')] %>% -mutate(home_=as.factor(home_), purpose_=as.factor(purpose_)) -x -daisy(x, metric='gower') -set.seed(301) -df <- loan_data[sample(nrow(loan_data), 250), -c('dti', 'payment_inc_ratio', 'home_', 'purpose_')] %>% -mutate(home_=as.factor(home_), purpose_=as.factor(purpose_)) -d = daisy(df, metric='gower') -hcl <- hclust(d) -dnd <- as.dendrogram(hcl) -plot(dnd, leaflab='none', ylab='distance') -dnd_cut <- cut(dnd, h=.5) -df[labels(dnd_cut$lower[[4]]),] -df <- model.matrix(~ -1 + dti + payment_inc_ratio + home_ + pub_rec_zero, data=defaults) -df0 <- scale(df) -km0 <- kmeans(df0, centers=4, nstart=10) -centers0 <- scale(km0$centers, center=FALSE, scale=1/attr(df0, 'scaled:scale')) -round(scale(centers0, center=-attr(df0, 'scaled:center'), scale=FALSE), 2) -df -x <- loan_data[1:5, c('dti', 'payment_inc_ratio', 'home_', 'purpose_')] %>% -mutate(home_=as.factor(home_), purpose_=as.factor(purpose_)) -x diff --git a/.gitignore b/.gitignore index 2c2f609..b3482e8 100644 --- a/.gitignore +++ b/.gitignore @@ -130,3 +130,4 @@ dmypy.json .Trash-0 .Rproj.user +.Rhistory \ No newline at end of file