options(stringsAsFactors=FALSE) library(WriteXLS) source("read_fasta.R") bad_s <- "YP_009725307.1" cod <- read.table("cod_to_AA.txt") #fin1 <- "gisaid_delta_05_11.out" #fin2 <- "gisaid_delta_05_11.fasta" #fout1 <- "gisaid_delta_05_11_mutations.xls" #fout2 <- "gisaid_delta_05_11_counts.xls" fin1 <- "new_combined1.out" fin2 <- "new_combined.fasta" fout1 <- "new_combined1_muts.xls" fout2 <- "new_combined1_counts.xls" x <- read.table(fin1) colnames(x) <- unlist(strsplit("qseqid qlen sseqid slen qstart qend sstart send evalue bitscore score length pident nident gapopen gaps sstrand"," ")) x <- x[x$sstart==1,] x <- x[x$gaps == 0,] junk <- paste(x$qseqid,x$sseqid) ind <- tapply(1:nrow(x),junk,function(i) i[which.max(x$score[i])]) x <- x[ind,] ref <- read_fasta("ref_dnaa.fa") gis <- read_fasta(fin2) ss <- unique(x$sseqid) qq <- unique(x$qseqid) tab <- list() for (s in ss) { ref_seq <- unlist(strsplit(ref[s],"")) if (s == bad_s) ref_seq <- c(ref_seq[1:27],ref_seq[-(1:26)]) rp <- matrix(ref_seq,nrow=3) rp <- apply(rp,2, function(x) paste0(x,collapse="")) mm <- match(rp,cod[,1]) ref_prot <- cod[mm,2] ref_prot <- ref_prot[-length(ref_prot)] y <- x[x$sseqid == s,] mat <- matrix("-",nrow=nrow(y),ncol = length(ref_prot)) n <- 0 for (i in 1:nrow(y)) { if (y$pident[i] == 100 & y$length[i] == y$slen[i]) { n <- n+1 mat[n,] <- "I" next } seq <- unlist(strsplit(gis[y$qseqid[i]],"")) seq <- seq[y$qstart[i]:y$qend[i]] if (s == bad_s) seq <- c(seq[1:27],seq[-(1:26)]) if (length(seq) > 3*length(ref_prot)) seq <- seq[1:(3*length(ref_prot))] k <- 3*floor(length(seq)/3) seq <- seq[1:k] rp <- matrix(seq,nrow=3) rp <- apply(rp,2, function(x) paste0(x,collapse="")) mm <- match(rp,cod[,1]) prot <- cod[mm,2] if (length(grep("STOP",prot)) != 0) next n <- n+1 bad <- which(is.na(prot)) prot[bad] <- "-" good <- which(prot == ref_prot[1:length(prot)]) mat[n,good] <- "I" d <- which(prot != ref_prot[1:length(prot)] & prot != "-") mat[n,d] <- paste0(ref_prot[d],d,prot[d]) } tab[[s]] <- apply(mat[1:n,],2,function(x) table(x)) } for (s in names(tab)) { res <- character() for (r in tab[[s]]) if (length(r) > 2) { ind <- which(! (names(r) %in% c("-","I"))) z <- paste(names(r)[ind],r[ind],sep=",") res <- c(res,z) } assign(s,as.data.frame(matrix(res,ncol=1))) } WriteXLS(names(tab),ExcelFileName=fout1,SheetNames=names(tab),row.names=FALSE,col.names=FALSE,AdjWidth=TRUE) for (s in names(tab)) { res <- sapply(tab[[s]],function(x) sum(x[names(x) != "-"])) assign(s,as.data.frame(matrix(res,ncol=1))) } WriteXLS(names(tab),ExcelFileName=fout2,SheetNames=names(tab),row.names=TRUE,col.names=FALSE,AdjWidth=TRUE)