##illustrative scripts for Helix lucorum IEG identification## ##Chuan Xu & Qian Li## ##transcriptome assembly## ##shell envir## ##config_file max_rd_len=200 avg_ins=300 asm_flags=1 rd_len_cutoff=200 SOAPdenovo-Trans-127mer all -K 33 -s config_file -p 8 -o snail ##assembly quality measure## ##shell envir## transrate --assembly assembled_contig_helix_lucorum.fa --reference aplysia.fa --threads 8 --output $PATH/TO/OUT/FOLDER ##mapping and quantification## ##shell envir## rsem-prepare-reference --bowtie -p 10 assembled_contig_helix_lucorum.fa reference_index rsem-calculate-expression -p 10 --sort-bam-by-coordinate $PATH/TO/SAMPLE_FASTQ reference_index $OUT_SAMPLE_NAME ##normalization by wrapper script from Trinity## ##input multiple *.isoforms.results from RSEM## ##output count matrix and normalized count matrix## ##shell envir## $TRINITY_HOME/util/abundance_estimates_to_matrix.pl --est_method RSEM --gene_trans_map none --cross_sample_norm TMM $SAMPLE1.isoforms.results $SAMPLE2.isoforms.results ##filter and MDS## ##input count matrix from abundance_estimates_to_matrix.pl## ##output MDS coordinates## ##R envir## E1_count <- E1_count[rowSums(E1_count)>=10,] E2_count <- E2_count[rowSums(E2_count)>=10,] mds1 <- cmdscale(as.dist(1-cor(E1_count,method='spe')),2) mds2 <- cmdscale(as.dist(1-cor(E2_count,method='spe')),2) ##DE analysis## ##input count matrix from abundance_estimates_to_matrix.pl## ##output edgeR exact test result## ##R envir## dge1 <- DGEList(counts = E1_count, group = condition1) dge2 <- DGEList(counts = E2_count, group = condition2) dge1 <- calcNormFactors(dge1) dge2 <- calcNormFactors(dge2) DE1 <- exactTest(dge1,pair=c('active','control')) DE2 <- exactTest(dge2,pair=c('active','control')) ##protein homology among five species## ##input protein sequences of each species for database formatting## ##output aligning result between pairwise species## ##shell envir## makeblastdb -dbtype prot -parse_seqids -in species.fasta -out species blastp -query species.fa -evalue 1e-5 -num_threads 10 -db species -out out.blastp ##contig aligning to proteins ##shell envir## makeblastdb -dbtype prot -parse_seqids -in species.fasta -out species blastx -query assembled_contig_helix_lucorum.fa -evalue 1e-5 -num_threads 10 -db species -out out.blastp ##contig annotation## ##R envir## #annotation is based on protein sequences from 5 species sps<-c('aplysia','fugu','biom','menogaster','elegans') #load blastx output of contig alignment to protein sequences load('blastx_output_from_species_aplysia') ## contig2gene_exp is a list with names being contigs and elements being aligned proteins res<-as.data.frame(cbind(names(contig2gene_exp),unlist(contig2gene_exp)),stringsAsFactors=F) colnames(res)<-c('contig',sps[1]) for(sp in sps[2:5]){ load('blastx_output_from_the_other_species') ## contig2gene_exp is a list with names being contigs and elements being aligned proteins sres<-as.data.frame(cbind(names(contig2gene_exp),unlist(contig2gene_exp)),stringsAsFactors=F) colnames(sres)<-c('contig',sp) res<-merge(res,sres,by='contig',all=T) } # res is a dataframe with 6 columns. The first column is contigs and the other five columns are aligned proteins from each of the five species sp_pair<-combn(sps,2) #read blastp output of proteins (from one species) alignment to proteins (from the other species) ##homologous proteins among the five species for(i in 1:ncol(sp_pair)){ tmp<-read.table('species1_align_to_species2.blastp.output',sep='\t',as.is=T) assign(paste0(sp_pair[1,i],'_',sp_pair[2,i],'_orth'),tmp) tmp<-read.table('species2_align_to_species1.blastp.output',sep='\t',as.is=T) assign(paste0(sp_pair[2,i],'_',sp_pair[1,i],'_orth'),tmp) } #function used to count the total number of proteins mapped by expressed contigs protein_num<-function(res){ contigs<-res[!is.na(res[,2]),1] proteins<-unique(res[!is.na(res[,2]),2]) for(i in 2:5){ sp<-sps[i] tmp_con<-res[(! res[,1] %in% contigs) & (!is.na(res[,sp])),1] contigs<-c(contigs,tmp_con) tmp_pro<-unique(res[res[,1] %in% tmp_con,sp]) check<-sapply(tmp_pro,function(x) { for(j in 1:(i-1)){ orth<-get(paste0(sp,'_',sps[j],'_orth')) tmp_orth<-unique(orth[orth[,1]==x,2]) if(sum(tmp_orth %in% proteins)>0){ return(T) } } return(F) }) proteins<-c(proteins,tmp_pro[!check]) } proteins } #total number of proteins mapped by expressed contigs protein_number<-protein_num(res) #function used to check the consistence of contig annotations among the five species consis_func<-function(x){ ind<-which(!is.na(x)) for(i in 1:length(ind)){ check<- T p1<-x[ind[i]] for(j in ind[-i]){ p2<-x[j] orth<-get(paste0(sps[ind[i]],'_',sps[j],'_orth')) ps<-unique(orth[orth[,1]==p1,2]) if(! (p2 %in% ps)) {check<- F; break} } if(check) break } check } #contigs mapped to protein sequences from more than one species mul_exp<-res[apply(res[,2:6],1,function(x) sum(!is.na(x))>1),] #contigs consistently mapped to protein sequences from more than one species consis_exp<-mul_exp[apply(mul_exp[,2:6],1,consis_func),] ##annotation focusing on DE contigs load('DE_contigs.Rdata') ## E1_DE_contigs is DE contigs from E1 and E2_DE_contigs is DE contigs from E2 de_contig<-union(E1_DE_contigs,E2_DE_contigs) de_consis<-consis_exp[consis_exp[,'contig'] %in% de_contig,] de_anno_num<-nrow(de_consis) #function used to summarize proteins mapped by expressed contigs protein_fill<-function(res){ for(k in 2:5){ sp<-sps[k] ind<-which(is.na(res[,2]) & !is.na(res[,sp])) ps<-unique(res[!is.na(res[,2]),2]) for(i in ind){ p<-res[i,sp] for(j in 1:(k-1)){ orth<-get(paste0(sp,'_',sps[j],'_orth')) ptmp<-unique(orth[orth[,1]==p,2]) check<-which(ptmp %in% ps) if(length(check)>0){ res[i,2]<-ptmp[check[1]] break } } if(is.na(res[i,2])){ res[i,2]<-p } } } res } #summarize proteins mapped by expressed contigs consis_fill<-protein_fill(consis_exp) #proteins mapped by DE contigs de_protein<-unique(consis_fill[consis_fill[,'contig'] %in% de_contig,2]) #Non-DE contigs that were mapped to proteins from DE contigs non_de_contig<-consis_fill[(!(consis_fill[,1] %in% de_contig)) & consis_fill[,2] %in% de_protein,1] #read log2 fold change of contigs between activated and control samples from E1 and E2 fd1<-read.table('log2_fold_change_from_E1',head=T,sep='\t',as.is=T) fd2<-read.table('log2_fold_change_from_E2',head=T,sep='\t',as.is=T) fd1<-data.frame(contig=rownames(fd1),fd1= fd1[,1],stringsAsFactors=F) fd2<-data.frame(contig=rownames(fd2),fd2= fd2[,1],stringsAsFactors=F) fd<-merge(fd1,fd2,by='contig',all=T) rownames(fd)<-fd[,1] fd<-fd[,-1] #calculate the number of up-regulated contigs among Non-DE contigs that were mapped to proteins from DE contigs non_de_fd<-fd[non_de_contig,] non_de_up<-sum(apply(non_de_fd,1,function(x) any(x>0,na.rm=T))) #1000 times random sampling to calculate the significance of observed up-regulation of non-DE contigs random_non_de_up<-c() all_st<-setdiff(rownames(fd),de_contig) for(i in 1:1000){ random_contig<-sample(all_st,length(non_de_contig)) random_fd<-fd[random_contig,] random_non_de_up<-c(random_non_de_up,sum(apply(random_fd,1,function(x) any(x>0,na.rm=T)))) } pval<-sum(random_non_de_up>=non_de_up)/1000 ##direction change of proteins from DE contigs #function used to calculate the percentage of contigs with consistent direction change among all contigs mapped to the protein. Here only proteins from DE contigs were considered consis_pro<-function(contig){ p<-consis_fill[match(contig,consis_fill[,'contig']),2] sfd<-fd[contig,] #consider contig direction change under both E1 and E2 experiments two<-sapply(split(sfd,p),function(x){ ss<-apply(x,1,function(y) { if(any(is.na(y))) return(sign(y[!is.na(y)])) else{ if(y[1]*y[2]>0) return(sign(y[1])) else return(2) } }) n1<-sum(ss==1) n2<-sum(ss== -1) if(n1>n2) return(n1/length(ss)) else return(n2/length(ss)) }) #consider contig direction change under either experiment one<-sapply(split(sfd,p),function(x){ if(all(is.na(x[,1]))) p1<-0 else p1<-max(table(sign(x[,1][!is.na(x[,1])])))/nrow(x) if(all(is.na(x[,2]))) p2<-0 else p2<-max(table(sign(x[,2][!is.na(x[,2])])))/nrow(x) max(c(p1,p2)) }) return(cbind(two,one)) } consis_pro_all<-consis_pro(c(de_consis[,1],non_de_contig)) ##random sampling to estimate the significance p<-consis_fill[match(c(de_consis[,1],non_de_contig),consis_fill[,1]),2] #consider contig direction change under both E1 and E2 experiments random_two<-function(cq){ random_contig<-sample(rownames(fd),length(p)) sfd<-fd[random_contig,] two<-sapply(split(sfd,p),function(x){ ss<-apply(x,1,function(y) { if(any(is.na(y))) return(sign(y[!is.na(y)])) else{ if(y[1]*y[2]>0) return(sign(y[1])) else return(2) } }) n1<-sum(ss==1) n2<-sum(ss== -1) if(n1>n2) return(n1/length(ss)) else return(n2/length(ss)) }) two } #consider contig direction change under either experiment random_one<-function(cq){ random_contig<-sample(rownames(fd),length(p)) sfd<-fd[random_contig,] one<-sapply(split(sfd,p),function(x){ if(all(is.na(x[,1]))) p1<-0 else p1<-max(table(sign(x[,1][!is.na(x[,1])])))/nrow(x) if(all(is.na(x[,2]))) p2<-0 else p2<-max(table(sign(x[,2][!is.na(x[,2])])))/nrow(x) max(c(p1,p2)) }) one } random_two_res<-replicate(1000,random_two()) random_one_res<-replicate(1000,random_one()) #calculate the significance of percentage of proteins with >=80% of its all contigs showing consistent direction of change among all the proteins mapped by DE contigs rr_two<-apply(random_two_res,2,function(x) sum(x>=0.8)/nrow(consis_pro_all)) rr_one<-apply(random_one_res,2,function(x) sum(x>=0.8)/nrow(consis_pro_all)) rtwo<-sum(consis_pro_all[,1]>=0.8)/nrow(consis_pro_all) rone<-sum(consis_pro_all[,2]>=0.8)/nrow(consis_pro_all) pval_under_two_experiments<-sum(rr_two>=rtwo)/1000 pval_under_either_experiment<-sum(rr_one>=rone)/1000