##illustrative scripts for Helix lucorum IEG identification##
##Chuan Xu & Qian Li##

##transcriptome assembly##
##shell envir##
##config_file max_rd_len=200 avg_ins=300 asm_flags=1 rd_len_cutoff=200 
SOAPdenovo-Trans-127mer all -K 33 -s config_file -p 8 -o snail 


##assembly quality measure##
##shell envir##
transrate --assembly assembled_contig_helix_lucorum.fa --reference aplysia.fa --threads 8 --output $PATH/TO/OUT/FOLDER

##mapping and quantification##
##shell envir##
rsem-prepare-reference --bowtie -p 10 assembled_contig_helix_lucorum.fa reference_index
rsem-calculate-expression -p 10 --sort-bam-by-coordinate $PATH/TO/SAMPLE_FASTQ reference_index $OUT_SAMPLE_NAME

##normalization by wrapper script from Trinity##
##input multiple *.isoforms.results from RSEM##
##output count matrix and normalized count matrix##
##shell envir##
$TRINITY_HOME/util/abundance_estimates_to_matrix.pl --est_method RSEM --gene_trans_map none --cross_sample_norm TMM $SAMPLE1.isoforms.results $SAMPLE2.isoforms.results

##filter and MDS##
##input count matrix from abundance_estimates_to_matrix.pl##
##output MDS coordinates##
##R envir##
E1_count <- E1_count[rowSums(E1_count)>=10,]
E2_count <- E2_count[rowSums(E2_count)>=10,]
mds1 <- cmdscale(as.dist(1-cor(E1_count,method='spe')),2)
mds2 <- cmdscale(as.dist(1-cor(E2_count,method='spe')),2)

##DE analysis##
##input count matrix from abundance_estimates_to_matrix.pl##
##output edgeR exact test result##
##R envir##
dge1 <- DGEList(counts = E1_count, group = condition1)
dge2 <- DGEList(counts = E2_count, group = condition2)
dge1 <- calcNormFactors(dge1)
dge2 <- calcNormFactors(dge2)
DE1 <- exactTest(dge1,pair=c('active','control'))
DE2 <- exactTest(dge2,pair=c('active','control'))

##protein homology among five species##
##input protein sequences of each species for database formatting##
##output aligning result between pairwise species##
##shell envir##
makeblastdb -dbtype prot -parse_seqids -in species.fasta -out species
blastp -query species.fa -evalue 1e-5 -num_threads 10 -db species -out out.blastp 

##contig aligning to proteins
##shell envir##
makeblastdb -dbtype prot -parse_seqids -in species.fasta -out species
blastx -query assembled_contig_helix_lucorum.fa -evalue 1e-5 -num_threads 10 -db species -out out.blastp

##contig annotation##
##R envir##
#annotation is based on protein sequences from 5 species
sps<-c('aplysia','fugu','biom','menogaster','elegans')
#load blastx output of contig alignment to protein sequences 
load('blastx_output_from_species_aplysia') ## contig2gene_exp is a list with names being contigs and elements being aligned proteins
res<-as.data.frame(cbind(names(contig2gene_exp),unlist(contig2gene_exp)),stringsAsFactors=F)
colnames(res)<-c('contig',sps[1])

for(sp in sps[2:5]){
	load('blastx_output_from_the_other_species') ## contig2gene_exp is a list with names being contigs and elements being aligned proteins
	sres<-as.data.frame(cbind(names(contig2gene_exp),unlist(contig2gene_exp)),stringsAsFactors=F)
	colnames(sres)<-c('contig',sp)
	res<-merge(res,sres,by='contig',all=T)
}
# res is a dataframe with 6 columns. The first column is contigs and the other five columns are aligned proteins from each of the five species

sp_pair<-combn(sps,2)
#read blastp output of proteins (from one species) alignment to proteins (from the other species) ##homologous proteins among the five species
for(i in 1:ncol(sp_pair)){
	tmp<-read.table('species1_align_to_species2.blastp.output',sep='\t',as.is=T)
	assign(paste0(sp_pair[1,i],'_',sp_pair[2,i],'_orth'),tmp)
	tmp<-read.table('species2_align_to_species1.blastp.output',sep='\t',as.is=T)
	assign(paste0(sp_pair[2,i],'_',sp_pair[1,i],'_orth'),tmp)
}

#function used to count the total number of proteins mapped by expressed contigs
protein_num<-function(res){
	contigs<-res[!is.na(res[,2]),1]
	proteins<-unique(res[!is.na(res[,2]),2])

	for(i in 2:5){
		sp<-sps[i]
		tmp_con<-res[(! res[,1] %in% contigs) & (!is.na(res[,sp])),1]
		contigs<-c(contigs,tmp_con)

		tmp_pro<-unique(res[res[,1] %in% tmp_con,sp])
		check<-sapply(tmp_pro,function(x) {
						for(j in 1:(i-1)){
							orth<-get(paste0(sp,'_',sps[j],'_orth'))
							tmp_orth<-unique(orth[orth[,1]==x,2])
							if(sum(tmp_orth %in% proteins)>0){
								return(T)
							}
						}
						return(F)
					})
		proteins<-c(proteins,tmp_pro[!check])
	}
	proteins
}
#total number of proteins mapped by expressed contigs
protein_number<-protein_num(res)

#function used to check the consistence of contig annotations among the five species
consis_func<-function(x){
			ind<-which(!is.na(x))
			for(i in 1:length(ind)){
				check<- T
				p1<-x[ind[i]]
				for(j in ind[-i]){
					p2<-x[j]
					orth<-get(paste0(sps[ind[i]],'_',sps[j],'_orth'))
					ps<-unique(orth[orth[,1]==p1,2])
					if(! (p2 %in% ps)) {check<- F; break}
				}
				if(check) break
			}
			check
}

#contigs mapped to protein sequences from more than one species
mul_exp<-res[apply(res[,2:6],1,function(x) sum(!is.na(x))>1),]
#contigs consistently mapped to protein sequences from more than one species
consis_exp<-mul_exp[apply(mul_exp[,2:6],1,consis_func),]


##annotation focusing on DE contigs
load('DE_contigs.Rdata') ## E1_DE_contigs is DE contigs from E1 and E2_DE_contigs is DE contigs from E2
de_contig<-union(E1_DE_contigs,E2_DE_contigs)
de_consis<-consis_exp[consis_exp[,'contig'] %in% de_contig,]
de_anno_num<-nrow(de_consis)

#function used to summarize proteins mapped by expressed contigs
protein_fill<-function(res){
	for(k in 2:5){
		sp<-sps[k]
		ind<-which(is.na(res[,2]) & !is.na(res[,sp]))
		ps<-unique(res[!is.na(res[,2]),2])
		for(i in ind){
			p<-res[i,sp]
			for(j in 1:(k-1)){
				orth<-get(paste0(sp,'_',sps[j],'_orth'))
				ptmp<-unique(orth[orth[,1]==p,2])
				check<-which(ptmp %in% ps)
				if(length(check)>0){
					res[i,2]<-ptmp[check[1]]
					break
				}
			}
			if(is.na(res[i,2])){
				res[i,2]<-p
			}			
		}
	}
	res
	}

#summarize proteins mapped by expressed contigs
consis_fill<-protein_fill(consis_exp)
#proteins mapped by DE contigs
de_protein<-unique(consis_fill[consis_fill[,'contig'] %in% de_contig,2])

#Non-DE contigs that were mapped to proteins from DE contigs
non_de_contig<-consis_fill[(!(consis_fill[,1] %in% de_contig)) & consis_fill[,2] %in% de_protein,1]

#read log2 fold change of contigs between activated and control samples from E1 and E2
fd1<-read.table('log2_fold_change_from_E1',head=T,sep='\t',as.is=T)
fd2<-read.table('log2_fold_change_from_E2',head=T,sep='\t',as.is=T)
fd1<-data.frame(contig=rownames(fd1),fd1= fd1[,1],stringsAsFactors=F)
fd2<-data.frame(contig=rownames(fd2),fd2= fd2[,1],stringsAsFactors=F)
fd<-merge(fd1,fd2,by='contig',all=T)
rownames(fd)<-fd[,1]
fd<-fd[,-1]

#calculate the number of up-regulated contigs among Non-DE contigs that were mapped to proteins from DE contigs
non_de_fd<-fd[non_de_contig,]
non_de_up<-sum(apply(non_de_fd,1,function(x) any(x>0,na.rm=T)))

#1000 times random sampling to calculate the significance of observed up-regulation of non-DE contigs
random_non_de_up<-c()
all_st<-setdiff(rownames(fd),de_contig)

for(i in 1:1000){
	random_contig<-sample(all_st,length(non_de_contig))
	random_fd<-fd[random_contig,]
	random_non_de_up<-c(random_non_de_up,sum(apply(random_fd,1,function(x) any(x>0,na.rm=T))))
}
pval<-sum(random_non_de_up>=non_de_up)/1000


##direction change of proteins from DE contigs
#function used to calculate the percentage of contigs with consistent direction change among all contigs mapped to the protein. Here only proteins from DE contigs were considered
consis_pro<-function(contig){
	p<-consis_fill[match(contig,consis_fill[,'contig']),2]
	sfd<-fd[contig,]
	#consider contig direction change under both E1 and E2 experiments
	two<-sapply(split(sfd,p),function(x){
					ss<-apply(x,1,function(y) {
							  	if(any(is.na(y))) return(sign(y[!is.na(y)]))
								else{
									if(y[1]*y[2]>0) return(sign(y[1]))
									else return(2)
								}
							})
					n1<-sum(ss==1)
					n2<-sum(ss== -1)
					if(n1>n2) return(n1/length(ss))
					else return(n2/length(ss))
				})
	#consider contig direction change under either experiment 
	one<-sapply(split(sfd,p),function(x){
					if(all(is.na(x[,1]))) p1<-0
					else p1<-max(table(sign(x[,1][!is.na(x[,1])])))/nrow(x)
					if(all(is.na(x[,2]))) p2<-0
					else p2<-max(table(sign(x[,2][!is.na(x[,2])])))/nrow(x)
					max(c(p1,p2))
				})
	return(cbind(two,one))
}

consis_pro_all<-consis_pro(c(de_consis[,1],non_de_contig))

##random sampling to estimate the significance
p<-consis_fill[match(c(de_consis[,1],non_de_contig),consis_fill[,1]),2]
#consider contig direction change under both E1 and E2 experiments
random_two<-function(cq){
	random_contig<-sample(rownames(fd),length(p))
	sfd<-fd[random_contig,]
	two<-sapply(split(sfd,p),function(x){
					ss<-apply(x,1,function(y) {
								if(any(is.na(y))) return(sign(y[!is.na(y)]))
								else{
									if(y[1]*y[2]>0) return(sign(y[1]))
									else return(2)
								}
							})
					n1<-sum(ss==1)
					n2<-sum(ss== -1)
					if(n1>n2) return(n1/length(ss))
					else return(n2/length(ss))	
				})
	two
}
#consider contig direction change under either experiment
random_one<-function(cq){
	random_contig<-sample(rownames(fd),length(p))
	sfd<-fd[random_contig,]
	one<-sapply(split(sfd,p),function(x){
					if(all(is.na(x[,1]))) p1<-0
					else p1<-max(table(sign(x[,1][!is.na(x[,1])])))/nrow(x)
					if(all(is.na(x[,2]))) p2<-0
					else p2<-max(table(sign(x[,2][!is.na(x[,2])])))/nrow(x)
					max(c(p1,p2))
				})
	one
}

random_two_res<-replicate(1000,random_two())
random_one_res<-replicate(1000,random_one())

#calculate the significance of percentage of proteins with >=80% of its all contigs showing consistent direction of change among all the proteins mapped by DE contigs
rr_two<-apply(random_two_res,2,function(x) sum(x>=0.8)/nrow(consis_pro_all))
rr_one<-apply(random_one_res,2,function(x) sum(x>=0.8)/nrow(consis_pro_all))
rtwo<-sum(consis_pro_all[,1]>=0.8)/nrow(consis_pro_all)
rone<-sum(consis_pro_all[,2]>=0.8)/nrow(consis_pro_all)
pval_under_two_experiments<-sum(rr_two>=rtwo)/1000
pval_under_either_experiment<-sum(rr_one>=rone)/1000