
/*
 * Coded by : Sael Lee
 * Last Modified: APRIL 05, 2011
 * ADDED : getAtm(*, *);
 * TODO: seperate out the DNA  !!!!!!!!
 */

#ifndef PROCESS_PDB__H
#define PROCESS_PDB__H

#ifndef SQR
#define SQR(a)				((a)*(a))
#endif	

#ifndef DIST
#define DIST(ax,ay,az,bx,by,bz)	sqrt(SQR((ax)-(bx))+SQR((ay)-(by))+SQR((az)-(bz)))
#endif																						

#ifndef ALLATOM
#define HETATM 	0
#define	ALLATOM 1
#define	CA			2
#define CACN		3
#define CACNO		4
#endif 

// --- std include ---
#include <vector>
#include <iostream>
#include <fstream>
#include <string>
#include <map>
//#include <cmath>
#include "Common.h"
using namespace std;
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <stdlib.h>


// --- local include 

/***********************************************************************
 * pdb file format(http://www.biochem.ucl.ac.uk/~roman/procheck/manual/manappb.html)
 ***********************************************************************
 * Brookhaven PDB file format  
 *  The table below shows the Brookhaven file format for the coordinate records 
 *  (ie ATOM and HETATM) of your PDB file. 
 *  Each record holds the coordinates and other details of a single atom. 
 * ---------------------------------------------------------------------------
 * Field |    Column    | FORTRAN |                                         
 *   No. |     range    | format  | Description                                   
 * ---------------------------------------------------------------------------
 *    1. |    1 -  6    |   A6    | Record ID (eg ATOM, HETATM)       
 *    2. |    7 - 11    |   I5    | Atom serial number                            
 *    -  |   12 - 12    |   1X    | Blank                                         
 *    3. |   13 - 16    |   A4    | Atom name (eg " CA " , " ND1")   
 *    4. |   17 - 17    |   A1    | Alternative location code (if any)            
 *    5. |   18 - 20    |   A3    | Standard 3-letter amino acid code for residue 
 *    -  |   21 - 21    |   1X    | Blank                                         
 *    6. |   22 - 22    |   A1    | Chain identifier code                         
 *    7. |   23 - 26    |   I4    | Residue sequence number                       
 *    8. |   27 - 27    |   A1    | Insertion code (if any)                       
 *    -  |   28 - 30    |   3X    | Blank                                         
 *    9. |   31 - 38    |  F8.3   | Atom's x-coordinate                         
 *   10. |   39 - 46    |  F8.3   | Atom's y-coordinate                         
 *   11. |   47 - 54    |  F8.3   | Atom's z-coordinate                         
 *   12. |   55 - 60    |  F6.2   | Occupancy value for atom                      
 *   13. |   61 - 66    |  F6.2   | B-value (thermal factor)                    
 *    -  |   67 - 67    |   1X    | Blank                                         
 *   14. |   68 - 70    |   I3    | Footnote number
 *   77 - 78        LString(2)    element      Element symbol, right-justified.
 *   79 - 80        LString(2)    charge       Charge  on the atom.
 * ---------------------------------------------------------------------------
 * 
 * Example:-
 *  Four sample records are shown below:- 
 *          1         2         3         4         5         6       0 
 * 12345678901234567890123456789012345678901234567890123456789012345678
 * --------------------------------------------------------------------
 * ATOM   1751  N   GLY C 250      32.286   1.882  43.206  1.00 22.00
 * ATOM   1752  CA  GLY C 250      32.365   1.086  41.969  1.00 21.39
 * ATOM   1753  C   GLY C 250      31.538   1.735  40.864  1.00 20.79
 * ATOM   1754  O   GLY C 250      30.621   2.527  41.152  1.00 21.58
 * 
 * NMR Ensembles
 *  For NMR ensembles the coordinates of each model should be preceded by a MODEL record, 
 *  and terminated by an ENDMDL record. The format of the former is ('MODEL',5X,I4), 
 *  where the I4 holds the model number. For example:- 
 * MODEL        1
 * ^^^^^^^^^^^^^^
 * MODELxxxxxIIII
 * -------------------------------------------------------------------------
 *	
 *	pqr file format (white space speration)
 *	
 *  Field_name Atom_number Atom_name Residue_name Residue_number X Y Z Charge Radius
 * 
 * --------------------------------------------------------------------------
 *
 * 													Hydrophobicity Scales
 * 																														Kyte-Doolittle  Hopp-Woods
 * 		Alanine 				Ala 	A 	nonpolar 			neutral 						 1.8						-0.5
 * 		Arginine 				Arg 	R 	polar 				basic (strongly) 	 	-4.5						 3.0
 * 		Asparagine 			Asn 	N 	polar 				neutral 	 				 	-3.5						 0.2	
 * 		Aspartic acid 	Asp 	D 	polar 				acidic 					 		-3.5						 3.0	 
 * 		Cysteine 				Cys 	C 	polar 				neutral							 2.5						-1.0
 * 		Glutamic acid 	Glu 	E 	polar 				acidic 							-3.5						 0.2
 * 		Glutamine 			Gln 	Q 	polar 				neutral 						-3.5						 3.0
 * 		Glycine 				Gly 	G 	nonpolar 			neutral 						-0.4						 0.0
 * 		Histidine 			His 	H 	polar 				basic (weakly) 			-3.2						-0.5
 * 		Isoleucine 			Ile 	I 	nonpolar 			neutral 						 4.5						-1.8
 * 		Leucine 				Leu 	L 	nonpolar 			neutral							 3.8					  -1.8
 * 		Lysine 					Lys 	K 	polar 				basic 							-3.9						 3.0	
 * 		Methionine 			Met 	M 	nonpolar 			neutral							 1.9					  -1.3
 * 		Phenylalanine 	Phe 	F 	nonpolar 			neutral							 2.8						-2.5	
 * 		Proline 				Pro 	P 	nonpolar 			neutral 						-1.6						 0.0
 * 		Serine 					Ser 	S 	polar 				neutral 						-0.8						 0.3
 * 		Threonine 			Thr 	T 	polar 				neutral 						-0.7						-0.4
 * 		Tryptophan 			Trp 	W 	nonpolar 			neutral 						-0.9 						-3.5
 * 		Tyrosine 				Tyr 	Y 	polar 				neutral 						-1.3						-2.3
 *	 	Valine 					Val 	V 	nonpolar 			neutral 						 4.2						-1.5
 *
 * Hoop TP and Woods KR, Prediction of protein antigenic determinants from amino acid sequences. Proc Natl Acad Sci USA 78:3824, 1981.
 * Kyte J and Doolittle RF, A simple method for displaying the hydropathic character of a protien. J Mol Biol 157:105, 1982. 
 **************************************************************************/ 


class ATOMStruct
{
	public:	
 		//------------------------ PDB format ---------------------------------
		char								type[7];				//	0 - 5  Record ID (ATOM or HETATM)	
		int	   							aSN;						//  6 - 10 Atom serial number                            
 		char								aName[5];				// 12 - 15 Atom name (eg " CA " , " ND1")   
 		char								aLC;						// 16 - 16 Alternative location code (if any)            
 		char								AA[4];					// 17 - 19 Standard 3-letter amino acid code for residue 
 		char								chain;					// 21 - 21 Chain identifier code                         
 		int									rSN;						// 22 - 25 Residue sequence number                       
 		char								iC;							// 26 - 26 Insertion code (if any)                       
 		float								x;							// 30 - 37 Atom's x-coordinate                         
 		float								y;							// 38 - 45 Atom's y-coordinate                         
 		float								z;							// 46 - 53 Atom's z-coordinate                         
 		float								oV;							// 54 - 59 Occupancy value for atom                      
 		float								bV;							// 60 - 65 B-value (thermal factor)                    
 		char								footN[4];				// 67 - 69 Footnote number  
		char								eleT[4];				// 76 - 77 Element type
		char								cha[3];					// 78 - 79 Charge
	
		//------------------------ additional info -----------------------------
		float								charge;					// 9th element from pqr Charge in electrons
		float 							radius; 				// 10th element from pqr Atomic radius

		float								KDhydro;				// Kyte-Doolittle Hydrophobicity: aa residue based 
		float								HWhydro;				// Hoop-Woods	 Hydrophilicity: aa residue based
		float								hbond;	//PLP assignement: donor = +1.0; acceptor = -1.0 for protein; //added by Bingjie Hu, 03/01/2014
													//donor = -1.0; acceptor = +1.0 for ligand
		float                                                           XlogP; // Predicted logP value of atoms by XlogP3 to include hydrophobicity in PL-PatchSurfer, W.-H. Shin, 8/25/2014 
		float                                                           arom; // aromaticity of atoms by XlogP3 to include hydrophobicity in PL-PatchSurfer. If an atom is a member of aromatic ring, then this value is assigned as 1.0, otherwise, 0.0. W.-H. Shin, 9/2/2014 


		//------------------------ mol2 format bond information for ligand------------(by Bingjie Hu----------
		ATOMStruct				*bond_[6];	
		int						numBonds;
		int						element;
		float					mass;
		int						hyb;

		// ---------------------------------------------------------------------
		// public functions
		// ---------------------------------------------------------------------	
		// pdb format
		void ATOMprint(){
			printf( "%-6s%5d %4s%1c%3s %1c%4d%1c   %8.3f%8.3f%8.3f%6.2f%6.2f %3s      %2s%2s\n",type,aSN,aName,aLC,AA,chain,rSN,iC,x,y,z,oV,bV,footN,eleT,cha); 	
		};
		void ATOMprint(FILE *pFile)
		{
		  if(pFile == NULL) perror("Error opeing file"); 	
			else
			{ 	
//			printf("         1         2         3         4         5         6        \n");
//			printf("12345678901234567890123456789012345678901234567890123456789012345678\n");
				fprintf(pFile, "%-6s%5d %4s%1c%3s %1c%4d%1c   %8.3f%8.3f%8.3f%6.2f%6.2f %3s      %2s%2s\n",type,aSN,aName,aLC,AA,chain,rSN,iC,x,y,z,oV,bV,footN,eleT,cha); 	
			}
		};

		// pseduo pdb format ( with extra info) 
		void ATOMprintV2(){ 		
			printf( "%-6s%5d %4s%1c%3s %1c%4d%1c   %8.3f%8.3f%8.3f%6.2f%6.2f %3s %f %f %f %f\n",type,aSN,aName,aLC,AA,chain,rSN,iC,x,y,z,oV,bV,footN, charge, radius, KDhydro, HWhydro); 	
		};
		void ATOMprintV2(FILE *pFile)
		{
		  if(pFile == NULL) perror("Error opeing file"); 	
			else
			{ 
//			printf("         1         2         3         4         5         6        \n");
//			printf("12345678901234567890123456789012345678901234567890123456789012345678\n");
			fprintf(pFile, "%-6s%5d %4s%1c%3s %1c%4d%1c   %8.3f%8.3f%8.3f%6.2f%6.2f %3s %f %f %f %f\n",type,aSN,aName,aLC,AA,chain,rSN,iC,x,y,z,oV,bV,footN, charge, radius, KDhydro, HWhydro); 	
			}
		};
		
		void init()
		{
			strcpy(type, ""); strcpy(aName,""); strcpy(AA,""); strcpy(footN, ""); strcpy(eleT, ""); strcpy(cha,"");
			aSN = 0; aLC = ' '; chain = ' '; rSN = 0; iC = ' '; x = 0; y = 0; z = 0; oV =0; bV = 0; 
			charge=0; radius=0; KDhydro=0; HWhydro=0; hbond=0;
			numBonds=0; element = 0; mass = 1.0; hyb = 0;
		};
	
		int copyATOMFrom(ATOMStruct &_from)
		{
			strcpy(type, _from.type);
			aSN = _from.aSN;
			strcpy(aName, _from.aName);
			aLC = _from.aLC;
			strcpy(AA, _from.AA);
			chain = _from.chain;
			rSN = _from.rSN;
			iC = _from.iC;
			x = _from.x; y = _from.y; z = _from.z;
			oV = _from.oV;
			bV = _from.bV;
			strcpy(footN, _from.footN);
			strcpy(eleT, _from.eleT);
			strcpy(cha, _from.cha); 
			return 0;
		}


};



class ResidueStruct
{
	public: 
		ResidueStruct(){init();};

		map<string, char>				aa1;
		map<string, float>			kd;
		map<string, float>			hw;

		void init()
		{
			 aa1["ALA"] = 'A';	kd["ALA"] =   1.8; hw["ALA"] =  -0.5; 
			 aa1["ARG"] = 'R';	kd["ARG"] =  -4.5; hw["ARG"] =   3.0;
			 aa1["ASN"] = 'N';	kd["ASN"] =  -3.5; hw["ASN"] =   0.2;
			 aa1["ASP"] = 'D';	kd["ASP"] =  -3.5; hw["ASP"] =   3.0;
			 aa1["CYS"] = 'C';	kd["CYS"] =   2.5; hw["CYS"] =  -1.0; 
			 aa1["GLU"] = 'E';	kd["GLU"] =  -3.5; hw["GLU"] =   0.2;
			 aa1["GLN"] = 'Q';	kd["GLN"] =  -3.5; hw["GLN"] =   3.0;
			 aa1["GLY"] = 'G';	kd["GLY"] =  -0.4; hw["GLY"] =   0.0;
			 aa1["HIS"] = 'H';	kd["HIS"] =  -3.2; hw["HIS"] =  -0.5; 
			 aa1["ILE"] = 'I';	kd["ILE"] =   4.5; hw["ILE"] =  -1.8;
			 aa1["LEU"] = 'L';	kd["LEU"] =   3.8; hw["LEU"] =  -1.8;
			 aa1["LYS"] = 'K';	kd["LYS"] =  -3.9; hw["LYS"] =   3.0;
			 aa1["MET"] = 'M';	kd["MET"] =   1.9; hw["MET"] =  -1.3; 
			 aa1["PHE"] = 'F';	kd["PHE"] =   2.8; hw["PHE"] =  -2.5;
			 aa1["PRO"] = 'P';	kd["PRO"] =  -1.6; hw["PRO"] =   0.0;
			 aa1["SER"] = 'S';	kd["SER"] =  -0.8; hw["SER"] =   0.3;
			 aa1["THR"] = 'T';	kd["THR"] =  -0.7; hw["THR"] =  -0.4; 
			 aa1["TRP"] = 'W';	kd["TRP"] =  -0.9; hw["TRP"] =  -3.5;
			 aa1["TYR"] = 'Y';	kd["TYR"] =  -1.3; hw["TYR"] =  -2.3;
			 aa1["VAL"] = 'V';	kd["VAL"] =   4.2; hw["VAL"] =  -1.5;
		};
	
};



template<class T, class TIn>
class ProcessPDB
{
	private:
		vector<ATOMStruct>		atom_;				// all ATOM 
		vector<ATOMStruct>		hetatm_;			// all HETATM
		vector<ATOMStruct>		dna_;					// DNA residue 
		
		float 								min_[3];
		float									max_[3];
		float 								cog_[3];
		float 								cenAtm_[3];

		vector<char>					AAlst_;				// one letter amino acid list 	
		int 									chainNum_;		// number of chains
		int 									atomNum_;			// number of atoms
		int										bondNum_;			// number of bonds (by Bingjie Hu)
		int 									AANum_;				// number of amino accid
		TIn										cen_[3];					// center of coordinates
		
		ResidueStruct					res_;
	
	public:	
		map<int, ATOMStruct>	atmIDmap_;
		map<int, ATOMStruct>	resIDmap_;
		map<int, ATOMStruct>	hetatmIDmap_;
		
		void makeAAlst();
	
		int ReadPDB(const char* _fname);	
		int ReadPQR(const char* _fname);
		int ReadPQR_detail(const char* _fname);
		vector<ATOMStruct> getATOM(){return atom_;};
		vector<ATOMStruct> getHETATM(){return hetatm_;};
		vector<char> getRes(){return AAlst_;}

		int ATOMsize(){return atom_.size();};
		int HETATMsize(){return hetatm_.size();};
		int size(){return (atom_.size() + hetatm_.size());};
		ATOMStruct getRecord(int _rid);
		int addHydro();	
		int addHB();
		int assign_XlogP_prot();
		//int addHB_ligand();	
	
		//for generating ligand hydrogen bond (by Bingjie Hu)
		
		int ReadMOL2(const char* _fname);
		void AssignLigandProperties();
		int AssignElements();
		int	AssignHybrid();
		int AssignDonorAcceptor();
		int Assign_arom_to_H();
		int Read_XlogP(const char* _fname);
		
		void getMin(float & _minX, float & _minY, float & _minZ)
		{
			_minX = min_[0]; _minY = min_[1]; _minZ = min_[2];
		};
		
		void getMax(float& _maxX, float & _maxY, float & _maxZ)
		{
			_maxX = max_[0]; _maxY = max_[1]; _maxZ = max_[2];
		};
		
		void getCOG(float& _cogX, float & _cogY, float & _cogZ)
		{
			_cogX = cog_[0]; _cogY = cog_[1]; _cogZ = cog_[2];
		};
		
		void getCenAtm(float& _cenX, float & _cenY, float & _cenZ)
		{
			_cenX = cenAtm_[0]; _cenY = cenAtm_[1]; _cenZ = cenAtm_[2];
		};

		int getMinCenMax(vector<float>& _cenX, vector<float>& _cenY,vector<float>& _cenZ);

		int getAtm( vector<ATOMStruct> &_atom, int _type );
	
		void printPDB();
		void printPDB(const char * _pdbFN);
		void printPDB(const char * _pdbFN, vector<ATOMStruct> _atm );
		
		void printPDBetc();
		void printCACN();
		void printCACN(vector<ATOMStruct> _atm );
		void printCACN(const char * _pdbFN, vector<ATOMStruct> _atm );

		int Seperate4LigPlot(const char * _pdbFN);
		void SeperateCACNbyChain(const char* _pdbID);

		int GetClosestChain(const char *pdbFN, vector<ATOMStruct> _atm);	
		int GetClosestHet(const char *pdbFN, vector<ATOMStruct> _atm);	


		int GetCloseRes(const char *pdbFN, vector<ATOMStruct> _atm, double _thres);	
		int GetCloseRes(vector<ATOMStruct> _atm, double _thres);	

		int LigCluster(const char* _pdbID, double _clusT, int _HANum );	

		int	InterfaceSeq(const char * _pdbFN, double _interT); 

		// given raw pdb, with parameters pdbID and chainID
		// output pdb chain and ligand that is bound to the pdbchain in pdbfile format
		int GetCloseLig(const char* _pdbID, const char _chainID, float _percent, int _count);

		int GetFasta( const char* _pdbID, const char _chainID);

	private:
		int searchXYZ(float _x, float _y, float _z, vector<ATOMStruct> _vect);
		
		double atmDist(ATOMStruct _atom1, ATOMStruct _atom2){
			float xx = ((_atom1.x - _atom2.x)* (_atom1.x - _atom2.x));
			float yy = ((_atom1.y - _atom2.y)* (_atom1.y - _atom2.y));
			float zz = ((_atom1.z - _atom2.z)* (_atom1.z - _atom2.z)); 
			return ( sqrt((double)xx + (double)yy + (double)zz) );
		};


};



#include "ProcessPDB.cpp"

#endif


