#include "stdio.h"
#include <algorithm>
#include <ctime>
#include <cstdlib> 

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
#define THREADS 64
#define BLOCKS 256
#define _dif (((1ll<<32)-121)/(THREADS*BLOCKS)+1)

#define ROUNDS 4
#define ITER 1


typedef unsigned long long ull;

inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
   if (code != cudaSuccess) 
   {
      //fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      printf("GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

__device__ unsigned int primes[1024]; // bitset of primes generated by eratosten_sieve()
//__device__ unsigned char primes[(1<<28)+1];
__device__ long long n = 1ll<<32; // #unsigned_integers
__device__ ull dev_base;
__device__ unsigned int dev_hash; // cislo round-u
__device__ unsigned int dev_index; // index vzhladom na block

__device__ unsigned int dev_HASH_SIZE;
__device__ unsigned int dev_HASH_ROW;
__device__ unsigned int dev_HASH_COL;

__device__ unsigned int dev_HASHTABLE_1[65536];
__device__ unsigned int dev_HASHTABLE_2[65536];

time_t curtime;




__device__ int hashh(long long x) {
      //return (x>>1)%1024;
      //return (((long long)0xAFF7B4*x)>>7)%1024;
      //return (dev_HASHTABLE_1[x&65535]+dev_HASHTABLE_2[x>>16])%dev_HASH_SIZE;
      //return (((long long)0xABECEDA*x)>>3)%340;
      //return (((long long)0xAFF7B4*x)>>5)%340;
      //return (((long long)0x5ADB015*x)>>7)%512;
      
      
      /*x = ((x >> 16) ^ x) * 0x45d9f3b;  // 0x3335b369
      x = ((x >> 16) ^ x) * 0x45d9f3b;
      x = ((x >> 16) ^ x);
      return x%240;*/
      
      x = ((x >> 16) ^ x) * 0x45d9f3b;  // 0x3335b369
      x = ((x >> 16) ^ x) * 0x45d9f3b;
      x = ((x >> 16) ^ x);
      return x%224;
      
      
      /*
      x^= x >> 16;
      x*= 0x85ebca6b;
      x^= x >> 13;
      x*= 0xc2b2ae35;
      x^= x >> 16;
      return x%230;*/
}
// compute (x^e)%n
__device__ ull mulmod(ull x,ull e,ull n) {
    ull ans = 1;
    while(e>0) {
	if(e&1) ans = (ans*x)%n;
	x = (x*x)%n;
	e>>=1;
    }
    return ans;
}

// determine whether n is strong probable prime base a or not.
// n is ODD
__device__ int is_SPRP(ull a,ull n) {
      int d=0;
      ull t = n-1;
      while(t%2==0) {
	  ++d;
	  t>>=1;
      }
      ull x = mulmod(a,t,n);
      if(x==1) return 1; 
      for(int i=0;i<d;++i) {
	  if(x==n-1) return 1;
	  x=(x*x)%n;
      }
      return 0;
}


__device__ int prime(long long x) {
    //unsigned long long b = 2;
    //return is_SPRP(b,(unsigned long long)x);
    return is_SPRP((unsigned long long)primes[(((long long)0xAFF7B4*x)>>7)%1024],(unsigned long long)x);
}

/*__device__ int prime(uint x) {
    if(x==2) return 1;
    if(x%2==0) return 0;
    long long pos = x/16;
    long long index = (x&15)>>1;
    return (1<<index)&(~(primes[pos]));
}*/

// copy all unsigned COMPOSITE ingeters which are not congruent to zero modulo 2,3,5,7 and their hashh value = 0; 
// count of those elements store in c



__global__ void find(unsigned int *out,unsigned int *c) {
    unsigned int buff[256][32];
    int local_c[256];
    for(int i=0;i<dev_HASH_ROW;++i) local_c[i]=0;
    
    long long b = 121+(threadIdx.x+blockIdx.x*blockDim.x)*_dif;
    long long e = b+_dif;
    if(b%2==0) ++b;
    for(long long i=b;i<e && i<n;i+=2) {
	if(i%3==0 || i%5==0 || i%7==0) continue;
	//int hash_num = ((dev_HASHTABLE_1[i&65535]+dev_HASHTABLE_2[i>>16])%dev_HASH_SIZE)-(dev_hash*dev_HASH_ROW);
	int hash_num = hashh(i)-(dev_hash*dev_HASH_ROW);
	if(0<=hash_num && hash_num<dev_HASH_ROW) {
	    if(prime(i)) continue;
	    buff[hash_num][local_c[hash_num]++]=(unsigned int)i;
	    if(local_c[hash_num]==32) {
		int start = atomicAdd(&c[hash_num],local_c[hash_num]);
		if(start+local_c[hash_num]>=dev_HASH_COL) {
		    //printf("fail: %u",start+local_c[hash_num]);
		    return;
		}
		
		unsigned int *out_offset = out+hash_num*dev_HASH_COL;
		for(int i=0;i<local_c[hash_num];++i) out_offset[i+start]=buff[hash_num][i];
		local_c[hash_num]=0;
	    }
	}
    }
    //return;
    for(int i=0;i<dev_HASH_ROW;++i) {
      int start = atomicAdd(&c[i],local_c[i]);
      if(start+local_c[i]>=dev_HASH_COL) return;
      unsigned int *out_offset = out+i*dev_HASH_COL;
      for(int j=0;j<local_c[i];++j) out_offset[j+start]=buff[i][j];
    }
    
}
// find base for which all elements in input are NOT SPRP. base is from {2,..,34} stored in 32bit uint
__global__ void solve(unsigned int *input, unsigned int *count,unsigned int *ans) {
    unsigned int dif = (count[dev_index])/(blockDim.x*gridDim.x) +1;
    unsigned int b = (threadIdx.x+blockIdx.x*blockDim.x)*dif;
    unsigned int e = b+dif>(count[dev_index])?(count[dev_index]):b+dif;
    // each thread doing its part
    unsigned int *input_offset = input+dev_index*dev_HASH_COL;
    for(unsigned int j = b; j<e ; ++j) {
	//is some element is sprp base i break
	if((*ans)==0) break;
	if(is_SPRP(dev_base,(unsigned long long)input_offset[j])!=0) {
	    *ans=0;
	    //atomicExch(ans,0);
	    break;
	}
    }
}

__global__ void incBase() {
    ++dev_base;
}

__global__ void incHash() { // ROUND
    ++dev_hash;
}
__global__ void incIndex() { // ITERATION, automatically nulled
    dev_index = (dev_index+1)%(dev_HASH_ROW);
}

__global__ void ppp() {
    printf("%d\n%d\n%d\n",dev_HASH_ROW,dev_HASH_COL,dev_HASH_SIZE);
}

int gettime(void) {
    time_t b = curtime;
    curtime = time(NULL);
    return curtime-b;
}


int main(void) {
    srand(time(NULL));
    // ALOKACIE!
    
    // allocate buffers
    unsigned int *dev_input,*dev_count,*dev_ans;
    unsigned int ans;
    gpuErrchk(cudaMalloc((void**)&dev_input,sizeof(unsigned int)*260000000)); // 125M = 1mld / rounds
    gpuErrchk(cudaMalloc((void**)&dev_count,4*256));
    gpuErrchk(cudaMalloc((void**)&dev_ans,4));
    
    size_t a,b;
    cudaMemGetInfo(&a,&b);
    printf("free %d total %d\n",a,b);
    
  
    unsigned int *adr_dev_HASH_SIZE, *adr_dev_HASH_ROW, *adr_dev_HASH_COL;
    unsigned int HASH_ROW,HASH_COL;
    gpuErrchk(cudaGetSymbolAddress((void**)&adr_dev_HASH_SIZE,dev_HASH_SIZE));
    gpuErrchk(cudaGetSymbolAddress((void**)&adr_dev_HASH_ROW,dev_HASH_ROW));
    gpuErrchk(cudaGetSymbolAddress((void**)&adr_dev_HASH_COL,dev_HASH_COL));
    
    ull *base_adr;
    gpuErrchk(cudaGetSymbolAddress((void**)&base_adr,dev_base));
    uint *hash_adr,*index_adr;
    gpuErrchk(cudaGetSymbolAddress((void**)&hash_adr,dev_hash));
    gpuErrchk(cudaGetSymbolAddress((void**)&index_adr,dev_index));
    
    /* load primes */
    unsigned int prms[] = { 17, 11, 6, 60, 7, 13, 11, 34, 13, 2, 3, 37, 13, 11, 38, 2, 7, 105, 2, 7, 42, 11, 7, 3, 6, 15, 53, 44, 6, 6, 5, 15, 54, 7, 35, 10, 10, 15, 10, 10, 17, 17, 11, 10, 15, 43, 7, 5, 5, 3, 7, 43, 34, 2, 34, 2, 68, 53, 39, 10, 7, 6, 11, 2, 5, 2, 7, 2, 6, 5, 15, 40, 3, 5, 5, 2, 2, 10, 47, 13, 7, 43, 6, 7, 5, 6, 6, 13, 6, 35, 6, 15, 6, 13, 40, 10, 11, 2, 7, 2, 2, 3, 13, 3, 11, 15, 10, 5, 11, 14, 7, 11, 47, 5, 2, 2, 6, 2, 5, 55, 6, 5, 7, 2, 6, 58, 35, 11, 5, 12, 17, 6, 10, 12, 6, 6, 2, 53, 2, 2, 13, 5, 14, 7, 15, 6, 13, 62, 10, 6, 3, 7, 7, 3, 14, 5, 14, 73, 15, 11, 11, 6, 5, 17, 10, 5, 3, 37, 51, 10, 7, 5, 38, 12, 5, 11, 5, 7, 6, 5, 6, 40, 43, 57, 10, 13, 7, 15, 2, 10, 34, 7, 39, 10, 5, 3, 6, 13, 11, 5, 10, 43, 10, 5, 3, 14, 5, 2, 5, 41, 5, 39, 46, 2, 10, 2, 5, 12, 3, 2, 2, 5, 15, 43, 17, 41, 2, 13, 15, 38, 11, 11, 3, 34, 5, 6, 3, 7, 2, 37, 5, 6, 10, 17, 35, 2, 15, 6, 7, 5, 3, 13, 13, 12, 34, 2, 12, 10, 15, 13, 2, 2, 34, 6, 6, 5, 2, 7, 13, 3, 6, 11, 39, 42, 7, 2, 6, 39, 47, 3, 17, 5, 13, 7, 
2, 47, 3, 7, 6, 11, 17, 37, 48, 7, 37, 11, 7, 10, 3, 14, 39, 14, 15, 43, 17, 2, 12, 7, 13, 5, 3, 6, 34, 37, 3, 17, 13, 2, 5, 10, 10, 44, 37, 2, 2, 10, 10, 7, 3, 7, 2, 7, 5, 43, 43, 11, 15, 51, 13, 17, 10, 11, 2, 5, 34, 17, 2, 2, 42, 6, 6, 5, 47, 15, 2, 12, 7, 3, 10, 15, 3, 7, 12, 12, 15, 43, 14, 7, 58, 13, 10, 6, 6, 38, 34, 5, 5, 13, 38, 6, 11, 10, 6, 7, 2, 55, 2, 13, 5, 11, 44, 15, 17, 2, 40, 2, 15, 13, 6, 2, 3, 3, 3, 3, 6, 39, 5, 11, 17, 37, 5, 7, 6, 10, 6, 12, 7, 5, 14, 10, 12, 71, 10, 35, 6, 11, 3, 2, 38, 3, 2, 34, 10, 17, 42, 2, 12, 6, 6, 11, 40, 12, 10, 6, 10, 2, 3, 3, 56, 11, 7, 42, 2, 38, 12, 2, 2, 13, 40, 12, 6, 5, 5, 59, 15, 38, 5, 5, 5, 7, 2, 10, 7, 2, 17, 10, 11, 6, 6, 6, 2, 10, 6, 54, 2, 82, 3, 34, 14, 15, 44, 5, 46, 2, 13, 5, 12, 13, 11, 10, 39, 5, 40, 3, 60, 3, 42, 11, 3, 46, 17, 3, 2, 37, 6, 42, 12, 14, 3, 12, 66, 13, 34, 7, 3, 13, 3, 11, 2, 13, 12, 38, 34, 5, 40, 10, 14, 6, 14, 11, 38, 58, 2, 48, 5, 15, 5, 73, 3, 37, 5, 11, 10, 5, 5, 13, 2, 10, 13, 34, 17, 3, 7, 47, 2, 2, 10, 15, 3, 3, 13, 6,
 34, 13, 10, 13, 3, 6, 41, 10, 6, 2, 6, 2, 6, 2, 6, 6, 37, 10, 44, 35, 13, 51, 2, 7, 53, 5, 40, 5, 2, 37, 11, 15, 11, 13, 2, 5, 2, 6, 10, 17, 15, 43, 39, 17, 2, 12, 10, 15, 17, 7, 13, 3, 7, 15, 37, 5, 15, 7, 6, 10, 51, 2, 2, 40, 61, 2, 13, 13, 11, 2, 5, 34, 5, 5, 7, 2, 2, 2, 11, 3, 6, 13, 6, 17, 11, 10, 7, 46, 15, 7, 14, 35, 11, 7, 10, 6, 11, 40, 11, 2, 39, 7, 6, 66, 5, 3, 6, 5, 11, 10, 2, 10, 7, 13, 2, 45, 34, 6, 35, 2, 11, 5, 59, 75, 10, 17, 14, 17, 17, 17, 2, 11, 7, 10, 6, 11, 6, 56, 34, 35, 11, 14, 12, 41, 40, 17, 40, 3, 11, 7, 37, 14, 7, 13, 7, 5, 2, 10, 6, 39, 2, 7, 37, 35, 10, 5, 15, 2, 7, 38, 34, 11, 17, 5, 6, 10, 3, 6, 7, 7, 43, 14, 2, 43, 3, 2, 47, 7, 35, 7, 3, 53, 2, 10, 10, 10, 60, 10, 6, 2, 6, 10, 5, 7, 57, 53, 13, 3, 35, 38, 15, 42, 3, 3, 12, 2, 10, 3, 38, 54, 13, 10, 11, 7, 13, 7, 2, 12, 39, 10, 54, 2, 12, 38, 10, 12, 12, 5, 15, 6, 10, 13, 5, 15, 10, 13, 6, 41, 40, 14, 12, 10, 11, 40, 5, 11, 10, 2, 5, 2, 13, 6, 2, 13, 5, 2, 10, 15, 5, 5, 10, 34, 13, 2, 5, 14, 5, 6, 5, 13, 3, 43, 6, 13, 11, 50, 
3, 6, 6, 12, 15, 11, 37, 7, 69, 11, 14, 14, 7, 43, 5, 35, 11, 35, 11, 11, 34, 34, 39, 14, 11, 2, 10, 53, 6, 11, 2, 11, 60, 39, 11, 6, 15, 40, 17, 47, 34, 50, 7, 59, 47, 5, 13, 39, 5, 6, 53, 10, 14, 5, 51, 5, 7, 5, 6, 77, 7, 12, 7, 42, 2, 5, 2, 6, 60, 10, 13, 10, 6, 47, 6, 15, 17, 10, 11, 10, 12, 7, 7, 10, 17, 34, 5, 10, 7, 7, 2, 6, 10, 38, 2, 15, 6, 13, 7, 13, 2, 3, 13, 5, 3, 17, 2, 5, 15, 11, 39, 7, 39, 10, 10, 2, 6, 13, 3, 5, 17, 6, 14, 10, 37, 44, 3, 34, 5, 11, 7, 12, 2, 5, 3, 12, 3, 2, 3, 133, 12, 2, 2, 2, 3, 34, 14, 41, 2, 37, 11, 2, 6, 11, 6, 7, 15, 11, 35, 13, 6, 5, 2, 14, 7, 2 };
    
    printf("primes_copy: %s\n",cudaGetErrorString(cudaMemcpyToSymbol(primes,prms,1024*4)));
    
    int answer[100];
    int buffoverflow[100];
    int current_index =0;
    int bb[1024];
    for(int i=0;i<100;++i) {
	answer[i]=0; 
	buffoverflow[i]=0;
    }
    
    printf("all done, iteration=%d, rounds=%d\n",ITER,ROUNDS);
    
    time_t starttime = time(NULL);	
    tm *timeinfo = localtime(&starttime);
    printf("Started: %02d:%02d:%02d\n",timeinfo->tm_hour,timeinfo->tm_min,timeinfo->tm_sec);
    
    curtime = time(NULL);
    
    for(unsigned int HASH_SIZE=224;HASH_SIZE<=224;HASH_SIZE+=100) { // 700 - 1000 +50
	/* compute hash_row, hash_col */
	HASH_ROW = (HASH_SIZE/ROUNDS)+(HASH_SIZE%ROUNDS==0?0:1);
	HASH_COL = (1000000000/HASH_SIZE);
	
	printf("SIZE: %u   ROW: %u   COL: %u\n",HASH_SIZE,HASH_ROW,HASH_COL);
	
	/* pass info to the device */
	
	gpuErrchk(cudaMemcpy(adr_dev_HASH_SIZE,&HASH_SIZE,4,cudaMemcpyHostToDevice));
	gpuErrchk(cudaMemcpy(adr_dev_HASH_COL,&HASH_COL,4,cudaMemcpyHostToDevice));
	gpuErrchk(cudaMemcpy(adr_dev_HASH_ROW,&HASH_ROW,4,cudaMemcpyHostToDevice));
	
	
	
	for(int iteration=0;iteration<ITER;++iteration) {
	    /* generate hash function and copy */
	    printf("%d / %d   (%d seconds)\n",iteration,ITER,gettime());
	    unsigned int temp[32];
	    for(int x=0;x<32;++x) temp[x]=rand()%HASH_SIZE;
	    unsigned int HASHTABLE_1[65536],HASHTABLE_2[65536];
	    for(unsigned int x=0;x<65536;++x) {
		HASHTABLE_1[x]=0;
		HASHTABLE_2[x]=0;
	    }
	    for(unsigned int x=0;x<65536;++x) {
		for(int y=0;y<16;++y) {
		    if((x&(1<<y))!=0) {
			HASHTABLE_1[x] = (HASHTABLE_1[x]+temp[y])%HASH_SIZE;
			HASHTABLE_2[x] = (HASHTABLE_2[x]+temp[y+16])%HASH_SIZE;
		    }
		}
	    }
	    /* copy hash tables */
	    
	    gpuErrchk(cudaMemcpyToSymbol(dev_HASHTABLE_1,HASHTABLE_1,65536*4));
	    gpuErrchk(cudaMemcpyToSymbol(dev_HASHTABLE_2,HASHTABLE_2,65536*4));
	    
	    /* prepare for finding solution */
	    gpuErrchk(cudaMemset(hash_adr,0,3));
	    gpuErrchk(cudaMemset(hash_adr,0x00,1));
	    gpuErrchk(cudaMemset(index_adr,0,4));
	    
	    /* find solution */
	    
	    int global_solution = 1;
	    int buf_overflow = 0;
	    for(int i=0;i<ROUNDS;++i) { // ROUNDS
		printf("run %d / %d scan\n",i,ROUNDS);
		gpuErrchk(cudaMemset(dev_count,0,4*(HASH_ROW)));
		find<<<BLOCKS,THREADS>>>(dev_input,dev_count);
		gpuErrchk( cudaPeekAtLastError() );
		gpuErrchk( cudaDeviceSynchronize() );
		
		//printf("%d seconds\n",gettime());
		
		printf("scan done\n");
		
		unsigned int count[256];
		gpuErrchk(cudaMemcpy(count,dev_count,4*(HASH_ROW),cudaMemcpyDeviceToHost));
		//printf("size: %u\n",count);
		for(int j=0;j<HASH_ROW;++j) {
		    //printf("%d count: %u\n",j,count[j]);
		    if(i*(HASH_ROW)+j==HASH_SIZE) break; // ak si skoncil, skonci
		    if(count[j]>=HASH_COL) { // pretiekol buffer
			  //error 
			  global_solution = 0;
			  ++buf_overflow;
			  //printf("round: %d row: %d count: %u\n",i,j,count[j]);
			  break;
		    }
		    gpuErrchk(cudaMemset(base_adr,0,7));
		    gpuErrchk(cudaMemset(base_adr,0x02,1));
		    int solution=0;
		    for(int base=2;base<262146;++base) { // <1024 
			gpuErrchk(cudaMemset(dev_ans,0xFF,4));
			solve<<<512,128>>>(dev_input,dev_count,dev_ans);
			gpuErrchk( cudaPeekAtLastError() );
			gpuErrchk( cudaDeviceSynchronize() );
			gpuErrchk(cudaMemcpy(&ans,dev_ans,4,cudaMemcpyDeviceToHost));
			//printf("%d\n",j);
		      
		      
			if(ans!=0) {
			  solution=base;
			  break;
			}
			incBase<<<1,1>>>();
			gpuErrchk( cudaPeekAtLastError() );
			gpuErrchk( cudaDeviceSynchronize() );
		    }
		    bb[i*(HASH_ROW)+j]=solution;
		  
		    printf("%d:   %d      in %d seconds\n",i*HASH_ROW+j,solution,gettime());
		  
		    
		    if(solution==0) {
			//gpuErrchk(cudaFree(dev_ans));
			//gpuErrchk(cudaFree(dev_input));
			//gpuErrchk(cudaFree(dev_count));
			//printf("no solution found\n");
			//return 0;
			
			global_solution=0;
			break;
		    }
		    
		    incIndex<<<1,1>>>();
		    gpuErrchk( cudaPeekAtLastError() );
		    gpuErrchk( cudaDeviceSynchronize() );
		}
		if(buf_overflow==1) {
		    global_solution=0;
		    break;
		}
		if(global_solution==0) break;
		incHash<<<1,1>>>();
		gpuErrchk( cudaPeekAtLastError() );
		gpuErrchk( cudaDeviceSynchronize() );
	    }
	    answer[current_index]+=global_solution;
	    buffoverflow[current_index]+=buf_overflow;
	    
	    // print bases
	    if(global_solution==1) {
		FILE *f = fopen("bases.txt","w");
		for(int i=0;i<224;++i) fprintf(f,"%d ",bb[i]);
		fclose(f);
	    }
		
	}
	printf("%u: %d / %d   err: %d    (%d seconds)\n",HASH_SIZE,answer[current_index],ITER,buffoverflow[current_index],gettime());
	++current_index;
    }
    
    /* print results */
    
    for(int i=0;i<current_index;++i) {
	printf("%d / %d   err: %d\n",answer[i],ITER,buffoverflow[i]);
    }
    
    time_t endtime = time(NULL);
    timeinfo = localtime(&endtime);
    printf("Ended: %02d:%02d:%02d   (total %d seconds)\n",timeinfo->tm_hour,timeinfo->tm_min,timeinfo->tm_sec,(int)(endtime-starttime));

    
    
    gpuErrchk(cudaFree(dev_input));
    gpuErrchk(cudaFree(dev_count));
    gpuErrchk(cudaFree(dev_ans));
    
    return 0;
}
  
  
