#include <stdlib.h>
#include <stdio.h>
#include <string.h>

#include "bpe.h"

/*
 * Initialize BPE structure
 */
int8_t bpeInit( struct BPEData* bpe, uint8_t *dictionary, int dictionaryLength )
{
	bpe->dictionary       = dictionary;
	bpe->dictionaryLength = dictionaryLength;
	
	bpe->pairs = (struct BPEPair*)malloc( bpe->dictionaryLength * sizeof(struct BPEPair) );
	if(bpe->pairs == NULL)
	{
		return BPE_ALLOCATION_ERROR;
	}
	bpe->pairCount = 0;
	
	bpe->compressedString = NULL;
	bpe->compressedStringLength = 0;
	
	return BPE_OK;
}

/*
 * Clear BPE structure
 */
void bpeClear( struct BPEData* bpe )
{
	bpe->dictionary       = NULL;
	bpe->dictionaryLength = 0;
	
	if(bpe->pairs != NULL)
	{
		free( bpe->pairs );
		bpe->pairs = NULL;
	}
	bpe->pairCount = 0;
	
	if(bpe->compressedString != NULL)
	{
		free( bpe->compressedString );
		bpe->compressedString = NULL;
	}
	bpe->compressedStringLength = 0;
}

/*
 * Encode string using Byte Pair Encoding
 */
int8_t bpeEncode( const char* source, int sourceSize, uint8_t* ignore, int ignoreLength, struct BPEData* bpe )
{
	int *frequency, *offset;
	int i, j, k, maxFrequency;
	
	frequency = (int*)malloc(2 * sourceSize * sizeof(int));
	if( frequency == NULL )
	{
		return BPE_ALLOCATION_ERROR;
	}
	offset = frequency + sourceSize;
	
	bpe->compressedString = (uint8_t*)malloc(sourceSize * sizeof(uint8_t));
	if(bpe->compressedString == NULL)
	{
		free(frequency);
		return BPE_ALLOCATION_ERROR;
	}
	bpe->compressedStringLength = sourceSize;
	memcpy( bpe->compressedString, source, sourceSize * sizeof(uint8_t) );

	while( bpe->pairCount < bpe->dictionaryLength )
	{
		memset( frequency, 0, 2 * sourceSize * sizeof(int) );
		
		/* update pair frequency */
		for( i=1; i<bpe->compressedStringLength; ++i )
		{
			for( j=0; 
				 (j<ignoreLength) && 
				 ( (bpe->compressedString[i  ] != ignore[j]) && 
				   (bpe->compressedString[i-1] != ignore[j]) );
				 ++j )
			{}
			
			if( j < ignoreLength )
			{
				continue;
			}

			for( j=1; j<i-1; ++j )
			{
				for( k=0; 
					(k<ignoreLength) && 
					( (bpe->compressedString[j  ] != ignore[k]) && 
				      (bpe->compressedString[j-1] != ignore[k]) );
					++k )
				{}
			
				if( k < ignoreLength )
				{
					continue;
				}

				if( ( bpe->compressedString[j-1] == bpe->compressedString[i-1] ) &&
					( bpe->compressedString[j  ] == bpe->compressedString[i  ] ) )
				{
						if( offset[j] != (i-1) )
						{
							++frequency[j];
							offset[i] = offset[j];
							offset[j] = i;
						}
						break;
				}
			}
		}
		
		/* find the most frequent pair */
		j = maxFrequency = 0;
		for( i=1; i<bpe->compressedStringLength; ++i )
		{
			if( frequency[i] > maxFrequency )
			{
				maxFrequency = frequency[i];
				j = i;
			}
		}
		
		/* no valuable pair repetition found */
		if( !maxFrequency )
		{
			break;
		}
		
		/* update pair dictionary */
		bpe->pairs[bpe->pairCount].first  = bpe->compressedString[j-1];
		bpe->pairs[bpe->pairCount].second = bpe->compressedString[j  ];
		
		/* compress string */
		for(i=j; i>0; i=k )
		{
			k = offset[i];
			if(k > i) --k;
			
			bpe->compressedString[i-1] = bpe->dictionary[bpe->pairCount];
			for(j=i; j<bpe->compressedStringLength-1; ++j)
			{
				if(offset[j+1] < i) offset[j] = offset[j+1];
				else  offset[j] = offset[j+1]-1;
				bpe->compressedString[j] = bpe->compressedString[j+1];
			}
			--bpe->compressedStringLength;		
		}
		
		++bpe->pairCount;
	}

	free(frequency);
	return BPE_OK;
}

#ifdef BPE_DEBUG

#define DICTIONARY_LEN 64
#define IGNORE_LEN 2

int main()
{
	//char *str = "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Vivamus sem elit, tristique id dictum eu, egestas quis leo. Aenean quis tellus eu tellus tristique aliquet. Ut aliquet, ipsum ac dignissim sollicitudin, turpis purus consectetur lorem, nec volutpat enim dui sollicitudin enim. Morbi et diam arcu, facilisis pharetra mauris. Fusce fringilla vehicula venenatis. Donec nibh mauris, commodo quis euismod ac, cursus et metus. In tellus orci, lacinia sit amet suscipit id, lacinia non velit. Ut adipiscing tempus libero, in vehicula metus ullamcorper sit amet. Donec varius risus a sapien vestibulum imperdiet. Vestibulum posuere tortor et risus hendrerit at auctor augue rutrum. Nam suscipit ullamcorper diam, quis ultricies nunc dictum at. Vestibulum aliquam dui tellus. Cras faucibus, urna ac rutrum porttitor, libero felis gravida urna, sagittis bibendum sapien nibh vel augue. Fusce cursus varius imperdiet. Pellentesque habitant morbi tristique senectus et netus et malesuada fames ac turpis egestas. Nullam bibendum lobortis eros eget lacinia.Mauris placerat orci non risus scelerisque dictum. Vivamus cursus scelerisque auctor. Donec iaculis erat sed ipsum consequat consectetur. Quisque aliquam fermentum leo ac volutpat. Aliquam sit amet nisl justo. Donec a ligula in odio pulvinar rhoncus sit amet non sapien. Integer scelerisque, urna eget tincidunt elementum, erat dui suscipit velit, condimentum convallis diam ipsum ac nisl. Vestibulum vitae ipsum in mauris tempus rhoncus ac ut nisi. Nunc dui arcu, convallis sed congue et, feugiat id sapien. Cras vel sapien sit amet diam adipiscing dictum sit amet vitae dolor. Donec at est sed tellus feugiat molestie. Aliquam felis diam, ultrices quis feugiat eget, pulvinar eu nisi.Nunc ut tellus ac tortor blandit vehicula nec eget enim. Nulla vitae magna mauris. Maecenas eu blandit turpis. Etiam commodo, arcu eu euismod scelerisque, purus ligula interdum est, vitae tempus massa enim ac nisl. Duis venenatis pretium odio. Ut ullamcorper nunc eu leo vestibulum in sollicitudin justo bibendum. Donec vestibulum tempor tellus, vel bibendum urna ultricies in. In consectetur fringilla augue, nec venenatis arcu eleifend a. Cras placerat cursus nunc. Vestibulum dapibus metus nec sapien porttitor auctor. In id sapien augue. Nulla venenatis sodales urna, at suscipit erat blandit at. Aenean malesuada justo eros. Proin non lacus sem, non interdum justo. Praesent dignissim molestie enim id euismod. Vestibulum quam ante, laoreet vel varius non, gravida vel augue. Aliquam sit amet turpis turpis, at posuere sapien.Vestibulum vehicula ornare mollis. Pellentesque consectetur, purus at porta sodales, ipsum libero volutpat ante, non dignissim quam nulla eu odio. Praesent sit amet urna nec ligula dapibus varius. Praesent vehicula odio id lacus luctus consequat. Donec tempor erat at dolor interdum vel scelerisque tellus tempus. Aenean in velit ut nisi commodo aliquam eu a massa. Phasellus malesuada diam neque, vitae vehicula erat. Sed ut posuere purus. Maecenas in diam tempus massa elementum aliquam. Pellentesque feugiat odio congue justo fringilla pellentesque. Class aptent taciti sociosqu ad litora torquent per conubia nostra, per inceptos himenaeos. Curabitur vehicula gravida bibendum.Maecenas vestibulum lobortis purus ut congue. Ut facilisis, lorem sed congue ultrices, enim lectus facilisis enim, et placerat nulla nunc congue arcu. Sed sed tortor risus. Donec semper purus a nunc cursus lacinia. Nullam aliquam risus dui. Vestibulum purus tellus, auctor a viverra at, vestibulum sed eros. Fusce malesuada, neque id auctor elementum, nibh sem molestie elit, sed elementum nisl augue nec sem. Nam augue felis, aliquam nec hendrerit in, vestibulum sed enim. Vivamus fermentum, nunc ut consequat mattis, mauris augue luctus leo, ac hendrerit est ante semper eros. Phasellus auctor adipiscing nunc vitae iaculis. Vivamus blandit odio a magna congue mattis. Aenean ullamcorper, dolor eu tincidunt congue, dui enim porta nulla, vel suscipit mauris odio vel libero. Mauris nec eros est. ";
	char *str = "ABCDEFBECEDFABBEAFDACEBFDABABEDFCFEDCAFBEBDBCABAB";
	
	uint8_t dict[DICTIONARY_LEN], c, ignore[IGNORE_LEN];
	int   len = strlen(str), i, j, k, max;
	int   stack[256], top;
	struct BPEData bpe;

	char *dummy;

	/* allowed chars for pair code range from 'z'+1 to 'z'+65 */
	for(i=0; i<DICTIONARY_LEN; ++i) dict[i] = 'z' + 1 + i;

	/* here we specify the characters that shouldn't be encoded */
	ignore[0] = '@';
	ignore[1] = '/';
	
	/* setup bpe struct */
	bpeInit( &bpe, dict, DICTIONARY_LEN );

	/* encode the Lorem ipsum text */
	bpeEncode( str, len, ignore, IGNORE_LEN, &bpe);

	for(i=0; i<bpe.pairCount; ++i)
	{
		for(j=0; j<IGNORE_LEN; ++j)
		{
			if((bpe.pairs[i].first  == ignore[j]) ||
			   (bpe.pairs[i].second == ignore[j]))
			{
				fprintf(stderr, "ignore failed for %d\n", j);
				break;
			}

		}
	}

	dummy = (char*)malloc(len);
	
	
	max = 0;
	k = 0;
	for(i=0; i<bpe.compressedStringLength; ++i)
	{
		c = bpe.compressedString[i];
		top = 0;
		stack[top++] = c;

		if( top > max ) max = top;

		while(top>0)
		{
			c = stack[--top];
			for(j=0; (j<DICTIONARY_LEN) && (c!=dict[j]); ++j);			
			if(j<DICTIONARY_LEN)
			{
				stack[top++] = bpe.pairs[j].second;
				stack[top++] = bpe.pairs[j].first;

				if( top > max ) max = top;
			}
			else
			{
				if(k >= len)
				{
					fprintf(stderr,"ARG! Generated too much data! %d %d\n", k, len );
					i = bpe.compressedStringLength;
					break;
				}
				dummy[k++] = c;
			}
		}
	}
	
	if( (k = memcmp( dummy, str, len )) != 0 )
	{
		fprintf(stderr, "ARG! Decompressed data doesn't match original!\n");
	}
	
	printf("BPE_COMPRESSED_STRING_LEN=%d\n"
	   "BPE_UNCOMPRESSED_STRING_LEN=%d\n"
	   "BPE_PAIR_COUNT=%d\n"
	   "BPE_STACK=%d ; beta test\n"
	   "ENCODED_CHAR_MIN=$%x\n"
	   "ENCODED_CHAR_MAX=$%x\n",
	   bpe.compressedStringLength, len,
	   bpe.pairCount, max,
	   dict[0], dict[DICTIONARY_LEN-1]);


	printf("\n_bpe_first: \n\t.db ");
	for(i=0; i<bpe.pairCount; ++i)
	{
		printf("$%02x",bpe.pairs[i].first);
		if((i & 7) == 7) printf("\n\t.db ");
		else if( i<(bpe.pairCount-1) ) printf(", ");
	}
	
	printf("\n_bpe_second: \n\t.db ");
	for(i=0; i<bpe.pairCount; ++i)
	{
		printf("$%02x",bpe.pairs[i].second);
		if((i & 7) == 7) printf("\n\t.db ");
		else if( i<(bpe.pairCount-1) ) printf(", ");
	}
	
	printf("\n_bpe_encoded:\n\t.db ");
	for(i=0; i<bpe.compressedStringLength; ++i)
	{
		printf("$%02x",bpe.compressedString[i]);
		if((i & 7) == 7) printf("\n\t.db ");
		else if( i<(bpe.compressedStringLength-1) ) printf(", ");
	}
	
	printf("\n");

	bpeClear( &bpe );
	free(dummy);
	return 0;
}

#endif // BPE_DEBUG
