/*-
 * Copyright (c) 2018 Robert Clausecker. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

/* puzzledist.c -- compute the number of puzzles with the given distance */

#ifdef __SSE__
# include <immintrin.h>
#endif

#include <assert.h>
#include <stdalign.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>

/*
 * One configuration of a 24 puzzle.  A 24 puzzle configuration
 * comprises 24 tiles labelled 1 to 24 arranged in a 5x5 grid with one
 * spot remaining empty.  The goal of the puzzle is to arrange the
 * tiles like on the left:
 *
 *     []  1  2  3  4       1  2  3  4  5
 *      5  6  7  8  9       6  7  8  9 10
 *     10 11 12 13 14      11 12 13 14 15
 *     15 16 17 18 19      16 17 18 19 20
 *     20 21 22 23 24      21 22 23 24 []
 *
 * Note that this arrangement is different from the traditional
 * arrangement on the right.  It is however isomorphic to the
 * traditional tile arrangement by changing coordinates and tile
 * numbers.
 *
 * To simplify the algorithms we want to run on them, puzzle
 * configurations are stored in two ways: First, the position of
 * each tile is stored in tiles[], then, the tile on each grid
 * position is stored in grid[] with 0 indicating the empty spot.
 * If viewed as permutations of { 0, ..., 24 }, tiles[] and grid[]
 * are inverse to each other at any given time.
 */
enum { TILE_COUNT = 25, ZERO_TILE = 0 };

struct puzzle {
	alignas(8) unsigned char tiles[TILE_COUNT], grid[TILE_COUNT];
};

/*
 * A solved puzzle configuration.
 */
static struct puzzle solved_puzzle = {
	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24 },
	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24 },
};

/*
 * List of all possible moves for a given position of the empty square.
 * There are up to four moves from every square, if there are less, the
 * remainder is filled up with -1.
 */
static const signed char movetab[TILE_COUNT][4] = {
	 1,  5, -1, -1,
	 0,  2,  6, -1,
	 1,  3,  7, -1,
	 2,  4,  8, -1,
	 3,  9, -1, -1,

	 0,  6, 10, -1,
	 1,  5,  7, 11,
	 2,  6,  8, 12,
	 3,  7,  9, 13,
	 4,  8, 14, -1,

	 5, 11, 15, -1,
	 6, 10, 12, 16,
	 7, 11, 13, 17,
	 8, 12, 14, 18,
	 9, 13, 19, -1,

	10, 16, 20, -1,
	11, 15, 17, 21,
	12, 16, 18, 22,
	13, 17, 19, 23,
	14, 18, 24, -1,

	15, 21, -1, -1,
	16, 20, 22, -1,
	17, 21, 23, -1,
	18, 22, 24, -1,
	19, 23, -1, -1,
};

/*
 * Return the location of the zero tile in p.
 */
static inline size_t
zero_location(const struct puzzle *p)
{
	return (p->tiles[ZERO_TILE]);
}

/*
 * Move the empty square to dloc, modifying p.  It is not tested whether
 * dest is adjacent to the empty square's current location.  Furtermore,
 * this function assumes 0 <= dloc < 25.
 */
static inline void
move(struct puzzle *p, size_t dloc)
{
	size_t zloc, dtile;

	dtile = p->grid[dloc];
	zloc = zero_location(p);

	p->grid[dloc] = ZERO_TILE;
	p->grid[zloc] = dtile;

	p->tiles[dtile] = zloc;
	p->tiles[ZERO_TILE] = dloc;
}

/*
 * Return the number of moves when the empty square is at z.  It is
 * assumed that 0 <= z < 25.
 */
static inline size_t
move_count(size_t z)
{
	/*
	 * 0xefffee is 01110 11111 11111 11111 01110,
	 * 0x07e9c0 is 00000 01110 01110 01110 00000,
	 * i.e. everything but the corners and everything but the border.
	 */
	return (2 + ((0xefffee & 1 << z) != 0) + ((0x0739c0 & 1 << z) != 0));
}

/*
 * Return the possible moves from square z.  Up to four moves are
 * possible, the exact number can be found using move_count(z).  If
 * less than four moves are possible, the last one or two entries are
 * marked with -1.  It is assumed that 0 <= z < 25.
 */
static inline const signed char *
get_moves(size_t z)
{
	return (movetab[z]);
}

/*
 * To save storage, we store puzzles using a compact representation with
 * five bits per entry, not storing the position of the zero tile.
 * Additionally, four bits are used to store all moves that lead back to
 * the previous generation.  This leads to 24 * 5 + 4 = 124 bits of
 * storage being required in total, split into two 64 bit quantities.
 * lo and hi store 12 tiles @ 5 bits each, lo additionally stores 4 move
 * mask bits in its least significant bits.
 */
struct compact_puzzle {
	unsigned long long lo, hi;
};

#define MOVE_MASK 0xfull

/*
 * An array of struct compact_puzzle with the given length and capacity.
 */
struct cp_slice {
	struct compact_puzzle *data;
	size_t len, cap;
};

/*
 * Translate a struct puzzle into a struct compact_puzzle.
 */
static void
pack_puzzle(struct compact_puzzle *restrict cp, const struct puzzle *restrict p)
{
#ifdef __BMI2__
	/* pext is available iff pdep is available */
	unsigned long long scratch, data;

	data = *(const unsigned long long*)&p->tiles[1];
	scratch = _pext_u64(data, 0x1f1f1f1f1f1f1f1full) << 4;
	data = *(unsigned*)&p->tiles[9];
	scratch |= (unsigned long long)_pext_u32(data, 0x1f1f1f1fu) << 4 + 8 * 5;

	cp->lo = scratch;

	data = *(const unsigned long long*)&p->tiles[13];
	scratch = _pext_u64(data, 0x1f1f1f1f1f1f1f1full);
	data = *(unsigned*)&p->tiles[21];
	scratch |= (unsigned long long)_pext_u32(data, 0x1f1f1f1fu) << 8 * 5;

	cp->hi = scratch;
#else /* !defined(__BMI2__) */
	size_t i;

	cp->lo = 0;
	cp->hi = 0;

	for (i = 1; i <= 12; i++)
		cp->lo |= (unsigned long long)p->tiles[i] << 5 * (i - 1) + 4;

	for (; i < TILE_COUNT; i++)
		cp->hi |= (unsigned long long)p->tiles[i] << 5 * (i - 13);
#endif /* defined(__BMI2__) */
}

/*
 * Pack p into cp, masking out the move that leads to dest.
 */
static void
pack_puzzle_masked(struct compact_puzzle *restrict cp, const struct puzzle *restrict p,
    int dest)
{
	size_t i, n_moves;
	int zloc;
	const signed char *moves;

	pack_puzzle(cp, p);
	zloc = zero_location(p);
	n_moves = move_count(zloc);
	moves = get_moves(zloc);

	for (i = 0; i < n_moves; i++)
		if (moves[i] == dest)
			cp->lo |= 1 << i;
}

/*
 * Translate a struct compact_puzzle back into a struct puzzle.
 */
static void
unpack_puzzle(struct puzzle *restrict p, const struct compact_puzzle *restrict cp)
{
#ifdef __BMI2__
	size_t i;
	unsigned long long data;

	memset(p, 0, sizeof *p);

	data = _pdep_u64(cp->lo >> 4, 0x1f1f1f1f1f1f1f1full);
	*(unsigned long long *)&p->tiles[1] = data;
	data = _pdep_u32(cp->lo >> 4 + 8 * 5, 0x1f1f1f1fu);
	*(unsigned *)&p->tiles[9] = data;

	data = _pdep_u64(cp->hi, 0x1f1f1f1f1f1f1f1full);
	*(unsigned long long *)&p->tiles[13] = data;
	data = _pdep_u32(cp->hi >> 8 * 5, 0x1f1f1f1fu);
	*(unsigned *)&p->tiles[21] = data;

	for (i = 1; i < TILE_COUNT; i++)
		p->grid[p->tiles[i]] = i;
#else /* !defined(__BMI2__) */
	size_t i;
	unsigned long long accum;

	memset(p, 0, sizeof *p);

	accum = cp->lo >> 4;
	for (i = 1; i <= 12; i++) {
		p->tiles[i] = accum & 31;
		p->grid[accum & 31] = i;
		accum >>= 5;
	}

	accum = cp->hi;
	for (; i < TILE_COUNT; i++) {
		p->tiles[i] = accum & 31;
		p->grid[accum & 31] = i;
		accum >>= 5;
	}
#endif /* defined(__BMI2__) */

	/* find the location of the zero tile in grid and set p->tiles[0] */
#ifdef __AVX2__
	__m256i grid, gridmask;

	grid = _mm256_loadu_si256((const __m256i*)&p->grid);
	gridmask = _mm256_cmpeq_epi8(grid, _mm256_setzero_si256());
	p->tiles[0] = __builtin_ctz(_mm256_movemask_epi8(gridmask));
#elif defined(__SSE2__)
	__m128i gridlo, gridhi, gridmasklo, gridmaskhi, zero;

	gridlo = _mm_loadu_si128((const __m128i*)&p->grid + 0);
	gridhi = _mm_loadu_si128((const __m128i*)&p->grid + 1);

	zero = _mm_setzero_si128();

	gridmasklo = _mm_cmpeq_epi8(gridlo, zero);
	gridmaskhi = _mm_cmpeq_epi8(gridhi, zero);

	p->tiles[0] = __builtin_ctz(_mm_movemask_epi8(gridmaskhi) << 16 | _mm_movemask_epi8(gridmasklo));
#else /* !defined(__AVX2__) && !defined(__SSE2__) */
	p->tiles[0] = strlen(p->grid);
#endif
}

/*
 * Append cp to slice cps and resize if required.
 */
static void
cps_append(struct cp_slice *cps, const struct compact_puzzle *cp)
{
	struct compact_puzzle *newdata;
	size_t newcap;

	if (cps->len >= cps->cap) {
		newcap = cps->cap < 64 ? 64 : cps->cap * 13 / 8;
		newdata = realloc(cps->data, newcap * sizeof *cps->data);
		if (newdata == NULL) {
			perror("realloc");
			exit(EXIT_FAILURE);
		}

		cps->data = newdata;
		cps->cap = newcap;
	}

	cps->data[cps->len++] = *cp;
}

/*
 * Initialize the content of cps to an empty slice.
 */
static void
cps_init(struct cp_slice *cps)
{
	cps->data = NULL;
	cps->len = 0;
	cps->cap = 0;
}

/*
 * Release all storage associated with cps.  The content of cps is
 * undefined afterwards.
 */
static void
cps_free(struct cp_slice *cps)
{
	free(cps->data);
}

/*
 * Perform all unmasked moves from cp and add them to cps.
 */
static void
expand(struct cp_slice *cps, const struct compact_puzzle *cp)
{
	struct puzzle p;
	struct compact_puzzle ncp;
	size_t i, n_move;
	int movemask = cp->lo & MOVE_MASK, zloc;
	const signed char *moves;

	unpack_puzzle(&p, cp);
	zloc = zero_location(&p);
	n_move = move_count(zloc);
	moves = get_moves(zloc);

	for (i = 0; i < n_move; i++) {
		if (movemask & 1 << i)
			continue;

		move(&p, moves[i]);
		pack_puzzle_masked(&ncp, &p, zloc);
		move(&p, zloc);

		cps_append(cps, &ncp);
	}
}

/*
 * Given a sorted struct cp_slice cps, coalesce identical puzzles,
 * oring their move masks.  Since the move mask bits are the least
 * significatn bits in lo, puzzles differing only by their move
 * masks end up next to each other.
 */
static void
coalesce(struct cp_slice *cps)
{
	struct compact_puzzle *a, *b, *newdata;
	size_t i, j;

	if (cps->len == 0)
		return;

	/* invariant: i < j */
	for (i = 0, j = 1; j < cps->len; j++) {
		a = &cps->data[i];
		b = &cps->data[j];

		/* are a and b equal, ignoring the move mask? */
		if (a->hi == b->hi && ((a->lo ^ b->lo) & ~MOVE_MASK) == 0)
			a->lo |= b->lo;
		else
			cps->data[++i] = *b;
	}

	cps->len = i + 1;

	/* conserve storage */
	newdata = realloc(cps->data, cps->len * sizeof *cps->data);
	if (newdata != NULL) {
		cps->data = newdata;
		cps->cap = cps->len;
	}
}

/*
 * Compare two struct compact_puzzle in a manner suitable for qsort.
 */
static int
compare_cp(const void *a_arg, const void *b_arg)
{
	const struct compact_puzzle *a = a_arg, *b = b_arg;

	if (a->hi != b->hi)
		return ((a->hi > b->hi) - (a->hi < b->hi));
	else
		return ((a->lo > b->lo) - (a->lo < b->lo));
}

/*
 * Expand vertices in cps and store them in new_cps.  Then sort and
 * coalesce new_cps.
 */
static void
do_round(struct cp_slice *restrict new_cps, const struct cp_slice *restrict cps)
{
	size_t i;

	for (i = 0; i < cps->len; i++)
		expand(new_cps, &cps->data[i]);

	qsort(new_cps->data, new_cps->len, sizeof *new_cps->data, compare_cp);
	coalesce(new_cps);
}

extern int
main()
{
	struct cp_slice old_cps, new_cps;
	struct compact_puzzle cp;

	cps_init(&new_cps);
	pack_puzzle(&cp, &solved_puzzle);
	cps_append(&new_cps, &cp);

	for (;;) {
		printf("%zu\n", new_cps.len);
		fflush(stdout);
		old_cps = new_cps;
		cps_init(&new_cps);
		do_round(&new_cps, &old_cps);
		cps_free(&old_cps);
	}
}