Improving FPS

The mixmode example would be fine for testing, failing that (I know not everyone has all of the examples) then this is what I’ve started…PokittOS.zip (423.2 KB)

The scanline fix I mentioned in the previous comment and moves scanlineIndex outside the y loop.

Possibly an array out of bounds error.

What does it do on an actual Pokitto?

Half the reason I use PokittoIO is so I don’t have to download the 20-odd examples.

I get an undefined reference to Pokitto::Display::scanType,
which means either scanType is something you’ve added locally or it’s in PokittoLib and not PokittoIO.


I also get a list of warnings:
Problems

The “array subscript is above array bounds” warnings are the most serious ones.

It’s in pokittolib. I was added a while back with the mixmode example.

This seems to be working perfectly, a little faster than the original.
Do you think moving the palette shifting to the loadpalette functions would make any difference? I don’t see a need to do that every single frame?

void lcdRefreshMixModeImplementation(const uint8_t screenBuffer[], const uint8_t scanTypes[], const uint16_t palette8bpp[], const uint32_t (&palette4bpp)[16], const uint32_t (&palette2bpp)[4]);

void Pokitto::lcdRefreshMixMode(const uint8_t * screenBuffer, const uint16_t * palette, const uint8_t * scanTypes)
{
	constexpr size_t palette8bppCount = (1 << 8);
	constexpr size_t palette4bppCount = (1 << 4);
	constexpr size_t palette2bppCount = (1 << 2);

	constexpr size_t palette8bppOffset = 0;
	constexpr size_t palette4bppOffset = (palette8bppOffset + palette8bppCount);
	constexpr size_t palette2bppOffset = (palette4bppOffset + palette4bppCount);

	uint32_t palette4bpp[palette4bppCount];
	uint32_t palette2bpp[palette2bppCount];

	const uint16_t * palette4bppBase = &palette[palette4bppOffset];
	for(size_t index = 0; index < palette4bppCount; ++index)
	{
		uint32_t colour = palette4bppBase[index];
		palette4bpp[index] = (colour << 3);
	}

	const uint16_t * palette2bppBase = &palette[palette2bppOffset];
	for(size_t index = 0; index < palette2bppCount; ++index)
	{
		uint32_t colour = palette2bppBase[index];
		palette2bpp[index] = (colour << 3);
	}

	lcdRefreshMixModeImplementation(screenBuffer, scanTypes, palette, palette4bpp, palette2bpp);
}

void lcdRefreshMixModeImplementation(const uint8_t screenBuffer[], const uint8_t scanTypes[], const uint16_t palette8bpp[], const uint32_t (&palette4bpp)[16], const uint32_t (&palette2bpp)[4])
{
	write_command(0x03);
	write_data(0x1038);

	// Horizontal DRAM Address
	write_command(0x20);
	write_data(0);

	// Vertical DRAM Address
	write_command(0x21);
	write_data(0);

	// write data to DRAM
	write_command(0x22);
	CLR_CS_SET_CD_RD_WR;
	SET_MASK_P2;

	uint32_t scanline[220];

	// point to beginning of line in data
	const uint8_t * d = screenBuffer;

	for(size_t y = 0; y < 176; ++y)
	{
		// find colours in one scanline
		const uint8_t scanTypeIndex = (y >> 1);
		uint16_t scanlineIndex = 0;
		switch(scanTypes[scanTypeIndex])
		{
			case 0: // 8bpp
			{
				// Point to the beginning of the line in data
				d = &screenBuffer[110 * scanTypeIndex];

				#define STEP() \
					{ \
					int color = static_cast<uint32_t>(palette8bpp[*d++]) << 3;\
					scanline[scanlineIndex++] = color;\
					scanline[scanlineIndex++] = color;\
					}

				for(uint8_t x = 0; x < 11; ++x)
				{
					STEP();
					STEP();
					STEP();
					STEP();
					STEP();
					STEP();
					STEP();
					STEP();
					STEP();
					STEP();
				}
				break;

				#undef STEP
			}
			case 1: // 4bpp
			{
				#define STEP() \
					{ \
						const uint8_t value = *d++; \
                        uint32_t color1 = palette4bpp[((value >> 4) & 0x0F)];\
                        uint32_t color2 = palette4bpp[((value >> 4) & 0x0F)];\
                        uint32_t color3 = palette4bpp[((value >> 0) & 0x0F)];\
                        uint32_t color4 = palette4bpp[((value >> 0) & 0x0F)];\
                        scanline[scanlineIndex++] = color1;\
                        scanline[scanlineIndex++] = color2;\
                        scanline[scanlineIndex++] = color3;\
                        scanline[scanlineIndex++] = color4;\
					}

				for(uint8_t x = 0; x < 11; ++x)
				{
					STEP();
					STEP();
					STEP();
					STEP();
					STEP();
				}
				break;

				#undef STEP
			}
			case 2: // 2bpp
			{
				// Point to the beginning of the line in data

				#define STEP() \
					{ \
						const uint8_t value = *d++; \
                        uint32_t color1 = palette2bpp[((value >> 6) & 0x03)];\
                        uint32_t color2 = palette2bpp[((value >> 4) & 0x03)];\
                        uint32_t color3 = palette2bpp[((value >> 2) & 0x03)];\
                        uint32_t color4 = palette2bpp[((value >> 0) & 0x03)];\
                        scanline[scanlineIndex++] = color1;\
                        scanline[scanlineIndex++] = color2;\
                        scanline[scanlineIndex++] = color3;\
                        scanline[scanlineIndex++] = color4;\
					}

				for(uint8_t x = 0; x < 11; ++x)
				{
					STEP();
					STEP();
					STEP();
					STEP();
					STEP();
				}

				#undef STEP

				break;
			}
		}

        #define WRITE_SCANLINE() \
            *LCD = color;\
             TGL_WR_OP(color = scanline[++scanlineIndex]);

		#define WRITE_SCANLINE_2() \
			WRITE_SCANLINE(); \
			WRITE_SCANLINE();

		#define WRITE_SCANLINE_10() \
			WRITE_SCANLINE_2(); \
			WRITE_SCANLINE_2(); \
			WRITE_SCANLINE_2(); \
			WRITE_SCANLINE_2(); \
			WRITE_SCANLINE_2();

		#define WRITE_SCANLINE_20() \
			WRITE_SCANLINE_10(); \
			WRITE_SCANLINE_10();

		{
            volatile uint32_t * LCD = reinterpret_cast< volatile uint32_t * >(0xA0002188);
            uint32_t color = scanline[0];
			size_t scanlineIndex = 0;

			WRITE_SCANLINE_20();
			WRITE_SCANLINE_20();
			WRITE_SCANLINE_20();
			WRITE_SCANLINE_20();
			WRITE_SCANLINE_20();
			WRITE_SCANLINE_20();
			WRITE_SCANLINE_20();
			WRITE_SCANLINE_20();
			WRITE_SCANLINE_20();
			WRITE_SCANLINE_20();
			WRITE_SCANLINE_20();
		}

		#undef WRITE_SCANLINE
	}

	CLR_MASK_P2;
}

added this to PokittoDisplay.h

static uint32_t mixPalette[276];

then load my palette pre-shifted:

// load only part of a palette if needed
void load8bitPalette(const uint16_t* p) {
//    if(numCols-from > 255)return;
    for (int i=0; i<=256; i++) game.display.mixPalette[i] = p[i]<<3;
    game.display.mixpaletteptr = game.display.mixPalette;
}

void load4bitPalette(const uint16_t* p) {
//    if(numCols-from > 15)return;
    for (int i=0; i<16; i++) game.display.mixPalette[i+256] = p[i]<<3;
    game.display.mixpaletteptr = game.display.mixPalette;
}

void load2bitPalette(const uint16_t* p) {
//    if(numCols-from > 3)return;
    for (int i=0; i<4; i++) game.display.mixPalette[i+272] = p[i]<<3;
    game.display.mixpaletteptr = game.display.mixPalette;
}

and remove the palette shifting from the display update…

void Pokitto::lcdRefreshMixMode(const uint8_t * screenBuffer, const uint32_t * palette8bpp, const uint8_t * scanTypes)
{
	write_command(0x03);
	write_data(0x1038);

	// Horizontal DRAM Address
	write_command(0x20);
	write_data(0);

	// Vertical DRAM Address
	write_command(0x21);
	write_data(0);

	// write data to DRAM
	write_command(0x22);
	CLR_CS_SET_CD_RD_WR;
	SET_MASK_P2;

	uint32_t scanline[220];

	// point to beginning of line in data
	const uint8_t * d = screenBuffer;

	for(size_t y = 0; y < 176; ++y)
	{
		// find colours in one scanline
		const uint8_t scanTypeIndex = (y >> 1);
		uint16_t scanlineIndex = 0;
		switch(scanTypes[scanTypeIndex])
		{
			case 0: // 8bpp
			{
				// Point to the beginning of the line in data
				d = &screenBuffer[110 * scanTypeIndex];

				#define STEP() \
					{ \
					int color = static_cast<uint32_t>(palette8bpp[*d++]);\
					scanline[scanlineIndex++] = color;\
					scanline[scanlineIndex++] = color;\
					}

                #define STEP11()\
                {\
					STEP();	STEP();	STEP();	STEP();	STEP();	STEP();	STEP();	STEP();	STEP();	STEP();	STEP();\
                }

                    STEP11();STEP11();STEP11();STEP11();STEP11();STEP11();STEP11();STEP11();STEP11();STEP11();


				break;

				#undef STEP
			}
			case 1: // 4bpp
			{
				#define STEP() \
					{ \
						const uint8_t value = *d++; \
                        uint32_t color1 = palette8bpp[((value >> 4) & 0x0F)+256];\
                        uint32_t color2 = palette8bpp[((value >> 0) & 0x0F)+256];\
                        scanline[scanlineIndex++] = color1;\
                        scanline[scanlineIndex++] = color1;\
                        scanline[scanlineIndex++] = color2;\
                        scanline[scanlineIndex++] = color2;\
					}

                #define STEP11()\
                {\
					STEP();	STEP();	STEP();	STEP();	STEP();	STEP();	STEP();	STEP();	STEP();	STEP();	STEP();\
                }

                STEP11();STEP11();STEP11();STEP11();STEP11();

				break;

				#undef STEP
			}
			case 2: // 2bpp
			{
				// Point to the beginning of the line in data
				#define STEP() \
					{ \
						const uint8_t value = *d++; \
                        uint32_t color1 = palette8bpp[((value >> 6) & 0x03)+272];\
                        uint32_t color2 = palette8bpp[((value >> 4) & 0x03)+272];\
                        uint32_t color3 = palette8bpp[((value >> 2) & 0x03)+272];\
                        uint32_t color4 = palette8bpp[((value >> 0) & 0x03)+272];\
                        scanline[scanlineIndex++] = color1;\
                        scanline[scanlineIndex++] = color2;\
                        scanline[scanlineIndex++] = color3;\
                        scanline[scanlineIndex++] = color4;\
					}

                #define STEP11()\
                {\
					STEP();	STEP();	STEP();	STEP();	STEP();	STEP();	STEP();	STEP();	STEP();	STEP();	STEP();\
                }

                STEP11();STEP11();STEP11();STEP11();STEP11();

				#undef STEP

				break;
			}
		}

        #define WRITE_SCANLINE() \
            *LCD = color;\
             TGL_WR_OP(color = scanline[++scanlineIndex]);

		#define WRITE_SCANLINE_2() \
			WRITE_SCANLINE(); \
			WRITE_SCANLINE();

		#define WRITE_SCANLINE_10() \
			WRITE_SCANLINE_2(); \
			WRITE_SCANLINE_2(); \
			WRITE_SCANLINE_2(); \
			WRITE_SCANLINE_2(); \
			WRITE_SCANLINE_2();

		#define WRITE_SCANLINE_20() \
			WRITE_SCANLINE_10(); \
			WRITE_SCANLINE_10();

		{
            volatile uint32_t * LCD = reinterpret_cast< volatile uint32_t * >(0xA0002188);
            uint32_t color = scanline[0];
			size_t scanlineIndex = 0;

			WRITE_SCANLINE_20();
			WRITE_SCANLINE_20();
			WRITE_SCANLINE_20();
			WRITE_SCANLINE_20();
			WRITE_SCANLINE_20();
			WRITE_SCANLINE_20();
			WRITE_SCANLINE_20();
			WRITE_SCANLINE_20();
			WRITE_SCANLINE_20();
			WRITE_SCANLINE_20();
			WRITE_SCANLINE_20();
		}

		#undef WRITE_SCANLINE
	}

	CLR_MASK_P2;
}

I can’t help thinking that perhaps displaying more than one line at a time might be faster, especially as I’m splitting the screen into 88 lines for mode switching. However my attempts have resulted in garbled data.
Does anyone think it would help?