Improving FPS

I am using the following to add 220x176x16 to mode13 (yes I know you guys don’t fully approve)

if(subMode==1){ /* 110x176x4bpp */

   uint16_t x,y;
   uint8_t *d;

   write_command(0x03); write_data(0x1038);
   write_command(0x20); write_data(0);
   write_command(0x21); write_data(0);
   write_command(0x22);
   CLR_CS_SET_CD_RD_WR;
   SET_MASK_P2;

   volatile uint32_t *LCD = reinterpret_cast< volatile uint32_t * >(0xA0002188);

   d = scrbuf;// point to beginning of line in data
   for(y=0;y<176;y++){
     for(x=0;x<110;){
       *LCD = paletteptr[(*d >> 4) &15]<<3; TGL_WR;TGL_WR; x++;
       *LCD = paletteptr[ *d       &15]<<3; TGL_WR;TGL_WR; x++;
       *d++;
     }
   }

} // subMode 1

And it works correctly (although there is noise).

However, the following still does not work for emulating mode1…

if(subMode==2){ /* 220x176x2bpp */

   uint16_t x,y;
   uint8_t *d;

   write_command(0x03); write_data(0x1038);
   write_command(0x20); write_data(0);
   write_command(0x21); write_data(0);
   write_command(0x22);
   CLR_CS_SET_CD_RD_WR;
   SET_MASK_P2;

   volatile uint32_t *LCD = reinterpret_cast< volatile uint32_t * >(0xA0002188);

   d = scrbuf;// point to beginning of line in data
   for(y=0;y<176;y++){
     for(x=0;x<220;){
       *LCD = paletteptr[(*d >> 6) &3]<<3; TGL_WR; x++;
       *LCD = paletteptr[(*d >> 4) &3]<<3; TGL_WR; x++;
       *LCD = paletteptr[(*d >> 2) &3]<<3; TGL_WR; x++;
       *LCD = paletteptr[ *d       &3]<<3; TGL_WR; x++;
       *d++;
     }
   }

} // subMode 2

Can anyone see what it is I’m doing wrong? I’m following exactly the same logic I used for 4bpp, but with 2bpp it seems to be missing a lot of data (about half).

At least I could not spot any errors, except that *d++ should be d++, but that should not cause trouble in this case.

Maybe there is a problem filling the buffer with gfx data? Try to fill the buffer with zeroes or $ff.

Well isn’t that something. You’re right. I never thought to check, drawBitmap checks the size of the image, it’s probably doing less predictable things when loading a lower depth image that says it’s 4 times bigger in 8bit mode…
Filling the buffer with a fixed known value works perfectly.

[edit] so does manually copying the image.

1 Like

Hi. I am not a C expert but I think your code is missing a TGL_WR from every line in the inner loop .That is why half the data is missing.

IT turned out that the display loop was OK. The error was the way I was transferring the image to the buffer before being displayed.

OK code wizards, Is there much that can be done to this to speed it up any?
At this point I have no clue about improving it.

void Pokitto::lcdRefreshMixMode(uint8_t * scrbuf, uint16_t* paletteptr, uint8_t offset, uint8_t* scanType){

    uint32_t x,y;
    uint32_t scanline[220]; 
    uint8_t *d;

    write_command(0x03); write_data(0x1038);
    write_command(0x20);  // Horizontal DRAM Address
    write_data(0);  // 0
    write_command(0x21);  // Vertical DRAM Address
    write_data(0);
    write_command(0x22); // write data to DRAM
    CLR_CS_SET_CD_RD_WR;
    SET_MASK_P2;
    volatile uint32_t *LCD = reinterpret_cast< volatile uint32_t * >(0xA0002188);

    d = scrbuf;// point to beginning of line in data
      for(y=0;y<176;y++)
      {

        /** find colours in one scanline **/
        uint8_t st = y>>1;
        uint8_t s=0;

        if(scanType[st]==0){
            d = scrbuf+(110*st);// point to beginning of line in data
            for(x=0;x<220;){
                uint32_t color = uint32_t(paletteptr[*d++])<<3; scanline[s++]=color; scanline[s++]=color; x+=2;
            }
        }

        if(scanType[st]==1){
            for(x=0;x<220;){
                uint8_t t = *d++;
                uint32_t color;
                color = uint32_t(paletteptr[(t>>4)+256])<<3;   scanline[s++]=color; scanline[s++]=color;
                color = uint32_t(paletteptr[(t&0xF)+256])<<3;  scanline[s++]=color; scanline[s++]=color;
                x+=4;
            }
        }
        if(scanType[st]==2){
            for(x=0;x<220;){
                uint8_t t = *d++;
                uint32_t color;
                color = uint32_t(paletteptr[((t >> 6)&3)+272])<<3;   scanline[s++]=color;
                color = uint32_t(paletteptr[((t >> 4)&3)+272])<<3;   scanline[s++]=color;
                color = uint32_t(paletteptr[((t >> 2)&3)+272])<<3;   scanline[s++]=color;
                color = uint32_t(paletteptr[((t     )&3)+272])<<3;   scanline[s++]=color;
                x+=4;
            }
        }


        for (s=0;s<220;) {
          *LCD = (scanline[s]);TGL_WR;s++;
          *LCD = (scanline[s]);TGL_WR;s++;
          *LCD = (scanline[s]);TGL_WR;s++;
          *LCD = (scanline[s]);TGL_WR;s++;
          *LCD = (scanline[s]);TGL_WR;s++;
          *LCD = (scanline[s]);TGL_WR;s++;
          *LCD = (scanline[s]);TGL_WR;s++;
          *LCD = (scanline[s]);TGL_WR;s++;
          *LCD = (scanline[s]);TGL_WR;s++;
          *LCD = (scanline[s]);TGL_WR;s++;
          *LCD = (scanline[s]);TGL_WR;s++;
        }

      }

     CLR_MASK_P2;

}

Remember to put an else with mutually exclusive conditions.
Or better yet, use a switch.

Sometimes the compiler doesn’t realise they’re mutually exclusive or is forced to assume that the value might have changed so it can’t turn the series of mutually exclusive ifs into else ifs.

Also, try to get into the habit of declaring variables as close as possible to where they’re used. It probably won’t speed up your code, but it will make it easier to read and easier to determine the actual scope of the variable.

Sometimes declaring new local variables instead of trying to reuse the same variable can actually cause the compiler to produce better code.


for scan type 3, is it possible to invert the format of t so you can invert the order of the 4 lines?
I’ve got a feeling that if the shifts are increasing in size instead of decreasing then the compiler might be able to operate how the shifts work, but I could be wrong.


I spent a bit of time trying to make it a bit more readable:

void Pokitto::lcdRefreshMixMode(const uint8_t * screenBuffer, const uint16_t * palettePointer, const uint8_t * scanTypes)
{
	write_command(0x03);
	write_data(0x1038);
	
	// Horizontal DRAM Address
	write_command(0x20);
	write_data(0);
	
	// Vertical DRAM Address
	write_command(0x21);
	write_data(0);
	
	// write data to DRAM
	write_command(0x22);
	CLR_CS_SET_CD_RD_WR;
	SET_MASK_P2;
	
	uint32_t scanline[220];

	// point to beginning of line in data
	const uint8_t * d = screenBuffer; 
	for(uint32_t y = 0; y < 176; ++y)
	{
		// find colours in one scanline
		uint8_t scanTypeIndex = y >> 1;
		uint8_t lineIndex = 0;
		switch(scanType[scanTypeIndex])
		{
			case 0:
			{
				// point to beginning of line in data
				d = &screenBuffer[110 * scanTypeIndex];
				for(uint8_t x = 0; x < (220 / 2); ++x)
				{
					uint32_t color = static_cast<uint32_t>(palettePointer[*d]) << 3;
					++d;
					scanline[lineIndex] = color;
					++lineIndex;
					scanline[lineIndex] = color;
					++lineIndex;
				}
				break;
			}
			case 1:
			{
				for(uint8_t x = 0; x < (220 / 4); ++x)
				{
					uint8_t t = *d;
					++d;
					
					uint32_t color1 = static_cast<uint32_t>(palettePointer[256 + (t >> 4)]) << 3;
					scanline[lineIndex] = color1;
					++lineIndex;
					scanline[lineIndex] = color1;
					++lineIndex;
					
					uint32_t color2 = static_cast<uint32_t>(palettePointer[256 + (t & 0xF)]) << 3;
					scanline[lineIndex] = color2;
					++lineIndex;
					scanline[lineIndex] = color2;
					++lineIndex;			
				}
				break;
			}
			case 2:
			{
				for(uint8_t x = 0; x < (220 / 4); ++x)
				{
					uint8_t t = *d;
					++d;
					
					scanline[lineIndex] = static_cast<uint32_t>(palettePointer[272 + ((t >> 6) & 0x03)]) << 3;
					++lineIndex;
					
					scanline[lineIndex] = static_cast<uint32_t>(palettePointer[272 + ((t >> 4) & 0x03)]) << 3;
					++lineIndex;
					
					scanline[lineIndex] = static_cast<uint32_t>(palettePointer[272 + ((t >> 2) & 0x03)]) << 3;
					++lineIndex;
					
					scanline[lineIndex] = static_cast<uint32_t>(palettePointer[272 + ((t >> 0) & 0x03)]) << 3;
					++lineIndex;
				}
				break;
			}
		}
		
		#define WRITE_SCANLINE \
			*LCD = (scanline[i]); \
			TGL_WR; \
			++i;
	
		volatile uint32_t * LCD = reinterpret_cast< volatile uint32_t * >(0xA0002188);
		for (uint8_t i = 0; i < 220;)
		{
			WRITE_SCANLINE			
			WRITE_SCANLINE			
			WRITE_SCANLINE			
			WRITE_SCANLINE			
			WRITE_SCANLINE			
			WRITE_SCANLINE			
			WRITE_SCANLINE			
			WRITE_SCANLINE			
			WRITE_SCANLINE			
			WRITE_SCANLINE			
			WRITE_SCANLINE
		}
		
		#undef WRITE_SCANLINE
	}

	CLR_MASK_P2;
}

Perhaps this will help someone spot a possible way to speed it up.

Have you tried using TGL_WR_OP instead of TGL_WR?
The latter uses a nop to prevent writing to the LCD too fast and ends up slowing down more than is necessary.
The former tries to do something useful instead:
*LCD = color; TGL_WR_OP(color = scanline[++i]);
You just have to load the first value of color before the loop:
uint32_t color = scanline[0];
Technically, doing this is wrong (the last color is read out of bounds and discarded) but it isn’t a problem.

So that gets about 10fps for the demo I posted in the other thread. Seems acceptable for redrawing the whole screen every frame. I’m getting 20+ for my wip game, which uses mostly wide-pixel and a little 4colour, which is OK.

I haven’t looked at this in a while, is the following as fast as it’s likely to get? would the ASM tricks for mode1,13,15 help?

void Pokitto::lcdRefreshMixMode(const uint8_t * scrbuf, const uint16_t * paletteptr, const uint8_t * scanType)
{
	write_command(0x03);
	write_data(0x1038);

	// Horizontal DRAM Address
	write_command(0x20);
	write_data(0);

	// Vertical DRAM Address
	write_command(0x21);
	write_data(0);

	// write data to DRAM
	write_command(0x22);
	CLR_CS_SET_CD_RD_WR;
	SET_MASK_P2;

	uint32_t scanline[220];

	// point to beginning of line in data
	const uint8_t * d = scrbuf;
	for(uint32_t y = 0; y < 176; ++y)
	{
		// find colours in one scanline
		uint8_t scanTypeIndex = y >> 1;
		uint8_t lineIndex = 0;
		switch(scanType[scanTypeIndex])
		{
			case 0: // 8bpp
			{
				// point to beginning of line in data
				d = &scrbuf[110 * scanTypeIndex];
				uint32_t color;
				for(uint8_t x = 0; x < 11; ++x)
				{
					color = static_cast<uint32_t>(paletteptr[*d]) << 3;
					++d; scanline[lineIndex] = color; ++lineIndex; scanline[lineIndex] = color; ++lineIndex;
					color = static_cast<uint32_t>(paletteptr[*d]) << 3;
					++d; scanline[lineIndex] = color; ++lineIndex; scanline[lineIndex] = color; ++lineIndex;
					color = static_cast<uint32_t>(paletteptr[*d]) << 3;
					++d; scanline[lineIndex] = color; ++lineIndex; scanline[lineIndex] = color; ++lineIndex;
					color = static_cast<uint32_t>(paletteptr[*d]) << 3;
					++d; scanline[lineIndex] = color; ++lineIndex; scanline[lineIndex] = color; ++lineIndex;
					color = static_cast<uint32_t>(paletteptr[*d]) << 3;
					++d; scanline[lineIndex] = color; ++lineIndex; scanline[lineIndex] = color; ++lineIndex;
					color = static_cast<uint32_t>(paletteptr[*d]) << 3;
					++d; scanline[lineIndex] = color; ++lineIndex; scanline[lineIndex] = color; ++lineIndex;
					color = static_cast<uint32_t>(paletteptr[*d]) << 3;
					++d; scanline[lineIndex] = color; ++lineIndex; scanline[lineIndex] = color; ++lineIndex;
					color = static_cast<uint32_t>(paletteptr[*d]) << 3;
					++d; scanline[lineIndex] = color; ++lineIndex; scanline[lineIndex] = color; ++lineIndex;
					color = static_cast<uint32_t>(paletteptr[*d]) << 3;
					++d; scanline[lineIndex] = color; ++lineIndex; scanline[lineIndex] = color; ++lineIndex;
					color = static_cast<uint32_t>(paletteptr[*d]) << 3;
					++d; scanline[lineIndex] = color; ++lineIndex; scanline[lineIndex] = color; ++lineIndex;
				}
				break;
			}
			case 1: // 4bpp
			{
			    uint8_t t;
			    uint32_t color1,color2;
				for(uint8_t x = 0; x < 11; ++x)
				{
					t = *d++;
					color1 = static_cast<uint32_t>(paletteptr[256 + (t >> 4)]) << 3;
					scanline[lineIndex] = color1; ++lineIndex; scanline[lineIndex] = color1; ++lineIndex;
					color2 = static_cast<uint32_t>(paletteptr[256 + (t & 0xF)]) << 3;
					scanline[lineIndex] = color2; ++lineIndex; scanline[lineIndex] = color2; ++lineIndex;
					t = *d++;
					color1 = static_cast<uint32_t>(paletteptr[256 + (t >> 4)]) << 3;
					scanline[lineIndex] = color1; ++lineIndex; scanline[lineIndex] = color1; ++lineIndex;
					color2 = static_cast<uint32_t>(paletteptr[256 + (t & 0xF)]) << 3;
					scanline[lineIndex] = color2; ++lineIndex; scanline[lineIndex] = color2; ++lineIndex;
					t = *d++;
					color1 = static_cast<uint32_t>(paletteptr[256 + (t >> 4)]) << 3;
					scanline[lineIndex] = color1; ++lineIndex; scanline[lineIndex] = color1; ++lineIndex;
					color2 = static_cast<uint32_t>(paletteptr[256 + (t & 0xF)]) << 3;
					scanline[lineIndex] = color2; ++lineIndex; scanline[lineIndex] = color2; ++lineIndex;
					t = *d++;
					color1 = static_cast<uint32_t>(paletteptr[256 + (t >> 4)]) << 3;
					scanline[lineIndex] = color1; ++lineIndex; scanline[lineIndex] = color1; ++lineIndex;
					color2 = static_cast<uint32_t>(paletteptr[256 + (t & 0xF)]) << 3;
					scanline[lineIndex] = color2; ++lineIndex; scanline[lineIndex] = color2; ++lineIndex;
					t = *d++;
					color1 = static_cast<uint32_t>(paletteptr[256 + (t >> 4)]) << 3;
					scanline[lineIndex] = color1; ++lineIndex; scanline[lineIndex] = color1; ++lineIndex;
					color2 = static_cast<uint32_t>(paletteptr[256 + (t & 0xF)]) << 3;
					scanline[lineIndex] = color2; ++lineIndex; scanline[lineIndex] = color2; ++lineIndex;
				}
				break;
			}
			case 2: // 2bpp
			{

				uint8_t t = *d;
				for(uint8_t x = 0; x < 11; ++x)
				{
					t = *d++;

					scanline[lineIndex] = static_cast<uint32_t>(paletteptr[272 + ((t >> 6) & 0x03)]) << 3;
					++lineIndex;
					scanline[lineIndex] = static_cast<uint32_t>(paletteptr[272 + ((t >> 4) & 0x03)]) << 3;
					++lineIndex;
					scanline[lineIndex] = static_cast<uint32_t>(paletteptr[272 + ((t >> 2) & 0x03)]) << 3;
					++lineIndex;
					scanline[lineIndex] = static_cast<uint32_t>(paletteptr[272 + ((t >> 0) & 0x03)]) << 3;
					++lineIndex;

					t = *d++;
					scanline[lineIndex] = static_cast<uint32_t>(paletteptr[272 + ((t >> 6) & 0x03)]) << 3;
					++lineIndex;
					scanline[lineIndex] = static_cast<uint32_t>(paletteptr[272 + ((t >> 4) & 0x03)]) << 3;
					++lineIndex;
					scanline[lineIndex] = static_cast<uint32_t>(paletteptr[272 + ((t >> 2) & 0x03)]) << 3;
					++lineIndex;
					scanline[lineIndex] = static_cast<uint32_t>(paletteptr[272 + ((t >> 0) & 0x03)]) << 3;
					++lineIndex;

					t = *d++;
					scanline[lineIndex] = static_cast<uint32_t>(paletteptr[272 + ((t >> 6) & 0x03)]) << 3;
					++lineIndex;
					scanline[lineIndex] = static_cast<uint32_t>(paletteptr[272 + ((t >> 4) & 0x03)]) << 3;
					++lineIndex;
					scanline[lineIndex] = static_cast<uint32_t>(paletteptr[272 + ((t >> 2) & 0x03)]) << 3;
					++lineIndex;
					scanline[lineIndex] = static_cast<uint32_t>(paletteptr[272 + ((t >> 0) & 0x03)]) << 3;
					++lineIndex;

					t = *d++;
					scanline[lineIndex] = static_cast<uint32_t>(paletteptr[272 + ((t >> 6) & 0x03)]) << 3;
					++lineIndex;
					scanline[lineIndex] = static_cast<uint32_t>(paletteptr[272 + ((t >> 4) & 0x03)]) << 3;
					++lineIndex;
					scanline[lineIndex] = static_cast<uint32_t>(paletteptr[272 + ((t >> 2) & 0x03)]) << 3;
					++lineIndex;
					scanline[lineIndex] = static_cast<uint32_t>(paletteptr[272 + ((t >> 0) & 0x03)]) << 3;
					++lineIndex;

					t = *d++;
					scanline[lineIndex] = static_cast<uint32_t>(paletteptr[272 + ((t >> 6) & 0x03)]) << 3;
					++lineIndex;
					scanline[lineIndex] = static_cast<uint32_t>(paletteptr[272 + ((t >> 4) & 0x03)]) << 3;
					++lineIndex;
					scanline[lineIndex] = static_cast<uint32_t>(paletteptr[272 + ((t >> 2) & 0x03)]) << 3;
					++lineIndex;
					scanline[lineIndex] = static_cast<uint32_t>(paletteptr[272 + ((t >> 0) & 0x03)]) << 3;
					++lineIndex;

				}

				break;
			}
		}

        uint32_t color = scanline[0];
        #define WRITE_SCANLINE *LCD = color; TGL_WR_OP(color = scanline[++i]);

		volatile uint32_t * LCD = reinterpret_cast< volatile uint32_t * >(0xA0002188);
		for (uint8_t i = 0; i < 55;)
		{
            WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE
            WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE
            WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE
            WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE
            WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE
            WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE
            WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE
            WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE
            WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE
            WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE
            WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE
            WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE
            WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE
            WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE
            WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE
            WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE
            WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE
            WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE
            WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE
            WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE
            WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE
            WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE
            WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE
            WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE
            WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE WRITE_SCANLINE
		}

		#undef WRITE_SCANLINE
	}

	CLR_MASK_P2;
}

I’m getting 33fps in the emulator and 25fps on hardware.

This may or may not be faster (but at the very least it’s easier to read):

(Note: completely untested.)

void Pokitto::lcdRefreshMixMode(const uint8_t * screenBuffer, const uint16_t * palette, const uint8_t * scanTypes)
{
	write_command(0x03);
	write_data(0x1038);

	// Horizontal DRAM Address
	write_command(0x20);
	write_data(0);

	// Vertical DRAM Address
	write_command(0x21);
	write_data(0);

	// write data to DRAM
	write_command(0x22);
	CLR_CS_SET_CD_RD_WR;
	SET_MASK_P2;

	uint32_t scanline[220];

	// point to beginning of line in data
	for(size_t y = 0; y < 176; ++y)
	{
		// find colours in one scanline
		const uint8_t scanTypeIndex = (y >> 1);
		switch(scanType[scanTypeIndex])
		{
			case 0: // 8bpp
			{
				// Point to the beginning of the line in data
				const uint8_t * data = &screenBuffer[110 * scanTypeIndex];
				size_t scanlineIndex = 0;
				
				#define STEP() \
					{ \
						uint32_t colour = static_cast<uint32_t>(palette[*data]) << 3; \
						++data; \
						scanline[scanlineIndex] = colour; \
						++scanlineIndex; \
						scanline[scanlineIndex] = colour; \
						++scanlineIndex; \
					}
				
				for(uint8_t x = 0; x < 11; ++x)
				{
					STEP();
					STEP();
					STEP();
					STEP();
					STEP();
					STEP();
					STEP();
					STEP();
					STEP();
					STEP();
				}
				break;
				
				#undef STEP
			}
			case 1: // 4bpp
			{
				const uint16_t * palette4bpp = &palette[256];
				size_t screenBufferIndex = 0;
				size_t scanlineIndex = 0;
				
				#define STEP() \
					{ \
						const uint8_t value = screenBuffer[screenBufferIndex]; \
						++screenBufferIndex; \
						\
						const size_t paletteIndex1 = ((value >> 4) & 0x0F); \
						const uint32_t colour1 = static_cast<uint32_t>(palette4bpp[paletteIndex1]) << 3; \
						scanline[scanlineIndex] = colour1; \
						++scanlineIndex; \
						scanline[scanlineIndex] = colour1; \
						++scanlineIndex; \
						\
						const size_t paletteIndex2 = ((value >> 0) & 0x0F); \
						const uint32_t colour2 = static_cast<uint32_t>(palette4bpp[paletteIndex2]) << 3; \
						scanline[scanlineIndex] = colour2; \
						++scanlineIndex; \
						scanline[scanlineIndex] = colour2; \
						++scanlineIndex; \
					}
				
				for(uint8_t x = 0; x < 11; ++x)
				{
					STEP();
					STEP();
					STEP();
					STEP();
					STEP();
				}
				break;
				
				#undef STEP
			}
			case 2: // 2bpp
			{
				const uint16_t * palette2bpp = &palette[272];
				uint8_t scanlineIndex = 0;
				
				#define STEP() \				
					const uint8_t value = screenBuffer[screenBufferIndex]; \
					++screenBufferIndex; \
					\
					const size_t paletteIndex0 = ((value >> 6) & 0x03); \
					const size_t paletteIndex1 = ((value >> 4) & 0x03); \
					const size_t paletteIndex2 = ((value >> 2) & 0x03); \
					const size_t paletteIndex3 = ((value >> 0) & 0x03); \
					\
					scanline[scanlineIndex] = (static_cast<uint32_t>(palette2bpp[paletteIndex0]) << 3); \
					++scanlineIndex; \
					scanline[scanlineIndex] = (static_cast<uint32_t>(palette2bpp[paletteIndex1]) << 3); \
					++scanlineIndex; \
					scanline[scanlineIndex] = (static_cast<uint32_t>(palette2bpp[paletteIndex2]) << 3); \
					++scanlineIndex; \
					scanline[scanlineIndex] = (static_cast<uint32_t>(palette2bpp[paletteIndex3]) << 3); \
					++scanlineIndex;

				for(uint8_t x = 0; x < 11; ++x)
				{
					STEP();
					STEP();
					STEP();
					STEP();
					STEP();
				}
				
				#undef STEP

				break;
			}
		}

        #define WRITE_SCANLINE() \
			*LCD = colour; \
			colour = scanline[scanlineIndex] \
			++scanlineIndex; \
			TGL_WR_OP(colour);
		
		#define WRITE_SCANLINE_2 \
			WRITE_SCANLINE \
			WRITE_SCANLINE
		
		#define WRITE_SCANLINE_10 \
			WRITE_SCANLINE_2 \
			WRITE_SCANLINE_2 \
			WRITE_SCANLINE_2 \
			WRITE_SCANLINE_2 \
			WRITE_SCANLINE_2
		
		#define WRITE_SCANLINE_20 \
			WRITE_SCANLINE_10 \
			WRITE_SCANLINE_10

		{
			volatile uint32_t * LCD = reinterpret_cast<volatile uint32_t *>(0xA0002188);
			uint32_t colour = scanline[0];
			size_t scanlineIndex = 0;

			WRITE_SCANLINE_20
			WRITE_SCANLINE_20
			WRITE_SCANLINE_20
			WRITE_SCANLINE_20
			WRITE_SCANLINE_20
			WRITE_SCANLINE_20
			WRITE_SCANLINE_20
			WRITE_SCANLINE_20
			WRITE_SCANLINE_20
			WRITE_SCANLINE_20
			WRITE_SCANLINE_20
		}

		#undef WRITE_SCANLINE
	}

	CLR_MASK_P2;
}

I don’t know enough about manipulating the screen to know if there’s something that can be done with the screen manipulation stuff, but I can think of one obvious optimisation.

Instead of having this code full of << 3, you could just store the palette data already shifted.

I notice that there’s over 256 colours in the palette, so the size increase wouldn’t work for the 8bpp mode, but it could be done easily enough with the smaller palettes:


Edit:
Actually, I keep forgetting the Pokitto has more RAM than I’m used to working with.
It might be possible to do the 8bpp palette colours too.
It would mean another 1024 bytes on the stack though.
I’m too lazy (and too busy) to do the calculations to check if it’s viable.


(Again, completely untested)

void lcdRefreshMixModeImplementation(const uint8_t * screenBuffer, const uint8_t * scanTypes, const uint16_t * palette8bpp, const uint32_t * palette4bpp, const uint32_t * palette2bpp);

void Pokitto::lcdRefreshMixMode(const uint8_t * screenBuffer, const uint16_t * palette, const uint8_t * scanType)
{
	constexpr size_t palette8bppCount = (1 << 8);
	constexpr size_t palette4bppCount = (1 << 4);
	constexpr size_t palette2bppCount = (1 << 2);
	
	constexpr size_t palette8bppOffset = 0;
	constexpr size_t palette4bppOffset = (palette8bppOffset + palette4bppCount);
	constexpr size_t palette2bppOffset = (palette4bppOffset + palette2bppCount);

	uint32_t palette4bpp[palette4bppCount];
	uint32_t palette2bpp[palette2bppCount];
	
	const uint16_t * palette4bppBase = &palette[palette4bppOffset];	
	for(size_t index = 0; index < palette4bppCount; ++index)
	{
		uint32_t colour = palette4bppBase[index];
		palette4bpp[index] = (colour << 3);
	}
	
	const uint16_t * palette2bppBase = &palette[palette2bppOffset];	
	for(size_t index = 0; index < palette2bppCount; ++index)
	{
		uint32_t colour = palette2bppBase[index];
		palette2bpp[index] = (colour << 3);
	}
	
	lcdRefreshMixModeImplementation(screenBuffer, scanType, palette, palette4bpp, palette2bpp);
}

void lcdRefreshMixModeImplementation(const uint8_t * screenBuffer, const uint8_t * scanTypes, const uint16_t * palette8bpp, const uint32_t * palette4bpp, const uint32_t * palette2bpp)
{
	write_command(0x03);
	write_data(0x1038);

	// Horizontal DRAM Address
	write_command(0x20);
	write_data(0);

	// Vertical DRAM Address
	write_command(0x21);
	write_data(0);

	// write data to DRAM
	write_command(0x22);
	CLR_CS_SET_CD_RD_WR;
	SET_MASK_P2;

	uint32_t scanline[220];

	// point to beginning of line in data
	for(size_t y = 0; y < 176; ++y)
	{
		// find colours in one scanline
		const uint8_t scanTypeIndex = (y >> 1);
		switch(scanType[scanTypeIndex])
		{
			case 0: // 8bpp
			{
				// Point to the beginning of the line in data
				const uint8_t * data = &screenBuffer[110 * scanTypeIndex];
				size_t scanlineIndex = 0;
				
				#define STEP() \
					{ \
						uint32_t colour = static_cast<uint32_t>(palette8bpp[*data]) << 3; \
						++data; \
						scanline[scanlineIndex] = colour; \
						++scanlineIndex; \
						scanline[scanlineIndex] = colour; \
						++scanlineIndex; \
					}
				
				for(uint8_t x = 0; x < 11; ++x)
				{
					STEP();
					STEP();
					STEP();
					STEP();
					STEP();
					STEP();
					STEP();
					STEP();
					STEP();
					STEP();
				}
				break;
				
				#undef STEP
			}
			case 1: // 4bpp
			{
				size_t screenBufferIndex = 0;
				size_t scanlineIndex = 0;
				
				#define STEP() \
					{ \
						const uint8_t value = screenBuffer[screenBufferIndex]; \
						++screenBufferIndex; \
						\
						const size_t paletteIndex1 = ((value >> 4) & 0x0F); \
						const uint32_t colour1 = palette4bpp[paletteIndex1]; \
						scanline[scanlineIndex] = colour1; \
						++scanlineIndex; \
						scanline[scanlineIndex] = colour1; \
						++scanlineIndex; \
						\
						const size_t paletteIndex2 = ((value >> 0) & 0x0F); \
						const uint32_t colour2 = palette4bpp[paletteIndex2]; \
						scanline[scanlineIndex] = colour2; \
						++scanlineIndex; \
						scanline[scanlineIndex] = colour2; \
						++scanlineIndex; \
					}
				
				for(uint8_t x = 0; x < 11; ++x)
				{
					STEP();
					STEP();
					STEP();
					STEP();
					STEP();
				}
				break;
				
				#undef STEP
			}
			case 2: // 2bpp
			{
				uint8_t scanlineIndex = 0;
				
				#define STEP() \				
					const uint8_t value = screenBuffer[screenBufferIndex]; \
					++screenBufferIndex; \
					\
					const size_t paletteIndex0 = ((value >> 6) & 0x03); \
					const size_t paletteIndex1 = ((value >> 4) & 0x03); \
					const size_t paletteIndex2 = ((value >> 2) & 0x03); \
					const size_t paletteIndex3 = ((value >> 0) & 0x03); \
					\
					scanline[scanlineIndex] = palette2bpp[paletteIndex0]; \
					++scanlineIndex; \
					scanline[scanlineIndex] = palette2bpp[paletteIndex1]); \
					++scanlineIndex; \
					scanline[scanlineIndex] = palette2bpp[paletteIndex2]; \
					++scanlineIndex; \
					scanline[scanlineIndex] = palette2bpp[paletteIndex3]; \
					++scanlineIndex;

				for(uint8_t x = 0; x < 11; ++x)
				{
					STEP();
					STEP();
					STEP();
					STEP();
					STEP();
				}
				
				#undef STEP

				break;
			}
		}

        #define WRITE_SCANLINE() \
			*LCD = colour; \
			colour = scanline[scanlineIndex] \
			++scanlineIndex; \
			TGL_WR_OP(colour);
		
		#define WRITE_SCANLINE_2 \
			WRITE_SCANLINE \
			WRITE_SCANLINE
		
		#define WRITE_SCANLINE_10 \
			WRITE_SCANLINE_2 \
			WRITE_SCANLINE_2 \
			WRITE_SCANLINE_2 \
			WRITE_SCANLINE_2 \
			WRITE_SCANLINE_2
		
		#define WRITE_SCANLINE_20 \
			WRITE_SCANLINE_10 \
			WRITE_SCANLINE_10

		{
			volatile uint32_t * LCD = reinterpret_cast<volatile uint32_t *>(0xA0002188);
			uint32_t colour = scanline[0];
			size_t scanlineIndex = 0;

			WRITE_SCANLINE_20
			WRITE_SCANLINE_20
			WRITE_SCANLINE_20
			WRITE_SCANLINE_20
			WRITE_SCANLINE_20
			WRITE_SCANLINE_20
			WRITE_SCANLINE_20
			WRITE_SCANLINE_20
			WRITE_SCANLINE_20
			WRITE_SCANLINE_20
			WRITE_SCANLINE_20
		}

		#undef WRITE_SCANLINE
	}

	CLR_MASK_P2;
}

For a price of just 20 extra shifts (and a bit of extra memory and looping) it eliminates the 110 (2 * 5 * 11) shifts used in the 4bpp branch and the 220 (4 * 5 * 11) shifts used in the 2bpp branch.

So theoretically it’ll slow down the 8bpp branch but significantly speed up the 4bpp and 2bpp branches.

1 Like

haven’t got it to work yet, there are a couple of typos, but also a handful of

Pokitto\POKITTO_HW\HWLCD.cpp|2527|error: redeclaration of ‘const uint8_t value’|

in here -

				#define STEP() \
					const uint8_t value = screenBuffer[screenBufferIndex]; \
					++screenBufferIndex; \
					\
					const size_t paletteIndex0 = ((value >> 6) & 0x03); \
					const size_t paletteIndex1 = ((value >> 4) & 0x03); \
					const size_t paletteIndex2 = ((value >> 2) & 0x03); \
					const size_t paletteIndex3 = ((value >> 0) & 0x03); \
					\
					scanline[scanlineIndex] = palette2bpp[paletteIndex0]; \
					++scanlineIndex; \
					scanline[scanlineIndex] = palette2bpp[paletteIndex1]; \
					++scanlineIndex; \
					scanline[scanlineIndex] = palette2bpp[paletteIndex2]; \
					++scanlineIndex; \
					scanline[scanlineIndex] = palette2bpp[paletteIndex3]; \
					++scanlineIndex;

Which ones?

(I’m not surprised, I threw it together in Notepad++.)

Oh right, I forgot to wrap that in a block.

Replace with:

				#define STEP() \
					{ \
						const uint8_t value = screenBuffer[screenBufferIndex]; \
						++screenBufferIndex; \
						\
						const size_t paletteIndex0 = ((value >> 6) & 0x03); \
						const size_t paletteIndex1 = ((value >> 4) & 0x03); \
						const size_t paletteIndex2 = ((value >> 2) & 0x03); \
						const size_t paletteIndex3 = ((value >> 0) & 0x03); \
						\
						scanline[scanlineIndex] = palette2bpp[paletteIndex0]; \
						++scanlineIndex; \
						scanline[scanlineIndex] = palette2bpp[paletteIndex1]; \
						++scanlineIndex; \
						scanline[scanlineIndex] = palette2bpp[paletteIndex2]; \
						++scanlineIndex; \
						scanline[scanlineIndex] = palette2bpp[paletteIndex3]; \
						++scanlineIndex; \
					}
void lcdRefreshMixModeImplementation(const uint8_t * screenBuffer, const uint8_t * scanTypes, const uint16_t * palette8bpp, const uint32_t * palette4bpp, const uint32_t * palette2bpp)

scanTypes should be scanType

scanline[scanlineIndex] = palette2bpp[paletteIndex1]); \

the ) shouldn’t be there

I think this might be wrong also.

        #define WRITE_SCANLINE() \
			*LCD = colour; \
			colour = scanline[scanlineIndex] \
			++scanlineIndex; \
			TGL_WR_OP(colour);

to

        #define WRITE_SCANLINE \
			*LCD = colour; \
			colour = scanline[scanlineIndex]; \
			++scanlineIndex; \
			TGL_WR_OP(colour);

There appears to be a slightly better framerate, 8bit section looks fine, 2bit section doesn’t work :frowning:
Image1

should look more like -
Image2

I have no idea what the 4bit image looks like, this project doesn’t use it :stuck_out_tongue:

I purposely changed scanType to scanTypes because it’s supposed to represent an array.

Yep, I missed a line.

Not wrong as such, I just forgot to put () on the other defines.


Ah, I see the problem:

constexpr size_t palette8bppOffset = 0;
constexpr size_t palette4bppOffset = (palette8bppOffset + palette4bppCount);
constexpr size_t palette2bppOffset = (palette4bppOffset + palette2bppCount);

Should be:

constexpr size_t palette8bppOffset = 0;
constexpr size_t palette4bppOffset = (palette8bppOffset + palette8bppCount);
constexpr size_t palette2bppOffset = (palette4bppOffset + palette4bppCount);

Here’s a version with all those things corrected:

void lcdRefreshMixModeImplementation(const uint8_t screenBuffer[], const uint8_t scanTypes[], const uint16_t palette8bpp[], const uint32_t (&palette4bpp)[16], const uint32_t (&palette2bpp)[4]);

void Pokitto::lcdRefreshMixMode(const uint8_t * screenBuffer, const uint16_t * palette, const uint8_t * scanTypes)
{
	constexpr size_t palette8bppCount = (1 << 8);
	constexpr size_t palette4bppCount = (1 << 4);
	constexpr size_t palette2bppCount = (1 << 2);
	
	constexpr size_t palette8bppOffset = 0;
	constexpr size_t palette4bppOffset = (palette8bppOffset + palette8bppCount);
	constexpr size_t palette2bppOffset = (palette4bppOffset + palette4bppCount);

	uint32_t palette4bpp[palette4bppCount];
	uint32_t palette2bpp[palette2bppCount];
	
	const uint16_t * palette4bppBase = &palette[palette4bppOffset];	
	for(size_t index = 0; index < palette4bppCount; ++index)
	{
		uint32_t colour = palette4bppBase[index];
		palette4bpp[index] = (colour << 3);
	}
	
	const uint16_t * palette2bppBase = &palette[palette2bppOffset];	
	for(size_t index = 0; index < palette2bppCount; ++index)
	{
		uint32_t colour = palette2bppBase[index];
		palette2bpp[index] = (colour << 3);
	}
	
	lcdRefreshMixModeImplementation(screenBuffer, scanTypes, palette, palette4bpp, palette2bpp);
}

void lcdRefreshMixModeImplementation(const uint8_t screenBuffer[], const uint8_t scanTypes[], const uint16_t palette8bpp[], const uint32_t (&palette4bpp)[16], const uint32_t (&palette2bpp)[4]);
{
	write_command(0x03);
	write_data(0x1038);

	// Horizontal DRAM Address
	write_command(0x20);
	write_data(0);

	// Vertical DRAM Address
	write_command(0x21);
	write_data(0);

	// write data to DRAM
	write_command(0x22);
	CLR_CS_SET_CD_RD_WR;
	SET_MASK_P2;

	uint32_t scanline[220];

	// point to beginning of line in data
	for(size_t y = 0; y < 176; ++y)
	{
		// find colours in one scanline
		const uint8_t scanTypeIndex = (y >> 1);
		switch(scanTypes[scanTypeIndex])
		{
			case 0: // 8bpp
			{
				// Point to the beginning of the line in data
				const uint8_t * data = &screenBuffer[110 * scanTypeIndex];
				size_t scanlineIndex = 0;
				
				#define STEP() \
					{ \
						uint32_t colour = static_cast<uint32_t>(palette8bpp[*data]) << 3; \
						++data; \
						scanline[scanlineIndex] = colour; \
						++scanlineIndex; \
						scanline[scanlineIndex] = colour; \
						++scanlineIndex; \
					}
				
				for(uint8_t x = 0; x < 11; ++x)
				{
					STEP();
					STEP();
					STEP();
					STEP();
					STEP();
					STEP();
					STEP();
					STEP();
					STEP();
					STEP();
				}
				break;
				
				#undef STEP
			}
			case 1: // 4bpp
			{
				size_t screenBufferIndex = 0;
				size_t scanlineIndex = 0;
				
				#define STEP() \
					{ \
						const uint8_t value = screenBuffer[screenBufferIndex]; \
						++screenBufferIndex; \
						\
						const size_t paletteIndex1 = ((value >> 4) & 0x0F); \
						const uint32_t colour1 = palette4bpp[paletteIndex1]; \
						scanline[scanlineIndex] = colour1; \
						++scanlineIndex; \
						scanline[scanlineIndex] = colour1; \
						++scanlineIndex; \
						\
						const size_t paletteIndex2 = ((value >> 0) & 0x0F); \
						const uint32_t colour2 = palette4bpp[paletteIndex2]; \
						scanline[scanlineIndex] = colour2; \
						++scanlineIndex; \
						scanline[scanlineIndex] = colour2; \
						++scanlineIndex; \
					}
				
				for(uint8_t x = 0; x < 11; ++x)
				{
					STEP();
					STEP();
					STEP();
					STEP();
					STEP();
				}
				break;
				
				#undef STEP
			}
			case 2: // 2bpp
			{
				uint8_t scanlineIndex = 0;
				
				#define STEP() \
					{ \
						const uint8_t value = screenBuffer[screenBufferIndex]; \
						++screenBufferIndex; \
						\
						const size_t paletteIndex0 = ((value >> 6) & 0x03); \
						const size_t paletteIndex1 = ((value >> 4) & 0x03); \
						const size_t paletteIndex2 = ((value >> 2) & 0x03); \
						const size_t paletteIndex3 = ((value >> 0) & 0x03); \
						\
						scanline[scanlineIndex] = palette2bpp[paletteIndex0]; \
						++scanlineIndex; \
						scanline[scanlineIndex] = palette2bpp[paletteIndex1]; \
						++scanlineIndex; \
						scanline[scanlineIndex] = palette2bpp[paletteIndex2]; \
						++scanlineIndex; \
						scanline[scanlineIndex] = palette2bpp[paletteIndex3]; \
						++scanlineIndex; \
					}

				for(uint8_t x = 0; x < 11; ++x)
				{
					STEP();
					STEP();
					STEP();
					STEP();
					STEP();
				}
				
				#undef STEP

				break;
			}
		}

        #define WRITE_SCANLINE() \
			*LCD = colour; \
			colour = scanline[scanlineIndex] \
			++scanlineIndex; \
			TGL_WR_OP(colour);
		
		#define WRITE_SCANLINE_2() \
			WRITE_SCANLINE(); \
			WRITE_SCANLINE();
		
		#define WRITE_SCANLINE_10() \
			WRITE_SCANLINE_2(); \
			WRITE_SCANLINE_2(); \
			WRITE_SCANLINE_2(); \
			WRITE_SCANLINE_2(); \
			WRITE_SCANLINE_2();
		
		#define WRITE_SCANLINE_20() \
			WRITE_SCANLINE_10(); \
			WRITE_SCANLINE_10();

		{
			volatile uint32_t * LCD = reinterpret_cast<volatile uint32_t *>(0xA0002188);
			uint32_t colour = scanline[0];
			size_t scanlineIndex = 0;

			WRITE_SCANLINE_20();
			WRITE_SCANLINE_20();
			WRITE_SCANLINE_20();
			WRITE_SCANLINE_20();
			WRITE_SCANLINE_20();
			WRITE_SCANLINE_20();
			WRITE_SCANLINE_20();
			WRITE_SCANLINE_20();
			WRITE_SCANLINE_20();
			WRITE_SCANLINE_20();
			WRITE_SCANLINE_20();
		}

		#undef WRITE_SCANLINE
	}

	CLR_MASK_P2;
}

Write a test case when you have chance.

still a couple of typos I think, a missing ; and variable not declared, here’s what I got after fixing those-

void lcdRefreshMixModeImplementation(const uint8_t screenBuffer[], const uint8_t scanTypes[], const uint16_t palette8bpp[], const uint32_t (&palette4bpp)[16], const uint32_t (&palette2bpp)[4]);

void Pokitto::lcdRefreshMixMode(const uint8_t * screenBuffer, const uint16_t * palette, const uint8_t * scanTypes)
{
	constexpr size_t palette8bppCount = (1 << 8);
	constexpr size_t palette4bppCount = (1 << 4);
	constexpr size_t palette2bppCount = (1 << 2);

	constexpr size_t palette8bppOffset = 0;
	constexpr size_t palette4bppOffset = (palette8bppOffset + palette8bppCount);
	constexpr size_t palette2bppOffset = (palette4bppOffset + palette4bppCount);

	uint32_t palette4bpp[palette4bppCount];
	uint32_t palette2bpp[palette2bppCount];

	const uint16_t * palette4bppBase = &palette[palette4bppOffset];
	for(size_t index = 0; index < palette4bppCount; ++index)
	{
		uint32_t colour = palette4bppBase[index];
		palette4bpp[index] = (colour << 3);
	}

	const uint16_t * palette2bppBase = &palette[palette2bppOffset];
	for(size_t index = 0; index < palette2bppCount; ++index)
	{
		uint32_t colour = palette2bppBase[index];
		palette2bpp[index] = (colour << 3);
	}

	lcdRefreshMixModeImplementation(screenBuffer, scanTypes, palette, palette4bpp, palette2bpp);
}

void lcdRefreshMixModeImplementation(const uint8_t screenBuffer[], const uint8_t scanTypes[], const uint16_t palette8bpp[], const uint32_t (&palette4bpp)[16], const uint32_t (&palette2bpp)[4])
{
	write_command(0x03);
	write_data(0x1038);

	// Horizontal DRAM Address
	write_command(0x20);
	write_data(0);

	// Vertical DRAM Address
	write_command(0x21);
	write_data(0);

	// write data to DRAM
	write_command(0x22);
	CLR_CS_SET_CD_RD_WR;
	SET_MASK_P2;

	uint32_t scanline[220];

	// point to beginning of line in data
	for(size_t y = 0; y < 176; ++y)
	{
		// find colours in one scanline
		const uint8_t scanTypeIndex = (y >> 1);
		switch(scanTypes[scanTypeIndex])
		{
			case 0: // 8bpp
			{
				// Point to the beginning of the line in data
				const uint8_t * data = &screenBuffer[110 * scanTypeIndex];
				size_t scanlineIndex = 0;

				#define STEP() \
					{ \
						uint32_t colour = static_cast<uint32_t>(palette8bpp[*data]) << 3; \
						++data; \
						scanline[scanlineIndex] = colour; \
						++scanlineIndex; \
						scanline[scanlineIndex] = colour; \
						++scanlineIndex; \
					}

				for(uint8_t x = 0; x < 11; ++x)
				{
					STEP();
					STEP();
					STEP();
					STEP();
					STEP();
					STEP();
					STEP();
					STEP();
					STEP();
					STEP();
				}
				break;

				#undef STEP
			}
			case 1: // 4bpp
			{
				size_t screenBufferIndex = 0;
				size_t scanlineIndex = 0;

				#define STEP() \
					{ \
						const uint8_t value = screenBuffer[screenBufferIndex]; \
						++screenBufferIndex; \
						\
						const size_t paletteIndex1 = ((value >> 4) & 0x0F); \
						const uint32_t colour1 = palette4bpp[paletteIndex1]; \
						scanline[scanlineIndex] = colour1; \
						++scanlineIndex; \
						scanline[scanlineIndex] = colour1; \
						++scanlineIndex; \
						\
						const size_t paletteIndex2 = ((value >> 0) & 0x0F); \
						const uint32_t colour2 = palette4bpp[paletteIndex2]; \
						scanline[scanlineIndex] = colour2; \
						++scanlineIndex; \
						scanline[scanlineIndex] = colour2; \
						++scanlineIndex; \
					}

				for(uint8_t x = 0; x < 11; ++x)
				{
					STEP();
					STEP();
					STEP();
					STEP();
					STEP();
				}
				break;

				#undef STEP
			}
			case 2: // 2bpp
			{
				uint8_t scanlineIndex = 0;
				size_t screenBufferIndex = 0;

				#define STEP() \
					{ \
						const uint8_t value = screenBuffer[screenBufferIndex]; \
						++screenBufferIndex; \
						\
						const size_t paletteIndex0 = ((value >> 6) & 0x03); \
						const size_t paletteIndex1 = ((value >> 4) & 0x03); \
						const size_t paletteIndex2 = ((value >> 2) & 0x03); \
						const size_t paletteIndex3 = ((value >> 0) & 0x03); \
						\
						scanline[scanlineIndex] = palette2bpp[paletteIndex0]; \
						++scanlineIndex; \
						scanline[scanlineIndex] = palette2bpp[paletteIndex1]; \
						++scanlineIndex; \
						scanline[scanlineIndex] = palette2bpp[paletteIndex2]; \
						++scanlineIndex; \
						scanline[scanlineIndex] = palette2bpp[paletteIndex3]; \
						++scanlineIndex; \
					}

				for(uint8_t x = 0; x < 11; ++x)
				{
					STEP();
					STEP();
					STEP();
					STEP();
					STEP();
				}

				#undef STEP

				break;
			}
		}

        #define WRITE_SCANLINE() \
			*LCD = colour; \
			colour = scanline[scanlineIndex]; \
			++scanlineIndex; \
			TGL_WR_OP(colour);

		#define WRITE_SCANLINE_2() \
			WRITE_SCANLINE(); \
			WRITE_SCANLINE();

		#define WRITE_SCANLINE_10() \
			WRITE_SCANLINE_2(); \
			WRITE_SCANLINE_2(); \
			WRITE_SCANLINE_2(); \
			WRITE_SCANLINE_2(); \
			WRITE_SCANLINE_2();

		#define WRITE_SCANLINE_20() \
			WRITE_SCANLINE_10(); \
			WRITE_SCANLINE_10();

		{
			volatile uint32_t * LCD = reinterpret_cast<volatile uint32_t *>(0xA0002188);
			uint32_t colour = scanline[0];
			size_t scanlineIndex = 0;

			WRITE_SCANLINE_20();
			WRITE_SCANLINE_20();
			WRITE_SCANLINE_20();
			WRITE_SCANLINE_20();
			WRITE_SCANLINE_20();
			WRITE_SCANLINE_20();
			WRITE_SCANLINE_20();
			WRITE_SCANLINE_20();
			WRITE_SCANLINE_20();
			WRITE_SCANLINE_20();
			WRITE_SCANLINE_20();
		}

		#undef WRITE_SCANLINE
	}

	CLR_MASK_P2;
}

8bit works, 4bit and 2bit dont. the screenshot should be 4bit on top with 2bit below.
Image3

the 4bit image should be the following ->
leaves

Have you got an example program I can run?
This would be a lot easier if I wasn’t ‘flying blind’.

Also I found a mistake in WRITE_SCANLINE,
I misread color = scanline[++i] as color = scanline[i++] so it should actually be:

#define WRITE_SCANLINE() \
	*LCD = colour; \
	++scanlineIndex; \
	colour = scanline[scanlineIndex]; \
	TGL_WR_OP(colour);

I think I found the problem.
One too many localisations.

Try it now:

void lcdRefreshMixModeImplementation(const uint8_t screenBuffer[], const uint8_t scanTypes[], const uint16_t palette8bpp[], const uint32_t (&palette4bpp)[16], const uint32_t (&palette2bpp)[4]);

void Pokitto::lcdRefreshMixMode(const uint8_t * screenBuffer, const uint16_t * palette, const uint8_t * scanTypes)
{
	constexpr size_t palette8bppCount = (1 << 8);
	constexpr size_t palette4bppCount = (1 << 4);
	constexpr size_t palette2bppCount = (1 << 2);

	constexpr size_t palette8bppOffset = 0;
	constexpr size_t palette4bppOffset = (palette8bppOffset + palette8bppCount);
	constexpr size_t palette2bppOffset = (palette4bppOffset + palette4bppCount);

	uint32_t palette4bpp[palette4bppCount];
	uint32_t palette2bpp[palette2bppCount];

	const uint16_t * palette4bppBase = &palette[palette4bppOffset];
	for(size_t index = 0; index < palette4bppCount; ++index)
	{
		uint32_t colour = palette4bppBase[index];
		palette4bpp[index] = (colour << 3);
	}

	const uint16_t * palette2bppBase = &palette[palette2bppOffset];
	for(size_t index = 0; index < palette2bppCount; ++index)
	{
		uint32_t colour = palette2bppBase[index];
		palette2bpp[index] = (colour << 3);
	}

	lcdRefreshMixModeImplementation(screenBuffer, scanTypes, palette, palette4bpp, palette2bpp);
}

void lcdRefreshMixModeImplementation(const uint8_t screenBuffer[], const uint8_t scanTypes[], const uint16_t palette8bpp[], const uint32_t (&palette4bpp)[16], const uint32_t (&palette2bpp)[4])
{
	write_command(0x03);
	write_data(0x1038);

	// Horizontal DRAM Address
	write_command(0x20);
	write_data(0);

	// Vertical DRAM Address
	write_command(0x21);
	write_data(0);

	// write data to DRAM
	write_command(0x22);
	CLR_CS_SET_CD_RD_WR;
	SET_MASK_P2;

	uint32_t scanline[220];
	size_t scanlineIndex = 0;

	// point to beginning of line in data
	for(size_t y = 0; y < 176; ++y)
	{
		// find colours in one scanline
		const uint8_t scanTypeIndex = (y >> 1);
		switch(scanTypes[scanTypeIndex])
		{
			case 0: // 8bpp
			{
				// Point to the beginning of the line in data
				const uint8_t * data = &screenBuffer[110 * scanTypeIndex];

				#define STEP() \
					{ \
						uint32_t colour = static_cast<uint32_t>(palette8bpp[*data]) << 3; \
						++data; \
						scanline[scanlineIndex] = colour; \
						++scanlineIndex; \
						scanline[scanlineIndex] = colour; \
						++scanlineIndex; \
					}

				for(uint8_t x = 0; x < 11; ++x)
				{
					STEP();
					STEP();
					STEP();
					STEP();
					STEP();
					STEP();
					STEP();
					STEP();
					STEP();
					STEP();
				}
				break;

				#undef STEP
			}
			case 1: // 4bpp
			{
				size_t screenBufferIndex = 0;

				#define STEP() \
					{ \
						const uint8_t value = screenBuffer[screenBufferIndex]; \
						++screenBufferIndex; \
						\
						const size_t paletteIndex1 = ((value >> 4) & 0x0F); \
						const uint32_t colour1 = palette4bpp[paletteIndex1]; \
						scanline[scanlineIndex] = colour1; \
						++scanlineIndex; \
						scanline[scanlineIndex] = colour1; \
						++scanlineIndex; \
						\
						const size_t paletteIndex2 = ((value >> 0) & 0x0F); \
						const uint32_t colour2 = palette4bpp[paletteIndex2]; \
						scanline[scanlineIndex] = colour2; \
						++scanlineIndex; \
						scanline[scanlineIndex] = colour2; \
						++scanlineIndex; \
					}

				for(uint8_t x = 0; x < 11; ++x)
				{
					STEP();
					STEP();
					STEP();
					STEP();
					STEP();
				}
				break;

				#undef STEP
			}
			case 2: // 2bpp
			{
				size_t screenBufferIndex = 0;

				#define STEP() \
					{ \
						const uint8_t value = screenBuffer[screenBufferIndex]; \
						++screenBufferIndex; \
						\
						const size_t paletteIndex0 = ((value >> 6) & 0x03); \
						const size_t paletteIndex1 = ((value >> 4) & 0x03); \
						const size_t paletteIndex2 = ((value >> 2) & 0x03); \
						const size_t paletteIndex3 = ((value >> 0) & 0x03); \
						\
						scanline[scanlineIndex] = palette2bpp[paletteIndex0]; \
						++scanlineIndex; \
						scanline[scanlineIndex] = palette2bpp[paletteIndex1]; \
						++scanlineIndex; \
						scanline[scanlineIndex] = palette2bpp[paletteIndex2]; \
						++scanlineIndex; \
						scanline[scanlineIndex] = palette2bpp[paletteIndex3]; \
						++scanlineIndex; \
					}

				for(uint8_t x = 0; x < 11; ++x)
				{
					STEP();
					STEP();
					STEP();
					STEP();
					STEP();
				}

				#undef STEP

				break;
			}
		}

        #define WRITE_SCANLINE() \
			*LCD = colour; \
			++scanlineIndex; \
			colour = scanline[scanlineIndex]; \
			TGL_WR_OP(colour);

		#define WRITE_SCANLINE_2() \
			WRITE_SCANLINE(); \
			WRITE_SCANLINE();

		#define WRITE_SCANLINE_10() \
			WRITE_SCANLINE_2(); \
			WRITE_SCANLINE_2(); \
			WRITE_SCANLINE_2(); \
			WRITE_SCANLINE_2(); \
			WRITE_SCANLINE_2();

		#define WRITE_SCANLINE_20() \
			WRITE_SCANLINE_10(); \
			WRITE_SCANLINE_10();

		{
			volatile uint32_t * LCD = reinterpret_cast<volatile uint32_t *>(0xA0002188);
			size_t scanlineIndex = 0;
			uint32_t colour = scanline[0];

			WRITE_SCANLINE_20();
			WRITE_SCANLINE_20();
			WRITE_SCANLINE_20();
			WRITE_SCANLINE_20();
			WRITE_SCANLINE_20();
			WRITE_SCANLINE_20();
			WRITE_SCANLINE_20();
			WRITE_SCANLINE_20();
			WRITE_SCANLINE_20();
			WRITE_SCANLINE_20();
			WRITE_SCANLINE_20();
		}

		#undef WRITE_SCANLINE20
		#undef WRITE_SCANLINE10
		#undef WRITE_SCANLINE2
		#undef WRITE_SCANLINE
	}

	CLR_MASK_P2;
}

I don’t know whats different about that, but it continually resets the emulator :stuck_out_tongue: