This may or may not be faster (but at the very least it’s easier to read):
(Note: completely untested.)
void Pokitto::lcdRefreshMixMode(const uint8_t * screenBuffer, const uint16_t * palette, const uint8_t * scanTypes)
{
write_command(0x03);
write_data(0x1038);
// Horizontal DRAM Address
write_command(0x20);
write_data(0);
// Vertical DRAM Address
write_command(0x21);
write_data(0);
// write data to DRAM
write_command(0x22);
CLR_CS_SET_CD_RD_WR;
SET_MASK_P2;
uint32_t scanline[220];
// point to beginning of line in data
for(size_t y = 0; y < 176; ++y)
{
// find colours in one scanline
const uint8_t scanTypeIndex = (y >> 1);
switch(scanType[scanTypeIndex])
{
case 0: // 8bpp
{
// Point to the beginning of the line in data
const uint8_t * data = &screenBuffer[110 * scanTypeIndex];
size_t scanlineIndex = 0;
#define STEP() \
{ \
uint32_t colour = static_cast<uint32_t>(palette[*data]) << 3; \
++data; \
scanline[scanlineIndex] = colour; \
++scanlineIndex; \
scanline[scanlineIndex] = colour; \
++scanlineIndex; \
}
for(uint8_t x = 0; x < 11; ++x)
{
STEP();
STEP();
STEP();
STEP();
STEP();
STEP();
STEP();
STEP();
STEP();
STEP();
}
break;
#undef STEP
}
case 1: // 4bpp
{
const uint16_t * palette4bpp = &palette[256];
size_t screenBufferIndex = 0;
size_t scanlineIndex = 0;
#define STEP() \
{ \
const uint8_t value = screenBuffer[screenBufferIndex]; \
++screenBufferIndex; \
\
const size_t paletteIndex1 = ((value >> 4) & 0x0F); \
const uint32_t colour1 = static_cast<uint32_t>(palette4bpp[paletteIndex1]) << 3; \
scanline[scanlineIndex] = colour1; \
++scanlineIndex; \
scanline[scanlineIndex] = colour1; \
++scanlineIndex; \
\
const size_t paletteIndex2 = ((value >> 0) & 0x0F); \
const uint32_t colour2 = static_cast<uint32_t>(palette4bpp[paletteIndex2]) << 3; \
scanline[scanlineIndex] = colour2; \
++scanlineIndex; \
scanline[scanlineIndex] = colour2; \
++scanlineIndex; \
}
for(uint8_t x = 0; x < 11; ++x)
{
STEP();
STEP();
STEP();
STEP();
STEP();
}
break;
#undef STEP
}
case 2: // 2bpp
{
const uint16_t * palette2bpp = &palette[272];
uint8_t scanlineIndex = 0;
#define STEP() \
const uint8_t value = screenBuffer[screenBufferIndex]; \
++screenBufferIndex; \
\
const size_t paletteIndex0 = ((value >> 6) & 0x03); \
const size_t paletteIndex1 = ((value >> 4) & 0x03); \
const size_t paletteIndex2 = ((value >> 2) & 0x03); \
const size_t paletteIndex3 = ((value >> 0) & 0x03); \
\
scanline[scanlineIndex] = (static_cast<uint32_t>(palette2bpp[paletteIndex0]) << 3); \
++scanlineIndex; \
scanline[scanlineIndex] = (static_cast<uint32_t>(palette2bpp[paletteIndex1]) << 3); \
++scanlineIndex; \
scanline[scanlineIndex] = (static_cast<uint32_t>(palette2bpp[paletteIndex2]) << 3); \
++scanlineIndex; \
scanline[scanlineIndex] = (static_cast<uint32_t>(palette2bpp[paletteIndex3]) << 3); \
++scanlineIndex;
for(uint8_t x = 0; x < 11; ++x)
{
STEP();
STEP();
STEP();
STEP();
STEP();
}
#undef STEP
break;
}
}
#define WRITE_SCANLINE() \
*LCD = colour; \
colour = scanline[scanlineIndex] \
++scanlineIndex; \
TGL_WR_OP(colour);
#define WRITE_SCANLINE_2 \
WRITE_SCANLINE \
WRITE_SCANLINE
#define WRITE_SCANLINE_10 \
WRITE_SCANLINE_2 \
WRITE_SCANLINE_2 \
WRITE_SCANLINE_2 \
WRITE_SCANLINE_2 \
WRITE_SCANLINE_2
#define WRITE_SCANLINE_20 \
WRITE_SCANLINE_10 \
WRITE_SCANLINE_10
{
volatile uint32_t * LCD = reinterpret_cast<volatile uint32_t *>(0xA0002188);
uint32_t colour = scanline[0];
size_t scanlineIndex = 0;
WRITE_SCANLINE_20
WRITE_SCANLINE_20
WRITE_SCANLINE_20
WRITE_SCANLINE_20
WRITE_SCANLINE_20
WRITE_SCANLINE_20
WRITE_SCANLINE_20
WRITE_SCANLINE_20
WRITE_SCANLINE_20
WRITE_SCANLINE_20
WRITE_SCANLINE_20
}
#undef WRITE_SCANLINE
}
CLR_MASK_P2;
}
I don’t know enough about manipulating the screen to know if there’s something that can be done with the screen manipulation stuff, but I can think of one obvious optimisation.
Instead of having this code full of << 3
, you could just store the palette data already shifted.
I notice that there’s over 256 colours in the palette, so the size increase wouldn’t work for the 8bpp mode, but it could be done easily enough with the smaller palettes:
Edit:
Actually, I keep forgetting the Pokitto has more RAM than I’m used to working with.
It might be possible to do the 8bpp palette colours too.
It would mean another 1024 bytes on the stack though.
I’m too lazy (and too busy) to do the calculations to check if it’s viable.
(Again, completely untested)
void lcdRefreshMixModeImplementation(const uint8_t * screenBuffer, const uint8_t * scanTypes, const uint16_t * palette8bpp, const uint32_t * palette4bpp, const uint32_t * palette2bpp);
void Pokitto::lcdRefreshMixMode(const uint8_t * screenBuffer, const uint16_t * palette, const uint8_t * scanType)
{
constexpr size_t palette8bppCount = (1 << 8);
constexpr size_t palette4bppCount = (1 << 4);
constexpr size_t palette2bppCount = (1 << 2);
constexpr size_t palette8bppOffset = 0;
constexpr size_t palette4bppOffset = (palette8bppOffset + palette4bppCount);
constexpr size_t palette2bppOffset = (palette4bppOffset + palette2bppCount);
uint32_t palette4bpp[palette4bppCount];
uint32_t palette2bpp[palette2bppCount];
const uint16_t * palette4bppBase = &palette[palette4bppOffset];
for(size_t index = 0; index < palette4bppCount; ++index)
{
uint32_t colour = palette4bppBase[index];
palette4bpp[index] = (colour << 3);
}
const uint16_t * palette2bppBase = &palette[palette2bppOffset];
for(size_t index = 0; index < palette2bppCount; ++index)
{
uint32_t colour = palette2bppBase[index];
palette2bpp[index] = (colour << 3);
}
lcdRefreshMixModeImplementation(screenBuffer, scanType, palette, palette4bpp, palette2bpp);
}
void lcdRefreshMixModeImplementation(const uint8_t * screenBuffer, const uint8_t * scanTypes, const uint16_t * palette8bpp, const uint32_t * palette4bpp, const uint32_t * palette2bpp)
{
write_command(0x03);
write_data(0x1038);
// Horizontal DRAM Address
write_command(0x20);
write_data(0);
// Vertical DRAM Address
write_command(0x21);
write_data(0);
// write data to DRAM
write_command(0x22);
CLR_CS_SET_CD_RD_WR;
SET_MASK_P2;
uint32_t scanline[220];
// point to beginning of line in data
for(size_t y = 0; y < 176; ++y)
{
// find colours in one scanline
const uint8_t scanTypeIndex = (y >> 1);
switch(scanType[scanTypeIndex])
{
case 0: // 8bpp
{
// Point to the beginning of the line in data
const uint8_t * data = &screenBuffer[110 * scanTypeIndex];
size_t scanlineIndex = 0;
#define STEP() \
{ \
uint32_t colour = static_cast<uint32_t>(palette8bpp[*data]) << 3; \
++data; \
scanline[scanlineIndex] = colour; \
++scanlineIndex; \
scanline[scanlineIndex] = colour; \
++scanlineIndex; \
}
for(uint8_t x = 0; x < 11; ++x)
{
STEP();
STEP();
STEP();
STEP();
STEP();
STEP();
STEP();
STEP();
STEP();
STEP();
}
break;
#undef STEP
}
case 1: // 4bpp
{
size_t screenBufferIndex = 0;
size_t scanlineIndex = 0;
#define STEP() \
{ \
const uint8_t value = screenBuffer[screenBufferIndex]; \
++screenBufferIndex; \
\
const size_t paletteIndex1 = ((value >> 4) & 0x0F); \
const uint32_t colour1 = palette4bpp[paletteIndex1]; \
scanline[scanlineIndex] = colour1; \
++scanlineIndex; \
scanline[scanlineIndex] = colour1; \
++scanlineIndex; \
\
const size_t paletteIndex2 = ((value >> 0) & 0x0F); \
const uint32_t colour2 = palette4bpp[paletteIndex2]; \
scanline[scanlineIndex] = colour2; \
++scanlineIndex; \
scanline[scanlineIndex] = colour2; \
++scanlineIndex; \
}
for(uint8_t x = 0; x < 11; ++x)
{
STEP();
STEP();
STEP();
STEP();
STEP();
}
break;
#undef STEP
}
case 2: // 2bpp
{
uint8_t scanlineIndex = 0;
#define STEP() \
const uint8_t value = screenBuffer[screenBufferIndex]; \
++screenBufferIndex; \
\
const size_t paletteIndex0 = ((value >> 6) & 0x03); \
const size_t paletteIndex1 = ((value >> 4) & 0x03); \
const size_t paletteIndex2 = ((value >> 2) & 0x03); \
const size_t paletteIndex3 = ((value >> 0) & 0x03); \
\
scanline[scanlineIndex] = palette2bpp[paletteIndex0]; \
++scanlineIndex; \
scanline[scanlineIndex] = palette2bpp[paletteIndex1]); \
++scanlineIndex; \
scanline[scanlineIndex] = palette2bpp[paletteIndex2]; \
++scanlineIndex; \
scanline[scanlineIndex] = palette2bpp[paletteIndex3]; \
++scanlineIndex;
for(uint8_t x = 0; x < 11; ++x)
{
STEP();
STEP();
STEP();
STEP();
STEP();
}
#undef STEP
break;
}
}
#define WRITE_SCANLINE() \
*LCD = colour; \
colour = scanline[scanlineIndex] \
++scanlineIndex; \
TGL_WR_OP(colour);
#define WRITE_SCANLINE_2 \
WRITE_SCANLINE \
WRITE_SCANLINE
#define WRITE_SCANLINE_10 \
WRITE_SCANLINE_2 \
WRITE_SCANLINE_2 \
WRITE_SCANLINE_2 \
WRITE_SCANLINE_2 \
WRITE_SCANLINE_2
#define WRITE_SCANLINE_20 \
WRITE_SCANLINE_10 \
WRITE_SCANLINE_10
{
volatile uint32_t * LCD = reinterpret_cast<volatile uint32_t *>(0xA0002188);
uint32_t colour = scanline[0];
size_t scanlineIndex = 0;
WRITE_SCANLINE_20
WRITE_SCANLINE_20
WRITE_SCANLINE_20
WRITE_SCANLINE_20
WRITE_SCANLINE_20
WRITE_SCANLINE_20
WRITE_SCANLINE_20
WRITE_SCANLINE_20
WRITE_SCANLINE_20
WRITE_SCANLINE_20
WRITE_SCANLINE_20
}
#undef WRITE_SCANLINE
}
CLR_MASK_P2;
}
For a price of just 20 extra shifts (and a bit of extra memory and looping) it eliminates the 110 (2 * 5 * 11) shifts used in the 4bpp branch and the 220 (4 * 5 * 11) shifts used in the 2bpp branch.
So theoretically it’ll slow down the 8bpp branch but significantly speed up the 4bpp and 2bpp branches.