; AES-256 implementation for 6502 (ACME assembler syntax)
; Robert Oestling
; http://www.robos.org/

!to "aes.prg",cbm
!convtab scr

* = $0801
!byte $0c,$08,$0a,$00,$9e,$32,$30,$36,$31,$00,$00,$00

data_area   = $2000
expkey      = data_area + $0000     ; f0 bytes
aesblock    = data_area + $00f0     ; 10 bytes
exptab      = data_area + $0100
logtab      = data_area + $0200
invtab      = data_area + $0300
sbox        = data_area + $0100
ssm0        = data_area + $0200
ssm1        = sbox
ssm2        = sbox
ssm3        = data_area + $0300

tmpblock    = $e0
tmp         = $f0

!zone main
main

    sei

    jsr gentabs

    ldx #$1f
.setkey
    txa
    sta expkey,x
    dex
    bpl .setkey

    ldx #$0f
.setdata
    txa
    sta aesblock,x
    dex
    bpl .setdata

    jsr expand
    jsr encrypt
    jsr encrypt
    jsr encrypt
    jsr encrypt
    jsr encrypt
    jsr encrypt
    jsr encrypt
    jsr encrypt

    ldy #0
.print
    tya
    lsr
    tax
    lda aesblock,x
    jsr printbyte
    cpy #$20
    bne .print

    jmp *

printbyte
    pha
    lsr
    lsr
    lsr
    lsr
    tax
    lda .hextab,x
    sta $0400,y
    iny
    pla
    and #$0f
    tax
    lda .hextab,x
    sta $0400,y
    iny
    rts

.hextab
    !tx "0123456789abcdef"


!zone gentabs
gentabs
    lda #1
    sta exptab+0
    tay
.genexp
    asl
    bcc *+4
    eor #$1b
    eor exptab-1,y
    sta exptab,y
    iny
    bne .genexp

    sty logtab+0
    iny
.genlog
    ldx exptab,y
    tya
    sta logtab,x
    iny
    bne .genlog

    sty invtab+0
    iny
.geninv
    sec
    lda #$ff
    sbc logtab,y
    tax
    lda exptab,x
    sta invtab,y
    iny
    bne .geninv

.gensbox
    lda invtab,y
    sta tmp+0
    eor #$63
    sta tmp+1
    ldx #4
.rotfour
    lda tmp+0
    cmp #$80
    rol
    sta tmp+0
    eor tmp+1
    sta tmp+1
    dex
    bne .rotfour
    sta sbox,y
    iny
    bne .gensbox

.genssm
    lda sbox,y
    asl
    bcc *+4
    eor #$1b
    sta ssm0,y
    eor sbox,y
    sta ssm3,y
    iny
    bne .genssm

    rts

!zone expand
expand
    lda #1
    sta tmp+0
    ldy #0
.expandword
    tya
    and #$1f
    bne .notzero
    ldx expkey+$1c+0,y
    lda sbox,x
    eor expkey+3,y
    sta expkey+$20+3,y
    ldx expkey+$1c+3,y
    lda sbox,x
    eor expkey+2,y
    sta expkey+$20+2,y
    ldx expkey+$1c+2,y
    lda sbox,x
    eor expkey+1,y
    sta expkey+$20+1,y
    ldx expkey+$1c+1,y
    lda sbox,x
    eor tmp+0
    eor expkey+0,y
    sta expkey+$20+0,y
    iny
    iny
    iny
    iny
    asl tmp+0
    bne .done
.notzero
    cmp #$10
    bne .notfour
    lda #4
    sta tmp+1
.subcopy
    ldx expkey+$1c+0,y
    lda sbox,x
    eor expkey+0,y
    sta expkey+$20+0,y
    iny
    dec tmp+1
    bne .subcopy
    beq .done
.notfour
    ldx #4
.copy
    lda expkey+0,y
    eor expkey+$1c+0,y
    sta expkey+$20+0,y
    iny
    dex
    bne .copy
.done
    cpy #$f0-$20
    beq .return
    jmp .expandword
.return
    rts

; ~10318 cycles without jsr/rts (~645 per byte)
!zone encrypt
encrypt
    ldx #$07
.addfirst
    lda aesblock+0,x    ; 4
    eor expkey+0,x      ; 8
    sta tmpblock+0,x    ; 13
    lda aesblock+8,x    ; 17
    eor expkey+8,x      ; 21
    sta tmpblock+8,x    ; 26
    dex                 ; 28
    bpl .addfirst       ; 31

    ldy #$10
.round
    lda expkey+$00,y    ; 4
    ldx tmpblock+4*0+0  ; 7
    eor ssm0,x          ; 11
    ldx tmpblock+4*1+1  ; 14
    eor ssm3,x          ; 18
    ldx tmpblock+4*2+2  ; 21
    eor ssm2,x          ; 25
    ldx tmpblock+4*3+3  ; 28
    eor ssm1,x          ; 32
    sta aesblock+$00    ; 36

    lda expkey+$01,y
    ldx tmpblock+4*0+0
    eor ssm1,x
    ldx tmpblock+4*1+1
    eor ssm0,x
    ldx tmpblock+4*2+2
    eor ssm3,x
    ldx tmpblock+4*3+3
    eor ssm2,x
    sta aesblock+$01

    lda expkey+$02,y
    ldx tmpblock+4*0+0
    eor ssm2,x
    ldx tmpblock+4*1+1
    eor ssm1,x
    ldx tmpblock+4*2+2
    eor ssm0,x
    ldx tmpblock+4*3+3
    eor ssm3,x
    sta aesblock+$02

    lda expkey+$03,y
    ldx tmpblock+4*0+0
    eor ssm3,x
    ldx tmpblock+4*1+1
    eor ssm2,x
    ldx tmpblock+4*2+2
    eor ssm1,x
    ldx tmpblock+4*3+3
    eor ssm0,x
    sta aesblock+$03

    lda expkey+$04,y
    ldx tmpblock+4*1+0
    eor ssm0,x
    ldx tmpblock+4*2+1
    eor ssm3,x
    ldx tmpblock+4*3+2
    eor ssm2,x
    ldx tmpblock+4*0+3
    eor ssm1,x
    sta aesblock+$04

    lda expkey+$05,y
    ldx tmpblock+4*1+0
    eor ssm1,x
    ldx tmpblock+4*2+1
    eor ssm0,x
    ldx tmpblock+4*3+2
    eor ssm3,x
    ldx tmpblock+4*0+3
    eor ssm2,x
    sta aesblock+$05

    lda expkey+$06,y
    ldx tmpblock+4*1+0
    eor ssm2,x
    ldx tmpblock+4*2+1
    eor ssm1,x
    ldx tmpblock+4*3+2
    eor ssm0,x
    ldx tmpblock+4*0+3
    eor ssm3,x
    sta aesblock+$06

    lda expkey+$07,y
    ldx tmpblock+4*1+0
    eor ssm3,x
    ldx tmpblock+4*2+1
    eor ssm2,x
    ldx tmpblock+4*3+2
    eor ssm1,x
    ldx tmpblock+4*0+3
    eor ssm0,x
    sta aesblock+$07

    lda expkey+$08,y
    ldx tmpblock+4*2+0
    eor ssm0,x
    ldx tmpblock+4*3+1
    eor ssm3,x
    ldx tmpblock+4*0+2
    eor ssm2,x
    ldx tmpblock+4*1+3
    eor ssm1,x
    sta aesblock+$08

    lda expkey+$09,y
    ldx tmpblock+4*2+0
    eor ssm1,x
    ldx tmpblock+4*3+1
    eor ssm0,x
    ldx tmpblock+4*0+2
    eor ssm3,x
    ldx tmpblock+4*1+3
    eor ssm2,x
    sta aesblock+$09

    lda expkey+$0a,y
    ldx tmpblock+4*2+0
    eor ssm2,x
    ldx tmpblock+4*3+1
    eor ssm1,x
    ldx tmpblock+4*0+2
    eor ssm0,x
    ldx tmpblock+4*1+3
    eor ssm3,x
    sta aesblock+$0a

    lda expkey+$0b,y
    ldx tmpblock+4*2+0
    eor ssm3,x
    ldx tmpblock+4*3+1
    eor ssm2,x
    ldx tmpblock+4*0+2
    eor ssm1,x
    ldx tmpblock+4*1+3
    eor ssm0,x
    sta aesblock+$0b

    lda expkey+$0c,y
    ldx tmpblock+4*3+0
    eor ssm0,x
    ldx tmpblock+4*0+1
    eor ssm3,x
    ldx tmpblock+4*1+2
    eor ssm2,x
    ldx tmpblock+4*2+3
    eor ssm1,x
    sta aesblock+$0c

    lda expkey+$0d,y
    ldx tmpblock+4*3+0
    eor ssm1,x
    ldx tmpblock+4*0+1
    eor ssm0,x
    ldx tmpblock+4*1+2
    eor ssm3,x
    ldx tmpblock+4*2+3
    eor ssm2,x
    sta aesblock+$0d

    lda expkey+$0e,y
    ldx tmpblock+4*3+0
    eor ssm2,x
    ldx tmpblock+4*0+1
    eor ssm1,x
    ldx tmpblock+4*1+2
    eor ssm0,x
    ldx tmpblock+4*2+3
    eor ssm3,x
    sta aesblock+$0e

    lda expkey+$0f,y
    ldx tmpblock+4*3+0
    eor ssm3,x
    ldx tmpblock+4*0+1
    eor ssm2,x
    ldx tmpblock+4*1+2
    eor ssm1,x
    ldx tmpblock+4*2+3
    eor ssm0,x
    sta aesblock+$0f

    ldx #$03
.copyblock
; Recuding the unrolling by half saves 10 bytes, at the cost of 260 cycles per
; block.
    lda aesblock+0,x    ; 4
    sta tmpblock+0,x    ; 8
    lda aesblock+4,x    ; 12
    sta tmpblock+4,x    ; 16
    lda aesblock+8,x    ; 20
    sta tmpblock+8,x    ; 24
    lda aesblock+12,x   ; 28
    sta tmpblock+12,x   ; 32
    dex                 ; 34
    bpl .copyblock      ; 37

    clc
    tya
    adc #$10
    tay

    cpy #$e0
    beq .lastround
    jmp .round

.lastround
    ldy #$0f
.ssa
; Computing x directly as 5*y mod 16 saves 8 bytes, but requires another 224
; cycles per block
;
;    tya                 ; 2
;    sta tmp+0           ; 5
;    asl                 ; 7
;    asl                 ; 9
;    clc                 ; 11
;    adc tmp+0           ; 14
;    and #$0f            ; 16
;    tax                 ; 18
    ldx .subtab,y       ; 4
    lda tmpblock,x      ; 8
    tax                 ; 10
    lda sbox,x          ; 14
    eor expkey+$e0,y    ; 18
    sta aesblock,y      ; 23
    dey                 ; 25
    bpl .ssa            ; 28

    rts

.subtab
    !byte 4*0+0, 4*1+1, 4*2+2, 4*3+3
    !byte 4*1+0, 4*2+1, 4*3+2, 4*0+3
    !byte 4*2+0, 4*3+1, 4*0+2, 4*1+3
    !byte 4*3+0, 4*0+1, 4*1+2, 4*2+3


