; Prot386 bootloader
; (C) 2006-2007 Peter Ambroz
; This code is free software and is distributed under the terms of GNU GPL
; Some parts were inspired by linux bootloader sources

; This code will be thrown into memory by the almighty BIOS
; Almost certainly, we are at linear address 0x7C00
; Only first sector (512B) was loaded. Tasks of this code are:
; - load remaining sectors (system code and data) into memory at 0x10000
; - switch into protected mode 386+
; - jump to the loaded code
;
; The code has to fit into 510 bytes (+2B for the magic sequence)

	.text
entry start
start:
	xor ax, ax
	mov ss, ax
	mov sp, #0x7C00		; setup the stack

	mov ax, #0x03
	int 0x10		; clear screen
	
	mov ax, #0x07C0
	mov ds, ax		; ds=0x07C0
	mov es, ax

	; store the memory map to the address 0xFC00
	call getmem

	lea bp, hello
	call print		; print out the loading.. message

; bios pre nas natiahol len 1. sektor. vsetky zvysne musime sami
; a na vlastne riziko. fukcia load_system natiahne zvysne sektory
; na linearnu adresu 0x10000 (nad prvych 64k). Toto bude fungovat,
; pokial nam systemova cast nezaberie viac ako 9*64k, lebo od adresy
; 0xA0000 zacina videoRAM. Ak bude raz system vacsi ako 576k :
; 1. nejako sa zbavit reserved pamate, ale aj tak sa tym ziska max 1 MB
; 2. rovno zavadzat nad 1. MB - len v protected mode. na sluzby biosu
;    ako napriklad int 0x13 treba zabudnut (a vtedy pride to prave maso)
; 3. vyuzit ku spinavej praci lilo / grub tak ako to robi linux kernel

	call load_system
	
; stop floppy drive motor because it might stay spinning after entering
; protected mode. obviously, there is no timer that could stop it.
; remember that we disabled interrupts completely

	mov dx,#0x3f2
	xor al, al
	outb

	cli		; we don't want interrupts while switching to pmode

; next we have to enable A20, so our memory requests >1MB won't get trashed
; this is done through the keyboard controller 8042 chip

	call	empty_8042
	mov	al,#0xD1
	out	#0x64,al
	mov	al,#0xDF
	out	#0x64,al
	call	empty_8042

; now it's time to reprogram interrupts. Otherwise there would be
; a conflict between IBM original int's (IRQ0-IRQ7 -> int08-int0f) and
; INTEL CPU event int's (int00-int1f). This will redirect HW IRQ00-IRQ0f
; to SW int20-int2f. It's done through 8259-1/2 interrupt controllers.
; After every out, we have to wait a few cycles so that controller
; gets ready. We also mask all those IRQ's, because they aren't needed.

	xor	al, al
	out	#0xF1, al		; reset coprocessor

	mov	al,#0x11		; initialization sequence
	out	#0x20,al		; send it to 8259A-1
	call jumper			; jmp $+2, jmp $+2
	out	#0xA0,al		; and to 8259A-2
	call jumper
	mov	al,#0x20		; start of hardware int's (0x20)
	out	#0x21,al
	call jumper
	mov	al,#0x28		; start of hardware int's 2 (0x28)
	out	#0xA1,al
	call jumper
	mov	al,#0x04		; 8259-1 is master
	out	#0x21,al
	call jumper
	mov	al,#0x02		; 8259-2 is slave
	out	#0xA1,al
	call jumper
	mov	al,#0x01		; 8086 mode for both
	out	#0x21,al
	call jumper
	out	#0xA1,al
	call jumper
	mov	al,#0xFD		; keyboard (IRQ 1) enabled
	out	#0x21,al
	call jumper
	mov	al,#0xFF
	out	#0xA1,al

; Time to switch into protected mode. Say goodbye to BIOS routines.
	call setup_dt		; setup descriptor tables (gdt, idt)

	lidt idt_48		; load idt
	lgdt gdt_48		; load gdt

	mov eax, cr0
	or al, #0x21	; NE=1 (int 0x10), PE=1
;	and eax, #0x7FFFFFFF	; PG=0 (zatial nepodporujeme paging)
	mov cr0, eax

	jmpi 0x10000,0x08	; far jump to protected mode, 8=CS selector

; OK, we're done. At this time, 32bit protected mode code is now running

halt:	
	cli
	hlt		; cpu will stop right here..
	jmp halt	; ..but it might got woken up by NMI

empty_8042:
	call jumper
	in	al,#0x64	; 8042 status port
	test	al,#2		; is input buffer full?
	jnz	empty_8042	; yes - loop
	ret

jumper:
	.word	0x00eb,0x00eb
	ret

; BIOS print char routine
print:
	mov ah, #3
	xor bx, bx
	int 0x10
	xor cx,cx
	inc cx
	mov ax, #0x1301
	mov bl, #0x07
	int 0x10
	ret

BOOTSEG = 0x07C0
GDTSEG  = 0x0800

setup_dt:
	mov ax, #BOOTSEG
	mov ds, ax
	mov ax, #GDTSEG
	mov es, ax
	xor di, di
	mov cx, #0x800
	xor ax, ax
	cld
	rep
	stosw
	
	lea si, gdt
	xor di, di
	mov cx, gdt_size
	rep
	movsw
	
	ret

SMAP = 0x534d4150

getmem:
	xor eax, eax
	xor ebx, ebx
	xor edi, edi
	; es = 0x7C0, es:edi = 0xFBF0
	mov di, #0x7FF0
_rep_gm:
	mov ax, #0xE820
	xor ecx, ecx
	mov cl, #0x20
	add di, cx
	mov edx, #SMAP
	int 0x15
	jc _end_gm
	or ebx, ebx
	jnz _rep_gm
_end_gm:
	mov ax, di
	sub ax, #0x7FF0
	shr ax, #5
	.byte 0x26
	mov [0x8000], ax
	ret

; some initial values
DRIVE = 0
SREAD = 1
HEAD = 0
TRACK = 0
RETRY = 0
RETRIES = 3
SPT = 18		; sectors per track (1.44 Mb floppy = 18)
SYSSEG = 0x1000		; base for loading system
SYSSIZE = (65520+15) / 16
ENDSEG = SYSSEG + SYSSIZE

load_system:
	mov sread, #SREAD	; initialize variables
	mov head, #HEAD
	mov track, #TRACK
	mov retry, #RETRY

	mov ax, #SYSSEG
	mov es, ax

	xor bx,bx
rep_read:
	push es
	push ds
	pop es
	lea bp, debug
	pusha
	call print
	popa
	pop es

	mov ax,es
	cmp ax,#ENDSEG
	jb ok1_read

	ret
ok1_read:
	mov ax,#SPT
	sub ax,sread
	mov cx,ax
	shl cx,#9
	add cx,bx
	jnc ok2_read
	je ok2_read
	xor ax,ax
	sub ax,bx
	shr ax,#9
ok2_read:
	call read_track
	mov cx,ax
	add ax,sread
	cmp ax,#SPT
	jne ok3_read
	mov ax,#1
	sub ax,head
	jne ok4_read
	inc track
ok4_read:
	mov head,ax
	xor ax,ax
ok3_read:
	mov sread,ax
	shl cx,#9
	add bx,cx
	jnc rep_read
	mov ax,es
	add ax,#0x1000
	mov es,ax
	xor bx,bx
	jmp rep_read

; read sectors from media

read_track:
	pusha
	mov retry, #RETRIES
read_again:
	mov ah, #2
	mov cx, sread
	inc cx
	mov dx, track
	mov ch, dl
	mov dx, head
	mov dh, dl
	mov dl, #DRIVE
	and dx, #0x0100
	int 0x13
	jc read_err
	popa
	ret
read_err:
	push ax
	xor ax, ax
	xor dx, dx
	int 0x13
	pop ax
	dec retry
	jnz read_again

	jmp NEAR halt


sread:  .word 1
head:	.word 0		; current head
track:	.word 0		; current track
retry:	.word 0		; read retries left

hello:	.ascii ":"
debug:	.ascii "#"

idt_48:
	.word	0x7FF		; idt limit=2047, 256 IDT vectors
	.word	0x8800,0x0	; idt base

gdt_48:
	.word	0x7FF		; gdt limit=2047, 256 GDT entries
	.word	0x8000,0x0	; gdt base

gdt:
	.word	0,0,0,0		; dummy

	;CODE SEGMENT DESCRIPTOR, selector=8
	.word	0xFFFF		; Limit = 1MB (1MB*1B granular) [15-0]
	.word	0x0000		; base address=0 [15-0]
	.byte   0x00		; base address[23-16]
	.byte	0x9A		; present,dpl=0,non-system,code,non-conform,read
	.byte	0x4F		; granularity=1B,32bit,limit[19-16]
	.byte	0x00		; base address[31-24]

	;DATA SEGMENT DESCRIPTOR, selector=16
	.word	0xFFFF		; Limit = 4GB (1MB*4kB granular) [15-0]
	.word	0x0000		; base address=0 [15-0]
	.byte   0x00		; base address[23-16]
	.byte	0x92		; present,dpl=0,non-system,data,expand-up,write
	.byte	0xCF		; granularity=4kB,32bit,limit[19-16]
	.byte	0x00		; base address[31-24]
	
	;DATA SEGMENT DESCRIPTOR, selector=24 (extra segment for video output)
	.word	0x7FFF		; Limit = 32kB [15-0]
	.word	0x8000		; base address=B8000 [15-0]
	.byte   0x0B		; base address[23-16]
	.byte	0xF2		; present,dpl=3,non-system,data,expand-up,write
	.byte	0x40		; granularity=1B,32bit,limit[19-16]
	.byte	0x00		; base address[31-24]

	;OLD STACK SEGMENT DESCRIPTOR, selector=24
;	.word	0xFFF8		; Limit = -0x7000 (FFFF8FFF) [15-0]
;	.word	0x7C00		; base address=0x7C00 [15-0]
;	.byte   0x00		; base address[23-16]
;	.byte	0x96		; present,dpl=0,non-system,data,expand-down,write
;	.byte	0xCF		; granularity=4kB,32bit,limit[19-16]
;	.byte	0x00		; base address[31-24]

; C calling convention relies on the same base & limit for DS and SS
; So the SS selector is equal to DS selector

	;TSS DESCRIPTOR, selector=24
	;Implemented in tss.s
	
gdt_size: .word (* - gdt)/2
	
	.org 510
	.word 0xAA55
