Also, arnezami, do you think you can also post the code, not just the patch for the binary, for printing out the SC calls in the rebooter?
Thanks a lot Ramzi

.
Sure I can post the code itself (and explain a little).
This is the routine that handles the SC table (which is called from the code at 0xC00 = where a CPU automatically jumps to when there is a SC command):
000013D8 cmplwi %r0, 0x61
000013DC bc 6, lt, loc_D7C
000013E0 mtocrf cr4, %r4
000013E4 mflr %sp
000013E8 std %r4, 0x28(%r13)
000013EC std %sp, 0x20(%r13)
000013F0 rldicr %sp, %r0, 2,61
000013F4 lwz %r4, 0x1F68(%sp) // start of SC table
000013F8 mtlr %r4
000013FC ld %r4, 0x48(%r13)
00001400 mfspr %sp, 0x131
00001404 addi %sp, %sp, 0x1F00
00001408 b loc_ECC // patch
0000140C li %rtoc, 2
00001410 rldicr %rtoc, %rtoc, 32,31
00001414 blrl
00001418 li %r0, 0
0000141C li %r4, 0
00001420 li %r5, 0
00001424 li %r6, 0
00001428 li %r7, 0
0000142C li %r8, 0
00001430 li %r9, 0
00001434 li %r10, 0
00001438 li %r11, 0
0000143C li %r12, 0
00001440 ld %rtoc, 0(%sp)
00001444 ld %sp, 0x20(%r13)
00001448 mtlr %sp
0000144C ld %sp, 0x38(%r13)
00001450 mfsprg0 %r13
00001454 rfid
As you can see its patched at 0x1408 to branch (b not bl so I'm not fiddling with the Link Register which is used at 0x1414) to my code at 0xECC. At that address I use the stack to be able to use some registers later on. And I call my get semaphore (sub_68C), serial output (sub_1530) and release semaphore (sub_3C0) functions with 'S' as input:
00000ECC std %r11, -8(%sp)
00000ED0 std %r12, -0x30(%sp)
00000ED4 std %r4, -0x18(%sp)
00000ED8 std %r3, -0x10(%sp)
00000EDC stdu %sp, -0x90(%sp)
00000EE0 mflr %r4
00000EE4 mr %r11, %r0
00000EE8 li %r3, 0x53 # 'S'
00000EEC bl sub_68C
00000EF0 bl sub_1530
00000EF4 bl sub_3C0
00000EF8 nop
00000EFC mtlr %r4
00000F00 addi %sp, %sp, 0x90
00000F04 ld %r3, -0x10(%sp)
00000F08 ld %r4, -0x18(%sp)
00000F0C ld %r11, -8(%sp)
00000F10 ld %r12, -0x30(%sp)
00000F14 stdu %rtoc, -8(%sp)
00000F18 b loc_140C
Similarly when the CPU have just entered the HV and right after its stack pointer has been initialized I patched the code (at 0x1D88):
00001D84 addi %sp, %r4, 0x1F00 // initialize stack pointer
00001D88 b loc_17B0 // patch
00001D8C rldicr %rtoc, %rtoc, 32,31
00001D90 cmplwi %r21, 0
00001D94 beql Execute_59_and_5A
00001D98 li %r5, 0
00001D9C slbia
Here I put some code at 0x17B0 which is shown here (here I didn't have to make sure my registers weren't destroyed because some of them were reset anyway in the function that does POST 0x59 and 0x5A). This again call the three functions:
000017B0 mr %r11, %r21
000017B4 bl sub_68C
000017B8 li %r3, 0x43 # 0x80000102 # 'C'
000017BC nop
000017C0 nop
000017C4 bl sub_1530
000017C8 bl sub_3C0
000017CC li %rtoc, 2
000017D0 b loc_1D8C
And similar for capturing of output to the POST bus. This is the function that does that:
00002D68 mflr %r12
00002D6C std %r12, -8(%sp)
00002D70 std %r31, -0x10(%sp)
00002D74 stdu %sp, -0x60(%sp)
00002D78 mr %r11, %r3
00002D7C lis %r3, loc_61000@h
00002D80 rldicr %r31, %r11, 56,7
00002D84 ori %r3, %r3, loc_61000@l
00002D88 bl sub_B38
00002D8C std %r31, ((qword_61010-0x1000)@l)(%r3) // output to POST bus
00002D90 eieio
00002D94 b loc_17D8 // patch
00002D98 li %r3, 0
00002D9C addi %sp, %sp, 0x60
00002DA0 ld %r12, -8(%sp)
00002DA4 mtlr %r12
00002DA8 ld %r31, -0x10(%sp)
00002DAC blr
And the my code at 0x17D8:
000017DC bl sub_68C
000017E0 li %r3, 0x50 # 'P'
000017E4 nop
000017E8 nop
000017EC bl sub_1530
000017F0 bl sub_3C0
000017F4 nop
000017F8 b loc_2D98
As for the semaphore stuff. This is to make sure a cpu waits for another cpu if the output to serial is in use (had to split the because there wasn't enough room in the HV):
0000068C mflr %r12
00000690 std %r12, var_30(%sp)
00000694 std %r11, var_8(%sp)
00000698 std %r3, var_10(%sp)
0000069C std %r4, var_18(%sp)
000006A0 std %r5, var_20(%sp)
000006A4 std %r6, var_28(%sp)
000006A8 stdu %sp, var_90(%sp)
000006AC lis %r6, 0x8000
000006B0 ori %r6, %r6, 0x102
000006B4 rldicr %r6, %r6, 32,31
000006B8 oris %r6, %r6, 1
000006BC ori %r6, %r6, 0xBC74 // semaphore at address: 0x800001020001BC74
000006C0 mfspr %r11, pir
000006C4 lwz %r4, 0(%r6)
000006C8 cmpwi %r4, 7
000006CC bne loc_6C4 // wait until released (= 0x7)
000006D0 stw %r11, 0(%r6) // store own pir value at semaphore
000006D4 mr %r3, %r6
000006D8 li %r4, 0x80
000006DC lis %r5, 0xFFFF
000006E0 ori %r5, %r5, 0xFFF1
000006E4 and %r5, %r5, %r3
000006E8 subf %r3, %r5, %r3
000006EC add %r4, %r4, %r3
000006F0 li %r3, 0
000006F4 nop
000006F8 nop
000006FC b loc_8B8
000008B8 dcbst %r3, %r5 // flush the cache
000008BC addic %r5, %r5, 0x80
000008C0 subic. %r4, %r4, 0x80
000008C4 bge loc_8B8
000008C8 isync
000008CC lwz %r4, 0(%r6) // make sure your pir is really stored (otherwise try again).
000008D0 cmpld %r4, %r11
000008D4 bne loc_6C4
000008D8 addi %sp, %sp, 0x90
000008DC ld %r11, -0x90+arg_88(%sp)
000008E0 ld %r3, -0x90+arg_80(%sp)
000008E4 ld %r4, -0x90+arg_78(%sp)
000008E8 ld %r5, -0x90+arg_70(%sp)
000008EC ld %r6, -0x90+arg_68(%sp)
000008F0 ld %r12, -0x90+arg_60(%sp)
000008F4 mtlr %r12
000008F8 blr
As you can see I use address 0x800001020001BC74 (0x1BC74 in the HV: in IDA when you load at 0x00000000) to store the semaphore value which is right after all functions in the HV and just before all data. I am not sure if this is the right place. It it seems to work fine.
Releasing the semaphore (again split because of not enough room):
000003C0 mflr %r12
000003C4 std %r12, var_30(%sp)
000003C8 std %r4, var_18(%sp)
000003CC std %r6, var_28(%sp)
000003D0 stdu %sp, var_90(%sp)
000003D4 lis %r6, 0x8000 # 0x80000102
000003D8 ori %r6, %r6, 0x102 # 0x80000102
000003DC rldicr %r6, %r6, 32,31
000003E0 oris %r6, %r6, 1
000003E4 ori %r6, %r6, 0xBC74
000003E8 li %r4, 7
000003EC stw %r4, 0(%r6) // store 0x7 at 0x800001020001BC74
000003F0 addi %sp, %sp, 0x90
000003F4 ld %r4, -0x90+arg_78(%sp)
000003F8 ld %r6, -0x90+arg_68(%sp)
000003FC b loc_468
00000468 ld %r12, -0x90+arg_60(%sp)
0000046C mtlr %r12
00000470 blr
The serial output is done at 0x1530 (this is split too but only by one instruction at 0x1600).
00001530 mflr %r12
00001534 std %r12, var_30(%sp)
00001538 std %r11, var_8(%sp)
0000153C std %r3, var_10(%sp)
00001540 std %r4, var_18(%sp)
00001544 std %r5, var_20(%sp)
00001548 std %r6, var_28(%sp)
0000154C stdu %sp, var_90(%sp)
00001550 mr %r6, %r3
00001554 nop
00001558 nop
0000155C nop
00001560 nop
00001564 mfspr %r3, pir
00001568 bl sub_1790
0000156C li %r3, 0x3A # ':'
00001570 bl sub_16C0
00001574 mr %r3, %r6
00001578 bl sub_16C0
0000157C li %r3, 0x20 # ' '
00001580 bl sub_16C0
00001584 andi. %r3, %r11, 0xF0
00001588 srwi %r3, %r3, 4
0000158C bl sub_1790
00001590 andi. %r3, %r11, 0xF
00001594 bl sub_1790
00001598 li %r3, 0xD
0000159C bl sub_16C0
000015A0 li %r3, 0xA
000015A4 bl sub_16C0
000015A8 nop
000015AC nop
000015B0 nop
000015B4 nop
000015B8 nop
000015BC nop
000015C0 nop
000015C4 nop
000015C8 nop
000015CC nop
000015D0 nop
000015D4 nop
000015D8 nop
000015DC nop
000015E0 nop
000015E4 nop
000015E8 nop
000015EC nop
000015F0 nop
000015F4 nop
000015F8 nop
000015FC b loc_1604
00001600 # ---------------------------------------------------------------------------
00001600 b sub_204
00001604 # ---------------------------------------------------------------------------
00001604 nop
00001608 nop
0000160C nop
00001610 nop
00001614 nop
00001618 nop
0000161C nop
00001620 nop
00001624 nop
00001628 nop
0000162C nop
00001630 nop
00001634 nop
00001638 nop
0000163C nop
00001640 nop
00001644 addi %sp, %sp, 0x90
00001648 ld %r11, -0x90+arg_88(%sp)
0000164C ld %r3, -0x90+arg_80(%sp)
00001650 ld %r4, -0x90+arg_78(%sp)
00001654 ld %r5, -0x90+arg_70(%sp)
00001658 ld %r6, -0x90+arg_68(%sp)
0000165C ld %r12, -0x90+arg_60(%sp)
00001660 mtlr %r12
00001664 blr
Sorry about the nops

. I'm planning to extend this...
The code at 0x1790 makes sure output of lowest byte of %r3 is done hexadecimally:
00001790 mflr %r12
00001794 addi %r3, %r3, 0x30 // 0x30 = '0'
00001798 cmpwi %r3, 0x3A
0000179C blt loc_17A4
000017A0 addi %r3, %r3, 7 // 0x37 + 0xA = 'A'
000017A4 bl sub_16C0
000017A8 mtlr %r12
000017AC blr
And of course the serial output itself:
000016C0 lis %r4, 0x8000
000016C4 ori %r4, %r4, 0x200
000016C8 rldicr %r4, %r4, 32,31
000016CC oris %r4, %r4, 0xEA00
000016D0 mr %r5, %r3
000016D4 lwz %r3, 0x1018(%r4)
000016D8 rlwinm. %r3, %r3, 0,6,6
000016DC beq loc_16D4
000016E0 mr %r3, %r5
000016E4 slwi %r3, %r3, 24
000016E8 stw %r3, 0x1014(%r4)
000016EC blr
All the above is just to dump stuff and get an idea of the dynamics of the booting the kernel/hv. And for me its just a tool to figure out what it doesn't do compared to normal boot or where certain cpus are stuck etc.
I will keep sharing my results.

Regards,
arnezami
PS. The reason why programming in the hv/kernel is a pain is because all the above has been done by a hex editor. While some of it I can first compile and then copy-paste other stuff (like the branches) have to be done manually. And there is also the room-issue and potential checksum(?) problems. So far I have been able to avoid or cope with these issues.