diff --git a/executor/dsdt.asl b/executor/dsdt.asl new file mode 100644 index 00000000..701c08ee --- /dev/null +++ b/executor/dsdt.asl @@ -0,0 +1,322 @@ +// ASL Example +DefinitionBlock ( + "dsdt.aml", // Output Filename + "DSDT", // Signature + 0x00, // DSDT Compliance Revision + "BAMM", // OEMID + "JONGE", // TABLE ID + 0x1 // OEM Revision + ) +{ + Scope(\_SB) { + Device(PCI0) { + // The following magic code stands for "PCI Host Bridge" + Name(_HID, EisaId("PNP0A03")) + Name(_ADR, 0) + Name(_UID, 0) + + // Hot Plug Parameters. Optional. + // Linux will complain and use standard parameters, + // if not given. + Name(_HPP, Package(){ + 0x08, // Cache line size in dwords + 0x40, // Latency timer in PCI clocks + 0x01, // Enable SERR line + 0x00 // Enable PERR line + }) + + // PCI Routing Table + // When defining as much ACPI information as + // needed for hotplug, we also have to define + // stuff like the following. + // Otherwise, Linux would complain. + Name(_PRT, Package() { + Package() { 0x1ffff, 0, LNKA, 0 }, + Package() { 0x1ffff, 1, LNKB, 0 }, + Package() { 0x1ffff, 2, LNKC, 0 }, + Package() { 0x1ffff, 3, LNKD, 0 }, + + Package() { 0x2ffff, 0, LNKA, 0 }, + Package() { 0x2ffff, 1, LNKB, 0 }, + Package() { 0x2ffff, 2, LNKC, 0 }, + Package() { 0x2ffff, 3, LNKD, 0 }, + + Package() { 0x3ffff, 0, LNKA, 0 }, + Package() { 0x3ffff, 1, LNKB, 0 }, + Package() { 0x3ffff, 2, LNKC, 0 }, + Package() { 0x3ffff, 3, LNKD, 0 }, + + Package() { 0x4ffff, 0, LNKA, 0 }, + Package() { 0x4ffff, 1, LNKB, 0 }, + Package() { 0x4ffff, 2, LNKC, 0 }, + Package() { 0x4ffff, 3, LNKD, 0 }, + }) + + // At boot, Linux will either scan the system for + // possible resources used by PCI cards or read + // ACPI tables to obtain this information. + // When providing as much ACPI data as needed + // for hotplugging, then this is not optional any longer. + // Linux would complain if all this was not provided here. + Name (_CRS, ResourceTemplate () { + // Bus enumeration from _MIN to _MAX + WordBusNumber ( + ResourceProducer, + MinFixed, // _MIF + MaxFixed, // _MAF + , + 0x00, // _GRA + 0x00, // _MIN + 0xFF, // _MAX + 0x00, // _TRA + 0x100) // _LEN + // IO ports usable by PCI from _MIN to _MAX + WordIO ( + ResourceProducer, + MinFixed, // _MIF + MaxFixed, // _MAF + PosDecode, + EntireRange, + 0x0000, // _GRA + 0x0000, // _MIN + 0x7FFF, // _MAX + 0x00, // _TRA + 0x8000) // _LEN + // System memory for mapping BAR areas from _MIN to _MAX + // BAR = Base Address Register, every PCI card will + // usually have 2 of those. + DWordMemory ( + ResourceProducer, + PosDecode, + MinFixed, // _MIF + MaxFixed, // _MAF + NonCacheable, // _MEM + ReadWrite, // _RW + 0x00000000, // _GRA + 0xE0000000, // _MIN + 0xE0FFFFFF, // _MAX + 0x00, // _TRA + 0x01000000) // _LEN + }) + + // This introduced three names dword fields in IO space. + // The hotplug controller knows these IO port. + // During hot plug/unplug, guest and the hosts hotplug- + // controller will communicate over these. + OperationRegion(PCST, SystemIO, 0xae00, 12) + Field (PCST, DWordAcc, NoLock, WriteAsZeros) + { + PCIU, 32, // IO port 0xae00 + PCID, 32, // IO port 0xae04 + B0EJ, 32, // IO port 0xae08 + } + + // Status method. Statically returns "Everything is up and working" + // because the PCI root bus will always be there. + Method (_STA, 0) { Return (0xf) } + } + + // All this interrupt routing information is necessary. + // This defines the interrupts A, B, C, D, considered legacy + // nowadays. + // Hotplugging etc. will work without this anyway if the PCI device uses + // MSI for interrupting, but the kernel would complain with + // ugly error messages. + // This device definitions are kept as minimal as possible. + Device(LNKA){ + Name(_HID, EISAID("PNP0C0F")) // PCI interrupt link + Name(_UID, 1) + Method (_STA, 0, NotSerialized) + { + Return (0x0B) + } + Method (_CRS, 0, NotSerialized) + { + Name (BUFF, ResourceTemplate () + { + IRQ (Level, ActiveLow, Shared) {5} + }) + Return (BUFF) + } + Method (_PRS, 0, NotSerialized) + { + Name (BUFF, ResourceTemplate () + { + IRQ (Level, ActiveLow, Shared) {5,9,10} + }) + Return (BUFF) + } + Method (_SRS, 1, NotSerialized) {} + Method (_DIS, 0, NotSerialized) {} + } + Device(LNKB){ + Name(_HID, EISAID("PNP0C0F")) // PCI interrupt link + Name(_UID, 2) + Method (_STA, 0, NotSerialized) + { + Return (0x0B) + } + Method (_CRS, 0, NotSerialized) + { + Name (BUFF, ResourceTemplate () + { + IRQ (Level, ActiveLow, Shared) {10} + }) + Return (BUFF) + } + Method (_PRS, 0, NotSerialized) + { + Name (BUFF, ResourceTemplate () + { + IRQ (Level, ActiveLow, Shared) {5,9,10} + }) + Return (BUFF) + } + Method (_SRS, 1, NotSerialized) {} + Method (_DIS, 0, NotSerialized) {} + } + Device(LNKC){ + Name(_HID, EISAID("PNP0C0F")) // PCI interrupt link + Name(_UID, 3) + Method (_STA, 0, NotSerialized) + { + Return (0x0B) + } + Method (_CRS, 0, NotSerialized) + { + Name (BUFF, ResourceTemplate () + { + IRQ (Level, ActiveLow, Shared) {9} + }) + Return (BUFF) + } + Method (_PRS, 0, NotSerialized) + { + Name (BUFF, ResourceTemplate () + { + IRQ (Level, ActiveLow, Shared) {5,9,10} + }) + Return (BUFF) + } + Method (_SRS, 1, NotSerialized) {} + Method (_DIS, 0, NotSerialized) {} + } + Device(LNKD){ + Name(_HID, EISAID("PNP0C0F")) // PCI interrupt link + Name(_UID, 4) + Method (_STA, 0, NotSerialized) + { + Return (0x0B) + } + Method (_CRS, 0, NotSerialized) + { + Name (BUFF, ResourceTemplate () + { + IRQ (Level, ActiveLow, Shared) {5} + }) + Return (BUFF) + } + Method (_PRS, 0, NotSerialized) + { + Name (BUFF, ResourceTemplate () + { + IRQ (Level, ActiveLow, Shared) {5,9,10} + }) + Return (BUFF) + } + Method (_SRS, 1, NotSerialized) {} + Method (_DIS, 0, NotSerialized) {} + } + + } + + Scope(\_SB.PCI0) { + // These are PCI slot definitions. + // They are necessary because every PCI card + // which shall be ejectable, needs an _EJ0 method. + Device (S01) { + Name (_ADR, 0x10000) + Name (_SUN, 0x01) // SUN: Slot User Number + + // This method is called by the operating system + // after unloading the device driver etc. + // _EJ0 = eject callback + Method (_EJ0, 1) { PCEJ(0x01) } + } + + Device (S02) { + Name (_ADR, 0x20000) + Name (_SUN, 0x02) + Method (_EJ0, 1) { PCEJ(0x02) } + } + + Device (S03) { + Name (_ADR, 0x30000) + Name (_SUN, 0x03) + Method (_EJ0, 1) { PCEJ(0x03) } + } + + Device (S04) { + Name (_ADR, 0x40000) + Name (_SUN, 0x04) + Method (_EJ0, 1) { PCEJ(0x04) } + } + + // Called by some PCI card's _EJ0 method, + // This tells the hypervisor to turn off the + // PCI device by writing (1 << PCI_ID) to the + // IO port associated with the B0EJ symbol. + Method (PCEJ, 1, NotSerialized) { + Store(ShiftLeft(1, Arg0), B0EJ) + Return (0x0) + } + + // PCNT = PCi NoTify + // PCNT(, <1 = check for inserted device / 3 = eject requested>) + // The values 1 and 3 are defined in the ACPI spec + Method(PCNT, 2) { + If (LEqual(Arg0, 0x01)) { Notify(S01, Arg1) } + If (LEqual(Arg0, 0x02)) { Notify(S02, Arg1) } + If (LEqual(Arg0, 0x03)) { Notify(S03, Arg1) } + If (LEqual(Arg0, 0x04)) { Notify(S04, Arg1) } + } + + /* PCI hotplug notify method */ + Method(PCNF, 0) { + // Local0 = iterator + Store (Zero, Local0) + + // These two fields contain bits mapped + // to PCI devices, like in the GPE bitmap. + + // bit (1 << N) set here --> Device N was inserted + Store (PCIU, Local1) + // bit (1 << N) set here --> Device N has to be removed + Store (PCID, Local2) + + While (LLess(Local0, 4)) { + Increment(Local0) + If (And(Local1, ShiftLeft(1, Local0))) { + PCNT(Local0, 1) // 1 => DEVICE CHECK + } + If (And(Local2, ShiftLeft(1, Local0))) { + PCNT(Local0, 3) // 3 => EJECT REQUEST + } + } + Return(One) + } + } + + Scope (\_GPE) + { + Name(_HID, "ACPI0006") + + // These methods are wired to the according bits in the GPE bitmap. + // The hypervisor will raise bits and then send an interrupt 9. + // The ACPI code in the guest kernel will then dispatch one of these methods. + Method(_E01) { + \_SB.PCI0.PCNF() // PCI hotplug event + } + } + +} // end of definition block diff --git a/executor/dsdt.h b/executor/dsdt.h new file mode 100644 index 00000000..f8ee3611 --- /dev/null +++ b/executor/dsdt.h @@ -0,0 +1,179 @@ +/* + * To generate this file, download the iASL compiler from + * https://acpica.org/downloads (or install the "iasl" packet, + * if available for your distro) and then run: + * iasl -tc dsdt.asl && mv dsdt.hex dsdt.h + */ + +/* + * + * Intel ACPI Component Architecture + * ASL Optimizing Compiler version 20130418-64 [May 8 2013] + * Copyright (c) 2000 - 2013 Intel Corporation + * + * Compilation of "dsdt.asl" - Thu Jun 20 15:28:32 2013 + * + * C source code output + * AML code block contains 0x4E8 bytes + * + */ +unsigned char AmlCode[] = +{ + 0x44,0x53,0x44,0x54,0xE8,0x04,0x00,0x00, /* 00000000 "DSDT...." */ + 0x00,0x31,0x42,0x41,0x4D,0x4D,0x00,0x00, /* 00000008 ".1BAMM.." */ + 0x4A,0x4F,0x4E,0x47,0x45,0x00,0x00,0x00, /* 00000010 "JONGE..." */ + 0x01,0x00,0x00,0x00,0x49,0x4E,0x54,0x4C, /* 00000018 "....INTL" */ + 0x18,0x04,0x13,0x20,0x10,0x40,0x33,0x5F, /* 00000020 "... .@3_" */ + 0x53,0x42,0x5F,0x5B,0x82,0x4D,0x18,0x50, /* 00000028 "SB_[.M.P" */ + 0x43,0x49,0x30,0x08,0x5F,0x48,0x49,0x44, /* 00000030 "CI0._HID" */ + 0x0C,0x41,0xD0,0x0A,0x03,0x08,0x5F,0x41, /* 00000038 ".A...._A" */ + 0x44,0x52,0x00,0x08,0x5F,0x55,0x49,0x44, /* 00000040 "DR.._UID" */ + 0x00,0x08,0x5F,0x48,0x50,0x50,0x12,0x08, /* 00000048 ".._HPP.." */ + 0x04,0x0A,0x08,0x0A,0x40,0x01,0x00,0x08, /* 00000050 "....@..." */ + 0x5F,0x50,0x52,0x54,0x12,0x4B,0x0E,0x10, /* 00000058 "_PRT.K.." */ + 0x12,0x0D,0x04,0x0C,0xFF,0xFF,0x01,0x00, /* 00000060 "........" */ + 0x00,0x4C,0x4E,0x4B,0x41,0x00,0x12,0x0D, /* 00000068 ".LNKA..." */ + 0x04,0x0C,0xFF,0xFF,0x01,0x00,0x01,0x4C, /* 00000070 ".......L" */ + 0x4E,0x4B,0x42,0x00,0x12,0x0E,0x04,0x0C, /* 00000078 "NKB....." */ + 0xFF,0xFF,0x01,0x00,0x0A,0x02,0x4C,0x4E, /* 00000080 "......LN" */ + 0x4B,0x43,0x00,0x12,0x0E,0x04,0x0C,0xFF, /* 00000088 "KC......" */ + 0xFF,0x01,0x00,0x0A,0x03,0x4C,0x4E,0x4B, /* 00000090 ".....LNK" */ + 0x44,0x00,0x12,0x0D,0x04,0x0C,0xFF,0xFF, /* 00000098 "D......." */ + 0x02,0x00,0x00,0x4C,0x4E,0x4B,0x41,0x00, /* 000000A0 "...LNKA." */ + 0x12,0x0D,0x04,0x0C,0xFF,0xFF,0x02,0x00, /* 000000A8 "........" */ + 0x01,0x4C,0x4E,0x4B,0x42,0x00,0x12,0x0E, /* 000000B0 ".LNKB..." */ + 0x04,0x0C,0xFF,0xFF,0x02,0x00,0x0A,0x02, /* 000000B8 "........" */ + 0x4C,0x4E,0x4B,0x43,0x00,0x12,0x0E,0x04, /* 000000C0 "LNKC...." */ + 0x0C,0xFF,0xFF,0x02,0x00,0x0A,0x03,0x4C, /* 000000C8 ".......L" */ + 0x4E,0x4B,0x44,0x00,0x12,0x0D,0x04,0x0C, /* 000000D0 "NKD....." */ + 0xFF,0xFF,0x03,0x00,0x00,0x4C,0x4E,0x4B, /* 000000D8 ".....LNK" */ + 0x41,0x00,0x12,0x0D,0x04,0x0C,0xFF,0xFF, /* 000000E0 "A......." */ + 0x03,0x00,0x01,0x4C,0x4E,0x4B,0x42,0x00, /* 000000E8 "...LNKB." */ + 0x12,0x0E,0x04,0x0C,0xFF,0xFF,0x03,0x00, /* 000000F0 "........" */ + 0x0A,0x02,0x4C,0x4E,0x4B,0x43,0x00,0x12, /* 000000F8 "..LNKC.." */ + 0x0E,0x04,0x0C,0xFF,0xFF,0x03,0x00,0x0A, /* 00000100 "........" */ + 0x03,0x4C,0x4E,0x4B,0x44,0x00,0x12,0x0D, /* 00000108 ".LNKD..." */ + 0x04,0x0C,0xFF,0xFF,0x04,0x00,0x00,0x4C, /* 00000110 ".......L" */ + 0x4E,0x4B,0x41,0x00,0x12,0x0D,0x04,0x0C, /* 00000118 "NKA....." */ + 0xFF,0xFF,0x04,0x00,0x01,0x4C,0x4E,0x4B, /* 00000120 ".....LNK" */ + 0x42,0x00,0x12,0x0E,0x04,0x0C,0xFF,0xFF, /* 00000128 "B......." */ + 0x04,0x00,0x0A,0x02,0x4C,0x4E,0x4B,0x43, /* 00000130 "....LNKC" */ + 0x00,0x12,0x0E,0x04,0x0C,0xFF,0xFF,0x04, /* 00000138 "........" */ + 0x00,0x0A,0x03,0x4C,0x4E,0x4B,0x44,0x00, /* 00000140 "...LNKD." */ + 0x08,0x5F,0x43,0x52,0x53,0x11,0x3F,0x0A, /* 00000148 "._CRS.?." */ + 0x3C,0x88,0x0D,0x00,0x02,0x0C,0x00,0x00, /* 00000150 "<......." */ + 0x00,0x00,0x00,0xFF,0x00,0x00,0x00,0x00, /* 00000158 "........" */ + 0x01,0x88,0x0D,0x00,0x01,0x0C,0x03,0x00, /* 00000160 "........" */ + 0x00,0x00,0x00,0xFF,0x7F,0x00,0x00,0x00, /* 00000168 "........" */ + 0x80,0x87,0x17,0x00,0x00,0x0C,0x01,0x00, /* 00000170 "........" */ + 0x00,0x00,0x00,0x00,0x00,0x00,0xE0,0xFF, /* 00000178 "........" */ + 0xFF,0xFF,0xE0,0x00,0x00,0x00,0x00,0x00, /* 00000180 "........" */ + 0x00,0x00,0x01,0x79,0x00,0x5B,0x80,0x50, /* 00000188 "...y.[.P" */ + 0x43,0x53,0x54,0x01,0x0B,0x00,0xAE,0x0A, /* 00000190 "CST....." */ + 0x0C,0x5B,0x81,0x15,0x50,0x43,0x53,0x54, /* 00000198 ".[..PCST" */ + 0x43,0x50,0x43,0x49,0x55,0x20,0x50,0x43, /* 000001A0 "CPCIU PC" */ + 0x49,0x44,0x20,0x42,0x30,0x45,0x4A,0x20, /* 000001A8 "ID B0EJ " */ + 0x14,0x09,0x5F,0x53,0x54,0x41,0x00,0xA4, /* 000001B0 ".._STA.." */ + 0x0A,0x0F,0x5B,0x82,0x44,0x06,0x4C,0x4E, /* 000001B8 "..[.D.LN" */ + 0x4B,0x41,0x08,0x5F,0x48,0x49,0x44,0x0C, /* 000001C0 "KA._HID." */ + 0x41,0xD0,0x0C,0x0F,0x08,0x5F,0x55,0x49, /* 000001C8 "A...._UI" */ + 0x44,0x01,0x14,0x09,0x5F,0x53,0x54,0x41, /* 000001D0 "D..._STA" */ + 0x00,0xA4,0x0A,0x0B,0x14,0x1A,0x5F,0x43, /* 000001D8 "......_C" */ + 0x52,0x53,0x00,0x08,0x42,0x55,0x46,0x46, /* 000001E0 "RS..BUFF" */ + 0x11,0x09,0x0A,0x06,0x23,0x20,0x00,0x18, /* 000001E8 "....# .." */ + 0x79,0x00,0xA4,0x42,0x55,0x46,0x46,0x14, /* 000001F0 "y..BUFF." */ + 0x1A,0x5F,0x50,0x52,0x53,0x00,0x08,0x42, /* 000001F8 "._PRS..B" */ + 0x55,0x46,0x46,0x11,0x09,0x0A,0x06,0x23, /* 00000200 "UFF....#" */ + 0x20,0x06,0x18,0x79,0x00,0xA4,0x42,0x55, /* 00000208 " ..y..BU" */ + 0x46,0x46,0x14,0x06,0x5F,0x53,0x52,0x53, /* 00000210 "FF.._SRS" */ + 0x01,0x14,0x06,0x5F,0x44,0x49,0x53,0x00, /* 00000218 "..._DIS." */ + 0x5B,0x82,0x45,0x06,0x4C,0x4E,0x4B,0x42, /* 00000220 "[.E.LNKB" */ + 0x08,0x5F,0x48,0x49,0x44,0x0C,0x41,0xD0, /* 00000228 "._HID.A." */ + 0x0C,0x0F,0x08,0x5F,0x55,0x49,0x44,0x0A, /* 00000230 "..._UID." */ + 0x02,0x14,0x09,0x5F,0x53,0x54,0x41,0x00, /* 00000238 "..._STA." */ + 0xA4,0x0A,0x0B,0x14,0x1A,0x5F,0x43,0x52, /* 00000240 "....._CR" */ + 0x53,0x00,0x08,0x42,0x55,0x46,0x46,0x11, /* 00000248 "S..BUFF." */ + 0x09,0x0A,0x06,0x23,0x00,0x04,0x18,0x79, /* 00000250 "...#...y" */ + 0x00,0xA4,0x42,0x55,0x46,0x46,0x14,0x1A, /* 00000258 "..BUFF.." */ + 0x5F,0x50,0x52,0x53,0x00,0x08,0x42,0x55, /* 00000260 "_PRS..BU" */ + 0x46,0x46,0x11,0x09,0x0A,0x06,0x23,0x20, /* 00000268 "FF....# " */ + 0x06,0x18,0x79,0x00,0xA4,0x42,0x55,0x46, /* 00000270 "..y..BUF" */ + 0x46,0x14,0x06,0x5F,0x53,0x52,0x53,0x01, /* 00000278 "F.._SRS." */ + 0x14,0x06,0x5F,0x44,0x49,0x53,0x00,0x5B, /* 00000280 ".._DIS.[" */ + 0x82,0x45,0x06,0x4C,0x4E,0x4B,0x43,0x08, /* 00000288 ".E.LNKC." */ + 0x5F,0x48,0x49,0x44,0x0C,0x41,0xD0,0x0C, /* 00000290 "_HID.A.." */ + 0x0F,0x08,0x5F,0x55,0x49,0x44,0x0A,0x03, /* 00000298 ".._UID.." */ + 0x14,0x09,0x5F,0x53,0x54,0x41,0x00,0xA4, /* 000002A0 ".._STA.." */ + 0x0A,0x0B,0x14,0x1A,0x5F,0x43,0x52,0x53, /* 000002A8 "...._CRS" */ + 0x00,0x08,0x42,0x55,0x46,0x46,0x11,0x09, /* 000002B0 "..BUFF.." */ + 0x0A,0x06,0x23,0x00,0x02,0x18,0x79,0x00, /* 000002B8 "..#...y." */ + 0xA4,0x42,0x55,0x46,0x46,0x14,0x1A,0x5F, /* 000002C0 ".BUFF.._" */ + 0x50,0x52,0x53,0x00,0x08,0x42,0x55,0x46, /* 000002C8 "PRS..BUF" */ + 0x46,0x11,0x09,0x0A,0x06,0x23,0x20,0x06, /* 000002D0 "F....# ." */ + 0x18,0x79,0x00,0xA4,0x42,0x55,0x46,0x46, /* 000002D8 ".y..BUFF" */ + 0x14,0x06,0x5F,0x53,0x52,0x53,0x01,0x14, /* 000002E0 ".._SRS.." */ + 0x06,0x5F,0x44,0x49,0x53,0x00,0x5B,0x82, /* 000002E8 "._DIS.[." */ + 0x45,0x06,0x4C,0x4E,0x4B,0x44,0x08,0x5F, /* 000002F0 "E.LNKD._" */ + 0x48,0x49,0x44,0x0C,0x41,0xD0,0x0C,0x0F, /* 000002F8 "HID.A..." */ + 0x08,0x5F,0x55,0x49,0x44,0x0A,0x04,0x14, /* 00000300 "._UID..." */ + 0x09,0x5F,0x53,0x54,0x41,0x00,0xA4,0x0A, /* 00000308 "._STA..." */ + 0x0B,0x14,0x1A,0x5F,0x43,0x52,0x53,0x00, /* 00000310 "..._CRS." */ + 0x08,0x42,0x55,0x46,0x46,0x11,0x09,0x0A, /* 00000318 ".BUFF..." */ + 0x06,0x23,0x20,0x00,0x18,0x79,0x00,0xA4, /* 00000320 ".# ..y.." */ + 0x42,0x55,0x46,0x46,0x14,0x1A,0x5F,0x50, /* 00000328 "BUFF.._P" */ + 0x52,0x53,0x00,0x08,0x42,0x55,0x46,0x46, /* 00000330 "RS..BUFF" */ + 0x11,0x09,0x0A,0x06,0x23,0x20,0x06,0x18, /* 00000338 "....# .." */ + 0x79,0x00,0xA4,0x42,0x55,0x46,0x46,0x14, /* 00000340 "y..BUFF." */ + 0x06,0x5F,0x53,0x52,0x53,0x01,0x14,0x06, /* 00000348 "._SRS..." */ + 0x5F,0x44,0x49,0x53,0x00,0x10,0x44,0x12, /* 00000350 "_DIS..D." */ + 0x2E,0x5F,0x53,0x42,0x5F,0x50,0x43,0x49, /* 00000358 "._SB_PCI" */ + 0x30,0x5B,0x82,0x21,0x53,0x30,0x31,0x5F, /* 00000360 "0[.!S01_" */ + 0x08,0x5F,0x41,0x44,0x52,0x0C,0x00,0x00, /* 00000368 "._ADR..." */ + 0x01,0x00,0x08,0x5F,0x53,0x55,0x4E,0x01, /* 00000370 "..._SUN." */ + 0x14,0x0B,0x5F,0x45,0x4A,0x30,0x01,0x50, /* 00000378 ".._EJ0.P" */ + 0x43,0x45,0x4A,0x01,0x5B,0x82,0x23,0x53, /* 00000380 "CEJ.[.#S" */ + 0x30,0x32,0x5F,0x08,0x5F,0x41,0x44,0x52, /* 00000388 "02_._ADR" */ + 0x0C,0x00,0x00,0x02,0x00,0x08,0x5F,0x53, /* 00000390 "......_S" */ + 0x55,0x4E,0x0A,0x02,0x14,0x0C,0x5F,0x45, /* 00000398 "UN...._E" */ + 0x4A,0x30,0x01,0x50,0x43,0x45,0x4A,0x0A, /* 000003A0 "J0.PCEJ." */ + 0x02,0x5B,0x82,0x23,0x53,0x30,0x33,0x5F, /* 000003A8 ".[.#S03_" */ + 0x08,0x5F,0x41,0x44,0x52,0x0C,0x00,0x00, /* 000003B0 "._ADR..." */ + 0x03,0x00,0x08,0x5F,0x53,0x55,0x4E,0x0A, /* 000003B8 "..._SUN." */ + 0x03,0x14,0x0C,0x5F,0x45,0x4A,0x30,0x01, /* 000003C0 "..._EJ0." */ + 0x50,0x43,0x45,0x4A,0x0A,0x03,0x5B,0x82, /* 000003C8 "PCEJ..[." */ + 0x23,0x53,0x30,0x34,0x5F,0x08,0x5F,0x41, /* 000003D0 "#S04_._A" */ + 0x44,0x52,0x0C,0x00,0x00,0x04,0x00,0x08, /* 000003D8 "DR......" */ + 0x5F,0x53,0x55,0x4E,0x0A,0x04,0x14,0x0C, /* 000003E0 "_SUN...." */ + 0x5F,0x45,0x4A,0x30,0x01,0x50,0x43,0x45, /* 000003E8 "_EJ0.PCE" */ + 0x4A,0x0A,0x04,0x14,0x11,0x50,0x43,0x45, /* 000003F0 "J....PCE" */ + 0x4A,0x01,0x70,0x79,0x01,0x68,0x00,0x42, /* 000003F8 "J.py.h.B" */ + 0x30,0x45,0x4A,0xA4,0x00,0x14,0x35,0x50, /* 00000400 "0EJ...5P" */ + 0x43,0x4E,0x54,0x02,0xA0,0x0A,0x93,0x68, /* 00000408 "CNT....h" */ + 0x01,0x86,0x53,0x30,0x31,0x5F,0x69,0xA0, /* 00000410 "..S01_i." */ + 0x0B,0x93,0x68,0x0A,0x02,0x86,0x53,0x30, /* 00000418 "..h...S0" */ + 0x32,0x5F,0x69,0xA0,0x0B,0x93,0x68,0x0A, /* 00000420 "2_i...h." */ + 0x03,0x86,0x53,0x30,0x33,0x5F,0x69,0xA0, /* 00000428 "..S03_i." */ + 0x0B,0x93,0x68,0x0A,0x04,0x86,0x53,0x30, /* 00000430 "..h...S0" */ + 0x34,0x5F,0x69,0x14,0x3E,0x50,0x43,0x4E, /* 00000438 "4_i.>PCN" */ + 0x46,0x00,0x70,0x00,0x60,0x70,0x50,0x43, /* 00000440 "F.p.`pPC" */ + 0x49,0x55,0x61,0x70,0x50,0x43,0x49,0x44, /* 00000448 "IUapPCID" */ + 0x62,0xA2,0x26,0x95,0x60,0x0A,0x04,0x75, /* 00000450 "b.&.`..u" */ + 0x60,0xA0,0x0E,0x7B,0x61,0x79,0x01,0x60, /* 00000458 "`..{ay.`" */ + 0x00,0x00,0x50,0x43,0x4E,0x54,0x60,0x01, /* 00000460 "..PCNT`." */ + 0xA0,0x0F,0x7B,0x62,0x79,0x01,0x60,0x00, /* 00000468 "..{by.`." */ + 0x00,0x50,0x43,0x4E,0x54,0x60,0x0A,0x03, /* 00000470 ".PCNT`.." */ + 0xA4,0x01,0x10,0x4D,0x06,0x5F,0x47,0x50, /* 00000478 "...M._GP" */ + 0x45,0x08,0x5F,0x48,0x49,0x44,0x0D,0x41, /* 00000480 "E._HID.A" */ + 0x43,0x50,0x49,0x30,0x30,0x30,0x36,0x00, /* 00000488 "CPI0006." */ + 0x14,0x15,0x5F,0x45,0x30,0x31,0x00,0x5C, /* 00000490 ".._E01.\" */ + 0x2F,0x03,0x5F,0x53,0x42,0x5F,0x50,0x43, /* 00000498 "/._SB_PC" */ + 0x49,0x30,0x50,0x43,0x4E,0x46,0x14,0x15, /* 000004A0 "I0PCNF.." */ + 0x5F,0x45,0x30,0x32,0x00,0x5C,0x2F,0x03, /* 000004A8 "_E02.\/." */ + 0x5F,0x53,0x42,0x5F,0x50,0x43,0x49,0x30, /* 000004B0 "_SB_PCI0" */ + 0x50,0x43,0x4E,0x46,0x14,0x15,0x5F,0x45, /* 000004B8 "PCNF.._E" */ + 0x30,0x33,0x00,0x5C,0x2F,0x03,0x5F,0x53, /* 000004C0 "03.\/._S" */ + 0x42,0x5F,0x50,0x43,0x49,0x30,0x50,0x43, /* 000004C8 "B_PCI0PC" */ + 0x4E,0x46,0x14,0x15,0x5F,0x45,0x30,0x34, /* 000004D0 "NF.._E04" */ + 0x00,0x5C,0x2F,0x03,0x5F,0x53,0x42,0x5F, /* 000004D8 ".\/._SB_" */ + 0x50,0x43,0x49,0x30,0x50,0x43,0x4E,0x46 /* 000004E0 "PCI0PCNF" */ +}; diff --git a/executor/vbios_reset.cc b/executor/vbios_reset.cc index 0a875b10..e8e778d4 100644 --- a/executor/vbios_reset.cc +++ b/executor/vbios_reset.cc @@ -4,6 +4,8 @@ * Copyright (C) 2009-2010, Bernhard Kauer * Economic rights: Technische Universitaet Dresden (Germany) * + * Copyright (C) 2013 Jacek Galowicz, Intel Corporation. + * * This file is part of Vancouver. * * Vancouver is free software: you can redistribute it and/or modify @@ -19,6 +21,11 @@ #include "nul/motherboard.h" #include "executor/bios.h" +/* This file contains the AML code of the DSDT in form + * of a string, which is available under the symbol name + * "AmlCode" */ +#include "dsdt.h" + bool use_x2apic_mode; PARAM_HANDLER(x2apic_mode, "x2apic_mode - enable x2apic mode in the LAPICs") @@ -144,6 +151,15 @@ class VirtualBiosReset : public StaticReceiver, public BiosCom // the ACPI IRQ is 9 discovery_write_dw("FACP", 46, 9, 2); + /* Initialize DSDT table. + * Its content is defined as AML bytecode in dsdt.h */ + discovery_write_st("DSDT", 0, "DSDT", 4); + + /* Initialize FACS table. + * The table is left empty. Linux demands its existence + * before switching to ACPI mode. */ + discovery_write_st("FACS", 0, "FACS", 4); + // store what remains on memory in KB discovery_write_dw("bda", 0x13, _mem_size >> 10, 2); return jmp_int(msg, 0x19); @@ -220,6 +236,28 @@ class VirtualBiosReset : public StaticReceiver, public BiosCom discovery_write_dw(name, 15, 0, 1); fix_acpi_checksum(_resources + index, 20, 8); } + else if (!strcmp("DSDT", name)) { + unsigned table; + check1(false, !(table = alloc(sizeof(AmlCode), 0x10)), + "allocate ACPI table failed"); + _resources[index] = Resource(name, table, sizeof(AmlCode), true); + + // FADT contains a pointer to the DSDT + discovery_write_dw("FACP", 40, table, 4); + + /* The DSDT is completely defined as AML bytecode in dsdt.h + * which was compiled from ASL by the Intel ASL compiler */ + memcpy(_mem_ptr + table, AmlCode, sizeof(AmlCode)); + } + else if (!strcmp("FACS", name)) { + unsigned table; + check1(false, !(table = alloc(36, 64)), "allocate ACPI table failed"); + _resources[index] = Resource(name, table, 36, true); + init_acpi_table(name); + + // FADT contains a pointer to the FACS + discovery_write_dw("FACP", 36, table, 4); + } else { // we create an ACPI table size_t table; diff --git a/host/migration.cc b/host/migration.cc new file mode 100644 index 00000000..85fb2002 --- /dev/null +++ b/host/migration.cc @@ -0,0 +1,764 @@ +/** + * Base migration code + * + * Copyright (C) 2013 Jacek Galowicz, Intel Corporation. + * + * This file is part of Seoul. + * + * Seoul is free software: you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Seoul is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details. + */ + +/* Activate checksumming for debugging purposes + * of the received range after migrating. + * As this really makes the freeze gap larger, + * this should only be used for testing when + * the migration algorithm is changed. */ +//#define DO_CHECKSUMMING + + +#include // snprintf + +#include +#include + +#include +#include + +Migration::Migration(Motherboard *mb) +: _mb(mb), + _vcpu_utcb(NULL), +#if PORTED_TO_UNIX + _vcpu_blocked_sem(cap, true), + _vcpu_sem(cap+1, true), +#endif + _vcpu_should_block(false), + _socket(NULL), + _sendmem(0), _sendmem_total(0), + _freeze_timer(_mb->clock()) +{ + _vcpu_utcb = new CpuState; +} + +Migration::~Migration() +{ +} + +void Migration::init_memrange_info() +{ + MessageHostOp msg(MessageHostOp::OP_GUEST_MEM, 0UL); + if (!_mb->bus_hostop.send(msg)) + Logging::panic("%s failed to get physical memory\n", + __PRETTY_FUNCTION__); + + _physmem_start = msg.ptr; + _physmem_size = msg.len; + + _dirtman = DirtManager(_physmem_size >> 12); +} + +void Migration::save_guestregs(CpuState *utcb) +{ + /* After Migration::freeze_vcpus() was called, the VCPU will + * arrive in the recall handler and call this method here. + * Its register states are saved and then it hangs in + * our lock. + */ + if (!_vcpu_should_block) return; + + mword vcpu_bytes = reinterpret_cast(&utcb->id+1); + vcpu_bytes -= reinterpret_cast(&utcb->mtd); + + memcpy(&_vcpu_utcb->mtd, &utcb->mtd, vcpu_bytes); + +#if PORTED_TO_UNIX + // Release the waiting migration thread + _vcopu_blocked_sem.up(); + // Freeze VCPU + _vcpu_sem.downmulti(); +#endif +} + +/* This is used to print messages onto the screen + * just after the VMM has started and waits for incoming + * guest state data. + */ +bool Migration::puts_guestscreen(const char *str, bool reset_screen) +{ + MessageRestore msg(MessageRestore::VGA_DISPLAY_GUEST, + const_cast(str), reset_screen); + return _mb->bus_restore.send(msg, true); +} + +void Migration::print_welcomescreen() +{ + char welcome_msg[255]; + mword ip = IpHelper::instance().get_ip(); + + snprintf(welcome_msg, sizeof(welcome_msg), + " Waiting for guest to migrate. IP: %lu.%lu.%lu.%lu\n\n", + ip & 0xff, (ip >> 8) & 0xff, (ip >> 16) & 0xff, (ip >> 24) & 0xff); + puts_guestscreen(welcome_msg, true); +} + +void Migration::freeze_vcpus() +{ + Logging::printf("Stopping vcpu.\n"); + + _vcpu_should_block = true; + + CpuEvent smsg(VCpu::EVENT_RESUME); + for (VCpu *vcpu = _mb->last_vcpu; vcpu; vcpu=vcpu->get_last()) + vcpu->bus_event.send(smsg); + +#if PORTED_TO_UNIX + _vcpu_blocked_sem.downmulti(); +#endif + + _freeze_timer.start(); +} + +void Migration::unfreeze_vcpus() +{ + _vcpu_should_block = false; +#if PORTED_TO_UNIX + /* After releasing the VCPU it will continue + * through the rest of the recall handler. + */ + _vcpu_sem.up(); +#endif +} + +bool Migration::chksum_page(unsigned page_nr, mword &their_chksum, bool compare) +{ + mword my_chksum = 0; + assert(page_nr < (_physmem_size >> 12)); + + mword *ptr = reinterpret_cast(_physmem_start + (page_nr << 12)); + + for (unsigned i=0; i < 4096 / sizeof(ptr[0]); ++i) + // checksum = sum over (address_i * value_i^2) + my_chksum += reinterpret_cast(ptr+1) * (ptr[i]) * (ptr[i]); + + // Use case one: return true if given memory range is correct + if (compare) return my_chksum == their_chksum; + + // Second use case: Provide a checksum for a given memory range + their_chksum = my_chksum; + return true; +} + +bool Migration::checksums(bool retrieve) +{ + mword pagenr = 0; + mword checksum; + mword magic = 0xfafab0b0; + bool success = true; + + if (retrieve) { + // Receiver. Check the existing checksum list against our memory + mword rec_magic; + + _socket->receive(&rec_magic, sizeof(rec_magic)); + _socket->receive(&pagenr, sizeof(pagenr)); + _socket->receive(&checksum, sizeof(checksum)); + + while (pagenr != ~0ul) { + assert(magic == rec_magic); + MessageMemRegion mmsg(pagenr); + assert(_mb->bus_memregion.send(mmsg, true)); + assert(mmsg.actual_physmem); + + bool area_success = chksum_page(mmsg.start_page, checksum, true); + success &= area_success; + + Logging::printf("Checksum of area [%8lx - %8lx) - %s\n", + reinterpret_cast(mmsg.start_page), + reinterpret_cast(mmsg.start_page + mmsg.count), + area_success ? "OK" : "Error"); + + _socket->receive(&rec_magic, sizeof(rec_magic)); + _socket->receive(&pagenr, sizeof(pagenr)); + _socket->receive(&checksum, sizeof(checksum)); + } + } + else { + // Sender. Make a list of checksums and send it away. + + while (pagenr < _physmem_size) { + MessageMemRegion mmsg(pagenr); + if (!_mb->bus_memregion.send(mmsg, true) || !mmsg.actual_physmem) { + // No one claims this region. do not check. + ++pagenr; + continue; + } + + Logging::printf("Checksumming the area [%8lx - %8lx)\n", + reinterpret_cast(mmsg.start_page), + reinterpret_cast(mmsg.start_page + mmsg.count)); + + chksum_page(pagenr, checksum, false); + success &= _socket->send(&magic, sizeof(magic)); + success &= _socket->send(&pagenr, sizeof(pagenr)); + success &= _socket->send(&checksum, sizeof(checksum)); + + pagenr += mmsg.count; + } + + pagenr = ~0ul; + success &= _socket->send(&magic, sizeof(magic)); + success &= _socket->send(&pagenr, sizeof(pagenr)); + success &= _socket->send(&pagenr, sizeof(pagenr)); + } + + return success; +} + + +/*********************************************************************** + * Guest receiving part + ***********************************************************************/ + +bool Migration::receive_ping() +{ + mword ping_msg = 0; + + _socket->receive(&ping_msg, sizeof(ping_msg)); + + if (ping_msg != 0xc0ffee) { + Logging::printf("Received bad ping message.\n"); + return false; + } + + ping_msg *= 3; + _socket->send(&ping_msg, sizeof(ping_msg)); + + return true; +} + +void Migration::receive_header() +{ + MigrationHeader mig_header; + + Logging::printf("Receiving guest information.\n"); + + _socket->receive(&mig_header, sizeof(mig_header)); + if (!mig_header.magic_string_check()) + Logging::panic("Magic string check failed: MigrationHeader\n"); + + MessageRestore vgamsg(MessageRestore::VGA_VIDEOMODE, NULL, true); + vgamsg.bytes = mig_header.videomode; + _mb->bus_restore.send(vgamsg, true); +} + +void Migration::receive_memory() +{ + StopWatch watch(_mb->clock()); + Logging::printf("Receiving guest memory.\n"); + + Prd current; + unsigned long bytes = 0; + + watch.start(); + while (1) { + _socket->receive(¤t, sizeof(current)); + if (!current.value()) + // Receiving an empty range descriptor means "EOF" + break; + + _socket->receive(current.base() + _physmem_start, current.size()); + bytes += current.size(); + } + watch.stop(); + + Logging::printf("Received %lu MB. RX Rate: %u KB/s\n", + bytes / 1024 / 1024, watch.rate(bytes)); +} + +/* Being equipped with a pointer to the stopped VCPU's + * register state structure, its registers will be overwritten + * and devices restored. + */ +bool Migration::receive_guestdevices(CpuState *vcpu_utcb) +{ + Logging::printf("Receiving UTCB.\n"); + + CpuState *buf = new CpuState; + + mword utcb_end = reinterpret_cast(&buf->id+1); + mword utcb_start = reinterpret_cast(&buf->mtd); + mword utcb_bytes = utcb_end - utcb_start; + + _socket->receive(&buf->mtd, utcb_bytes); + + memcpy(&vcpu_utcb->mtd, &buf->mtd, utcb_bytes); + + delete buf; + + Logging::printf("Receiving Devices.\n"); + + // This works quite similar to the device saving procedure + MessageRestore *rmsg = new MessageRestore(MessageRestore::RESTORE_RESTART, + NULL, false); + _mb->bus_restore.send_fifo(*rmsg); + + // no while(someone_responds_true) approach here because we know + // what we want to restore and how many. + bool ret; + while (1) { + _socket->receive(rmsg, sizeof(*rmsg)); + assert(rmsg->magic_string_check()); + + if (rmsg->devtype == 0xdead) + break; + + char *device_buffer = new char[rmsg->bytes]; + _socket->receive(device_buffer, rmsg->bytes); + + rmsg->space = device_buffer; + rmsg->write = false; + ret = _mb->bus_restore.send(*rmsg, true); + if (!ret) Logging::printf("No device replied on restore message!" + " VMM-Configuration mismatch?\n"); + + delete [] device_buffer; + } + + delete rmsg; + + /* Fix TSC offset. + * The guest would freeze for some time or skip some timesteps otherwise. + */ + unsigned long long sender_rdtsc; + _socket->receive(&sender_rdtsc, sizeof(sender_rdtsc)); + + CpuMessage rdtsc_msg(CpuMessage::TYPE_ADD_TSC_OFF, NULL, 0); + rdtsc_msg.current_tsc_off = sender_rdtsc - Cpu::rdtsc(); + + for (VCpu *vcpu = _mb->last_vcpu; vcpu; vcpu=vcpu->get_last()) + vcpu->executor.send(rdtsc_msg); + + return true; +} + +bool Migration::listen(unsigned port, CpuState *vcpu_utcb) +{ + init_memrange_info(); + + print_welcomescreen(); + + _socket = IpHelper::instance().listen(port); + if (_socket == NULL) Logging::panic("Got no TCP receiver.\n"); + + receive_ping(); + + receive_header(); + + receive_memory(); + + receive_guestdevices(vcpu_utcb); + +#ifdef DO_CHECKSUMMING + // Checksumming really makes the migration gap larger + if (!checksums(true)) { + Logging::printf("Error while comparing checksums.\n"); + return false; + } +#endif + + _socket->close(); + + MessageRestore replug_msg(MessageRestore::PCI_PLUG, NULL, true); + _mb->bus_restore.send(replug_msg, false); + + Logging::printf("That's it. Waking up VCPUs.\n"); + unfreeze_vcpus(); + + return true; +} + +/*********************************************************************** + * Guest sending part + ***********************************************************************/ + +unsigned Migration::negotiate_port() +{ + char *cmdline = NULL; + + MessageHostOp msg(MessageHostOp::OP_GET_CONFIG_STRING, 0ul); + if (!_mb->bus_hostop.send(msg)) + return 0; + assert(msg.obj != NULL); + cmdline = reinterpret_cast(msg.obj); + + /* Send the listener service our configuration string. + * It will try to start an identically configured VMM + * instance and then tell us on what port it is waiting + * for state input. + */ + MigrationInit mig_init(strlen(cmdline)); + if (!_socket->send(&mig_init, sizeof(mig_init))) return 0; + if (!_socket->send(cmdline, mig_init.cmdlen)) return 0; + + MigrationAnswer mig_ans; + _socket->receive(&mig_ans, sizeof(mig_ans)); + if (!mig_ans.magic_string_check()) { + Logging::printf("Magic string check failed: MigrationAnswer"); + return 0; + } + + if (!mig_ans.success) { + Logging::printf("Configuration is not suitable for target machine.\n"); + return 0; + } + + delete [] cmdline; + return mig_ans.port; +} + +bool Migration::send_header() +{ + /* Sending the listening VMM the video mode setting will allow it + * to switch the framebuffer to the right setting before migration. + * The screen would flicker and display ugly symbols if the + * framebuffer state is restored, but the host doesn't display it + * the right way, otherwise. + */ + MessageRestore vgamsg(MessageRestore::VGA_VIDEOMODE, NULL, false); + _mb->bus_restore.send(vgamsg, true); + + MigrationHeader mig_header(vgamsg.bytes); + return _socket->send(&mig_header, sizeof(mig_header)); +} + +timevalue Migration::send_ping() +{ + StopWatch ping_timer(_mb->clock()); + + mword ping_msg = 0xc0ffee; + mword pong_msg = 0; + + ping_timer.start(); + _socket->send(&ping_msg, sizeof(ping_msg)); + _socket->receive(&pong_msg, sizeof(pong_msg)); + ping_timer.stop(); + + if (pong_msg != 3 * ping_msg) { + Logging::printf("Error during latency check\n"); + return 0; + } + + return ping_timer.delta(); +} + +#define NEXT_DIRTY_PAGE() \ +({ \ + MessageHostOp msg(MessageHostOp::OP_NEXT_DIRTY_PAGE, 0ul); \ + _mb->bus_hostop.send(msg); \ + msg.value; \ +}) + +unsigned Migration::enqueue_all_dirty_pages(longrange_data &async_data) +{ + Prd *crds = async_data.crds; + unsigned crds_sent=0; + + Prd first_crd, last_crd; + + /* This loop will cycle through the memory space + * until it ends up without any new dirty regions + * or it has done a full cycle. + */ + while (1) { + Prd current(NEXT_DIRTY_PAGE()); + + if (!current.value() || // Nothing dirty + // Next round through the memspace + (first_crd.value() && current.base() == first_crd.base()) || + (last_crd.value() && current.base() == last_crd.base())) + break; + + /* These pages are just _marked_ dirty in another data structure, + * the dirt manager. + * This structure might be able to apply some smart optimizations + * in the future like e.g. "don't resend pages too often which are dirtied + * with high access-frequency to reduce traffic", etc. + */ + _dirtman.mark_dirty(current); + + if (!first_crd.value()) first_crd = current; + last_crd = current; + } + + unsigned pages_enqueued = 0; + while (_dirtman.dirty_pages() > 0 && crds_sent < async_data.crd_count) { + Prd current = crds[crds_sent] = _dirtman.next_dirty(); + if (!current.value()) + // That's it for now. + break; + + _dirtman.mark_clean(current); + + if (!_socket->send_nonblocking(&crds[crds_sent], sizeof(*crds)) || + !_socket->send_nonblocking(current.base() + _physmem_start, + current.size())) + return 0; + + ++crds_sent; + pages_enqueued += 1 << current.order(); + } + + return pages_enqueued; +} + +bool Migration::send_memory(longrange_data &async_data) +{ + StopWatch lap_time(_mb->clock()); + StopWatch last_lap(_mb->clock()); + + unsigned transfer_rate; + unsigned dirtying_rate; + + /* The underlying socket architecture works a little bit different than + * BSD sockets, where you stuff data to be sent into the send buffer + * until it replies with "buffer is full, wait a bit". + * These sockets here asynchronously manage lists of pointers to memory ranges + * and their size and will pick up this data when it is actually needed. + * And because of this we have to preserve all memory ranges to be sent + * until they are ACKed. + */ + + const unsigned page_limit = 1000; + unsigned pages_transferred; + unsigned round = 0; + async_data.crds = new Prd[page_limit]; + async_data.crd_count = page_limit; + + MessageRestore unplug_msg(MessageRestore::PCI_PLUG, NULL, false); + _mb->bus_restore.send(unplug_msg, false); + + do { + last_lap = lap_time; + lap_time.start(); + + if (!(pages_transferred = enqueue_all_dirty_pages(async_data)) || + !_socket->wait_complete()) + return false; + + lap_time.stop(); + + transfer_rate = lap_time.rate(pages_transferred << 12); + dirtying_rate = last_lap.rate(pages_transferred << 12); + Logging::printf("RND %u PAGE_CNT %5u TX %5u KB/s DRT %5u KB/s DELTA" + " %llu START %llu\n", + round, pages_transferred, transfer_rate, dirtying_rate, + lap_time.delta(), lap_time.abs_start()); + + assert(pages_transferred); + + _sendmem_total += pages_transferred << 12; + if (_sendmem == 0) _sendmem = _sendmem_total; + ++round; + } while (transfer_rate >= dirtying_rate); + + // The last transfer round with a frozen guest system will follow now + freeze_vcpus(); + + unsigned freeze_pages = 0; + while ((freeze_pages = enqueue_all_dirty_pages(async_data)) > 0) { + if (!_socket->wait_complete()) return false; + pages_transferred += freeze_pages; + } + + static Prd end_of_crds; + if (!pages_transferred || + !_socket->send_nonblocking(&end_of_crds, sizeof(end_of_crds))) + return false; + + Logging::printf("Enqueued the last %u dirty pages\n", pages_transferred); + return true; +} + +bool Migration::send_devices(longrange_data dat) +{ + // Send VCPU state +#if PORTED_TO_UNIX + unsigned vcpu_bytes = reinterpret_cast(&_vcpu_utcb->id+1); + vcpu_bytes -= reinterpret_cast(&_vcpu_utcb->mtd); + + if (!_socket->send(&_vcpu_utcb->mtd, vcpu_bytes)) + return false; +#endif + + /* There are multiple RESTORE_xxx types of restore messages. + * For each kind of device there is one. + * So we throw messages of each type onto the bus. + */ + MessageRestore restart_msg(MessageRestore::RESTORE_RESTART, NULL, true); + _mb->bus_restore.send_fifo(restart_msg); + + mword restore_bytes = restart_msg.bytes; + mword restore_bytes_consumed = 0; + dat.restore_buf = new char[restore_bytes + sizeof(MessageRestore)]; + + for (int i=MessageRestore::RESTORE_RESTART+1; + i < MessageRestore::RESTORE_LAST; + i++) { + /* A device will receive this message, write its state into it and + * return true. If it receives such a message again, it will return + * false. That's why we sent this RESTORE_RESTART message before. + * After the first time the bus returns false, we know that we saved + * all devices of this particular type. + */ + while (1) { + char *msg_addr = dat.restore_buf + restore_bytes_consumed; + char *device_space = dat.restore_buf + restore_bytes_consumed + + sizeof(MessageRestore); + + MessageRestore *rmsg = reinterpret_cast(msg_addr); + memset(rmsg, 0, sizeof(*rmsg)); + + rmsg->devtype = i; + rmsg->write = true; + rmsg->space = device_space; + rmsg->magic_string = MessageRestore::MAGIC_STRING_DEVICE_DESC; + + if (!_mb->bus_restore.send(*rmsg, true)) break; + + restore_bytes_consumed += sizeof(*rmsg) + rmsg->bytes; + } + } + assert(restore_bytes == restore_bytes_consumed); + + if (!_socket->send_nonblocking(dat.restore_buf, restore_bytes) || + // Send "end of devices" + !_socket->send_nonblocking(&dat.end_of_devices, + sizeof(dat.end_of_devices)) || + !_socket->wait_complete()) { + Logging::printf("Error sending device states.\n"); + return false; + } + + // Restore current tsc offset at destination + dat.rdtsc = Cpu::rdtsc(); + /* Compensate network latency. + * This was tested with cloning a VM displaying animations + * which were bound to TSC values. After migration, + * they only ran in sync when the following line was applied. + */ + dat.rdtsc += dat.latency * _mb->clock()->freq() / 1000; + + if (!_socket->send(&dat.rdtsc, sizeof(dat.rdtsc))) { + Logging::printf("Error sending RDTSC\n"); + return false; + } + + return true; +} + +bool Migration::send(unsigned long addr, unsigned long port) +{ + StopWatch migration_timer(_mb->clock()); + longrange_data async_data; + + init_memrange_info(); + + Logging::printf("Trying to connect...\n"); + _socket = IpHelper::instance().connect(addr, port); + if (_socket == NULL) { + Logging::printf("Quitting: Got no TCP connection.\n"); + return false; + } + + Logging::printf("Established connection.\n"); + + unsigned mig_port = negotiate_port(); + + _socket->close(); + + if (!mig_port) return false; + + Logging::printf("Connecting to waiting target VM.\n"); + _socket = IpHelper::instance().connect(addr, mig_port); + if (!_socket) { + Logging::printf("Error connecting to target VM.\n"); + return false; + } + Logging::printf("OK, starting the actual migration.\n"); + + migration_timer.start(); + + async_data.latency = send_ping(); + if (!async_data.latency) { + Logging::printf("Ping failed.\n"); + return false; + } + // Latency = round trip time / 2 + async_data.latency >>= 1; + Logging::printf("Connection has a latency of %lu ms * freq %llu kHz" + " = %llu ticks.\n", + async_data.latency, _mb->clock()->freq() / 1000, + async_data.latency * _mb->clock()->freq() / 1000); + + if (!send_header()) { + Logging::printf("Sending header failed.\n"); + return false; + } + if (!send_memory(async_data)) { + Logging::printf("Sending guest state failed.\n"); + return false; + } + + if (!send_devices(async_data)) { + Logging::printf("Sending guest devices failed.\n"); + return false; + } + +#ifdef DO_CHECKSUMMING + // Checksumming really makes the freeze gap larger + if (!checksums(false)) { + Logging::printf("Error while sending checksums.\n"); + return false; + } +#endif + + // Uncomment this to "clone" the VM instead of migrating it away. + //unfreeze_vcpus(); + + _freeze_timer.stop(); + + _socket->close(); + + migration_timer.stop(); + + Logging::printf("Done. VM was frozen for %llu ms.\n", _freeze_timer.delta()); + Logging::printf("This migration took %llu seconds.\n", + migration_timer.delta() / 1000); + Logging::printf("%3lu%% (%lu MB) of guest memory resent due to change.\n", + 100u * (_sendmem_total - _sendmem) / _sendmem, + (_sendmem_total - _sendmem) / 1024u / 1024u); + + _dirtman.print_stats(); + + delete [] async_data.crds; + delete [] async_data.restore_buf; +#if PORTED_TO_UNIX + delete _vcpu_utcb; +#endif + + return true; +} + +PARAM_HANDLER(retrieve_guest, + "retrieve_guest: - Start a VMM instance which waits for guest", + " state input over network listening on ") +{ + MessageHostOp msg(MessageHostOp::OP_MIGRATION_RETRIEVE_INIT, argv[0]); + mb.bus_hostop.send(msg); +} diff --git a/include/nul/bus.h b/include/nul/bus.h index c59a5898..85c9eac1 100644 --- a/include/nul/bus.h +++ b/include/nul/bus.h @@ -4,6 +4,8 @@ * Copyright (C) 2007-2009, Bernhard Kauer * Economic rights: Technische Universitaet Dresden (Germany) * + * Copyright (C) 2013 Markus Partheymueller, Intel Corporation. + * * This file is part of Vancouver. * * Vancouver is free software: you can redistribute it and/or modify @@ -17,6 +19,7 @@ */ #pragma once +#include "message.h" #include "service/logging.h" #include "service/string.h" @@ -41,17 +44,29 @@ template class DBus { typedef bool (*ReceiveFunction)(Device *, M&); + typedef bool (*EnqueueFunction)(Device *, M&, MessageIOThread::Mode, MessageIOThread::Sync, unsigned*, VCpu *vcpu); struct Entry { Device *_dev; ReceiveFunction _func; }; + struct EnqEntry + { + Device *_dev; + VCpu *_vcpu; + EnqueueFunction _func; + }; unsigned long _debug_counter; unsigned _list_count; unsigned _list_size; struct Entry *_list; + unsigned _callback_count; + unsigned _callback_size; + struct Entry *_iothread_callback; + struct EnqEntry *_iothread_enqueue; + /** * To avoid bugs we disallow the copy constuctor. */ @@ -65,6 +80,14 @@ class DBus _list = n; _list_size = new_size; }; + void set_callback_size(unsigned new_size) + { + Entry *n = new Entry[new_size]; + memcpy(n, _iothread_callback, _callback_count * sizeof(*_iothread_callback)); + if (_iothread_callback) delete [] _iothread_callback; + _iothread_callback = n; + _callback_size = new_size; + }; public: void add(Device *dev, ReceiveFunction func) @@ -76,13 +99,99 @@ class DBus _list_count++; } + void add_iothread_callback(Device *dev, ReceiveFunction func) + { + if (_callback_count >= _callback_size) + set_callback_size(_callback_size > 0 ? _callback_size * 2 : 1); + _iothread_callback[_callback_count]._dev = dev; + _iothread_callback[_callback_count]._func = func; + _callback_count++; + } + + void set_iothread_enqueue(Device *dev, EnqueueFunction func, VCpu *vcpu=nullptr) + { + if (_iothread_enqueue == nullptr) { + delete [] _iothread_enqueue; + _iothread_enqueue = new EnqEntry; + } + _iothread_enqueue->_dev = dev; + _iothread_enqueue->_vcpu = vcpu; + _iothread_enqueue->_func = func; + } + /** - * Send message LIFO. + * Send message directly. */ - bool send(M &msg, bool earlyout = false) + bool send_direct_fifo(M &msg) + { + _debug_counter++; + bool res = false; + for (unsigned i = 0; i < _list_count; i++) + res |= _list[i]._func(_list[i]._dev, msg); + return res; + } + bool send_direct_rr(M &msg, unsigned *value) { + for (unsigned i = 0; i < _list_count; i++) + if (_list[i]._func(_list[(i + *value) % _list_count]._dev, msg)) { + *value = (i + *value + 1) % _list_count; + return true; + } + return false; + } + bool send_direct(M &msg, MessageIOThread::Mode mode, unsigned *value=nullptr) + { + if (mode == MessageIOThread::MODE_FIFO) return send_direct_fifo(msg); + if (mode == MessageIOThread::MODE_RR) return send_direct_rr(msg, value); + + _debug_counter++; + bool res = false; + bool earlyout = (mode == MessageIOThread::MODE_EARLYOUT); + for (unsigned i = _list_count; i-- && !(earlyout && res);) + res |= _list[i]._func(_list[i]._dev, msg); + return res; + } + + /** + * Send message LIFO synchronously. + */ + bool send_sync(M &msg, bool earlyout = false) { + bool res = false; + if (_iothread_callback) { + for (unsigned i = _callback_count; i-- && !res;) { + res |= _iothread_callback[i]._func(_iothread_callback[i]._dev, msg); + } + } + if (!res && _iothread_enqueue != nullptr) { + // No one wants the message directly, enqueue it. + if (_iothread_enqueue->_func(_iothread_enqueue->_dev, msg, earlyout ? MessageIOThread::MODE_EARLYOUT : MessageIOThread::MODE_NORMAL, MessageIOThread::SYNC_SYNC, nullptr, _iothread_enqueue->_vcpu)) + return true; + } _debug_counter++; + res = false; + for (unsigned i = _list_count; i-- && !(earlyout && res);) + res |= _list[i]._func(_list[i]._dev, msg); + return res; + } + + /** + * Send message LIFO asynchronously. + */ + bool send(M &msg, bool earlyout = false) + { bool res = false; + if (_iothread_callback) { + for (unsigned i = _callback_count; i-- && !res;) { + res |= _iothread_callback[i]._func(_iothread_callback[i]._dev, msg); + } + } + if (!res && _iothread_enqueue != nullptr) { + // No one wants the message directly, enqueue it. + if (_iothread_enqueue->_func(_iothread_enqueue->_dev, msg, earlyout ? MessageIOThread::MODE_EARLYOUT : MessageIOThread::MODE_NORMAL, MessageIOThread::SYNC_ASYNC, nullptr, _iothread_enqueue->_vcpu)) + return true; + } + _debug_counter++; + res = false; for (unsigned i = _list_count; i-- && !(earlyout && res);) res |= _list[i]._func(_list[i]._dev, msg); return res; @@ -93,8 +202,19 @@ class DBus */ bool send_fifo(M &msg) { - _debug_counter++; bool res = false; + if (_iothread_callback) { + for (unsigned i = _callback_count; i-- && !res;) { + res |= _iothread_callback[i]._func(_iothread_callback[i]._dev, msg); + } + } + if (!res && _iothread_enqueue != nullptr) { + // No one wants the message directly, enqueue it. + if (_iothread_enqueue->_func(_iothread_enqueue->_dev, msg, MessageIOThread::MODE_FIFO, MessageIOThread::SYNC_ASYNC, nullptr, _iothread_enqueue->_vcpu)) + return true; + } + _debug_counter++; + res = false; for (unsigned i = 0; i < _list_count; i++) res |= _list[i]._func(_list[i]._dev, msg); return 0; @@ -107,6 +227,17 @@ class DBus */ bool send_rr(M &msg, unsigned &start) { + bool res = false; + if (_iothread_callback) { + for (unsigned i = _callback_count; i-- && !res;) { + res |= _iothread_callback[i]._func(_iothread_callback[i]._dev, msg); + } + } + if (!res && _iothread_enqueue != nullptr) { + // No one wants the message directly, enqueue it. + if (_iothread_enqueue->_func(_iothread_enqueue->_dev, msg, MessageIOThread::MODE_RR, MessageIOThread::SYNC_ASYNC, &start, _iothread_enqueue->_vcpu)) + return true; + } _debug_counter++; for (unsigned i = 0; i < _list_count; i++) if (_list[i]._func(_list[(i + start) % _list_count]._dev, msg)) { @@ -138,5 +269,5 @@ class DBus } /** Default constructor. */ - DBus() : _debug_counter(0), _list_count(0), _list_size(0), _list(nullptr) {} + DBus() : _debug_counter(0), _list_count(0), _list_size(0), _list(nullptr), _callback_count(0), _callback_size(0), _iothread_callback(nullptr), _iothread_enqueue(nullptr) {} }; diff --git a/include/nul/iphelper.h b/include/nul/iphelper.h new file mode 100644 index 00000000..417599ec --- /dev/null +++ b/include/nul/iphelper.h @@ -0,0 +1,146 @@ +/* + * IpHelper class + * + * Copyright (C) 2013 Jacek Galowicz, Intel Corporation. + * + * This file is part of Seoul. + * + * Seoul is free software: you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Seoul is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details. + * + * This was previously used for network communication in the NUL userland + * when virtualizing with the NOVA microhypervisor. Functionality was not + * ported and rather the interface is described here to ease porting to the + * UNIX socket interface. + */ + +#ifndef __IPHELPER_H +#define __IPHELPER_H + +#include + +#define IP_AS_UL(a, b, c, d) ((((d) & 0xff) << 24) | (((c) & 0xff) << 16) | (((b) & 0xff) << 8) | ((a) & 0xff)) + +class IpHelper; + +class TcpSocket +{ + friend IpHelper; + + private: + + bool _outgoing; + unsigned short _local_port; + unsigned short _remote_port; + + // Indicates if we are connected. + bool _connected; + // A socket can still be "connected" although closed, if there is still data to be sent. + // After sending this data, the socket will finally be marked as "closed" + bool _closed; + + /* ... semaphores used to be initialized here */ + /* ... buffers ... */ + + /* Only to be called by IpHelper */ + TcpSocket(unsigned caps) + : _remote_port(0), _connected(false), _closed(true) + { /* ... */ } + + /* Forbidden and hence not implemented: */ + TcpSocket(TcpSocket const&); + void operator=(TcpSocket const&); + + public: + /* + * Methods for the end user! + */ + + bool block_until_connected() { return false; } + + /* Close this socket. */ + void close() {} + + /* Blocking receive function. Difference to BSD sockets: + * Does _not_ return before it received the expected number of bytes. */ + bool receive(void *data, unsigned bytes) { return false; } + + /* Blocking send function. Difference to BSD sockets: + * Does _not_ return before the user ACKed all bytes. */ + bool send(void *data, unsigned bytes) { return false; } + + /* Nonblocking send function. Returns immediately. + * Call wait_complete after you pushed multiple send_nonblocking() calls. */ + bool send_nonblocking(void *data, unsigned bytes) { return false; } + + /* Wait until the receiver ACKed all packets sent from this socket. */ + bool wait_complete() { return false; } +}; + +class IpHelper +{ + private: + /* ... */ + + unsigned long long _mac; + + mword _ip; + mword _netmask; + mword _gateway; + + TcpSocket *_sockets; + + IpHelper() : _mac(0), _ip(0), _netmask(0), _gateway(0), _sockets(NULL) + {}; + + + /* Forbidden, hence not implemented: */ + IpHelper(IpHelper const&); + void operator=(IpHelper const&); + + public: + /* This is a singleton */ + static IpHelper & instance() + { + static IpHelper instance; + return instance; + } + + /* === These methods are to be used from the network thread === */ + + /* Attach a KernelSemaphore to this and get notified on timeout events. + * You will better attach this to network events, too. */ + unsigned timer_sm() { return 0; /* This used to return a network timer semaphor capability */ } + + /* Call this after the semaphore let you through to reprogram for the next timeout */ + void check_timeout() {} + + /* Call this regularly to let sockets send */ + void sockets_send() {} + + /* Feed this method regularly with new incoming packets from the network. */ + void do_tcpip(unsigned char* data, unsigned size) {} + + /* === These methods are to be used by the actual end user === */ + + /* Call this once at the beginning to initialize everything. */ + bool init(/* ... */) { return false;} + + /* Block-wait until IpHelper gets an IP and return its value. */ + mword get_ip() { return 0; } + + /* Connect to port at given IP and return a working socket. */ + TcpSocket * connect(unsigned addr, unsigned port) { return NULL; } + + /* Make a socket listen on port and return a TcpSocket object when a connection + * was established */ + TcpSocket * listen(unsigned port) { return NULL; } +}; + +#endif /* __IPHELPER_H */ diff --git a/include/nul/message.h b/include/nul/message.h index bf0a31c3..43c28588 100644 --- a/include/nul/message.h +++ b/include/nul/message.h @@ -6,6 +6,9 @@ * Copyright (C) 2009, Bernhard Kauer * Economic rights: Technische Universitaet Dresden (Germany) * + * Copyright (C) 2013 Jacek Galowicz, Intel Corporation. + * Copyright (C) 2013 Markus Partheymueller, Intel Corporation. + * * This file is part of Vancouver. * * Vancouver is free software: you can redistribute it and/or modify @@ -22,6 +25,46 @@ #include #include +class VCpu; +struct MessageIOThread +{ + VCpu *vcpu; + enum Type { + TYPE_IOIN, + TYPE_IOOUT, + TYPE_MEM, + TYPE_INPUT, + TYPE_IRQ, + TYPE_IRQLINES, + TYPE_IRQNOTIFY, + TYPE_NETWORK, + TYPE_DISK, + TYPE_DISKCOMMIT, + TYPE_LEGACY, + TYPE_TIME, + TYPE_TIMER, + TYPE_TIMEOUT, + TYPE_PCICFG, + TYPE_HOSTOP, + TYPE_CPU, + } type; + enum Mode { + MODE_NORMAL, + MODE_EARLYOUT, + MODE_FIFO, + MODE_RR + } mode; + enum Sync { + SYNC_SYNC, + SYNC_ASYNC + } sync; + unsigned *value; + void *ptr; + void *sem; + + MessageIOThread(Type _type, Mode _mode, Sync _sync, void *_ptr) : vcpu(nullptr), type(_type), mode(_mode), sync(_sync), value(nullptr), ptr(_ptr), sem(nullptr) {} + MessageIOThread(Type _type, Mode _mode, Sync _sync, unsigned *_value, void *_ptr) : vcpu(nullptr), type(_type), mode(_mode), sync(_sync), value(_value), ptr(_ptr), sem(nullptr) {} +}; /****************************************************/ /* IOIO messages */ @@ -110,7 +153,8 @@ struct MessageMemRegion uintptr_t start_page; unsigned count; char * ptr; - MessageMemRegion(uintptr_t _page) : page(_page), count(0), ptr(0) {} + bool actual_physmem; + MessageMemRegion(uintptr_t _page) : page(_page), count(0), ptr(0), actual_physmem(false) {} }; @@ -254,6 +298,8 @@ struct MessageLegacy INTR, DEASS_INTR, INTA, + UNLOCK, + CHECK_INTR, } type; unsigned value; MessageLegacy(Type _type, unsigned _value=0) : type(_type), value(_value) {} @@ -449,6 +495,10 @@ struct MessageHostOp OP_VCPU_BLOCK, OP_VCPU_RELEASE, OP_WAIT_CHILD, + OP_NEXT_DIRTY_PAGE, + OP_GET_CONFIG_STRING, + OP_MIGRATION_RETRIEVE_INIT, + OP_MIGRATION_START, } type; union { unsigned long value; @@ -560,6 +610,23 @@ struct MessageAcpi MessageAcpi(unsigned _parent_bdf, unsigned _bdf, unsigned char _pin): type(ACPI_GET_IRQ), parent_bdf(_parent_bdf), bdf(_bdf), pin(_pin), gsi(~0u) {} }; +/** + * Virtual ACPI: Fixed and General Purpose Events + * can be triggered with these messages + */ +struct MessageAcpiEvent +{ + enum EventType { + ACPI_EVENT_FIXED, + ACPI_EVENT_GP, + ACPI_EVENT_HOT_UNPLUG, + ACPI_EVENT_HOT_REPLUG, + } type; + unsigned num; + + MessageAcpiEvent(EventType _type, unsigned _num) + : type(_type), num(_num) {}; +}; /** * Resource discovery between device models is done by the virtual @@ -748,4 +815,52 @@ struct MessageNetwork MessageNetwork(unsigned type, unsigned client) : type(type), mac(0), client(client) { } }; +struct MessageRestore +{ + enum networkStrings { + MAGIC_STRING_DEVICE_DESC = 0x8D06F00D + }; + + enum restoreTypes { + RESTORE_RESTART = 0, // RESTART is sent over the restore bus for initialization + RESTORE_TIMEOUTLIST, + RESTORE_PIC, + RESTORE_LAPIC, + RESTORE_PIT, + RESTORE_VGA, + RESTORE_NIC, + RESTORE_ACPI, + RESTORE_VCPU, + RESTORE_LAST, + // This one is acutally a restore device type: + // vga.cc will react on this, printing messages on the guest screen. + VGA_DISPLAY_GUEST, + VGA_VIDEOMODE, + // This is for pass-through devices. They will un-/replug themselves + // out of/into the guest before/after live migration + PCI_PLUG, + }; + unsigned long magic_string; + // Use these enums on devtype + unsigned devtype; + // The device will note down how many bytes of this structure it actually uses. + mword bytes; + // Two variables which every device type can use for identification + unsigned id1; + unsigned id2; + // write=true: Writing a device state onto disk. false: Reading back from disk + bool write; + + // Space for saving the device state + char *space; + + MessageRestore(unsigned _devtype, char *_space, bool _write) : + magic_string(MAGIC_STRING_DEVICE_DESC), devtype(_devtype), + bytes(0), id1(0), id2(0), write(_write), space(_space) + {} + bool magic_string_check() { return magic_string == MAGIC_STRING_DEVICE_DESC; } +}; + + + /* EOF */ diff --git a/include/nul/migration.h b/include/nul/migration.h new file mode 100644 index 00000000..67ca0cae --- /dev/null +++ b/include/nul/migration.h @@ -0,0 +1,278 @@ +/** + * Base migration code declarations + * + * Copyright (C) 2013 Jacek Galowicz, Intel Corporation. + * + * This file is part of Seoul. + * + * Seoul is free software: you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Seoul is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details. + */ + +#include +#include +#include +#include + +class Desc +{ + protected: + unsigned _value; + Desc(unsigned v) : _value(v) {} + public: + unsigned value() { return _value; } +}; + +/** + * A page range descriptor; + * Introduced, because NUL provided CRDs for this... + **/ +class Prd +{ + protected: + unsigned _value; + + public: + unsigned order() { return ((_value >> 7) & 0x1f); } + unsigned size() { return 1 << (order() + 12); } + unsigned base() { return _value & ~0xfff; } + unsigned attr() { return _value & 0x1f; } + unsigned cap() { return _value >> 12; } + unsigned value() { return _value; } + + explicit Prd(unsigned offset, unsigned order, unsigned attr) : _value((offset << 12) | (order << 7) | attr) { } + explicit Prd(unsigned v) : _value(v) {} + explicit Prd() : Prd(0) {} +}; + +/* The DirtManager is feeded with CRDs of dirty page regions. + * There's an internal bitmap which can be used for future resend-optimizations + * as well as generating resend-statistics. + */ +class DirtManager +{ + private: + unsigned *_map; + unsigned _pages; + + unsigned char *_cnt; + + unsigned _dirt_count; + + public: + void mark_dirty(Prd dirty) + { + unsigned base = dirty.base() >> 12; + unsigned pages = 1 << dirty.order(); + for (unsigned i=base; i < base + pages; ++i) mark_dirty(i); + } + + void mark_dirty(unsigned page) + { + if (!Cpu::get_bit(_map, page)) { + ++_dirt_count; + ++_cnt[page]; + } + Cpu::set_bit(_map, page, true); + } + + void mark_clean(Prd clean) + { + unsigned base = clean.base() >> 12; + unsigned pages = 1 << clean.order(); + for (unsigned i=base; i < base + pages; ++i) mark_clean(i); + } + + void mark_clean(unsigned page) + { + --_dirt_count; + Cpu::set_bit(_map, page, false); + } + + unsigned dirty_pages() { return _dirt_count; } + + Prd next_dirty() { + unsigned base, len; + + for (base = 0; base < _pages; ++base) { + len = 0; + while (Cpu::get_bit(_map, base + len)) ++len; + + if (len > 0) break; + } + + if (len == 0) return Prd(); + + Prd ret(base, Cpu::bsr(len), 0); + return ret; + } + + static inline unsigned char fir_max(unsigned char *in, unsigned limit, unsigned pos, int size) + { + int beg = pos - size; + int end = pos + size; + beg = VMM_MAX(beg, static_cast(0)); + end = VMM_MIN(end, static_cast(limit - 1)); + + int width = end - beg; + assert(width > 0); + assert(width < 2 * size + 1); + + unsigned max = 0; + for (int i=beg; i <= end; ++i) max = VMM_MAX(max, in[i]); + + return static_cast(max); + } + + void print_stats() + { + const unsigned size = 20; + unsigned char bucket[size]; + + unsigned sx = 0, sqx = 0; + + unsigned char *smooth[3]; + + smooth[0] = new unsigned char[_pages]; + smooth[1] = new unsigned char[_pages]; + smooth[2] = new unsigned char[_pages]; + + for (unsigned i=0; i < _pages; ++i) { + unsigned faults = VMM_MIN(_cnt[i], size); + ++bucket[faults]; + + sx += faults; + sqx += faults * faults; + + for (unsigned j=0; j < 3; ++j) + smooth[j][i] = fir_max(_cnt, _pages, i, j*50+1); + } + + float avg = sx / _pages; + float var = sqx - _pages * avg * avg; + + Logging::printf("# avg = %u, var = %u\n", + static_cast(avg), static_cast(var)); + +#if 0 + /* This generates a really long list needed for plotting + * statistics + */ + Logging::printf("# Remaps per page:\n"); + for (unsigned i = 0; i < _pages; ++i) + Logging::printf("REMAP %#x %u %u %u %u\n", + i, _cnt[i], smooth[0][i], smooth[1][i], smooth[2][i]); +#endif + + delete [] smooth[0]; + delete [] smooth[1]; + delete [] smooth[2]; + } + + DirtManager() : _map(NULL), _pages(0), _cnt(NULL), _dirt_count(0) {} + DirtManager(unsigned pages) : _map(NULL), _pages(pages), _cnt(NULL), _dirt_count(0) + { + _map = new unsigned[(pages + sizeof(*_map) -1) / sizeof(*_map)]; + _cnt = new unsigned char[pages]; + memset(_cnt, 0, pages * sizeof(*_cnt)); + } + ~DirtManager() + { + if (_map) delete [] _map; + if (_cnt) delete [] _cnt; + } +}; + +class Migration : public StaticReceiver +{ + Motherboard *_mb; +#if PORTED_TO_UNIX + Hip *_hip; + CapAllocator *_tls; +#endif + + char *_physmem_start; + unsigned long _physmem_size; + + CpuState *_vcpu_utcb; +#if PORTED_TO_UNIX + KernelSemaphore _vcpu_blocked_sem; + KernelSemaphore _vcpu_sem; +#endif + bool _vcpu_should_block; + + TcpSocket *_socket; + + unsigned long _sendmem; + unsigned long _sendmem_total; + + StopWatch _freeze_timer; + + /* Because of asynchronous send operations, all + * data to be send has to be preserved somewhere until + * it is ACKED. That's what this structure is for. + */ + struct longrange_data { + unsigned crd_count; + Prd *crds; + + timevalue rdtsc; + char *restore_buf; + MessageRestore end_of_devices; + + mword latency; + + longrange_data() : + crd_count(0), crds(NULL), + rdtsc(0), restore_buf(NULL), end_of_devices(0xdead, NULL, true), + latency(0) {} + }; + + DirtManager _dirtman; + + void init_memrange_info(); + void print_welcomescreen(); + bool puts_guestscreen(const char *str, bool reset_screen); + + void freeze_vcpus(); + void unfreeze_vcpus(); + + unsigned negotiate_port(); + bool send_header(); + timevalue send_ping(); + bool send_devices(longrange_data dat); + unsigned enqueue_all_dirty_pages(longrange_data &async_data); + bool send_memory(longrange_data &async_data); + + void receive_header(); + bool receive_ping(); + void receive_memory(); + bool receive_guestdevices(CpuState *vcpu_utcb); + + bool chksum_page(unsigned page_nr, mword &their_chksum, bool compare); + bool checksums(bool retrieve); + + public: + enum RestoreModes { + MODE_OFF = 0, + MODE_SEND, + MODE_RECEIVE + }; + + bool listen(unsigned port , CpuState *vcpu_utcb); + bool send(unsigned long addr, unsigned long port); + + // To be called from do_recall + void save_guestregs(CpuState *utcb); + + bool receive(MessageHostOp &msg); + + Migration(Motherboard *mb); + ~Migration(); +}; diff --git a/include/nul/migration_structs.h b/include/nul/migration_structs.h new file mode 100644 index 00000000..86893592 --- /dev/null +++ b/include/nul/migration_structs.h @@ -0,0 +1,115 @@ +/** + * Migration protocol structures + * + * Copyright (C) 2013 Jacek Galowicz, Intel Corporation. + * + * This file is part of Seoul. + * + * Seoul is free software: you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Seoul is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details. + */ + + +struct MigrationInit { +#define MAGIC_STRING_MIGINIT 0xb00b00 + mword cmdlen; + mword magic_string; + + MigrationInit() : cmdlen(0), magic_string(MAGIC_STRING_MIGINIT) {} + MigrationInit(mword _cmdlen) : cmdlen(_cmdlen), magic_string(MAGIC_STRING_MIGINIT) {} + bool magic_string_check() { return magic_string == MAGIC_STRING_MIGINIT; } +}; + +struct MigrationAnswer { +#define MAGIC_STRING_MIGANSWER 0xfeeb1ed0 + mword success; + mword port; + mword magic_string; + + MigrationAnswer() : success(0), port(0), magic_string(MAGIC_STRING_MIGANSWER) {} + MigrationAnswer(unsigned _port) : success(1), port(_port), magic_string(MAGIC_STRING_MIGANSWER) {} + bool magic_string_check() { return magic_string == MAGIC_STRING_MIGANSWER; } +}; + +/* + * This is an index structure telling us how many memory pages and device pages + * are saved to the hard disk, enabling us to calculate offsets later. + */ +struct RestoreIndex { + unsigned mem_pages; + unsigned dev_pages; + char space[0x1000 - 2*sizeof(unsigned)]; +}; + +struct MigrationHeader { +#define MAGIC_STRING_HEADER 0xb0015366 + mword magic_string; + mword version; + mword videomode; + + MigrationHeader() : magic_string(MAGIC_STRING_HEADER) {} + MigrationHeader(mword _videomode) + : magic_string(MAGIC_STRING_HEADER), videomode(_videomode) {} + bool magic_string_check() { return magic_string == MAGIC_STRING_HEADER; } +}; + +struct AddressSpaceIndex { +#define MAGIC_STRING_ADDR_SPACE 0xBADB0B + unsigned long magic_string; + unsigned long num_pages; + + AddressSpaceIndex() {} + AddressSpaceIndex(unsigned long pages) : magic_string(MAGIC_STRING_ADDR_SPACE), num_pages(pages) {} + bool magic_string_check() { return magic_string == MAGIC_STRING_ADDR_SPACE; } +}; + +struct PageTransferIndex { +#define MAGIC_STRING_PAGE_INDEX 0x51CD06 + unsigned long magic_string; + unsigned long desc_num; + unsigned long total_bytes; + + PageTransferIndex() + : magic_string(MAGIC_STRING_PAGE_INDEX) {} + PageTransferIndex(unsigned long descs, unsigned long bytes) + : magic_string(MAGIC_STRING_PAGE_INDEX), desc_num(descs), total_bytes(bytes) {} + bool magic_string_check() { return magic_string == MAGIC_STRING_PAGE_INDEX; } +}; + +static unsigned long checksum_pages(void *offset, unsigned long count) +{ + if (offset == 0) return 0; + assert(! (reinterpret_cast(offset) & 0xfff) ); + + unsigned long chksum = 0; + unsigned long *ptr = reinterpret_cast(offset); + + for (unsigned i=0; i < count * 0x1000 / sizeof(unsigned long); i++) + chksum += ptr[i] * ptr[i]; + + return chksum; +} + +struct PageTransferDesc { +#define MAGIC_STRING_PAGE_DESC 0xDEADC0DE + unsigned long magic_string; + unsigned long offset; + unsigned long count; + unsigned long checksum; + + PageTransferDesc() {} + PageTransferDesc(unsigned long _offset, unsigned long _count) + : magic_string(MAGIC_STRING_PAGE_DESC), offset(_offset), count(_count), + checksum(checksum_pages(reinterpret_cast(_offset), _count)) { } + unsigned long recalculate_checksums() + { return (checksum = checksum_pages(reinterpret_cast(offset), count)); } + bool magic_string_check() { return magic_string == MAGIC_STRING_PAGE_DESC; } +}; + +#define MAGIC_STRING_PAGE_BORDER 0xC03DD00D diff --git a/include/nul/motherboard.h b/include/nul/motherboard.h index f4ce9b7b..6fb2f149 100644 --- a/include/nul/motherboard.h +++ b/include/nul/motherboard.h @@ -4,6 +4,8 @@ * Copyright (C) 2007-2010, Bernhard Kauer * Economic rights: Technische Universitaet Dresden (Germany) * + * Copyright (C) 2013 Jacek Galowicz, Intel Corporation. + * * This file is part of Vancouver. * * Vancouver is free software: you can redistribute it and/or modify @@ -48,6 +50,7 @@ class Motherboard public: DBus bus_acpi; + DBus bus_acpi_event; DBus bus_ahcicontroller; DBus bus_apic; DBus bus_bios; @@ -79,6 +82,8 @@ class Motherboard DBus bus_timer; ///< Request for timers DBus bus_vesa; + DBus bus_restore; + VCpu *last_vcpu; Clock *clock() { return _clock; } Hip *hip() { return _hip; } diff --git a/include/nul/templates.h b/include/nul/templates.h index c883b747..2bea14fd 100644 --- a/include/nul/templates.h +++ b/include/nul/templates.h @@ -6,6 +6,8 @@ * * This file is part of Vancouver. * + * Copyright (C) 2013 Markus Partheymueller, Intel Corporation. + * * Vancouver is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. @@ -26,6 +28,10 @@ class StaticReceiver : public Device public: template static bool receive_static(Device *o, M& msg) { return static_cast(o)->receive(msg); } + template + static bool enqueue_static(Device *o, M& msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu=nullptr) { return static_cast(o)->enqueue(msg, mode, sync, value, vcpu); } + template + static bool claim_static(Device *o, M& msg) { return static_cast(o)->claim(msg); } StaticReceiver() : Device(__PRETTY_FUNCTION__) {}; }; diff --git a/include/nul/timer.h b/include/nul/timer.h index 0f4a821f..530beb4f 100644 --- a/include/nul/timer.h +++ b/include/nul/timer.h @@ -4,6 +4,9 @@ * Copyright (C) 2007-2008, Bernhard Kauer * Economic rights: Technische Universitaet Dresden (Germany) * + * Copyright (C) 2013 Jacek Galowicz, Intel Corporation. + * Copyright (C) 2013 Markus Partheymueller, Intel Corporation. + * * This file is part of Vancouver. * * Vancouver is free software: you can redistribute it and/or modify @@ -19,6 +22,10 @@ #include "service/cpu.h" #include "service/math.h" +#include +#include +#include + typedef unsigned long long timevalue; @@ -78,7 +85,7 @@ class Clock * Keeping track of the timeouts. */ template -class TimeoutList +class TimeoutList : public StaticReceiver> { class TimeoutEntry { @@ -91,6 +98,8 @@ class TimeoutList }; TimeoutEntry _entries[ENTRIES]; + + bool _restore_processed; public: /** * Alloc a new timeout object. @@ -187,5 +196,64 @@ class TimeoutList _entries[0]._timeout = ~0ULL; } - TimeoutList() { init(); } + TimeoutList() : _restore_processed(false) { init(); } + +#define REL_PTR(ptr, offset) ( \ + reinterpret_cast<__typeof__(ptr)>( \ + reinterpret_cast(ptr) - reinterpret_cast(offset)) \ +) +#define ABS_PTR(ptr, offset) ( \ + reinterpret_cast<__typeof__(ptr)>( \ + reinterpret_cast(ptr) + reinterpret_cast(offset)) \ +) + + bool receive(MessageRestore &msg) + { + const mword bytes = reinterpret_cast(&_restore_processed) + - reinterpret_cast(_entries); + + if (msg.devtype == MessageRestore::RESTORE_RESTART) { + _restore_processed = false; + msg.bytes += bytes + sizeof(msg); + return false; + } + + if (msg.devtype != MessageRestore::RESTORE_TIMEOUTLIST || _restore_processed) return false; + + unsigned long long rdtsc = Cpu::rdtsc(); + + if (msg.write) { + msg.bytes = bytes; + memcpy(msg.space, reinterpret_cast(_entries), bytes); + + // Do not mess around with timeout entries of the running guest, + // since we may want to let it continue after saving + TimeoutEntry *entries = reinterpret_cast(msg.space); + for (unsigned i=0; i < ENTRIES; i++) { + entries[i]._prev = REL_PTR(entries[i]._prev, _entries); + entries[i]._next = REL_PTR(entries[i]._next, _entries); + + if (i == 0) continue; + + if (entries[i]._timeout <= rdtsc) + entries[i]._timeout = 0; + else + entries[i]._timeout -= rdtsc; + } + } + else { + memcpy(reinterpret_cast(_entries), msg.space, bytes); + for (unsigned i=0; i < ENTRIES; i++) { + _entries[i]._prev = ABS_PTR(_entries[i]._prev, _entries); + _entries[i]._next = ABS_PTR(_entries[i]._next, _entries); + + if (i == 0) continue; + _entries[i]._timeout += rdtsc; + } + } + + //Logging::printf("%s Timeoutlist\n", msg.write ? "Saved" : "Restored"); + _restore_processed = true; + return true; + } }; diff --git a/include/nul/vcpu.h b/include/nul/vcpu.h index 145a0184..4b7fc057 100644 --- a/include/nul/vcpu.h +++ b/include/nul/vcpu.h @@ -4,6 +4,9 @@ * Copyright (C) 2010, Bernhard Kauer * Economic rights: Technische Universitaet Dresden (Germany) * + * Copyright (C) 2013 Jacek Galowicz, Intel Corporation. + * Copyright (C) 2013 Markus Partheymueller, Intel Corporation. + * * This file is part of Vancouver. * * Vancouver is free software: you can redistribute it and/or modify @@ -36,7 +39,8 @@ struct CpuMessage { TYPE_WBINVD, TYPE_CHECK_IRQ, TYPE_CALC_IRQWINDOW, - TYPE_SINGLE_STEP + TYPE_SINGLE_STEP, + TYPE_ADD_TSC_OFF, } type; union { struct { @@ -84,7 +88,8 @@ struct LapicEvent { enum Type{ INTA, RESET, - INIT + INIT, + CHECK_INTR } type; unsigned value; LapicEvent(Type _type) : type(_type) { if (type == INTA) value = ~0u; } @@ -122,7 +127,8 @@ class VCpu EVENT_DEBUG = 1 << 17, STATE_BLOCK = 1 << 18, STATE_WAKEUP = 1 << 19, - EVENT_HOST = 1 << 20 + EVENT_HOST = 1 << 20, + EVENT_RESUME = 1 << 21 }; unsigned long long inj_count; diff --git a/include/service/time.h b/include/service/time.h index ab0239be..5ce605cf 100644 --- a/include/service/time.h +++ b/include/service/time.h @@ -4,6 +4,8 @@ * Copyright (C) 2009, Bernhard Kauer * Economic rights: Technische Universitaet Dresden (Germany) * + * Copyright (C) 2013 Jacek Galowicz, Intel Corporation. + * * This file is part of Vancouver. * * Vancouver is free software: you can redistribute it and/or modify @@ -97,3 +99,29 @@ static inline void gmtime(timevalue seconds, struct tm_simple *tm) tm->mon = m + 1; tm->mday = days + 1; } + +class StopWatch +{ +private: + Clock *_clock; + unsigned _frequency; + timevalue _tic, _toc; + +public: + void start() { _tic = _clock->clock(_frequency); } + timevalue stop() { _toc = _clock->clock(_frequency); return delta(); } + timevalue delta() { return _toc - _tic; } + + timevalue abs_start() { return _tic; } + timevalue abs_stop() { return _toc; } + + // Returns B/ms, which is actually kB/s (if using default frequency) + unsigned rate(mword bytes) { + if (delta()) return bytes / delta(); + else return 0; + } + + StopWatch(Clock *clock, unsigned frequency = 1000 /* ms */) + : _clock(clock), _frequency(frequency), _tic(0), _toc(0) + {} +}; diff --git a/model/acpicontroller.cc b/model/acpicontroller.cc new file mode 100644 index 00000000..d293a112 --- /dev/null +++ b/model/acpicontroller.cc @@ -0,0 +1,346 @@ +/** + * ACPI controller model + * + * Copyright (C) 2013 Jacek Galowicz, Intel Corporation. + * + * This file is part of Seoul. + * + * Seoul is free software: you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Seoul is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details. + */ + + +#include +#include + +#include "nul/motherboard.h" +#include "executor/bios.h" + +#define CMD_ACPI_ENABLE 0xab +#define CMD_ACPI_DISABLE 0xba + +#define PORT_SMI_CMD 0xaeae + +/* The pm1 event register group is somewhat complicated. + * port numbers follow a partition rule of the register block. + * see ACPI spec 4.7.3.1 + */ +#define PM1_EVT_LEN 4 +#define PORT_PM1A_EVENT_BLK 0xaea6 +#define PORT_PM1B_EVENT_BLK 0xaeaa +#define PORT_PM1A_EVENT_STATUS (PORT_PM1A_EVENT_BLK) +#define PORT_PM1A_EVENT_ENABLE (PORT_PM1A_EVENT_BLK + (PM1_EVT_LEN) / 2) // 0xa6 + 4/2 = 0xa8 +#define PORT_PM1B_EVENT_STATUS (PORT_PM1B_EVENT_BLK) +#define PORT_PM1B_EVENT_ENABLE (PORT_PM1B_EVENT_BLK + (PM1_EVT_LEN) / 2) // 0xaa + 4/2 = 0xac + +#define PM1_CNT_LEN 2 +#define PORT_PM1A_CONTROL 0xaeb0 +#define PORT_PM1B_CONTROL 0xaeb2 + +#define PORT_GPE0_STATUS 0xaeb4 +#define PORT_GPE1_STATUS 0xaeb5 +#define PORT_GPE0_ENABLE (PORT_GPE0_STATUS + 2) +#define PORT_GPE1_ENABLE (PORT_GPE1_STATUS + 2) + +#define PORT_PCIU 0xae00 +#define PORT_PCID 0xae04 +#define PORT_B0EJ 0xae08 + + +class AcpiController : public StaticReceiver, public BiosCommon +{ + private: + unsigned short _pm1a_status; + unsigned short _pm1a_enable; + unsigned short _pm1a_control; + + unsigned short _pm1b_status; + unsigned short _pm1b_enable; + unsigned short _pm1b_control; + + unsigned char _gpe0_sts; + unsigned char _gpe0_en; + unsigned char _gpe1_sts; + unsigned char _gpe1_en; + + unsigned _b0ej; // write-only register + unsigned _pciu; // read-only, REFRESH register (card plugged in) + unsigned _pcid; // read-only, DETACH register (card to be unplugged) + + bool _processed; + + StopWatch _watch; + + public: + void trigger_gpe(unsigned event_nr) + { + + // Activate this event in the appropriate register + _gpe0_sts |= 0x00ff & (1 << event_nr); + _gpe1_sts |= (0xff00 & (1 << event_nr)) >> 8; + + // If this event is masked by the guest, then just ignore it + if ((0 == _gpe0_sts & _gpe0_en) || (0 == _gpe1_sts & _gpe1_en)) + return; + + // Send the guest an SCI + MessageIrqLines msg(MessageIrq::ASSERT_IRQ, 9); + _mb.bus_irqlines.send(msg); + } + + bool receive(MessageAcpiEvent &msg) { + switch (msg.type) { + case MessageAcpiEvent::ACPI_EVENT_GP: + trigger_gpe(msg.num); + break; + case MessageAcpiEvent::ACPI_EVENT_HOT_REPLUG: + _pciu |= (1 << msg.num); + trigger_gpe(1); + break; + case MessageAcpiEvent::ACPI_EVENT_HOT_UNPLUG: + _watch.start(); + _pcid |= (1 << msg.num); + trigger_gpe(1); + break; + + case MessageAcpiEvent::ACPI_EVENT_FIXED: + default: + return false; + } + + return true; + } + + bool receive(MessageDiscovery &msg) { + if (msg.type != MessageDiscovery::DISCOVERY) return false; + + /* The following FADT entries will tell the guest kernel + * how to interact with the system when receiving + * System Control Interrupts (SCI). + * Only the GPE part is important for hot plugging, but + * all the PM-stuff is mandatory for event management + * to work. + */ + discovery_write_dw("FACP", 56, PORT_PM1A_EVENT_BLK); + discovery_write_dw("FACP", 60, PORT_PM1B_EVENT_BLK); + discovery_write_dw("FACP", 64, PORT_PM1A_CONTROL); + discovery_write_dw("FACP", 68, PORT_PM1B_CONTROL); + discovery_write_dw("FACP", 88, PM1_EVT_LEN, 1); + discovery_write_dw("FACP", 89, PM1_CNT_LEN, 1); + + discovery_write_dw("FACP", 80, PORT_GPE0_STATUS, 4); // GPE0_BLK + discovery_write_dw("FACP", 84, PORT_GPE1_STATUS, 4); // GPE1_BLK + + discovery_write_dw("FACP", 92, 4, 1); // GPE0_BLK_LEN + discovery_write_dw("FACP", 93, 4, 1); // GPE1_BLK_LEN + discovery_write_dw("FACP", 94, 16, 1); // GPE1_BASE (offset) + + /* This is used at boot once. Linux will write + * CMD_ACPI_ENABLE via system IO using port PORT_SMI_CMD + * to tell the mainboard it wants to use ACPI. + * If CMD_ACPI_ENABLE was defined as 0x00, the guest kernel + * would think that ACPI was always on. Therefore, this is + * optional and one could just erase the next three lines. + */ + discovery_write_dw("FACP", 48, PORT_SMI_CMD); + discovery_write_dw("FACP", 52, CMD_ACPI_ENABLE, 1); + discovery_write_dw("FACP", 53, CMD_ACPI_DISABLE, 1); + + return true; + } + + bool receive(MessageIOIn &msg) { + switch (msg.port) { + case PORT_PM1A_EVENT_STATUS: + //Logging::printf("In on port pm1a EVENT STATUS: %x len %u\n", _pm1a_status, msg.type); + msg.value = _pm1a_status; + return true; + case PORT_PM1A_EVENT_ENABLE: + //Logging::printf("In on port pm1a EVENT ENABLE: %x len %u\n", _pm1a_enable, msg.type); + msg.value = _pm1a_enable; + return true; + case PORT_PM1A_CONTROL: + //Logging::printf("In on port pm1a CONTROL %x len %u\n", _pm1a_control, msg.type); + msg.value = _pm1a_control; + return true; + + case PORT_PM1B_EVENT_STATUS: + //Logging::printf("In on port pm1b EVENT STATUS: %x len %u\n", _pm1b_status, msg.type); + msg.value = _pm1b_status; + return true; + case PORT_PM1B_EVENT_ENABLE: + //Logging::printf("In on port pm1b EVENT ENABLE: %x len %u\n", _pm1b_enable, msg.type); + msg.value = _pm1b_enable; + return true; + case PORT_PM1B_CONTROL: + //Logging::printf("In on port pm1b CONTROL %x len %u\n", _pm1b_control, msg.type); + msg.value = _pm1b_control; + return true; + + + case PORT_GPE0_STATUS: + //Logging::printf("In on port GPE0 STS: %x\n", _gpe0_sts); + msg.value = _gpe0_sts; + return true; + case PORT_GPE0_ENABLE: + //Logging::printf("In on port GPE0 EN %x\n", _gpe0_en); + msg.value = _gpe0_en; + return true; + case PORT_GPE1_STATUS: + //Logging::printf("In on port GPE1 STS: %x\n", _gpe1_sts); + msg.value = _gpe1_sts; + return true; + case PORT_GPE1_ENABLE: + //Logging::printf("In on port GPE1 EN %x\n", _gpe1_en); + msg.value = _gpe1_en; + return true; + + case PORT_PCIU: + //Logging::printf("--- In on PCIU\n"); + msg.value = _pciu; + return true; + case PORT_PCID: + //Logging::printf("--- In on PCID\n"); + msg.value = _pcid; + return true; + default:; + } + return false; + } + + bool receive(MessageIOOut &msg) { + switch (msg.port) { + case PORT_SMI_CMD: + /* During boot the guest kernel checks PORT_SMI_CMD + * in the ACPI FADT table. If SCI_EN is not set, + * the system is in legacy mode. Hence it sends the + * CMD_ACPI_ENABLE cmd it got from the FADT again to + * this port and then polls for SCI_EN until it is set. + * ACPI is then officially active. */ + if (msg.value == CMD_ACPI_ENABLE) { + Logging::printf("Enabling ACPI for guest.\n"); + _pm1a_control |= 1; // Setting SCI_EN bit + } + else if (msg.value == CMD_ACPI_DISABLE) { + Logging::printf("Disabling ACPI for guest.\n"); + _pm1a_control &= ~1U; + } + return true; + + case PORT_PM1A_EVENT_STATUS: + //Logging::printf("Out on port pm1a EVENT STATUS: %x len %u\n", msg.value, msg.type); + return true; + case PORT_PM1A_EVENT_ENABLE: + //Logging::printf("Out on port pm1a EVENT ENABLE: %x len %u\n", msg.value, msg.type); + _pm1a_enable = static_cast(msg.value); + return true; + case PORT_PM1A_CONTROL: + //Logging::printf("Out on port pm1a CONTROL %x len %u\n", msg.value, msg.type); + return true; + + + case PORT_PM1B_EVENT_STATUS: + //Logging::printf("Out on port pm1b EVENT STATUS: %x len %u\n", msg.value, msg.type); + return true; + case PORT_PM1B_EVENT_ENABLE: + //Logging::printf("Out on port pm1b EVENT ENABLE: %x len %u\n", msg.value, msg.type); + _pm1a_enable = static_cast(msg.value); + return true; + case PORT_PM1B_CONTROL: + //Logging::printf("Out on port pm1b CONTROL %x len %u\n", msg.value, msg.type); + return true; + + case PORT_GPE0_STATUS: + //Logging::printf("Out on port GPE0 STS: %x len %u\n", msg.value, msg.type); + _gpe0_sts &= ~ static_cast(msg.value); + return true; + case PORT_GPE0_ENABLE: + //Logging::printf("Out on port GPE0 EN %x len %u\n", msg.value, msg.type); + _gpe0_en = static_cast(msg.value); + return true; + case PORT_GPE1_STATUS: + //Logging::printf("Out on port GPE1 STS: %x\n", msg.value); + _gpe1_sts &= ~ static_cast(msg.value); + return true; + case PORT_GPE1_ENABLE: + //Logging::printf("Out on port GPE1 EN %x\n", msg.value); + _gpe1_en = static_cast(msg.value); + return true; + + case PORT_B0EJ: + _watch.stop(); + Logging::printf("PCI hot-unplug confirmed by guest " + "(Output on B0EJ: %x) after %llu ms\n", + msg.value, _watch.delta()); + _pcid &= ~msg.value; + //Logging::printf("PCIU: %x, PCID: %x\n", _pciu, _pcid); + return true; + default:; + } + + /* Deassert this IRQ if all enabled events were cleared by the guest. + * This interrupt is thrown again otherwise. */ + if (!(_pm1a_status & _pm1a_enable) && + !(_pm1b_status & _pm1b_enable) && + !(_gpe0_sts & _gpe0_en) && + !(_gpe1_sts & _gpe1_en)) { + MessageIrqLines msg(MessageIrq::DEASSERT_IRQ, 9); + _mb.bus_irqlines.send(msg); + } + + return false; + } + + bool receive(MessageRestore &msg) + { + const mword bytes = reinterpret_cast(&_processed) + -reinterpret_cast(&_pm1a_status); + + if (msg.devtype == MessageRestore::RESTORE_RESTART) { + _processed = false; + msg.bytes += bytes + sizeof(msg); + return false; + } + + if (msg.devtype != MessageRestore::RESTORE_ACPI || _processed) return false; + + if (msg.write) { + msg.bytes = bytes; + memcpy(msg.space, reinterpret_cast(&_pm1a_status), bytes); + } + else { + memcpy(reinterpret_cast(&_pm1a_status), msg.space, bytes); + } + + Logging::printf("%s ACPI controller\n", msg.write?"Saved":"Restored"); + + _processed = true; + return true; + } + + AcpiController(Motherboard &mb) + : BiosCommon(mb), + _pm1a_status(0), _pm1a_enable(0), _pm1a_control(0), + _pm1b_status(0), _pm1b_enable(0), _pm1b_control(0), + _gpe0_sts(0), _gpe0_en(0), _gpe1_sts(0), _gpe1_en(0), + _b0ej(0), _pciu(0), _pcid(0), + _processed(false), _watch(mb.clock()) + { } +}; + +PARAM_HANDLER(acpimodel, + "acpimodel - Capable of issuing ACPI events to the guest.") +{ + AcpiController * dev = new AcpiController(mb); + mb.bus_discovery .add(dev, AcpiController::receive_static); + mb.bus_ioin .add(dev, AcpiController::receive_static); + mb.bus_ioout .add(dev, AcpiController::receive_static); + mb.bus_acpi_event.add(dev, AcpiController::receive_static); + mb.bus_restore .add(dev, AcpiController::receive_static); +} diff --git a/model/intel82576vf.cc b/model/intel82576vf.cc index abe9655e..612356ac 100644 --- a/model/intel82576vf.cc +++ b/model/intel82576vf.cc @@ -5,6 +5,8 @@ * Copyright (C) 2010, Julian Stecklina * Economic rights: Technische Universitaet Dresden (Germany) * + * Copyright (C) 2013 Jacek Galowicz, Intel Corporation. + * * This file is part of Vancouver. * * Vancouver is free software: you can redistribute it and/or modify @@ -578,6 +580,40 @@ class Model82576vf : public StaticReceiver uint32 raw[3*4]; } _msix; + unsigned _ip_address; + EthernetAddr _guest_uses_mac; + bool processed; + + void update_ip(unsigned char *packet, unsigned packet_len) + { + unsigned short packet_type = * reinterpret_cast(packet + 12); + if (packet_type == 0x0608) { + unsigned char *mac = packet + 14 + 8; // Source MAC address + unsigned char *ip = packet + 14 + 14; // Source IP address + + EthernetAddr ethaddr(mac[0], mac[1], mac[2], mac[3], mac[4], mac[5]); + +#if 0 + Logging::printf("Sending packet type %x from MAC %08llx, IP %x\n", + static_cast(packet_type), + ethaddr.raw, *reinterpret_cast(ip)); +#endif + + _guest_uses_mac = ethaddr; + _ip_address = * reinterpret_cast(ip); + } + } + + + void arp_gratuitous(const EthernetAddr &addr, const bool request) + { + const arp_packet arp(_guest_uses_mac, addr, _ip_address, + request ? 0x100 /* ARP_REQUEST */ : 0x200 /* ARP_REPLY */); + + MessageNetwork m(reinterpret_cast(&arp), sizeof(arp), 0); + _net.send(m); + } + uint32 VTFRTIMER_compute() { // XXX @@ -886,6 +922,62 @@ class Model82576vf : public StaticReceiver return false; } + bool receive(MessageRestore &msg) + { + const mword bytes = reinterpret_cast(&processed) + -reinterpret_cast(&_mac); + + if (msg.devtype == MessageRestore::RESTORE_RESTART) { + processed = false; + msg.bytes += bytes + 2 * 0x1000 + sizeof(msg); + return false; + } + + if (msg.devtype != MessageRestore::RESTORE_NIC || processed) return false; + + if (msg.write) { + msg.bytes = bytes + 2 * 0x1000; + memcpy(msg.space, reinterpret_cast(&_mac), bytes); + memcpy(msg.space + bytes, reinterpret_cast(_local_rx_regs), 0x1000); + memcpy(msg.space + bytes + 0x1000, reinterpret_cast(_local_tx_regs), 0x1000); + } + else { + uint32 *local_rx_regs = _local_rx_regs; + uint32 *local_tx_regs = _local_tx_regs; + Clock *clock = _clock; + + memcpy(reinterpret_cast(&_mac), msg.space, bytes); + + _local_rx_regs = local_rx_regs; + _local_tx_regs = local_tx_regs; + _clock = clock; + + memcpy(_local_rx_regs, msg.space + bytes, 0x1000); + memcpy(_local_tx_regs, msg.space + bytes + 0x1000, 0x1000); + + _rx_queues[0].parent = this; + _rx_queues[0].regs = local_rx_regs; + _rx_queues[1].parent = this; + _rx_queues[1].regs = local_rx_regs + 0x100/4; + _tx_queues[0].parent = this; + _tx_queues[0].regs = local_tx_regs; + _tx_queues[1].parent = this; + _tx_queues[1].regs = local_tx_regs + 0x100/4; + + if (_ip_address) { + Logging::printf("Trying to claim: MAC " MAC_FMT " IP %x\n", + MAC_SPLIT((&_guest_uses_mac)), _ip_address); + for (int i=0; i < 3; ++i) + arp_gratuitous(EthernetAddr(0xffffffffffffull), true); + } + } + + Logging::printf("%s NIC\n", msg.write?"Saved":"Restored"); + processed = true; + return true; + } + + Model82576vf(uint64 mac, DBus &net, DBus *bus_mem, DBus *bus_memregion, Clock *clock, DBus &timer, @@ -895,7 +987,8 @@ class Model82576vf : public StaticReceiver _clock(clock), _timer(timer), _mem_mmio(mem_mmio), _mem_msix(mem_msix), _txpoll_us(txpoll_us), _map_rx(map_rx), _bdf(bdf), - _promisc_default(promisc_default) + _promisc_default(promisc_default), _ip_address(0), _guest_uses_mac(0), + processed(false) { Logging::printf("Attached 82576VF model at %08x+0x4000, %08x+0x1000\n", mem_mmio, mem_msix); @@ -946,6 +1039,7 @@ PARAM_HANDLER(intel82576vf, mb.bus_network. add(dev, &Model82576vf::receive_static); mb.bus_timeout. add(dev, &Model82576vf::receive_static); mb.bus_legacy. add(dev, &Model82576vf::receive_static); + mb.bus_restore. add(dev, &Model82576vf::receive_static); } diff --git a/model/intel82576vf.h b/model/intel82576vf.h index b857d378..7da55770 100644 --- a/model/intel82576vf.h +++ b/model/intel82576vf.h @@ -5,6 +5,8 @@ * Copyright (C) 2010, Julian Stecklina * Economic rights: Technische Universitaet Dresden (Germany) * + * Copyright (C) 2013 Jacek Galowicz, Intel Corporation. + * * This file is part of Vancouver. * * Vancouver is free software: you can redistribute it and/or modify @@ -44,4 +46,38 @@ class Mta { Mta() : _bits() { } }; +struct arp_packet { + unsigned char destination[6]; + unsigned char source[6]; + unsigned short eth_type; + unsigned short hw_type; + unsigned short protocol_type; + unsigned char hwaddr_len; + unsigned char protocoladdr_len; + unsigned short operation; + unsigned char sender_hwaddr[6]; + unsigned sender_ip; + unsigned char target_hwaddr[6]; + unsigned target_ip; + + arp_packet(EthernetAddr src, EthernetAddr dst, unsigned ip_addr, + unsigned short _operation) + : + eth_type(0x608), hw_type(0x100), protocol_type(0x8), hwaddr_len(6), + protocoladdr_len(4), operation(_operation), + sender_ip(ip_addr), target_ip(ip_addr) + { + memcpy(destination, dst.byte, 6); + memset(target_hwaddr, 0, 6); + memcpy(source, src.byte, 6); + memcpy(sender_hwaddr, src.byte, 6); + } + + bool source_is(const EthernetAddr &a) const + { + EthernetAddr my_addr(*reinterpret_cast(destination)); + return my_addr == a; + } +} __attribute__((packed)); + // EOF diff --git a/model/lapic.cc b/model/lapic.cc index bd89ea30..01367da3 100644 --- a/model/lapic.cc +++ b/model/lapic.cc @@ -4,6 +4,9 @@ * Copyright (C) 2010, Bernhard Kauer * Economic rights: Technische Universitaet Dresden (Germany) * + * Copyright (C) 2013 Jacek Galowicz, Intel Corporation. + * Copyright (C) 2013 Markus Partheymueller, Intel Corporation. + * * This file is part of Vancouver. * * Vancouver is free software: you can redistribute it and/or modify @@ -32,8 +35,11 @@ */ class Lapic : public DiscoveryHelper, public StaticReceiver { + int _regstart; #define VMM_REGBASE "../model/lapic.cc" #include "model/reg.h" + int _regend; + enum { MAX_FREQ = 200000000, LVT_MASK_BIT = 16, @@ -64,6 +70,7 @@ class Lapic : public DiscoveryHelper, public StaticReceiver bool _rirr[NUM_LVT]; unsigned _lowest_rr; + bool _restore_processed; bool sw_disabled() { return ~_SVR & 0x100; } bool hw_disabled() { return ~_msr & 0x800; } @@ -158,9 +165,6 @@ class Lapic : public DiscoveryHelper, public StaticReceiver timevalue delta = (now - _timer_start) >> _timer_dcr_shift; if (delta < _ICT) return _ICT - delta; - // we need to trigger the timer LVT - trigger_lvt(_TIMER_offset - LVT_BASE); - // one shot? if (~_TIMER & (1 << 17)) { _timer_start = 0; @@ -519,8 +523,6 @@ class Lapic : public DiscoveryHelper, public StaticReceiver { if (((_msr & 0xc00) != 0x800) || !in_range(msg.phys, _msr & ~0xfffull, 0x1000)) return false; if ((msg.phys & 0xf) || (msg.phys & 0xfff) >= 0x400) return false; - - if (msg.read) register_read((msg.phys >> 4) & 0x3f, *msg.ptr); else @@ -551,7 +553,7 @@ class Lapic : public DiscoveryHelper, public StaticReceiver // no need to call update timer here, as the CPU needs to do an // EOI first - get_ccr(_mb.clock()->time()); + trigger_lvt(_TIMER_offset - LVT_BASE); return true; } @@ -604,6 +606,11 @@ class Lapic : public DiscoveryHelper, public StaticReceiver msg.value = _SVR & 0xff; update_irqs(); } + else if (msg.type == LapicEvent::CHECK_INTR) { + unsigned irrv = prioritize_irq(); + msg.value = (irrv > 0); + return true; + } else if (msg.type == LapicEvent::RESET) reset(); else if (msg.type == LapicEvent::INIT) @@ -738,8 +745,40 @@ class Lapic : public DiscoveryHelper, public StaticReceiver } } + bool receive(MessageRestore &msg) + { + const mword bytes = reinterpret_cast(&_restore_processed) + -reinterpret_cast(&_timer); + + const mword bytes2 = reinterpret_cast(&_regend) - reinterpret_cast(&_regstart); - Lapic(Motherboard &mb, VCpu *vcpu, unsigned initial_apic_id, unsigned timer) : _mb(mb), _vcpu(vcpu), _initial_apic_id(initial_apic_id), _timer(timer) + if (msg.devtype == MessageRestore::RESTORE_RESTART) { + _restore_processed = false; + msg.bytes += bytes + bytes2 + sizeof(msg); + return false; + } + + if (msg.devtype != MessageRestore::RESTORE_LAPIC || _restore_processed) return false; + + if (msg.write) { + msg.bytes = bytes + bytes2; + memcpy(msg.space, reinterpret_cast(&_timer), bytes); + memcpy(msg.space + bytes, reinterpret_cast(&_regstart), bytes2); + } + else { + memcpy(reinterpret_cast(&_timer), msg.space, bytes); + memcpy(reinterpret_cast(&_regstart), msg.space + bytes, bytes2); + } + + Logging::printf("%s LAPIC\n", msg.write?"Saved":"Restored"); + + _restore_processed = true; + return true; + } + + + Lapic(Motherboard &mb, VCpu *vcpu, unsigned initial_apic_id, unsigned timer) + : _mb(mb), _vcpu(vcpu), _initial_apic_id(initial_apic_id), _timer(timer), _restore_processed(false) { // find a FREQ that is not too high for (_timer_clock_shift=0; _timer_clock_shift < 32; _timer_clock_shift++) @@ -762,11 +801,12 @@ class Lapic : public DiscoveryHelper, public StaticReceiver mb.bus_apic.add(this, receive_static); mb.bus_timeout.add(this, receive_static); mb.bus_discovery.add(this,discover); + mb.bus_restore.add(this, receive_static); + vcpu->executor.add(this, receive_static); vcpu->mem.add(this, receive_static); vcpu->memregion.add(this, receive_static); vcpu->bus_lapic.add(this, receive_static); - } }; diff --git a/model/memorycontroller.cc b/model/memorycontroller.cc index c61119eb..4bfedd6e 100644 --- a/model/memorycontroller.cc +++ b/model/memorycontroller.cc @@ -4,6 +4,8 @@ * Copyright (C) 2009, Bernhard Kauer * Economic rights: Technische Universitaet Dresden (Germany) * + * Copyright (C) 2013 Markus Partheymueller, Intel Corporation. + * * This file is part of Vancouver. * * Vancouver is free software: you can redistribute it and/or modify @@ -29,6 +31,10 @@ class MemoryController : public StaticReceiver /****************************************************/ /* Physmem access */ /****************************************************/ + bool claim(MessageMem &msg) + { + return ((msg.phys >= _start) && (msg.phys < (_end - 4))); + } bool receive(MessageMem &msg) { if ((msg.phys < _start) || (msg.phys >= (_end - 4))) return false; @@ -45,6 +51,7 @@ class MemoryController : public StaticReceiver msg.start_page = _start >> 12; msg.count = (_end - _start) >> 12; msg.ptr = _physmem + _start; + msg.actual_physmem = true; return true; } @@ -68,5 +75,6 @@ PARAM_HANDLER(mem, MemoryController *dev = new MemoryController(msg.ptr, start, end); // physmem access mb.bus_mem.add(dev, MemoryController::receive_static); + mb.bus_mem.add_iothread_callback(dev, MemoryController::claim_static); mb.bus_memregion.add(dev, MemoryController::receive_static); } diff --git a/model/pcidirect.cc b/model/pcidirect.cc index e8daaeeb..a6ad5ce6 100644 --- a/model/pcidirect.cc +++ b/model/pcidirect.cc @@ -4,6 +4,9 @@ * Copyright (C) 2007-2010, Bernhard Kauer * Economic rights: Technische Universitaet Dresden (Germany) * + * Copyright (C) 2013 Jacek Galowicz, Intel Corporation. + * Copyright (C) 2013 Markus Partheymueller, Intel Corporation. + * * This file is part of Vancouver. * * Vancouver is free software: you can redistribute it and/or modify @@ -266,7 +269,12 @@ class DirectPciDevice : public StaticReceiver, public HostVfPci return true; } + bool claim(MessageIrq &msg) { + for (unsigned i = 0; i < _irq_count; i++) + if (_host_irqs[i] == msg.line) return true; + return false; + } bool receive(MessageIrq &msg) { for (unsigned i = 0; i < _irq_count; i++) @@ -310,8 +318,11 @@ class DirectPciDevice : public StaticReceiver, public HostVfPci return _mb.bus_hostop.send(msg2); } - - + bool claim(MessageMem &msg) + { + unsigned *ptr; + return match_bars(msg.phys, 4, ptr); + } bool receive(MessageMem &msg) { unsigned *ptr; @@ -393,6 +404,19 @@ class DirectPciDevice : public StaticReceiver, public HostVfPci return true; } + bool receive(MessageRestore &msg) { + if (msg.devtype != MessageRestore::PCI_PLUG) return false; + + unsigned slot = (_guestbdf >> 3) & 0x1f; + + MessageAcpiEvent amsg(msg.write ? + MessageAcpiEvent::ACPI_EVENT_HOT_REPLUG : + MessageAcpiEvent::ACPI_EVENT_HOT_UNPLUG, + slot); + + _mb.bus_acpi_event.send(amsg); + return true; + } DirectPciDevice(Motherboard &mb, unsigned hbdf, unsigned guestbdf, bool assign, @@ -457,10 +481,13 @@ class DirectPciDevice : public StaticReceiver, public HostVfPci mb.bus_ioin.add(this, DirectPciDevice::receive_static); mb.bus_ioout.add(this, DirectPciDevice::receive_static); mb.bus_mem.add(this, DirectPciDevice::receive_static); + mb.bus_mem.add_iothread_callback(this, DirectPciDevice::claim_static); mb.bus_legacy.add(this, DirectPciDevice::receive_static); if (map_mode != MAP_MODE_DISABLED) mb.bus_memregion.add(this, DirectPciDevice::receive_static); mb.bus_hostirq.add(this, DirectPciDevice::receive_static); + mb.bus_hostirq.add_iothread_callback(this, DirectPciDevice::claim_static); + mb.bus_restore.add(this, DirectPciDevice::receive_static); //mb.bus_irqnotify.add(this, DirectPciDevice::receive_static); } }; diff --git a/model/pic8259.cc b/model/pic8259.cc index 438bfd44..4e88920f 100644 --- a/model/pic8259.cc +++ b/model/pic8259.cc @@ -4,6 +4,9 @@ * Copyright (C) 2007-2009, Bernhard Kauer * Economic rights: Technische Universitaet Dresden (Germany) * + * Copyright (C) 2013 Jacek Galowicz, Intel Corporation. + * Copyright (C) 2013 Markus Partheymueller, Intel Corporation. + * * This file is part of Vancouver. * * Vancouver is free software: you can redistribute it and/or modify @@ -67,10 +70,21 @@ class PicDevice : public StaticReceiver unsigned char _elcr; unsigned char _notify; + bool _restore_processed; + // helper functions bool is_slave() { return (_icw[ICW4] & ICW4_BUF) ? (~_icw[ICW4] & ICW4_MS) : _virq; } void rotate_prios() { _prio_lowest = (_prio_lowest+1) & 7; } - void specific_eoi(unsigned char irq) { _isr &= ~irq; propagate_irq(false); } + void specific_eoi(unsigned char irq) { + // We do the notify here to avoid races + unsigned char notify = __sync_fetch_and_and(&_notify, ~irq); + if (notify & irq) { + MessageIrqNotify msg(_virq, irq); + _bus_notify.send(msg); + } + _isr &= ~irq; + propagate_irq(false); + } void non_specific_eoi() { for (unsigned i=0; i<8; i++) @@ -108,14 +122,6 @@ class PicDevice : public StaticReceiver */ bool prioritize_irq(unsigned char &irq_index, bool int_ack) { - unsigned char tonotify = ~_irr & _notify; - if (tonotify) - { - Cpu::atomic_and(&_notify, ~tonotify); - MessageIrqNotify msg(_virq, tonotify); - _bus_notify.send(msg); - } - unsigned char state = _irr & ~_imr; for (unsigned i=0; i<8; i++) { @@ -131,6 +137,7 @@ class PicDevice : public StaticReceiver _isr |= irq; if (~_elcr & irq) Cpu::atomic_and(&_irr, ~irq); + if (_icw[ICW4] & ICW4_AEOI) { non_specific_eoi(); @@ -182,7 +189,7 @@ class PicDevice : public StaticReceiver } } else { - Logging::printf("PicDevice::%s() spurious IRQ? for irr %x isr %x imr %x %x\n", __func__, _irr, _isr, res, _imr); + Logging::printf("PicDevice::%s() spurious IRQ? for irr %x isr %x imr %x %x\n", __func__, _irr, _isr, _imr, res); res = 7; } res += _icw[ICW2]; @@ -196,6 +203,13 @@ class PicDevice : public StaticReceiver */ bool receive(MessageLegacy &msg) { + if (msg.type == MessageLegacy::CHECK_INTR) { + if (_virq) return false; + unsigned char vec; + bool ret = prioritize_irq(vec, false); + msg.value = (ret) ? (0xff << 8) | vec : vec; + return true; + } if (msg.type != MessageLegacy::INTA) return false; unsigned char vec; get_irqvector(vec); @@ -328,8 +342,6 @@ class PicDevice : public StaticReceiver if (in_range(msg.line, _virq, 8)) { unsigned char irq = 1 << (msg.line - _virq); - if (msg.type == MessageIrq::ASSERT_NOTIFY) - Cpu::atomic_or(&_notify, irq); if (msg.type == MessageIrq::DEASSERT_IRQ) { @@ -343,7 +355,11 @@ class PicDevice : public StaticReceiver { if (msg.line == 0) COUNTER_INC("pirq0"); else COUNTER_INC("pirqN"); - Cpu::atomic_or(&_irr, irq); + if (msg.type == MessageIrq::ASSERT_NOTIFY) + Cpu::atomic_or(&_notify, irq); + + Cpu::atomic_or(&_irr, irq); + propagate_irq(false); } return true; @@ -351,11 +367,44 @@ class PicDevice : public StaticReceiver return false; } + bool receive(MessageRestore &msg) + { + const mword bytes = reinterpret_cast(&_restore_processed) + -reinterpret_cast(&_base); + + if (msg.devtype == MessageRestore::RESTORE_RESTART) { + _restore_processed = false; + msg.bytes += bytes + sizeof(msg); + return false; + } + + if (msg.devtype != MessageRestore::RESTORE_PIC || _restore_processed) return false; + + if (msg.write) { + msg.bytes = bytes; + msg.id1 = _base; + msg.id2 = _upstream_irq; + memcpy(msg.space, reinterpret_cast(&_base), bytes); + + } + else { + if (msg.id1 != _base || msg.id2 != _upstream_irq) return false; + + memcpy(reinterpret_cast(&_base), msg.space, bytes); + } + + //Logging::printf("%s PIC (base %x, IRQ %x)\n", msg.write?"Saved":"Restored", msg.id1, msg.id2); + _restore_processed = true; + return true; + } + + + PicDevice(DBus &bus_irq, DBus &bus_pic, DBus &bus_legacy, DBus &bus_notify, unsigned short base, unsigned char irq, unsigned short elcr_base, unsigned char virq) : _bus_irq(bus_irq), _bus_pic(bus_pic), _bus_legacy(bus_legacy), _bus_notify(bus_notify), - _base(base), _upstream_irq(irq), _elcr_base(elcr_base), _virq(virq), _icw_mode(OCW1) + _base(base), _upstream_irq(irq), _elcr_base(elcr_base), _virq(virq), _icw_mode(OCW1), _restore_processed(false) { _icw[ICW1] = 0; reset_values(); @@ -384,8 +433,10 @@ PARAM_HANDLER(pic, mb.bus_ioout. add(dev, PicDevice::receive_static); mb.bus_irqlines.add(dev, PicDevice::receive_static); mb.bus_pic. add(dev, PicDevice::receive_static); + mb.bus_restore.add(dev, PicDevice::receive_static); if (!virq) mb.bus_legacy.add(dev, PicDevice::receive_static); virq += 8; + } diff --git a/model/pit8254.cc b/model/pit8254.cc index 5853e77c..b67d11af 100644 --- a/model/pit8254.cc +++ b/model/pit8254.cc @@ -4,6 +4,8 @@ * Copyright (C) 2007-2009, Bernhard Kauer * Economic rights: Technische Universitaet Dresden (Germany) * + * Copyright (C) 2013 Jacek Galowicz, Intel Corporation. + * * This file is part of Vancouver. * * Vancouver is free software: you can redistribute it and/or modify @@ -370,6 +372,8 @@ class PitDevice : public StaticReceiver static const unsigned COUNTER = 3; PitCounter _c[COUNTER]; + bool _restore_processed; + public: bool receive(MessagePit &msg) @@ -421,9 +425,36 @@ class PitDevice : public StaticReceiver return true; } + bool receive(MessageRestore &msg) + { + const mword bytes = reinterpret_cast(&_restore_processed) + -reinterpret_cast(&_base); + + if (msg.devtype == MessageRestore::RESTORE_RESTART) { + _restore_processed = false; + msg.bytes += bytes + sizeof(msg); + return false; + } + + if (msg.devtype != MessageRestore::RESTORE_PIT || _restore_processed) return false; + + if (msg.write) { + msg.bytes = bytes; + memcpy(msg.space, reinterpret_cast(&_base), bytes); + + } + else { + memcpy(reinterpret_cast(&_base), msg.space, bytes); + } + + //Logging::printf("%s PIT\n", msg.write?"Saved":"Restored"); + _restore_processed = true; + return true; + } + PitDevice(Motherboard &mb, unsigned short base, unsigned irq, unsigned pit) - : _base(base), _addr(pit*COUNTER) + : _base(base), _addr(pit*COUNTER), _restore_processed(false) { for (unsigned i=0; i < COUNTER; i++) { @@ -449,4 +480,5 @@ PARAM_HANDLER(pit, mb.bus_ioin.add(dev, PitDevice::receive_static); mb.bus_ioout.add(dev, PitDevice::receive_static); mb.bus_pit.add(dev, PitDevice::receive_static); + mb.bus_restore.add(dev, PitDevice::receive_static); } diff --git a/model/pmtimer.cc b/model/pmtimer.cc index 2e38ea08..2c69a474 100644 --- a/model/pmtimer.cc +++ b/model/pmtimer.cc @@ -4,6 +4,8 @@ * Copyright (C) 2010, Bernhard Kauer * Economic rights: Technische Universitaet Dresden (Germany) * + * Copyright (C) 2013 Markus Partheymueller, Intel Corporation. + * * This file is part of Vancouver. * * Vancouver is free software: you can redistribute it and/or modify @@ -34,6 +36,9 @@ class PmTimer : public DiscoveryHelper, public StaticReceiver unsigned _iobase; enum { FREQ = 3579545 }; public: + bool claim(MessageIOIn &msg) { + return (msg.port == _iobase && msg.type == MessageIOIn::TYPE_INL); + } bool receive(MessageIOIn &msg) { if (msg.port != _iobase || msg.type != MessageIOIn::TYPE_INL) return false; @@ -55,6 +60,7 @@ class PmTimer : public DiscoveryHelper, public StaticReceiver PmTimer(Motherboard &mb, unsigned iobase) : _mb(mb), _iobase(iobase) { _mb.bus_ioin.add(this, receive_static); + _mb.bus_ioin.add_iothread_callback(this, claim_static); _mb.bus_discovery.add(this, discover); } }; diff --git a/model/vcpu.cc b/model/vcpu.cc index c412c7ce..c3a086aa 100644 --- a/model/vcpu.cc +++ b/model/vcpu.cc @@ -4,6 +4,9 @@ * Copyright (C) 2010, Bernhard Kauer * Economic rights: Technische Universitaet Dresden (Germany) * + * Copyright (C) 2013 Jacek Galowicz, Intel Corporation. + * Copyright (C) 2013 Markus Partheymueller, Intel Corporation. + * * This file is part of Vancouver. * * Vancouver is free software: you can redistribute it and/or modify @@ -32,6 +35,7 @@ class VirtualCpu : public VCpu, public StaticReceiver volatile unsigned _event; volatile unsigned _sipi; + unsigned long _intr_hint; unsigned char debugioin[8192]; unsigned char debugioout[8192]; @@ -204,6 +208,12 @@ class VirtualCpu : public VCpu, public StaticReceiver msg.mtr_out |= MTD_STATE | MTD_INJ; if (!old_event) return; + + if (old_event & EVENT_RESUME) { + Cpu::atomic_and(&_event, ~(old_event & EVENT_RESUME)); + cpu->actv_state = 0; + } + if (old_event & (EVENT_DEBUG | EVENT_HOST)) { if (old_event & EVENT_DEBUG) dprintf("state %x event %8x eip %8x eax %x ebx %x edx %x esi %x\n", cpu->actv_state, old_event, cpu->eip, cpu->eax, cpu->ebx, cpu->edx, cpu->esi); @@ -267,16 +277,32 @@ class VirtualCpu : public VCpu, public StaticReceiver // if we can not inject interrupts or if we are in shutdown state return if (cpu->intr_state & 0x3 || ~cpu->efl & 0x200 || cpu->actv_state == 2) return; + unsigned long intr = _intr_hint; + LapicEvent msg2(LapicEvent::INTA); if (old_event & EVENT_EXTINT) { // EXTINT IRQ via MSI or IPI: INTA directly from the PIC Cpu::atomic_and(&_event, ~VCpu::EVENT_EXTINT); - receive(msg2); + LapicEvent check(LapicEvent::CHECK_INTR); + check.value = 0; + if (receive(check) && check.value) + receive(msg2); + else { + return; + } } - else if (old_event & EVENT_INTR) { + else if (intr & 1) { // interrupt from the APIC or directly via INTR line - INTA via LAPIC // do not clear EVENT_INTR here, as the PIC or the LAPIC will do this for us - bus_lapic.send(msg2, true); + LapicEvent check(LapicEvent::CHECK_INTR); + check.value = 0; + if (bus_lapic.send(check, true) && check.value) { + bus_lapic.send(msg2, true); + } else { + Cpu::cmpxchg8b(&_intr_hint, intr, (intr + 4) & ~1ULL); + Cpu::atomic_and(&_event, ~EVENT_INTR); + return; + } } else return; cpu->inj_info = msg2.value | 0x80000000; @@ -315,8 +341,12 @@ class VirtualCpu : public VCpu, public StaticReceiver void got_event(unsigned value) { COUNTER_INC("EVENT"); - if (value & DEASS_INTR) Cpu::atomic_and(&_event, ~EVENT_INTR); - if (!((~_event & value) & (EVENT_MASK | EVENT_DEBUG | EVENT_HOST))) return; + Cpu::atomic_xadd(&_intr_hint, 4); + if (value & EVENT_INTR) Cpu::atomic_or(&_intr_hint, 1); + + /* Avoid delayed DEASS messages. The event loop clears INTR itself. + if (value & DEASS_INTR) Cpu::atomic_and(&_event, ~EVENT_INTR);*/ + if (!((~(_event & ~EVENT_INTR) & value) & (EVENT_MASK | EVENT_DEBUG | EVENT_HOST))) return; // INIT or AP RESET - go to the wait-for-sipi state if ((value & EVENT_MASK) == EVENT_INIT) @@ -331,7 +361,7 @@ class VirtualCpu : public VCpu, public StaticReceiver */ if (Cpu::cmpxchg4b(&_sipi, 0, value)) return; - Cpu::atomic_or(&_event, STATE_WAKEUP | (value & (EVENT_MASK | EVENT_DEBUG | EVENT_HOST))); + Cpu::atomic_or(&_event, STATE_WAKEUP | (value & (EVENT_MASK | EVENT_DEBUG | EVENT_HOST | EVENT_RESUME))); MessageHostOp msg(MessageHostOp::OP_VCPU_RELEASE, _hostop_id, _event & STATE_BLOCK); @@ -342,6 +372,7 @@ class VirtualCpu : public VCpu, public StaticReceiver /** * Forward MEM requests to the motherboard. */ + bool claim(MessageMem &msg) { /* The entire vCPU subsystem should be bypassing */ return true; } bool receive(MessageMem &msg) { return _mb.bus_mem.send(msg, true); } bool receive(MessageMemRegion &msg) { return _mb.bus_memregion.send(msg, true); } @@ -353,6 +384,11 @@ class VirtualCpu : public VCpu, public StaticReceiver return true; } + if (msg.type == MessageLegacy::UNLOCK) { + got_event(EVENT_RESUME); + return true; + } + // BSP receives only legacy signals if the LAPIC is disabled if (is_ap() || CPUID_EDX1 & (1 << 9)) return false; @@ -379,11 +415,23 @@ class VirtualCpu : public VCpu, public StaticReceiver msg.value = msg2.value; return true; } + if (msg.type == LapicEvent::CHECK_INTR) { + MessageLegacy check(MessageLegacy::CHECK_INTR); + _mb.bus_legacy.send(check); + msg.value = (check.value & 0xff00); + return true; + } return false; } + bool claim(CpuMessage &msg) { /* Entire vCPU subsystem should be bypassing */ return true; } bool receive(CpuMessage &msg) { + if (msg.type == CpuMessage::TYPE_ADD_TSC_OFF) { + _reset_tsc_off += msg.current_tsc_off; + return true; + } + // TSC drift compensation. if (msg.type != CpuMessage::TYPE_CPUID_WRITE && msg.mtr_in & MTD_TSC && ~msg.mtr_out & MTD_TSC) { COUNTER_INC("tsc adoption"); @@ -447,6 +495,7 @@ class VirtualCpu : public VCpu, public StaticReceiver case CpuMessage::TYPE_SINGLE_STEP: case CpuMessage::TYPE_WBINVD: case CpuMessage::TYPE_INVD: + case CpuMessage::TYPE_ADD_TSC_OFF: default: return false; } @@ -469,8 +518,10 @@ class VirtualCpu : public VCpu, public StaticReceiver // add to the busses executor. add(this, VirtualCpu::receive_static); + executor.add_iothread_callback(this, VirtualCpu::claim_static); bus_event.add(this, VirtualCpu::receive_static); mem. add(this, VirtualCpu::receive_static); + mem.add_iothread_callback(this, VirtualCpu::claim_static); memregion.add(this, VirtualCpu::receive_static); mb.bus_legacy.add(this, VirtualCpu::receive_static); bus_lapic.add(this, VirtualCpu::receive_static); diff --git a/model/vga.cc b/model/vga.cc index dadfe0ed..a1a18a46 100644 --- a/model/vga.cc +++ b/model/vga.cc @@ -4,6 +4,9 @@ * Copyright (C) 2007-2010, Bernhard Kauer * Economic rights: Technische Universitaet Dresden (Germany) * + * Copyright (C) 2013 Jacek Galowicz, Intel Corporation. + * Copyright (C) 2013 Markus Partheymueller, Intel Corporation. + * * This file is part of Vancouver. * * Vancouver is free software: you can redistribute it and/or modify @@ -45,6 +48,9 @@ class Vga : public StaticReceiver, public BiosCommon unsigned char _crt_index; unsigned _ebda_segment; unsigned _vbe_mode; + mword _last_videomode_request; + + bool _restore_processed; void puts_guest(const char *msg) { unsigned pos = _regs.cursor_pos - TEXT_OFFSET; @@ -174,6 +180,7 @@ class Vga : public StaticReceiver, public BiosCommon case 0x4f02: // set vbemode { ConsoleModeInfo info; + _last_videomode_request = cpu->ebx; unsigned index = get_vesa_mode(cpu->ebx & 0x0fff, &info); if (index != ~0u && info.attr & 1) { @@ -349,6 +356,12 @@ class Vga : public StaticReceiver, public BiosCommon return true; } + void set_videomode(mword videomode) + { + ConsoleModeInfo info; + _regs.mode = get_vesa_mode(videomode & 0x0fff, &info); + } + public: bool receive(MessageBios &msg) @@ -467,7 +480,10 @@ class Vga : public StaticReceiver, public BiosCommon return res; } - + bool claim(MessageMem &msg) + { + return ((in_range(msg.phys, _framebuffer_phys, _framebuffer_size)) || (in_range(msg.phys, LOW_BASE, LOW_SIZE))); + } bool receive(MessageMem &msg) { unsigned *ptr; @@ -522,9 +538,55 @@ class Vga : public StaticReceiver, public BiosCommon return true; } + bool receive(MessageRestore &msg) + { + const mword bytes = reinterpret_cast(&_restore_processed) + -reinterpret_cast(&_view); + + if (msg.devtype == MessageRestore::RESTORE_RESTART) { + _restore_processed = false; + msg.bytes += bytes + sizeof(msg); + return false; + } + + if (msg.devtype == MessageRestore::VGA_DISPLAY_GUEST) { + if (msg.write) memset(_framebuffer_ptr, 0, _framebuffer_size); + puts_guest(msg.space); + return true; + } + + if (msg.devtype == MessageRestore::VGA_VIDEOMODE) { + if (msg.write) { + set_videomode(msg.bytes); + MessageConsole cmsg(MessageConsole::TYPE_SWITCH_VIEW); + cmsg.view = _view; + _mb.bus_console.send(cmsg); + } + else + msg.bytes = _last_videomode_request; + return true; + } + + if (msg.devtype != MessageRestore::RESTORE_VGA || _restore_processed) return false; + + if (msg.write) { + msg.bytes = bytes; + memcpy(msg.space, reinterpret_cast(&_view), bytes); + + } + else { + memcpy(reinterpret_cast(&_view), msg.space, bytes); + set_videomode(_last_videomode_request); + } + + //Logging::printf("%s VGA\n", msg.write?"Saved":"Restored"); + _restore_processed = true; + return true; + } + Vga(Motherboard &mb, unsigned short iobase, char *framebuffer_ptr, uintptr_t framebuffer_phys, size_t framebuffer_size) - : BiosCommon(mb), _iobase(iobase), _framebuffer_ptr(framebuffer_ptr), _framebuffer_phys(framebuffer_phys), _framebuffer_size(framebuffer_size), _crt_index(0), _ebda_segment(), _vbe_mode() + : BiosCommon(mb), _iobase(iobase), _framebuffer_ptr(framebuffer_ptr), _framebuffer_phys(framebuffer_phys), _framebuffer_size(framebuffer_size), _crt_index(0), _ebda_segment(), _vbe_mode(), _last_videomode_request(), _restore_processed(false) { assert(!(framebuffer_phys & 0xfff)); assert(!(framebuffer_size & 0xfff)); @@ -574,7 +636,9 @@ PARAM_HANDLER(vga, mb.bus_ioout .add(dev, Vga::receive_static); mb.bus_bios .add(dev, Vga::receive_static); mb.bus_mem .add(dev, Vga::receive_static); + mb.bus_mem.add_iothread_callback(dev, Vga::claim_static); mb.bus_memregion.add(dev, Vga::receive_static); mb.bus_discovery.add(dev, Vga::receive_static); + mb.bus_restore.add(dev, Vga::receive_static); } diff --git a/nre/src/IOThread.cc b/nre/src/IOThread.cc new file mode 100644 index 00000000..4d7147c1 --- /dev/null +++ b/nre/src/IOThread.cc @@ -0,0 +1,522 @@ +/** + * I/O Thread + * + * Copyright (C) 2013 Markus Partheymueller, Intel Corporation. + * + * This file is part of Seoul. + * + * Seoul is free software: you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Seoul is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details. + */ + +#include "IOThread.h" + +#define IOTHREAD_DEBUG + +#ifdef IOTHREAD_DEBUG +static unsigned long msgcount[20] = {}; +static unsigned long maxqueue=0; +static unsigned long sync=0, async=0; +#endif + +static bool iothread_init = false; + +void IOThread::stats() { +#ifdef IOTHREAD_DEBUG + for (unsigned i=0; i<17; i++) { + Logging::printf("Type %u: Count %lu\n", i, msgcount[i]); + } + Logging::printf("Max queue size: %lu\n", maxqueue); + Logging::printf(" Sync messages: %lu\n", sync); + Logging::printf("ASync messages: %lu\n", async); +#endif +} + +void IOThread::reset() { + stats(); + if (iothread_init) return; + for(VCpu *vcpu = _mb->last_vcpu; vcpu; vcpu = vcpu->get_last()) { + vcpu->mem.set_iothread_enqueue(this, enqueue_static, vcpu); + vcpu->executor.set_iothread_enqueue(this, enqueue_static, vcpu); + } + iothread_init = true; +} + +nre::UserSm *IOThread::get_notify_sem(nre::Utcb *utcb) { + assert(utcb != 0); + for (auto it = _notify.begin(); it != _notify.end(); it++) { + if (it->utcb == utcb) return it->sem; + } + Notify *new_notify = new Notify; + new_notify->utcb = utcb; + new_notify->sem = new nre::UserSm(0); + _notify.append(new_notify); + return new_notify->sem; +} + +template +static void sync_msg(MessageIOThreadEle *iomsg) { + // We have to keep the message when it is synchronous. The receiver will delete it. + if (iomsg->sync == MessageIOThread::SYNC_SYNC) { + // Wake enqueuer + assert(iomsg->sem != nullptr); + reinterpret_cast(iomsg->sem)->up(); + } else { + delete (M*) iomsg->ptr; + } +} + +void IOThread::syncify_message(MessageIOThreadEle *msg) { + if (msg->sync == MessageIOThread::SYNC_SYNC) { + msg->sem = this->get_notify_sem(nre::Thread::current()->utcb()); + assert(msg->sem != nullptr); + } +} + +template +void IOThread::sync_message(MessageIOThreadEle *msg, MessageIOThread::Sync sync) { + if (sync == MessageIOThread::SYNC_SYNC) { + reinterpret_cast(msg->sem)->down(); + delete msg; + } +} + +bool IOThread::enq(MessageIOThreadEle *msg) { + nre::ScopedLock lock(&_lock); + +#ifdef IOTHREAD_DEBUG + msgcount[msg->type]++; + if (msg->sync == MessageIOThread::SYNC_SYNC) sync++; + else async++; +#endif + _queue.append(msg); + _block.up(); + return true; +} + +bool IOThread::enqueue(MessageDisk &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu*) { + if (nre::Thread::current()->utcb() == own_utcb) return false; + // Disk is always sync because of error check + sync = MessageIOThread::SYNC_SYNC; + MessageIOThreadEle *enq = new MessageIOThreadEle(MessageIOThread::TYPE_DISK, mode, sync, value, &msg); + syncify_message(enq); + this->enq(enq); + sync_message(enq, sync); + return true; +} + +bool IOThread::enqueue(MessageDiskCommit &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu*) { + if (nre::Thread::current()->utcb() == own_utcb) return false; + MessageDiskCommit *ptr = new MessageDiskCommit(msg.disknr, msg.usertag, msg.status); + MessageIOThreadEle *enq = new MessageIOThreadEle(MessageIOThread::TYPE_DISKCOMMIT, mode, sync, value, ptr); + syncify_message(enq); + this->enq(enq); + sync_message(enq, sync); + return true; +} + +bool IOThread::enqueue(MessageTime &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu*) { + if (nre::Thread::current()->utcb() == own_utcb) return false; + // Time must be sync + sync = MessageIOThread::SYNC_SYNC; + MessageIOThreadEle *enq = new MessageIOThreadEle(MessageIOThread::TYPE_TIME, mode, sync, value, &msg); + syncify_message(enq); + this->enq(enq); + sync_message(enq, sync); + return true; +} + +bool IOThread::enqueue(MessageTimer &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu*) { + if (nre::Thread::current()->utcb() == own_utcb) return false; + /* + * Timer slot requests are always sync. + * Because they are a result of an earlier message, timeout requests should never be enqueued. + */ + if (msg.type == MessageTimer::TIMER_NEW) sync = MessageIOThread::SYNC_SYNC; + MessageTimer *ptr; + if (sync == MessageIOThread::SYNC_ASYNC) { + ptr = new MessageTimer; + memcpy(ptr, &msg, sizeof(msg)); + } else { + ptr = &msg; + } + MessageIOThreadEle *enq = new MessageIOThreadEle(MessageIOThread::TYPE_TIMER, mode, sync, value, ptr); + syncify_message(enq); + this->enq(enq); + sync_message(enq, sync); + return true; +} + +bool IOThread::enqueue(MessageTimeout &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu*) { + if (nre::Thread::current()->utcb() == own_utcb) return false; + MessageTimeout *ptr; + if (sync == MessageIOThread::SYNC_ASYNC) { + ptr = new MessageTimeout(msg.nr, msg.time); + } else { + ptr = &msg; + } + MessageIOThreadEle *enq = new MessageIOThreadEle(MessageIOThread::TYPE_TIMEOUT, mode, sync, value, ptr); + syncify_message(enq); + this->enq(enq); + sync_message(enq, sync); + return true; +} + +bool IOThread::enqueue(MessageIOOut &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu*) { + if (nre::Thread::current()->utcb() == own_utcb) return false; + MessageIOOut *ptr; + if (sync == MessageIOThread::SYNC_ASYNC) { + ptr = new MessageIOOut(msg.type, msg.port, msg.value); + memcpy(ptr, &msg, sizeof(msg)); + } else { + ptr = &msg; + } + MessageIOThreadEle *enq = new MessageIOThreadEle(MessageIOThread::TYPE_IOOUT, mode, sync, value, ptr); + syncify_message(enq); + this->enq(enq); + sync_message(enq, sync); + return true; +} + +bool IOThread::enqueue(MessageIOIn &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu*) { + if (nre::Thread::current()->utcb() == own_utcb) return false; + // I/O port reads are always sync + sync = MessageIOThread::SYNC_SYNC; + MessageIOThreadEle *enq = new MessageIOThreadEle(MessageIOThread::TYPE_IOIN, mode, sync, value, &msg); + syncify_message(enq); + this->enq(enq); + sync_message(enq, sync); + return true; +} + +bool IOThread::enqueue(MessageMem &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu) { + if (nre::Thread::current()->utcb() == own_utcb) return false; + // Mem reads are always sync + if (msg.read) sync = MessageIOThread::SYNC_SYNC; + MessageMem *ptr; + if (sync == MessageIOThread::SYNC_ASYNC) { + assert(!msg.read); + // We need to save the value pointed to by msg.ptr! + unsigned *val = new unsigned; + *val = *msg.ptr; + ptr = new MessageMem(msg.read, msg.phys, val); + } else { + ptr = &msg; + } + MessageIOThreadEle *enq = new MessageIOThreadEle(MessageIOThread::TYPE_MEM, mode, sync, value, ptr); + enq->vcpu = vcpu; + syncify_message(enq); + this->enq(enq); + sync_message(enq, sync); + return true; +} +bool IOThread::enqueue(CpuMessage &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu) { + if (nre::Thread::current()->utcb() == own_utcb) return false; + if (msg.type != CpuMessage::TYPE_RDMSR && msg.type != CpuMessage::TYPE_WRMSR && msg.type != CpuMessage::TYPE_CHECK_IRQ) return false; + // These messages are always sync + sync = MessageIOThread::SYNC_SYNC; + CpuMessage *ptr; + ptr = &msg; + MessageIOThreadEle *enq = new MessageIOThreadEle(MessageIOThread::TYPE_CPU, mode, sync, value, ptr); + enq->vcpu = vcpu; + syncify_message(enq); + this->enq(enq); + sync_message(enq, sync); + return true; +} + +bool IOThread::enqueue(MessageInput &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu*) { + if (nre::Thread::current()->utcb() == own_utcb) return false; + MessageInput *ptr; + if (sync == MessageIOThread::SYNC_ASYNC) { + ptr = new MessageInput(msg.device, msg.data); + } else { + ptr = &msg; + } + MessageIOThreadEle *enq = new MessageIOThreadEle(MessageIOThread::TYPE_INPUT, mode, sync, value, ptr); + syncify_message(enq); + this->enq(enq); + sync_message(enq, sync); + return true; +} + +bool IOThread::enqueue(MessageIrqLines &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu*) { + if (nre::Thread::current()->utcb() == own_utcb) return false; + MessageIrqLines *ptr; + if (sync == MessageIOThread::SYNC_ASYNC) { + ptr = new MessageIrqLines(msg.type, msg.line); + } else { + ptr = &msg; + } + MessageIOThreadEle *enq = new MessageIOThreadEle(MessageIOThread::TYPE_IRQLINES, mode, sync, value, ptr); + syncify_message(enq); + this->enq(enq); + sync_message(enq, sync); + return true; +} + +bool IOThread::enqueue(MessageIrqNotify &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu*) { + if (nre::Thread::current()->utcb() == own_utcb) return false; + MessageIrqNotify *ptr; + if (sync == MessageIOThread::SYNC_ASYNC) { + ptr = new MessageIrqNotify(msg.baseirq, msg.mask); + } else { + ptr = &msg; + } + MessageIOThreadEle *enq = new MessageIOThreadEle(MessageIOThread::TYPE_IRQNOTIFY, mode, sync, value, ptr); + syncify_message(enq); + this->enq(enq); + sync_message(enq, sync); + return true; +} + +bool IOThread::enqueue(MessageIrq &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu*) { + if (nre::Thread::current()->utcb() == own_utcb) return false; + MessageIrq *ptr; + if (sync == MessageIOThread::SYNC_ASYNC) { + ptr = new MessageIrq(msg.type, msg.line); + } else { + ptr = &msg; + } + MessageIOThreadEle *enq = new MessageIOThreadEle(MessageIOThread::TYPE_IRQ, mode, sync, value, ptr); + syncify_message(enq); + this->enq(enq); + sync_message(enq, sync); + return true; +} + +bool IOThread::enqueue(MessageLegacy &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu*) { + if (nre::Thread::current()->utcb() == own_utcb) return false; + if (msg.type == MessageLegacy::INTA || msg.type == MessageLegacy::DEASS_INTR) sync = MessageIOThread::SYNC_SYNC; + MessageLegacy *ptr; + if (sync == MessageIOThread::SYNC_ASYNC) { + ptr = new MessageLegacy(msg.type, msg.value); + } else { + ptr = &msg; + } + MessageIOThreadEle *enq = new MessageIOThreadEle(MessageIOThread::TYPE_LEGACY, mode, sync, value, ptr); + syncify_message(enq); + this->enq(enq); + sync_message(enq, sync); + return true; +} + +bool IOThread::enqueue(MessageNetwork &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu*) { + if (nre::Thread::current()->utcb() == own_utcb) return false; + if (msg.type == MessageNetwork::QUERY_MAC) sync = MessageIOThread::SYNC_SYNC; + MessageNetwork *ptr; + if (sync == MessageIOThread::SYNC_ASYNC) { + ptr = new MessageNetwork(msg.type, msg.client); + memcpy(ptr, &msg, sizeof(msg)); + } else { + ptr = &msg; + } + MessageIOThreadEle *enq = new MessageIOThreadEle(MessageIOThread::TYPE_NETWORK, mode, sync, value, ptr); + syncify_message(enq); + this->enq(enq); + sync_message(enq, sync); + return true; +} + +bool IOThread::enqueue(MessagePciConfig &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu*) { + if (nre::Thread::current()->utcb() == own_utcb) return false; + // Reads are sync + if (msg.type == MessagePciConfig::TYPE_READ) sync = MessageIOThread::SYNC_SYNC; + MessagePciConfig *ptr; + if (sync == MessageIOThread::SYNC_ASYNC) { + ptr = new MessagePciConfig(msg.bdf); + memcpy(ptr, &msg, sizeof(msg)); + } else { + ptr = &msg; + } + MessageIOThreadEle *enq = new MessageIOThreadEle(MessageIOThread::TYPE_PCICFG, mode, sync, value, ptr); + syncify_message(enq); + this->enq(enq); + sync_message(enq, sync); + return true; +} + +bool IOThread::enqueue(MessageHostOp &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu*) { + if (nre::Thread::current()->utcb() == own_utcb || msg.type != MessageHostOp::OP_VCPU_RELEASE) return false; + MessageHostOp *ptr; + if (sync == MessageIOThread::SYNC_ASYNC) { + ptr = new MessageHostOp(msg.vcpu); + memcpy(ptr, &msg, sizeof(msg)); + } else { + ptr = &msg; + } + MessageIOThreadEle *enq = new MessageIOThreadEle(MessageIOThread::TYPE_HOSTOP, mode, sync, value, ptr); + syncify_message(enq); + this->enq(enq); + sync_message(enq, sync); + return true; +} + +void IOThread::worker() { + // Set own UTCB. With this we can detect when sending message ourselves. They must not be enqueued. + own_utcb = nre::Thread::current()->utcb(); + + MessageIOThreadEle *msg; + MessageIOThread::Sync sync; + MessageIOThread::Type type; + while (1) { + _block.down(); + + { + nre::ScopedLock lock(&_lock); + assert(_queue.length() > 0); +#ifdef IOTHREAD_DEBUG + if (_queue.length() > maxqueue) maxqueue = _queue.length(); +#endif + + auto it = _queue.begin(); + msg = &*it; + _queue.remove(msg); + sync = msg->sync; + type = msg->type; + } + + // Send message on appropriate bus + switch (msg->type) { + case MessageIOThread::TYPE_DISK: + { + MessageDisk *msg2 = reinterpret_cast(msg->ptr); + _mb->bus_disk.send_direct(*msg2, msg->mode, msg->value); + sync_msg(msg); + } + break; + case MessageIOThread::TYPE_DISKCOMMIT: + { + MessageDiskCommit *msg2 = reinterpret_cast(msg->ptr); + _mb->bus_diskcommit.send_direct(*msg2, msg->mode, msg->value); + sync_msg(msg); + } + break; + case MessageIOThread::TYPE_TIME: + { + MessageTime *msg2 = reinterpret_cast(msg->ptr); + _mb->bus_time.send_direct(*msg2, msg->mode, msg->value); + sync_msg(msg); + } + break; + case MessageIOThread::TYPE_TIMER: + { + MessageTimer *msg2 = reinterpret_cast(msg->ptr); + _mb->bus_timer.send_direct(*msg2, msg->mode, msg->value); + sync_msg(msg); + } + break; + case MessageIOThread::TYPE_TIMEOUT: + { + MessageTimeout *msg2 = reinterpret_cast(msg->ptr); + _mb->bus_timeout.send_direct(*msg2, msg->mode, msg->value); + sync_msg(msg); + } + break; + case MessageIOThread::TYPE_IOOUT: + { + MessageIOOut *msg2 = reinterpret_cast(msg->ptr); + _mb->bus_ioout.send_direct(*msg2, msg->mode, msg->value); + sync_msg(msg); + } + break; + case MessageIOThread::TYPE_IOIN: + { + MessageIOIn *msg2 = reinterpret_cast(msg->ptr); + _mb->bus_ioin.send_direct(*msg2, msg->mode, msg->value); + sync_msg(msg); + } + break; + case MessageIOThread::TYPE_MEM: + { + MessageMem *msg2 = reinterpret_cast(msg->ptr); + if (msg->vcpu) + msg->vcpu->mem.send_direct(*msg2, msg->mode, msg->value); + else + _mb->bus_mem.send_direct(*msg2, msg->mode, msg->value); + // Special case: delete saved value + if (msg->sync == MessageIOThread::SYNC_ASYNC) delete msg2->ptr; + sync_msg(msg); + } + break; + case MessageIOThread::TYPE_CPU: + { + CpuMessage *msg2 = reinterpret_cast(msg->ptr); + if (msg->vcpu) + msg->vcpu->executor.send_direct(*msg2, msg->mode, msg->value); + else + Logging::panic("TYPE_CPU needs a vcpu pointer!\n"); + sync_msg(msg); + } + break; + case MessageIOThread::TYPE_INPUT: + { + MessageInput *msg2 = reinterpret_cast(msg->ptr); + _mb->bus_input.send_direct(*msg2, msg->mode, msg->value); + sync_msg(msg); + } + break; + case MessageIOThread::TYPE_IRQLINES: + { + MessageIrqLines *msg2 = reinterpret_cast(msg->ptr); + _mb->bus_irqlines.send_direct(*msg2, msg->mode, msg->value); + sync_msg(msg); + } + break; + case MessageIOThread::TYPE_IRQNOTIFY: + { + MessageIrqNotify *msg2 = reinterpret_cast(msg->ptr); + _mb->bus_irqnotify.send_direct(*msg2, msg->mode, msg->value); + sync_msg(msg); + } + break; + case MessageIOThread::TYPE_IRQ: + { + MessageIrq *msg2 = reinterpret_cast(msg->ptr); + _mb->bus_hostirq.send_direct(*msg2, msg->mode, msg->value); + sync_msg(msg); + } + break; + case MessageIOThread::TYPE_LEGACY: + { + MessageLegacy *msg2 = reinterpret_cast(msg->ptr); + _mb->bus_legacy.send_direct(*msg2, msg->mode, msg->value); + sync_msg(msg); + } + break; + case MessageIOThread::TYPE_NETWORK: + { + MessageNetwork *msg2 = reinterpret_cast(msg->ptr); + _mb->bus_network.send_direct(*msg2, msg->mode, msg->value); + sync_msg(msg); + } + break; + case MessageIOThread::TYPE_PCICFG: + { + MessagePciConfig *msg2 = reinterpret_cast(msg->ptr); + _mb->bus_pcicfg.send_direct(*msg2, msg->mode, msg->value); + sync_msg(msg); + } + break; + case MessageIOThread::TYPE_HOSTOP: + { + MessageHostOp *msg2 = reinterpret_cast(msg->ptr); + _mb->bus_hostop.send_direct(*msg2, msg->mode, msg->value); + sync_msg(msg); + } + break; + + default: + Logging::panic("Cannot handle type %x %x (size is %lx)!\n", type, sync, _queue.length()); + } + if (sync == MessageIOThread::SYNC_ASYNC) { + delete msg; + } + } +} diff --git a/nre/src/IOThread.h b/nre/src/IOThread.h new file mode 100644 index 00000000..3b8a61b2 --- /dev/null +++ b/nre/src/IOThread.h @@ -0,0 +1,98 @@ +/** + * I/O Thread + * + * Copyright (C) 2013 Markus Partheymueller, Intel Corporation. + * + * This file is part of Seoul. + * + * Seoul is free software: you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Seoul is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details. + */ + +#include +#include +#include +#include + +#include +#include +#include + +class MessageIOThreadEle : public MessageIOThread, public nre::SListItem { +public: + MessageIOThreadEle(Type _type, Mode _mode, Sync _sync, void *_ptr) : MessageIOThread(_type, _mode, _sync, _ptr) {} + MessageIOThreadEle(Type _type, Mode _mode, Sync _sync, unsigned *_value, void *_ptr) : MessageIOThread(_type, _mode, _sync, _value, _ptr) {} +}; + +class Notify : public nre::SListItem { +public: + nre::Utcb *utcb; + nre::UserSm *sem; +}; + +class IOThread : public StaticReceiver, public nre::SListItem { +private: + nre::UserSm _lock; + nre::UserSm _block; + bool blocking; + nre::SList _queue; + Motherboard *_mb; + + nre::SList _notify; + nre::Utcb *own_utcb; + +public: + bool enq(MessageIOThreadEle *msg); + void syncify_message(MessageIOThreadEle *msg); + template + void sync_message(MessageIOThreadEle *msg, MessageIOThread::Sync sync); + + void stats(); + + void reset(); + + bool enqueue(MessageDisk &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu); + bool enqueue(MessageDiskCommit &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu); + bool enqueue(MessageTime &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu); + bool enqueue(MessageTimer &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu); + bool enqueue(MessageTimeout &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu); + bool enqueue(MessageIOOut &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu); + bool enqueue(MessageIOIn &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu); + bool enqueue(MessageMem &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu); + bool enqueue(CpuMessage &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu); + bool enqueue(MessageInput &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu); + bool enqueue(MessageIrqLines &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu); + bool enqueue(MessageIrqNotify &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu); + bool enqueue(MessageIrq &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu); + bool enqueue(MessageLegacy &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu); + bool enqueue(MessageNetwork &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu); + bool enqueue(MessagePciConfig &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu); + bool enqueue(MessageHostOp &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu); + + void worker(); + nre::UserSm *get_notify_sem(nre::Utcb *utcb); + + IOThread(Motherboard *mb) : _lock(1), _block(0), blocking(false), _queue(), _mb(mb), _notify() { + mb->bus_disk.set_iothread_enqueue(this, enqueue_static); + mb->bus_diskcommit.set_iothread_enqueue(this, enqueue_static); + mb->bus_time.set_iothread_enqueue(this, enqueue_static); + mb->bus_timer.set_iothread_enqueue(this, enqueue_static); + mb->bus_timeout.set_iothread_enqueue(this, enqueue_static); + mb->bus_ioout.set_iothread_enqueue(this, enqueue_static); + mb->bus_ioin.set_iothread_enqueue(this, enqueue_static); + mb->bus_mem.set_iothread_enqueue(this, enqueue_static); + mb->bus_input.set_iothread_enqueue(this, enqueue_static); + mb->bus_irqlines.set_iothread_enqueue(this, enqueue_static); + mb->bus_irqnotify.set_iothread_enqueue(this, enqueue_static); + mb->bus_hostirq.set_iothread_enqueue(this, enqueue_static); + mb->bus_legacy.set_iothread_enqueue(this, enqueue_static); + mb->bus_network.set_iothread_enqueue(this, enqueue_static); + mb->bus_pcicfg.set_iothread_enqueue(this, enqueue_static); + } +}; diff --git a/nre/src/StorageDevice.h b/nre/src/StorageDevice.h index fa248c8a..d25f500a 100644 --- a/nre/src/StorageDevice.h +++ b/nre/src/StorageDevice.h @@ -2,6 +2,8 @@ * Copyright (C) 2012, Nils Asmussen * Economic rights: Technische Universitaet Dresden (Germany) * + * Copyright (C) 2013 Markus Partheymueller, Intel Corporation. + * * This file is part of NRE (NOVA runtime environment). * * NRE is free software: you can redistribute it and/or modify @@ -34,7 +36,7 @@ class StorageDevice { nre::Reference gt = nre::GlobalThread::create( thread, nre::CPU::current().log_id(), buffer); gt->set_tls(nre::Thread::TLS_PARAM, this); - gt->start(); + gt->start(nre::Qpd(2, 10000)); } MessageDisk::Status get_params(DiskParameter ¶ms) { @@ -78,7 +80,6 @@ class StorageDevice { nre::Storage::Packet *pk = sd->_sess.consumer().get(); // the status isn't used anyway { - nre::ScopedLock guard(&globalsm); MessageDiskCommit msg(sd->_no, pk->tag, MessageDisk::DISK_OK); sd->_bus.send(msg); } diff --git a/nre/src/Timeouts.cc b/nre/src/Timeouts.cc index c6659a83..5d4bfaa6 100644 --- a/nre/src/Timeouts.cc +++ b/nre/src/Timeouts.cc @@ -3,6 +3,8 @@ * Copyright (C) 2007-2009, Bernhard Kauer * Economic rights: Technische Universitaet Dresden (Germany) * + * Copyright (C) 2013 Markus Partheymueller, Intel Corporation. + * * This file is part of Vancouver. * * Vancouver is free software: you can redistribute it and/or modify @@ -35,7 +37,6 @@ void Timeouts::timer_thread(void*) { } void Timeouts::trigger() { - ScopedLock guard(&globalsm); // TODO it can't be correct to not grab _sm here, because we might access stuff from // different threads here. but if we grab it here, we deadlock ourself because the devices // on the bus might call e.g. alloc(). diff --git a/nre/src/Timeouts.h b/nre/src/Timeouts.h index 3cf59177..8890fcf1 100644 --- a/nre/src/Timeouts.h +++ b/nre/src/Timeouts.h @@ -3,6 +3,8 @@ * Copyright (C) 2007-2009, Bernhard Kauer * Economic rights: Technische Universitaet Dresden (Germany) * + * Copyright (C) 2013 Markus Partheymueller, Intel Corporation. + * * This file is part of Vancouver. * * Vancouver is free software: you can redistribute it and/or modify @@ -26,20 +28,18 @@ #include -extern nre::UserSm globalsm; - class Timeouts { enum { NO_TIMEOUT = ~0ULL }; public: - Timeouts(Motherboard &mb) - : _mb(mb), _sm(), _timeouts(), _timer("timer"), _last_to(NO_TIMEOUT) { + Timeouts(Motherboard &mb, cpu_t cpu) + : _mb(mb), _cpu(cpu), _sm(), _timeouts(), _timer("timer"), _last_to(NO_TIMEOUT) { nre::Reference gt = nre::GlobalThread::create( - timer_thread, nre::CPU::current().log_id(), "vmm-timeouts"); + timer_thread, _cpu, "vmm-timeouts"); gt->set_tls(nre::Thread::TLS_PARAM, this); - gt->start(); + gt->start(nre::Qpd(2, 10000)); } nre::TimerSession &session() { @@ -67,8 +67,9 @@ class Timeouts { void program(); Motherboard &_mb; + cpu_t _cpu; nre::UserSm _sm; - nre::TimeoutList<32, void> _timeouts; + nre::TimeoutList<64, void> _timeouts; nre::TimerSession _timer; timevalue_t _last_to; }; diff --git a/nre/src/VCPUBackend.cc b/nre/src/VCPUBackend.cc index e8ee126d..9b0d3e91 100644 --- a/nre/src/VCPUBackend.cc +++ b/nre/src/VCPUBackend.cc @@ -3,6 +3,8 @@ * Copyright (C) 2007-2009, Bernhard Kauer * Economic rights: Technische Universitaet Dresden (Germany) * + * Copyright (C) 2013 Markus Partheymueller, Intel Corporation. + * * This file is part of Vancouver. * * Vancouver is free software: you can redistribute it and/or modify @@ -80,7 +82,6 @@ void VCPUBackend::handle_io(bool is_in, unsigned io_order, unsigned port) { io_order, port, &uf->eax, uf->mtd); skip_instruction(msg); { - ScopedLock guard(&globalsm); if(!vcpu->executor.send(msg, true)) Util::panic("nobody to execute %s at %x:%x\n", __func__, msg.cpu->cs.sel, msg.cpu->eip); } @@ -97,8 +98,6 @@ void VCPUBackend::handle_vcpu(capsel_t pid, bool skip, CpuMessage::Type type) { if(skip) skip_instruction(msg); - ScopedLock guard(&globalsm); - /** * Send the message to the VCpu. */ diff --git a/nre/src/Vancouver.cc b/nre/src/Vancouver.cc index 574320b0..72e08add 100644 --- a/nre/src/Vancouver.cc +++ b/nre/src/Vancouver.cc @@ -3,6 +3,8 @@ * Copyright (C) 2007-2009, Bernhard Kauer * Economic rights: Technische Universitaet Dresden (Germany) * + * Copyright (C) 2013 Markus Partheymueller, Intel Corporation. + * * This file is part of Vancouver. * * Vancouver is free software: you can redistribute it and/or modify @@ -36,6 +38,7 @@ using namespace nre; static bool initialized = false; static size_t ncpu = 1; +size_t last_cpunr = 0; static DataSpace *guest_mem = nullptr; static size_t guest_size = 0; static size_t console = 1; @@ -221,7 +224,8 @@ bool Vancouver::receive(MessageHostOp &msg) { break; case MessageHostOp::OP_VCPU_CREATE_BACKEND: { - cpu_t cpu = CPU::current().log_id(); + cpu_t cpu = (++last_cpunr + CPU::current().log_id())%CPU::count(); + Serial::get() << "Create VCPU pinned to CPU " << fmt(cpu, "%d") << "\n"; VCPUBackend *v = new VCPUBackend(&_mb, msg.vcpu, nre::Hip::get().has_svm(), cpu); msg.value = reinterpret_cast(v); msg.vcpu->executor.add(this, receive_static ); @@ -231,17 +235,17 @@ bool Vancouver::receive(MessageHostOp &msg) { case MessageHostOp::OP_VCPU_BLOCK: { VCPUBackend *v = reinterpret_cast(msg.value); - globalsm.up(); - v->sm().down(); - globalsm.down(); + bool block = !initialized; + if (block) globalsm.up(); + v->sm().zero(); + if (block) globalsm.down(); res = true; } break; case MessageHostOp::OP_VCPU_RELEASE: { VCPUBackend *v = reinterpret_cast(msg.value); - if(msg.len) - v->sm().up(); + v->sm().up(); v->vcpu().recall(); res = true; } @@ -281,10 +285,10 @@ bool Vancouver::receive(MessageTimer &msg) { COUNTER_INC("requestTO"); switch(msg.type) { case MessageTimer::TIMER_NEW: - msg.nr = _timeouts.alloc(); + msg.nr = _timeouts[CPU::current().log_id()]->alloc(); return true; case MessageTimer::TIMER_REQUEST_TIMEOUT: - _timeouts.request(msg.nr, msg.abstime); + _timeouts[CPU::current().log_id()]->request(msg.nr, msg.abstime); break; default: return false; @@ -294,7 +298,7 @@ bool Vancouver::receive(MessageTimer &msg) { bool Vancouver::receive(MessageTime &msg) { timevalue_t ts, wallclock; - _timeouts.time(ts, wallclock); + _timeouts[CPU::current().log_id()]->time(ts, wallclock); msg.timestamp = ts; msg.wallclocktime = wallclock; return true; @@ -304,6 +308,7 @@ bool Vancouver::receive(MessageLegacy &msg) { if(msg.type != MessageLegacy::RESET) return false; // TODO ?? + _iothread_obj->reset(); return true; } @@ -414,7 +419,6 @@ void Vancouver::network_thread(void*) { break; { - ScopedLock guard(&globalsm); MessageNetwork msg(packet, len, 0); vc->_mb.bus_network.send(msg); } @@ -451,7 +455,6 @@ void Vancouver::keyboard_thread(void*) { } } - ScopedLock guard(&globalsm); MessageInput msg(0x10000, pk.scancode | pk.flags); vc->_mb.bus_input.send(msg); } @@ -511,6 +514,11 @@ void Vancouver::create_vcpus() { } } +void Vancouver::iothread_worker(void *) { + Vancouver *vc = Thread::current()->get_tls(Thread::TLS_PARAM); + vc->iothread()->worker(); +} + int main(int argc, char **argv) { size_t fbsize = ExecEnv::PAGE_SIZE * nre::VGAStream::PAGES; for(int i = 1; i < argc; ++i) { diff --git a/nre/src/Vancouver.h b/nre/src/Vancouver.h index 8c72c61f..e3f26163 100644 --- a/nre/src/Vancouver.h +++ b/nre/src/Vancouver.h @@ -3,6 +3,8 @@ * Copyright (C) 2007-2009, Bernhard Kauer * Economic rights: Technische Universitaet Dresden (Germany) * + * Copyright (C) 2013 Markus Partheymueller, Intel Corporation. + * * This file is part of Vancouver. * * Vancouver is free software: you can redistribute it and/or modify @@ -31,6 +33,7 @@ #include "StorageDevice.h" #include "VCPUBackend.h" #include "ConsoleBackend.h" +#include "IOThread.h" extern nre::UserSm globalsm; @@ -40,9 +43,22 @@ class Vancouver : public StaticReceiver { public: explicit Vancouver(const char **args, size_t count, size_t console, const nre::String &constitle, size_t fbsize) - : _clock(nre::Hip::get().freq_tsc * 1000), _mb(&_clock, nullptr), _timeouts(_mb), + : _clock(nre::Hip::get().freq_tsc * 1000), _mb(&_clock, nullptr), _iothread_obj(nullptr), _conssess("console", console, constitle), _console(this, fbsize), _netsess(), _vmmng(), _vcpus(), _stdevs() { + + _iothread_obj = new IOThread(&_mb); + + // IOThread + nre::Reference io = nre::GlobalThread::create( + iothread_worker, nre::CPU::current().log_id(), "vmm-io"); + io->set_tls(nre::Thread::TLS_PARAM, this); + io->start(nre::Qpd(2, 10000)); + + _timeouts = new Timeouts *[nre::CPU::count()]; + for (cpu_t i=0; i { nre::Reference network = nre::GlobalThread::create( network_thread, nre::CPU::current().log_id(), "vmm-network"); network->set_tls(nre::Thread::TLS_PARAM, this); - network->start(); + network->start(nre::Qpd(2, 10000)); } catch(const nre::Exception &e) { nre::Serial::get() << "Unable to connect to network: " << e.msg() << "\n"; @@ -81,8 +97,8 @@ class Vancouver : public StaticReceiver { nre::ConsoleSession &console() { return _conssess; } - Timeouts &timeouts() { - return _timeouts; + Timeouts *timeouts(cpu_t cpu) { + return _timeouts[cpu]; } uint64_t generate_mac() { static int macs = 0; @@ -90,6 +106,7 @@ class Vancouver : public StaticReceiver { return _vmmng->generate_mac().raw(); return BASE_MAC + macs++; } + IOThread *iothread() { return _iothread_obj; } void reset(); bool receive(CpuMessage &msg); @@ -106,13 +123,15 @@ class Vancouver : public StaticReceiver { private: static void network_thread(void*); static void keyboard_thread(void*); + static void iothread_worker(void*); static void vmmng_thread(void*); void create_devices(const char **args, size_t count); void create_vcpus(); Clock _clock; Motherboard _mb; - Timeouts _timeouts; + IOThread *_iothread_obj; + Timeouts **_timeouts; nre::ConsoleSession _conssess; ConsoleBackend _console; nre::NetworkSession *_netsess; diff --git a/test/Makefile b/test/Makefile new file mode 100644 index 00000000..cd7be9f9 --- /dev/null +++ b/test/Makefile @@ -0,0 +1,48 @@ +CC=g++ +CFLAGS=-g -O3 -std=gnu++11 -gdwarf-2 -ggdb3 +CFLAGS_NOOPT=-g -O0 -std=gnu++11 -gdwarf-2 -ggdb3 +INCLUDES=-I ../include/ -I ../unix/include/ +LIBS=-pthread + +all: pic lapic ioapic + +pic: pic.o logging.o params.o pic8259.o + $(CC) $(CFLAGS) $(INCLUDES) $(LIBS) -DPICTEST \ + main.cc pic.o \ + params.o logging.o pic8259.o -o pictest.bin + +pic.o: pic.cc pic.h + $(CC) $(CFLAGS) $(INCLUDES) $(LIBS) -DPICTEST \ + pic.cc -c +params.o: ../unix/params.cc + $(CC) $(CFLAGS) $(INCLUDES) $(LIBS) -DPICTEST \ + ../unix/params.cc -c +logging.o: logging.cc + $(CC) $(CFLAGS) $(INCLUDES) $(LIBS) -DPICTEST \ + logging.cc -c +pic8259.o: ../model/pic8259.cc + $(CC) $(CFLAGS) $(INCLUDES) $(LIBS) -DPICTEST \ + ../model/pic8259.cc -c + +runpic: pic + @./pictest.bin 2> log.txt + +ioapic: + $(CC) $(CFLAGS) $(INCLUDES) $(LIBS) -DIOAPICTEST \ + main.cc ioapic.cc \ + logging.cc ../unix/params.cc ../model/ioapic.cc -o ioapictest.bin + +runioapic: ioapic + ./ioapictest.bin 2> log.txt + +lapic: logging.o params.o lapic.cc lapic.h + $(CC) $(CFLAGS) $(INCLUDES) $(LIBS) -DLAPICTEST \ + main.cc lapic.cc \ + ../model/ioapic.cc ../model/lapic.cc ../model/vcpu.cc \ + params.o logging.o -o lapictest.bin + +runlapic: lapic + ./lapictest.bin 2> log.txt + +clean: + rm -f *.bin *.txt *.o diff --git a/test/ioapic.cc b/test/ioapic.cc new file mode 100644 index 00000000..9374fbe1 --- /dev/null +++ b/test/ioapic.cc @@ -0,0 +1,168 @@ +/** + * I/O APIC Unit Test + * + * Copyright (C) 2013 Markus Partheymueller, Intel Corporation. + * + * This file is part of Seoul. + * + * Seoul is free software: you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Seoul is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details. + */ + +#include "ioapic.h" + +static LogBuffer logger; + +static Clock mb_clock(1000000); +static Motherboard mb(&mb_clock, NULL); + +void doIO(bool read, uintptr_t phys, unsigned *ptr) { + MessageMem msg(read, phys, ptr); + mb.bus_mem.send(msg); +} + +void readIO(uintptr_t phys, unsigned *ptr) { + doIO(true, phys, ptr); +} + +void writeIO(uintptr_t phys, unsigned *ptr) { + doIO(false, phys, ptr); +} + +// worker threads +pthread_t receiver, trigger1, trigger2; +unsigned irq_received_1=0, irq_received_2=0; +bool irq1free=true, irq2free=true; +unsigned irq_pending=0; + +// message handlers +static bool receive(Device *, MessageMem &msg) { + if (msg.phys == LAPIC_ADDR) { + // set bit for IRQ + logger.log(LOG_INTR, *msg.ptr); + __sync_fetch_and_or(&irq_pending, 1 << (*msg.ptr & 0xff)); + } +} +static bool receive(Device *, MessageIrqNotify &msg) { + logger.log(LOG_NOTIFY, msg.baseirq << 16 | msg.mask); + if (msg.baseirq == (IRQ1 & ~0x7) && msg.mask & (1 << (IRQ1 & 0x7))) { + // IRQ1 can be re-raised + __sync_bool_compare_and_swap(&irq1free, false, true); + } + if (msg.baseirq == (IRQ2 & ~0x7) && msg.mask & (1 << (IRQ2 & 0x7))) { + // IRQ2 can be re-raised + __sync_bool_compare_and_swap(&irq2free, false, true); + } +} + +static void * receiver_fn(void *) { + unsigned waitcount = 0; + while (true) { + if (!__sync_fetch_and_or(&irq_pending, 0)) { + if (waitcount++ > 1000000 || (irq_received_1 == IRQ_COUNT && irq_received_2 == IRQ_COUNT)) break; + continue; + } + waitcount = 0; + + unsigned vec; + if (irq_pending & (1 << IRQ2)) { + vec = IRQ2; + irq_received_2++; + } else if (irq_pending & (1 << IRQ1)) { + vec = IRQ1; + irq_received_1++; + } + + __sync_fetch_and_and(&irq_pending, ~(1 << vec)); + + // EOI + logger.log(LOG_EOI, vec); + writeIO(IOAPIC_ADDR | IOAPIC_EOI, &vec); + } +} + +static void * trigger_1_fn(void *) { + MessageIrqLines msg(MessageIrq::ASSERT_NOTIFY, IRQ1); + unsigned sent = 0; + while (sent++ < IRQ_COUNT) { + while (!__sync_bool_compare_and_swap(&irq1free, true, false)); + + logger.log(LOG_SEND, IRQ1); + mb.bus_irqlines.send(msg); + } + return nullptr; +} + +static void * trigger_2_fn(void *) { + MessageIrqLines msg(MessageIrq::ASSERT_NOTIFY, IRQ2); + unsigned sent = 0; + while (sent++ < IRQ_COUNT) { + while (!__sync_bool_compare_and_swap(&irq2free, true, false)); + + logger.log(LOG_SEND, IRQ2); + mb.bus_irqlines.send(msg); + } + return nullptr; +} + +int runIOAPicTest() { + // attach handlers + mb.bus_irqnotify.add(nullptr, receive); + mb.bus_mem.add(nullptr, receive); + + // create I/O APIC + mb.handle_arg("ioapic"); + + // init two IRQs + unsigned index1 = 0x10+IRQ1*2; + unsigned irq1 = 0x8000 | (IRQ1 & 0xff); + unsigned index2 = 0x10+IRQ2*2; + unsigned irq2 = 0x8000 | (IRQ2 & 0xff); + + writeIO(IOAPIC_ADDR | IOAPIC_IDX, &index1); + writeIO(IOAPIC_ADDR | IOAPIC_DATA, &irq1); + writeIO(IOAPIC_ADDR | IOAPIC_IDX, &index2); + writeIO(IOAPIC_ADDR | IOAPIC_DATA, &irq2); + + // create threads for triggering and receiving interrupts + cpu_set_t cpuset_receiver, cpuset_trigger1, cpuset_trigger2; + pthread_t self = pthread_self(); + CPU_ZERO(&cpuset_receiver); + CPU_ZERO(&cpuset_trigger1); + CPU_ZERO(&cpuset_trigger2); + CPU_SET(1, &cpuset_receiver); + CPU_SET(2, &cpuset_trigger1); + CPU_SET(3, &cpuset_trigger2); + + pthread_setaffinity_np(self, sizeof(cpu_set_t), &cpuset_trigger1); + + timevalue tsc_start = Cpu::rdtsc(); + + pthread_create(&receiver, NULL, receiver_fn, NULL); + pthread_setaffinity_np(receiver, sizeof(cpu_set_t), &cpuset_receiver); + + pthread_create(&trigger1, NULL, trigger_1_fn, NULL); + pthread_setaffinity_np(trigger1, sizeof(cpu_set_t), &cpuset_trigger1); + + pthread_create(&trigger2, NULL, trigger_2_fn, NULL); + pthread_setaffinity_np(trigger2, sizeof(cpu_set_t), &cpuset_trigger2); + + pthread_join(receiver, nullptr); + pthread_join(trigger1, nullptr); + pthread_join(trigger2, nullptr); + + timevalue cycles = Cpu::rdtsc() - tsc_start; + + printf("Test completed. Received (%u, %u) interrupts (expected %u, %u).\nTest took %llu cycles.\n", + irq_received_1, irq_received_2, IRQ_COUNT, IRQ_COUNT, cycles); + + //logger.dump(); + + return 0; +} diff --git a/test/ioapic.h b/test/ioapic.h new file mode 100644 index 00000000..05dd6b79 --- /dev/null +++ b/test/ioapic.h @@ -0,0 +1,77 @@ +/** + * I/O APIC Test header file + * + * Copyright (C) 2013 Markus Partheymueller, Intel Corporation. + * + * This file is part of Seoul. + * + * Seoul is free software: you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Seoul is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details. + */ + +#include +#include + +#include +#include +#include +#include +#include + +int runIOAPicTest(); + +enum { IRQ_COUNT = 10000000, IRQ1 = 20, IRQ2 = 21 }; + +enum LogItem { + LOG_SEND = 0x1, + LOG_INTR, + LOG_INTA_RX, + LOG_INTA_TX, + LOG_EOI, + LOG_NOTIFY, + LOG_DEASS +}; + +#define IOAPIC_ADDR 0xfec00000 +#define IOAPIC_IDX 0x00 +#define IOAPIC_DATA 0x10 +#define IOAPIC_EOI 0x40 +#define LAPIC_ADDR 0xfee00000 + +class LogBuffer { +private: + unsigned logbuffer[20*IRQ_COUNT]; + unsigned logindex=0; + +public: + + void log(LogItem type, unsigned value=0) { + unsigned logindex_tmp = __sync_fetch_and_add(&logindex, 1); + logbuffer[logindex_tmp] = (value << 16) | type; + } + + void dump() { + Logging::printf("\nLog output follows:\n---------------------------------------\n\n"); + for (unsigned i=0; i> 16); + } + Logging::printf("\n---------------------------------------\n\nPrinted %u events.\n", logindex); + } +}; diff --git a/test/lapic.cc b/test/lapic.cc new file mode 100644 index 00000000..0e5f7937 --- /dev/null +++ b/test/lapic.cc @@ -0,0 +1,238 @@ +/* + * LAPIC Unit Test + * + * Copyright (C) 2013 Markus Partheymueller, Intel Corporation. + * + * This file is part of Seoul. + * + * Seoul is free software: you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Seoul is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details. + */ + +#include "lapic.h" + +static LogBuffer logger; + +static Clock mb_clock(1000000); +static Motherboard mb(&mb_clock, NULL); + +VCpu *vcpu; + +timevalue tsc_start; + +void doIO(bool read, uintptr_t phys, unsigned *ptr) { + MessageMem msg(read, phys, ptr); + vcpu->mem.send(msg); +} + +void readIO(uintptr_t phys, unsigned *ptr) { + doIO(true, phys, ptr); +} + +void writeIO(uintptr_t phys, unsigned *ptr) { + doIO(false, phys, ptr); +} + +// worker threads +pthread_t receiver, trigger, ipi; +bool irq_free = false; +bool ipi_free = false; +unsigned irq_sent_timer = 0, irq_sent_ipi = 0; +unsigned irq_received_timer = 0, irq_received_ipi = 0; +unsigned long intr = 0; +bool wakeup = false; + +static bool receive(Device *, CpuEvent &msg) { + if (msg.value == VCpu::EVENT_INTR) { + __sync_fetch_and_add(&intr, 4); + if (!(__sync_fetch_and_or(&intr, 1) & 0x1)) { + logger.log(LOG_INTR); + } + wakeup = true; + } else if (msg.value == VCpu::DEASS_INTR) { + logger.log(LOG_DEASS); + } +} + +static bool receive(Device *, MessageHostOp &msg) { + if (msg.type == MessageHostOp::OP_VCPU_CREATE_BACKEND) { + vcpu = msg.vcpu; + vcpu->bus_event.add(nullptr, receive); + msg.value = 0; + return true; + } else if (msg.type == MessageHostOp::OP_VCPU_RELEASE) { + // Release: CHECK_IRQ (unused) + } else { + Logging::printf("Hostop msg type %u\n", msg.type); + } +} + +static bool receive(Device *, MessageTimer &msg) { + if (msg.type == MessageTimer::TIMER_NEW) { + msg.nr = 0; + } else { + // Ready to fire new + logger.log(LOG_NOTIFY, TIMER_VEC); + __sync_bool_compare_and_swap(&irq_free, false, true); + } + return true; +} + +static void * receiver_fn(void *) { + // Run magic + unsigned long waitcount = 0; + unsigned long current; + + while (Cpu::rdtsc() < tsc_start+10000000); + + __sync_bool_compare_and_swap(&ipi_free, false, true); + + while (true) { + if (!__sync_fetch_and_and(&wakeup, 0)) { + if (irq_received_timer == IRQ_COUNT_TIMER && irq_received_ipi == IRQ_COUNT_IPI || waitcount++ > 100000000) break; + asm volatile ("pause"); + continue; + } + waitcount = 0; + + // Double-check due to race + current = intr; + LapicEvent check(LapicEvent::CHECK_INTR); + check.value = 0; + vcpu->bus_lapic.send(check, true); + if (!check.value) { + logger.log(LOG_SKIP, check.value); + __sync_bool_compare_and_swap(&intr, current, (current + 4) & ~1ULL); + continue; + } + + // INTA + logger.log(LOG_INTA_TX); + LapicEvent msg(LapicEvent::INTA); + vcpu->bus_lapic.send(msg); + logger.log(LOG_INTA_RX, msg.value); + if (msg.value == TIMER_VEC) irq_received_timer++; + else if (msg.value == IPI_VEC) irq_received_ipi++; + else Logging::panic("Spurious IRQ! %x\n", msg.value); + + // EOI + unsigned val = 0x0; + logger.log(LOG_EOI, msg.value); + writeIO(LAPIC_BASE + 0xb0, &val); + + if (msg.value == TIMER_VEC) { + // Rearm + unsigned val = 1U; + writeIO(LAPIC_BASE + 0x380, &val); + } else if (msg.value == IPI_VEC) { + // Free IPI mutex + __sync_bool_compare_and_swap(&ipi_free, false, true); + } + } + Logging::printf("Receiver finished with %u,%u interrupts. (waitcount %lu)\n", irq_received_timer, irq_received_ipi, waitcount); + Logging::printf("They sent %u,%u interrupts.\n", irq_sent_timer, irq_sent_ipi); + return nullptr; +} + +static void * trigger_fn(void *) { + // Trigger timer interrupt at lapic + while (irq_sent_timer < IRQ_COUNT_TIMER) { + while (!__sync_bool_compare_and_swap(&irq_free, true, false)) asm volatile ("pause"); + + logger.log(LOG_SEND, TIMER_VEC); + + MessageTimeout msg(0, Cpu::rdtsc()); + assert(mb.bus_timeout.send(msg)); + + irq_sent_timer++; + } + Logging::printf("Timer thread exits.\n"); + return nullptr; +} + +static void * ipi_fn(void *) { + while (irq_sent_ipi < IRQ_COUNT_IPI) { + while (!__sync_bool_compare_and_swap(&ipi_free, true, false)) asm volatile ("pause"); + + logger.log(LOG_SEND, IPI_VEC); + + MessageApic msg(0x4000 | IPI_VEC, 0xff, 0); + assert(mb.bus_apic.send(msg)); + + irq_sent_ipi++; + } + Logging::printf("IPI thread exits.\n"); + return nullptr; +} + +int runLAPICTest() { + // attach handlers + mb.bus_hostop.add(nullptr, receive); + mb.bus_timer.add(nullptr, receive); + + // parse args + //mb.handle_arg("ioapic"); + mb.handle_arg("vcpu"); + mb.handle_arg("lapic"); + + // init LAPIC + //software enable, map spurious interrupt to dummy isr + unsigned val = 39 | 0x100; + writeIO(LAPIC_BASE + 0xf0, &val); + //map APIC timer to an interrupt, and by that enable it + val = TIMER_VEC; + writeIO(LAPIC_BASE + 0x320, &val); + //set up divide value to 16 + val = 0x03; + writeIO(LAPIC_BASE + 0x3e0, &val); + //reset APIC timer (set counter) + val = 1000U; + writeIO(LAPIC_BASE + 0x380, &val); + + tsc_start = Cpu::rdtsc(); + + cpu_set_t cpuset_receiver, cpuset_trigger, cpuset_ipi; + pthread_t self = pthread_self(); + CPU_ZERO(&cpuset_receiver); + CPU_ZERO(&cpuset_trigger); + CPU_ZERO(&cpuset_ipi); + CPU_SET(1, &cpuset_receiver); + CPU_SET(2, &cpuset_trigger); + CPU_SET(3, &cpuset_ipi); + + pthread_setaffinity_np(self, sizeof(cpu_set_t), &cpuset_trigger); + + pthread_create(&trigger, NULL, trigger_fn, NULL); + pthread_setaffinity_np(trigger, sizeof(cpu_set_t), &cpuset_trigger); + + pthread_create(&ipi, NULL, ipi_fn, NULL); + pthread_setaffinity_np(ipi, sizeof(cpu_set_t), &cpuset_ipi); + + pthread_create(&receiver, NULL, receiver_fn, NULL); + pthread_setaffinity_np(receiver, sizeof(cpu_set_t), &cpuset_receiver); + + //pthread_join(trigger, nullptr); + //pthread_join(ipi, nullptr); + pthread_join(receiver, nullptr); + + timevalue cycles = Cpu::rdtsc() - tsc_start; + + printf("Test completed. Received (%u, %u) interrupts (expected %u, %u).\nTest took %llu cycles.\n", + irq_received_timer, irq_received_ipi, IRQ_COUNT_TIMER, IRQ_COUNT_IPI, cycles); + + if (irq_received_timer != irq_sent_timer || irq_received_ipi != irq_sent_ipi || irq_received_timer != IRQ_COUNT_TIMER || irq_received_ipi != IRQ_COUNT_IPI) { + logger.dump(); + printf("Error. Log dumped, going to spin...\n"); + for (;;); + } + + //logger.dump(); + + return 0; +} diff --git a/test/lapic.h b/test/lapic.h new file mode 100644 index 00000000..a6d56914 --- /dev/null +++ b/test/lapic.h @@ -0,0 +1,85 @@ +/** + * LAPIC Test header file + * + * Copyright (C) 2013 Markus Partheymueller, Intel Corporation. + * + * This file is part of Seoul. + * + * Seoul is free software: you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Seoul is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details. + */ + +#include +#include + +#include +#include +#include +#include + +int runLAPICTest(); + +#define LAPIC_BASE 0xfee00000 + +enum { IRQ_COUNT_TIMER = 500000, IRQ_COUNT_IPI = 500000, TIMER_VEC = 32, IPI_VEC = 0xa7 }; + +enum LogItem { + LOG_INIT = 0x0, + LOG_SEND, + LOG_INTR, + LOG_INTA_RX, + LOG_INTA_TX, + LOG_EOI, + LOG_NOTIFY, + LOG_DEASS, + LOG_IGNORE, + LOG_SKIP +}; + +class LogBuffer { +private: + unsigned long logbuffer[60*(IRQ_COUNT_TIMER+IRQ_COUNT_IPI)]; + unsigned logindex=0; + +public: + + void log(LogItem type, unsigned value=0) { + unsigned logindex_tmp = __sync_fetch_and_add(&logindex, 2); + if (logindex_tmp >= sizeof(logbuffer)-2) { return; } + logbuffer[logindex_tmp] = Cpu::rdtsc(); + logbuffer[logindex_tmp+1] = (pthread_self() << 32) | (value << 16) | type; + } + + void dump() { + Logging::printf("\nLog output follows:\n---------------------------------------\n\n"); + for (unsigned i=0; i>32), + event, + (logbuffer[i+1] >> 16)& 0xffff + ); + } + Logging::printf("\n---------------------------------------\n\nPrinted %u events.\n", logindex/2); + } +}; diff --git a/test/logging.cc b/test/logging.cc new file mode 100644 index 00000000..1e97b79f --- /dev/null +++ b/test/logging.cc @@ -0,0 +1,50 @@ +/** + * Logging stubs + * + * Copyright (C) 2013 Markus Partheymueller, Intel Corporation. + * + * This file is part of Seoul. + * + * Seoul is free software: you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Seoul is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details. + */ + +#include +#include + +#include +#include +#include +#include + +void Logging::panic(const char *format, ...) +{ + va_list ap; + va_start(ap, format); + + Logging::vprintf(format, ap); + Logging::printf("\n"); + + va_end(ap); + abort(); +} + +void Logging::printf(const char *format, ...) +{ + va_list ap; + va_start(ap, format); + Logging::vprintf(format, ap); + va_end(ap); +} + + +void Logging::vprintf(const char *format, va_list &ap) +{ + ::vfprintf(stderr, format, ap); +} diff --git a/test/main.cc b/test/main.cc new file mode 100644 index 00000000..20ba583f --- /dev/null +++ b/test/main.cc @@ -0,0 +1,59 @@ +/** + * Synthetic Testing Environment + * + * Copyright (C) 2013 Markus Partheymueller, Intel Corporation. + * + * This file is part of Seoul. + * + * Seoul is free software: you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Seoul is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details. + */ + +#include + +#ifdef PICTEST +#include "pic.h" +#endif + +#ifdef IOAPICTEST +#include "ioapic.h" +#endif + +#ifdef LAPICTEST +#include "lapic.h" +#endif + +#ifdef SATATEST +#include "sata.h" +#endif + +int main(int argc, char **argv) { + std::cout << "Hello, this is Seoulcheck." << std::endl; + +#ifdef PICTEST + std::cout << "Running PIC test." << std::endl; + runPicTest(); +#endif + +#ifdef IOAPICTEST + std::cout << "Running I/O APIC test." << std::endl; + runIOAPicTest(); +#endif + +#ifdef LAPICTEST + std::cout << "Running LAPIC test." << std::endl; + runLAPICTest(); +#endif + +#ifdef SATATEST + std::cout << "Running SATA test." << std::endl; + runSATATest(); +#endif + +} diff --git a/test/pic.cc b/test/pic.cc new file mode 100644 index 00000000..953e8058 --- /dev/null +++ b/test/pic.cc @@ -0,0 +1,196 @@ +/** + * PIC Unit Test + * + * Copyright (C) 2013 Markus Partheymueller, Intel Corporation. + * + * This file is part of Seoul. + * + * Seoul is free software: you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Seoul is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details. + */ + +#include "pic.h" + +static LogBuffer logger; + +static Clock mb_clock(1000000); +static Motherboard mb(&mb_clock, NULL); + +timevalue tsc_start; + +unsigned char IRQS[2] = { 3, 12 }; + +void outb(unsigned short port, unsigned short value) { + MessageIOOut msg(MessageIOOut::TYPE_OUTB, port, value); + mb.bus_ioout.send(msg); +} +unsigned short inb(unsigned short port) { + MessageIOIn msg(MessageIOIn::TYPE_INB, port); + mb.bus_ioin.send(msg); + return msg.value; +} +unsigned char _get_irr() { + outb(0x20, 0x0a); + return inb(0x20); +} + +// worker threads +pthread_t receiver, trigger1, trigger2; +unsigned irq_received_1=0, irq_received_2=0; +bool irq_1_free=true, irq_2_free=true; +unsigned long intr = 0; + +// message handlers +static bool receive(Device *, MessageLegacy &msg) { + if (msg.type == MessageLegacy::INTR) { + __sync_fetch_and_add(&intr, 4); + if (!(__sync_fetch_and_or(&intr, 1) & 0x1)) { + logger.log(LOG_INTR); + } + } else if (msg.type == MessageLegacy::DEASS_INTR) { + logger.log(LOG_DEASS, msg.value); + } +} + +static bool receive(Device *, MessageIrqNotify &msg) { + logger.log(LOG_NOTIFY, msg.baseirq << 8 | msg.mask); + if (msg.baseirq == (IRQS[0] & 0x8) && msg.mask & (1 << (IRQS[0] & 0x7))) { + // First IRQ can be re-raised + __sync_bool_compare_and_swap(&irq_1_free, false, true); + } + else if (msg.baseirq == (IRQS[1] & 0x8) && msg.mask & (1 << (IRQS[1] & 0x7))) { + // Second IRQ can be re-raised + __sync_bool_compare_and_swap(&irq_2_free, false, true); + } + else Logging::panic("w00t %x:%x\n", msg.baseirq, msg.mask); +} + +static void * receiver_fn(void *) { + while (Cpu::rdtsc() < tsc_start+10000000); + logger.log(LOG_INIT); + sleep(1); + unsigned long waitcount = 0; + unsigned long current; + while (true) { + if (!(__sync_fetch_and_or(&intr, 1) & 0x1)) { + if (waitcount++ > 1000000000 || (irq_received_1 == IRQ_COUNT && irq_received_2 == IRQ_COUNT)) break; + continue; + } + + waitcount = 0; + + // Double-check due to race + current = intr; + MessageLegacy check(MessageLegacy::CHECK_INTR); + mb.bus_legacy.send(check); + if (!(check.value & 0xff00)) { + logger.log(LOG_SKIP, check.value); + __sync_bool_compare_and_swap(&intr, current, (current + 4) & ~1ULL); + continue; + } + + logger.log(LOG_INTA_TX, check.value); + MessageLegacy inta(MessageLegacy::INTA, 0); + waitcount = 0; + mb.bus_legacy.send(inta); + logger.log(LOG_INTA_RX, inta.value); + + if (inta.value == IRQS[0]) irq_received_1++; + if (inta.value == IRQS[1]) irq_received_2++; + + if (inta.value >= 8) outb(0xa0, 0x20); + outb(0x20, 0x20); + logger.log(LOG_EOI, (intr << 8) | inta.value); + } +} + + +template +static void * trigger_fn(void *) { + while (Cpu::rdtsc() < tsc_start+10000000); + logger.log(LOG_INIT); + MessageIrqLines msg(MessageIrq::ASSERT_NOTIFY, IRQS[IRQ-1]); + unsigned sent = 0, ignored = 0; + bool * waiter = (IRQ == 1) ? &irq_1_free : &irq_2_free; + while (sent < IRQ_COUNT) { + while (!__sync_bool_compare_and_swap(waiter, true, false)) asm volatile ("pause" : : : "memory"); + asm volatile ("":::"memory"); + + logger.log(LOG_SEND, IRQS[IRQ-1]); + if (mb.bus_irqlines.send(msg)) { + sent++; + ignored = 0; + } else { + assert(false); + logger.log(LOG_IGNORE, IRQS[IRQ-1]); + *waiter = true; + if (ignored++ >= 1000) return nullptr; + } + } + return nullptr; +} + +int runPicTest() { + // attach handlers + mb.bus_irqnotify.add(nullptr, receive); + mb.bus_legacy.add(nullptr, receive); + + tsc_start = Cpu::rdtsc(); + + // create PIC + mb.handle_arg("pic:0x20,,0x4d0"); + mb.handle_arg("pic:0xa0,2,0x4d1"); + + // init PICs (sequence according to http://wiki.osdev.org/PIC) + outb(0x20, 0x10+0x01); + outb(0xa0, 0x10+0x01); + outb(0x21, 0); + outb(0xa1, 8); + outb(0x21, 4); + outb(0xa1, 2); + + outb(0x21, 0x01); + outb(0xa1, 0x01); + + // create threads for triggering and receiving interrupts + cpu_set_t cpuset_receiver, cpuset_trigger1, cpuset_trigger2; + pthread_t self = pthread_self(); + CPU_ZERO(&cpuset_receiver); + CPU_ZERO(&cpuset_trigger1); + CPU_ZERO(&cpuset_trigger2); + CPU_SET(1, &cpuset_receiver); + CPU_SET(2, &cpuset_trigger1); + CPU_SET(3, &cpuset_trigger2); + + pthread_setaffinity_np(self, sizeof(cpu_set_t), &cpuset_trigger1); + + timevalue tsc_start = Cpu::rdtsc(); + + pthread_create(&trigger1, NULL, trigger_fn<1>, NULL); + pthread_setaffinity_np(trigger1, sizeof(cpu_set_t), &cpuset_trigger1); + + pthread_create(&trigger2, NULL, trigger_fn<2>, NULL); + pthread_setaffinity_np(trigger2, sizeof(cpu_set_t), &cpuset_trigger2); + + pthread_create(&receiver, NULL, receiver_fn, NULL); + pthread_setaffinity_np(receiver, sizeof(cpu_set_t), &cpuset_receiver); + + pthread_join(receiver, nullptr); + pthread_join(trigger1, nullptr); + pthread_join(trigger2, nullptr); + + timevalue cycles = Cpu::rdtsc() - tsc_start; + + printf("Test completed. Received (%u, %u) interrupts (expected %u, %u).\nTest took %llu cycles.\n", + irq_received_1, irq_received_2, IRQ_COUNT, IRQ_COUNT, cycles); + + //logger.dump(); + + return 0; +} diff --git a/test/pic.h b/test/pic.h new file mode 100644 index 00000000..f89c0821 --- /dev/null +++ b/test/pic.h @@ -0,0 +1,83 @@ +/** + * PIC Test header file + * + * Copyright (C) 2013 Markus Partheymueller, Intel Corporation. + * + * This file is part of Seoul. + * + * Seoul is free software: you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Seoul is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details. + */ + +#include +#include + +#include +#include +#include +#include + +int runPicTest(); + +enum { IRQ_COUNT = 500000 }; + +enum LogItem { + LOG_INIT = 0x0, + LOG_SEND, + LOG_INTR, + LOG_INTA_RX, + LOG_INTA_TX, + LOG_EOI, + LOG_NOTIFY, + LOG_DEASS, + LOG_IGNORE, + LOG_SKIP +}; + +class LogBuffer { +private: + unsigned long logbuffer[40*IRQ_COUNT]; + unsigned logindex=0; + +public: + + void log(LogItem type, unsigned value=0) { + unsigned logindex_tmp = __sync_fetch_and_add(&logindex, 2); + if (logindex_tmp >= sizeof(logbuffer)) return; + logbuffer[logindex_tmp] = Cpu::rdtsc(); + logbuffer[logindex_tmp+1] = (pthread_self() << 32) | (value << 16) | type; + } + + void dump() { + Logging::printf("\nLog output follows:\n---------------------------------------\n\n"); + for (unsigned i=0; i>32), + event, + (logbuffer[i+1] >> 16)& 0xffff + ); + } + Logging::printf("\n---------------------------------------\n\nPrinted %u events.\n", logindex/2); + } +}; diff --git a/unix/SConstruct b/unix/SConstruct index 3da784e8..5e64e833 100644 --- a/unix/SConstruct +++ b/unix/SConstruct @@ -123,9 +123,11 @@ sources = Glob('*.cc') + [ # Unix frontend '../model/pmtimer.cc', '../model/vcpu.cc', '../model/vbios.cc', + '../model/acpicontroller.cc', '../model/lapic.cc', '../model/msi.cc', '../host/hostkeyboard.cc', + '../host/migration.cc' ] # TODO not yet ported if target_arch == 'x86_32': diff --git a/unix/include/seoul/unix.h b/unix/include/seoul/unix.h index 29c57482..44c7d649 100644 --- a/unix/include/seoul/unix.h +++ b/unix/include/seoul/unix.h @@ -4,6 +4,8 @@ * Copyright (C) 2012, Julian Stecklina * Economic rights: Technische Universitaet Dresden (Germany) * + * Copyright (C) 2013 Markus Partheymueller, Intel Corporation. + * * This file is part of Seoul. * * Seoul is free software: you can redistribute it and/or modify it @@ -24,4 +26,34 @@ // everything else. extern pthread_mutex_t irq_mtx; +static unsigned long long int rdtsc(void) +{ + unsigned long long tsc; + asm volatile ("rdtsc" : "=A" (tsc)); + return tsc; +} + +static unsigned get_tsc_frequency() +{ + struct timezone tz; + memset(&tz, 0, sizeof(tz)); + + struct timeval start, stop; + unsigned long cycles[2], ms, hz; + + cycles[0] = rdtsc(); + gettimeofday(&start, &tz); + + usleep(250000); + + cycles[1] = rdtsc(); + gettimeofday(&stop, &tz); + + ms = ((stop.tv_sec - start.tv_sec)*1000000) + (stop.tv_usec - start.tv_usec); + + hz = (cycles[1]-cycles[0]) / ms * 1000000; + + return hz; +} + // EOF diff --git a/unix/iothread.cc b/unix/iothread.cc new file mode 100644 index 00000000..1d1a1673 --- /dev/null +++ b/unix/iothread.cc @@ -0,0 +1,477 @@ +/** + * I/O Thread + * + * Copyright (C) 2013 Markus Partheymueller, Intel Corporation. + * + * This file is part of Seoul. + * + * Seoul is free software: you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Seoul is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details. + */ + +#include "iothread.h" + +void IOThread::init() { + for(VCpu *vcpu = _mb->last_vcpu; vcpu; vcpu = vcpu->get_last()) { + vcpu->mem.set_iothread_enqueue(this, enqueue_static, vcpu); + vcpu->executor.set_iothread_enqueue(this, enqueue_static, vcpu); + } +} + +sem_t *IOThread::get_notify_sem(pthread_t tid) { + for (auto it = _notify.begin(); it != _notify.end(); it++) { + if (it->tid == pthread_self()) return &it->sem; + } + Notify *new_notify = new Notify; + new_notify->tid = pthread_self(); + sem_init(&new_notify->sem, 0, 0); + _notify.push_back(*new_notify); + return &new_notify->sem; +} + +template +static void sync_msg(MessageIOThread &iomsg) { + // We have to keep the message when it is synchronous. The receiver will delete it. + if (iomsg.sync == MessageIOThread::SYNC_SYNC) { + // Wake enqueuer + assert(iomsg.sem != nullptr); + sem_post(reinterpret_cast(iomsg.sem)); + } else { + delete (M*) iomsg.ptr; + } +} + +void IOThread::syncify_message(MessageIOThread &msg) { + if (msg.sync == MessageIOThread::SYNC_SYNC) { + msg.sem = this->get_notify_sem(pthread_self()); + assert(msg.sem != nullptr); + } +} + +template +void IOThread::sync_message(MessageIOThread &msg, MessageIOThread::Sync sync) { + if (sync == MessageIOThread::SYNC_SYNC) { + // Wait for signal from worker + sem_wait(reinterpret_cast(msg.sem)); + } +} + +bool IOThread::enq(MessageIOThread &msg) { + pthread_mutex_lock(&_lock); + _queue->push(msg); + sem_post(&_block); + pthread_mutex_unlock(&_lock); + return true; +} + +bool IOThread::enqueue(MessageDisk &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu) { + if (pthread_self() == own_tid) return false; + // Disk is always sync because of error check + sync = MessageIOThread::SYNC_SYNC; + MessageIOThread enq(MessageIOThread::TYPE_DISK, mode, sync, value, &msg); + syncify_message(enq); + this->enq(enq); + sync_message(enq, sync); + return true; +} + +bool IOThread::enqueue(MessageDiskCommit &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu) { + if (pthread_self() == own_tid) return false; + MessageDiskCommit *ptr = new MessageDiskCommit(msg.disknr, msg.usertag, msg.status); + MessageIOThread enq(MessageIOThread::TYPE_DISKCOMMIT, mode, sync, value, ptr); + syncify_message(enq); + this->enq(enq); + sync_message(enq, sync); + return true; +} + +bool IOThread::enqueue(MessageTime &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu) { + if (pthread_self() == own_tid) return false; + // Time must be sync + sync = MessageIOThread::SYNC_SYNC; + MessageIOThread enq(MessageIOThread::TYPE_TIME, mode, sync, value, &msg); + syncify_message(enq); + this->enq(enq); + sync_message(enq, sync); + return true; +} + +bool IOThread::enqueue(MessageTimer &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu) { + if (pthread_self() == own_tid) return false; + MessageTimer *ptr; + if (msg.type == MessageTimer::TIMER_NEW) sync = MessageIOThread::SYNC_SYNC; + else Logging::panic("MessageTimer request nr %u\n", msg.nr); + if (sync == MessageIOThread::SYNC_ASYNC) { + ptr = new MessageTimer; + memcpy(ptr, &msg, sizeof(msg)); + } else { + ptr = &msg; + } + MessageIOThread enq(MessageIOThread::TYPE_TIMER, mode, sync, value, ptr); + syncify_message(enq); + this->enq(enq); + sync_message(enq, sync); + return true; +} + +bool IOThread::enqueue(MessageTimeout &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu) { + if (pthread_self() == own_tid) return false; + MessageTimeout *ptr; + if (sync == MessageIOThread::SYNC_ASYNC) { + ptr = new MessageTimeout(msg.nr, msg.time); + } else { + ptr = &msg; + } + MessageIOThread enq(MessageIOThread::TYPE_TIMEOUT, mode, sync, value, ptr); + syncify_message(enq); + this->enq(enq); + sync_message(enq, sync); + return true; +} + +bool IOThread::enqueue(MessageIOOut &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu) { + if (pthread_self() == own_tid) return false; + MessageIOOut *ptr; + if (sync == MessageIOThread::SYNC_ASYNC) { + ptr = new MessageIOOut(msg.type, msg.port, msg.value); + memcpy(ptr, &msg, sizeof(msg)); + } else { + ptr = &msg; + } + MessageIOThread enq(MessageIOThread::TYPE_IOOUT, mode, sync, value, ptr); + syncify_message(enq); + this->enq(enq); + sync_message(enq, sync); + return true; +} + +bool IOThread::enqueue(MessageIOIn &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu) { + if (pthread_self() == own_tid) return false; + // I/O port reads are always sync + sync = MessageIOThread::SYNC_SYNC; + MessageIOThread enq(MessageIOThread::TYPE_IOIN, mode, sync, value, &msg); + syncify_message(enq); + this->enq(enq); + sync_message(enq, sync); + return true; +} + +bool IOThread::enqueue(MessageMem &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu) { + if (pthread_self() == own_tid) return false; + // Mem reads are always sync + if (msg.read) sync = MessageIOThread::SYNC_SYNC; + MessageMem *ptr; + if (sync == MessageIOThread::SYNC_ASYNC) { + assert(!msg.read); + // We need to save the value pointed to by msg.ptr! + unsigned *val = new unsigned; + *val = *msg.ptr; + ptr = new MessageMem(msg.read, msg.phys, val); + } else { + ptr = &msg; + } + MessageIOThread enq(MessageIOThread::TYPE_MEM, mode, sync, value, ptr); + enq.vcpu = vcpu; + syncify_message(enq); + this->enq(enq); + sync_message(enq, sync); + return true; +} + +bool IOThread::enqueue(CpuMessage &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu) { + if (pthread_self() == own_tid) return false; + if (msg.type != CpuMessage::TYPE_RDMSR && msg.type != CpuMessage::TYPE_WRMSR && msg.type != CpuMessage::TYPE_CHECK_IRQ) + return false; + + // These messages are always sync + sync = MessageIOThread::SYNC_SYNC; + CpuMessage *ptr; + ptr = &msg; + MessageIOThread enq(MessageIOThread::TYPE_CPU, mode, sync, value, ptr); + enq.vcpu = vcpu; + syncify_message(enq); + this->enq(enq); + sync_message(enq, sync); + return true; +} + +bool IOThread::enqueue(MessageInput &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu) { + if (pthread_self() == own_tid) return false; + MessageInput *ptr; + if (sync == MessageIOThread::SYNC_ASYNC) { + ptr = new MessageInput(msg.device, msg.data); + } else { + ptr = &msg; + } + MessageIOThread enq(MessageIOThread::TYPE_INPUT, mode, sync, value, ptr); + syncify_message(enq); + this->enq(enq); + sync_message(enq, sync); + return true; +} + +bool IOThread::enqueue(MessageIrqLines &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu) { + if (pthread_self() == own_tid) return false; + MessageIrqLines *ptr; + if (sync == MessageIOThread::SYNC_ASYNC) { + ptr = new MessageIrqLines(msg.type, msg.line); + } else { + ptr = &msg; + } + MessageIOThread enq(MessageIOThread::TYPE_IRQLINES, mode, sync, value, ptr); + syncify_message(enq); + this->enq(enq); + sync_message(enq, sync); + return true; +} + +bool IOThread::enqueue(MessageIrqNotify &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu) { + if (pthread_self() == own_tid) return false; + MessageIrqNotify *ptr; + if (sync == MessageIOThread::SYNC_ASYNC) { + ptr = new MessageIrqNotify(msg.baseirq, msg.mask); + } else { + ptr = &msg; + } + MessageIOThread enq(MessageIOThread::TYPE_IRQNOTIFY, mode, sync, value, ptr); + syncify_message(enq); + this->enq(enq); + sync_message(enq, sync); + return true; +} + +bool IOThread::enqueue(MessageIrq &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu) { + if (pthread_self() == own_tid) return false; + MessageIrq *ptr; + if (sync == MessageIOThread::SYNC_ASYNC) { + ptr = new MessageIrq(msg.type, msg.line); + } else { + ptr = &msg; + } + MessageIOThread enq(MessageIOThread::TYPE_IRQ, mode, sync, value, ptr); + syncify_message(enq); + this->enq(enq); + sync_message(enq, sync); + return true; +} + +bool IOThread::enqueue(MessageLegacy &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu) { + if (pthread_self() == own_tid) return false; + if (msg.type == MessageLegacy::INTA || msg.type == MessageLegacy::DEASS_INTR) sync = MessageIOThread::SYNC_SYNC; + MessageLegacy *ptr; + if (sync == MessageIOThread::SYNC_ASYNC) { + ptr = new MessageLegacy(msg.type, msg.value); + } else { + ptr = &msg; + } + MessageIOThread enq(MessageIOThread::TYPE_LEGACY, mode, sync, value, ptr); + syncify_message(enq); + this->enq(enq); + sync_message(enq, sync); + return true; +} + +bool IOThread::enqueue(MessageNetwork &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu) { + if (pthread_self() == own_tid) return false; + if (msg.type == MessageNetwork::QUERY_MAC) sync = MessageIOThread::SYNC_SYNC; + MessageNetwork *ptr; + if (sync == MessageIOThread::SYNC_ASYNC) { + ptr = new MessageNetwork(msg.type, msg.client); + memcpy(ptr, &msg, sizeof(msg)); + } else { + ptr = &msg; + } + MessageIOThread enq(MessageIOThread::TYPE_NETWORK, mode, sync, value, ptr); + syncify_message(enq); + this->enq(enq); + sync_message(enq, sync); + return true; +} + +bool IOThread::enqueue(MessagePciConfig &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu) { + if (pthread_self() == own_tid) return false; + // Reads are sync + if (msg.type == MessagePciConfig::TYPE_READ) sync = MessageIOThread::SYNC_SYNC; + MessagePciConfig *ptr; + if (sync == MessageIOThread::SYNC_ASYNC) { + ptr = new MessagePciConfig(msg.bdf); + memcpy(ptr, &msg, sizeof(msg)); + } else { + ptr = &msg; + } + MessageIOThread enq(MessageIOThread::TYPE_PCICFG, mode, sync, value, ptr); + syncify_message(enq); + this->enq(enq); + sync_message(enq, sync); + return true; +} + +bool IOThread::enqueue(MessageHostOp &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu) { + if (pthread_self() == own_tid || msg.type != MessageHostOp::OP_VCPU_RELEASE) return false; + MessageHostOp *ptr; + if (sync == MessageIOThread::SYNC_ASYNC) { + ptr = new MessageHostOp(msg.vcpu); + memcpy(ptr, &msg, sizeof(msg)); + } else { + ptr = &msg; + } + MessageIOThread enq(MessageIOThread::TYPE_HOSTOP, mode, sync, value, ptr); + syncify_message(enq); + this->enq(enq); + sync_message(enq, sync); + return true; +} + + +void IOThread::worker() { + own_tid = pthread_self(); + + while (1) { + sem_wait(&_block); + pthread_mutex_lock(&_lock); + + MessageIOThread msg = _queue->front(); + _queue->pop(); + + pthread_mutex_unlock(&_lock); + + // Send message on appropriate bus + switch (msg.type) { + case MessageIOThread::TYPE_DISK: + { + MessageDisk *msg2 = reinterpret_cast(msg.ptr); + _mb->bus_disk.send_direct(*msg2, msg.mode, msg.value); + sync_msg(msg); + } + break; + case MessageIOThread::TYPE_DISKCOMMIT: + { + MessageDiskCommit *msg2 = reinterpret_cast(msg.ptr); + _mb->bus_diskcommit.send_direct(*msg2, msg.mode, msg.value); + sync_msg(msg); + } + break; + case MessageIOThread::TYPE_TIME: + { + MessageTime *msg2 = reinterpret_cast(msg.ptr); + _mb->bus_time.send_direct(*msg2, msg.mode, msg.value); + sync_msg(msg); + } + break; + case MessageIOThread::TYPE_TIMER: + { + MessageTimer *msg2 = reinterpret_cast(msg.ptr); + _mb->bus_timer.send_direct(*msg2, msg.mode, msg.value); + sync_msg(msg); + } + break; + case MessageIOThread::TYPE_TIMEOUT: + { + MessageTimeout *msg2 = reinterpret_cast(msg.ptr); + _mb->bus_timeout.send_direct(*msg2, msg.mode, msg.value); + sync_msg(msg); + } + break; + case MessageIOThread::TYPE_IOOUT: + { + MessageIOOut *msg2 = reinterpret_cast(msg.ptr); + _mb->bus_ioout.send_direct(*msg2, msg.mode, msg.value); + sync_msg(msg); + } + break; + case MessageIOThread::TYPE_IOIN: + { + MessageIOIn *msg2 = reinterpret_cast(msg.ptr); + _mb->bus_ioin.send_direct(*msg2, msg.mode, msg.value); + sync_msg(msg); + } + break; + case MessageIOThread::TYPE_MEM: + { + MessageMem *msg2 = reinterpret_cast(msg.ptr); + if (msg.vcpu) { + msg.vcpu->mem.send_direct(*msg2, msg.mode, msg.value); + } else + _mb->bus_mem.send_direct(*msg2, msg.mode, msg.value); + // Special case: delete saved value + if (msg.sync == MessageIOThread::SYNC_ASYNC) delete msg2->ptr; + sync_msg(msg); + } + break; + case MessageIOThread::TYPE_CPU: + { + CpuMessage *msg2 = reinterpret_cast(msg.ptr); + if (msg.vcpu) + msg.vcpu->executor.send_direct(*msg2, msg.mode, msg.value); + else + Logging::panic("TYPE_CPU needs a vcpu pointer!\n"); + sync_msg(msg); + } + break; + case MessageIOThread::TYPE_INPUT: + { + MessageInput *msg2 = reinterpret_cast(msg.ptr); + _mb->bus_input.send_direct(*msg2, msg.mode, msg.value); + sync_msg(msg); + } + break; + case MessageIOThread::TYPE_IRQLINES: + { + MessageIrqLines *msg2 = reinterpret_cast(msg.ptr); + _mb->bus_irqlines.send_direct(*msg2, msg.mode, msg.value); + sync_msg(msg); + } + break; + case MessageIOThread::TYPE_IRQNOTIFY: + { + MessageIrqNotify *msg2 = reinterpret_cast(msg.ptr); + _mb->bus_irqnotify.send_direct(*msg2, msg.mode, msg.value); + sync_msg(msg); + } + break; + case MessageIOThread::TYPE_IRQ: + { + MessageIrq *msg2 = reinterpret_cast(msg.ptr); + _mb->bus_hostirq.send_direct(*msg2, msg.mode, msg.value); + sync_msg(msg); + } + break; + case MessageIOThread::TYPE_LEGACY: + { + MessageLegacy *msg2 = reinterpret_cast(msg.ptr); + _mb->bus_legacy.send_direct(*msg2, msg.mode, msg.value); + sync_msg(msg); + } + break; + case MessageIOThread::TYPE_NETWORK: + { + MessageNetwork *msg2 = reinterpret_cast(msg.ptr); + _mb->bus_network.send_direct(*msg2, msg.mode, msg.value); + sync_msg(msg); + } + break; + case MessageIOThread::TYPE_PCICFG: + { + MessagePciConfig *msg2 = reinterpret_cast(msg.ptr); + _mb->bus_pcicfg.send_direct(*msg2, msg.mode, msg.value); + sync_msg(msg); + } + break; + case MessageIOThread::TYPE_HOSTOP: + { + MessageHostOp *msg2 = reinterpret_cast(msg.ptr); + _mb->bus_hostop.send_direct(*msg2, msg.mode, msg.value); + sync_msg(msg); + } + break; + + default: Logging::panic("Cannot handle type %x %x (size was %lx)!\n", msg.type, msg.mode, _queue->size()); + } + } +} diff --git a/unix/iothread.h b/unix/iothread.h new file mode 100644 index 00000000..9cf49837 --- /dev/null +++ b/unix/iothread.h @@ -0,0 +1,99 @@ +/** + * I/O Thread + * + * Copyright (C) 2013 Markus Partheymueller, Intel Corporation. + * + * This file is part of Seoul. + * + * Seoul is free software: you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Seoul is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details. + */ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +class IOThread : public StaticReceiver { +private: + pthread_mutex_t _lock; + sem_t _block; + bool blocking; + std::queue *_queue; + Motherboard *_mb; + + struct Notify { + pthread_t tid; + sem_t sem; + }; + std::vector _notify; + + pthread_t own_tid; + +public: + bool enq(MessageIOThread &msg); + void syncify_message(MessageIOThread &msg); + template + void sync_message(MessageIOThread &msg, MessageIOThread::Sync sync); + + void init(); + + bool enqueue(MessageDisk &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu); + bool enqueue(MessageDiskCommit &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu); + bool enqueue(MessageTime &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu); + bool enqueue(MessageTimer &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu); + bool enqueue(MessageTimeout &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu); + bool enqueue(MessageIOOut &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu); + bool enqueue(MessageIOIn &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu); + bool enqueue(MessageMem &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu); + bool enqueue(CpuMessage &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu); + bool enqueue(MessageInput &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu); + bool enqueue(MessageIrqLines &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu); + bool enqueue(MessageIrqNotify &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu); + bool enqueue(MessageIrq &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu); + bool enqueue(MessageLegacy &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu); + bool enqueue(MessageNetwork &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu); + bool enqueue(MessagePciConfig &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu); + bool enqueue(MessageHostOp &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu); + + void worker(); + sem_t *get_notify_sem(pthread_t tid); + + IOThread(Motherboard *mb) : blocking(false), _queue(nullptr), _mb(mb) { + _queue = new std::queue; + if (0 != pthread_mutex_init(&_lock, nullptr)) perror("Could not init mutex."); + if (0 != sem_init(&_block, 0, 0)) perror("Could not init sem."); + + mb->bus_disk.set_iothread_enqueue(this, enqueue_static); + mb->bus_diskcommit.set_iothread_enqueue(this, enqueue_static); + mb->bus_time.set_iothread_enqueue(this, enqueue_static); + mb->bus_timer.set_iothread_enqueue(this, enqueue_static); + mb->bus_timeout.set_iothread_enqueue(this, enqueue_static); + mb->bus_ioout.set_iothread_enqueue(this, enqueue_static); + mb->bus_ioin.set_iothread_enqueue(this, enqueue_static); + mb->bus_mem.set_iothread_enqueue(this, enqueue_static); + mb->bus_input.set_iothread_enqueue(this, enqueue_static); + mb->bus_irqlines.set_iothread_enqueue(this, enqueue_static); + mb->bus_irqnotify.set_iothread_enqueue(this, enqueue_static); + mb->bus_legacy.set_iothread_enqueue(this, enqueue_static); + mb->bus_network.set_iothread_enqueue(this, enqueue_static); + mb->bus_pcicfg.set_iothread_enqueue(this, enqueue_static); + mb->bus_hostop.set_iothread_enqueue(this, enqueue_static); + } +}; diff --git a/unix/main.cc b/unix/main.cc index c4cd9a3f..6322b44f 100644 --- a/unix/main.cc +++ b/unix/main.cc @@ -4,6 +4,9 @@ * Copyright (C) 2012, Julian Stecklina * Economic rights: Technische Universitaet Dresden (Germany) * + * Copyright (C) 2013 Jacek Galowicz, Intel Corporation. + * Copyright (C) 2013 Markus Partheymueller, Intel Corporation. + * * This file is part of Seoul. * * Seoul is free software: you can redistribute it and/or modify it @@ -48,6 +51,13 @@ #include #include +#include + +#define USE_IOTHREAD + +#ifdef USE_IOTHREAD +#include "iothread.h" +#endif const char version_str[] = #include "version.inc" @@ -86,7 +96,10 @@ static const char *pc_ps2[] = { "rtl8029:,9,0x300", "ahci:0xe0800000,14", "pmtimer:0x8000", - // 1 vCPU + // 4 vCPUs + "vcpu", "halifax", "vbios", "lapic", + "vcpu", "halifax", "vbios", "lapic", + "vcpu", "halifax", "vbios", "lapic", "vcpu", "halifax", "vbios", "lapic", NULL, }; @@ -97,9 +110,11 @@ static TimeoutList<32, void> timeouts; static timevalue last_to = ~0ULL; static timer_t timer_id; - -static Clock mb_clock(1000000); // XXX Use correct frequency -static Motherboard mb(&mb_clock, NULL); +Motherboard *mb; +Clock *mb_clock; +#ifdef USE_IOTHREAD +IOThread *iothread_obj; +#endif // Multiboot module data @@ -166,6 +181,17 @@ static std::vector disks; // Used to serialize all operations (for now). pthread_mutex_t irq_mtx; +// Relevant to live migration + +Migration *_migrator; +Migration::RestoreModes _restore_mode = Migration::MODE_OFF; +unsigned _migration_ip; +unsigned _migration_port; + +// the memory remapping procedure should only +// remap memory in page size granularity, if set +bool _track_page_usage = false; + static void skip_instruction(CpuMessage &msg) { // advance EIP @@ -231,8 +257,23 @@ static void *vcpu_thread_fn(void *arg) while (true) { pthread_mutex_lock(&irq_mtx); + + if (_restore_mode == Migration::MODE_RECEIVE) + // This will block until everything is restored + _migrator->listen(_migration_port, &cpu_state); + else if (_restore_mode == Migration::MODE_SEND) + // This will block if the last memory resend round is reached + _migrator->save_guestregs(&cpu_state); + handle_vcpu(false, CpuMessage::TYPE_SINGLE_STEP, vcpu, &cpu_state); // Logging::printf("eip %x\n", cpu_state.eip); + + if (_restore_mode == Migration::MODE_RECEIVE) { + _restore_mode = Migration::MODE_OFF; + delete _migrator; + _migrator = NULL; + cpu_state.mtd = MTD_ALL; + } pthread_mutex_unlock(&irq_mtx); } @@ -247,6 +288,39 @@ struct Vcpu_info { static std::vector vcpu_info; +static void *migration_thread_fn(void *) +{ + _migrator = new Migration(mb); + _migrator->send(_migration_ip, _migration_port); + + delete _migrator; + _migrator = nullptr; + + return nullptr; +} + +static void start_migration_to(unsigned ip, unsigned port) +{ + _migration_ip = ip; + _migration_port = port; + _restore_mode = Migration::MODE_SEND; + + pthread_t migthread; + if (0 != pthread_create(&migthread, NULL, migration_thread_fn, NULL)) { + perror("pthread_create"); + return; + } + pthread_setname_np(migthread, "migration"); +} + +#ifdef USE_IOTHREAD +void * iothread_worker(void *) { + iothread_obj->worker(); + + return NULL; +} +#endif + static bool receive(Device *, MessageHostOp &msg) { bool res = true; @@ -316,6 +390,122 @@ static bool receive(Device *, MessageHostOp &msg) msg.mac = mac_prefix << 16 | mac_host; break; } + case MessageHostOp::OP_NEXT_DIRTY_PAGE: { + /* + * What this does when it is properly implemented: + * - There is a variable "pageptr" which points + * to a page number. + * - The user emits this message host op when + * he wants a dirty page region + * - pageptr is moved incrementally until + * a dirty page region is found. + * This page region is then remapped RO + * and returned to the user as a CRD description + * - pageptr wraps around if it exceeds guest mem size. + */ +#if PORTED_TO_UNIX + const unsigned physpages = _physsize >> 12; + static unsigned long pageptr = 0; + + _track_page_usage = true; + + Crd reg = nova_lookup(Crd(pageptr, 0, DESC_MEM_ALL)); + // There will be several mappings, but we want to see the ones + // which are set to "writable by the guest" + + unsigned increment = 0; + do { + if (increment >= physpages) { + // That's it for now. Come back later. + msg.value = 0; + return true; + } + MessageMemRegion mmsg(pageptr); + if (!_mb->bus_memregion.send(mmsg, true)) { + // No one claims this region. Do not track. + pageptr = (pageptr + 1) % physpages; + ++increment; + continue; + } + if (!mmsg.actual_physmem) { + // This is no physmem. + pageptr += mmsg.count; + increment += mmsg.count; + if (pageptr > physpages) pageptr = 0; + continue; + } + reg = nova_lookup(Crd(pageptr, 0, DESC_MEM_ALL)); + if (!(reg.attr() & DESC_RIGHT_W)) { + // Not write-mapped, hence not dirty. + pageptr += 1 << reg.order(); + increment += 1 << reg.order(); + if (pageptr > physpages) pageptr = 0; + continue; + } + + break; + } while (1); + + // reg now describes a region which is guest-writable + // This means that the guest wrote to it before and it is now considered "dirty" + + // Tell the user "where" and "how many" + msg.phys = pageptr << 12; + msg.phys_len = reg.order(); + msg.value = reg.value(); + + // Make this page read-only for the guest, so it is considered "clean" now. + nova_revoke(Crd((reg.base() + _physmem) >> 12, reg.order(), + DESC_RIGHT_W | DESC_TYPE_MEM), false); + pageptr += 1 << reg.order(); + if (pageptr >= physpages) pageptr = 0; + +#endif + return true; + } + break; + case MessageHostOp::OP_GET_CONFIG_STRING: { + char *cmdline = NULL; + +#if PORTED_TO_UNIX + // Retrieve the command line string length from sigma0 + MessageConsole cmsg(MessageConsole::TYPE_START, cmdline); + cmsg.read = true; + cmsg.mem = 0; + unsigned ret = Sigma0Base::console(cmsg); + if (ret) { + Logging::printf("Error retrieving the command line" + " string length from sigma0.\n"); + return false; + } + + // Retrieve the command line itself + cmdline = new char[cmsg.mem+1]; + cmsg.mem += 1; + cmsg.cmdline = cmdline; + ret = Sigma0Base::console(cmsg); + if (ret) { + Logging::printf("Error retrieving the command line string sigma0.\n"); + return false; + } +#endif + + msg.obj = cmdline; + } + break; + + case MessageHostOp::OP_MIGRATION_RETRIEVE_INIT: { + _migration_port = msg.value; + _restore_mode = Migration::MODE_RECEIVE; + _migrator = new Migration(mb); + } + break; + case MessageHostOp::OP_MIGRATION_START: { + start_migration_to(msg.value, 9000); + return true; + } + break; + default: Logging::panic("%s - unimplemented operation %#x\n", __PRETTY_FUNCTION__, msg.type); @@ -326,7 +516,7 @@ static bool receive(Device *, MessageHostOp &msg) static void timeout_trigger() { - timevalue now = mb.clock()->time(); + timevalue now = mb_clock->time(); // Force time reprogramming. Otherwise, we might not reprogram a // timer, if the timeout event reached us too early. @@ -337,7 +527,7 @@ static void timeout_trigger() while ((nr = timeouts.trigger(now))) { MessageTimeout msg(nr, timeouts.timeout()); timeouts.cancel(nr); - mb.bus_timeout.send(msg); + mb->bus_timeout.send(msg); } } @@ -346,7 +536,7 @@ static void timeout_request() { timevalue next_to = timeouts.timeout(); if (next_to != ~0ULL) { - unsigned long long delta = mb_clock.delta(next_to, 1000000000UL); + unsigned long long delta = mb_clock->delta(next_to, 1000000000UL); if (delta == 0) { // Timeout pending NOW. Skip programming a timeout. @@ -400,7 +590,7 @@ static bool receive(Device *, MessageTime &msg) { struct timeval tv; gettimeofday(&tv, NULL); - msg.timestamp = mb_clock.clock(MessageTime::FREQUENCY); + msg.timestamp = mb_clock->clock(MessageTime::FREQUENCY); assert(MessageTime::FREQUENCY == 1000000U); msg.wallclocktime = (uint64)tv.tv_sec * 1000000 + tv.tv_usec; @@ -430,7 +620,7 @@ static void *network_io_thread_fn(void *) MessageNetwork msg(network_pbuf, res, 0); pthread_mutex_lock(&irq_mtx); - mb.bus_network.send(msg); + mb->bus_network.send(msg); pthread_mutex_unlock(&irq_mtx); } @@ -511,7 +701,7 @@ static bool receive(Device *, MessageDisk &msg) } MessageDiskCommit cmsg(msg.disknr, msg.usertag, status); - mb.bus_diskcommit.send(cmsg); + mb->bus_diskcommit.send(cmsg); return true; } @@ -581,13 +771,27 @@ int main(int argc, char **argv) return EXIT_FAILURE; } + mb_clock = new Clock(get_tsc_frequency()); + mb = new Motherboard(mb_clock, NULL); + +#ifdef USE_IOTHREAD + iothread_obj = new IOThread(mb); + pthread_t iothread_worker_thread; + if (0 != pthread_create(&iothread_worker_thread, NULL, iothread_worker, NULL)) { + perror("create iothread_worker failed"); + return EXIT_FAILURE; + } + pthread_setname_np(iothread_worker_thread, "iothread_worker"); +#endif + + mb->bus_hostop .add(nullptr, receive); + mb->bus_timer .add(nullptr, receive); + mb->bus_time .add(nullptr, receive); - mb.bus_hostop .add(nullptr, receive); - mb.bus_timer .add(nullptr, receive); - mb.bus_time .add(nullptr, receive); + mb->bus_network.add(nullptr, receive); + mb->bus_disk .add(nullptr, receive); - mb.bus_network.add(nullptr, receive); - mb.bus_disk .add(nullptr, receive); + mb->bus_restore.add(&timeouts, TimeoutList<32, void>::receive_static); // Synchronization initialization if (0 != pthread_mutex_init(&irq_mtx, nullptr)) { @@ -598,14 +802,19 @@ int main(int argc, char **argv) // Create standard PC for (const char **dev = pc_ps2; *dev != NULL; dev++) { - mb.handle_arg(*dev); + mb->handle_arg(*dev); } Logging::printf("Devices and %zu virtual CPU%s started successfully.\n", vcpu_info.size(), vcpu_info.size() == 1 ? "" : "s"); +#ifdef USE_IOTHREAD + // Init I/O thread (vCPU local busses) + iothread_obj->init(); +#endif + // init VCPUs - for (VCpu *vcpu = mb.last_vcpu; vcpu; vcpu=vcpu->get_last()) { + for (VCpu *vcpu = mb->last_vcpu; vcpu; vcpu=vcpu->get_last()) { Logging::printf("Initializing virtual CPU %p.\n", vcpu); // init CPU strings @@ -627,7 +836,16 @@ int main(int argc, char **argv) Logging::printf("RESET device state\n"); MessageLegacy msg2(MessageLegacy::RESET, 0); - mb.bus_legacy.send_fifo(msg2); + mb->bus_legacy.send_fifo(msg2); + + if (_restore_mode != Migration::MODE_OFF) { + /* + * The following UNLOCK message helps the VCPU out of the lock + * it is blocked by and catches it into the recall handler. + */ + MessageLegacy msg3(MessageLegacy::UNLOCK, 0); + mb->bus_legacy.send_fifo(msg3); + } pthread_t iothread; if (tap_fd) { diff --git a/unix/ncurses.cc b/unix/ncurses.cc index 24e33105..9cbe37c9 100644 --- a/unix/ncurses.cc +++ b/unix/ncurses.cc @@ -4,6 +4,8 @@ * Copyright (C) 2013, Julian Stecklina * Economic rights: Technische Universitaet Dresden (Germany) * + * Copyright (C) 2013 Jacek Galowicz, Intel Corporation. + * * This file is part of Seoul. * * Seoul is free software: you can redistribute it and/or modify it @@ -147,6 +149,14 @@ class NcursesDisplay : public StaticReceiver { if (current_view < views.size() - 1) current_view ++; break; + + case KEY_BACKSPACE: { + /* Migration example start event. As soon as the user hits this event, + * the VM will be migrated to the hard coded destination host. */ + MessageHostOp msg(MessageHostOp::OP_MIGRATION_START, + /* destination ip, address: 192.168.0.1 */ 0xC0A80001ul); + mb.bus_hostop.send(msg); + } case ERR: default: break;