From 77431d3f15260edc004589ee82965697725ba798 Mon Sep 17 00:00:00 2001
From: Jacek Galowicz <jacek.galowicz@intel.com>
Date: Tue, 23 Jul 2013 11:21:52 +0200
Subject: [PATCH 01/35] VCPU: Added a RESUME event to get the CPU out of its
 lock.

This is needed for:

- luring the VCPU recall context into a blocking semaphore to pause its execution at end of migration.
- unlocking the VCPU before boot to let it run into recall context and restoring it to exact state like on the source migration host.
---
 include/nul/message.h |  3 +++
 include/nul/vcpu.h    |  5 ++++-
 model/vcpu.cc         | 17 +++++++++++++++--
 3 files changed, 22 insertions(+), 3 deletions(-)
diff --git a/include/nul/message.h b/include/nul/message.h
index bf0a31c3..0565d222 100644
--- a/include/nul/message.h
+++ b/include/nul/message.h
@@ -6,6 +6,8 @@
  * Copyright (C) 2009, Bernhard Kauer <bk@vmmon.org>
  * Economic rights: Technische Universitaet Dresden (Germany)
  *
+ * Copyright (C) 2013 Jacek Galowicz, Intel Corporation.
+ *
  * This file is part of Vancouver.
  *
  * Vancouver is free software: you can redistribute it and/or modify
@@ -254,6 +256,7 @@ struct MessageLegacy
       INTR,
       DEASS_INTR,
       INTA,
+      UNLOCK,
     } type;
   unsigned value;
   MessageLegacy(Type _type, unsigned _value=0) : type(_type), value(_value) {}
diff --git a/include/nul/vcpu.h b/include/nul/vcpu.h
index 145a0184..3f27aa8c 100644
--- a/include/nul/vcpu.h
+++ b/include/nul/vcpu.h
@@ -4,6 +4,8 @@
  * Copyright (C) 2010, Bernhard Kauer <bk@vmmon.org>
  * Economic rights: Technische Universitaet Dresden (Germany)
  *
+ * Copyright (C) 2013 Jacek Galowicz, Intel Corporation.
+ *
  * This file is part of Vancouver.
  *
  * Vancouver is free software: you can redistribute it and/or modify
@@ -122,7 +124,8 @@ class VCpu
     EVENT_DEBUG  = 1 << 17,
     STATE_BLOCK  = 1 << 18,
     STATE_WAKEUP = 1 << 19,
-    EVENT_HOST   = 1 << 20
+    EVENT_HOST   = 1 << 20,
+    EVENT_RESUME = 1 << 21
   };
 
   unsigned long long inj_count;
diff --git a/model/vcpu.cc b/model/vcpu.cc
index c412c7ce..b6e131dd 100644
--- a/model/vcpu.cc
+++ b/model/vcpu.cc
@@ -4,6 +4,8 @@
  * Copyright (C) 2010, Bernhard Kauer <bk@vmmon.org>
  * Economic rights: Technische Universitaet Dresden (Germany)
  *
+ * Copyright (C) 2013 Jacek Galowicz, Intel Corporation.
+ *
  * This file is part of Vancouver.
  *
  * Vancouver is free software: you can redistribute it and/or modify
@@ -204,6 +206,12 @@ class VirtualCpu : public VCpu, public StaticReceiver<VirtualCpu>
     msg.mtr_out |= MTD_STATE | MTD_INJ;
 
     if (!old_event)  return;
+
+    if (old_event & EVENT_RESUME) {
+        Cpu::atomic_and<volatile unsigned>(&_event, ~(old_event & EVENT_RESUME));
+        cpu->actv_state = 0;
+    }
+
     if (old_event & (EVENT_DEBUG | EVENT_HOST)) {
       if (old_event & EVENT_DEBUG)
         dprintf("state %x event %8x eip %8x eax %x ebx %x edx %x esi %x\n", cpu->actv_state, old_event, cpu->eip, cpu->eax, cpu->ebx, cpu->edx, cpu->esi);
@@ -316,7 +324,7 @@ class VirtualCpu : public VCpu, public StaticReceiver<VirtualCpu>
     COUNTER_INC("EVENT");
 
     if (value & DEASS_INTR) Cpu::atomic_and<volatile unsigned>(&_event, ~EVENT_INTR);
-    if (!((~_event & value) & (EVENT_MASK | EVENT_DEBUG | EVENT_HOST))) return;
+    if (!((~_event & value) & (EVENT_MASK | EVENT_DEBUG | EVENT_HOST | EVENT_RESUME))) return;
 
     // INIT or AP RESET - go to the wait-for-sipi state
     if ((value & EVENT_MASK) == EVENT_INIT)
@@ -331,7 +339,7 @@ class VirtualCpu : public VCpu, public StaticReceiver<VirtualCpu>
        */
       if (Cpu::cmpxchg4b(&_sipi, 0, value)) return;
 
-    Cpu::atomic_or<volatile unsigned>(&_event, STATE_WAKEUP | (value & (EVENT_MASK | EVENT_DEBUG | EVENT_HOST)));
+    Cpu::atomic_or<volatile unsigned>(&_event, STATE_WAKEUP | (value & (EVENT_MASK | EVENT_DEBUG | EVENT_HOST | EVENT_RESUME)));
 
 
     MessageHostOp msg(MessageHostOp::OP_VCPU_RELEASE, _hostop_id, _event & STATE_BLOCK);
@@ -353,6 +361,11 @@ class VirtualCpu : public VCpu, public StaticReceiver<VirtualCpu>
       return true;
     }
 
+    if (msg.type == MessageLegacy::UNLOCK) {
+        got_event(EVENT_RESUME);
+        return true;
+    }
+
     // BSP receives only legacy signals if the LAPIC is disabled
     if (is_ap() || CPUID_EDX1 & (1 << 9)) return false;
 

From a768139139ac233a3d76c46329bd63be4bf2adb2 Mon Sep 17 00:00:00 2001
From: Jacek Galowicz <jacek.galowicz@intel.com>
Date: Tue, 23 Jul 2013 11:40:27 +0200
Subject: [PATCH 02/35] Page Tracking: Added a host op for this.

This is needed for live migration.

The general semantic works like this:
- The caller will ask for some RW-mapped memory range
- This range will be found in the host op
- It will be remapped as read-only and reported back to the caller

This routine uses a pointer which is moved round-robin through the guest memory range.

WARNING: Assumes NOVA as underlying kernel. Was not ported to UNIX.
---
 include/nul/message.h |  1 +
 unix/main.cc          | 64 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 65 insertions(+)

diff --git a/include/nul/message.h b/include/nul/message.h
index 0565d222..f9c1b287 100644
--- a/include/nul/message.h
+++ b/include/nul/message.h
@@ -452,6 +452,7 @@ struct MessageHostOp
       OP_VCPU_BLOCK,
       OP_VCPU_RELEASE,
       OP_WAIT_CHILD,
+      OP_NEXT_DIRTY_PAGE,
     } type;
   union {
     unsigned long value;
diff --git a/unix/main.cc b/unix/main.cc
index c4cd9a3f..388e132d 100644
--- a/unix/main.cc
+++ b/unix/main.cc
@@ -4,6 +4,8 @@
  * Copyright (C) 2012, Julian Stecklina <jsteckli@os.inf.tu-dresden.de>
  * Economic rights: Technische Universitaet Dresden (Germany)
  *
+ * Copyright (C) 2013 Jacek Galowicz, Intel Corporation.
+ *
  * This file is part of Seoul.
  *
  * Seoul is free software: you can redistribute it and/or modify it
@@ -166,6 +168,12 @@ static std::vector<Disk> disks;
 // Used to serialize all operations (for now).
 pthread_mutex_t irq_mtx;
 
+// Relevant to live migration
+
+// the memory remapping procedure should only
+// remap memory in page size granularity, if set
+bool _track_page_usage = false;
+
 static void skip_instruction(CpuMessage &msg)
 {
   // advance EIP
@@ -316,6 +324,62 @@ static bool receive(Device *, MessageHostOp &msg)
       msg.mac = mac_prefix << 16 | mac_host;
       break;
     }
+    case MessageHostOp::OP_NEXT_DIRTY_PAGE: {
+        /*
+         * What this does when it is properly implemented:
+         * - There is a variable "pageptr" which points
+         *   to a page number.
+         * - The user emits this message host op when
+         *   he wants a dirty page region
+         * - pageptr is moved incrementally until
+         *   a dirty page region is found.
+         *   This page region is then remapped RO
+         *   and returned to the user as a CRD description
+         * - pageptr wraps around if it exceeds guest mem size.
+         */
+#if PORTED_TO_UNIX
+        static unsigned long pageptr = 0;
+        const unsigned physpages = _physsize >> 12;
+
+        // Setting this to true makes the map_memory_helper function
+        // remap with page size
+        _track_page_usage = true;
+
+        Crd reg = nova_lookup(Crd(pageptr, 0, DESC_MEM_ALL));
+        // There will be several mappings, but we want to see the ones
+        // which are set to "writable by the guest"
+
+        unsigned long oldptr = pageptr;
+        while (!(reg.attr() & DESC_RIGHT_W)) {
+            pageptr = (pageptr + 1) % physpages;
+            if (pageptr == oldptr) {
+                // Come back later, please.
+                msg.value = 0;
+                return true;
+        }
+
+        reg = nova_lookup(Crd(pageptr, 0, DESC_MEM_ALL));
+        }
+
+        // reg now describes a region which is guest-writable
+        // This means that the guest wrote to it before and it is considered "dirty"
+
+        // Tell the user "where" and "how many"
+        msg.phys    = pageptr << 12;
+        msg.phys_len = reg.order();
+
+        msg.value = reg.value();
+
+        // Make this page read-only for the guest, so it is considered "clean" now.
+        nova_revoke(Crd((reg.base() + _physmem) >> 12, reg.order(),
+        DESC_RIGHT_W | DESC_TYPE_MEM), false);
+        pageptr += 1 << reg.order();
+        if (pageptr >= physpages) pageptr = 0;
+#endif
+        return true;
+    }
+    break;
+
     default:
       Logging::panic("%s - unimplemented operation %#x\n",
                        __PRETTY_FUNCTION__, msg.type);

From c50d4bc5169c704c20cba266cf0ba74a80526bd9 Mon Sep 17 00:00:00 2001
From: Jacek Galowicz <jacek.galowicz@intel.com>
Date: Tue, 23 Jul 2013 13:45:08 +0200
Subject: [PATCH 03/35] VCPU TSC offset manipulation.

Added a CpuMessage to add arbitrary offsets to the VCPU's timestamp counter.
This is needed for live migration.
---
 include/nul/vcpu.h | 3 ++-
 model/vcpu.cc      | 6 ++++++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/include/nul/vcpu.h b/include/nul/vcpu.h
index 3f27aa8c..5591b285 100644
--- a/include/nul/vcpu.h
+++ b/include/nul/vcpu.h
@@ -38,7 +38,8 @@ struct CpuMessage {
     TYPE_WBINVD,
     TYPE_CHECK_IRQ,
     TYPE_CALC_IRQWINDOW,
-    TYPE_SINGLE_STEP
+    TYPE_SINGLE_STEP,
+    TYPE_ADD_TSC_OFF,
   } type;
   union {
     struct {
diff --git a/model/vcpu.cc b/model/vcpu.cc
index b6e131dd..ce76685f 100644
--- a/model/vcpu.cc
+++ b/model/vcpu.cc
@@ -397,6 +397,11 @@ class VirtualCpu : public VCpu, public StaticReceiver<VirtualCpu>
 
   bool receive(CpuMessage &msg) {
 
+    if (msg.type == CpuMessage::TYPE_ADD_TSC_OFF) {
+        _reset_tsc_off += msg.current_tsc_off;
+        return true;
+    }
+
     // TSC drift compensation.
     if (msg.type != CpuMessage::TYPE_CPUID_WRITE && msg.mtr_in & MTD_TSC && ~msg.mtr_out & MTD_TSC) {
       COUNTER_INC("tsc adoption");
@@ -460,6 +465,7 @@ class VirtualCpu : public VCpu, public StaticReceiver<VirtualCpu>
     case CpuMessage::TYPE_SINGLE_STEP:
     case CpuMessage::TYPE_WBINVD:
     case CpuMessage::TYPE_INVD:
+    case CpuMessage::TYPE_ADD_TSC_OFF:
     default:
       return false;
     }

From 163db90e3dd7802991be66192b7d3e269d07810c Mon Sep 17 00:00:00 2001
From: Jacek Galowicz <jacek.galowicz@intel.com>
Date: Tue, 23 Jul 2013 14:25:03 +0200
Subject: [PATCH 04/35] Added a restore bus to the mainboard.

Devices will be attached to this.
The migration code uses this to to communicate with classes of devices.
Devices can write their state into restore messages and also read it back to restore.
---
 include/nul/message.h     | 48 +++++++++++++++++++++++++++++++++++++++
 include/nul/motherboard.h |  4 ++++
 2 files changed, 52 insertions(+)

diff --git a/include/nul/message.h b/include/nul/message.h
index f9c1b287..e01eb802 100644
--- a/include/nul/message.h
+++ b/include/nul/message.h
@@ -752,4 +752,52 @@ struct MessageNetwork
   MessageNetwork(unsigned type, unsigned client) : type(type), mac(0), client(client) { }
 };
 
+struct MessageRestore
+{
+    enum networkStrings {
+        MAGIC_STRING_DEVICE_DESC = 0x8D06F00D
+    };
+
+    enum restoreTypes {
+        RESTORE_RESTART = 0, // RESTART is sent over the restore bus for initialization
+        RESTORE_TIMEOUTLIST,
+        RESTORE_PIC,
+        RESTORE_LAPIC,
+        RESTORE_PIT,
+        RESTORE_VGA,
+        RESTORE_NIC,
+        RESTORE_ACPI,
+        RESTORE_VCPU,
+        RESTORE_LAST,
+        // This one is acutally a restore device type:
+        // vga.cc will react on this, printing messages on the guest screen.
+        VGA_DISPLAY_GUEST,
+        VGA_VIDEOMODE,
+        // This is for pass-through devices. They will un-/replug themselves
+        // out of/into the guest before/after live migration
+        PCI_PLUG,
+    };
+    unsigned long magic_string;
+    // Use these enums on devtype
+    unsigned devtype;
+    // The device will note down how many bytes of this structure it actually uses.
+    mword bytes;
+    // Two variables which every device type can use for identification
+    unsigned id1;
+    unsigned id2;
+    // write=true: Writing a device state onto disk. false: Reading back from disk
+    bool write;
+
+    // Space for saving the device state
+    char *space;
+
+    MessageRestore(unsigned _devtype, char *_space, bool _write) :
+        magic_string(MAGIC_STRING_DEVICE_DESC), devtype(_devtype),
+        bytes(0), id1(0), id2(0), write(_write), space(_space)
+    {}
+    bool magic_string_check() { return magic_string == MAGIC_STRING_DEVICE_DESC; }
+};
+
+
+
 /* EOF */
diff --git a/include/nul/motherboard.h b/include/nul/motherboard.h
index f4ce9b7b..a0084053 100644
--- a/include/nul/motherboard.h
+++ b/include/nul/motherboard.h
@@ -4,6 +4,8 @@
  * Copyright (C) 2007-2010, Bernhard Kauer <bk@vmmon.org>
  * Economic rights: Technische Universitaet Dresden (Germany)
  *
+ * Copyright (C) 2013 Jacek Galowicz, Intel Corporation.
+ *
  * This file is part of Vancouver.
  *
  * Vancouver is free software: you can redistribute it and/or modify
@@ -79,6 +81,8 @@ class Motherboard
   DBus<MessageTimer>        bus_timer;      ///< Request for timers
   DBus<MessageVesa>         bus_vesa;
 
+  DBus<MessageRestore>      bus_restore;
+
   VCpu *last_vcpu;
   Clock *clock() { return _clock; }
   Hip   *hip() { return _hip; }

From f6ea4b60538d2e40ee922c5a4b62920626294fb4 Mon Sep 17 00:00:00 2001
From: Jacek Galowicz <jacek.galowicz@intel.com>
Date: Wed, 24 Jul 2013 13:42:11 +0200
Subject: [PATCH 05/35] Added restore code to TimeoutList.

---
 include/nul/timer.h | 71 +++++++++++++++++++++++++++++++++++++++++++--
 unix/main.cc        |  2 ++
 2 files changed, 71 insertions(+), 2 deletions(-)

diff --git a/include/nul/timer.h b/include/nul/timer.h
index 0f4a821f..2bfd59c1 100644
--- a/include/nul/timer.h
+++ b/include/nul/timer.h
@@ -4,6 +4,8 @@
  * Copyright (C) 2007-2008, Bernhard Kauer <bk@vmmon.org>
  * Economic rights: Technische Universitaet Dresden (Germany)
  *
+ * Copyright (C) 2013 Jacek Galowicz, Intel Corporation.
+ *
  * This file is part of Vancouver.
  *
  * Vancouver is free software: you can redistribute it and/or modify
@@ -19,6 +21,10 @@
 #include "service/cpu.h"
 #include "service/math.h"
 
+#include <nul/message.h>
+#include <nul/bus.h>
+#include <nul/templates.h>
+
 
 typedef unsigned long long timevalue;
 
@@ -78,7 +84,7 @@ class Clock
  * Keeping track of the timeouts.
  */
 template <unsigned ENTRIES, typename DATA>
-class TimeoutList
+class TimeoutList : public StaticReceiver<TimeoutList<ENTRIES, DATA>>
 {
   class TimeoutEntry
   {
@@ -91,6 +97,8 @@ class TimeoutList
   };
 
   TimeoutEntry  _entries[ENTRIES];
+
+  bool _restore_processed;
 public:
   /**
    * Alloc a new timeout object.
@@ -187,5 +195,64 @@ class TimeoutList
     _entries[0]._timeout = ~0ULL;
   }
 
-  TimeoutList() { init(); }
+  TimeoutList() : _restore_processed(false) { init(); }
+
+#define REL_PTR(ptr, offset) ( \
+    reinterpret_cast<typeof(ptr)>( \
+        reinterpret_cast<mword>(ptr) - reinterpret_cast<mword>(offset)) \
+)
+#define ABS_PTR(ptr, offset) ( \
+    reinterpret_cast<typeof(ptr)>( \
+        reinterpret_cast<mword>(ptr) + reinterpret_cast<mword>(offset)) \
+)
+
+  bool receive(MessageRestore &msg)
+  {
+      const mword bytes = reinterpret_cast<mword>(&_restore_processed)
+          - reinterpret_cast<mword>(_entries);
+
+      if (msg.devtype == MessageRestore::RESTORE_RESTART) {
+          _restore_processed = false;
+          msg.bytes += bytes + sizeof(msg);
+          return false;
+      }
+
+      if (msg.devtype != MessageRestore::RESTORE_TIMEOUTLIST || _restore_processed) return false;
+
+      unsigned long long rdtsc = Cpu::rdtsc();
+
+      if (msg.write) {
+          msg.bytes = bytes;
+          memcpy(msg.space, reinterpret_cast<void*>(_entries), bytes);
+
+          // Do not mess around with timeout entries of the running guest,
+          // since we may want to let it continue after saving
+          TimeoutEntry *entries = reinterpret_cast<TimeoutEntry*>(msg.space);
+          for (unsigned i=0; i < ENTRIES; i++) {
+              entries[i]._prev = REL_PTR(entries[i]._prev, _entries);
+              entries[i]._next = REL_PTR(entries[i]._next, _entries);
+
+              if (i == 0) continue;
+
+              if (entries[i]._timeout <= rdtsc)
+                  entries[i]._timeout = 0;
+              else
+                  entries[i]._timeout -= rdtsc;
+          }
+      }
+      else {
+          memcpy(reinterpret_cast<void*>(_entries), msg.space, bytes);
+          for (unsigned i=0; i < ENTRIES; i++) {
+              _entries[i]._prev = ABS_PTR(_entries[i]._prev, _entries);
+              _entries[i]._next = ABS_PTR(_entries[i]._next, _entries);
+
+              if (i == 0) continue;
+              _entries[i]._timeout += rdtsc;
+          }
+      }
+
+      //Logging::printf("%s Timeoutlist\n", msg.write ? "Saved" : "Restored");
+      _restore_processed = true;
+      return true;
+  }
 };
diff --git a/unix/main.cc b/unix/main.cc
index 388e132d..90e05739 100644
--- a/unix/main.cc
+++ b/unix/main.cc
@@ -653,6 +653,8 @@ int main(int argc, char **argv)
   mb.bus_network.add(nullptr, receive);
   mb.bus_disk   .add(nullptr, receive);
 
+  mb.bus_restore.add(&timeouts, TimeoutList<32, void>::receive_static<MessageRestore>);
+
   // Synchronization initialization
   if (0 != pthread_mutex_init(&irq_mtx, nullptr)) {
     perror("pthread_mutex_init");

From 656c247bd73c4d9caaa6292962d21767bb1e7d1d Mon Sep 17 00:00:00 2001
From: Jacek Galowicz <jacek.galowicz@intel.com>
Date: Wed, 24 Jul 2013 13:43:51 +0200
Subject: [PATCH 06/35] Added restore code to LAPIC model.

---
 model/lapic.cc | 43 +++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 41 insertions(+), 2 deletions(-)

diff --git a/model/lapic.cc b/model/lapic.cc
index bd89ea30..c99b066a 100644
--- a/model/lapic.cc
+++ b/model/lapic.cc
@@ -4,6 +4,8 @@
  * Copyright (C) 2010, Bernhard Kauer <bk@vmmon.org>
  * Economic rights: Technische Universitaet Dresden (Germany)
  *
+ * Copyright (C) 2013 Jacek Galowicz, Intel Corporation.
+ *
  * This file is part of Vancouver.
  *
  * Vancouver is free software: you can redistribute it and/or modify
@@ -32,8 +34,11 @@
  */
 class Lapic : public DiscoveryHelper<Lapic>, public StaticReceiver<Lapic>
 {
+    int _regstart;
 #define VMM_REGBASE "../model/lapic.cc"
 #include "model/reg.h"
+    int _regend;
+
   enum {
     MAX_FREQ   = 200000000,
     LVT_MASK_BIT = 16,
@@ -64,6 +69,7 @@ class Lapic : public DiscoveryHelper<Lapic>, public StaticReceiver<Lapic>
   bool      _rirr[NUM_LVT];
   unsigned  _lowest_rr;
 
+  bool _restore_processed;
 
   bool sw_disabled() { return ~_SVR & 0x100; }
   bool hw_disabled() { return ~_msr & 0x800; }
@@ -738,8 +744,40 @@ class Lapic : public DiscoveryHelper<Lapic>, public StaticReceiver<Lapic>
     }
   }
 
+  bool receive(MessageRestore &msg)
+  {
+      const mword bytes = reinterpret_cast<mword>(&_restore_processed)
+          -reinterpret_cast<mword>(&_timer);
 
-  Lapic(Motherboard &mb, VCpu *vcpu, unsigned initial_apic_id, unsigned timer) : _mb(mb), _vcpu(vcpu), _initial_apic_id(initial_apic_id), _timer(timer)
+      const mword bytes2 = reinterpret_cast<mword>(&_regend) - reinterpret_cast<mword>(&_regstart);
+
+      if (msg.devtype == MessageRestore::RESTORE_RESTART) {
+          _restore_processed = false;
+          msg.bytes += bytes + bytes2 + sizeof(msg);
+          return false;
+      }
+
+      if (msg.devtype != MessageRestore::RESTORE_LAPIC || _restore_processed) return false;
+
+      if (msg.write) {
+          msg.bytes = bytes + bytes2;
+          memcpy(msg.space, reinterpret_cast<void*>(&_timer), bytes);
+          memcpy(msg.space + bytes, reinterpret_cast<void*>(&_regstart), bytes2);
+      }
+      else {
+          memcpy(reinterpret_cast<void*>(&_timer), msg.space, bytes);
+          memcpy(reinterpret_cast<void*>(&_regstart), msg.space + bytes, bytes2);
+      }
+
+      Logging::printf("%s LAPIC\n", msg.write?"Saved":"Restored");
+
+      _restore_processed = true;
+      return true;
+  }
+
+
+  Lapic(Motherboard &mb, VCpu *vcpu, unsigned initial_apic_id, unsigned timer)
+      : _mb(mb), _vcpu(vcpu), _initial_apic_id(initial_apic_id), _timer(timer), _restore_processed(false)
   {
     // find a FREQ that is not too high
     for (_timer_clock_shift=0; _timer_clock_shift < 32; _timer_clock_shift++)
@@ -762,11 +800,12 @@ class Lapic : public DiscoveryHelper<Lapic>, public StaticReceiver<Lapic>
     mb.bus_apic.add(this,     receive_static<MessageApic>);
     mb.bus_timeout.add(this,  receive_static<MessageTimeout>);
     mb.bus_discovery.add(this,discover);
+    mb.bus_restore.add(this, receive_static<MessageRestore>);
+
     vcpu->executor.add(this,  receive_static<CpuMessage>);
     vcpu->mem.add(this,       receive_static<MessageMem>);
     vcpu->memregion.add(this, receive_static<MessageMemRegion>);
     vcpu->bus_lapic.add(this, receive_static<LapicEvent>);
-
   }
 };
 

From a5a8b19175cf68da4c5a143c0eb59ed264906dcb Mon Sep 17 00:00:00 2001
From: Jacek Galowicz <jacek.galowicz@intel.com>
Date: Wed, 24 Jul 2013 13:46:35 +0200
Subject: [PATCH 07/35] Added restore code to PIC model.

---
 model/pic8259.cc | 41 ++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 40 insertions(+), 1 deletion(-)

diff --git a/model/pic8259.cc b/model/pic8259.cc
index 438bfd44..2b8adede 100644
--- a/model/pic8259.cc
+++ b/model/pic8259.cc
@@ -4,6 +4,8 @@
  * Copyright (C) 2007-2009, Bernhard Kauer <bk@vmmon.org>
  * Economic rights: Technische Universitaet Dresden (Germany)
  *
+ * Copyright (C) 2013 Jacek Galowicz, Intel Corporation.
+ *
  * This file is part of Vancouver.
  *
  * Vancouver is free software: you can redistribute it and/or modify
@@ -67,6 +69,8 @@ class PicDevice : public StaticReceiver<PicDevice>
   unsigned char  _elcr;
   unsigned char  _notify;
 
+  bool _restore_processed;
+
   // helper functions
   bool is_slave()                      { return (_icw[ICW4] & ICW4_BUF) ? (~_icw[ICW4] & ICW4_MS) : _virq; }
   void rotate_prios()                  { _prio_lowest = (_prio_lowest+1) & 7; }
@@ -351,11 +355,44 @@ class PicDevice : public StaticReceiver<PicDevice>
       return false;
     }
 
+  bool receive(MessageRestore &msg)
+  {
+        const mword bytes = reinterpret_cast<mword>(&_restore_processed)
+            -reinterpret_cast<mword>(&_base);
+
+        if (msg.devtype == MessageRestore::RESTORE_RESTART) {
+            _restore_processed = false;
+            msg.bytes += bytes + sizeof(msg);
+            return false;
+        }
+
+        if (msg.devtype != MessageRestore::RESTORE_PIC || _restore_processed) return false;
+
+        if (msg.write) {
+            msg.bytes = bytes;
+            msg.id1 = _base;
+            msg.id2 = _upstream_irq;
+            memcpy(msg.space, reinterpret_cast<void*>(&_base), bytes);
+
+        }
+        else {
+            if (msg.id1 != _base || msg.id2 != _upstream_irq) return false;
+
+            memcpy(reinterpret_cast<void*>(&_base), msg.space, bytes);
+        }
+
+        //Logging::printf("%s PIC (base %x, IRQ %x)\n", msg.write?"Saved":"Restored", msg.id1, msg.id2);
+        _restore_processed = true;
+        return true;
+  }
+
+
+
 
  PicDevice(DBus<MessageIrqLines> &bus_irq, DBus<MessagePic> &bus_pic, DBus<MessageLegacy> &bus_legacy, DBus<MessageIrqNotify> &bus_notify,
 	   unsigned short base, unsigned char irq, unsigned short elcr_base, unsigned char virq) :
    _bus_irq(bus_irq), _bus_pic(bus_pic), _bus_legacy(bus_legacy), _bus_notify(bus_notify),
-   _base(base), _upstream_irq(irq), _elcr_base(elcr_base), _virq(virq), _icw_mode(OCW1)
+   _base(base), _upstream_irq(irq), _elcr_base(elcr_base), _virq(virq), _icw_mode(OCW1), _restore_processed(false)
   {
     _icw[ICW1] = 0;
     reset_values();
@@ -384,8 +421,10 @@ PARAM_HANDLER(pic,
   mb.bus_ioout.   add(dev, PicDevice::receive_static<MessageIOOut>);
   mb.bus_irqlines.add(dev, PicDevice::receive_static<MessageIrqLines>);
   mb.bus_pic.     add(dev, PicDevice::receive_static<MessagePic>);
+  mb.bus_restore.add(dev, PicDevice::receive_static<MessageRestore>);
   if (!virq)
     mb.bus_legacy.add(dev, PicDevice::receive_static<MessageLegacy>);
   virq += 8;
+
 }
 

From 94df54b4cf0161d798f453124257958f9ce60a5c Mon Sep 17 00:00:00 2001
From: Jacek Galowicz <jacek.galowicz@intel.com>
Date: Wed, 24 Jul 2013 13:50:08 +0200
Subject: [PATCH 08/35] Added restore code to PIT model.

---
 model/pit8254.cc | 34 +++++++++++++++++++++++++++++++++-
 1 file changed, 33 insertions(+), 1 deletion(-)

diff --git a/model/pit8254.cc b/model/pit8254.cc
index 5853e77c..b67d11af 100644
--- a/model/pit8254.cc
+++ b/model/pit8254.cc
@@ -4,6 +4,8 @@
  * Copyright (C) 2007-2009, Bernhard Kauer <bk@vmmon.org>
  * Economic rights: Technische Universitaet Dresden (Germany)
  *
+ * Copyright (C) 2013 Jacek Galowicz, Intel Corporation.
+ *
  * This file is part of Vancouver.
  *
  * Vancouver is free software: you can redistribute it and/or modify
@@ -370,6 +372,8 @@ class PitDevice : public StaticReceiver<PitDevice>
   static const unsigned COUNTER = 3;
   PitCounter _c[COUNTER];
 
+  bool _restore_processed;
+
  public:
 
   bool  receive(MessagePit &msg)
@@ -421,9 +425,36 @@ class PitDevice : public StaticReceiver<PitDevice>
    return true;
  }
 
+ bool receive(MessageRestore &msg)
+ {
+     const mword bytes = reinterpret_cast<mword>(&_restore_processed)
+         -reinterpret_cast<mword>(&_base);
+
+     if (msg.devtype == MessageRestore::RESTORE_RESTART) {
+         _restore_processed = false;
+         msg.bytes += bytes + sizeof(msg);
+         return false;
+     }
+
+     if (msg.devtype != MessageRestore::RESTORE_PIT || _restore_processed) return false;
+
+     if (msg.write) {
+         msg.bytes = bytes;
+         memcpy(msg.space, reinterpret_cast<void*>(&_base), bytes);
+
+     }
+     else {
+         memcpy(reinterpret_cast<void*>(&_base), msg.space, bytes);
+     }
+
+     //Logging::printf("%s PIT\n", msg.write?"Saved":"Restored");
+     _restore_processed = true;
+     return true;
+ }
+
 
   PitDevice(Motherboard &mb, unsigned short base, unsigned irq, unsigned pit)
-    : _base(base), _addr(pit*COUNTER)
+    : _base(base), _addr(pit*COUNTER), _restore_processed(false)
   {
     for (unsigned i=0; i < COUNTER; i++)
       {
@@ -449,4 +480,5 @@ PARAM_HANDLER(pit,
   mb.bus_ioin.add(dev,  PitDevice::receive_static<MessageIOIn>);
   mb.bus_ioout.add(dev, PitDevice::receive_static<MessageIOOut>);
   mb.bus_pit.add(dev,   PitDevice::receive_static<MessagePit>);
+  mb.bus_restore.add(dev, PitDevice::receive_static<MessageRestore>);
 } 

From 391087dd8fb9b8ac6ace049ef90c53fbc4717112 Mon Sep 17 00:00:00 2001
From: Jacek Galowicz <jacek.galowicz@intel.com>
Date: Wed, 24 Jul 2013 14:19:15 +0200
Subject: [PATCH 09/35] Added restore code to VGA model.

---
 model/vga.cc | 61 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 60 insertions(+), 1 deletion(-)

diff --git a/model/vga.cc b/model/vga.cc
index dadfe0ed..44c9d3fa 100644
--- a/model/vga.cc
+++ b/model/vga.cc
@@ -4,6 +4,8 @@
  * Copyright (C) 2007-2010, Bernhard Kauer <bk@vmmon.org>
  * Economic rights: Technische Universitaet Dresden (Germany)
  *
+ * Copyright (C) 2013 Jacek Galowicz, Intel Corporation.
+ *
  * This file is part of Vancouver.
  *
  * Vancouver is free software: you can redistribute it and/or modify
@@ -45,6 +47,9 @@ class Vga : public StaticReceiver<Vga>, public BiosCommon
   unsigned char  _crt_index;
   unsigned       _ebda_segment;
   unsigned       _vbe_mode;
+  mword          _last_videomode_request;
+
+  bool _restore_processed;
 
   void puts_guest(const char *msg) {
     unsigned pos = _regs.cursor_pos - TEXT_OFFSET;
@@ -174,6 +179,7 @@ class Vga : public StaticReceiver<Vga>, public BiosCommon
       case 0x4f02: // set vbemode
 	{
 	  ConsoleModeInfo info;
+      _last_videomode_request = cpu->ebx;
 	  unsigned index = get_vesa_mode(cpu->ebx & 0x0fff, &info);
 	  if (index != ~0u && info.attr & 1)
 	    {
@@ -349,6 +355,12 @@ class Vga : public StaticReceiver<Vga>, public BiosCommon
     return true;
   }
 
+  void set_videomode(mword videomode)
+  {
+      ConsoleModeInfo info;
+      _regs.mode = get_vesa_mode(videomode & 0x0fff, &info);
+  }
+
 public:
 
   bool  receive(MessageBios &msg)
@@ -522,9 +534,55 @@ class Vga : public StaticReceiver<Vga>, public BiosCommon
     return true;
   }
 
+  bool receive(MessageRestore &msg)
+  {
+      const mword bytes = reinterpret_cast<mword>(&_restore_processed)
+          -reinterpret_cast<mword>(&_view);
+
+      if (msg.devtype == MessageRestore::RESTORE_RESTART) {
+          _restore_processed = false;
+          msg.bytes += bytes + sizeof(msg);
+          return false;
+      }
+
+      if (msg.devtype == MessageRestore::VGA_DISPLAY_GUEST) {
+          if (msg.write) memset(_framebuffer_ptr, 0, _framebuffer_size);
+          puts_guest(msg.space);
+          return true;
+      }
+
+      if (msg.devtype == MessageRestore::VGA_VIDEOMODE) {
+          if (msg.write) {
+              set_videomode(msg.bytes);
+              MessageConsole cmsg(MessageConsole::TYPE_SWITCH_VIEW);
+              cmsg.view = _view;
+              _mb.bus_console.send(cmsg);
+          }
+          else
+              msg.bytes = _last_videomode_request;
+          return true;
+      }
+
+      if (msg.devtype != MessageRestore::RESTORE_VGA || _restore_processed) return false;
+
+      if (msg.write) {
+          msg.bytes = bytes;
+          memcpy(msg.space, reinterpret_cast<void*>(&_view), bytes);
+
+      }
+      else {
+          memcpy(reinterpret_cast<void*>(&_view), msg.space, bytes);
+          set_videomode(_last_videomode_request);
+      }
+
+      //Logging::printf("%s VGA\n", msg.write?"Saved":"Restored");
+      _restore_processed = true;
+      return true;
+  }
+
 
   Vga(Motherboard &mb, unsigned short iobase, char *framebuffer_ptr, uintptr_t framebuffer_phys, size_t framebuffer_size)
-    : BiosCommon(mb), _iobase(iobase), _framebuffer_ptr(framebuffer_ptr), _framebuffer_phys(framebuffer_phys), _framebuffer_size(framebuffer_size), _crt_index(0), _ebda_segment(), _vbe_mode()
+    : BiosCommon(mb), _iobase(iobase), _framebuffer_ptr(framebuffer_ptr), _framebuffer_phys(framebuffer_phys), _framebuffer_size(framebuffer_size), _crt_index(0), _ebda_segment(), _vbe_mode(), _last_videomode_request(), _restore_processed(false)
   {
     assert(!(framebuffer_phys & 0xfff));
     assert(!(framebuffer_size & 0xfff));
@@ -576,5 +634,6 @@ PARAM_HANDLER(vga,
   mb.bus_mem      .add(dev, Vga::receive_static<MessageMem>);
   mb.bus_memregion.add(dev, Vga::receive_static<MessageMemRegion>);
   mb.bus_discovery.add(dev, Vga::receive_static<MessageDiscovery>);
+  mb.bus_restore.add(dev, Vga::receive_static<MessageRestore>);
 }
 

From f22aab386d9e8c353a09dbb995a04f4815b694ee Mon Sep 17 00:00:00 2001
From: Jacek Galowicz <jacek.galowicz@intel.com>
Date: Tue, 30 Jul 2013 10:51:34 +0200
Subject: [PATCH 10/35] Added a host op stub for the application's
 configuration string retrieval.

The live migration module needs this to tell the target host what kind of VMM has to be started.
---
 include/nul/message.h |  1 +
 unix/main.cc          | 30 ++++++++++++++++++++++++++++++
 2 files changed, 31 insertions(+)

diff --git a/include/nul/message.h b/include/nul/message.h
index e01eb802..7dd7a6d1 100644
--- a/include/nul/message.h
+++ b/include/nul/message.h
@@ -453,6 +453,7 @@ struct MessageHostOp
       OP_VCPU_RELEASE,
       OP_WAIT_CHILD,
       OP_NEXT_DIRTY_PAGE,
+      OP_GET_CONFIG_STRING,
     } type;
   union {
     unsigned long value;
diff --git a/unix/main.cc b/unix/main.cc
index 90e05739..0ca94fea 100644
--- a/unix/main.cc
+++ b/unix/main.cc
@@ -379,6 +379,36 @@ static bool receive(Device *, MessageHostOp &msg)
         return true;
     }
     break;
+    case MessageHostOp::OP_GET_CONFIG_STRING: {
+        char *cmdline = NULL;
+
+#if PORTED_TO_UNIX
+        // Retrieve the command line string length from sigma0
+        MessageConsole cmsg(MessageConsole::TYPE_START, cmdline);
+        cmsg.read = true;
+        cmsg.mem = 0;
+        unsigned ret = Sigma0Base::console(cmsg);
+        if (ret) {
+            Logging::printf("Error retrieving the command line"
+                    " string length from sigma0.\n");
+            return false;
+        }
+
+        // Retrieve the command line itself
+        cmdline = new char[cmsg.mem+1];
+        cmsg.mem += 1;
+        cmsg.cmdline = cmdline;
+        ret = Sigma0Base::console(cmsg);
+        if (ret) {
+            Logging::printf("Error retrieving the command line string sigma0.\n");
+            return false;
+        }
+#endif
+
+        msg.obj = cmdline;
+    }
+    break;
+
 
     default:
       Logging::panic("%s - unimplemented operation %#x\n",

From edfb110629231f07f1d8f0100f4f7f3194a1e00b Mon Sep 17 00:00:00 2001
From: Jacek Galowicz <jacek.galowicz@intel.com>
Date: Tue, 30 Jul 2013 11:05:55 +0200
Subject: [PATCH 11/35] Added class StopWatch to time.h.

---
 include/service/time.h | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/include/service/time.h b/include/service/time.h
index ab0239be..5ce605cf 100644
--- a/include/service/time.h
+++ b/include/service/time.h
@@ -4,6 +4,8 @@
  * Copyright (C) 2009, Bernhard Kauer <bk@vmmon.org>
  * Economic rights: Technische Universitaet Dresden (Germany)
  *
+ * Copyright (C) 2013 Jacek Galowicz, Intel Corporation.
+ *
  * This file is part of Vancouver.
  *
  * Vancouver is free software: you can redistribute it and/or modify
@@ -97,3 +99,29 @@ static inline void gmtime(timevalue seconds, struct tm_simple *tm)
   tm->mon  = m + 1;
   tm->mday = days + 1;
 }
+
+class StopWatch
+{
+private:
+    Clock *_clock;
+    unsigned _frequency;
+    timevalue _tic, _toc;
+
+public:
+    void start()      { _tic = _clock->clock(_frequency); }
+    timevalue stop()  { _toc = _clock->clock(_frequency); return delta(); }
+    timevalue delta() { return _toc - _tic; }
+
+    timevalue abs_start() { return _tic; }
+    timevalue abs_stop()  { return _toc; }
+
+    // Returns B/ms, which is actually kB/s (if using default frequency)
+    unsigned rate(mword bytes) {
+        if (delta()) return bytes / delta();
+        else return 0;
+    }
+
+    StopWatch(Clock *clock, unsigned frequency = 1000 /* ms */)
+        : _clock(clock), _frequency(frequency), _tic(0), _toc(0)
+    {}
+};

From 223ee651e4381d30bc041fd01fa40142a9d31c13 Mon Sep 17 00:00:00 2001
From: Jacek Galowicz <jacek.galowicz@intel.com>
Date: Wed, 31 Jul 2013 11:51:06 +0200
Subject: [PATCH 12/35] Added class IpHelper containing a skeleton of the
 socket abstraction which was in use in the Vancouver project on NUL to do
 host app networking.

---
 include/nul/iphelper.h | 146 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 146 insertions(+)
 create mode 100644 include/nul/iphelper.h

diff --git a/include/nul/iphelper.h b/include/nul/iphelper.h
new file mode 100644
index 00000000..417599ec
--- /dev/null
+++ b/include/nul/iphelper.h
@@ -0,0 +1,146 @@
+/*
+ * IpHelper class
+ *
+ * Copyright (C) 2013 Jacek Galowicz, Intel Corporation.
+ *
+ * This file is part of Seoul.
+ *
+ * Seoul is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Seoul is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details.
+ *
+ * This was previously used for network communication in the NUL userland
+ * when virtualizing with the NOVA microhypervisor.  Functionality was not
+ * ported and rather the interface is described here to ease porting to the
+ * UNIX socket interface.
+ */
+
+#ifndef __IPHELPER_H
+#define __IPHELPER_H
+
+#include <nul/timer.h>
+
+#define IP_AS_UL(a, b, c, d) ((((d) & 0xff) << 24) | (((c) & 0xff) << 16) | (((b) & 0xff) << 8) | ((a) & 0xff))
+
+class IpHelper;
+
+class TcpSocket
+{
+    friend IpHelper;
+
+    private:
+
+    bool              _outgoing;
+    unsigned short    _local_port;
+    unsigned short    _remote_port;
+
+    // Indicates if we are connected.
+    bool            _connected;
+    // A socket can still be "connected" although closed, if there is still data to be sent.
+    // After sending this data, the socket will finally be marked as "closed"
+    bool            _closed;
+
+    /* ... semaphores used to be initialized here */
+    /* ... buffers ... */
+
+    /* Only to be called by IpHelper */
+    TcpSocket(unsigned caps)
+        : _remote_port(0), _connected(false), _closed(true)
+    { /* ... */ }
+
+    /* Forbidden and hence not implemented: */
+    TcpSocket(TcpSocket const&);
+    void operator=(TcpSocket const&);
+
+    public:
+    /*
+     * Methods for the end user!
+     */
+
+    bool block_until_connected() { return false; }
+
+    /* Close this socket. */
+    void close() {}
+
+    /* Blocking receive function. Difference to BSD sockets:
+     * Does _not_ return before it received the expected number of bytes. */
+    bool receive(void *data, unsigned bytes) { return false; }
+
+    /* Blocking send function. Difference to BSD sockets:
+     * Does _not_ return before the user ACKed all bytes. */
+    bool send(void *data, unsigned bytes) { return false; }
+
+    /* Nonblocking send function. Returns immediately.
+     * Call wait_complete after you pushed multiple send_nonblocking() calls. */
+    bool send_nonblocking(void *data, unsigned bytes) { return false; }
+
+    /* Wait until the receiver ACKed all packets sent from this socket. */
+    bool wait_complete() { return false; }
+};
+
+class IpHelper
+{
+    private:
+        /* ... */
+
+        unsigned long long _mac;
+
+        mword _ip;
+        mword _netmask;
+        mword _gateway;
+
+        TcpSocket *_sockets;
+
+        IpHelper() : _mac(0), _ip(0), _netmask(0), _gateway(0), _sockets(NULL)
+        {};
+
+
+        /* Forbidden, hence not implemented: */
+        IpHelper(IpHelper const&);
+        void operator=(IpHelper const&);
+
+    public:
+        /* This is a singleton */
+        static IpHelper & instance()
+        {
+            static IpHelper instance;
+            return instance;
+        }
+
+        /* === These methods are to be used from the network thread === */
+
+        /* Attach a KernelSemaphore to this and get notified on timeout events.
+         * You will better attach this to network events, too. */
+        unsigned timer_sm() { return 0; /* This used to return a network timer semaphor capability */ }
+
+        /* Call this after the semaphore let you through to reprogram for the next timeout */
+        void check_timeout() {}
+
+        /* Call this regularly to let sockets send */
+        void sockets_send() {}
+
+        /* Feed this method regularly with new incoming packets from the network. */
+        void do_tcpip(unsigned char* data, unsigned size) {}
+
+        /* === These methods are to be used by the actual end user === */
+
+        /* Call this once at the beginning to initialize everything. */
+        bool init(/* ... */) { return false;}
+
+        /* Block-wait until IpHelper gets an IP and return its value. */
+        mword get_ip() { return 0; }
+
+        /* Connect to port at given IP and return a working socket. */
+        TcpSocket * connect(unsigned addr, unsigned port) { return NULL; }
+
+        /* Make a socket listen on port and return a TcpSocket object when a connection
+         * was established */
+        TcpSocket * listen(unsigned port) { return NULL; }
+};
+
+#endif /* __IPHELPER_H */

From 8d21f2f574bf618f82d888493d0336edeb1ec71b Mon Sep 17 00:00:00 2001
From: Jacek Galowicz <jacek.galowicz@intel.com>
Date: Wed, 31 Jul 2013 11:52:12 +0200
Subject: [PATCH 13/35] Added the main live migration code.

This is only compiled into the project and not in use, yet.
The next commit will embedd these mechanisms into main.cc
---
 host/migration.cc               | 719 ++++++++++++++++++++++++++++++++
 include/nul/message.h           |   1 +
 include/nul/migration.h         | 276 ++++++++++++
 include/nul/migration_structs.h | 115 +++++
 unix/SConstruct                 |   1 +
 5 files changed, 1112 insertions(+)
 create mode 100644 host/migration.cc
 create mode 100644 include/nul/migration.h
 create mode 100644 include/nul/migration_structs.h

diff --git a/host/migration.cc b/host/migration.cc
new file mode 100644
index 00000000..25c1c9e1
--- /dev/null
+++ b/host/migration.cc
@@ -0,0 +1,719 @@
+/**
+ * Base migration code
+ *
+ * Copyright (C) 2013 Jacek Galowicz, Intel Corporation.
+ *
+ * This file is part of Seoul.
+ *
+ * Seoul is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Seoul is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details.
+ */
+
+
+#include <stdio.h> // snprintf
+
+#include <nul/motherboard.h>
+#include <nul/vcpu.h>
+
+#include <nul/migration.h>
+#include <service/vprintf.h>
+#include <service/time.h>
+
+Migration::Migration(Motherboard *mb)
+: _mb(mb),
+    _vcpu_utcb(NULL),
+#if PORTED_TO_UNIX
+    _vcpu_blocked_sem(cap, true),
+    _vcpu_sem(cap+1, true),
+#endif
+    _vcpu_should_block(false),
+    _socket(NULL),
+    _sendmem(0), _sendmem_total(0)
+{
+    MessageHostOp msg(MessageHostOp::OP_GUEST_MEM, 0UL);
+    if (!_mb->bus_hostop.send(msg))
+        Logging::panic("%s failed to get physical memory\n",
+                __PRETTY_FUNCTION__);
+
+    _physmem_start = msg.ptr;
+    _physmem_size  = msg.len;
+
+    _dirtman = DirtManager(_physmem_size >> 12);
+
+    _vcpu_utcb = new CpuState;
+}
+
+Migration::~Migration()
+{
+}
+
+void Migration::save_guestregs(CpuState *utcb)
+{
+    /* After Migration::freeze_vcpus() was called, the VCPU will
+     * arrive in the recall handler and call this method here.
+     * Its register states are saved and then it hangs in
+     * our lock.
+     */
+    if (!_vcpu_should_block) return;
+
+    mword vcpu_bytes = reinterpret_cast<mword>(&utcb->id+1);
+    vcpu_bytes -= reinterpret_cast<mword>(&utcb->mtd);
+
+    memcpy(&_vcpu_utcb->mtd, &utcb->mtd, vcpu_bytes);
+
+#if PORTED_TO_UNIX
+    // Release the waiting migration thread
+    _vcopu_blocked_sem.up();
+    // Freeze VCPU
+    _vcpu_sem.downmulti();
+#endif
+}
+
+/* This is used to print messages onto the screen
+ * just after the VMM has started and waits for incoming
+ * guest state data.
+ */
+bool Migration::puts_guestscreen(const char *str, bool reset_screen)
+{
+    MessageRestore msg(MessageRestore::VGA_DISPLAY_GUEST,
+            const_cast<char*>(str), reset_screen);
+    return _mb->bus_restore.send(msg, true);
+}
+
+void Migration::print_welcomescreen()
+{
+    char welcome_msg[255];
+    mword ip = IpHelper::instance().get_ip();
+
+    snprintf(welcome_msg, sizeof(welcome_msg),
+            "   Waiting for guest to migrate. IP: %lu.%lu.%lu.%lu\n\n",
+            ip & 0xff, (ip >> 8) & 0xff, (ip >> 16) & 0xff, (ip >> 24) & 0xff);
+    puts_guestscreen(welcome_msg, true);
+}
+
+void Migration::freeze_vcpus()
+{
+    Logging::printf("Stopping vcpu.\n");
+
+    _vcpu_should_block = true;
+
+    CpuEvent smsg(VCpu::EVENT_RESUME);
+    for (VCpu *vcpu = _mb->last_vcpu; vcpu; vcpu=vcpu->get_last())
+        vcpu->bus_event.send(smsg);
+
+#if PORTED_TO_UNIX
+    _vcpu_blocked_sem.downmulti();
+#endif
+}
+
+void Migration::unfreeze_vcpus()
+{
+    _vcpu_should_block = false;
+#if PORTED_TO_UNIX
+    /* After releasing the VCPU it will continue
+     * through the rest of the recall handler.
+     */
+    _vcpu_sem.up();
+#endif
+}
+
+bool Migration::chksum_page(unsigned page_nr, mword &their_chksum, bool compare)
+{
+    mword my_chksum = 0;
+    assert(page_nr < (_physmem_size >> 12));
+
+    mword *ptr = reinterpret_cast<mword*>(_physmem_start + (page_nr << 12));
+
+    for (unsigned i=0; i < 4096 / sizeof(ptr[0]); ++i)
+        // checksum = sum over (address_i * value_i^2)
+        my_chksum += reinterpret_cast<mword>(ptr+1) * (ptr[i]) * (ptr[i]);
+
+    // Use case one: return true if given memory range is correct
+    if (compare) return my_chksum == their_chksum;
+
+    // Second use case: Provide a checksum for a given memory range
+    their_chksum = my_chksum;
+    return true;
+}
+
+bool Migration::checksums(bool retrieve)
+{
+    unsigned entries = _physmem_size >> 12;
+    bool success = true;
+
+    mword *chksum = new mword[entries];
+    if (!chksum) Logging::panic("Allocating checksum list error\n");
+
+    Logging::printf("Checksumming the area [%8lx - %8lx)\n",
+            reinterpret_cast<mword>(_physmem_start),
+            reinterpret_cast<mword>(_physmem_start + 4096 * entries));
+
+    if (retrieve) {
+        // Receiver. Check the existing checksum list against our memory
+        _socket->receive(chksum, entries * sizeof(chksum[0]));
+
+        unsigned err = 0;
+
+        for (unsigned i=0; i < entries; ++i) {
+            bool ret = chksum_page(i, chksum[i], true);
+            if (!ret) {
+                ++err;
+                Logging::printf("bad page received. page number: %8x\n", i);
+            }
+            success &= ret;
+        }
+
+        Logging::printf("Erroneous pages: %u\n", err);
+    }
+    else {
+        // Sender. Make a list of checksums and send it away.
+
+        for (unsigned i=0; i < entries; ++i)
+            chksum_page(i, chksum[i], false);
+
+        success &= _socket->send(chksum, entries * sizeof(chksum[0]));
+    }
+
+    delete [] chksum;
+
+    return success;
+}
+
+/***********************************************************************
+ * Guest receiving part
+ ***********************************************************************/
+
+bool Migration::receive_ping()
+{
+    mword ping_msg = 0;
+
+    _socket->receive(&ping_msg, sizeof(ping_msg));
+
+    if (ping_msg != 0xc0ffee) {
+        Logging::printf("Received bad ping message.\n");
+        return false;
+    }
+
+    ping_msg *= 3;
+    _socket->send(&ping_msg, sizeof(ping_msg));
+
+    return true;
+}
+
+void Migration::receive_header()
+{
+    MigrationHeader mig_header;
+
+    Logging::printf("Receiving guest information.\n");
+
+    _socket->receive(&mig_header, sizeof(mig_header));
+    if (!mig_header.magic_string_check())
+        Logging::panic("Magic string check failed: MigrationHeader\n");
+
+    MessageRestore vgamsg(MessageRestore::VGA_VIDEOMODE, NULL, true);
+    vgamsg.bytes = mig_header.videomode;
+    _mb->bus_restore.send(vgamsg, true);
+}
+
+void Migration::receive_memory()
+{
+    StopWatch watch(_mb->clock());
+    Logging::printf("Receiving guest memory.\n");
+
+    Prd current;
+    unsigned long bytes = 0;
+
+    watch.start();
+    while (1) {
+        _socket->receive(&current, sizeof(current));
+        if (!current.value())
+            // Receiving an empty range descriptor means "EOF"
+            break;
+
+        _socket->receive(current.base() + _physmem_start, current.size());
+        bytes += current.size();
+    }
+    watch.stop();
+
+    Logging::printf("Received %lu MB. RX Rate: %u KB/s\n",
+            bytes / 1024 / 1024, watch.rate(bytes));
+}
+
+/* Being equipped with a pointer to the stopped VCPU's
+ * register state structure, its registers will be overwritten
+ * and devices restored.
+ */
+bool Migration::receive_guestdevices(CpuState *vcpu_utcb)
+{
+    Logging::printf("Receiving UTCB.\n");
+
+    CpuState *buf = new CpuState;
+
+    mword utcb_end = reinterpret_cast<mword>(&buf->id+1);
+    mword utcb_start = reinterpret_cast<mword>(&buf->mtd);
+    mword utcb_bytes =  utcb_end - utcb_start;
+
+    _socket->receive(&buf->mtd, utcb_bytes);
+
+    memcpy(&vcpu_utcb->mtd, &buf->mtd, utcb_bytes);
+
+    delete buf;
+
+    Logging::printf("Receiving Devices.\n");
+
+    // This works quite similar to the device saving procedure
+    MessageRestore *rmsg = new MessageRestore(MessageRestore::RESTORE_RESTART,
+            NULL, false);
+    _mb->bus_restore.send_fifo(*rmsg);
+
+    // no while(someone_responds_true) approach here because we know
+    // what we want to restore and how many.
+    bool ret;
+    while (1) {
+        _socket->receive(rmsg, sizeof(*rmsg));
+        assert(rmsg->magic_string_check());
+
+        if (rmsg->devtype == 0xdead)
+            break;
+
+        char *device_buffer = new char[rmsg->bytes];
+        _socket->receive(device_buffer, rmsg->bytes);
+
+        rmsg->space = device_buffer;
+        rmsg->write = false;
+        ret = _mb->bus_restore.send(*rmsg, true);
+        if (!ret) Logging::printf("No device replied on restore message!"
+                " VMM-Configuration mismatch?\n");
+
+        delete [] device_buffer;
+    }
+
+    delete rmsg;
+
+    /* Fix TSC offset.
+     * The guest would freeze for some time or skip some timesteps otherwise.
+     */
+    unsigned long long sender_rdtsc;
+    _socket->receive(&sender_rdtsc, sizeof(sender_rdtsc));
+
+    CpuMessage rdtsc_msg(CpuMessage::TYPE_ADD_TSC_OFF, NULL, 0);
+    rdtsc_msg.current_tsc_off = sender_rdtsc - Cpu::rdtsc();
+
+    for (VCpu *vcpu = _mb->last_vcpu; vcpu; vcpu=vcpu->get_last())
+        vcpu->executor.send(rdtsc_msg);
+
+    return true;
+}
+
+bool Migration::listen(unsigned port, CpuState *vcpu_utcb)
+{
+    print_welcomescreen();
+
+    _socket = IpHelper::instance().listen(port);
+    if (_socket == NULL) Logging::panic("Got no TCP receiver.\n");
+
+    receive_ping();
+
+    receive_header();
+
+    receive_memory();
+
+    receive_guestdevices(vcpu_utcb);
+
+#if 0
+    // Checksumming really makes the migration gap larger
+    if (!checksums(true)) {
+        Logging::printf("Error while comparing checksums.\n");
+        return false;
+    }
+#endif
+
+    _socket->close();
+
+    Logging::printf("That's it. Waking up VCPUs.\n");
+    unfreeze_vcpus();
+
+    return true;
+}
+
+/***********************************************************************
+ * Guest sending part
+ ***********************************************************************/
+
+unsigned Migration::negotiate_port()
+{
+    char *cmdline = NULL;
+
+    MessageHostOp msg(MessageHostOp::OP_GET_CONFIG_STRING, 0ul);
+    if (!_mb->bus_hostop.send(msg))
+        return 0;
+    assert(msg.obj != NULL);
+    cmdline = reinterpret_cast<char*>(msg.obj);
+
+    /* Send the listener service our configuration string.
+     * It will try to start an identically configured VMM
+     * instance and then tell us on what port it is waiting
+     * for state input.
+     */
+    MigrationInit mig_init(strlen(cmdline));
+    if (!_socket->send(&mig_init, sizeof(mig_init))) return 0;
+    if (!_socket->send(cmdline, mig_init.cmdlen)) return 0;
+
+    MigrationAnswer mig_ans;
+    _socket->receive(&mig_ans, sizeof(mig_ans));
+    if (!mig_ans.magic_string_check()) {
+        Logging::printf("Magic string check failed: MigrationAnswer");
+        return 0;
+    }
+
+    if (!mig_ans.success) {
+        Logging::printf("Configuration is not suitable for target machine.\n");
+        return 0;
+    }
+
+    delete [] cmdline;
+    return mig_ans.port;
+}
+
+bool Migration::send_header()
+{
+    /* Sending the listening VMM the video mode setting will allow it
+     * to switch the framebuffer to the right setting before migration.
+     * The screen would flicker and display ugly symbols if the
+     * framebuffer state is restored, but the host doesn't display it
+     * the right way, otherwise.
+     */
+    MessageRestore vgamsg(MessageRestore::VGA_VIDEOMODE, NULL, false);
+    _mb->bus_restore.send(vgamsg, true);
+
+    MigrationHeader mig_header(vgamsg.bytes);
+    return _socket->send(&mig_header, sizeof(mig_header));
+}
+
+timevalue Migration::send_ping()
+{
+    StopWatch ping_timer(_mb->clock());
+
+    mword ping_msg = 0xc0ffee;
+    mword pong_msg = 0;
+
+    ping_timer.start();
+    _socket->send(&ping_msg, sizeof(ping_msg));
+    _socket->receive(&pong_msg, sizeof(pong_msg));
+    ping_timer.stop();
+
+    if (pong_msg != 3 * ping_msg) {
+        Logging::printf("Error during latency check\n");
+        return 0;
+    }
+
+    return ping_timer.delta();
+}
+
+#define NEXT_DIRTY_PAGE() \
+({ \
+        MessageHostOp msg(MessageHostOp::OP_NEXT_DIRTY_PAGE, 0ul); \
+        _mb->bus_hostop.send(msg); \
+        msg.value; \
+})
+
+unsigned Migration::enqueue_all_dirty_pages(longrange_data &async_data)
+{
+    Prd *crds = async_data.crds;
+    unsigned crds_sent=0;
+
+    Prd first_crd, last_crd;
+
+    /* This loop will cycle through the memory space
+     * until it ends up without any new dirty regions
+     * or it has done a full cycle.
+     */
+    while (1) {
+        Prd current(NEXT_DIRTY_PAGE());
+
+        if (!current.value() || // Nothing dirty
+            // Next round through the memspace
+            (first_crd.value() && current.base() == first_crd.base()) ||
+            (last_crd.value() && current.base() == last_crd.base()))
+            break;
+
+        /* These pages are just _marked_ dirty in another data structure,
+         * the dirt manager.
+         * This structure might be able to apply some smart optimizations
+         * in the future like e.g. "don't resend pages too often which are dirtied
+         * with high access-frequency to reduce traffic", etc.
+         */
+        _dirtman.mark_dirty(current);
+
+        if (!first_crd.value()) first_crd = current;
+        last_crd = current;
+    }
+
+    unsigned pages_enqueued = 0;
+    while (_dirtman.dirty_pages() > 0 && crds_sent < async_data.crd_count) {
+        Prd current = crds[crds_sent] = _dirtman.next_dirty();
+        if (!current.value())
+            // That's it for now.
+            break;
+
+        _dirtman.mark_clean(current);
+
+        if (!_socket->send_nonblocking(&crds[crds_sent], sizeof(*crds)) ||
+            !_socket->send_nonblocking(current.base() + _physmem_start,
+                current.size()))
+            return 0;
+
+        ++crds_sent;
+        pages_enqueued += 1 << current.order();
+    }
+
+    return pages_enqueued;
+}
+
+bool Migration::send_memory(longrange_data &async_data)
+{
+    StopWatch lap_time(_mb->clock());
+    StopWatch last_lap(_mb->clock());
+
+    unsigned transfer_rate;
+    unsigned dirtying_rate;
+
+    /* The underlying socket architecture works a little bit different than
+     * BSD sockets, where you stuff data to be sent into the send buffer
+     * until it replies with "buffer is full, wait a bit".
+     * These sockets here asynchronously manage lists of pointers to memory ranges
+     * and their size and will pick up this data when it is actually needed.
+     * And because of this we have to preserve all memory ranges to be sent
+     * until they are ACKed.
+     */
+
+    const unsigned page_limit = 1000;
+    unsigned pages_transferred;
+    unsigned round = 0;
+    async_data.crds = new Prd[page_limit];
+    async_data.crd_count = page_limit;
+
+    MessageRestore unplug_msg(MessageRestore::PCI_PLUG, NULL, false);
+    _mb->bus_restore.send(unplug_msg, false);
+
+    do {
+        last_lap = lap_time;
+        lap_time.start();
+
+        if (!(pages_transferred = enqueue_all_dirty_pages(async_data)) ||
+            !_socket->wait_complete())
+            return false;
+
+        lap_time.stop();
+
+        transfer_rate = lap_time.rate(pages_transferred << 12);
+        dirtying_rate = last_lap.rate(pages_transferred << 12);
+        Logging::printf("RND %u PAGE_CNT %5u TX %5u KB/s DRT %5u KB/s DELTA"
+                " %llu START %llu\n",
+                round, pages_transferred, transfer_rate, dirtying_rate,
+                lap_time.delta(), lap_time.abs_start());
+
+        assert(pages_transferred);
+
+        _sendmem_total += pages_transferred << 12;
+        if (_sendmem == 0) _sendmem = _sendmem_total;
+        ++round;
+    } while (transfer_rate >= dirtying_rate);
+
+    // The last transfer round with a frozen guest system will follow now
+    freeze_vcpus();
+
+    static Prd end_of_crds;
+    pages_transferred = enqueue_all_dirty_pages(async_data);
+    Logging::printf("pages_dirty: %x\n", pages_transferred);
+    if (!pages_transferred ||
+        !_socket->send_nonblocking(&end_of_crds, sizeof(end_of_crds)))
+        return false;
+
+    Logging::printf("Enqueued the last %u dirty pages\n", pages_transferred);
+    return true;
+}
+
+bool Migration::send_devices(longrange_data dat)
+{
+    // Send VCPU state
+#if PORTED_TO_UNIX
+    unsigned vcpu_bytes = reinterpret_cast<unsigned>(&_vcpu_utcb->id+1);
+    vcpu_bytes -= reinterpret_cast<unsigned>(&_vcpu_utcb->mtd);
+
+    if (!_socket->send(&_vcpu_utcb->mtd, vcpu_bytes))
+        return false;
+#endif
+
+    /* There are multiple RESTORE_xxx types of restore messages.
+     * For each kind of device there is one.
+     * So we throw messages of each type onto the bus.
+     */
+    MessageRestore restart_msg(MessageRestore::RESTORE_RESTART, NULL, true);
+    _mb->bus_restore.send_fifo(restart_msg);
+
+    mword restore_bytes = restart_msg.bytes;
+    mword restore_bytes_consumed = 0;
+    dat.restore_buf = new char[restore_bytes + sizeof(MessageRestore)];
+
+    for (int i=MessageRestore::RESTORE_RESTART+1;
+            i < MessageRestore::RESTORE_LAST;
+            i++) {
+        /* A device will receive this message, write its state into it and
+         * return true. If it receives such a message again, it will return
+         * false. That's why we sent this RESTORE_RESTART message before.
+         * After the first time the bus returns false, we know that we saved
+         * all devices of this particular type.
+         */
+        while (1) {
+            char *msg_addr = dat.restore_buf + restore_bytes_consumed;
+            char *device_space = dat.restore_buf + restore_bytes_consumed
+                + sizeof(MessageRestore);
+
+            MessageRestore *rmsg = reinterpret_cast<MessageRestore*>(msg_addr);
+            memset(rmsg, 0, sizeof(*rmsg));
+
+            rmsg->devtype = i;
+            rmsg->write = true;
+            rmsg->space = device_space;
+            rmsg->magic_string = MessageRestore::MAGIC_STRING_DEVICE_DESC;
+
+            if (!_mb->bus_restore.send(*rmsg, true)) break;
+
+            restore_bytes_consumed += sizeof(*rmsg) + rmsg->bytes;
+        }
+    }
+    assert(restore_bytes == restore_bytes_consumed);
+
+    if (!_socket->send_nonblocking(dat.restore_buf, restore_bytes) ||
+           // Send "end of devices"
+        !_socket->send_nonblocking(&dat.end_of_devices,
+            sizeof(dat.end_of_devices)) ||
+        !_socket->wait_complete()) {
+        Logging::printf("Error sending device states.\n");
+        return false;
+    }
+
+    // Restore current tsc offset at destination
+    dat.rdtsc  = Cpu::rdtsc();
+    /* Compensate network latency.
+     * This was tested with cloning a VM displaying animations
+     * which were bound to TSC values. After migration,
+     * they only ran in sync when the following line was applied.
+     */
+    dat.rdtsc += dat.latency * _mb->clock()->freq() / 1000;
+
+    if (!_socket->send(&dat.rdtsc, sizeof(dat.rdtsc))) {
+        Logging::printf("Error sending RDTSC\n");
+        return false;
+    }
+
+    return true;
+}
+
+bool Migration::send(unsigned long addr, unsigned long port)
+{
+    StopWatch migration_timer(_mb->clock());
+    StopWatch freeze_timer(_mb->clock());
+    longrange_data async_data;
+
+    Logging::printf("Trying to connect...\n");
+    _socket = IpHelper::instance().connect(addr, port);
+    if (_socket == NULL) {
+        Logging::printf("Quitting: Got no TCP connection.\n");
+        return false;
+    }
+
+    Logging::printf("Established connection.\n");
+
+    unsigned mig_port = negotiate_port();
+
+    _socket->close();
+
+    if (!mig_port) return false;
+
+    Logging::printf("Connecting to waiting target VM.\n");
+    _socket = IpHelper::instance().connect(addr, mig_port);
+    if (!_socket) {
+        Logging::printf("Error connecting to target VM.\n");
+        return false;
+    }
+    Logging::printf("OK, starting the actual migration.\n");
+
+    migration_timer.start();
+
+    async_data.latency = send_ping();
+    if (!async_data.latency) {
+        Logging::printf("Ping failed.\n");
+        return false;
+    }
+    // Latency = round trip time / 2
+    async_data.latency >>= 1;
+    Logging::printf("Connection has a latency of %lu ms * freq %llu kHz"
+            " = %llu ticks.\n",
+            async_data.latency, _mb->clock()->freq() / 1000,
+            async_data.latency * _mb->clock()->freq() / 1000);
+
+    if (!send_header()) {
+        Logging::printf("Sending header failed.\n");
+        return false;
+    }
+    if (!send_memory(async_data)) {
+        Logging::printf("Sending guest state failed.\n");
+        return false;
+    }
+    freeze_timer.start();
+
+    if (!send_devices(async_data)) {
+        Logging::printf("Sending guest devices failed.\n");
+        return false;
+    }
+
+#if 0
+    // Checksumming really makes the freeze gap larger
+    if (!checksums(false)) {
+        Logging::printf("Error while sending checksums.\n");
+        return false;
+    }
+#endif
+
+    // Uncomment this to "clone" the VM instead of migrating it away.
+    //unfreeze_vcpus();
+
+    freeze_timer.stop();
+
+    _socket->close();
+
+    migration_timer.stop();
+
+    Logging::printf("Done. VM was frozen for %llu ms.\n", freeze_timer.delta());
+    Logging::printf("This migration took %llu seconds.\n",
+            migration_timer.delta() / 1000);
+    Logging::printf("%3lu%% (%lu MB) of guest memory resent due to change.\n",
+            100u * (_sendmem_total - _sendmem) / _sendmem,
+            (_sendmem_total - _sendmem) / 1024u / 1024u);
+
+    _dirtman.print_stats();
+
+    delete [] async_data.crds;
+    delete [] async_data.restore_buf;
+#if PORTED_TO_UNIX
+    delete _vcpu_utcb;
+#endif
+
+    return true;
+}
+
+PARAM_HANDLER(retrieve_guest,
+	      "retrieve_guest:<port> - Start a VMM instance which waits for guest",
+          " state input over network listening on <port>")
+{
+    MessageHostOp msg(MessageHostOp::OP_MIGRATION_RETRIEVE_INIT, argv[0]);
+    mb.bus_hostop.send(msg);
+}
diff --git a/include/nul/message.h b/include/nul/message.h
index 7dd7a6d1..bcf27351 100644
--- a/include/nul/message.h
+++ b/include/nul/message.h
@@ -454,6 +454,7 @@ struct MessageHostOp
       OP_WAIT_CHILD,
       OP_NEXT_DIRTY_PAGE,
       OP_GET_CONFIG_STRING,
+      OP_MIGRATION_RETRIEVE_INIT,
     } type;
   union {
     unsigned long value;
diff --git a/include/nul/migration.h b/include/nul/migration.h
new file mode 100644
index 00000000..f64cdd51
--- /dev/null
+++ b/include/nul/migration.h
@@ -0,0 +1,276 @@
+/**
+ * Base migration code declarations
+ *
+ * Copyright (C) 2013 Jacek Galowicz, Intel Corporation.
+ *
+ * This file is part of Seoul.
+ *
+ * Seoul is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Seoul is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details.
+ */
+
+
+
+#include <nul/motherboard.h>
+#include <nul/iphelper.h>
+#include <nul/migration_structs.h>
+
+class Desc
+{
+    protected:
+          unsigned _value;
+            Desc(unsigned v) : _value(v) {}
+    public:
+              unsigned value() { return _value; }
+};
+
+/**
+ *  A page range descriptor;
+ *  Introduced, because NUL provided CRDs for this...
+ **/
+class Prd
+{
+    protected:
+        unsigned _value;
+
+    public:
+        unsigned order() { return ((_value >> 7) & 0x1f); }
+        unsigned size()  { return 1 << (order() + 12); }
+        unsigned base()  { return _value & ~0xfff; }
+        unsigned attr()  { return _value & 0x1f; }
+        unsigned cap()   { return _value >> 12; }
+        unsigned value() { return _value; }
+
+        explicit Prd(unsigned offset, unsigned order, unsigned attr) : _value((offset << 12) | (order << 7) | attr) { }
+        explicit Prd(unsigned v) : _value(v) {}
+        explicit Prd() : Prd(0) {}
+};
+
+/* The DirtManager is feeded with CRDs of dirty page regions.
+ * There's an internal bitmap which can be used for future resend-optimizations
+ * as well as generating resend-statistics.
+ */
+class DirtManager
+{
+    private:
+        unsigned *_map;
+        unsigned  _pages;
+
+        unsigned char *_cnt;
+
+        unsigned _dirt_count;
+
+    public:
+        void mark_dirty(Prd dirty)
+        {
+            unsigned base  = dirty.base() >> 12;
+            unsigned pages = 1 << dirty.order();
+            for (unsigned i=base; i < base + pages; ++i) mark_dirty(i);
+        }
+
+        void mark_dirty(unsigned page)
+        {
+            if (!Cpu::get_bit(_map, page)) {
+                ++_dirt_count;
+                ++_cnt[page];
+            }
+            Cpu::set_bit(_map, page, true);
+        }
+
+        void mark_clean(Prd clean)
+        {
+            unsigned base  = clean.base() >> 12;
+            unsigned pages = 1 << clean.order();
+            for (unsigned i=base; i < base + pages; ++i) mark_clean(i);
+        }
+
+        void mark_clean(unsigned page)
+        {
+            --_dirt_count;
+            Cpu::set_bit(_map, page, false);
+        }
+
+        unsigned dirty_pages() { return _dirt_count; }
+
+        Prd next_dirty() {
+            unsigned base, len;
+
+            for (base = 0; base < _pages; ++base) {
+                len = 0;
+                while (Cpu::get_bit(_map, base + len)) ++len;
+
+                if (len > 0) break;
+            }
+
+            if (len == 0) return Prd();
+
+            Prd ret(base, Cpu::bsr(len), 0);
+            return ret;
+        }
+
+        static inline unsigned char fir_max(unsigned char *in, unsigned limit, unsigned pos, int size)
+        {
+            int beg = pos - size;
+            int end = pos + size;
+            beg = VMM_MAX(beg, static_cast<int>(0));
+            end = VMM_MIN(end, static_cast<int>(limit - 1));
+
+            int width = end - beg;
+            assert(width > 0);
+            assert(width < 2 * size + 1);
+
+            unsigned max = 0;
+            for (int i=beg; i <= end; ++i) max = VMM_MAX(max, in[i]);
+
+            return static_cast<unsigned char>(max);
+        }
+
+        void print_stats()
+        {
+            const unsigned size = 20;
+            unsigned char bucket[size];
+
+            unsigned sx = 0, sqx = 0;
+
+            unsigned char *smooth[3];
+
+            smooth[0] = new unsigned char[_pages];
+            smooth[1] = new unsigned char[_pages];
+            smooth[2] = new unsigned char[_pages];
+
+            for (unsigned i=0; i < _pages; ++i) {
+                unsigned faults = VMM_MIN(_cnt[i], size);
+                ++bucket[faults];
+
+                sx  += faults;
+                sqx += faults * faults;
+
+                for (unsigned j=0; j < 3; ++j)
+                    smooth[j][i] = fir_max(_cnt, _pages, i, j*50+1);
+            }
+
+            float avg = sx / _pages;
+            float var = sqx - _pages * avg * avg;
+
+            Logging::printf("# avg = %u, var = %u\n",
+                    static_cast<unsigned>(avg), static_cast<unsigned>(var));
+
+#if 0
+            /* This generates a really long list needed for plotting
+             * statistics
+             */
+            Logging::printf("# Remaps per page:\n");
+            for (unsigned i = 0; i < _pages; ++i)
+                Logging::printf("REMAP %#x %u %u %u %u\n",
+                        i, _cnt[i], smooth[0][i], smooth[1][i], smooth[2][i]);
+#endif
+
+            delete [] smooth[0];
+            delete [] smooth[1];
+            delete [] smooth[2];
+        }
+
+        DirtManager() : _map(NULL), _pages(0), _cnt(NULL), _dirt_count(0) {}
+        DirtManager(unsigned pages) : _map(NULL), _pages(pages), _cnt(NULL), _dirt_count(0)
+        {
+            _map = new unsigned[(pages + sizeof(*_map) -1) / sizeof(*_map)];
+            _cnt = new unsigned char[pages];
+            memset(_cnt, 0, pages * sizeof(*_cnt));
+        }
+        ~DirtManager()
+        {
+            if (_map) delete [] _map;
+            if (_cnt) delete [] _cnt;
+        }
+};
+
+class Migration : public StaticReceiver<Migration>
+{
+    Motherboard     *_mb;
+#if PORTED_TO_UNIX
+    Hip             *_hip;
+    CapAllocator    *_tls;
+#endif
+
+    char         *_physmem_start;
+    unsigned long _physmem_size;
+
+    CpuState         *_vcpu_utcb;
+#if PORTED_TO_UNIX
+    KernelSemaphore   _vcpu_blocked_sem;
+    KernelSemaphore   _vcpu_sem;
+#endif
+    bool              _vcpu_should_block;
+
+    TcpSocket       *_socket;
+
+    unsigned long   _sendmem;
+    unsigned long   _sendmem_total;
+
+    /* Because of asynchronous send operations, all
+     * data to be send has to be preserved somewhere until
+     * it is ACKED. That's what this structure is for.
+     */
+    struct longrange_data {
+        unsigned          crd_count;
+        Prd              *crds;
+
+        timevalue         rdtsc;
+        char             *restore_buf;
+        MessageRestore    end_of_devices;
+
+        mword             latency;
+
+        longrange_data() :
+            crd_count(0), crds(NULL),
+            rdtsc(0), restore_buf(NULL), end_of_devices(0xdead, NULL, true),
+            latency(0) {}
+    };
+
+    DirtManager _dirtman;
+
+    void print_welcomescreen();
+    bool puts_guestscreen(const char *str, bool reset_screen);
+
+    void freeze_vcpus();
+    void unfreeze_vcpus();
+
+    unsigned negotiate_port();
+    bool send_header();
+    timevalue send_ping();
+    bool send_devices(longrange_data dat);
+    unsigned enqueue_all_dirty_pages(longrange_data &async_data);
+    bool send_memory(longrange_data &async_data);
+
+    void receive_header();
+    bool receive_ping();
+    void receive_memory();
+    bool receive_guestdevices(CpuState *vcpu_utcb);
+
+    bool chksum_page(unsigned page_nr, mword &their_chksum, bool compare);
+    bool checksums(bool retrieve);
+
+ public:
+    enum RestoreModes {
+        MODE_OFF = 0,
+        MODE_SEND,
+        MODE_RECEIVE
+    };
+
+    bool listen(unsigned port , CpuState *vcpu_utcb);
+    bool send(unsigned long addr, unsigned long port);
+
+    // To be called from do_recall
+    void save_guestregs(CpuState *utcb);
+
+	bool receive(MessageHostOp &msg);
+
+    Migration(Motherboard *mb);
+    ~Migration();
+};
diff --git a/include/nul/migration_structs.h b/include/nul/migration_structs.h
new file mode 100644
index 00000000..86893592
--- /dev/null
+++ b/include/nul/migration_structs.h
@@ -0,0 +1,115 @@
+/**
+ * Migration protocol structures
+ *
+ * Copyright (C) 2013 Jacek Galowicz, Intel Corporation.
+ *
+ * This file is part of Seoul.
+ *
+ * Seoul is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Seoul is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details.
+ */
+
+
+struct MigrationInit {
+#define MAGIC_STRING_MIGINIT 0xb00b00
+    mword cmdlen;
+    mword magic_string;
+
+    MigrationInit() : cmdlen(0), magic_string(MAGIC_STRING_MIGINIT) {}
+    MigrationInit(mword _cmdlen) : cmdlen(_cmdlen), magic_string(MAGIC_STRING_MIGINIT) {}
+    bool magic_string_check() { return magic_string == MAGIC_STRING_MIGINIT; }
+};
+
+struct MigrationAnswer {
+#define MAGIC_STRING_MIGANSWER 0xfeeb1ed0
+    mword success;
+    mword port;
+    mword magic_string;
+
+    MigrationAnswer() : success(0), port(0), magic_string(MAGIC_STRING_MIGANSWER) {}
+    MigrationAnswer(unsigned _port) : success(1), port(_port), magic_string(MAGIC_STRING_MIGANSWER) {}
+    bool magic_string_check() { return magic_string == MAGIC_STRING_MIGANSWER; }
+};
+
+/*
+ * This is an index structure telling us how many memory pages and device pages
+ * are saved to the hard disk, enabling us to calculate offsets later.
+ */
+struct RestoreIndex {
+    unsigned mem_pages;
+    unsigned dev_pages;
+    char space[0x1000 - 2*sizeof(unsigned)];
+};
+
+struct MigrationHeader {
+#define MAGIC_STRING_HEADER 0xb0015366
+    mword magic_string;
+    mword version;
+    mword videomode;
+
+    MigrationHeader() : magic_string(MAGIC_STRING_HEADER) {}
+    MigrationHeader(mword _videomode)
+        : magic_string(MAGIC_STRING_HEADER), videomode(_videomode) {}
+    bool magic_string_check() { return magic_string == MAGIC_STRING_HEADER; }
+};
+
+struct AddressSpaceIndex {
+#define MAGIC_STRING_ADDR_SPACE 0xBADB0B
+    unsigned long magic_string;
+    unsigned long num_pages;
+
+    AddressSpaceIndex() {}
+    AddressSpaceIndex(unsigned long pages) : magic_string(MAGIC_STRING_ADDR_SPACE), num_pages(pages) {}
+    bool magic_string_check() { return magic_string == MAGIC_STRING_ADDR_SPACE; }
+};
+
+struct PageTransferIndex {
+#define MAGIC_STRING_PAGE_INDEX 0x51CD06
+    unsigned long magic_string;
+    unsigned long desc_num;
+    unsigned long total_bytes;
+
+    PageTransferIndex()
+        : magic_string(MAGIC_STRING_PAGE_INDEX) {}
+    PageTransferIndex(unsigned long descs, unsigned long bytes)
+        : magic_string(MAGIC_STRING_PAGE_INDEX), desc_num(descs), total_bytes(bytes) {}
+    bool magic_string_check() { return magic_string == MAGIC_STRING_PAGE_INDEX; }
+};
+
+static unsigned long checksum_pages(void *offset, unsigned long count)
+{
+    if (offset == 0) return 0;
+    assert(! (reinterpret_cast<unsigned long>(offset) & 0xfff) );
+
+    unsigned long chksum = 0;
+    unsigned long *ptr = reinterpret_cast<unsigned long*>(offset);
+
+    for (unsigned i=0; i < count * 0x1000 / sizeof(unsigned long); i++)
+        chksum += ptr[i] * ptr[i];
+
+    return chksum;
+}
+
+struct PageTransferDesc {
+#define MAGIC_STRING_PAGE_DESC 0xDEADC0DE
+    unsigned long magic_string;
+    unsigned long offset;
+    unsigned long count;
+    unsigned long checksum;
+
+    PageTransferDesc() {}
+    PageTransferDesc(unsigned long _offset, unsigned long _count)
+        : magic_string(MAGIC_STRING_PAGE_DESC), offset(_offset), count(_count),
+        checksum(checksum_pages(reinterpret_cast<void*>(_offset), _count)) { }
+    unsigned long recalculate_checksums()
+    { return (checksum = checksum_pages(reinterpret_cast<void*>(offset), count)); }
+    bool magic_string_check() { return magic_string == MAGIC_STRING_PAGE_DESC; }
+};
+
+#define MAGIC_STRING_PAGE_BORDER 0xC03DD00D
diff --git a/unix/SConstruct b/unix/SConstruct
index 3da784e8..78e24897 100644
--- a/unix/SConstruct
+++ b/unix/SConstruct
@@ -126,6 +126,7 @@ sources = Glob('*.cc') + [            # Unix frontend
       '../model/lapic.cc',
       '../model/msi.cc',
       '../host/hostkeyboard.cc',
+      '../host/migration.cc'
       ]
 # TODO not yet ported
 if target_arch == 'x86_32':

From b9a236d631ebd60af56ef70a67a96b32413d001c Mon Sep 17 00:00:00 2001
From: Jacek Galowicz <jacek.galowicz@intel.com>
Date: Wed, 31 Jul 2013 13:54:37 +0200
Subject: [PATCH 14/35] This commit implements most of the code needed to
 actually use the new live migration code.

---
 unix/main.cc | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 62 insertions(+)

diff --git a/unix/main.cc b/unix/main.cc
index 0ca94fea..cffe5c84 100644
--- a/unix/main.cc
+++ b/unix/main.cc
@@ -50,6 +50,7 @@
 #include <vector>
 
 #include <seoul/unix.h>
+#include <nul/migration.h>
 
 const char version_str[] =
 #include "version.inc"
@@ -170,6 +171,11 @@ pthread_mutex_t irq_mtx;
 
 // Relevant to live migration
 
+Migration *_migrator;
+Migration::RestoreModes _restore_mode = Migration::MODE_OFF;
+unsigned _migration_ip;
+unsigned _migration_port;
+
 // the memory remapping procedure should only
 // remap memory in page size granularity, if set
 bool _track_page_usage = false;
@@ -239,8 +245,23 @@ static void *vcpu_thread_fn(void *arg)
 
   while (true) {
     pthread_mutex_lock(&irq_mtx);
+
+    if (_restore_mode == Migration::MODE_RECEIVE)
+        // This will block until everything is restored
+        _migrator->listen(_migration_port, &cpu_state);
+    else if (_restore_mode == Migration::MODE_SEND)
+        // This will block if the last memory resend round is reached
+        _migrator->save_guestregs(&cpu_state);
+
     handle_vcpu(false, CpuMessage::TYPE_SINGLE_STEP, vcpu, &cpu_state);
     // Logging::printf("eip %x\n", cpu_state.eip);
+
+    if (_restore_mode == Migration::MODE_RECEIVE) {
+        _restore_mode = Migration::MODE_OFF;
+        delete _migrator;
+        _migrator = NULL;
+        cpu_state.mtd = MTD_ALL;
+    }
     pthread_mutex_unlock(&irq_mtx);
   }
 
@@ -409,6 +430,12 @@ static bool receive(Device *, MessageHostOp &msg)
     }
     break;
 
+    case MessageHostOp::OP_MIGRATION_RETRIEVE_INIT: {
+        _migration_port = msg.value;
+        _restore_mode = Migration::MODE_RECEIVE;
+        _migrator = new Migration(&mb);
+    }
+    break;
 
     default:
       Logging::panic("%s - unimplemented operation %#x\n",
@@ -610,6 +637,32 @@ static bool receive(Device *, MessageDisk &msg)
   return true;
 }
 
+
+static void *migration_thread_fn(void *)
+{
+    _migrator = new Migration(&mb);
+    _migrator->send(_migration_ip, _migration_port);
+
+    delete _migrator;
+    _migrator = nullptr;
+
+    return nullptr;
+}
+
+static void start_migration_to(unsigned ip, unsigned port)
+{
+    _migration_ip = ip;
+    _migration_port = port;
+    _restore_mode = Migration::MODE_SEND;
+
+    pthread_t migthread;
+    if (0 != pthread_create(&migthread, NULL, migration_thread_fn, NULL)) {
+        perror("pthread_create");
+        return;
+    }
+    pthread_setname_np(migthread, "migration");
+}
+
 static void usage()
 {
   fprintf(stderr, "Usage: seoul [-m RAM] [-n tap-device] [kernel parameters] [module1 parameters] ...\n");
@@ -725,6 +778,15 @@ int main(int argc, char **argv)
   MessageLegacy msg2(MessageLegacy::RESET, 0);
   mb.bus_legacy.send_fifo(msg2);
 
+  if (_restore_mode != Migration::MODE_OFF) {
+      /*
+       * The following UNLOCK message helps the VCPU out of the lock
+       * it is blocked by and catches it into the recall handler.
+       */
+       MessageLegacy msg3(MessageLegacy::UNLOCK, 0);
+       mb.bus_legacy.send_fifo(msg3);
+  }
+
   pthread_t iothread;
   if (tap_fd) {
     Logging::printf("Starting background threads.\n");

From 67d90543f4c5be11d60e7e9a126302ff8d71d5a1 Mon Sep 17 00:00:00 2001
From: Jacek Galowicz <jacek.galowicz@intel.com>
Date: Fri, 18 Oct 2013 15:40:38 +0200
Subject: [PATCH 15/35] This commit introduces an example start event,
 triggered by a keyboard backspace key press.

As this migrates to a hard coded destination host, this could also be done more elegant:
- By a VMCALL from the VM, carrying a magic number in the eax register and the destination host in the ebx register.
- By some VM manager application, triggering the migration event via some IPC event.
- By a fancy ncurses menu, prompting the user for the destination host IP.
---
 include/nul/message.h |  1 +
 unix/main.cc          | 56 +++++++++++++++++++++++--------------------
 unix/ncurses.cc       | 10 ++++++++
 3 files changed, 41 insertions(+), 26 deletions(-)

diff --git a/include/nul/message.h b/include/nul/message.h
index bcf27351..3e7660e9 100644
--- a/include/nul/message.h
+++ b/include/nul/message.h
@@ -455,6 +455,7 @@ struct MessageHostOp
       OP_NEXT_DIRTY_PAGE,
       OP_GET_CONFIG_STRING,
       OP_MIGRATION_RETRIEVE_INIT,
+      OP_MIGRATION_START,
     } type;
   union {
     unsigned long value;
diff --git a/unix/main.cc b/unix/main.cc
index cffe5c84..816d92be 100644
--- a/unix/main.cc
+++ b/unix/main.cc
@@ -276,6 +276,31 @@ struct  Vcpu_info {
 
 static std::vector<Vcpu_info> vcpu_info;
 
+static void *migration_thread_fn(void *)
+{
+    _migrator = new Migration(&mb);
+    _migrator->send(_migration_ip, _migration_port);
+
+    delete _migrator;
+    _migrator = nullptr;
+
+    return nullptr;
+}
+
+static void start_migration_to(unsigned ip, unsigned port)
+{
+    _migration_ip = ip;
+    _migration_port = port;
+    _restore_mode = Migration::MODE_SEND;
+
+    pthread_t migthread;
+    if (0 != pthread_create(&migthread, NULL, migration_thread_fn, NULL)) {
+        perror("pthread_create");
+        return;
+    }
+    pthread_setname_np(migthread, "migration");
+}
+
 static bool receive(Device *, MessageHostOp &msg)
 {
     bool res = true;
@@ -436,6 +461,11 @@ static bool receive(Device *, MessageHostOp &msg)
         _migrator = new Migration(&mb);
     }
     break;
+    case MessageHostOp::OP_MIGRATION_START: {
+        start_migration_to(msg.value, 9000);
+        return true;
+    }
+    break;
 
     default:
       Logging::panic("%s - unimplemented operation %#x\n",
@@ -637,32 +667,6 @@ static bool receive(Device *, MessageDisk &msg)
   return true;
 }
 
-
-static void *migration_thread_fn(void *)
-{
-    _migrator = new Migration(&mb);
-    _migrator->send(_migration_ip, _migration_port);
-
-    delete _migrator;
-    _migrator = nullptr;
-
-    return nullptr;
-}
-
-static void start_migration_to(unsigned ip, unsigned port)
-{
-    _migration_ip = ip;
-    _migration_port = port;
-    _restore_mode = Migration::MODE_SEND;
-
-    pthread_t migthread;
-    if (0 != pthread_create(&migthread, NULL, migration_thread_fn, NULL)) {
-        perror("pthread_create");
-        return;
-    }
-    pthread_setname_np(migthread, "migration");
-}
-
 static void usage()
 {
   fprintf(stderr, "Usage: seoul [-m RAM] [-n tap-device] [kernel parameters] [module1 parameters] ...\n");
diff --git a/unix/ncurses.cc b/unix/ncurses.cc
index 24e33105..9cbe37c9 100644
--- a/unix/ncurses.cc
+++ b/unix/ncurses.cc
@@ -4,6 +4,8 @@
  * Copyright (C) 2013, Julian Stecklina <jsteckli@os.inf.tu-dresden.de>
  * Economic rights: Technische Universitaet Dresden (Germany)
  *
+ * Copyright (C) 2013 Jacek Galowicz, Intel Corporation.
+ *
  * This file is part of Seoul.
  *
  * Seoul is free software: you can redistribute it and/or modify it
@@ -147,6 +149,14 @@ class NcursesDisplay : public StaticReceiver<NcursesDisplay> {
           if (current_view < views.size() - 1)
             current_view ++;
         break;
+
+      case KEY_BACKSPACE: {
+        /* Migration example start event. As soon as the user hits this event,
+         * the VM will be migrated to the hard coded destination host. */
+        MessageHostOp msg(MessageHostOp::OP_MIGRATION_START,
+                /* destination ip, address: 192.168.0.1 */ 0xC0A80001ul);
+        mb.bus_hostop.send(msg);
+      }
       case ERR:
       default:
         break;

From 154a76d1f3688985291d89ef793167f47f50ad6a Mon Sep 17 00:00:00 2001
From: Jacek Galowicz <jacek.galowicz@intel.com>
Date: Mon, 21 Oct 2013 11:42:38 +0200
Subject: [PATCH 16/35] Add an ACPI controller model.

---
 executor/dsdt.asl       | 322 ++++++++++++++++++++++++++++++++++++++++
 executor/dsdt.h         | 179 ++++++++++++++++++++++
 executor/vbios_reset.cc |  38 +++++
 model/acpicontroller.cc | 322 ++++++++++++++++++++++++++++++++++++++++
 unix/SConstruct         |   1 +
 5 files changed, 862 insertions(+)
 create mode 100644 executor/dsdt.asl
 create mode 100644 executor/dsdt.h
 create mode 100644 model/acpicontroller.cc

diff --git a/executor/dsdt.asl b/executor/dsdt.asl
new file mode 100644
index 00000000..701c08ee
--- /dev/null
+++ b/executor/dsdt.asl
@@ -0,0 +1,322 @@
+// ASL Example
+DefinitionBlock (
+        "dsdt.aml", // Output Filename
+        "DSDT",     // Signature
+        0x00,       // DSDT Compliance Revision
+        "BAMM",     // OEMID
+        "JONGE",    // TABLE ID
+        0x1         // OEM Revision
+        )
+{
+    Scope(\_SB) {
+        Device(PCI0) {
+            // The following magic code stands for "PCI Host Bridge"
+            Name(_HID, EisaId("PNP0A03"))
+            Name(_ADR, 0)
+            Name(_UID, 0)
+
+            // Hot Plug Parameters. Optional.
+            // Linux will complain and use standard parameters,
+            // if not given.
+            Name(_HPP, Package(){
+                0x08,  // Cache line size in dwords
+                0x40,  // Latency timer in PCI clocks
+                0x01,  // Enable SERR line
+                0x00   // Enable PERR line
+            })
+
+            // PCI Routing Table
+            // When defining as much ACPI information as
+            // needed for hotplug, we also have to define
+            // stuff like the following.
+            // Otherwise, Linux would complain.
+            Name(_PRT, Package() {
+                Package() { 0x1ffff, 0, LNKA, 0 },
+                Package() { 0x1ffff, 1, LNKB, 0 },
+                Package() { 0x1ffff, 2, LNKC, 0 },
+                Package() { 0x1ffff, 3, LNKD, 0 },
+
+                Package() { 0x2ffff, 0, LNKA, 0 },
+                Package() { 0x2ffff, 1, LNKB, 0 },
+                Package() { 0x2ffff, 2, LNKC, 0 },
+                Package() { 0x2ffff, 3, LNKD, 0 },
+
+                Package() { 0x3ffff, 0, LNKA, 0 },
+                Package() { 0x3ffff, 1, LNKB, 0 },
+                Package() { 0x3ffff, 2, LNKC, 0 },
+                Package() { 0x3ffff, 3, LNKD, 0 },
+
+                Package() { 0x4ffff, 0, LNKA, 0 },
+                Package() { 0x4ffff, 1, LNKB, 0 },
+                Package() { 0x4ffff, 2, LNKC, 0 },
+                Package() { 0x4ffff, 3, LNKD, 0 },
+            })
+
+            // At boot, Linux will either scan the system for
+            // possible resources used by PCI cards or read
+            // ACPI tables to obtain this information.
+            // When providing as much ACPI data as needed
+            // for hotplugging, then this is not optional any longer.
+            // Linux would complain if all this was not provided here.
+            Name (_CRS, ResourceTemplate () {
+                // Bus enumeration from _MIN to _MAX
+                WordBusNumber (
+                    ResourceProducer,
+                    MinFixed,     // _MIF
+                    MaxFixed,     // _MAF
+                    ,
+                    0x00,         // _GRA
+                    0x00,         // _MIN
+                    0xFF,         // _MAX
+                    0x00,         // _TRA
+                    0x100)        // _LEN
+                // IO ports usable by PCI from _MIN to _MAX
+                WordIO (
+                    ResourceProducer,
+                    MinFixed,     // _MIF
+                    MaxFixed,     // _MAF
+                    PosDecode,
+                    EntireRange,
+                    0x0000,       // _GRA
+                    0x0000,       // _MIN
+                    0x7FFF,       // _MAX
+                    0x00,         // _TRA
+                    0x8000)       // _LEN
+                // System memory for mapping BAR areas from _MIN to _MAX
+                // BAR = Base Address Register, every PCI card will
+                // usually have 2 of those.
+                DWordMemory (
+                    ResourceProducer,
+                    PosDecode,
+                    MinFixed,     // _MIF
+                    MaxFixed,     // _MAF
+                    NonCacheable, // _MEM
+                    ReadWrite,    // _RW
+                    0x00000000,   // _GRA
+                    0xE0000000,   // _MIN
+                    0xE0FFFFFF,   // _MAX
+                    0x00,         // _TRA
+                    0x01000000)   // _LEN
+            })
+
+            // This introduced three names dword fields in IO space.
+            // The hotplug controller knows these IO port.
+            // During hot plug/unplug, guest and the hosts hotplug-
+            // controller will communicate over these.
+            OperationRegion(PCST, SystemIO, 0xae00, 12)
+            Field (PCST, DWordAcc, NoLock, WriteAsZeros)
+            {
+                PCIU, 32, // IO port 0xae00
+                PCID, 32, // IO port 0xae04
+                B0EJ, 32, // IO port 0xae08
+            }
+
+            // Status method. Statically returns "Everything is up and working"
+            // because the PCI root bus will always be there.
+            Method (_STA, 0) { Return (0xf) }
+        }
+
+        // All this interrupt routing information is necessary.
+        // This defines the interrupts A, B, C, D, considered legacy
+        // nowadays.
+        // Hotplugging etc. will work without this anyway if the PCI device uses
+        // MSI for interrupting, but the kernel would complain with
+        // ugly error messages.
+        // This device definitions are kept as minimal as possible.
+        Device(LNKA){
+                Name(_HID, EISAID("PNP0C0F")) // PCI interrupt link
+                Name(_UID, 1)
+                Method (_STA, 0, NotSerialized)
+                {
+                    Return (0x0B)
+                }
+                Method (_CRS, 0, NotSerialized)
+                {
+                    Name (BUFF, ResourceTemplate ()
+                    {
+                        IRQ (Level, ActiveLow, Shared) {5}
+                    })
+                    Return (BUFF)
+                }
+                Method (_PRS, 0, NotSerialized)
+			    {
+				    Name (BUFF, ResourceTemplate ()
+                    {
+					IRQ (Level, ActiveLow, Shared) {5,9,10}
+                    })
+                    Return (BUFF)
+                }
+                Method (_SRS, 1, NotSerialized) {}
+                Method (_DIS, 0, NotSerialized) {}
+        }
+        Device(LNKB){
+                Name(_HID, EISAID("PNP0C0F")) // PCI interrupt link
+                Name(_UID, 2)
+                Method (_STA, 0, NotSerialized)
+                {
+                    Return (0x0B)
+                }
+                Method (_CRS, 0, NotSerialized)
+                {
+                    Name (BUFF, ResourceTemplate ()
+                    {
+                        IRQ (Level, ActiveLow, Shared) {10}
+                    })
+                    Return (BUFF)
+                }
+                Method (_PRS, 0, NotSerialized)
+			    {
+				    Name (BUFF, ResourceTemplate ()
+                    {
+					IRQ (Level, ActiveLow, Shared) {5,9,10}
+                    })
+                    Return (BUFF)
+                }
+                Method (_SRS, 1, NotSerialized) {}
+                Method (_DIS, 0, NotSerialized) {}
+        }
+        Device(LNKC){
+                Name(_HID, EISAID("PNP0C0F")) // PCI interrupt link
+                Name(_UID, 3)
+                Method (_STA, 0, NotSerialized)
+                {
+                    Return (0x0B)
+                }
+                Method (_CRS, 0, NotSerialized)
+                {
+                    Name (BUFF, ResourceTemplate ()
+                    {
+                        IRQ (Level, ActiveLow, Shared) {9}
+                    })
+                    Return (BUFF)
+                }
+                Method (_PRS, 0, NotSerialized)
+			    {
+				    Name (BUFF, ResourceTemplate ()
+                    {
+					IRQ (Level, ActiveLow, Shared) {5,9,10}
+                    })
+                    Return (BUFF)
+                }
+                Method (_SRS, 1, NotSerialized) {}
+                Method (_DIS, 0, NotSerialized) {}
+        }
+        Device(LNKD){
+                Name(_HID, EISAID("PNP0C0F")) // PCI interrupt link
+                Name(_UID, 4)
+                Method (_STA, 0, NotSerialized)
+                {
+                    Return (0x0B)
+                }
+                Method (_CRS, 0, NotSerialized)
+                {
+                    Name (BUFF, ResourceTemplate ()
+                    {
+                        IRQ (Level, ActiveLow, Shared) {5}
+                    })
+                    Return (BUFF)
+                }
+                Method (_PRS, 0, NotSerialized)
+			    {
+				    Name (BUFF, ResourceTemplate ()
+                    {
+					IRQ (Level, ActiveLow, Shared) {5,9,10}
+                    })
+                    Return (BUFF)
+                }
+                Method (_SRS, 1, NotSerialized) {}
+                Method (_DIS, 0, NotSerialized) {}
+        }
+
+    }
+
+    Scope(\_SB.PCI0) {
+        // These are PCI slot definitions.
+        // They are necessary because every PCI card
+        // which shall be ejectable, needs an _EJ0 method.
+        Device (S01) {
+           Name (_ADR, 0x10000)
+           Name (_SUN, 0x01) // SUN: Slot User Number
+
+           // This method is called by the operating system
+           // after unloading the device driver etc.
+           // _EJ0 = eject callback
+           Method (_EJ0, 1) { PCEJ(0x01) }
+        }
+
+        Device (S02) {
+           Name (_ADR, 0x20000)
+           Name (_SUN, 0x02)
+           Method (_EJ0, 1) { PCEJ(0x02) }
+        }
+
+        Device (S03) {
+           Name (_ADR, 0x30000)
+           Name (_SUN, 0x03)
+           Method (_EJ0, 1) { PCEJ(0x03) }
+        }
+
+        Device (S04) {
+           Name (_ADR, 0x40000)
+           Name (_SUN, 0x04)
+           Method (_EJ0, 1) { PCEJ(0x04) }
+        }
+
+        // Called by some PCI card's _EJ0 method,
+        // This tells the hypervisor to turn off the
+        // PCI device by writing (1 << PCI_ID) to the
+        // IO port associated with the B0EJ symbol.
+        Method (PCEJ, 1, NotSerialized) {
+            Store(ShiftLeft(1, Arg0), B0EJ)
+            Return (0x0)
+        }
+
+        // PCNT = PCi NoTify
+        // PCNT(<device>, <1 = check for inserted device / 3 = eject requested>)
+        // The values 1 and 3 are defined in the ACPI spec
+        Method(PCNT, 2) {
+            If (LEqual(Arg0, 0x01)) { Notify(S01, Arg1) }
+            If (LEqual(Arg0, 0x02)) { Notify(S02, Arg1) }
+            If (LEqual(Arg0, 0x03)) { Notify(S03, Arg1) }
+            If (LEqual(Arg0, 0x04)) { Notify(S04, Arg1) }
+        }
+
+        /* PCI hotplug notify method */
+        Method(PCNF, 0) {
+            // Local0 = iterator
+            Store (Zero, Local0)
+
+            // These two fields contain bits mapped
+            // to PCI devices, like in the GPE bitmap.
+
+            // bit (1 << N) set here --> Device N was inserted
+            Store (PCIU, Local1)
+            // bit (1 << N) set here --> Device N has to be removed
+            Store (PCID, Local2)
+
+            While (LLess(Local0, 4)) {
+                Increment(Local0)
+                If (And(Local1, ShiftLeft(1, Local0))) {
+                    PCNT(Local0, 1) // 1 => DEVICE CHECK
+                }
+                If (And(Local2, ShiftLeft(1, Local0))) {
+                    PCNT(Local0, 3) // 3 => EJECT REQUEST
+                }
+            }
+            Return(One)
+        }
+    }
+
+    Scope (\_GPE)
+    {
+        Name(_HID, "ACPI0006")
+
+        // These methods are wired to the according bits in the GPE bitmap.
+        // The hypervisor will raise bits and then send an interrupt 9.
+        // The ACPI code in the guest kernel will then dispatch one of these methods.
+        Method(_E01) {
+            \_SB.PCI0.PCNF() // PCI hotplug event
+        }
+    }
+
+} // end of definition block
diff --git a/executor/dsdt.h b/executor/dsdt.h
new file mode 100644
index 00000000..f8ee3611
--- /dev/null
+++ b/executor/dsdt.h
@@ -0,0 +1,179 @@
+/*
+ * To generate this file, download the iASL compiler from
+ * https://acpica.org/downloads (or install the "iasl" packet,
+ * if available for your distro) and then run:
+ * iasl -tc dsdt.asl && mv dsdt.hex dsdt.h
+ */
+
+/*
+ *
+ * Intel ACPI Component Architecture
+ * ASL Optimizing Compiler version 20130418-64 [May  8 2013]
+ * Copyright (c) 2000 - 2013 Intel Corporation
+ *
+ * Compilation of "dsdt.asl" - Thu Jun 20 15:28:32 2013
+ *
+ * C source code output
+ * AML code block contains 0x4E8 bytes
+ *
+ */
+unsigned char AmlCode[] =
+{
+    0x44,0x53,0x44,0x54,0xE8,0x04,0x00,0x00,  /* 00000000    "DSDT...." */
+    0x00,0x31,0x42,0x41,0x4D,0x4D,0x00,0x00,  /* 00000008    ".1BAMM.." */
+    0x4A,0x4F,0x4E,0x47,0x45,0x00,0x00,0x00,  /* 00000010    "JONGE..." */
+    0x01,0x00,0x00,0x00,0x49,0x4E,0x54,0x4C,  /* 00000018    "....INTL" */
+    0x18,0x04,0x13,0x20,0x10,0x40,0x33,0x5F,  /* 00000020    "... .@3_" */
+    0x53,0x42,0x5F,0x5B,0x82,0x4D,0x18,0x50,  /* 00000028    "SB_[.M.P" */
+    0x43,0x49,0x30,0x08,0x5F,0x48,0x49,0x44,  /* 00000030    "CI0._HID" */
+    0x0C,0x41,0xD0,0x0A,0x03,0x08,0x5F,0x41,  /* 00000038    ".A...._A" */
+    0x44,0x52,0x00,0x08,0x5F,0x55,0x49,0x44,  /* 00000040    "DR.._UID" */
+    0x00,0x08,0x5F,0x48,0x50,0x50,0x12,0x08,  /* 00000048    ".._HPP.." */
+    0x04,0x0A,0x08,0x0A,0x40,0x01,0x00,0x08,  /* 00000050    "....@..." */
+    0x5F,0x50,0x52,0x54,0x12,0x4B,0x0E,0x10,  /* 00000058    "_PRT.K.." */
+    0x12,0x0D,0x04,0x0C,0xFF,0xFF,0x01,0x00,  /* 00000060    "........" */
+    0x00,0x4C,0x4E,0x4B,0x41,0x00,0x12,0x0D,  /* 00000068    ".LNKA..." */
+    0x04,0x0C,0xFF,0xFF,0x01,0x00,0x01,0x4C,  /* 00000070    ".......L" */
+    0x4E,0x4B,0x42,0x00,0x12,0x0E,0x04,0x0C,  /* 00000078    "NKB....." */
+    0xFF,0xFF,0x01,0x00,0x0A,0x02,0x4C,0x4E,  /* 00000080    "......LN" */
+    0x4B,0x43,0x00,0x12,0x0E,0x04,0x0C,0xFF,  /* 00000088    "KC......" */
+    0xFF,0x01,0x00,0x0A,0x03,0x4C,0x4E,0x4B,  /* 00000090    ".....LNK" */
+    0x44,0x00,0x12,0x0D,0x04,0x0C,0xFF,0xFF,  /* 00000098    "D......." */
+    0x02,0x00,0x00,0x4C,0x4E,0x4B,0x41,0x00,  /* 000000A0    "...LNKA." */
+    0x12,0x0D,0x04,0x0C,0xFF,0xFF,0x02,0x00,  /* 000000A8    "........" */
+    0x01,0x4C,0x4E,0x4B,0x42,0x00,0x12,0x0E,  /* 000000B0    ".LNKB..." */
+    0x04,0x0C,0xFF,0xFF,0x02,0x00,0x0A,0x02,  /* 000000B8    "........" */
+    0x4C,0x4E,0x4B,0x43,0x00,0x12,0x0E,0x04,  /* 000000C0    "LNKC...." */
+    0x0C,0xFF,0xFF,0x02,0x00,0x0A,0x03,0x4C,  /* 000000C8    ".......L" */
+    0x4E,0x4B,0x44,0x00,0x12,0x0D,0x04,0x0C,  /* 000000D0    "NKD....." */
+    0xFF,0xFF,0x03,0x00,0x00,0x4C,0x4E,0x4B,  /* 000000D8    ".....LNK" */
+    0x41,0x00,0x12,0x0D,0x04,0x0C,0xFF,0xFF,  /* 000000E0    "A......." */
+    0x03,0x00,0x01,0x4C,0x4E,0x4B,0x42,0x00,  /* 000000E8    "...LNKB." */
+    0x12,0x0E,0x04,0x0C,0xFF,0xFF,0x03,0x00,  /* 000000F0    "........" */
+    0x0A,0x02,0x4C,0x4E,0x4B,0x43,0x00,0x12,  /* 000000F8    "..LNKC.." */
+    0x0E,0x04,0x0C,0xFF,0xFF,0x03,0x00,0x0A,  /* 00000100    "........" */
+    0x03,0x4C,0x4E,0x4B,0x44,0x00,0x12,0x0D,  /* 00000108    ".LNKD..." */
+    0x04,0x0C,0xFF,0xFF,0x04,0x00,0x00,0x4C,  /* 00000110    ".......L" */
+    0x4E,0x4B,0x41,0x00,0x12,0x0D,0x04,0x0C,  /* 00000118    "NKA....." */
+    0xFF,0xFF,0x04,0x00,0x01,0x4C,0x4E,0x4B,  /* 00000120    ".....LNK" */
+    0x42,0x00,0x12,0x0E,0x04,0x0C,0xFF,0xFF,  /* 00000128    "B......." */
+    0x04,0x00,0x0A,0x02,0x4C,0x4E,0x4B,0x43,  /* 00000130    "....LNKC" */
+    0x00,0x12,0x0E,0x04,0x0C,0xFF,0xFF,0x04,  /* 00000138    "........" */
+    0x00,0x0A,0x03,0x4C,0x4E,0x4B,0x44,0x00,  /* 00000140    "...LNKD." */
+    0x08,0x5F,0x43,0x52,0x53,0x11,0x3F,0x0A,  /* 00000148    "._CRS.?." */
+    0x3C,0x88,0x0D,0x00,0x02,0x0C,0x00,0x00,  /* 00000150    "<......." */
+    0x00,0x00,0x00,0xFF,0x00,0x00,0x00,0x00,  /* 00000158    "........" */
+    0x01,0x88,0x0D,0x00,0x01,0x0C,0x03,0x00,  /* 00000160    "........" */
+    0x00,0x00,0x00,0xFF,0x7F,0x00,0x00,0x00,  /* 00000168    "........" */
+    0x80,0x87,0x17,0x00,0x00,0x0C,0x01,0x00,  /* 00000170    "........" */
+    0x00,0x00,0x00,0x00,0x00,0x00,0xE0,0xFF,  /* 00000178    "........" */
+    0xFF,0xFF,0xE0,0x00,0x00,0x00,0x00,0x00,  /* 00000180    "........" */
+    0x00,0x00,0x01,0x79,0x00,0x5B,0x80,0x50,  /* 00000188    "...y.[.P" */
+    0x43,0x53,0x54,0x01,0x0B,0x00,0xAE,0x0A,  /* 00000190    "CST....." */
+    0x0C,0x5B,0x81,0x15,0x50,0x43,0x53,0x54,  /* 00000198    ".[..PCST" */
+    0x43,0x50,0x43,0x49,0x55,0x20,0x50,0x43,  /* 000001A0    "CPCIU PC" */
+    0x49,0x44,0x20,0x42,0x30,0x45,0x4A,0x20,  /* 000001A8    "ID B0EJ " */
+    0x14,0x09,0x5F,0x53,0x54,0x41,0x00,0xA4,  /* 000001B0    ".._STA.." */
+    0x0A,0x0F,0x5B,0x82,0x44,0x06,0x4C,0x4E,  /* 000001B8    "..[.D.LN" */
+    0x4B,0x41,0x08,0x5F,0x48,0x49,0x44,0x0C,  /* 000001C0    "KA._HID." */
+    0x41,0xD0,0x0C,0x0F,0x08,0x5F,0x55,0x49,  /* 000001C8    "A...._UI" */
+    0x44,0x01,0x14,0x09,0x5F,0x53,0x54,0x41,  /* 000001D0    "D..._STA" */
+    0x00,0xA4,0x0A,0x0B,0x14,0x1A,0x5F,0x43,  /* 000001D8    "......_C" */
+    0x52,0x53,0x00,0x08,0x42,0x55,0x46,0x46,  /* 000001E0    "RS..BUFF" */
+    0x11,0x09,0x0A,0x06,0x23,0x20,0x00,0x18,  /* 000001E8    "....# .." */
+    0x79,0x00,0xA4,0x42,0x55,0x46,0x46,0x14,  /* 000001F0    "y..BUFF." */
+    0x1A,0x5F,0x50,0x52,0x53,0x00,0x08,0x42,  /* 000001F8    "._PRS..B" */
+    0x55,0x46,0x46,0x11,0x09,0x0A,0x06,0x23,  /* 00000200    "UFF....#" */
+    0x20,0x06,0x18,0x79,0x00,0xA4,0x42,0x55,  /* 00000208    " ..y..BU" */
+    0x46,0x46,0x14,0x06,0x5F,0x53,0x52,0x53,  /* 00000210    "FF.._SRS" */
+    0x01,0x14,0x06,0x5F,0x44,0x49,0x53,0x00,  /* 00000218    "..._DIS." */
+    0x5B,0x82,0x45,0x06,0x4C,0x4E,0x4B,0x42,  /* 00000220    "[.E.LNKB" */
+    0x08,0x5F,0x48,0x49,0x44,0x0C,0x41,0xD0,  /* 00000228    "._HID.A." */
+    0x0C,0x0F,0x08,0x5F,0x55,0x49,0x44,0x0A,  /* 00000230    "..._UID." */
+    0x02,0x14,0x09,0x5F,0x53,0x54,0x41,0x00,  /* 00000238    "..._STA." */
+    0xA4,0x0A,0x0B,0x14,0x1A,0x5F,0x43,0x52,  /* 00000240    "....._CR" */
+    0x53,0x00,0x08,0x42,0x55,0x46,0x46,0x11,  /* 00000248    "S..BUFF." */
+    0x09,0x0A,0x06,0x23,0x00,0x04,0x18,0x79,  /* 00000250    "...#...y" */
+    0x00,0xA4,0x42,0x55,0x46,0x46,0x14,0x1A,  /* 00000258    "..BUFF.." */
+    0x5F,0x50,0x52,0x53,0x00,0x08,0x42,0x55,  /* 00000260    "_PRS..BU" */
+    0x46,0x46,0x11,0x09,0x0A,0x06,0x23,0x20,  /* 00000268    "FF....# " */
+    0x06,0x18,0x79,0x00,0xA4,0x42,0x55,0x46,  /* 00000270    "..y..BUF" */
+    0x46,0x14,0x06,0x5F,0x53,0x52,0x53,0x01,  /* 00000278    "F.._SRS." */
+    0x14,0x06,0x5F,0x44,0x49,0x53,0x00,0x5B,  /* 00000280    ".._DIS.[" */
+    0x82,0x45,0x06,0x4C,0x4E,0x4B,0x43,0x08,  /* 00000288    ".E.LNKC." */
+    0x5F,0x48,0x49,0x44,0x0C,0x41,0xD0,0x0C,  /* 00000290    "_HID.A.." */
+    0x0F,0x08,0x5F,0x55,0x49,0x44,0x0A,0x03,  /* 00000298    ".._UID.." */
+    0x14,0x09,0x5F,0x53,0x54,0x41,0x00,0xA4,  /* 000002A0    ".._STA.." */
+    0x0A,0x0B,0x14,0x1A,0x5F,0x43,0x52,0x53,  /* 000002A8    "...._CRS" */
+    0x00,0x08,0x42,0x55,0x46,0x46,0x11,0x09,  /* 000002B0    "..BUFF.." */
+    0x0A,0x06,0x23,0x00,0x02,0x18,0x79,0x00,  /* 000002B8    "..#...y." */
+    0xA4,0x42,0x55,0x46,0x46,0x14,0x1A,0x5F,  /* 000002C0    ".BUFF.._" */
+    0x50,0x52,0x53,0x00,0x08,0x42,0x55,0x46,  /* 000002C8    "PRS..BUF" */
+    0x46,0x11,0x09,0x0A,0x06,0x23,0x20,0x06,  /* 000002D0    "F....# ." */
+    0x18,0x79,0x00,0xA4,0x42,0x55,0x46,0x46,  /* 000002D8    ".y..BUFF" */
+    0x14,0x06,0x5F,0x53,0x52,0x53,0x01,0x14,  /* 000002E0    ".._SRS.." */
+    0x06,0x5F,0x44,0x49,0x53,0x00,0x5B,0x82,  /* 000002E8    "._DIS.[." */
+    0x45,0x06,0x4C,0x4E,0x4B,0x44,0x08,0x5F,  /* 000002F0    "E.LNKD._" */
+    0x48,0x49,0x44,0x0C,0x41,0xD0,0x0C,0x0F,  /* 000002F8    "HID.A..." */
+    0x08,0x5F,0x55,0x49,0x44,0x0A,0x04,0x14,  /* 00000300    "._UID..." */
+    0x09,0x5F,0x53,0x54,0x41,0x00,0xA4,0x0A,  /* 00000308    "._STA..." */
+    0x0B,0x14,0x1A,0x5F,0x43,0x52,0x53,0x00,  /* 00000310    "..._CRS." */
+    0x08,0x42,0x55,0x46,0x46,0x11,0x09,0x0A,  /* 00000318    ".BUFF..." */
+    0x06,0x23,0x20,0x00,0x18,0x79,0x00,0xA4,  /* 00000320    ".# ..y.." */
+    0x42,0x55,0x46,0x46,0x14,0x1A,0x5F,0x50,  /* 00000328    "BUFF.._P" */
+    0x52,0x53,0x00,0x08,0x42,0x55,0x46,0x46,  /* 00000330    "RS..BUFF" */
+    0x11,0x09,0x0A,0x06,0x23,0x20,0x06,0x18,  /* 00000338    "....# .." */
+    0x79,0x00,0xA4,0x42,0x55,0x46,0x46,0x14,  /* 00000340    "y..BUFF." */
+    0x06,0x5F,0x53,0x52,0x53,0x01,0x14,0x06,  /* 00000348    "._SRS..." */
+    0x5F,0x44,0x49,0x53,0x00,0x10,0x44,0x12,  /* 00000350    "_DIS..D." */
+    0x2E,0x5F,0x53,0x42,0x5F,0x50,0x43,0x49,  /* 00000358    "._SB_PCI" */
+    0x30,0x5B,0x82,0x21,0x53,0x30,0x31,0x5F,  /* 00000360    "0[.!S01_" */
+    0x08,0x5F,0x41,0x44,0x52,0x0C,0x00,0x00,  /* 00000368    "._ADR..." */
+    0x01,0x00,0x08,0x5F,0x53,0x55,0x4E,0x01,  /* 00000370    "..._SUN." */
+    0x14,0x0B,0x5F,0x45,0x4A,0x30,0x01,0x50,  /* 00000378    ".._EJ0.P" */
+    0x43,0x45,0x4A,0x01,0x5B,0x82,0x23,0x53,  /* 00000380    "CEJ.[.#S" */
+    0x30,0x32,0x5F,0x08,0x5F,0x41,0x44,0x52,  /* 00000388    "02_._ADR" */
+    0x0C,0x00,0x00,0x02,0x00,0x08,0x5F,0x53,  /* 00000390    "......_S" */
+    0x55,0x4E,0x0A,0x02,0x14,0x0C,0x5F,0x45,  /* 00000398    "UN...._E" */
+    0x4A,0x30,0x01,0x50,0x43,0x45,0x4A,0x0A,  /* 000003A0    "J0.PCEJ." */
+    0x02,0x5B,0x82,0x23,0x53,0x30,0x33,0x5F,  /* 000003A8    ".[.#S03_" */
+    0x08,0x5F,0x41,0x44,0x52,0x0C,0x00,0x00,  /* 000003B0    "._ADR..." */
+    0x03,0x00,0x08,0x5F,0x53,0x55,0x4E,0x0A,  /* 000003B8    "..._SUN." */
+    0x03,0x14,0x0C,0x5F,0x45,0x4A,0x30,0x01,  /* 000003C0    "..._EJ0." */
+    0x50,0x43,0x45,0x4A,0x0A,0x03,0x5B,0x82,  /* 000003C8    "PCEJ..[." */
+    0x23,0x53,0x30,0x34,0x5F,0x08,0x5F,0x41,  /* 000003D0    "#S04_._A" */
+    0x44,0x52,0x0C,0x00,0x00,0x04,0x00,0x08,  /* 000003D8    "DR......" */
+    0x5F,0x53,0x55,0x4E,0x0A,0x04,0x14,0x0C,  /* 000003E0    "_SUN...." */
+    0x5F,0x45,0x4A,0x30,0x01,0x50,0x43,0x45,  /* 000003E8    "_EJ0.PCE" */
+    0x4A,0x0A,0x04,0x14,0x11,0x50,0x43,0x45,  /* 000003F0    "J....PCE" */
+    0x4A,0x01,0x70,0x79,0x01,0x68,0x00,0x42,  /* 000003F8    "J.py.h.B" */
+    0x30,0x45,0x4A,0xA4,0x00,0x14,0x35,0x50,  /* 00000400    "0EJ...5P" */
+    0x43,0x4E,0x54,0x02,0xA0,0x0A,0x93,0x68,  /* 00000408    "CNT....h" */
+    0x01,0x86,0x53,0x30,0x31,0x5F,0x69,0xA0,  /* 00000410    "..S01_i." */
+    0x0B,0x93,0x68,0x0A,0x02,0x86,0x53,0x30,  /* 00000418    "..h...S0" */
+    0x32,0x5F,0x69,0xA0,0x0B,0x93,0x68,0x0A,  /* 00000420    "2_i...h." */
+    0x03,0x86,0x53,0x30,0x33,0x5F,0x69,0xA0,  /* 00000428    "..S03_i." */
+    0x0B,0x93,0x68,0x0A,0x04,0x86,0x53,0x30,  /* 00000430    "..h...S0" */
+    0x34,0x5F,0x69,0x14,0x3E,0x50,0x43,0x4E,  /* 00000438    "4_i.>PCN" */
+    0x46,0x00,0x70,0x00,0x60,0x70,0x50,0x43,  /* 00000440    "F.p.`pPC" */
+    0x49,0x55,0x61,0x70,0x50,0x43,0x49,0x44,  /* 00000448    "IUapPCID" */
+    0x62,0xA2,0x26,0x95,0x60,0x0A,0x04,0x75,  /* 00000450    "b.&.`..u" */
+    0x60,0xA0,0x0E,0x7B,0x61,0x79,0x01,0x60,  /* 00000458    "`..{ay.`" */
+    0x00,0x00,0x50,0x43,0x4E,0x54,0x60,0x01,  /* 00000460    "..PCNT`." */
+    0xA0,0x0F,0x7B,0x62,0x79,0x01,0x60,0x00,  /* 00000468    "..{by.`." */
+    0x00,0x50,0x43,0x4E,0x54,0x60,0x0A,0x03,  /* 00000470    ".PCNT`.." */
+    0xA4,0x01,0x10,0x4D,0x06,0x5F,0x47,0x50,  /* 00000478    "...M._GP" */
+    0x45,0x08,0x5F,0x48,0x49,0x44,0x0D,0x41,  /* 00000480    "E._HID.A" */
+    0x43,0x50,0x49,0x30,0x30,0x30,0x36,0x00,  /* 00000488    "CPI0006." */
+    0x14,0x15,0x5F,0x45,0x30,0x31,0x00,0x5C,  /* 00000490    ".._E01.\" */
+    0x2F,0x03,0x5F,0x53,0x42,0x5F,0x50,0x43,  /* 00000498    "/._SB_PC" */
+    0x49,0x30,0x50,0x43,0x4E,0x46,0x14,0x15,  /* 000004A0    "I0PCNF.." */
+    0x5F,0x45,0x30,0x32,0x00,0x5C,0x2F,0x03,  /* 000004A8    "_E02.\/." */
+    0x5F,0x53,0x42,0x5F,0x50,0x43,0x49,0x30,  /* 000004B0    "_SB_PCI0" */
+    0x50,0x43,0x4E,0x46,0x14,0x15,0x5F,0x45,  /* 000004B8    "PCNF.._E" */
+    0x30,0x33,0x00,0x5C,0x2F,0x03,0x5F,0x53,  /* 000004C0    "03.\/._S" */
+    0x42,0x5F,0x50,0x43,0x49,0x30,0x50,0x43,  /* 000004C8    "B_PCI0PC" */
+    0x4E,0x46,0x14,0x15,0x5F,0x45,0x30,0x34,  /* 000004D0    "NF.._E04" */
+    0x00,0x5C,0x2F,0x03,0x5F,0x53,0x42,0x5F,  /* 000004D8    ".\/._SB_" */
+    0x50,0x43,0x49,0x30,0x50,0x43,0x4E,0x46   /* 000004E0    "PCI0PCNF" */
+};
diff --git a/executor/vbios_reset.cc b/executor/vbios_reset.cc
index 0a875b10..e8e778d4 100644
--- a/executor/vbios_reset.cc
+++ b/executor/vbios_reset.cc
@@ -4,6 +4,8 @@
  * Copyright (C) 2009-2010, Bernhard Kauer <bk@vmmon.org>
  * Economic rights: Technische Universitaet Dresden (Germany)
  *
+ * Copyright (C) 2013 Jacek Galowicz, Intel Corporation.
+ *
  * This file is part of Vancouver.
  *
  * Vancouver is free software: you can redistribute it and/or modify
@@ -19,6 +21,11 @@
 #include "nul/motherboard.h"
 #include "executor/bios.h"
 
+/* This file contains the AML code of the DSDT in form
+ * of a string, which is available under the symbol name
+ * "AmlCode" */
+#include "dsdt.h"
+
 bool use_x2apic_mode;
 PARAM_HANDLER(x2apic_mode,
 	      "x2apic_mode - enable x2apic mode in the LAPICs")
@@ -144,6 +151,15 @@ class VirtualBiosReset : public StaticReceiver<VirtualBiosReset>, public BiosCom
     // the ACPI IRQ is 9
     discovery_write_dw("FACP",  46,          9, 2);
 
+    /* Initialize DSDT table.
+     * Its content is defined as AML bytecode in dsdt.h */
+    discovery_write_st("DSDT", 0, "DSDT", 4);
+
+    /* Initialize FACS table.
+     * The table is left empty. Linux demands its existence
+     * before switching to ACPI mode. */
+    discovery_write_st("FACS", 0, "FACS", 4);
+
     // store what remains on memory in KB
     discovery_write_dw("bda", 0x13, _mem_size >> 10, 2);
     return jmp_int(msg, 0x19);
@@ -220,6 +236,28 @@ class VirtualBiosReset : public StaticReceiver<VirtualBiosReset>, public BiosCom
       discovery_write_dw(name, 15, 0, 1);
       fix_acpi_checksum(_resources + index, 20, 8);
     }
+    else if (!strcmp("DSDT", name)) {
+        unsigned table;
+        check1(false, !(table = alloc(sizeof(AmlCode), 0x10)),
+                "allocate ACPI table failed");
+        _resources[index] = Resource(name, table, sizeof(AmlCode), true);
+
+        // FADT contains a pointer to the DSDT
+        discovery_write_dw("FACP", 40, table, 4);
+
+        /* The DSDT is completely defined as AML bytecode in dsdt.h
+         * which was compiled from ASL by the Intel ASL compiler */
+        memcpy(_mem_ptr + table, AmlCode, sizeof(AmlCode));
+    }
+    else if (!strcmp("FACS", name)) {
+        unsigned table;
+        check1(false, !(table = alloc(36, 64)), "allocate ACPI table failed");
+        _resources[index] = Resource(name, table, 36, true);
+        init_acpi_table(name);
+
+        // FADT contains a pointer to the FACS
+        discovery_write_dw("FACP", 36, table, 4);
+    }
     else {
       // we create an ACPI table
       size_t table;
diff --git a/model/acpicontroller.cc b/model/acpicontroller.cc
new file mode 100644
index 00000000..49441d9e
--- /dev/null
+++ b/model/acpicontroller.cc
@@ -0,0 +1,322 @@
+/**
+ * ACPI controller model
+ *
+ * Copyright (C) 2013 Jacek Galowicz, Intel Corporation.
+ *
+ * This file is part of Seoul.
+ *
+ * Seoul is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Seoul is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details.
+ */
+
+
+#include <nul/timer.h>
+#include <service/time.h>
+
+#include "nul/motherboard.h"
+#include "executor/bios.h"
+
+#define CMD_ACPI_ENABLE 0xab
+#define CMD_ACPI_DISABLE 0xba
+
+#define PORT_SMI_CMD        0xaeae
+
+/* The pm1 event register group is somewhat complicated.
+ * port numbers follow a partition rule of the register block.
+ * see ACPI spec 4.7.3.1
+ */
+#define PM1_EVT_LEN         4
+#define PORT_PM1A_EVENT_BLK     0xaea6
+#define PORT_PM1B_EVENT_BLK     0xaeaa
+#define PORT_PM1A_EVENT_STATUS  (PORT_PM1A_EVENT_BLK)
+#define PORT_PM1A_EVENT_ENABLE  (PORT_PM1A_EVENT_BLK + (PM1_EVT_LEN) / 2) // 0xa6 + 4/2 = 0xa8
+#define PORT_PM1B_EVENT_STATUS  (PORT_PM1B_EVENT_BLK)
+#define PORT_PM1B_EVENT_ENABLE  (PORT_PM1B_EVENT_BLK + (PM1_EVT_LEN) / 2) // 0xaa + 4/2 = 0xac
+
+#define PM1_CNT_LEN         2
+#define PORT_PM1A_CONTROL   0xaeb0
+#define PORT_PM1B_CONTROL   0xaeb2
+
+#define PORT_GPE0_STATUS    0xaeb4
+#define PORT_GPE1_STATUS    0xaeb5
+#define PORT_GPE0_ENABLE    (PORT_GPE0_STATUS + 2)
+#define PORT_GPE1_ENABLE    (PORT_GPE1_STATUS + 2)
+
+#define PORT_PCIU   0xae00
+#define PORT_PCID   0xae04
+#define PORT_B0EJ   0xae08
+
+
+class AcpiController : public StaticReceiver<AcpiController>, public BiosCommon
+{
+    private:
+        unsigned short _pm1a_status;
+        unsigned short _pm1a_enable;
+        unsigned short _pm1a_control;
+
+        unsigned short _pm1b_status;
+        unsigned short _pm1b_enable;
+        unsigned short _pm1b_control;
+
+        unsigned char _gpe0_sts;
+        unsigned char _gpe0_en;
+        unsigned char _gpe1_sts;
+        unsigned char _gpe1_en;
+
+        unsigned _b0ej; // write-only register
+        unsigned _pciu; // read-only, REFRESH register (card plugged in)
+        unsigned _pcid; // read-only, DETACH register (card to be unplugged)
+
+        bool _processed;
+
+        StopWatch _watch;
+
+    public:
+        void trigger_gpe(unsigned event_nr)
+        {
+
+            // Activate this event in the appropriate register
+            _gpe0_sts |=  0x00ff & (1 << event_nr);
+            _gpe1_sts |= (0xff00 & (1 << event_nr)) >> 8;
+
+            // If this event is masked by the guest, then just ignore it
+            if ((0 == _gpe0_sts & _gpe0_en) || (0 == _gpe1_sts & _gpe1_en))
+                return;
+
+            // Send the guest an SCI
+            MessageIrqLines msg(MessageIrq::ASSERT_IRQ, 9);
+            _mb.bus_irqlines.send(msg);
+        }
+
+        bool  receive(MessageDiscovery &msg) {
+            if (msg.type != MessageDiscovery::DISCOVERY) return false;
+
+            /* The following FADT entries will tell the guest kernel
+             * how to interact with the system when receiving
+             * System Control Interrupts (SCI).
+             * Only the GPE part is important for hot plugging, but
+             * all the PM-stuff is mandatory for event management
+             * to work.
+             */
+            discovery_write_dw("FACP", 56, PORT_PM1A_EVENT_BLK);
+            discovery_write_dw("FACP", 60, PORT_PM1B_EVENT_BLK);
+            discovery_write_dw("FACP", 64, PORT_PM1A_CONTROL);
+            discovery_write_dw("FACP", 68, PORT_PM1B_CONTROL);
+            discovery_write_dw("FACP", 88, PM1_EVT_LEN, 1);
+            discovery_write_dw("FACP", 89, PM1_CNT_LEN, 1);
+
+            discovery_write_dw("FACP", 80, PORT_GPE0_STATUS, 4); // GPE0_BLK
+            discovery_write_dw("FACP", 84, PORT_GPE1_STATUS, 4); // GPE1_BLK
+
+            discovery_write_dw("FACP", 92,  4, 1); // GPE0_BLK_LEN
+            discovery_write_dw("FACP", 93,  4, 1); // GPE1_BLK_LEN
+            discovery_write_dw("FACP", 94, 16, 1); // GPE1_BASE (offset)
+
+            /* This is used at boot once. Linux will write
+             * CMD_ACPI_ENABLE via system IO using port PORT_SMI_CMD
+             * to tell the mainboard it wants to use ACPI.
+             * If CMD_ACPI_ENABLE was defined as 0x00, the guest kernel
+             * would think that ACPI was always on. Therefore, this is
+             * optional and one could just erase the next three lines.
+             */
+            discovery_write_dw("FACP", 48, PORT_SMI_CMD);
+            discovery_write_dw("FACP", 52, CMD_ACPI_ENABLE, 1);
+            discovery_write_dw("FACP", 53, CMD_ACPI_DISABLE, 1);
+
+            return true;
+        }
+
+        bool  receive(MessageIOIn &msg) {
+            switch (msg.port) {
+                case PORT_PM1A_EVENT_STATUS:
+                    //Logging::printf("In on port pm1a EVENT STATUS: %x len %u\n", _pm1a_status, msg.type);
+                    msg.value = _pm1a_status;
+                    return true;
+                case PORT_PM1A_EVENT_ENABLE:
+                    //Logging::printf("In on port pm1a EVENT ENABLE: %x len %u\n", _pm1a_enable, msg.type);
+                    msg.value = _pm1a_enable;
+                    return true;
+                case PORT_PM1A_CONTROL:
+                    //Logging::printf("In on port pm1a CONTROL %x len %u\n", _pm1a_control, msg.type);
+                    msg.value = _pm1a_control;
+                    return true;
+
+                case PORT_PM1B_EVENT_STATUS:
+                    //Logging::printf("In on port pm1b EVENT STATUS: %x len %u\n", _pm1b_status, msg.type);
+                    msg.value = _pm1b_status;
+                    return true;
+                case PORT_PM1B_EVENT_ENABLE:
+                    //Logging::printf("In on port pm1b EVENT ENABLE: %x len %u\n", _pm1b_enable, msg.type);
+                    msg.value = _pm1b_enable;
+                    return true;
+                case PORT_PM1B_CONTROL:
+                    //Logging::printf("In on port pm1b CONTROL %x len %u\n", _pm1b_control, msg.type);
+                    msg.value = _pm1b_control;
+                    return true;
+
+
+                case PORT_GPE0_STATUS:
+                    //Logging::printf("In on port GPE0 STS: %x\n", _gpe0_sts);
+                    msg.value = _gpe0_sts;
+                    return true;
+                case PORT_GPE0_ENABLE:
+                    //Logging::printf("In on port GPE0 EN %x\n", _gpe0_en);
+                    msg.value = _gpe0_en;
+                    return true;
+                case PORT_GPE1_STATUS:
+                    //Logging::printf("In on port GPE1 STS: %x\n", _gpe1_sts);
+                    msg.value = _gpe1_sts;
+                    return true;
+                case PORT_GPE1_ENABLE:
+                    //Logging::printf("In on port GPE1 EN %x\n", _gpe1_en);
+                    msg.value = _gpe1_en;
+                    return true;
+
+                case PORT_PCIU:
+                    //Logging::printf("--- In on PCIU\n");
+                    msg.value = _pciu;
+                    return true;
+                case PORT_PCID:
+                    //Logging::printf("--- In on PCID\n");
+                    msg.value = _pcid;
+                    return true;
+                default:;
+            }
+            return false;
+        }
+
+        bool  receive(MessageIOOut &msg) {
+            switch (msg.port) {
+                case PORT_SMI_CMD:
+                    /* During boot the guest kernel checks PORT_SMI_CMD
+                     * in the ACPI FADT table. If SCI_EN is not set,
+                     * the system is in legacy mode. Hence it sends the
+                     * CMD_ACPI_ENABLE cmd it got from the FADT again to
+                     * this port and then polls for SCI_EN until it is set.
+                     * ACPI is then officially active. */
+                    if (msg.value == CMD_ACPI_ENABLE) {
+                        Logging::printf("Enabling ACPI for guest.\n");
+                        _pm1a_control |= 1; // Setting SCI_EN bit
+                    }
+                    else if (msg.value == CMD_ACPI_DISABLE) {
+                        Logging::printf("Disabling ACPI for guest.\n");
+                        _pm1a_control &= ~1U;
+                    }
+                    return true;
+
+                case PORT_PM1A_EVENT_STATUS:
+                    //Logging::printf("Out on port pm1a EVENT STATUS: %x len %u\n", msg.value, msg.type);
+                    return true;
+                case PORT_PM1A_EVENT_ENABLE:
+                    //Logging::printf("Out on port pm1a EVENT ENABLE: %x len %u\n", msg.value, msg.type);
+                    _pm1a_enable = static_cast<unsigned short>(msg.value);
+                    return true;
+                case PORT_PM1A_CONTROL:
+                    //Logging::printf("Out on port pm1a CONTROL %x len %u\n", msg.value, msg.type);
+                    return true;
+
+
+                case PORT_PM1B_EVENT_STATUS:
+                    //Logging::printf("Out on port pm1b EVENT STATUS: %x len %u\n", msg.value, msg.type);
+                    return true;
+                case PORT_PM1B_EVENT_ENABLE:
+                    //Logging::printf("Out on port pm1b EVENT ENABLE: %x len %u\n", msg.value, msg.type);
+                    _pm1a_enable = static_cast<unsigned short>(msg.value);
+                    return true;
+                case PORT_PM1B_CONTROL:
+                    //Logging::printf("Out on port pm1b CONTROL %x len %u\n", msg.value, msg.type);
+                    return true;
+
+                case PORT_GPE0_STATUS:
+                    //Logging::printf("Out on port GPE0 STS: %x len %u\n", msg.value, msg.type);
+                    _gpe0_sts &= ~ static_cast<unsigned char>(msg.value);
+                    return true;
+                case PORT_GPE0_ENABLE:
+                    //Logging::printf("Out on port GPE0 EN %x len %u\n", msg.value, msg.type);
+                    _gpe0_en = static_cast<unsigned char>(msg.value);
+                    return true;
+                case PORT_GPE1_STATUS:
+                    //Logging::printf("Out on port GPE1 STS: %x\n", msg.value);
+                    _gpe1_sts &= ~ static_cast<unsigned char>(msg.value);
+                    return true;
+                case PORT_GPE1_ENABLE:
+                    //Logging::printf("Out on port GPE1 EN %x\n", msg.value);
+                    _gpe1_en = static_cast<unsigned char>(msg.value);
+                    return true;
+
+                case PORT_B0EJ:
+                    _watch.stop();
+                    Logging::printf("PCI hot-unplug confirmed by guest "
+                            "(Output on B0EJ: %x) after %llu ms\n",
+                            msg.value, _watch.delta());
+                    _pcid &= ~msg.value;
+                    //Logging::printf("PCIU: %x, PCID: %x\n", _pciu, _pcid);
+                    return true;
+                default:;
+            }
+
+            /* Deassert this IRQ if all enabled events were cleared by the guest.
+             * This interrupt is thrown again otherwise. */
+            if (!(_pm1a_status & _pm1a_enable) &&
+                !(_pm1b_status & _pm1b_enable) &&
+                !(_gpe0_sts & _gpe0_en) &&
+                !(_gpe1_sts & _gpe1_en)) {
+                MessageIrqLines msg(MessageIrq::DEASSERT_IRQ, 9);
+                _mb.bus_irqlines.send(msg);
+            }
+
+            return false;
+        }
+
+        bool receive(MessageRestore &msg)
+        {
+            const mword bytes = reinterpret_cast<mword>(&_processed)
+                -reinterpret_cast<mword>(&_pm1a_status);
+
+            if (msg.devtype == MessageRestore::RESTORE_RESTART) {
+                _processed = false;
+                msg.bytes += bytes + sizeof(msg);
+                return false;
+            }
+
+            if (msg.devtype != MessageRestore::RESTORE_ACPI || _processed) return false;
+
+            if (msg.write) {
+                msg.bytes = bytes;
+                memcpy(msg.space, reinterpret_cast<void*>(&_pm1a_status), bytes);
+            }
+            else {
+                memcpy(reinterpret_cast<void*>(&_pm1a_status), msg.space, bytes);
+            }
+
+            Logging::printf("%s ACPI controller\n", msg.write?"Saved":"Restored");
+
+            _processed = true;
+            return true;
+        }
+
+        AcpiController(Motherboard &mb)
+            : BiosCommon(mb),
+            _pm1a_status(0), _pm1a_enable(0), _pm1a_control(0),
+            _pm1b_status(0), _pm1b_enable(0), _pm1b_control(0),
+            _gpe0_sts(0), _gpe0_en(0), _gpe1_sts(0), _gpe1_en(0),
+            _b0ej(0), _pciu(0), _pcid(0),
+            _processed(false), _watch(mb.clock())
+        { }
+};
+
+PARAM_HANDLER(acpimodel,
+        "acpimodel - Capable of issuing ACPI events to the guest.")
+{
+    AcpiController * dev = new AcpiController(mb);
+    mb.bus_discovery .add(dev, AcpiController::receive_static<MessageDiscovery>);
+    mb.bus_ioin      .add(dev, AcpiController::receive_static<MessageIOIn>);
+    mb.bus_ioout     .add(dev, AcpiController::receive_static<MessageIOOut>);
+    mb.bus_restore   .add(dev, AcpiController::receive_static<MessageRestore>);
+}
diff --git a/unix/SConstruct b/unix/SConstruct
index 78e24897..5e64e833 100644
--- a/unix/SConstruct
+++ b/unix/SConstruct
@@ -123,6 +123,7 @@ sources = Glob('*.cc') + [            # Unix frontend
       '../model/pmtimer.cc',
       '../model/vcpu.cc',
       '../model/vbios.cc',
+      '../model/acpicontroller.cc',
       '../model/lapic.cc',
       '../model/msi.cc',
       '../host/hostkeyboard.cc',

From 57432ce72497092baad78a6c207b3067ea279e2b Mon Sep 17 00:00:00 2001
From: Jacek Galowicz <jacek.galowicz@intel.com>
Date: Mon, 21 Oct 2013 12:39:44 +0200
Subject: [PATCH 17/35] Added a new message type: MessageAcpiEvent.

ACPI events can be rised with this, fixed and GP events.
---
 include/nul/message.h     | 17 +++++++++++++++++
 include/nul/motherboard.h |  1 +
 model/acpicontroller.cc   | 24 ++++++++++++++++++++++++
 3 files changed, 42 insertions(+)

diff --git a/include/nul/message.h b/include/nul/message.h
index 3e7660e9..ee20b3d5 100644
--- a/include/nul/message.h
+++ b/include/nul/message.h
@@ -567,6 +567,23 @@ struct MessageAcpi
   MessageAcpi(unsigned _parent_bdf, unsigned _bdf, unsigned char _pin): type(ACPI_GET_IRQ), parent_bdf(_parent_bdf), bdf(_bdf), pin(_pin), gsi(~0u) {}
 };
 
+/**
+ * Virtual ACPI: Fixed and General Purpose Events
+ * can be triggered with these messages
+ */
+struct MessageAcpiEvent
+{
+    enum EventType {
+        ACPI_EVENT_FIXED,
+        ACPI_EVENT_GP,
+        ACPI_EVENT_HOT_UNPLUG,
+        ACPI_EVENT_HOT_REPLUG,
+    } type;
+    unsigned num;
+
+    MessageAcpiEvent(EventType _type, unsigned _num)
+        : type(_type), num(_num) {};
+};
 
 /**
  * Resource discovery between device models is done by the virtual
diff --git a/include/nul/motherboard.h b/include/nul/motherboard.h
index a0084053..6fb2f149 100644
--- a/include/nul/motherboard.h
+++ b/include/nul/motherboard.h
@@ -50,6 +50,7 @@ class Motherboard
 
  public:
   DBus<MessageAcpi>         bus_acpi;
+  DBus<MessageAcpiEvent>    bus_acpi_event;
   DBus<MessageAhciSetDrive> bus_ahcicontroller;
   DBus<MessageApic>         bus_apic;
   DBus<MessageBios>         bus_bios;
diff --git a/model/acpicontroller.cc b/model/acpicontroller.cc
index 49441d9e..d293a112 100644
--- a/model/acpicontroller.cc
+++ b/model/acpicontroller.cc
@@ -94,6 +94,29 @@ class AcpiController : public StaticReceiver<AcpiController>, public BiosCommon
             _mb.bus_irqlines.send(msg);
         }
 
+        bool  receive(MessageAcpiEvent &msg) {
+            switch (msg.type) {
+                case MessageAcpiEvent::ACPI_EVENT_GP:
+                    trigger_gpe(msg.num);
+                    break;
+                case MessageAcpiEvent::ACPI_EVENT_HOT_REPLUG:
+                    _pciu |= (1 << msg.num);
+                    trigger_gpe(1);
+                    break;
+                case MessageAcpiEvent::ACPI_EVENT_HOT_UNPLUG:
+                    _watch.start();
+                    _pcid |= (1 << msg.num);
+                    trigger_gpe(1);
+                    break;
+
+                case MessageAcpiEvent::ACPI_EVENT_FIXED:
+                default:
+                    return false;
+            }
+
+            return true;
+        }
+
         bool  receive(MessageDiscovery &msg) {
             if (msg.type != MessageDiscovery::DISCOVERY) return false;
 
@@ -318,5 +341,6 @@ PARAM_HANDLER(acpimodel,
     mb.bus_discovery .add(dev, AcpiController::receive_static<MessageDiscovery>);
     mb.bus_ioin      .add(dev, AcpiController::receive_static<MessageIOIn>);
     mb.bus_ioout     .add(dev, AcpiController::receive_static<MessageIOOut>);
+    mb.bus_acpi_event.add(dev, AcpiController::receive_static<MessageAcpiEvent>);
     mb.bus_restore   .add(dev, AcpiController::receive_static<MessageRestore>);
 }

From 8332c23023d5c177bc1f74bb70ad923d6cffdf39 Mon Sep 17 00:00:00 2001
From: Jacek Galowicz <jacek.galowicz@intel.com>
Date: Mon, 21 Oct 2013 13:04:56 +0200
Subject: [PATCH 18/35] Made PCI passthrough devices migratable by adding hot
 plug event handling code.

---
 host/migration.cc  |  3 +++
 model/pcidirect.cc | 16 ++++++++++++++++
 2 files changed, 19 insertions(+)

diff --git a/host/migration.cc b/host/migration.cc
index 25c1c9e1..19a7bdc2 100644
--- a/host/migration.cc
+++ b/host/migration.cc
@@ -336,6 +336,9 @@ bool Migration::listen(unsigned port, CpuState *vcpu_utcb)
 
     _socket->close();
 
+    MessageRestore replug_msg(MessageRestore::PCI_PLUG, NULL, true);
+    _mb->bus_restore.send(replug_msg, false);
+
     Logging::printf("That's it. Waking up VCPUs.\n");
     unfreeze_vcpus();
 
diff --git a/model/pcidirect.cc b/model/pcidirect.cc
index e8daaeeb..7929860b 100644
--- a/model/pcidirect.cc
+++ b/model/pcidirect.cc
@@ -4,6 +4,8 @@
  * Copyright (C) 2007-2010, Bernhard Kauer <bk@vmmon.org>
  * Economic rights: Technische Universitaet Dresden (Germany)
  *
+ * Copyright (C) 2013 Jacek Galowicz, Intel Corporation.
+ *
  * This file is part of Vancouver.
  *
  * Vancouver is free software: you can redistribute it and/or modify
@@ -393,6 +395,19 @@ class DirectPciDevice : public StaticReceiver<DirectPciDevice>, public HostVfPci
     return true;
   }
 
+  bool receive(MessageRestore &msg) {
+    if (msg.devtype != MessageRestore::PCI_PLUG) return false;
+
+    unsigned slot = (_guestbdf >> 3) & 0x1f;
+
+    MessageAcpiEvent amsg(msg.write ?
+            MessageAcpiEvent::ACPI_EVENT_HOT_REPLUG :
+            MessageAcpiEvent::ACPI_EVENT_HOT_UNPLUG,
+            slot);
+
+    _mb.bus_acpi_event.send(amsg);
+    return true;
+  }
 
 
   DirectPciDevice(Motherboard &mb, unsigned hbdf, unsigned guestbdf, bool assign,
@@ -461,6 +476,7 @@ class DirectPciDevice : public StaticReceiver<DirectPciDevice>, public HostVfPci
     if (map_mode != MAP_MODE_DISABLED)
       mb.bus_memregion.add(this, DirectPciDevice::receive_static<MessageMemRegion>);
     mb.bus_hostirq.add(this,     DirectPciDevice::receive_static<MessageIrq>);
+    mb.bus_restore.add(this,     DirectPciDevice::receive_static<MessageRestore>);
     //mb.bus_irqnotify.add(this, DirectPciDevice::receive_static<MessageIrqNotify>);
   }
 };

From c8ecf0689b4dd56f59d1d48fde2a449a8691b10b Mon Sep 17 00:00:00 2001
From: Jacek Galowicz <jacek.galowicz@intel.com>
Date: Mon, 21 Oct 2013 13:26:59 +0200
Subject: [PATCH 19/35] Made the NIC model migratable.

The restore procedure does automatically propagate its new position within the LAN.
---
 model/intel82576vf.cc | 96 ++++++++++++++++++++++++++++++++++++++++++-
 model/intel82576vf.h  | 36 ++++++++++++++++
 2 files changed, 131 insertions(+), 1 deletion(-)

diff --git a/model/intel82576vf.cc b/model/intel82576vf.cc
index abe9655e..612356ac 100644
--- a/model/intel82576vf.cc
+++ b/model/intel82576vf.cc
@@ -5,6 +5,8 @@
  * Copyright (C) 2010, Julian Stecklina <jsteckli@os.inf.tu-dresden.de>
  * Economic rights: Technische Universitaet Dresden (Germany)
  *
+ * Copyright (C) 2013 Jacek Galowicz, Intel Corporation.
+ *
  * This file is part of Vancouver.
  *
  * Vancouver is free software: you can redistribute it and/or modify
@@ -578,6 +580,40 @@ class Model82576vf : public StaticReceiver<Model82576vf>
     uint32 raw[3*4];
   } _msix;
 
+  unsigned _ip_address;
+  EthernetAddr _guest_uses_mac;
+  bool processed;
+
+  void update_ip(unsigned char *packet, unsigned packet_len)
+  {
+      unsigned short packet_type = * reinterpret_cast<unsigned short*>(packet + 12);
+      if (packet_type == 0x0608) {
+          unsigned char *mac = packet + 14 +  8; // Source MAC address
+          unsigned char *ip  = packet + 14 + 14; // Source IP address
+
+          EthernetAddr ethaddr(mac[0], mac[1], mac[2], mac[3], mac[4], mac[5]);
+
+#if 0
+          Logging::printf("Sending packet type %x from MAC %08llx, IP %x\n",
+                  static_cast<unsigned>(packet_type),
+                  ethaddr.raw, *reinterpret_cast<unsigned*>(ip));
+#endif
+
+          _guest_uses_mac = ethaddr;
+          _ip_address = * reinterpret_cast<unsigned*>(ip);
+      }
+  }
+
+
+  void arp_gratuitous(const EthernetAddr &addr, const bool request)
+  {
+      const arp_packet arp(_guest_uses_mac, addr, _ip_address,
+              request ? 0x100 /* ARP_REQUEST */ : 0x200 /* ARP_REPLY */);
+
+      MessageNetwork m(reinterpret_cast<const unsigned char*>(&arp), sizeof(arp), 0);
+      _net.send(m);
+  }
+
   uint32 VTFRTIMER_compute()
   {
     // XXX
@@ -886,6 +922,62 @@ class Model82576vf : public StaticReceiver<Model82576vf>
     return false;
   }
 
+  bool receive(MessageRestore &msg)
+  {
+      const mword bytes = reinterpret_cast<mword>(&processed)
+          -reinterpret_cast<mword>(&_mac);
+
+      if (msg.devtype == MessageRestore::RESTORE_RESTART) {
+          processed = false;
+          msg.bytes += bytes + 2 * 0x1000 + sizeof(msg);
+          return false;
+      }
+
+      if (msg.devtype != MessageRestore::RESTORE_NIC || processed) return false;
+
+      if (msg.write) {
+          msg.bytes = bytes + 2 * 0x1000;
+          memcpy(msg.space, reinterpret_cast<void*>(&_mac), bytes);
+          memcpy(msg.space + bytes, reinterpret_cast<void*>(_local_rx_regs), 0x1000);
+          memcpy(msg.space + bytes + 0x1000, reinterpret_cast<void*>(_local_tx_regs), 0x1000);
+      }
+      else {
+          uint32 *local_rx_regs = _local_rx_regs;
+          uint32 *local_tx_regs = _local_tx_regs;
+          Clock  *clock = _clock;
+
+          memcpy(reinterpret_cast<void*>(&_mac), msg.space, bytes);
+
+          _local_rx_regs = local_rx_regs;
+          _local_tx_regs = local_tx_regs;
+          _clock = clock;
+
+          memcpy(_local_rx_regs, msg.space + bytes, 0x1000);
+          memcpy(_local_tx_regs, msg.space + bytes + 0x1000, 0x1000);
+
+          _rx_queues[0].parent = this;
+          _rx_queues[0].regs = local_rx_regs;
+          _rx_queues[1].parent = this;
+          _rx_queues[1].regs = local_rx_regs + 0x100/4;
+          _tx_queues[0].parent = this;
+          _tx_queues[0].regs = local_tx_regs;
+          _tx_queues[1].parent = this;
+          _tx_queues[1].regs = local_tx_regs + 0x100/4;
+
+          if (_ip_address) {
+              Logging::printf("Trying to claim: MAC " MAC_FMT " IP %x\n",
+                      MAC_SPLIT((&_guest_uses_mac)), _ip_address);
+              for (int i=0; i < 3; ++i)
+                  arp_gratuitous(EthernetAddr(0xffffffffffffull), true);
+          }
+      }
+
+      Logging::printf("%s NIC\n", msg.write?"Saved":"Restored");
+      processed = true;
+      return true;
+  }
+
+
   Model82576vf(uint64 mac, DBus<MessageNetwork> &net,
 	       DBus<MessageMem> *bus_mem, DBus<MessageMemRegion> *bus_memregion,
 	       Clock *clock, DBus<MessageTimer> &timer,
@@ -895,7 +987,8 @@ class Model82576vf : public StaticReceiver<Model82576vf>
       _clock(clock), _timer(timer),
       _mem_mmio(mem_mmio), _mem_msix(mem_msix),
       _txpoll_us(txpoll_us), _map_rx(map_rx), _bdf(bdf),
-      _promisc_default(promisc_default)
+      _promisc_default(promisc_default), _ip_address(0), _guest_uses_mac(0),
+      processed(false)
   {
     Logging::printf("Attached 82576VF model at %08x+0x4000, %08x+0x1000\n",
 		    mem_mmio, mem_msix);
@@ -946,6 +1039,7 @@ PARAM_HANDLER(intel82576vf,
   mb.bus_network. add(dev, &Model82576vf::receive_static<MessageNetwork>);
   mb.bus_timeout. add(dev, &Model82576vf::receive_static<MessageTimeout>);
   mb.bus_legacy.  add(dev, &Model82576vf::receive_static<MessageLegacy>);
+  mb.bus_restore. add(dev, &Model82576vf::receive_static<MessageRestore>);
 }
 
 
diff --git a/model/intel82576vf.h b/model/intel82576vf.h
index b857d378..7da55770 100644
--- a/model/intel82576vf.h
+++ b/model/intel82576vf.h
@@ -5,6 +5,8 @@
  * Copyright (C) 2010, Julian Stecklina <jsteckli@os.inf.tu-dresden.de>
  * Economic rights: Technische Universitaet Dresden (Germany)
  *
+ * Copyright (C) 2013 Jacek Galowicz, Intel Corporation.
+ *
  * This file is part of Vancouver.
  *
  * Vancouver is free software: you can redistribute it and/or modify
@@ -44,4 +46,38 @@ class Mta {
   Mta() : _bits() { }
 };
 
+struct arp_packet {
+    unsigned char destination[6];
+    unsigned char source[6];
+    unsigned short eth_type;
+    unsigned short hw_type;
+    unsigned short protocol_type;
+    unsigned char hwaddr_len;
+    unsigned char protocoladdr_len;
+    unsigned short operation;
+    unsigned char sender_hwaddr[6];
+    unsigned sender_ip;
+    unsigned char target_hwaddr[6];
+    unsigned target_ip;
+
+    arp_packet(EthernetAddr src, EthernetAddr dst, unsigned ip_addr,
+            unsigned short _operation)
+        :
+            eth_type(0x608), hw_type(0x100), protocol_type(0x8), hwaddr_len(6),
+            protocoladdr_len(4), operation(_operation),
+            sender_ip(ip_addr), target_ip(ip_addr)
+    {
+        memcpy(destination, dst.byte, 6);
+        memset(target_hwaddr, 0, 6);
+        memcpy(source, src.byte, 6);
+        memcpy(sender_hwaddr, src.byte, 6);
+    }
+
+    bool source_is(const EthernetAddr &a) const
+    {
+        EthernetAddr my_addr(*reinterpret_cast<const uint64*>(destination));
+        return my_addr == a;
+    }
+} __attribute__((packed));
+
 // EOF

From 5792f3db1641aac9153e1891b249b89dd79091b8 Mon Sep 17 00:00:00 2001
From: Jacek Galowicz <jacek.galowicz@intel.com>
Date: Mon, 28 Oct 2013 17:12:03 +0100
Subject: [PATCH 20/35] Move guest physical memory information retrieval into
 the listen/send methods.

This has been done in the Migration class constructor, but this was too early after reordering VMM parameters for live migration retrieval.
---
 host/migration.cc       | 19 +++++++++++++------
 include/nul/migration.h |  1 +
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/host/migration.cc b/host/migration.cc
index 19a7bdc2..811544a0 100644
--- a/host/migration.cc
+++ b/host/migration.cc
@@ -35,6 +35,15 @@ Migration::Migration(Motherboard *mb)
     _vcpu_should_block(false),
     _socket(NULL),
     _sendmem(0), _sendmem_total(0)
+{
+       _vcpu_utcb = new CpuState;
+}
+
+Migration::~Migration()
+{
+}
+
+void Migration::init_memrange_info()
 {
     MessageHostOp msg(MessageHostOp::OP_GUEST_MEM, 0UL);
     if (!_mb->bus_hostop.send(msg))
@@ -45,12 +54,6 @@ Migration::Migration(Motherboard *mb)
     _physmem_size  = msg.len;
 
     _dirtman = DirtManager(_physmem_size >> 12);
-
-    _vcpu_utcb = new CpuState;
-}
-
-Migration::~Migration()
-{
 }
 
 void Migration::save_guestregs(CpuState *utcb)
@@ -313,6 +316,8 @@ bool Migration::receive_guestdevices(CpuState *vcpu_utcb)
 
 bool Migration::listen(unsigned port, CpuState *vcpu_utcb)
 {
+    init_memrange_info();
+
     print_welcomescreen();
 
     _socket = IpHelper::instance().listen(port);
@@ -626,6 +631,8 @@ bool Migration::send(unsigned long addr, unsigned long port)
     StopWatch freeze_timer(_mb->clock());
     longrange_data async_data;
 
+    init_memrange_info();
+
     Logging::printf("Trying to connect...\n");
     _socket = IpHelper::instance().connect(addr, port);
     if (_socket == NULL) {
diff --git a/include/nul/migration.h b/include/nul/migration.h
index f64cdd51..9b66e7e0 100644
--- a/include/nul/migration.h
+++ b/include/nul/migration.h
@@ -235,6 +235,7 @@ class Migration : public StaticReceiver<Migration>
 
     DirtManager _dirtman;
 
+    void init_memrange_info();
     void print_welcomescreen();
     bool puts_guestscreen(const char *str, bool reset_screen);
 

From 2d74868e313ae53d7829c74dc72c5c86929201af Mon Sep 17 00:00:00 2001
From: Jacek Galowicz <jacek.galowicz@intel.com>
Date: Tue, 29 Oct 2013 14:54:12 +0100
Subject: [PATCH 21/35] Added a new field "actual_physmem" to MessageMemRegion
 messages.

Users of the memory bus can now determine if they are working with actual guest-physmem.
---
 include/nul/message.h     | 3 ++-
 model/memorycontroller.cc | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/include/nul/message.h b/include/nul/message.h
index ee20b3d5..be27d20d 100644
--- a/include/nul/message.h
+++ b/include/nul/message.h
@@ -112,7 +112,8 @@ struct MessageMemRegion
   uintptr_t start_page;
   unsigned      count;
   char *        ptr;
-  MessageMemRegion(uintptr_t _page) : page(_page), count(0), ptr(0) {}
+  bool      actual_physmem;
+  MessageMemRegion(uintptr_t _page) : page(_page), count(0), ptr(0), actual_physmem(false) {}
 };
 
 
diff --git a/model/memorycontroller.cc b/model/memorycontroller.cc
index c61119eb..d331ef5a 100644
--- a/model/memorycontroller.cc
+++ b/model/memorycontroller.cc
@@ -45,6 +45,7 @@ class MemoryController : public StaticReceiver<MemoryController>
     msg.start_page = _start >> 12;
     msg.count = (_end - _start) >> 12;
     msg.ptr = _physmem + _start;
+    msg.actual_physmem = true;
     return true;
   }
 

From 2005dc2dd5de8fa3a5dfb5a5e8bf25717749025c Mon Sep 17 00:00:00 2001
From: Jacek Galowicz <jacek.galowicz@intel.com>
Date: Tue, 29 Oct 2013 16:19:32 +0100
Subject: [PATCH 22/35] Rewrote the MessageHostOp OP_NEXT_DIRTY_REGION

From now on, only actual guest-physmem will be tracked.
---
 unix/main.cc | 49 ++++++++++++++++++++++++++++++++++---------------
 1 file changed, 34 insertions(+), 15 deletions(-)

diff --git a/unix/main.cc b/unix/main.cc
index 816d92be..530403dd 100644
--- a/unix/main.cc
+++ b/unix/main.cc
@@ -384,43 +384,62 @@ static bool receive(Device *, MessageHostOp &msg)
          * - pageptr wraps around if it exceeds guest mem size.
          */
 #if PORTED_TO_UNIX
-        static unsigned long pageptr = 0;
         const unsigned physpages = _physsize >> 12;
+        static unsigned long pageptr = 0;
 
-        // Setting this to true makes the map_memory_helper function
-        // remap with page size
         _track_page_usage = true;
 
         Crd reg = nova_lookup(Crd(pageptr, 0, DESC_MEM_ALL));
         // There will be several mappings, but we want to see the ones
         // which are set to "writable by the guest"
 
-        unsigned long oldptr = pageptr;
-        while (!(reg.attr() & DESC_RIGHT_W)) {
-            pageptr = (pageptr + 1) % physpages;
-            if (pageptr == oldptr) {
-                // Come back later, please.
+        unsigned increment = 0;
+        do {
+            if (increment >= physpages) {
+                // That's it for now. Come back later.
                 msg.value = 0;
                 return true;
-        }
-
-        reg = nova_lookup(Crd(pageptr, 0, DESC_MEM_ALL));
-        }
+            }
+            MessageMemRegion mmsg(pageptr);
+            if (!_mb->bus_memregion.send(mmsg, true)) {
+                // No one claims this region. Do not track.
+                pageptr = (pageptr + 1) % physpages;
+                ++increment;
+                continue;
+            }
+            if (!mmsg.actual_physmem) {
+                // This is no physmem.
+                pageptr += mmsg.count;
+                increment += mmsg.count;
+                if (pageptr > physpages) pageptr = 0;
+                continue;
+            }
+            reg = nova_lookup(Crd(pageptr, 0, DESC_MEM_ALL));
+            if (!(reg.attr() & DESC_RIGHT_W)) {
+                // Not write-mapped, hence not dirty.
+                pageptr += 1 << reg.order();
+                increment += 1 << reg.order();
+                if (pageptr > physpages) pageptr = 0;
+                continue;
+            }
+
+            break;
+        } while (1);
 
         // reg now describes a region which is guest-writable
-        // This means that the guest wrote to it before and it is considered "dirty"
+        // This means that the guest wrote to it before and it is now considered "dirty"
 
         // Tell the user "where" and "how many"
         msg.phys    = pageptr << 12;
         msg.phys_len = reg.order();
-
         msg.value = reg.value();
 
         // Make this page read-only for the guest, so it is considered "clean" now.
         nova_revoke(Crd((reg.base() + _physmem) >> 12, reg.order(),
-        DESC_RIGHT_W | DESC_TYPE_MEM), false);
+                    DESC_RIGHT_W | DESC_TYPE_MEM), false);
         pageptr += 1 << reg.order();
         if (pageptr >= physpages) pageptr = 0;
+
 #endif
         return true;
     }

From bf5f6b3f6b67992e907d91541fa0c4f6425e7288 Mon Sep 17 00:00:00 2001
From: Jacek Galowicz <jacek.galowicz@intel.com>
Date: Tue, 29 Oct 2013 16:44:59 +0100
Subject: [PATCH 23/35] Rewrote the checksumming routine.

It will now only check memory ranges which are actual guest-physmem.
---
 host/migration.cc | 72 +++++++++++++++++++++++++++++++----------------
 1 file changed, 48 insertions(+), 24 deletions(-)

diff --git a/host/migration.cc b/host/migration.cc
index 811544a0..9ea1aab2 100644
--- a/host/migration.cc
+++ b/host/migration.cc
@@ -147,47 +147,71 @@ bool Migration::chksum_page(unsigned page_nr, mword &their_chksum, bool compare)
 
 bool Migration::checksums(bool retrieve)
 {
-    unsigned entries = _physmem_size >> 12;
+    mword pagenr = 0;
+    mword checksum;
+    mword magic = 0xfafab0b0;
     bool success = true;
 
-    mword *chksum = new mword[entries];
-    if (!chksum) Logging::panic("Allocating checksum list error\n");
-
-    Logging::printf("Checksumming the area [%8lx - %8lx)\n",
-            reinterpret_cast<mword>(_physmem_start),
-            reinterpret_cast<mword>(_physmem_start + 4096 * entries));
-
     if (retrieve) {
         // Receiver. Check the existing checksum list against our memory
-        _socket->receive(chksum, entries * sizeof(chksum[0]));
+        mword rec_magic;
 
-        unsigned err = 0;
+        _socket->receive(&rec_magic, sizeof(rec_magic));
+        _socket->receive(&pagenr, sizeof(pagenr));
+        _socket->receive(&checksum, sizeof(checksum));
 
-        for (unsigned i=0; i < entries; ++i) {
-            bool ret = chksum_page(i, chksum[i], true);
-            if (!ret) {
-                ++err;
-                Logging::printf("bad page received. page number: %8x\n", i);
-            }
-            success &= ret;
-        }
+        while (pagenr != ~0ul) {
+            assert(magic == rec_magic);
+            MessageMemRegion mmsg(pagenr);
+            assert(_mb->bus_memregion.send(mmsg, true));
+            assert(mmsg.actual_physmem);
 
-        Logging::printf("Erroneous pages: %u\n", err);
+            bool area_success = chksum_page(mmsg.start_page, checksum, true);
+            success &= area_success;
+
+            Logging::printf("Checksum of area [%8lx - %8lx) - %s\n",
+                reinterpret_cast<mword>(mmsg.start_page),
+                reinterpret_cast<mword>(mmsg.start_page + mmsg.count),
+                area_success ? "OK" : "Error");
+
+            _socket->receive(&rec_magic, sizeof(rec_magic));
+            _socket->receive(&pagenr, sizeof(pagenr));
+            _socket->receive(&checksum, sizeof(checksum));
+        }
     }
     else {
         // Sender. Make a list of checksums and send it away.
 
-        for (unsigned i=0; i < entries; ++i)
-            chksum_page(i, chksum[i], false);
+        while (pagenr < _physmem_size) {
+            MessageMemRegion mmsg(pagenr);
+            if (!_mb->bus_memregion.send(mmsg, true) || !mmsg.actual_physmem) {
+                // No one claims this region. do not check.
+                ++pagenr;
+                continue;
+            }
 
-        success &= _socket->send(chksum, entries * sizeof(chksum[0]));
-    }
+            Logging::printf("Checksumming the area [%8lx - %8lx)\n",
+                reinterpret_cast<mword>(mmsg.start_page),
+                reinterpret_cast<mword>(mmsg.start_page + mmsg.count));
 
-    delete [] chksum;
+            chksum_page(pagenr, checksum, false);
+            success &= _socket->send(&magic, sizeof(magic));
+            success &= _socket->send(&pagenr, sizeof(pagenr));
+            success &= _socket->send(&checksum, sizeof(checksum));
+
+            pagenr += mmsg.count;
+        }
+
+        pagenr = ~0ul;
+        success &= _socket->send(&magic, sizeof(magic));
+        success &= _socket->send(&pagenr, sizeof(pagenr));
+        success &= _socket->send(&pagenr, sizeof(pagenr));
+    }
 
     return success;
 }
 
+
 /***********************************************************************
  * Guest receiving part
  ***********************************************************************/

From fcdea6fdade48f58b2c0301c31a72bd0d4092f38 Mon Sep 17 00:00:00 2001
From: Jacek Galowicz <jacek.galowicz@intel.com>
Date: Tue, 29 Oct 2013 16:47:30 +0100
Subject: [PATCH 24/35] Made checksumming optional with a preprocessor #define.

In general the transfer has demonstrated to be errorfree.
However, checksumming is useful to find out if changes on the tracking mechanism etc. provoke data corruption.
---
 host/migration.cc | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/host/migration.cc b/host/migration.cc
index 9ea1aab2..f9aa3b00 100644
--- a/host/migration.cc
+++ b/host/migration.cc
@@ -15,6 +15,13 @@
  * General Public License version 2 for more details.
  */
 
+/* Activate checksumming for debugging purposes
+ * of the received range after migrating.
+ * As this really makes the freeze gap larger,
+ * this should only be used for testing when
+ * the migration algorithm is changed. */
+//#define DO_CHECKSUMMING
+
 
 #include <stdio.h> // snprintf
 
@@ -355,7 +362,7 @@ bool Migration::listen(unsigned port, CpuState *vcpu_utcb)
 
     receive_guestdevices(vcpu_utcb);
 
-#if 0
+#ifdef DO_CHECKSUMMING
     // Checksumming really makes the migration gap larger
     if (!checksums(true)) {
         Logging::printf("Error while comparing checksums.\n");
@@ -709,7 +716,7 @@ bool Migration::send(unsigned long addr, unsigned long port)
         return false;
     }
 
-#if 0
+#ifdef DO_CHECKSUMMING
     // Checksumming really makes the freeze gap larger
     if (!checksums(false)) {
         Logging::printf("Error while sending checksums.\n");

From 81d7fd702bf83f1c66336278fed351978ce213f0 Mon Sep 17 00:00:00 2001
From: Jacek Galowicz <jacek.galowicz@intel.com>
Date: Wed, 30 Oct 2013 13:10:50 +0100
Subject: [PATCH 25/35] Fix live migration for larger and memory-aggressive VMs

The last resend round did tend to be uncomplete in the scenario of both slow ethernet and large Writeable Working Sets.
---
 host/migration.cc       | 20 ++++++++++++--------
 include/nul/migration.h |  5 +++--
 2 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/host/migration.cc b/host/migration.cc
index f9aa3b00..85fb2002 100644
--- a/host/migration.cc
+++ b/host/migration.cc
@@ -30,7 +30,6 @@
 
 #include <nul/migration.h>
 #include <service/vprintf.h>
-#include <service/time.h>
 
 Migration::Migration(Motherboard *mb)
 : _mb(mb),
@@ -41,7 +40,8 @@ Migration::Migration(Motherboard *mb)
 #endif
     _vcpu_should_block(false),
     _socket(NULL),
-    _sendmem(0), _sendmem_total(0)
+    _sendmem(0), _sendmem_total(0),
+    _freeze_timer(_mb->clock())
 {
        _vcpu_utcb = new CpuState;
 }
@@ -120,6 +120,8 @@ void Migration::freeze_vcpus()
 #if PORTED_TO_UNIX
     _vcpu_blocked_sem.downmulti();
 #endif
+
+    _freeze_timer.start();
 }
 
 void Migration::unfreeze_vcpus()
@@ -568,9 +570,13 @@ bool Migration::send_memory(longrange_data &async_data)
     // The last transfer round with a frozen guest system will follow now
     freeze_vcpus();
 
+    unsigned freeze_pages = 0;
+    while ((freeze_pages = enqueue_all_dirty_pages(async_data)) > 0) {
+        if (!_socket->wait_complete()) return false;
+        pages_transferred += freeze_pages;
+    }
+
     static Prd end_of_crds;
-    pages_transferred = enqueue_all_dirty_pages(async_data);
-    Logging::printf("pages_dirty: %x\n", pages_transferred);
     if (!pages_transferred ||
         !_socket->send_nonblocking(&end_of_crds, sizeof(end_of_crds)))
         return false;
@@ -659,7 +665,6 @@ bool Migration::send_devices(longrange_data dat)
 bool Migration::send(unsigned long addr, unsigned long port)
 {
     StopWatch migration_timer(_mb->clock());
-    StopWatch freeze_timer(_mb->clock());
     longrange_data async_data;
 
     init_memrange_info();
@@ -709,7 +714,6 @@ bool Migration::send(unsigned long addr, unsigned long port)
         Logging::printf("Sending guest state failed.\n");
         return false;
     }
-    freeze_timer.start();
 
     if (!send_devices(async_data)) {
         Logging::printf("Sending guest devices failed.\n");
@@ -727,13 +731,13 @@ bool Migration::send(unsigned long addr, unsigned long port)
     // Uncomment this to "clone" the VM instead of migrating it away.
     //unfreeze_vcpus();
 
-    freeze_timer.stop();
+    _freeze_timer.stop();
 
     _socket->close();
 
     migration_timer.stop();
 
-    Logging::printf("Done. VM was frozen for %llu ms.\n", freeze_timer.delta());
+    Logging::printf("Done. VM was frozen for %llu ms.\n", _freeze_timer.delta());
     Logging::printf("This migration took %llu seconds.\n",
             migration_timer.delta() / 1000);
     Logging::printf("%3lu%% (%lu MB) of guest memory resent due to change.\n",
diff --git a/include/nul/migration.h b/include/nul/migration.h
index 9b66e7e0..67ca0cae 100644
--- a/include/nul/migration.h
+++ b/include/nul/migration.h
@@ -15,11 +15,10 @@
  * General Public License version 2 for more details.
  */
 
-
-
 #include <nul/motherboard.h>
 #include <nul/iphelper.h>
 #include <nul/migration_structs.h>
+#include <service/time.h>
 
 class Desc
 {
@@ -213,6 +212,8 @@ class Migration : public StaticReceiver<Migration>
     unsigned long   _sendmem;
     unsigned long   _sendmem_total;
 
+    StopWatch       _freeze_timer;
+
     /* Because of asynchronous send operations, all
      * data to be send has to be preserved somewhere until
      * it is ACKED. That's what this structure is for.

From c7280f98af373f689ea2381a17db6d2e90a85438 Mon Sep 17 00:00:00 2001
From: Markus Partheymueller <markus.partheymueller@intel.com>
Date: Mon, 21 Oct 2013 15:20:05 +0200
Subject: [PATCH 26/35] Prepare DBus and messages for use with I/O thread.

This commit adds hooks to the DBus infrastructure to install a proxy between message senders and receivers. In addition to the ReceiveFunction, a similar EnqueueFunction is provided which gets called upon message sending. An I/O thread can then register enqueue callbacks for the respective message type and manage sending on the caller's behalf. If the callback returns false or is not present, everything works as before.

Another callback (named "claim") can be used to configure messages to bypass the I/O thread and hence being sent by the issuing thread directly.

All information needed to process the message identically as before is encoded into a new message type MessageIOThread. This information includes the send mode (FIFO, LIFO, early out, round-robin), if the message should be sent synchronously (i.e., the caller has to wait until the request is completed) and which vCPU was the parent of the bus, if applicable.
---
 include/nul/bus.h       | 139 ++++++++++++++++++++++++++++++++++++++--
 include/nul/message.h   |  41 ++++++++++++
 include/nul/templates.h |   6 ++
 3 files changed, 182 insertions(+), 4 deletions(-)

diff --git a/include/nul/bus.h b/include/nul/bus.h
index c59a5898..85c9eac1 100644
--- a/include/nul/bus.h
+++ b/include/nul/bus.h
@@ -4,6 +4,8 @@
  * Copyright (C) 2007-2009, Bernhard Kauer <bk@vmmon.org>
  * Economic rights: Technische Universitaet Dresden (Germany)
  *
+ * Copyright (C) 2013 Markus Partheymueller, Intel Corporation.
+ *
  * This file is part of Vancouver.
  *
  * Vancouver is free software: you can redistribute it and/or modify
@@ -17,6 +19,7 @@
  */
 #pragma once
 
+#include "message.h"
 #include "service/logging.h"
 #include "service/string.h"
 
@@ -41,17 +44,29 @@ template <class M>
 class DBus
 {
   typedef bool (*ReceiveFunction)(Device *, M&);
+  typedef bool (*EnqueueFunction)(Device *, M&, MessageIOThread::Mode, MessageIOThread::Sync, unsigned*, VCpu *vcpu);
   struct Entry
   {
     Device *_dev;
     ReceiveFunction _func;
   };
+  struct EnqEntry
+  {
+    Device *_dev;
+    VCpu *_vcpu;
+    EnqueueFunction _func;
+  };
 
   unsigned long _debug_counter;
   unsigned _list_count;
   unsigned _list_size;
   struct Entry *_list;
 
+  unsigned _callback_count;
+  unsigned _callback_size;
+  struct Entry *_iothread_callback;
+  struct EnqEntry *_iothread_enqueue;
+
   /**
    * To avoid bugs we disallow the copy constuctor.
    */
@@ -65,6 +80,14 @@ class DBus
     _list = n;
     _list_size = new_size;
   };
+  void set_callback_size(unsigned new_size)
+  {
+    Entry *n = new Entry[new_size];
+    memcpy(n, _iothread_callback, _callback_count * sizeof(*_iothread_callback));
+    if (_iothread_callback)  delete [] _iothread_callback;
+    _iothread_callback = n;
+    _callback_size = new_size;
+  };
 public:
 
   void add(Device *dev, ReceiveFunction func)
@@ -76,13 +99,99 @@ class DBus
     _list_count++;
   }
 
+  void add_iothread_callback(Device *dev, ReceiveFunction func)
+  {
+    if (_callback_count >= _callback_size)
+      set_callback_size(_callback_size > 0 ? _callback_size * 2 : 1);
+    _iothread_callback[_callback_count]._dev    = dev;
+    _iothread_callback[_callback_count]._func = func;
+    _callback_count++;
+  }
+
+  void set_iothread_enqueue(Device *dev, EnqueueFunction func, VCpu *vcpu=nullptr)
+  {
+    if (_iothread_enqueue == nullptr) {
+      delete [] _iothread_enqueue;
+      _iothread_enqueue = new EnqEntry;
+    }
+    _iothread_enqueue->_dev = dev;
+    _iothread_enqueue->_vcpu = vcpu;
+    _iothread_enqueue->_func = func;
+  }
+
   /**
-   * Send message LIFO.
+   * Send message directly.
    */
-  bool  send(M &msg, bool earlyout = false)
+  bool  send_direct_fifo(M &msg)
+  {
+    _debug_counter++;
+    bool res = false;
+    for (unsigned i = 0; i < _list_count; i++)
+      res |= _list[i]._func(_list[i]._dev, msg);
+    return res;
+  }
+  bool  send_direct_rr(M &msg, unsigned *value) {
+    for (unsigned i = 0; i < _list_count; i++)
+      if (_list[i]._func(_list[(i + *value) % _list_count]._dev, msg)) {
+	*value = (i + *value + 1) % _list_count;
+	return true;
+      }
+    return false;
+  }
+  bool  send_direct(M &msg, MessageIOThread::Mode mode, unsigned *value=nullptr)
+  {
+    if (mode == MessageIOThread::MODE_FIFO) return send_direct_fifo(msg);
+    if (mode == MessageIOThread::MODE_RR) return send_direct_rr(msg, value);
+
+    _debug_counter++;
+    bool res = false;
+    bool earlyout = (mode == MessageIOThread::MODE_EARLYOUT);
+    for (unsigned i = _list_count; i-- && !(earlyout && res);)
+      res |= _list[i]._func(_list[i]._dev, msg);
+    return res;
+  }
+
+  /**
+   * Send message LIFO synchronously.
+   */
+  bool  send_sync(M &msg, bool earlyout = false)
   {
+    bool res = false;
+    if (_iothread_callback) {
+      for (unsigned i = _callback_count; i-- && !res;) {
+        res |= _iothread_callback[i]._func(_iothread_callback[i]._dev, msg);
+      }
+    }
+    if (!res && _iothread_enqueue != nullptr) {
+      // No one wants the message directly, enqueue it.
+      if (_iothread_enqueue->_func(_iothread_enqueue->_dev, msg, earlyout ? MessageIOThread::MODE_EARLYOUT : MessageIOThread::MODE_NORMAL, MessageIOThread::SYNC_SYNC, nullptr, _iothread_enqueue->_vcpu))
+        return true;
+    }
     _debug_counter++;
+    res = false;
+    for (unsigned i = _list_count; i-- && !(earlyout && res);)
+      res |= _list[i]._func(_list[i]._dev, msg);
+    return res;
+  }
+
+  /**
+   * Send message LIFO asynchronously.
+   */
+  bool  send(M &msg, bool earlyout = false)
+  {
     bool res = false;
+    if (_iothread_callback) {
+      for (unsigned i = _callback_count; i-- && !res;) {
+        res |= _iothread_callback[i]._func(_iothread_callback[i]._dev, msg);
+      }
+    }
+    if (!res && _iothread_enqueue != nullptr) {
+      // No one wants the message directly, enqueue it.
+      if (_iothread_enqueue->_func(_iothread_enqueue->_dev, msg, earlyout ? MessageIOThread::MODE_EARLYOUT : MessageIOThread::MODE_NORMAL, MessageIOThread::SYNC_ASYNC, nullptr, _iothread_enqueue->_vcpu))
+        return true;
+    }
+    _debug_counter++;
+    res = false;
     for (unsigned i = _list_count; i-- && !(earlyout && res);)
       res |= _list[i]._func(_list[i]._dev, msg);
     return res;
@@ -93,8 +202,19 @@ class DBus
    */
   bool  send_fifo(M &msg)
   {
-    _debug_counter++;
     bool res = false;
+    if (_iothread_callback) {
+      for (unsigned i = _callback_count; i-- && !res;) {
+        res |= _iothread_callback[i]._func(_iothread_callback[i]._dev, msg);
+      }
+    }
+    if (!res && _iothread_enqueue != nullptr) {
+      // No one wants the message directly, enqueue it.
+      if (_iothread_enqueue->_func(_iothread_enqueue->_dev, msg, MessageIOThread::MODE_FIFO, MessageIOThread::SYNC_ASYNC, nullptr, _iothread_enqueue->_vcpu))
+        return true;
+    }
+    _debug_counter++;
+    res = false;
     for (unsigned i = 0; i < _list_count; i++)
       res |= _list[i]._func(_list[i]._dev, msg);
     return 0;
@@ -107,6 +227,17 @@ class DBus
    */
   bool  send_rr(M &msg, unsigned &start)
   {
+    bool res = false;
+    if (_iothread_callback) {
+      for (unsigned i = _callback_count; i-- && !res;) {
+        res |= _iothread_callback[i]._func(_iothread_callback[i]._dev, msg);
+      }
+    }
+    if (!res && _iothread_enqueue != nullptr) {
+      // No one wants the message directly, enqueue it.
+      if (_iothread_enqueue->_func(_iothread_enqueue->_dev, msg, MessageIOThread::MODE_RR, MessageIOThread::SYNC_ASYNC, &start, _iothread_enqueue->_vcpu))
+        return true;
+    }
     _debug_counter++;
     for (unsigned i = 0; i < _list_count; i++)
       if (_list[i]._func(_list[(i + start) % _list_count]._dev, msg)) {
@@ -138,5 +269,5 @@ class DBus
   }
 
   /** Default constructor. */
-  DBus() : _debug_counter(0), _list_count(0), _list_size(0), _list(nullptr) {}
+  DBus() : _debug_counter(0), _list_count(0), _list_size(0), _list(nullptr), _callback_count(0), _callback_size(0), _iothread_callback(nullptr), _iothread_enqueue(nullptr) {}
 };
diff --git a/include/nul/message.h b/include/nul/message.h
index be27d20d..40dc9f3e 100644
--- a/include/nul/message.h
+++ b/include/nul/message.h
@@ -7,6 +7,7 @@
  * Economic rights: Technische Universitaet Dresden (Germany)
  *
  * Copyright (C) 2013 Jacek Galowicz, Intel Corporation.
+ * Copyright (C) 2013 Markus Partheymueller, Intel Corporation.
  *
  * This file is part of Vancouver.
  *
@@ -24,6 +25,46 @@
 
 #include <nul/types.h>
 #include <nul/compiler.h>
+class VCpu;
+struct MessageIOThread
+{
+  VCpu *vcpu;
+  enum Type {
+    TYPE_IOIN,
+    TYPE_IOOUT,
+    TYPE_MEM,
+    TYPE_INPUT,
+    TYPE_IRQ,
+    TYPE_IRQLINES,
+    TYPE_IRQNOTIFY,
+    TYPE_NETWORK,
+    TYPE_DISK,
+    TYPE_DISKCOMMIT,
+    TYPE_LEGACY,
+    TYPE_TIME,
+    TYPE_TIMER,
+    TYPE_TIMEOUT,
+    TYPE_PCICFG,
+    TYPE_HOSTOP,
+    TYPE_CPU,
+  } type;
+  enum Mode {
+    MODE_NORMAL,
+    MODE_EARLYOUT,
+    MODE_FIFO,
+    MODE_RR
+  } mode;
+  enum Sync {
+    SYNC_SYNC,
+    SYNC_ASYNC
+  } sync;
+  unsigned *value;
+  void *ptr;
+  void *sem;
+
+  MessageIOThread(Type _type, Mode _mode, Sync _sync, void *_ptr) : vcpu(nullptr), type(_type), mode(_mode), sync(_sync), value(nullptr), ptr(_ptr), sem(nullptr) {}
+  MessageIOThread(Type _type, Mode _mode, Sync _sync, unsigned *_value, void *_ptr) : vcpu(nullptr), type(_type), mode(_mode), sync(_sync), value(_value), ptr(_ptr), sem(nullptr) {}
+};
 
 /****************************************************/
 /* IOIO messages                                    */
diff --git a/include/nul/templates.h b/include/nul/templates.h
index c883b747..2bea14fd 100644
--- a/include/nul/templates.h
+++ b/include/nul/templates.h
@@ -6,6 +6,8 @@
  *
  * This file is part of Vancouver.
  *
+ * Copyright (C) 2013 Markus Partheymueller, Intel Corporation.
+ *
  * Vancouver is free software: you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
  * published by the Free Software Foundation.
@@ -26,6 +28,10 @@ class StaticReceiver : public Device
 public:
   template<class M>
   static bool receive_static(Device *o, M& msg) { return static_cast<Y*>(o)->receive(msg); }
+  template<class M>
+  static bool enqueue_static(Device *o, M& msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu=nullptr) { return static_cast<Y*>(o)->enqueue(msg, mode, sync, value, vcpu); }
+  template<class M>
+  static bool claim_static(Device *o, M& msg) { return static_cast<Y*>(o)->claim(msg); }
   StaticReceiver() : Device(__PRETTY_FUNCTION__) {};
 };
 

From 2fbc5a2fe9fbbdd73d15743fda3f420bce6f53f6 Mon Sep 17 00:00:00 2001
From: Markus Partheymueller <markus.partheymueller@intel.com>
Date: Mon, 4 Nov 2013 12:53:27 +0100
Subject: [PATCH 27/35] Use __typeof__ for c++0x (NRE).

---
 include/nul/timer.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/include/nul/timer.h b/include/nul/timer.h
index 2bfd59c1..530beb4f 100644
--- a/include/nul/timer.h
+++ b/include/nul/timer.h
@@ -5,6 +5,7 @@
  * Economic rights: Technische Universitaet Dresden (Germany)
  *
  * Copyright (C) 2013 Jacek Galowicz, Intel Corporation.
+ * Copyright (C) 2013 Markus Partheymueller, Intel Corporation.
  *
  * This file is part of Vancouver.
  *
@@ -198,11 +199,11 @@ class TimeoutList : public StaticReceiver<TimeoutList<ENTRIES, DATA>>
   TimeoutList() : _restore_processed(false) { init(); }
 
 #define REL_PTR(ptr, offset) ( \
-    reinterpret_cast<typeof(ptr)>( \
+    reinterpret_cast<__typeof__(ptr)>( \
         reinterpret_cast<mword>(ptr) - reinterpret_cast<mword>(offset)) \
 )
 #define ABS_PTR(ptr, offset) ( \
-    reinterpret_cast<typeof(ptr)>( \
+    reinterpret_cast<__typeof__(ptr)>( \
         reinterpret_cast<mword>(ptr) + reinterpret_cast<mword>(offset)) \
 )
 

From cf0e5ae3c29abb4b324e0fb4b8a38db88cc72466 Mon Sep 17 00:00:00 2001
From: Markus Partheymueller <markus.partheymueller@intel.com>
Date: Mon, 21 Oct 2013 17:19:16 +0200
Subject: [PATCH 28/35] I/O thread Reference implementation.

This commit adds a reference implementation for the unix frontend to show how an I/O thread could be implemented.

Note that the global lock is still in place to allow for easily disabling the I/O thread. Performance may suffer, but the unix frontend is proof-of-concept only, anyway. The I/O thread can be disabled by commenting out the #define USE_IOTHREAD line in unix/main.cc

Access to guest RAM is bypassing the I/O thread because it is synchronized by the operating system.
---
 model/memorycontroller.cc |   7 +
 unix/include/seoul/unix.h |  32 +++
 unix/iothread.cc          | 477 ++++++++++++++++++++++++++++++++++++++
 unix/iothread.h           |  99 ++++++++
 unix/main.cc              |  81 +++++--
 5 files changed, 674 insertions(+), 22 deletions(-)
 create mode 100644 unix/iothread.cc
 create mode 100644 unix/iothread.h

diff --git a/model/memorycontroller.cc b/model/memorycontroller.cc
index d331ef5a..4bfedd6e 100644
--- a/model/memorycontroller.cc
+++ b/model/memorycontroller.cc
@@ -4,6 +4,8 @@
  * Copyright (C) 2009, Bernhard Kauer <bk@vmmon.org>
  * Economic rights: Technische Universitaet Dresden (Germany)
  *
+ * Copyright (C) 2013 Markus Partheymueller, Intel Corporation.
+ *
  * This file is part of Vancouver.
  *
  * Vancouver is free software: you can redistribute it and/or modify
@@ -29,6 +31,10 @@ class MemoryController : public StaticReceiver<MemoryController>
   /****************************************************/
   /* Physmem access                                   */
   /****************************************************/
+  bool  claim(MessageMem &msg)
+  {
+    return ((msg.phys >= _start) && (msg.phys < (_end - 4)));
+  }
   bool  receive(MessageMem &msg)
   {
     if ((msg.phys < _start) || (msg.phys >= (_end - 4)))  return false;
@@ -69,5 +75,6 @@ PARAM_HANDLER(mem,
   MemoryController *dev = new MemoryController(msg.ptr, start, end);
   // physmem access
   mb.bus_mem.add(dev,       MemoryController::receive_static<MessageMem>);
+  mb.bus_mem.add_iothread_callback(dev,       MemoryController::claim_static<MessageMem>);
   mb.bus_memregion.add(dev, MemoryController::receive_static<MessageMemRegion>);
 }
diff --git a/unix/include/seoul/unix.h b/unix/include/seoul/unix.h
index 29c57482..44c7d649 100644
--- a/unix/include/seoul/unix.h
+++ b/unix/include/seoul/unix.h
@@ -4,6 +4,8 @@
  * Copyright (C) 2012, Julian Stecklina <jsteckli@os.inf.tu-dresden.de>
  * Economic rights: Technische Universitaet Dresden (Germany)
  *
+ * Copyright (C) 2013 Markus Partheymueller, Intel Corporation.
+ *
  * This file is part of Seoul.
  *
  * Seoul is free software: you can redistribute it and/or modify it
@@ -24,4 +26,34 @@
 // everything else.
 extern pthread_mutex_t irq_mtx;
 
+static unsigned long long int rdtsc(void)
+{
+  unsigned long long tsc;
+  asm volatile ("rdtsc" : "=A" (tsc));
+  return tsc;
+}
+
+static unsigned get_tsc_frequency()
+{
+  struct timezone tz;
+  memset(&tz, 0, sizeof(tz));
+
+  struct timeval start, stop;
+  unsigned long cycles[2], ms, hz;
+
+  cycles[0] = rdtsc();
+  gettimeofday(&start, &tz);
+
+  usleep(250000);
+
+  cycles[1] = rdtsc();
+  gettimeofday(&stop, &tz);
+
+  ms = ((stop.tv_sec - start.tv_sec)*1000000) + (stop.tv_usec - start.tv_usec);
+
+  hz = (cycles[1]-cycles[0]) / ms * 1000000;
+
+  return hz;
+}
+
 // EOF
diff --git a/unix/iothread.cc b/unix/iothread.cc
new file mode 100644
index 00000000..1d1a1673
--- /dev/null
+++ b/unix/iothread.cc
@@ -0,0 +1,477 @@
+/**
+ * I/O Thread
+ *
+ * Copyright (C) 2013 Markus Partheymueller, Intel Corporation.
+ *
+ * This file is part of Seoul.
+ *
+ * Seoul is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Seoul is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details.
+ */
+
+#include "iothread.h"
+
+void IOThread::init() {
+  for(VCpu *vcpu = _mb->last_vcpu; vcpu; vcpu = vcpu->get_last()) {
+    vcpu->mem.set_iothread_enqueue(this, enqueue_static<MessageMem>, vcpu);
+    vcpu->executor.set_iothread_enqueue(this, enqueue_static<CpuMessage>, vcpu);
+  }
+}
+
+sem_t *IOThread::get_notify_sem(pthread_t tid) {
+  for (auto it = _notify.begin(); it != _notify.end(); it++) {
+    if (it->tid == pthread_self()) return &it->sem;
+  }
+  Notify *new_notify = new Notify;
+  new_notify->tid = pthread_self();
+  sem_init(&new_notify->sem, 0, 0);
+  _notify.push_back(*new_notify);
+  return &new_notify->sem;
+}
+
+template <typename M>
+static void sync_msg(MessageIOThread &iomsg) {
+  // We have to keep the message when it is synchronous. The receiver will delete it.
+  if (iomsg.sync == MessageIOThread::SYNC_SYNC) {
+    // Wake enqueuer
+    assert(iomsg.sem != nullptr);
+    sem_post(reinterpret_cast<sem_t*>(iomsg.sem));
+  } else {
+    delete (M*) iomsg.ptr;
+  }
+}
+
+void IOThread::syncify_message(MessageIOThread &msg) {
+  if (msg.sync == MessageIOThread::SYNC_SYNC) {
+    msg.sem = this->get_notify_sem(pthread_self());
+    assert(msg.sem != nullptr);
+  }
+}
+
+template <typename M>
+void IOThread::sync_message(MessageIOThread &msg, MessageIOThread::Sync sync) {
+  if (sync == MessageIOThread::SYNC_SYNC) {
+    // Wait for signal from worker
+    sem_wait(reinterpret_cast<sem_t*>(msg.sem));
+  }
+}
+
+bool IOThread::enq(MessageIOThread &msg) {
+  pthread_mutex_lock(&_lock);
+  _queue->push(msg);
+  sem_post(&_block);
+  pthread_mutex_unlock(&_lock);
+  return true;
+}
+
+bool IOThread::enqueue(MessageDisk &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu) {
+  if (pthread_self() == own_tid) return false;
+  // Disk is always sync because of error check
+  sync = MessageIOThread::SYNC_SYNC;
+  MessageIOThread enq(MessageIOThread::TYPE_DISK, mode, sync, value, &msg);
+  syncify_message(enq);
+  this->enq(enq);
+  sync_message<MessageDisk>(enq, sync);
+  return true;
+}
+
+bool IOThread::enqueue(MessageDiskCommit &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu) {
+  if (pthread_self() == own_tid) return false;
+  MessageDiskCommit *ptr = new MessageDiskCommit(msg.disknr, msg.usertag, msg.status);
+  MessageIOThread enq(MessageIOThread::TYPE_DISKCOMMIT, mode, sync, value, ptr);
+  syncify_message(enq);
+  this->enq(enq);
+  sync_message<MessageDiskCommit>(enq, sync);
+  return true;
+}
+
+bool IOThread::enqueue(MessageTime &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu) {
+  if (pthread_self() == own_tid) return false;
+  // Time must be sync
+  sync = MessageIOThread::SYNC_SYNC;
+  MessageIOThread enq(MessageIOThread::TYPE_TIME, mode, sync, value, &msg);
+  syncify_message(enq);
+  this->enq(enq);
+  sync_message<MessageTime>(enq, sync);
+  return true;
+}
+
+bool IOThread::enqueue(MessageTimer &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu) {
+  if (pthread_self() == own_tid) return false;
+  MessageTimer *ptr;
+  if (msg.type == MessageTimer::TIMER_NEW) sync = MessageIOThread::SYNC_SYNC;
+  else Logging::panic("MessageTimer request nr %u\n", msg.nr);
+  if (sync == MessageIOThread::SYNC_ASYNC) {
+    ptr = new MessageTimer;
+    memcpy(ptr, &msg, sizeof(msg));
+  } else {
+    ptr = &msg;
+  }
+  MessageIOThread enq(MessageIOThread::TYPE_TIMER, mode, sync, value, ptr);
+  syncify_message(enq);
+  this->enq(enq);
+  sync_message<MessageTimer>(enq, sync);
+  return true;
+}
+
+bool IOThread::enqueue(MessageTimeout &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu) {
+  if (pthread_self() == own_tid) return false;
+  MessageTimeout *ptr;
+  if (sync == MessageIOThread::SYNC_ASYNC) {
+    ptr = new MessageTimeout(msg.nr, msg.time);
+  } else {
+    ptr = &msg;
+  }
+  MessageIOThread enq(MessageIOThread::TYPE_TIMEOUT, mode, sync, value, ptr);
+  syncify_message(enq);
+  this->enq(enq);
+  sync_message<MessageTimeout>(enq, sync);
+  return true;
+}
+
+bool IOThread::enqueue(MessageIOOut &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu) {
+  if (pthread_self() == own_tid) return false;
+  MessageIOOut *ptr;
+  if (sync == MessageIOThread::SYNC_ASYNC) {
+    ptr = new MessageIOOut(msg.type, msg.port, msg.value);
+    memcpy(ptr, &msg, sizeof(msg));
+  } else {
+    ptr = &msg;
+  }
+  MessageIOThread enq(MessageIOThread::TYPE_IOOUT, mode, sync, value, ptr);
+  syncify_message(enq);
+  this->enq(enq);
+  sync_message<MessageIOOut>(enq, sync);
+  return true;
+}
+
+bool IOThread::enqueue(MessageIOIn &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu) {
+  if (pthread_self() == own_tid) return false;
+  // I/O port reads are always sync
+  sync = MessageIOThread::SYNC_SYNC;
+  MessageIOThread enq(MessageIOThread::TYPE_IOIN, mode, sync, value, &msg);
+  syncify_message(enq);
+  this->enq(enq);
+  sync_message<MessageIOIn>(enq, sync);
+  return true;
+}
+
+bool IOThread::enqueue(MessageMem &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu) {
+  if (pthread_self() == own_tid) return false;
+  // Mem reads are always sync
+  if (msg.read) sync = MessageIOThread::SYNC_SYNC;
+  MessageMem *ptr;
+  if (sync == MessageIOThread::SYNC_ASYNC) {
+    assert(!msg.read);
+    // We need to save the value pointed to by msg.ptr!
+    unsigned *val = new unsigned;
+    *val = *msg.ptr;
+    ptr = new MessageMem(msg.read, msg.phys, val);
+  } else {
+    ptr = &msg;
+  }
+  MessageIOThread enq(MessageIOThread::TYPE_MEM, mode, sync, value, ptr);
+  enq.vcpu = vcpu;
+  syncify_message(enq);
+  this->enq(enq);
+  sync_message<MessageMem>(enq, sync);
+  return true;
+}
+
+bool IOThread::enqueue(CpuMessage &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu) {
+  if (pthread_self() == own_tid) return false;
+  if (msg.type != CpuMessage::TYPE_RDMSR && msg.type != CpuMessage::TYPE_WRMSR && msg.type != CpuMessage::TYPE_CHECK_IRQ)
+    return false;
+
+  // These messages are always sync
+  sync = MessageIOThread::SYNC_SYNC;
+  CpuMessage *ptr;
+  ptr = &msg;
+  MessageIOThread enq(MessageIOThread::TYPE_CPU, mode, sync, value, ptr);
+  enq.vcpu = vcpu;
+  syncify_message(enq);
+  this->enq(enq);
+  sync_message<CpuMessage>(enq, sync);
+  return true;
+}
+
+bool IOThread::enqueue(MessageInput &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu) {
+  if (pthread_self() == own_tid) return false;
+  MessageInput *ptr;
+  if (sync == MessageIOThread::SYNC_ASYNC) {
+    ptr = new MessageInput(msg.device, msg.data);
+  } else {
+    ptr = &msg;
+  }
+  MessageIOThread enq(MessageIOThread::TYPE_INPUT, mode, sync, value, ptr);
+  syncify_message(enq);
+  this->enq(enq);
+  sync_message<MessageInput>(enq, sync);
+  return true;
+}
+
+bool IOThread::enqueue(MessageIrqLines &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu) {
+  if (pthread_self() == own_tid) return false;
+  MessageIrqLines *ptr;
+  if (sync == MessageIOThread::SYNC_ASYNC) {
+    ptr = new MessageIrqLines(msg.type, msg.line);
+  } else {
+    ptr = &msg;
+  }
+  MessageIOThread enq(MessageIOThread::TYPE_IRQLINES, mode, sync, value, ptr);
+  syncify_message(enq);
+  this->enq(enq);
+  sync_message<MessageIrqLines>(enq, sync);
+  return true;
+}
+
+bool IOThread::enqueue(MessageIrqNotify &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu) {
+  if (pthread_self() == own_tid) return false;
+  MessageIrqNotify *ptr;
+  if (sync == MessageIOThread::SYNC_ASYNC) {
+    ptr = new MessageIrqNotify(msg.baseirq, msg.mask);
+  } else {
+    ptr = &msg;
+  }
+  MessageIOThread enq(MessageIOThread::TYPE_IRQNOTIFY, mode, sync, value, ptr);
+  syncify_message(enq);
+  this->enq(enq);
+  sync_message<MessageIrqNotify>(enq, sync);
+  return true;
+}
+
+bool IOThread::enqueue(MessageIrq &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu) {
+  if (pthread_self() == own_tid) return false;
+  MessageIrq *ptr;
+  if (sync == MessageIOThread::SYNC_ASYNC) {
+    ptr = new MessageIrq(msg.type, msg.line);
+  } else {
+    ptr = &msg;
+  }
+  MessageIOThread enq(MessageIOThread::TYPE_IRQ, mode, sync, value, ptr);
+  syncify_message(enq);
+  this->enq(enq);
+  sync_message<MessageIrq>(enq, sync);
+  return true;
+}
+
+bool IOThread::enqueue(MessageLegacy &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu) {
+  if (pthread_self() == own_tid) return false;
+  if (msg.type == MessageLegacy::INTA || msg.type == MessageLegacy::DEASS_INTR) sync = MessageIOThread::SYNC_SYNC;
+  MessageLegacy *ptr;
+  if (sync == MessageIOThread::SYNC_ASYNC) {
+    ptr = new MessageLegacy(msg.type, msg.value);
+  } else {
+    ptr = &msg;
+  }
+  MessageIOThread enq(MessageIOThread::TYPE_LEGACY, mode, sync, value, ptr);
+  syncify_message(enq);
+  this->enq(enq);
+  sync_message<MessageLegacy>(enq, sync);
+  return true;
+}
+
+bool IOThread::enqueue(MessageNetwork &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu) {
+  if (pthread_self() == own_tid) return false;
+  if (msg.type == MessageNetwork::QUERY_MAC) sync = MessageIOThread::SYNC_SYNC;
+  MessageNetwork *ptr;
+  if (sync == MessageIOThread::SYNC_ASYNC) {
+    ptr = new MessageNetwork(msg.type, msg.client);
+    memcpy(ptr, &msg, sizeof(msg));
+  } else {
+    ptr = &msg;
+  }
+  MessageIOThread enq(MessageIOThread::TYPE_NETWORK, mode, sync, value, ptr);
+  syncify_message(enq);
+  this->enq(enq);
+  sync_message<MessageNetwork>(enq, sync);
+  return true;
+}
+
+bool IOThread::enqueue(MessagePciConfig &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu) {
+  if (pthread_self() == own_tid) return false;
+  // Reads are sync
+  if (msg.type == MessagePciConfig::TYPE_READ) sync = MessageIOThread::SYNC_SYNC;
+  MessagePciConfig *ptr;
+  if (sync == MessageIOThread::SYNC_ASYNC) {
+    ptr = new MessagePciConfig(msg.bdf);
+    memcpy(ptr, &msg, sizeof(msg));
+  } else {
+    ptr = &msg;
+  }
+  MessageIOThread enq(MessageIOThread::TYPE_PCICFG, mode, sync, value, ptr);
+  syncify_message(enq);
+  this->enq(enq);
+  sync_message<MessagePciConfig>(enq, sync);
+  return true;
+}
+
+bool IOThread::enqueue(MessageHostOp &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu) {
+  if (pthread_self() == own_tid || msg.type != MessageHostOp::OP_VCPU_RELEASE) return false;
+  MessageHostOp *ptr;
+  if (sync == MessageIOThread::SYNC_ASYNC) {
+    ptr = new MessageHostOp(msg.vcpu);
+    memcpy(ptr, &msg, sizeof(msg));
+  } else {
+    ptr = &msg;
+  }
+  MessageIOThread enq(MessageIOThread::TYPE_HOSTOP, mode, sync, value, ptr);
+  syncify_message(enq);
+  this->enq(enq);
+  sync_message<MessageHostOp>(enq, sync);
+  return true;
+}
+
+
+void IOThread::worker() {
+  own_tid = pthread_self();
+
+  while (1) {
+    sem_wait(&_block);
+    pthread_mutex_lock(&_lock);
+
+    MessageIOThread msg = _queue->front();
+    _queue->pop();
+
+    pthread_mutex_unlock(&_lock);
+
+    // Send message on appropriate bus
+    switch (msg.type) {
+      case MessageIOThread::TYPE_DISK:
+        {
+          MessageDisk *msg2 = reinterpret_cast<MessageDisk*>(msg.ptr);
+          _mb->bus_disk.send_direct(*msg2, msg.mode, msg.value);
+          sync_msg<MessageDisk>(msg);
+        }
+        break;
+      case MessageIOThread::TYPE_DISKCOMMIT:
+        {
+          MessageDiskCommit *msg2 = reinterpret_cast<MessageDiskCommit*>(msg.ptr);
+          _mb->bus_diskcommit.send_direct(*msg2, msg.mode, msg.value);
+          sync_msg<MessageDiskCommit>(msg);
+        }
+        break;
+      case MessageIOThread::TYPE_TIME:
+        {
+          MessageTime *msg2 = reinterpret_cast<MessageTime*>(msg.ptr);
+          _mb->bus_time.send_direct(*msg2, msg.mode, msg.value);
+          sync_msg<MessageTime>(msg);
+        }
+        break;
+      case MessageIOThread::TYPE_TIMER:
+        {
+          MessageTimer *msg2 = reinterpret_cast<MessageTimer*>(msg.ptr);
+          _mb->bus_timer.send_direct(*msg2, msg.mode, msg.value);
+          sync_msg<MessageTimer>(msg);
+        }
+        break;
+      case MessageIOThread::TYPE_TIMEOUT:
+        {
+          MessageTimeout *msg2 = reinterpret_cast<MessageTimeout*>(msg.ptr);
+          _mb->bus_timeout.send_direct(*msg2, msg.mode, msg.value);
+          sync_msg<MessageTimeout>(msg);
+        }
+        break;
+      case MessageIOThread::TYPE_IOOUT:
+        {
+          MessageIOOut *msg2 = reinterpret_cast<MessageIOOut*>(msg.ptr);
+          _mb->bus_ioout.send_direct(*msg2, msg.mode, msg.value);
+          sync_msg<MessageIOOut>(msg);
+        }
+        break;
+      case MessageIOThread::TYPE_IOIN:
+        {
+          MessageIOIn *msg2 = reinterpret_cast<MessageIOIn*>(msg.ptr);
+          _mb->bus_ioin.send_direct(*msg2, msg.mode, msg.value);
+          sync_msg<MessageIOIn>(msg);
+        }
+        break;
+      case MessageIOThread::TYPE_MEM:
+        {
+          MessageMem *msg2 = reinterpret_cast<MessageMem*>(msg.ptr);
+          if (msg.vcpu) {
+            msg.vcpu->mem.send_direct(*msg2, msg.mode, msg.value);
+          } else
+            _mb->bus_mem.send_direct(*msg2, msg.mode, msg.value);
+          // Special case: delete saved value
+          if (msg.sync == MessageIOThread::SYNC_ASYNC) delete msg2->ptr;
+          sync_msg<MessageMem>(msg);
+        }
+        break;
+      case MessageIOThread::TYPE_CPU:
+        {
+          CpuMessage *msg2 = reinterpret_cast<CpuMessage*>(msg.ptr);
+          if (msg.vcpu)
+            msg.vcpu->executor.send_direct(*msg2, msg.mode, msg.value);
+          else
+            Logging::panic("TYPE_CPU needs a vcpu pointer!\n");
+          sync_msg<CpuMessage>(msg);
+        }
+        break;
+      case MessageIOThread::TYPE_INPUT:
+        {
+          MessageInput *msg2 = reinterpret_cast<MessageInput*>(msg.ptr);
+          _mb->bus_input.send_direct(*msg2, msg.mode, msg.value);
+          sync_msg<MessageInput>(msg);
+        }
+        break;
+      case MessageIOThread::TYPE_IRQLINES:
+        {
+          MessageIrqLines *msg2 = reinterpret_cast<MessageIrqLines*>(msg.ptr);
+          _mb->bus_irqlines.send_direct(*msg2, msg.mode, msg.value);
+          sync_msg<MessageIrqLines>(msg);
+        }
+        break;
+      case MessageIOThread::TYPE_IRQNOTIFY:
+        {
+          MessageIrqNotify *msg2 = reinterpret_cast<MessageIrqNotify*>(msg.ptr);
+          _mb->bus_irqnotify.send_direct(*msg2, msg.mode, msg.value);
+          sync_msg<MessageIrqNotify>(msg);
+        }
+        break;
+      case MessageIOThread::TYPE_IRQ:
+        {
+          MessageIrq *msg2 = reinterpret_cast<MessageIrq*>(msg.ptr);
+          _mb->bus_hostirq.send_direct(*msg2, msg.mode, msg.value);
+          sync_msg<MessageIrq>(msg);
+        }
+        break;
+      case MessageIOThread::TYPE_LEGACY:
+        {
+          MessageLegacy *msg2 = reinterpret_cast<MessageLegacy*>(msg.ptr);
+          _mb->bus_legacy.send_direct(*msg2, msg.mode, msg.value);
+          sync_msg<MessageLegacy>(msg);
+        }
+        break;
+      case MessageIOThread::TYPE_NETWORK:
+        {
+          MessageNetwork *msg2 = reinterpret_cast<MessageNetwork*>(msg.ptr);
+          _mb->bus_network.send_direct(*msg2, msg.mode, msg.value);
+          sync_msg<MessageNetwork>(msg);
+        }
+        break;
+      case MessageIOThread::TYPE_PCICFG:
+        {
+          MessagePciConfig *msg2 = reinterpret_cast<MessagePciConfig*>(msg.ptr);
+          _mb->bus_pcicfg.send_direct(*msg2, msg.mode, msg.value);
+          sync_msg<MessagePciConfig>(msg);
+        }
+        break;
+      case MessageIOThread::TYPE_HOSTOP:
+        {
+          MessageHostOp *msg2 = reinterpret_cast<MessageHostOp*>(msg.ptr);
+          _mb->bus_hostop.send_direct(*msg2, msg.mode, msg.value);
+          sync_msg<MessageHostOp>(msg);
+        }
+        break;
+
+      default: Logging::panic("Cannot handle type %x %x (size was %lx)!\n", msg.type, msg.mode, _queue->size());
+    }
+  }
+}
diff --git a/unix/iothread.h b/unix/iothread.h
new file mode 100644
index 00000000..9cf49837
--- /dev/null
+++ b/unix/iothread.h
@@ -0,0 +1,99 @@
+/**
+ * I/O Thread
+ *
+ * Copyright (C) 2013 Markus Partheymueller, Intel Corporation.
+ *
+ * This file is part of Seoul.
+ *
+ * Seoul is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Seoul is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details.
+ */
+
+#include <nul/message.h>
+#include <nul/motherboard.h>
+#include <nul/vcpu.h>
+#include <service/logging.h>
+
+#include <stdlib.h>
+#include <unistd.h>
+#include <queue>
+#include <mutex>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <pthread.h>
+#include <semaphore.h>
+#include <errno.h>
+
+class IOThread : public StaticReceiver<IOThread> {
+private:
+  pthread_mutex_t _lock;
+  sem_t _block;
+  bool blocking;
+  std::queue<MessageIOThread> *_queue;
+  Motherboard *_mb;
+
+  struct Notify {
+    pthread_t tid;
+    sem_t sem;
+  };
+  std::vector<Notify> _notify;
+
+  pthread_t own_tid;
+
+public:
+  bool enq(MessageIOThread &msg);
+  void syncify_message(MessageIOThread &msg);
+  template <typename M>
+  void sync_message(MessageIOThread &msg, MessageIOThread::Sync sync);
+
+  void init();
+
+  bool enqueue(MessageDisk &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu);
+  bool enqueue(MessageDiskCommit &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu);
+  bool enqueue(MessageTime &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu);
+  bool enqueue(MessageTimer &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu);
+  bool enqueue(MessageTimeout &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu);
+  bool enqueue(MessageIOOut &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu);
+  bool enqueue(MessageIOIn &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu);
+  bool enqueue(MessageMem &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu);
+  bool enqueue(CpuMessage &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu);
+  bool enqueue(MessageInput &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu);
+  bool enqueue(MessageIrqLines &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu);
+  bool enqueue(MessageIrqNotify &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu);
+  bool enqueue(MessageIrq &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu);
+  bool enqueue(MessageLegacy &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu);
+  bool enqueue(MessageNetwork &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu);
+  bool enqueue(MessagePciConfig &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu);
+  bool enqueue(MessageHostOp &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu);
+
+  void worker();
+  sem_t *get_notify_sem(pthread_t tid);
+
+  IOThread(Motherboard *mb) : blocking(false), _queue(nullptr), _mb(mb) {
+    _queue = new std::queue<MessageIOThread>;
+    if (0 != pthread_mutex_init(&_lock, nullptr)) perror("Could not init mutex.");
+    if (0 != sem_init(&_block, 0, 0)) perror("Could not init sem.");
+
+    mb->bus_disk.set_iothread_enqueue(this, enqueue_static<MessageDisk>);
+    mb->bus_diskcommit.set_iothread_enqueue(this, enqueue_static<MessageDiskCommit>);
+    mb->bus_time.set_iothread_enqueue(this, enqueue_static<MessageTime>);
+    mb->bus_timer.set_iothread_enqueue(this, enqueue_static<MessageTimer>);
+    mb->bus_timeout.set_iothread_enqueue(this, enqueue_static<MessageTimeout>);
+    mb->bus_ioout.set_iothread_enqueue(this, enqueue_static<MessageIOOut>);
+    mb->bus_ioin.set_iothread_enqueue(this, enqueue_static<MessageIOIn>);
+    mb->bus_mem.set_iothread_enqueue(this, enqueue_static<MessageMem>);
+    mb->bus_input.set_iothread_enqueue(this, enqueue_static<MessageInput>);
+    mb->bus_irqlines.set_iothread_enqueue(this, enqueue_static<MessageIrqLines>);
+    mb->bus_irqnotify.set_iothread_enqueue(this, enqueue_static<MessageIrqNotify>);
+    mb->bus_legacy.set_iothread_enqueue(this, enqueue_static<MessageLegacy>);
+    mb->bus_network.set_iothread_enqueue(this, enqueue_static<MessageNetwork>);
+    mb->bus_pcicfg.set_iothread_enqueue(this, enqueue_static<MessagePciConfig>);
+    mb->bus_hostop.set_iothread_enqueue(this, enqueue_static<MessageHostOp>);
+  }
+};
diff --git a/unix/main.cc b/unix/main.cc
index 530403dd..6322b44f 100644
--- a/unix/main.cc
+++ b/unix/main.cc
@@ -5,6 +5,7 @@
  * Economic rights: Technische Universitaet Dresden (Germany)
  *
  * Copyright (C) 2013 Jacek Galowicz, Intel Corporation.
+ * Copyright (C) 2013 Markus Partheymueller, Intel Corporation.
  *
  * This file is part of Seoul.
  *
@@ -52,6 +53,12 @@
 #include <seoul/unix.h>
 #include <nul/migration.h>
 
+#define USE_IOTHREAD
+
+#ifdef USE_IOTHREAD
+#include "iothread.h"
+#endif
+
 const char version_str[] =
 #include "version.inc"
   ;
@@ -89,7 +96,10 @@ static const char *pc_ps2[] = {
   "rtl8029:,9,0x300",
   "ahci:0xe0800000,14",
   "pmtimer:0x8000",
-  // 1 vCPU
+  // 4 vCPUs
+  "vcpu", "halifax", "vbios", "lapic",
+  "vcpu", "halifax", "vbios", "lapic",
+  "vcpu", "halifax", "vbios", "lapic",
   "vcpu", "halifax", "vbios", "lapic",
   NULL,
   };
@@ -100,9 +110,11 @@ static TimeoutList<32, void> timeouts;
 static timevalue             last_to = ~0ULL;
 static timer_t               timer_id;
 
-
-static Clock                 mb_clock(1000000);   // XXX Use correct frequency
-static Motherboard           mb(&mb_clock, NULL);
+Motherboard                 *mb;
+Clock                       *mb_clock;
+#ifdef USE_IOTHREAD
+IOThread                    *iothread_obj;
+#endif
 
 // Multiboot module data
 
@@ -278,7 +290,7 @@ static std::vector<Vcpu_info> vcpu_info;
 
 static void *migration_thread_fn(void *)
 {
-    _migrator = new Migration(&mb);
+    _migrator = new Migration(mb);
     _migrator->send(_migration_ip, _migration_port);
 
     delete _migrator;
@@ -301,6 +313,14 @@ static void start_migration_to(unsigned ip, unsigned port)
     pthread_setname_np(migthread, "migration");
 }
 
+#ifdef USE_IOTHREAD
+void * iothread_worker(void *) {
+  iothread_obj->worker();
+
+  return NULL;
+}
+#endif
+
 static bool receive(Device *, MessageHostOp &msg)
 {
     bool res = true;
@@ -477,7 +497,7 @@ static bool receive(Device *, MessageHostOp &msg)
     case MessageHostOp::OP_MIGRATION_RETRIEVE_INIT: {
         _migration_port = msg.value;
         _restore_mode = Migration::MODE_RECEIVE;
-        _migrator = new Migration(&mb);
+        _migrator = new Migration(mb);
     }
     break;
     case MessageHostOp::OP_MIGRATION_START: {
@@ -496,7 +516,7 @@ static bool receive(Device *, MessageHostOp &msg)
 
 static void timeout_trigger()
 {
-  timevalue now = mb.clock()->time();
+  timevalue now = mb_clock->time();
 
   // Force time reprogramming. Otherwise, we might not reprogram a
   // timer, if the timeout event reached us too early.
@@ -507,7 +527,7 @@ static void timeout_trigger()
   while ((nr = timeouts.trigger(now))) {
     MessageTimeout msg(nr, timeouts.timeout());
     timeouts.cancel(nr);
-    mb.bus_timeout.send(msg);
+    mb->bus_timeout.send(msg);
   }
 }
 
@@ -516,7 +536,7 @@ static void timeout_request()
 {
   timevalue next_to = timeouts.timeout();
   if (next_to != ~0ULL) {
-    unsigned long long delta = mb_clock.delta(next_to, 1000000000UL);
+    unsigned long long delta = mb_clock->delta(next_to, 1000000000UL);
 
     if (delta == 0) {
       // Timeout pending NOW. Skip programming a timeout.
@@ -570,7 +590,7 @@ static bool receive(Device *, MessageTime &msg)
 {
   struct timeval tv;
   gettimeofday(&tv, NULL);
-  msg.timestamp = mb_clock.clock(MessageTime::FREQUENCY);
+  msg.timestamp = mb_clock->clock(MessageTime::FREQUENCY);
 
   assert(MessageTime::FREQUENCY == 1000000U);
   msg.wallclocktime = (uint64)tv.tv_sec * 1000000 + tv.tv_usec;
@@ -600,7 +620,7 @@ static void *network_io_thread_fn(void *)
     MessageNetwork msg(network_pbuf, res, 0);
 
     pthread_mutex_lock(&irq_mtx);
-    mb.bus_network.send(msg);
+    mb->bus_network.send(msg);
     pthread_mutex_unlock(&irq_mtx);
   }
 
@@ -681,7 +701,7 @@ static bool receive(Device *, MessageDisk &msg)
   }
 
   MessageDiskCommit cmsg(msg.disknr, msg.usertag, status);
-  mb.bus_diskcommit.send(cmsg);
+  mb->bus_diskcommit.send(cmsg);
 
   return true;
 }
@@ -751,15 +771,27 @@ int main(int argc, char **argv)
     return EXIT_FAILURE;
   }
 
+  mb_clock = new Clock(get_tsc_frequency());
+  mb = new Motherboard(mb_clock, NULL);
 
-  mb.bus_hostop .add(nullptr, receive);
-  mb.bus_timer  .add(nullptr, receive);
-  mb.bus_time   .add(nullptr, receive);
+#ifdef USE_IOTHREAD
+  iothread_obj = new IOThread(mb);
+  pthread_t iothread_worker_thread;
+  if (0 != pthread_create(&iothread_worker_thread, NULL, iothread_worker, NULL)) {
+    perror("create iothread_worker failed");
+    return EXIT_FAILURE;
+  }
+  pthread_setname_np(iothread_worker_thread, "iothread_worker");
+#endif
+
+  mb->bus_hostop .add(nullptr, receive);
+  mb->bus_timer  .add(nullptr, receive);
+  mb->bus_time   .add(nullptr, receive);
 
-  mb.bus_network.add(nullptr, receive);
-  mb.bus_disk   .add(nullptr, receive);
+  mb->bus_network.add(nullptr, receive);
+  mb->bus_disk   .add(nullptr, receive);
 
-  mb.bus_restore.add(&timeouts, TimeoutList<32, void>::receive_static<MessageRestore>);
+  mb->bus_restore.add(&timeouts, TimeoutList<32, void>::receive_static<MessageRestore>);
 
   // Synchronization initialization
   if (0 != pthread_mutex_init(&irq_mtx, nullptr)) {
@@ -770,14 +802,19 @@ int main(int argc, char **argv)
 
   // Create standard PC
   for (const char **dev = pc_ps2; *dev != NULL; dev++) {
-    mb.handle_arg(*dev);
+    mb->handle_arg(*dev);
   }
 
   Logging::printf("Devices and %zu virtual CPU%s started successfully.\n",
                   vcpu_info.size(), vcpu_info.size() == 1 ? "" : "s");
 
+#ifdef USE_IOTHREAD
+  // Init I/O thread (vCPU local busses)
+  iothread_obj->init();
+#endif
+
   // init VCPUs
-  for (VCpu *vcpu = mb.last_vcpu; vcpu; vcpu=vcpu->get_last()) {
+  for (VCpu *vcpu = mb->last_vcpu; vcpu; vcpu=vcpu->get_last()) {
     Logging::printf("Initializing virtual CPU %p.\n", vcpu);
 
     // init CPU strings
@@ -799,7 +836,7 @@ int main(int argc, char **argv)
 
   Logging::printf("RESET device state\n");
   MessageLegacy msg2(MessageLegacy::RESET, 0);
-  mb.bus_legacy.send_fifo(msg2);
+  mb->bus_legacy.send_fifo(msg2);
 
   if (_restore_mode != Migration::MODE_OFF) {
       /*
@@ -807,7 +844,7 @@ int main(int argc, char **argv)
        * it is blocked by and catches it into the recall handler.
        */
        MessageLegacy msg3(MessageLegacy::UNLOCK, 0);
-       mb.bus_legacy.send_fifo(msg3);
+       mb->bus_legacy.send_fifo(msg3);
   }
 
   pthread_t iothread;

From 42e2fa94c738e267cc8500c6a3d1ce4f9c2c048b Mon Sep 17 00:00:00 2001
From: Markus Partheymueller <markus.partheymueller@intel.com>
Date: Tue, 13 Aug 2013 17:39:41 +0200
Subject: [PATCH 29/35] Assign vCPUs to different physical cores.

The vCPU threads are now pinned to consecutive physical cores, starting at the one following the original core that vancouver was started on. As a first simple solution, every physical core gets assigned a dedicated timeout object (i.e., a timer session). Later on, this could be restricted to the actual cores the instance runs on.
---
 nre/src/Timeouts.h   | 11 +++++++----
 nre/src/Vancouver.cc | 12 ++++++++----
 nre/src/Vancouver.h  | 14 ++++++++++----
 3 files changed, 25 insertions(+), 12 deletions(-)

diff --git a/nre/src/Timeouts.h b/nre/src/Timeouts.h
index 3cf59177..556a242f 100644
--- a/nre/src/Timeouts.h
+++ b/nre/src/Timeouts.h
@@ -3,6 +3,8 @@
  * Copyright (C) 2007-2009, Bernhard Kauer <bk@vmmon.org>
  * Economic rights: Technische Universitaet Dresden (Germany)
  *
+ * Copyright (C) 2013 Markus Partheymueller, Intel Corporation.
+ *
  * This file is part of Vancouver.
  *
  * Vancouver is free software: you can redistribute it and/or modify
@@ -34,10 +36,10 @@ class Timeouts {
     };
 
 public:
-    Timeouts(Motherboard &mb)
-        : _mb(mb), _sm(), _timeouts(), _timer("timer"), _last_to(NO_TIMEOUT) {
+    Timeouts(Motherboard &mb, cpu_t cpu)
+        : _mb(mb), _cpu(cpu), _sm(), _timeouts(), _timer("timer"), _last_to(NO_TIMEOUT) {
         nre::Reference<nre::GlobalThread> gt = nre::GlobalThread::create(
-            timer_thread, nre::CPU::current().log_id(), "vmm-timeouts");
+            timer_thread, _cpu, "vmm-timeouts");
         gt->set_tls<Timeouts*>(nre::Thread::TLS_PARAM, this);
         gt->start();
     }
@@ -67,8 +69,9 @@ class Timeouts {
     void program();
 
     Motherboard &_mb;
+    cpu_t _cpu;
     nre::UserSm _sm;
-    nre::TimeoutList<32, void> _timeouts;
+    nre::TimeoutList<64, void> _timeouts;
     nre::TimerSession _timer;
     timevalue_t _last_to;
 };
diff --git a/nre/src/Vancouver.cc b/nre/src/Vancouver.cc
index 574320b0..1d8a962c 100644
--- a/nre/src/Vancouver.cc
+++ b/nre/src/Vancouver.cc
@@ -3,6 +3,8 @@
  * Copyright (C) 2007-2009, Bernhard Kauer <bk@vmmon.org>
  * Economic rights: Technische Universitaet Dresden (Germany)
  *
+ * Copyright (C) 2013 Markus Partheymueller, Intel Corporation.
+ *
  * This file is part of Vancouver.
  *
  * Vancouver is free software: you can redistribute it and/or modify
@@ -36,6 +38,7 @@ using namespace nre;
 
 static bool initialized = false;
 static size_t ncpu = 1;
+size_t last_cpunr = 0;
 static DataSpace *guest_mem = nullptr;
 static size_t guest_size = 0;
 static size_t console = 1;
@@ -221,7 +224,8 @@ bool Vancouver::receive(MessageHostOp &msg) {
         break;
 
         case MessageHostOp::OP_VCPU_CREATE_BACKEND: {
-            cpu_t cpu = CPU::current().log_id();
+            cpu_t cpu = (++last_cpunr + CPU::current().log_id())%CPU::count();
+            Serial::get() << "Create VCPU pinned to CPU " << fmt(cpu, "%d") << "\n";
             VCPUBackend *v = new VCPUBackend(&_mb, msg.vcpu, nre::Hip::get().has_svm(), cpu);
             msg.value = reinterpret_cast<ulong>(v);
             msg.vcpu->executor.add(this, receive_static<CpuMessage> );
@@ -281,10 +285,10 @@ bool Vancouver::receive(MessageTimer &msg) {
     COUNTER_INC("requestTO");
     switch(msg.type) {
         case MessageTimer::TIMER_NEW:
-            msg.nr = _timeouts.alloc();
+            msg.nr = _timeouts[CPU::current().log_id()]->alloc();
             return true;
         case MessageTimer::TIMER_REQUEST_TIMEOUT:
-            _timeouts.request(msg.nr, msg.abstime);
+            _timeouts[CPU::current().log_id()]->request(msg.nr, msg.abstime);
             break;
         default:
             return false;
@@ -294,7 +298,7 @@ bool Vancouver::receive(MessageTimer &msg) {
 
 bool Vancouver::receive(MessageTime &msg) {
     timevalue_t ts, wallclock;
-    _timeouts.time(ts, wallclock);
+    _timeouts[CPU::current().log_id()]->time(ts, wallclock);
     msg.timestamp = ts;
     msg.wallclocktime = wallclock;
     return true;
diff --git a/nre/src/Vancouver.h b/nre/src/Vancouver.h
index 8c72c61f..7b8519c8 100644
--- a/nre/src/Vancouver.h
+++ b/nre/src/Vancouver.h
@@ -3,6 +3,8 @@
  * Copyright (C) 2007-2009, Bernhard Kauer <bk@vmmon.org>
  * Economic rights: Technische Universitaet Dresden (Germany)
  *
+ * Copyright (C) 2013 Markus Partheymueller, Intel Corporation.
+ *
  * This file is part of Vancouver.
  *
  * Vancouver is free software: you can redistribute it and/or modify
@@ -40,9 +42,13 @@ class Vancouver : public StaticReceiver<Vancouver> {
 public:
     explicit Vancouver(const char **args, size_t count, size_t console, const nre::String &constitle,
                        size_t fbsize)
-        : _clock(nre::Hip::get().freq_tsc * 1000), _mb(&_clock, nullptr), _timeouts(_mb),
+        : _clock(nre::Hip::get().freq_tsc * 1000), _mb(&_clock, nullptr),
           _conssess("console", console, constitle), _console(this, fbsize), _netsess(),
           _vmmng(), _vcpus(), _stdevs() {
+        _timeouts = new Timeouts *[nre::CPU::count()];
+        for (cpu_t i=0; i<nre::CPU::count(); i++)
+          _timeouts[i] = new Timeouts(_mb, i);
+
         // vmmanager is optional
         try {
             _vmmng = new nre::VMManagerSession("vmmanager");
@@ -81,8 +87,8 @@ class Vancouver : public StaticReceiver<Vancouver> {
     nre::ConsoleSession &console() {
         return _conssess;
     }
-    Timeouts &timeouts() {
-        return _timeouts;
+    Timeouts *timeouts(cpu_t cpu) {
+        return _timeouts[cpu];
     }
     uint64_t generate_mac() {
         static int macs = 0;
@@ -112,7 +118,7 @@ class Vancouver : public StaticReceiver<Vancouver> {
 
     Clock _clock;
     Motherboard _mb;
-    Timeouts _timeouts;
+    Timeouts **_timeouts;
     nre::ConsoleSession _conssess;
     ConsoleBackend _console;
     nre::NetworkSession *_netsess;

From 68184e255804a8853c1914d554c60d3558133ae3 Mon Sep 17 00:00:00 2001
From: Markus Partheymueller <markus.partheymueller@intel.com>
Date: Mon, 4 Nov 2013 14:08:02 +0100
Subject: [PATCH 30/35] Ported I/O thread implementation.

This commit ports the reference implementation of the I/O thread found in the unix frontend to NRE. It places the I/O thread worker on the CPU assigned to Vancouver, leaving the vCPUs on the following CPUs.
---
 nre/src/IOThread.cc  | 523 +++++++++++++++++++++++++++++++++++++++++++
 nre/src/IOThread.h   |  99 ++++++++
 nre/src/Vancouver.cc |   6 +
 nre/src/Vancouver.h  |  15 +-
 4 files changed, 642 insertions(+), 1 deletion(-)
 create mode 100644 nre/src/IOThread.cc
 create mode 100644 nre/src/IOThread.h

diff --git a/nre/src/IOThread.cc b/nre/src/IOThread.cc
new file mode 100644
index 00000000..232f4ce1
--- /dev/null
+++ b/nre/src/IOThread.cc
@@ -0,0 +1,523 @@
+/**
+ * I/O Thread
+ *
+ * Copyright (C) 2013 Markus Partheymueller, Intel Corporation.
+ *
+ * This file is part of Seoul.
+ *
+ * Seoul is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Seoul is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details.
+ */
+
+#include "IOThread.h"
+
+#define IOTHREAD_DEBUG
+
+#ifdef IOTHREAD_DEBUG
+static unsigned long msgcount[20] = {};
+static unsigned long maxqueue=0;
+static unsigned long sync=0, async=0;
+#endif
+
+static bool iothread_init = false;
+
+void IOThread::stats() {
+#ifdef IOTHREAD_DEBUG
+  for (unsigned i=0; i<17; i++) {
+    Logging::printf("Type %u: Count %lu\n", i, msgcount[i]);
+  }
+  Logging::printf("Max queue size: %lu\n", maxqueue);
+  Logging::printf(" Sync messages: %lu\n", sync);
+  Logging::printf("ASync messages: %lu\n", async);
+#endif
+}
+
+void IOThread::reset() {
+  stats();
+  if (iothread_init) return;
+  for(VCpu *vcpu = _mb->last_vcpu; vcpu; vcpu = vcpu->get_last()) {
+    vcpu->mem.set_iothread_enqueue(this, enqueue_static<MessageMem>, vcpu);
+    vcpu->executor.set_iothread_enqueue(this, enqueue_static<CpuMessage>, vcpu);
+  }
+  iothread_init = true;
+}
+
+nre::UserSm *IOThread::get_notify_sem(nre::Utcb *utcb) {
+  assert(utcb != 0);
+  for (auto it = _notify.begin(); it != _notify.end(); it++) {
+    if (it->utcb == utcb) return it->sem;
+  }
+  Notify *new_notify = new Notify;
+  new_notify->utcb = utcb;
+  new_notify->sem = new nre::UserSm(0);
+  _notify.append(new_notify);
+  return new_notify->sem;
+}
+
+template <typename M>
+static void sync_msg(MessageIOThreadEle *iomsg) {
+  // We have to keep the message when it is synchronous. The receiver will delete it.
+  if (iomsg->sync == MessageIOThread::SYNC_SYNC) {
+    // Wake enqueuer
+    assert(iomsg->sem != nullptr);
+    reinterpret_cast<nre::UserSm*>(iomsg->sem)->up();
+  } else {
+    delete (M*) iomsg->ptr;
+  }
+}
+
+void IOThread::syncify_message(MessageIOThreadEle *msg) {
+  if (msg->sync == MessageIOThread::SYNC_SYNC) {
+    msg->sem = this->get_notify_sem(nre::Thread::current()->utcb());
+    assert(msg->sem != nullptr);
+  }
+}
+
+template <typename M>
+void IOThread::sync_message(MessageIOThreadEle *msg, MessageIOThread::Sync sync) {
+  if (sync == MessageIOThread::SYNC_SYNC) {
+    reinterpret_cast<nre::UserSm*>(msg->sem)->down();
+    delete msg;
+  }
+}
+
+bool IOThread::enq(MessageIOThreadEle *msg) {
+  nre::ScopedLock<nre::UserSm> lock(&_lock);
+
+#ifdef IOTHREAD_DEBUG
+  msgcount[msg->type]++;
+  if (msg->sync == MessageIOThread::SYNC_SYNC) sync++;
+  else async++;
+#endif
+  _queue.append(msg);
+  _block.up();
+  return true;
+}
+
+bool IOThread::enqueue(MessageDisk &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu*) {
+  if (nre::Thread::current()->utcb() == own_utcb) return false;
+  // Disk is always sync because of error check
+  sync = MessageIOThread::SYNC_SYNC;
+  MessageIOThreadEle *enq = new MessageIOThreadEle(MessageIOThread::TYPE_DISK, mode, sync, value, &msg);
+  syncify_message(enq);
+  this->enq(enq);
+  sync_message<MessageDisk>(enq, sync);
+  return true;
+}
+
+bool IOThread::enqueue(MessageDiskCommit &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu*) {
+  if (nre::Thread::current()->utcb() == own_utcb) return false;
+  MessageDiskCommit *ptr = new MessageDiskCommit(msg.disknr, msg.usertag, msg.status);
+  MessageIOThreadEle *enq = new MessageIOThreadEle(MessageIOThread::TYPE_DISKCOMMIT, mode, sync, value, ptr);
+  syncify_message(enq);
+  this->enq(enq);
+  sync_message<MessageDiskCommit>(enq, sync);
+  return true;
+}
+
+bool IOThread::enqueue(MessageTime &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu*) {
+  if (nre::Thread::current()->utcb() == own_utcb) return false;
+  // Time must be sync
+  sync = MessageIOThread::SYNC_SYNC;
+  MessageIOThreadEle *enq = new MessageIOThreadEle(MessageIOThread::TYPE_TIME, mode, sync, value, &msg);
+  syncify_message(enq);
+  this->enq(enq);
+  sync_message<MessageTime>(enq, sync);
+  return true;
+}
+
+bool IOThread::enqueue(MessageTimer &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu*) {
+  if (nre::Thread::current()->utcb() == own_utcb) return false;
+  /*
+   * Timer slot requests are always sync.
+   * Because they are a result of an earlier message, timeout requests should never be enqueued.
+   */
+  if (msg.type == MessageTimer::TIMER_NEW) sync = MessageIOThread::SYNC_SYNC;
+  else Logging::panic("MessageTimer request nr %u\n", msg.nr);
+  MessageTimer *ptr;
+  if (sync == MessageIOThread::SYNC_ASYNC) {
+    ptr = new MessageTimer;
+    memcpy(ptr, &msg, sizeof(msg));
+  } else {
+    ptr = &msg;
+  }
+  MessageIOThreadEle *enq = new MessageIOThreadEle(MessageIOThread::TYPE_TIMER, mode, sync, value, ptr);
+  syncify_message(enq);
+  this->enq(enq);
+  sync_message<MessageTimer>(enq, sync);
+  return true;
+}
+
+bool IOThread::enqueue(MessageTimeout &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu*) {
+  if (nre::Thread::current()->utcb() == own_utcb) return false;
+  MessageTimeout *ptr;
+  if (sync == MessageIOThread::SYNC_ASYNC) {
+    ptr = new MessageTimeout(msg.nr, msg.time);
+  } else {
+    ptr = &msg;
+  }
+  MessageIOThreadEle *enq = new MessageIOThreadEle(MessageIOThread::TYPE_TIMEOUT, mode, sync, value, ptr);
+  syncify_message(enq);
+  this->enq(enq);
+  sync_message<MessageTimeout>(enq, sync);
+  return true;
+}
+
+bool IOThread::enqueue(MessageIOOut &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu*) {
+  if (nre::Thread::current()->utcb() == own_utcb) return false;
+  MessageIOOut *ptr;
+  if (sync == MessageIOThread::SYNC_ASYNC) {
+    ptr = new MessageIOOut(msg.type, msg.port, msg.value);
+    memcpy(ptr, &msg, sizeof(msg));
+  } else {
+    ptr = &msg;
+  }
+  MessageIOThreadEle *enq = new MessageIOThreadEle(MessageIOThread::TYPE_IOOUT, mode, sync, value, ptr);
+  syncify_message(enq);
+  this->enq(enq);
+  sync_message<MessageIOOut>(enq, sync);
+  return true;
+}
+
+bool IOThread::enqueue(MessageIOIn &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu*) {
+  if (nre::Thread::current()->utcb() == own_utcb) return false;
+  // I/O port reads are always sync
+  sync = MessageIOThread::SYNC_SYNC;
+  MessageIOThreadEle *enq = new MessageIOThreadEle(MessageIOThread::TYPE_IOIN, mode, sync, value, &msg);
+  syncify_message(enq);
+  this->enq(enq);
+  sync_message<MessageIOIn>(enq, sync);
+  return true;
+}
+
+bool IOThread::enqueue(MessageMem &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu) {
+  if (nre::Thread::current()->utcb() == own_utcb) return false;
+  // Mem reads are always sync
+  if (msg.read) sync = MessageIOThread::SYNC_SYNC;
+  MessageMem *ptr;
+  if (sync == MessageIOThread::SYNC_ASYNC) {
+    assert(!msg.read);
+    // We need to save the value pointed to by msg.ptr!
+    unsigned *val = new unsigned;
+    *val = *msg.ptr;
+    ptr = new MessageMem(msg.read, msg.phys, val);
+  } else {
+    ptr = &msg;
+  }
+  MessageIOThreadEle *enq = new MessageIOThreadEle(MessageIOThread::TYPE_MEM, mode, sync, value, ptr);
+  enq->vcpu = vcpu;
+  syncify_message(enq);
+  this->enq(enq);
+  sync_message<MessageMem>(enq, sync);
+  return true;
+}
+bool IOThread::enqueue(CpuMessage &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu) {
+  if (nre::Thread::current()->utcb() == own_utcb) return false;
+  if (msg.type != CpuMessage::TYPE_RDMSR && msg.type != CpuMessage::TYPE_WRMSR && msg.type != CpuMessage::TYPE_CHECK_IRQ) return false;
+  // These messages are always sync
+  sync = MessageIOThread::SYNC_SYNC;
+  CpuMessage *ptr;
+  ptr = &msg;
+  MessageIOThreadEle *enq = new MessageIOThreadEle(MessageIOThread::TYPE_CPU, mode, sync, value, ptr);
+  enq->vcpu = vcpu;
+  syncify_message(enq);
+  this->enq(enq);
+  sync_message<CpuMessage>(enq, sync);
+  return true;
+}
+
+bool IOThread::enqueue(MessageInput &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu*) {
+  if (nre::Thread::current()->utcb() == own_utcb) return false;
+  MessageInput *ptr;
+  if (sync == MessageIOThread::SYNC_ASYNC) {
+    ptr = new MessageInput(msg.device, msg.data);
+  } else {
+    ptr = &msg;
+  }
+  MessageIOThreadEle *enq = new MessageIOThreadEle(MessageIOThread::TYPE_INPUT, mode, sync, value, ptr);
+  syncify_message(enq);
+  this->enq(enq);
+  sync_message<MessageInput>(enq, sync);
+  return true;
+}
+
+bool IOThread::enqueue(MessageIrqLines &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu*) {
+  if (nre::Thread::current()->utcb() == own_utcb) return false;
+  MessageIrqLines *ptr;
+  if (sync == MessageIOThread::SYNC_ASYNC) {
+    ptr = new MessageIrqLines(msg.type, msg.line);
+  } else {
+    ptr = &msg;
+  }
+  MessageIOThreadEle *enq = new MessageIOThreadEle(MessageIOThread::TYPE_IRQLINES, mode, sync, value, ptr);
+  syncify_message(enq);
+  this->enq(enq);
+  sync_message<MessageIrqLines>(enq, sync);
+  return true;
+}
+
+bool IOThread::enqueue(MessageIrqNotify &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu*) {
+  if (nre::Thread::current()->utcb() == own_utcb) return false;
+  MessageIrqNotify *ptr;
+  if (sync == MessageIOThread::SYNC_ASYNC) {
+    ptr = new MessageIrqNotify(msg.baseirq, msg.mask);
+  } else {
+    ptr = &msg;
+  }
+  MessageIOThreadEle *enq = new MessageIOThreadEle(MessageIOThread::TYPE_IRQNOTIFY, mode, sync, value, ptr);
+  syncify_message(enq);
+  this->enq(enq);
+  sync_message<MessageIrqNotify>(enq, sync);
+  return true;
+}
+
+bool IOThread::enqueue(MessageIrq &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu*) {
+  if (nre::Thread::current()->utcb() == own_utcb) return false;
+  MessageIrq *ptr;
+  if (sync == MessageIOThread::SYNC_ASYNC) {
+    ptr = new MessageIrq(msg.type, msg.line);
+  } else {
+    ptr = &msg;
+  }
+  MessageIOThreadEle *enq = new MessageIOThreadEle(MessageIOThread::TYPE_IRQ, mode, sync, value, ptr);
+  syncify_message(enq);
+  this->enq(enq);
+  sync_message<MessageIrq>(enq, sync);
+  return true;
+}
+
+bool IOThread::enqueue(MessageLegacy &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu*) {
+  if (nre::Thread::current()->utcb() == own_utcb) return false;
+  if (msg.type == MessageLegacy::INTA || msg.type == MessageLegacy::DEASS_INTR) sync = MessageIOThread::SYNC_SYNC;
+  MessageLegacy *ptr;
+  if (sync == MessageIOThread::SYNC_ASYNC) {
+    ptr = new MessageLegacy(msg.type, msg.value);
+  } else {
+    ptr = &msg;
+  }
+  MessageIOThreadEle *enq = new MessageIOThreadEle(MessageIOThread::TYPE_LEGACY, mode, sync, value, ptr);
+  syncify_message(enq);
+  this->enq(enq);
+  sync_message<MessageLegacy>(enq, sync);
+  return true;
+}
+
+bool IOThread::enqueue(MessageNetwork &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu*) {
+  if (nre::Thread::current()->utcb() == own_utcb) return false;
+  if (msg.type == MessageNetwork::QUERY_MAC) sync = MessageIOThread::SYNC_SYNC;
+  MessageNetwork *ptr;
+  if (sync == MessageIOThread::SYNC_ASYNC) {
+    ptr = new MessageNetwork(msg.type, msg.client);
+    memcpy(ptr, &msg, sizeof(msg));
+  } else {
+    ptr = &msg;
+  }
+  MessageIOThreadEle *enq = new MessageIOThreadEle(MessageIOThread::TYPE_NETWORK, mode, sync, value, ptr);
+  syncify_message(enq);
+  this->enq(enq);
+  sync_message<MessageNetwork>(enq, sync);
+  return true;
+}
+
+bool IOThread::enqueue(MessagePciConfig &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu*) {
+  if (nre::Thread::current()->utcb() == own_utcb) return false;
+  // Reads are sync
+  if (msg.type == MessagePciConfig::TYPE_READ) sync = MessageIOThread::SYNC_SYNC;
+  MessagePciConfig *ptr;
+  if (sync == MessageIOThread::SYNC_ASYNC) {
+    ptr = new MessagePciConfig(msg.bdf);
+    memcpy(ptr, &msg, sizeof(msg));
+  } else {
+    ptr = &msg;
+  }
+  MessageIOThreadEle *enq = new MessageIOThreadEle(MessageIOThread::TYPE_PCICFG, mode, sync, value, ptr);
+  syncify_message(enq);
+  this->enq(enq);
+  sync_message<MessagePciConfig>(enq, sync);
+  return true;
+}
+
+bool IOThread::enqueue(MessageHostOp &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu*) {
+  if (nre::Thread::current()->utcb() == own_utcb || msg.type != MessageHostOp::OP_VCPU_RELEASE) return false;
+  MessageHostOp *ptr;
+  if (sync == MessageIOThread::SYNC_ASYNC) {
+    ptr = new MessageHostOp(msg.vcpu);
+    memcpy(ptr, &msg, sizeof(msg));
+  } else {
+    ptr = &msg;
+  }
+  MessageIOThreadEle *enq = new MessageIOThreadEle(MessageIOThread::TYPE_HOSTOP, mode, sync, value, ptr);
+  syncify_message(enq);
+  this->enq(enq);
+  sync_message<MessageHostOp>(enq, sync);
+  return true;
+}
+
+void IOThread::worker() {
+  // Set own UTCB. With this we can detect when sending message ourselves. They must not be enqueued.
+  own_utcb = nre::Thread::current()->utcb();
+
+  MessageIOThreadEle *msg;
+  MessageIOThread::Sync sync;
+  MessageIOThread::Type type;
+  while (1) {
+    _block.down();
+
+    {
+      nre::ScopedLock<nre::UserSm> lock(&_lock);
+      assert(_queue.length() > 0);
+#ifdef IOTHREAD_DEBUG
+      if (_queue.length() > maxqueue) maxqueue = _queue.length();
+#endif
+
+      auto it = _queue.begin();
+      msg = &*it;
+      _queue.remove(msg);
+      sync = msg->sync;
+      type = msg->type;
+    }
+
+    // Send message on appropriate bus
+    switch (msg->type) {
+      case MessageIOThread::TYPE_DISK:
+        {
+          MessageDisk *msg2 = reinterpret_cast<MessageDisk*>(msg->ptr);
+          _mb->bus_disk.send_direct(*msg2, msg->mode, msg->value);
+          sync_msg<MessageDisk>(msg);
+        }
+        break;
+      case MessageIOThread::TYPE_DISKCOMMIT:
+        {
+          MessageDiskCommit *msg2 = reinterpret_cast<MessageDiskCommit*>(msg->ptr);
+          _mb->bus_diskcommit.send_direct(*msg2, msg->mode, msg->value);
+          sync_msg<MessageDiskCommit>(msg);
+        }
+        break;
+      case MessageIOThread::TYPE_TIME:
+        {
+          MessageTime *msg2 = reinterpret_cast<MessageTime*>(msg->ptr);
+          _mb->bus_time.send_direct(*msg2, msg->mode, msg->value);
+          sync_msg<MessageTime>(msg);
+        }
+        break;
+      case MessageIOThread::TYPE_TIMER:
+        {
+          MessageTimer *msg2 = reinterpret_cast<MessageTimer*>(msg->ptr);
+          _mb->bus_timer.send_direct(*msg2, msg->mode, msg->value);
+          sync_msg<MessageTimer>(msg);
+        }
+        break;
+      case MessageIOThread::TYPE_TIMEOUT:
+        {
+          MessageTimeout *msg2 = reinterpret_cast<MessageTimeout*>(msg->ptr);
+          _mb->bus_timeout.send_direct(*msg2, msg->mode, msg->value);
+          sync_msg<MessageTimeout>(msg);
+        }
+        break;
+      case MessageIOThread::TYPE_IOOUT:
+        {
+          MessageIOOut *msg2 = reinterpret_cast<MessageIOOut*>(msg->ptr);
+          _mb->bus_ioout.send_direct(*msg2, msg->mode, msg->value);
+          sync_msg<MessageIOOut>(msg);
+        }
+        break;
+      case MessageIOThread::TYPE_IOIN:
+        {
+          MessageIOIn *msg2 = reinterpret_cast<MessageIOIn*>(msg->ptr);
+          _mb->bus_ioin.send_direct(*msg2, msg->mode, msg->value);
+          sync_msg<MessageIOIn>(msg);
+        }
+        break;
+      case MessageIOThread::TYPE_MEM:
+        {
+          MessageMem *msg2 = reinterpret_cast<MessageMem*>(msg->ptr);
+          if (msg->vcpu)
+            msg->vcpu->mem.send_direct(*msg2, msg->mode, msg->value);
+          else
+            _mb->bus_mem.send_direct(*msg2, msg->mode, msg->value);
+          // Special case: delete saved value
+          if (msg->sync == MessageIOThread::SYNC_ASYNC) delete msg2->ptr;
+          sync_msg<MessageMem>(msg);
+        }
+        break;
+      case MessageIOThread::TYPE_CPU:
+        {
+          CpuMessage *msg2 = reinterpret_cast<CpuMessage*>(msg->ptr);
+          if (msg->vcpu)
+            msg->vcpu->executor.send_direct(*msg2, msg->mode, msg->value);
+          else
+            Logging::panic("TYPE_CPU needs a vcpu pointer!\n");
+          sync_msg<CpuMessage>(msg);
+        }
+        break;
+      case MessageIOThread::TYPE_INPUT:
+        {
+          MessageInput *msg2 = reinterpret_cast<MessageInput*>(msg->ptr);
+          _mb->bus_input.send_direct(*msg2, msg->mode, msg->value);
+          sync_msg<MessageInput>(msg);
+        }
+        break;
+      case MessageIOThread::TYPE_IRQLINES:
+        {
+          MessageIrqLines *msg2 = reinterpret_cast<MessageIrqLines*>(msg->ptr);
+          _mb->bus_irqlines.send_direct(*msg2, msg->mode, msg->value);
+          sync_msg<MessageIrqLines>(msg);
+        }
+        break;
+      case MessageIOThread::TYPE_IRQNOTIFY:
+        {
+          MessageIrqNotify *msg2 = reinterpret_cast<MessageIrqNotify*>(msg->ptr);
+          _mb->bus_irqnotify.send_direct(*msg2, msg->mode, msg->value);
+          sync_msg<MessageIrqNotify>(msg);
+        }
+        break;
+      case MessageIOThread::TYPE_IRQ:
+        {
+          MessageIrq *msg2 = reinterpret_cast<MessageIrq*>(msg->ptr);
+          _mb->bus_hostirq.send_direct(*msg2, msg->mode, msg->value);
+          sync_msg<MessageIrq>(msg);
+        }
+        break;
+      case MessageIOThread::TYPE_LEGACY:
+        {
+          MessageLegacy *msg2 = reinterpret_cast<MessageLegacy*>(msg->ptr);
+          _mb->bus_legacy.send_direct(*msg2, msg->mode, msg->value);
+          sync_msg<MessageLegacy>(msg);
+        }
+        break;
+      case MessageIOThread::TYPE_NETWORK:
+        {
+          MessageNetwork *msg2 = reinterpret_cast<MessageNetwork*>(msg->ptr);
+          _mb->bus_network.send_direct(*msg2, msg->mode, msg->value);
+          sync_msg<MessageNetwork>(msg);
+        }
+        break;
+      case MessageIOThread::TYPE_PCICFG:
+        {
+          MessagePciConfig *msg2 = reinterpret_cast<MessagePciConfig*>(msg->ptr);
+          _mb->bus_pcicfg.send_direct(*msg2, msg->mode, msg->value);
+          sync_msg<MessagePciConfig>(msg);
+        }
+        break;
+      case MessageIOThread::TYPE_HOSTOP:
+        {
+          MessageHostOp *msg2 = reinterpret_cast<MessageHostOp*>(msg->ptr);
+          _mb->bus_hostop.send_direct(*msg2, msg->mode, msg->value);
+          sync_msg<MessageHostOp>(msg);
+        }
+        break;
+
+      default:
+        Logging::panic("Cannot handle type %x %x (size is %lx)!\n", type, sync, _queue.length());
+    }
+    if (sync == MessageIOThread::SYNC_ASYNC) {
+      delete msg;
+    }
+  }
+}
diff --git a/nre/src/IOThread.h b/nre/src/IOThread.h
new file mode 100644
index 00000000..2670e5d2
--- /dev/null
+++ b/nre/src/IOThread.h
@@ -0,0 +1,99 @@
+/**
+ * I/O Thread
+ *
+ * Copyright (C) 2013 Markus Partheymueller, Intel Corporation.
+ *
+ * This file is part of Seoul.
+ *
+ * Seoul is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Seoul is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details.
+ */
+
+#include <nul/message.h>
+#include <nul/motherboard.h>
+#include <nul/vcpu.h>
+#include <service/logging.h>
+
+#include <kobj/UserSm.h>
+#include <kobj/GlobalThread.h>
+#include <collection/SList.h>
+
+class MessageIOThreadEle : public MessageIOThread, public nre::SListItem {
+public:
+  MessageIOThreadEle(Type _type, Mode _mode, Sync _sync, void *_ptr) : MessageIOThread(_type, _mode, _sync, _ptr) {}
+  MessageIOThreadEle(Type _type, Mode _mode, Sync _sync, unsigned *_value, void *_ptr) : MessageIOThread(_type, _mode, _sync, _value, _ptr) {}
+};
+
+class Notify : public nre::SListItem {
+public:
+  nre::Utcb *utcb;
+  nre::UserSm *sem;
+};
+
+class IOThread : public StaticReceiver<IOThread>, public nre::SListItem {
+private:
+  nre::UserSm _lock;
+  nre::UserSm _block;
+  bool blocking;
+  nre::SList<MessageIOThreadEle> _queue;
+  Motherboard *_mb;
+
+  nre::SList<Notify> _notify;
+  nre::Utcb *own_utcb;
+
+public:
+  bool enq(MessageIOThreadEle *msg);
+  void syncify_message(MessageIOThreadEle *msg);
+  template <typename M>
+  void sync_message(MessageIOThreadEle *msg, MessageIOThread::Sync sync);
+
+  void stats();
+
+  void reset();
+
+  bool enqueue(MessageDisk &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu);
+  bool enqueue(MessageDiskCommit &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu);
+  bool enqueue(MessageTime &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu);
+  bool enqueue(MessageTimer &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu);
+  bool enqueue(MessageTimeout &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu);
+  bool enqueue(MessageIOOut &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu);
+  bool enqueue(MessageIOIn &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu);
+  bool enqueue(MessageMem &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu);
+  bool enqueue(CpuMessage &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu);
+  bool enqueue(MessageInput &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu);
+  bool enqueue(MessageIrqLines &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu);
+  bool enqueue(MessageIrqNotify &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu);
+  bool enqueue(MessageIrq &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu);
+  bool enqueue(MessageLegacy &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu);
+  bool enqueue(MessageNetwork &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu);
+  bool enqueue(MessagePciConfig &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu);
+  bool enqueue(MessageHostOp &msg, MessageIOThread::Mode mode, MessageIOThread::Sync sync, unsigned *value, VCpu *vcpu);
+
+  void worker();
+  nre::UserSm *get_notify_sem(nre::Utcb *utcb);
+
+  IOThread(Motherboard *mb) : _lock(1), _block(0), blocking(false), _queue(), _mb(mb), _notify() {
+    mb->bus_disk.set_iothread_enqueue(this, enqueue_static<MessageDisk>);
+    mb->bus_diskcommit.set_iothread_enqueue(this, enqueue_static<MessageDiskCommit>);
+    mb->bus_time.set_iothread_enqueue(this, enqueue_static<MessageTime>);
+    mb->bus_timer.set_iothread_enqueue(this, enqueue_static<MessageTimer>);
+    mb->bus_timeout.set_iothread_enqueue(this, enqueue_static<MessageTimeout>);
+    mb->bus_ioout.set_iothread_enqueue(this, enqueue_static<MessageIOOut>);
+    mb->bus_ioin.set_iothread_enqueue(this, enqueue_static<MessageIOIn>);
+    mb->bus_mem.set_iothread_enqueue(this, enqueue_static<MessageMem>);
+    mb->bus_input.set_iothread_enqueue(this, enqueue_static<MessageInput>);
+    mb->bus_irqlines.set_iothread_enqueue(this, enqueue_static<MessageIrqLines>);
+    mb->bus_irqnotify.set_iothread_enqueue(this, enqueue_static<MessageIrqNotify>);
+    mb->bus_hostirq.set_iothread_enqueue(this, enqueue_static<MessageIrq>);
+    mb->bus_legacy.set_iothread_enqueue(this, enqueue_static<MessageLegacy>);
+    mb->bus_network.set_iothread_enqueue(this, enqueue_static<MessageNetwork>);
+    mb->bus_pcicfg.set_iothread_enqueue(this, enqueue_static<MessagePciConfig>);
+    mb->bus_hostop.set_iothread_enqueue(this, enqueue_static<MessageHostOp>);
+  }
+};
diff --git a/nre/src/Vancouver.cc b/nre/src/Vancouver.cc
index 1d8a962c..f056ee39 100644
--- a/nre/src/Vancouver.cc
+++ b/nre/src/Vancouver.cc
@@ -308,6 +308,7 @@ bool Vancouver::receive(MessageLegacy &msg) {
     if(msg.type != MessageLegacy::RESET)
         return false;
     // TODO ??
+    _iothread_obj->reset();
     return true;
 }
 
@@ -515,6 +516,11 @@ void Vancouver::create_vcpus() {
     }
 }
 
+void Vancouver::iothread_worker(void *) {
+    Vancouver *vc = Thread::current()->get_tls<Vancouver*>(Thread::TLS_PARAM);
+    vc->iothread()->worker();
+}
+
 int main(int argc, char **argv) {
     size_t fbsize = ExecEnv::PAGE_SIZE * nre::VGAStream::PAGES;
     for(int i = 1; i < argc; ++i) {
diff --git a/nre/src/Vancouver.h b/nre/src/Vancouver.h
index 7b8519c8..6e56c868 100644
--- a/nre/src/Vancouver.h
+++ b/nre/src/Vancouver.h
@@ -33,6 +33,7 @@
 #include "StorageDevice.h"
 #include "VCPUBackend.h"
 #include "ConsoleBackend.h"
+#include "IOThread.h"
 
 extern nre::UserSm globalsm;
 
@@ -42,9 +43,18 @@ class Vancouver : public StaticReceiver<Vancouver> {
 public:
     explicit Vancouver(const char **args, size_t count, size_t console, const nre::String &constitle,
                        size_t fbsize)
-        : _clock(nre::Hip::get().freq_tsc * 1000), _mb(&_clock, nullptr),
+        : _clock(nre::Hip::get().freq_tsc * 1000), _mb(&_clock, nullptr), _iothread_obj(nullptr),
           _conssess("console", console, constitle), _console(this, fbsize), _netsess(),
           _vmmng(), _vcpus(), _stdevs() {
+
+        _iothread_obj = new IOThread(&_mb);
+
+        // IOThread
+        nre::Reference<nre::GlobalThread> io = nre::GlobalThread::create(
+            iothread_worker, nre::CPU::current().log_id(), "vmm-io");
+        io->set_tls<Vancouver*>(nre::Thread::TLS_PARAM, this);
+        io->start();
+
         _timeouts = new Timeouts *[nre::CPU::count()];
         for (cpu_t i=0; i<nre::CPU::count(); i++)
           _timeouts[i] = new Timeouts(_mb, i);
@@ -96,6 +106,7 @@ class Vancouver : public StaticReceiver<Vancouver> {
             return _vmmng->generate_mac().raw();
         return BASE_MAC + macs++;
     }
+    IOThread *iothread() { return _iothread_obj; }
 
     void reset();
     bool receive(CpuMessage &msg);
@@ -112,12 +123,14 @@ class Vancouver : public StaticReceiver<Vancouver> {
 private:
     static void network_thread(void*);
     static void keyboard_thread(void*);
+    static void iothread_worker(void*);
     static void vmmng_thread(void*);
     void create_devices(const char **args, size_t count);
     void create_vcpus();
 
     Clock _clock;
     Motherboard _mb;
+    IOThread *_iothread_obj;
     Timeouts **_timeouts;
     nre::ConsoleSession _conssess;
     ConsoleBackend _console;

From ba61be07ed181d1196c5fd00cba70291cbe9fb3d Mon Sep 17 00:00:00 2001
From: Markus Partheymueller <markus.partheymueller@intel.com>
Date: Mon, 4 Nov 2013 14:25:51 +0100
Subject: [PATCH 31/35] Give I/O threads higher priority.

Sporadic event handlers should have higher priority. In the case of the I/O thread, this is important when it is colocated with another vCPU (which is not advised). For timers, this can help avoid timing issues when the VM does busy waiting on timer events.
---
 nre/src/StorageDevice.h | 4 +++-
 nre/src/Timeouts.h      | 2 +-
 nre/src/Vancouver.h     | 4 ++--
 3 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/nre/src/StorageDevice.h b/nre/src/StorageDevice.h
index fa248c8a..27a0a00a 100644
--- a/nre/src/StorageDevice.h
+++ b/nre/src/StorageDevice.h
@@ -2,6 +2,8 @@
  * Copyright (C) 2012, Nils Asmussen <nils@os.inf.tu-dresden.de>
  * Economic rights: Technische Universitaet Dresden (Germany)
  *
+ * Copyright (C) 2013 Markus Partheymueller, Intel Corporation.
+ *
  * This file is part of NRE (NOVA runtime environment).
  *
  * NRE is free software: you can redistribute it and/or modify
@@ -34,7 +36,7 @@ class StorageDevice {
         nre::Reference<nre::GlobalThread> gt = nre::GlobalThread::create(
             thread, nre::CPU::current().log_id(), buffer);
         gt->set_tls<StorageDevice*>(nre::Thread::TLS_PARAM, this);
-        gt->start();
+        gt->start(nre::Qpd(2, 10000));
     }
 
     MessageDisk::Status get_params(DiskParameter &params) {
diff --git a/nre/src/Timeouts.h b/nre/src/Timeouts.h
index 556a242f..bdc0a74b 100644
--- a/nre/src/Timeouts.h
+++ b/nre/src/Timeouts.h
@@ -41,7 +41,7 @@ class Timeouts {
         nre::Reference<nre::GlobalThread> gt = nre::GlobalThread::create(
             timer_thread, _cpu, "vmm-timeouts");
         gt->set_tls<Timeouts*>(nre::Thread::TLS_PARAM, this);
-        gt->start();
+        gt->start(nre::Qpd(2, 10000));
     }
 
     nre::TimerSession &session() {
diff --git a/nre/src/Vancouver.h b/nre/src/Vancouver.h
index 6e56c868..e3f26163 100644
--- a/nre/src/Vancouver.h
+++ b/nre/src/Vancouver.h
@@ -53,7 +53,7 @@ class Vancouver : public StaticReceiver<Vancouver> {
         nre::Reference<nre::GlobalThread> io = nre::GlobalThread::create(
             iothread_worker, nre::CPU::current().log_id(), "vmm-io");
         io->set_tls<Vancouver*>(nre::Thread::TLS_PARAM, this);
-        io->start();
+        io->start(nre::Qpd(2, 10000));
 
         _timeouts = new Timeouts *[nre::CPU::count()];
         for (cpu_t i=0; i<nre::CPU::count(); i++)
@@ -79,7 +79,7 @@ class Vancouver : public StaticReceiver<Vancouver> {
             nre::Reference<nre::GlobalThread> network = nre::GlobalThread::create(
                 network_thread, nre::CPU::current().log_id(), "vmm-network");
             network->set_tls<Vancouver*>(nre::Thread::TLS_PARAM, this);
-            network->start();
+            network->start(nre::Qpd(2, 10000));
         }
         catch(const nre::Exception &e) {
             nre::Serial::get() << "Unable to connect to network: " << e.msg() << "\n";

From 2457cc07e798b437e519f29f2f1104587cb1c090 Mon Sep 17 00:00:00 2001
From: Markus Partheymueller <markus.partheymueller@intel.com>
Date: Mon, 4 Nov 2013 14:34:40 +0100
Subject: [PATCH 32/35] Remove global lock.

Because the synchronization is now provided by the I/O thread, it is safe to remove the global lock.
---
 nre/src/StorageDevice.h |  1 -
 nre/src/Timeouts.cc     |  3 ++-
 nre/src/Timeouts.h      |  2 --
 nre/src/VCPUBackend.cc  |  5 ++---
 nre/src/Vancouver.cc    | 12 +++++-------
 5 files changed, 9 insertions(+), 14 deletions(-)

diff --git a/nre/src/StorageDevice.h b/nre/src/StorageDevice.h
index 27a0a00a..d25f500a 100644
--- a/nre/src/StorageDevice.h
+++ b/nre/src/StorageDevice.h
@@ -80,7 +80,6 @@ class StorageDevice {
             nre::Storage::Packet *pk = sd->_sess.consumer().get();
             // the status isn't used anyway
             {
-                nre::ScopedLock<nre::UserSm> guard(&globalsm);
                 MessageDiskCommit msg(sd->_no, pk->tag, MessageDisk::DISK_OK);
                 sd->_bus.send(msg);
             }
diff --git a/nre/src/Timeouts.cc b/nre/src/Timeouts.cc
index c6659a83..5d4bfaa6 100644
--- a/nre/src/Timeouts.cc
+++ b/nre/src/Timeouts.cc
@@ -3,6 +3,8 @@
  * Copyright (C) 2007-2009, Bernhard Kauer <bk@vmmon.org>
  * Economic rights: Technische Universitaet Dresden (Germany)
  *
+ * Copyright (C) 2013 Markus Partheymueller, Intel Corporation.
+ *
  * This file is part of Vancouver.
  *
  * Vancouver is free software: you can redistribute it and/or modify
@@ -35,7 +37,6 @@ void Timeouts::timer_thread(void*) {
 }
 
 void Timeouts::trigger() {
-    ScopedLock<UserSm> guard(&globalsm);
     // TODO it can't be correct to not grab _sm here, because we might access stuff from
     // different threads here. but if we grab it here, we deadlock ourself because the devices
     // on the bus might call e.g. alloc().
diff --git a/nre/src/Timeouts.h b/nre/src/Timeouts.h
index bdc0a74b..8890fcf1 100644
--- a/nre/src/Timeouts.h
+++ b/nre/src/Timeouts.h
@@ -28,8 +28,6 @@
 
 #include <nul/motherboard.h>
 
-extern nre::UserSm globalsm;
-
 class Timeouts {
     enum {
         NO_TIMEOUT  = ~0ULL
diff --git a/nre/src/VCPUBackend.cc b/nre/src/VCPUBackend.cc
index e8ee126d..9b0d3e91 100644
--- a/nre/src/VCPUBackend.cc
+++ b/nre/src/VCPUBackend.cc
@@ -3,6 +3,8 @@
  * Copyright (C) 2007-2009, Bernhard Kauer <bk@vmmon.org>
  * Economic rights: Technische Universitaet Dresden (Germany)
  *
+ * Copyright (C) 2013 Markus Partheymueller, Intel Corporation.
+ *
  * This file is part of Vancouver.
  *
  * Vancouver is free software: you can redistribute it and/or modify
@@ -80,7 +82,6 @@ void VCPUBackend::handle_io(bool is_in, unsigned io_order, unsigned port) {
                    io_order, port, &uf->eax, uf->mtd);
     skip_instruction(msg);
     {
-        ScopedLock<UserSm> guard(&globalsm);
         if(!vcpu->executor.send(msg, true))
             Util::panic("nobody to execute %s at %x:%x\n", __func__, msg.cpu->cs.sel, msg.cpu->eip);
     }
@@ -97,8 +98,6 @@ void VCPUBackend::handle_vcpu(capsel_t pid, bool skip, CpuMessage::Type type) {
     if(skip)
         skip_instruction(msg);
 
-    ScopedLock<UserSm> guard(&globalsm);
-
     /**
      * Send the message to the VCpu.
      */
diff --git a/nre/src/Vancouver.cc b/nre/src/Vancouver.cc
index f056ee39..72e08add 100644
--- a/nre/src/Vancouver.cc
+++ b/nre/src/Vancouver.cc
@@ -235,17 +235,17 @@ bool Vancouver::receive(MessageHostOp &msg) {
 
         case MessageHostOp::OP_VCPU_BLOCK: {
             VCPUBackend *v = reinterpret_cast<VCPUBackend*>(msg.value);
-            globalsm.up();
-            v->sm().down();
-            globalsm.down();
+            bool block = !initialized;
+            if (block) globalsm.up();
+            v->sm().zero();
+            if (block) globalsm.down();
             res = true;
         }
         break;
 
         case MessageHostOp::OP_VCPU_RELEASE: {
             VCPUBackend *v = reinterpret_cast<VCPUBackend*>(msg.value);
-            if(msg.len)
-                v->sm().up();
+            v->sm().up();
             v->vcpu().recall();
             res = true;
         }
@@ -419,7 +419,6 @@ void Vancouver::network_thread(void*) {
             break;
 
         {
-            ScopedLock<UserSm> guard(&globalsm);
             MessageNetwork msg(packet, len, 0);
             vc->_mb.bus_network.send(msg);
         }
@@ -456,7 +455,6 @@ void Vancouver::keyboard_thread(void*) {
             }
         }
 
-        ScopedLock<UserSm> guard(&globalsm);
         MessageInput msg(0x10000, pk.scancode | pk.flags);
         vc->_mb.bus_input.send(msg);
     }

From 46f09fb1c2a55b0bd1e7a4062e0694bd18d1b0f4 Mon Sep 17 00:00:00 2001
From: Markus Partheymueller <markus.partheymueller@intel.com>
Date: Tue, 12 Nov 2013 14:18:18 +0100
Subject: [PATCH 33/35] Added synthetic testing environment.

To help modify vCPU and (LA)PIC subsystem to use atomic operations or fine-grained locking instead of a global lock, this synthetic testing utility can be used to stress the respective device models in an isolated way and run targeted development cycles with it.
---
 test/Makefile   |  48 ++++++++++
 test/ioapic.cc  | 168 ++++++++++++++++++++++++++++++++++
 test/ioapic.h   |  77 ++++++++++++++++
 test/lapic.cc   | 238 ++++++++++++++++++++++++++++++++++++++++++++++++
 test/lapic.h    |  85 +++++++++++++++++
 test/logging.cc |  50 ++++++++++
 test/main.cc    |  59 ++++++++++++
 test/pic.cc     | 196 +++++++++++++++++++++++++++++++++++++++
 test/pic.h      |  83 +++++++++++++++++
 9 files changed, 1004 insertions(+)
 create mode 100644 test/Makefile
 create mode 100644 test/ioapic.cc
 create mode 100644 test/ioapic.h
 create mode 100644 test/lapic.cc
 create mode 100644 test/lapic.h
 create mode 100644 test/logging.cc
 create mode 100644 test/main.cc
 create mode 100644 test/pic.cc
 create mode 100644 test/pic.h

diff --git a/test/Makefile b/test/Makefile
new file mode 100644
index 00000000..cd7be9f9
--- /dev/null
+++ b/test/Makefile
@@ -0,0 +1,48 @@
+CC=g++
+CFLAGS=-g -O3 -std=gnu++11 -gdwarf-2 -ggdb3
+CFLAGS_NOOPT=-g -O0 -std=gnu++11 -gdwarf-2 -ggdb3
+INCLUDES=-I ../include/ -I ../unix/include/
+LIBS=-pthread
+
+all: pic lapic ioapic
+
+pic: pic.o logging.o params.o pic8259.o
+	$(CC) $(CFLAGS) $(INCLUDES) $(LIBS) -DPICTEST \
+		main.cc pic.o \
+		params.o logging.o pic8259.o -o pictest.bin
+
+pic.o: pic.cc pic.h
+	$(CC) $(CFLAGS) $(INCLUDES) $(LIBS) -DPICTEST \
+		pic.cc -c
+params.o: ../unix/params.cc
+	$(CC) $(CFLAGS) $(INCLUDES) $(LIBS) -DPICTEST \
+		../unix/params.cc -c
+logging.o: logging.cc
+	$(CC) $(CFLAGS) $(INCLUDES) $(LIBS) -DPICTEST \
+		logging.cc -c
+pic8259.o: ../model/pic8259.cc
+	$(CC) $(CFLAGS) $(INCLUDES) $(LIBS) -DPICTEST \
+		../model/pic8259.cc -c
+
+runpic: pic
+	@./pictest.bin 2> log.txt
+
+ioapic:
+	$(CC) $(CFLAGS) $(INCLUDES) $(LIBS) -DIOAPICTEST \
+		main.cc ioapic.cc \
+		logging.cc ../unix/params.cc ../model/ioapic.cc -o ioapictest.bin
+
+runioapic: ioapic
+	./ioapictest.bin 2> log.txt
+
+lapic: logging.o params.o lapic.cc lapic.h
+	$(CC) $(CFLAGS) $(INCLUDES) $(LIBS) -DLAPICTEST \
+                main.cc lapic.cc \
+		../model/ioapic.cc ../model/lapic.cc ../model/vcpu.cc \
+                params.o logging.o -o lapictest.bin
+
+runlapic: lapic
+	./lapictest.bin 2> log.txt
+
+clean:
+	rm -f *.bin *.txt *.o
diff --git a/test/ioapic.cc b/test/ioapic.cc
new file mode 100644
index 00000000..9374fbe1
--- /dev/null
+++ b/test/ioapic.cc
@@ -0,0 +1,168 @@
+/**
+ * I/O APIC Unit Test
+ *
+ * Copyright (C) 2013 Markus Partheymueller, Intel Corporation.
+ *
+ * This file is part of Seoul.
+ *
+ * Seoul is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Seoul is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details.
+ */
+
+#include "ioapic.h"
+
+static LogBuffer logger;
+
+static Clock mb_clock(1000000);
+static Motherboard mb(&mb_clock, NULL);
+
+void doIO(bool read, uintptr_t phys, unsigned *ptr) {
+  MessageMem msg(read, phys, ptr);
+  mb.bus_mem.send(msg);
+}
+
+void readIO(uintptr_t phys, unsigned *ptr) {
+  doIO(true, phys, ptr);
+}
+
+void writeIO(uintptr_t phys, unsigned *ptr) {
+  doIO(false, phys, ptr);
+}
+
+// worker threads
+pthread_t receiver, trigger1, trigger2;
+unsigned irq_received_1=0, irq_received_2=0;
+bool irq1free=true, irq2free=true;
+unsigned irq_pending=0;
+
+// message handlers
+static bool receive(Device *, MessageMem &msg) {
+  if (msg.phys == LAPIC_ADDR) {
+    // set bit for IRQ
+    logger.log(LOG_INTR, *msg.ptr);
+    __sync_fetch_and_or(&irq_pending, 1 << (*msg.ptr & 0xff));
+  }
+}
+static bool receive(Device *, MessageIrqNotify &msg) {
+  logger.log(LOG_NOTIFY, msg.baseirq << 16 | msg.mask);
+  if (msg.baseirq == (IRQ1 & ~0x7) && msg.mask & (1 << (IRQ1 & 0x7))) {
+    // IRQ1 can be re-raised
+    __sync_bool_compare_and_swap(&irq1free, false, true);
+  }
+  if (msg.baseirq == (IRQ2 & ~0x7) && msg.mask & (1 << (IRQ2 & 0x7))) {
+    // IRQ2 can be re-raised
+    __sync_bool_compare_and_swap(&irq2free, false, true);
+  }
+}
+
+static void * receiver_fn(void *) {
+  unsigned waitcount = 0;
+  while (true) {
+    if (!__sync_fetch_and_or(&irq_pending, 0)) {
+      if (waitcount++ > 1000000 || (irq_received_1 == IRQ_COUNT && irq_received_2 == IRQ_COUNT)) break;
+      continue;
+    }
+    waitcount = 0;
+
+    unsigned vec;
+    if (irq_pending & (1 << IRQ2)) {
+      vec = IRQ2;
+      irq_received_2++;
+    } else if (irq_pending & (1 << IRQ1)) {
+      vec = IRQ1;
+      irq_received_1++;
+    }
+
+    __sync_fetch_and_and(&irq_pending, ~(1 << vec));
+
+    // EOI
+    logger.log(LOG_EOI, vec);
+    writeIO(IOAPIC_ADDR | IOAPIC_EOI, &vec);
+  }
+}
+
+static void * trigger_1_fn(void *) {
+  MessageIrqLines msg(MessageIrq::ASSERT_NOTIFY, IRQ1);
+  unsigned sent = 0;
+  while (sent++ < IRQ_COUNT) {
+    while (!__sync_bool_compare_and_swap(&irq1free, true, false));
+
+    logger.log(LOG_SEND, IRQ1);
+    mb.bus_irqlines.send(msg);
+  }
+  return nullptr;
+}
+
+static void * trigger_2_fn(void *) {
+  MessageIrqLines msg(MessageIrq::ASSERT_NOTIFY, IRQ2);
+  unsigned sent = 0;
+  while (sent++ < IRQ_COUNT) {
+    while (!__sync_bool_compare_and_swap(&irq2free, true, false));
+
+    logger.log(LOG_SEND, IRQ2);
+    mb.bus_irqlines.send(msg);
+  }
+  return nullptr;
+}
+
+int runIOAPicTest() {
+  // attach handlers
+  mb.bus_irqnotify.add(nullptr, receive);
+  mb.bus_mem.add(nullptr, receive);
+
+  // create I/O APIC
+  mb.handle_arg("ioapic");
+
+  // init two IRQs
+  unsigned index1 = 0x10+IRQ1*2;
+  unsigned irq1 = 0x8000 | (IRQ1 & 0xff);
+  unsigned index2 = 0x10+IRQ2*2;
+  unsigned irq2 = 0x8000 | (IRQ2 & 0xff);
+
+  writeIO(IOAPIC_ADDR | IOAPIC_IDX, &index1);
+  writeIO(IOAPIC_ADDR | IOAPIC_DATA, &irq1);
+  writeIO(IOAPIC_ADDR | IOAPIC_IDX, &index2);
+  writeIO(IOAPIC_ADDR | IOAPIC_DATA, &irq2);
+
+  // create threads for triggering and receiving interrupts
+  cpu_set_t cpuset_receiver, cpuset_trigger1, cpuset_trigger2;
+  pthread_t self = pthread_self();
+  CPU_ZERO(&cpuset_receiver);
+  CPU_ZERO(&cpuset_trigger1);
+  CPU_ZERO(&cpuset_trigger2);
+  CPU_SET(1, &cpuset_receiver);
+  CPU_SET(2, &cpuset_trigger1);
+  CPU_SET(3, &cpuset_trigger2);
+
+  pthread_setaffinity_np(self, sizeof(cpu_set_t), &cpuset_trigger1);
+
+  timevalue tsc_start = Cpu::rdtsc();
+
+  pthread_create(&receiver, NULL, receiver_fn, NULL);
+  pthread_setaffinity_np(receiver, sizeof(cpu_set_t), &cpuset_receiver);
+
+  pthread_create(&trigger1, NULL, trigger_1_fn, NULL);
+  pthread_setaffinity_np(trigger1, sizeof(cpu_set_t), &cpuset_trigger1);
+
+  pthread_create(&trigger2, NULL, trigger_2_fn, NULL);
+  pthread_setaffinity_np(trigger2, sizeof(cpu_set_t), &cpuset_trigger2);
+
+  pthread_join(receiver, nullptr);
+  pthread_join(trigger1, nullptr);
+  pthread_join(trigger2, nullptr);
+
+  timevalue cycles = Cpu::rdtsc() - tsc_start;
+
+  printf("Test completed. Received (%u, %u) interrupts (expected %u, %u).\nTest took %llu cycles.\n",
+         irq_received_1, irq_received_2, IRQ_COUNT, IRQ_COUNT, cycles);
+
+  //logger.dump();
+
+  return 0;
+}
diff --git a/test/ioapic.h b/test/ioapic.h
new file mode 100644
index 00000000..05dd6b79
--- /dev/null
+++ b/test/ioapic.h
@@ -0,0 +1,77 @@
+/**
+ * I/O APIC Test header file
+ *
+ * Copyright (C) 2013 Markus Partheymueller, Intel Corporation.
+ *
+ * This file is part of Seoul.
+ *
+ * Seoul is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Seoul is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details.
+ */
+
+#include <nul/motherboard.h>
+#include <nul/vcpu.h>
+
+#include <pthread.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <assert.h>
+#include <signal.h>
+
+int runIOAPicTest();
+
+enum { IRQ_COUNT = 10000000, IRQ1 = 20, IRQ2 = 21 };
+
+enum LogItem {
+  LOG_SEND = 0x1,
+  LOG_INTR,
+  LOG_INTA_RX,
+  LOG_INTA_TX,
+  LOG_EOI,
+  LOG_NOTIFY,
+  LOG_DEASS
+};
+
+#define IOAPIC_ADDR 0xfec00000
+#define IOAPIC_IDX 0x00
+#define IOAPIC_DATA 0x10
+#define IOAPIC_EOI  0x40
+#define LAPIC_ADDR 0xfee00000
+
+class LogBuffer {
+private:
+  unsigned logbuffer[20*IRQ_COUNT];
+  unsigned logindex=0;
+
+public:
+
+  void log(LogItem type, unsigned value=0) {
+    unsigned logindex_tmp = __sync_fetch_and_add(&logindex, 1);
+    logbuffer[logindex_tmp] = (value << 16) | type;
+  }
+
+  void dump() {
+    Logging::printf("\nLog output follows:\n---------------------------------------\n\n");
+    for (unsigned i=0; i<logindex; i++) {
+      const char * event;
+      switch (logbuffer[i] & 0xffff) {
+        case LOG_SEND: event = "\tIRQ\t\t"; break;
+        case LOG_INTR: event = "\tINTR\t\t"; break;
+        case LOG_INTA_TX: event = "\tINTA TX\t\t"; break;
+        case LOG_INTA_RX: event = "\tINTA RX\t\t"; break;
+        case LOG_EOI: event = "\tEOI\t\t"; break;
+        case LOG_NOTIFY: event = "\tNOTIFY\t\t"; break;
+        case LOG_DEASS: event = "\tDEASS\t\t"; break;
+        default: event = "n/a"; break;
+      }
+      Logging::printf("%s %x\n", event, logbuffer[i] >> 16);
+    }
+    Logging::printf("\n---------------------------------------\n\nPrinted %u events.\n", logindex);
+  }
+};
diff --git a/test/lapic.cc b/test/lapic.cc
new file mode 100644
index 00000000..0e5f7937
--- /dev/null
+++ b/test/lapic.cc
@@ -0,0 +1,238 @@
+/*
+ * LAPIC Unit Test
+ *
+ * Copyright (C) 2013 Markus Partheymueller, Intel Corporation.
+ *
+ * This file is part of Seoul.
+ *
+ * Seoul is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Seoul is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details.
+ */
+
+#include "lapic.h"
+
+static LogBuffer logger;
+
+static Clock mb_clock(1000000);
+static Motherboard mb(&mb_clock, NULL);
+
+VCpu *vcpu;
+
+timevalue tsc_start;
+
+void doIO(bool read, uintptr_t phys, unsigned *ptr) {
+  MessageMem msg(read, phys, ptr);
+  vcpu->mem.send(msg);
+}
+
+void readIO(uintptr_t phys, unsigned *ptr) {
+  doIO(true, phys, ptr);
+}
+
+void writeIO(uintptr_t phys, unsigned *ptr) {
+  doIO(false, phys, ptr);
+}
+
+// worker threads
+pthread_t receiver, trigger, ipi;
+bool irq_free = false;
+bool ipi_free = false;
+unsigned irq_sent_timer = 0, irq_sent_ipi = 0;
+unsigned irq_received_timer = 0, irq_received_ipi = 0;
+unsigned long intr = 0;
+bool wakeup = false;
+
+static bool receive(Device *, CpuEvent &msg) {
+  if (msg.value == VCpu::EVENT_INTR) {
+    __sync_fetch_and_add(&intr, 4);
+    if (!(__sync_fetch_and_or(&intr, 1) & 0x1)) {
+      logger.log(LOG_INTR);
+    }
+    wakeup = true;
+  } else if (msg.value == VCpu::DEASS_INTR) {
+    logger.log(LOG_DEASS);
+  }
+}
+
+static bool receive(Device *, MessageHostOp &msg) {
+  if (msg.type == MessageHostOp::OP_VCPU_CREATE_BACKEND) {
+    vcpu = msg.vcpu;
+    vcpu->bus_event.add(nullptr, receive);
+    msg.value = 0;
+    return true;
+  } else if (msg.type == MessageHostOp::OP_VCPU_RELEASE) {
+    // Release: CHECK_IRQ (unused)
+  } else {
+    Logging::printf("Hostop msg type %u\n", msg.type);
+  }
+}
+
+static bool receive(Device *, MessageTimer &msg) {
+  if (msg.type == MessageTimer::TIMER_NEW) {
+    msg.nr = 0;
+  } else {
+    // Ready to fire new
+    logger.log(LOG_NOTIFY, TIMER_VEC);
+    __sync_bool_compare_and_swap(&irq_free, false, true);
+  }
+  return true;
+}
+
+static void * receiver_fn(void *) {
+  // Run magic
+  unsigned long waitcount = 0;
+  unsigned long current;
+
+  while (Cpu::rdtsc() < tsc_start+10000000);
+
+  __sync_bool_compare_and_swap(&ipi_free, false, true);
+
+  while (true) {
+    if (!__sync_fetch_and_and(&wakeup, 0)) {
+      if (irq_received_timer == IRQ_COUNT_TIMER && irq_received_ipi == IRQ_COUNT_IPI || waitcount++ > 100000000) break;
+      asm volatile ("pause");
+      continue;
+    }
+    waitcount = 0;
+
+    // Double-check due to race
+    current = intr;
+    LapicEvent check(LapicEvent::CHECK_INTR);
+    check.value = 0;
+    vcpu->bus_lapic.send(check, true);
+    if (!check.value) {
+      logger.log(LOG_SKIP, check.value);
+      __sync_bool_compare_and_swap(&intr, current, (current + 4) & ~1ULL);
+      continue;
+    }
+
+    // INTA
+    logger.log(LOG_INTA_TX);
+    LapicEvent msg(LapicEvent::INTA);
+    vcpu->bus_lapic.send(msg);
+    logger.log(LOG_INTA_RX, msg.value);
+    if (msg.value == TIMER_VEC) irq_received_timer++;
+    else if (msg.value == IPI_VEC) irq_received_ipi++;
+    else Logging::panic("Spurious IRQ! %x\n", msg.value);
+
+    // EOI
+    unsigned val = 0x0;
+    logger.log(LOG_EOI, msg.value);
+    writeIO(LAPIC_BASE + 0xb0, &val);
+
+    if (msg.value == TIMER_VEC) {
+      // Rearm
+      unsigned val = 1U;
+      writeIO(LAPIC_BASE + 0x380, &val);
+    } else if (msg.value == IPI_VEC) {
+      // Free IPI mutex
+      __sync_bool_compare_and_swap(&ipi_free, false, true);
+    }
+  }
+  Logging::printf("Receiver finished with %u,%u interrupts. (waitcount %lu)\n", irq_received_timer, irq_received_ipi, waitcount);
+  Logging::printf("They sent %u,%u interrupts.\n", irq_sent_timer, irq_sent_ipi);
+  return nullptr;
+}
+
+static void * trigger_fn(void *) {
+  // Trigger timer interrupt at lapic
+  while (irq_sent_timer < IRQ_COUNT_TIMER) {
+    while (!__sync_bool_compare_and_swap(&irq_free, true, false)) asm volatile ("pause");
+
+    logger.log(LOG_SEND, TIMER_VEC);
+
+    MessageTimeout msg(0, Cpu::rdtsc());
+    assert(mb.bus_timeout.send(msg));
+
+    irq_sent_timer++;
+  }
+  Logging::printf("Timer thread exits.\n");
+  return nullptr;
+}
+
+static void * ipi_fn(void *) {
+  while (irq_sent_ipi < IRQ_COUNT_IPI) {
+    while (!__sync_bool_compare_and_swap(&ipi_free, true, false)) asm volatile ("pause");
+
+    logger.log(LOG_SEND, IPI_VEC);
+
+    MessageApic msg(0x4000 | IPI_VEC, 0xff, 0);
+    assert(mb.bus_apic.send(msg));
+
+    irq_sent_ipi++;
+  }
+  Logging::printf("IPI thread exits.\n");
+  return nullptr;
+}
+
+int runLAPICTest() {
+  // attach handlers
+  mb.bus_hostop.add(nullptr, receive);
+  mb.bus_timer.add(nullptr, receive);
+
+  // parse args
+  //mb.handle_arg("ioapic");
+  mb.handle_arg("vcpu");
+  mb.handle_arg("lapic");
+
+  // init LAPIC
+  //software enable, map spurious interrupt to dummy isr
+  unsigned val = 39 | 0x100;
+  writeIO(LAPIC_BASE + 0xf0, &val);
+  //map APIC timer to an interrupt, and by that enable it
+  val = TIMER_VEC;
+  writeIO(LAPIC_BASE + 0x320, &val);
+  //set up divide value to 16
+  val = 0x03;
+  writeIO(LAPIC_BASE + 0x3e0, &val);
+  //reset APIC timer (set counter)
+  val = 1000U;
+  writeIO(LAPIC_BASE + 0x380, &val);
+
+  tsc_start = Cpu::rdtsc();
+
+  cpu_set_t cpuset_receiver, cpuset_trigger, cpuset_ipi;
+  pthread_t self = pthread_self();
+  CPU_ZERO(&cpuset_receiver);
+  CPU_ZERO(&cpuset_trigger);
+  CPU_ZERO(&cpuset_ipi);
+  CPU_SET(1, &cpuset_receiver);
+  CPU_SET(2, &cpuset_trigger);
+  CPU_SET(3, &cpuset_ipi);
+
+  pthread_setaffinity_np(self, sizeof(cpu_set_t), &cpuset_trigger);
+
+  pthread_create(&trigger, NULL, trigger_fn, NULL);
+  pthread_setaffinity_np(trigger, sizeof(cpu_set_t), &cpuset_trigger);
+
+  pthread_create(&ipi, NULL, ipi_fn, NULL);
+  pthread_setaffinity_np(ipi, sizeof(cpu_set_t), &cpuset_ipi);
+
+  pthread_create(&receiver, NULL, receiver_fn, NULL);
+  pthread_setaffinity_np(receiver, sizeof(cpu_set_t), &cpuset_receiver);
+
+  //pthread_join(trigger, nullptr);
+  //pthread_join(ipi, nullptr);
+  pthread_join(receiver, nullptr);
+
+  timevalue cycles = Cpu::rdtsc() - tsc_start;
+
+  printf("Test completed. Received (%u, %u) interrupts (expected %u, %u).\nTest took %llu cycles.\n",
+         irq_received_timer, irq_received_ipi, IRQ_COUNT_TIMER, IRQ_COUNT_IPI, cycles);
+
+  if (irq_received_timer != irq_sent_timer || irq_received_ipi != irq_sent_ipi || irq_received_timer != IRQ_COUNT_TIMER || irq_received_ipi != IRQ_COUNT_IPI) {
+    logger.dump();
+    printf("Error. Log dumped, going to spin...\n");
+    for (;;);
+  }
+
+  //logger.dump();
+
+  return 0;
+}
diff --git a/test/lapic.h b/test/lapic.h
new file mode 100644
index 00000000..a6d56914
--- /dev/null
+++ b/test/lapic.h
@@ -0,0 +1,85 @@
+/**
+ * LAPIC Test header file
+ *
+ * Copyright (C) 2013 Markus Partheymueller, Intel Corporation.
+ *
+ * This file is part of Seoul.
+ *
+ * Seoul is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Seoul is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details.
+ */
+
+#include <nul/motherboard.h>
+#include <nul/vcpu.h>
+
+#include <pthread.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <assert.h>
+
+int runLAPICTest();
+
+#define LAPIC_BASE 0xfee00000
+
+enum { IRQ_COUNT_TIMER = 500000, IRQ_COUNT_IPI = 500000, TIMER_VEC = 32, IPI_VEC = 0xa7 };
+
+enum LogItem {
+  LOG_INIT = 0x0,
+  LOG_SEND,
+  LOG_INTR,
+  LOG_INTA_RX,
+  LOG_INTA_TX,
+  LOG_EOI,
+  LOG_NOTIFY,
+  LOG_DEASS,
+  LOG_IGNORE,
+  LOG_SKIP
+};
+
+class LogBuffer {
+private:
+  unsigned long logbuffer[60*(IRQ_COUNT_TIMER+IRQ_COUNT_IPI)];
+  unsigned logindex=0;
+
+public:
+
+  void log(LogItem type, unsigned value=0) {
+    unsigned logindex_tmp = __sync_fetch_and_add(&logindex, 2);
+    if (logindex_tmp >= sizeof(logbuffer)-2) { return; }
+    logbuffer[logindex_tmp] = Cpu::rdtsc();
+    logbuffer[logindex_tmp+1] = (pthread_self() << 32) | (value << 16) | type;
+  }
+
+  void dump() {
+    Logging::printf("\nLog output follows:\n---------------------------------------\n\n");
+    for (unsigned i=0; i<logindex; i+=2) {
+      const char * event;
+      switch (logbuffer[i+1] & 0xffff) {
+        case LOG_INIT: event = "\tINIT\t\t"; break;
+        case LOG_SEND: event = "\tIRQ\t\t"; break;
+        case LOG_INTR: event = "\tINTR\t\t"; break;
+        case LOG_INTA_TX: event = "\tINTA TX\t\t"; break;
+        case LOG_INTA_RX: event = "\tINTA RX\t\t"; break;
+        case LOG_EOI: event = "\tEOI\t\t"; break;
+        case LOG_NOTIFY: event = "\tNOTIFY\t\t"; break;
+        case LOG_DEASS: event = "\tDEASS\t\t"; break;
+        case LOG_IGNORE: event = "\tIGNORE\t\t"; break;
+        case LOG_SKIP: event = "\tSKIP\t\t"; break;
+        default: event = "n/a"; break;
+      }
+      Logging::printf("%lx\tT%lx\t%s %lx\n",
+                      logbuffer[i],
+                      (logbuffer[i+1] >>32),
+                      event,
+                      (logbuffer[i+1] >> 16)& 0xffff
+      );
+    }
+    Logging::printf("\n---------------------------------------\n\nPrinted %u events.\n", logindex/2);
+  }
+};
diff --git a/test/logging.cc b/test/logging.cc
new file mode 100644
index 00000000..1e97b79f
--- /dev/null
+++ b/test/logging.cc
@@ -0,0 +1,50 @@
+/**
+ * Logging stubs
+ *
+ * Copyright (C) 2013 Markus Partheymueller, Intel Corporation.
+ *
+ * This file is part of Seoul.
+ *
+ * Seoul is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Seoul is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details.
+ */
+
+#include <service/logging.h>
+#include <nul/motherboard.h>
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdarg.h>
+#include <unistd.h>
+
+void Logging::panic(const char *format, ...)
+{
+  va_list ap;
+  va_start(ap, format);
+
+  Logging::vprintf(format, ap);
+  Logging::printf("\n");
+
+  va_end(ap);
+  abort();
+}
+
+void Logging::printf(const char *format, ...)
+{
+  va_list ap;
+  va_start(ap, format);
+  Logging::vprintf(format, ap);
+  va_end(ap);
+}
+
+
+void Logging::vprintf(const char *format, va_list &ap)
+{
+    ::vfprintf(stderr, format, ap);
+}
diff --git a/test/main.cc b/test/main.cc
new file mode 100644
index 00000000..20ba583f
--- /dev/null
+++ b/test/main.cc
@@ -0,0 +1,59 @@
+/**
+ * Synthetic Testing Environment
+ *
+ * Copyright (C) 2013 Markus Partheymueller, Intel Corporation.
+ *
+ * This file is part of Seoul.
+ *
+ * Seoul is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Seoul is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details.
+ */
+
+#include <iostream>
+
+#ifdef PICTEST
+#include "pic.h"
+#endif
+
+#ifdef IOAPICTEST
+#include "ioapic.h"
+#endif
+
+#ifdef LAPICTEST
+#include "lapic.h"
+#endif
+
+#ifdef SATATEST
+#include "sata.h"
+#endif
+
+int main(int argc, char **argv) {
+  std::cout << "Hello, this is Seoulcheck." << std::endl;
+
+#ifdef PICTEST
+  std::cout << "Running PIC test." << std::endl;
+  runPicTest();
+#endif
+
+#ifdef IOAPICTEST
+  std::cout << "Running I/O APIC test." << std::endl;
+  runIOAPicTest();
+#endif
+
+#ifdef LAPICTEST
+  std::cout << "Running LAPIC test." << std::endl;
+  runLAPICTest();
+#endif
+
+#ifdef SATATEST
+  std::cout << "Running SATA test." << std::endl;
+  runSATATest();
+#endif
+
+}
diff --git a/test/pic.cc b/test/pic.cc
new file mode 100644
index 00000000..953e8058
--- /dev/null
+++ b/test/pic.cc
@@ -0,0 +1,196 @@
+/**
+ * PIC Unit Test
+ *
+ * Copyright (C) 2013 Markus Partheymueller, Intel Corporation.
+ *
+ * This file is part of Seoul.
+ *
+ * Seoul is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Seoul is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details.
+ */
+
+#include "pic.h"
+
+static LogBuffer logger;
+
+static Clock mb_clock(1000000);
+static Motherboard mb(&mb_clock, NULL);
+
+timevalue tsc_start;
+
+unsigned char IRQS[2] = { 3, 12 };
+
+void outb(unsigned short port, unsigned short value) {
+  MessageIOOut msg(MessageIOOut::TYPE_OUTB, port, value);
+  mb.bus_ioout.send(msg);
+}
+unsigned short inb(unsigned short port) {
+  MessageIOIn msg(MessageIOIn::TYPE_INB, port);
+  mb.bus_ioin.send(msg);
+  return msg.value;
+}
+unsigned char _get_irr() {
+  outb(0x20, 0x0a);
+  return inb(0x20);
+}
+
+// worker threads
+pthread_t receiver, trigger1, trigger2;
+unsigned irq_received_1=0, irq_received_2=0;
+bool irq_1_free=true, irq_2_free=true;
+unsigned long intr = 0;
+
+// message handlers
+static bool receive(Device *, MessageLegacy &msg) {
+  if (msg.type == MessageLegacy::INTR) {
+    __sync_fetch_and_add(&intr, 4);
+    if (!(__sync_fetch_and_or(&intr, 1) & 0x1)) {
+      logger.log(LOG_INTR);
+    }
+  } else if (msg.type == MessageLegacy::DEASS_INTR) {
+    logger.log(LOG_DEASS, msg.value);
+  }
+}
+
+static bool receive(Device *, MessageIrqNotify &msg) {
+  logger.log(LOG_NOTIFY, msg.baseirq << 8 | msg.mask);
+  if (msg.baseirq == (IRQS[0] & 0x8) && msg.mask & (1 << (IRQS[0] & 0x7))) {
+    // First IRQ can be re-raised
+    __sync_bool_compare_and_swap(&irq_1_free, false, true);
+  }
+  else if (msg.baseirq == (IRQS[1] & 0x8) && msg.mask & (1 << (IRQS[1] & 0x7))) {
+    // Second IRQ can be re-raised
+    __sync_bool_compare_and_swap(&irq_2_free, false, true);
+  }
+  else Logging::panic("w00t %x:%x\n", msg.baseirq, msg.mask);
+}
+
+static void * receiver_fn(void *) {
+  while (Cpu::rdtsc() < tsc_start+10000000);
+  logger.log(LOG_INIT);
+  sleep(1);
+  unsigned long waitcount = 0;
+  unsigned long current;
+  while (true) {
+    if (!(__sync_fetch_and_or(&intr, 1) & 0x1)) {
+      if (waitcount++ > 1000000000 || (irq_received_1 == IRQ_COUNT && irq_received_2 == IRQ_COUNT)) break;
+      continue;
+    }
+
+    waitcount = 0;
+
+    // Double-check due to race
+    current = intr;
+    MessageLegacy check(MessageLegacy::CHECK_INTR);
+    mb.bus_legacy.send(check);
+    if (!(check.value & 0xff00)) {
+      logger.log(LOG_SKIP, check.value);
+      __sync_bool_compare_and_swap(&intr, current, (current + 4) & ~1ULL);
+      continue;
+    }
+
+    logger.log(LOG_INTA_TX, check.value);
+    MessageLegacy inta(MessageLegacy::INTA, 0);
+    waitcount = 0;
+    mb.bus_legacy.send(inta);
+    logger.log(LOG_INTA_RX, inta.value);
+
+    if (inta.value == IRQS[0]) irq_received_1++;
+    if (inta.value == IRQS[1]) irq_received_2++;
+
+    if (inta.value >= 8) outb(0xa0, 0x20);
+    outb(0x20, 0x20);
+    logger.log(LOG_EOI, (intr << 8) | inta.value);
+  }
+}
+
+
+template <unsigned char IRQ>
+static void * trigger_fn(void *) {
+  while (Cpu::rdtsc() < tsc_start+10000000);
+  logger.log(LOG_INIT);
+  MessageIrqLines msg(MessageIrq::ASSERT_NOTIFY, IRQS[IRQ-1]);
+  unsigned sent = 0, ignored = 0;
+  bool * waiter = (IRQ == 1) ? &irq_1_free : &irq_2_free;
+  while (sent < IRQ_COUNT) {
+    while (!__sync_bool_compare_and_swap(waiter, true, false)) asm volatile ("pause" : : : "memory");
+    asm volatile ("":::"memory");
+
+    logger.log(LOG_SEND, IRQS[IRQ-1]);
+    if (mb.bus_irqlines.send(msg)) {
+      sent++;
+      ignored = 0;
+    } else {
+      assert(false);
+      logger.log(LOG_IGNORE, IRQS[IRQ-1]);
+      *waiter = true;
+      if (ignored++ >= 1000) return nullptr;
+    }
+  }
+  return nullptr;
+}
+
+int runPicTest() {
+  // attach handlers
+  mb.bus_irqnotify.add(nullptr, receive);
+  mb.bus_legacy.add(nullptr, receive);
+
+  tsc_start = Cpu::rdtsc();
+
+  // create PIC
+  mb.handle_arg("pic:0x20,,0x4d0");
+  mb.handle_arg("pic:0xa0,2,0x4d1");
+
+  // init PICs (sequence according to http://wiki.osdev.org/PIC)
+  outb(0x20, 0x10+0x01);
+  outb(0xa0, 0x10+0x01);
+  outb(0x21, 0);
+  outb(0xa1, 8);
+  outb(0x21, 4);
+  outb(0xa1, 2);
+
+  outb(0x21, 0x01);
+  outb(0xa1, 0x01);
+
+  // create threads for triggering and receiving interrupts
+  cpu_set_t cpuset_receiver, cpuset_trigger1, cpuset_trigger2;
+  pthread_t self = pthread_self();
+  CPU_ZERO(&cpuset_receiver);
+  CPU_ZERO(&cpuset_trigger1);
+  CPU_ZERO(&cpuset_trigger2);
+  CPU_SET(1, &cpuset_receiver);
+  CPU_SET(2, &cpuset_trigger1);
+  CPU_SET(3, &cpuset_trigger2);
+
+  pthread_setaffinity_np(self, sizeof(cpu_set_t), &cpuset_trigger1);
+
+  timevalue tsc_start = Cpu::rdtsc();
+
+  pthread_create(&trigger1, NULL, trigger_fn<1>, NULL);
+  pthread_setaffinity_np(trigger1, sizeof(cpu_set_t), &cpuset_trigger1);
+
+  pthread_create(&trigger2, NULL, trigger_fn<2>, NULL);
+  pthread_setaffinity_np(trigger2, sizeof(cpu_set_t), &cpuset_trigger2);
+
+  pthread_create(&receiver, NULL, receiver_fn, NULL);
+  pthread_setaffinity_np(receiver, sizeof(cpu_set_t), &cpuset_receiver);
+
+  pthread_join(receiver, nullptr);
+  pthread_join(trigger1, nullptr);
+  pthread_join(trigger2, nullptr);
+
+  timevalue cycles = Cpu::rdtsc() - tsc_start;
+
+  printf("Test completed. Received (%u, %u) interrupts (expected %u, %u).\nTest took %llu cycles.\n",
+         irq_received_1, irq_received_2, IRQ_COUNT, IRQ_COUNT, cycles);
+
+  //logger.dump();
+
+  return 0;
+}
diff --git a/test/pic.h b/test/pic.h
new file mode 100644
index 00000000..f89c0821
--- /dev/null
+++ b/test/pic.h
@@ -0,0 +1,83 @@
+/**
+ * PIC Test header file
+ *
+ * Copyright (C) 2013 Markus Partheymueller, Intel Corporation.
+ *
+ * This file is part of Seoul.
+ *
+ * Seoul is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Seoul is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details.
+ */
+
+#include <nul/motherboard.h>
+#include <nul/vcpu.h>
+
+#include <pthread.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <assert.h>
+
+int runPicTest();
+
+enum { IRQ_COUNT = 500000 };
+
+enum LogItem {
+  LOG_INIT = 0x0,
+  LOG_SEND,
+  LOG_INTR,
+  LOG_INTA_RX,
+  LOG_INTA_TX,
+  LOG_EOI,
+  LOG_NOTIFY,
+  LOG_DEASS,
+  LOG_IGNORE,
+  LOG_SKIP
+};
+
+class LogBuffer {
+private:
+  unsigned long logbuffer[40*IRQ_COUNT];
+  unsigned logindex=0;
+
+public:
+
+  void log(LogItem type, unsigned value=0) {
+    unsigned logindex_tmp = __sync_fetch_and_add(&logindex, 2);
+    if (logindex_tmp >= sizeof(logbuffer)) return;
+    logbuffer[logindex_tmp] = Cpu::rdtsc();
+    logbuffer[logindex_tmp+1] = (pthread_self() << 32) | (value << 16) | type;
+  }
+
+  void dump() {
+    Logging::printf("\nLog output follows:\n---------------------------------------\n\n");
+    for (unsigned i=0; i<logindex; i+=2) {
+      const char * event;
+      switch (logbuffer[i+1] & 0xffff) {
+        case LOG_INIT: event = "\tINIT\t\t"; break;
+        case LOG_SEND: event = "\tIRQ\t\t"; break;
+        case LOG_INTR: event = "\tINTR\t\t"; break;
+        case LOG_INTA_TX: event = "\tINTA TX\t\t"; break;
+        case LOG_INTA_RX: event = "\tINTA RX\t\t"; break;
+        case LOG_EOI: event = "\tEOI\t\t"; break;
+        case LOG_NOTIFY: event = "\tNOTIFY\t\t"; break;
+        case LOG_DEASS: event = "\tDEASS\t\t"; break;
+        case LOG_IGNORE: event = "\tIGNORE\t\t"; break;
+        case LOG_SKIP: event = "\tSKIP\t\t"; break;
+        default: event = "n/a"; break;
+      }
+      Logging::printf("%lx\tT%lx\t%s %lx\n",
+                      logbuffer[i],
+                      (logbuffer[i+1] >>32),
+                      event,
+                      (logbuffer[i+1] >> 16)& 0xffff
+      );
+    }
+    Logging::printf("\n---------------------------------------\n\nPrinted %u events.\n", logindex/2);
+  }
+};

From fc714d4b9c2ef32fb4fa81c4de0e62f45b848ca6 Mon Sep 17 00:00:00 2001
From: Markus Partheymueller <markus.partheymueller@intel.com>
Date: Mon, 4 Nov 2013 14:58:42 +0100
Subject: [PATCH 34/35] Use atomic ops to protect vCPU and (LA)PIC.

Races in the emulation paths of the vCPU and the interrupt controller logic can cause problems when no external synchronization mechanism is applied. Using atomic instructions and relocation of certain code sections, it will now be possible to concurrently access vCPU, Lapic and PIC without the need for a lock around them.
---
 include/nul/message.h |  1 +
 include/nul/vcpu.h    |  4 +++-
 model/lapic.cc        | 13 +++++++------
 model/pic8259.cc      | 38 +++++++++++++++++++++++++-------------
 model/vcpu.cc         | 38 +++++++++++++++++++++++++++++++++-----
 5 files changed, 69 insertions(+), 25 deletions(-)

diff --git a/include/nul/message.h b/include/nul/message.h
index 40dc9f3e..43c28588 100644
--- a/include/nul/message.h
+++ b/include/nul/message.h
@@ -299,6 +299,7 @@ struct MessageLegacy
       DEASS_INTR,
       INTA,
       UNLOCK,
+      CHECK_INTR,
     } type;
   unsigned value;
   MessageLegacy(Type _type, unsigned _value=0) : type(_type), value(_value) {}
diff --git a/include/nul/vcpu.h b/include/nul/vcpu.h
index 5591b285..4b7fc057 100644
--- a/include/nul/vcpu.h
+++ b/include/nul/vcpu.h
@@ -5,6 +5,7 @@
  * Economic rights: Technische Universitaet Dresden (Germany)
  *
  * Copyright (C) 2013 Jacek Galowicz, Intel Corporation.
+ * Copyright (C) 2013 Markus Partheymueller, Intel Corporation.
  *
  * This file is part of Vancouver.
  *
@@ -87,7 +88,8 @@ struct LapicEvent {
   enum Type{
     INTA,
     RESET,
-    INIT
+    INIT,
+    CHECK_INTR
   } type;
   unsigned value;
   LapicEvent(Type _type) : type(_type) { if (type == INTA) value = ~0u; }
diff --git a/model/lapic.cc b/model/lapic.cc
index c99b066a..01367da3 100644
--- a/model/lapic.cc
+++ b/model/lapic.cc
@@ -5,6 +5,7 @@
  * Economic rights: Technische Universitaet Dresden (Germany)
  *
  * Copyright (C) 2013 Jacek Galowicz, Intel Corporation.
+ * Copyright (C) 2013 Markus Partheymueller, Intel Corporation.
  *
  * This file is part of Vancouver.
  *
@@ -164,9 +165,6 @@ class Lapic : public DiscoveryHelper<Lapic>, public StaticReceiver<Lapic>
     timevalue delta = (now - _timer_start) >> _timer_dcr_shift;
     if (delta < _ICT)  return _ICT - delta;
 
-    // we need to trigger the timer LVT
-    trigger_lvt(_TIMER_offset - LVT_BASE);
-
     // one shot?
     if (~_TIMER & (1 << 17))  {
       _timer_start = 0;
@@ -525,8 +523,6 @@ class Lapic : public DiscoveryHelper<Lapic>, public StaticReceiver<Lapic>
   {
     if (((_msr & 0xc00) != 0x800) || !in_range(msg.phys, _msr & ~0xfffull, 0x1000)) return false;
     if ((msg.phys & 0xf) || (msg.phys & 0xfff) >= 0x400) return false;
-
-
     if (msg.read)
       register_read((msg.phys >> 4) & 0x3f, *msg.ptr);
     else
@@ -557,7 +553,7 @@ class Lapic : public DiscoveryHelper<Lapic>, public StaticReceiver<Lapic>
 
     // no need to call update timer here, as the CPU needs to do an
     // EOI first
-    get_ccr(_mb.clock()->time());
+    trigger_lvt(_TIMER_offset - LVT_BASE);
     return true;
   }
 
@@ -610,6 +606,11 @@ class Lapic : public DiscoveryHelper<Lapic>, public StaticReceiver<Lapic>
 	msg.value = _SVR & 0xff;
       update_irqs();
     }
+    else if (msg.type == LapicEvent::CHECK_INTR) {
+      unsigned irrv = prioritize_irq();
+      msg.value = (irrv > 0);
+      return true;
+    }
     else if (msg.type == LapicEvent::RESET)
       reset();
     else if (msg.type == LapicEvent::INIT)
diff --git a/model/pic8259.cc b/model/pic8259.cc
index 2b8adede..4e88920f 100644
--- a/model/pic8259.cc
+++ b/model/pic8259.cc
@@ -5,6 +5,7 @@
  * Economic rights: Technische Universitaet Dresden (Germany)
  *
  * Copyright (C) 2013 Jacek Galowicz, Intel Corporation.
+ * Copyright (C) 2013 Markus Partheymueller, Intel Corporation.
  *
  * This file is part of Vancouver.
  *
@@ -74,7 +75,16 @@ class PicDevice : public StaticReceiver<PicDevice>
   // helper functions
   bool is_slave()                      { return (_icw[ICW4] & ICW4_BUF) ? (~_icw[ICW4] & ICW4_MS) : _virq; }
   void rotate_prios()                  { _prio_lowest = (_prio_lowest+1) & 7; }
-  void specific_eoi(unsigned char irq) { _isr &= ~irq; propagate_irq(false); }
+  void specific_eoi(unsigned char irq) {
+    // We do the notify here to avoid races
+    unsigned char notify = __sync_fetch_and_and(&_notify, ~irq);
+    if (notify & irq) {
+      MessageIrqNotify msg(_virq, irq);
+      _bus_notify.send(msg);
+    }
+    _isr &= ~irq;
+    propagate_irq(false);
+  }
   void non_specific_eoi()
   {
     for (unsigned i=0; i<8; i++)
@@ -112,14 +122,6 @@ class PicDevice : public StaticReceiver<PicDevice>
    */
   bool prioritize_irq(unsigned char &irq_index, bool int_ack)
   {
-    unsigned char tonotify = ~_irr & _notify;
-    if (tonotify)
-      {
-	Cpu::atomic_and<unsigned char>(&_notify, ~tonotify);
-	MessageIrqNotify msg(_virq, tonotify);
-	_bus_notify.send(msg);
-      }
-
     unsigned char state = _irr & ~_imr;
     for (unsigned i=0; i<8; i++)
       {
@@ -135,6 +137,7 @@ class PicDevice : public StaticReceiver<PicDevice>
 		_isr |= irq;
 		if (~_elcr & irq)
 		  Cpu::atomic_and<unsigned char>(&_irr, ~irq);
+
 		if (_icw[ICW4] & ICW4_AEOI)
 		  {
 		    non_specific_eoi();
@@ -186,7 +189,7 @@ class PicDevice : public StaticReceiver<PicDevice>
 	}
     }
     else {
-      Logging::printf("PicDevice::%s() spurious IRQ? for irr %x isr %x imr %x %x\n", __func__, _irr, _isr, res, _imr);
+      Logging::printf("PicDevice::%s() spurious IRQ? for irr %x isr %x imr %x %x\n", __func__, _irr, _isr, _imr, res);
       res = 7;
     }
     res += _icw[ICW2];
@@ -200,6 +203,13 @@ class PicDevice : public StaticReceiver<PicDevice>
    */
   bool  receive(MessageLegacy &msg)
   {
+    if (msg.type == MessageLegacy::CHECK_INTR) {
+      if (_virq) return false;
+      unsigned char vec;
+      bool ret = prioritize_irq(vec, false);
+      msg.value = (ret) ? (0xff << 8) | vec : vec;
+      return true;
+    }
     if (msg.type != MessageLegacy::INTA) return false;
     unsigned char vec;
     get_irqvector(vec);
@@ -332,8 +342,6 @@ class PicDevice : public StaticReceiver<PicDevice>
       if (in_range(msg.line, _virq, 8))
 	{
 	  unsigned char irq = 1 << (msg.line - _virq);
-	  if (msg.type == MessageIrq::ASSERT_NOTIFY)
-	      Cpu::atomic_or(&_notify, irq);
 
 	  if (msg.type == MessageIrq::DEASSERT_IRQ)
 	    {
@@ -347,7 +355,11 @@ class PicDevice : public StaticReceiver<PicDevice>
 	    {
 	      if (msg.line == 0) COUNTER_INC("pirq0"); else COUNTER_INC("pirqN");
 
-	      Cpu::atomic_or(&_irr, irq);
+              if (msg.type == MessageIrq::ASSERT_NOTIFY)
+                Cpu::atomic_or<unsigned char>(&_notify, irq);
+
+              Cpu::atomic_or<unsigned char>(&_irr, irq);
+
 	      propagate_irq(false);
 	    }
 	  return true;
diff --git a/model/vcpu.cc b/model/vcpu.cc
index ce76685f..fc46a018 100644
--- a/model/vcpu.cc
+++ b/model/vcpu.cc
@@ -5,6 +5,7 @@
  * Economic rights: Technische Universitaet Dresden (Germany)
  *
  * Copyright (C) 2013 Jacek Galowicz, Intel Corporation.
+ * Copyright (C) 2013 Markus Partheymueller, Intel Corporation.
  *
  * This file is part of Vancouver.
  *
@@ -34,6 +35,7 @@ class VirtualCpu : public VCpu, public StaticReceiver<VirtualCpu>
 
   volatile unsigned _event;
   volatile unsigned _sipi;
+  unsigned long _intr_hint;
 
   unsigned char debugioin[8192];
   unsigned char debugioout[8192];
@@ -275,16 +277,32 @@ class VirtualCpu : public VCpu, public StaticReceiver<VirtualCpu>
     // if we can not inject interrupts or if we are in shutdown state return
     if (cpu->intr_state & 0x3 || ~cpu->efl & 0x200 || cpu->actv_state == 2) return;
 
+    unsigned long intr = _intr_hint;
+
     LapicEvent msg2(LapicEvent::INTA);
     if (old_event & EVENT_EXTINT) {
       // EXTINT IRQ via MSI or IPI: INTA directly from the PIC
       Cpu::atomic_and<volatile unsigned>(&_event, ~VCpu::EVENT_EXTINT);
-      receive(msg2);
+      LapicEvent check(LapicEvent::CHECK_INTR);
+      check.value = 0;
+      if (receive(check) && check.value)
+        receive(msg2);
+      else {
+        return;
+      }
     }
-    else if (old_event & EVENT_INTR) {
+    else if (intr & 1) {
       // interrupt from the APIC or directly via INTR line - INTA via LAPIC
       // do not clear EVENT_INTR here, as the PIC or the LAPIC will do this for us
-      bus_lapic.send(msg2, true);
+      LapicEvent check(LapicEvent::CHECK_INTR);
+      check.value = 0;
+      if (bus_lapic.send(check, true) && check.value) {
+        bus_lapic.send(msg2, true);
+      } else {
+        Cpu::cmpxchg8b(&_intr_hint, intr, (intr + 4) & ~1ULL);
+        Cpu::atomic_and<volatile unsigned>(&_event, ~EVENT_INTR);
+        return;
+      }
     } else return;
 
     cpu->inj_info = msg2.value | 0x80000000;
@@ -323,8 +341,12 @@ class VirtualCpu : public VCpu, public StaticReceiver<VirtualCpu>
   void got_event(unsigned value) {
     COUNTER_INC("EVENT");
 
-    if (value & DEASS_INTR) Cpu::atomic_and<volatile unsigned>(&_event, ~EVENT_INTR);
-    if (!((~_event & value) & (EVENT_MASK | EVENT_DEBUG | EVENT_HOST | EVENT_RESUME))) return;
+    Cpu::atomic_xadd<unsigned long, unsigned>(&_intr_hint, 4);
+    if (value & EVENT_INTR) Cpu::atomic_or<unsigned long>(&_intr_hint, 1);
+
+    /* Avoid delayed DEASS messages. The event loop clears INTR itself.
+    if (value & DEASS_INTR) Cpu::atomic_and<volatile unsigned>(&_event, ~EVENT_INTR);*/
+    if (!((~(_event & ~EVENT_INTR) & value) & (EVENT_MASK | EVENT_DEBUG | EVENT_HOST))) return;
 
     // INIT or AP RESET - go to the wait-for-sipi state
     if ((value & EVENT_MASK) == EVENT_INIT)
@@ -392,6 +414,12 @@ class VirtualCpu : public VCpu, public StaticReceiver<VirtualCpu>
 	msg.value = msg2.value;
       return true;
     }
+    if (msg.type == LapicEvent::CHECK_INTR) {
+      MessageLegacy check(MessageLegacy::CHECK_INTR);
+      _mb.bus_legacy.send(check);
+      msg.value = (check.value & 0xff00);
+      return true;
+    }
     return false;
   }
 

From 8bd3ad5bc40df4d53c1c27bbff66d09ef1102b2a Mon Sep 17 00:00:00 2001
From: Markus Partheymueller <markus.partheymueller@intel.com>
Date: Mon, 4 Nov 2013 15:14:01 +0100
Subject: [PATCH 35/35] Bypass I/O thread in several devices.

The following devices are now configured to bypass an I/O thread:
* vCPU memory and CpuMessage: Safe as of previous commit.
* PM Timer: No need for synchronization.
* VGA Framebuffer memory: No need for synchronization.
* PCI Pass-through memory and IRQ: No need for synchronization, IRQ already safe.
---
 model/pcidirect.cc  | 15 +++++++++++++--
 model/pmtimer.cc    |  6 ++++++
 model/vcpu.cc       |  4 ++++
 model/vga.cc        |  7 ++++++-
 nre/src/IOThread.cc |  1 -
 nre/src/IOThread.h  |  1 -
 6 files changed, 29 insertions(+), 5 deletions(-)

diff --git a/model/pcidirect.cc b/model/pcidirect.cc
index 7929860b..a6ad5ce6 100644
--- a/model/pcidirect.cc
+++ b/model/pcidirect.cc
@@ -5,6 +5,7 @@
  * Economic rights: Technische Universitaet Dresden (Germany)
  *
  * Copyright (C) 2013 Jacek Galowicz, Intel Corporation.
+ * Copyright (C) 2013 Markus Partheymueller, Intel Corporation.
  *
  * This file is part of Vancouver.
  *
@@ -268,7 +269,12 @@ class DirectPciDevice : public StaticReceiver<DirectPciDevice>, public HostVfPci
     return true;
   }
 
+  bool claim(MessageIrq &msg) {
+    for (unsigned i = 0; i < _irq_count; i++)
+      if (_host_irqs[i] == msg.line) return true;
 
+    return false;
+  }
   bool receive(MessageIrq &msg)
   {
     for (unsigned i = 0; i < _irq_count; i++)
@@ -312,8 +318,11 @@ class DirectPciDevice : public StaticReceiver<DirectPciDevice>, public HostVfPci
     return _mb.bus_hostop.send(msg2);
   }
 
-
-
+  bool claim(MessageMem &msg)
+  {
+    unsigned *ptr;
+    return match_bars(msg.phys, 4, ptr);
+  }
   bool receive(MessageMem &msg)
   {
     unsigned *ptr;
@@ -472,10 +481,12 @@ class DirectPciDevice : public StaticReceiver<DirectPciDevice>, public HostVfPci
     mb.bus_ioin.add(this,   DirectPciDevice::receive_static<MessageIOIn>);
     mb.bus_ioout.add(this,  DirectPciDevice::receive_static<MessageIOOut>);
     mb.bus_mem.add(this,    DirectPciDevice::receive_static<MessageMem>);
+    mb.bus_mem.add_iothread_callback(this, DirectPciDevice::claim_static<MessageMem>);
     mb.bus_legacy.add(this, DirectPciDevice::receive_static<MessageLegacy>);
     if (map_mode != MAP_MODE_DISABLED)
       mb.bus_memregion.add(this, DirectPciDevice::receive_static<MessageMemRegion>);
     mb.bus_hostirq.add(this,     DirectPciDevice::receive_static<MessageIrq>);
+    mb.bus_hostirq.add_iothread_callback(this, DirectPciDevice::claim_static<MessageIrq>);
     mb.bus_restore.add(this,     DirectPciDevice::receive_static<MessageRestore>);
     //mb.bus_irqnotify.add(this, DirectPciDevice::receive_static<MessageIrqNotify>);
   }
diff --git a/model/pmtimer.cc b/model/pmtimer.cc
index 2e38ea08..2c69a474 100644
--- a/model/pmtimer.cc
+++ b/model/pmtimer.cc
@@ -4,6 +4,8 @@
  * Copyright (C) 2010, Bernhard Kauer <bk@vmmon.org>
  * Economic rights: Technische Universitaet Dresden (Germany)
  *
+ * Copyright (C) 2013 Markus Partheymueller, Intel Corporation.
+ *
  * This file is part of Vancouver.
  *
  * Vancouver is free software: you can redistribute it and/or modify
@@ -34,6 +36,9 @@ class PmTimer : public DiscoveryHelper<PmTimer>, public StaticReceiver<PmTimer>
   unsigned _iobase;
   enum { FREQ = 3579545 };
 public:
+  bool  claim(MessageIOIn &msg) {
+    return (msg.port == _iobase && msg.type == MessageIOIn::TYPE_INL);
+  }
   bool  receive(MessageIOIn &msg) {
 
     if (msg.port != _iobase || msg.type != MessageIOIn::TYPE_INL)  return false;
@@ -55,6 +60,7 @@ class PmTimer : public DiscoveryHelper<PmTimer>, public StaticReceiver<PmTimer>
   PmTimer(Motherboard &mb, unsigned iobase) : _mb(mb), _iobase(iobase) {
 
     _mb.bus_ioin.add(this,      receive_static<MessageIOIn>);
+    _mb.bus_ioin.add_iothread_callback(this, claim_static<MessageIOIn>);
     _mb.bus_discovery.add(this, discover);
   }
 };
diff --git a/model/vcpu.cc b/model/vcpu.cc
index fc46a018..c3a086aa 100644
--- a/model/vcpu.cc
+++ b/model/vcpu.cc
@@ -372,6 +372,7 @@ class VirtualCpu : public VCpu, public StaticReceiver<VirtualCpu>
   /**
    * Forward MEM requests to the motherboard.
    */
+  bool claim(MessageMem &msg) { /* The entire vCPU subsystem should be bypassing */ return true; }
   bool receive(MessageMem &msg) { return _mb.bus_mem.send(msg, true); }
   bool receive(MessageMemRegion &msg) { return _mb.bus_memregion.send(msg, true); }
 
@@ -423,6 +424,7 @@ class VirtualCpu : public VCpu, public StaticReceiver<VirtualCpu>
     return false;
   }
 
+  bool claim(CpuMessage &msg) { /* Entire vCPU subsystem should be bypassing */ return true; }
   bool receive(CpuMessage &msg) {
 
     if (msg.type == CpuMessage::TYPE_ADD_TSC_OFF) {
@@ -516,8 +518,10 @@ class VirtualCpu : public VCpu, public StaticReceiver<VirtualCpu>
 
     // add to the busses
     executor. add(this, VirtualCpu::receive_static<CpuMessage>);
+    executor.add_iothread_callback(this, VirtualCpu::claim_static<CpuMessage>);
     bus_event.add(this, VirtualCpu::receive_static<CpuEvent>);
     mem.      add(this, VirtualCpu::receive_static<MessageMem>);
+    mem.add_iothread_callback(this, VirtualCpu::claim_static<MessageMem>);
     memregion.add(this, VirtualCpu::receive_static<MessageMemRegion>);
     mb.bus_legacy.add(this, VirtualCpu::receive_static<MessageLegacy>);
     bus_lapic.add(this, VirtualCpu::receive_static<LapicEvent>);
diff --git a/model/vga.cc b/model/vga.cc
index 44c9d3fa..a1a18a46 100644
--- a/model/vga.cc
+++ b/model/vga.cc
@@ -5,6 +5,7 @@
  * Economic rights: Technische Universitaet Dresden (Germany)
  *
  * Copyright (C) 2013 Jacek Galowicz, Intel Corporation.
+ * Copyright (C) 2013 Markus Partheymueller, Intel Corporation.
  *
  * This file is part of Vancouver.
  *
@@ -479,7 +480,10 @@ class Vga : public StaticReceiver<Vga>, public BiosCommon
     return res;
   }
 
-
+  bool  claim(MessageMem &msg)
+  {
+    return ((in_range(msg.phys, _framebuffer_phys, _framebuffer_size)) || (in_range(msg.phys, LOW_BASE, LOW_SIZE)));
+  }
   bool  receive(MessageMem &msg)
   {
     unsigned *ptr;
@@ -632,6 +636,7 @@ PARAM_HANDLER(vga,
   mb.bus_ioout    .add(dev, Vga::receive_static<MessageIOOut>);
   mb.bus_bios     .add(dev, Vga::receive_static<MessageBios>);
   mb.bus_mem      .add(dev, Vga::receive_static<MessageMem>);
+  mb.bus_mem.add_iothread_callback(dev, Vga::claim_static<MessageMem>);
   mb.bus_memregion.add(dev, Vga::receive_static<MessageMemRegion>);
   mb.bus_discovery.add(dev, Vga::receive_static<MessageDiscovery>);
   mb.bus_restore.add(dev, Vga::receive_static<MessageRestore>);
diff --git a/nre/src/IOThread.cc b/nre/src/IOThread.cc
index 232f4ce1..4d7147c1 100644
--- a/nre/src/IOThread.cc
+++ b/nre/src/IOThread.cc
@@ -139,7 +139,6 @@ bool IOThread::enqueue(MessageTimer &msg, MessageIOThread::Mode mode, MessageIOT
    * Because they are a result of an earlier message, timeout requests should never be enqueued.
    */
   if (msg.type == MessageTimer::TIMER_NEW) sync = MessageIOThread::SYNC_SYNC;
-  else Logging::panic("MessageTimer request nr %u\n", msg.nr);
   MessageTimer *ptr;
   if (sync == MessageIOThread::SYNC_ASYNC) {
     ptr = new MessageTimer;
diff --git a/nre/src/IOThread.h b/nre/src/IOThread.h
index 2670e5d2..3b8a61b2 100644
--- a/nre/src/IOThread.h
+++ b/nre/src/IOThread.h
@@ -94,6 +94,5 @@ class IOThread : public StaticReceiver<IOThread>, public nre::SListItem {
     mb->bus_legacy.set_iothread_enqueue(this, enqueue_static<MessageLegacy>);
     mb->bus_network.set_iothread_enqueue(this, enqueue_static<MessageNetwork>);
     mb->bus_pcicfg.set_iothread_enqueue(this, enqueue_static<MessagePciConfig>);
-    mb->bus_hostop.set_iothread_enqueue(this, enqueue_static<MessageHostOp>);
   }
 };