From 284dacc120476b3ff7ea8cfe72efee438222a5c6 Mon Sep 17 00:00:00 2001 From: Stefan Schmidt Date: Thu, 1 Mar 2018 01:07:22 +0100 Subject: [PATCH 01/11] Detect and force-unlock stale locks. This change fixes the issue of orphaned lock files, which Terraform leaves behind in some scenarios. Such lock files had to be removed manually, either by means of a force-unlock or by deleting the file in GCS. The "updated" timestamp of the lock file is used as an indicator of staleness. This timestamp is updated once per minute as long as the lock is held. A lock file is considered stale and is force-unlocked if its timestamp hasn't been updated for several minutes. --- backend/remote-state/gcs/backend_test.go | 65 +++++++++++++++++++ backend/remote-state/gcs/client.go | 82 +++++++++++++++++++++--- 2 files changed, 138 insertions(+), 9 deletions(-) diff --git a/backend/remote-state/gcs/backend_test.go b/backend/remote-state/gcs/backend_test.go index dd44d919a2b0..b48fe8fd9380 100644 --- a/backend/remote-state/gcs/backend_test.go +++ b/backend/remote-state/gcs/backend_test.go @@ -10,6 +10,7 @@ import ( "cloud.google.com/go/storage" "github.com/hashicorp/terraform/backend" + "github.com/hashicorp/terraform/state" "github.com/hashicorp/terraform/state/remote" ) @@ -139,6 +140,7 @@ func TestBackend(t *testing.T) { backend.TestBackendStates(t, be0) backend.TestBackendStateLocks(t, be0, be1) backend.TestBackendStateForceUnlock(t, be0, be1) + testStaleLocks(t, be0, be1) } func TestBackendWithPrefix(t *testing.T) { @@ -169,6 +171,69 @@ func TestBackendWithEncryption(t *testing.T) { backend.TestBackendStateLocks(t, be0, be1) } +func testStaleLocks(t *testing.T, b1, b2 backend.Backend) { + t.Helper() + + // Get the default state for each + b1StateMgr, err := b1.State(backend.DefaultStateName) + if err != nil { + t.Fatalf("error: %s", err) + } + if err := b1StateMgr.RefreshState(); err != nil { + t.Fatalf("bad: %s", err) + } + + b2StateMgr, err := b2.State(backend.DefaultStateName) + if err != nil { + t.Fatalf("error: %s", err) + } + if err := b2StateMgr.RefreshState(); err != nil { + t.Fatalf("bad: %s", err) + } + + // Reassign so its obvious whats happening + lockerA := b1StateMgr.(state.Locker) + lockerB := b2StateMgr.(state.Locker) + + infoA := state.NewLockInfo() + infoA.Operation = "test" + infoA.Who = "clientA" + + infoB := state.NewLockInfo() + infoB.Operation = "test" + infoB.Who = "clientB" + + // Reduce tick interval for faster tests + tickInterval = 5 * time.Second + + lockIDA, err := lockerA.Lock(infoA) + if err != nil { + t.Fatal("unable to get initial lock:", err) + } + + // stop updating the "updated" timestamp. Eventually, the lock will become stale. + lockerA.(*remote.State).Client.(*remoteClient).ticker.Stop() + + // lock is still held by A after 10 seconds. + time.Sleep(10 * time.Second) + _, err = lockerB.Lock(infoB) + if err == nil { + lockerA.Unlock(lockIDA) + t.Fatal("client B obtained lock while held by client A") + } + + // wait a bit longer and the lock will become stale. + time.Sleep(20 * time.Second) + lockIDB, err := lockerB.Lock(infoB) + if err != nil { + t.Fatal("client B failed to obtain lock that was previously held by client A but that went stale") + } + + if err := lockerB.Unlock(lockIDB); err != nil { + t.Fatal("error unlocking client B", err) + } +} + // setupBackend returns a new GCS backend. func setupBackend(t *testing.T, bucket, prefix, key string) backend.Backend { t.Helper() diff --git a/backend/remote-state/gcs/client.go b/backend/remote-state/gcs/client.go index 930802b01724..8e4b5768e8b9 100644 --- a/backend/remote-state/gcs/client.go +++ b/backend/remote-state/gcs/client.go @@ -4,7 +4,9 @@ import ( "encoding/json" "fmt" "io/ioutil" + "log" "strconv" + "time" "cloud.google.com/go/storage" multierror "github.com/hashicorp/go-multierror" @@ -23,8 +25,12 @@ type remoteClient struct { stateFilePath string lockFilePath string encryptionKey []byte + ticker *time.Ticker } +var tickInterval = 1 * time.Minute +var minMissesUntilStale = 4 + func (c *remoteClient) Get() (payload *remote.Payload, err error) { stateFileReader, err := c.stateFile().NewReader(c.storageContext) if err != nil { @@ -77,6 +83,58 @@ func (c *remoteClient) Delete() error { return nil } +// createLockFile writes to a lock file, ensuring file creation. Returns the +// generation number. +func (c *remoteClient) createLockFile(lockFile *storage.ObjectHandle, infoJson []byte) (int64, error) { + w := lockFile.If(storage.Conditions{DoesNotExist: true}).NewWriter(c.storageContext) + if _, err := w.Write(infoJson); err != nil { + return 0, err + } + if err := w.Close(); err != nil { + return 0, err + } + + return w.Attrs().Generation, nil +} + +// unlockIfStale force-unlocks the lock file if it is stale. Returns true if a +// stale lock was removed (and therefore a retry makes sense), otherwise false. +func (c *remoteClient) unlockIfStale(lockFile *storage.ObjectHandle) bool { + if attrs, err := lockFile.Attrs(c.storageContext); err == nil { + age := time.Now().Sub(attrs.Updated) + if age > time.Duration(int64(minMissesUntilStale)*tickInterval.Nanoseconds()) { + if err := c.Unlock(strconv.FormatInt(attrs.Generation, 10)); err != nil { + log.Printf("[WARN] failed to release stale lock: %s", err) + } else { + return true + } + } + } + + return false +} + +// updateLockFile periodically updates the "updated" timestamp of the lock +// file until the lock is released in Unlock(). +func (c *remoteClient) updateLockFile(lockFile *storage.ObjectHandle, generation int64) { + for _ = range c.ticker.C { + if attrs, err := lockFile.Attrs(c.storageContext); err == nil { + if attrs.Generation != generation { + // This is no longer the lock file that we started with. Stop updating. + c.ticker.Stop() + return + } + + // Update the "updated" timestamp by removing non-existent metadata. + uattrs := storage.ObjectAttrsToUpdate{Metadata: make(map[string]string)} + uattrs.Metadata["x-goog-meta-terraform-state-heartbeat"] = "" + if _, err := lockFile.Update(c.storageContext, uattrs); err != nil { + log.Printf("[WARN] failed to update lock: %s", err) + } + } + } +} + // Lock writes to a lock file, ensuring file creation. Returns the generation // number, which must be passed to Unlock(). func (c *remoteClient) Lock(info *state.LockInfo) (string, error) { @@ -90,19 +148,21 @@ func (c *remoteClient) Lock(info *state.LockInfo) (string, error) { } lockFile := c.lockFile() - w := lockFile.If(storage.Conditions{DoesNotExist: true}).NewWriter(c.storageContext) - err = func() error { - if _, err := w.Write(infoJson); err != nil { - return err - } - return w.Close() - }() + generation, err := c.createLockFile(lockFile, infoJson) if err != nil { - return "", c.lockError(fmt.Errorf("writing %q failed: %v", c.lockFileURL(), err)) + if c.unlockIfStale(lockFile) { + generation, err = c.createLockFile(lockFile, infoJson) + } + } + if err != nil { + return "", c.lockError(fmt.Errorf("failed to create lock file %q: %v", c.lockFileURL(), err)) } - info.ID = strconv.FormatInt(w.Attrs().Generation, 10) + info.ID = strconv.FormatInt(generation, 10) + + c.ticker = time.NewTicker(tickInterval) + go c.updateLockFile(lockFile, generation) return info.ID, nil } @@ -113,6 +173,10 @@ func (c *remoteClient) Unlock(id string) error { return err } + if c.ticker != nil { + c.ticker.Stop() + } + if err := c.lockFile().If(storage.Conditions{GenerationMatch: gen}).Delete(c.storageContext); err != nil { return c.lockError(err) } From c4ec04b7182141d0534c5876e881914c0c3f5496 Mon Sep 17 00:00:00 2001 From: Stefan Schmidt Date: Wed, 20 Jun 2018 17:26:09 +0200 Subject: [PATCH 02/11] Addressed review comments. In particular: - added a mutex to remoteClient to prevent concurrent modifications - refactored the background heartbeating - added/improved log messages --- backend/remote-state/gcs/backend_test.go | 16 ++-- backend/remote-state/gcs/client.go | 95 ++++++++++++++++++------ 2 files changed, 82 insertions(+), 29 deletions(-) diff --git a/backend/remote-state/gcs/backend_test.go b/backend/remote-state/gcs/backend_test.go index b48fe8fd9380..2e445cc8df5d 100644 --- a/backend/remote-state/gcs/backend_test.go +++ b/backend/remote-state/gcs/backend_test.go @@ -156,7 +156,9 @@ func TestBackendWithPrefix(t *testing.T) { backend.TestBackendStates(t, be0) backend.TestBackendStateLocks(t, be0, be1) + testStaleLocks(t, be0, be1) } + func TestBackendWithEncryption(t *testing.T) { t.Parallel() @@ -169,6 +171,7 @@ func TestBackendWithEncryption(t *testing.T) { backend.TestBackendStates(t, be0) backend.TestBackendStateLocks(t, be0, be1) + testStaleLocks(t, be0, be1) } func testStaleLocks(t *testing.T, b1, b2 backend.Backend) { @@ -203,18 +206,19 @@ func testStaleLocks(t *testing.T, b1, b2 backend.Backend) { infoB.Operation = "test" infoB.Who = "clientB" - // Reduce tick interval for faster tests - tickInterval = 5 * time.Second + // For faster tests, reduce the duration until the lock is considered stale. + heartbeatInterval = 5 * time.Second + minHeartbeatAgeUntilStale = 20 * time.Second lockIDA, err := lockerA.Lock(infoA) if err != nil { t.Fatal("unable to get initial lock:", err) } - // stop updating the "updated" timestamp. Eventually, the lock will become stale. - lockerA.(*remote.State).Client.(*remoteClient).ticker.Stop() + // Stop heartbeating on the lock file. It will be considered stale after minHeartbeatAgeUntilStale. + lockerA.(*remote.State).Client.(*remoteClient).stopHeartbeatCh <- true - // lock is still held by A after 10 seconds. + // Lock is still held by A after 10 seconds. time.Sleep(10 * time.Second) _, err = lockerB.Lock(infoB) if err == nil { @@ -222,7 +226,7 @@ func testStaleLocks(t *testing.T, b1, b2 backend.Backend) { t.Fatal("client B obtained lock while held by client A") } - // wait a bit longer and the lock will become stale. + // Wait a bit longer, and the lock will become stale. time.Sleep(20 * time.Second) lockIDB, err := lockerB.Lock(infoB) if err != nil { diff --git a/backend/remote-state/gcs/client.go b/backend/remote-state/gcs/client.go index 8e4b5768e8b9..6e3bcd442aae 100644 --- a/backend/remote-state/gcs/client.go +++ b/backend/remote-state/gcs/client.go @@ -6,6 +6,7 @@ import ( "io/ioutil" "log" "strconv" + "sync" "time" "cloud.google.com/go/storage" @@ -19,17 +20,31 @@ import ( // blobs representing state. // Implements "state/remote".ClientLocker type remoteClient struct { + mutex sync.Mutex + storageContext context.Context storageClient *storage.Client bucketName string stateFilePath string lockFilePath string encryptionKey []byte - ticker *time.Ticker + + // The initial generation number of the lock file created by this + // remoteClient. + generation *int64 + + // Channel used for signalling the lock-heartbeating goroutine to stop. + stopHeartbeatCh chan bool } -var tickInterval = 1 * time.Minute -var minMissesUntilStale = 4 +var ( + // Time between consecutive heartbeats on the lock file. + heartbeatInterval = 1 * time.Minute + + // The mininum duration that must have passed since the youngest + // recorded heartbeat before the lock file is considered stale/orphaned. + minHeartbeatAgeUntilStale = 15 * time.Minute +) func (c *remoteClient) Get() (payload *remote.Payload, err error) { stateFileReader, err := c.stateFile().NewReader(c.storageContext) @@ -102,9 +117,10 @@ func (c *remoteClient) createLockFile(lockFile *storage.ObjectHandle, infoJson [ func (c *remoteClient) unlockIfStale(lockFile *storage.ObjectHandle) bool { if attrs, err := lockFile.Attrs(c.storageContext); err == nil { age := time.Now().Sub(attrs.Updated) - if age > time.Duration(int64(minMissesUntilStale)*tickInterval.Nanoseconds()) { + if age > minHeartbeatAgeUntilStale { + log.Printf("[WARN] Existing lock file %s is considered stale, last heartbeat was %s ago", c.lockFileURL(), age) if err := c.Unlock(strconv.FormatInt(attrs.Generation, 10)); err != nil { - log.Printf("[WARN] failed to release stale lock: %s", err) + log.Printf("[WARN] Failed to release stale lock: %s", err) } else { return true } @@ -114,22 +130,46 @@ func (c *remoteClient) unlockIfStale(lockFile *storage.ObjectHandle) bool { return false } -// updateLockFile periodically updates the "updated" timestamp of the lock +// heartbeatLockFile periodically updates the "updated" timestamp of the lock // file until the lock is released in Unlock(). -func (c *remoteClient) updateLockFile(lockFile *storage.ObjectHandle, generation int64) { - for _ = range c.ticker.C { - if attrs, err := lockFile.Attrs(c.storageContext); err == nil { - if attrs.Generation != generation { - // This is no longer the lock file that we started with. Stop updating. - c.ticker.Stop() - return - } +func (c *remoteClient) heartbeatLockFile() { + log.Printf("[TRACE] Starting heartbeat on lock file %s, interval is %s", c.lockFileURL(), heartbeatInterval) + + ticker := time.NewTicker(heartbeatInterval) + defer ticker.Stop() - // Update the "updated" timestamp by removing non-existent metadata. - uattrs := storage.ObjectAttrsToUpdate{Metadata: make(map[string]string)} - uattrs.Metadata["x-goog-meta-terraform-state-heartbeat"] = "" - if _, err := lockFile.Update(c.storageContext, uattrs); err != nil { - log.Printf("[WARN] failed to update lock: %s", err) + defer func() { + c.mutex.Lock() + c.stopHeartbeatCh = nil + c.mutex.Unlock() + }() + + for { + select { + case <-c.stopHeartbeatCh: + log.Printf("[TRACE] Stopping heartbeat on lock file %s", c.lockFileURL()) + return + case t := <-ticker.C: + log.Printf("[TRACE] Performing heartbeat on lock file %s, tick %q", c.lockFileURL(), t) + if attrs, err := c.lockFile().Attrs(c.storageContext); err != nil { + log.Printf("[WARN] Failed to retrieve attributes of lock file %s: %s", c.lockFileURL(), err) + } else { + c.mutex.Lock() + generation := *c.generation + c.mutex.Unlock() + + if attrs.Generation != generation { + // This is no longer the lock file that we started with. Stop heartbeating on it. + log.Printf("[WARN] Stopping heartbeat on lock file %s as it changed underneath.", c.lockFileURL()) + return + } + + // Update the "updated" timestamp by removing non-existent metadata. + uattrs := storage.ObjectAttrsToUpdate{Metadata: make(map[string]string)} + uattrs.Metadata["x-goog-meta-terraform-state-heartbeat"] = "" + if _, err := c.lockFile().Update(c.storageContext, uattrs); err != nil { + log.Printf("[WARN] Failed to perform heartbeat on lock file %s: %s", c.lockFileURL(), err) + } } } } @@ -161,8 +201,13 @@ func (c *remoteClient) Lock(info *state.LockInfo) (string, error) { info.ID = strconv.FormatInt(generation, 10) - c.ticker = time.NewTicker(tickInterval) - go c.updateLockFile(lockFile, generation) + c.mutex.Lock() + defer c.mutex.Unlock() + + c.generation = &generation + c.stopHeartbeatCh = make(chan bool) + + go c.heartbeatLockFile() return info.ID, nil } @@ -173,8 +218,12 @@ func (c *remoteClient) Unlock(id string) error { return err } - if c.ticker != nil { - c.ticker.Stop() + c.mutex.Lock() + defer c.mutex.Unlock() + + if c.stopHeartbeatCh != nil { + log.Printf("[TRACE] Stopping heartbeat on lock file %s before removing the lock", c.lockFileURL()) + c.stopHeartbeatCh <- true } if err := c.lockFile().If(storage.Conditions{GenerationMatch: gen}).Delete(c.storageContext); err != nil { From e9df8f32a47000efb1652ff672bfc61c07737546 Mon Sep 17 00:00:00 2001 From: Stefan Schmidt Date: Wed, 20 Jun 2018 18:21:47 +0200 Subject: [PATCH 03/11] Reduce diff by undoing a few lines of refactorings. --- backend/remote-state/gcs/client.go | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/backend/remote-state/gcs/client.go b/backend/remote-state/gcs/client.go index 6e3bcd442aae..828712c35afc 100644 --- a/backend/remote-state/gcs/client.go +++ b/backend/remote-state/gcs/client.go @@ -102,11 +102,15 @@ func (c *remoteClient) Delete() error { // generation number. func (c *remoteClient) createLockFile(lockFile *storage.ObjectHandle, infoJson []byte) (int64, error) { w := lockFile.If(storage.Conditions{DoesNotExist: true}).NewWriter(c.storageContext) - if _, err := w.Write(infoJson); err != nil { - return 0, err - } - if err := w.Close(); err != nil { - return 0, err + err := func() error { + if _, err := w.Write(infoJson); err != nil { + return err + } + return w.Close() + }() + + if err != nil { + return 0, c.lockError(fmt.Errorf("writing %q failed: %v", c.lockFileURL(), err)) } return w.Attrs().Generation, nil From a9ae77244cc9a457111274d67c084e9d8afdc6de Mon Sep 17 00:00:00 2001 From: Stefan Schmidt Date: Thu, 21 Jun 2018 02:15:56 +0200 Subject: [PATCH 04/11] Facilitate a safe migration path. Do not force-unlock locks created by clients that don't perform heartbeating on the lock file. --- backend/remote-state/gcs/client.go | 37 ++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/backend/remote-state/gcs/client.go b/backend/remote-state/gcs/client.go index 828712c35afc..8f2ffcc3525e 100644 --- a/backend/remote-state/gcs/client.go +++ b/backend/remote-state/gcs/client.go @@ -37,6 +37,14 @@ type remoteClient struct { stopHeartbeatCh chan bool } +// Name of the metadata header on the lock file which indicates that the lock +// file has been created by a client which is supposed to periodically perform +// heartbeats on it. This header facilitates a safe migration from previous +// Terraform versions that do not yet perform any heartbeats on the lock file. +// A lock file will only be considered stale and force-unlocked if it's age +// exceeds minHeartbeatAgeUntilStale AND this metadata header is present. +const metadataHeaderHeartbeatEnabled = "x-google-lock-file-uses-heartbeating" + var ( // Time between consecutive heartbeats on the lock file. heartbeatInterval = 1 * time.Minute @@ -113,13 +121,42 @@ func (c *remoteClient) createLockFile(lockFile *storage.ObjectHandle, infoJson [ return 0, c.lockError(fmt.Errorf("writing %q failed: %v", c.lockFileURL(), err)) } + // Add metadata signalling to other clients that heartbeats will be + // performed on this lock file. + uattrs := storage.ObjectAttrsToUpdate{Metadata: make(map[string]string)} + uattrs.Metadata[metadataHeaderHeartbeatEnabled] = "true" + if _, err := lockFile.Update(c.storageContext, uattrs); err != nil { + return 0, c.lockError(err) + } + return w.Attrs().Generation, nil } +func isHeartbeatEnabled(attrs *storage.ObjectAttrs) bool { + if attrs.Metadata != nil { + if val, ok := attrs.Metadata[metadataHeaderHeartbeatEnabled]; ok { + if val == "true" { + return true + } + } + } + + return false +} + // unlockIfStale force-unlocks the lock file if it is stale. Returns true if a // stale lock was removed (and therefore a retry makes sense), otherwise false. func (c *remoteClient) unlockIfStale(lockFile *storage.ObjectHandle) bool { if attrs, err := lockFile.Attrs(c.storageContext); err == nil { + if !isHeartbeatEnabled(attrs) { + // Metadata header metadataHeaderHeartbeatEnabled is + // not present, thus this lock file is owned by an + // older client that does not perform heartbeats on the + // lock file. Therefore, we cannot be sure whether the + // lock file might be stale. Better don't force-unlock! + log.Printf("[TRACE] Found existing lock file %s from an older client that does not perform heartbeats", c.lockFileURL()) + return false + } age := time.Now().Sub(attrs.Updated) if age > minHeartbeatAgeUntilStale { log.Printf("[WARN] Existing lock file %s is considered stale, last heartbeat was %s ago", c.lockFileURL(), age) From 9f6dcf300c28eda3c5262856f5be1e73c5c8b590 Mon Sep 17 00:00:00 2001 From: Stefan Schmidt Date: Thu, 21 Jun 2018 12:03:25 +0200 Subject: [PATCH 05/11] Fix typo. --- backend/remote-state/gcs/client.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/remote-state/gcs/client.go b/backend/remote-state/gcs/client.go index 8f2ffcc3525e..c9aec599d519 100644 --- a/backend/remote-state/gcs/client.go +++ b/backend/remote-state/gcs/client.go @@ -41,7 +41,7 @@ type remoteClient struct { // file has been created by a client which is supposed to periodically perform // heartbeats on it. This header facilitates a safe migration from previous // Terraform versions that do not yet perform any heartbeats on the lock file. -// A lock file will only be considered stale and force-unlocked if it's age +// A lock file will only be considered stale and force-unlocked if its age // exceeds minHeartbeatAgeUntilStale AND this metadata header is present. const metadataHeaderHeartbeatEnabled = "x-google-lock-file-uses-heartbeating" From 6e6040ad4dd982c65f426371ca53561372e843d0 Mon Sep 17 00:00:00 2001 From: Stefan Schmidt Date: Thu, 21 Jun 2018 23:42:57 +0200 Subject: [PATCH 06/11] Use the well-known x-goog-meta prefix for metadata headers. --- backend/remote-state/gcs/client.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/remote-state/gcs/client.go b/backend/remote-state/gcs/client.go index c9aec599d519..0c96e95f6129 100644 --- a/backend/remote-state/gcs/client.go +++ b/backend/remote-state/gcs/client.go @@ -43,7 +43,7 @@ type remoteClient struct { // Terraform versions that do not yet perform any heartbeats on the lock file. // A lock file will only be considered stale and force-unlocked if its age // exceeds minHeartbeatAgeUntilStale AND this metadata header is present. -const metadataHeaderHeartbeatEnabled = "x-google-lock-file-uses-heartbeating" +const metadataHeaderHeartbeatEnabled = "x-goog-meta-heartbeating" var ( // Time between consecutive heartbeats on the lock file. From 111fe6d4c1da43e14e38ff54a6298a7939e3b006 Mon Sep 17 00:00:00 2001 From: Stefan Schmidt Date: Thu, 21 Jun 2018 23:49:32 +0200 Subject: [PATCH 07/11] No need to pass around the lock file handle as a parameter. --- backend/remote-state/gcs/client.go | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/backend/remote-state/gcs/client.go b/backend/remote-state/gcs/client.go index 0c96e95f6129..db6d5b74d497 100644 --- a/backend/remote-state/gcs/client.go +++ b/backend/remote-state/gcs/client.go @@ -108,8 +108,8 @@ func (c *remoteClient) Delete() error { // createLockFile writes to a lock file, ensuring file creation. Returns the // generation number. -func (c *remoteClient) createLockFile(lockFile *storage.ObjectHandle, infoJson []byte) (int64, error) { - w := lockFile.If(storage.Conditions{DoesNotExist: true}).NewWriter(c.storageContext) +func (c *remoteClient) createLockFile(infoJson []byte) (int64, error) { + w := c.lockFile().If(storage.Conditions{DoesNotExist: true}).NewWriter(c.storageContext) err := func() error { if _, err := w.Write(infoJson); err != nil { return err @@ -125,7 +125,7 @@ func (c *remoteClient) createLockFile(lockFile *storage.ObjectHandle, infoJson [ // performed on this lock file. uattrs := storage.ObjectAttrsToUpdate{Metadata: make(map[string]string)} uattrs.Metadata[metadataHeaderHeartbeatEnabled] = "true" - if _, err := lockFile.Update(c.storageContext, uattrs); err != nil { + if _, err := c.lockFile().Update(c.storageContext, uattrs); err != nil { return 0, c.lockError(err) } @@ -146,8 +146,8 @@ func isHeartbeatEnabled(attrs *storage.ObjectAttrs) bool { // unlockIfStale force-unlocks the lock file if it is stale. Returns true if a // stale lock was removed (and therefore a retry makes sense), otherwise false. -func (c *remoteClient) unlockIfStale(lockFile *storage.ObjectHandle) bool { - if attrs, err := lockFile.Attrs(c.storageContext); err == nil { +func (c *remoteClient) unlockIfStale() bool { + if attrs, err := c.lockFile().Attrs(c.storageContext); err == nil { if !isHeartbeatEnabled(attrs) { // Metadata header metadataHeaderHeartbeatEnabled is // not present, thus this lock file is owned by an @@ -228,12 +228,10 @@ func (c *remoteClient) Lock(info *state.LockInfo) (string, error) { return "", err } - lockFile := c.lockFile() - - generation, err := c.createLockFile(lockFile, infoJson) + generation, err := c.createLockFile(infoJson) if err != nil { - if c.unlockIfStale(lockFile) { - generation, err = c.createLockFile(lockFile, infoJson) + if c.unlockIfStale() { + generation, err = c.createLockFile(infoJson) } } if err != nil { From e536d078cf48c7e23713b833b3327957e34060f6 Mon Sep 17 00:00:00 2001 From: Stefan Schmidt Date: Fri, 22 Jun 2018 15:31:07 +0200 Subject: [PATCH 08/11] Use the OAuth scope that is required for storage.objects.update. --- backend/remote-state/gcs/backend.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/remote-state/gcs/backend.go b/backend/remote-state/gcs/backend.go index 26b430b41c7b..9be36be3e9e0 100644 --- a/backend/remote-state/gcs/backend.go +++ b/backend/remote-state/gcs/backend.go @@ -147,13 +147,13 @@ func (b *Backend) configure(ctx context.Context) error { conf := jwt.Config{ Email: account.ClientEmail, PrivateKey: []byte(account.PrivateKey), - Scopes: []string{storage.ScopeReadWrite}, + Scopes: []string{storage.ScopeFullControl}, TokenURL: "https://accounts.google.com/o/oauth2/token", } opts = append(opts, option.WithHTTPClient(conf.Client(ctx))) } else { - opts = append(opts, option.WithScopes(storage.ScopeReadWrite)) + opts = append(opts, option.WithScopes(storage.ScopeFullControl)) } opts = append(opts, option.WithUserAgent(httpclient.UserAgentString())) From eccf30c09530c7221512d723de9de6ac19988523 Mon Sep 17 00:00:00 2001 From: Stefan Schmidt Date: Tue, 11 Sep 2018 23:34:44 +0200 Subject: [PATCH 09/11] Set the metadata header when creating the lock file, not afterwards in a separate call. --- backend/remote-state/gcs/client.go | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/backend/remote-state/gcs/client.go b/backend/remote-state/gcs/client.go index db6d5b74d497..5381ffe1afb4 100644 --- a/backend/remote-state/gcs/client.go +++ b/backend/remote-state/gcs/client.go @@ -111,6 +111,10 @@ func (c *remoteClient) Delete() error { func (c *remoteClient) createLockFile(infoJson []byte) (int64, error) { w := c.lockFile().If(storage.Conditions{DoesNotExist: true}).NewWriter(c.storageContext) err := func() error { + // Add metadata signalling to other clients that heartbeats will be + // performed on this lock file. + w.ObjectAttrs.Metadata = map[string]string{metadataHeaderHeartbeatEnabled: "true"} + if _, err := w.Write(infoJson); err != nil { return err } @@ -121,14 +125,6 @@ func (c *remoteClient) createLockFile(infoJson []byte) (int64, error) { return 0, c.lockError(fmt.Errorf("writing %q failed: %v", c.lockFileURL(), err)) } - // Add metadata signalling to other clients that heartbeats will be - // performed on this lock file. - uattrs := storage.ObjectAttrsToUpdate{Metadata: make(map[string]string)} - uattrs.Metadata[metadataHeaderHeartbeatEnabled] = "true" - if _, err := c.lockFile().Update(c.storageContext, uattrs); err != nil { - return 0, c.lockError(err) - } - return w.Attrs().Generation, nil } From 6c8b42a812ca79298e0c982295ffc3c81f3ab4ef Mon Sep 17 00:00:00 2001 From: Stefan Schmidt Date: Mon, 5 Nov 2018 12:33:15 +0100 Subject: [PATCH 10/11] Get backend tests building again after the state manager refactoring --- backend/remote-state/gcs/backend_test.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/remote-state/gcs/backend_test.go b/backend/remote-state/gcs/backend_test.go index 2e445cc8df5d..f54aa1353b2f 100644 --- a/backend/remote-state/gcs/backend_test.go +++ b/backend/remote-state/gcs/backend_test.go @@ -178,7 +178,7 @@ func testStaleLocks(t *testing.T, b1, b2 backend.Backend) { t.Helper() // Get the default state for each - b1StateMgr, err := b1.State(backend.DefaultStateName) + b1StateMgr, err := b1.StateMgr(backend.DefaultStateName) if err != nil { t.Fatalf("error: %s", err) } @@ -186,7 +186,7 @@ func testStaleLocks(t *testing.T, b1, b2 backend.Backend) { t.Fatalf("bad: %s", err) } - b2StateMgr, err := b2.State(backend.DefaultStateName) + b2StateMgr, err := b2.StateMgr(backend.DefaultStateName) if err != nil { t.Fatalf("error: %s", err) } From ab8d83be5750bcf0001be8808826b457e34ea1d7 Mon Sep 17 00:00:00 2001 From: Stefan Schmidt Date: Mon, 5 Nov 2018 16:34:52 +0100 Subject: [PATCH 11/11] Introduce configuration knobs for determining the staleness of a lock file. --- backend/remote-state/gcs/backend.go | 32 +++++++++++++++++++++ backend/remote-state/gcs/backend_state.go | 14 +++++---- backend/remote-state/gcs/backend_test.go | 6 ++-- backend/remote-state/gcs/client.go | 35 +++++++++-------------- website/docs/backends/types/gcs.html.md | 2 ++ 5 files changed, 59 insertions(+), 30 deletions(-) diff --git a/backend/remote-state/gcs/backend.go b/backend/remote-state/gcs/backend.go index 9be36be3e9e0..c3b25cc23da8 100644 --- a/backend/remote-state/gcs/backend.go +++ b/backend/remote-state/gcs/backend.go @@ -8,6 +8,7 @@ import ( "fmt" "os" "strings" + "time" "cloud.google.com/go/storage" "github.com/hashicorp/terraform/backend" @@ -35,6 +36,13 @@ type Backend struct { projectID string region string + + // Time between consecutive heartbeats on the lock file. + lockHeartbeatInterval time.Duration + + // The mininum duration that must have passed since the youngest + // recorded heartbeat before the lock file is considered stale/orphaned. + lockStaleAfter time.Duration } func New() backend.Backend { @@ -88,6 +96,20 @@ func New() backend.Backend { Description: "Region / location in which to create the bucket", Default: "", }, + + "lock_heartbeat_interval": { + Type: schema.TypeString, + Optional: true, + Description: "Time between consecutive heartbeats on the lock file as a duration string (cf. https://golang.org/pkg/time/#ParseDuration).", + Default: "1m", + }, + + "lock_stale_after": { + Type: schema.TypeString, + Optional: true, + Description: "Mininum duration (cf. https://golang.org/pkg/time/#ParseDuration) that must have passed since the youngest recorded heartbeat before the lock file is considered stale/orphaned.", + Default: "15m", + }, }, } @@ -188,6 +210,16 @@ func (b *Backend) configure(ctx context.Context) error { b.encryptionKey = k } + b.lockHeartbeatInterval, err = time.ParseDuration(data.Get("lock_heartbeat_interval").(string)) + if err != nil { + return fmt.Errorf("Error parsing lock_heartbeat_interval: %s", err) + } + + b.lockStaleAfter, err = time.ParseDuration(data.Get("lock_stale_after").(string)) + if err != nil { + return fmt.Errorf("Error parsing lock_stale_after: %s", err) + } + return nil } diff --git a/backend/remote-state/gcs/backend_state.go b/backend/remote-state/gcs/backend_state.go index 835ad96a7629..71c60ef73b46 100644 --- a/backend/remote-state/gcs/backend_state.go +++ b/backend/remote-state/gcs/backend_state.go @@ -75,12 +75,14 @@ func (b *Backend) client(name string) (*remoteClient, error) { } return &remoteClient{ - storageContext: b.storageContext, - storageClient: b.storageClient, - bucketName: b.bucketName, - stateFilePath: b.stateFile(name), - lockFilePath: b.lockFile(name), - encryptionKey: b.encryptionKey, + storageContext: b.storageContext, + storageClient: b.storageClient, + bucketName: b.bucketName, + stateFilePath: b.stateFile(name), + lockFilePath: b.lockFile(name), + encryptionKey: b.encryptionKey, + lockHeartbeatInterval: b.lockHeartbeatInterval, + lockStaleAfter: b.lockStaleAfter, }, nil } diff --git a/backend/remote-state/gcs/backend_test.go b/backend/remote-state/gcs/backend_test.go index f54aa1353b2f..d68d533554da 100644 --- a/backend/remote-state/gcs/backend_test.go +++ b/backend/remote-state/gcs/backend_test.go @@ -207,15 +207,15 @@ func testStaleLocks(t *testing.T, b1, b2 backend.Backend) { infoB.Who = "clientB" // For faster tests, reduce the duration until the lock is considered stale. - heartbeatInterval = 5 * time.Second - minHeartbeatAgeUntilStale = 20 * time.Second + lockerB.(*remote.State).Client.(*remoteClient).lockHeartbeatInterval = 5 * time.Second + lockerB.(*remote.State).Client.(*remoteClient).lockStaleAfter = 20 * time.Second lockIDA, err := lockerA.Lock(infoA) if err != nil { t.Fatal("unable to get initial lock:", err) } - // Stop heartbeating on the lock file. It will be considered stale after minHeartbeatAgeUntilStale. + // Stop heartbeating on the lock file. It will be considered stale after lockStaleAfter. lockerA.(*remote.State).Client.(*remoteClient).stopHeartbeatCh <- true // Lock is still held by A after 10 seconds. diff --git a/backend/remote-state/gcs/client.go b/backend/remote-state/gcs/client.go index 5381ffe1afb4..52d446f9012b 100644 --- a/backend/remote-state/gcs/client.go +++ b/backend/remote-state/gcs/client.go @@ -10,7 +10,7 @@ import ( "time" "cloud.google.com/go/storage" - multierror "github.com/hashicorp/go-multierror" + "github.com/hashicorp/go-multierror" "github.com/hashicorp/terraform/state" "github.com/hashicorp/terraform/state/remote" "golang.org/x/net/context" @@ -22,12 +22,14 @@ import ( type remoteClient struct { mutex sync.Mutex - storageContext context.Context - storageClient *storage.Client - bucketName string - stateFilePath string - lockFilePath string - encryptionKey []byte + storageContext context.Context + storageClient *storage.Client + bucketName string + stateFilePath string + lockFilePath string + encryptionKey []byte + lockHeartbeatInterval time.Duration + lockStaleAfter time.Duration // The initial generation number of the lock file created by this // remoteClient. @@ -42,18 +44,9 @@ type remoteClient struct { // heartbeats on it. This header facilitates a safe migration from previous // Terraform versions that do not yet perform any heartbeats on the lock file. // A lock file will only be considered stale and force-unlocked if its age -// exceeds minHeartbeatAgeUntilStale AND this metadata header is present. +// exceeds lockStaleAfter AND this metadata header is present. const metadataHeaderHeartbeatEnabled = "x-goog-meta-heartbeating" -var ( - // Time between consecutive heartbeats on the lock file. - heartbeatInterval = 1 * time.Minute - - // The mininum duration that must have passed since the youngest - // recorded heartbeat before the lock file is considered stale/orphaned. - minHeartbeatAgeUntilStale = 15 * time.Minute -) - func (c *remoteClient) Get() (payload *remote.Payload, err error) { stateFileReader, err := c.stateFile().NewReader(c.storageContext) if err != nil { @@ -153,8 +146,8 @@ func (c *remoteClient) unlockIfStale() bool { log.Printf("[TRACE] Found existing lock file %s from an older client that does not perform heartbeats", c.lockFileURL()) return false } - age := time.Now().Sub(attrs.Updated) - if age > minHeartbeatAgeUntilStale { + age := time.Since(attrs.Updated) + if age > c.lockStaleAfter { log.Printf("[WARN] Existing lock file %s is considered stale, last heartbeat was %s ago", c.lockFileURL(), age) if err := c.Unlock(strconv.FormatInt(attrs.Generation, 10)); err != nil { log.Printf("[WARN] Failed to release stale lock: %s", err) @@ -170,9 +163,9 @@ func (c *remoteClient) unlockIfStale() bool { // heartbeatLockFile periodically updates the "updated" timestamp of the lock // file until the lock is released in Unlock(). func (c *remoteClient) heartbeatLockFile() { - log.Printf("[TRACE] Starting heartbeat on lock file %s, interval is %s", c.lockFileURL(), heartbeatInterval) + log.Printf("[TRACE] Starting heartbeat on lock file %s, interval is %s", c.lockFileURL(), c.lockHeartbeatInterval) - ticker := time.NewTicker(heartbeatInterval) + ticker := time.NewTicker(c.lockHeartbeatInterval) defer ticker.Stop() defer func() { diff --git a/website/docs/backends/types/gcs.html.md b/website/docs/backends/types/gcs.html.md index 3349c304eefb..563ed7d40208 100644 --- a/website/docs/backends/types/gcs.html.md +++ b/website/docs/backends/types/gcs.html.md @@ -60,3 +60,5 @@ The following configuration options are supported: * `region` / `GOOGLE_REGION` - (Optional) The region in which a new bucket is created. For more information, see [Bucket Locations](https://cloud.google.com/storage/docs/bucket-locations). * `encryption_key` / `GOOGLE_ENCRYPTION_KEY` - (Optional) A 32 byte base64 encoded 'customer supplied encryption key' used to encrypt all state. For more information see [Customer Supplied Encryption Keys](https://cloud.google.com/storage/docs/encryption#customer-supplied). + * `lock_heartbeat_interval` - (Optional) The time between consecutive heartbeats on the lock file as a [duration string](https://golang.org/pkg/time/#ParseDuration). Defaults to "1m". + * `lock_stale_after` - (Optional) The mininum duration (as a [duration string](https://golang.org/pkg/time/#ParseDuration)) that must have passed since the youngest recorded heartbeat before the lock file is considered stale/orphaned. Defaults to "15m".