Skip to content

Commit

Permalink
Fixes #868, panic on upgrades.
Browse files Browse the repository at this point in the history
This was a problem with 0.7.x upgrades to 0.8 where there was a raft snapshot. The recovery method assumed certain structures would be there and they weren't.
  • Loading branch information
pauldix committed Sep 3, 2014
1 parent fd0457c commit d9a1486
Show file tree
Hide file tree
Showing 3 changed files with 226 additions and 7 deletions.
10 changes: 9 additions & 1 deletion cluster/cluster_configuration.go
Original file line number Diff line number Diff line change
Expand Up @@ -689,7 +689,11 @@ func (self *ClusterConfiguration) Recovery(b []byte) error {
self.clusterAdmins = data.Admins
self.dbUsers = data.DbUsers
self.servers = data.Servers
self.MetaStore.UpdateFromSnapshot(data.MetaStore)

// if recovering from a snapshot from version 0.7.x the metastore will be nil. Fix #868 https://github.com/influxdb/influxdb/issues/868
if data.MetaStore != nil {
self.MetaStore.UpdateFromSnapshot(data.MetaStore)
}

for _, server := range self.servers {
log.Info("Checking whether %s is the local server %s", server.RaftName, self.LocalRaftName)
Expand Down Expand Up @@ -724,6 +728,10 @@ func (self *ClusterConfiguration) Recovery(b []byte) error {
}
// map the shards to their spaces
self.databaseShardSpaces = data.DatabaseShardSpaces
// we need this if recovering from a snapshot from 0.7.x
if data.DatabaseShardSpaces == nil {
self.databaseShardSpaces = make(map[string][]*ShardSpace)
}
for _, spaces := range self.databaseShardSpaces {
for _, space := range spaces {
spaceShards := make([]*ShardData, 0)
Expand Down
36 changes: 30 additions & 6 deletions integration/migration_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,13 @@ are up on S3 so that we can run it later. Just trust that I've run it (this is P

// var _ = Suite(&MigrationTestSuite{})

// func (self *MigrationTestSuite) SetUpSuite(c *C) {
// self.server = NewServer("integration/migration_test.toml", c)
// func (self *MigrationTestSuite) setup(file string, c *C) {
// self.server = NewServer(fmt.Sprintf("integration/%s", file), c)
// }

// func (self *MigrationTestSuite) TearDownSuite(c *C) {
// func (self *MigrationTestSuite) teardown(dir string, c *C) {
// self.server.Stop()
// dataDir := "migration_data/data"
// dataDir := fmt.Sprintf("./%s/data", "migration_data")
// shardDir := filepath.Join(dataDir, migration.OLD_SHARD_DIR)
// infos, err := ioutil.ReadDir(shardDir)
// if err != nil {
Expand All @@ -41,13 +41,21 @@ are up on S3 so that we can run it later. Just trust that I've run it (this is P
// }
// for _, info := range infos {
// if info.IsDir() {
// os.Remove(filepath.Join(shardDir, info.Name(), migration.MIGRATED_MARKER))
// err := os.Remove(filepath.Join(shardDir, info.Name(), migration.MIGRATED_MARKER))
// if err != nil {
// fmt.Printf("Error Clearing Migration: ", err)
// }
// }
// }
// os.RemoveAll(filepath.Join(dataDir, datastore.SHARD_DATABASE_DIR))
// err = os.RemoveAll(filepath.Join(dataDir, datastore.SHARD_DATABASE_DIR))
// if err != nil {
// fmt.Printf("Error Clearing Migration: ", err)
// }
// }

// func (self *MigrationTestSuite) TestMigrationOfPreviousDb(c *C) {
// self.setup("migration_test.toml", c)
// defer self.teardown("migration_data", c)
// _, err := http.Post("http://localhost:8086/cluster/migrate_data?u=root&p=root", "application/json", nil)
// c.Assert(err, IsNil)
// // make sure that it won't kick it off a second time while it's already running
Expand Down Expand Up @@ -84,3 +92,19 @@ are up on S3 so that we can run it later. Just trust that I've run it (this is P
// c.Assert(err, IsNil)
// time.Sleep(time.Second * 5)
// }

// func (self *MigrationTestSuite) TestDoesntPanicOnPreviousSnapshot(c *C) {
// self.setup("migration_test2.toml", c)
// defer self.teardown("migration_data2", c)
// _, err := http.Post("http://localhost:8086/cluster/migrate_data?u=root&p=root", "application/json", nil)
// c.Assert(err, IsNil)

// time.Sleep(time.Second * 5)

// client := self.server.GetClient("test", c)
// s, err := client.Query("select count(value) from foo")
// c.Assert(err, IsNil)
// c.Assert(s, HasLen, 1)
// c.Assert(s[0].Points, HasLen, 1)
// c.Assert(s[0].Points[0][1].(float64), Equals, float64(1))
// }
187 changes: 187 additions & 0 deletions integration/migration_test2.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,187 @@
# Welcome to the InfluxDB configuration file.

# If hostname (on the OS) doesn't return a name that can be resolved by the other
# systems in the cluster, you'll have to set the hostname to an IP or something
# that can be resolved here.
# hostname = ""

bind-address = "0.0.0.0"

# Once every 24 hours InfluxDB will report anonymous data to m.influxdb.com
# The data includes raft name (random 8 bytes), os, arch and version
# We don't track ip addresses of servers reporting. This is only used
# to track the number of instances running and the versions which
# is very helpful for us.
# Change this option to true to disable reporting.
reporting-disabled = true

[logging]
# logging level can be one of "debug", "info", "warn" or "error"
level = "info"
file = "stdout" # stdout to log to standard out

# Configure the admin server
[admin]
port = 8083 # binding is disabled if the port isn't set
assets = "./admin"

# Configure the http api
[api]
port = 8086 # binding is disabled if the port isn't set
# ssl-port = 8084 # Ssl support is enabled if you set a port and cert
# ssl-cert = /path/to/cert.pem

# connections will timeout after this amount of time. Ensures that clients that misbehave
# and keep alive connections they don't use won't end up connection a million times.
# However, if a request is taking longer than this to complete, could be a problem.
read-timeout = "5s"

[input_plugins]

# Configure the graphite api
[input_plugins.graphite]
enabled = false
# port = 2003
# database = "" # store graphite data in this database
# udp_enabled = true # enable udp interface on the same port as the tcp interface

# Configure the udp api
[input_plugins.udp]
enabled = false
# port = 4444
# database = ""

# Configure multiple udp apis each can write to separate db. Just
# repeat the following section to enable multiple udp apis on
# different ports.
[[input_plugins.udp_servers]] # array of tables
enabled = false
# port = 5551
# database = "db1"

# Raft configuration
[raft]
# The raft port should be open between all servers in a cluster.
# However, this port shouldn't be accessible from the internet.

port = 8090

# Where the raft logs are stored. The user running InfluxDB will need read/write access.
dir = "integration/migration_data2/raft"

# election-timeout = "1s"

[storage]

dir = "integration/migration_data2/data"
# How many requests to potentially buffer in memory. If the buffer gets filled then writes
# will still be logged and once the local storage has caught up (or compacted) the writes
# will be replayed from the WAL
write-buffer-size = 10000

# the engine to use for new shards, old shards will continue to use the same engine
default-engine = "leveldb"

# The default setting on this is 0, which means unlimited. Set this to something if you want to
# limit the max number of open files. max-open-files is per shard so this * that will be max.
max-open-shards = 0

# The default setting is 100. This option tells how many points will be fetched from LevelDb before
# they get flushed into backend.
point-batch-size = 100

# The number of points to batch in memory before writing them to leveldb. Lowering this number will
# reduce the memory usage, but will result in slower writes.
write-batch-size = 5000000

# The server will check this often for shards that have expired that should be cleared.
retention-sweep-period = "10m"

[storage.engines.leveldb]

# Maximum mmap open files, this will affect the virtual memory used by
# the process
max-open-files = 1000

# LRU cache size, LRU is used by leveldb to store contents of the
# uncompressed sstables. You can use `m` or `g` prefix for megabytes
# and gigabytes, respectively.
lru-cache-size = "200m"

[storage.engines.rocksdb]

# Maximum mmap open files, this will affect the virtual memory used by
# the process
max-open-files = 1000

# LRU cache size, LRU is used by rocksdb to store contents of the
# uncompressed sstables. You can use `m` or `g` prefix for megabytes
# and gigabytes, respectively.
lru-cache-size = "200m"

[storage.engines.hyperleveldb]

# Maximum mmap open files, this will affect the virtual memory used by
# the process
max-open-files = 1000

# LRU cache size, LRU is used by rocksdb to store contents of the
# uncompressed sstables. You can use `m` or `g` prefix for megabytes
# and gigabytes, respectively.
lru-cache-size = "200m"

[storage.engines.lmdb]

map-size = "100g"

[cluster]
# A comma separated list of servers to seed
# this server. this is only relevant when the
# server is joining a new cluster. Otherwise
# the server will use the list of known servers
# prior to shutting down. Any server can be pointed to
# as a seed. It will find the Raft leader automatically.

# Here's an example. Note that the port on the host is the same as the raft port.
# seed-servers = ["hosta:8090","hostb:8090"]

# Replication happens over a TCP connection with a Protobuf protocol.
# This port should be reachable between all servers in a cluster.
# However, this port shouldn't be accessible from the internet.

protobuf_port = 8099
protobuf_timeout = "2s" # the write timeout on the protobuf conn any duration parseable by time.ParseDuration
protobuf_heartbeat = "200ms" # the heartbeat interval between the servers. must be parseable by time.ParseDuration
protobuf_min_backoff = "1s" # the minimum backoff after a failed heartbeat attempt
protobuf_max_backoff = "10s" # the maxmimum backoff after a failed heartbeat attempt

# How many write requests to potentially buffer in memory per server. If the buffer gets filled then writes
# will still be logged and once the server has caught up (or come back online) the writes
# will be replayed from the WAL
write-buffer-size = 1000

# the maximum number of responses to buffer from remote nodes, if the
# expected number of responses exceed this number then querying will
# happen sequentially and the buffer size will be limited to this
# number
max-response-buffer-size = 100

# When queries get distributed out to shards, they go in parallel. This means that results can get buffered
# in memory since results will come in any order, but have to be processed in the correct time order.
# Setting this higher will give better performance, but you'll need more memory. Setting this to 1 will ensure
# that you don't need to buffer in memory, but you won't get the best performance.
concurrent-shard-query-limit = 10

[wal]

dir = "integration/migration_data2/wal"
flush-after = 1000 # the number of writes after which wal will be flushed, 0 for flushing on every write
bookmark-after = 1000 # the number of writes after which a bookmark will be created

# the number of writes after which an index entry is created pointing
# to the offset of the first request, default to 1k
index-after = 1000

# the number of requests per one log file, if new requests came in a
# new log file will be created
requests-per-logfile = 10000

0 comments on commit d9a1486

Please sign in to comment.