diff --git a/cmd/zrepl/zrepl.c b/cmd/zrepl/zrepl.c index f0c9fa77139d..fe4a3e3d2088 100644 --- a/cmd/zrepl/zrepl.c +++ b/cmd/zrepl/zrepl.c @@ -25,462 +25,14 @@ extern unsigned long zfs_arc_max; extern unsigned long zfs_arc_min; extern int zfs_autoimport_disable; -static void uzfs_zvol_io_ack_sender(void *arg); +#if DEBUG +inject_error_t inject_error; +#endif kthread_t *conn_accpt_thread; kthread_t *uzfs_timer_thread; kthread_t *mgmt_conn_thread; -/* - * (Re)Initializes zv's state variables. - * This fn need to be called to use zv across network disconnections. - * Lock protection and life of zv need to be managed by caller - */ -static void -reinitialize_zv_state(zvol_state_t *zv) -{ - if (zv == NULL) - return; - zv->zv_metavolblocksize = 0; - - uzfs_zvol_set_status(zv, ZVOL_STATUS_DEGRADED); - bzero(&zv->rebuild_info, sizeof (zvol_rebuild_info_t)); - - uzfs_zvol_set_rebuild_status(zv, ZVOL_REBUILDING_INIT); -} - -/* - * Process open request on data connection, the first message. - * - * Return status meaning: - * != 0: OPEN failed, stop reading data from connection. - * == 0 && zinfopp == NULL: OPEN failed, recoverable error - * == 0 && zinfopp != NULL: OPEN succeeded, proceed with other commands - */ -static int -open_zvol(int fd, zvol_info_t **zinfopp) -{ - int rc; - zvol_io_hdr_t hdr; - zvol_op_open_data_t open_data; - zvol_info_t *zinfo = NULL; - zvol_state_t *zv = NULL; - kthread_t *thrd_info; - thread_args_t *thrd_arg; - int rele_dataset_on_error = 0; - - /* - * If we don't know the version yet, be more careful when - * reading header - */ - if (uzfs_zvol_read_header(fd, &hdr) != 0) { - LOG_ERR("error reading open header"); - return (-1); - } - if (hdr.opcode != ZVOL_OPCODE_OPEN) { - LOG_ERR("zvol must be opened first"); - return (-1); - } - if (hdr.len != sizeof (open_data)) { - LOG_ERR("Invalid payload length for open"); - return (-1); - } - rc = uzfs_zvol_socket_read(fd, (char *)&open_data, sizeof (open_data)); - if (rc != 0) { - LOG_ERR("Payload read failed"); - return (-1); - } - - open_data.volname[MAX_NAME_LEN - 1] = '\0'; - zinfo = uzfs_zinfo_lookup(open_data.volname); - if (zinfo == NULL) { - LOG_ERR("zvol %s not found", open_data.volname); - hdr.status = ZVOL_OP_STATUS_FAILED; - goto open_reply; - } - if (zinfo->state != ZVOL_INFO_STATE_ONLINE) { - LOG_ERR("zvol %s is not online", open_data.volname); - hdr.status = ZVOL_OP_STATUS_FAILED; - goto open_reply; - } - zv = zinfo->zv; - - if (zv->zv_metavolblocksize != 0) { - LOG_ERR("there might be already a data connection for %s", - open_data.volname); - hdr.status = ZVOL_OP_STATUS_FAILED; - goto open_reply; - } - - ASSERT3P(zv, !=, NULL); - ASSERT3P(zv->zv_status, ==, ZVOL_STATUS_DEGRADED); - ASSERT3P(zv->rebuild_info.zv_rebuild_status, ==, ZVOL_REBUILDING_INIT); - - // validate block size (only one bit is set in the number) - if (open_data.tgt_block_size == 0 || - (open_data.tgt_block_size & (open_data.tgt_block_size - 1)) != 0) { - LOG_ERR("Invalid block size"); - hdr.status = ZVOL_OP_STATUS_FAILED; - goto open_reply; - } - - (void) pthread_mutex_lock(&zinfo->zinfo_mutex); - /* - * Hold objset if this is the first query for the zvol. This can happen - * in case that the target creates data connection directly without - * getting the endpoint through mgmt connection first. - */ - rele_dataset_on_error = 0; - if (zv->zv_objset == NULL) { - if (uzfs_hold_dataset(zv) != 0) { - (void) pthread_mutex_unlock(&zinfo->zinfo_mutex); - LOG_ERR("Failed to hold zvol during open"); - hdr.status = ZVOL_OP_STATUS_FAILED; - goto open_reply; - } - rele_dataset_on_error = 1; - } - if (uzfs_update_metadata_granularity(zv, - open_data.tgt_block_size) != 0) { - (void) pthread_mutex_unlock(&zinfo->zinfo_mutex); - if (rele_dataset_on_error == 1) - uzfs_rele_dataset(zv); - LOG_ERR("Failed to set granularity of metadata"); - hdr.status = ZVOL_OP_STATUS_FAILED; - goto open_reply; - } - /* - * TODO: Once we support multiple concurrent data connections for a - * single zvol, we should probably check that the timeout is the same - * for all data connections. - */ - uzfs_update_ionum_interval(zinfo, open_data.timeout); - zinfo->timeout = open_data.timeout; - *zinfopp = zinfo; - - if (!zinfo->is_io_ack_sender_created) { - zinfo->conn_closed = B_FALSE; - zinfo->is_io_ack_sender_created = B_TRUE; - (void) pthread_mutex_unlock(&zinfo->zinfo_mutex); - thrd_arg = kmem_alloc(sizeof (thread_args_t), KM_SLEEP); - thrd_arg->fd = fd; - thrd_arg->zinfo = zinfo; - uzfs_zinfo_take_refcnt(zinfo); - thrd_info = zk_thread_create(NULL, 0, - (thread_func_t)uzfs_zvol_io_ack_sender, - (void *)thrd_arg, 0, NULL, TS_RUN, 0, - PTHREAD_CREATE_DETACHED); - VERIFY3P(thrd_info, !=, NULL); - } else { - (void) pthread_mutex_unlock(&zinfo->zinfo_mutex); - } - - hdr.status = ZVOL_OP_STATUS_OK; - -open_reply: - hdr.len = 0; - rc = uzfs_zvol_socket_write(fd, (char *)&hdr, sizeof (hdr)); - if (rc == -1) - LOG_ERR("Failed to send reply for open request"); - if (hdr.status != ZVOL_OP_STATUS_OK) { - ASSERT3P(*zinfopp, ==, NULL); - reinitialize_zv_state(zv); - if (zinfo != NULL) - uzfs_zinfo_drop_refcnt(zinfo); - return (-1); - } - return (rc); -} - -/* - * IO-Receiver would be per ZVOL, it would be - * responsible for receiving IOs on given socket. - */ -static void -uzfs_zvol_io_receiver(void *arg) -{ - int rc; - int fd = (uintptr_t)arg; - zvol_info_t *zinfo = NULL; - zvol_io_cmd_t *zio_cmd; - zvol_io_hdr_t hdr; - - prctl(PR_SET_NAME, "io_receiver", 0, 0, 0); - - /* First command should be OPEN */ - while (zinfo == NULL) { - if (open_zvol(fd, &zinfo) != 0) { - if ((zinfo != NULL) && - (zinfo->is_io_ack_sender_created)) - goto exit; - shutdown(fd, SHUT_RDWR); - goto thread_exit; - } - } - - LOG_INFO("Data connection associated with zvol %s fd: %d", - zinfo->name, fd); - - while ((rc = uzfs_zvol_socket_read(fd, (char *)&hdr, sizeof (hdr))) == - 0) { - if ((zinfo->state == ZVOL_INFO_STATE_OFFLINE)) - break; - - if (hdr.opcode != ZVOL_OPCODE_WRITE && - hdr.opcode != ZVOL_OPCODE_READ && - hdr.opcode != ZVOL_OPCODE_SYNC) { - LOG_ERR("Unexpected opcode %d", hdr.opcode); - break; - } - - if (((hdr.opcode == ZVOL_OPCODE_WRITE) || - (hdr.opcode == ZVOL_OPCODE_READ)) && !hdr.len) { - LOG_ERR("Zero Payload size for opcode %d", hdr.opcode); - break; - } else if ((hdr.opcode == ZVOL_OPCODE_SYNC) && hdr.len > 0) { - LOG_ERR("Unexpected payload for opcode %d", hdr.opcode); - break; - } - - zio_cmd = zio_cmd_alloc(&hdr, fd); - /* Read payload for commands which have it */ - if (hdr.opcode == ZVOL_OPCODE_WRITE) { - rc = uzfs_zvol_socket_read(fd, zio_cmd->buf, hdr.len); - if (rc != 0) { - zio_cmd_free(&zio_cmd); - break; - } - } - - if (zinfo->state == ZVOL_INFO_STATE_OFFLINE) { - zio_cmd_free(&zio_cmd); - break; - } - /* Take refcount for uzfs_zvol_worker to work on it */ - uzfs_zinfo_take_refcnt(zinfo); - zio_cmd->zv = zinfo; - taskq_dispatch(zinfo->uzfs_zvol_taskq, uzfs_zvol_worker, - zio_cmd, TQ_SLEEP); - } -exit: - (void) pthread_mutex_lock(&zinfo->zinfo_mutex); - zinfo->conn_closed = B_TRUE; - /* - * Send signal to ack sender so that it can free - * zio_cmd, close fd and exit. - */ - if (zinfo->io_ack_waiting) { - rc = pthread_cond_signal(&zinfo->io_ack_cond); - } - /* - * wait for ack thread to exit to avoid races with new - * connections for the same zinfo - */ - while (zinfo->conn_closed && zinfo->is_io_ack_sender_created) { - (void) pthread_mutex_unlock(&zinfo->zinfo_mutex); - usleep(1000); - (void) pthread_mutex_lock(&zinfo->zinfo_mutex); - } - (void) pthread_mutex_unlock(&zinfo->zinfo_mutex); - - zinfo->io_ack_waiting = 0; - - reinitialize_zv_state(zinfo->zv); - uzfs_zinfo_drop_refcnt(zinfo); -thread_exit: - close(fd); - LOG_INFO("Data connection closed on fd: %d", fd); - zk_thread_exit(); -} - -/* - * This func takes care of sending potentially multiple read blocks each - * prefixed by metainfo. - */ -static int -uzfs_send_reads(int fd, zvol_io_cmd_t *zio_cmd) -{ - zvol_io_hdr_t *hdr = &zio_cmd->hdr; - struct zvol_io_rw_hdr read_hdr; - metadata_desc_t *md; - size_t rel_offset = 0; - int rc = 0; - - /* special case for missing metadata */ - if (zio_cmd->metadata_desc == NULL) { - read_hdr.io_num = 0; - /* - * read_hdr.len should be adjusted back - * to actual read request size now - */ - read_hdr.len = hdr->len - - sizeof (struct zvol_io_rw_hdr); - rc = uzfs_zvol_socket_write(fd, (char *)&read_hdr, - sizeof (read_hdr)); - if (rc != 0) - return (rc); - /* Data that need to be sent is equal to read_hdr.len */ - rc = uzfs_zvol_socket_write(fd, zio_cmd->buf, read_hdr.len); - return (rc); - } - - /* - * TODO: Optimize performance by combining multiple writes to a single - * system call either by copying all data to larger buffer or using - * vector write. - */ - for (md = zio_cmd->metadata_desc; md != NULL; md = md->next) { - read_hdr.io_num = md->metadata.io_num; - read_hdr.len = md->len; - rc = uzfs_zvol_socket_write(fd, (char *)&read_hdr, - sizeof (read_hdr)); - if (rc != 0) - goto end; - - rc = uzfs_zvol_socket_write(fd, - (char *)zio_cmd->buf + rel_offset, md->len); - if (rc != 0) - goto end; - rel_offset += md->len; - } - -end: - FREE_METADATA_LIST(zio_cmd->metadata_desc); - zio_cmd->metadata_desc = NULL; - - return (rc); -} - -/* - * One thread per LUN/vol. This thread works - * on queue and it sends ack back to client on - * a given fd. - * There are two types of clients - one is iscsi target, and, - * other is a replica which undergoes rebuild. - * Need to exit from thread when there are network errors - * on fd related to iscsi target. - */ -static void -uzfs_zvol_io_ack_sender(void *arg) -{ - int fd; - int md_len; - zvol_info_t *zinfo; - thread_args_t *thrd_arg; - zvol_io_cmd_t *zio_cmd = NULL; - - thrd_arg = (thread_args_t *)arg; - fd = thrd_arg->fd; - zinfo = thrd_arg->zinfo; - kmem_free(arg, sizeof (thread_args_t)); - - prctl(PR_SET_NAME, "ack_sender", 0, 0, 0); - - while (1) { - int rc = 0; - (void) pthread_mutex_lock(&zinfo->zinfo_mutex); - zinfo->zio_cmd_in_ack = NULL; - while (1) { - if ((zinfo->state == ZVOL_INFO_STATE_OFFLINE) || - (zinfo->conn_closed == B_TRUE)) { - (void) pthread_mutex_unlock( - &zinfo->zinfo_mutex); - goto exit; - } - if (STAILQ_EMPTY(&zinfo->complete_queue)) { - zinfo->io_ack_waiting = 1; - pthread_cond_wait(&zinfo->io_ack_cond, - &zinfo->zinfo_mutex); - zinfo->io_ack_waiting = 0; - } - else - break; - } - - zio_cmd = STAILQ_FIRST(&zinfo->complete_queue); - STAILQ_REMOVE_HEAD(&zinfo->complete_queue, cmd_link); - zinfo->zio_cmd_in_ack = zio_cmd; - (void) pthread_mutex_unlock(&zinfo->zinfo_mutex); - - LOG_DEBUG("ACK for op: %d, seq-id: %ld", - zio_cmd->hdr.opcode, zio_cmd->hdr.io_seq); - - /* account for space taken by metadata headers */ - if (zio_cmd->hdr.status == ZVOL_OP_STATUS_OK && - zio_cmd->hdr.opcode == ZVOL_OPCODE_READ) { - md_len = 0; - for (metadata_desc_t *md = zio_cmd->metadata_desc; - md != NULL; - md = md->next) { - md_len++; - } - /* we need at least one header even if no metadata */ - if (md_len == 0) - md_len++; - zio_cmd->hdr.len += (md_len * - sizeof (struct zvol_io_rw_hdr)); - } - - rc = uzfs_zvol_socket_write(zio_cmd->conn, - (char *)&zio_cmd->hdr, sizeof (zio_cmd->hdr)); - if (rc == -1) { - LOG_ERRNO("socket write err"); - zinfo->zio_cmd_in_ack = NULL; - /* - * exit due to network errors on fd related - * to iscsi target - */ - if (zio_cmd->conn == fd) { - zio_cmd_free(&zio_cmd); - goto exit; - } - zio_cmd_free(&zio_cmd); - continue; - } - - if (zio_cmd->hdr.opcode == ZVOL_OPCODE_READ) { - if (zio_cmd->hdr.status == ZVOL_OP_STATUS_OK) { - /* Send data read from disk */ - rc = uzfs_send_reads(zio_cmd->conn, zio_cmd); - if (rc == -1) { - zinfo->zio_cmd_in_ack = NULL; - LOG_ERRNO("socket write err"); - if (zio_cmd->conn == fd) { - zio_cmd_free(&zio_cmd); - goto exit; - } - } - } - atomic_inc_64(&zinfo->read_req_ack_cnt); - } else { - if (zio_cmd->hdr.opcode == ZVOL_OPCODE_WRITE) - atomic_inc_64(&zinfo->write_req_ack_cnt); - else if (zio_cmd->hdr.opcode == ZVOL_OPCODE_SYNC) - atomic_inc_64(&zinfo->sync_req_ack_cnt); - } - zinfo->zio_cmd_in_ack = NULL; - zio_cmd_free(&zio_cmd); - } -exit: - zinfo->zio_cmd_in_ack = NULL; - shutdown(fd, SHUT_RDWR); - LOG_INFO("Data connection for zvol %s closed on fd: %d", - zinfo->name, fd); - - (void) pthread_mutex_lock(&zinfo->zinfo_mutex); - while (!STAILQ_EMPTY(&zinfo->complete_queue)) { - zio_cmd = STAILQ_FIRST(&zinfo->complete_queue); - STAILQ_REMOVE_HEAD(&zinfo->complete_queue, cmd_link); - zio_cmd_free(&zio_cmd); - } - zinfo->conn_closed = B_FALSE; - zinfo->is_io_ack_sender_created = B_FALSE; - (void) pthread_mutex_unlock(&zinfo->zinfo_mutex); - uzfs_zinfo_drop_refcnt(zinfo); - - zk_thread_exit(); -} - static void zrepl_svc_run(void) { @@ -572,14 +124,18 @@ main(int argc, char **argv) } } - if (getenv("CONFIG_LOAD_DISABLE") != NULL) { + if (getenv("CONFIG_LOAD_ENABLE") != NULL) { LOG_INFO("disabled auto import (reading of zpool.cache)"); - zfs_autoimport_disable = 1; + zfs_autoimport_disable = 0; } else { LOG_INFO("auto importing pools by reading zpool.cache files"); - zfs_autoimport_disable = 0; + zfs_autoimport_disable = 1; } + SLIST_INIT(&uzfs_mgmt_conns); + mutex_init(&conn_list_mtx, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&async_tasks_mtx, NULL, MUTEX_DEFAULT, NULL); + zinfo_create_hook = &zinfo_create_cb; zinfo_destroy_hook = &zinfo_destroy_cb; diff --git a/include/data_conn.h b/include/data_conn.h index df9ad16b6fb8..b473c83f215e 100644 --- a/include/data_conn.h +++ b/include/data_conn.h @@ -51,6 +51,8 @@ thread_func_t rebuild_scanner; extern void (*io_receiver)(void *arg); extern void (*rebuild_scanner)(void *arg); +extern void uzfs_zvol_io_receiver(void *); + extern uint16_t io_server_port; extern uint16_t rebuild_io_server_port; extern uint64_t zvol_rebuild_step_size; @@ -70,6 +72,8 @@ void uzfs_zvol_rebuild_scanner(void *arg); void uzfs_update_ionum_interval(zvol_info_t *zinfo, uint32_t timeout); void uzfs_zvol_timer_thread(void); +void signal_fds_related_to_zinfo(zvol_info_t *zinfo); + #ifdef __cplusplus } #endif diff --git a/include/zrepl_mgmt.h b/include/zrepl_mgmt.h index 2b91da04049a..4e136e4b4283 100644 --- a/include/zrepl_mgmt.h +++ b/include/zrepl_mgmt.h @@ -68,6 +68,18 @@ extern kmutex_t zvol_list_mutex; extern struct zvol_list zvol_list; struct zvol_io_cmd_s; +#if DEBUG +typedef struct inject_delay_s { + int helping_replica_rebuild_step; +} inject_delay_t; + +typedef struct inject_error_s { + inject_delay_t delay; +} inject_error_t; + +extern inject_error_t inject_error; +#endif + typedef enum zvol_info_state_e { ZVOL_INFO_STATE_ONLINE, ZVOL_INFO_STATE_OFFLINE, @@ -102,6 +114,9 @@ typedef struct zvol_info_s { /* All cmds after execution will go here for ack */ STAILQ_HEAD(, zvol_io_cmd_s) complete_queue; + /* fds related to this zinfo on which threads are waiting */ + STAILQ_HEAD(, zinfo_fd_s) fd_list; + uint8_t io_ack_waiting; /* Will be used to singal ack-sender to exit */ @@ -109,7 +124,10 @@ typedef struct zvol_info_s { /* Pointer to mgmt connection for this zinfo */ void *mgmt_conn; - /* Perfromance counter */ + /* ongoing command that is being worked on to ack to its sender */ + void *zio_cmd_in_ack; + + /* Performance counter */ /* Debug counters */ uint64_t read_req_received_cnt; @@ -118,9 +136,6 @@ typedef struct zvol_info_s { uint64_t read_req_ack_cnt; uint64_t write_req_ack_cnt; uint64_t sync_req_ack_cnt; - - /* ongoing command that is being worked on to ack to its sender */ - void *zio_cmd_in_ack; } zvol_info_t; typedef struct thread_args_s { @@ -132,6 +147,11 @@ typedef struct thread_args_s { extern void (*zinfo_create_hook)(zvol_info_t *, nvlist_t *); extern void (*zinfo_destroy_hook)(zvol_info_t *); +typedef struct zinfo_fd_s { + STAILQ_ENTRY(zinfo_fd_s) fd_link; + int fd; +} zinfo_fd_t; + typedef struct zvol_io_cmd_s { STAILQ_ENTRY(zvol_io_cmd_s) cmd_link; zvol_io_hdr_t hdr; @@ -159,6 +179,7 @@ extern int set_socket_keepalive(int sfd); extern int create_and_bind(const char *port, int bind_needed, boolean_t nonblocking); int uzfs_zvol_name_compare(zvol_info_t *zv, const char *name); +void shutdown_fds_related_to_zinfo(zvol_info_t *zinfo); /* * API to drop refcnt on zinfo. If refcnt diff --git a/lib/fio/replica.c b/lib/fio/replica.c index 4db2815b88e7..37cf4ce2e2e3 100644 --- a/lib/fio/replica.c +++ b/lib/fio/replica.c @@ -477,7 +477,7 @@ static int fio_repl_open_file(struct thread_data *td, struct fio_file *f) if (get_data_endpoint(td, f->file_name, &port, host) != 0) return (1); } - +again: f->fd = socket(AF_INET, SOCK_STREAM, 0); if (f->fd < 0) { td_verror(td, errno, "socket"); @@ -498,10 +498,12 @@ static int fio_repl_open_file(struct thread_data *td, struct fio_file *f) if (set_window_size(td, f->fd)) { close(f->fd); + f->fd = -1; return (1); } if (set_mss(td, f->fd)) { close(f->fd); + f->fd = -1; return (1); } @@ -512,6 +514,7 @@ static int fio_repl_open_file(struct thread_data *td, struct fio_file *f) if (inet_pton(AF_INET, host, &addr.sin_addr) <= 0) { td_verror(td, errno, "inet_pton"); close(f->fd); + f->fd = -1; return (1); } log_info("repl: opening zvol %s on data connection\n", @@ -519,13 +522,15 @@ static int fio_repl_open_file(struct thread_data *td, struct fio_file *f) if (connect(f->fd, (struct sockaddr *)&addr, sizeof (addr)) < 0) { td_verror(td, errno, "connect"); close(f->fd); + f->fd = -1; return (1); } // send volume name we want to open to replica if (open_zvol(td, f->fd, f->file_name) != 0) { close(f->fd); - return (1); + sleep(2); + goto again; } return (0); diff --git a/lib/libzpool/uzfs_io.c b/lib/libzpool/uzfs_io.c index d120708f5382..28fcd92ba359 100644 --- a/lib/libzpool/uzfs_io.c +++ b/lib/libzpool/uzfs_io.c @@ -397,7 +397,7 @@ void uzfs_zvol_set_rebuild_status(zvol_state_t *zv, zvol_rebuild_status_t status) { LOG_INFO("zvol %s rebuild status change: %s -> %s", zv->zv_name, - rebuild_status_to_str(zv->zv_status), + rebuild_status_to_str(zv->rebuild_info.zv_rebuild_status), rebuild_status_to_str(status)); zv->rebuild_info.zv_rebuild_status = status; } diff --git a/lib/libzpool/zrepl_mgmt.c b/lib/libzpool/zrepl_mgmt.c index e0377235f934..686038be3bb6 100644 --- a/lib/libzpool/zrepl_mgmt.c +++ b/lib/libzpool/zrepl_mgmt.c @@ -181,9 +181,31 @@ uzfs_insert_zinfo_list(zvol_info_t *zinfo) (void) mutex_exit(&zvol_list_mutex); } +void +shutdown_fds_related_to_zinfo(zvol_info_t *zinfo) +{ + zinfo_fd_t *zinfo_fd = NULL; + + (void) pthread_mutex_lock(&zinfo->zinfo_mutex); + while (1) { + STAILQ_FOREACH(zinfo_fd, &zinfo->fd_list, fd_link) { + LOG_INFO("shutting down %d on %s", zinfo_fd->fd, + zinfo->name); + shutdown(zinfo_fd->fd, SHUT_RDWR); + } + (void) pthread_mutex_unlock(&zinfo->zinfo_mutex); + sleep(1); + (void) pthread_mutex_lock(&zinfo->zinfo_mutex); + if (STAILQ_EMPTY(&zinfo->fd_list)) + break; + } + (void) pthread_mutex_unlock(&zinfo->zinfo_mutex); +} + static void uzfs_mark_offline_and_free_zinfo(zvol_info_t *zinfo) { + shutdown_fds_related_to_zinfo(zinfo); (void) pthread_mutex_lock(&zinfo->zinfo_mutex); zinfo->state = ZVOL_INFO_STATE_OFFLINE; /* Send signal to ack_sender thread about offline */ @@ -338,6 +360,7 @@ uzfs_zinfo_init(void *zv, const char *ds_name, nvlist_t *create_props) TASKQ_PREPOPULATE | TASKQ_DYNAMIC); STAILQ_INIT(&zinfo->complete_queue); + STAILQ_INIT(&zinfo->fd_list); uzfs_zinfo_init_mutex(zinfo); strlcpy(zinfo->name, ds_name, MAXNAMELEN); diff --git a/lib/libzrepl/data_conn.c b/lib/libzrepl/data_conn.c index b8d5f68a4027..11a60907c576 100644 --- a/lib/libzrepl/data_conn.c +++ b/lib/libzrepl/data_conn.c @@ -33,6 +33,7 @@ #include #include #include +#include #include "mgmt_conn.h" #include "data_conn.h" @@ -341,6 +342,53 @@ uzfs_zvol_worker(void *arg) uzfs_zinfo_drop_refcnt(zinfo); } +static void +uzfs_zvol_append_to_fd_list(zvol_info_t *zinfo, int fd) +{ + zinfo_fd_t *new_zinfo_fd = kmem_alloc(sizeof (zinfo_fd_t), KM_SLEEP); + new_zinfo_fd->fd = fd; + + (void) pthread_mutex_lock(&zinfo->zinfo_mutex); +#ifdef DEBUG + zinfo_fd_t *zinfo_fd = NULL; + STAILQ_FOREACH(zinfo_fd, &zinfo->fd_list, fd_link) { + if (zinfo_fd->fd == fd) { + ASSERT(1 == 0); + } + } +#endif + STAILQ_INSERT_TAIL(&zinfo->fd_list, new_zinfo_fd, fd_link); + LOG_DEBUG("Appending fd %d for zvol %s", fd, zinfo->name); + (void) pthread_mutex_unlock(&zinfo->zinfo_mutex); +} + +static void +uzfs_zvol_remove_from_fd_list(zvol_info_t *zinfo, int fd) +{ + zinfo_fd_t *zinfo_fd = NULL; + + (void) pthread_mutex_lock(&zinfo->zinfo_mutex); +#ifdef DEBUG + int count = 0; + STAILQ_FOREACH(zinfo_fd, &zinfo->fd_list, fd_link) { + if (zinfo_fd->fd == fd) + count++; + } + ASSERT(count == 1); +#endif + zinfo_fd = STAILQ_FIRST(&zinfo->fd_list); + while (zinfo_fd != NULL) { + if (zinfo_fd->fd == fd) { + STAILQ_REMOVE(&zinfo->fd_list, zinfo_fd, + zinfo_fd_s, fd_link); + kmem_free(zinfo_fd, sizeof (zinfo_fd_t)); + break; + } + zinfo_fd = STAILQ_NEXT(zinfo_fd, fd_link); + } + (void) pthread_mutex_unlock(&zinfo->zinfo_mutex); +} + void uzfs_zvol_rebuild_dw_replica(void *arg) { @@ -360,6 +408,8 @@ uzfs_zvol_rebuild_dw_replica(void *arg) sfd = rebuild_args->fd; zinfo = rebuild_args->zinfo; + uzfs_zvol_append_to_fd_list(zinfo, sfd); + if ((rc = setsockopt(sfd, SOL_SOCKET, SO_LINGER, &lo, sizeof (lo))) != 0) { LOG_ERRNO("setsockopt failed"); @@ -496,6 +546,8 @@ uzfs_zvol_rebuild_dw_replica(void *arg) } exit: + uzfs_zvol_remove_from_fd_list(zinfo, sfd); + mutex_enter(&zinfo->zv->rebuild_mtx); if (rc != 0) { uzfs_zvol_set_rebuild_status(zinfo->zv, @@ -854,7 +906,9 @@ uzfs_zvol_rebuild_scanner_callback(off_t offset, size_t len, hdr.len = len; hdr.flags = ZVOL_OP_FLAG_REBUILD; hdr.status = ZVOL_OP_STATUS_OK; - if (zinfo->state == ZVOL_INFO_STATE_OFFLINE) + + if ((zinfo->state == ZVOL_INFO_STATE_OFFLINE) || + (zinfo->is_io_ack_sender_created == B_FALSE)) return (-1); LOG_DEBUG("IO number for rebuild %ld", metadata->io_num); @@ -883,13 +937,13 @@ uzfs_zvol_rebuild_scanner(void *arg) zvol_info_t *zinfo = NULL; zvol_io_hdr_t hdr; int rc = 0; - zvol_rebuild_t warg; + zvol_rebuild_t warg; char *name; blk_metadata_t metadata; uint64_t rebuild_req_offset; uint64_t rebuild_req_len; zvol_io_cmd_t *zio_cmd; - struct linger lo = { 1, 0 }; + struct linger lo = { 1, 0 }; if ((rc = setsockopt(fd, SOL_SOCKET, SO_LINGER, &lo, sizeof (lo))) != 0) { @@ -897,16 +951,23 @@ uzfs_zvol_rebuild_scanner(void *arg) goto exit; } read_socket: + if ((zinfo != NULL) && + ((zinfo->state == ZVOL_INFO_STATE_OFFLINE) || + (zinfo->is_io_ack_sender_created == B_FALSE))) + goto exit; + rc = uzfs_zvol_read_header(fd, &hdr); - if ((rc != 0) || ((zinfo != NULL) && - (zinfo->state == ZVOL_INFO_STATE_OFFLINE))) + if ((rc != 0) || + ((zinfo != NULL) && + ((zinfo->state == ZVOL_INFO_STATE_OFFLINE) || + (zinfo->is_io_ack_sender_created == B_FALSE)))) goto exit; LOG_DEBUG("op_code=%d io_seq=%ld", hdr.opcode, hdr.io_seq); /* Handshake yet to happen */ if ((hdr.opcode != ZVOL_OPCODE_HANDSHAKE) && (zinfo == NULL)) { - LOG_DEBUG("Wrong opcode:%d, expecting handshake\n", hdr.opcode); + LOG_DEBUG("Wrong opcode:%d, expecting handshake", hdr.opcode); rc = -1; goto exit; } @@ -939,6 +1000,9 @@ uzfs_zvol_rebuild_scanner(void *arg) } LOG_INFO("Rebuild scanner started on zvol %s", name); + + uzfs_zvol_append_to_fd_list(zinfo, fd); + kmem_free(name, hdr.len); warg.zinfo = zinfo; warg.fd = fd; @@ -954,7 +1018,11 @@ uzfs_zvol_rebuild_scanner(void *arg) "Rebuild Req offset: %ld, Rebuild Req length: %ld", metadata.io_num, rebuild_req_offset, rebuild_req_len); - +#if DEBUG + if (inject_error.delay.helping_replica_rebuild_step + == 1) + sleep(5); +#endif rc = uzfs_get_io_diff(zinfo->zv, &metadata, uzfs_zvol_rebuild_scanner_callback, rebuild_req_offset, rebuild_req_len, &warg); @@ -989,6 +1057,8 @@ uzfs_zvol_rebuild_scanner(void *arg) if (zinfo != NULL) { LOG_INFO("Closing rebuild connection for zvol %s", zinfo->name); remove_pending_cmds_to_ack(fd, zinfo); + uzfs_zvol_remove_from_fd_list(zinfo, fd); + uzfs_zinfo_drop_refcnt(zinfo); } else { LOG_INFO("Closing rebuild connection"); @@ -998,3 +1068,467 @@ uzfs_zvol_rebuild_scanner(void *arg) close(fd); zk_thread_exit(); } + +/* + * (Re)Initializes zv's state variables. + * This fn need to be called to use zv across network disconnections. + * Lock protection and life of zv need to be managed by caller + */ +static void +reinitialize_zv_state(zvol_state_t *zv) +{ + if (zv == NULL) + return; + zv->zv_metavolblocksize = 0; + + uzfs_zvol_set_status(zv, ZVOL_STATUS_DEGRADED); + uzfs_zvol_set_rebuild_status(zv, ZVOL_REBUILDING_INIT); +} + +/* + * This func takes care of sending potentially multiple read blocks each + * prefixed by metainfo. + */ +static int +uzfs_send_reads(int fd, zvol_io_cmd_t *zio_cmd) +{ + zvol_io_hdr_t *hdr = &zio_cmd->hdr; + struct zvol_io_rw_hdr read_hdr; + metadata_desc_t *md; + size_t rel_offset = 0; + int rc = 0; + + /* special case for missing metadata */ + if (zio_cmd->metadata_desc == NULL) { + read_hdr.io_num = 0; + /* + * read_hdr.len should be adjusted back + * to actual read request size now + */ + read_hdr.len = hdr->len - + sizeof (struct zvol_io_rw_hdr); + rc = uzfs_zvol_socket_write(fd, (char *)&read_hdr, + sizeof (read_hdr)); + if (rc != 0) + return (rc); + /* Data that need to be sent is equal to read_hdr.len */ + rc = uzfs_zvol_socket_write(fd, zio_cmd->buf, read_hdr.len); + return (rc); + } + + /* + * TODO: Optimize performance by combining multiple writes to a single + * system call either by copying all data to larger buffer or using + * vector write. + */ + for (md = zio_cmd->metadata_desc; md != NULL; md = md->next) { + read_hdr.io_num = md->metadata.io_num; + read_hdr.len = md->len; + rc = uzfs_zvol_socket_write(fd, (char *)&read_hdr, + sizeof (read_hdr)); + if (rc != 0) + goto end; + + rc = uzfs_zvol_socket_write(fd, + (char *)zio_cmd->buf + rel_offset, md->len); + if (rc != 0) + goto end; + rel_offset += md->len; + } + +end: + FREE_METADATA_LIST(zio_cmd->metadata_desc); + zio_cmd->metadata_desc = NULL; + + return (rc); +} + +/* + * One thread per LUN/vol. This thread works + * on queue and it sends ack back to client on + * a given fd. + * There are two types of clients - one is iscsi target, and, + * other is a replica which undergoes rebuild. + * Need to exit from thread when there are network errors + * on fd related to iscsi target. + */ +static void +uzfs_zvol_io_ack_sender(void *arg) +{ + int fd; + int md_len; + zvol_info_t *zinfo; + thread_args_t *thrd_arg; + zvol_io_cmd_t *zio_cmd = NULL; + + thrd_arg = (thread_args_t *)arg; + fd = thrd_arg->fd; + zinfo = thrd_arg->zinfo; + kmem_free(arg, sizeof (thread_args_t)); + + prctl(PR_SET_NAME, "ack_sender", 0, 0, 0); + + LOG_INFO("Started ack sender for zvol %s fd: %d", zinfo->name, fd); + + while (1) { + int rc = 0; + (void) pthread_mutex_lock(&zinfo->zinfo_mutex); + zinfo->zio_cmd_in_ack = NULL; + while (1) { + if ((zinfo->state == ZVOL_INFO_STATE_OFFLINE) || + (zinfo->conn_closed == B_TRUE)) { + (void) pthread_mutex_unlock( + &zinfo->zinfo_mutex); + goto exit; + } + if (STAILQ_EMPTY(&zinfo->complete_queue)) { + zinfo->io_ack_waiting = 1; + pthread_cond_wait(&zinfo->io_ack_cond, + &zinfo->zinfo_mutex); + zinfo->io_ack_waiting = 0; + } + else + break; + } + + zio_cmd = STAILQ_FIRST(&zinfo->complete_queue); + STAILQ_REMOVE_HEAD(&zinfo->complete_queue, cmd_link); + zinfo->zio_cmd_in_ack = zio_cmd; + (void) pthread_mutex_unlock(&zinfo->zinfo_mutex); + + LOG_DEBUG("ACK for op: %d, seq-id: %ld", + zio_cmd->hdr.opcode, zio_cmd->hdr.io_seq); + + /* account for space taken by metadata headers */ + if (zio_cmd->hdr.status == ZVOL_OP_STATUS_OK && + zio_cmd->hdr.opcode == ZVOL_OPCODE_READ) { + md_len = 0; + for (metadata_desc_t *md = zio_cmd->metadata_desc; + md != NULL; + md = md->next) { + md_len++; + } + /* we need at least one header even if no metadata */ + if (md_len == 0) + md_len++; + zio_cmd->hdr.len += (md_len * + sizeof (struct zvol_io_rw_hdr)); + } + + rc = uzfs_zvol_socket_write(zio_cmd->conn, + (char *)&zio_cmd->hdr, sizeof (zio_cmd->hdr)); + if (rc == -1) { + LOG_ERRNO("socket write err"); + zinfo->zio_cmd_in_ack = NULL; + /* + * exit due to network errors on fd related + * to iscsi target + */ + if (zio_cmd->conn == fd) { + zio_cmd_free(&zio_cmd); + goto exit; + } + zio_cmd_free(&zio_cmd); + continue; + } + + if (zio_cmd->hdr.opcode == ZVOL_OPCODE_READ) { + if (zio_cmd->hdr.status == ZVOL_OP_STATUS_OK) { + /* Send data read from disk */ + rc = uzfs_send_reads(zio_cmd->conn, zio_cmd); + if (rc == -1) { + zinfo->zio_cmd_in_ack = NULL; + LOG_ERRNO("socket write err"); + if (zio_cmd->conn == fd) { + zio_cmd_free(&zio_cmd); + goto exit; + } + } + } + atomic_inc_64(&zinfo->read_req_ack_cnt); + } else { + if (zio_cmd->hdr.opcode == ZVOL_OPCODE_WRITE) + atomic_inc_64(&zinfo->write_req_ack_cnt); + else if (zio_cmd->hdr.opcode == ZVOL_OPCODE_SYNC) + atomic_inc_64(&zinfo->sync_req_ack_cnt); + } + zinfo->zio_cmd_in_ack = NULL; + zio_cmd_free(&zio_cmd); + } +exit: + zinfo->zio_cmd_in_ack = NULL; + shutdown(fd, SHUT_RDWR); + LOG_INFO("Data connection for zvol %s closed on fd: %d", + zinfo->name, fd); + + remove_pending_cmds_to_ack(fd, zinfo); + + (void) pthread_mutex_lock(&zinfo->zinfo_mutex); + zinfo->conn_closed = B_FALSE; + zinfo->is_io_ack_sender_created = B_FALSE; + (void) pthread_mutex_unlock(&zinfo->zinfo_mutex); + + uzfs_zinfo_drop_refcnt(zinfo); + + zk_thread_exit(); +} +/* + * Process open request on data connection, the first message. + * + * Return status meaning: + * != 0: OPEN failed, stop reading data from connection. + * == 0 && zinfopp == NULL: OPEN failed, recoverable error + * == 0 && zinfopp != NULL: OPEN succeeded, proceed with other commands + */ +static int +open_zvol(int fd, zvol_info_t **zinfopp) +{ + int rc; + zvol_io_hdr_t hdr; + zvol_op_open_data_t open_data; + zvol_info_t *zinfo = NULL; + zvol_state_t *zv = NULL; + kthread_t *thrd_info; + thread_args_t *thrd_arg; + int rele_dataset_on_error = 0; + + /* + * If we don't know the version yet, be more careful when + * reading header + */ + if (uzfs_zvol_read_header(fd, &hdr) != 0) { + LOG_ERR("error reading open header"); + return (-1); + } + if (hdr.opcode != ZVOL_OPCODE_OPEN) { + LOG_ERR("zvol must be opened first"); + return (-1); + } + if (hdr.len != sizeof (open_data)) { + LOG_ERR("Invalid payload length for open"); + return (-1); + } + rc = uzfs_zvol_socket_read(fd, (char *)&open_data, sizeof (open_data)); + if (rc != 0) { + LOG_ERR("Payload read failed"); + return (-1); + } + + open_data.volname[MAX_NAME_LEN - 1] = '\0'; + zinfo = uzfs_zinfo_lookup(open_data.volname); + if (zinfo == NULL) { + LOG_ERR("zvol %s not found", open_data.volname); + hdr.status = ZVOL_OP_STATUS_FAILED; + goto open_reply; + } + if (zinfo->state != ZVOL_INFO_STATE_ONLINE) { + LOG_ERR("zvol %s is not online", open_data.volname); + hdr.status = ZVOL_OP_STATUS_FAILED; + goto open_reply; + } + if (zinfo->is_io_ack_sender_created != B_FALSE) { + LOG_ERR("zvol %s ack sender already present", + open_data.volname); + hdr.status = ZVOL_OP_STATUS_FAILED; + goto open_reply; + } + + zv = zinfo->zv; + ASSERT3P(zv, !=, NULL); + + if (zv->zv_metavolblocksize != 0) { + LOG_ERR("there might be already a data connection for %s", + open_data.volname); + hdr.status = ZVOL_OP_STATUS_FAILED; + goto open_reply; + } + + ASSERT3P(zv->zv_status, ==, ZVOL_STATUS_DEGRADED); + ASSERT3P(zv->rebuild_info.zv_rebuild_status, ==, ZVOL_REBUILDING_INIT); + + if ((zv->zv_status != ZVOL_STATUS_DEGRADED) || + ((zv->rebuild_info.zv_rebuild_status != ZVOL_REBUILDING_INIT) && + (zv->rebuild_info.zv_rebuild_status != ZVOL_REBUILDING_FAILED))) { + LOG_ERR("as status for %s is %d or rebuild status is %d", + open_data.volname, zv->zv_status, + zv->rebuild_info.zv_rebuild_status); + hdr.status = ZVOL_OP_STATUS_FAILED; + goto open_reply; + } + // validate block size (only one bit is set in the number) + if (open_data.tgt_block_size == 0 || + (open_data.tgt_block_size & (open_data.tgt_block_size - 1)) != 0) { + LOG_ERR("Invalid block size"); + hdr.status = ZVOL_OP_STATUS_FAILED; + goto open_reply; + } + + (void) pthread_mutex_lock(&zinfo->zinfo_mutex); + /* + * Hold objset if this is the first query for the zvol. This can happen + * in case that the target creates data connection directly without + * getting the endpoint through mgmt connection first. + */ + rele_dataset_on_error = 0; + if (zv->zv_objset == NULL) { + if (uzfs_hold_dataset(zv) != 0) { + (void) pthread_mutex_unlock(&zinfo->zinfo_mutex); + LOG_ERR("Failed to hold zvol during open"); + hdr.status = ZVOL_OP_STATUS_FAILED; + goto open_reply; + } + rele_dataset_on_error = 1; + } + if (uzfs_update_metadata_granularity(zv, + open_data.tgt_block_size) != 0) { + (void) pthread_mutex_unlock(&zinfo->zinfo_mutex); + if (rele_dataset_on_error == 1) + uzfs_rele_dataset(zv); + LOG_ERR("Failed to set granularity of metadata"); + hdr.status = ZVOL_OP_STATUS_FAILED; + goto open_reply; + } + /* + * TODO: Once we support multiple concurrent data connections for a + * single zvol, we should probably check that the timeout is the same + * for all data connections. + */ + uzfs_update_ionum_interval(zinfo, open_data.timeout); + zinfo->timeout = open_data.timeout; + *zinfopp = zinfo; + + zinfo->conn_closed = B_FALSE; + zinfo->is_io_ack_sender_created = B_TRUE; + (void) pthread_mutex_unlock(&zinfo->zinfo_mutex); + thrd_arg = kmem_alloc(sizeof (thread_args_t), KM_SLEEP); + thrd_arg->fd = fd; + thrd_arg->zinfo = zinfo; + uzfs_zinfo_take_refcnt(zinfo); + thrd_info = zk_thread_create(NULL, 0, + (thread_func_t)uzfs_zvol_io_ack_sender, (void *)thrd_arg, 0, NULL, + TS_RUN, 0, PTHREAD_CREATE_DETACHED); + VERIFY3P(thrd_info, !=, NULL); + + hdr.status = ZVOL_OP_STATUS_OK; + +open_reply: + hdr.len = 0; + rc = uzfs_zvol_socket_write(fd, (char *)&hdr, sizeof (hdr)); + + /* + * Reinitializing zv states during this error is taken care + * in open_zvol caller + */ + if (rc == -1) + LOG_ERR("Failed to send reply for open request"); + if (hdr.status != ZVOL_OP_STATUS_OK) { + ASSERT3P(*zinfopp, ==, NULL); + if (zinfo != NULL) + uzfs_zinfo_drop_refcnt(zinfo); + return (-1); + } + return (rc); +} + +/* + * IO-Receiver would be per ZVOL, it would be + * responsible for receiving IOs on given socket. + */ +void +uzfs_zvol_io_receiver(void *arg) +{ + int rc; + int fd = (uintptr_t)arg; + zvol_info_t *zinfo = NULL; + zvol_io_cmd_t *zio_cmd; + zvol_io_hdr_t hdr; + + prctl(PR_SET_NAME, "io_receiver", 0, 0, 0); + + /* First command should be OPEN */ + while (zinfo == NULL) { + if (open_zvol(fd, &zinfo) != 0) { + if ((zinfo != NULL) && + (zinfo->is_io_ack_sender_created)) + goto exit; + shutdown(fd, SHUT_RDWR); + goto thread_exit; + } + } + + LOG_INFO("Data connection associated with zvol %s fd: %d", + zinfo->name, fd); + + while ((rc = uzfs_zvol_socket_read(fd, (char *)&hdr, sizeof (hdr))) == + 0) { + if ((zinfo->state == ZVOL_INFO_STATE_OFFLINE)) + break; + + if (hdr.opcode != ZVOL_OPCODE_WRITE && + hdr.opcode != ZVOL_OPCODE_READ && + hdr.opcode != ZVOL_OPCODE_SYNC) { + LOG_ERR("Unexpected opcode %d", hdr.opcode); + break; + } + + if (((hdr.opcode == ZVOL_OPCODE_WRITE) || + (hdr.opcode == ZVOL_OPCODE_READ)) && !hdr.len) { + LOG_ERR("Zero Payload size for opcode %d", hdr.opcode); + break; + } else if ((hdr.opcode == ZVOL_OPCODE_SYNC) && hdr.len > 0) { + LOG_ERR("Unexpected payload for opcode %d", hdr.opcode); + break; + } + + zio_cmd = zio_cmd_alloc(&hdr, fd); + /* Read payload for commands which have it */ + if (hdr.opcode == ZVOL_OPCODE_WRITE) { + rc = uzfs_zvol_socket_read(fd, zio_cmd->buf, hdr.len); + if (rc != 0) { + zio_cmd_free(&zio_cmd); + break; + } + } + + if (zinfo->state == ZVOL_INFO_STATE_OFFLINE) { + zio_cmd_free(&zio_cmd); + break; + } + /* Take refcount for uzfs_zvol_worker to work on it */ + uzfs_zinfo_take_refcnt(zinfo); + zio_cmd->zv = zinfo; + taskq_dispatch(zinfo->uzfs_zvol_taskq, uzfs_zvol_worker, + zio_cmd, TQ_SLEEP); + } +exit: + (void) pthread_mutex_lock(&zinfo->zinfo_mutex); + zinfo->conn_closed = B_TRUE; + /* + * Send signal to ack sender so that it can free + * zio_cmd, close fd and exit. + */ + if (zinfo->io_ack_waiting) { + rc = pthread_cond_signal(&zinfo->io_ack_cond); + } + /* + * wait for ack thread to exit to avoid races with new + * connections for the same zinfo + */ + while (zinfo->conn_closed && zinfo->is_io_ack_sender_created) { + (void) pthread_mutex_unlock(&zinfo->zinfo_mutex); + usleep(1000); + (void) pthread_mutex_lock(&zinfo->zinfo_mutex); + } + (void) pthread_mutex_unlock(&zinfo->zinfo_mutex); + + shutdown_fds_related_to_zinfo(zinfo); + + zinfo->io_ack_waiting = 0; + + reinitialize_zv_state(zinfo->zv); + uzfs_zinfo_drop_refcnt(zinfo); +thread_exit: + close(fd); + LOG_INFO("Data connection closed on fd: %d", fd); + zk_thread_exit(); +} diff --git a/lib/libzrepl/mgmt_conn.c b/lib/libzrepl/mgmt_conn.c index d2128d752fdc..8110ea9c3565 100644 --- a/lib/libzrepl/mgmt_conn.c +++ b/lib/libzrepl/mgmt_conn.c @@ -1138,10 +1138,6 @@ uzfs_zvol_mgmt_thread(void *arg) async_task_t *async_task; struct timespec diff_time, now, last_time; - SLIST_INIT(&uzfs_mgmt_conns); - mutex_init(&conn_list_mtx, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&async_tasks_mtx, NULL, MUTEX_DEFAULT, NULL); - mgmt_eventfd = eventfd(0, EFD_NONBLOCK); if (mgmt_eventfd < 0) { perror("eventfd"); diff --git a/tests/cbtest/gtest/Makefile.am b/tests/cbtest/gtest/Makefile.am index 798c9bfcc60b..8e20f15e49fa 100644 --- a/tests/cbtest/gtest/Makefile.am +++ b/tests/cbtest/gtest/Makefile.am @@ -1,12 +1,12 @@ include $(top_srcdir)/config/Rules.am -AM_CPPFLAGS += $(DEBUG_STACKFLAGS) +AM_CPPFLAGS += $(DEBUG_CFLAGS) $(DEBUG_STACKFLAGS) DEFAULT_INCLUDES += \ -I$(top_srcdir)/include \ -I$(top_srcdir)/lib/libspl/include -sbin_PROGRAMS = test_uzfsserver test_uzfs test_zfs test_zrepl_prot +sbin_PROGRAMS = test_uzfs test_uzfsserver test_zfs test_zrepl_prot test_uzfsserver_SOURCES = test_uzfsserver.cc test_uzfs_SOURCES = test_uzfs.cc gtest_utils.cc @@ -31,6 +31,7 @@ test_uzfsserver_LDADD = \ test_uzfsserver_CXXFLAGS = -std=c++11 test_uzfsserver_LDFLAGS = -pthread -lgtest -lgtest_main +test_uzfs_CXXFLAGS = -std=c++11 test_uzfs_LDFLAGS = -pthread -lgtest -lgtest_main test_zrepl_prot_LDFLAGS = -pthread -lgtest -lgtest_main test_zfs_LDFLAGS = -pthread -lgtest -lgtest_main diff --git a/tests/cbtest/gtest/test_uzfs.cc b/tests/cbtest/gtest/test_uzfs.cc index 3c4e2c00b03e..f90195b8ec2e 100644 --- a/tests/cbtest/gtest/test_uzfs.cc +++ b/tests/cbtest/gtest/test_uzfs.cc @@ -52,12 +52,18 @@ zvol_state_t *zv2; zvol_info_t *zinfo; zvol_info_t *zinfo2; int rebuild_test_case = 0; +int data_conn_fd = -1; extern void (*zinfo_create_hook)(zvol_info_t *, nvlist_t *); extern void (*zinfo_destroy_hook)(zvol_info_t *); int receiver_created = 0; extern uint64_t zvol_rebuild_step_size; +void (*dw_replica_fn)(void *); +#if DEBUG +inject_error_t inject_error; +#endif + void make_vdev(const char *path) { @@ -194,11 +200,21 @@ uzfs_mock_rebuild_scanner(void *arg) rc = uzfs_zvol_socket_write(fd, (char *)&hdr, sizeof(hdr)); EXPECT_NE(rc, -1); - if (rebuild_test_case == 6) - goto exit; + if (rebuild_test_case == 6) { + close(data_conn_fd); + sleep(5); + } /* Read REBUILD_STEP */ rc = uzfs_zvol_socket_read(fd, (char *)&hdr, sizeof (hdr)); + if (rebuild_test_case == 6) { + if (rc != -1) + rc = uzfs_zvol_socket_read(fd, (char *)&hdr, sizeof (hdr)); + EXPECT_EQ(rc, -1); + sleep(3); + goto exit; + } + EXPECT_NE(rc, -1); EXPECT_EQ(hdr.opcode, ZVOL_OPCODE_REBUILD_STEP); EXPECT_EQ(hdr.status, ZVOL_OP_STATUS_OK); @@ -252,6 +268,11 @@ TEST(uZFS, Setup) { GtestUtils::strlcpy(pool_ds2, "pool1/vol3", MAXNAMELEN); signal(SIGPIPE, SIG_IGN); + mutex_init(&conn_list_mtx, NULL, MUTEX_DEFAULT, NULL); + SLIST_INIT(&uzfs_mgmt_conns); + mutex_init(&async_tasks_mtx, NULL, MUTEX_DEFAULT, NULL); + mgmt_eventfd = -1; + uzfs_init(); init_zrepl(); setup_unit_test(path); @@ -262,11 +283,6 @@ TEST(uZFS, Setup) { uzfs_hold_dataset(zv); uzfs_update_metadata_granularity(zv, 512); - mutex_init(&conn_list_mtx, NULL, MUTEX_DEFAULT, NULL); - SLIST_INIT(&uzfs_mgmt_conns); - mutex_init(&async_tasks_mtx, NULL, MUTEX_DEFAULT, NULL); - mgmt_eventfd = -1; - zinfo_create_hook = &zinfo_create_cb; zinfo_destroy_hook = &zinfo_destroy_cb; @@ -888,18 +904,22 @@ uzfs_mock_zvol_rebuild_dw_replica(void *arg) if (rebuild_test_case == 6) { hdr.offset = -1; rc = uzfs_zvol_socket_write(sfd, (char *)&hdr, sizeof (hdr)); - rc = -1; - goto exit; - } else if (rebuild_test_case == 7) { + if (rc != 0) { + goto exit; + } + } else if ((rebuild_test_case == 7) || (rebuild_test_case == 8) || (rebuild_test_case == 9)) { /* * Set offline state on vol3 */ - zinfo2->state = ZVOL_INFO_STATE_OFFLINE; +#if DEBUG + if ((rebuild_test_case == 7) || (rebuild_test_case == 8)) + inject_error.delay.helping_replica_rebuild_step = 1; +#endif rc = uzfs_zvol_socket_write(sfd, (char *)&hdr, sizeof (hdr)); if (rc != 0) { goto exit; } - } else if (rebuild_test_case == 8) { + } else if (rebuild_test_case == 10) { hdr.opcode = ZVOL_OPCODE_REBUILD_COMPLETE; rc = uzfs_zvol_socket_write(sfd, (char *)&hdr, sizeof (hdr)); if (rc != 0) { @@ -916,7 +936,30 @@ uzfs_mock_zvol_rebuild_dw_replica(void *arg) while (1) { + if ((rebuild_test_case == 7) || (rebuild_test_case == 8) || (rebuild_test_case == 9)) + { + sleep(1); + if (rebuild_test_case == 7) + zinfo2->state = ZVOL_INFO_STATE_OFFLINE; + else if (rebuild_test_case == 8) + zinfo2->is_io_ack_sender_created = B_FALSE; + else { + close(data_conn_fd); + sleep(5); + } +#if DEBUG + inject_error.delay.helping_replica_rebuild_step = 0; +#endif + } + rc = uzfs_zvol_socket_read(sfd, (char *)&hdr, sizeof (hdr)); + if (rebuild_test_case == 9) { + if (rc != -1) + rc = uzfs_zvol_socket_read(sfd, (char *)&hdr, sizeof (hdr)); + EXPECT_EQ(rc, -1); + sleep(3); + goto exit; + } if (rc != 0) { LOG_ERR("Socket read failed"); goto exit; @@ -1001,8 +1044,7 @@ uzfs_mock_zvol_rebuild_dw_replica(void *arg) } void execute_rebuild_test_case(const char *s, int test_case, - zvol_rebuild_status_t status, boolean_t is_test_positive, - boolean_t is_rebuild_scanner) + zvol_rebuild_status_t status, zvol_rebuild_status_t verify_status) { kthread_t *thrd; rebuild_thread_arg_t *rebuild_args; @@ -1015,16 +1057,10 @@ void execute_rebuild_test_case(const char *s, int test_case, uzfs_zinfo_take_refcnt(zinfo); uzfs_zvol_set_rebuild_status(zinfo->zv, status); - if (!is_rebuild_scanner) - thrd = zk_thread_create(NULL, 0, uzfs_zvol_rebuild_dw_replica, - rebuild_args, 0, NULL, TS_RUN, 0, 0); - else - thrd = zk_thread_create(NULL, 0, uzfs_mock_zvol_rebuild_dw_replica, - rebuild_args, 0, NULL, TS_RUN, 0, 0); + thrd = zk_thread_create(NULL, 0, dw_replica_fn, + rebuild_args, 0, NULL, TS_RUN, 0, 0); zk_thread_join(thrd->t_tid); - EXPECT_EQ(2, zinfo->refcnt); - /* wait for rebuild thread to exit */ while (1) { if (rebuild_test_case != 0) @@ -1033,44 +1069,119 @@ void execute_rebuild_test_case(const char *s, int test_case, break; } - if (!is_test_positive) - EXPECT_EQ(ZVOL_REBUILDING_FAILED, uzfs_zvol_get_rebuild_status(zinfo->zv)); - else - EXPECT_EQ(ZVOL_REBUILDING_DONE, uzfs_zvol_get_rebuild_status(zinfo->zv)); + EXPECT_EQ(2, zinfo->refcnt); + + EXPECT_EQ(verify_status, uzfs_zvol_get_rebuild_status(zinfo->zv)); } -TEST(uZFS, TestRebuild) { - uzfs_mgmt_conn_t *conn; - mgmt_ack_t *mack; - char ip[MAX_IP_LEN]; - kthread_t *thrd; +TEST(uZFS, TestRebuildAbrupt) { rebuild_scanner = &uzfs_mock_rebuild_scanner; - rebuild_thread_arg_t *rebuild_args; + dw_replica_fn = &uzfs_zvol_rebuild_dw_replica; zvol_rebuild_step_size = (1024ULL * 1024ULL * 1024ULL) / 2 + 1000; /* thread that helps rebuilding exits abruptly just after connects */ - execute_rebuild_test_case("rebuild abrupt", 1, ZVOL_REBUILDING_IN_PROGRESS, B_FALSE, B_FALSE); + execute_rebuild_test_case("rebuild abrupt", 1, ZVOL_REBUILDING_IN_PROGRESS, ZVOL_REBUILDING_FAILED); +} +TEST(uZFS, TestRebuildGrace) { /* thread that helps rebuilding exits gracefully just after connects */ - execute_rebuild_test_case("rebuild grace", 2, ZVOL_REBUILDING_IN_PROGRESS, B_FALSE, B_FALSE); + execute_rebuild_test_case("rebuild grace", 2, ZVOL_REBUILDING_IN_PROGRESS, ZVOL_REBUILDING_FAILED); +} +TEST(uZFS, TestRebuildErrorState) { /* rebuild state is ERRORED on dw replica */ - execute_rebuild_test_case("rebuild error state", 2, ZVOL_REBUILDING_ERRORED, B_FALSE, B_FALSE); + execute_rebuild_test_case("rebuild error state", 2, ZVOL_REBUILDING_ERRORED, ZVOL_REBUILDING_FAILED); +} +TEST(uZFS, TestRebuildExitAfterStep) { /* thread helping rebuild will exit after reading REBUILD_STEP */ - execute_rebuild_test_case("rebuild exit after step", 3, ZVOL_REBUILDING_IN_PROGRESS, B_FALSE, B_FALSE); + execute_rebuild_test_case("rebuild exit after step", 3, ZVOL_REBUILDING_IN_PROGRESS, ZVOL_REBUILDING_FAILED); +} +TEST(uZFS, TestRebuildExitAfterInvalidWrite) { /* thread helping rebuild will exit after writng invalid write IO */ - execute_rebuild_test_case("rebuild exit after invalid write", 4, ZVOL_REBUILDING_IN_PROGRESS, B_FALSE, B_FALSE); + execute_rebuild_test_case("rebuild exit after invalid write", 4, ZVOL_REBUILDING_IN_PROGRESS, ZVOL_REBUILDING_FAILED); +} +TEST(uZFS, TestRebuildExitAfterValidWrite) { /* thread helping rebuild will exit after writng valid write IO */ - execute_rebuild_test_case("rebuild exit after valid write", 5, ZVOL_REBUILDING_IN_PROGRESS, B_FALSE, B_FALSE); + execute_rebuild_test_case("rebuild exit after valid write", 5, ZVOL_REBUILDING_IN_PROGRESS, ZVOL_REBUILDING_FAILED); +} + +/* + * THIS IS COPIED FROM test_zrepl_prot.cc + */ +/* + * This fn does data conn for a host:ip and volume, and fills data fd + * + * NOTE: Return value must be void otherwise we could not use asserts + * (pecularity of gtest framework). + */ +static void do_data_connection(int &data_fd, std::string host, uint16_t port, + std::string zvol_name, int bs=512, int timeout=120, + int res=ZVOL_OP_STATUS_OK) { + struct sockaddr_in addr; + zvol_io_hdr_t hdr_in, hdr_out = {0}; + zvol_op_open_data_t open_data; + int rc; + char val; + int fd; + + memset(&addr, 0, sizeof (addr)); + addr.sin_family = AF_INET; + addr.sin_port = htons(port); + rc = inet_pton(AF_INET, host.c_str(), &addr.sin_addr); + ASSERT_TRUE(rc > 0); +retry: + fd = socket(AF_INET, SOCK_STREAM, 0); + rc = connect(fd, (struct sockaddr *)&addr, sizeof (addr)); + if (rc != 0) { + perror("connect"); + ASSERT_EQ(errno, 0); + } + hdr_out.version = REPLICA_VERSION; + hdr_out.opcode = ZVOL_OPCODE_OPEN; + hdr_out.status = ZVOL_OP_STATUS_OK; + hdr_out.len = sizeof (open_data); + + rc = write(fd, &hdr_out, sizeof (hdr_out)); + ASSERT_EQ(rc, sizeof (hdr_out)); + + open_data.tgt_block_size = bs; + open_data.timeout = timeout; + GtestUtils::strlcpy(open_data.volname, zvol_name.c_str(), + sizeof (open_data.volname)); + rc = write(fd, &open_data, hdr_out.len); + + rc = read(fd, &hdr_in, sizeof (hdr_in)); + ASSERT_EQ(rc, sizeof (hdr_in)); + ASSERT_EQ(hdr_in.version, REPLICA_VERSION); + ASSERT_EQ(hdr_in.opcode, ZVOL_OPCODE_OPEN); + ASSERT_EQ(hdr_in.len, 0); + if (hdr_in.status != res) { + sleep(2); + shutdown(fd, SHUT_WR); + rc = read(fd, &val, sizeof (val)); + close(fd); + goto retry; + } + data_fd = fd; +} - /* thread helping rebuild will exit after writng valid write IO and REBUILD_STEP_DONE */ - execute_rebuild_test_case("rebuild exit after valid write and rebuild_step", 6, ZVOL_REBUILDING_IN_PROGRESS, B_FALSE, B_FALSE); +TEST(uZFS, TestRebuildCompleteWithDataConn) { + io_receiver = &uzfs_zvol_io_receiver; + uzfs_update_metadata_granularity(zv, 0); + uzfs_zvol_set_rebuild_status(zv, ZVOL_REBUILDING_INIT); + do_data_connection(data_conn_fd, "127.0.0.1", 3232, "vol1"); /* thread helping rebuild will exit after writing valid write IO and REBUILD_STEP_DONE, and reads REBUILD_STEP, writes REBUILD_STEP_DONE */ - execute_rebuild_test_case("complete rebuild", 7, ZVOL_REBUILDING_IN_PROGRESS, B_TRUE, B_FALSE); + execute_rebuild_test_case("complete rebuild with data conn", 6, ZVOL_REBUILDING_IN_PROGRESS, ZVOL_REBUILDING_INIT); +} + +TEST(uZFS, TestRebuildComplete) { + uzfs_update_metadata_granularity(zv, 512); + /* thread helping rebuild will exit after writing valid write IO and REBUILD_STEP_DONE, and reads REBUILD_STEP, writes REBUILD_STEP_DONE */ + execute_rebuild_test_case("complete rebuild", 7, ZVOL_REBUILDING_IN_PROGRESS, ZVOL_REBUILDING_DONE); EXPECT_EQ(ZVOL_STATUS_HEALTHY, uzfs_zvol_get_status(zinfo->zv)); memset(&zinfo->zv->rebuild_info, 0, sizeof (zvol_rebuild_info_t)); @@ -1078,85 +1189,82 @@ TEST(uZFS, TestRebuild) { TEST(RebuildScanner, AbruptClose) { rebuild_scanner = &uzfs_zvol_rebuild_scanner; + dw_replica_fn = &uzfs_mock_zvol_rebuild_dw_replica; zvol_rebuild_step_size = (1024ULL * 1024ULL * 100); + zinfo2->state = ZVOL_INFO_STATE_ONLINE; /* Rebuild thread exits abruptly just after connect */ execute_rebuild_test_case("Rebuild abrupt", 1, - ZVOL_REBUILDING_IN_PROGRESS, B_FALSE, B_TRUE); - EXPECT_EQ(ZVOL_REBUILDING_FAILED, uzfs_zvol_get_rebuild_status(zinfo->zv)); + ZVOL_REBUILDING_IN_PROGRESS, ZVOL_REBUILDING_FAILED); } TEST(RebuildScanner, WrongOpcode) { - rebuild_scanner = &uzfs_zvol_rebuild_scanner; - zvol_rebuild_step_size = (1024ULL * 1024ULL * 100); - /* Rebuild thread sending wrong opcode after connectg */ execute_rebuild_test_case("Wrong opcode", 2, - ZVOL_REBUILDING_IN_PROGRESS, B_FALSE, B_TRUE); - EXPECT_EQ(ZVOL_REBUILDING_FAILED, uzfs_zvol_get_rebuild_status(zinfo->zv)); + ZVOL_REBUILDING_IN_PROGRESS, ZVOL_REBUILDING_FAILED); } TEST(RebuildScanner, ErrorOut) { - rebuild_scanner = &uzfs_zvol_rebuild_scanner; - zvol_rebuild_step_size = (1024ULL * 1024ULL * 100); - /* Rebuild thread exits after handshake */ execute_rebuild_test_case("Rebuild error out", 3, - ZVOL_REBUILDING_ERRORED, B_FALSE, B_TRUE); - EXPECT_EQ(ZVOL_REBUILDING_FAILED, uzfs_zvol_get_rebuild_status(zinfo->zv)); + ZVOL_REBUILDING_ERRORED, ZVOL_REBUILDING_FAILED); } TEST(RebuildScanner, WrongVolname) { - rebuild_scanner = &uzfs_zvol_rebuild_scanner; - zvol_rebuild_step_size = (1024ULL * 1024ULL * 100); - /* Rebuild thread sending wrong vol name */ execute_rebuild_test_case("Wrong vol name", 4, - ZVOL_REBUILDING_IN_PROGRESS, B_FALSE, B_TRUE); - - EXPECT_EQ(ZVOL_REBUILDING_FAILED, uzfs_zvol_get_rebuild_status(zinfo->zv)); + ZVOL_REBUILDING_IN_PROGRESS, ZVOL_REBUILDING_FAILED); } TEST(RebuildScanner, HandshakeAgaian) { - rebuild_scanner = &uzfs_zvol_rebuild_scanner; - zvol_rebuild_step_size = (1024ULL * 1024ULL * 100); - /* Rebuild thread sending handshake again on same volume */ execute_rebuild_test_case("Send handshake again", 5, - ZVOL_REBUILDING_IN_PROGRESS, B_FALSE, B_TRUE); - EXPECT_EQ(ZVOL_REBUILDING_FAILED, uzfs_zvol_get_rebuild_status(zinfo->zv)); + ZVOL_REBUILDING_IN_PROGRESS, ZVOL_REBUILDING_FAILED); } TEST(RebuildScanner, VolumeTooLargeToHandle) { - rebuild_scanner = &uzfs_zvol_rebuild_scanner; - zvol_rebuild_step_size = (1024ULL * 1024ULL * 1024ULL * 100); - /* Rebuild thread sending handshake again on same volume */ execute_rebuild_test_case("Volume offset and len too large", 6, - ZVOL_REBUILDING_IN_PROGRESS, B_FALSE, B_TRUE); - EXPECT_EQ(ZVOL_REBUILDING_FAILED, uzfs_zvol_get_rebuild_status(zinfo->zv)); + ZVOL_REBUILDING_IN_PROGRESS, ZVOL_REBUILDING_FAILED); } TEST(RebuildScanner, VolumeOffline) { - rebuild_scanner = &uzfs_zvol_rebuild_scanner; zvol_rebuild_step_size = (1024ULL * 1024ULL * 1); - /* Rebuild thread sending handshake again on same volume */ + /* Set offline state on vol3 */ + zinfo2->state = ZVOL_INFO_STATE_ONLINE; execute_rebuild_test_case("Volume offline", 7, - ZVOL_REBUILDING_IN_PROGRESS, B_FALSE, B_TRUE); - EXPECT_EQ(ZVOL_REBUILDING_FAILED, uzfs_zvol_get_rebuild_status(zinfo->zv)); + ZVOL_REBUILDING_IN_PROGRESS, ZVOL_REBUILDING_FAILED); + zinfo2->state = ZVOL_INFO_STATE_ONLINE; } -TEST(RebuildScanner, RebuildSuccess) { - rebuild_scanner = &uzfs_zvol_rebuild_scanner; +TEST(RebuildScanner, AckSenderCreatedFalse) { + /* Set io_ack_sender_created as B_FALSE */ + zinfo2->is_io_ack_sender_created = B_TRUE; + execute_rebuild_test_case("Ack Sender Created False", 8, + ZVOL_REBUILDING_IN_PROGRESS, ZVOL_REBUILDING_FAILED); + zinfo2->is_io_ack_sender_created = B_FALSE; +} + +TEST(RebuildScanner, ShutdownRebuildFd) { + /* Set io_ack_sender_created as B_FALSE */ + uzfs_update_metadata_granularity(zv2, 0); + uzfs_zvol_set_rebuild_status(zv2, ZVOL_REBUILDING_INIT); + do_data_connection(data_conn_fd, "127.0.0.1", 3232, "vol3"); + execute_rebuild_test_case("Shutdown Rebuild FD", 9, + ZVOL_REBUILDING_IN_PROGRESS, ZVOL_REBUILDING_FAILED); +} +TEST(RebuildScanner, RebuildSuccess) { + uzfs_update_metadata_granularity(zv2, 0); + uzfs_zvol_set_rebuild_status(zv2, ZVOL_REBUILDING_INIT); + do_data_connection(data_conn_fd, "127.0.0.1", 3232, "vol3"); zvol_rebuild_step_size = (1024ULL * 1024ULL * 100); /* Rebuild thread sendinc complete opcode */ - execute_rebuild_test_case("complete rebuild", 8, - ZVOL_REBUILDING_IN_PROGRESS, B_TRUE, B_TRUE); + execute_rebuild_test_case("complete rebuild", 10, + ZVOL_REBUILDING_IN_PROGRESS, ZVOL_REBUILDING_DONE); EXPECT_EQ(ZVOL_STATUS_HEALTHY, uzfs_zvol_get_status(zinfo->zv)); - memset(&zinfo->zv->rebuild_info, 0, sizeof (zvol_rebuild_info_t)); } diff --git a/tests/cbtest/gtest/test_uzfsserver.cc b/tests/cbtest/gtest/test_uzfsserver.cc index b559eccdca95..dfb2cf4b03ca 100644 --- a/tests/cbtest/gtest/test_uzfsserver.cc +++ b/tests/cbtest/gtest/test_uzfsserver.cc @@ -29,6 +29,11 @@ /* Avoid including conflicting C++ declarations for LE-BE conversions */ #define _SYS_BYTEORDER_H #include +#include + +#if DEBUG +inject_error_t inject_error; +#endif TEST(uZFSServer, Setup) { kernel_init(FREAD);