#0 std::sync::mpmc::list::Slot<timely_communication::message::Message<(usize, usize, alloc::vec::Vec<((timely::progress::Location, usize), i64), alloc::alloc::Global>)>>::wait_write<timely_communication::message::Message<(usize, usize, alloc::vec::Vec<((timely::progress::Location, usize), i64), alloc::alloc::Global>)>> (self=<optimized out>) at /rustc/9eb3afe9ebe9c7d2b84b71002d44f4a0edac95e0/library/std/src/sync/mpmc/list.rs:48
#1 std::sync::mpmc::list::Channel<timely_communication::message::Message<(usize, usize, alloc::vec::Vec<((timely::progress::Location, usize), i64), alloc::alloc::Global>)>>::discard_all_messages<timely_communication::message::Message<(usize, usize, alloc::vec::Vec<((timely::progress::Location, usize), i64), alloc::alloc::Global>)>> (self=0x7ffe5d371300) at /rustc/9eb3afe9ebe9c7d2b84b71002d44f4a0edac95e0/library/std/src/sync/mpmc/list.rs:560
#2 std::sync::mpmc::list::Channel<timely_communication::message::Message<(usize, usize, alloc::vec::Vec<((timely::progress::Location, usize), i64), alloc::alloc::Global>)>>::disconnect_receivers<timely_communication::message::Message<(usize, usize, alloc::vec::Vec<((timely::progress::Location, usize), i64), alloc::alloc::Global>)>> (self=0x7ffe5d371300) at /rustc/9eb3afe9ebe9c7d2b84b71002d44f4a0edac95e0/library/std/src/sync/mpmc/list.rs:523
#3 0x000055555557a6bf in std::sync::mpmc::{impl#15}::drop::{closure#1}<timely_communication::message::Message<(usize, usize, alloc::vec::Vec<((timely::progress::Location, usize), i64), alloc::alloc::Global>)>> (c=0x7ffe5d371300)
at /rustc/9eb3afe9ebe9c7d2b84b71002d44f4a0edac95e0/library/std/src/sync/mpmc/mod.rs:407
#4 std::sync::mpmc::counter::Receiver<std::sync::mpmc::list::Channel<timely_communication::message::Message<(usize, usize, alloc::vec::Vec<((timely::progress::Location, usize), i64), alloc::alloc::Global>)>>>::release<std::sync::mpmc::list::Channel<timely_communication::message::Message<(usize, usize, alloc::vec::Vec<((timely::progress::Location, usize), i64), alloc::alloc::Global>)>>, std::sync::mpmc::{impl#15}::drop::{closure_env#1}<timely_communication::message::Message<(usize, usize, alloc::vec::Vec<((timely::progress::Location, usize), i64), alloc::alloc::Global>)>>> (self=0x7fffe80cd858, disconnect=...)
at /rustc/9eb3afe9ebe9c7d2b84b71002d44f4a0edac95e0/library/std/src/sync/mpmc/counter.rs:116
#5 0x00005555555a4ad6 in core::ptr::drop_in_place<alloc::boxed::Box<dyn timely_communication::Pull<timely_communication::message::Message<(usize, usize, alloc::vec::Vec<((timely::progress::Location, usize), i64), alloc::alloc::Global>)>>, alloc::alloc::Global>> () at /rustc/9eb3afe9ebe9c7d2b84b71002d44f4a0edac95e0/library/core/src/ptr/mod.rs:490
#6 core::ptr::drop_in_place<timely::progress::broadcast::Progcaster<usize>> () at /rustc/9eb3afe9ebe9c7d2b84b71002d44f4a0edac95e0/library/core/src/ptr/mod.rs:490
#7 0x00005555555a57f1 in core::ptr::drop_in_place<timely::progress::subgraph::Subgraph<(), usize>> () at /rustc/9eb3afe9ebe9c7d2b84b71002d44f4a0edac95e0/library/core/src/ptr/mod.rs:490
#8 0x00005555555c24ed in core::ptr::drop_in_place<alloc::boxed::Box<dyn timely::scheduling::Schedule, alloc::alloc::Global>> () at /rustc/9eb3afe9ebe9c7d2b84b71002d44f4a0edac95e0/library/core/src/ptr/mod.rs:490
#9 core::ptr::drop_in_place<core::option::Option<alloc::boxed::Box<dyn timely::scheduling::Schedule, alloc::alloc::Global>>> () at /rustc/9eb3afe9ebe9c7d2b84b71002d44f4a0edac95e0/library/core/src/ptr/mod.rs:490
#10 timely::worker::{impl#8}::drop (self=0x7ffff7b67530) at timely/src/worker.rs:775
#11 0x00005555555a3e3d in core::ptr::drop_in_place<timely::worker::Wrapper> () at /rustc/9eb3afe9ebe9c7d2b84b71002d44f4a0edac95e0/library/core/src/ptr/mod.rs:490
#12 0x00005555555a75a0 in timely::worker::Worker<timely_communication::allocator::generic::Generic>::drop_dataflow<timely_communication::allocator::generic::Generic> (self=<optimized out>, dataflow_identifier=<optimized out>)
at timely/src/worker.rs:685
#13 0x000055555557dffb in channel_segfault::main::{closure#0} (worker=0x7ffff7b67840) at timely/examples/channel_segfault.rs:25
#14 timely::execute::execute::{closure#1}<(), channel_segfault::main::{closure_env#0}> (allocator=...) at timely/src/execute.rs:287
#15 timely_communication::initialize::initialize_from::{closure#0}<timely_communication::allocator::generic::GenericBuilder, (), timely::execute::execute::{closure_env#1}<(), channel_segfault::main::{closure_env#0}>> ()
at communication/src/initialize.rs:316
#16 std::sys_common::backtrace::__rust_begin_short_backtrace<timely_communication::initialize::initialize_from::{closure_env#0}<timely_communication::allocator::generic::GenericBuilder, (), timely::execute::execute::{closure_env#1}<(), channel_segfault::main::{closure_env#0}>>, ()> (f=...) at /rustc/9eb3afe9ebe9c7d2b84b71002d44f4a0edac95e0/library/std/src/sys_common/backtrace.rs:121
#17 0x000055555557eaa7 in std::thread::{impl#0}::spawn_unchecked_::{closure#1}::{closure#0}<timely_communication::initialize::initialize_from::{closure_env#0}<timely_communication::allocator::generic::GenericBuilder, (), timely::execute::execute::{closure_env#1}<(), channel_segfault::main::{closure_env#0}>>, ()> () at /rustc/9eb3afe9ebe9c7d2b84b71002d44f4a0edac95e0/library/std/src/thread/mod.rs:558
#18 core::panic::unwind_safe::{impl#23}::call_once<(), std::thread::{impl#0}::spawn_unchecked_::{closure#1}::{closure_env#0}<timely_communication::initialize::initialize_from::{closure_env#0}<timely_communication::allocator::generic::GenericBuilder, (), timely::execute::execute::{closure_env#1}<(), channel_segfault::main::{closure_env#0}>>, ()>> (self=<error reading variable: Cannot access memory at address 0x0>, _args=<optimized out>)
at /rustc/9eb3afe9ebe9c7d2b84b71002d44f4a0edac95e0/library/core/src/panic/unwind_safe.rs:271
#19 std::panicking::try::do_call<core::panic::unwind_safe::AssertUnwindSafe<std::thread::{impl#0}::spawn_unchecked_::{closure#1}::{closure_env#0}<timely_communication::initialize::initialize_from::{closure_env#0}<timely_communication::allocator::generic::GenericBuilder, (), timely::execute::execute::{closure_env#1}<(), channel_segfault::main::{closure_env#0}>>, ()>>, ()> (data=<optimized out>)
at /rustc/9eb3afe9ebe9c7d2b84b71002d44f4a0edac95e0/library/std/src/panicking.rs:483
#20 std::panicking::try<(), core::panic::unwind_safe::AssertUnwindSafe<std::thread::{impl#0}::spawn_unchecked_::{closure#1}::{closure_env#0}<timely_communication::initialize::initialize_from::{closure_env#0}<timely_communication::allocator::generic::GenericBuilder, (), timely::execute::execute::{closure_env#1}<(), channel_segfault::main::{closure_env#0}>>, ()>>> (f=<error reading variable: Cannot access memory at address 0x0>)
at /rustc/9eb3afe9ebe9c7d2b84b71002d44f4a0edac95e0/library/std/src/panicking.rs:447
#21 std::panic::catch_unwind<core::panic::unwind_safe::AssertUnwindSafe<std::thread::{impl#0}::spawn_unchecked_::{closure#1}::{closure_env#0}<timely_communication::initialize::initialize_from::{closure_env#0}<timely_communication::allocator::generic::GenericBuilder, (), timely::execute::execute::{closure_env#1}<(), channel_segfault::main::{closure_env#0}>>, ()>>, ()> (f=<error reading variable: Cannot access memory at address 0x0>)
at /rustc/9eb3afe9ebe9c7d2b84b71002d44f4a0edac95e0/library/std/src/panic.rs:140
#22 std::thread::{impl#0}::spawn_unchecked_::{closure#1}<timely_communication::initialize::initialize_from::{closure_env#0}<timely_communication::allocator::generic::GenericBuilder, (), timely::execute::execute::{closure_env#1}<(), channel_segfault::main::{closure_env#0}>>, ()> () at /rustc/9eb3afe9ebe9c7d2b84b71002d44f4a0edac95e0/library/std/src/thread/mod.rs:557
#23 core::ops::function::FnOnce::call_once<std::thread::{impl#0}::spawn_unchecked_::{closure_env#1}<timely_communication::initialize::initialize_from::{closure_env#0}<timely_communication::allocator::generic::GenericBuilder, (), timely::execute::execute::{closure_env#1}<(), channel_segfault::main::{closure_env#0}>>, ()>, ()> () at /rustc/9eb3afe9ebe9c7d2b84b71002d44f4a0edac95e0/library/core/src/ops/function.rs:250
#24 0x0000555555613be3 in alloc::boxed::{impl#45}::call_once<(), dyn core::ops::function::FnOnce<(), Output=()>, alloc::alloc::Global> () at library/alloc/src/boxed.rs:1988
#25 alloc::boxed::{impl#45}::call_once<(), alloc::boxed::Box<dyn core::ops::function::FnOnce<(), Output=()>, alloc::alloc::Global>, alloc::alloc::Global> () at library/alloc/src/boxed.rs:1988
#26 std::sys::unix::thread::{impl#2}::new::thread_start () at library/std/src/sys/unix/thread.rs:108
#27 0x00007ffff7e01b43 in start_thread (arg=<optimized out>) at ./nptl/pthread_create.c:442
#28 0x00007ffff7e93a00 in clone3 () at ../sysdeps/unix/sysv/linux/x86_64/clone3.S:81
The unbounded channel implementation from
std::sync::mpsccan segfault due to what looks like a race condition when dropping channel senders and receivers.You can find a repro at https://github.com/teskje/timely-dataflow/tree/channel-segfault. Unfortunately it is not minimal at all, since it depends on the timely-dataflow crate, which is the actual channel user. I hope it is still self-contained enough that people familiar with the channel implementation can run it and use it as a starting point for debugging.
Run the
channel_segfaultexample like this:On some machines* it will segfault after some time, e.g.:
The segfault happens inside
std::sync::mpsc::list, as visible in the gdb backtrace below.The same segfault also happens with
crossbeam_channel. You can reproduce this by reverting the second-to-last commit on the repro branch. I understand thatstd::sync::mpscis a port ofcrossbeam_channel, so that's not surprising.Meta
rustc --version --verbose:I can reproduce the segfault on nightly as well.
*The segfault is not reproducible on all machines! We were able to reproduce it on:
It doesn't seem to reproduce on:
Also note that the repro is using more and more memory over time. That is because the different worker threads create and drop dataflows at different speeds and timely-dataflow keeps per-dataflow communication state (including channels) around until the last worker has dropped the dataflow. It is possible to avoid that by adding a
thread::sleepthat lets late workers catch up, but in my experience that also stops the segfault from happening.Backtrace