From b30c223927a46008dc9c441ac0a6ac90d25041c1 Mon Sep 17 00:00:00 2001 From: Ryan Lopopolo Date: Tue, 9 Jul 2019 22:40:15 -0700 Subject: [PATCH 1/6] Add local script to format text without waiting for the linter to run --- package.json | 1 + scripts/lint.sh | 4 +--- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/package.json b/package.json index 41ecf31f..6c070450 100644 --- a/package.json +++ b/package.json @@ -81,6 +81,7 @@ }, "scripts": { "eslint-check": "eslint --print-config . | eslint-config-prettier-check", + "format-text": "prettier --write --prose-wrap always './*.{css,html,js,json,md}' '{!(target),!(node_modules)}**/*.{css,html,js,json,md}'", "lint": "./scripts/lint.sh", "loc": "loc --exclude vendor --exclude ffi\\.rs --exclude mruby-bin/ruby/fixtures --exclude spec-runner/spec" } diff --git a/scripts/lint.sh b/scripts/lint.sh index 6112c475..24d3bbc3 100755 --- a/scripts/lint.sh +++ b/scripts/lint.sh @@ -68,6 +68,4 @@ find . -type f \ # Text sources (e.g. HTML, Markdown) ## Format with prettier -yarn run prettier --write --prose-wrap always \ - './*.{css,html,js,json,md}' \ - '{!(target),!(node_modules)}**/*.{css,html,js,json,md}' +yarn run format-text From b27c937e5459352d8b627c249baea1f4cebc4f41 Mon Sep 17 00:00:00 2001 From: Ryan Lopopolo Date: Tue, 9 Jul 2019 22:40:31 -0700 Subject: [PATCH 2/6] WIP design docs for artichoke VM --- design/README.md | 18 +++ design/ast.md | 3 + design/memory-management.md | 234 ++++++++++++++++++++++++++++ design/parser.md | 3 + design/string.md | 3 + design/threading-and-concurrency.md | 3 + design/value.md | 3 + 7 files changed, 267 insertions(+) create mode 100644 design/README.md create mode 100644 design/ast.md create mode 100644 design/memory-management.md create mode 100644 design/parser.md create mode 100644 design/string.md create mode 100644 design/threading-and-concurrency.md create mode 100644 design/value.md diff --git a/design/README.md b/design/README.md new file mode 100644 index 00000000..fa214943 --- /dev/null +++ b/design/README.md @@ -0,0 +1,18 @@ +# Artichoke Ruby VM Design + +Artichoke aims to be a source-compatible, [ruby/spec](/spec-runner/spec/ruby) +compliant implementation of Ruby 2.6.3 written in safe Rust (excluding crate +dependencies). + +These documents discuss the design of the Artichoke VM. + +## Design Document Index + +- [Value Representation](value.md) +- [String and Encoding](string.md) +- [Memory Management](memory-management.md): the heap and using Rust memory + management to get a reference counting GC for free. +- [Threading and Concurrency](threading-and-concurrency.md): True concurrency + with no GIL. +- [Parser](parser.md) +- [AST](ast.md) diff --git a/design/ast.md b/design/ast.md new file mode 100644 index 00000000..20b256bc --- /dev/null +++ b/design/ast.md @@ -0,0 +1,3 @@ +# Artichoke Ruby AST + +WIP diff --git a/design/memory-management.md b/design/memory-management.md new file mode 100644 index 00000000..a18c87e9 --- /dev/null +++ b/design/memory-management.md @@ -0,0 +1,234 @@ +# Artichoke Ruby Memory Management + +Artichoke has no garbage collector and relies on +[Rust's built-in memory management](https://pcwalton.github.io/2013/03/18/an-overview-of-memory-management-in-rust.html) +to reclaim memory when Ruby [`Value`](value.md)s are no longer reachable from +the VM. + +This document refers to data structures with backticks if it is refering to a +specific implementation, for example, [`Value`](value.md). If the data structure +is not formatted as code, the document is referring to the general concept, for +example, HashMap does not refer to +[`HashMap`](https://doc.rust-lang.org/std/collections/struct.HashMap.html), but +rather the concept of a hash table. + +## `BasicObject#object_id` + +`BasicObject` is the root of the class hierarchy in Ruby. All +[`Value`](value.md)s inherit from `BasicObject`. Every `BasicObject` must have a +unique `object_id`, which is a `u64`. There are some wrinkles to this, but for +now we can assume that every `Value` that the VM allocates will have a unique +`object_id`. + +In the VM, `object_id` is represented by the following struct: + +```rust +#[derive(Clone, Debug)] +pub struct ObjectId { + // reference to the VM + interp: Artichoke, + // opaque and immutable identifier + id: u64, +} + +impl Hash for ObjectId { + fn hash(&self, state: &mut H) { + self.id.hash(state); + } +} + +impl PartialEq for ObjectId { + fn eq(&self, other: &Self) -> bool { + self.id == other.id + } +} + +impl Eq for ObjectId {} +``` + +`ObjectId` is useful as a pointer. By having a reference to an `ObjectId`, +components of the VM can retrieve `Value`s from the heap. + +Mediating access to the underlying `Value`s via the `ObjectId` allows us to +centrally implement guards around mutability. For example, `Value`s can be +marked immutable with +[`Object#freeze`](https://ruby-doc.org/core-2.6.3/Object.html#method-i-freeze). +`ObjectId` implements +[`Deref`](https://doc.rust-lang.org/std/ops/trait.Deref.html) and +[`DerefMut`](https://doc.rust-lang.org/std/ops/trait.DerefMut.html) that resolve +a `Value` on the heap via its `ObjectId` and enforces mutability guarantees of +the VM. + +## The Heap + +The heap is a HashMap from `ObjectId` to a sharable `Value` representation. The +_shareable `Value` representation_ is a wrapper around `Value` that enables it +to have shared ownership. The specifics of the wrapper depend on VM context (for +example, values are wrapped differently if they are shared by multiple threads), +but conceptually the wrapper behaves like an `Rc>`. The wrapper +can have multiple owners, supports weak references, and allows the `Value` to be +mutated. + +The heap stores weak references to `Value`. When a `Value` takes an owned +reference to another, it resolves the value wrapper via the heap and upgrades +the weak reference into a strong reference. + +Eventually, a `Value` may become unreachable, the strong count on the `Rc` that +wraps it will drop to zero, the memory will be reclaimed, and the weak reference +becomes invalid. To optimize access times for the heap and prevent the heap from +growing unbounded, a background thread will periodically compact the heap by +removing `ObjectId`s that point to invalid weak references. + +## Shared References and Reference Counting + +A `Value` can be referenced by many other `Value`s. For example, in the below +program, the String `'artichoke'` is reachable from six locations. + +```ruby +x = 'artichoke' +# name binding +y = x +# collection item +ary = [x, x] +# captured variable +f = proc { x } +# self-referential structure +x.instance_variable_set :@a, x +``` + +Because instance variables are publically settable, every `Value` can hold a +reference other `Value`s, including cyclical ones. + +Ignoring cycles, when a `Value` takes a reference to another value, we can call +[`Rc::clone`](https://doc.rust-lang.org/std/rc/struct.Rc.html#impl-Clone). This +takes a strong reference to a `Value` and increases the ref count on the smart +pointer. When the `Value` is deallocated, Rust will drop the references on the +smart pointers it ownes. + +For example, an Array is backed by a `Vec>>` and the symbol +table of instance variables on an object is a +`HashMap>>`. + +Things are trickier if we need to handle cycles. Consider the following code: + +```ruby +class Container + attr_accessor :inner + + def initialize(inner) + @inner = inner + end +end + +def make_cycle + a = Container.new(nil) # ObjectId(100) + b = Container.new(a) # ObjectId(200) + c = Container.new(b) # ObjectId(300) + d = Container.new(c) # ObjectId(400) + a.inner = d + a +end + +ring = make_cycle +``` + +Here's what happens from the perspective of the VM: + +1. The `a` binding holds a strong reference to `ObjectId(100)` +2. `ObjectId(200)` holds a strong reference to `ObjectId(100)` +3. The `b` binding holds a strong reference to `ObjectId(200)` +4. `ObjectId(300)` holds a strong reference to `ObjectId(200)` +5. The `c` binding holds a strong reference to `ObjectId(300)` +6. `ObjectId(400)` holds a strong reference to `ObjectId(300)` +7. The `d` binding holds a strong reference to `ObjectId(400)` + +At this point the strong counts look like this: + +| `ObjectId` | Strong Count | +| ---------- | ------------ | +| 100 | 2 | +| 200 | 2 | +| 300 | 2 | +| 400 | 1 | + +Assigning `ObjectId(400)` to the `@inner` instance variable of `ObjectId(100)` +makes these four `Value`s form a cycle. + +### Detecting Cycles + +Each `Value` can answer the question: Can I reach an `ObjectId`? + +```rust +impl ObjectId { + pub fn can_reach_object(&self, other: Self, &mut checked: HashSet) -> HashSet { + unimplemented!(); + } +} +``` + +`Value` asks this question of all its strong references when attempting to take +a strong reference to another `Value`. If the returned `HashSet` is empty, the +`Value` takes a strong reference. If the returned `HashSet` is non-empty, these +`ObjectId`s are added to a VM-tracked `Cycle`. The cycle group holds a weak +reference to the shared `Value` wrapper and and rather than hold a +`Rc>`, the `Value`s in the cycle hold a reference to the cycle +group which can resolve an `ObjectId` into a strong value wrapper reference +temporarily. + +```rust +pub enum ValueReference { + Strong(Rc>), + CycleWeak(Rc>>), +} + +pub struct Cycle { + value: Weak>, + group: HashSet, +} +``` + +Back to our example: when `ObjectId(400)` is assigned to `@inner` on +`ObjectId(100)`, the VM detects a cycle because `ObjectId(100)` is reachable by +the chain of `ObjectId(400) -> ObjectId(300) -> ObjectId(200) -> ObjectId(100)`. +The `ObjectId`s are reachable in these ways: + +| `ObjectId` | Binding | +| ---------- | -------------------- | +| 100 | `a`, `ObjectId(200)` | +| 200 | `b`, `ObjectId(300)` | +| 300 | `c`, `ObjectId(400)` | +| 400 | `d`, `ObjectId(100)` | + +Once we return from the function, the variable bindings get dropped: + +| `ObjectId` | Binding | +| ---------- | ----------------------- | +| 100 | `ring`, `ObjectId(200)` | +| 200 | `ObjectId(300)` | +| 300 | `ObjectId(400)` | +| 400 | `ObjectId(100)` | + +But even if `ring` is dropped or reassigned, memory will not be reclaimed. + +### Escape Analysis + +All `ObjectId`s in the cycle will hold `CycleWeak` references. This is safe +because the Weak references are only invalid if the cycle is unreachable by any +other `Value`s in the VM. + +If the `ObjectId` owning the reference is not in the cycle, it will hold a +`Strong` reference. The cycle is unreachable unless it is referenced by an +`ObjectId` outside of the cycle. + +If the reference is bound to a name (whether a local in a function, class +context, module context, proc, or top self, captured variable in a proc, or a +constant binding), the name will hold a `Strong` reference. + +If the reference is captured by a proc, the proc will hold a `CycleStrong` +reference. + +### Breaking Cycles + +If the VM changes the value of a binding that points to a `CycleWeak`, the cycle +is broken. The VM will replace the reachable `CycleWeak`s with strong +references. diff --git a/design/parser.md b/design/parser.md new file mode 100644 index 00000000..7f837e40 --- /dev/null +++ b/design/parser.md @@ -0,0 +1,3 @@ +# Artichoke Ruby Parser + +WIP diff --git a/design/string.md b/design/string.md new file mode 100644 index 00000000..a36a61e8 --- /dev/null +++ b/design/string.md @@ -0,0 +1,3 @@ +# Artichoke Ruby String and Encoding + +WIP diff --git a/design/threading-and-concurrency.md b/design/threading-and-concurrency.md new file mode 100644 index 00000000..5ec89c92 --- /dev/null +++ b/design/threading-and-concurrency.md @@ -0,0 +1,3 @@ +# Artichoke Ruby Threading and Concurrency + +WIP diff --git a/design/value.md b/design/value.md new file mode 100644 index 00000000..da2176b2 --- /dev/null +++ b/design/value.md @@ -0,0 +1,3 @@ +# Artichoke Ruby Value Representation + +WIP From 14be270b4f55e6903e3f076a78def5b7de2270d2 Mon Sep 17 00:00:00 2001 From: Ryan Lopopolo Date: Tue, 9 Jul 2019 22:53:09 -0700 Subject: [PATCH 3/6] Fix some typos --- design/memory-management.md | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/design/memory-management.md b/design/memory-management.md index a18c87e9..b6080686 100644 --- a/design/memory-management.md +++ b/design/memory-management.md @@ -103,7 +103,7 @@ Ignoring cycles, when a `Value` takes a reference to another value, we can call [`Rc::clone`](https://doc.rust-lang.org/std/rc/struct.Rc.html#impl-Clone). This takes a strong reference to a `Value` and increases the ref count on the smart pointer. When the `Value` is deallocated, Rust will drop the references on the -smart pointers it ownes. +smart pointers it owns. For example, an Array is backed by a `Vec>>` and the symbol table of instance variables on an object is a @@ -178,7 +178,7 @@ temporarily. ```rust pub enum ValueReference { Strong(Rc>), - CycleWeak(Rc>>), + CycleWeak(Rc>), } pub struct Cycle { @@ -224,8 +224,7 @@ If the reference is bound to a name (whether a local in a function, class context, module context, proc, or top self, captured variable in a proc, or a constant binding), the name will hold a `Strong` reference. -If the reference is captured by a proc, the proc will hold a `CycleStrong` -reference. +If the reference is captured by a proc, the proc will hold a `Strong` reference. ### Breaking Cycles From 3f67bd44e9757377553916675697fc9dfedadb77 Mon Sep 17 00:00:00 2001 From: Ryan Lopopolo Date: Thu, 11 Jul 2019 01:50:58 -0700 Subject: [PATCH 4/6] Run specs from ruby/spec root --- spec-runner/src/mspec.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/spec-runner/src/mspec.rs b/spec-runner/src/mspec.rs index cb643269..6d5723eb 100644 --- a/spec-runner/src/mspec.rs +++ b/spec-runner/src/mspec.rs @@ -47,6 +47,7 @@ impl Runner { pub fn run(self) -> Result { init(&self.interp).unwrap(); self.interp.def_rb_source_file("/src/spec_helper.rb", "")?; + self.interp.def_rb_source_file("spec_helper.rb", "")?; self.interp .def_rb_source_file("/src/test/spec_runner", include_str!("spec_runner.rb"))?; if let Err(err) = self.interp.eval("require '/src/test/spec_runner'") { From 4428618eac98e77d3a8345f947baae9c2cc7f256 Mon Sep 17 00:00:00 2001 From: Ryan Lopopolo Date: Thu, 11 Jul 2019 03:49:44 -0700 Subject: [PATCH 5/6] Use CactusRef in VM design --- design/memory-management.md | 116 ++++++++++++++++-------------------- 1 file changed, 50 insertions(+), 66 deletions(-) diff --git a/design/memory-management.md b/design/memory-management.md index b6080686..777f3794 100644 --- a/design/memory-management.md +++ b/design/memory-management.md @@ -2,8 +2,9 @@ Artichoke has no garbage collector and relies on [Rust's built-in memory management](https://pcwalton.github.io/2013/03/18/an-overview-of-memory-management-in-rust.html) -to reclaim memory when Ruby [`Value`](value.md)s are no longer reachable from -the VM. +and a [cycle-aware reference-counted smart pointer](/cactusref) of its own +invention to reclaim memory when Ruby [`Value`](value.md)s are no longer +reachable from the VM. This document refers to data structures with backticks if it is refering to a specific implementation, for example, [`Value`](value.md). If the data structure @@ -28,7 +29,13 @@ pub struct ObjectId { // reference to the VM interp: Artichoke, // opaque and immutable identifier - id: u64, + id: usize, +} + +impl ObjectId { + pub fn id(&self) -> usize { + self.id + } } impl Hash for ObjectId { @@ -97,19 +104,26 @@ x.instance_variable_set :@a, x ``` Because instance variables are publically settable, every `Value` can hold a -reference other `Value`s, including cyclical ones. +reference other `Value`s, including cyclical ones. This means we cannot ignore +cycles. + +`CactusRef` is a smart pointer that behaves similarly to `Rc`, with the addition +that `CactusRef` can detect cycles and deallocate `Value`s if they form an +orphaned cycle. -Ignoring cycles, when a `Value` takes a reference to another value, we can call -[`Rc::clone`](https://doc.rust-lang.org/std/rc/struct.Rc.html#impl-Clone). This -takes a strong reference to a `Value` and increases the ref count on the smart -pointer. When the `Value` is deallocated, Rust will drop the references on the -smart pointers it owns. +When a special `Value` takes ownership of another, the `ObjectId` of the other +value resolves a strong `CactusRef` via the heap and stores it in its internal +data structures (e.g. an instance variable table or a `Vec` backing an Array). -For example, an Array is backed by a `Vec>>` and the symbol -table of instance variables on an object is a -`HashMap>>`. +For example, an Array is backed by a `Vec>>` and the +symbol table of instance variables on an object is a +`HashMap>>`. -Things are trickier if we need to handle cycles. Consider the following code: +When a `CactusRef` is dropped, the reference count of the `Value` decreases and +`CactusRef` does a reachability check using breadth-first search and the +`Reachable` implementation on `Value`. + +Consider the following code: ```ruby class Container @@ -134,13 +148,13 @@ ring = make_cycle Here's what happens from the perspective of the VM: -1. The `a` binding holds a strong reference to `ObjectId(100)` -2. `ObjectId(200)` holds a strong reference to `ObjectId(100)` -3. The `b` binding holds a strong reference to `ObjectId(200)` -4. `ObjectId(300)` holds a strong reference to `ObjectId(200)` -5. The `c` binding holds a strong reference to `ObjectId(300)` -6. `ObjectId(400)` holds a strong reference to `ObjectId(300)` -7. The `d` binding holds a strong reference to `ObjectId(400)` +1. The `a` binding holds a strong reference to the Value `ObjectId(100)` +2. `ObjectId(200)` holds a strong reference to the Value `ObjectId(100)` +3. The `b` binding holds a strong reference to the Value `ObjectId(200)` +4. `ObjectId(300)` holds a strong reference to the Value `ObjectId(200)` +5. The `c` binding holds a strong reference to the Value `ObjectId(300)` +6. `ObjectId(400)` holds a strong reference to the Value `ObjectId(300)` +7. The `d` binding holds a strong reference to the Value `ObjectId(400)` At this point the strong counts look like this: @@ -159,38 +173,29 @@ makes these four `Value`s form a cycle. Each `Value` can answer the question: Can I reach an `ObjectId`? ```rust -impl ObjectId { - pub fn can_reach_object(&self, other: Self, &mut checked: HashSet) -> HashSet { +unsafe impl Reachable for Value { + pub fn object_id(&self) -> usize { + self.object_id.id() + } + + pub fn can_reach(&self, object_id: usize) -> bool { + for value in self.instance_variables { + if value.object_id.id() == object_id { + return true; + } + } + // and for other data structures like Class, Array, Hash unimplemented!(); } } ``` -`Value` asks this question of all its strong references when attempting to take -a strong reference to another `Value`. If the returned `HashSet` is empty, the -`Value` takes a strong reference. If the returned `HashSet` is non-empty, these -`ObjectId`s are added to a VM-tracked `Cycle`. The cycle group holds a weak -reference to the shared `Value` wrapper and and rather than hold a -`Rc>`, the `Value`s in the cycle hold a reference to the cycle -group which can resolve an `ObjectId` into a strong value wrapper reference -temporarily. - -```rust -pub enum ValueReference { - Strong(Rc>), - CycleWeak(Rc>), -} - -pub struct Cycle { - value: Weak>, - group: HashSet, -} -``` +`Value` does not need to do a full graph traversal because `CactusRef` does it. Back to our example: when `ObjectId(400)` is assigned to `@inner` on `ObjectId(100)`, the VM detects a cycle because `ObjectId(100)` is reachable by the chain of `ObjectId(400) -> ObjectId(300) -> ObjectId(200) -> ObjectId(100)`. -The `ObjectId`s are reachable in these ways: +The Valuse associated with the `ObjectId`s are reachable in these ways: | `ObjectId` | Binding | | ---------- | -------------------- | @@ -208,26 +213,5 @@ Once we return from the function, the variable bindings get dropped: | 300 | `ObjectId(400)` | | 400 | `ObjectId(100)` | -But even if `ring` is dropped or reassigned, memory will not be reclaimed. - -### Escape Analysis - -All `ObjectId`s in the cycle will hold `CycleWeak` references. This is safe -because the Weak references are only invalid if the cycle is unreachable by any -other `Value`s in the VM. - -If the `ObjectId` owning the reference is not in the cycle, it will hold a -`Strong` reference. The cycle is unreachable unless it is referenced by an -`ObjectId` outside of the cycle. - -If the reference is bound to a name (whether a local in a function, class -context, module context, proc, or top self, captured variable in a proc, or a -constant binding), the name will hold a `Strong` reference. - -If the reference is captured by a proc, the proc will hold a `Strong` reference. - -### Breaking Cycles - -If the VM changes the value of a binding that points to a `CycleWeak`, the cycle -is broken. The VM will replace the reachable `CycleWeak`s with strong -references. +When `ring` is dropped or reassigned, `CactusRef` detects an orphaned cycle and +will deallocate all of the `Value`s. From f246d5560b2355ac778d552620582e6cc9f6959c Mon Sep 17 00:00:00 2001 From: Ryan Lopopolo Date: Thu, 11 Jul 2019 08:53:10 -0700 Subject: [PATCH 6/6] ObjectIds are only PartialEq if they are from the same arena --- design/memory-management.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/design/memory-management.md b/design/memory-management.md index 777f3794..4e176745 100644 --- a/design/memory-management.md +++ b/design/memory-management.md @@ -46,7 +46,7 @@ impl Hash for ObjectId { impl PartialEq for ObjectId { fn eq(&self, other: &Self) -> bool { - self.id == other.id + self.interp == other.interp && self.id == other.id } }