Compare commits

..

12 Commits

Author SHA1 Message Date
_ 32ddedc066 📝 doc: document attempted and planned optimizations 2023-10-03 16:17:34 -05:00
_ 43a3294c57 🚧 wip: decode Instructions lazily
I don't like it. I need to step back and be more methodical about optimizing.
2023-10-03 15:59:06 -05:00
_ 47d5fe3df1 test: check how big Instruction is 2023-10-03 12:29:47 -05:00
_ 3698feeb90 🐛 bug: fix tests 2023-10-03 11:58:29 -05:00
_ 55ea0c233e not really faster. Maybe more clear. 2023-10-02 20:28:36 -05:00
_ 7878efc235 🚧 wip: Vec seems to be faster than BTreeMap here, slightly 2023-10-02 20:13:43 -05:00
_ 4e23f51634 🚧 this seemed to improve perf a little but idk why 2023-10-02 19:27:30 -05:00
_ b35dc346e8 📈 performance: caching the current block helps, barely 2023-10-02 19:14:54 -05:00
_ 52df317326 measuring 2023-10-02 18:54:09 -05:00
_ 11fd5b6cbc 🚧 wip: always keep at least one stack frame handy
This removes a bunch of unwraps but doesn't make it any faster
2023-10-02 18:20:03 -05:00
_ 66fe54adef remove unused things 2023-10-02 17:55:17 -05:00
_ e11026a553 📈 performance: down to 800 ms / 3.3x by changing PC handling 2023-10-02 17:39:33 -05:00
6 changed files with 594 additions and 364 deletions

View File

@ -8,6 +8,10 @@ authors = ["ReactorScram"]
[dependencies]
lunar_wave_vm = { path = "../lunar_wave_vm" }
[profile.release]
codegen-units = 1
lto = "fat"
[target.x86_64-unknown-linux-gnu]
linker = "/usr/bin/clang"
# Recommended for flamegraph

View File

@ -1,4 +1,7 @@
use std::io::Read;
use std::{
io::Read,
rc::Rc,
};
use crate::{
instruction::Instruction as Inst,
@ -101,6 +104,68 @@ fn i_sc (buf: [u8; 4]) -> Option <i8> {
i8::try_from (i32::try_from (c).ok ()? - 127).ok ()
}
pub trait DecodeInstruction {
fn opcode (self) -> u8;
fn a (self) -> u8;
fn ax (self) -> u32;
fn b (self) -> u8;
fn bx (self) -> u32;
fn c (self) -> u8;
fn k (self) -> bool;
fn sb (self) -> i8;
fn sbx (self) -> i32;
fn sc (self) -> i8;
fn sj (self) -> i32;
}
impl DecodeInstruction for u32 {
#[inline(always)]
fn opcode (self) -> u8 {
((self >> 0) & 0x7f) as u8
}
fn a (self) -> u8 {
((self >> 7) & 0xff) as u8
}
fn ax (self) -> u32 {
self >> 7
}
fn b (self) -> u8 {
((self >> 16) & 0xff) as u8
}
fn bx (self) -> u32 {
(self >> 15) as u32
}
fn c (self) -> u8 {
(self >> 24) as u8
}
fn k (self) -> bool {
((self >> 15) & 0x1) == 1
}
fn sb (self) -> i8 {
((((self >> 16) & 0xff) as i16) - 127) as i8
}
fn sbx (self) -> i32 {
(self >> 15) as i32 - 65535
}
fn sc (self) -> i8 {
(((self >> 24) as i16) - 127) as i8
}
fn sj (self) -> i32 {
((self >> 7) as i32) - 0xffffff
}
}
pub fn parse_inst (buf: [u8; 4]) -> Option <Inst>
{
let opcode = buf [0] & 0x7f;
@ -148,9 +213,9 @@ pub fn parse_inst (buf: [u8; 4]) -> Option <Inst>
0x33 => Inst::Not (a, b),
0x34 => Inst::Len (a, b),
0x35 => Inst::Concat (a, b),
0x38 => Inst::Jmp (s_j),
0x3c => Inst::EqK (a, b, k),
0x3d => Inst::EqI (a, i_sb (buf)?, k),
0x38 => Inst::Jmp (s_j),
0x42 => Inst::Test (a, k),
0x44 => Inst::Call (a, b, c),
0x45 => Inst::TailCall (a, b, c, k),
@ -243,7 +308,7 @@ fn parse_i64 <R: Read> (rdr: &mut R) -> Option <i64> {
// code, but I don't like recursion in general, and I don't know
// why PUC wrote it that way.
pub fn parse_block <R: Read> (rdr: &mut R, si: &mut Interner, blocks: &mut Vec <Block>)
pub fn parse_block <R: Read> (rdr: &mut R, si: &mut Interner, blocks: &mut Vec <Rc <Block>>)
-> Option <()>
{
// Ignore things I haven't implemented yet
@ -261,9 +326,11 @@ pub fn parse_block <R: Read> (rdr: &mut R, si: &mut Interner, blocks: &mut Vec <
for _ in 0..inst_count {
let mut buf = [0u8; 4];
rdr.read_exact (&mut buf).ok ().unwrap ();
instructions.push (parse_inst (buf).expect (&format! ("{buf:?}")));
instructions.push (u32::from_le_bytes (buf));
}
let instructions = Rc::from (instructions);
let constant_count = parse_int (rdr).unwrap ();
let mut constants = Vec::with_capacity (constant_count as usize);
@ -306,7 +373,7 @@ pub fn parse_block <R: Read> (rdr: &mut R, si: &mut Interner, blocks: &mut Vec <
constants,
instructions,
upvalues,
});
}.into ());
// Recursion

File diff suppressed because it is too large Load Diff

View File

@ -1,8 +1,14 @@
use std::hash::Hash;
use std::{
hash::Hash,
rc::Rc,
};
use crate::{
instruction::Instruction as Inst,
loader,
loader::{
self,
DecodeInstruction,
},
state::{
Block,
Chunk,
@ -39,8 +45,7 @@ fn run_bytecode (vm: &mut State, args: &[&str], bc: &[u8]) -> Vec <Value> {
/// Takes arguments and Lua source code,
/// invokes `luac` to compile it to bytecode,
/// runs it,
/// and returns the output
/// runs it, and returns the output
fn run_source (vm: &mut State, args: &[&str], s: &str) -> Vec <Value> {
let bc = loader::compile_bytecode (s.as_bytes ().to_vec ()).unwrap ();
@ -64,7 +69,7 @@ fn bools () {
*/
let mut si = Interner::default ();
/*
let chunk = Chunk {
blocks: vec! [
Block {
@ -97,7 +102,7 @@ fn bools () {
si.to_value ("print"),
],
upvalues: vec! [],
},
}.into (),
Block {
instructions: vec! [
Inst::Test (0, false),
@ -111,7 +116,7 @@ fn bools () {
],
constants: vec! [],
upvalues: vec! [],
},
}.into (),
],
};
@ -126,6 +131,7 @@ fn bools () {
let actual = run_chunk (&mut vm, &arg, chunk.clone ());
assert_eq! (actual, expected);
}
*/
}
#[test]
@ -152,7 +158,7 @@ fn floats () {
*/
let mut si = Interner::default ();
/*
let block = Block {
instructions: vec! [
Inst::VarArgPrep (0),
@ -173,7 +179,7 @@ fn floats () {
upvalues: vec! [],
};
let chunk = Chunk {
blocks: vec! [block],
blocks: vec! [block.into ()],
};
let mut vm = crate::State::new_with_args (Chunk::default (), si, vec! [].into_iter());
@ -187,6 +193,7 @@ fn floats () {
assert_eq! (actual, expected);
}
*/
}
#[test]
@ -195,10 +202,49 @@ fn fma () {
let mut si = Interner::default ();
let bytecode = &crate::loader::compile_bytecode (source.to_vec ()).unwrap ();
let chunk = crate::loader::parse_chunk (bytecode, &mut si).unwrap ();
assert_eq! (chunk.blocks.len (), 5);
assert_eq! (chunk.blocks.len (), 5);
assert_eq! (chunk.blocks [3].upvalues.len (), 2);
let i = chunk.blocks [1].instructions [0];
assert_eq! (i.opcode (), 0x22);
assert_eq! (i.a (), 2);
assert_eq! (i.b (), 0);
assert_eq! (i.c (), 1);
let i = chunk.blocks [1].instructions [1];
assert_eq! (i.opcode (), 0x2e);
assert_eq! (i.a (), 0);
assert_eq! (i.b (), 1);
assert_eq! (i.c (), 6);
let i = chunk.blocks [2].instructions [0];
assert_eq! (i.opcode (), 0x24);
assert_eq! (i.a (), 2);
assert_eq! (i.b (), 0);
assert_eq! (i.c (), 1);
let i = chunk.blocks [2].instructions [1];
assert_eq! (i.opcode (), 0x2e);
assert_eq! (i.a (), 0);
assert_eq! (i.b (), 1);
assert_eq! (i.c (), 8);
let i = chunk.blocks [3].instructions [2];
assert_eq! (i.opcode (), 0x00);
assert_eq! (i.a (), 5);
assert_eq! (i.b (), 0);
let i = chunk.blocks [3].instructions [4];
assert_eq! (i.opcode (), 0x44);
assert_eq! (i.a (), 4);
assert_eq! (i.b (), 3);
assert_eq! (i.c (), 2);
let i = chunk.blocks [4].instructions [1];
assert_eq! (i.opcode (), 0x01);
assert_eq! (i.a (), 1);
assert_eq! (i.sbx (), 10);
let mut vm = crate::State::new_with_args (chunk, si, vec! ["_exe_name".to_string ()].into_iter ());
let actual = vm.execute ().unwrap ();
@ -319,7 +365,15 @@ fn is_93 () {
let bc = loader::compile_bytecode (src.to_vec ()).unwrap ();
let chunk = loader::parse_chunk (&bc, &mut si).unwrap ();
assert_eq! (chunk.blocks [0].instructions [3], Inst::EqK (0, 1, false));
let i = chunk.blocks [0].instructions [3];
assert_eq! (i.opcode (), 0x3c);
assert_eq! (i.a (), 0);
assert_eq! (i.b (), 1);
assert_eq! (i.k (), false);
let i = chunk.blocks [0].instructions [4];
assert_eq! (i.opcode (), 0x38);
assert_eq! (i.sj (), 6);
let mut vm = crate::State::new_with_args (Chunk::default (), si, vec! [].into_iter());
@ -404,8 +458,6 @@ fn tables_2 () {
#[test]
fn tailcall () {
use crate::instruction::Instruction;
let mut si = Interner::default ();
let src = br#"
@ -415,7 +467,8 @@ fn tailcall () {
let bc = loader::compile_bytecode (src.to_vec ()).unwrap ();
let chunk = loader::parse_chunk (&bc, &mut si).unwrap ();
assert_eq! (chunk.blocks [0].instructions [3], Instruction::TailCall (0, 2, 1, false));
// assert_eq! (chunk.blocks [0].instructions [3].opcode (), Instruction::TailCall (0, 2, 1, false));
assert_eq! (chunk.blocks [0].instructions [3].opcode (), 0x45);
let mut vm = crate::State::new_with_args (Chunk::default (), si, vec! [].into_iter());
@ -426,7 +479,7 @@ fn tailcall () {
}
#[test]
fn value_size () {
fn rust_stuff () {
// Per https://www.lua.org/doc/jucs05.pdf,
// "The Implementation of Lua 5.0",
//
@ -480,4 +533,9 @@ fn value_size () {
let expected = 8;
assert! (sz == expected, "{sz} != {expected}");
}
assert_eq! (size_of::<crate::instruction::Instruction> (), 8);
let x = vec! [100, 101, 102, 103];
let x: Rc <[u32]> = Rc::from (x);
}

View File

@ -247,6 +247,7 @@ impl Value {
pub struct Table {
array: Vec <Value>,
hash: HashMap <Value, Value>,
strings: Vec <(InternedString, Value)>,
map: BTreeMap <InternedString, Value>,
}
@ -262,7 +263,7 @@ impl Table {
fn get_inner (&self, key: &Value) -> &Value {
match key {
Value::Nil => &NIL,
Value::String (x) => self.map.get (x).unwrap_or (&NIL),
Value::String (x) => self.get_str (*x),
Value::Integer (x) => self.array.get (usize::try_from (*x).unwrap ()).unwrap_or (&NIL),
x => self.hash.get (x).unwrap_or (&NIL),
}
@ -277,10 +278,7 @@ impl Table {
}
pub fn get_str (&self, key: InternedString) -> &Value {
match self.map.get (&key) {
None => &NIL,
Some (x) => x,
}
self.strings.iter ().find (|(hay, _)| hay == &key).map (|(_, v)| v).unwrap_or (&NIL)
}
/// Insert value at arbitrary key
@ -293,9 +291,7 @@ impl Table {
match a.into () {
Value::Integer (x) => self.insert_int (x, b),
Value::Nil => (),
Value::String (x) => {
self.map.insert (x, b.into ());
},
Value::String (x) => self.insert_str (x, b.into ()),
x => {
self.hash.insert (x, b.into ());
},
@ -312,7 +308,11 @@ impl Table {
}
pub fn insert_str (&mut self, key: InternedString, v: Value) {
self.map.insert (key, v);
match self.strings.iter_mut ().find (|(hay, _)| hay == &key).map (|(_, v)| v)
{
None => self.strings.push ((key, v)),
Some (x) => *x = v,
}
}
pub fn length (&self) -> i64 {

67
notes.md Normal file
View File

@ -0,0 +1,67 @@
# Optimizations
Making notes on optimizations I've made and plan to make, so I can remember which ones paid off.
## String interning
Worked well. PUC Lua does this. I think it's faster not because it avoids
hashing or comparing strings, but because it avoids the pointer deref.
I still ended up hashing ints after this change.
## Linear search
The n_body benchmark uses tables with about 7 slots in its hot loop.
The hashing overhead of HashMap for i64 seems pretty bad for this.
BTreeMap was faster, but not fast enough.
I switched to just an unsorted Vec and linear search, and it's the
fastest by a small margin.
I don't think PUC Lua does this, but PUC Lua might have a faster, less
secure hash algorithm than Rust's default.
Flamegraph reveals we still spend a lot of time in linear searching tables.
## Lazy instruction decoding
I think this actually slowed it down. PUC Lua keeps instructions in their
encoded u32 form and decodes them lazily inside the interpreter's main loop.
I did this mostly to match PUC Lua, although I didn't think it would work. My enum for decoded instructions is only 64 bits, and I didn't think the extra bit fiddling was cheap enough.
Maybe if I tweaked it, it would pay off. It just really doesn't look like it should work.
## Caching the current block
I think this one paid off. The idea was to avoid some `chunk.blocks [i]` derefs and bound checks in the inner loop.
I used an `Rc` to make it work. PUC Lua probably just keeps a raw pointer to the block.
## Caching the current instruction list
I think this one paid off more. Instead of caching the current block I just cached its instructions, since the inner loop doesn't use constants or upvalues much, but every step requires access to the instruction list.
Using `Rc <[u32]>` was fun, too. I never stored a slice directly in a smart pointer before.
## Fat LTO and codegen-units = 1
Did absolutely nothing. I couldn't outsmart LLVM.
## Remove RefCell
(upcoming)
I think the `borrow` and `borrow_mut` calls slow down OP_GETFIELD and OP_SETFIELD. I can remove them if I store all the tables in State directly, replacing `Rc <RefCell <Table>>` with my own ref counting. This might
remove a layer of indirection, too.
It's a big change, but I'd need _something_ like this for adding a GC anyway, and sometimes big changes have paid off.
## Iterating over instruction list
(upcoming)
I noticed PUC Lua doesn't store a program counter, it stores a `u32 *`, a pointer to the next instruction itself. This might save, like, 1 single cycle or something, I can't believe it does anything, but it could. Because it saves you that "Look at the instruction list, multiply the index by 4, add it to the base pointer" step.
Maybe the real saving is that it saves a little bit of cache space by forgetting the base pointer?
Storing an iterator sounds like a big fight with the borrow checker. I might want to prototype it outside the interpreter first. But if it works, it might compile down to what PUC Lua does in C. Plus a bounds check.