Compare commits
No commits in common. "c452116c0a9790b732b1bed1912cf8a28beea7ee" and "6db6dc372579d06cff39d8c2a5df824eb37838fd" have entirely different histories.
c452116c0a
...
6db6dc3725
|
@ -120,6 +120,7 @@ pub trait DecodeInstruction {
|
|||
}
|
||||
|
||||
impl DecodeInstruction for u32 {
|
||||
#[inline(always)]
|
||||
fn opcode (self) -> u8 {
|
||||
((self >> 0) & 0x7f) as u8
|
||||
}
|
||||
|
|
|
@ -551,13 +551,12 @@ impl State {
|
|||
{
|
||||
use crate::loader::DecodeInstruction;
|
||||
|
||||
let i = self.fetch ();
|
||||
|
||||
let make_step_error = |msg| {
|
||||
panic! ("unimplemented {msg}")
|
||||
};
|
||||
|
||||
let i = self.fetch ();
|
||||
self.incr_pc ();
|
||||
|
||||
match i.opcode () {
|
||||
0x22 => {
|
||||
if self.op_add (i.a (), i.b (), i.c ()) {
|
||||
|
@ -1048,6 +1047,8 @@ impl State {
|
|||
x => unimplemented! ("{x}"),
|
||||
}
|
||||
|
||||
self.incr_pc ();
|
||||
|
||||
Ok (None)
|
||||
}
|
||||
|
||||
|
|
|
@ -2,17 +2,14 @@ use std::collections::BTreeMap;
|
|||
|
||||
use crate::value::Value;
|
||||
|
||||
// I think Lua doesn't really support tables / arrays bigger than
|
||||
// 4 billion, so no need for LunarWave to support them, either.
|
||||
|
||||
#[derive (Copy, Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
|
||||
pub struct InternedString (u32);
|
||||
pub struct InternedString (i64);
|
||||
|
||||
#[derive (Debug, Default)]
|
||||
pub struct Interner {
|
||||
table_fwd: BTreeMap <String, u32>,
|
||||
table_rev: BTreeMap <u32, String>,
|
||||
counter: u32,
|
||||
table_fwd: BTreeMap <String, i64>,
|
||||
table_rev: BTreeMap <i64, String>,
|
||||
counter: i64,
|
||||
}
|
||||
|
||||
impl Interner {
|
||||
|
@ -25,7 +22,7 @@ impl Interner {
|
|||
Some (x) => InternedString (*x),
|
||||
None => {
|
||||
self.counter += 1;
|
||||
if self.counter == u32::MAX {
|
||||
if self.counter == i64::MAX {
|
||||
panic! ("Out of IDs");
|
||||
}
|
||||
self.table_fwd.insert (s.to_string (), self.counter);
|
||||
|
|
|
@ -509,22 +509,6 @@ fn rust_stuff () {
|
|||
|
||||
let sz = size_of::<crate::value::Value> ();
|
||||
let expected = 16;
|
||||
assert! (sz == expected, "{sz} != {expected}");
|
||||
}
|
||||
|
||||
{
|
||||
// Make sure LWVM's Values are 16 bytes or smaller.
|
||||
// Because types are usually aligned to their size, f64s
|
||||
// are supposed to be aligned to 8 bytes. So even an `Option <f64>`
|
||||
// uses 8 bytes to say "Some" or "None".
|
||||
// I could _maybe_ fudge this somehow but it's fine to start with.
|
||||
|
||||
let sz = size_of::<(crate::string_interner::InternedString, crate::value::Value)> ();
|
||||
let expected = 16;
|
||||
assert! (sz <= expected, "{sz} > {expected}");
|
||||
|
||||
let sz = size_of::<(crate::value::Value, u32)> ();
|
||||
let expected = 16;
|
||||
assert! (sz <= expected, "{sz} > {expected}");
|
||||
}
|
||||
|
||||
|
|
|
@ -22,7 +22,6 @@ pub struct BogusClosure {
|
|||
}
|
||||
|
||||
#[derive (Clone, PartialEq)]
|
||||
#[repr (u8)]
|
||||
pub enum Value {
|
||||
Nil,
|
||||
Boolean (bool),
|
||||
|
|
|
@ -18,7 +18,7 @@ fn embedding () {
|
|||
*l.reg_mut (0) = Value::from (a + b + 1993);
|
||||
1
|
||||
}
|
||||
/*
|
||||
|
||||
let mut si = lwvm::Interner::default ();
|
||||
|
||||
let bc = lwvm::compile_bytecode (src.to_vec ()).unwrap ();
|
||||
|
@ -40,5 +40,4 @@ fn embedding () {
|
|||
let output = vm.execute ().unwrap ();
|
||||
|
||||
assert_eq! (output, vec! [Value::from (2019)]);
|
||||
*/
|
||||
}
|
||||
|
|
22
notes.md
22
notes.md
|
@ -64,16 +64,6 @@ Result: Regressed from 3200 to 3600. Not sure why.
|
|||
|
||||
Plan: OP_GETFIELD hits the constants a lot. I thought caching it and not dereferencing the chunk and block constantly might help.
|
||||
|
||||
## Splitting up the opcode match
|
||||
|
||||
Result: No change, 3200 to 3200. Maybe Rust was already optimizing this into an optimal jump table?
|
||||
|
||||
Plan: Maybe if the hot inner opcodes, OP_GETFIELD, OP_MUL, and OP_SETFIELD get their own match statement at the top of the function, the step function can exit sooner.
|
||||
|
||||
## Cache constants
|
||||
|
||||
Result: Regressed again from 2900 to 3700. It's not meant to be.
|
||||
|
||||
## Iterating over instruction list
|
||||
|
||||
(upcoming)
|
||||
|
@ -83,15 +73,3 @@ I noticed PUC Lua doesn't store a program counter, it stores a `u32 *`, a pointe
|
|||
Maybe the real saving is that it saves a little bit of cache space by forgetting the base pointer?
|
||||
|
||||
Storing an iterator sounds like a big fight with the borrow checker. I might want to prototype it outside the interpreter first. But if it works, it might compile down to what PUC Lua does in C. Plus a bounds check.
|
||||
|
||||
## Threaded interpretation of basic blocks
|
||||
|
||||
(upcoming)
|
||||
|
||||
Plan: I don't want to do this, because it's against the spirit of matching what PUC Lua does. And I would need to set up a microbenchmark to prove that it would have any chance of paying off. And it's a little bit over-fitting to the n_body benchmark, whose inner loop is heavy on number crunching.
|
||||
|
||||
But it's inspired by QEMU's TCG, so I think it could work.
|
||||
|
||||
There's 2 places in the n_body inner loop where we have 5 math instructions, 3 muls, and 2 adds, that computer the squared length of a vector. If, when the block is first loaded, we could detect this as a "block of only non-branching math instructions", we could replace that block with specialized instructions for a non-Lua-compatible interpreter. When we hit that block, we make sure all 3 input registers are floats, and then we execute these alternate instructions using special float-only registers and possibly a threaded interpreter mode. When those 5 instructions are done, we either copy the float registers out to the Lua value registers, or the last couple instructions write out Lua values instead of floats. This might reduce the stepping overhead, let us use a simpler decode step (since there would be no instructions for tables or call/return), and we'd be using 8-byte floats and skipping over a couple type checks inside that block.
|
||||
|
||||
The cost is, it adds a ton of complexity, it's a new way to fail, and if the biggest block I can find is 5 ops, it may not pay back.
|
||||
|
|
Loading…
Reference in New Issue