-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathyosefk.cpp
218 lines (176 loc) · 9.15 KB
/
yosefk.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
/*
A coroutine is thus two contexts: the callee's (producer's), and the caller's (consumer's).
The tricky and non-portable part is initializing these contexts (jmp_buf's). The cleaner and portable part is jumping back and forth between them, so let's see that first:
*/
#include "yosefk.h"
#include "stdio.h"
enum { DIRECTINVOKE=0, WORKING=1, DONE=2 };
/* setjmp - If the return is from a direct invocation, setjmp returns 0.
* If the return is from a call to longjmp, setjmp returns a nonzero value.
*/
void yield( coroutine* c ) {
if (!setjmp( c->callee_context )) {
longjmp( c->caller_context, WORKING );
}
}
int resume( coroutine* c ) {
if (DONE == c->state) return DONE;
int ret = setjmp( c->caller_context );
c->state = (ret == WORKING);
if (!ret /* == DIRECTINVOKE*/) {
longjmp( c->callee_context, 1 );
}
else {
return c->state;
}
}
/*
So yield and resume are each just a pair of setjmp/longjmp calls. The trick with setjmp is, it returns 0
right after you call it - having saved the context (machine registers, basically) to your jmp_buf.
You can later return to this context (restore register values) with longjmp.
Where exactly does longjmp bring you? To the point in code where setjmp has just returned. How do you
know that setjmp has just returned because longjmp brought you there - as opposed to returning directly
after you called setjmp? Because setjmp returns the non-zero value that was passed to longjmp, rather
than 0 as it does when returning directly. So if(!setjmp) asks, "did I just call you - or was it a long
time ago and we're back from longjmp?"
As you can see, there's almost no difference between yield and resume - both say, "save our current
context and then go to the other side's context". The only thing setting resume apart is, it reports
whether we're DONE or WORKING.
But, it looks like we're always WORKING, doesn't it? ret is what's passed to longjmp and yield always
passes WORKING. Which brings us to start - the one passing DONE, because, well, the code of start also
contains the part where we're finished.
*/
typedef struct {
coroutine* c;
func f;
void* arg;
void* old_sp;
void* old_fp;
} start_params;
#ifdef _WIN64
// kevin:
// see https://github.com/halayli/lthread/blob/master/src/lthread.c
// for _switch function, might be a better alternative than set_sp set_fp
#include <intrin.h>
#include <windows.h>
extern "C" inline __int64 getRSP();
extern "C" inline __int64 getRBP();
extern "C" inline void setRSP(__int64 v);
extern "C" inline void setRBP(__int64 v);
extern "C" inline void setRSPRBP(__int64 v1, __int64 v2);
#define get_sp( var ) var = (void*)getRSP()
#define get_fp( var ) var = (start_params*)getRBP()
#define set_sp( var ) setRSP((__int64)(var))
#define set_fp( var ) setRBP((__int64)(var))
#define set_spfp( var1, var2 ) setRSPRBP((__int64)(var1), (__int64)(var2))
#elif _WIN32
#define get_sp(var) { __int32 t1918; __asm { \
mov t1918, esp \
}; var = (void*)t1918; }
#define get_fp(var) { __int32 t1918; __asm { \
mov t1918, ebp \
}; var = (start_params*)t1918; }
#define set_sp(var) { __int32 t1918 = (__int32)(var); __asm { \
mov esp, t1918 \
}; }
#define set_fp(var) { __int32 t1918 = (__int32)(var); __asm { \
mov ebp, t1918 \
}; }
#define set_spfp(p1, p2) set_sp( p1 ); set_fp( p2 )
#else
#define get_sp(p) \
__asm__ volatile("movq %%rsp, %0" : "=r"(p))
#define get_fp(p) \
__asm__ volatile("movq %%rbp, %0" : "=r"(p))
#define set_sp(p) \
__asm__ volatile("movq %0, %%rsp" : : "r"(p))
#define set_fp(p) \
__asm__ volatile("movq %0, %%rbp" : : "r"(p))
#define set_spfp(p1, p2) set_sp( p1 ); set_fp( p2 )
#endif
enum { FRAME_SZ=5 }; //fairly arbitrary
void start(coroutine* c, func f, void* arg, void* sp)
{
c->state = WORKING;
start_params* p = ((start_params*)sp)-1;
//save params before stack switching
p->c = c;
p->f = f;
p->arg = arg;
/*
CONTEXT context;
memset( &context, 0, sizeof( CONTEXT ) );
EXCEPTION_RECORD econtext;
memset( &econtext, 0, sizeof( EXCEPTION_RECORD ) );
RtlCaptureContext( &context );
p->old_sp = (void*)context.Rsp;
p->old_fp = (void*)context.Rbp;
*/
get_sp(p->old_sp);
get_fp(p->old_fp);
//char** retaddr = (char**)_AddressOfReturnAddress();
//*retaddr = (char*)(p - FRAME_SZ);
/*RtlCaptureContext( &context );
context.Rsp = DWORD64(p - 5);
context.Rbp = (DWORD64)p;
econtext.ExceptionAddress = (void*)context.Rsp;
RtlRestoreContext( &context, &econtext );
*/
//set_spfp( p - FRAME_SZ, p );
set_sp(p - FRAME_SZ);
set_fp(p); //effectively clobbers p
//(and all other locals)
/*RtlCaptureContext( &context );
p = (start_params*)context.Rbp;
*/
get_fp(p); //…so we read p back from $fp
//and now we read our params from p
if(!setjmp(p->c->callee_context)) {
//set_spfp( p->old_sp, p->old_fp );
/*RtlCaptureContext( &context );
context.Rsp = (DWORD64)p->old_sp;
context.Rbp = (DWORD64)p->old_fp;
econtext.ExceptionAddress = (void*)context.Rsp;
RtlRestoreContext( &context, &econtext );
*/
set_sp(p->old_sp);
set_fp(p->old_fp);
return;
}
(*p->f)(p->arg);
longjmp(p->c->caller_context, DONE);
}
/*
This is longer, uglier, non-portable and perhaps not strictly correct (please tell me if you spot bugs - I have close to zero experience with x86-64, it's just what I test on at home.)
So why so ugly? Because we need a separate stack for the coroutine (the producer), and C doesn't have a way to give it a stack. And without a separate stack, our setjmp will save the registers alright - and then we go back to the consumer, and write happily to places on the stack.
By the time we longjmp back to the producer, all the local variables it kept on the stack (as opposed to in registers) may be long overwritten. That's why out of the box, setjmp/longjmp work fine as a try-catch/throw - but not as resume/yield.
If, however, we:
Allocate space for a separate stack
Make the stack pointer point into that stack before setjmp
Restore the stack pointer after setjmp
Return to the caller (consumer)
…then we'll have created a jmp_buf with a stack pointer pointing to our separate space - jmp_buf remembers the stack pointer among the other registers. And then we can jump as we please back and forth between the producer and the consumer - they no longer overwrite each other's on-stack variables.
The actual stack switching is the ugly part of start. Once we accomplish that, and the producer calls resume for the first time, bringing us past the if(!setjmp), all that is left is to call the entry point function and report that we're DONE:
(*p->f)(p->arg);
longjmp(p->c->caller_context, DONE);
The longjmp brings us back to a context saved by some call to resume, and then resume sees the DONE returned by setjmp and returns false to the consumer.
And now to the stack switching. Basically, all we need is some inline assembly to change $sp - the stack pointer register, something most machines have. There are two complications though:
Apart from $sp, there's also the frame pointer (at least in debug builds) - a register pointing into the stack and used to access local variables. So we have to switch that, too.
Our parameters, such as f and arg, may themselves be on the stack. So how do we access them after switching the stack?
Actually I think that the clean way around these is to write start in pure assembly - where you know where everything is and how to access it and no compiler puts it in different places under different build settings. But I don't like assembly, so instead I did this:
Use space right below the sp address (the stack grows downwards) to store our parameters (in a start_params structure). The beginning of this structure is kept in p. Together with the parameters, we save the stack pointer and frame pointer values:
start_params* p = ((start_params*)sp)-1;
p->c = c;
p->f = f;
p->arg = arg;
get_sp(p->old_sp);
get_fp(p->old_fp);
Set the frame pointer to p, and the stack pointer to p - FRAME_SZ (I picked a random constant for FRAME_SZ that I thought was large enough for whatever the compiler may want to keep on the stack - in bytes, I'm reserving 5*sizeof(start_params)). The idea is, the frame pointer is the highest address of our stack frame, and the stack pointer is the lowest address.
set_sp(p - FRAME_SZ);
set_fp(p);
Read the value of p back from the frame pointer register, because once we've changed the value of that register, we no longer trust any local variable value (because the compiler may have put it on the stack so it will try to access it through the frame pointer - which is no longer the same frame pointer that it has just used to store the variable, so it will get garbage.)
get_fp(p);
Be careful to access everything through p from on - and restore $sp and $fp before returning to the caller.
That's all; the get/set macros use the GNU inline assembly syntax to get/set the $rsp and $rbp registers of x86-64. The code worked with a bunch of compiler flags for me - but do tell if you see bugs in there.
So that's how coroutines work with setjmp/longjmp and a bit of inline assembly - hopefully, porting to another machine takes little more than rewriting the 4 asm macros.
*/