Index: version.lisp-expr
===================================================================
RCS file: /cvsroot/sbcl/sbcl/version.lisp-expr,v
retrieving revision 1.3253
diff -u -r1.3253 version.lisp-expr
--- version.lisp-expr	21 Mar 2007 08:26:19 -0000	1.3253
+++ version.lisp-expr	24 Mar 2007 22:27:35 -0000
@@ -17,4 +17,4 @@
 ;;; checkins which aren't released. (And occasionally for internal
 ;;; versions, especially for internal versions off the main CVS
 ;;; branch, it gets hairier, e.g. "0.pre7.14.flaky4.13".)
-"1.0.3.47"
+"1.0.3.47-win32-threading-1"
Index: src/assembly/x86/arith.lisp
===================================================================
RCS file: /cvsroot/sbcl/sbcl/src/assembly/x86/arith.lisp,v
retrieving revision 1.10
diff -u -r1.10 arith.lisp
--- src/assembly/x86/arith.lisp	4 Mar 2006 19:58:28 -0000	1.10
+++ src/assembly/x86/arith.lisp	24 Mar 2007 22:27:35 -0000
@@ -28,7 +28,7 @@
                  (:res res (descriptor-reg any-reg) edx-offset)
 
                  (:temp eax unsigned-reg eax-offset)
-                 (:temp ebx unsigned-reg ebx-offset)
+                 (:temp ebx unsigned-reg #!-x86-reserve-ebx ebx-offset #!+x86-reserve-ebx esi-offset)
                  (:temp ecx unsigned-reg ecx-offset))
 
                 (declare (ignorable ebx))
Index: src/assembly/x86/assem-rtns.lisp
===================================================================
RCS file: /cvsroot/sbcl/sbcl/src/assembly/x86/assem-rtns.lisp,v
retrieving revision 1.15
diff -u -r1.15 assem-rtns.lisp
--- src/assembly/x86/assem-rtns.lisp	13 Jan 2007 21:05:34 -0000	1.15
+++ src/assembly/x86/assem-rtns.lisp	24 Mar 2007 22:27:35 -0000
@@ -22,6 +22,7 @@
     (return-multiple (:return-style :none))
     (;; These four are really arguments.
      (:temp eax unsigned-reg eax-offset)
+     #!-x86-reserve-ebx
      (:temp ebx unsigned-reg ebx-offset)
      (:temp ecx unsigned-reg ecx-offset)
      (:temp esi unsigned-reg esi-offset)
@@ -36,11 +37,13 @@
   (inst jmp :e one-value)
   (inst cmp ecx (fixnumize 2))
   (inst jmp :e two-values)
-  (inst cmp ecx (fixnumize 3))
-  (inst jmp :e three-values)
+  #!-x86-two-arg-passing-regs
+  (progn
+    (inst cmp ecx (fixnumize 3))
+    (inst jmp :e three-values))
 
   ;; Save the count, because the loop is going to destroy it.
-  (inst mov edx ecx)
+  (inst push ecx)
 
   ;; Blit the values down the stack. Note: there might be overlap, so
   ;; we have to be careful not to clobber values before we've read
@@ -51,7 +54,7 @@
   (inst shr ecx 2)                      ; fixnum to raw word count
   (inst std)                            ; count down
   (inst sub esi 4)                      ; ?
-  (inst lea edi (make-ea :dword :base ebx :disp (- n-word-bytes)))
+  (inst lea edi (make-ea :dword :base #!-x86-reserve-ebx ebx #!+x86-reserve-ebx edx :disp (- n-word-bytes)))
   (inst rep)
   (inst movs :dword)
 
@@ -59,15 +62,25 @@
   #!+sunos (inst cld)
 
   ;; Restore the count.
-  (inst mov ecx edx)
+  (inst pop ecx)
+
+  #!+x86-reserve-ebx
+  (inst mov esi edx)
 
   ;; Set the stack top to the last result.
   (inst lea esp-tn (make-ea :dword :base edi :disp n-word-bytes))
 
   ;; Load the register args.
-  (loadw edx ebx -1)
-  (loadw edi ebx -2)
-  (loadw esi ebx -3)
+  #!-x86-reserve-ebx
+  (progn
+    (loadw edx ebx -1)
+    (loadw edi ebx -2)
+    #!-x86-two-arg-passing-regs
+    (loadw esi ebx -3))
+  #!+x86-reserve-ebx
+  (progn
+    (loadw edx esi -1)
+    (loadw edi esi -2))
 
   ;; And back we go.
   (inst stc)
@@ -75,35 +88,52 @@
 
   ;; Handle the register arg cases.
   ZERO-VALUES
-  (move esp-tn ebx)
+  (move esp-tn #!-x86-reserve-ebx ebx #!+x86-reserve-ebx edx)
   (inst mov edx nil-value)
   (inst mov edi edx)
+  #!-x86-two-arg-passing-regs
   (inst mov esi edx)
+  #!+x86-reserve-ebx
+  (inst mov esi esp-tn)
   (inst stc)
   (inst jmp eax)
 
   ONE-VALUE ; Note: we can get this, because the return-multiple vop
             ; doesn't check for this case when size > speed.
-  (loadw edx esi -1)
-  (inst mov esp-tn ebx)
+  #!-x86-reserve-ebx
+  (progn
+    (loadw edx esi -1)
+    (inst mov esp-tn ebx))
+  #!+x86-reserve-ebx
+  (progn
+    (loadw esi esi -1)
+    (inst mov esp-tn edx)
+    (inst mov edx esi))
   (inst clc)
   (inst jmp eax)
 
   TWO-VALUES
+  #!+x86-reserve-ebx
+  (inst push edx)
   (loadw edx esi -1)
   (loadw edi esi -2)
+  #!-x86-two-arg-passing-regs
   (inst mov esi nil-value)
-  (inst lea esp-tn (make-ea :dword :base ebx :disp (* -2 n-word-bytes)))
+  #!+x86-reserve-ebx
+  (inst pop esi)
+  (inst lea esp-tn (make-ea :dword :base #!-x86-reserve-ebx ebx #!+x86-reserve-ebx esi :disp (* -2 n-word-bytes)))
   (inst stc)
   (inst jmp eax)
 
   THREE-VALUES
-  (loadw edx esi -1)
-  (loadw edi esi -2)
-  (loadw esi esi -3)
-  (inst lea esp-tn (make-ea :dword :base ebx :disp (* -3 n-word-bytes)))
-  (inst stc)
-  (inst jmp eax))
+  #!-x86-two-arg-passing-regs
+  (progn
+    (loadw edx esi -1)
+    (loadw edi esi -2)
+    (loadw esi esi -3)
+    (inst lea esp-tn (make-ea :dword :base ebx :disp (* -3 n-word-bytes)))
+    (inst stc)
+    (inst jmp eax)))
 
 ;;;; TAIL-CALL-VARIABLE
 
@@ -123,6 +153,7 @@
      (:return-style :none))
 
     ((:temp eax unsigned-reg eax-offset)
+     #!-x86-reserve-ebx
      (:temp ebx unsigned-reg ebx-offset)
      (:temp ecx unsigned-reg ecx-offset)
      (:temp edx unsigned-reg edx-offset)
@@ -134,13 +165,16 @@
   (inst sub ecx esp-tn)
 
   ;; Check for all the args fitting the registers.
-  (inst cmp ecx (fixnumize 3))
+  (inst cmp ecx (fixnumize register-arg-count))
   (inst jmp :le REGISTER-ARGS)
 
   ;; Save the OLD-FP and RETURN-PC because the blit it going to trash
   ;; those stack locations. Save the ECX, because the loop is going
   ;; to trash it.
+  #!+x86-reserve-ebx
+  (pushw ebp-tn -2)
   (pushw ebp-tn -1)
+  #!-x86-reserve-ebx
   (loadw ebx ebp-tn -2)
   (inst push ecx)
 
@@ -164,15 +198,23 @@
   (inst pop ecx)
   (popw ebp-tn -1)                      ; overwrites a0
 
+  ;; And move the return-pc to safety.
+  #!+x86-reserve-ebx
+  (inst pop esi)
+
   ;; Blow off the stack above the arguments.
   (inst lea esp-tn (make-ea :dword :base edi :disp n-word-bytes))
 
   ;; remaining register args
   (loadw edi ebp-tn -2)
+  #!-x86-two-arg-passing-regs
   (loadw esi ebp-tn -3)
 
   ;; Push the (saved) return-pc so it looks like we just called.
+  #!-x86-reserve-ebx
   (inst push ebx)
+  #!+x86-reserve-ebx
+  (inst push esi)
 
   ;; And jump into the function.
     (inst jmp
@@ -184,6 +226,7 @@
   REGISTER-ARGS
   (loadw edx esi -1)
   (loadw edi esi -2)
+  #!-x86-two-arg-passing-regs
   (loadw esi esi -3)
 
   ;; Clear most of the stack.
@@ -201,7 +244,7 @@
 (define-assembly-routine (throw
                           (:return-style :none))
                          ((:arg target (descriptor-reg any-reg) edx-offset)
-                          (:arg start any-reg ebx-offset)
+                          (:arg start any-reg #!-x86-reserve-ebx ebx-offset #!+x86-reserve-ebx esi-offset)
                           (:arg count any-reg ecx-offset)
                           (:temp catch any-reg eax-offset))
 
@@ -234,9 +277,9 @@
                           (:translate %continue-unwind)
                           (:policy :fast-safe))
                          ((:arg block (any-reg descriptor-reg) eax-offset)
-                          (:arg start (any-reg descriptor-reg) ebx-offset)
+                          (:arg start (any-reg descriptor-reg) #!-x86-reserve-ebx ebx-offset #!+x86-reserve-ebx esi-offset)
                           (:arg count (any-reg descriptor-reg) ecx-offset)
-                          (:temp uwp unsigned-reg esi-offset))
+                          (:temp uwp unsigned-reg #!-x86-reserve-ebx esi-offset #!+x86-reserve-ebx edi-offset))
   (declare (ignore start count))
 
   (let ((error (generate-error-code nil invalid-unwind-error)))
@@ -281,7 +324,7 @@
                           (:return-style :none)
                           (:policy :fast-safe))
                          ((:arg block (any-reg descriptor-reg) eax-offset)
-                          (:arg start (any-reg descriptor-reg) ebx-offset)
+                          (:arg start (any-reg descriptor-reg) #!-x86-reserve-ebx ebx-offset #!+x86-reserve-ebx esi-offset)
                           (:arg count (any-reg descriptor-reg) ecx-offset))
   (declare (ignore start count))
 
@@ -405,6 +448,13 @@
   ;; Establish our stack frame.
   (inst mov ebp-tn esp-tn)
 
+  #!+x86-ebx-threads
+  (progn
+    ;; Restore our TLS block pointer.
+    (inst push (make-ea :dword :disp (make-fixup "specials" :foreign)))
+    (inst call (make-fixup "TlsGetValue@4" :foreign))
+    (inst mov ebx-tn eax-tn))
+
   ;; This section based on VOP CALL-OUT.
   ;; Restore the NPX for lisp; ensure no regs are empty
   (dotimes (i 8)
@@ -417,8 +467,8 @@
                                        n-word-bytes))))
 
   ;; Update *CURRENT-UNWIND-PROTECT-BLOCK*.
-  (loadw ebx-tn block unwind-block-current-uwp-slot)
-  (store-tl-symbol-value ebx-tn *current-unwind-protect-block* ecx-tn)
+  (loadw edx-tn block unwind-block-current-uwp-slot)
+  (store-tl-symbol-value edx-tn *current-unwind-protect-block* ecx-tn)
 
   ;; Uwp-entry expects some things in known locations so that they can
   ;; be saved on the stack: the block in edx-tn, start in ebx-tn, and
@@ -426,9 +476,9 @@
   ;; do need to have access to our own stack frame, so we hijack the
   ;; known locations to cover our own state.
 
-  (inst xor ebx-tn ebx-tn)
+  (inst xor edx-tn edx-tn)
   (inst xor ecx-tn ecx-tn)
-  (inst mov ebx-tn ebp-tn)
+  (inst mov #!-x86-reserve-ebx ebx-tn #!+x86-reserve-ebx esi-tn ebp-tn)
   (loadw ebp-tn block unwind-block-current-cont-slot)
   (inst jmp (make-ea :byte :base block
                      :disp (* unwind-block-entry-pc-slot n-word-bytes))))
@@ -439,7 +489,7 @@
                           (:translate %continue-unwind)
                           (:policy :fast-safe))
                          ((:arg block (any-reg descriptor-reg) eax-offset)
-                          (:arg start (any-reg descriptor-reg) ebx-offset)
+                          (:arg start (any-reg descriptor-reg) #!-x86-reserve-ebx ebx-offset #!+x86-reserve-ebx esi-offset)
                           (:arg count (any-reg descriptor-reg) ecx-offset))
   (declare (ignore block count))
   ;; The args here are mostly ignored because we're using the
Index: src/assembly/x86/support.lisp
===================================================================
RCS file: /cvsroot/sbcl/sbcl/src/assembly/x86/support.lisp,v
retrieving revision 1.7
diff -u -r1.7 support.lisp
--- src/assembly/x86/support.lisp	7 Mar 2006 18:47:38 -0000	1.7
+++ src/assembly/x86/support.lisp	24 Mar 2007 22:27:35 -0000
@@ -36,11 +36,11 @@
         (note-this-location ,vop :single-value-return)
         (cond
           ((member :cmov *backend-subfeatures*)
-           (inst cmov :c esp-tn ebx-tn))
+           (inst cmov :c esp-tn #!-x86-reserve-ebx ebx-tn #!+x86-reserve-ebx esi-tn))
           (t
            (let ((single-value (gen-label)))
              (inst jmp :nc single-value)
-             (move esp-tn ebx-tn)
+             (move esp-tn #!-x86-reserve-ebx ebx-tn #!+x86-reserve-ebx esi-tn)
              (emit-label single-value)))))
       '((:save-p :compute-only))))))
 
Index: src/code/target-thread.lisp
===================================================================
RCS file: /cvsroot/sbcl/sbcl/src/code/target-thread.lisp,v
retrieving revision 1.63
diff -u -r1.63 target-thread.lisp
--- src/code/target-thread.lisp	18 Mar 2007 19:30:25 -0000	1.63
+++ src/code/target-thread.lisp	24 Mar 2007 22:27:35 -0000
@@ -663,6 +663,7 @@
                              ;; now that most things have a chance to
                              ;; work properly without messing up other
                              ;; threads, it's time to enable signals
+                             #!-win32
                              (sb!unix::reset-signal-mask)
                              (setf (thread-result thread)
                                    (cons t
Index: src/compiler/generic/genesis.lisp
===================================================================
RCS file: /cvsroot/sbcl/sbcl/src/compiler/generic/genesis.lisp,v
retrieving revision 1.128
diff -u -r1.128 genesis.lisp
--- src/compiler/generic/genesis.lisp	15 Jan 2007 22:15:49 -0000	1.128
+++ src/compiler/generic/genesis.lisp	24 Mar 2007 22:27:36 -0000
@@ -1583,15 +1583,13 @@
                   (setf (gethash name *cold-foreign-symbol-table*) value))))))
   (values))     ;; PROGN
 
+(defvar *missing-foreign-symbols* nil)
+
 (defun cold-foreign-symbol-address (name)
   (or (find-foreign-symbol-in-table name *cold-foreign-symbol-table*)
-      *foreign-symbol-placeholder-value*
       (progn
-        (format *error-output* "~&The foreign symbol table is:~%")
-        (maphash (lambda (k v)
-                   (format *error-output* "~&~S = #X~8X~%" k v))
-                 *cold-foreign-symbol-table*)
-        (error "The foreign symbol ~S is undefined." name))))
+        (pushnew name *missing-foreign-symbols* :test #'equal)
+        *foreign-symbol-placeholder-value*)))
 
 (defvar *cold-assembler-routines*)
 
@@ -3150,7 +3148,7 @@
     (do-all-symbols (sym)
       (remprop sym 'cold-intern-info))
 
-    (let* ((*foreign-symbol-placeholder-value* (if core-file-name nil 0))
+    (let* ((*foreign-symbol-placeholder-value* 0)
            (*load-time-value-counter* 0)
            (*cold-fdefn-objects* (make-hash-table :test 'equal))
            (*cold-symbols* (make-hash-table :test 'equal))
@@ -3238,6 +3236,16 @@
         (write-line (namestring file-name))
         (cold-load file-name))
 
+      ;; If we're missing some foreign symbols, now's the time to complain.
+      (when (and core-file-name *missing-foreign-symbols*)
+        (format *error-output* "~&The foreign symbol table is:~%")
+        (maphash (lambda (k v)
+                   (format *error-output* "~&~S = #X~8X~%" k v))
+                 *cold-foreign-symbol-table*)
+        (apply #'error
+               "The foreign symbol~#[~; ~S is~;s ~S and ~S are~:;s~@{~#[~; and~] ~S~^,~} are~] undefined."
+               *missing-foreign-symbols*))
+
       ;; Tidy up loose ends left by cold loading. ("Postpare from cold load?")
       (resolve-assembler-fixups)
       #!+(or x86 x86-64) (output-load-time-code-fixups)
Index: src/compiler/x86/c-call.lisp
===================================================================
RCS file: /cvsroot/sbcl/sbcl/src/compiler/x86/c-call.lisp,v
retrieving revision 1.32
diff -u -r1.32 c-call.lisp
--- src/compiler/x86/c-call.lisp	20 Mar 2006 02:49:16 -0000	1.32
+++ src/compiler/x86/c-call.lisp	24 Mar 2007 22:27:36 -0000
@@ -305,8 +305,12 @@
                                 (static-symbol-offset '*alien-stack*)
                                 (ash symbol-tls-index-slot word-shift)
                                 (- other-pointer-lowtag))))
+        #!-x86-ebx-threads
         (inst fs-segment-prefix)
-        (inst sub (make-ea :dword :base temp) delta)))
+        #!-x86-ebx-threads
+        (inst sub (make-ea :dword :base temp) delta)
+        #!+x86-ebx-threads
+        (inst sub (make-ea :dword :base ebx-tn :index temp) delta)))
     (load-tl-symbol-value result *alien-stack*))
   #!-sb-thread
   (:generator 0
@@ -334,8 +338,12 @@
                                 (static-symbol-offset '*alien-stack*)
                                 (ash symbol-tls-index-slot word-shift)
                                 (- other-pointer-lowtag))))
+        #!-x86-ebx-threads
         (inst fs-segment-prefix)
-        (inst add (make-ea :dword :base temp) delta))))
+        #!-x86-ebx-threads
+        (inst add (make-ea :dword :base temp) delta)
+        #!+x86-ebx-threads
+        (inst add (make-ea :dword :base ebx-tn :index temp) delta))))
   #!-sb-thread
   (:generator 0
     (unless (zerop amount)
Index: src/compiler/x86/call.lisp
===================================================================
RCS file: /cvsroot/sbcl/sbcl/src/compiler/x86/call.lisp,v
retrieving revision 1.35
diff -u -r1.35 call.lisp
--- src/compiler/x86/call.lisp	5 Dec 2006 20:10:27 -0000	1.35
+++ src/compiler/x86/call.lisp	24 Mar 2007 22:27:37 -0000
@@ -197,180 +197,204 @@
 (defun default-unknown-values (vop values nvals)
   (declare (type (or tn-ref null) values)
            (type unsigned-byte nvals))
-  (cond
-   ((<= nvals 1)
-    (note-this-location vop :single-value-return)
-    (let ((single-value (gen-label)))
-      (cond
-       ((member :cmov *backend-subfeatures*)
-        (inst cmov :c esp-tn ebx-tn))
-       (t
-        (inst jmp :nc single-value)
-        (inst mov esp-tn ebx-tn)
-        (emit-label single-value)))))
-   ((<= nvals register-arg-count)
-    (let ((regs-defaulted (gen-label)))
-      (note-this-location vop :unknown-return)
-      (inst jmp :c regs-defaulted)
-      ;; Default the unsuppled registers.
-      (let* ((2nd-tn-ref (tn-ref-across values))
-             (2nd-tn (tn-ref-tn 2nd-tn-ref)))
-        (inst mov 2nd-tn nil-value)
-        (when (> nvals 2)
-          (loop
-            for tn-ref = (tn-ref-across 2nd-tn-ref)
-            then (tn-ref-across tn-ref)
-            for count from 2 below register-arg-count
-            do (inst mov (tn-ref-tn tn-ref) 2nd-tn))))
-      (inst mov ebx-tn esp-tn)
-      (emit-label regs-defaulted)
-      (inst mov esp-tn ebx-tn)))
-   ((<= nvals 7)
-    ;; The number of bytes depends on the relative jump instructions.
-    ;; Best case is 31+(n-3)*14, worst case is 35+(n-3)*18. For
-    ;; NVALS=6 that is 73/89 bytes, and for NVALS=7 that is 87/107
-    ;; bytes which is likely better than using the blt below.
-    (let ((regs-defaulted (gen-label))
-          (defaulting-done (gen-label))
-          (default-stack-slots (gen-label)))
-      (note-this-location vop :unknown-return)
-      ;; Branch off to the MV case.
-      (inst jmp :c regs-defaulted)
-      ;; Do the single value case.
-      ;; Default the register args
-      (inst mov eax-tn nil-value)
-      (do ((i 1 (1+ i))
-           (val (tn-ref-across values) (tn-ref-across val)))
-          ((= i (min nvals register-arg-count)))
-        (inst mov (tn-ref-tn val) eax-tn))
-
-      ;; Fake other registers so it looks like we returned with all the
-      ;; registers filled in.
-      (move ebx-tn esp-tn)
-      (inst push edx-tn)
-      (inst jmp default-stack-slots)
-
-      (emit-label regs-defaulted)
-
-      (inst mov eax-tn nil-value)
-      (storew edx-tn ebx-tn -1)
-      (collect ((defaults))
-        (do ((i register-arg-count (1+ i))
-             (val (do ((i 0 (1+ i))
-                       (val values (tn-ref-across val)))
-                      ((= i register-arg-count) val))
-                  (tn-ref-across val)))
-            ((null val))
-          (let ((default-lab (gen-label))
-                (tn (tn-ref-tn val)))
-            (defaults (cons default-lab tn))
-
-            (inst cmp ecx-tn (fixnumize i))
-            (inst jmp :be default-lab)
-            (loadw edx-tn ebx-tn (- (1+ i)))
-            (inst mov tn edx-tn)))
-
-        (emit-label defaulting-done)
-        (loadw edx-tn ebx-tn -1)
-        (move esp-tn ebx-tn)
-
-        (let ((defaults (defaults)))
-          (when defaults
-            (assemble (*elsewhere*)
-              (trace-table-entry trace-table-fun-prologue)
-              (emit-label default-stack-slots)
-              (dolist (default defaults)
-                (emit-label (car default))
-                (inst mov (cdr default) eax-tn))
-              (inst jmp defaulting-done)
-              (trace-table-entry trace-table-normal)))))))
-   (t
-    ;; 91 bytes for this branch.
-    (let ((regs-defaulted (gen-label))
-          (restore-edi (gen-label))
-          (no-stack-args (gen-label))
-          (default-stack-vals (gen-label))
-          (count-okay (gen-label)))
-      (note-this-location vop :unknown-return)
-      ;; Branch off to the MV case.
-      (inst jmp :c regs-defaulted)
-
-      ;; Default the register args, and set up the stack as if we
-      ;; entered the MV return point.
-      (inst mov ebx-tn esp-tn)
-      (inst push edx-tn)
-      (inst mov edi-tn nil-value)
-      (inst push edi-tn)
-      (inst mov esi-tn edi-tn)
-      ;; Compute a pointer to where to put the [defaulted] stack values.
-      (emit-label no-stack-args)
-      (inst lea edi-tn
-            (make-ea :dword :base ebp-tn
-                     :disp (* (- (1+ register-arg-count)) n-word-bytes)))
-      ;; Load EAX with NIL so we can quickly store it, and set up
-      ;; stuff for the loop.
-      (inst mov eax-tn nil-value)
-      (inst std)
-      (inst mov ecx-tn (- nvals register-arg-count))
-      ;; solaris requires DF being zero.
-      #!+sunos (inst cld)
-      ;; Jump into the default loop.
-      (inst jmp default-stack-vals)
-
-      ;; The regs are defaulted. We need to copy any stack arguments,
-      ;; and then default the remaining stack arguments.
-      (emit-label regs-defaulted)
-      ;; Save EDI.
-      (storew edi-tn ebx-tn (- (1+ 1)))
-      ;; Compute the number of stack arguments, and if it's zero or
-      ;; less, don't copy any stack arguments.
-      (inst sub ecx-tn (fixnumize register-arg-count))
-      (inst jmp :le no-stack-args)
-
-      ;; Throw away any unwanted args.
-      (inst cmp ecx-tn (fixnumize (- nvals register-arg-count)))
-      (inst jmp :be count-okay)
-      (inst mov ecx-tn (fixnumize (- nvals register-arg-count)))
-      (emit-label count-okay)
-      ;; Save the number of stack values.
-      (inst mov eax-tn ecx-tn)
-      ;; Compute a pointer to where the stack args go.
-      (inst lea edi-tn
-            (make-ea :dword :base ebp-tn
-                     :disp (* (- (1+ register-arg-count)) n-word-bytes)))
-      ;; Save ESI, and compute a pointer to where the args come from.
-      (storew esi-tn ebx-tn (- (1+ 2)))
-      (inst lea esi-tn
-            (make-ea :dword :base ebx-tn
-                     :disp (* (- (1+ register-arg-count)) n-word-bytes)))
-      ;; Do the copy.
-      (inst shr ecx-tn word-shift)              ; make word count
-      (inst std)
-      (inst rep)
-      (inst movs :dword)
-      ;; solaris requires DF being zero.
-      #!+sunos (inst cld)
-      ;; Restore ESI.
-      (loadw esi-tn ebx-tn (- (1+ 2)))
-      ;; Now we have to default the remaining args. Find out how many.
-      (inst sub eax-tn (fixnumize (- nvals register-arg-count)))
-      (inst neg eax-tn)
-      ;; If none, then just blow out of here.
-      (inst jmp :le restore-edi)
-      (inst mov ecx-tn eax-tn)
-      (inst shr ecx-tn word-shift)      ; word count
-      ;; Load EAX with NIL for fast storing.
-      (inst mov eax-tn nil-value)
-      ;; Do the store.
-      (emit-label default-stack-vals)
-      (inst rep)
-      (inst stos eax-tn)
-      ;; solaris requires DF being zero.
-      #!+sunos (inst cld)
-      ;; Restore EDI, and reset the stack.
-      (emit-label restore-edi)
-      (loadw edi-tn ebx-tn (- (1+ 1)))
-      (inst mov esp-tn ebx-tn))))
+  (let ((values-start #!-x86-reserve-ebx ebx-tn #!+x86-reserve-ebx esi-tn))
+    (cond
+      ((<= nvals 1)
+       (note-this-location vop :single-value-return)
+       (let ((single-value (gen-label)))
+         (cond
+           ((member :cmov *backend-subfeatures*)
+            (inst cmov :c esp-tn values-start))
+           (t
+            (inst jmp :nc single-value)
+            (inst mov esp-tn values-start)
+            (emit-label single-value)))))
+      ((<= nvals register-arg-count)
+       (let ((regs-defaulted (gen-label)))
+         (note-this-location vop :unknown-return)
+         (inst jmp :c regs-defaulted)
+         ;; Default the unsuppled registers.
+         (let* ((2nd-tn-ref (tn-ref-across values))
+                (2nd-tn (tn-ref-tn 2nd-tn-ref)))
+           (inst mov 2nd-tn nil-value)
+           (when (> nvals 2)
+             (loop
+              for tn-ref = (tn-ref-across 2nd-tn-ref)
+              then (tn-ref-across tn-ref)
+              for count from 2 below register-arg-count
+              do (inst mov (tn-ref-tn tn-ref) 2nd-tn))))
+         (inst mov values-start esp-tn)
+         (emit-label regs-defaulted)
+         (inst mov esp-tn values-start)))
+      ((<= nvals 7)
+       ;; The number of bytes depends on the relative jump instructions.
+       ;; Best case is 31+(n-3)*14, worst case is 35+(n-3)*18. For
+       ;; NVALS=6 that is 73/89 bytes, and for NVALS=7 that is 87/107
+       ;; bytes which is likely better than using the blt below.
+       (let ((regs-defaulted (gen-label))
+             (defaulting-done (gen-label))
+             (default-stack-slots (gen-label)))
+         (note-this-location vop :unknown-return)
+         ;; Branch off to the MV case.
+         (inst jmp :c regs-defaulted)
+         ;; Do the single value case.
+         ;; Default the register args
+         (inst mov eax-tn nil-value)
+         (do ((i 1 (1+ i))
+              (val (tn-ref-across values) (tn-ref-across val)))
+             ((= i (min nvals register-arg-count)))
+           (inst mov (tn-ref-tn val) eax-tn))
+
+         ;; Fake other registers so it looks like we returned with all the
+         ;; registers filled in.
+         (move values-start esp-tn)
+         (inst push edx-tn)
+         (inst jmp default-stack-slots)
+
+         (emit-label regs-defaulted)
+
+         (inst mov eax-tn nil-value)
+         (storew edx-tn values-start -1)
+         (collect ((defaults))
+                  (do ((i register-arg-count (1+ i))
+                       (val (do ((i 0 (1+ i))
+                                 (val values (tn-ref-across val)))
+                                ((= i register-arg-count) val))
+                            (tn-ref-across val)))
+                      ((null val))
+                    (let ((default-lab (gen-label))
+                          (tn (tn-ref-tn val)))
+                      (defaults (cons default-lab tn))
+
+                      (inst cmp ecx-tn (fixnumize i))
+                      (inst jmp :be default-lab)
+                      (loadw edx-tn values-start (- (1+ i)))
+                      (inst mov tn edx-tn)))
+
+                  (emit-label defaulting-done)
+                  (loadw edx-tn values-start -1)
+                  (move esp-tn values-start)
+
+                  (let ((defaults (defaults)))
+                    (when defaults
+                      (assemble (*elsewhere*)
+                                (trace-table-entry trace-table-fun-prologue)
+                                (emit-label default-stack-slots)
+                                (dolist (default defaults)
+                                  (emit-label (car default))
+                                  (inst mov (cdr default) eax-tn))
+                                (inst jmp defaulting-done)
+                                (trace-table-entry trace-table-normal)))))))
+      (t
+       ;; 91 bytes for this branch.
+       (assemble ()
+         (note-this-location vop :unknown-return)
+
+         ;; All code paths here use string instructions, and need to
+         ;; move down.
+         (inst std)
+
+         ;; Branch off to the MV case.
+         (inst jmp :c regs-defaulted)
+
+         ;; Default the register args, and set up the stack as if we
+         ;; entered the MV return point.
+         (inst mov values-start esp-tn)
+         (inst push edx-tn)
+         #!+x86-reserve-ebx
+         (inst push nil-value)
+         #!-x86-reserve-ebx
+         (progn
+           (inst mov edi-tn nil-value)
+           (inst push edi-tn))
+         #!-x86-two-arg-passing-regs
+         (inst mov esi-tn edi-tn)
+
+         ;; Compute a pointer to where to put the [defaulted] stack values.
+         NO-STACK-ARGS
+         (inst lea edi-tn
+               (make-ea :dword :base ebp-tn
+                        :disp (* (- (1+ register-arg-count)) n-word-bytes)))
+
+         ;; Set up loop count.
+         (inst mov ecx-tn (- nvals register-arg-count))
+
+         ;; Jump into the default loop.
+         (inst jmp default-stack-vals)
+
+         ;; The regs are defaulted. We need to copy any stack arguments,
+         ;; and then default the remaining stack arguments.
+         REGS-DEFAULTED
+
+         ;; Save EDI.
+         (storew edi-tn values-start (- (1+ 1)))
+
+         ;; Compute the number of stack arguments, and if it's zero or
+         ;; less, don't copy any stack arguments.
+         (inst sub ecx-tn (fixnumize register-arg-count))
+         (inst jmp :le no-stack-args)
+
+         ;; Throw away any unwanted args.
+         (inst cmp ecx-tn (fixnumize (- nvals register-arg-count)))
+         (inst jmp :be count-okay)
+         (inst mov ecx-tn (fixnumize (- nvals register-arg-count)))
+         COUNT-OKAY
+
+         ;; Save the number of stack values.
+         (inst mov eax-tn ecx-tn)
+
+         ;; Compute a pointer to where the stack args go.
+         (inst lea edi-tn
+               (make-ea :dword :base ebp-tn
+                        :disp (* (- (1+ register-arg-count)) n-word-bytes)))
+
+         ;; Save ESI
+         #!-x86-two-arg-passing-regs
+         (storew esi-tn ebx-tn (- (1+ 2)))
+         #!+x86-reserve-ebx
+         (inst push values-start)
+
+         ;; compute a pointer to where the args come from.
+         (inst lea esi-tn
+               (make-ea :dword :base values-start
+                        :disp (* (- (1+ register-arg-count)) n-word-bytes)))
+
+         ;; Do the copy.
+         (inst shr ecx-tn word-shift)   ; make word count
+         (inst rep)
+         (inst movs :dword)
+
+         ;; Restore ESI.
+         #!-x86-two-arg-passing-regs
+         (loadw esi-tn ebx-tn (- (1+ 2)))
+         #!+x86-reserve-ebx
+         (inst pop values-start)
+
+         ;; Now we have to default the remaining args. Find out how many.
+         (inst sub eax-tn (fixnumize (- nvals register-arg-count)))
+         (inst neg eax-tn)
+
+         ;; If none, then just blow out of here.
+         (inst jmp :le restore-edi)
+         (inst mov ecx-tn eax-tn)
+         (inst shr ecx-tn word-shift)   ; word count
+
+         DEFAULT-STACK-VALS
+
+         ;; Load EAX with NIL for fast storing.
+         (inst mov eax-tn nil-value)
+
+         ;; Do the store.
+         (inst rep)
+         (inst stos eax-tn)
+
+         ;; Restore EDI, and reset the stack.
+         RESTORE-EDI
+         (loadw edi-tn values-start (- (1+ 1)))
+         (inst mov esp-tn values-start)
+
+         ;; solaris requires DF being zero.
+         #!+sunos (inst cld)))))
   (values))
 
 ;;;; unknown values receiving
@@ -423,7 +447,7 @@
 ;;; VOP that can be inherited by unknown values receivers. The main thing this
 ;;; handles is allocation of the result temporaries.
 (define-vop (unknown-values-receiver)
-  (:temporary (:sc descriptor-reg :offset ebx-offset
+  (:temporary (:sc descriptor-reg :offset #!-x86-reserve-ebx ebx-offset #!+x86-reserve-ebx esi-offset
                    :from :eval :to (:result 0))
               values-start)
   (:temporary (:sc any-reg :offset ecx-offset
@@ -1049,7 +1073,7 @@
 
   ;; In the case of other than one value, we need these registers to
   ;; tell the caller where they are and how many there are.
-  (:temporary (:sc unsigned-reg :offset ebx-offset) ebx)
+  (:temporary (:sc unsigned-reg :offset #!-x86-reserve-ebx ebx-offset #!+x86-reserve-ebx esi-offset) argptr)
   (:temporary (:sc unsigned-reg :offset ecx-offset) ecx)
 
   ;; We need to stretch the lifetime of return-pc past the argument
@@ -1059,13 +1083,14 @@
                    :from :eval) a0)
   (:temporary (:sc unsigned-reg :offset (second *register-arg-offsets*)
                    :from :eval) a1)
+  #!-x86-two-arg-passing-regs
   (:temporary (:sc unsigned-reg :offset (third *register-arg-offsets*)
                    :from :eval) a2)
 
   (:generator 6
     (trace-table-entry trace-table-fun-epilogue)
     ;; Establish the values pointer and values count.
-    (move ebx ebp-tn)
+    (move argptr ebp-tn)
     (if (zerop nvals)
         (inst xor ecx ecx) ; smaller
       (inst mov ecx (fixnumize nvals)))
@@ -1073,11 +1098,12 @@
     (move ebp-tn old-fp)
     ;; Clear as much of the stack as possible, but not past the return
     ;; address.
-    (inst lea esp-tn (make-ea :dword :base ebx
+    (inst lea esp-tn (make-ea :dword :base argptr
                               :disp (- (* (max nvals 2) n-word-bytes))))
     ;; Pre-default any argument register that need it.
     (when (< nvals register-arg-count)
-      (let* ((arg-tns (nthcdr nvals (list a0 a1 a2)))
+      (let* ((arg-tns (nthcdr nvals (list a0 a1
+                                          #!-x86-two-arg-passing-regs a2)))
              (first (first arg-tns)))
         (inst mov first nil-value)
         (dolist (tn (cdr arg-tns))
@@ -1086,7 +1112,7 @@
     (inst stc)
     ;; And away we go. Except that return-pc is still on the
     ;; stack and we've changed the stack pointer. So we have to
-    ;; tell it to index off of EBX instead of EBP.
+    ;; tell it to index off of EBX (or ESI) instead of EBP.
     (cond ((zerop nvals)
            ;; Return popping the return address and the OCFP.
            (inst ret n-word-bytes))
@@ -1095,7 +1121,7 @@
            ;; happen, or is a single value return handled elsewhere?
            (inst ret))
           (t
-           (inst jmp (make-ea :dword :base ebx
+           (inst jmp (make-ea :dword :base argptr
                               :disp (- (* (1+ (tn-offset return-pc))
                                           n-word-bytes))))))
 
@@ -1109,7 +1135,7 @@
 ;;;
 ;;; The assembly routine takes the following args:
 ;;;  EAX -- the return-pc to finally jump to.
-;;;  EBX -- pointer to where to put the values.
+;;;  EBX -- pointer to where to put the values. (EDX when #!+x86-reserve-ebx)
 ;;;  ECX -- number of values to find there.
 ;;;  ESI -- pointer to where to find the values.
 (define-vop (return-multiple)
@@ -1121,6 +1147,7 @@
   (:temporary (:sc unsigned-reg :offset eax-offset :from (:argument 1)) eax)
   (:temporary (:sc unsigned-reg :offset esi-offset :from (:argument 2)) esi)
   (:temporary (:sc unsigned-reg :offset ecx-offset :from (:argument 3)) ecx)
+  #!-x86-reserve-ebx
   (:temporary (:sc unsigned-reg :offset ebx-offset :from (:eval 0)) ebx)
   (:temporary (:sc descriptor-reg :offset (first *register-arg-offsets*)
                    :from (:eval 0)) a0)
@@ -1153,7 +1180,10 @@
         (emit-label not-single)))
     (move esi vals)
     (move ecx nvals)
+    #!-x86-reserve-ebx
     (move ebx ebp-tn)
+    #!+x86-reserve-ebx
+    (move a0 ebp-tn)
     (move ebp-tn old-fp)
     (inst jmp (make-fixup 'return-multiple :assembly-routine))
     (trace-table-entry trace-table-normal)))
@@ -1209,18 +1239,17 @@
 
     ;; Allocate the space on the stack.
     ;; stack = ebp - (max 3 frame-size) - (nargs - fixed)
-    (inst lea ebx-tn
+    (inst lea esp-tn
           (make-ea :dword :base ebp-tn
                    :disp (- (fixnumize fixed)
                             (* n-word-bytes
                                (max 3 (sb-allocated-size 'stack))))))
-    (inst sub ebx-tn ecx-tn)  ; Got the new stack in ebx
-    (inst mov esp-tn ebx-tn)
+    (inst sub esp-tn ecx-tn)
 
     ;; Now: nargs>=1 && nargs>fixed
 
     ;; Save the original count of args.
-    (inst mov ebx-tn ecx-tn)
+    (inst push ecx-tn)
 
     (cond ((< fixed register-arg-count)
            ;; We must stop when we run out of stack args, not when we
@@ -1242,7 +1271,7 @@
 
     ;; Initialize src to be end of args.
     (inst mov esi-tn ebp-tn)
-    (inst sub esi-tn ebx-tn)
+    (inst sub esi-tn (make-ea :dword :base esp-tn :disp 12))
 
     ;; We need to copy from downwards up to avoid overwriting some of
     ;; the yet uncopied args. So we need to use EBX as the copy index
@@ -1254,7 +1283,7 @@
     COPY-LOOP
     (inst mov edi-tn (make-ea :dword :base esi-tn :index ebx-tn))
     ;; The :DISP is to account for the registers saved on the stack
-    (inst mov (make-ea :dword :base esp-tn :disp (* 3 n-word-bytes)
+    (inst mov (make-ea :dword :base esp-tn :disp (* 4 n-word-bytes)
                        :index ebx-tn)
           edi-tn)
     (inst add ebx-tn n-word-bytes)
@@ -1269,7 +1298,7 @@
     DO-REGS
 
     ;; Restore ECX
-    (inst mov ecx-tn ebx-tn)
+    (inst pop ecx-tn)
 
     ;; Here: nargs>=1 && nargs>fixed
     (when (< fixed register-arg-count)
@@ -1477,8 +1506,10 @@
   ;; register on -SB-THREAD.
   #!+sb-thread
   (progn
+    #!-x86-ebx-threads
     (inst fs-segment-prefix)
     (inst cmp (make-ea :dword
+                       #!+x86-ebx-threads :base #!+x86-ebx-threads ebx-tn
                        :disp (* thread-stepping-slot n-word-bytes))
           nil-value))
   #!-sb-thread
Index: src/compiler/x86/cell.lisp
===================================================================
RCS file: /cvsroot/sbcl/sbcl/src/compiler/x86/cell.lisp,v
retrieving revision 1.29
diff -u -r1.29 cell.lisp
--- src/compiler/x86/cell.lisp	19 Oct 2006 02:06:12 -0000	1.29
+++ src/compiler/x86/cell.lisp	24 Mar 2007 22:27:37 -0000
@@ -59,7 +59,7 @@
 
 
 ;;; The compiler likes to be able to directly SET symbols.
-#!+sb-thread
+#!+(and sb-thread (not x86-ebx-threads))
 (define-vop (set)
   (:args (symbol :scs (descriptor-reg))
          (value :scs (descriptor-reg any-reg)))
@@ -81,6 +81,24 @@
       (storew value symbol symbol-value-slot other-pointer-lowtag)
       (emit-label done))))
 
+#!+x86-ebx-threads
+(define-vop (set)
+  (:args (symbol :scs (descriptor-reg))
+         (value :scs (descriptor-reg any-reg)))
+  (:temporary (:sc descriptor-reg) tls)
+  (:generator 4
+    (loadw tls symbol symbol-tls-index-slot other-pointer-lowtag)
+    (inst or tls tls)
+    (inst jmp :z global-val)
+    (inst cmp (make-ea :dword :base ebx-tn :index tls)
+          no-tls-value-marker-widetag)
+    (inst jmp :z global-val)
+    (inst mov (make-ea :dword :base ebx-tn :index tls) value)
+    (inst jmp done)
+    GLOBAL-VAL
+    (storew value symbol symbol-value-slot other-pointer-lowtag)
+    DONE))
+
 ;; unithreaded it's a lot simpler ...
 #!-sb-thread
 (define-vop (set cell-set)
@@ -101,8 +119,10 @@
            (err-lab (generate-error-code vop unbound-symbol-error object))
            (ret-lab (gen-label)))
       (loadw value object symbol-tls-index-slot other-pointer-lowtag)
+      #!-x86-ebx-threads
       (inst fs-segment-prefix)
-      (inst mov value (make-ea :dword :base value))
+      (inst mov value (make-ea :dword :base #!+x86-ebx-threads ebx-tn
+                               #!+x86-ebx-threads :index value))
       (inst cmp value no-tls-value-marker-widetag)
       (inst jmp :ne check-unbound-label)
       (loadw value object symbol-value-slot other-pointer-lowtag)
@@ -123,8 +143,10 @@
   (:generator 8
     (let ((ret-lab (gen-label)))
       (loadw value object symbol-tls-index-slot other-pointer-lowtag)
+      #!-x86-ebx-threads
       (inst fs-segment-prefix)
-      (inst mov value (make-ea :dword :base value))
+      (inst mov value (make-ea :dword :base #!+x86-ebx-threads ebx-tn
+                               #!+x86-ebx-threads :index value))
       (inst cmp value no-tls-value-marker-widetag)
       (inst jmp :ne ret-lab)
       (loadw value object symbol-value-slot other-pointer-lowtag)
@@ -180,8 +202,10 @@
   (:generator 9
     (let ((check-unbound-label (gen-label)))
       (loadw value object symbol-tls-index-slot other-pointer-lowtag)
+      #!-x86-ebx-threads
       (inst fs-segment-prefix)
-      (inst mov value (make-ea :dword :base value))
+      (inst mov value (make-ea :dword :base #!+x86-ebx-threads ebx-tn
+                               #!+x86-ebx-threads :index value))
       (inst cmp value no-tls-value-marker-widetag)
       (inst jmp :ne check-unbound-label)
       (loadw value object symbol-value-slot other-pointer-lowtag)
@@ -274,7 +298,8 @@
 
 #!+sb-thread
 (define-vop (bind)
-  (:args (val :scs (any-reg descriptor-reg))
+  (:args (val :scs #!-x86-reserve-ebx (any-reg descriptor-reg)
+              #!+x86-reserve-ebx (control-stack))
          (symbol :scs (descriptor-reg)))
   (:temporary (:sc descriptor-reg :offset eax-offset) eax)
   (:temporary (:sc unsigned-reg) tls-index temp bsp)
@@ -311,12 +336,23 @@
        (store-symbol-value 0 *tls-index-lock*))
 
       (emit-label tls-index-valid)
-      (inst fs-segment-prefix)
-      (inst mov temp (make-ea :dword :base tls-index))
-      (storew temp bsp (- binding-value-slot binding-size))
-      (storew symbol bsp (- binding-symbol-slot binding-size))
-      (inst fs-segment-prefix)
-      (inst mov (make-ea :dword :base tls-index) val))))
+      (let ((tls-ea (make-ea :dword :base #!+x86-ebx-threads ebx-tn
+                             #!+x86-ebx-threads :index tls-index)))
+        #!-x86-ebx-threads
+        (inst fs-segment-prefix)
+        (inst mov temp tls-ea)
+        (storew temp bsp (- binding-value-slot binding-size))
+        (storew symbol bsp (- binding-symbol-slot binding-size))
+        (if (sc-is val control-stack)
+            (progn
+              (inst push val)
+              #!-x86-ebx-threads
+              (inst fs-segment-prefix)
+              (inst pop tls-ea))
+            (progn
+              #!-x86-ebx-threads
+              (inst fs-segment-prefix)
+              (inst mov tls-ea val)))))))
 
 #!-sb-thread
 (define-vop (bind)
@@ -343,8 +379,10 @@
     (loadw value bsp (- binding-value-slot binding-size))
 
     (loadw tls-index symbol symbol-tls-index-slot other-pointer-lowtag)
+    #!-x86-ebx-threads
     (inst fs-segment-prefix)
-    (inst mov (make-ea :dword :base tls-index) value)
+    (inst mov (make-ea :dword :base #!+x86-ebx-threads ebx-tn
+                       #!+x86-ebx-threads :index tls-index) value)
 
     (storew 0 bsp (- binding-symbol-slot binding-size))
     (storew 0 bsp (- binding-value-slot binding-size))
@@ -382,8 +420,12 @@
 
     #!+sb-thread (loadw
                   tls-index symbol symbol-tls-index-slot other-pointer-lowtag)
-    #!+sb-thread (inst fs-segment-prefix)
-    #!+sb-thread (inst mov (make-ea :dword :base tls-index) value)
+    #!+(and sb-thread (not x86-ebx-threads))
+    (progn
+      (inst fs-segment-prefix)
+      (inst mov (make-ea :dword :base tls-index) value))
+    #!+x86-ebx-threads
+    (inst mov (make-ea :dword :base ebx-tn :index tls-index) value)
     (storew 0 bsp (- binding-symbol-slot binding-size))
 
     SKIP
Index: src/compiler/x86/macros.lisp
===================================================================
RCS file: /cvsroot/sbcl/sbcl/src/compiler/x86/macros.lisp,v
retrieving revision 1.41
diff -u -r1.41 macros.lisp
--- src/compiler/x86/macros.lisp	2 Mar 2007 04:36:03 -0000	1.41
+++ src/compiler/x86/macros.lisp	24 Mar 2007 22:27:37 -0000
@@ -93,21 +93,31 @@
            (ash symbol-tls-index-slot word-shift)
            (- other-pointer-lowtag))))
 
-#!+sb-thread
+#!+(and sb-thread (not x86-ebx-threads))
 (defmacro load-tl-symbol-value (reg symbol)
   `(progn
     (inst mov ,reg (make-ea-for-symbol-tls-index ,symbol))
     (inst fs-segment-prefix)
     (inst mov ,reg (make-ea :dword :base ,reg))))
+#!+x86-ebx-threads
+(defmacro load-tl-symbol-value (reg symbol)
+  `(progn
+    (inst mov ,reg (make-ea-for-symbol-tls-index ,symbol))
+    (inst mov ,reg (make-ea :dword :base ebx-tn :index ,reg))))
 #!-sb-thread
 (defmacro load-tl-symbol-value (reg symbol) `(load-symbol-value ,reg ,symbol))
 
-#!+sb-thread
+#!+(and sb-thread (not x86-ebx-threads))
 (defmacro store-tl-symbol-value (reg symbol temp)
   `(progn
     (inst mov ,temp (make-ea-for-symbol-tls-index ,symbol))
     (inst fs-segment-prefix)
     (inst mov (make-ea :dword :base ,temp) ,reg)))
+#!+x86-ebx-threads
+(defmacro store-tl-symbol-value (reg symbol temp)
+  `(progn
+    (inst mov ,temp (make-ea-for-symbol-tls-index ,symbol))
+    (inst mov (make-ea :dword :base ebx-tn :index ,temp) ,reg)))
 #!-sb-thread
 (defmacro store-tl-symbol-value (reg symbol temp)
   (declare (ignore temp))
@@ -116,19 +126,23 @@
 (defmacro load-binding-stack-pointer (reg)
   #!+sb-thread
   `(progn
-     (inst fs-segment-prefix)
-     (inst mov ,reg (make-ea :dword
-                             :disp (* 4 thread-binding-stack-pointer-slot))))
+    #!-x86-ebx-threads
+    (inst fs-segment-prefix)
+    (inst mov ,reg (make-ea :dword
+                    #!+x86-ebx-threads ,@'(:base ebx-tn)
+                    :disp (* 4 thread-binding-stack-pointer-slot))))
   #!-sb-thread
   `(load-symbol-value ,reg *binding-stack-pointer*))
 
 (defmacro store-binding-stack-pointer (reg)
   #!+sb-thread
   `(progn
-     (inst fs-segment-prefix)
-     (inst mov (make-ea :dword
-                        :disp (* 4 thread-binding-stack-pointer-slot))
-           ,reg))
+    #!-x86-ebx-threads
+    (inst fs-segment-prefix)
+    (inst mov (make-ea :dword
+               #!+x86-ebx-threads ,@'(:base ebx-tn)
+               :disp (* 4 thread-binding-stack-pointer-slot))
+     ,reg))
   #!-sb-thread
   `(store-symbol-value ,reg *binding-stack-pointer*))
 
@@ -201,18 +215,20 @@
         (free-pointer
          (make-ea :dword :disp
                   #!+sb-thread (* n-word-bytes thread-alloc-region-slot)
+                  #!+x86-ebx-threads :base #!+x86-ebx-threads ebx-tn
                   #!-sb-thread (make-fixup "boxed_region" :foreign)
                   :scale 1)) ; thread->alloc_region.free_pointer
         (end-addr
          (make-ea :dword :disp
                   #!+sb-thread (* n-word-bytes (1+ thread-alloc-region-slot))
+                  #!+x86-ebx-threads :base #!+x86-ebx-threads ebx-tn
                   #!-sb-thread (make-fixup "boxed_region" :foreign 4)
                   :scale 1)))   ; thread->alloc_region.end_addr
     (unless (and (tn-p size) (location= alloc-tn size))
       (inst mov alloc-tn size))
-    #!+sb-thread (inst fs-segment-prefix)
+    #!+(and sb-thread (not x86-ebx-threads)) (inst fs-segment-prefix)
     (inst add alloc-tn free-pointer)
-    #!+sb-thread (inst fs-segment-prefix)
+    #!+(and sb-thread (not x86-ebx-threads)) (inst fs-segment-prefix)
     (inst cmp alloc-tn end-addr)
     (inst jmp :be ok)
     (let ((dst (ecase (tn-offset alloc-tn)
@@ -228,15 +244,15 @@
     ;; Swap ALLOC-TN and FREE-POINTER
     (cond ((and (tn-p size) (location= alloc-tn size))
            ;; XCHG is extremely slow, use the xor swap trick
-           #!+sb-thread (inst fs-segment-prefix)
+           #!+(and sb-thread (not x86-ebx-threads)) (inst fs-segment-prefix)
            (inst xor alloc-tn free-pointer)
-           #!+sb-thread (inst fs-segment-prefix)
+           #!+(and sb-thread (not x86-ebx-threads)) (inst fs-segment-prefix)
            (inst xor free-pointer alloc-tn)
-           #!+sb-thread (inst fs-segment-prefix)
+           #!+(and sb-thread (not x86-ebx-threads)) (inst fs-segment-prefix)
            (inst xor alloc-tn free-pointer))
           (t
            ;; It's easier if SIZE is still available.
-           #!+sb-thread (inst fs-segment-prefix)
+           #!+(and sb-thread (not x86-ebx-threads)) (inst fs-segment-prefix)
            (inst mov free-pointer alloc-tn)
            (inst sub alloc-tn size)))
     (emit-label done))
@@ -349,14 +365,16 @@
 #!+sb-thread
 (defmacro pseudo-atomic (&rest forms)
   (with-unique-names (label)
-    `(let ((,label (gen-label)))
+    `(let ((,label (gen-label))
+           (tls-ea (make-ea :byte :disp (* 4 thread-pseudo-atomic-bits-slot)
+                            #!+x86-ebx-threads ,@'(:base ebx-tn))))
+       #!-x86-ebx-threads
        (inst fs-segment-prefix)
-       (inst or (make-ea :byte :disp (* 4 thread-pseudo-atomic-bits-slot))
-            (fixnumize 1))
+       (inst or tls-ea (fixnumize 1))
        ,@forms
+       #!-x86-ebx-threads
        (inst fs-segment-prefix)
-       (inst xor (make-ea :byte :disp (* 4 thread-pseudo-atomic-bits-slot))
-             (fixnumize 1))
+       (inst xor tls-ea (fixnumize 1))
        (inst jmp :z ,label)
        ;; if PAI was set, interrupts were disabled at the same
        ;; time using the process signal mask.
Index: src/compiler/x86/nlx.lisp
===================================================================
RCS file: /cvsroot/sbcl/sbcl/src/compiler/x86/nlx.lisp,v
retrieving revision 1.20
diff -u -r1.20 nlx.lisp
--- src/compiler/x86/nlx.lisp	13 Jan 2007 21:05:34 -0000	1.20
+++ src/compiler/x86/nlx.lisp	24 Mar 2007 22:27:37 -0000
@@ -19,7 +19,7 @@
 
 ;;; Make a TN for the argument count passing location for a non-local entry.
 (!def-vm-support-routine make-nlx-entry-arg-start-location ()
-  (make-wired-tn *fixnum-primitive-type* any-reg-sc-number ebx-offset))
+  (make-wired-tn *fixnum-primitive-type* any-reg-sc-number #!-x86-reserve-ebx ebx-offset #!+x86-reserve-ebx esi-offset))
 
 (defun catch-block-ea (tn)
   (aver (sc-is tn catch-block))
@@ -204,13 +204,14 @@
 
 (define-vop (nlx-entry-multiple)
   (:args (top)
-         (source)
+         #!-x86-reserve-ebx (source)
+         #!+x86-reserve-ebx (source :target esi)
          (count :target ecx))
   ;; Again, no SC restrictions for the args, 'cause the loading would
   ;; happen before the entry label.
   (:info label)
   (:temporary (:sc unsigned-reg :offset ecx-offset :from (:argument 2)) ecx)
-  (:temporary (:sc unsigned-reg :offset esi-offset) esi)
+  (:temporary (:sc unsigned-reg :offset esi-offset #!+x86-reserve-ebx :from #!+x86-reserve-ebx (:argument 1)) esi)
   (:temporary (:sc unsigned-reg :offset edi-offset) edi)
   (:results (result :scs (any-reg) :from (:argument 0))
             (num :scs (any-reg control-stack)))
Index: src/compiler/x86/static-fn.lisp
===================================================================
RCS file: /cvsroot/sbcl/sbcl/src/compiler/x86/static-fn.lisp,v
retrieving revision 1.10
diff -u -r1.10 static-fn.lisp
--- src/compiler/x86/static-fn.lisp	14 Jul 2005 19:13:48 -0000	1.10
+++ src/compiler/x86/static-fn.lisp	24 Mar 2007 22:27:37 -0000
@@ -17,6 +17,7 @@
   (:variant-vars function)
   (:vop-var vop)
   (:node-var node)
+  #!-x86-reserve-ebx
   (:temporary (:sc unsigned-reg :offset ebx-offset
                    :from (:eval 0) :to (:eval 2)) ebx)
   (:temporary (:sc unsigned-reg :offset ecx-offset
@@ -78,7 +79,8 @@
          ;; If speed not more important than size, duplicate the
          ;; effect of the ENTER with discrete instructions. Takes
          ;; 2+1+3+2=8 bytes as opposed to 4+3=7 bytes.
-         (cond ((policy node (>= speed space))
+         (cond #!-x86-reserve-ebx
+               ((policy node (>= speed space))
                 (inst mov ebx esp-tn)
                 ;; Save the old-fp
                 (inst push ebp-tn)
@@ -86,6 +88,17 @@
                 ;; above, two more needed.
                 (inst sub esp-tn (fixnumize 2))
                 (inst mov ebp-tn ebx))
+               #!+x86-reserve-ebx
+               ((policy node (>= speed space))
+                ;; 1+1+1+4=7 bytes. Not sure the policy makes sense anymore.
+                ;; Save the old-fp
+                (inst push ebp-tn)
+                ;; Ensure that at least three slots are available; one
+                ;; above, two more needed.
+                (inst push ebp-tn)
+                (inst push ebp-tn)
+                ;; And compute our new fp.
+                (inst lea ebp-tn (make-ea :dword :base esp-tn :disp 12)))
                (t
                 (inst enter (fixnumize 2))
                 ;; The enter instruction pushes EBP and then copies
@@ -139,6 +152,8 @@
   (frob 0 1)
   (frob 1 1)
   (frob 2 1)
+  ;; We don't actually -use- this next static fun template.
+  #!-x86-two-arg-passing-regs
   (frob 3 1))
 
 (defmacro define-static-fun (name args &key (results '(x)) translate
Index: src/compiler/x86/system.lisp
===================================================================
RCS file: /cvsroot/sbcl/sbcl/src/compiler/x86/system.lisp,v
retrieving revision 1.20
diff -u -r1.20 system.lisp
--- src/compiler/x86/system.lisp	6 Oct 2006 10:54:16 -0000	1.20
+++ src/compiler/x86/system.lisp	24 Mar 2007 22:27:37 -0000
@@ -268,9 +268,13 @@
   (:args (n :scs (unsigned-reg) :target sap))
   (:arg-types unsigned-num)
   (:policy :fast-safe)
+  #!-x86-ebx-threads
   (:generator 2
     (inst fs-segment-prefix)
-    (inst mov sap (make-ea :dword :disp 0 :index n :scale 4))))
+    (inst mov sap (make-ea :dword :disp 0 :index n :scale 4)))
+  #!+x86-ebx-threads
+  (:generator 2
+    (inst mov sap (make-ea :dword :base ebx-tn :disp 0 :index n :scale 4))))
 
 (define-vop (halt)
   (:generator 1
Index: src/compiler/x86/vm.lisp
===================================================================
RCS file: /cvsroot/sbcl/sbcl/src/compiler/x86/vm.lisp,v
retrieving revision 1.25
diff -u -r1.25 vm.lisp
--- src/compiler/x86/vm.lisp	13 Jan 2007 21:05:34 -0000	1.25
+++ src/compiler/x86/vm.lisp	24 Mar 2007 22:27:37 -0000
@@ -56,7 +56,7 @@
   (defreg dh 5 :byte)
   (defreg bl 6 :byte)
   (defreg bh 7 :byte)
-  (defregset *byte-regs* al ah cl ch dl dh bl bh)
+  (defregset *byte-regs* al ah cl ch dl dh #!-x86-reserve-ebx bl #!-x86-reserve-ebx bh)
 
   ;; word registers
   (defreg ax 0 :word)
@@ -67,7 +67,7 @@
   (defreg bp 10 :word)
   (defreg si 12 :word)
   (defreg di 14 :word)
-  (defregset *word-regs* ax cx dx bx si di)
+  (defregset *word-regs* ax cx dx #!-x86-reserve-ebx bx si di)
 
   ;; double word registers
   (defreg eax 0 :dword)
@@ -78,7 +78,7 @@
   (defreg ebp 10 :dword)
   (defreg esi 12 :dword)
   (defreg edi 14 :dword)
-  (defregset *dword-regs* eax ecx edx ebx esi edi)
+  (defregset *dword-regs* eax ecx edx #!-x86-reserve-ebx ebx esi edi)
 
   ;; floating point registers
   (defreg fr0 0 :float)
@@ -94,11 +94,13 @@
   ;; registers used to pass arguments
   ;;
   ;; the number of arguments/return values passed in registers
-  (def!constant  register-arg-count 3)
+  (def!constant  register-arg-count
+      #!-x86-two-arg-passing-regs 3
+      #!+x86-two-arg-passing-regs 2)
   ;; names and offsets for registers used to pass arguments
   (eval-when (:compile-toplevel :load-toplevel :execute)
-    (defparameter *register-arg-names* '(edx edi esi)))
-  (defregset    *register-arg-offsets* edx edi esi))
+    (defparameter *register-arg-names* '(edx edi #!-x86-two-arg-passing-regs esi)))
+  (defregset    *register-arg-offsets* edx edi #!-x86-two-arg-passing-regs esi))
 
 ;;;; SB definitions
 
Index: src/runtime/gencgc.c
===================================================================
RCS file: /cvsroot/sbcl/sbcl/src/runtime/gencgc.c,v
retrieving revision 1.111
diff -u -r1.111 gencgc.c
--- src/runtime/gencgc.c	22 Nov 2006 11:37:21 -0000	1.111
+++ src/runtime/gencgc.c	24 Mar 2007 22:27:38 -0000
@@ -281,7 +281,11 @@
  * page_table[] that other threads may want to see */
 
 #ifdef LISP_FEATURE_SB_THREAD
+#ifndef LISP_FEATURE_WIN32
 static pthread_mutex_t free_pages_lock = PTHREAD_MUTEX_INITIALIZER;
+#else
+HANDLE free_pages_lock = INVALID_HANDLE_VALUE;
+#endif
 #endif
 
 
@@ -4405,6 +4409,11 @@
 {
     page_index_t i;
 
+#if defined(LISP_FEATURE_WIN32) && defined(LISP_FEATURE_SB_THREAD)
+    /* Create the GC mutex. */
+    free_pages_lock = CreateMutex(NULL, FALSE, NULL);
+#endif
+
     /* Compute the number of pages needed for the dynamic space.
      * Dynamic space size should be aligned on page size. */
     page_table_pages = dynamic_space_size/PAGE_BYTES;
Index: src/runtime/globals.c
===================================================================
RCS file: /cvsroot/sbcl/sbcl/src/runtime/globals.c,v
retrieving revision 1.19
diff -u -r1.19 globals.c
--- src/runtime/globals.c	21 Feb 2006 22:59:33 -0000	1.19
+++ src/runtime/globals.c	24 Mar 2007 22:27:38 -0000
@@ -51,7 +51,11 @@
 lispobj *current_dynamic_space;
 
 #if defined(LISP_FEATURE_SB_THREAD)
+#if !defined(LISP_FEATURE_WIN32)
 pthread_key_t specials=0;
+#else
+unsigned long specials = 0;
+#endif
 #endif
 
 void globals_init(void)
@@ -68,6 +72,10 @@
     /* Set foreign function call active. */
     foreign_function_call_active = 1;
 #if defined(LISP_FEATURE_SB_THREAD)
+#if !defined(LISP_FEATURE_WIN32)
     pthread_key_create(&specials,0);
+#else
+    specials = TlsAlloc();
+#endif
 #endif
 }
Index: src/runtime/globals.h
===================================================================
RCS file: /cvsroot/sbcl/sbcl/src/runtime/globals.h,v
retrieving revision 1.28
diff -u -r1.28 globals.h
--- src/runtime/globals.h	2 Nov 2006 17:18:37 -0000	1.28
+++ src/runtime/globals.h	24 Mar 2007 22:27:38 -0000
@@ -32,7 +32,11 @@
 extern char **ENVIRON;
 
 #if defined(LISP_FEATURE_SB_THREAD)
+#if !defined(LISP_FEATURE_WIN32)
 extern pthread_key_t specials;
+#else
+extern unsigned long specials;
+#endif
 #endif
 
 extern lispobj *current_control_stack_pointer;
Index: src/runtime/interrupt.c
===================================================================
RCS file: /cvsroot/sbcl/sbcl/src/runtime/interrupt.c,v
retrieving revision 1.121
diff -u -r1.121 interrupt.c
--- src/runtime/interrupt.c	3 Mar 2007 00:42:02 -0000	1.121
+++ src/runtime/interrupt.c	24 Mar 2007 22:27:38 -0000
@@ -530,7 +530,7 @@
         info_sap = alloc_sap(info);
         /* Leave deferrable signals blocked, the handler itself will
          * allow signals again when it sees fit. */
-#ifdef LISP_FEATURE_SB_THREAD
+#if defined(LISP_FEATURE_SB_THREAD) && !defined(LISP_FEATURE_WIN32)
         {
             sigset_t unblock;
             sigemptyset(&unblock);
@@ -745,7 +745,9 @@
     os_context_t *context = arch_os_get_context(&void_context);
 
     struct thread *thread=arch_os_get_current_thread();
+#ifndef LISP_FEATURE_WIN32
     sigset_t ss;
+#endif
 
     if ((arch_pseudo_atomic_atomic(context) ||
          SymbolValue(GC_INHIBIT,thread) != NIL)) {
@@ -758,8 +760,10 @@
         /* need the context stored so it can have registers scavenged */
         fake_foreign_function_call(context);
 
+#ifndef LISP_FEATURE_WIN32
         sigfillset(&ss); /* Block everything. */
         thread_sigmask(SIG_BLOCK,&ss,0);
+#endif
 
         if(thread->state!=STATE_RUNNING) {
             lose("sig_stop_for_gc_handler: wrong thread state: %ld\n",
@@ -768,6 +772,7 @@
         thread->state=STATE_SUSPENDED;
         FSHOW_SIGNAL((stderr,"thread=%lu suspended\n",thread->os_thread));
 
+#ifndef LISP_FEATURE_WIN32
 #if defined(SIG_RESUME_FROM_GC)
         sigemptyset(&ss); sigaddset(&ss,SIG_RESUME_FROM_GC);
 #else
@@ -785,6 +790,9 @@
 #else
         while (sigwaitinfo(&ss,0) != SIG_STOP_FOR_GC);
 #endif
+#else
+        SuspendThread(GetCurrentThread());
+#endif
 
         FSHOW_SIGNAL((stderr,"thread=%lu resumed\n",thread->os_thread));
         if(thread->state!=STATE_RUNNING) {
@@ -1008,7 +1016,7 @@
 #endif
 }
 
-#ifdef LISP_FEATURE_SB_THREAD
+#if defined(LISP_FEATURE_SB_THREAD) && !defined(LISP_FEATURE_WIN32)
 
 /* FIXME: this function can go away when all lisp handlers are invoked
  * via arrange_return_to_lisp_function. */
Index: src/runtime/runtime.h
===================================================================
RCS file: /cvsroot/sbcl/sbcl/src/runtime/runtime.h,v
retrieving revision 1.25
diff -u -r1.25 runtime.h
--- src/runtime/runtime.h	6 Mar 2006 10:03:00 -0000	1.25
+++ src/runtime/runtime.h	24 Mar 2007 22:27:38 -0000
@@ -62,8 +62,14 @@
 #include <sys/types.h>
 
 #if defined(LISP_FEATURE_SB_THREAD)
+#  if !defined(LISP_FEATURE_WIN32)
 #include <pthread.h>
 typedef pthread_t os_thread_t;
+#  else
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+typedef HANDLE os_thread_t;
+#  endif
 #else
 typedef pid_t os_thread_t;
 #endif
Index: src/runtime/thread.c
===================================================================
RCS file: /cvsroot/sbcl/sbcl/src/runtime/thread.c,v
retrieving revision 1.73
diff -u -r1.73 thread.c
--- src/runtime/thread.c	2 Feb 2007 19:26:23 -0000	1.73
+++ src/runtime/thread.c	24 Mar 2007 22:27:39 -0000
@@ -87,7 +87,11 @@
 extern struct interrupt_data * global_interrupt_data;
 
 #ifdef LISP_FEATURE_SB_THREAD
+#ifndef LISP_FEATURE_WIN32
 pthread_mutex_t all_threads_lock = PTHREAD_MUTEX_INITIALIZER;
+#else
+HANDLE all_threads_lock = INVALID_HANDLE_VALUE;
+#endif
 #ifdef LOCK_CREATE_THREAD
 static pthread_mutex_t create_thread_lock = PTHREAD_MUTEX_INITIALIZER;
 #endif
@@ -130,12 +134,24 @@
     th->no_tls_value_marker = NO_TLS_VALUE_MARKER_WIDETAG;
     if(arch_os_thread_init(th)==0) return 1;
     link_thread(th);
-    th->os_thread=thread_self();
 #ifndef LISP_FEATURE_WIN32
+    th->os_thread=thread_self();
     protect_control_stack_guard_page(1);
+#else
+    /* Can't use GetCurrentThread(), as that's not a real handle, have
+     * to do this instead. */
+    if (!DuplicateHandle(GetCurrentProcess(),
+                         GetCurrentThread(),
+                         GetCurrentProcess(),
+                         (LPHANDLE)&th->os_thread,
+                         0,
+                         FALSE,
+                         DUPLICATE_SAME_ACCESS)) {
+        lose("Failed to duplicate thread handle: %ld", GetLastError());
+    }
 #endif
 
-#if defined(LISP_FEATURE_X86) || defined(LISP_FEATURE_X86_64)
+#if (defined(LISP_FEATURE_X86) || defined(LISP_FEATURE_X86_64)) && !defined(LISP_FEATURE_WIN32)
     return call_into_lisp_first_time(function,args,0);
 #else
     return funcall0(function);
@@ -154,7 +170,7 @@
 queue_freeable_thread_stack(struct thread *thread_to_be_cleaned_up)
 {
      if (thread_to_be_cleaned_up) {
-        pthread_mutex_lock(&freeable_stack_lock);
+        thread_mutex_lock(&freeable_stack_lock);
         if (freeable_stack_queue) {
             struct freeable_stack *new_freeable_stack = 0, *next;
             next = freeable_stack_queue;
@@ -180,7 +196,7 @@
             freeable_stack_queue = new_freeable_stack;
             freeable_stack_count++;
         }
-        pthread_mutex_unlock(&freeable_stack_lock);
+        thread_mutex_unlock(&freeable_stack_lock);
     }
 }
 
@@ -190,7 +206,7 @@
 free_freeable_stacks() {
     if (freeable_stack_queue && (freeable_stack_count > FREEABLE_STACK_QUEUE_SIZE)) {
         struct freeable_stack* old;
-        pthread_mutex_lock(&freeable_stack_lock);
+        thread_mutex_lock(&freeable_stack_lock);
         old = freeable_stack_queue;
         freeable_stack_queue = old->next;
         freeable_stack_count--;
@@ -198,7 +214,7 @@
         FSHOW((stderr, "freeing thread %x stack\n", old->os_thread));
         os_invalidate(old->stack, THREAD_STRUCT_SIZE);
         os_invalidate((os_vm_address_t)old, sizeof(struct freeable_stack));
-        pthread_mutex_unlock(&freeable_stack_lock);
+        thread_mutex_unlock(&freeable_stack_lock);
     }
 }
 
@@ -261,7 +277,12 @@
         /* Under NPTL pthread_join really waits until the thread
          * exists and the stack can be safely freed. This is sadly not
          * mandated by the pthread spec. */
+#ifndef LISP_FEATURE_WIN32
         gc_assert(pthread_join(new_freeable_stack->os_thread, NULL) == 0);
+#else
+        gc_assert(WaitForSingleObject(new_freeable_stack->os_thread, INFINITE)
+                  == WAIT_OBJECT_0);
+#endif
         os_invalidate(new_freeable_stack->stack, THREAD_STRUCT_SIZE);
         os_invalidate((os_vm_address_t) new_freeable_stack,
                       sizeof(struct freeable_stack));
@@ -288,32 +309,38 @@
         lose("arch_os_thread_init failed\n");
     }
 
+#ifndef LISP_FEATURE_WIN32
+    /* Done during create_os_thread() on Win32. */
     th->os_thread=thread_self();
+    /* Stack guard page defaults to being protected on Win32 */
     protect_control_stack_guard_page(1);
+#endif
     /* Since GC can only know about this thread from the all_threads
      * list and we're just adding this thread to it there is no danger
      * of deadlocking even with SIG_STOP_FOR_GC blocked (which it is
      * not). */
-    lock_ret = pthread_mutex_lock(&all_threads_lock);
+    lock_ret = thread_mutex_lock(&all_threads_lock);
     gc_assert(lock_ret == 0);
     link_thread(th);
-    lock_ret = pthread_mutex_unlock(&all_threads_lock);
+    lock_ret = thread_mutex_unlock(&all_threads_lock);
     gc_assert(lock_ret == 0);
 
     result = funcall0(function);
 
+    /* FIXME: The rest of this function needs re-thinking for Win32 */
+
     /* Block GC */
     block_blockable_signals();
     th->state=STATE_DEAD;
 
     /* SIG_STOP_FOR_GC is blocked and GC might be waiting for this
      * thread, but since we are already dead it won't wait long. */
-    lock_ret = pthread_mutex_lock(&all_threads_lock);
+    lock_ret = thread_mutex_lock(&all_threads_lock);
     gc_assert(lock_ret == 0);
 
     gc_alloc_update_page_tables(0, &th->alloc_region);
     unlink_thread(th);
-    pthread_mutex_unlock(&all_threads_lock);
+    thread_mutex_unlock(&all_threads_lock);
     gc_assert(lock_ret == 0);
 
     if(th->tls_cookie>=0) arch_os_thread_cleanup(th);
@@ -487,7 +514,12 @@
 #endif
 
 void create_initial_thread(lispobj initial_function) {
-    struct thread *th=create_thread_struct(initial_function);
+    struct thread *th;
+
+#if defined(LISP_FEATURE_WIN32) && defined(LISP_FEATURE_SB_THREAD)
+    all_threads_lock = CreateMutex(NULL, FALSE, NULL);
+#endif
+    th = create_thread_struct(initial_function);
     if(th) {
 #ifdef LISP_FEATURE_MACH_EXCEPTION_HANDLER
         kern_return_t ret;
@@ -500,27 +532,36 @@
 
 #ifdef LISP_FEATURE_SB_THREAD
 
-#ifndef __USE_XOPEN2K
+#if !defined(__USE_XOPEN2K) && !defined(LISP_FEATURE_WIN32)
 extern int pthread_attr_setstack (pthread_attr_t *__attr, void *__stackaddr,
                                   size_t __stacksize);
 #endif
 
+#ifndef LISP_FEATURE_WIN32
 boolean create_os_thread(struct thread *th,os_thread_t *kid_tid)
+#else
+boolean create_os_thread(struct thread *th,long *kid_tid)
+#endif
 {
     /* The new thread inherits the restrictive signal mask set here,
      * and enables signals again when it is set up properly. */
+#ifndef LISP_FEATURE_WIN32
     pthread_attr_t attr;
     sigset_t newset,oldset;
-    boolean r=1;
     int retcode, initcode, sizecode, addrcode;
+#else
+    HANDLE thread_handle;
+#endif
+    boolean r=1;
 
     FSHOW_SIGNAL((stderr,"/create_os_thread: creating new thread\n"));
 
 #ifdef LOCK_CREATE_THREAD
-    retcode = pthread_mutex_lock(&create_thread_lock);
+    retcode = thread_mutex_lock(&create_thread_lock);
     gc_assert(retcode == 0);
     FSHOW_SIGNAL((stderr,"/create_os_thread: got lock\n"));
 #endif
+#ifndef LISP_FEATURE_WIN32
     sigemptyset(&newset);
     /* Blocking deferrable signals is enough, no need to block
      * SIG_STOP_FOR_GC because the child process is not linked onto
@@ -550,19 +591,42 @@
         }
         r=0;
     }
+#else
+    /* FIXME: Do we need to close this handle? Is a tid really a
+     * thread id or should it be a handle? */
+    *kid_tid = 1;
+    thread_handle =
+        CreateThread(NULL, 0,
+                     (LPTHREAD_START_ROUTINE)new_thread_trampoline,
+                     th, CREATE_SUSPENDED, NULL);
+    if (!thread_handle) {
+        *kid_tid = 0;
+        fprintf(stderr, "CreateThread() lossage: 0x%08lx.\n", GetLastError());
+        r = 0;
+    }
+    th->os_thread = thread_handle;
+    ResumeThread(thread_handle);
+#endif
 
 #ifdef QUEUE_FREEABLE_THREAD_STACKS
     free_freeable_stacks();
 #endif
+#ifndef LISP_FEATURE_WIN32
     thread_sigmask(SIG_SETMASK,&oldset,0);
+#endif
 #ifdef LOCK_CREATE_THREAD
-    retcode = pthread_mutex_unlock(&create_thread_lock);
+    retcode = thread_mutex_unlock(&create_thread_lock);
     gc_assert(retcode == 0);
     FSHOW_SIGNAL((stderr,"/create_os_thread: released lock\n"));
 #endif
     return r;
 }
 
+#ifdef LISP_FEATURE_WIN32
+/* The return value from create_thread is tested for zero and
+ * discarded.  Don't do that with handles. */
+#  define os_thread_t long
+#endif
 os_thread_t create_thread(lispobj initial_function) {
     struct thread *th;
     os_thread_t kid_tid;
@@ -581,7 +645,11 @@
         return 0;
     }
 }
+#ifdef LISP_FEATURE_WIN32
+#  undef os_thread_t
+#endif
 
+#ifndef LISP_FEATURE_WIN32
 /* Send the signo to os_thread, retry if the rt signal queue is
  * full. */
 int
@@ -603,9 +671,11 @@
     }
     return r;
 }
+#endif
 
 int signal_interrupt_thread(os_thread_t os_thread)
 {
+#ifndef LISP_FEATURE_WIN32
     int status = kill_thread_safely(os_thread, SIG_INTERRUPT_THREAD);
     if (status == 0) {
         return 0;
@@ -615,6 +685,7 @@
         lose("cannot send SIG_INTERRUPT_THREAD to thread=%lu: %d, %s\n",
              os_thread, status, strerror(status));
     }
+#endif
 }
 
 /* stopping the world is a two-stage process.  From this thread we signal
@@ -635,7 +706,7 @@
      * on FreeBSD. */
     FSHOW_SIGNAL((stderr,"/gc_stop_the_world:waiting on create_thread_lock, thread=%lu\n",
                   th->os_thread));
-    lock_ret = pthread_mutex_lock(&create_thread_lock);
+    lock_ret = thread_mutex_lock(&create_thread_lock);
     gc_assert(lock_ret == 0);
     FSHOW_SIGNAL((stderr,"/gc_stop_the_world:got create_thread_lock, thread=%lu\n",
                   th->os_thread));
@@ -643,11 +714,12 @@
     FSHOW_SIGNAL((stderr,"/gc_stop_the_world:waiting on lock, thread=%lu\n",
                   th->os_thread));
     /* keep threads from starting while the world is stopped. */
-    lock_ret = pthread_mutex_lock(&all_threads_lock);      \
+    lock_ret = thread_mutex_lock(&all_threads_lock);      \
     gc_assert(lock_ret == 0);
 
     FSHOW_SIGNAL((stderr,"/gc_stop_the_world:got lock, thread=%lu\n",
                   th->os_thread));
+#ifndef LISP_FEATURE_WIN32
     /* stop all other threads by sending them SIG_STOP_FOR_GC */
     for(p=all_threads; p; p=p->next) {
         gc_assert(p->os_thread != 0);
@@ -676,6 +748,7 @@
         }
     }
     FSHOW_SIGNAL((stderr,"/gc_stop_the_world:end\n"));
+#endif
 }
 
 void gc_start_the_world()
@@ -698,6 +771,13 @@
                           p->os_thread));
             p->state=STATE_RUNNING;
 
+#ifdef LISP_FEATURE_WIN32
+            if (ResumeThread(p->os_thread) == -1) {
+                status = GetLastError();
+                lose("cannot resume thread=%lu: %ld\n",
+                     p->os_thread,status);
+            }
+#else
 #if defined(SIG_RESUME_FROM_GC)
             status=kill_thread_safely(p->os_thread,SIG_RESUME_FROM_GC);
 #else
@@ -707,6 +787,7 @@
                 lose("cannot resume thread=%lu: %d, %s\n",
                      p->os_thread,status,strerror(status));
             }
+#endif
         }
     }
     /* If we waited here until all threads leave STATE_SUSPENDED, then
@@ -714,10 +795,10 @@
      * performance implications, but does away with the 'rt signal
      * queue full' problem. */
 
-    lock_ret = pthread_mutex_unlock(&all_threads_lock);
+    lock_ret = thread_mutex_unlock(&all_threads_lock);
     gc_assert(lock_ret == 0);
 #ifdef LOCK_CREATE_THREAD
-    lock_ret = pthread_mutex_unlock(&create_thread_lock);
+    lock_ret = thread_mutex_unlock(&create_thread_lock);
     gc_assert(lock_ret == 0);
 #endif
 
Index: src/runtime/thread.h
===================================================================
RCS file: /cvsroot/sbcl/sbcl/src/runtime/thread.h,v
retrieving revision 1.28
diff -u -r1.28 thread.h
--- src/runtime/thread.h	26 Dec 2006 23:10:24 -0000	1.28
+++ src/runtime/thread.h	24 Mar 2007 22:27:39 -0000
@@ -111,7 +111,7 @@
 
 static inline struct thread *arch_os_get_current_thread() {
 #if defined(LISP_FEATURE_SB_THREAD)
-#if defined(LISP_FEATURE_X86)
+#if defined(LISP_FEATURE_X86) && !defined(LISP_FEATURE_X86_EBX_THREADS)
     register struct thread *me=0;
     if(all_threads) {
 #if defined(LISP_FEATURE_DARWIN) && defined(LISP_FEATURE_RESTORE_FS_SEGMENT_REGISTER_FROM_TLS)
@@ -136,6 +136,8 @@
                  : "i" (offsetof (struct thread,this)));
     }
     return me;
+#elif defined(LISP_FEATURE_WIN32)
+    return TlsGetValue(specials);
 #else
     return pthread_getspecific(specials);
 #endif /* x86 */
@@ -150,12 +152,20 @@
 #endif
 
 #if defined(LISP_FEATURE_SB_THREAD)
+#if !defined(LISP_FEATURE_WIN32)
 #define thread_self pthread_self
 #define thread_kill pthread_kill
 #define thread_sigmask pthread_sigmask
 #define thread_mutex_lock(l) pthread_mutex_lock(l)
 #define thread_mutex_unlock(l) pthread_mutex_unlock(l)
 #else
+#define thread_self GetCurrentThreadId
+#define thread_kill pthread_kill /* want to see uses */
+#define thread_sigmask pthread_sigmask /* what mask? */
+#define thread_mutex_lock(l) WaitForSingleObject(*l, INFINITE)
+#define thread_mutex_unlock(l) (!ReleaseMutex(*l))
+#endif
+#else
 #define thread_self getpid
 #define thread_kill kill
 #define thread_sigmask sigprocmask
Index: src/runtime/win32-os.c
===================================================================
RCS file: /cvsroot/sbcl/sbcl/src/runtime/win32-os.c,v
retrieving revision 1.29
diff -u -r1.29 win32-os.c
--- src/runtime/win32-os.c	13 Jan 2007 21:05:34 -0000	1.29
+++ src/runtime/win32-os.c	24 Mar 2007 22:27:39 -0000
@@ -602,4 +602,20 @@
     return copied_string(path);
 }
 
+/* This is thread system hacking. */
+
+int futex_wait(int *lock_word, int oldval)
+{
+    /* Returns 0 if woken via futex_wait or EWOULDBLOCK if futex value not
+     * oldval. Retries if interrupted (does Win32 even do this?).
+     */
+    return 0;
+}
+
+int futex_wake(int *lock_word, int n)
+{
+    /* "Returns the number of processes woken up. */
+    return 0;
+}
+
 /* EOF */
Index: src/runtime/win32-os.h
===================================================================
RCS file: /cvsroot/sbcl/sbcl/src/runtime/win32-os.h,v
retrieving revision 1.4
diff -u -r1.4 win32-os.h
--- src/runtime/win32-os.h	13 Jan 2007 21:05:34 -0000	1.4
+++ src/runtime/win32-os.h	24 Mar 2007 22:27:39 -0000
@@ -35,10 +35,17 @@
 
 #define SIG_MEMORY_FAULT SIGSEGV
 
+#if 0
 #define SIG_INTERRUPT_THREAD (SIGRTMIN)
 #define SIG_STOP_FOR_GC (SIGRTMIN+1)
 #define SIG_DEQUEUE (SIGRTMIN+2)
 #define SIG_THREAD_EXIT (SIGRTMIN+3)
+#else
+#define SIG_INTERRUPT_THREAD 0
+#define SIG_STOP_FOR_GC 1
+#define SIG_DEQUEUE 2
+#define SIG_THREAD_EXIT 3
+#endif
 
 struct lisp_exception_frame {
     struct lisp_exception_frame *next_frame;
Index: src/runtime/x86-assem.S
===================================================================
RCS file: /cvsroot/sbcl/sbcl/src/runtime/x86-assem.S,v
retrieving revision 1.39
diff -u -r1.39 x86-assem.S
--- src/runtime/x86-assem.S	21 Jan 2007 03:48:23 -0000	1.39
+++ src/runtime/x86-assem.S	24 Mar 2007 22:27:39 -0000
@@ -195,6 +195,9 @@
 #ifndef LISP_FEATURE_WIN32
 	movl    %esp,ALIEN_STACK + SYMBOL_VALUE_OFFSET
 	movl    GNAME(all_threads),%eax
+#ifdef	LISP_FEATURE_X86_EBX_THREADS
+	pushl	%eax
+#endif
 	movl    THREAD_CONTROL_STACK_START_OFFSET(%eax) ,%esp
 	/* don't think too hard about what happens if we get interrupted
 	* here */
@@ -216,6 +219,16 @@
 GNAME(call_into_lisp):
 	pushl	%ebp		# Save old frame pointer.
 	movl	%esp,%ebp	# Establish new frame.
+#ifdef LISP_FEATURE_X86_EBX_THREADS
+	pushl	GNAME(specials)
+#ifndef LISP_FEATURE_WIN32
+	call	GNAME(pthread_getspecific)
+	popl	%ecx		# Garbage	
+#else
+	call	_TlsGetValue@4
+#endif
+	pushl	%eax
+#endif
 Lstack:
 /* Save the NPX state */
 	fwait			# Catch any pending NPX exceptions.
@@ -274,11 +287,15 @@
 Ldone:	
 	/* Registers eax, ecx, edx, edi, and esi are now live. */
 
+#ifdef LISP_FEATURE_X86_EBX_THREADS
+	/* This needs doing before the SEH frame on Win32. */
+	mov	-4(%ebp),%ebx	# Load thread context pointer.
+#endif
+
 #ifdef LISP_FEATURE_WIN32
 	/* Establish an SEH frame. */
 #ifdef LISP_FEATURE_SB_THREAD
-	/* FIXME: need to save BSP here. */
-#error "need to save BSP here, but don't know how yet."
+	pushl	THREAD_BINDING_STACK_POINTER_OFFSET(%ebx)
 #else
 	pushl	BINDING_STACK_POINTER + SYMBOL_VALUE_OFFSET
 #endif
@@ -287,18 +304,38 @@
 	movl	%esp, %fs:0
 #endif
 
+#ifdef LISP_FEATURE_X86_EBX_THREADS
+	/* Alloc new frame. */
+	push	%ebp		# fp in save location S0
+	sub	$8,%esp		# Ensure 3 slots are allocated, one above.
+	lea	12(%esp),%ebp	# Switch to new frame.
+#else
 	/* Alloc new frame. */
 	mov	%esp,%ebx	# The current sp marks start of new frame.
 	push	%ebp		# fp in save location S0
 	sub	$8,%esp		# Ensure 3 slots are allocated, one above.
 	mov	%ebx,%ebp	# Switch to new frame.
+#endif
+
+#ifdef LISP_FEATURE_X86_TWO_ARG_PASSING_REGS
+	mov	%esi,-12(%ebp)	# Write third arg back down.
+#endif
+/*
+ * #ifdef LISP_FEATURE_X86_EBX_THREADS
+ * 	mov	%fs:THREAD_THIS_OFFSET,%ebx
+ * #endif
+ */
 
 	call	*CLOSURE_FUN_OFFSET(%eax)
 	
 	/* If the function returned multiple values, it will return to
 	   this point.  Lose them */
 	jnc	LsingleValue
+#ifdef LISP_FEATURE_X86_RESERVE_EBX
+	mov	%esi, %esp
+#else
 	mov	%ebx, %esp
+#endif
 LsingleValue:
 	/* A singled value function returns here */
 
@@ -319,6 +356,10 @@
 /* Restore the NPX state. */
 	frstor  (%esp)
 	addl	$108, %esp
+
+#ifdef LISP_FEATURE_X86_EBX_THREADS
+	popl	%ebp		# Lose the thread pointer
+#endif
 	
 	popl	%ebp		# c-sp
 	movl	%edx,%eax	# c-val
@@ -404,7 +445,11 @@
 	/* Single value return: The eventual return will now use the
 	   multiple values return convention but with a return values
 	   count of one. */
+#ifdef LISP_FEATURE_X86_RESERVE_EBX
+	movl	%esp,%esi	# Setup esi - the ofp.
+#else
 	movl	%esp,%ebx	# Setup ebx - the ofp.
+#endif
 	subl	$4,%esp		# Allocate one stack slot for the return value
 	movl	$4,%ecx		# Setup ecx for one return value.
 	movl	$(NIL),%edi	# default second value
@@ -745,8 +790,12 @@
    But where necessary must save eax, ecx, edx. */
 
 #ifdef LISP_FEATURE_SB_THREAD
+#ifndef LISP_FEATURE_X86_EBX_THREADS
 #define START_REGION %fs:THREAD_ALLOC_REGION_OFFSET
 #else
+#define START_REGION THREAD_ALLOC_REGION_OFFSET(%ebx)
+#endif
+#else
 #define START_REGION GNAME(boxed_region)
 #endif
 		
Index: src/runtime/x86-linux-os.c
===================================================================
RCS file: /cvsroot/sbcl/sbcl/src/runtime/x86-linux-os.c,v
retrieving revision 1.29
diff -u -r1.29 x86-linux-os.c
--- src/runtime/x86-linux-os.c	15 Oct 2005 08:40:36 -0000	1.29
+++ src/runtime/x86-linux-os.c	24 Mar 2007 22:27:39 -0000
@@ -76,6 +76,7 @@
 int arch_os_thread_init(struct thread *thread) {
     stack_t sigstack;
 #ifdef LISP_FEATURE_SB_THREAD
+#ifndef LISP_FEATURE_X86_EBX_THREADS
     struct user_desc ldt_entry = {
         1, 0, 0, /* index, address, length filled in later */
         1, MODIFY_LDT_CONTENTS_DATA, 0, 0, 0, 1
@@ -107,6 +108,7 @@
     pthread_mutex_unlock(&modify_ldt_lock);
 
     if(n<0) return 0;
+#endif
     pthread_setspecific(specials,thread);
 #endif
 #ifdef LISP_FEATURE_C_STACK_IS_CONTROL_STACK
@@ -133,6 +135,7 @@
  */
 
 int arch_os_thread_cleanup(struct thread *thread) {
+#ifndef LISP_FEATURE_X86_EBX_THREADS
     struct user_desc ldt_entry = {
         0, 0, 0,
         0, MODIFY_LDT_CONTENTS_DATA, 0, 0, 0, 0
@@ -144,6 +147,7 @@
     result = modify_ldt(1, &ldt_entry, sizeof (ldt_entry));
     thread_mutex_unlock(&modify_ldt_lock);
     return result;
+#endif
 }
 
 
Index: src/runtime/x86-win32-os.c
===================================================================
RCS file: /cvsroot/sbcl/sbcl/src/runtime/x86-win32-os.c,v
retrieving revision 1.5
diff -u -r1.5 x86-win32-os.c
--- src/runtime/x86-win32-os.c	1 Dec 2006 20:47:05 -0000	1.5
+++ src/runtime/x86-win32-os.c	24 Mar 2007 22:27:39 -0000
@@ -86,40 +86,7 @@
     }
 
 #ifdef LISP_FEATURE_SB_THREAD
-    /* this must be called from a function that has an exclusive lock
-     * on all_threads
-     */
-    struct user_desc ldt_entry = {
-        1, 0, 0, /* index, address, length filled in later */
-        1, MODIFY_LDT_CONTENTS_DATA, 0, 0, 0, 1
-    };
-    int n;
-    get_spinlock(&modify_ldt_lock,thread);
-    n=modify_ldt(0,local_ldt_copy,sizeof local_ldt_copy);
-    /* get next free ldt entry */
-
-    if(n) {
-        u32 *p;
-        for(n=0,p=local_ldt_copy;*p;p+=LDT_ENTRY_SIZE/sizeof(u32))
-            n++;
-    }
-    ldt_entry.entry_number=n;
-    ldt_entry.base_addr=(unsigned long) thread;
-    ldt_entry.limit=dynamic_values_bytes;
-    ldt_entry.limit_in_pages=0;
-    if (modify_ldt (1, &ldt_entry, sizeof (ldt_entry)) != 0) {
-        modify_ldt_lock=0;
-        /* modify_ldt call failed: something magical is not happening */
-        return -1;
-    }
-    __asm__ __volatile__ ("movw %w0, %%fs" : : "q"
-                          ((n << 3) /* selector number */
-                           + (1 << 2) /* TI set = LDT */
-                           + 3)); /* privilege level */
-    thread->tls_cookie=n;
-    modify_ldt_lock=0;
-
-    if(n<0) return 0;
+    TlsSetValue(specials, thread);
 #endif
 
     return 1;
