--- invert_paul.c	2015-05-05 09:43:42.623559595 +0200
+++ invert_mine.c	2015-05-07 18:25:22.217387220 +0200
@@ -39,38 +39,38 @@
     }
   else if (n == 2)
     {
-      mp_limb_t tp[4], up[2], sp[2], cy;
+      mp_limb_t tp[3], cy;
 
-      tp[0] = ZERO;
       invert_limb (xp[1], ap[1]);
-      tp[3] = mpn_mul_1 (tp + 1, ap, 2, xp[1]);
-      cy = mpn_add_n (tp + 2, tp + 2, ap, 2);
+      /* cy, here, plays the role of tp[3] */
+      cy = mpn_mul_1 (tp + 1, ap, 2, xp[1]);
+      add_ssaaaa (cy, tp[2], cy, tp[2], ap[1], ap[0]);
+      /* We want cy == GMP_NUMB_MASK, ie cy+1 == 0 */
+      ++ cy;
+      ASSERT (cy <= ONE + ONE);
       while (cy) /* Xh is too large */
 	{
-	  xp[1] --;
-	  cy -= mpn_sub (tp + 1, tp + 1, 3, ap, 2);
-	}
+          -- xp[1];
+          cy -= mpn_sub_n (tp + 1, tp + 1, ap, 2);
+	};
       /* tp[3] should be 111...111 */
 
-      mpn_com_n (sp, tp + 1, 2);
-      cy = mpn_add_1 (sp, sp, 2, ONE);
-      /* cy should be 0 */
-
-      up[1] = mpn_mul_1 (up, sp + 1, 1, xp[1]);
-      cy = mpn_add_1 (up + 1, up + 1, 1, sp[1]);
-      /* cy should be 0 */
-      xp[0] = up[1];
+      cy = - tp[2] - (tp[1] != ZERO);
+
+      {
+        mp_limb_t dummy; /* We could use tp[0] for dummy */
+        umul_ppmm (xp[0], dummy, cy, xp[1]);
+      }
+      xp[0] += cy;
+      ASSERT (xp[0] >= cy); /* No carry */
 
       /* update tp */
-      cy = mpn_addmul_1 (tp, ap, 2, xp[0]);
-      cy = mpn_add_1 (tp + 2, tp + 2, 2, cy);
-      do
-	{
-	  cy = mpn_add (tp, tp, 4, ap, 2);
-	  if (cy == ZERO)
-	    mpn_add_1 (xp, xp, 2, ONE);
-	}
-      while (cy == ZERO);
+      tp[0] = ZERO;
+      /* cy, here, plays the role of tp[2] */
+      cy = tp[2] + mpn_addmul_1 (tp, ap, 2, xp[0]);
+      ASSERT (tp[2] <= cy); /* No carry */
+      while ((cy += mpn_add_n (tp, tp, ap, 2)) & GMP_NUMB_MASK )
+        MPN_INCR_U (xp, 2, ONE);
 
       /* now A*X < B^4 <= A*(X+1) */
     }
@@ -87,8 +87,8 @@
       mpn_invert2 (xp + l, ap + l, h);
 
       TMP_MARK;
-      tp = TMP_ALLOC_LIMBS (n + h);
-      up = TMP_ALLOC_LIMBS (2 * h);
+      tp = TMP_ALLOC_LIMBS (n + 3 * h); /* n + h */
+      up = tp + n + h; /* 2 * h */
       mpn_mul (tp, ap, n, xp + l, h);
       cy = mpn_add_n (tp + h, tp + h, ap, n);
       while (cy)
@@ -104,7 +104,8 @@
       mpn_add_nc (xp, up + 2*h - l, tp + h, l, cy);
       if (up[2*h-l-1] + 3 <= CNST_LIMB(2)) /* X might be off by 1 */
         {
-          mp_ptr vp = TMP_ALLOC_LIMBS (n + n);
+          mp_ptr vp = tp;
+          assert (n + n <= n + 3 * h);
           mpn_mul_n (vp, ap, xp, n);
           cy = mpn_add_n (vp + n, vp + n, ap, n);
           assert (cy == 0); /* A*X should be less than B^(2n) */