623 files changed, 88937 insertions, 283 deletions
diff --git a/CHANGES b/CHANGES
new file mode 100644
index 0000000..ffe9ff2
--- /dev/null
+++ b/CHANGES
@@ -0,0 +1,519 @@
+0.47 - Thurs Dec 8 2005
+
+- SECURITY: fix for buffer allocation error in server code, could potentially
+  allow authenticated users to gain elevated privileges. All multi-user systems
+  running the server should upgrade (or apply the patch available on the
+  Dropbear webpage).
+
+- Fix channel handling code so that redirecting to /dev/null doesn't use
+  100% CPU.
+
+- Turn on zlib compression for dbclient.
+
+- Set "low delay" TOS bit, can significantly improve interactivity
+  over some links.
+
+- Added client keyboard-interactive mode support, allows operation with
+  newer OpenSSH servers in default config.
+
+- Log when pubkey auth fails because of bad ~/.ssh/authorized_keys permissions
+
+- Improve logging of assertions
+
+- Added aes-256 cipher and sha1-96 hmac.
+
+- Fix twofish so that it actually works.
+
+- Improve PAM prompt comparison.
+
+- Added -g (dbclient) and -a (dropbear server) options to allow
+  connections to listening forwarded ports from remote machines.
+
+- Various other minor fixes
+
+- Compile fixes for glibc 2.1 (ss_family vs __ss_family) and NetBSD
+  (netinet/in_systm.h needs to be included).
+
+0.46 - Sat July 9 2005
+
+- Fix long-standing bug which caused connections to be closed if an ssh-agent
+  socket was no longer available
+
+- Print a warning if we seem to be blocking on /dev/random 
+  (suggested by Paul Fox)
+
+- Fixed a memory leak in DSS code (thanks to Boris Berezovsky for the patch)
+
+- dbclient -L no longer segfaults, allocate correct buffer size (thanks
+  to David Cook for reporting it, and Christopher Faylor for independently
+  sending in a patch)
+
+- Added RSA blinding to signing code (suggested by Dan Kaminsky)
+
+- Rearranged bignum reading/random generation code
+
+- Reset the non-blocking status on stderr and stdout as well as stdin,
+  fixes a problem where the shell running dbclient will exit (thanks to 
+  Brent Roman for reporting it)
+
+- Fix so that all file descriptors are closed so the child shell doesn't
+  inherit descriptors (thanks to Linden May for the patch)
+
+- Change signkey.c to avoid gcc 4 generating incorrect code
+
+- After both sides of a file descriptor have been shutdown(), close()
+  it to avoid leaking descriptors (thanks to Ari Hyttinen for a patch)
+
+- Update to LibTomCrypt 1.05 and LibTomMath 0.35
+
+0.45 - Mon March 7 2005
+
+- Makefile no longer appends 'static' to statically linked binaries
+
+- Add optional SSH_ASKPASS support to the client
+
+- Respect HOST_LOOKUP option
+
+- Fix accidentally removed "return;" statement which was removed in 0.44
+  (causing clients which sent an empty terminal-modes string to fail to
+  connect - including pssh, ssh.com, danger hiptop). (patches
+  independently from Paul Fox, David Horwitt and Sven-Ola Tuecke)
+
+- Read "y/n" response for fingerprints from /dev/tty directly so that dbclient
+  will work with scp.
+
+0.44 - Mon Jan 3 2005
+
+- SECURITY: Fix for PAM auth so that usernames are logged and conversation
+  function responses are allocated correctly - all 0.44test4 users with PAM
+  compiled in (not default) are advised to upgrade.
+
+- Fix calls to getnameinfo() for compatibility with Solaris
+
+- Pristine compilation works (run 'configure' from a fresh dir and make it
+  there)
+
+- Fixes for compiling with most options disabled.
+
+- Upgraded to LibTomCrypt 0.99 and LibTomMath 0.32
+
+- Make sure that zeroing out of values in LTM and LTC won't get optimised away
+
+- Removed unused functions from loginrec.c
+
+- /dev/random is now the default entropy source rather than /dev/urandom
+
+- Logging of IPs in auth success/failure messages for improved greppability
+
+- Fix dbclient so that "scp -i keyfile" works. (It can handle "-ikeyfile
+  properly)
+
+- Avoid a race in server shell-handling code which prevents the exit-code
+  from being returned to the client in some circumstances.
+
+- Makefile modified so that install target works correctly (doesn't try
+  to install "all" binary) - patch from Juergen Daubert
+
+- Various minor fixes and compile warnings.
+
+0.44test4 - Tue Sept 14 2004 21:15:54 +0800
+
+- Fix inetd mode so it actually loads the hostkeys (oops)
+
+- Changed DROPBEAR_DEFPORT properly everywhere
+
+- Fix a small memory leak in the auth code
+
+- WCOREDUMP is only used on systems which support it (ie not cygwin or AIX)
+
+- Check (and fail for) cases when we can't negotiate algorithms with the
+  remote side successfully (rather than bombing out ungracefully)
+
+- Handle authorized_keys files without a terminating newline
+
+- Fiddle the channel receive window size for possibly better performance
+
+- Added in the PAM authentication code (finally! thanks to Martin Carlsson)
+
+0.44test3 - Fri Aug 27 22:20:54 +0800
+
+- Fixed a bunch of warnings.
+
+- scp works correctly when passed a username (fix for the dbclient program
+  itself as well, "-lmatt" works as well as "-l matt").
+
+- Remove unrequired debian files
+
+- Exit with the remote process's return code for dbclient
+
+- Display stderr messages from the server in the client
+
+- Add circular buffering to the channel code. This should dramatically reduce
+  the amount of backtraffic sent in response to traffic incoming to the
+  Dropbear end - improves high-latency performance (ie dialup).
+
+- Various other related channel-handling fixups.
+
+- Allow leading lines in the banner when connecting to servers
+
+- Fixed printing out errors onto the network socket with stderr (for inetd
+  mode when using xinetd)
+
+- Remove obselete documentation
+
+- Fix a null-pointer exception when trying to free non-existant listeners
+  at cleanup.
+
+- DEBUG_TRACE now only works if you add "-v" to the program commandline
+
+- Don't leave stdin non-blocking on exit - this caused the parent shell
+  of dbclient to close when dbclient exited, for some shells in BusyBox
+
+- Server connections no longer timeout after 5 minutes
+
+- Fixed stupid DSS hostkey typo (server couldn't load host keys)
+
+0.44test2 - Tues Aug 17 2004 17:43:54 +0800
+
+- Fix up dropbearmulti targets in the Makefile - symlinks are now created
+
+- Compile fake-rfc2553 even with dropbearconvert/dropbearkey - this 
+  allows them to work on platforms without a native getaddrinfo()
+
+- Create ~/.ssh/known_hosts properly if it doesn't exist
+
+- Fix basename() function prototype
+
+- Backport some local changes (more #ifdefs for termcodes.c, a fix for missing
+  defines on AIX).
+
+- Let dbclient be run as "ssh"
+
+- Initialise mp_ints by default
+
+0.44test1 - Sun Aug 16 2005 17:43:54 +0800
+
+- TESTING RELEASE - this is the first public release of the client codebase,
+  so there are sure to be bugs to be found. In addition, if you're just using
+  the server portion, the final binary size probably will increase - I'll
+  be trying to get it back down in future releases.
+
+- Dropbear client added - lots of changes to the server code as well to 
+  generalise things
+
+- IPv6 support added for client, server, and forwarding
+
+- New makefile with more generic support for multiple-program binaries
+
+0.43 - Fri Jul 16 2004 17:44:54 +0800
+
+- SECURITY: Don't try to free() uninitialised variables in DSS verification
+  code. Thanks to Arne Bernin for pointing out this bug. This is possibly
+  exploitable, all users with DSS and pubkey-auth compiled in are advised to
+  upgrade.
+
+- Clean up agent forwarding socket files correctly, patch from Gerrit Pape.
+
+- Don't go into an infinite loop when portforwarding to servers which don't
+  send any initial data/banner. Patch from Nikola Vladov
+
+- Fix for network vs. host byte order in logging remote TCP ports, also
+  from Gerrit Pape.
+
+- Initialise many pointers to NULL, for general safety. Also checked cleanup
+  code for mp_ints (related to security issues above).
+
+0.42 - Wed Jun 16 2004 12:44:54 +0800
+
+- Updated to Gerrit Pape's official Debian subdirectory
+
+- Fixed bad check when opening /dev/urandom - thanks to Danny Sung.
+
+- Added -i inetd mode flag, and associated options in options.h . Dropbear
+  can be compiled with either normal mode, inetd, or both modes. Thanks
+  to Gerrit Pape for basic patch and motivation.
+
+- Use <dirent.h> rather than <sys/dir.h> for POSIX compliance. Thanks to Bill
+  Sommerfield.
+
+- Fixed a TCP forwarding (client-local, -L style) bug which caused the whole
+  session to close if the TCP connection failed. Thanks to Andrew Braund for
+  reporting it and helping track it down.
+
+- Re-enable sigpipe for child processes. Thanks to Gerrit Pape for some
+  suggestions, and BSD manpages for a clearer explanation of the behaviour.
+
+- Added manpages, thanks to Gerrit Pape.
+
+- Changed license text for LibTomCrypt and LibTomMath.
+
+- Added strip-static target
+
+- Fixed a bug in agent-forwarding cleanup handler - would segfault
+  (dereferencing a null pointer) if agent forwarding had failed.
+
+- Fix behaviour of authorized_keys parsing, so larger (>1024 bit) DSA keys will
+  work. Thanks to Dr. Markus Waldeck for the report. 
+
+- Fixed local port forwarding code so that the "-j" option will make forwarding
+  attempts fail more gracefully.
+
+- Allow repeated requests in a single session if previous ones fail - this fixes  PuTTY and some other SCP clients, which try SFTP, then fall-back to SCP if it
+  isn't available. Thanks to Stirling Westrup for the report.
+
+- Updated to LibTomCrypt 0.96 and LibTomMath 0.30. The AES code now uses
+  smaller non-precomputed tables if DROPBEAR_SMALL_CODE is defined in
+  options.h, leading to a significant reduction in the binary size.
+
+0.41 - Mon Jan 19 2004 22:40:19 +0800
+
+- Fix in configure so that cross-compiling works, thanks to numerous people for
+  reporting and testing
+
+- Terminal mode parsing now handles empty terminal mode strings (sent by
+  Windows ssh.com clients), thanks to Ricardo Derbes for the report
+
+- Handling is improved for users with no shell specified in /etc/passwd,
+  thanks again to Ricardo Derbes
+
+- Fix for compiling with --disable-syslog, thanks to gordonfh
+
+- Various minor fixes allow scp to work with irix, thanks to Paul Marinceu for
+  fixing it up
+
+- Use <stropts.h> not <sys/stropts.h>, since the former seems more common
+
+0.40 - Tue Jan 13 2004 21:05:19 +0800
+
+- Remote TCP forwarding (-R) style implemented
+
+- Local and remote TCP forwarding can each be disabled at runtime (-k and -j
+  switches)
+
+- Fix for problems detecting openpty() with uClibc - many thanks to various
+  people for reporting and testing fixes, including (in random order) Cristian
+  Ionescu-Idbohrn, James Ewing, Steve Dover, Thomas Lundquist and Frederic
+  Lavernhe
+
+- Improved portability for IRIX, thanks to Paul Marinceu
+
+- AIX and HPUX portability fixes, thanks to Darren Tucker for patches
+
+- prngd should now work correctly, thanks to Darren Tucker for the patch
+
+- scp compilation on systems without strlcpy() is fixed, thanks to Peter
+  Jannesen and David Muse for reporting it (independently and simultaneously :)
+
+- Merged in new LibTomCrypt 0.92 and LibTomMath 0.28
+
+0.39 - Tue Dec 16 2003 15:19:19 +0800
+
+- Better checking of key lengths and parameters for DSS and RSA auth
+
+- Print fingerprint of keys used for pubkey auth
+
+- More consistent logging of usernames and IPs
+
+- Added option to disable password auth (or just for root) at runtime
+
+- Avoid including bignum functions which don't give much speed benefit but
+  take up binary size
+
+- Added a stripped down version of OpenSSH's scp binary
+
+- Added additional supporting functions for Irix, thanks to Paul Marinceu
+
+- Don't check for unused libraries in configure script
+
+- Removed trailing comma in algorithm lists (thanks to Mihnea Stoenescu)
+
+- Fixed up channel close handling, always send close packet in response
+  (also thanks to Mihnea Stoenescu)
+
+- Various makefile improvements for cross-compiling, thanks to Friedrich
+  Lobenstock and Mihnea Stoenescu
+
+- Use daemon() function if available (or our own copy) rather than separate
+  code (thanks to Fr�d�ric Lavernhe for the report and debugging, and Bernard
+  Blackham for his suggestion on what to look at)
+
+- Fixed up support for first_kex_packet_follows, required to talk to ssh.com
+  clients. Thanks to Marian Stagarescu for the bug report.
+
+- Avoid using MAXPATHLEN, pointer from Ian Morris
+
+- Improved input sanity checking
+
+0.38 - Sat Oct 11 2003 16:28:13 +0800
+
+- Default hostkey path changed to /etc/dropbear/dropbear_{rsa,dss}_host_key
+  rather than /etc/dropbear_{rsa,dss}_host_key
+
+- Added SMALL and MULTI text files which have info on compiling for multiple
+  binaries or small binaries
+
+- Allow for commandline definition of some options.h settings
+  (without warnings)
+
+- Be more careful handling EINTR
+
+- More fixes for channel closing
+
+- Added multi-binary support
+
+- Improved logging of IPs, now get logged in all cases
+
+- Don't chew cpu when waiting for version identification string, also
+  make sure that we kick off people if they don't auth within 5 minutes.
+
+- Various small fixes, warnings etc
+
+- Display MOTD if requested - suggested by
+  Trent Lloyd <lathiat at sixlabs.org> and
+  Zach White <zwhite at darkstar.frop.org>
+
+- sftp support works (relies on OpenSSH sftp binary or similar)
+
+- Added --disable-shadow option (requested by the floppyfw guys)
+
+0.37 - Wed Sept 24 2003 19:42:12 +0800
+
+- Various portability fixes, fixes for Solaris 9, Tru64 5.1, Mac OS X 10.2,
+  AIX, BSDs
+
+- Updated LibTomMath to 0.27 and LibTomCrypt to 0.90
+
+- Renamed util.{c,h} to dbutil.{c,h} to avoid conflicts with system util.h
+
+- Added some small changes so it'll work with AIX (plus Linux Affinity).
+  Thanks to Shig for them.
+
+- Improved the closing messages, so a clean exit is "Exited normally"
+
+- Added some more robust integer/size checking in buffer.c as a backstop for
+  integer overflows
+
+- X11 forwarding fixed for OSX, path for xauth changed to /usr/X11R6/bin/xauth
+
+- Channel code handles closing more nicely, doesn't sit waiting for an extra
+  keystroke on BSD/OSX platforms, and data is flushed fully before closing
+  child processes (thanks to 
+  Cristian Ionescu-Idbohrn <cristian.ionescu-idbohrn at axis.com> for
+  pointing that out).
+
+- Changed "DISABLE_TCPFWD" to "ENABLE_TCPFWD" (and for x11/auth) so
+  "disable DISABLE_TCPWD" isn't so confusing.
+
+- Fix authorized_keys handling (don't crash on too-long keys, and
+  use fgetc not getc to avoid strange macro-related issues), thanks to
+  Cristian Ionescu-Idbohrn <cristian.ionescu-idbohrn at axis.com> 
+  and Steve Rodgers <hwstar at cox.net> for reporting and testing.
+
+- Fixes to the README with regard to uClibc systems, thanks to 
+  Cristian Ionescu-Idbohrn <cristian.ionescu-idbohrn at axis.com>,
+  as well as general improvements to documentation (split README/INSTALL)
+
+- Fixed up some compilation problems with dropbearconvert/dropbearkey if
+  DSS or RSA were disabled, reported by Patrik Karlsson <patrik at cqure.net>
+
+- Fix double-free bug for hostkeys, reported by
+  Vincent Sanders <vince at kyllikki.org>
+
+- Fix up missing \ns from dropbearconvert help message,
+  thanks to Mordy Ovits <movits at bloomberg.com> for the patch
+
+0.36 - Tue August 19 2003 12:16:23 +0800
+
+- Fix uninitialised temporary variable in DSS signing code
+  (thanks to Matthew Franz <mdfranz at io.com> for reporting, and the authors
+  of Valgrind for making it easy to track down)
+- Fix remote version-string parsing error
+  (thanks to Bernard Blackham <bernard at blackham.com.au> for noticing)
+- Improved host-algorithm-matching algorithm in algo.c
+- Decreased MAX_STRING_LEN to a more realistic value
+- Fix incorrect version (0.34) in this CHANGES file for the previous release.
+
+0.35 - Sun August 17 2003 05:37:47 +0800
+
+- Fix for remotely exploitable format string buffer overflow.
+  (thanks to Joel Eriksson <je at bitnux.com>)
+
+0.34 - Fri August 15 2003 15:10:00 +0800
+
+- Made syslog optional, both at compile time and as a compile option
+  (suggested by Laurent Bercot <ska at skarnet.org>)
+- Fixup for bad base64 parsing in authorized_keys
+  (noticed by Davyd Madeley <davyd at zdlcomputing.com>)
+- Added initial tcp forwarding code, only -L (local) at this stage
+- Improved "make install" with DESTDIR and changing ownership seperately,
+  don't check for setpgrp on Linux for crosscompiling.
+  (from Erik Andersen <andersen at codepoet.org>)
+- More commenting, fix minor compile warnings, make return values more
+  consistent etc
+- Various signedness fixes
+- Can listen on multiple ports
+- added option to disable openpty with configure script,
+  (from K.-P. Kirchd�rfer <kapeka at epost.de>)
+- Various cleanups to bignum code
+  (thanks to Tom St Denis <tomstdenis at iahu.ca>)
+- Fix compile error when disabling RSA
+  (from Marc Kleine-Budde <kleine-budde at gmx.de>)
+- Other cleanups, splitting large functions for packet and kex handling etc
+
+0.33 - Sun June 22 2003 22:24:12 +0800
+
+- Fixed some invalid assertions in the channel code, fixing the server dying
+  when forwarding X11 connections.
+- Add dropbearconvert to convert to/from OpenSSH host keys and Dropbear keys
+- RSA keys now keep p and q parameters for compatibility -- old Dropbear keys
+  still work, but can't be converted to OpenSSH etc.
+- Debian packaging directory added, thanks to 
+  Grahame (grahame at angrygoats.net)
+- 'install' target added to the makefile
+- general tidying, improve consistency of functions etc
+- If RSA or DSS hostkeys don't exist, that algorithm won't be used.
+- Improved RSA and DSS key generation, more efficient and fixed some minor bugs
+  (thanks to Tom St Denis for the advice)
+- Merged new versions of LibTomCrypt (0.86) and LibTomMath (0.21)
+
+0.32 - Sat May 24 2003 12:44:11 +0800
+
+- Don't compile unused code from libtomcrypt (test vectors etc)
+- Updated to libtommath 0.17 and libtomcrypt 0.83. New libtommath results
+  in smaller binary size, due to not linking unrequired code
+- X11 forwarding added
+- Agent forwarding added (for OpenSSH.com ssh client/agent)
+- Fix incorrect buffer freeing when banners are used
+- Hostname resolution works
+- Various minor bugfixes/code size improvements etc
+
+0.31 - Fri May 9 2003 17:57:16 +0800
+
+- Improved syslog messages - IP logging etc
+- Strip control characters from log messages (specified username currently)
+- Login recording (utmp/wtmp) support, so last/w/who work - taken from OpenSSH
+- Shell is started as a proper login shell, so /etc/profile etc is sourced
+- Ptys work on Solaris (2.8 x86 tested) now
+- Fixed bug in specifying the rsa hostkey
+- Fixed bug in compression code, could trigger if compression resulted in
+  larger output than input (uncommon but possible).
+
+0.30 - Thu Apr 17 2003 18:46:15 +0800
+
+- SECURITY: buffer.c had bad checking for buffer increment length - fixed
+- channel code now closes properly on EOF - scp processes don't hang around
+- syslog support added - improved auth/login/failure messages
+- general code tidying, made return codes more consistent
+- Makefile fixed for dependencies and makes libtomcrypt as well
+- Implemented sending SSH_MSG_UNIMPLEMENTED :)
+
+0.29 - Wed Apr 9 2003
+
+- Fixed a stupid bug in 0.28 release, 'newstr = strdup(oldstr)',
+  not 'newstr=oldstr'
+
+0.28 - Sun Apr 6 2003
+
+- Initial public release
+
+Development was started in October 2002
diff --git a/INSTALL b/INSTALL
new file mode 100644
index 0000000..1bf444e
--- /dev/null
+++ b/INSTALL
@@ -0,0 +1,79 @@
+Basic Dropbear build instructions:
+
+- Edit options.h to set which features you want.
+- Edit debug.h if you want any debug options (not usually required).
+
+(If using a non-tarball copy, "autoconf; autoheader")
+
+./configure      (optionally with --disable-zlib or --disable-syslog,
+                  or --help for other options)
+
+Now compile:
+
+make PROGRAMS="dropbear dbclient dropbearkey dropbearconvert scp"
+
+And install (/usr/local/bin is usual default):
+
+make PROGRAMS="dropbear dbclient dropbearkey dropbearconvert scp" install
+
+(you can leave items out of the PROGRAMS list to avoid compiling them. If you
+recompile after changing the PROGRAMS list, you *MUST* "make clean" before
+recompiling - bad things will happen otherwise)
+
+See MULTI for instructions on making all-in-one binaries.
+
+If you want to compile statically, add "STATIC=1" to the make command-line.
+
+Binaries can be strippd with "make strip"
+
+============================================================================
+
+If you're compiling for a 386-class CPU, you will probably need to add
+CFLAGS=-DLTC_NO_BSWAP so that libtomcrypt doesn't use 486+ instructions.
+
+============================================================================
+
+Compiling with uClibc:
+
+Firstly, make sure you have at least uclibc 0.9.17, as getusershell() in prior
+versions is broken. Also note that you may get strange issues if your uClibc
+headers don't match the library you are running with, ie the headers might
+say that shadow password support exists, but the libraries don't have it.
+
+Compiling for uClibc should be the same as normal, just set CC to the magic
+uClibc toolchain compiler (ie export CC=i386-uclibc-gcc or whatever).
+You can use "make STATIC=1" to make statically linked binaries, and it is
+advisable to strip the binaries too. If you're looking to make a small binary,
+you should remove unneeded ciphers and MD5, by editing options.h
+
+It is possible to compile zlib in, by copying zlib.h and zconf.h into a
+subdirectory (ie zlibincludes), and 
+
+export CFLAGS="-Izlibincludes -I../zlibincludes"
+export LDFLAGS=/usr/lib/libz.a
+
+before ./configure and make.
+
+If you disable zlib, you must explicitly disable compression for the client -
+OpenSSH is possibly buggy in this regard, it seems you need to disable it
+globally in ~/.ssh/config, not just in the host entry in that file.
+
+You may want to manually disable lastlog recording when using uClibc, configure
+with --disable-lastlog.
+
+One common problem is pty allocation. There are a number of types of pty
+allocation which can be used -- if they work properly, the end result is the
+same for each type. Running configure should detect the best type to use
+automatically, however for some systems, this may be incorrect. Some
+things to note:
+
+    If your system expects /dev/pts to be mounted (this is a uClibc option),
+	make sure that it is.
+
+	Make sure that your libc headers match the library version you are using.
+
+	If openpty() is being used (HAVE_OPENPTY defined in config.h) and it fails,
+	you can try compiling with --disable-openpty. You will probably then need
+	to create all the /dev/pty?? and /dev/tty?? devices, which can be
+	problematic for devfs. In general, openpty() is the best way to allocate
+	PTYs, so it's best to try and get it working.
diff --git a/LICENSE b/LICENSE
index 5d678c5..e0a11ac 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,5 +1,89 @@
-LibTomCrypt is public domain.  As should all quality software be.
+Dropbear contains a number of components from different sources, hence there
+are a few licenses and authors involved. All licenses are fairly 
+non-restrictive.
 
-Tom St Denis
 
+The majority of code is written by Matt Johnston, under the license below.
 
+Portions of the client-mode work are (c) 2004 Mihnea Stoenescu, under the
+same license:
+
+Copyright (c) 2002-2004 Matt Johnston
+Portions copyright (c) 2004 Mihnea Stoenescu
+All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+=====
+
+LibTomCrypt and LibTomMath are written by Tom St Denis, and are Public Domain.
+
+=====
+
+sshpty.c is taken from OpenSSH 3.5p1, 
+  Copyright (c) 1995 Tatu Ylonen <ylo@cs.hut.fi>, Espoo, Finland
+                     All rights reserved
+ "As far as I am concerned, the code I have written for this software
+  can be used freely for any purpose.  Any derived versions of this
+  software must be clearly marked as such, and if the derived work is
+  incompatible with the protocol description in the RFC file, it must be
+  called by a name other than "ssh" or "Secure Shell". "
+
+=====
+
+loginrec.c
+loginrec.h
+atomicio.h
+atomicio.c
+and strlcat() (included in util.c) are from OpenSSH 3.6.1p2, and are licensed
+under the 2 point BSD license.
+
+loginrec is written primarily by Andre Lucas, atomicio.c by Theo de Raadt.
+
+strlcat() is (c) Todd C. Miller
+
+=====
+
+Import code in keyimport.c is modified from PuTTY's import.c, licensed as
+follows:
+
+PuTTY is copyright 1997-2003 Simon Tatham.
+
+Portions copyright Robert de Bath, Joris van Rantwijk, Delian
+Delchev, Andreas Schultz, Jeroen Massar, Wez Furlong, Nicolas Barry,
+Justin Bradford, and CORE SDI S.A.
+
+Permission is hereby granted, free of charge, to any person
+obtaining a copy of this software and associated documentation files
+(the "Software"), to deal in the Software without restriction,
+including without limitation the rights to use, copy, modify, merge,
+publish, distribute, sublicense, and/or sell copies of the Software,
+and to permit persons to whom the Software is furnished to do so,
+subject to the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT.  IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE
+FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
+CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/MULTI b/MULTI
new file mode 100644
index 0000000..a50e30e
--- /dev/null
+++ b/MULTI
@@ -0,0 +1,26 @@
+Multi-binary compilation
+========================
+
+To compile for systems without much space (floppy distributions etc), you
+can create a single binary. This will save disk space by avoiding repeated
+code between the various parts.
+If you are familiar with "busybox", it's the same principle.
+
+To compile the multi-binary, first "make clean" (if you've compiled
+previously), then
+
+make PROGRAMS="programs you want here" MULTI=1
+
+To use the binary, symlink it from the desired executable:
+
+ln -s dropbearmulti dropbear
+ln -s dropbearmulti dbclient
+etc
+
+then execute as normal:
+
+./dropbear <options here>
+
+"make install" doesn't currently work for multi-binary configuration, though
+in most situations where it is being used, the target and build systems will
+differ.
diff --git a/Makefile.in b/Makefile.in
index e1bee57..fc17c1f 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -1,282 +1,209 @@
-# MAKEFILE for linux GCC
+# This Makefile is for Dropbear SSH Server and Client
+# @configure_input@
+
+# invocation:
+# make PROGRAMS="dropbear dbclient scp" MULTI=1 STATIC=1 SCPPROGRESS=1
 #
-# Tom St Denis
-# Modified by Clay Culver
+# to make a multiple-program statically linked binary "staticdropbearmulti".
+# This example will include dropbear, scp, dropbearkey, dropbearconvert, and
+# dbclient functionality, and includes the progress-bar functionality in scp.
+# Hopefully that seems intuitive.
+
+ifndef PROGRAMS
+	PROGRAMS=dropbear dbclient dropbearkey dropbearconvert
+endif
+
+LTC=libtomcrypt/libtomcrypt.a
+LTM=libtommath/libtommath.a
+
+COMMONOBJS=dbutil.o buffer.o \
+		dss.o bignum.o \
+		signkey.o rsa.o random.o \
+		queue.o \
+		atomicio.o compat.o  fake-rfc2553.o
+
+SVROBJS=svr-kex.o svr-algo.o svr-auth.o sshpty.o \
+		svr-authpasswd.o svr-authpubkey.o svr-session.o svr-service.o \
+		svr-chansession.o svr-runopts.o svr-agentfwd.o svr-main.o svr-x11fwd.o\
+		svr-tcpfwd.o svr-authpam.o
+
+CLIOBJS=cli-algo.o cli-main.o cli-auth.o cli-authpasswd.o cli-kex.o \
+		cli-session.o cli-service.o cli-runopts.o cli-chansession.o \
+		cli-authpubkey.o cli-tcpfwd.o cli-channel.o cli-authinteract.o
 
-# The version
-VERSION=1.05
+CLISVROBJS=common-session.o packet.o common-algo.o common-kex.o \
+			common-channel.o common-chansession.o termcodes.o loginrec.o \
+			tcp-accept.o listener.o process-packet.o \
+			common-runopts.o circbuffer.o
+
+KEYOBJS=dropbearkey.o gendss.o genrsa.o
+
+CONVERTOBJS=dropbearconvert.o keyimport.o
+
+SCPOBJS=scp.o progressmeter.o atomicio.o scpmisc.o
+
+HEADERS=options.h dbutil.h session.h packet.h algo.h ssh.h buffer.h kex.h \
+		dss.h bignum.h signkey.h rsa.h random.h service.h auth.h \
+		debug.h channel.h chansession.h config.h queue.h sshpty.h \
+		termcodes.h gendss.h genrsa.h runopts.h includes.h \
+		loginrec.h atomicio.h x11fwd.h agentfwd.h tcpfwd.h compat.h \
+		listener.h fake-rfc2553.h
+
+dropbearobjs=$(COMMONOBJS) $(CLISVROBJS) $(SVROBJS) 
+dbclientobjs=$(COMMONOBJS) $(CLISVROBJS) $(CLIOBJS)
+dropbearkeyobjs=$(COMMONOBJS) $(KEYOBJS)
+dropbearconvertobjs=$(COMMONOBJS) $(CONVERTOBJS)
+scpobjs=$(SCPOBJS)
 
 VPATH=@srcdir@
 srcdir=@srcdir@
 
-# Compiler and Linker Names
-#CC=gcc
-#LD=ld
-
-# Archiver [makes .a files]
-#AR=ar
-#ARFLAGS=r
-
-# Compilation flags. Note the += does not write over the user's CFLAGS!
-# The rest of the flags come from the parent Dropbear makefile
-CFLAGS += -c -I$(srcdir)/src/headers/ -I$(srcdir)/../
-
-# additional warnings (newer GCC 3.4 and higher)
-#CFLAGS += -Wsystem-headers -Wdeclaration-after-statement -Wbad-function-cast -Wcast-align -Wstrict-prototypes -Wmissing-prototypes \
-#		  -Wmissing-declarations -Wpointer-arith 
-
-# optimize for SPEED
-#CFLAGS += -O3 -funroll-loops
-
-# add -fomit-frame-pointer.  hinders debugging!
-#CFLAGS += -fomit-frame-pointer
-
-# optimize for SIZE
-#CFLAGS += -Os -DLTC_SMALL_CODE
-
-# older GCCs can't handle the "rotate with immediate" ROLc/RORc/etc macros
-# define this to help
-#CFLAGS += -DLTC_NO_ROLC
-
-# compile for DEBUGING (required for ccmalloc checking!!!)
-#CFLAGS += -g3 -DLTC_NO_ASM
-
-#Output filenames for various targets.
-LIBNAME=libtomcrypt.a
-LIBTEST=testprof/libtomcrypt_prof.a
-HASH=hashsum
-CRYPT=encrypt
-SMALL=small
-PROF=x86_prof
-TV=tv_gen
-MULTI=multi
-TIMING=timing
-TEST=test
-
-#LIBPATH-The directory for libtomcrypt to be installed to.
-#INCPATH-The directory to install the header files for libtomcrypt.
-#DATAPATH-The directory to install the pdf docs.
-DESTDIR=
-LIBPATH=/usr/lib
-INCPATH=/usr/include
-DATAPATH=/usr/share/doc/libtomcrypt/pdf
-
-#Who do we install as?
-ifdef INSTALL_USER
-USER=$(INSTALL_USER)
-else
-USER=root
+prefix=@prefix@
+exec_prefix=${prefix}
+bindir=${exec_prefix}/bin
+sbindir=${exec_prefix}/sbin
+
+CC=@CC@
+LD=@LD@
+AR=@AR@
+RANLIB=@RANLIB@
+STRIP=@STRIP@
+INSTALL=@INSTALL@
+CFLAGS=-I. -I$(srcdir)/libtomcrypt/src/headers/ @CFLAGS@
+LIBS=$(LTC) $(LTM) @LIBS@
+LDFLAGS=@LDFLAGS@
+
+EXEEXT=@EXEEXT@
+
+# whether we're building client, server, or both for the common objects.
+# evilness so we detect 'dropbear' by itself as a word
+space:= $(empty) $(empty)
+ifneq (,$(strip $(foreach prog, $(PROGRAMS), $(findstring ZdropbearZ, Z$(prog)Z))))
+	CFLAGS+= -DDROPBEAR_SERVER
 endif
+ifneq (,$(strip $(foreach prog, $(PROGRAMS), $(findstring ZdbclientZ, Z$(prog)Z))))
+	CFLAGS+= -DDROPBEAR_CLIENT
+endif
+
+
+# these are exported so that libtomcrypt's makefile will use them
+export CC
+export CFLAGS
+export RANLIB AR STRIP
 
-ifdef INSTALL_GROUP
-GROUP=$(INSTALL_GROUP)
+ifeq ($(STATIC), 1)
+	LDFLAGS+=-static
+endif
+
+ifeq ($(MULTI), 1)
+	TARGETS=dropbearmulti
 else
-GROUP=wheel
+	TARGETS=$(PROGRAMS)
+endif
+
+# for the scp progress meter. The -D doesn't affect anything else.
+ifeq ($(SCPPROGRESS), 1)
+	CFLAGS+=-DPROGRESS_METER
 endif
 
-#List of objects to compile.
-
-#Leave MPI built-in or force developer to link against libtommath?
-#MPIOBJECT=src/misc/mpi/mpi.o
-#Dropbear uses libtommath
-MPIOBJECT=
-
-OBJECTS=src/ciphers/aes/aes_enc.o $(MPIOBJECT) src/ciphers/aes/aes.o src/ciphers/anubis.o \
-src/ciphers/blowfish.o src/ciphers/cast5.o src/ciphers/des.o src/ciphers/khazad.o src/ciphers/noekeon.o \
-src/ciphers/rc2.o src/ciphers/rc5.o src/ciphers/rc6.o src/ciphers/safer/safer.o \
-src/ciphers/safer/safer_tab.o src/ciphers/safer/saferp.o src/ciphers/skipjack.o \
-src/ciphers/twofish/twofish.o src/ciphers/xtea.o src/encauth/ccm/ccm_memory.o \
-src/encauth/ccm/ccm_test.o src/encauth/eax/eax_addheader.o src/encauth/eax/eax_decrypt.o \
-src/encauth/eax/eax_decrypt_verify_memory.o src/encauth/eax/eax_done.o src/encauth/eax/eax_encrypt.o \
-src/encauth/eax/eax_encrypt_authenticate_memory.o src/encauth/eax/eax_init.o \
-src/encauth/eax/eax_test.o src/encauth/gcm/gcm_add_aad.o src/encauth/gcm/gcm_add_iv.o \
-src/encauth/gcm/gcm_done.o src/encauth/gcm/gcm_gf_mult.o src/encauth/gcm/gcm_init.o \
-src/encauth/gcm/gcm_memory.o src/encauth/gcm/gcm_process.o src/encauth/gcm/gcm_reset.o \
-src/encauth/gcm/gcm_test.o src/encauth/ocb/ocb_decrypt.o src/encauth/ocb/ocb_decrypt_verify_memory.o \
-src/encauth/ocb/ocb_done_decrypt.o src/encauth/ocb/ocb_done_encrypt.o src/encauth/ocb/ocb_encrypt.o \
-src/encauth/ocb/ocb_encrypt_authenticate_memory.o src/encauth/ocb/ocb_init.o src/encauth/ocb/ocb_ntz.o \
-src/encauth/ocb/ocb_shift_xor.o src/encauth/ocb/ocb_test.o src/encauth/ocb/s_ocb_done.o \
-src/hashes/chc/chc.o src/hashes/helper/hash_file.o src/hashes/helper/hash_filehandle.o \
-src/hashes/helper/hash_memory.o src/hashes/helper/hash_memory_multi.o src/hashes/md2.o src/hashes/md4.o \
-src/hashes/md5.o src/hashes/rmd128.o src/hashes/rmd160.o src/hashes/sha1.o src/hashes/sha2/sha256.o \
-src/hashes/sha2/sha512.o src/hashes/tiger.o src/hashes/whirl/whirl.o src/mac/hmac/hmac_done.o \
-src/mac/hmac/hmac_file.o src/mac/hmac/hmac_init.o src/mac/hmac/hmac_memory.o \
-src/mac/hmac/hmac_memory_multi.o src/mac/hmac/hmac_process.o src/mac/hmac/hmac_test.o \
-src/mac/omac/omac_done.o src/mac/omac/omac_file.o src/mac/omac/omac_init.o src/mac/omac/omac_memory.o \
-src/mac/omac/omac_memory_multi.o src/mac/omac/omac_process.o src/mac/omac/omac_test.o \
-src/mac/pelican/pelican.o src/mac/pelican/pelican_memory.o src/mac/pelican/pelican_test.o \
-src/mac/pmac/pmac_done.o src/mac/pmac/pmac_file.o src/mac/pmac/pmac_init.o src/mac/pmac/pmac_memory.o \
-src/mac/pmac/pmac_memory_multi.o src/mac/pmac/pmac_ntz.o src/mac/pmac/pmac_process.o \
-src/mac/pmac/pmac_shift_xor.o src/mac/pmac/pmac_test.o src/misc/base64/base64_decode.o \
-src/misc/base64/base64_encode.o src/misc/burn_stack.o src/misc/crypt/crypt.o \
-src/misc/crypt/crypt_argchk.o src/misc/crypt/crypt_cipher_descriptor.o \
-src/misc/crypt/crypt_cipher_is_valid.o src/misc/crypt/crypt_find_cipher.o \
-src/misc/crypt/crypt_find_cipher_any.o src/misc/crypt/crypt_find_cipher_id.o \
-src/misc/crypt/crypt_find_hash.o src/misc/crypt/crypt_find_hash_any.o \
-src/misc/crypt/crypt_find_hash_id.o src/misc/crypt/crypt_find_prng.o \
-src/misc/crypt/crypt_hash_descriptor.o src/misc/crypt/crypt_hash_is_valid.o \
-src/misc/crypt/crypt_prng_descriptor.o src/misc/crypt/crypt_prng_is_valid.o \
-src/misc/crypt/crypt_register_cipher.o src/misc/crypt/crypt_register_hash.o \
-src/misc/crypt/crypt_register_prng.o src/misc/crypt/crypt_unregister_cipher.o \
-src/misc/crypt/crypt_unregister_hash.o src/misc/crypt/crypt_unregister_prng.o \
-src/misc/error_to_string.o src/misc/mpi/is_prime.o src/misc/mpi/mpi_to_ltc_error.o \
-src/misc/mpi/rand_prime.o src/misc/pkcs5/pkcs_5_1.o src/misc/pkcs5/pkcs_5_2.o src/misc/zeromem.o \
-src/modes/cbc/cbc_decrypt.o src/modes/cbc/cbc_done.o src/modes/cbc/cbc_encrypt.o \
-src/modes/cbc/cbc_getiv.o src/modes/cbc/cbc_setiv.o src/modes/cbc/cbc_start.o \
-src/modes/cfb/cfb_decrypt.o src/modes/cfb/cfb_done.o src/modes/cfb/cfb_encrypt.o \
-src/modes/cfb/cfb_getiv.o src/modes/cfb/cfb_setiv.o src/modes/cfb/cfb_start.o \
-src/modes/ctr/ctr_decrypt.o src/modes/ctr/ctr_done.o src/modes/ctr/ctr_encrypt.o \
-src/modes/ctr/ctr_getiv.o src/modes/ctr/ctr_setiv.o src/modes/ctr/ctr_start.o \
-src/modes/ecb/ecb_decrypt.o src/modes/ecb/ecb_done.o src/modes/ecb/ecb_encrypt.o \
-src/modes/ecb/ecb_start.o src/modes/ofb/ofb_decrypt.o src/modes/ofb/ofb_done.o \
-src/modes/ofb/ofb_encrypt.o src/modes/ofb/ofb_getiv.o src/modes/ofb/ofb_setiv.o \
-src/modes/ofb/ofb_start.o 
-
-HEADERS=src/headers/tommath_superclass.h src/headers/tomcrypt_cfg.h src/headers/tomcrypt_mac.h \
-src/headers/tomcrypt_macros.h src/headers/tomcrypt_custom.h src/headers/tomcrypt_argchk.h \
-src/headers/tomcrypt_cipher.h src/headers/tomcrypt_pk.h src/headers/tommath_class.h \
-src/headers/ltc_tommath.h src/headers/tomcrypt_hash.h src/headers/tomcrypt_misc.h \
-src/headers/tomcrypt.h src/headers/tomcrypt_pkcs.h src/headers/tomcrypt_prng.h testprof/tomcrypt_test.h
-
-TESTOBJECTS=demos/test.o
-HASHOBJECTS=demos/hashsum.o
-CRYPTOBJECTS=demos/encrypt.o
-SMALLOBJECTS=demos/small.o
-TVS=demos/tv_gen.o
-MULTIS=demos/multi.o
-TIMINGS=demos/timing.o
-TESTS=demos/test.o
-
-#Files left over from making the crypt.pdf.
-LEFTOVERS=*.dvi *.log *.aux *.toc *.idx *.ilg *.ind *.out
-
-#Compressed filenames
-COMPRESSED=crypt-$(VERSION).tar.bz2 crypt-$(VERSION).zip
-
-#The default rule for make builds the libtomcrypt library.
-default:library
-
-#ciphers come in two flavours... enc+dec and enc 
-src/ciphers/aes/aes_enc.o: src/ciphers/aes/aes.c src/ciphers/aes/aes_tab.c
-	$(CC) $(CFLAGS) -DENCRYPT_ONLY -c src/ciphers/aes/aes.c -o src/ciphers/aes/aes_enc.o
-
-#These are the rules to make certain object files.
-src/ciphers/aes/aes.o: src/ciphers/aes/aes.c src/ciphers/aes/aes_tab.c
-src/ciphers/twofish/twofish.o: src/ciphers/twofish/twofish.c src/ciphers/twofish/twofish_tab.c
-src/hashes/whirl/whirl.o: src/hashes/whirl/whirl.c src/hashes/whirl/whirltab.c
-src/pk/ecc/ecc.o: src/pk/ecc/ecc.c src/pk/ecc/ecc_sys.c
-src/pk/dh/dh.o: src/pk/dh/dh.c src/pk/dh/dh_sys.c
-src/hashes/sha2/sha512.o: src/hashes/sha2/sha512.c src/hashes/sha2/sha384.c
-src/hashes/sha2/sha256.o: src/hashes/sha2/sha256.c src/hashes/sha2/sha224.c
-
-#This rule makes the libtomcrypt library.
-library: $(LIBNAME)
-
-$(LIBTEST): 
-	cd testprof ; CFLAGS="$(CFLAGS)" make 
-
-$(LIBNAME): $(OBJECTS)
-	$(AR) $(ARFLAGS) $@ $(OBJECTS) 
-	$(RANLIB) $(LIBNAME)
-
-#This rule makes the hash program included with libtomcrypt
-hashsum: library $(HASHOBJECTS)
-	$(CC) $(HASHOBJECTS) $(LIBNAME) -o $(HASH) $(WARN)
-
-#makes the crypt program
-crypt: library $(CRYPTOBJECTS)
-	$(CC) $(CRYPTOBJECTS) $(LIBNAME) -o $(CRYPT) $(WARN)
-
-#makes the small program
-small: library $(SMALLOBJECTS)
-	$(CC) $(SMALLOBJECTS) $(LIBNAME) -o $(SMALL) $(WARN)
-	
-tv_gen: library $(TVS)
-	$(CC) $(TVS) $(LIBNAME) -o $(TV)
-
-multi: library $(MULTIS)
-	$(CC) $(MULTIS) $(LIBNAME) -o $(MULTI)
-
-timing: library $(LIBTEST) $(TIMINGS)
-	$(CC) $(TIMINGS) $(LIBTEST) $(LIBNAME) $(EXTRALIBS) -o $(TIMING)
-
-test: library $(LIBTEST) $(TESTS)
-	$(CC) $(TESTS) $(LIBTEST) $(LIBNAME) -o $(TEST)
-
-
-#This rule installs the library and the header files. This must be run
-#as root in order to have a high enough permission to write to the correct
-#directories and to set the owner and group to root.
-install: library docs
-	install -d -g $(GROUP) -o $(USER) $(DESTDIR)$(LIBPATH)
-	install -d -g $(GROUP) -o $(USER) $(DESTDIR)$(INCPATH)
-	install -d -g $(GROUP) -o $(USER) $(DESTDIR)$(DATAPATH)
-	install -g $(GROUP) -o $(USER) $(LIBNAME) $(DESTDIR)$(LIBPATH)
-	install -g $(GROUP) -o $(USER) $(HEADERS) $(DESTDIR)$(INCPATH)
-	install -g $(GROUP) -o $(USER) doc/crypt.pdf $(DESTDIR)$(DATAPATH)
-
-install_test: $(LIBTEST)
-	install -d -g $(GROUP) -o $(USER) $(DESTDIR)$(LIBPATH)
-	install -d -g $(GROUP) -o $(USER) $(DESTDIR)$(INCPATH)
-	install -g $(GROUP) -o $(USER) $(LIBTEST) $(DESTDIR)$(LIBPATH)
-
-profile:
-	CFLAGS="$(CFLAGS) -fprofile-generate" make timing EXTRALIBS=-lgcov
-	./timing
-	rm -f timing `find . -type f | grep [.][ao] | xargs`
-	CFLAGS="$(CFLAGS) -fprofile-use" make timing EXTRALIBS=-lgcov
-
-
-#This rule cleans the source tree of all compiled code, not including the pdf
-#documentation.
-clean:
-	-rm -f $(OBJECTS)
-	-rm -f libtomcrypt.a
-
-#build the doxy files (requires Doxygen, tetex and patience)
-doxy:
-	doxygen
-	cd doc/doxygen/latex ; make ; mv -f refman.pdf ../../.
-	echo The huge doxygen PDF should be available as doc/refman.pdf
-	
-#This builds the crypt.pdf file. Note that the rm -f *.pdf has been removed
-#from the clean command! This is because most people would like to keep the
-#nice pre-compiled crypt.pdf that comes with libtomcrypt! We only need to
-#delete it if we are rebuilding it.
-docs: crypt.tex
-	rm -f doc/crypt.pdf $(LEFTOVERS)
-	echo "hello" > crypt.ind
-	latex crypt > /dev/null
-	latex crypt > /dev/null
-	makeindex crypt.idx > /dev/null
-	latex crypt > /dev/null
-	dvipdf crypt
-	mv -ivf crypt.pdf doc/crypt.pdf
-	rm -f $(LEFTOVERS)
-
-docdvi: crypt.tex
-	echo hello > crypt.ind
-	latex crypt > /dev/null
-	latex crypt > /dev/null
-	makeindex crypt.idx
-	latex crypt > /dev/null
-
-#zipup the project (take that!)
-no_oops: clean
-	cd .. ; cvs commit 
-	echo Scanning for scratch/dirty files
-	find . -type f | grep -v CVS | xargs -n 1 bash mess.sh
-
-zipup: no_oops docs
-	cd .. ; rm -rf crypt* libtomcrypt-$(VERSION) ; mkdir libtomcrypt-$(VERSION) ; \
-	cp -R ./libtomcrypt/* ./libtomcrypt-$(VERSION)/ ; \
-	cd libtomcrypt-$(VERSION) ; rm -rf `find . -type d | grep CVS | xargs` ; cd .. ; \
-	tar -cjvf crypt-$(VERSION).tar.bz2 libtomcrypt-$(VERSION) ; \
-	zip -9r crypt-$(VERSION).zip libtomcrypt-$(VERSION) ; \
-	gpg -b -a crypt-$(VERSION).tar.bz2 ; gpg -b -a crypt-$(VERSION).zip ; \
-	mv -fv crypt* ~ ; rm -rf libtomcrypt-$(VERSION)
-
-
-# $Source: /cvs/libtom/libtomcrypt/makefile,v $ 
-# $Revision: 1.70 $ 
-# $Date: 2005/06/19 18:03:24 $ 
+#%: $(HEADERS)
+#%: $(HEADERS) Makefile
+# TODO
+
+all: $(TARGETS)
+
+strip: $(TARGETS)
+	$(STRIP) $(addsuffix $(EXEEXT), $(TARGETS))
+
+install: $(addprefix inst_, $(TARGETS))
+
+installdropbearmulti: insdbmulti $(addprefix insmulti, $(PROGRAMS)) 
+
+insdbmulti: dropbearmulti
+	$(INSTALL) -d -m 755 $(DESTDIR)$(bindir)
+	$(INSTALL) -m 755 dropbearmulti$(EXEEXT) $(DESTDIR)$(bindir)
+	-chown root $(DESTDIR)$(bindir)/dropbearmulti$(EXEEXT)
+	-chgrp 0 $(DESTDIR)$(bindir)/dropbearmulti$(EXEEXT)
+
+insmultidropbear: dropbearmulti
+	-rm -f $(DESTDIR)$(sbindir)/dropbear$(EXEEXT)
+	-ln -s $(DESTDIR)$(bindir)/dropbearmulti$(EXEEXT) $(DESTDIR)$(sbindir)/dropbear$(EXEEXT) 
+
+insmulti%: dropbearmulti
+	-rm -f $(DESTDIR)$(bindir)/$*$(EXEEXT) 
+	-ln -s $(DESTDIR)$(bindir)/dropbearmulti$(EXEEXT) $(DESTDIR)$(bindir)/$*$(EXEEXT) 
+
+# dropbear should go in sbin, so it needs a seperate rule
+inst_dropbear: dropbear
+	$(INSTALL) -d -m 755 $(DESTDIR)$(sbindir)
+	$(INSTALL) -m 755 dropbear$(EXEEXT) $(DESTDIR)$(sbindir)
+	-chown root $(DESTDIR)$(sbindir)/dropbear$(EXEEXT)
+	-chgrp 0 $(DESTDIR)$(sbindir)/dropbear$(EXEEXT)
+
+inst_%: $*
+	$(INSTALL) -d -m 755 $(DESTDIR)$(bindir)
+	$(INSTALL) -m 755 $*$(EXEEXT) $(DESTDIR)$(bindir)
+	-chown root $(DESTDIR)$(bindir)/$*$(EXEEXT)
+	-chgrp 0 $(DESTDIR)$(bindir)/$*$(EXEEXT)
+
+
+# for some reason the rule further down doesn't like $($@objs) as a prereq.
+dropbear: $(dropbearobjs)
+dbclient: $(dbclientobjs)
+dropbearkey: $(dropbearkeyobjs)
+dropbearconvert: $(dropbearconvertobjs)
+
+dropbear dbclient dropbearkey dropbearconvert: $(HEADERS)  $(LTC) $(LTM) \
+													Makefile
+	$(LD) $(LDFLAGS) -o $@$(EXEEXT) $($@objs) $(LIBS)
+
+# scp doesn't use the libs so is special.
+scp: $(SCPOBJS)  $(HEADERS) Makefile
+	$(LD) $(LDFLAGS) -o $@$(EXEEXT) $(SCPOBJS)
+
+
+# multi-binary compilation.
+MULTIOBJS=
+ifeq ($(MULTI),1)
+	MULTIOBJS=dbmulti.o $(sort $(foreach prog, $(PROGRAMS), $($(prog)objs)))
+	CFLAGS+=$(addprefix -DDBMULTI_, $(PROGRAMS)) -DDROPBEAR_MULTI
+endif
+
+dropbearmulti: multilink 
+
+multibinary: $(HEADERS) $(MULTIOBJS) $(LTC) $(LTM) Makefile
+	$(LD) $(LDFLAGS) -o dropbearmulti$(EXEEXT) $(MULTIOBJS) $(LIBS)
+
+multilink: multibinary $(addprefix link, $(PROGRAMS))
+
+link%:
+	-rm -f $*$(EXEEXT)
+	-ln -s dropbearmulti$(EXEEXT) $*$(EXEEXT)
+
+$(LTC): options.h
+	cd libtomcrypt && $(MAKE) clean && $(MAKE)
+
+$(LTM): options.h
+	cd libtommath && $(MAKE)
+
+ltc-clean:
+	cd libtomcrypt && $(MAKE) clean
+
+ltm-clean:
+	cd libtommath && $(MAKE) clean
+
+sizes: dropbear
+	objdump -t dropbear|grep ".text"|cut -d "." -f 2|sort -rn
+
+clean: ltc-clean ltm-clean thisclean
+
+thisclean:
+	-rm -f dropbear dbclient dropbearkey dropbearconvert scp scp-progress \
+			dropbearmulti *.o *.da *.bb *.bbg *.prof 
+
+distclean: clean tidy
+	-rm -f config.h
+	-rm -f Makefile
+
+tidy:
+	-rm -f *~ *.gcov */*~
diff --git a/README b/README
new file mode 100644
index 0000000..43dd1f2
--- /dev/null
+++ b/README
@@ -0,0 +1,74 @@
+This is Dropbear, a smallish SSH 2 server and client.
+
+INSTALL has compilation instructions.
+
+MULTI has instructions on making a multi-purpose binary (ie a single binary
+which performs multiple tasks, to save disk space)
+
+SMALL has some tips on creating small binaries.
+
+See TODO for a few of the things I know need looking at, and please contact
+me if you have any questions/bugs found/features/ideas/comments etc :)
+
+Matt Johnston
+matt@ucc.asn.au
+
+
+In the absence of detailed documentation, some notes follow:
+============================================================================
+
+Server public key auth:
+
+You can use ~/.ssh/authorized_keys in the same way as with OpenSSH, just put
+the key entries in that file. They should be of the form:
+
+ssh-rsa AAAAB3NzaC1yc2EAAAABIwAAAIEAwVa6M6cGVmUcLl2cFzkxEoJd06Ub4bVDsYrWvXhvUV+ZAM9uGuewZBDoAqNKJxoIn0Hyd0Nk/yU99UVv6NWV/5YSHtnf35LKds56j7cuzoQpFIdjNwdxAN0PCET/MG8qyskG/2IE2DPNIaJ3Wy+Ws4IZEgdJgPlTYUBWWtCWOGc= someone@hostname
+
+You must make sure that ~/.ssh, and the key file, are only writable by the
+user.
+
+NOTE: Dropbear ignores authorized_keys options such as those described in the
+OpenSSH sshd manpage, and will not allow a login for these keys. 
+
+============================================================================
+
+Client public key auth:
+
+Dropbear can do public key auth as a client, but you will have to convert
+OpenSSH style keys to Dropbear format, or use dropbearkey to create them.
+
+If you have an OpenSSH-style private key ~/.ssh/id_rsa, you need to do:
+
+dropbearconvert openssh dropbear ~/.ssh/id_rsa  ~/.ssh/id_rsa.db
+dbclient -i ~/.ssh/id_rsa.db <hostname>
+
+Currently encrypted keys aren't supported, neither is agent forwarding. At some
+stage both hopefully will be.
+
+============================================================================
+
+If you want to get the public-key portion of a Dropbear private key, look at
+dropbearkey's '-y' option.
+
+============================================================================
+
+To run the server, you need to generate server keys, this is one-off:
+./dropbearkey -t rsa -f dropbear_rsa_host_key
+./dropbearkey -t dss -f dropbear_dss_host_key
+
+or alternatively convert OpenSSH keys to Dropbear:
+./dropbearconvert openssh dropbear /etc/ssh/ssh_host_dsa_key dropbear_dss_host_key
+
+============================================================================
+
+If the server is run as non-root, you most likely won't be able to allocate a
+pty, and you cannot login as any user other than that running the daemon
+(obviously). Shadow passwords will also be unusable as non-root.
+
+============================================================================
+
+The Dropbear distribution includes a standalone version of OpenSSH's scp
+program. You can compile it with "make scp", you may want to change the path
+of the ssh binary, specified by _PATH_SSH_PROGRAM in options.h . By default
+the progress meter isn't compiled in to save space, you can enable it by 
+adding 'SCPPROGRESS=1' to the make commandline.
diff --git a/SMALL b/SMALL
new file mode 100644
index 0000000..babd671
--- /dev/null
+++ b/SMALL
@@ -0,0 +1,53 @@
+Tips for a small system:
+
+If you only want server functionality (for example), compile with
+	make PROGRAMS=dropbear
+rather than just
+	make dropbear
+so that client functionality in shared portions of Dropbear won't be included.
+The same applies if you are compiling just a client.
+
+---
+
+The following are set in options.h:
+
+	- You can safely disable blowfish and twofish ciphers, and MD5 hmac, without
+	  affecting interoperability
+
+	- If you're compiling statically, you can turn off host lookups
+
+	- You can disable either password or public-key authentication, though note
+	  that the IETF draft states that pubkey authentication is required.
+
+	- Similarly with DSS and RSA, you can disable one of these if you know that
+	  all clients will be able to support a particular one. The IETF draft
+	  states that DSS is required, however you may prefer to use RSA. 
+	  DON'T disable either of these on systems where you aren't 100% sure about
+	  who will be connecting and what clients they will be using.
+
+	- Disabling the MOTD code and SFTP-SERVER may save a small amount of codesize
+
+	- You can disable x11, tcp and agent forwarding as desired. None of these are
+	  essential, although agent-forwarding is often useful even on firewall boxes.
+
+---
+
+If you are compiling statically, you may want to disable zlib, as it will use
+a few tens of kB of binary-size (./configure --disable-zlib).
+
+You can create a combined binary, see the file MULTI, which will put all
+the functions into one binary, avoiding repeated code.
+
+If you're compiling with gcc, you might want to look at gcc's options for
+stripping unused code. The relevant vars to set before configure are:
+
+LDFLAGS=-Wl,--gc-sections
+CFLAGS="-ffunction-sections -fdata-sections"
+
+You can also experiment with optimisation flags such as -Os, note that in some
+cases these flags actually seem to increase size, so experiment before
+deciding.
+
+Of course using small C libraries such as uClibc and dietlibc can also help.
+
+If you have any queries, mail me and I'll see if I can help.
diff --git a/TODO b/TODO
index f4f0665..9807f59 100644
--- a/TODO
+++ b/TODO
@@ -1,10 +1,30 @@
-For 1.06
-
-1. export ECC functions globally [e.g. mulmod and the sets]
-   - goal is tv_gen module and test vectors
-2. ASN.1 SET and T61String
-3. phase out DH code [RSA/ECC/DSA is enough]
-4. Some ASN.1 demo programs [for now read the source code!]
-5. Start working towards making the bignum code plugable
-6. Look into other ECC point muls and consider a "precomp" interface 
-7. Add OID for ciphers and PRNGs to their descriptors
+Current:
+
+Things which might need doing:
+
+- default private dbclient keys
+
+- Make options.h generated from configure perhaps?
+
+- Improved queueing of unauthed connections
+
+- handle /etc/environment in AIX
+
+- check that there aren't timing issues with valid/invalid user authentication
+  feedback.
+
+- Binding to different interfaces
+
+- check PRNG
+- CTR mode
+- SSH_MSG_IGNORE sending to improve CBC security
+- DH Group Exchange possibly, or just add group14 (whatever it's called today)
+
+- fix scp.c for IRIX
+
+- Be able to use OpenSSH keys for the client? or at least have some form of 
+  encrypted keys.
+
+- Client agent forwarding
+
+- Handle restrictions in ~/.ssh/authorized_keys ?
diff --git a/agentfwd.h b/agentfwd.h
new file mode 100644
index 0000000..a0f675d
--- /dev/null
+++ b/agentfwd.h
@@ -0,0 +1,43 @@
+/*
+ * Dropbear - a SSH2 server
+ * 
+ * Copyright (c) 2002,2003 Matt Johnston
+ * All rights reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE. */
+#ifndef _AGENTFWD_H_
+#define _AGENTFWD_H_
+#ifndef DISABLE_AGENTFWD
+
+#include "includes.h"
+#include "chansession.h"
+#include "channel.h"
+
+int agentreq(struct ChanSess * chansess);
+void agentsetauth(struct ChanSess *chansess);
+void agentcleanup(struct ChanSess * chansess);
+void agentset(struct ChanSess *chansess);
+
+#ifdef __hpux
+#define seteuid(a)       setresuid(-1, (a), -1)
+#define setegid(a)       setresgid(-1, (a), -1)
+#endif
+
+#endif /* DROPBEAR_AGENTFWD */
+#endif /* _AGENTFWD_H_ */
diff --git a/algo.h b/algo.h
new file mode 100644
index 0000000..5ed01cc
--- /dev/null
+++ b/algo.h
@@ -0,0 +1,74 @@
+/*
+ * Dropbear - a SSH2 server
+ * 
+ * Copyright (c) 2002,2003 Matt Johnston
+ * All rights reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE. */
+
+#ifndef _ALGO_H_
+
+#define _ALGO_H_
+
+#include "includes.h"
+#include "buffer.h"
+
+struct Algo_Type {
+
+	unsigned char *name; /* identifying name */
+	char val; /* a value for this cipher, or -1 for invalid */
+	void *data; /* algorithm specific data */
+	unsigned usable : 1; /* whether we can use this algorithm */
+
+};
+
+typedef struct Algo_Type algo_type;
+
+/* lists mapping ssh types of algorithms to internal values */
+extern algo_type sshkex[];
+extern algo_type sshhostkey[];
+extern algo_type sshciphers[];
+extern algo_type sshhashes[];
+extern algo_type sshcompress[];
+
+extern const struct dropbear_cipher dropbear_nocipher;
+extern const struct dropbear_hash dropbear_nohash;
+
+struct dropbear_cipher {
+	const struct ltc_cipher_descriptor *cipherdesc;
+	unsigned long keysize;
+	unsigned char blocksize;
+};
+
+struct dropbear_hash {
+	const struct ltc_hash_descriptor *hashdesc;
+	unsigned long keysize;
+	unsigned char hashsize;
+};
+
+void crypto_init();
+int have_algo(char* algo, size_t algolen, algo_type algos[]);
+void buf_put_algolist(buffer * buf, algo_type localalgos[]);
+
+algo_type * svr_buf_match_algo(buffer* buf, algo_type localalgos[],
+		int *goodguess);
+algo_type * cli_buf_match_algo(buffer* buf, algo_type localalgos[],
+		int *goodguess);
+
+#endif /* _ALGO_H_ */
diff --git a/atomicio.c b/atomicio.c
new file mode 100644
index 0000000..1915a7b
--- /dev/null
+++ b/atomicio.c
@@ -0,0 +1,63 @@
+/*
+ * Copied from OpenSSH 3.6.1p2.
+ * 
+ * Copyright (c) 1995,1999 Theo de Raadt.  All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* RCSID("OpenBSD: atomicio.c,v 1.10 2001/05/08 22:48:07 markus Exp "); */
+
+#include "atomicio.h"
+
+/*
+ * ensure all of data on socket comes through. f==read || f==write
+ */
+ssize_t
+atomicio(f, fd, _s, n)
+	ssize_t (*f) ();
+	int fd;
+	void *_s;
+	size_t n;
+{
+	char *s = _s;
+	ssize_t res;
+	size_t pos = 0;
+
+	while (n > pos) {
+		res = (f) (fd, s + pos, n - pos);
+		switch (res) {
+		case -1:
+#ifdef EWOULDBLOCK
+			if (errno == EINTR || errno == EAGAIN || errno == EWOULDBLOCK)
+#else
+			if (errno == EINTR || errno == EAGAIN)
+#endif
+				continue;
+		case 0:
+			return (res);
+		default:
+			pos += res;
+		}
+	}
+	return (pos);
+}
diff --git a/atomicio.h b/atomicio.h
new file mode 100644
index 0000000..6c1f3ac
--- /dev/null
+++ b/atomicio.h
@@ -0,0 +1,36 @@
+
+/*
+ * Copied from OpenSSH 3.6.1p2, required for loginrec.c
+ *
+ * $OpenBSD: atomicio.h,v 1.4 2001/06/26 06:32:46 itojun Exp $
+ *
+ * Copyright (c) 1995,1999 Theo de Raadt.  All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "includes.h"
+
+/*
+ * Ensure all of data on socket comes through. f==read || f==write
+ */
+ssize_t	atomicio(ssize_t (*)(), int, void *, size_t);
diff --git a/auth.h b/auth.h
new file mode 100644
index 0000000..c407ad5
--- /dev/null
+++ b/auth.h
@@ -0,0 +1,111 @@
+/*
+ * Dropbear - a SSH2 server
+ * 
+ * Copyright (c) 2002,2003 Matt Johnston
+ * All rights reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE. */
+
+#ifndef _AUTH_H_
+#define _AUTH_H_
+
+#include "includes.h"
+
+void svr_authinitialise();
+void cli_authinitialise();
+
+/* Server functions */
+void recv_msg_userauth_request();
+void send_msg_userauth_failure(int partial, int incrfail);
+void send_msg_userauth_success();
+void svr_auth_password();
+void svr_auth_pubkey();
+void svr_auth_pam();
+
+/* Client functions */
+void recv_msg_userauth_failure();
+void recv_msg_userauth_success();
+void recv_msg_userauth_specific_60();
+void recv_msg_userauth_pk_ok();
+void recv_msg_userauth_info_request();
+void cli_get_user();
+void cli_auth_getmethods();
+void cli_auth_try();
+void recv_msg_userauth_banner();
+void cli_pubkeyfail();
+void cli_auth_password();
+int cli_auth_pubkey();
+void cli_auth_interactive();
+char* getpass_or_cancel();
+
+
+#define MAX_USERNAME_LEN 25 /* arbitrary for the moment */
+
+#define AUTH_TYPE_NONE      1
+#define AUTH_TYPE_PUBKEY    1 << 1
+#define AUTH_TYPE_PASSWORD  1 << 2
+#define AUTH_TYPE_INTERACT  1 << 3
+
+#define AUTH_METHOD_NONE "none"
+#define AUTH_METHOD_NONE_LEN 4
+#define AUTH_METHOD_PUBKEY "publickey"
+#define AUTH_METHOD_PUBKEY_LEN 9
+#define AUTH_METHOD_PASSWORD "password"
+#define AUTH_METHOD_PASSWORD_LEN 8
+#define AUTH_METHOD_INTERACT "keyboard-interactive"
+#define AUTH_METHOD_INTERACT_LEN 20
+
+
+
+/* This structure is shared between server and client - it contains
+ * relatively little extraneous bits when used for the client rather than the
+ * server */
+struct AuthState {
+
+	char *username; /* This is the username the client presents to check. It
+					   is updated each run through, used for auth checking */
+	unsigned char authtypes; /* Flags indicating which auth types are still 
+								valid */
+	unsigned int failcount; /* Number of (failed) authentication attempts.*/
+	unsigned authdone : 1; /* 0 if we haven't authed, 1 if we have. Applies for
+							  client and server (though has differing [obvious]
+							  meanings). */
+	unsigned perm_warn : 1; /* Server only, set if bad permissions on 
+							   ~/.ssh/authorized_keys have already been
+							   logged. */
+
+	/* These are only used for the server */
+	char *printableuser; /* stripped of control chars, used for logs etc */
+	struct passwd * pw;
+
+};
+
+struct SignKeyList;
+/* A singly linked list of signing keys */
+struct SignKeyList {
+
+	sign_key *key;
+	int type; /* The type of key */
+	struct SignKeyList *next;
+	/* filename? or the buffer? for encrypted keys, so we can later get
+	 * the private key portion */
+
+};
+
+#endif /* _AUTH_H_ */
diff --git a/bignum.c b/bignum.c
new file mode 100644
index 0000000..60b5220
--- /dev/null
+++ b/bignum.c
@@ -0,0 +1,75 @@
+/*
+ * Dropbear - a SSH2 server
+ * 
+ * Copyright (c) 2002,2003 Matt Johnston
+ * All rights reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE. */
+
+/* Contains helper functions for mp_int handling */
+
+#include "includes.h"
+#include "dbutil.h"
+
+/* wrapper for mp_init, failing fatally on errors (memory allocation) */
+void m_mp_init(mp_int *mp) {
+
+	if (mp_init(mp) != MP_OKAY) {
+		dropbear_exit("mem alloc error");
+	}
+}
+
+/* simplified duplication of bn_mp_multi's mp_init_multi, but die fatally
+ * on error */
+void m_mp_init_multi(mp_int *mp, ...) 
+{
+    mp_int* cur_arg = mp;
+    va_list args;
+
+    va_start(args, mp);        /* init args to next argument from caller */
+    while (cur_arg != NULL) {
+        if (mp_init(cur_arg) != MP_OKAY) {
+			dropbear_exit("mem alloc error");
+        }
+        cur_arg = va_arg(args, mp_int*);
+    }
+    va_end(args);
+}
+
+void bytes_to_mp(mp_int *mp, const unsigned char* bytes, unsigned int len) {
+
+	if (mp_read_unsigned_bin(mp, (unsigned char*)bytes, len) != MP_OKAY) {
+		dropbear_exit("mem alloc error");
+	}
+}
+
+/* hash the ssh representation of the mp_int mp */
+void sha1_process_mp(hash_state *hs, mp_int *mp) {
+
+	int i;
+	buffer * buf;
+
+	buf = buf_new(512 + 20); /* max buffer is a 4096 bit key, 
+								plus header + some leeway*/
+	buf_putmpint(buf, mp);
+	i = buf->pos;
+	buf_setpos(buf, 0);
+	sha1_process(hs, buf_getptr(buf, i), i);
+	buf_free(buf);
+}
diff --git a/bignum.h b/bignum.h
new file mode 100644
index 0000000..042f811
--- /dev/null
+++ b/bignum.h
@@ -0,0 +1,35 @@
+/*
+ * Dropbear - a SSH2 server
+ * 
+ * Copyright (c) 2002,2003 Matt Johnston
+ * All rights reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE. */
+
+#ifndef _BIGNUM_H_
+#define _BIGNUM_H_
+
+#include "includes.h"
+
+void m_mp_init(mp_int *mp);
+void m_mp_init_multi(mp_int *mp, ...);
+void bytes_to_mp(mp_int *mp, const unsigned char* bytes, unsigned int len);
+void sha1_process_mp(hash_state *hs, mp_int *mp);
+
+#endif /* _BIGNUM_H_ */
diff --git a/buffer.c b/buffer.c
new file mode 100644
index 0000000..579fa6f
--- /dev/null
+++ b/buffer.c
@@ -0,0 +1,338 @@
+/*
+ * Dropbear SSH
+ * 
+ * Copyright (c) 2002,2003 Matt Johnston
+ * All rights reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE. */
+
+/* Buffer handling routines, designed to avoid overflows/using invalid data */
+
+#include "includes.h"
+#include "dbutil.h"
+#include "buffer.h"
+
+/* Prevent integer overflows when incrementing buffer position/length.
+ * Calling functions should check arguments first, but this provides a
+ * backstop */
+#define BUF_MAX_INCR 1000000000
+#define BUF_MAX_SIZE 1000000000
+
+/* avoid excessively large numbers, > ~8192 bits */
+#define BUF_MAX_MPINT (8240 / 8)
+
+/* Create (malloc) a new buffer of size */
+buffer* buf_new(unsigned int size) {
+
+	buffer* buf;
+	
+	if (size > BUF_MAX_SIZE) {
+		dropbear_exit("buf->size too big");
+	}
+
+	buf = (buffer*)m_malloc(sizeof(buffer));
+
+	if (size > 0) {
+		buf->data = (unsigned char*)m_malloc(size);
+	} else {
+		buf->data = NULL;
+	}
+
+	buf->size = size;
+	buf->pos = 0;
+	buf->len = 0;
+
+	return buf;
+
+}
+
+/* free the buffer's data and the buffer itself */
+void buf_free(buffer* buf) {
+
+	m_free(buf->data)
+	m_free(buf);
+}
+
+/* overwrite the contents of the buffer to clear it */
+void buf_burn(buffer* buf) {
+	
+	m_burn(buf->data, buf->size);
+
+}
+
+/* resize a buffer, pos and len will be repositioned if required when
+ * downsizing */
+void buf_resize(buffer *buf, unsigned int newsize) {
+
+	if (newsize > BUF_MAX_SIZE) {
+		dropbear_exit("buf->size too big");
+	}
+
+	buf->data = m_realloc(buf->data, newsize);
+	buf->size = newsize;
+	buf->len = MIN(newsize, buf->len);
+	buf->pos = MIN(newsize, buf->pos);
+
+}
+
+/* Create a copy of buf, allocating required memory etc. */
+/* The new buffer is sized the same as the length of the source buffer. */
+buffer* buf_newcopy(buffer* buf) {
+	
+	buffer* ret;
+
+	ret = buf_new(buf->len);
+	ret->len = buf->len;
+	memcpy(ret->data, buf->data, buf->len);
+	return ret;
+}
+
+/* Set the length of the buffer */
+void buf_setlen(buffer* buf, unsigned int len) {
+	if (len > buf->size) {
+		dropbear_exit("bad buf_setlen");
+	}
+	buf->len = len;
+}
+
+/* Increment the length of the buffer */
+void buf_incrlen(buffer* buf, unsigned int incr) {
+	if (incr > BUF_MAX_INCR || buf->len + incr > buf->size) {
+		dropbear_exit("bad buf_incrlen");
+	}
+	buf->len += incr;
+}
+/* Set the position of the buffer */
+void buf_setpos(buffer* buf, unsigned int pos) {
+
+	if (pos > buf->len) {
+		dropbear_exit("bad buf_setpos");
+	}
+	buf->pos = pos;
+}
+
+/* increment the postion by incr, increasing the buffer length if required */
+void buf_incrwritepos(buffer* buf, unsigned int incr) {
+	if (incr > BUF_MAX_INCR || buf->pos + incr > buf->size) {
+		dropbear_exit("bad buf_incrwritepos");
+	}
+	buf->pos += incr;
+	if (buf->pos > buf->len) {
+		buf->len = buf->pos;
+	}
+}
+
+/* increment the position by incr, negative values are allowed, to
+ * decrement the pos*/
+void buf_incrpos(buffer* buf,  int incr) {
+	if (incr > BUF_MAX_INCR ||
+			(unsigned int)((int)buf->pos + incr) > buf->len 
+			|| ((int)buf->pos + incr) < 0) {
+		dropbear_exit("bad buf_incrpos");
+	}
+	buf->pos += incr;
+}
+
+/* Get a byte from the buffer and increment the pos */
+unsigned char buf_getbyte(buffer* buf) {
+
+	/* This check is really just ==, but the >= allows us to check for the
+	 * bad case of pos > len, which should _never_ happen. */
+	if (buf->pos >= buf->len) {
+		dropbear_exit("bad buf_getbyte");
+	}
+	return buf->data[buf->pos++];
+}
+
+/* Get a bool from the buffer and increment the pos */
+unsigned char buf_getbool(buffer* buf) {
+
+	unsigned char b;
+	b = buf_getbyte(buf);
+	if (b != 0)
+		b = 1;
+	return b;
+}
+
+/* put a byte, incrementing the length if required */
+void buf_putbyte(buffer* buf, unsigned char val) {
+
+	if (buf->pos >= buf->len) {
+		buf_incrlen(buf, 1);
+	}
+	buf->data[buf->pos] = val;
+	buf->pos++;
+}
+
+/* returns an in-place pointer to the buffer, checking that
+ * the next len bytes from that position can be used */
+unsigned char* buf_getptr(buffer* buf, unsigned int len) {
+
+	if (buf->pos + len > buf->len) {
+		dropbear_exit("bad buf_getptr");
+	}
+	return &buf->data[buf->pos];
+}
+
+/* like buf_getptr, but checks against total size, not used length.
+ * This allows writing past the used length, but not past the size */
+unsigned char* buf_getwriteptr(buffer* buf, unsigned int len) {
+
+	if (buf->pos + len > buf->size) {
+		dropbear_exit("bad buf_getwriteptr");
+	}
+	return &buf->data[buf->pos];
+}
+
+/* Return a null-terminated string, it is malloced, so must be free()ed
+ * Note that the string isn't checked for null bytes, hence the retlen
+ * may be longer than what is returned by strlen */
+unsigned char* buf_getstring(buffer* buf, unsigned int *retlen) {
+
+	unsigned int len;
+	unsigned char* ret;
+	len = buf_getint(buf);
+	if (len > MAX_STRING_LEN) {
+		dropbear_exit("string too long");
+	}
+
+	if (retlen != NULL) {
+		*retlen = len;
+	}
+	ret = m_malloc(len+1);
+	memcpy(ret, buf_getptr(buf, len), len);
+	buf_incrpos(buf, len);
+	ret[len] = '\0';
+
+	return ret;
+}
+
+/* Just increment the buffer position the same as if we'd used buf_getstring,
+ * but don't bother copying/malloc()ing for it */
+void buf_eatstring(buffer *buf) {
+
+	buf_incrpos( buf, buf_getint(buf) );
+}
+
+/* Get an uint32 from the buffer and increment the pos */
+unsigned int buf_getint(buffer* buf) {
+	unsigned int ret;
+
+	LOAD32H(ret, buf_getptr(buf, 4));
+	buf_incrpos(buf, 4);
+	return ret;
+}
+
+/* put a 32bit uint into the buffer, incr bufferlen & pos if required */
+void buf_putint(buffer* buf, int unsigned val) {
+
+	STORE32H(val, buf_getwriteptr(buf, 4));
+	buf_incrwritepos(buf, 4);
+
+}
+
+/* put a SSH style string into the buffer, increasing buffer len if required */
+void buf_putstring(buffer* buf, const unsigned char* str, unsigned int len) {
+	
+	buf_putint(buf, len);
+	buf_putbytes(buf, str, len);
+
+}
+
+/* put the set of len bytes into the buffer, incrementing the pos, increasing
+ * len if required */
+void buf_putbytes(buffer *buf, const unsigned char *bytes, unsigned int len) {
+	memcpy(buf_getwriteptr(buf, len), bytes, len);
+	buf_incrwritepos(buf, len);
+}
+	
+
+/* for our purposes we only need positive (or 0) numbers, so will
+ * fail if we get negative numbers */
+void buf_putmpint(buffer* buf, mp_int * mp) {
+
+	unsigned int len, pad = 0;
+	TRACE(("enter buf_putmpint"))
+
+	dropbear_assert(mp != NULL);
+
+	if (SIGN(mp) == MP_NEG) {
+		dropbear_exit("negative bignum");
+	}
+
+	/* zero check */
+	if (USED(mp) == 1 && DIGIT(mp, 0) == 0) {
+		len = 0;
+	} else {
+		/* SSH spec requires padding for mpints with the MSB set, this code
+		 * implements it */
+		len = mp_count_bits(mp);
+		/* if the top bit of MSB is set, we need to pad */
+		pad = (len%8 == 0) ? 1 : 0;
+		len = len / 8 + 1; /* don't worry about rounding, we need it for
+							  padding anyway when len%8 == 0 */
+
+	}
+
+	/* store the length */
+	buf_putint(buf, len);
+	
+	/* store the actual value */
+	if (len > 0) {
+		if (pad) {
+			buf_putbyte(buf, 0x00);
+		}
+		if (mp_to_unsigned_bin(mp, buf_getwriteptr(buf, len-pad)) != MP_OKAY) {
+			dropbear_exit("mpint error");
+		}
+		buf_incrwritepos(buf, len-pad);
+	}
+
+	TRACE(("leave buf_putmpint"))
+}
+
+/* Retrieve an mp_int from the buffer.
+ * Will fail for -ve since they shouldn't be required here.
+ * Returns DROPBEAR_SUCCESS or DROPBEAR_FAILURE */
+int buf_getmpint(buffer* buf, mp_int* mp) {
+
+	unsigned int len;
+	len = buf_getint(buf);
+	
+	if (len == 0) {
+		mp_zero(mp);
+		return DROPBEAR_SUCCESS;
+	}
+
+	if (len > BUF_MAX_MPINT) {
+		return DROPBEAR_FAILURE;
+	}
+
+	/* check for negative */
+	if (*buf_getptr(buf, 1) & (1 << (CHAR_BIT-1))) {
+		return DROPBEAR_FAILURE;
+	}
+
+	if (mp_read_unsigned_bin(mp, buf_getptr(buf, len), len) != MP_OKAY) {
+		return DROPBEAR_FAILURE;
+	}
+
+	buf_incrpos(buf, len);
+	return DROPBEAR_SUCCESS;
+}
diff --git a/buffer.h b/buffer.h
new file mode 100644
index 0000000..f9aa6fa
--- /dev/null
+++ b/buffer.h
@@ -0,0 +1,66 @@
+/*
+ * Dropbear - a SSH2 server
+ * 
+ * Copyright (c) 2002,2003 Matt Johnston
+ * All rights reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE. */
+
+#ifndef _BUFFER_H_
+
+#define _BUFFER_H_
+
+#include "includes.h"
+
+struct buf {
+
+	unsigned char * data;
+	unsigned int len; /* the used size */
+	unsigned int pos;
+	unsigned int size; /* the memory size */
+
+};
+
+typedef struct buf buffer;
+
+buffer * buf_new(unsigned int size);
+void buf_resize(buffer *buf, unsigned int newsize);
+void buf_free(buffer* buf);
+void buf_burn(buffer* buf);
+buffer* buf_newcopy(buffer* buf);
+void buf_setlen(buffer* buf, unsigned int len);
+void buf_incrlen(buffer* buf, unsigned int incr);
+void buf_setpos(buffer* buf, unsigned int pos);
+void buf_incrpos(buffer* buf, int incr); /* -ve is ok, to go backwards */
+void buf_incrwritepos(buffer* buf, unsigned int incr);
+unsigned char buf_getbyte(buffer* buf);
+unsigned char buf_getbool(buffer* buf);
+void buf_putbyte(buffer* buf, unsigned char val);
+unsigned char* buf_getptr(buffer* buf, unsigned int len);
+unsigned char* buf_getwriteptr(buffer* buf, unsigned int len);
+unsigned char* buf_getstring(buffer* buf, unsigned int *retlen);
+void buf_eatstring(buffer *buf);
+void buf_putint(buffer* buf, unsigned int val);
+void buf_putstring(buffer* buf, const unsigned char* str, unsigned int len);
+void buf_putbytes(buffer *buf, const unsigned char *bytes, unsigned int len);
+void buf_putmpint(buffer* buf, mp_int * mp);
+int buf_getmpint(buffer* buf, mp_int* mp);
+unsigned int buf_getint(buffer* buf);
+
+#endif /* _BUFFER_H_ */
diff --git a/channel.h b/channel.h
new file mode 100644
index 0000000..7030a0f
--- /dev/null
+++ b/channel.h
@@ -0,0 +1,135 @@
+/*
+ * Dropbear - a SSH2 server
+ * 
+ * Copyright (c) 2002,2003 Matt Johnston
+ * All rights reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE. */
+
+#ifndef _CHANNEL_H_
+#define _CHANNEL_H_
+
+#include "includes.h"
+#include "buffer.h"
+#include "circbuffer.h"
+
+/* channel->type values */
+#define CHANNEL_ID_NONE 0
+#define CHANNEL_ID_SESSION 1
+#define CHANNEL_ID_X11 2
+#define CHANNEL_ID_AGENT 3
+#define CHANNEL_ID_TCPDIRECT 4
+#define CHANNEL_ID_TCPFORWARDED 5
+
+#define SSH_OPEN_ADMINISTRATIVELY_PROHIBITED    1
+#define SSH_OPEN_CONNECT_FAILED                 2
+#define SSH_OPEN_UNKNOWN_CHANNEL_TYPE           3
+#define SSH_OPEN_RESOURCE_SHORTAGE              4
+
+/* Not a real type */
+#define SSH_OPEN_IN_PROGRESS					99
+
+#define MAX_CHANNELS 100 /* simple mem restriction, includes each tcp/x11
+							connection, so can't be _too_ small */
+
+#define CHAN_EXTEND_SIZE 3 /* how many extra slots to add when we need more */
+
+#define RECV_MAXWINDOW 8000 /* tweak */
+#define RECV_WINDOWEXTEND 1000 /* We send a "window extend" every
+								RECV_WINDOWEXTEND bytes */
+#define RECV_MAXPACKET RECV_MAXWINDOW /* tweak */
+
+struct ChanType;
+
+struct Channel {
+
+	unsigned int index; /* the local channel index */
+	unsigned int remotechan;
+	unsigned int recvwindow, transwindow;
+	unsigned int recvdonelen;
+	unsigned int recvmaxpacket, transmaxpacket;
+	void* typedata; /* a pointer to type specific data */
+	int writefd; /* read from wire, written to insecure side */
+	int readfd; /* read from insecure size, written to wire */
+	int errfd; /* used like writefd or readfd, depending if it's client or server.
+				  Doesn't exactly belong here, but is cleaner here */
+	circbuffer *writebuf; /* data from the wire, for local consumption */
+	circbuffer *extrabuf; /* extended-data for the program - used like writebuf
+					     but for stderr */
+
+	int sentclosed, recvclosed;
+
+	/* this is set when we receive/send a channel eof packet */
+	int recveof, senteof;
+
+	int initconn; /* used for TCP forwarding, whether the channel has been
+					 fully initialised */
+
+	int await_open; /* flag indicating whether we've sent an open request
+					   for this channel (and are awaiting a confirmation
+					   or failure). */
+
+	const struct ChanType* type;
+
+};
+
+struct ChanType {
+
+	int sepfds; /* Whether this channel has seperate pipes for in/out or not */
+	char *name;
+	int (*inithandler)(struct Channel*);
+	int (*checkclose)(struct Channel*);
+	void (*reqhandler)(struct Channel*);
+	void (*closehandler)(struct Channel*);
+
+};
+
+void chaninitialise(const struct ChanType *chantypes[]);
+void chancleanup();
+void setchannelfds(fd_set *readfd, fd_set *writefd);
+void channelio(fd_set *readfd, fd_set *writefd);
+struct Channel* getchannel();
+struct Channel* newchannel(unsigned int remotechan, 
+		const struct ChanType *type, 
+		unsigned int transwindow, unsigned int transmaxpacket);
+
+void recv_msg_channel_open();
+void recv_msg_channel_request();
+void send_msg_channel_failure(struct Channel *channel);
+void send_msg_channel_success(struct Channel *channel);
+void recv_msg_channel_data();
+void recv_msg_channel_extended_data();
+void recv_msg_channel_window_adjust();
+void recv_msg_channel_close();
+void recv_msg_channel_eof();
+
+void common_recv_msg_channel_data(struct Channel *channel, int fd, 
+		circbuffer * buf);
+
+#ifdef DROPBEAR_CLIENT
+extern const struct ChanType clichansess;
+#endif
+
+#if defined(USING_LISTENERS) || defined(DROPBEAR_CLIENT)
+int send_msg_channel_open_init(int fd, const struct ChanType *type);
+void recv_msg_channel_open_confirmation();
+void recv_msg_channel_open_failure();
+#endif
+
+#endif /* _CHANNEL_H_ */
diff --git a/chansession.h b/chansession.h
new file mode 100644
index 0000000..213c285
--- /dev/null
+++ b/chansession.h
@@ -0,0 +1,92 @@
+/*
+ * Dropbear - a SSH2 server
+ * 
+ * Copyright (c) 2002,2003 Matt Johnston
+ * All rights reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE. */
+
+#ifndef _CHANSESSION_H_
+#define _CHANSESSION_H_
+
+#include "loginrec.h"
+#include "channel.h"
+#include "listener.h"
+
+struct exitinfo {
+
+	int exitpid; /* -1 if not exited */
+	int exitstatus;
+	int exitsignal;
+	int exitcore;
+};
+
+struct ChanSess {
+
+	unsigned char * cmd; /* command to exec */
+	pid_t pid; /* child process pid */
+
+	/* pty details */
+	int master; /* the master terminal fd*/
+	int slave;
+	unsigned char * tty;
+	unsigned char * term;
+
+	/* exit details */
+	struct exitinfo exit;
+	
+#ifndef DISABLE_X11FWD
+	struct Listener * x11listener;
+	int x11port;
+	char * x11authprot;
+	char * x11authcookie;
+	unsigned int x11screennum;
+	unsigned char x11singleconn;
+#endif
+
+#ifndef DISABLE_AGENTFWD
+	struct Listener * agentlistener;
+	char * agentfile;
+	char * agentdir;
+#endif
+};
+
+struct ChildPid {
+	pid_t pid;
+	struct ChanSess * chansess;
+};
+
+
+void addnewvar(const char* param, const char* var);
+
+void cli_send_chansess_request();
+void cli_tty_cleanup();
+void cli_chansess_winchange();
+
+void svr_chansessinitialise();
+extern const struct ChanType svrchansess;
+
+struct SigMap {
+	int signal;
+	char* name;
+};
+
+extern const struct SigMap signames[];
+
+#endif /* _CHANSESSION_H_ */
diff --git a/circbuffer.c b/circbuffer.c
new file mode 100644
index 0000000..e70087a
--- /dev/null
+++ b/circbuffer.c
@@ -0,0 +1,138 @@
+/*
+ * Dropbear SSH
+ * 
+ * Copyright (c) 2002-2004 Matt Johnston
+ * All rights reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE. */
+
+#include "includes.h"
+#include "dbutil.h"
+#include "circbuffer.h"
+
+#define MAX_CBUF_SIZE 100000000
+
+circbuffer * cbuf_new(unsigned int size) {
+
+	circbuffer *cbuf = NULL;
+
+	if (size > MAX_CBUF_SIZE) {
+		dropbear_exit("bad cbuf size");
+	}
+
+	cbuf = (circbuffer*)m_malloc(sizeof(circbuffer));
+	cbuf->data = (unsigned char*)m_malloc(size);
+	cbuf->used = 0;
+	cbuf->readpos = 0;
+	cbuf->writepos = 0;
+	cbuf->size = size;
+
+	return cbuf;
+}
+
+void cbuf_free(circbuffer * cbuf) {
+
+	m_free(cbuf->data);
+	m_free(cbuf);
+}
+
+unsigned int cbuf_getused(circbuffer * cbuf) {
+
+	return cbuf->used;
+
+}
+
+unsigned int cbuf_getavail(circbuffer * cbuf) {
+
+	return cbuf->size - cbuf->used;
+
+}
+
+unsigned int cbuf_readlen(circbuffer *cbuf) {
+
+	dropbear_assert(((2*cbuf->size)+cbuf->writepos-cbuf->readpos)%cbuf->size == cbuf->used%cbuf->size);
+	dropbear_assert(((2*cbuf->size)+cbuf->readpos-cbuf->writepos)%cbuf->size == (cbuf->size-cbuf->used)%cbuf->size);
+
+	if (cbuf->used == 0) {
+		TRACE(("cbuf_readlen: unused buffer"))
+		return 0;
+	}
+
+	if (cbuf->readpos < cbuf->writepos) {
+		return cbuf->writepos - cbuf->readpos;
+	}
+
+	return cbuf->size - cbuf->readpos;
+}
+
+unsigned int cbuf_writelen(circbuffer *cbuf) {
+
+	dropbear_assert(cbuf->used <= cbuf->size);
+	dropbear_assert(((2*cbuf->size)+cbuf->writepos-cbuf->readpos)%cbuf->size == cbuf->used%cbuf->size);
+	dropbear_assert(((2*cbuf->size)+cbuf->readpos-cbuf->writepos)%cbuf->size == (cbuf->size-cbuf->used)%cbuf->size);
+
+	if (cbuf->used == cbuf->size) {
+		TRACE(("cbuf_writelen: full buffer"))
+		return 0; /* full */
+	}
+	
+	if (cbuf->writepos < cbuf->readpos) {
+		return cbuf->readpos - cbuf->writepos;
+	}
+
+	return cbuf->size - cbuf->writepos;
+}
+
+unsigned char* cbuf_readptr(circbuffer *cbuf, unsigned int len) {
+	if (len > cbuf_readlen(cbuf)) {
+		dropbear_exit("bad cbuf read");
+	}
+
+	return &cbuf->data[cbuf->readpos];
+}
+
+unsigned char* cbuf_writeptr(circbuffer *cbuf, unsigned int len) {
+
+	if (len > cbuf_writelen(cbuf)) {
+		dropbear_exit("bad cbuf write");
+	}
+
+	return &cbuf->data[cbuf->writepos];
+}
+
+void cbuf_incrwrite(circbuffer *cbuf, unsigned int len) {
+	if (len > cbuf_writelen(cbuf)) {
+		dropbear_exit("bad cbuf write");
+	}
+
+	cbuf->used += len;
+	dropbear_assert(cbuf->used <= cbuf->size);
+	cbuf->writepos = (cbuf->writepos + len) % cbuf->size;
+}
+
+
+void cbuf_incrread(circbuffer *cbuf, unsigned int len) {
+	if (len > cbuf_readlen(cbuf)) {
+		dropbear_exit("bad cbuf read");
+	}
+
+	dropbear_assert(cbuf->used >= len);
+	cbuf->used -= len;
+	cbuf->readpos = (cbuf->readpos + len) % cbuf->size;
+}
diff --git a/circbuffer.h b/circbuffer.h
new file mode 100644
index 0000000..21c5134
--- /dev/null
+++ b/circbuffer.h
@@ -0,0 +1,50 @@
+/*
+ * Dropbear SSH
+ * 
+ * Copyright (c) 2002-2004 Matt Johnston
+ * All rights reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE. */
+
+#ifndef _CIRCBUFFER_H_
+#define _CIRCBUFFER_H_
+struct circbuf {
+
+	unsigned int size;
+	unsigned int readpos;
+	unsigned int writepos;
+	unsigned int used;
+	unsigned char* data;
+};
+
+typedef struct circbuf circbuffer;
+
+circbuffer * cbuf_new(unsigned int size);
+void cbuf_free(circbuffer * cbuf);
+
+unsigned int cbuf_getused(circbuffer * cbuf); /* how much data stored */
+unsigned int cbuf_getavail(circbuffer * cbuf); /* how much we can write */
+unsigned int cbuf_readlen(circbuffer *cbuf); /* max linear read len */
+unsigned int cbuf_writelen(circbuffer *cbuf); /* max linear write len */
+
+unsigned char* cbuf_readptr(circbuffer *cbuf, unsigned int len);
+unsigned char* cbuf_writeptr(circbuffer *cbuf, unsigned int len);
+void cbuf_incrwrite(circbuffer *cbuf, unsigned int len);
+void cbuf_incrread(circbuffer *cbuf, unsigned int len);
+#endif
diff --git a/cli-algo.c b/cli-algo.c
new file mode 100644
index 0000000..ec3a1ff
--- /dev/null
+++ b/cli-algo.c
@@ -0,0 +1,99 @@
+/*
+ * Dropbear - a SSH2 server
+ * SSH client implementation
+ * 
+ * Copyright (c) 2002,2003 Matt Johnston
+ * Copyright (c) 2004 by Mihnea Stoenescu
+ * All rights reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE. */
+
+#include "algo.h"
+#include "dbutil.h"
+
+
+/*
+ * The chosen [encryption | MAC | compression] algorithm to each 
+ * direction MUST be the first algorithm  on the client's list
+ * that is also on the server's list.
+ */
+algo_type * cli_buf_match_algo(buffer* buf, algo_type localalgos[],
+		int *goodguess) {
+
+	unsigned char * algolist = NULL;
+	unsigned char * remotealgos[MAX_PROPOSED_ALGO];
+	unsigned int len;
+	unsigned int count, i, j;
+	algo_type * ret = NULL;
+
+	*goodguess = 0;
+
+	/* get the comma-separated list from the buffer ie "algo1,algo2,algo3" */
+	algolist = buf_getstring(buf, &len);
+	TRACE(("cli_buf_match_algo: %s", algolist))
+	if (len > MAX_PROPOSED_ALGO*(MAX_NAME_LEN+1)) {
+		goto out; /* just a sanity check, no other use */
+	}
+
+	/* remotealgos will contain a list of the strings parsed out */
+	/* We will have at least one string (even if it's just "") */
+	remotealgos[0] = algolist;
+	count = 1;
+	/* Iterate through, replacing ','s with NULs, to split it into
+	 * words. */
+	for (i = 0; i < len; i++) {
+		if (algolist[i] == '\0') {
+			/* someone is trying something strange */
+			goto out;
+		}
+		if (algolist[i] == ',') {
+			algolist[i] = '\0';
+			remotealgos[count] = &algolist[i+1];
+			count++;
+		}
+		if (count == MAX_PROPOSED_ALGO) {
+			break;
+		}
+	}
+
+	/* iterate and find the first match */
+
+	for (j = 0; localalgos[j].name != NULL; j++) {
+		if (localalgos[j].usable) {
+		len = strlen(localalgos[j].name);
+			for (i = 0; i < count; i++) {
+				if (len == strlen(remotealgos[i]) 
+						&& strncmp(localalgos[j].name, 
+							remotealgos[i], len) == 0) {
+					if (i == 0 && j == 0) {
+						/* was a good guess */
+						*goodguess = 1;
+					}
+					ret = &localalgos[j];
+					goto out;
+				}
+			}
+		}
+	}
+
+out:
+	m_free(algolist);
+	return ret;
+}
+
diff --git a/cli-auth.c b/cli-auth.c
new file mode 100644
index 0000000..d08de9a
--- /dev/null
+++ b/cli-auth.c
@@ -0,0 +1,295 @@
+/*
+ * Dropbear SSH
+ * 
+ * Copyright (c) 2002,2003 Matt Johnston
+ * Copyright (c) 2004 by Mihnea Stoenescu
+ * All rights reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE. */
+
+#include "includes.h"
+#include "session.h"
+#include "auth.h"
+#include "dbutil.h"
+#include "buffer.h"
+#include "ssh.h"
+#include "packet.h"
+#include "runopts.h"
+
+void cli_authinitialise() {
+
+	memset(&ses.authstate, 0, sizeof(ses.authstate));
+}
+
+
+/* Send a "none" auth request to get available methods */
+void cli_auth_getmethods() {
+
+	TRACE(("enter cli_auth_getmethods"))
+
+	CHECKCLEARTOWRITE();
+
+	buf_putbyte(ses.writepayload, SSH_MSG_USERAUTH_REQUEST);
+	buf_putstring(ses.writepayload, cli_opts.username, 
+			strlen(cli_opts.username));
+	buf_putstring(ses.writepayload, SSH_SERVICE_CONNECTION, 
+			SSH_SERVICE_CONNECTION_LEN);
+	buf_putstring(ses.writepayload, "none", 4); /* 'none' method */
+
+	encrypt_packet();
+	TRACE(("leave cli_auth_getmethods"))
+
+}
+
+void recv_msg_userauth_banner() {
+
+	unsigned char* banner = NULL;
+	unsigned int bannerlen;
+	unsigned int i, linecount;
+
+	TRACE(("enter recv_msg_userauth_banner"))
+	if (ses.authstate.authdone) {
+		TRACE(("leave recv_msg_userauth_banner: banner after auth done"))
+		return;
+	}
+
+	banner = buf_getstring(ses.payload, &bannerlen);
+	buf_eatstring(ses.payload); /* The language string */
+
+	if (bannerlen > MAX_BANNER_SIZE) {
+		TRACE(("recv_msg_userauth_banner: bannerlen too long: %d", bannerlen))
+		goto out;
+	}
+
+	cleantext(banner);
+
+	/* Limit to 25 lines */
+	linecount = 1;
+	for (i = 0; i < bannerlen; i++) {
+		if (banner[i] == '\n') {
+			if (linecount >= MAX_BANNER_LINES) {
+				banner[i] = '\0';
+				break;
+			}
+			linecount++;
+		}
+	}
+
+	printf("%s\n", banner);
+
+out:
+	m_free(banner);
+	TRACE(("leave recv_msg_userauth_banner"))
+}
+
+/* This handles the message-specific types which
+ * all have a value of 60. These are
+ * SSH_MSG_USERAUTH_PASSWD_CHANGEREQ,
+ * SSH_MSG_USERAUTH_PK_OK, &
+ * SSH_MSG_USERAUTH_INFO_REQUEST. */
+void recv_msg_userauth_specific_60() {
+
+#ifdef ENABLE_CLI_PUBKEY_AUTH
+	if (cli_ses.lastauthtype == AUTH_TYPE_PUBKEY) {
+		recv_msg_userauth_pk_ok();
+		return;
+	}
+#endif
+
+#ifdef ENABLE_CLI_INTERACT_AUTH
+	if (cli_ses.lastauthtype == AUTH_TYPE_INTERACT) {
+		recv_msg_userauth_info_request();
+		return;
+	}
+#endif
+
+#ifdef ENABLE_CLI_PASSWORD_AUTH
+	if (cli_ses.lastauthtype == AUTH_TYPE_PASSWORD) {
+		/* Eventually there could be proper password-changing
+		 * support. However currently few servers seem to
+		 * implement it, and password auth is last-resort
+		 * regardless - keyboard-interactive is more likely
+		 * to be used anyway. */
+		dropbear_close("Your password has expired.");
+	}
+#endif
+
+	dropbear_exit("Unexpected userauth packet");
+}
+
+void recv_msg_userauth_failure() {
+
+	unsigned char * methods = NULL;
+	unsigned char * tok = NULL;
+	unsigned int methlen = 0;
+	unsigned int partial = 0;
+	unsigned int i = 0;
+
+	TRACE(("<- MSG_USERAUTH_FAILURE"))
+	TRACE(("enter recv_msg_userauth_failure"))
+
+	if (cli_ses.state != USERAUTH_REQ_SENT) {
+		/* Perhaps we should be more fatal? */
+		dropbear_exit("Unexpected userauth failure");
+	}
+
+#ifdef ENABLE_CLI_PUBKEY_AUTH
+	/* If it was a pubkey auth request, we should cross that key 
+	 * off the list. */
+	if (cli_ses.lastauthtype == AUTH_TYPE_PUBKEY) {
+		cli_pubkeyfail();
+	}
+#endif
+
+#ifdef ENABLE_CLI_INTERACT_AUTH
+	/* If we get a failure message for keyboard interactive without
+	 * receiving any request info packet, then we don't bother trying
+	 * keyboard interactive again */
+	if (cli_ses.lastauthtype == AUTH_TYPE_INTERACT
+			&& !cli_ses.interact_request_received) {
+		TRACE(("setting auth_interact_failed = 1"))
+		cli_ses.auth_interact_failed = 1;
+	}
+#endif
+
+	cli_ses.lastauthtype = AUTH_TYPE_NONE;
+
+	methods = buf_getstring(ses.payload, &methlen);
+
+	partial = buf_getbool(ses.payload);
+
+	if (partial) {
+		dropbear_log(LOG_INFO, "Authentication partially succeeded, more attempts required");
+	} else {
+		ses.authstate.failcount++;
+	}
+
+	TRACE(("Methods (len %d): '%s'", methlen, methods))
+
+	ses.authstate.authdone=0;
+	ses.authstate.authtypes=0;
+
+	/* Split with nulls rather than commas */
+	for (i = 0; i < methlen; i++) {
+		if (methods[i] == ',') {
+			methods[i] = '\0';
+		}
+	}
+
+	tok = methods; /* tok stores the next method we'll compare */
+	for (i = 0; i <= methlen; i++) {
+		if (methods[i] == '\0') {
+			TRACE(("auth method '%s'", tok))
+#ifdef ENABLE_CLI_PUBKEY_AUTH
+			if (strncmp(AUTH_METHOD_PUBKEY, tok,
+				AUTH_METHOD_PUBKEY_LEN) == 0) {
+				ses.authstate.authtypes |= AUTH_TYPE_PUBKEY;
+			}
+#endif
+#ifdef ENABLE_CLI_INTERACT_AUTH
+			if (strncmp(AUTH_METHOD_INTERACT, tok,
+				AUTH_METHOD_INTERACT_LEN) == 0) {
+				ses.authstate.authtypes |= AUTH_TYPE_INTERACT;
+			}
+#endif
+#ifdef ENABLE_CLI_PASSWORD_AUTH
+			if (strncmp(AUTH_METHOD_PASSWORD, tok,
+				AUTH_METHOD_PASSWORD_LEN) == 0) {
+				ses.authstate.authtypes |= AUTH_TYPE_PASSWORD;
+			}
+#endif
+			tok = &methods[i+1]; /* Must make sure we don't use it after the
+									last loop, since it'll point to something
+									undefined */
+		}
+	}
+
+	m_free(methods);
+
+	cli_ses.state = USERAUTH_FAIL_RCVD;
+		
+	TRACE(("leave recv_msg_userauth_failure"))
+}
+
+void recv_msg_userauth_success() {
+	TRACE(("received msg_userauth_success"))
+	ses.authstate.authdone = 1;
+	cli_ses.state = USERAUTH_SUCCESS_RCVD;
+	cli_ses.lastauthtype = AUTH_TYPE_NONE;
+}
+
+void cli_auth_try() {
+
+	TRACE(("enter cli_auth_try"))
+	int finished = 0;
+
+	CHECKCLEARTOWRITE();
+	
+	/* Order to try is pubkey, interactive, password.
+	 * As soon as "finished" is set for one, we don't do any more. */
+#ifdef ENABLE_CLI_PUBKEY_AUTH
+	if (ses.authstate.authtypes & AUTH_TYPE_PUBKEY) {
+		finished = cli_auth_pubkey();
+		cli_ses.lastauthtype = AUTH_TYPE_PUBKEY;
+	}
+#endif
+
+#ifdef ENABLE_CLI_INTERACT_AUTH
+	if (!finished && ses.authstate.authtypes & AUTH_TYPE_INTERACT) {
+		if (cli_ses.auth_interact_failed) {
+			finished = 0;
+		} else {
+			cli_auth_interactive();
+			cli_ses.lastauthtype = AUTH_TYPE_INTERACT;
+			finished = 1;
+		}
+	}
+#endif
+
+#ifdef ENABLE_CLI_PASSWORD_AUTH
+	if (!finished && ses.authstate.authtypes & AUTH_TYPE_PASSWORD) {
+		cli_auth_password();
+		finished = 1;
+		cli_ses.lastauthtype = AUTH_TYPE_PASSWORD;
+	}
+#endif
+
+	TRACE(("cli_auth_try lastauthtype %d", cli_ses.lastauthtype))
+
+	if (!finished) {
+		dropbear_exit("No auth methods could be used.");
+	}
+
+	TRACE(("leave cli_auth_try"))
+}
+
+/* A helper for getpass() that exits if the user cancels. The returned
+ * password is statically allocated by getpass() */
+char* getpass_or_cancel()
+{
+	char* password = NULL;
+
+	password = getpass("Password: ");
+
+	/* 0x03 is a ctrl-c character in the buffer. */
+	if (password == NULL || strchr(password, '\3') != NULL) {
+		dropbear_close("Interrupted.");
+	}
+	return password;
+}
diff --git a/cli-authinteract.c b/cli-authinteract.c
new file mode 100644
index 0000000..5a169cb
--- /dev/null
+++ b/cli-authinteract.c
@@ -0,0 +1,168 @@
+/*
+ * Dropbear SSH
+ * 
+ * Copyright (c) 2005 Matt Johnston
+ * All rights reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE. */
+
+#include "includes.h"
+#include "buffer.h"
+#include "dbutil.h"
+#include "session.h"
+#include "ssh.h"
+#include "runopts.h"
+
+#ifdef ENABLE_CLI_INTERACT_AUTH
+
+static unsigned char* get_response(unsigned char* prompt)
+{
+	FILE* tty = NULL;
+	unsigned char* response = NULL;
+	/* not a password, but a reasonable limit */
+	char buf[DROPBEAR_MAX_CLI_PASS];
+	char* ret = NULL;
+
+	fprintf(stderr, "%s", prompt);
+
+	tty = fopen(_PATH_TTY, "r");
+	if (tty) {
+		ret = fgets(buf, sizeof(buf), tty);
+		fclose(tty);
+	} else {
+		ret = fgets(buf, sizeof(buf), stdin);
+	}
+
+	if (ret == NULL) {
+		response = (unsigned char*)m_strdup("");
+	} else {
+		unsigned int buflen = strlen(buf);
+		/* fgets includes newlines */
+		if (buflen > 0 && buf[buflen-1] == '\n')
+			buf[buflen-1] = '\0';
+		response = (unsigned char*)m_strdup(buf);
+	}
+
+	m_burn(buf, sizeof(buf));
+
+	return response;
+}
+
+void recv_msg_userauth_info_request() {
+
+	unsigned char *name = NULL;
+	unsigned char *instruction = NULL;
+	unsigned int num_prompts = 0;
+	unsigned int i;
+
+	unsigned char *prompt = NULL;
+	unsigned int echo = 0;
+	unsigned char *response = NULL;
+
+	TRACE(("enter recv_msg_recv_userauth_info_request"))
+
+	cli_ses.interact_request_received = 1;
+
+	name = buf_getstring(ses.payload, NULL);
+	instruction = buf_getstring(ses.payload, NULL);
+
+	/* language tag */
+	buf_eatstring(ses.payload);
+
+	num_prompts = buf_getint(ses.payload);
+	
+	if (num_prompts >= DROPBEAR_MAX_CLI_INTERACT_PROMPTS) {
+		dropbear_exit("Too many prompts received for keyboard-interactive");
+	}
+
+	/* we'll build the response as we go */
+	CHECKCLEARTOWRITE();
+	buf_putbyte(ses.writepayload, SSH_MSG_USERAUTH_INFO_RESPONSE);
+	buf_putint(ses.writepayload, num_prompts);
+
+	if (strlen(name) > 0) {
+		cleantext(name);
+		fprintf(stderr, "%s", name);
+		m_free(name);
+	}
+	if (strlen(instruction) > 0) {
+		cleantext(instruction);
+		fprintf(stderr, "%s", instruction);
+		m_free(instruction);
+	}
+
+	for (i = 0; i < num_prompts; i++) {
+		unsigned int response_len = 0;
+		prompt = buf_getstring(ses.payload, NULL);
+		cleantext(prompt);
+
+		echo = buf_getbool(ses.payload);
+
+		if (!echo) {
+			unsigned char* p = getpass_or_cancel(prompt);
+			response = m_strdup(p);
+			m_burn(p, strlen(p));
+		} else {
+			response = get_response(prompt);
+		}
+
+		response_len = strlen(response);
+		buf_putstring(ses.writepayload, response, response_len);
+		m_burn(response, response_len);
+		m_free(response);
+	}
+
+	encrypt_packet();
+
+
+	TRACE(("leave recv_msg_recv_userauth_info_request"))
+}
+
+void cli_auth_interactive() {
+
+	TRACE(("enter cli_auth_interactive"))
+	CHECKCLEARTOWRITE();
+
+	buf_putbyte(ses.writepayload, SSH_MSG_USERAUTH_REQUEST);
+
+	/* username */
+	buf_putstring(ses.writepayload, cli_opts.username,
+			strlen(cli_opts.username));
+
+	/* service name */
+	buf_putstring(ses.writepayload, SSH_SERVICE_CONNECTION, 
+			SSH_SERVICE_CONNECTION_LEN);
+
+	/* method */
+	buf_putstring(ses.writepayload, AUTH_METHOD_INTERACT,
+			AUTH_METHOD_INTERACT_LEN);
+
+	/* empty language tag */
+	buf_putstring(ses.writepayload, "", 0);
+
+	/* empty submethods */
+	buf_putstring(ses.writepayload, "", 0);
+
+	encrypt_packet();
+	cli_ses.interact_request_received = 0;
+
+	TRACE(("leave cli_auth_interactive"))
+
+}
+#endif	/* ENABLE_CLI_INTERACT_AUTH */
diff --git a/cli-authpasswd.c b/cli-authpasswd.c
new file mode 100644
index 0000000..5dffac4
--- /dev/null
+++ b/cli-authpasswd.c
@@ -0,0 +1,150 @@
+/*
+ * Dropbear SSH
+ * 
+ * Copyright (c) 2002,2003 Matt Johnston
+ * All rights reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE. */
+
+#include "includes.h"
+#include "buffer.h"
+#include "dbutil.h"
+#include "session.h"
+#include "ssh.h"
+#include "runopts.h"
+
+#ifdef ENABLE_CLI_PASSWORD_AUTH
+
+#ifdef ENABLE_CLI_ASKPASS_HELPER
+/* Returns 1 if we want to use the askpass program, 0 otherwise */
+static int want_askpass()
+{
+	char* askpass_prog = NULL;
+
+	askpass_prog = getenv("SSH_ASKPASS");
+	return askpass_prog && !isatty(STDIN_FILENO) && getenv("DISPLAY");
+}
+
+/* returns a statically allocated password from a helper app, or NULL
+ * on failure */
+static char *gui_getpass(const char *prompt) {
+
+	pid_t pid;
+	int p[2], maxlen, len, status;
+	static char buf[DROPBEAR_MAX_CLI_PASS + 1];
+	char* helper = NULL;
+
+	TRACE(("enter gui_getpass"))
+
+	helper = getenv("SSH_ASKPASS");
+	if (!helper)
+	{
+		TRACE(("leave gui_getpass: no askpass program"))
+		return NULL;
+	}
+
+	if (pipe(p) < 0) {
+		TRACE(("error creating child pipe"))
+		return NULL;
+	}
+
+	pid = fork();
+
+	if (pid < 0) {
+		TRACE(("fork error"))
+		return NULL;
+	}
+
+	if (!pid) {
+		/* child */
+		close(p[0]);
+		if (dup2(p[1], STDOUT_FILENO) < 0) {
+			TRACE(("error redirecting stdout"))
+			exit(1);
+		}
+		close(p[1]);
+		execlp(helper, helper, prompt, (char *)0);
+		TRACE(("execlp error"))
+		exit(1);
+	}
+
+	close(p[1]);
+	maxlen = sizeof(buf);
+	while (maxlen > 0) {
+		len = read(p[0], buf + sizeof(buf) - maxlen, maxlen);
+		if (len > 0) {
+			maxlen -= len;
+		} else {
+			if (errno != EINTR)
+				break;
+		}
+	}
+
+	close(p[0]);
+
+	while (waitpid(pid, &status, 0) < 0 && errno == EINTR)
+		;
+	if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
+		return(NULL);
+
+	len = sizeof(buf) - maxlen;
+	buf[len] = '\0';
+	if (len > 0 && buf[len - 1] == '\n')
+		buf[len - 1] = '\0';
+
+	TRACE(("leave gui_getpass"))
+	return(buf);
+}
+#endif /* ENABLE_CLI_ASKPASS_HELPER */
+
+void cli_auth_password() {
+
+	char* password = NULL;
+
+	TRACE(("enter cli_auth_password"))
+	CHECKCLEARTOWRITE();
+
+#ifdef ENABLE_CLI_ASKPASS_HELPER
+	if (want_askpass())
+		password = gui_getpass("Password: ");
+	else
+#endif
+		password = getpass_or_cancel("Password: ");
+
+	buf_putbyte(ses.writepayload, SSH_MSG_USERAUTH_REQUEST);
+
+	buf_putstring(ses.writepayload, cli_opts.username,
+			strlen(cli_opts.username));
+
+	buf_putstring(ses.writepayload, SSH_SERVICE_CONNECTION, 
+			SSH_SERVICE_CONNECTION_LEN);
+
+	buf_putstring(ses.writepayload, AUTH_METHOD_PASSWORD, 
+			AUTH_METHOD_PASSWORD_LEN);
+
+	buf_putbyte(ses.writepayload, 0); /* FALSE - so says the spec */
+
+	buf_putstring(ses.writepayload, password, strlen(password));
+
+	encrypt_packet();
+	m_burn(password, strlen(password));
+
+	TRACE(("leave cli_auth_password"))
+}
+#endif	/* ENABLE_CLI_PASSWORD_AUTH */
diff --git a/cli-authpubkey.c b/cli-authpubkey.c
new file mode 100644
index 0000000..9d36bc3
--- /dev/null
+++ b/cli-authpubkey.c
@@ -0,0 +1,187 @@
+/*
+ * Dropbear SSH
+ * 
+ * Copyright (c) 2002,2003 Matt Johnston
+ * Copyright (c) 2004 by Mihnea Stoenescu
+ * All rights reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE. */
+
+#include "includes.h"
+#include "buffer.h"
+#include "dbutil.h"
+#include "session.h"
+#include "ssh.h"
+#include "runopts.h"
+#include "auth.h"
+
+#ifdef ENABLE_CLI_PUBKEY_AUTH
+static void send_msg_userauth_pubkey(sign_key *key, int type, int realsign);
+
+/* Called when we receive a SSH_MSG_USERAUTH_FAILURE for a pubkey request.
+ * We use it to remove the key we tried from the list */
+void cli_pubkeyfail() {
+
+	struct SignKeyList *keyitem;
+	struct SignKeyList **previtem;
+
+	TRACE(("enter cli_pubkeyfail"))
+	previtem = &cli_opts.privkeys;
+
+	/* Find the key we failed with, and remove it */
+	for (keyitem = cli_opts.privkeys; keyitem != NULL; keyitem = keyitem->next) {
+		if (keyitem == cli_ses.lastprivkey) {
+			*previtem = keyitem->next;
+		}
+		previtem = &keyitem;
+	}
+
+	sign_key_free(cli_ses.lastprivkey->key); /* It won't be used again */
+	m_free(cli_ses.lastprivkey);
+
+	TRACE(("leave cli_pubkeyfail"))
+}
+
+void recv_msg_userauth_pk_ok() {
+
+	struct SignKeyList *keyitem;
+	buffer* keybuf;
+	char* algotype = NULL;
+	unsigned int algolen;
+	int keytype;
+	unsigned int remotelen;
+
+	TRACE(("enter recv_msg_userauth_pk_ok"))
+
+	algotype = buf_getstring(ses.payload, &algolen);
+	keytype = signkey_type_from_name(algotype, algolen);
+	TRACE(("recv_msg_userauth_pk_ok: type %d", keytype))
+	m_free(algotype);
+
+	keybuf = buf_new(MAX_PUBKEY_SIZE);
+
+	remotelen = buf_getint(ses.payload);
+
+	/* Iterate through our keys, find which one it was that matched, and
+	 * send a real request with that key */
+	for (keyitem = cli_opts.privkeys; keyitem != NULL; keyitem = keyitem->next) {
+
+		if (keyitem->type != keytype) {
+			/* Types differed */
+			TRACE(("types differed"))
+			continue;
+		}
+
+		/* Now we compare the contents of the key */
+		keybuf->pos = keybuf->len = 0;
+		buf_put_pub_key(keybuf, keyitem->key, keytype);
+		buf_setpos(keybuf, 0);
+		buf_incrpos(keybuf, 4); /* first int is the length of the remainder (ie
+								   remotelen) which has already been taken from
+								   the remote buffer */
+
+
+		if (keybuf->len-4 != remotelen) {
+			TRACE(("lengths differed: localh %d remote %d", keybuf->len, remotelen))
+			/* Lengths differed */
+			continue;
+		}
+		if (memcmp(buf_getptr(keybuf, remotelen),
+					buf_getptr(ses.payload, remotelen), remotelen) != 0) {
+			/* Data didn't match this key */
+			TRACE(("data differed"))
+			continue;
+		}
+
+		/* Success */
+		break;
+	}
+
+	if (keyitem != NULL) {
+		TRACE(("matching key"))
+		/* XXX TODO: if it's an encrypted key, here we ask for their
+		 * password */
+		send_msg_userauth_pubkey(keyitem->key, keytype, 1);
+	} else {
+		TRACE(("That was whacky. We got told that a key was valid, but it didn't match our list. Sounds like dodgy code on Dropbear's part"))
+	}
+
+	TRACE(("leave recv_msg_userauth_pk_ok"))
+}
+
+/* TODO: make it take an agent reference to use as well */
+static void send_msg_userauth_pubkey(sign_key *key, int type, int realsign) {
+
+	const char *algoname = NULL;
+	int algolen;
+	buffer* sigbuf = NULL;
+
+	TRACE(("enter send_msg_userauth_pubkey"))
+	CHECKCLEARTOWRITE();
+
+	buf_putbyte(ses.writepayload, SSH_MSG_USERAUTH_REQUEST);
+
+	buf_putstring(ses.writepayload, cli_opts.username,
+			strlen(cli_opts.username));
+
+	buf_putstring(ses.writepayload, SSH_SERVICE_CONNECTION, 
+			SSH_SERVICE_CONNECTION_LEN);
+
+	buf_putstring(ses.writepayload, AUTH_METHOD_PUBKEY, 
+			AUTH_METHOD_PUBKEY_LEN);
+
+	buf_putbyte(ses.writepayload, realsign);
+
+	algoname = signkey_name_from_type(type, &algolen);
+
+	buf_putstring(ses.writepayload, algoname, algolen);
+	buf_put_pub_key(ses.writepayload, key, type);
+
+	if (realsign) {
+		TRACE(("realsign"))
+		/* We put the signature as well - this contains string(session id), then
+		 * the contents of the write payload to this point */
+		sigbuf = buf_new(4 + SHA1_HASH_SIZE + ses.writepayload->len);
+		buf_putstring(sigbuf, ses.session_id, SHA1_HASH_SIZE);
+		buf_putbytes(sigbuf, ses.writepayload->data, ses.writepayload->len);
+		buf_put_sign(ses.writepayload, key, type, sigbuf->data, sigbuf->len);
+		buf_free(sigbuf); /* Nothing confidential in the buffer */
+	}
+
+	encrypt_packet();
+	TRACE(("leave send_msg_userauth_pubkey"))
+}
+
+int cli_auth_pubkey() {
+
+	TRACE(("enter cli_auth_pubkey"))
+
+	if (cli_opts.privkeys != NULL) {
+		/* Send a trial request */
+		send_msg_userauth_pubkey(cli_opts.privkeys->key,
+				cli_opts.privkeys->type, 0);
+		cli_ses.lastprivkey = cli_opts.privkeys;
+		TRACE(("leave cli_auth_pubkey-success"))
+		return 1;
+	} else {
+		TRACE(("leave cli_auth_pubkey-failure"))
+		return 0;
+	}
+}
+#endif /* Pubkey auth */
diff --git a/cli-channel.c b/cli-channel.c
new file mode 100644
index 0000000..1bd49ab
--- /dev/null
+++ b/cli-channel.c
@@ -0,0 +1,62 @@
+/*
+ * Dropbear SSH
+ * 
+ * Copyright (c) 2002-2004 Matt Johnston
+ * All rights reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE. */
+
+#include "includes.h"
+#include "channel.h"
+#include "buffer.h"
+#include "circbuffer.h"
+#include "dbutil.h"
+#include "session.h"
+#include "ssh.h"
+
+/* We receive channel data - only used by the client chansession code*/
+void recv_msg_channel_extended_data() {
+
+	struct Channel *channel;
+	unsigned int datatype;
+
+	TRACE(("enter recv_msg_channel_extended_data"))
+
+	channel = getchannel();
+	if (channel == NULL) {
+		dropbear_exit("Unknown channel");
+	}
+
+	if (channel->type != &clichansess) {
+		TRACE(("leave recv_msg_channel_extended_data: chantype is wrong"))
+		return; /* we just ignore it */
+	}
+
+	datatype = buf_getint(ses.payload);
+	
+	if (datatype != SSH_EXTENDED_DATA_STDERR) {
+		TRACE(("leave recv_msg_channel_extended_data: wrong datatype: %d",
+					datatype))
+		return;	
+	}
+
+	common_recv_msg_channel_data(channel, channel->errfd, channel->extrabuf);
+
+	TRACE(("leave recv_msg_channel_extended_data"))
+}
diff --git a/cli-chansession.c b/cli-chansession.c
new file mode 100644
index 0000000..6d358b7
--- /dev/null
+++ b/cli-chansession.c
@@ -0,0 +1,380 @@
+/*
+ * Dropbear SSH
+ * 
+ * Copyright (c) 2002,2003 Matt Johnston
+ * Copyright (c) 2004 by Mihnea Stoenescu
+ * All rights reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE. */
+
+#include "includes.h"
+#include "packet.h"
+#include "buffer.h"
+#include "session.h"
+#include "dbutil.h"
+#include "channel.h"
+#include "ssh.h"
+#include "runopts.h"
+#include "termcodes.h"
+#include "chansession.h"
+
+static void cli_closechansess(struct Channel *channel);
+static int cli_initchansess(struct Channel *channel);
+static void cli_chansessreq(struct Channel *channel);
+
+static void start_channel_request(struct Channel *channel, unsigned char *type);
+
+static void send_chansess_pty_req(struct Channel *channel);
+static void send_chansess_shell_req(struct Channel *channel);
+
+static void cli_tty_setup();
+
+const struct ChanType clichansess = {
+	0, /* sepfds */
+	"session", /* name */
+	cli_initchansess, /* inithandler */
+	NULL, /* checkclosehandler */
+	cli_chansessreq, /* reqhandler */
+	cli_closechansess, /* closehandler */
+};
+
+static void cli_chansessreq(struct Channel *channel) {
+
+	unsigned char* type = NULL;
+	int wantreply;
+
+	TRACE(("enter cli_chansessreq"))
+
+	type = buf_getstring(ses.payload, NULL);
+	wantreply = buf_getbool(ses.payload);
+
+	if (strcmp(type, "exit-status") != 0) {
+		TRACE(("unknown request '%s'", type))
+		send_msg_channel_failure(channel);
+		goto out;
+	}
+		
+	/* We'll just trust what they tell us */
+	cli_ses.retval = buf_getint(ses.payload);
+	TRACE(("got exit-status of '%d'", cli_ses.retval))
+
+out:
+	m_free(type);
+}
+	
+
+/* If the main session goes, we close it up */
+static void cli_closechansess(struct Channel *UNUSED(channel)) {
+
+	/* This channel hasn't gone yet, so we have > 1 */
+	if (ses.chancount > 1) {
+		dropbear_log(LOG_INFO, "Waiting for other channels to close...");
+	}
+
+	cli_tty_cleanup(); /* Restore tty modes etc */
+
+}
+
+static void start_channel_request(struct Channel *channel, 
+		unsigned char *type) {
+
+	CHECKCLEARTOWRITE();
+	buf_putbyte(ses.writepayload, SSH_MSG_CHANNEL_REQUEST);
+	buf_putint(ses.writepayload, channel->remotechan);
+
+	buf_putstring(ses.writepayload, type, strlen(type));
+
+}
+
+/* Taken from OpenSSH's sshtty.c:
+ * RCSID("OpenBSD: sshtty.c,v 1.5 2003/09/19 17:43:35 markus Exp "); */
+static void cli_tty_setup() {
+
+	struct termios tio;
+
+	TRACE(("enter cli_pty_setup"))
+
+	if (cli_ses.tty_raw_mode == 1) {
+		TRACE(("leave cli_tty_setup: already in raw mode!"))
+		return;
+	}
+
+	if (tcgetattr(STDIN_FILENO, &tio) == -1) {
+		dropbear_exit("Failed to set raw TTY mode");
+	}
+
+	/* make a copy */
+	cli_ses.saved_tio = tio;
+
+	tio.c_iflag |= IGNPAR;
+	tio.c_iflag &= ~(ISTRIP | INLCR | IGNCR | ICRNL | IXON | IXANY | IXOFF);
+#ifdef IUCLC
+	tio.c_iflag &= ~IUCLC;
+#endif
+	tio.c_lflag &= ~(ISIG | ICANON | ECHO | ECHOE | ECHOK | ECHONL);
+#ifdef IEXTEN
+	tio.c_lflag &= ~IEXTEN;
+#endif
+	tio.c_oflag &= ~OPOST;
+	tio.c_cc[VMIN] = 1;
+	tio.c_cc[VTIME] = 0;
+	if (tcsetattr(STDIN_FILENO, TCSADRAIN, &tio) == -1) {
+		dropbear_exit("Failed to set raw TTY mode");
+	}
+
+	cli_ses.tty_raw_mode = 1;
+	TRACE(("leave cli_tty_setup"))
+}
+
+void cli_tty_cleanup() {
+
+	TRACE(("enter cli_tty_cleanup"))
+
+	if (cli_ses.tty_raw_mode == 0) {
+		TRACE(("leave cli_tty_cleanup: not in raw mode"))
+		return;
+	}
+
+	if (tcsetattr(STDIN_FILENO, TCSADRAIN, &cli_ses.saved_tio) == -1) {
+		dropbear_log(LOG_WARNING, "Failed restoring TTY");
+	} else {
+		cli_ses.tty_raw_mode = 0; 
+	}
+
+	TRACE(("leave cli_tty_cleanup"))
+}
+
+static void put_termcodes() {
+
+	TRACE(("enter put_termcodes"))
+
+	struct termios tio;
+	unsigned int sshcode;
+	const struct TermCode *termcode;
+	unsigned int value;
+	unsigned int mapcode;
+
+	unsigned int bufpos1, bufpos2;
+
+	if (tcgetattr(STDIN_FILENO, &tio) == -1) {
+		dropbear_log(LOG_WARNING, "Failed reading termmodes");
+		buf_putint(ses.writepayload, 1); /* Just the terminator */
+		buf_putbyte(ses.writepayload, 0); /* TTY_OP_END */
+		return;
+	}
+
+	bufpos1 = ses.writepayload->pos;
+	buf_putint(ses.writepayload, 0); /* A placeholder for the final length */
+
+	/* As with Dropbear server, we ignore baud rates for now */
+	for (sshcode = 1; sshcode < MAX_TERMCODE; sshcode++) {
+
+		termcode = &termcodes[sshcode];
+		mapcode = termcode->mapcode;
+
+		switch (termcode->type) {
+
+			case TERMCODE_NONE:
+				continue;
+
+			case TERMCODE_CONTROLCHAR:
+				value = tio.c_cc[mapcode];
+				break;
+
+			case TERMCODE_INPUT:
+				value = tio.c_iflag & mapcode;
+				break;
+
+			case TERMCODE_OUTPUT:
+				value = tio.c_oflag & mapcode;
+				break;
+
+			case TERMCODE_LOCAL:
+				value = tio.c_lflag & mapcode;
+				break;
+
+			case TERMCODE_CONTROL:
+				value = tio.c_cflag & mapcode;
+				break;
+
+			default:
+				continue;
+
+		}
+
+		/* If we reach here, we have something to say */
+		buf_putbyte(ses.writepayload, sshcode);
+		buf_putint(ses.writepayload, value);
+	}
+
+	buf_putbyte(ses.writepayload, 0); /* THE END, aka TTY_OP_END */
+
+	/* Put the string length at the start of the buffer */
+	bufpos2 = ses.writepayload->pos;
+
+	buf_setpos(ses.writepayload, bufpos1); /* Jump back */
+	buf_putint(ses.writepayload, bufpos2 - bufpos1 - 4); /* len(termcodes) */
+	buf_setpos(ses.writepayload, bufpos2); /* Back where we were */
+
+	TRACE(("leave put_termcodes"))
+}
+
+static void put_winsize() {
+
+	struct winsize ws;
+
+	if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) < 0) {
+		/* Some sane defaults */
+		ws.ws_row = 25;
+		ws.ws_col = 80;
+		ws.ws_xpixel = 0;
+		ws.ws_ypixel = 0;
+	}
+
+	buf_putint(ses.writepayload, ws.ws_col); /* Cols */
+	buf_putint(ses.writepayload, ws.ws_row); /* Rows */
+	buf_putint(ses.writepayload, ws.ws_xpixel); /* Width */
+	buf_putint(ses.writepayload, ws.ws_ypixel); /* Height */
+
+}
+
+static void sigwinch_handler(int UNUSED(unused)) {
+
+	cli_ses.winchange = 1;
+
+}
+
+void cli_chansess_winchange() {
+
+	unsigned int i;
+	struct Channel *channel = NULL;
+
+	for (i = 0; i < ses.chansize; i++) {
+		channel = ses.channels[i];
+		if (channel != NULL && channel->type == &clichansess) {
+			CHECKCLEARTOWRITE();
+			buf_putbyte(ses.writepayload, SSH_MSG_CHANNEL_REQUEST);
+			buf_putint(ses.writepayload, channel->remotechan);
+			buf_putstring(ses.writepayload, "window-change", 13);
+			buf_putbyte(ses.writepayload, 0); /* FALSE says the spec */
+			put_winsize();
+			encrypt_packet();
+		}
+	}
+	cli_ses.winchange = 0;
+}
+
+static void send_chansess_pty_req(struct Channel *channel) {
+
+	unsigned char* term = NULL;
+
+	TRACE(("enter send_chansess_pty_req"))
+
+	start_channel_request(channel, "pty-req");
+
+	/* Don't want replies */
+	buf_putbyte(ses.writepayload, 0);
+
+	/* Get the terminal */
+	term = getenv("TERM");
+	if (term == NULL) {
+		term = "vt100"; /* Seems a safe default */
+	}
+	buf_putstring(ses.writepayload, term, strlen(term));
+
+	/* Window size */
+	put_winsize();
+
+	/* Terminal mode encoding */
+	put_termcodes();
+
+	encrypt_packet();
+
+	/* Set up a window-change handler */
+	if (signal(SIGWINCH, sigwinch_handler) == SIG_ERR) {
+		dropbear_exit("signal error");
+	}
+	TRACE(("leave send_chansess_pty_req"))
+}
+
+static void send_chansess_shell_req(struct Channel *channel) {
+
+	unsigned char* reqtype = NULL;
+
+	TRACE(("enter send_chansess_shell_req"))
+
+	if (cli_opts.cmd) {
+		reqtype = "exec";
+	} else {
+		reqtype = "shell";
+	}
+
+	start_channel_request(channel, reqtype);
+
+	/* XXX TODO */
+	buf_putbyte(ses.writepayload, 0); /* Don't want replies */
+	if (cli_opts.cmd) {
+		buf_putstring(ses.writepayload, cli_opts.cmd, strlen(cli_opts.cmd));
+	}
+
+	encrypt_packet();
+	TRACE(("leave send_chansess_shell_req"))
+}
+
+static int cli_initchansess(struct Channel *channel) {
+
+
+	channel->writefd = STDOUT_FILENO;
+	setnonblocking(STDOUT_FILENO);
+
+	channel->readfd = STDIN_FILENO;
+	setnonblocking(STDIN_FILENO);
+
+	channel->errfd = STDERR_FILENO;
+	setnonblocking(STDERR_FILENO);
+
+	channel->extrabuf = cbuf_new(RECV_MAXWINDOW);
+
+	if (cli_opts.wantpty) {
+		send_chansess_pty_req(channel);
+	}
+
+	send_chansess_shell_req(channel);
+
+	if (cli_opts.wantpty) {
+		cli_tty_setup();
+	}
+
+	return 0; /* Success */
+
+}
+
+void cli_send_chansess_request() {
+
+	TRACE(("enter cli_send_chansess_request"))
+	if (send_msg_channel_open_init(STDIN_FILENO, &clichansess) 
+			== DROPBEAR_FAILURE) {
+		dropbear_exit("Couldn't open initial channel");
+	}
+
+	/* No special channel request data */
+	encrypt_packet();
+	TRACE(("leave cli_send_chansess_request"))
+
+}
diff --git a/cli-kex.c b/cli-kex.c
new file mode 100644
index 0000000..40d4e95
--- /dev/null
+++ b/cli-kex.c
@@ -0,0 +1,288 @@
+/*
+ * Dropbear - a SSH2 server
+ * 
+ * Copyright (c) 2002-2004 Matt Johnston
+ * Copyright (c) 2004 by Mihnea Stoenescu
+ * All rights reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE. */
+
+#include "includes.h"
+#include "session.h"
+#include "dbutil.h"
+#include "algo.h"
+#include "buffer.h"
+#include "session.h"
+#include "kex.h"
+#include "ssh.h"
+#include "packet.h"
+#include "bignum.h"
+#include "random.h"
+#include "runopts.h"
+#include "signkey.h"
+
+
+static void checkhostkey(unsigned char* keyblob, unsigned int keybloblen);
+#define MAX_KNOWNHOSTS_LINE 4500
+
+void send_msg_kexdh_init() {
+
+	cli_ses.dh_e = (mp_int*)m_malloc(sizeof(mp_int));
+	cli_ses.dh_x = (mp_int*)m_malloc(sizeof(mp_int));
+	m_mp_init_multi(cli_ses.dh_e, cli_ses.dh_x, NULL);
+
+	gen_kexdh_vals(cli_ses.dh_e, cli_ses.dh_x);
+
+	CHECKCLEARTOWRITE();
+	buf_putbyte(ses.writepayload, SSH_MSG_KEXDH_INIT);
+	buf_putmpint(ses.writepayload, cli_ses.dh_e);
+	encrypt_packet();
+	ses.requirenext = SSH_MSG_KEXDH_REPLY;
+}
+
+/* Handle a diffie-hellman key exchange reply. */
+void recv_msg_kexdh_reply() {
+
+	DEF_MP_INT(dh_f);
+	sign_key *hostkey = NULL;
+	unsigned int type, keybloblen;
+	unsigned char* keyblob = NULL;
+
+
+	TRACE(("enter recv_msg_kexdh_reply"))
+
+	if (cli_ses.kex_state != KEXDH_INIT_SENT) {
+		dropbear_exit("Received out-of-order kexdhreply");
+	}
+	m_mp_init(&dh_f);
+	type = ses.newkeys->algo_hostkey;
+	TRACE(("type is %d", type))
+
+	hostkey = new_sign_key();
+	keybloblen = buf_getint(ses.payload);
+
+	keyblob = buf_getptr(ses.payload, keybloblen);
+	if (!ses.kexstate.donefirstkex) {
+		/* Only makes sense the first time */
+		checkhostkey(keyblob, keybloblen);
+	}
+
+	if (buf_get_pub_key(ses.payload, hostkey, &type) != DROPBEAR_SUCCESS) {
+		TRACE(("failed getting pubkey"))
+		dropbear_exit("Bad KEX packet");
+	}
+
+	if (buf_getmpint(ses.payload, &dh_f) != DROPBEAR_SUCCESS) {
+		TRACE(("failed getting mpint"))
+		dropbear_exit("Bad KEX packet");
+	}
+
+	kexdh_comb_key(cli_ses.dh_e, cli_ses.dh_x, &dh_f, hostkey);
+	mp_clear(&dh_f);
+	mp_clear_multi(cli_ses.dh_e, cli_ses.dh_x, NULL);
+	m_free(cli_ses.dh_e);
+	m_free(cli_ses.dh_x);
+
+	if (buf_verify(ses.payload, hostkey, ses.hash, SHA1_HASH_SIZE) 
+			!= DROPBEAR_SUCCESS) {
+		dropbear_exit("Bad hostkey signature");
+	}
+
+	sign_key_free(hostkey);
+	hostkey = NULL;
+
+	send_msg_newkeys();
+	ses.requirenext = SSH_MSG_NEWKEYS;
+	TRACE(("leave recv_msg_kexdh_init"))
+}
+
+static void ask_to_confirm(unsigned char* keyblob, unsigned int keybloblen) {
+
+	char* fp = NULL;
+	FILE *tty = NULL;
+	char response = 'z';
+
+	fp = sign_key_fingerprint(keyblob, keybloblen);
+	fprintf(stderr, "\nHost '%s' is not in the trusted hosts file.\n(fingerprint %s)\nDo you want to continue connecting? (y/n)\n", 
+			cli_opts.remotehost, 
+			fp);
+
+	tty = fopen(_PATH_TTY, "r");
+	if (tty) {
+		response = getc(tty);
+		fclose(tty);
+	} else {
+		response = getc(stdin);
+	}
+
+	if (response == 'y') {
+		m_free(fp);
+		return;
+	}
+
+	dropbear_exit("Didn't validate host key");
+}
+
+static void checkhostkey(unsigned char* keyblob, unsigned int keybloblen) {
+
+	char * filename = NULL;
+	FILE *hostsfile = NULL;
+	int readonly = 0;
+	struct passwd *pw = NULL;
+	unsigned int hostlen, algolen;
+	unsigned long len;
+	const char *algoname = NULL;
+	buffer * line = NULL;
+	int ret;
+	
+	pw = getpwuid(getuid());
+
+	if (pw == NULL) {
+		dropbear_exit("Failed to get homedir");
+	}
+
+	len = strlen(pw->pw_dir);
+	filename = m_malloc(len + 18); /* "/.ssh/known_hosts" and null-terminator*/
+
+	snprintf(filename, len+18, "%s/.ssh", pw->pw_dir);
+	/* Check that ~/.ssh exists - easiest way is just to mkdir */
+	if (mkdir(filename, S_IRWXU) != 0) {
+		if (errno != EEXIST) {
+			dropbear_log(LOG_INFO, "Warning: failed creating ~/.ssh: %s",
+					strerror(errno));
+			TRACE(("mkdir didn't work: %s", strerror(errno)))
+			ask_to_confirm(keyblob, keybloblen);
+			goto out; /* only get here on success */
+		}
+	}
+
+	snprintf(filename, len+18, "%s/.ssh/known_hosts", pw->pw_dir);
+	hostsfile = fopen(filename, "a+");
+	
+	if (hostsfile != NULL) {
+		fseek(hostsfile, 0, SEEK_SET);
+	} else {
+		/* We mightn't have been able to open it if it was read-only */
+		if (errno == EACCES || errno == EROFS) {
+				TRACE(("trying readonly: %s", strerror(errno)))
+				readonly = 1;
+				hostsfile = fopen(filename, "r");
+		}
+	}
+
+	if (hostsfile == NULL) {
+		TRACE(("hostsfile didn't open: %s", strerror(errno)))
+		ask_to_confirm(keyblob, keybloblen);
+		goto out; /* We only get here on success */
+	}
+
+	line = buf_new(MAX_KNOWNHOSTS_LINE);
+	hostlen = strlen(cli_opts.remotehost);
+	algoname = signkey_name_from_type(ses.newkeys->algo_hostkey, &algolen);
+
+	do {
+		if (buf_getline(line, hostsfile) == DROPBEAR_FAILURE) {
+			TRACE(("failed reading line: prob EOF"))
+			break;
+		}
+
+		/* The line is too short to be sensible */
+		/* "30" is 'enough to hold ssh-dss plus the spaces, ie so we don't
+		 * buf_getfoo() past the end and die horribly - the base64 parsing
+		 * code is what tiptoes up to the end nicely */
+		if (line->len < (hostlen+30) ) {
+			TRACE(("line is too short to be sensible"))
+			continue;
+		}
+
+		/* Compare hostnames */
+		if (strncmp(cli_opts.remotehost, buf_getptr(line, hostlen),
+					hostlen) != 0) {
+			TRACE(("hosts don't match"))
+			continue;
+		}
+
+		buf_incrpos(line, hostlen);
+		if (buf_getbyte(line) != ' ') {
+			/* there wasn't a space after the hostname, something dodgy */
+			TRACE(("missing space afte matching hostname"))
+			continue;
+		}
+
+		if ( strncmp(buf_getptr(line, algolen), algoname, algolen) != 0) {
+			TRACE(("algo doesn't match"))
+			continue;
+		}
+
+		buf_incrpos(line, algolen);
+		if (buf_getbyte(line) != ' ') {
+			TRACE(("missing space after algo"))
+			continue;
+		}
+
+		/* Now we're at the interesting hostkey */
+		ret = cmp_base64_key(keyblob, keybloblen, algoname, algolen, line);
+
+		if (ret == DROPBEAR_SUCCESS) {
+			/* Good matching key */
+			TRACE(("good matching key"))
+			goto out;
+		}
+
+		/* The keys didn't match. eep. */
+	} while (1); /* keep going 'til something happens */
+
+	/* Key doesn't exist yet */
+	ask_to_confirm(keyblob, keybloblen);
+
+	/* If we get here, they said yes */
+
+	if (readonly) {
+		TRACE(("readonly"))
+		goto out;
+	}
+
+	/* put the new entry in the file */
+	fseek(hostsfile, 0, SEEK_END); /* In case it wasn't opened append */
+	buf_setpos(line, 0);
+	buf_setlen(line, 0);
+	buf_putbytes(line, ses.remotehost, hostlen);
+	buf_putbyte(line, ' ');
+	buf_putbytes(line, algoname, algolen);
+	buf_putbyte(line, ' ');
+	len = line->size - line->pos;
+	TRACE(("keybloblen %d, len %d", keybloblen, len))
+	/* The only failure with base64 is buffer_overflow, but buf_getwriteptr
+	 * will die horribly in the case anyway */
+	base64_encode(keyblob, keybloblen, buf_getwriteptr(line, len), &len);
+	buf_incrwritepos(line, len);
+	buf_putbyte(line, '\n');
+	buf_setpos(line, 0);
+	fwrite(buf_getptr(line, line->len), line->len, 1, hostsfile);
+	/* We ignore errors, since there's not much we can do about them */
+
+out:
+	if (hostsfile != NULL) {
+		fclose(hostsfile);
+	}
+	m_free(filename);
+	if (line != NULL) {
+		buf_free(line);
+	}
+}
diff --git a/cli-main.c b/cli-main.c
new file mode 100644
index 0000000..3f767c9
--- /dev/null
+++ b/cli-main.c
@@ -0,0 +1,112 @@
+/*
+ * Dropbear - a SSH2 server
+ * SSH client implementation
+ * 
+ * Copyright (c) 2002,2003 Matt Johnston
+ * Copyright (c) 2004 by Mihnea Stoenescu
+ * All rights reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE. */
+
+#include "includes.h"
+#include "dbutil.h"
+#include "runopts.h"
+#include "session.h"
+
+static void cli_dropbear_exit(int exitcode, const char* format, va_list param);
+static void cli_dropbear_log(int priority, const char* format, va_list param);
+
+#if defined(DBMULTI_dbclient) || !defined(DROPBEAR_MULTI)
+#if defined(DBMULTI_dbclient) && defined(DROPBEAR_MULTI)
+int cli_main(int argc, char ** argv) {
+#else
+int main(int argc, char ** argv) {
+#endif
+
+	int sock;
+	char* error = NULL;
+	char* hostandport;
+	int len;
+
+	_dropbear_exit = cli_dropbear_exit;
+	_dropbear_log = cli_dropbear_log;
+
+	cli_getopts(argc, argv);
+
+	TRACE(("user='%s' host='%s' port='%s'", cli_opts.username,
+				cli_opts.remotehost, cli_opts.remoteport))
+
+	if (signal(SIGPIPE, SIG_IGN) == SIG_ERR) {
+		dropbear_exit("signal() error");
+	}
+
+	sock = connect_remote(cli_opts.remotehost, cli_opts.remoteport, 
+			0, &error);
+
+	if (sock < 0) {
+		dropbear_exit("%s", error);
+	}
+
+	/* Set up the host:port log */
+	len = strlen(cli_opts.remotehost);
+	len += 10; /* 16 bit port and leeway*/
+	hostandport = (char*)m_malloc(len);
+	snprintf(hostandport, len, "%s:%s", 
+			cli_opts.remotehost, cli_opts.remoteport);
+
+	cli_session(sock, hostandport);
+
+	/* not reached */
+	return -1;
+}
+#endif /* DBMULTI stuff */
+
+static void cli_dropbear_exit(int exitcode, const char* format, va_list param) {
+
+	char fmtbuf[300];
+
+	if (!sessinitdone) {
+		snprintf(fmtbuf, sizeof(fmtbuf), "exited: %s",
+				format);
+	} else {
+		snprintf(fmtbuf, sizeof(fmtbuf), 
+				"connection to %s@%s:%s exited: %s", 
+				cli_opts.username, cli_opts.remotehost, 
+				cli_opts.remoteport, format);
+	}
+
+	/* Do the cleanup first, since then the terminal will be reset */
+	cli_session_cleanup();
+	common_session_cleanup();
+
+	_dropbear_log(LOG_INFO, fmtbuf, param);
+
+	exit(exitcode);
+}
+
+static void cli_dropbear_log(int UNUSED(priority), 
+		const char* format, va_list param) {
+
+	char printbuf[1024];
+
+	vsnprintf(printbuf, sizeof(printbuf), format, param);
+
+	fprintf(stderr, "%s: %s\n", cli_opts.progname, printbuf);
+
+}
diff --git a/cli-runopts.c b/cli-runopts.c
new file mode 100644
index 0000000..54d4875
--- /dev/null
+++ b/cli-runopts.c
@@ -0,0 +1,414 @@
+/*
+ * Dropbear - a SSH2 server
+ * 
+ * Copyright (c) 2002,2003 Matt Johnston
+ * All rights reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE. */
+
+#include "includes.h"
+#include "runopts.h"
+#include "signkey.h"
+#include "buffer.h"
+#include "dbutil.h"
+#include "algo.h"
+#include "tcpfwd.h"
+
+cli_runopts cli_opts; /* GLOBAL */
+
+static void printhelp();
+static void parsehostname(char* userhostarg);
+#ifdef ENABLE_CLI_PUBKEY_AUTH
+static void loadidentityfile(const char* filename);
+#endif
+#ifdef ENABLE_CLI_ANYTCPFWD
+static void addforward(char* str, struct TCPFwdList** fwdlist);
+#endif
+
+static void printhelp() {
+
+	fprintf(stderr, "Dropbear client v%s\n"
+					"Usage: %s [options] [user@]host\n"
+					"Options are:\n"
+					"-p <remoteport>\n"
+					"-l <username>\n"
+					"-t    Allocate a pty\n"
+					"-T    Don't allocate a pty\n"
+#ifdef ENABLE_CLI_PUBKEY_AUTH
+					"-i <identityfile>   (multiple allowed)\n"
+#endif
+#ifdef ENABLE_CLI_LOCALTCPFWD
+					"-L <listenport:remotehost:remoteport> Local port forwarding\n"
+					"-g    Allow remote hosts to connect to forwarded ports\n"
+#endif
+#ifdef ENABLE_CLI_REMOTETCPFWD
+					"-R <listenport:remotehost:remoteport> Remote port forwarding\n"
+#endif
+#ifdef DEBUG_TRACE
+					"-v    verbose\n"
+#endif
+					,DROPBEAR_VERSION, cli_opts.progname);
+}
+
+void cli_getopts(int argc, char ** argv) {
+
+	unsigned int i, j;
+	char ** next = 0;
+	unsigned int cmdlen;
+#ifdef ENABLE_CLI_PUBKEY_AUTH
+	int nextiskey = 0; /* A flag if the next argument is a keyfile */
+#endif
+#ifdef ENABLE_CLI_LOCALTCPFWD
+	int nextislocal = 0;
+#endif
+#ifdef ENABLE_CLI_REMOTETCPFWD
+	int nextisremote = 0;
+#endif
+	char* dummy = NULL; /* Not used for anything real */
+
+	/* see printhelp() for options */
+	cli_opts.progname = argv[0];
+	cli_opts.remotehost = NULL;
+	cli_opts.remoteport = NULL;
+	cli_opts.username = NULL;
+	cli_opts.cmd = NULL;
+	cli_opts.wantpty = 9; /* 9 means "it hasn't been touched", gets set later */
+#ifdef ENABLE_CLI_PUBKEY_AUTH
+	cli_opts.privkeys = NULL;
+#endif
+#ifdef ENABLE_CLI_LOCALTCPFWD
+	cli_opts.localfwds = NULL;
+	opts.listen_fwd_all = 0;
+#endif
+#ifdef ENABLE_CLI_REMOTETCPFWD
+	cli_opts.remotefwds = NULL;
+#endif
+	/* not yet
+	opts.ipv4 = 1;
+	opts.ipv6 = 1;
+	*/
+
+	/* Iterate all the arguments */
+	for (i = 1; i < (unsigned int)argc; i++) {
+#ifdef ENABLE_CLI_PUBKEY_AUTH
+		if (nextiskey) {
+			/* Load a hostkey since the previous argument was "-i" */
+			loadidentityfile(argv[i]);
+			nextiskey = 0;
+			continue;
+		}
+#endif
+#ifdef ENABLE_CLI_REMOTETCPFWD
+		if (nextisremote) {
+			TRACE(("nextisremote true"))
+			addforward(argv[i], &cli_opts.remotefwds);
+			nextisremote = 0;
+			continue;
+		}
+#endif
+#ifdef ENABLE_CLI_LOCALTCPFWD
+		if (nextislocal) {
+			TRACE(("nextislocal true"))
+			addforward(argv[i], &cli_opts.localfwds);
+			nextislocal = 0;
+			continue;
+		}
+#endif
+		if (next) {
+			/* The previous flag set a value to assign */
+			*next = argv[i];
+			if (*next == NULL) {
+				dropbear_exit("Invalid null argument");
+			}
+			next = NULL;
+			continue;
+		}
+
+		if (argv[i][0] == '-') {
+			/* A flag *waves* */
+
+			switch (argv[i][1]) {
+				case 'p': /* remoteport */
+					next = &cli_opts.remoteport;
+					break;
+#ifdef ENABLE_CLI_PUBKEY_AUTH
+				case 'i': /* an identityfile */
+					/* Keep scp happy when it changes "-i file" to "-ifile" */
+					if (strlen(argv[i]) > 2) {
+						loadidentityfile(&argv[i][2]);
+					} else  {
+						nextiskey = 1;
+					}
+					break;
+#endif
+				case 't': /* we want a pty */
+					cli_opts.wantpty = 1;
+					break;
+				case 'T': /* don't want a pty */
+					cli_opts.wantpty = 0;
+					break;
+#ifdef ENABLE_CLI_LOCALTCPFWD
+				case 'L':
+					nextislocal = 1;
+					break;
+				case 'g':
+					opts.listen_fwd_all = 1;
+					break;
+#endif
+#ifdef ENABLE_CLI_REMOTETCPFWD
+				case 'R':
+					nextisremote = 1;
+					break;
+#endif
+				case 'l':
+					next = &cli_opts.username;
+					break;
+				case 'h':
+					printhelp();
+					exit(EXIT_SUCCESS);
+					break;
+#ifdef DEBUG_TRACE
+				case 'v':
+					debug_trace = 1;
+					break;
+#endif
+				case 'F':
+				case 'e':
+				case 'c':
+				case 'm':
+				case 'D':
+#ifndef ENABLE_CLI_REMOTETCPFWD
+				case 'R':
+#endif
+#ifndef ENABLE_CLI_LOCALTCPFWD
+				case 'L':
+#endif
+				case 'o':
+				case 'b':
+					next = &dummy;
+				default:
+					fprintf(stderr, 
+						"WARNING: Ignoring unknown argument '%s'\n", argv[i]);
+					break;
+			} /* Switch */
+			
+			/* Now we handle args where they might be "-luser" (no spaces)*/
+			if (next && strlen(argv[i]) > 2) {
+				*next = &argv[i][2];
+				next = NULL;
+			}
+
+			continue; /* next argument */
+
+		} else {
+			TRACE(("non-flag arg: '%s'", argv[i]))
+
+			/* Either the hostname or commands */
+
+			if (cli_opts.remotehost == NULL) {
+
+				parsehostname(argv[i]);
+
+			} else {
+
+				/* this is part of the commands to send - after this we
+				 * don't parse any more options, and flags are sent as the
+				 * command */
+				cmdlen = 0;
+				for (j = i; j < (unsigned int)argc; j++) {
+					cmdlen += strlen(argv[j]) + 1; /* +1 for spaces */
+				}
+				/* Allocate the space */
+				cli_opts.cmd = (char*)m_malloc(cmdlen);
+				cli_opts.cmd[0] = '\0';
+
+				/* Append all the bits */
+				for (j = i; j < (unsigned int)argc; j++) {
+					strlcat(cli_opts.cmd, argv[j], cmdlen);
+					strlcat(cli_opts.cmd, " ", cmdlen);
+				}
+				/* It'll be null-terminated here */
+
+				/* We've eaten all the options and flags */
+				break;
+			}
+		}
+	}
+
+	if (cli_opts.remotehost == NULL) {
+		printhelp();
+		exit(EXIT_FAILURE);
+	}
+
+	if (cli_opts.remoteport == NULL) {
+		cli_opts.remoteport = "22";
+	}
+
+	/* If not explicitly specified with -t or -T, we don't want a pty if
+	 * there's a command, but we do otherwise */
+	if (cli_opts.wantpty == 9) {
+		if (cli_opts.cmd == NULL) {
+			cli_opts.wantpty = 1;
+		} else {
+			cli_opts.wantpty = 0;
+		}
+	}
+}
+
+#ifdef ENABLE_CLI_PUBKEY_AUTH
+static void loadidentityfile(const char* filename) {
+
+	struct SignKeyList * nextkey;
+	sign_key *key;
+	int keytype;
+
+	key = new_sign_key();
+	keytype = DROPBEAR_SIGNKEY_ANY;
+	if ( readhostkey(filename, key, &keytype) != DROPBEAR_SUCCESS ) {
+
+		fprintf(stderr, "Failed loading keyfile '%s'\n", filename);
+		sign_key_free(key);
+
+	} else {
+
+		nextkey = (struct SignKeyList*)m_malloc(sizeof(struct SignKeyList));
+		nextkey->key = key;
+		nextkey->next = cli_opts.privkeys;
+		nextkey->type = keytype;
+		cli_opts.privkeys = nextkey;
+	}
+}
+#endif
+
+
+/* Parses a [user@]hostname argument. userhostarg is the argv[i] corresponding
+ * - note that it will be modified */
+static void parsehostname(char* orighostarg) {
+
+	uid_t uid;
+	struct passwd *pw = NULL; 
+	char *userhostarg = NULL;
+
+	/* We probably don't want to be editing argvs */
+	userhostarg = m_strdup(orighostarg);
+
+	cli_opts.remotehost = strchr(userhostarg, '@');
+	if (cli_opts.remotehost == NULL) {
+		/* no username portion, the cli-auth.c code can figure the
+		 * local user's name */
+		cli_opts.remotehost = userhostarg;
+	} else {
+		cli_opts.remotehost[0] = '\0'; /* Split the user/host */
+		cli_opts.remotehost++;
+		cli_opts.username = userhostarg;
+	}
+
+	if (cli_opts.username == NULL) {
+		uid = getuid();
+		
+		pw = getpwuid(uid);
+		if (pw == NULL || pw->pw_name == NULL) {
+			dropbear_exit("Unknown own user");
+		}
+
+		cli_opts.username = m_strdup(pw->pw_name);
+	}
+
+	if (cli_opts.remotehost[0] == '\0') {
+		dropbear_exit("Bad hostname");
+	}
+}
+
+#ifdef ENABLE_CLI_ANYTCPFWD
+/* Turn a "listenport:remoteaddr:remoteport" string into into a forwarding
+ * set, and add it to the forwarding list */
+static void addforward(char* origstr, struct TCPFwdList** fwdlist) {
+
+	char * listenport = NULL;
+	char * connectport = NULL;
+	char * connectaddr = NULL;
+	struct TCPFwdList* newfwd = NULL;
+	char * str = NULL;
+
+	TRACE(("enter addforward"))
+
+	/* We probably don't want to be editing argvs */
+	str = m_strdup(origstr);
+
+	listenport = str;
+
+	connectaddr = strchr(str, ':');
+	if (connectaddr == NULL) {
+		TRACE(("connectaddr == NULL"))
+		goto fail;
+	}
+
+	connectaddr[0] = '\0';
+	connectaddr++;
+
+	connectport = strchr(connectaddr, ':');
+	if (connectport == NULL) {
+		TRACE(("connectport == NULL"))
+		goto fail;
+	}
+
+	connectport[0] = '\0';
+	connectport++;
+
+	newfwd = (struct TCPFwdList*)m_malloc(sizeof(struct TCPFwdList));
+
+	/* Now we check the ports - note that the port ints are unsigned,
+	 * the check later only checks for >= MAX_PORT */
+	newfwd->listenport = strtol(listenport, NULL, 10);
+	if (errno != 0) {
+		TRACE(("bad listenport strtol"))
+		goto fail;
+	}
+
+	newfwd->connectport = strtol(connectport, NULL, 10);
+	if (errno != 0) {
+		TRACE(("bad connectport strtol"))
+		goto fail;
+	}
+
+	newfwd->connectaddr = connectaddr;
+
+	if (newfwd->listenport > 65535) {
+		TRACE(("listenport > 65535"))
+		goto badport;
+	}
+		
+	if (newfwd->connectport > 65535) {
+		TRACE(("connectport > 65535"))
+		goto badport;
+	}
+
+	newfwd->next = *fwdlist;
+	*fwdlist = newfwd;
+
+	TRACE(("leave addforward: done"))
+	return;
+
+fail:
+	dropbear_exit("Bad TCP forward '%s'", origstr);
+
+badport:
+	dropbear_exit("Bad TCP port in '%s'", origstr);
+}
+#endif
diff --git a/cli-service.c b/cli-service.c
new file mode 100644
index 0000000..87b6ed2
--- /dev/null
+++ b/cli-service.c
@@ -0,0 +1,87 @@
+/*
+ * Dropbear SSH
+ * 
+ * Copyright (c) 2002,2003 Matt Johnston
+ * Copyright (c) 2004 by Mihnea Stoenescu
+ * All rights reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE. */
+
+#include "includes.h"
+#include "service.h"
+#include "dbutil.h"
+#include "packet.h"
+#include "buffer.h"
+#include "session.h"
+#include "ssh.h"
+
+void send_msg_service_request(char* servicename) {
+
+	TRACE(("enter send_msg_service_request: servicename='%s'", servicename))
+
+	CHECKCLEARTOWRITE();
+
+	buf_putbyte(ses.writepayload, SSH_MSG_SERVICE_REQUEST);
+	buf_putstring(ses.writepayload, servicename, strlen(servicename));
+
+	encrypt_packet();
+	TRACE(("leave send_msg_service_request"))
+}
+
+/* This just sets up the state variables right for the main client session loop
+ * to deal with */
+void recv_msg_service_accept() {
+
+	unsigned char* servicename;
+	unsigned int len;
+
+	TRACE(("enter recv_msg_service_accept"))
+
+	servicename = buf_getstring(ses.payload, &len);
+
+	/* ssh-userauth */
+	if (cli_ses.state == SERVICE_AUTH_REQ_SENT
+			&& len == SSH_SERVICE_USERAUTH_LEN
+			&& strncmp(SSH_SERVICE_USERAUTH, servicename, len) == 0) {
+
+		cli_ses.state = SERVICE_AUTH_ACCEPT_RCVD;
+		m_free(servicename);
+		TRACE(("leave recv_msg_service_accept: done ssh-userauth"))
+		return;
+	}
+
+	/* ssh-connection */
+	if (cli_ses.state == SERVICE_CONN_REQ_SENT
+			&& len == SSH_SERVICE_CONNECTION_LEN 
+			&& strncmp(SSH_SERVICE_CONNECTION, servicename, len) == 0) {
+
+		if (ses.authstate.authdone != 1) {
+			dropbear_exit("request for connection before auth");
+		}
+
+		cli_ses.state = SERVICE_CONN_ACCEPT_RCVD;
+		m_free(servicename);
+		TRACE(("leave recv_msg_service_accept: done ssh-connection"))
+		return;
+	}
+
+	dropbear_exit("unrecognised service accept");
+	/* m_free(servicename); not reached */
+
+}
diff --git a/cli-session.c b/cli-session.c
new file mode 100644
index 0000000..35510fa
--- /dev/null
+++ b/cli-session.c
@@ -0,0 +1,303 @@
+/*
+ * Dropbear SSH
+ * 
+ * Copyright (c) 2002,2003 Matt Johnston
+ * Copyright (c) 2004 by Mihnea Stoenescu
+ * All rights reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE. */
+
+#include "includes.h"
+#include "session.h"
+#include "dbutil.h"
+#include "kex.h"
+#include "ssh.h"
+#include "packet.h"
+#include "tcpfwd.h"
+#include "channel.h"
+#include "random.h"
+#include "service.h"
+#include "runopts.h"
+#include "chansession.h"
+
+static void cli_remoteclosed();
+static void cli_sessionloop();
+static void cli_session_init();
+static void cli_finished();
+
+struct clientsession cli_ses; /* GLOBAL */
+
+/* Sorted in decreasing frequency will be more efficient - data and window
+ * should be first */
+static const packettype cli_packettypes[] = {
+	/* TYPE, FUNCTION */
+	{SSH_MSG_CHANNEL_DATA, recv_msg_channel_data},
+	{SSH_MSG_CHANNEL_EXTENDED_DATA, recv_msg_channel_extended_data},
+	{SSH_MSG_CHANNEL_WINDOW_ADJUST, recv_msg_channel_window_adjust},
+	{SSH_MSG_USERAUTH_FAILURE, recv_msg_userauth_failure}, /* client */
+	{SSH_MSG_USERAUTH_SUCCESS, recv_msg_userauth_success}, /* client */
+	{SSH_MSG_KEXINIT, recv_msg_kexinit},
+	{SSH_MSG_KEXDH_REPLY, recv_msg_kexdh_reply}, /* client */
+	{SSH_MSG_NEWKEYS, recv_msg_newkeys},
+	{SSH_MSG_SERVICE_ACCEPT, recv_msg_service_accept}, /* client */
+	{SSH_MSG_CHANNEL_REQUEST, recv_msg_channel_request},
+	{SSH_MSG_CHANNEL_OPEN, recv_msg_channel_open},
+	{SSH_MSG_CHANNEL_EOF, recv_msg_channel_eof},
+	{SSH_MSG_CHANNEL_CLOSE, recv_msg_channel_close},
+	{SSH_MSG_CHANNEL_OPEN_CONFIRMATION, recv_msg_channel_open_confirmation},
+	{SSH_MSG_CHANNEL_OPEN_FAILURE, recv_msg_channel_open_failure},
+	{SSH_MSG_USERAUTH_BANNER, recv_msg_userauth_banner}, /* client */
+	{SSH_MSG_USERAUTH_SPECIFIC_60, recv_msg_userauth_specific_60}, /* client */
+	{0, 0} /* End */
+};
+
+static const struct ChanType *cli_chantypes[] = {
+#ifdef ENABLE_CLI_REMOTETCPFWD
+	&cli_chan_tcpremote,
+#endif
+	NULL /* Null termination */
+};
+
+void cli_session(int sock, char* remotehost) {
+
+	seedrandom();
+
+	crypto_init();
+
+	common_session_init(sock, remotehost);
+
+	chaninitialise(cli_chantypes);
+
+	/* Set up cli_ses vars */
+	cli_session_init();
+
+	/* Ready to go */
+	sessinitdone = 1;
+
+	/* Exchange identification */
+	session_identification();
+
+	send_msg_kexinit();
+
+	session_loop(cli_sessionloop);
+
+	/* Not reached */
+
+}
+
+static void cli_session_init() {
+
+	cli_ses.state = STATE_NOTHING;
+	cli_ses.kex_state = KEX_NOTHING;
+
+	cli_ses.tty_raw_mode = 0;
+	cli_ses.winchange = 0;
+
+	/* We store std{in,out,err}'s flags, so we can set them back on exit
+	 * (otherwise busybox's ash isn't happy */
+	cli_ses.stdincopy = dup(STDIN_FILENO);
+	cli_ses.stdinflags = fcntl(STDIN_FILENO, F_GETFL, 0);
+	cli_ses.stdoutcopy = dup(STDOUT_FILENO);
+	cli_ses.stdoutflags = fcntl(STDOUT_FILENO, F_GETFL, 0);
+	cli_ses.stderrcopy = dup(STDERR_FILENO);
+	cli_ses.stderrflags = fcntl(STDERR_FILENO, F_GETFL, 0);
+
+	cli_ses.retval = EXIT_SUCCESS; /* Assume it's clean if we don't get a
+									  specific exit status */
+
+	/* Auth */
+	cli_ses.lastprivkey = NULL;
+	cli_ses.lastauthtype = 0;
+
+	/* For printing "remote host closed" for the user */
+	ses.remoteclosed = cli_remoteclosed;
+	ses.buf_match_algo = cli_buf_match_algo;
+
+	/* packet handlers */
+	ses.packettypes = cli_packettypes;
+
+	ses.isserver = 0;
+}
+
+/* This function drives the progress of the session - it initiates KEX,
+ * service, userauth and channel requests */
+static void cli_sessionloop() {
+
+	TRACE(("enter cli_sessionloop"))
+
+	if (ses.lastpacket == SSH_MSG_KEXINIT && cli_ses.kex_state == KEX_NOTHING) {
+		cli_ses.kex_state = KEXINIT_RCVD;
+	}
+
+	if (cli_ses.kex_state == KEXINIT_RCVD) {
+
+		/* We initiate the KEXDH. If DH wasn't the correct type, the KEXINIT
+		 * negotiation would have failed. */
+		send_msg_kexdh_init();
+		cli_ses.kex_state = KEXDH_INIT_SENT;
+		TRACE(("leave cli_sessionloop: done with KEXINIT_RCVD"))
+		return;
+	}
+
+	/* A KEX has finished, so we should go back to our KEX_NOTHING state */
+	if (cli_ses.kex_state != KEX_NOTHING && ses.kexstate.recvkexinit == 0
+			&& ses.kexstate.sentkexinit == 0) {
+		cli_ses.kex_state = KEX_NOTHING;
+	}
+
+	/* We shouldn't do anything else if a KEX is in progress */
+	if (cli_ses.kex_state != KEX_NOTHING) {
+		TRACE(("leave cli_sessionloop: kex_state != KEX_NOTHING"))
+		return;
+	}
+
+	/* We should exit if we haven't donefirstkex: we shouldn't reach here
+	 * in normal operation */
+	if (ses.kexstate.donefirstkex == 0) {
+		TRACE(("XXX XXX might be bad! leave cli_sessionloop: haven't donefirstkex"))
+		return;
+	}
+
+	switch (cli_ses.state) {
+
+		case STATE_NOTHING:
+			/* We've got the transport layer sorted, we now need to request
+			 * userauth */
+			send_msg_service_request(SSH_SERVICE_USERAUTH);
+			cli_ses.state = SERVICE_AUTH_REQ_SENT;
+			TRACE(("leave cli_sessionloop: sent userauth service req"))
+			return;
+
+		/* userauth code */
+		case SERVICE_AUTH_ACCEPT_RCVD:
+			cli_auth_getmethods();
+			cli_ses.state = USERAUTH_REQ_SENT;
+			TRACE(("leave cli_sessionloop: sent userauth methods req"))
+			return;
+			
+		case USERAUTH_FAIL_RCVD:
+			cli_auth_try();
+			cli_ses.state = USERAUTH_REQ_SENT;
+			TRACE(("leave cli_sessionloop: cli_auth_try"))
+			return;
+
+			/*
+		case USERAUTH_SUCCESS_RCVD:
+			send_msg_service_request(SSH_SERVICE_CONNECTION);
+			cli_ses.state = SERVICE_CONN_REQ_SENT;
+			TRACE(("leave cli_sessionloop: sent ssh-connection service req"))
+			return;
+
+		case SERVICE_CONN_ACCEPT_RCVD:
+			cli_send_chansess_request();
+			TRACE(("leave cli_sessionloop: cli_send_chansess_request"))
+			cli_ses.state = SESSION_RUNNING;
+			return;
+			*/
+
+		case USERAUTH_SUCCESS_RCVD:
+#ifdef ENABLE_CLI_LOCALTCPFWD
+			setup_localtcp();
+#endif
+#ifdef ENABLE_CLI_REMOTETCPFWD
+			setup_remotetcp();
+#endif
+			cli_send_chansess_request();
+			TRACE(("leave cli_sessionloop: cli_send_chansess_request"))
+			cli_ses.state = SESSION_RUNNING;
+			return;
+
+		case SESSION_RUNNING:
+			if (ses.chancount < 1) {
+				cli_finished();
+			}
+
+			if (cli_ses.winchange) {
+				cli_chansess_winchange();
+			}
+			return;
+
+		/* XXX more here needed */
+
+
+	default:
+		break;
+	}
+
+	TRACE(("leave cli_sessionloop: fell out"))
+
+}
+
+void cli_session_cleanup() {
+
+	if (!sessinitdone) {
+		return;
+	}
+
+	/* Set std{in,out,err} back to non-blocking - busybox ash dies nastily if
+	 * we don't revert the flags */
+	fcntl(cli_ses.stdincopy, F_SETFL, cli_ses.stdinflags);
+	fcntl(cli_ses.stdoutcopy, F_SETFL, cli_ses.stdoutflags);
+	fcntl(cli_ses.stderrcopy, F_SETFL, cli_ses.stderrflags);
+
+	cli_tty_cleanup();
+
+}
+
+static void cli_finished() {
+
+	cli_session_cleanup();
+	common_session_cleanup();
+	fprintf(stderr, "Connection to %s@%s:%s closed.\n", cli_opts.username,
+			cli_opts.remotehost, cli_opts.remoteport);
+	exit(cli_ses.retval);
+}
+
+
+/* called when the remote side closes the connection */
+static void cli_remoteclosed() {
+
+	/* XXX TODO perhaps print a friendlier message if we get this but have
+	 * already sent/received disconnect message(s) ??? */
+	close(ses.sock);
+	ses.sock = -1;
+	dropbear_exit("remote closed the connection");
+}
+
+/* Operates in-place turning dirty (untrusted potentially containing control
+ * characters) text into clean text. 
+ * Note: this is safe only with ascii - other charsets could have problems. */
+void cleantext(unsigned char* dirtytext) {
+
+	unsigned int i, j;
+	unsigned char c;
+
+	j = 0;
+	for (i = 0; dirtytext[i] != '\0'; i++) {
+
+		c = dirtytext[i];
+		/* We can ignore '\r's */
+		if ( (c >= ' ' && c <= '~') || c == '\n' || c == '\t') {
+			dirtytext[j] = c;
+			j++;
+		}
+	}
+	/* Null terminate */
+	dirtytext[j] = '\0';
+}
diff --git a/cli-tcpfwd.c b/cli-tcpfwd.c
new file mode 100644
index 0000000..c3bfd4d
--- /dev/null
+++ b/cli-tcpfwd.c
@@ -0,0 +1,216 @@
+/*
+ * Dropbear SSH
+ * 
+ * Copyright (c) 2002,2003 Matt Johnston
+ * All rights reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE. */
+
+#include "includes.h"
+#include "options.h"
+#include "dbutil.h"
+#include "tcpfwd.h"
+#include "channel.h"
+#include "runopts.h"
+#include "session.h"
+#include "ssh.h"
+
+#ifdef ENABLE_CLI_REMOTETCPFWD
+static int newtcpforwarded(struct Channel * channel);
+
+const struct ChanType cli_chan_tcpremote = {
+	1, /* sepfds */
+	"forwarded-tcpip",
+	newtcpforwarded,
+	NULL,
+	NULL,
+	NULL
+};
+#endif
+
+#ifdef ENABLE_CLI_LOCALTCPFWD
+static int cli_localtcp(unsigned int listenport, const char* remoteaddr,
+		unsigned int remoteport);
+static const struct ChanType cli_chan_tcplocal = {
+	1, /* sepfds */
+	"direct-tcpip",
+	NULL,
+	NULL,
+	NULL,
+	NULL
+};
+#endif
+
+#ifdef ENABLE_CLI_LOCALTCPFWD
+void setup_localtcp() {
+
+	int ret;
+
+	TRACE(("enter setup_localtcp"))
+
+	if (cli_opts.localfwds == NULL) {
+		TRACE(("cli_opts.localfwds == NULL"))
+	}
+
+	while (cli_opts.localfwds != NULL) {
+		ret = cli_localtcp(cli_opts.localfwds->listenport,
+				cli_opts.localfwds->connectaddr,
+				cli_opts.localfwds->connectport);
+		if (ret == DROPBEAR_FAILURE) {
+			dropbear_log(LOG_WARNING, "Failed local port forward %d:%s:%d",
+					cli_opts.localfwds->listenport,
+					cli_opts.localfwds->connectaddr,
+					cli_opts.localfwds->connectport);
+		}
+
+		cli_opts.localfwds = cli_opts.localfwds->next;
+	}
+	TRACE(("leave setup_localtcp"))
+
+}
+
+static int cli_localtcp(unsigned int listenport, const char* remoteaddr,
+		unsigned int remoteport) {
+
+	struct TCPListener* tcpinfo = NULL;
+	int ret;
+
+	TRACE(("enter cli_localtcp: %d %s %d", listenport, remoteaddr,
+				remoteport));
+
+	tcpinfo = (struct TCPListener*)m_malloc(sizeof(struct TCPListener));
+
+	tcpinfo->sendaddr = m_strdup(remoteaddr);
+	tcpinfo->sendport = remoteport;
+
+	if (opts.listen_fwd_all) {
+		tcpinfo->listenaddr = m_strdup("");
+	} else {
+		tcpinfo->listenaddr = m_strdup("localhost");
+	}
+	tcpinfo->listenport = listenport;
+
+	tcpinfo->chantype = &cli_chan_tcplocal;
+	tcpinfo->tcp_type = direct;
+
+	ret = listen_tcpfwd(tcpinfo);
+
+	if (ret == DROPBEAR_FAILURE) {
+		m_free(tcpinfo);
+	}
+	TRACE(("leave cli_localtcp: %d", ret))
+	return ret;
+}
+#endif /* ENABLE_CLI_LOCALTCPFWD */
+
+#ifdef  ENABLE_CLI_REMOTETCPFWD
+static void send_msg_global_request_remotetcp(int port) {
+
+	char* listenspec = NULL;
+	TRACE(("enter send_msg_global_request_remotetcp"))
+
+	CHECKCLEARTOWRITE();
+	buf_putbyte(ses.writepayload, SSH_MSG_GLOBAL_REQUEST);
+	buf_putstring(ses.writepayload, "tcpip-forward", 13);
+	buf_putbyte(ses.writepayload, 0);
+	if (opts.listen_fwd_all) {
+		listenspec = "";
+	} else {
+		listenspec = "localhost";
+	}
+	/* TODO: IPv6? */;
+	buf_putstring(ses.writepayload, listenspec, strlen(listenspec));
+	buf_putint(ses.writepayload, port);
+
+	encrypt_packet();
+
+	TRACE(("leave send_msg_global_request_remotetcp"))
+}
+
+void setup_remotetcp() {
+
+	struct TCPFwdList * iter = NULL;
+
+	TRACE(("enter setup_remotetcp"))
+
+	if (cli_opts.remotefwds == NULL) {
+		TRACE(("cli_opts.remotefwds == NULL"))
+	}
+
+	iter = cli_opts.remotefwds;
+
+	while (iter != NULL) {
+		send_msg_global_request_remotetcp(iter->listenport);
+		iter = iter->next;
+	}
+	TRACE(("leave setup_remotetcp"))
+}
+
+static int newtcpforwarded(struct Channel * channel) {
+
+	unsigned int origport;
+	struct TCPFwdList * iter = NULL;
+	char portstring[NI_MAXSERV];
+	int sock;
+	int err = SSH_OPEN_ADMINISTRATIVELY_PROHIBITED;
+
+	/* We don't care what address they connected to */
+	buf_eatstring(ses.payload);
+
+	origport = buf_getint(ses.payload);
+
+	/* Find which port corresponds */
+	iter = cli_opts.remotefwds;
+
+	while (iter != NULL) {
+		if (origport == iter->listenport) {
+			break;
+		}
+		iter = iter->next;
+	}
+
+	if (iter == NULL) {
+		/* We didn't request forwarding on that port */
+		dropbear_log(LOG_INFO, "Server send unrequested port, from port %d", 
+										origport);
+		goto out;
+	}
+	
+	snprintf(portstring, sizeof(portstring), "%d", iter->connectport);
+	sock = connect_remote(iter->connectaddr, portstring, 1, NULL);
+	if (sock < 0) {
+		TRACE(("leave newtcpdirect: sock failed"))
+		err = SSH_OPEN_CONNECT_FAILED;
+		goto out;
+	}
+
+	ses.maxfd = MAX(ses.maxfd, sock);
+
+	/* We don't set readfd, that will get set after the connection's
+	 * progress succeeds */
+	channel->writefd = sock;
+	channel->initconn = 1;
+	
+	err = SSH_OPEN_IN_PROGRESS;
+
+out:
+	TRACE(("leave newtcpdirect: err %d", err))
+	return err;
+}
+#endif /* ENABLE_CLI_REMOTETCPFWD */
diff --git a/common-algo.c b/common-algo.c
new file mode 100644
index 0000000..ae2102a
--- /dev/null
+++ b/common-algo.c
@@ -0,0 +1,228 @@
+/*
+ * Dropbear SSH
+ * 
+ * Copyright (c) 2002,2003 Matt Johnston
+ * Copyright (c) 2004 by Mihnea Stoenescu
+ * All rights reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE. */
+
+#include "algo.h"
+#include "dbutil.h"
+
+/* This file (algo.c) organises the ciphers which can be used, and is used to
+ * decide which ciphers/hashes/compression/signing to use during key exchange*/
+
+/* Mappings for ciphers, parameters are
+   {&cipher_desc, keysize, blocksize} */
+
+#ifdef DROPBEAR_AES256_CBC
+static const struct dropbear_cipher dropbear_aes256 = 
+	{&aes_desc, 32, 16};
+#endif
+#ifdef DROPBEAR_AES128_CBC
+static const struct dropbear_cipher dropbear_aes128 = 
+	{&aes_desc, 16, 16};
+#endif
+#ifdef DROPBEAR_BLOWFISH_CBC
+static const struct dropbear_cipher dropbear_blowfish = 
+	{&blowfish_desc, 16, 8};
+#endif
+#ifdef DROPBEAR_TWOFISH256_CBC
+static const struct dropbear_cipher dropbear_twofish256 = 
+	{&twofish_desc, 32, 16};
+#endif
+#ifdef DROPBEAR_TWOFISH128_CBC
+static const struct dropbear_cipher dropbear_twofish128 = 
+	{&twofish_desc, 16, 16};
+#endif
+#ifdef DROPBEAR_3DES_CBC
+static const struct dropbear_cipher dropbear_3des = 
+	{&des3_desc, 24, 8};
+#endif
+
+/* used to indicate no encryption, as defined in rfc2410 */
+const struct dropbear_cipher dropbear_nocipher =
+	{NULL, 16, 8}; 
+
+/* Mapping of ssh hashes to libtomcrypt hashes, including keysize etc.
+   {&hash_desc, keysize, hashsize} */
+
+#ifdef DROPBEAR_SHA1_HMAC
+static const struct dropbear_hash dropbear_sha1 = 
+	{&sha1_desc, 20, 20};
+#endif
+#ifdef DROPBEAR_SHA1_96_HMAC
+static const struct dropbear_hash dropbear_sha1_96 = 
+	{&sha1_desc, 20, 12};
+#endif
+#ifdef DROPBEAR_MD5_HMAC
+static const struct dropbear_hash dropbear_md5 = 
+	{&md5_desc, 16, 16};
+#endif
+
+const struct dropbear_hash dropbear_nohash =
+	{NULL, 16, 0}; /* used initially */
+	
+
+/* The following map ssh names to internal values */
+
+algo_type sshciphers[] = {
+#ifdef DROPBEAR_AES128_CBC
+	{"aes128-cbc", 0, (void*)&dropbear_aes128, 1},
+#endif
+#ifdef DROPBEAR_3DES_CBC
+	{"3des-cbc", 0, (void*)&dropbear_3des, 1},
+#endif
+#ifdef DROPBEAR_AES256_CBC
+	{"aes256-cbc", 0, (void*)&dropbear_aes256, 1},
+#endif
+#ifdef DROPBEAR_TWOFISH256_CBC
+	{"twofish256-cbc", 0, (void*)&dropbear_twofish256, 1},
+	{"twofish-cbc", 0, (void*)&dropbear_twofish256, 1},
+#endif
+#ifdef DROPBEAR_TWOFISH128_CBC
+	{"twofish128-cbc", 0, (void*)&dropbear_twofish128, 1},
+#endif
+#ifdef DROPBEAR_BLOWFISH_CBC
+	{"blowfish-cbc", 0, (void*)&dropbear_blowfish, 1},
+#endif
+	{NULL, 0, NULL, 0}
+};
+
+algo_type sshhashes[] = {
+#ifdef DROPBEAR_SHA1_96_HMAC
+	{"hmac-sha1-96", 0, (void*)&dropbear_sha1_96, 1},
+#endif
+#ifdef DROPBEAR_SHA1_HMAC
+	{"hmac-sha1", 0, (void*)&dropbear_sha1, 1},
+#endif
+#ifdef DROPBEAR_MD5_HMAC
+	{"hmac-md5", 0, (void*)&dropbear_md5, 1},
+#endif
+	{NULL, 0, NULL, 0}
+};
+
+algo_type sshcompress[] = {
+#ifndef DISABLE_ZLIB
+	{"zlib", DROPBEAR_COMP_ZLIB, NULL, 1},
+#endif
+	{"none", DROPBEAR_COMP_NONE, NULL, 1},
+	{NULL, 0, NULL, 0}
+};
+
+algo_type sshhostkey[] = {
+#ifdef DROPBEAR_RSA
+	{"ssh-rsa", DROPBEAR_SIGNKEY_RSA, NULL, 1},
+#endif
+#ifdef DROPBEAR_DSS
+	{"ssh-dss", DROPBEAR_SIGNKEY_DSS, NULL, 1},
+#endif
+	{NULL, 0, NULL, 0}
+};
+
+algo_type sshkex[] = {
+	{"diffie-hellman-group1-sha1", DROPBEAR_KEX_DH_GROUP1, NULL, 1},
+	{NULL, 0, NULL, 0}
+};
+
+
+/* Register the compiled in ciphers.
+ * This should be run before using any of the ciphers/hashes */
+void crypto_init() {
+
+	const struct ltc_cipher_descriptor *regciphers[] = {
+#ifdef DROPBEAR_AES_CBC
+		&aes_desc,
+#endif
+#ifdef DROPBEAR_BLOWFISH_CBC
+		&blowfish_desc,
+#endif
+#ifdef DROPBEAR_TWOFISH_CBC
+		&twofish_desc,
+#endif
+#ifdef DROPBEAR_3DES_CBC
+		&des3_desc,
+#endif
+		NULL
+	};
+
+	const struct ltc_hash_descriptor *reghashes[] = {
+		/* we need sha1 for hostkey stuff regardless */
+		&sha1_desc,
+#ifdef DROPBEAR_MD5_HMAC
+		&md5_desc,
+#endif
+		NULL
+	};	
+	int i;
+	
+	for (i = 0; regciphers[i] != NULL; i++) {
+		if (register_cipher(regciphers[i]) == -1) {
+			dropbear_exit("error registering crypto");
+		}
+	}
+
+	for (i = 0; reghashes[i] != NULL; i++) {
+		if (register_hash(reghashes[i]) == -1) {
+			dropbear_exit("error registering crypto");
+		}
+	}
+}
+
+/* algolen specifies the length of algo, algos is our local list to match
+ * against.
+ * Returns DROPBEAR_SUCCESS if we have a match for algo, DROPBEAR_FAILURE
+ * otherwise */
+int have_algo(char* algo, size_t algolen, algo_type algos[]) {
+
+	int i;
+
+	for (i = 0; algos[i].name != NULL; i++) {
+		if (strlen(algos[i].name) == algolen
+				&& (strncmp(algos[i].name, algo, algolen) == 0)) {
+			return DROPBEAR_SUCCESS;
+		}
+	}
+
+	return DROPBEAR_FAILURE;
+}
+
+
+
+/* Output a comma separated list of algorithms to a buffer */
+void buf_put_algolist(buffer * buf, algo_type localalgos[]) {
+
+	unsigned int i, len;
+	unsigned int donefirst = 0;
+	buffer *algolist = NULL;
+
+	algolist = buf_new(100);
+	for (i = 0; localalgos[i].name != NULL; i++) {
+		if (localalgos[i].usable) {
+			if (donefirst)
+				buf_putbyte(algolist, ',');
+			donefirst = 1;
+			len = strlen(localalgos[i].name);
+			buf_putbytes(algolist, localalgos[i].name, len);
+		}
+	}
+	buf_putstring(buf, algolist->data, algolist->len);
+	buf_free(algolist);
+}
diff --git a/common-channel.c b/common-channel.c
new file mode 100644
index 0000000..68d2b48
--- /dev/null
+++ b/common-channel.c
@@ -0,0 +1,1038 @@
+/*
+ * Dropbear SSH
+ * 
+ * Copyright (c) 2002-2004 Matt Johnston
+ * All rights reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE. */
+
+/* Handle the multiplexed channels, such as sessions, x11, agent connections */
+
+#include "includes.h"
+#include "session.h"
+#include "packet.h"
+#include "ssh.h"
+#include "buffer.h"
+#include "circbuffer.h"
+#include "dbutil.h"
+#include "channel.h"
+#include "ssh.h"
+#include "listener.h"
+
+static void send_msg_channel_open_failure(unsigned int remotechan, int reason,
+		const unsigned char *text, const unsigned char *lang);
+static void send_msg_channel_open_confirmation(struct Channel* channel,
+		unsigned int recvwindow, 
+		unsigned int recvmaxpacket);
+static void writechannel(struct Channel* channel, int fd, circbuffer *cbuf);
+static void send_msg_channel_window_adjust(struct Channel *channel, 
+		unsigned int incr);
+static void send_msg_channel_data(struct Channel *channel, int isextended,
+		unsigned int exttype);
+static void send_msg_channel_eof(struct Channel *channel);
+static void send_msg_channel_close(struct Channel *channel);
+static void removechannel(struct Channel *channel);
+static void deletechannel(struct Channel *channel);
+static void checkinitdone(struct Channel *channel);
+static void checkclose(struct Channel *channel);
+
+static void closewritefd(struct Channel * channel);
+static void closereadfd(struct Channel * channel, int fd);
+static void closechanfd(struct Channel *channel, int fd, int how);
+
+#define FD_UNINIT (-2)
+#define FD_CLOSED (-1)
+
+/* Initialise all the channels */
+void chaninitialise(const struct ChanType *chantypes[]) {
+
+	/* may as well create space for a single channel */
+	ses.channels = (struct Channel**)m_malloc(sizeof(struct Channel*));
+	ses.chansize = 1;
+	ses.channels[0] = NULL;
+	ses.chancount = 0;
+
+	ses.chantypes = chantypes;
+
+#ifdef USING_LISTENERS
+	listeners_initialise();
+#endif
+
+}
+
+/* Clean up channels, freeing allocated memory */
+void chancleanup() {
+
+	unsigned int i;
+
+	TRACE(("enter chancleanup"))
+	for (i = 0; i < ses.chansize; i++) {
+		if (ses.channels[i] != NULL) {
+			TRACE(("channel %d closing", i))
+			removechannel(ses.channels[i]);
+		}
+	}
+	m_free(ses.channels);
+	TRACE(("leave chancleanup"))
+}
+
+/* Create a new channel entry, send a reply confirm or failure */
+/* If remotechan, transwindow and transmaxpacket are not know (for a new
+ * outgoing connection, with them to be filled on confirmation), they should
+ * all be set to 0 */
+struct Channel* newchannel(unsigned int remotechan, 
+		const struct ChanType *type, 
+		unsigned int transwindow, unsigned int transmaxpacket) {
+
+	struct Channel * newchan;
+	unsigned int i, j;
+
+	TRACE(("enter newchannel"))
+	
+	/* first see if we can use existing channels */
+	for (i = 0; i < ses.chansize; i++) {
+		if (ses.channels[i] == NULL) {
+			break;
+		}
+	}
+
+	/* otherwise extend the list */
+	if (i == ses.chansize) {
+		if (ses.chansize >= MAX_CHANNELS) {
+			TRACE(("leave newchannel: max chans reached"))
+			return NULL;
+		}
+
+		/* extend the channels */
+		ses.channels = (struct Channel**)m_realloc(ses.channels,
+				(ses.chansize+CHAN_EXTEND_SIZE)*sizeof(struct Channel*));
+
+		ses.chansize += CHAN_EXTEND_SIZE;
+
+		/* set the new channels to null */
+		for (j = i; j < ses.chansize; j++) {
+			ses.channels[j] = NULL;
+		}
+
+	}
+	
+	newchan = (struct Channel*)m_malloc(sizeof(struct Channel));
+	newchan->type = type;
+	newchan->index = i;
+	newchan->sentclosed = newchan->recvclosed = 0;
+	newchan->senteof = newchan->recveof = 0;
+
+	newchan->remotechan = remotechan;
+	newchan->transwindow = transwindow;
+	newchan->transmaxpacket = transmaxpacket;
+	
+	newchan->typedata = NULL;
+	newchan->writefd = FD_UNINIT;
+	newchan->readfd = FD_UNINIT;
+	newchan->errfd = FD_CLOSED; /* this isn't always set to start with */
+	newchan->initconn = 0;
+	newchan->await_open = 0;
+
+	newchan->writebuf = cbuf_new(RECV_MAXWINDOW);
+	newchan->extrabuf = NULL; /* The user code can set it up */
+	newchan->recvwindow = RECV_MAXWINDOW;
+	newchan->recvdonelen = 0;
+	newchan->recvmaxpacket = RECV_MAXPACKET;
+
+	ses.channels[i] = newchan;
+	ses.chancount++;
+
+	TRACE(("leave newchannel"))
+
+	return newchan;
+}
+
+/* Returns the channel structure corresponding to the channel in the current
+ * data packet (ses.payload must be positioned appropriately) */
+struct Channel* getchannel() {
+
+	unsigned int chan;
+
+	chan = buf_getint(ses.payload);
+	if (chan >= ses.chansize || ses.channels[chan] == NULL) {
+		return NULL;
+	}
+	return ses.channels[chan];
+}
+
+/* Iterate through the channels, performing IO if available */
+void channelio(fd_set *readfds, fd_set *writefds) {
+
+	struct Channel *channel;
+	unsigned int i;
+
+	/* iterate through all the possible channels */
+	for (i = 0; i < ses.chansize; i++) {
+
+		channel = ses.channels[i];
+		if (channel == NULL) {
+			/* only process in-use channels */
+			continue;
+		}
+
+		/* read data and send it over the wire */
+		if (channel->readfd >= 0 && FD_ISSET(channel->readfd, readfds)) {
+			send_msg_channel_data(channel, 0, 0);
+		}
+
+		/* read stderr data and send it over the wire */
+		if (channel->extrabuf == NULL &&
+				channel->errfd >= 0 && FD_ISSET(channel->errfd, readfds)) {
+				send_msg_channel_data(channel, 1, SSH_EXTENDED_DATA_STDERR);
+		}
+
+		/* write to program/pipe stdin */
+		if (channel->writefd >= 0 && FD_ISSET(channel->writefd, writefds)) {
+			if (channel->initconn) {
+				checkinitdone(channel);
+				continue; /* Important not to use the channel after
+							 checkinitdone(), as it may be NULL */
+			}
+			writechannel(channel, channel->writefd, channel->writebuf);
+		}
+		
+		/* stderr for client mode */
+		if (channel->extrabuf != NULL 
+				&& channel->errfd >= 0 && FD_ISSET(channel->errfd, writefds)) {
+			writechannel(channel, channel->errfd, channel->extrabuf);
+		}
+	
+		/* now handle any of the channel-closing type stuff */
+		checkclose(channel);
+
+	} /* foreach channel */
+
+	/* Listeners such as TCP, X11, agent-auth */
+#ifdef USING_LISTENERS
+	handle_listeners(readfds);
+#endif
+}
+
+
+/* do all the EOF/close type stuff checking for a channel */
+static void checkclose(struct Channel *channel) {
+
+	TRACE(("checkclose: writefd %d, readfd %d, errfd %d, sentclosed %d, recvclosed %d",
+				channel->writefd, channel->readfd,
+				channel->errfd, channel->sentclosed, channel->recvclosed))
+	TRACE(("writebuf %d extrabuf %s extrabuf %d",
+				cbuf_getused(channel->writebuf),
+				channel->writebuf,
+				channel->writebuf ? 0 : cbuf_getused(channel->extrabuf)))
+
+	if (!channel->sentclosed) {
+
+		/* check for exited - currently only used for server sessions,
+		 * if the shell has exited etc */
+		if (channel->type->checkclose) {
+			if (channel->type->checkclose(channel)) {
+				closewritefd(channel);
+			}
+		}
+
+		if (!channel->senteof
+			&& channel->readfd == FD_CLOSED 
+			&& (channel->extrabuf != NULL || channel->errfd == FD_CLOSED)) {
+			send_msg_channel_eof(channel);
+		}
+
+		if (channel->writefd == FD_CLOSED
+			&& channel->readfd == FD_CLOSED
+			&& (channel->extrabuf != NULL || channel->errfd == FD_CLOSED)) {
+			send_msg_channel_close(channel);
+		}
+	}
+
+	/* When either party wishes to terminate the channel, it sends
+	 * SSH_MSG_CHANNEL_CLOSE.  Upon receiving this message, a party MUST
+	 * send back a SSH_MSG_CHANNEL_CLOSE unless it has already sent this
+	 * message for the channel.  The channel is considered closed for a
+	 * party when it has both sent and received SSH_MSG_CHANNEL_CLOSE, and
+	 * the party may then reuse the channel number.  A party MAY send
+	 * SSH_MSG_CHANNEL_CLOSE without having sent or received
+	 * SSH_MSG_CHANNEL_EOF. 
+	 * (from draft-ietf-secsh-connect)
+	 */
+	if (channel->recvclosed) {
+		if (! channel->sentclosed) {
+			TRACE(("Sending MSG_CHANNEL_CLOSE in response to same."))
+			send_msg_channel_close(channel);
+		}
+		removechannel(channel);
+	}
+}
+
+
+/* Check whether a deferred (EINPROGRESS) connect() was successful, and
+ * if so, set up the channel properly. Otherwise, the channel is cleaned up, so
+ * it is important that the channel reference isn't used after a call to this
+ * function */
+static void checkinitdone(struct Channel *channel) {
+
+	int val;
+	socklen_t vallen = sizeof(val);
+
+	TRACE(("enter checkinitdone"))
+
+	if (getsockopt(channel->writefd, SOL_SOCKET, SO_ERROR, &val, &vallen)
+			|| val != 0) {
+		send_msg_channel_open_failure(channel->remotechan,
+				SSH_OPEN_CONNECT_FAILED, "", "");
+		close(channel->writefd);
+		deletechannel(channel);
+		TRACE(("leave checkinitdone: fail"))
+	} else {
+		send_msg_channel_open_confirmation(channel, channel->recvwindow,
+				channel->recvmaxpacket);
+		channel->readfd = channel->writefd;
+		channel->initconn = 0;
+		TRACE(("leave checkinitdone: success"))
+	}
+}
+
+
+
+/* Send the close message and set the channel as closed */
+static void send_msg_channel_close(struct Channel *channel) {
+
+	TRACE(("enter send_msg_channel_close"))
+	/* XXX server */
+	if (channel->type->closehandler) {
+		channel->type->closehandler(channel);
+	}
+	
+	CHECKCLEARTOWRITE();
+
+	buf_putbyte(ses.writepayload, SSH_MSG_CHANNEL_CLOSE);
+	buf_putint(ses.writepayload, channel->remotechan);
+
+	encrypt_packet();
+
+	channel->senteof = 1;
+	channel->sentclosed = 1;
+	TRACE(("leave send_msg_channel_close"))
+}
+
+/* call this when trans/eof channels are closed */
+static void send_msg_channel_eof(struct Channel *channel) {
+
+	TRACE(("enter send_msg_channel_eof"))
+	CHECKCLEARTOWRITE();
+
+	buf_putbyte(ses.writepayload, SSH_MSG_CHANNEL_EOF);
+	buf_putint(ses.writepayload, channel->remotechan);
+
+	encrypt_packet();
+
+	channel->senteof = 1;
+
+	TRACE(("leave send_msg_channel_eof"))
+}
+
+/* Called to write data out to the local side of the channel. 
+ * Only called when we know we can write to a channel, writes as much as
+ * possible */
+static void writechannel(struct Channel* channel, int fd, circbuffer *cbuf) {
+
+	int len, maxlen;
+
+	TRACE(("enter writechannel"))
+
+	maxlen = cbuf_readlen(cbuf);
+
+	/* Write the data out */
+	len = write(fd, cbuf_readptr(cbuf, maxlen), maxlen);
+	if (len <= 0) {
+		if (len < 0 && errno != EINTR) {
+			/* no more to write - we close it even if the fd was stderr, since
+			 * that's a nasty failure too */
+			closewritefd(channel);
+		}
+		TRACE(("leave writechannel: len <= 0"))
+		return;
+	}
+
+	cbuf_incrread(cbuf, len);
+	channel->recvdonelen += len;
+
+	if (fd == channel->writefd && cbuf_getused(cbuf) == 0 && channel->recveof) { 
+		/* Check if we're closing up */
+		closewritefd(channel);
+		TRACE(("leave writechannel: recveof set"))
+		return;
+	}
+
+	/* Window adjust handling */
+	if (channel->recvdonelen >= RECV_WINDOWEXTEND) {
+		/* Set it back to max window */
+		send_msg_channel_window_adjust(channel, channel->recvdonelen);
+		channel->recvwindow += channel->recvdonelen;
+		channel->recvdonelen = 0;
+	}
+
+	dropbear_assert(channel->recvwindow <= RECV_MAXWINDOW);
+	dropbear_assert(channel->recvwindow <= cbuf_getavail(channel->writebuf));
+	dropbear_assert(channel->extrabuf == NULL ||
+			channel->recvwindow <= cbuf_getavail(channel->extrabuf));
+	
+	
+	TRACE(("leave writechannel"))
+}
+
+/* Set the file descriptors for the main select in session.c
+ * This avoid channels which don't have any window available, are closed, etc*/
+void setchannelfds(fd_set *readfds, fd_set *writefds) {
+	
+	unsigned int i;
+	struct Channel * channel;
+	
+	for (i = 0; i < ses.chansize; i++) {
+
+		channel = ses.channels[i];
+		if (channel == NULL) {
+			continue;
+		}
+
+		/* Stuff to put over the wire */
+		if (channel->transwindow > 0) {
+
+			if (channel->readfd >= 0) {
+				FD_SET(channel->readfd, readfds);
+			}
+			
+			if (channel->extrabuf == NULL && channel->errfd >= 0) {
+					FD_SET(channel->errfd, readfds);
+			}
+		}
+
+		/* Stuff from the wire */
+		if ((channel->writefd >= 0 && cbuf_getused(channel->writebuf) > 0 )
+				|| channel->initconn) {
+
+				FD_SET(channel->writefd, writefds);
+		}
+
+		if (channel->extrabuf != NULL && channel->errfd >= 0 
+				&& cbuf_getused(channel->extrabuf) > 0 ) {
+				FD_SET(channel->errfd, writefds);
+		}
+
+	} /* foreach channel */
+
+#ifdef USING_LISTENERS
+	set_listener_fds(readfds);
+#endif
+
+}
+
+/* handle the channel EOF event, by closing the channel filedescriptor. The
+ * channel isn't closed yet, it is left until the incoming (from the program
+ * etc) FD is also EOF */
+void recv_msg_channel_eof() {
+
+	struct Channel * channel;
+
+	TRACE(("enter recv_msg_channel_eof"))
+
+	channel = getchannel();
+	if (channel == NULL) {
+		dropbear_exit("EOF for unknown channel");
+	}
+
+	channel->recveof = 1;
+	if (cbuf_getused(channel->writebuf) == 0
+			&& (channel->extrabuf == NULL 
+					|| cbuf_getused(channel->extrabuf) == 0)) {
+		closewritefd(channel);
+	}
+
+	TRACE(("leave recv_msg_channel_eof"))
+}
+
+
+/* Handle channel closure(), respond in kind and close the channels */
+void recv_msg_channel_close() {
+
+	struct Channel * channel;
+
+	TRACE(("enter recv_msg_channel_close"))
+
+	channel = getchannel();
+	if (channel == NULL) {
+		/* disconnect ? */
+		dropbear_exit("Close for unknown channel");
+	}
+
+	channel->recveof = 1;
+	channel->recvclosed = 1;
+
+	if (channel->sentclosed) {
+		removechannel(channel);
+	}
+
+	TRACE(("leave recv_msg_channel_close"))
+}
+
+/* Remove a channel entry, this is only executed after both sides have sent
+ * channel close */
+static void removechannel(struct Channel * channel) {
+
+	TRACE(("enter removechannel"))
+	TRACE(("channel index is %d", channel->index))
+
+	cbuf_free(channel->writebuf);
+	channel->writebuf = NULL;
+
+	if (channel->extrabuf) {
+		cbuf_free(channel->extrabuf);
+		channel->extrabuf = NULL;
+	}
+
+
+	/* close the FDs in case they haven't been done
+	 * yet (ie they were shutdown etc */
+	close(channel->writefd);
+	close(channel->readfd);
+	close(channel->errfd);
+
+	channel->typedata = NULL;
+
+	deletechannel(channel);
+
+	TRACE(("leave removechannel"))
+}
+
+/* Remove a channel entry */
+static void deletechannel(struct Channel *channel) {
+
+	ses.channels[channel->index] = NULL;
+	m_free(channel);
+	ses.chancount--;
+	
+}
+
+
+/* Handle channel specific requests, passing off to corresponding handlers
+ * such as chansession or x11fwd */
+void recv_msg_channel_request() {
+
+	struct Channel *channel;
+
+	TRACE(("enter recv_msg_channel_request"))
+	
+	channel = getchannel();
+	if (channel == NULL) {
+		/* disconnect ? */
+		dropbear_exit("Unknown channel");
+	}
+
+	if (channel->type->reqhandler) {
+		channel->type->reqhandler(channel);
+	} else {
+		send_msg_channel_failure(channel);
+	}
+
+	TRACE(("leave recv_msg_channel_request"))
+
+}
+
+/* Reads data from the server's program/shell/etc, and puts it in a
+ * channel_data packet to send.
+ * chan is the remote channel, isextended is 0 if it is normal data, 1
+ * if it is extended data. if it is extended, then the type is in
+ * exttype */
+static void send_msg_channel_data(struct Channel *channel, int isextended,
+		unsigned int exttype) {
+
+	buffer *buf;
+	int len;
+	unsigned int maxlen;
+	int fd;
+
+/*	TRACE(("enter send_msg_channel_data"))
+	TRACE(("extended = %d type = %d", isextended, exttype))*/
+
+	CHECKCLEARTOWRITE();
+
+	dropbear_assert(!channel->sentclosed);
+
+	if (isextended) {
+		fd = channel->errfd;
+	} else {
+		fd = channel->readfd;
+	}
+	dropbear_assert(fd >= 0);
+
+	maxlen = MIN(channel->transwindow, channel->transmaxpacket);
+	/* -(1+4+4) is SSH_MSG_CHANNEL_DATA, channel number, string length, and 
+	 * exttype if is extended */
+	maxlen = MIN(maxlen, 
+			ses.writepayload->size - 1 - 4 - 4 - (isextended ? 4 : 0));
+	if (maxlen == 0) {
+		TRACE(("leave send_msg_channel_data: no window"))
+		return; /* the data will get written later */
+	}
+
+	/* read the data */
+	TRACE(("maxlen %d", maxlen))
+	buf = buf_new(maxlen);
+	TRACE(("buf pos %d data %x", buf->pos, buf->data))
+	len = read(fd, buf_getwriteptr(buf, maxlen), maxlen);
+	if (len <= 0) {
+		/* on error/eof, send eof */
+		if (len == 0 || errno != EINTR) {
+			closereadfd(channel, fd);
+		}
+		buf_free(buf);
+		buf = NULL;
+		TRACE(("leave send_msg_channel_data: read err or EOF for fd %d", 
+					channel->index));
+		return;
+	}
+	buf_incrlen(buf, len);
+
+	buf_putbyte(ses.writepayload, 
+			isextended ? SSH_MSG_CHANNEL_EXTENDED_DATA : SSH_MSG_CHANNEL_DATA);
+	buf_putint(ses.writepayload, channel->remotechan);
+
+	if (isextended) {
+		buf_putint(ses.writepayload, exttype);
+	}
+
+	buf_putstring(ses.writepayload, buf_getptr(buf, len), len);
+	buf_free(buf);
+	buf = NULL;
+
+	channel->transwindow -= len;
+
+	encrypt_packet();
+	TRACE(("leave send_msg_channel_data"))
+}
+
+/* We receive channel data */
+void recv_msg_channel_data() {
+
+	struct Channel *channel;
+
+	channel = getchannel();
+	if (channel == NULL) {
+		dropbear_exit("Unknown channel");
+	}
+
+	common_recv_msg_channel_data(channel, channel->writefd, channel->writebuf);
+}
+
+/* Shared for data and stderr data - when we receive data, put it in a buffer
+ * for writing to the local file descriptor */
+void common_recv_msg_channel_data(struct Channel *channel, int fd, 
+		circbuffer * cbuf) {
+
+	unsigned int datalen;
+	unsigned int maxdata;
+	unsigned int buflen;
+	unsigned int len;
+
+	TRACE(("enter recv_msg_channel_data"))
+
+	if (channel->recveof) {
+		dropbear_exit("received data after eof");
+	}
+
+ 	if (fd < 0) {
+		dropbear_exit("received data with bad writefd");
+	}
+
+	datalen = buf_getint(ses.payload);
+
+
+	maxdata = cbuf_getavail(cbuf);
+
+	/* Whilst the spec says we "MAY ignore data past the end" this could
+	 * lead to corrupted file transfers etc (chunks missed etc). It's better to
+	 * just die horribly */
+	if (datalen > maxdata) {
+		dropbear_exit("Oversized packet");
+	}
+
+	/* We may have to run throught twice, if the buffer wraps around. Can't
+	 * just "leave it for next time" like with writechannel, since this
+	 * is payload data */
+	len = datalen;
+	while (len > 0) {
+		buflen = cbuf_writelen(cbuf);
+		buflen = MIN(buflen, len);
+
+		memcpy(cbuf_writeptr(cbuf, buflen), 
+				buf_getptr(ses.payload, buflen), buflen);
+		cbuf_incrwrite(cbuf, buflen);
+		buf_incrpos(ses.payload, buflen);
+		len -= buflen;
+	}
+
+	dropbear_assert(channel->recvwindow >= datalen);
+	channel->recvwindow -= datalen;
+	dropbear_assert(channel->recvwindow <= RECV_MAXWINDOW);
+
+	TRACE(("leave recv_msg_channel_data"))
+}
+
+/* Increment the outgoing data window for a channel - the remote end limits
+ * the amount of data which may be transmitted, this window is decremented
+ * as data is sent, and incremented upon receiving window-adjust messages */
+void recv_msg_channel_window_adjust() {
+
+	struct Channel * channel;
+	unsigned int incr;
+	
+	channel = getchannel();
+	if (channel == NULL) {
+		dropbear_exit("Unknown channel");
+	}
+	
+	incr = buf_getint(ses.payload);
+	TRACE(("received window increment %d", incr))
+	incr = MIN(incr, MAX_TRANS_WIN_INCR);
+	
+	channel->transwindow += incr;
+	channel->transwindow = MIN(channel->transwindow, MAX_TRANS_WINDOW);
+
+}
+
+/* Increment the incoming data window for a channel, and let the remote
+ * end know */
+static void send_msg_channel_window_adjust(struct Channel* channel, 
+		unsigned int incr) {
+
+	TRACE(("sending window adjust %d", incr))
+	CHECKCLEARTOWRITE();
+
+	buf_putbyte(ses.writepayload, SSH_MSG_CHANNEL_WINDOW_ADJUST);
+	buf_putint(ses.writepayload, channel->remotechan);
+	buf_putint(ses.writepayload, incr);
+
+	encrypt_packet();
+}
+	
+/* Handle a new channel request, performing any channel-type-specific setup */
+/* XXX server */
+void recv_msg_channel_open() {
+
+	unsigned char *type;
+	unsigned int typelen;
+	unsigned int remotechan, transwindow, transmaxpacket;
+	struct Channel *channel;
+	const struct ChanType **cp;
+	const struct ChanType *chantype;
+	unsigned int errtype = SSH_OPEN_UNKNOWN_CHANNEL_TYPE;
+	int ret;
+
+
+	TRACE(("enter recv_msg_channel_open"))
+
+	/* get the packet contents */
+	type = buf_getstring(ses.payload, &typelen);
+
+	remotechan = buf_getint(ses.payload);
+	transwindow = buf_getint(ses.payload);
+	transwindow = MIN(transwindow, MAX_TRANS_WINDOW);
+	transmaxpacket = buf_getint(ses.payload);
+	transmaxpacket = MIN(transmaxpacket, MAX_TRANS_PAYLOAD_LEN);
+
+	/* figure what type of packet it is */
+	if (typelen > MAX_NAME_LEN) {
+		goto failure;
+	}
+
+	/* Get the channel type. Client and server style invokation will set up a
+	 * different list for ses.chantypes at startup. We just iterate through
+	 * this list and find the matching name */
+	for (cp = &ses.chantypes[0], chantype = (*cp); 
+			chantype != NULL;
+			cp++, chantype = (*cp)) {
+		if (strcmp(type, chantype->name) == 0) {
+			break;
+		}
+	}
+
+	if (chantype == NULL) {
+		TRACE(("No matching type for '%s'", type))
+		goto failure;
+	}
+
+	TRACE(("matched type '%s'", type))
+
+	/* create the channel */
+	channel = newchannel(remotechan, chantype, transwindow, transmaxpacket);
+
+	if (channel == NULL) {
+		TRACE(("newchannel returned NULL"))
+		goto failure;
+	}
+
+	if (channel->type->inithandler) {
+		ret = channel->type->inithandler(channel);
+		if (ret > 0) {
+			if (ret == SSH_OPEN_IN_PROGRESS) {
+				/* We'll send the confirmation later */
+				goto cleanup;
+			}
+			errtype = ret;
+			deletechannel(channel);
+			TRACE(("inithandler returned failure %d", ret))
+			goto failure;
+		}
+	}
+
+	/* success */
+	send_msg_channel_open_confirmation(channel, channel->recvwindow,
+			channel->recvmaxpacket);
+	goto cleanup;
+
+failure:
+	TRACE(("recv_msg_channel_open failure"))
+	send_msg_channel_open_failure(remotechan, errtype, "", "");
+
+cleanup:
+	m_free(type);
+
+	TRACE(("leave recv_msg_channel_open"))
+}
+
+/* Send a failure message */
+void send_msg_channel_failure(struct Channel *channel) {
+
+	TRACE(("enter send_msg_channel_failure"))
+	CHECKCLEARTOWRITE();
+
+	buf_putbyte(ses.writepayload, SSH_MSG_CHANNEL_FAILURE);
+	buf_putint(ses.writepayload, channel->remotechan);
+
+	encrypt_packet();
+	TRACE(("leave send_msg_channel_failure"))
+}
+
+/* Send a success message */
+void send_msg_channel_success(struct Channel *channel) {
+
+	TRACE(("enter send_msg_channel_success"))
+	CHECKCLEARTOWRITE();
+
+	buf_putbyte(ses.writepayload, SSH_MSG_CHANNEL_SUCCESS);
+	buf_putint(ses.writepayload, channel->remotechan);
+
+	encrypt_packet();
+	TRACE(("leave send_msg_channel_success"))
+}
+
+/* Send a channel open failure message, with a corresponding reason
+ * code (usually resource shortage or unknown chan type) */
+static void send_msg_channel_open_failure(unsigned int remotechan, 
+		int reason, const unsigned char *text, const unsigned char *lang) {
+
+	TRACE(("enter send_msg_channel_open_failure"))
+	CHECKCLEARTOWRITE();
+	
+	buf_putbyte(ses.writepayload, SSH_MSG_CHANNEL_OPEN_FAILURE);
+	buf_putint(ses.writepayload, remotechan);
+	buf_putint(ses.writepayload, reason);
+	buf_putstring(ses.writepayload, text, strlen((char*)text));
+	buf_putstring(ses.writepayload, lang, strlen((char*)lang));
+
+	encrypt_packet();
+	TRACE(("leave send_msg_channel_open_failure"))
+}
+
+/* Confirm a channel open, and let the remote end know what number we've
+ * allocated and the receive parameters */
+static void send_msg_channel_open_confirmation(struct Channel* channel,
+		unsigned int recvwindow, 
+		unsigned int recvmaxpacket) {
+
+	TRACE(("enter send_msg_channel_open_confirmation"))
+	CHECKCLEARTOWRITE();
+
+	buf_putbyte(ses.writepayload, SSH_MSG_CHANNEL_OPEN_CONFIRMATION);
+	buf_putint(ses.writepayload, channel->remotechan);
+	buf_putint(ses.writepayload, channel->index);
+	buf_putint(ses.writepayload, recvwindow);
+	buf_putint(ses.writepayload, recvmaxpacket);
+
+	encrypt_packet();
+	TRACE(("leave send_msg_channel_open_confirmation"))
+}
+
+#if defined(USING_LISTENERS) || defined(DROPBEAR_CLIENT)
+/* Create a new channel, and start the open request. This is intended
+ * for X11, agent, tcp forwarding, and should be filled with channel-specific
+ * options, with the calling function calling encrypt_packet() after
+ * completion. It is mandatory for the caller to encrypt_packet() if
+ * DROPBEAR_SUCCESS is returned */
+int send_msg_channel_open_init(int fd, const struct ChanType *type) {
+
+	struct Channel* chan;
+
+	TRACE(("enter send_msg_channel_open_init()"))
+	chan = newchannel(0, type, 0, 0);
+	if (!chan) {
+		TRACE(("leave send_msg_channel_open_init() - FAILED in newchannel()"))
+		return DROPBEAR_FAILURE;
+	}
+
+	/* set fd non-blocking */
+	setnonblocking(fd);
+
+	chan->writefd = chan->readfd = fd;
+	ses.maxfd = MAX(ses.maxfd, fd);
+
+	chan->await_open = 1;
+
+	/* now open the channel connection */
+	CHECKCLEARTOWRITE();
+
+	buf_putbyte(ses.writepayload, SSH_MSG_CHANNEL_OPEN);
+	buf_putstring(ses.writepayload, type->name, strlen(type->name));
+	buf_putint(ses.writepayload, chan->index);
+	buf_putint(ses.writepayload, RECV_MAXWINDOW);
+	buf_putint(ses.writepayload, RECV_MAXPACKET);
+
+	TRACE(("leave send_msg_channel_open_init()"))
+	return DROPBEAR_SUCCESS;
+}
+
+/* Confirmation that our channel open request (for forwardings) was 
+ * successful*/
+void recv_msg_channel_open_confirmation() {
+
+	struct Channel * channel;
+	int ret;
+
+	TRACE(("enter recv_msg_channel_open_confirmation"))
+
+	channel = getchannel();
+	if (channel == NULL) {
+		dropbear_exit("Unknown channel");
+	}
+
+	if (!channel->await_open) {
+		dropbear_exit("unexpected channel reply");
+	}
+	channel->await_open = 0;
+
+	channel->remotechan =  buf_getint(ses.payload);
+	channel->transwindow = buf_getint(ses.payload);
+	channel->transmaxpacket = buf_getint(ses.payload);
+	
+	TRACE(("new chan remote %d local %d", 
+				channel->remotechan, channel->index))
+
+	/* Run the inithandler callback */
+	if (channel->type->inithandler) {
+		ret = channel->type->inithandler(channel);
+		if (ret > 0) {
+			removechannel(channel);
+			TRACE(("inithandler returned failure %d", ret))
+		}
+	}
+
+	
+	TRACE(("leave recv_msg_channel_open_confirmation"))
+}
+
+/* Notification that our channel open request failed */
+void recv_msg_channel_open_failure() {
+
+	struct Channel * channel;
+
+	channel = getchannel();
+	if (channel == NULL) {
+		dropbear_exit("Unknown channel");
+	}
+
+	if (!channel->await_open) {
+		dropbear_exit("unexpected channel reply");
+	}
+	channel->await_open = 0;
+
+	removechannel(channel);
+}
+#endif /* USING_LISTENERS */
+
+/* close a stdout/stderr fd */
+static void closereadfd(struct Channel * channel, int fd) {
+
+	/* don't close it if it is the same as writefd,
+	 * unless writefd is already set -1 */
+	TRACE(("enter closereadfd"))
+	closechanfd(channel, fd, 0);
+	TRACE(("leave closereadfd"))
+}
+
+/* close a stdin fd */
+static void closewritefd(struct Channel * channel) {
+
+	TRACE(("enter closewritefd"))
+	closechanfd(channel, channel->writefd, 1);
+	TRACE(("leave closewritefd"))
+}
+
+/* close a fd, how is 0 for stdout/stderr, 1 for stdin */
+static void closechanfd(struct Channel *channel, int fd, int how) {
+
+	int closein = 0, closeout = 0;
+
+	/* XXX server */
+	if (channel->type->sepfds) {
+		TRACE(("shutdown((%d), %d)", fd, how))
+		shutdown(fd, how);
+		if (how == 0) {
+			closeout = 1;
+		} else {
+			closein = 1;
+		}
+	} else {
+		close(fd);
+		closein = closeout = 1;
+	}
+
+	if (closeout && fd == channel->readfd) {
+		channel->readfd = FD_CLOSED;
+	}
+	if (closeout && (channel->extrabuf == NULL) && (fd == channel->errfd)) {
+		channel->errfd = FD_CLOSED;
+	}
+
+	if (closein && fd == channel->writefd) {
+		channel->writefd = FD_CLOSED;
+	}
+	if (closein && (channel->extrabuf != NULL) && (fd == channel->errfd)) {
+		channel->errfd = FD_CLOSED;
+	}
+
+	/* if we called shutdown on it and all references are gone, then we 
+	 * need to close() it to stop it lingering */
+	if (channel->type->sepfds && channel->readfd == FD_CLOSED 
+		&& channel->writefd == FD_CLOSED && channel->errfd == FD_CLOSED) {
+		close(fd);
+	}
+}
diff --git a/common-chansession.c b/common-chansession.c
new file mode 100644
index 0000000..b350c6c
--- /dev/null
+++ b/common-chansession.c
@@ -0,0 +1,43 @@
+/*
+ * Dropbear - a SSH2 server
+ * 
+ * Copyright (c) 2002,2003 Matt Johnston
+ * All rights reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE. */
+
+#include "chansession.h"
+
+/* Mapping of signal values to ssh signal strings */
+const struct SigMap signames[] = {
+	{SIGABRT, "ABRT"},
+	{SIGALRM, "ALRM"},
+	{SIGFPE, "FPE"},
+	{SIGHUP, "HUP"},
+	{SIGILL, "ILL"},
+	{SIGINT, "INT"},
+	{SIGKILL, "KILL"},
+	{SIGPIPE, "PIPE"},
+	{SIGQUIT, "QUIT"},
+	{SIGSEGV, "SEGV"},
+	{SIGTERM, "TERM"},
+	{SIGUSR1, "USR1"},
+	{SIGUSR2, "USR2"},
+	{0, NULL}
+};
diff --git a/common-kex.c b/common-kex.c
new file mode 100644
index 0000000..5db8e52
--- /dev/null
+++ b/common-kex.c
@@ -0,0 +1,718 @@
+/*
+ * Dropbear SSH
+ * 
+ * Copyright (c) 2002-2004 Matt Johnston
+ * Portions Copyright (c) 2004 by Mihnea Stoenescu
+ * All rights reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE. */
+
+#include "includes.h"
+#include "dbutil.h"
+#include "algo.h"
+#include "buffer.h"
+#include "session.h"
+#include "kex.h"
+#include "ssh.h"
+#include "packet.h"
+#include "bignum.h"
+#include "random.h"
+
+/* diffie-hellman-group1-sha1 value for p */
+static const unsigned char dh_p_val[] = {
+	0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xC9, 0x0F, 0xDA, 0xA2,
+    0x21, 0x68, 0xC2, 0x34, 0xC4, 0xC6, 0x62, 0x8B, 0x80, 0xDC, 0x1C, 0xD1,
+	0x29, 0x02, 0x4E, 0x08, 0x8A, 0x67, 0xCC, 0x74, 0x02, 0x0B, 0xBE, 0xA6,
+	0x3B, 0x13, 0x9B, 0x22, 0x51, 0x4A, 0x08, 0x79, 0x8E, 0x34, 0x04, 0xDD,
+	0xEF, 0x95, 0x19, 0xB3, 0xCD, 0x3A, 0x43, 0x1B, 0x30, 0x2B, 0x0A, 0x6D,
+	0xF2, 0x5F, 0x14, 0x37, 0x4F, 0xE1, 0x35, 0x6D, 0x6D, 0x51, 0xC2, 0x45,
+	0xE4, 0x85, 0xB5, 0x76, 0x62, 0x5E, 0x7E, 0xC6, 0xF4, 0x4C, 0x42, 0xE9,
+	0xA6, 0x37, 0xED, 0x6B, 0x0B, 0xFF, 0x5C, 0xB6, 0xF4, 0x06, 0xB7, 0xED,
+	0xEE, 0x38, 0x6B, 0xFB, 0x5A, 0x89, 0x9F, 0xA5, 0xAE, 0x9F, 0x24, 0x11,
+	0x7C, 0x4B, 0x1F, 0xE6, 0x49, 0x28, 0x66, 0x51, 0xEC, 0xE6, 0x53, 0x81,
+	0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF};
+#define DH_P_LEN sizeof(dh_p_val)
+
+static const int DH_G_VAL = 2;
+
+static void kexinitialise();
+void gen_new_keys();
+#ifndef DISABLE_ZLIB
+static void gen_new_zstreams();
+#endif
+static void read_kex_algos();
+/* helper function for gen_new_keys */
+static void hashkeys(unsigned char *out, int outlen, 
+		const hash_state * hs, unsigned const char X);
+
+
+/* Send our list of algorithms we can use */
+void send_msg_kexinit() {
+
+	CHECKCLEARTOWRITE();
+	buf_putbyte(ses.writepayload, SSH_MSG_KEXINIT);
+
+	/* cookie */
+	genrandom(buf_getwriteptr(ses.writepayload, 16), 16);
+	buf_incrwritepos(ses.writepayload, 16);
+
+	/* kex algos */
+	buf_put_algolist(ses.writepayload, sshkex);
+
+	/* server_host_key_algorithms */
+	buf_put_algolist(ses.writepayload, sshhostkey);
+
+	/* encryption_algorithms_client_to_server */
+	buf_put_algolist(ses.writepayload, sshciphers);
+
+	/* encryption_algorithms_server_to_client */
+	buf_put_algolist(ses.writepayload, sshciphers);
+
+	/* mac_algorithms_client_to_server */
+	buf_put_algolist(ses.writepayload, sshhashes);
+
+	/* mac_algorithms_server_to_client */
+	buf_put_algolist(ses.writepayload, sshhashes);
+
+	/* compression_algorithms_client_to_server */
+	buf_put_algolist(ses.writepayload, sshcompress);
+
+	/* compression_algorithms_server_to_client */
+	buf_put_algolist(ses.writepayload, sshcompress);
+
+	/* languages_client_to_server */
+	buf_putstring(ses.writepayload, "", 0);
+
+	/* languages_server_to_client */
+	buf_putstring(ses.writepayload, "", 0);
+
+	/* first_kex_packet_follows - unimplemented for now */
+	buf_putbyte(ses.writepayload, 0x00);
+
+	/* reserved unit32 */
+	buf_putint(ses.writepayload, 0);
+
+	/* set up transmitted kex packet buffer for hashing. 
+	 * This is freed after the end of the kex */
+	ses.transkexinit = buf_newcopy(ses.writepayload);
+
+	encrypt_packet();
+	ses.dataallowed = 0; /* don't send other packets during kex */
+
+	TRACE(("DATAALLOWED=0"))
+	TRACE(("-> KEXINIT"))
+	ses.kexstate.sentkexinit = 1;
+}
+
+/* *** NOTE regarding (send|recv)_msg_newkeys *** 
+ * Changed by mihnea from the original kex.c to set dataallowed after a 
+ * completed key exchange, no matter the order in which it was performed.
+ * This enables client mode without affecting server functionality.
+ */
+
+/* Bring new keys into use after a key exchange, and let the client know*/
+void send_msg_newkeys() {
+
+	TRACE(("enter send_msg_newkeys"))
+
+	/* generate the kexinit request */
+	CHECKCLEARTOWRITE();
+	buf_putbyte(ses.writepayload, SSH_MSG_NEWKEYS);
+	encrypt_packet();
+	
+
+	/* set up our state */
+	if (ses.kexstate.recvnewkeys) {
+		TRACE(("while RECVNEWKEYS=1"))
+		gen_new_keys();
+		kexinitialise(); /* we've finished with this kex */
+		TRACE((" -> DATAALLOWED=1"))
+		ses.dataallowed = 1; /* we can send other packets again now */
+		ses.kexstate.donefirstkex = 1;
+	} else {
+		ses.kexstate.sentnewkeys = 1;
+		TRACE(("SENTNEWKEYS=1"))
+	}
+
+	TRACE(("-> MSG_NEWKEYS"))
+	TRACE(("leave send_msg_newkeys"))
+}
+
+/* Bring the new keys into use after a key exchange */
+void recv_msg_newkeys() {
+
+	TRACE(("<- MSG_NEWKEYS"))
+	TRACE(("enter recv_msg_newkeys"))
+
+	/* simply check if we've sent SSH_MSG_NEWKEYS, and if so,
+	 * switch to the new keys */
+	if (ses.kexstate.sentnewkeys) {
+		TRACE(("while SENTNEWKEYS=1"))
+		gen_new_keys();
+		kexinitialise(); /* we've finished with this kex */
+	    TRACE((" -> DATAALLOWED=1"))
+	    ses.dataallowed = 1; /* we can send other packets again now */
+		ses.kexstate.donefirstkex = 1;
+	} else {
+		TRACE(("RECVNEWKEYS=1"))
+		ses.kexstate.recvnewkeys = 1;
+	}
+	
+	TRACE(("leave recv_msg_newkeys"))
+}
+
+
+/* Set up the kex for the first time */
+void kexfirstinitialise() {
+
+	ses.kexstate.donefirstkex = 0;
+	kexinitialise();
+}
+
+/* Reset the kex state, ready for a new negotiation */
+static void kexinitialise() {
+
+	struct timeval tv;
+
+	TRACE(("kexinitialise()"))
+
+	/* sent/recv'd MSG_KEXINIT */
+	ses.kexstate.sentkexinit = 0;
+	ses.kexstate.recvkexinit = 0;
+
+	/* sent/recv'd MSG_NEWKEYS */
+	ses.kexstate.recvnewkeys = 0;
+	ses.kexstate.sentnewkeys = 0;
+
+	/* first_packet_follows */
+	ses.kexstate.firstfollows = 0;
+
+	ses.kexstate.datatrans = 0;
+	ses.kexstate.datarecv = 0;
+
+	if (gettimeofday(&tv, 0) < 0) {
+		dropbear_exit("Error getting time");
+	}
+	ses.kexstate.lastkextime = tv.tv_sec;
+
+}
+
+/* Helper function for gen_new_keys, creates a hash. It makes a copy of the
+ * already initialised hash_state hs, which should already have processed
+ * the dh_K and hash, since these are common. X is the letter 'A', 'B' etc.
+ * out must have at least min(SHA1_HASH_SIZE, outlen) bytes allocated.
+ * The output will only be expanded once, since that is all that is required
+ * (for 3DES and SHA, with 24 and 20 bytes respectively). 
+ *
+ * See Section 5.2 of the IETF secsh Transport Draft for details */
+
+/* Duplicated verbatim from kex.c --mihnea */
+static void hashkeys(unsigned char *out, int outlen, 
+		const hash_state * hs, const unsigned char X) {
+
+	hash_state hs2;
+	unsigned char k2[SHA1_HASH_SIZE]; /* used to extending */
+
+	memcpy(&hs2, hs, sizeof(hash_state));
+	sha1_process(&hs2, &X, 1);
+	sha1_process(&hs2, ses.session_id, SHA1_HASH_SIZE);
+	sha1_done(&hs2, out);
+	if (SHA1_HASH_SIZE < outlen) {
+		/* need to extend */
+		memcpy(&hs2, hs, sizeof(hash_state));
+		sha1_process(&hs2, out, SHA1_HASH_SIZE);
+		sha1_done(&hs2, k2);
+		memcpy(&out[SHA1_HASH_SIZE], k2, outlen - SHA1_HASH_SIZE);
+	}
+}
+
+/* Generate the actual encryption/integrity keys, using the results of the
+ * key exchange, as specified in section 5.2 of the IETF secsh-transport
+ * draft. This occurs after the DH key-exchange.
+ *
+ * ses.newkeys is the new set of keys which are generated, these are only
+ * taken into use after both sides have sent a newkeys message */
+
+/* Originally from kex.c, generalized for cli/svr mode --mihnea */
+void gen_new_keys() {
+
+	unsigned char C2S_IV[MAX_IV_LEN];
+	unsigned char C2S_key[MAX_KEY_LEN];
+	unsigned char S2C_IV[MAX_IV_LEN];
+	unsigned char S2C_key[MAX_KEY_LEN];
+	/* unsigned char key[MAX_KEY_LEN]; */
+	unsigned char *trans_IV, *trans_key, *recv_IV, *recv_key;
+
+	hash_state hs;
+	unsigned int C2S_keysize, S2C_keysize;
+	char mactransletter, macrecvletter; /* Client or server specific */
+
+	TRACE(("enter gen_new_keys"))
+	/* the dh_K and hash are the start of all hashes, we make use of that */
+
+	sha1_init(&hs);
+	sha1_process_mp(&hs, ses.dh_K);
+	mp_clear(ses.dh_K);
+	m_free(ses.dh_K);
+	sha1_process(&hs, ses.hash, SHA1_HASH_SIZE);
+	m_burn(ses.hash, SHA1_HASH_SIZE);
+
+	if (IS_DROPBEAR_CLIENT) {
+	    trans_IV	= C2S_IV;
+	    recv_IV		= S2C_IV;
+	    trans_key	= C2S_key;
+	    recv_key	= S2C_key;
+	    C2S_keysize = ses.newkeys->trans_algo_crypt->keysize;
+	    S2C_keysize = ses.newkeys->recv_algo_crypt->keysize;
+		mactransletter = 'E';
+		macrecvletter = 'F';
+	} else {
+	    trans_IV	= S2C_IV;
+	    recv_IV		= C2S_IV;
+	    trans_key	= S2C_key;
+	    recv_key	= C2S_key;
+	    C2S_keysize = ses.newkeys->recv_algo_crypt->keysize;
+	    S2C_keysize = ses.newkeys->trans_algo_crypt->keysize;
+		mactransletter = 'F';
+		macrecvletter = 'E';
+	}
+
+	hashkeys(C2S_IV, SHA1_HASH_SIZE, &hs, 'A');
+	hashkeys(S2C_IV, SHA1_HASH_SIZE, &hs, 'B');
+	hashkeys(C2S_key, C2S_keysize, &hs, 'C');
+	hashkeys(S2C_key, S2C_keysize, &hs, 'D');
+
+	if (cbc_start(
+		find_cipher(ses.newkeys->recv_algo_crypt->cipherdesc->name),
+			recv_IV, recv_key, 
+			ses.newkeys->recv_algo_crypt->keysize, 0, 
+			&ses.newkeys->recv_symmetric_struct) != CRYPT_OK) {
+		dropbear_exit("crypto error");
+	}
+
+	if (cbc_start(
+		find_cipher(ses.newkeys->trans_algo_crypt->cipherdesc->name),
+			trans_IV, trans_key, 
+			ses.newkeys->trans_algo_crypt->keysize, 0, 
+			&ses.newkeys->trans_symmetric_struct) != CRYPT_OK) {
+		dropbear_exit("crypto error");
+	}
+	
+	/* MAC keys */
+	hashkeys(ses.newkeys->transmackey, 
+			ses.newkeys->trans_algo_mac->keysize, &hs, mactransletter);
+	hashkeys(ses.newkeys->recvmackey, 
+			ses.newkeys->recv_algo_mac->keysize, &hs, macrecvletter);
+
+#ifndef DISABLE_ZLIB
+	gen_new_zstreams();
+#endif
+	
+	/* Switch over to the new keys */
+	m_burn(ses.keys, sizeof(struct key_context));
+	m_free(ses.keys);
+	ses.keys = ses.newkeys;
+	ses.newkeys = NULL;
+
+	TRACE(("leave gen_new_keys"))
+}
+
+#ifndef DISABLE_ZLIB
+/* Set up new zlib compression streams, close the old ones. Only
+ * called from gen_new_keys() */
+static void gen_new_zstreams() {
+
+	/* create new zstreams */
+	if (ses.newkeys->recv_algo_comp == DROPBEAR_COMP_ZLIB) {
+		ses.newkeys->recv_zstream = (z_streamp)m_malloc(sizeof(z_stream));
+		ses.newkeys->recv_zstream->zalloc = Z_NULL;
+		ses.newkeys->recv_zstream->zfree = Z_NULL;
+		
+		if (inflateInit(ses.newkeys->recv_zstream) != Z_OK) {
+			dropbear_exit("zlib error");
+		}
+	} else {
+		ses.newkeys->recv_zstream = NULL;
+	}
+
+	if (ses.newkeys->trans_algo_comp == DROPBEAR_COMP_ZLIB) {
+		ses.newkeys->trans_zstream = (z_streamp)m_malloc(sizeof(z_stream));
+		ses.newkeys->trans_zstream->zalloc = Z_NULL;
+		ses.newkeys->trans_zstream->zfree = Z_NULL;
+	
+		if (deflateInit(ses.newkeys->trans_zstream, Z_DEFAULT_COMPRESSION) 
+				!= Z_OK) {
+			dropbear_exit("zlib error");
+		}
+	} else {
+		ses.newkeys->trans_zstream = NULL;
+	}
+	
+	/* clean up old keys */
+	if (ses.keys->recv_zstream != NULL) {
+		if (inflateEnd(ses.keys->recv_zstream) == Z_STREAM_ERROR) {
+			/* Z_DATA_ERROR is ok, just means that stream isn't ended */
+			dropbear_exit("crypto error");
+		}
+		m_free(ses.keys->recv_zstream);
+	}
+	if (ses.keys->trans_zstream != NULL) {
+		if (deflateEnd(ses.keys->trans_zstream) == Z_STREAM_ERROR) {
+			/* Z_DATA_ERROR is ok, just means that stream isn't ended */
+			dropbear_exit("crypto error");
+		}
+		m_free(ses.keys->trans_zstream);
+	}
+}
+#endif
+
+
+/* Executed upon receiving a kexinit message from the client to initiate
+ * key exchange. If we haven't already done so, we send the list of our
+ * preferred algorithms. The client's requested algorithms are processed,
+ * and we calculate the first portion of the key-exchange-hash for used
+ * later in the key exchange. No response is sent, as the client should
+ * initiate the diffie-hellman key exchange */
+
+/* Originally from kex.c, generalized for cli/svr mode --mihnea  */
+/* Belongs in common_kex.c where it should be moved after review */
+void recv_msg_kexinit() {
+	
+	unsigned int kexhashbuf_len = 0;
+	unsigned int remote_ident_len = 0;
+	unsigned int local_ident_len = 0;
+
+	TRACE(("<- KEXINIT"))
+	TRACE(("enter recv_msg_kexinit"))
+	
+	if (!ses.kexstate.sentkexinit) {
+		/* we need to send a kex packet */
+		send_msg_kexinit();
+		TRACE(("continue recv_msg_kexinit: sent kexinit"))
+	}
+
+	/* start the kex hash */
+	local_ident_len = strlen(LOCAL_IDENT);
+	remote_ident_len = strlen((char*)ses.remoteident);
+
+	kexhashbuf_len = local_ident_len + remote_ident_len
+		+ ses.transkexinit->len + ses.payload->len
+		+ KEXHASHBUF_MAX_INTS;
+
+	ses.kexhashbuf = buf_new(kexhashbuf_len);
+
+	if (IS_DROPBEAR_CLIENT) {
+
+		/* read the peer's choice of algos */
+		read_kex_algos();
+
+		/* V_C, the client's version string (CR and NL excluded) */
+	    buf_putstring(ses.kexhashbuf,
+			(unsigned char*)LOCAL_IDENT, local_ident_len);
+		/* V_S, the server's version string (CR and NL excluded) */
+	    buf_putstring(ses.kexhashbuf, ses.remoteident, remote_ident_len);
+
+		/* I_C, the payload of the client's SSH_MSG_KEXINIT */
+	    buf_putstring(ses.kexhashbuf,
+			ses.transkexinit->data, ses.transkexinit->len);
+		/* I_S, the payload of the server's SSH_MSG_KEXINIT */
+	    buf_setpos(ses.payload, 0);
+	    buf_putstring(ses.kexhashbuf, ses.payload->data, ses.payload->len);
+
+	} else {
+		/* SERVER */
+
+		/* read the peer's choice of algos */
+		read_kex_algos();
+		/* V_C, the client's version string (CR and NL excluded) */
+	    buf_putstring(ses.kexhashbuf, ses.remoteident, remote_ident_len);
+		/* V_S, the server's version string (CR and NL excluded) */
+	    buf_putstring(ses.kexhashbuf, 
+				(unsigned char*)LOCAL_IDENT, local_ident_len);
+
+		/* I_C, the payload of the client's SSH_MSG_KEXINIT */
+	    buf_setpos(ses.payload, 0);
+	    buf_putstring(ses.kexhashbuf, ses.payload->data, ses.payload->len);
+
+		/* I_S, the payload of the server's SSH_MSG_KEXINIT */
+	    buf_putstring(ses.kexhashbuf,
+			ses.transkexinit->data, ses.transkexinit->len);
+
+		ses.requirenext = SSH_MSG_KEXDH_INIT;
+	}
+
+	buf_free(ses.transkexinit);
+	ses.transkexinit = NULL;
+	/* the rest of ses.kexhashbuf will be done after DH exchange */
+
+	ses.kexstate.recvkexinit = 1;
+
+	TRACE(("leave recv_msg_kexinit"))
+}
+
+/* Initialises and generate one side of the diffie-hellman key exchange values.
+ * See the ietf-secsh-transport draft, section 6, for details */
+/* dh_pub and dh_priv MUST be already initialised */
+void gen_kexdh_vals(mp_int *dh_pub, mp_int *dh_priv) {
+
+	DEF_MP_INT(dh_p);
+	DEF_MP_INT(dh_q);
+	DEF_MP_INT(dh_g);
+
+	TRACE(("enter send_msg_kexdh_reply"))
+	
+	m_mp_init_multi(&dh_g, &dh_p, &dh_q, NULL);
+
+	/* read the prime and generator*/
+	bytes_to_mp(&dh_p, (unsigned char*)dh_p_val, DH_P_LEN);
+	
+	if (mp_set_int(&dh_g, DH_G_VAL) != MP_OKAY) {
+		dropbear_exit("Diffie-Hellman error");
+	}
+
+	/* calculate q = (p-1)/2 */
+	/* dh_priv is just a temp var here */
+	if (mp_sub_d(&dh_p, 1, dh_priv) != MP_OKAY) { 
+		dropbear_exit("Diffie-Hellman error");
+	}
+	if (mp_div_2(dh_priv, &dh_q) != MP_OKAY) {
+		dropbear_exit("Diffie-Hellman error");
+	}
+
+	/* Generate a private portion 0 < dh_priv < dh_q */
+	gen_random_mpint(&dh_q, dh_priv);
+
+	/* f = g^y mod p */
+	if (mp_exptmod(&dh_g, dh_priv, &dh_p, dh_pub) != MP_OKAY) {
+		dropbear_exit("Diffie-Hellman error");
+	}
+	mp_clear_multi(&dh_g, &dh_p, &dh_q, NULL);
+}
+
+/* This function is fairly common between client/server, with some substitution
+ * of dh_e/dh_f etc. Hence these arguments:
+ * dh_pub_us is 'e' for the client, 'f' for the server. dh_pub_them is 
+ * vice-versa. dh_priv is the x/y value corresponding to dh_pub_us */
+void kexdh_comb_key(mp_int *dh_pub_us, mp_int *dh_priv, mp_int *dh_pub_them,
+		sign_key *hostkey) {
+
+	mp_int dh_p;
+	mp_int *dh_e = NULL, *dh_f = NULL;
+	hash_state hs;
+
+	/* read the prime and generator*/
+	mp_init(&dh_p);
+	bytes_to_mp(&dh_p, dh_p_val, DH_P_LEN);
+
+	/* Check that dh_pub_them (dh_e or dh_f) is in the range [1, p-1] */
+	if (mp_cmp(dh_pub_them, &dh_p) != MP_LT 
+			|| mp_cmp_d(dh_pub_them, 0) != MP_GT) {
+		dropbear_exit("Diffie-Hellman error");
+	}
+	
+	/* K = e^y mod p = f^x mod p */
+	ses.dh_K = (mp_int*)m_malloc(sizeof(mp_int));
+	m_mp_init(ses.dh_K);
+	if (mp_exptmod(dh_pub_them, dh_priv, &dh_p, ses.dh_K) != MP_OKAY) {
+		dropbear_exit("Diffie-Hellman error");
+	}
+
+	/* clear no longer needed vars */
+	mp_clear_multi(&dh_p, NULL);
+
+	/* From here on, the code needs to work with the _same_ vars on each side,
+	 * not vice-versaing for client/server */
+	if (IS_DROPBEAR_CLIENT) {
+		dh_e = dh_pub_us;
+		dh_f = dh_pub_them;
+	} else {
+		dh_e = dh_pub_them;
+		dh_f = dh_pub_us;
+	} 
+
+	/* Create the remainder of the hash buffer, to generate the exchange hash */
+	/* K_S, the host key */
+	buf_put_pub_key(ses.kexhashbuf, hostkey, ses.newkeys->algo_hostkey);
+	/* e, exchange value sent by the client */
+	buf_putmpint(ses.kexhashbuf, dh_e);
+	/* f, exchange value sent by the server */
+	buf_putmpint(ses.kexhashbuf, dh_f);
+	/* K, the shared secret */
+	buf_putmpint(ses.kexhashbuf, ses.dh_K);
+
+	/* calculate the hash H to sign */
+	sha1_init(&hs);
+	buf_setpos(ses.kexhashbuf, 0);
+	sha1_process(&hs, buf_getptr(ses.kexhashbuf, ses.kexhashbuf->len),
+			ses.kexhashbuf->len);
+	sha1_done(&hs, ses.hash);
+
+	buf_burn(ses.kexhashbuf);
+	buf_free(ses.kexhashbuf);
+	ses.kexhashbuf = NULL;
+	
+	/* first time around, we set the session_id to H */
+	if (ses.session_id == NULL) {
+		/* create the session_id, this never needs freeing */
+		ses.session_id = (unsigned char*)m_malloc(SHA1_HASH_SIZE);
+		memcpy(ses.session_id, ses.hash, SHA1_HASH_SIZE);
+	}
+}
+
+/* read the other side's algo list. buf_match_algo is a callback to match
+ * algos for the client or server. */
+static void read_kex_algos() {
+
+	/* for asymmetry */
+	algo_type * c2s_hash_algo = NULL;
+	algo_type * s2c_hash_algo = NULL;
+	algo_type * c2s_cipher_algo = NULL;
+	algo_type * s2c_cipher_algo = NULL;
+	algo_type * c2s_comp_algo = NULL;
+	algo_type * s2c_comp_algo = NULL;
+	/* the generic one */
+	algo_type * algo = NULL;
+
+	/* which algo couldn't match */
+	char * erralgo = NULL;
+
+	int goodguess = 0;
+	int allgood = 1; /* we AND this with each goodguess and see if its still
+						true after */
+
+	buf_incrpos(ses.payload, 16); /* start after the cookie */
+
+	ses.newkeys = (struct key_context*)m_malloc(sizeof(struct key_context));
+
+	/* kex_algorithms */
+	algo = ses.buf_match_algo(ses.payload, sshkex, &goodguess);
+	allgood &= goodguess;
+	if (algo == NULL) {
+		erralgo = "kex";
+		goto error;
+	}
+	TRACE(("kex algo %s", algo->name))
+	ses.newkeys->algo_kex = algo->val;
+
+	/* server_host_key_algorithms */
+	algo = ses.buf_match_algo(ses.payload, sshhostkey, &goodguess);
+	allgood &= goodguess;
+	if (algo == NULL) {
+		erralgo = "hostkey";
+		goto error;
+	}
+	TRACE(("hostkey algo %s", algo->name))
+	ses.newkeys->algo_hostkey = algo->val;
+
+	/* encryption_algorithms_client_to_server */
+	c2s_cipher_algo = ses.buf_match_algo(ses.payload, sshciphers, &goodguess);
+	if (c2s_cipher_algo == NULL) {
+		erralgo = "enc c->s";
+		goto error;
+	}
+	TRACE(("enc c2s is  %s", c2s_cipher_algo->name))
+
+	/* encryption_algorithms_server_to_client */
+	s2c_cipher_algo = ses.buf_match_algo(ses.payload, sshciphers, &goodguess);
+	if (s2c_cipher_algo == NULL) {
+		erralgo = "enc s->c";
+		goto error;
+	}
+	TRACE(("enc s2c is  %s", s2c_cipher_algo->name))
+
+	/* mac_algorithms_client_to_server */
+	c2s_hash_algo = ses.buf_match_algo(ses.payload, sshhashes, &goodguess);
+	if (c2s_hash_algo == NULL) {
+		erralgo = "mac c->s";
+		goto error;
+	}
+	TRACE(("hash c2s is  %s", c2s_hash_algo->name))
+
+	/* mac_algorithms_server_to_client */
+	s2c_hash_algo = ses.buf_match_algo(ses.payload, sshhashes, &goodguess);
+	if (s2c_hash_algo == NULL) {
+		erralgo = "mac s->c";
+		goto error;
+	}
+	TRACE(("hash s2c is  %s", s2c_hash_algo->name))
+
+	/* compression_algorithms_client_to_server */
+	c2s_comp_algo = ses.buf_match_algo(ses.payload, sshcompress, &goodguess);
+	if (c2s_comp_algo == NULL) {
+		erralgo = "comp c->s";
+		goto error;
+	}
+	TRACE(("hash c2s is  %s", c2s_comp_algo->name))
+
+	/* compression_algorithms_server_to_client */
+	s2c_comp_algo = ses.buf_match_algo(ses.payload, sshcompress, &goodguess);
+	if (s2c_comp_algo == NULL) {
+		erralgo = "comp s->c";
+		goto error;
+	}
+	TRACE(("hash s2c is  %s", s2c_comp_algo->name))
+
+	/* languages_client_to_server */
+	buf_eatstring(ses.payload);
+
+	/* languages_server_to_client */
+	buf_eatstring(ses.payload);
+
+	/* first_kex_packet_follows */
+	if (buf_getbool(ses.payload)) {
+		ses.kexstate.firstfollows = 1;
+		/* if the guess wasn't good, we ignore the packet sent */
+		if (!allgood) {
+			ses.ignorenext = 1;
+		}
+	}
+
+	/* Handle the asymmetry */
+	if (IS_DROPBEAR_CLIENT) {
+		ses.newkeys->recv_algo_crypt = 
+			(struct dropbear_cipher*)s2c_cipher_algo->data;
+		ses.newkeys->trans_algo_crypt = 
+			(struct dropbear_cipher*)c2s_cipher_algo->data;
+		ses.newkeys->recv_algo_mac = 
+			(struct dropbear_hash*)s2c_hash_algo->data;
+		ses.newkeys->trans_algo_mac = 
+			(struct dropbear_hash*)c2s_hash_algo->data;
+		ses.newkeys->recv_algo_comp = s2c_comp_algo->val;
+		ses.newkeys->trans_algo_comp = c2s_comp_algo->val;
+	} else {
+		/* SERVER */
+		ses.newkeys->recv_algo_crypt = 
+			(struct dropbear_cipher*)c2s_cipher_algo->data;
+		ses.newkeys->trans_algo_crypt = 
+			(struct dropbear_cipher*)s2c_cipher_algo->data;
+		ses.newkeys->recv_algo_mac = 
+			(struct dropbear_hash*)c2s_hash_algo->data;
+		ses.newkeys->trans_algo_mac = 
+			(struct dropbear_hash*)s2c_hash_algo->data;
+		ses.newkeys->recv_algo_comp = c2s_comp_algo->val;
+		ses.newkeys->trans_algo_comp = s2c_comp_algo->val;
+	}
+
+	/* reserved for future extensions */
+	buf_getint(ses.payload);
+	return;
+
+error:
+	dropbear_exit("no matching algo %s", erralgo);
+}
diff --git a/common-runopts.c b/common-runopts.c
new file mode 100644
index 0000000..2de036e
--- /dev/null
+++ b/common-runopts.c
@@ -0,0 +1,57 @@
+/*
+ * Dropbear - a SSH2 server
+ * 
+ * Copyright (c) 2002,2003 Matt Johnston
+ * All rights reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE. */
+
+#include "includes.h"
+#include "runopts.h"
+#include "signkey.h"
+#include "buffer.h"
+#include "dbutil.h"
+#include "auth.h"
+
+runopts opts; /* GLOBAL */
+
+/* returns success or failure, and the keytype in *type. If we want
+ * to restrict the type, type can contain a type to return */
+int readhostkey(const char * filename, sign_key * hostkey, int *type) {
+
+	int ret = DROPBEAR_FAILURE;
+	buffer *buf;
+
+	buf = buf_new(MAX_PRIVKEY_SIZE);
+
+	if (buf_readfile(buf, filename) == DROPBEAR_FAILURE) {
+		goto out;
+	}
+	buf_setpos(buf, 0);
+	if (buf_get_priv_key(buf, hostkey, type) == DROPBEAR_FAILURE) {
+		goto out;
+	}
+
+	ret = DROPBEAR_SUCCESS;
+out:
+
+	buf_burn(buf);
+	buf_free(buf);
+	return ret;
+}
diff --git a/common-session.c b/common-session.c
new file mode 100644
index 0000000..4c15391
--- /dev/null
+++ b/common-session.c
@@ -0,0 +1,373 @@
+/*
+ * Dropbear - a SSH2 server
+ * 
+ * Copyright (c) 2002,2003 Matt Johnston
+ * All rights reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE. */
+
+#include "includes.h"
+#include "session.h"
+#include "dbutil.h"
+#include "packet.h"
+#include "algo.h"
+#include "buffer.h"
+#include "dss.h"
+#include "ssh.h"
+#include "random.h"
+#include "kex.h"
+#include "channel.h"
+#include "atomicio.h"
+
+static void checktimeouts();
+static int ident_readln(int fd, char* buf, int count);
+
+struct sshsession ses; /* GLOBAL */
+
+/* need to know if the session struct has been initialised, this way isn't the
+ * cleanest, but works OK */
+int sessinitdone = 0; /* GLOBAL */
+
+/* this is set when we get SIGINT or SIGTERM, the handler is in main.c */
+int exitflag = 0; /* GLOBAL */
+
+
+
+/* called only at the start of a session, set up initial state */
+void common_session_init(int sock, char* remotehost) {
+
+	TRACE(("enter session_init"))
+
+	ses.remotehost = remotehost;
+
+	ses.sock = sock;
+	ses.maxfd = sock;
+
+	ses.connecttimeout = 0;
+	
+	kexfirstinitialise(); /* initialise the kex state */
+
+	ses.writepayload = buf_new(MAX_TRANS_PAYLOAD_LEN);
+	ses.transseq = 0;
+
+	ses.readbuf = NULL;
+	ses.decryptreadbuf = NULL;
+	ses.payload = NULL;
+	ses.recvseq = 0;
+
+	initqueue(&ses.writequeue);
+
+	ses.requirenext = SSH_MSG_KEXINIT;
+	ses.dataallowed = 0; /* don't send data yet, we'll wait until after kex */
+	ses.ignorenext = 0;
+	ses.lastpacket = 0;
+
+	/* set all the algos to none */
+	ses.keys = (struct key_context*)m_malloc(sizeof(struct key_context));
+	ses.newkeys = NULL;
+	ses.keys->recv_algo_crypt = &dropbear_nocipher;
+	ses.keys->trans_algo_crypt = &dropbear_nocipher;
+	
+	ses.keys->recv_algo_mac = &dropbear_nohash;
+	ses.keys->trans_algo_mac = &dropbear_nohash;
+
+	ses.keys->algo_kex = -1;
+	ses.keys->algo_hostkey = -1;
+	ses.keys->recv_algo_comp = DROPBEAR_COMP_NONE;
+	ses.keys->trans_algo_comp = DROPBEAR_COMP_NONE;
+
+#ifndef DISABLE_ZLIB
+	ses.keys->recv_zstream = NULL;
+	ses.keys->trans_zstream = NULL;
+#endif
+
+	/* key exchange buffers */
+	ses.session_id = NULL;
+	ses.kexhashbuf = NULL;
+	ses.transkexinit = NULL;
+	ses.dh_K = NULL;
+	ses.remoteident = NULL;
+
+	ses.chantypes = NULL;
+
+	ses.allowprivport = 0;
+
+
+	TRACE(("leave session_init"))
+}
+
+void session_loop(void(*loophandler)()) {
+
+	fd_set readfd, writefd;
+	struct timeval timeout;
+	int val;
+
+	/* main loop, select()s for all sockets in use */
+	for(;;) {
+
+		timeout.tv_sec = SELECT_TIMEOUT;
+		timeout.tv_usec = 0;
+		FD_ZERO(&writefd);
+		FD_ZERO(&readfd);
+		dropbear_assert(ses.payload == NULL);
+		if (ses.sock != -1) {
+			FD_SET(ses.sock, &readfd);
+			if (!isempty(&ses.writequeue)) {
+				FD_SET(ses.sock, &writefd);
+			}
+		}
+
+		/* set up for channels which require reading/writing */
+		if (ses.dataallowed) {
+			setchannelfds(&readfd, &writefd);
+		}
+		val = select(ses.maxfd+1, &readfd, &writefd, NULL, &timeout);
+
+		if (exitflag) {
+			dropbear_exit("Terminated by signal");
+		}
+		
+		if (val < 0) {
+			if (errno == EINTR) {
+				/* This must happen even if we've been interrupted, so that
+				 * changed signal-handler vars can take effect etc */
+				if (loophandler) {
+					loophandler();
+				}
+				continue;
+			} else {
+				dropbear_exit("Error in select");
+			}
+		}
+
+		/* check for auth timeout, rekeying required etc */
+		checktimeouts();
+		
+		if (val == 0) {
+			/* timeout */
+			TRACE(("select timeout"))
+			continue;
+		}
+
+		/* process session socket's incoming/outgoing data */
+		if (ses.sock != -1) {
+			if (FD_ISSET(ses.sock, &writefd) && !isempty(&ses.writequeue)) {
+				write_packet();
+			}
+
+			if (FD_ISSET(ses.sock, &readfd)) {
+				read_packet();
+			}
+			
+			/* Process the decrypted packet. After this, the read buffer
+			 * will be ready for a new packet */
+			if (ses.payload != NULL) {
+				process_packet();
+			}
+		}
+
+		/* process pipes etc for the channels, ses.dataallowed == 0
+		 * during rekeying ) */
+		if (ses.dataallowed) {
+			channelio(&readfd, &writefd);
+		}
+
+		if (loophandler) {
+			loophandler();
+		}
+
+	} /* for(;;) */
+	
+	/* Not reached */
+}
+
+/* clean up a session on exit */
+void common_session_cleanup() {
+	
+	TRACE(("enter session_cleanup"))
+	
+	/* we can't cleanup if we don't know the session state */
+	if (!sessinitdone) {
+		TRACE(("leave session_cleanup: !sessinitdone"))
+		return;
+	}
+	
+	m_free(ses.session_id);
+	m_burn(ses.keys, sizeof(struct key_context));
+	m_free(ses.keys);
+
+	chancleanup();
+
+	TRACE(("leave session_cleanup"))
+}
+
+
+void session_identification() {
+
+	/* max length of 255 chars */
+	char linebuf[256];
+	int len = 0;
+	char done = 0;
+	int i;
+
+	/* write our version string, this blocks */
+	if (atomicio(write, ses.sock, LOCAL_IDENT "\r\n",
+				strlen(LOCAL_IDENT "\r\n")) == DROPBEAR_FAILURE) {
+		dropbear_exit("Error writing ident string");
+	}
+
+    /* If they send more than 50 lines, something is wrong */
+	for (i = 0; i < 50; i++) {
+		len = ident_readln(ses.sock, linebuf, sizeof(linebuf));
+
+		if (len < 0 && errno != EINTR) {
+			/* It failed */
+			break;
+		}
+
+		if (len >= 4 && memcmp(linebuf, "SSH-", 4) == 0) {
+			/* start of line matches */
+			done = 1;
+			break;
+		}
+	}
+
+	if (!done) {
+		TRACE(("err: %s for '%s'\n", strerror(errno), linebuf))
+		dropbear_exit("Failed to get remote version");
+	} else {
+		/* linebuf is already null terminated */
+		ses.remoteident = m_malloc(len);
+		memcpy(ses.remoteident, linebuf, len);
+	}
+
+    /* Shall assume that 2.x will be backwards compatible. */
+    if (strncmp(ses.remoteident, "SSH-2.", 6) != 0
+            && strncmp(ses.remoteident, "SSH-1.99-", 9) != 0) {
+        dropbear_exit("Incompatible remote version '%s'", ses.remoteident);
+    }
+
+	TRACE(("remoteident: %s", ses.remoteident))
+
+}
+
+/* returns the length including null-terminating zero on success,
+ * or -1 on failure */
+static int ident_readln(int fd, char* buf, int count) {
+	
+	char in;
+	int pos = 0;
+	int num = 0;
+	fd_set fds;
+	struct timeval timeout;
+
+	TRACE(("enter ident_readln"))
+
+	if (count < 1) {
+		return -1;
+	}
+
+	FD_ZERO(&fds);
+
+	/* select since it's a non-blocking fd */
+	
+	/* leave space to null-terminate */
+	while (pos < count-1) {
+
+		FD_SET(fd, &fds);
+
+		timeout.tv_sec = 1;
+		timeout.tv_usec = 0;
+		if (select(fd+1, &fds, NULL, NULL, &timeout) < 0) {
+			if (errno == EINTR) {
+				continue;
+			}
+			TRACE(("leave ident_readln: select error"))
+			return -1;
+		}
+
+		checktimeouts();
+		
+		/* Have to go one byte at a time, since we don't want to read past
+		 * the end, and have to somehow shove bytes back into the normal
+		 * packet reader */
+		if (FD_ISSET(fd, &fds)) {
+			num = read(fd, &in, 1);
+			/* a "\n" is a newline, "\r" we want to read in and keep going
+			 * so that it won't be read as part of the next line */
+			if (num < 0) {
+				/* error */
+				if (errno == EINTR) {
+					continue; /* not a real error */
+				}
+				TRACE(("leave ident_readln: read error"))
+				return -1;
+			}
+			if (num == 0) {
+				/* EOF */
+				TRACE(("leave ident_readln: EOF"))
+				return -1;
+			}
+			if (in == '\n') {
+				/* end of ident string */
+				break;
+			}
+			/* we don't want to include '\r's */
+			if (in != '\r') {
+				buf[pos] = in;
+				pos++;
+			}
+		}
+	}
+
+	buf[pos] = '\0';
+	TRACE(("leave ident_readln: return %d", pos+1))
+	return pos+1;
+}
+
+/* Check all timeouts which are required. Currently these are the time for
+ * user authentication, and the automatic rekeying. */
+static void checktimeouts() {
+
+	struct timeval tv;
+	long secs;
+
+	if (gettimeofday(&tv, 0) < 0) {
+		dropbear_exit("Error getting time");
+	}
+
+	secs = tv.tv_sec;
+	
+	if (ses.connecttimeout != 0 && secs > ses.connecttimeout) {
+			dropbear_close("Timeout before auth");
+	}
+
+	/* we can't rekey if we haven't done remote ident exchange yet */
+	if (ses.remoteident == NULL) {
+		return;
+	}
+
+	if (!ses.kexstate.sentkexinit
+			&& (secs - ses.kexstate.lastkextime >= KEX_REKEY_TIMEOUT
+			|| ses.kexstate.datarecv+ses.kexstate.datatrans >= KEX_REKEY_DATA)){
+		TRACE(("rekeying after timeout or max data reached"))
+		send_msg_kexinit();
+	}
+}
+
diff --git a/compat.c b/compat.c
new file mode 100644
index 0000000..7e0c1ac
--- /dev/null
+++ b/compat.c
@@ -0,0 +1,281 @@
+/*
+ * Dropbear - a SSH2 server
+ * 
+ * Copyright (c) 2002,2003 Matt Johnston
+ * All rights reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * strlcat() is copyright as follows:
+ * Copyright (c) 1998 Todd C. Miller <Todd.Miller@courtesan.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
+ * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
+ * THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * daemon() and getusershell() is copyright as follows:
+ *
+ * Copyright (c) 1990, 1993
+ *      The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+		* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+* SUCH DAMAGE.
+*
+* Modifications for Dropbear to getusershell() are by Paul Marinceu
+*/
+
+#include "includes.h"
+
+#ifndef HAVE_GETUSERSHELL
+static char **curshell, **shells, *strings;
+static char **initshells();
+#endif
+
+#ifndef HAVE_STRLCPY
+/* Implemented by matt as specified in freebsd 4.7 manpage.
+ * We don't require great speed, is simply for use with sshpty code */
+size_t strlcpy(char *dst, const char *src, size_t size) {
+
+	size_t i;
+
+	/* this is undefined, though size==0 -> return 0 */
+	if (size < 1) {
+		return 0;
+	}
+
+	for (i = 0; i < size-1; i++) {
+		if (src[i] == '\0') {
+			break;
+		} else {
+			dst[i] = src[i];
+		}
+	}
+
+	dst[i] = '\0';
+	return strlen(src);
+
+}
+#endif /* HAVE_STRLCPY */
+
+#ifndef HAVE_STRLCAT
+/* taken from openbsd-compat for OpenSSH 3.6.1p1 */
+/* "$OpenBSD: strlcat.c,v 1.8 2001/05/13 15:40:15 deraadt Exp $"
+ *
+ * Appends src to string dst of size siz (unlike strncat, siz is the
+ * full size of dst, not space left).  At most siz-1 characters
+ * will be copied.  Always NUL terminates (unless siz <= strlen(dst)).
+ * Returns strlen(src) + MIN(siz, strlen(initial dst)).
+ * If retval >= siz, truncation occurred.
+ */
+	size_t
+strlcat(dst, src, siz)
+	char *dst;
+	const char *src;
+	size_t siz;
+{
+	register char *d = dst;
+	register const char *s = src;
+	register size_t n = siz;
+	size_t dlen;
+
+	/* Find the end of dst and adjust bytes left but don't go past end */
+	while (n-- != 0 && *d != '\0')
+		d++;
+	dlen = d - dst;
+	n = siz - dlen;
+
+	if (n == 0)
+		return(dlen + strlen(s));
+	while (*s != '\0') {
+		if (n != 1) {
+			*d++ = *s;
+			n--;
+		}
+		s++;
+	}
+	*d = '\0';
+
+	return(dlen + (s - src));	/* count does not include NUL */
+}
+#endif /* HAVE_STRLCAT */
+
+#ifndef HAVE_DAEMON
+/* From NetBSD - daemonise a process */
+
+int daemon(int nochdir, int noclose) {
+
+	int fd;
+
+	switch (fork()) {
+		case -1:
+			return (-1);
+		case 0:
+			break;
+		default:
+			_exit(0);
+	}
+
+	if (setsid() == -1)
+		return -1;
+
+	if (!nochdir)
+		(void)chdir("/");
+
+	if (!noclose && (fd = open(_PATH_DEVNULL, O_RDWR, 0)) != -1) {
+		(void)dup2(fd, STDIN_FILENO);
+		(void)dup2(fd, STDOUT_FILENO);
+		(void)dup2(fd, STDERR_FILENO);
+		if (fd > STDERR_FILENO)
+			(void)close(fd);
+	}
+	return 0;
+}
+#endif /* HAVE_DAEMON */
+
+#ifndef HAVE_BASENAME
+
+char *basename(const char *path) {
+
+	char *foo = strrchr(path, '/');
+	return ++foo;
+}
+
+#endif /* HAVE_BASENAME */
+
+#ifndef HAVE_GETUSERSHELL
+
+/*
+ * Get a list of shells from /etc/shells, if it exists.
+ */
+char * getusershell() {
+	char *ret;
+
+	if (curshell == NULL)
+		curshell = initshells();
+	ret = *curshell;
+	if (ret != NULL)
+		curshell++;
+	return (ret);
+}
+
+void endusershell() {
+
+	if (shells != NULL)
+		free(shells);
+	shells = NULL;
+	if (strings != NULL)
+		free(strings);
+	strings = NULL;
+	curshell = NULL;
+}
+
+void setusershell() {
+	curshell = initshells();
+}
+
+static char **initshells() {
+	/* don't touch this list. */
+	const char *okshells[] = { "/bin/sh", "/bin/csh", NULL };
+	register char **sp, *cp;
+	register FILE *fp;
+	struct stat statb;
+	int flen;
+
+	if (shells != NULL)
+		free(shells);
+	shells = NULL;
+	if (strings != NULL)
+		free(strings);
+	strings = NULL;
+	if ((fp = fopen("/etc/shells", "rc")) == NULL)
+		return (char **) okshells;
+	if (fstat(fileno(fp), &statb) == -1) {
+		(void)fclose(fp);
+		return (char **) okshells;
+	}
+	if ((strings = malloc((u_int)statb.st_size + 1)) == NULL) {
+		(void)fclose(fp);
+		return (char **) okshells;
+	}
+	shells = calloc((unsigned)statb.st_size / 3, sizeof (char *));
+	if (shells == NULL) {
+		(void)fclose(fp);
+		free(strings);
+		strings = NULL;
+		return (char **) okshells;
+	}
+	sp = shells;
+	cp = strings;
+	flen = statb.st_size;
+	while (fgets(cp, flen - (cp - strings), fp) != NULL) {
+		while (*cp != '#' && *cp != '/' && *cp != '\0')
+			cp++;
+		if (*cp == '#' || *cp == '\0')
+			continue;
+		*sp++ = cp;
+		while (!isspace(*cp) && *cp != '#' && *cp != '\0')
+			cp++;
+		*cp++ = '\0';
+	}
+	*sp = NULL;
+	(void)fclose(fp);
+	return (shells);
+}
+
+#endif /* HAVE_GETUSERSHELL */
diff --git a/compat.h b/compat.h
new file mode 100644
index 0000000..1ab344f
--- /dev/null
+++ b/compat.h
@@ -0,0 +1,56 @@
+/*
+ * Dropbear SSH
+ * 
+ * Copyright (c) 2002,2003 Matt Johnston
+ * All rights reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE. */
+
+#ifndef _COMPAT_H_
+#define _COMPAT_H_
+
+#include "includes.h"
+
+#ifndef HAVE_STRLCPY
+size_t strlcpy(char *dst, const char *src, size_t size);
+#endif
+
+#ifndef HAVE_STRLCAT
+size_t strlcat(char *dst, const char *src, size_t siz);
+#endif
+
+#ifndef HAVE_DAEMON
+int daemon(int nochdir, int noclose);
+#endif
+
+#ifndef HAVE_BASENAME
+char *basename(const char* path);
+#endif
+
+#ifndef HAVE_GETUSERSHELL
+char *getusershell();
+void setusershell();
+void endusershell();
+#endif
+
+#ifndef _PATH_DEVNULL
+#define _PATH_DEVNULL "/dev/null"
+#endif
+
+#endif /* _COMPAT_H_ */
diff --git a/config.guess b/config.guess
new file mode 100644
index 0000000..9e55e1a
--- /dev/null
+++ b/config.guess
@@ -0,0 +1,1391 @@
+#! /bin/sh
+# Attempt to guess a canonical system name.
+#   Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
+#   2000, 2001, 2002, 2003 Free Software Foundation, Inc.
+
+timestamp='2003-05-19'
+
+# This file is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+#
+# As a special exception to the GNU General Public License, if you
+# distribute this file as part of a program that contains a
+# configuration script generated by Autoconf, you may include it under
+# the same distribution terms that you use for the rest of that program.
+
+# Originally written by Per Bothner <per@bothner.com>.
+# Please send patches to <config-patches@gnu.org>.  Submit a context
+# diff and a properly formatted ChangeLog entry.
+#
+# This script attempts to guess a canonical system name similar to
+# config.sub.  If it succeeds, it prints the system name on stdout, and
+# exits with 0.  Otherwise, it exits with 1.
+#
+# The plan is that this can be called by configure scripts if you
+# don't specify an explicit build system type.
+
+me=`echo "$0" | sed -e 's,.*/,,'`
+
+usage="\
+Usage: $0 [OPTION]
+
+Output the configuration name of the system \`$me' is run on.
+
+Operation modes:
+  -h, --help         print this help, then exit
+  -t, --time-stamp   print date of last modification, then exit
+  -v, --version      print version number, then exit
+
+Report bugs and patches to <config-patches@gnu.org>."
+
+version="\
+GNU config.guess ($timestamp)
+
+Originally written by Per Bothner.
+Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001
+Free Software Foundation, Inc.
+
+This is free software; see the source for copying conditions.  There is NO
+warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE."
+
+help="
+Try \`$me --help' for more information."
+
+# Parse command line
+while test $# -gt 0 ; do
+  case $1 in
+    --time-stamp | --time* | -t )
+       echo "$timestamp" ; exit 0 ;;
+    --version | -v )
+       echo "$version" ; exit 0 ;;
+    --help | --h* | -h )
+       echo "$usage"; exit 0 ;;
+    -- )     # Stop option processing
+       shift; break ;;
+    - )	# Use stdin as input.
+       break ;;
+    -* )
+       echo "$me: invalid option $1$help" >&2
+       exit 1 ;;
+    * )
+       break ;;
+  esac
+done
+
+if test $# != 0; then
+  echo "$me: too many arguments$help" >&2
+  exit 1
+fi
+
+trap 'exit 1' 1 2 15
+
+# CC_FOR_BUILD -- compiler used by this script. Note that the use of a
+# compiler to aid in system detection is discouraged as it requires
+# temporary files to be created and, as you can see below, it is a
+# headache to deal with in a portable fashion.
+
+# Historically, `CC_FOR_BUILD' used to be named `HOST_CC'. We still
+# use `HOST_CC' if defined, but it is deprecated.
+
+# Portable tmp directory creation inspired by the Autoconf team.
+
+set_cc_for_build='
+trap "exitcode=\$?; (rm -f \$tmpfiles 2>/dev/null; rmdir \$tmp 2>/dev/null) && exit \$exitcode" 0 ;
+trap "rm -f \$tmpfiles 2>/dev/null; rmdir \$tmp 2>/dev/null; exit 1" 1 2 13 15 ;
+: ${TMPDIR=/tmp} ;
+ { tmp=`(umask 077 && mktemp -d -q "$TMPDIR/cgXXXXXX") 2>/dev/null` && test -n "$tmp" && test -d "$tmp" ; } ||
+ { test -n "$RANDOM" && tmp=$TMPDIR/cg$$-$RANDOM && (umask 077 && mkdir $tmp) ; } ||
+ { echo "$me: cannot create a temporary directory in $TMPDIR" >&2 ; exit 1 ; } ;
+dummy=$tmp/dummy ;
+tmpfiles="$dummy.c $dummy.o $dummy.rel $dummy" ;
+case $CC_FOR_BUILD,$HOST_CC,$CC in
+ ,,)    echo "int x;" > $dummy.c ;
+	for c in cc gcc c89 c99 ; do
+	  if ($c -c -o $dummy.o $dummy.c) >/dev/null 2>&1 ; then
+	     CC_FOR_BUILD="$c"; break ;
+	  fi ;
+	done ;
+	if test x"$CC_FOR_BUILD" = x ; then
+	  CC_FOR_BUILD=no_compiler_found ;
+	fi
+	;;
+ ,,*)   CC_FOR_BUILD=$CC ;;
+ ,*,*)  CC_FOR_BUILD=$HOST_CC ;;
+esac ;'
+
+# This is needed to find uname on a Pyramid OSx when run in the BSD universe.
+# (ghazi@noc.rutgers.edu 1994-08-24)
+if (test -f /.attbin/uname) >/dev/null 2>&1 ; then
+	PATH=$PATH:/.attbin ; export PATH
+fi
+
+UNAME_MACHINE=`(uname -m) 2>/dev/null` || UNAME_MACHINE=unknown
+UNAME_RELEASE=`(uname -r) 2>/dev/null` || UNAME_RELEASE=unknown
+UNAME_SYSTEM=`(uname -s) 2>/dev/null`  || UNAME_SYSTEM=unknown
+UNAME_VERSION=`(uname -v) 2>/dev/null` || UNAME_VERSION=unknown
+
+# Note: order is significant - the case branches are not exclusive.
+
+case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
+    *:NetBSD:*:*)
+	# NetBSD (nbsd) targets should (where applicable) match one or
+	# more of the tupples: *-*-netbsdelf*, *-*-netbsdaout*,
+	# *-*-netbsdecoff* and *-*-netbsd*.  For targets that recently
+	# switched to ELF, *-*-netbsd* would select the old
+	# object file format.  This provides both forward
+	# compatibility and a consistent mechanism for selecting the
+	# object file format.
+	#
+	# Note: NetBSD doesn't particularly care about the vendor
+	# portion of the name.  We always set it to "unknown".
+	sysctl="sysctl -n hw.machine_arch"
+	UNAME_MACHINE_ARCH=`(/sbin/$sysctl 2>/dev/null || \
+	    /usr/sbin/$sysctl 2>/dev/null || echo unknown)`
+	case "${UNAME_MACHINE_ARCH}" in
+	    armeb) machine=armeb-unknown ;;
+	    arm*) machine=arm-unknown ;;
+	    sh3el) machine=shl-unknown ;;
+	    sh3eb) machine=sh-unknown ;;
+	    *) machine=${UNAME_MACHINE_ARCH}-unknown ;;
+	esac
+	# The Operating System including object format, if it has switched
+	# to ELF recently, or will in the future.
+	case "${UNAME_MACHINE_ARCH}" in
+	    arm*|i386|m68k|ns32k|sh3*|sparc|vax)
+		eval $set_cc_for_build
+		if echo __ELF__ | $CC_FOR_BUILD -E - 2>/dev/null \
+			| grep __ELF__ >/dev/null
+		then
+		    # Once all utilities can be ECOFF (netbsdecoff) or a.out (netbsdaout).
+		    # Return netbsd for either.  FIX?
+		    os=netbsd
+		else
+		    os=netbsdelf
+		fi
+		;;
+	    *)
+	        os=netbsd
+		;;
+	esac
+	# The OS release
+	# Debian GNU/NetBSD machines have a different userland, and
+	# thus, need a distinct triplet. However, they do not need
+	# kernel version information, so it can be replaced with a
+	# suitable tag, in the style of linux-gnu.
+	case "${UNAME_VERSION}" in
+	    Debian*)
+		release='-gnu'
+		;;
+	    *)
+		release=`echo ${UNAME_RELEASE}|sed -e 's/[-_].*/\./'`
+		;;
+	esac
+	# Since CPU_TYPE-MANUFACTURER-KERNEL-OPERATING_SYSTEM:
+	# contains redundant information, the shorter form:
+	# CPU_TYPE-MANUFACTURER-OPERATING_SYSTEM is used.
+	echo "${machine}-${os}${release}"
+	exit 0 ;;
+    amiga:OpenBSD:*:*)
+	echo m68k-unknown-openbsd${UNAME_RELEASE}
+	exit 0 ;;
+    arc:OpenBSD:*:*)
+	echo mipsel-unknown-openbsd${UNAME_RELEASE}
+	exit 0 ;;
+    hp300:OpenBSD:*:*)
+	echo m68k-unknown-openbsd${UNAME_RELEASE}
+	exit 0 ;;
+    mac68k:OpenBSD:*:*)
+	echo m68k-unknown-openbsd${UNAME_RELEASE}
+	exit 0 ;;
+    macppc:OpenBSD:*:*)
+	echo powerpc-unknown-openbsd${UNAME_RELEASE}
+	exit 0 ;;
+    mvme68k:OpenBSD:*:*)
+	echo m68k-unknown-openbsd${UNAME_RELEASE}
+	exit 0 ;;
+    mvme88k:OpenBSD:*:*)
+	echo m88k-unknown-openbsd${UNAME_RELEASE}
+	exit 0 ;;
+    mvmeppc:OpenBSD:*:*)
+	echo powerpc-unknown-openbsd${UNAME_RELEASE}
+	exit 0 ;;
+    pmax:OpenBSD:*:*)
+	echo mipsel-unknown-openbsd${UNAME_RELEASE}
+	exit 0 ;;
+    sgi:OpenBSD:*:*)
+	echo mipseb-unknown-openbsd${UNAME_RELEASE}
+	exit 0 ;;
+    sun3:OpenBSD:*:*)
+	echo m68k-unknown-openbsd${UNAME_RELEASE}
+	exit 0 ;;
+    wgrisc:OpenBSD:*:*)
+	echo mipsel-unknown-openbsd${UNAME_RELEASE}
+	exit 0 ;;
+    *:OpenBSD:*:*)
+	echo ${UNAME_MACHINE}-unknown-openbsd${UNAME_RELEASE}
+	exit 0 ;;
+    alpha:OSF1:*:*)
+	if test $UNAME_RELEASE = "V4.0"; then
+		UNAME_RELEASE=`/usr/sbin/sizer -v | awk '{print $3}'`
+	fi
+	# According to Compaq, /usr/sbin/psrinfo has been available on
+	# OSF/1 and Tru64 systems produced since 1995.  I hope that
+	# covers most systems running today.  This code pipes the CPU
+	# types through head -n 1, so we only detect the type of CPU 0.
+	ALPHA_CPU_TYPE=`/usr/sbin/psrinfo -v | sed -n -e 's/^  The alpha \(.*\) processor.*$/\1/p' | head -n 1`
+	case "$ALPHA_CPU_TYPE" in
+	    "EV4 (21064)")
+		UNAME_MACHINE="alpha" ;;
+	    "EV4.5 (21064)")
+		UNAME_MACHINE="alpha" ;;
+	    "LCA4 (21066/21068)")
+		UNAME_MACHINE="alpha" ;;
+	    "EV5 (21164)")
+		UNAME_MACHINE="alphaev5" ;;
+	    "EV5.6 (21164A)")
+		UNAME_MACHINE="alphaev56" ;;
+	    "EV5.6 (21164PC)")
+		UNAME_MACHINE="alphapca56" ;;
+	    "EV5.7 (21164PC)")
+		UNAME_MACHINE="alphapca57" ;;
+	    "EV6 (21264)")
+		UNAME_MACHINE="alphaev6" ;;
+	    "EV6.7 (21264A)")
+		UNAME_MACHINE="alphaev67" ;;
+	    "EV6.8CB (21264C)")
+		UNAME_MACHINE="alphaev68" ;;
+	    "EV6.8AL (21264B)")
+		UNAME_MACHINE="alphaev68" ;;
+	    "EV6.8CX (21264D)")
+		UNAME_MACHINE="alphaev68" ;;
+	    "EV6.9A (21264/EV69A)")
+		UNAME_MACHINE="alphaev69" ;;
+	    "EV7 (21364)")
+		UNAME_MACHINE="alphaev7" ;;
+	    "EV7.9 (21364A)")
+		UNAME_MACHINE="alphaev79" ;;
+	esac
+	# A Vn.n version is a released version.
+	# A Tn.n version is a released field test version.
+	# A Xn.n version is an unreleased experimental baselevel.
+	# 1.2 uses "1.2" for uname -r.
+	echo ${UNAME_MACHINE}-dec-osf`echo ${UNAME_RELEASE} | sed -e 's/^[VTX]//' | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz'`
+	exit 0 ;;
+    Alpha\ *:Windows_NT*:*)
+	# How do we know it's Interix rather than the generic POSIX subsystem?
+	# Should we change UNAME_MACHINE based on the output of uname instead
+	# of the specific Alpha model?
+	echo alpha-pc-interix
+	exit 0 ;;
+    21064:Windows_NT:50:3)
+	echo alpha-dec-winnt3.5
+	exit 0 ;;
+    Amiga*:UNIX_System_V:4.0:*)
+	echo m68k-unknown-sysv4
+	exit 0;;
+    *:[Aa]miga[Oo][Ss]:*:*)
+	echo ${UNAME_MACHINE}-unknown-amigaos
+	exit 0 ;;
+    *:[Mm]orph[Oo][Ss]:*:*)
+	echo ${UNAME_MACHINE}-unknown-morphos
+	exit 0 ;;
+    *:OS/390:*:*)
+	echo i370-ibm-openedition
+	exit 0 ;;
+    arm:RISC*:1.[012]*:*|arm:riscix:1.[012]*:*)
+	echo arm-acorn-riscix${UNAME_RELEASE}
+	exit 0;;
+    SR2?01:HI-UX/MPP:*:* | SR8000:HI-UX/MPP:*:*)
+	echo hppa1.1-hitachi-hiuxmpp
+	exit 0;;
+    Pyramid*:OSx*:*:* | MIS*:OSx*:*:* | MIS*:SMP_DC-OSx*:*:*)
+	# akee@wpdis03.wpafb.af.mil (Earle F. Ake) contributed MIS and NILE.
+	if test "`(/bin/universe) 2>/dev/null`" = att ; then
+		echo pyramid-pyramid-sysv3
+	else
+		echo pyramid-pyramid-bsd
+	fi
+	exit 0 ;;
+    NILE*:*:*:dcosx)
+	echo pyramid-pyramid-svr4
+	exit 0 ;;
+    DRS?6000:UNIX_SV:4.2*:7*)
+	case `/usr/bin/uname -p` in
+	    sparc) echo sparc-icl-nx7 && exit 0 ;;
+	esac ;;
+    sun4H:SunOS:5.*:*)
+	echo sparc-hal-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
+	exit 0 ;;
+    sun4*:SunOS:5.*:* | tadpole*:SunOS:5.*:*)
+	echo sparc-sun-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
+	exit 0 ;;
+    i86pc:SunOS:5.*:*)
+	echo i386-pc-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
+	exit 0 ;;
+    sun4*:SunOS:6*:*)
+	# According to config.sub, this is the proper way to canonicalize
+	# SunOS6.  Hard to guess exactly what SunOS6 will be like, but
+	# it's likely to be more like Solaris than SunOS4.
+	echo sparc-sun-solaris3`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
+	exit 0 ;;
+    sun4*:SunOS:*:*)
+	case "`/usr/bin/arch -k`" in
+	    Series*|S4*)
+		UNAME_RELEASE=`uname -v`
+		;;
+	esac
+	# Japanese Language versions have a version number like `4.1.3-JL'.
+	echo sparc-sun-sunos`echo ${UNAME_RELEASE}|sed -e 's/-/_/'`
+	exit 0 ;;
+    sun3*:SunOS:*:*)
+	echo m68k-sun-sunos${UNAME_RELEASE}
+	exit 0 ;;
+    sun*:*:4.2BSD:*)
+	UNAME_RELEASE=`(sed 1q /etc/motd | awk '{print substr($5,1,3)}') 2>/dev/null`
+	test "x${UNAME_RELEASE}" = "x" && UNAME_RELEASE=3
+	case "`/bin/arch`" in
+	    sun3)
+		echo m68k-sun-sunos${UNAME_RELEASE}
+		;;
+	    sun4)
+		echo sparc-sun-sunos${UNAME_RELEASE}
+		;;
+	esac
+	exit 0 ;;
+    aushp:SunOS:*:*)
+	echo sparc-auspex-sunos${UNAME_RELEASE}
+	exit 0 ;;
+    # The situation for MiNT is a little confusing.  The machine name
+    # can be virtually everything (everything which is not
+    # "atarist" or "atariste" at least should have a processor
+    # > m68000).  The system name ranges from "MiNT" over "FreeMiNT"
+    # to the lowercase version "mint" (or "freemint").  Finally
+    # the system name "TOS" denotes a system which is actually not
+    # MiNT.  But MiNT is downward compatible to TOS, so this should
+    # be no problem.
+    atarist[e]:*MiNT:*:* | atarist[e]:*mint:*:* | atarist[e]:*TOS:*:*)
+        echo m68k-atari-mint${UNAME_RELEASE}
+	exit 0 ;;
+    atari*:*MiNT:*:* | atari*:*mint:*:* | atarist[e]:*TOS:*:*)
+	echo m68k-atari-mint${UNAME_RELEASE}
+        exit 0 ;;
+    *falcon*:*MiNT:*:* | *falcon*:*mint:*:* | *falcon*:*TOS:*:*)
+        echo m68k-atari-mint${UNAME_RELEASE}
+	exit 0 ;;
+    milan*:*MiNT:*:* | milan*:*mint:*:* | *milan*:*TOS:*:*)
+        echo m68k-milan-mint${UNAME_RELEASE}
+        exit 0 ;;
+    hades*:*MiNT:*:* | hades*:*mint:*:* | *hades*:*TOS:*:*)
+        echo m68k-hades-mint${UNAME_RELEASE}
+        exit 0 ;;
+    *:*MiNT:*:* | *:*mint:*:* | *:*TOS:*:*)
+        echo m68k-unknown-mint${UNAME_RELEASE}
+        exit 0 ;;
+    powerpc:machten:*:*)
+	echo powerpc-apple-machten${UNAME_RELEASE}
+	exit 0 ;;
+    RISC*:Mach:*:*)
+	echo mips-dec-mach_bsd4.3
+	exit 0 ;;
+    RISC*:ULTRIX:*:*)
+	echo mips-dec-ultrix${UNAME_RELEASE}
+	exit 0 ;;
+    VAX*:ULTRIX*:*:*)
+	echo vax-dec-ultrix${UNAME_RELEASE}
+	exit 0 ;;
+    2020:CLIX:*:* | 2430:CLIX:*:*)
+	echo clipper-intergraph-clix${UNAME_RELEASE}
+	exit 0 ;;
+    mips:*:*:UMIPS | mips:*:*:RISCos)
+	eval $set_cc_for_build
+	sed 's/^	//' << EOF >$dummy.c
+#ifdef __cplusplus
+#include <stdio.h>  /* for printf() prototype */
+	int main (int argc, char *argv[]) {
+#else
+	int main (argc, argv) int argc; char *argv[]; {
+#endif
+	#if defined (host_mips) && defined (MIPSEB)
+	#if defined (SYSTYPE_SYSV)
+	  printf ("mips-mips-riscos%ssysv\n", argv[1]); exit (0);
+	#endif
+	#if defined (SYSTYPE_SVR4)
+	  printf ("mips-mips-riscos%ssvr4\n", argv[1]); exit (0);
+	#endif
+	#if defined (SYSTYPE_BSD43) || defined(SYSTYPE_BSD)
+	  printf ("mips-mips-riscos%sbsd\n", argv[1]); exit (0);
+	#endif
+	#endif
+	  exit (-1);
+	}
+EOF
+	$CC_FOR_BUILD -o $dummy $dummy.c \
+	  && $dummy `echo "${UNAME_RELEASE}" | sed -n 's/\([0-9]*\).*/\1/p'` \
+	  && exit 0
+	echo mips-mips-riscos${UNAME_RELEASE}
+	exit 0 ;;
+    Motorola:PowerMAX_OS:*:*)
+	echo powerpc-motorola-powermax
+	exit 0 ;;
+    Motorola:*:4.3:PL8-*)
+	echo powerpc-harris-powermax
+	exit 0 ;;
+    Night_Hawk:*:*:PowerMAX_OS | Synergy:PowerMAX_OS:*:*)
+	echo powerpc-harris-powermax
+	exit 0 ;;
+    Night_Hawk:Power_UNIX:*:*)
+	echo powerpc-harris-powerunix
+	exit 0 ;;
+    m88k:CX/UX:7*:*)
+	echo m88k-harris-cxux7
+	exit 0 ;;
+    m88k:*:4*:R4*)
+	echo m88k-motorola-sysv4
+	exit 0 ;;
+    m88k:*:3*:R3*)
+	echo m88k-motorola-sysv3
+	exit 0 ;;
+    AViiON:dgux:*:*)
+        # DG/UX returns AViiON for all architectures
+        UNAME_PROCESSOR=`/usr/bin/uname -p`
+	if [ $UNAME_PROCESSOR = mc88100 ] || [ $UNAME_PROCESSOR = mc88110 ]
+	then
+	    if [ ${TARGET_BINARY_INTERFACE}x = m88kdguxelfx ] || \
+	       [ ${TARGET_BINARY_INTERFACE}x = x ]
+	    then
+		echo m88k-dg-dgux${UNAME_RELEASE}
+	    else
+		echo m88k-dg-dguxbcs${UNAME_RELEASE}
+	    fi
+	else
+	    echo i586-dg-dgux${UNAME_RELEASE}
+	fi
+ 	exit 0 ;;
+    M88*:DolphinOS:*:*)	# DolphinOS (SVR3)
+	echo m88k-dolphin-sysv3
+	exit 0 ;;
+    M88*:*:R3*:*)
+	# Delta 88k system running SVR3
+	echo m88k-motorola-sysv3
+	exit 0 ;;
+    XD88*:*:*:*) # Tektronix XD88 system running UTekV (SVR3)
+	echo m88k-tektronix-sysv3
+	exit 0 ;;
+    Tek43[0-9][0-9]:UTek:*:*) # Tektronix 4300 system running UTek (BSD)
+	echo m68k-tektronix-bsd
+	exit 0 ;;
+    *:IRIX*:*:*)
+	echo mips-sgi-irix`echo ${UNAME_RELEASE}|sed -e 's/-/_/g'`
+	exit 0 ;;
+    ????????:AIX?:[12].1:2)   # AIX 2.2.1 or AIX 2.1.1 is RT/PC AIX.
+	echo romp-ibm-aix      # uname -m gives an 8 hex-code CPU id
+	exit 0 ;;              # Note that: echo "'`uname -s`'" gives 'AIX '
+    i*86:AIX:*:*)
+	echo i386-ibm-aix
+	exit 0 ;;
+    ia64:AIX:*:*)
+	if [ -x /usr/bin/oslevel ] ; then
+		IBM_REV=`/usr/bin/oslevel`
+	else
+		IBM_REV=${UNAME_VERSION}.${UNAME_RELEASE}
+	fi
+	echo ${UNAME_MACHINE}-ibm-aix${IBM_REV}
+	exit 0 ;;
+    *:AIX:2:3)
+	if grep bos325 /usr/include/stdio.h >/dev/null 2>&1; then
+		eval $set_cc_for_build
+		sed 's/^		//' << EOF >$dummy.c
+		#include <sys/systemcfg.h>
+
+		main()
+			{
+			if (!__power_pc())
+				exit(1);
+			puts("powerpc-ibm-aix3.2.5");
+			exit(0);
+			}
+EOF
+		$CC_FOR_BUILD -o $dummy $dummy.c && $dummy && exit 0
+		echo rs6000-ibm-aix3.2.5
+	elif grep bos324 /usr/include/stdio.h >/dev/null 2>&1; then
+		echo rs6000-ibm-aix3.2.4
+	else
+		echo rs6000-ibm-aix3.2
+	fi
+	exit 0 ;;
+    *:AIX:*:[45])
+	IBM_CPU_ID=`/usr/sbin/lsdev -C -c processor -S available | sed 1q | awk '{ print $1 }'`
+	if /usr/sbin/lsattr -El ${IBM_CPU_ID} | grep ' POWER' >/dev/null 2>&1; then
+		IBM_ARCH=rs6000
+	else
+		IBM_ARCH=powerpc
+	fi
+	if [ -x /usr/bin/oslevel ] ; then
+		IBM_REV=`/usr/bin/oslevel`
+	else
+		IBM_REV=${UNAME_VERSION}.${UNAME_RELEASE}
+	fi
+	echo ${IBM_ARCH}-ibm-aix${IBM_REV}
+	exit 0 ;;
+    *:AIX:*:*)
+	echo rs6000-ibm-aix
+	exit 0 ;;
+    ibmrt:4.4BSD:*|romp-ibm:BSD:*)
+	echo romp-ibm-bsd4.4
+	exit 0 ;;
+    ibmrt:*BSD:*|romp-ibm:BSD:*)            # covers RT/PC BSD and
+	echo romp-ibm-bsd${UNAME_RELEASE}   # 4.3 with uname added to
+	exit 0 ;;                           # report: romp-ibm BSD 4.3
+    *:BOSX:*:*)
+	echo rs6000-bull-bosx
+	exit 0 ;;
+    DPX/2?00:B.O.S.:*:*)
+	echo m68k-bull-sysv3
+	exit 0 ;;
+    9000/[34]??:4.3bsd:1.*:*)
+	echo m68k-hp-bsd
+	exit 0 ;;
+    hp300:4.4BSD:*:* | 9000/[34]??:4.3bsd:2.*:*)
+	echo m68k-hp-bsd4.4
+	exit 0 ;;
+    9000/[34678]??:HP-UX:*:*)
+	HPUX_REV=`echo ${UNAME_RELEASE}|sed -e 's/[^.]*.[0B]*//'`
+	case "${UNAME_MACHINE}" in
+	    9000/31? )            HP_ARCH=m68000 ;;
+	    9000/[34]?? )         HP_ARCH=m68k ;;
+	    9000/[678][0-9][0-9])
+		if [ -x /usr/bin/getconf ]; then
+		    sc_cpu_version=`/usr/bin/getconf SC_CPU_VERSION 2>/dev/null`
+                    sc_kernel_bits=`/usr/bin/getconf SC_KERNEL_BITS 2>/dev/null`
+                    case "${sc_cpu_version}" in
+                      523) HP_ARCH="hppa1.0" ;; # CPU_PA_RISC1_0
+                      528) HP_ARCH="hppa1.1" ;; # CPU_PA_RISC1_1
+                      532)                      # CPU_PA_RISC2_0
+                        case "${sc_kernel_bits}" in
+                          32) HP_ARCH="hppa2.0n" ;;
+                          64) HP_ARCH="hppa2.0w" ;;
+			  '') HP_ARCH="hppa2.0" ;;   # HP-UX 10.20
+                        esac ;;
+                    esac
+		fi
+		if [ "${HP_ARCH}" = "" ]; then
+		    eval $set_cc_for_build
+		    sed 's/^              //' << EOF >$dummy.c
+
+              #define _HPUX_SOURCE
+              #include <stdlib.h>
+              #include <unistd.h>
+
+              int main ()
+              {
+              #if defined(_SC_KERNEL_BITS)
+                  long bits = sysconf(_SC_KERNEL_BITS);
+              #endif
+                  long cpu  = sysconf (_SC_CPU_VERSION);
+
+                  switch (cpu)
+              	{
+              	case CPU_PA_RISC1_0: puts ("hppa1.0"); break;
+              	case CPU_PA_RISC1_1: puts ("hppa1.1"); break;
+              	case CPU_PA_RISC2_0:
+              #if defined(_SC_KERNEL_BITS)
+              	    switch (bits)
+              		{
+              		case 64: puts ("hppa2.0w"); break;
+              		case 32: puts ("hppa2.0n"); break;
+              		default: puts ("hppa2.0"); break;
+              		} break;
+              #else  /* !defined(_SC_KERNEL_BITS) */
+              	    puts ("hppa2.0"); break;
+              #endif
+              	default: puts ("hppa1.0"); break;
+              	}
+                  exit (0);
+              }
+EOF
+		    (CCOPTS= $CC_FOR_BUILD -o $dummy $dummy.c 2>/dev/null) && HP_ARCH=`$dummy`
+		    test -z "$HP_ARCH" && HP_ARCH=hppa
+		fi ;;
+	esac
+	if [ ${HP_ARCH} = "hppa2.0w" ]
+	then
+	    # avoid double evaluation of $set_cc_for_build
+	    test -n "$CC_FOR_BUILD" || eval $set_cc_for_build
+	    if echo __LP64__ | (CCOPTS= $CC_FOR_BUILD -E -) | grep __LP64__ >/dev/null
+	    then
+		HP_ARCH="hppa2.0w"
+	    else
+		HP_ARCH="hppa64"
+	    fi
+	fi
+	echo ${HP_ARCH}-hp-hpux${HPUX_REV}
+	exit 0 ;;
+    ia64:HP-UX:*:*)
+	HPUX_REV=`echo ${UNAME_RELEASE}|sed -e 's/[^.]*.[0B]*//'`
+	echo ia64-hp-hpux${HPUX_REV}
+	exit 0 ;;
+    3050*:HI-UX:*:*)
+	eval $set_cc_for_build
+	sed 's/^	//' << EOF >$dummy.c
+	#include <unistd.h>
+	int
+	main ()
+	{
+	  long cpu = sysconf (_SC_CPU_VERSION);
+	  /* The order matters, because CPU_IS_HP_MC68K erroneously returns
+	     true for CPU_PA_RISC1_0.  CPU_IS_PA_RISC returns correct
+	     results, however.  */
+	  if (CPU_IS_PA_RISC (cpu))
+	    {
+	      switch (cpu)
+		{
+		  case CPU_PA_RISC1_0: puts ("hppa1.0-hitachi-hiuxwe2"); break;
+		  case CPU_PA_RISC1_1: puts ("hppa1.1-hitachi-hiuxwe2"); break;
+		  case CPU_PA_RISC2_0: puts ("hppa2.0-hitachi-hiuxwe2"); break;
+		  default: puts ("hppa-hitachi-hiuxwe2"); break;
+		}
+	    }
+	  else if (CPU_IS_HP_MC68K (cpu))
+	    puts ("m68k-hitachi-hiuxwe2");
+	  else puts ("unknown-hitachi-hiuxwe2");
+	  exit (0);
+	}
+EOF
+	$CC_FOR_BUILD -o $dummy $dummy.c && $dummy && exit 0
+	echo unknown-hitachi-hiuxwe2
+	exit 0 ;;
+    9000/7??:4.3bsd:*:* | 9000/8?[79]:4.3bsd:*:* )
+	echo hppa1.1-hp-bsd
+	exit 0 ;;
+    9000/8??:4.3bsd:*:*)
+	echo hppa1.0-hp-bsd
+	exit 0 ;;
+    *9??*:MPE/iX:*:* | *3000*:MPE/iX:*:*)
+	echo hppa1.0-hp-mpeix
+	exit 0 ;;
+    hp7??:OSF1:*:* | hp8?[79]:OSF1:*:* )
+	echo hppa1.1-hp-osf
+	exit 0 ;;
+    hp8??:OSF1:*:*)
+	echo hppa1.0-hp-osf
+	exit 0 ;;
+    i*86:OSF1:*:*)
+	if [ -x /usr/sbin/sysversion ] ; then
+	    echo ${UNAME_MACHINE}-unknown-osf1mk
+	else
+	    echo ${UNAME_MACHINE}-unknown-osf1
+	fi
+	exit 0 ;;
+    parisc*:Lites*:*:*)
+	echo hppa1.1-hp-lites
+	exit 0 ;;
+    C1*:ConvexOS:*:* | convex:ConvexOS:C1*:*)
+	echo c1-convex-bsd
+        exit 0 ;;
+    C2*:ConvexOS:*:* | convex:ConvexOS:C2*:*)
+	if getsysinfo -f scalar_acc
+	then echo c32-convex-bsd
+	else echo c2-convex-bsd
+	fi
+        exit 0 ;;
+    C34*:ConvexOS:*:* | convex:ConvexOS:C34*:*)
+	echo c34-convex-bsd
+        exit 0 ;;
+    C38*:ConvexOS:*:* | convex:ConvexOS:C38*:*)
+	echo c38-convex-bsd
+        exit 0 ;;
+    C4*:ConvexOS:*:* | convex:ConvexOS:C4*:*)
+	echo c4-convex-bsd
+        exit 0 ;;
+    CRAY*Y-MP:*:*:*)
+	echo ymp-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
+	exit 0 ;;
+    CRAY*[A-Z]90:*:*:*)
+	echo ${UNAME_MACHINE}-cray-unicos${UNAME_RELEASE} \
+	| sed -e 's/CRAY.*\([A-Z]90\)/\1/' \
+	      -e y/ABCDEFGHIJKLMNOPQRSTUVWXYZ/abcdefghijklmnopqrstuvwxyz/ \
+	      -e 's/\.[^.]*$/.X/'
+	exit 0 ;;
+    CRAY*TS:*:*:*)
+	echo t90-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
+	exit 0 ;;
+    CRAY*T3E:*:*:*)
+	echo alphaev5-cray-unicosmk${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
+	exit 0 ;;
+    CRAY*SV1:*:*:*)
+	echo sv1-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
+	exit 0 ;;
+    *:UNICOS/mp:*:*)
+	echo nv1-cray-unicosmp${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/' 
+	exit 0 ;;
+    F30[01]:UNIX_System_V:*:* | F700:UNIX_System_V:*:*)
+	FUJITSU_PROC=`uname -m | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz'`
+        FUJITSU_SYS=`uname -p | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/\///'`
+        FUJITSU_REL=`echo ${UNAME_RELEASE} | sed -e 's/ /_/'`
+        echo "${FUJITSU_PROC}-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}"
+        exit 0 ;;
+    i*86:BSD/386:*:* | i*86:BSD/OS:*:* | *:Ascend\ Embedded/OS:*:*)
+	echo ${UNAME_MACHINE}-pc-bsdi${UNAME_RELEASE}
+	exit 0 ;;
+    sparc*:BSD/OS:*:*)
+	echo sparc-unknown-bsdi${UNAME_RELEASE}
+	exit 0 ;;
+    *:BSD/OS:*:*)
+	echo ${UNAME_MACHINE}-unknown-bsdi${UNAME_RELEASE}
+	exit 0 ;;
+    *:FreeBSD:*:*|*:GNU/FreeBSD:*:*)
+	# Determine whether the default compiler uses glibc.
+	eval $set_cc_for_build
+	sed 's/^	//' << EOF >$dummy.c
+	#include <features.h>
+	#if __GLIBC__ >= 2
+	LIBC=gnu
+	#else
+	LIBC=
+	#endif
+EOF
+	eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep ^LIBC=`
+	echo ${UNAME_MACHINE}-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'`${LIBC:+-$LIBC}
+	exit 0 ;;
+    i*:CYGWIN*:*)
+	echo ${UNAME_MACHINE}-pc-cygwin
+	exit 0 ;;
+    i*:MINGW*:*)
+	echo ${UNAME_MACHINE}-pc-mingw32
+	exit 0 ;;
+    i*:PW*:*)
+	echo ${UNAME_MACHINE}-pc-pw32
+	exit 0 ;;
+    x86:Interix*:3*)
+	echo i586-pc-interix3
+	exit 0 ;;
+    [345]86:Windows_95:* | [345]86:Windows_98:* | [345]86:Windows_NT:*)
+	echo i${UNAME_MACHINE}-pc-mks
+	exit 0 ;;
+    i*:Windows_NT*:* | Pentium*:Windows_NT*:*)
+	# How do we know it's Interix rather than the generic POSIX subsystem?
+	# It also conflicts with pre-2.0 versions of AT&T UWIN. Should we
+	# UNAME_MACHINE based on the output of uname instead of i386?
+	echo i586-pc-interix
+	exit 0 ;;
+    i*:UWIN*:*)
+	echo ${UNAME_MACHINE}-pc-uwin
+	exit 0 ;;
+    p*:CYGWIN*:*)
+	echo powerpcle-unknown-cygwin
+	exit 0 ;;
+    prep*:SunOS:5.*:*)
+	echo powerpcle-unknown-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
+	exit 0 ;;
+    *:GNU:*:*)
+	echo `echo ${UNAME_MACHINE}|sed -e 's,[-/].*$,,'`-unknown-gnu`echo ${UNAME_RELEASE}|sed -e 's,/.*$,,'`
+	exit 0 ;;
+    i*86:Minix:*:*)
+	echo ${UNAME_MACHINE}-pc-minix
+	exit 0 ;;
+    arm*:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	exit 0 ;;
+    cris:Linux:*:*)
+	echo cris-axis-linux-gnu
+	exit 0 ;;
+    ia64:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	exit 0 ;;
+    m68*:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	exit 0 ;;
+    mips:Linux:*:*)
+	eval $set_cc_for_build
+	sed 's/^	//' << EOF >$dummy.c
+	#undef CPU
+	#undef mips
+	#undef mipsel
+	#if defined(__MIPSEL__) || defined(__MIPSEL) || defined(_MIPSEL) || defined(MIPSEL)
+	CPU=mipsel
+	#else
+	#if defined(__MIPSEB__) || defined(__MIPSEB) || defined(_MIPSEB) || defined(MIPSEB)
+	CPU=mips
+	#else
+	CPU=
+	#endif
+	#endif
+EOF
+	eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep ^CPU=`
+	test x"${CPU}" != x && echo "${CPU}-unknown-linux-gnu" && exit 0
+	;;
+    mips64:Linux:*:*)
+	eval $set_cc_for_build
+	sed 's/^	//' << EOF >$dummy.c
+	#undef CPU
+	#undef mips64
+	#undef mips64el
+	#if defined(__MIPSEL__) || defined(__MIPSEL) || defined(_MIPSEL) || defined(MIPSEL)
+	CPU=mips64el
+	#else
+	#if defined(__MIPSEB__) || defined(__MIPSEB) || defined(_MIPSEB) || defined(MIPSEB)
+	CPU=mips64
+	#else
+	CPU=
+	#endif
+	#endif
+EOF
+	eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep ^CPU=`
+	test x"${CPU}" != x && echo "${CPU}-unknown-linux-gnu" && exit 0
+	;;
+    ppc:Linux:*:*)
+	echo powerpc-unknown-linux-gnu
+	exit 0 ;;
+    ppc64:Linux:*:*)
+	echo powerpc64-unknown-linux-gnu
+	exit 0 ;;
+    alpha:Linux:*:*)
+	case `sed -n '/^cpu model/s/^.*: \(.*\)/\1/p' < /proc/cpuinfo` in
+	  EV5)   UNAME_MACHINE=alphaev5 ;;
+	  EV56)  UNAME_MACHINE=alphaev56 ;;
+	  PCA56) UNAME_MACHINE=alphapca56 ;;
+	  PCA57) UNAME_MACHINE=alphapca56 ;;
+	  EV6)   UNAME_MACHINE=alphaev6 ;;
+	  EV67)  UNAME_MACHINE=alphaev67 ;;
+	  EV68*) UNAME_MACHINE=alphaev68 ;;
+        esac
+	objdump --private-headers /bin/sh | grep ld.so.1 >/dev/null
+	if test "$?" = 0 ; then LIBC="libc1" ; else LIBC="" ; fi
+	echo ${UNAME_MACHINE}-unknown-linux-gnu${LIBC}
+	exit 0 ;;
+    parisc:Linux:*:* | hppa:Linux:*:*)
+	# Look for CPU level
+	case `grep '^cpu[^a-z]*:' /proc/cpuinfo 2>/dev/null | cut -d' ' -f2` in
+	  PA7*) echo hppa1.1-unknown-linux-gnu ;;
+	  PA8*) echo hppa2.0-unknown-linux-gnu ;;
+	  *)    echo hppa-unknown-linux-gnu ;;
+	esac
+	exit 0 ;;
+    parisc64:Linux:*:* | hppa64:Linux:*:*)
+	echo hppa64-unknown-linux-gnu
+	exit 0 ;;
+    s390:Linux:*:* | s390x:Linux:*:*)
+	echo ${UNAME_MACHINE}-ibm-linux
+	exit 0 ;;
+    sh*:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	exit 0 ;;
+    sparc:Linux:*:* | sparc64:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	exit 0 ;;
+    x86_64:Linux:*:*)
+	echo x86_64-unknown-linux-gnu
+	exit 0 ;;
+    i*86:Linux:*:*)
+	# The BFD linker knows what the default object file format is, so
+	# first see if it will tell us. cd to the root directory to prevent
+	# problems with other programs or directories called `ld' in the path.
+	# Set LC_ALL=C to ensure ld outputs messages in English.
+	ld_supported_targets=`cd /; LC_ALL=C ld --help 2>&1 \
+			 | sed -ne '/supported targets:/!d
+				    s/[ 	][ 	]*/ /g
+				    s/.*supported targets: *//
+				    s/ .*//
+				    p'`
+        case "$ld_supported_targets" in
+	  elf32-i386)
+		TENTATIVE="${UNAME_MACHINE}-pc-linux-gnu"
+		;;
+	  a.out-i386-linux)
+		echo "${UNAME_MACHINE}-pc-linux-gnuaout"
+		exit 0 ;;
+	  coff-i386)
+		echo "${UNAME_MACHINE}-pc-linux-gnucoff"
+		exit 0 ;;
+	  "")
+		# Either a pre-BFD a.out linker (linux-gnuoldld) or
+		# one that does not give us useful --help.
+		echo "${UNAME_MACHINE}-pc-linux-gnuoldld"
+		exit 0 ;;
+	esac
+	# Determine whether the default compiler is a.out or elf
+	eval $set_cc_for_build
+	sed 's/^	//' << EOF >$dummy.c
+	#include <features.h>
+	#ifdef __ELF__
+	# ifdef __GLIBC__
+	#  if __GLIBC__ >= 2
+	LIBC=gnu
+	#  else
+	LIBC=gnulibc1
+	#  endif
+	# else
+	LIBC=gnulibc1
+	# endif
+	#else
+	#ifdef __INTEL_COMPILER
+	LIBC=gnu
+	#else
+	LIBC=gnuaout
+	#endif
+	#endif
+EOF
+	eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep ^LIBC=`
+	test x"${LIBC}" != x && echo "${UNAME_MACHINE}-pc-linux-${LIBC}" && exit 0
+	test x"${TENTATIVE}" != x && echo "${TENTATIVE}" && exit 0
+	;;
+    i*86:DYNIX/ptx:4*:*)
+	# ptx 4.0 does uname -s correctly, with DYNIX/ptx in there.
+	# earlier versions are messed up and put the nodename in both
+	# sysname and nodename.
+	echo i386-sequent-sysv4
+	exit 0 ;;
+    i*86:UNIX_SV:4.2MP:2.*)
+        # Unixware is an offshoot of SVR4, but it has its own version
+        # number series starting with 2...
+        # I am not positive that other SVR4 systems won't match this,
+	# I just have to hope.  -- rms.
+        # Use sysv4.2uw... so that sysv4* matches it.
+	echo ${UNAME_MACHINE}-pc-sysv4.2uw${UNAME_VERSION}
+	exit 0 ;;
+    i*86:OS/2:*:*)
+	# If we were able to find `uname', then EMX Unix compatibility
+	# is probably installed.
+	echo ${UNAME_MACHINE}-pc-os2-emx
+	exit 0 ;;
+    i*86:XTS-300:*:STOP)
+	echo ${UNAME_MACHINE}-unknown-stop
+	exit 0 ;;
+    i*86:atheos:*:*)
+	echo ${UNAME_MACHINE}-unknown-atheos
+	exit 0 ;;
+    i*86:LynxOS:2.*:* | i*86:LynxOS:3.[01]*:* | i*86:LynxOS:4.0*:*)
+	echo i386-unknown-lynxos${UNAME_RELEASE}
+	exit 0 ;;
+    i*86:*DOS:*:*)
+	echo ${UNAME_MACHINE}-pc-msdosdjgpp
+	exit 0 ;;
+    i*86:*:4.*:* | i*86:SYSTEM_V:4.*:*)
+	UNAME_REL=`echo ${UNAME_RELEASE} | sed 's/\/MP$//'`
+	if grep Novell /usr/include/link.h >/dev/null 2>/dev/null; then
+		echo ${UNAME_MACHINE}-univel-sysv${UNAME_REL}
+	else
+		echo ${UNAME_MACHINE}-pc-sysv${UNAME_REL}
+	fi
+	exit 0 ;;
+    i*86:*:5:[78]*)
+	case `/bin/uname -X | grep "^Machine"` in
+	    *486*)	     UNAME_MACHINE=i486 ;;
+	    *Pentium)	     UNAME_MACHINE=i586 ;;
+	    *Pent*|*Celeron) UNAME_MACHINE=i686 ;;
+	esac
+	echo ${UNAME_MACHINE}-unknown-sysv${UNAME_RELEASE}${UNAME_SYSTEM}${UNAME_VERSION}
+	exit 0 ;;
+    i*86:*:3.2:*)
+	if test -f /usr/options/cb.name; then
+		UNAME_REL=`sed -n 's/.*Version //p' </usr/options/cb.name`
+		echo ${UNAME_MACHINE}-pc-isc$UNAME_REL
+	elif /bin/uname -X 2>/dev/null >/dev/null ; then
+		UNAME_REL=`(/bin/uname -X|grep Release|sed -e 's/.*= //')`
+		(/bin/uname -X|grep i80486 >/dev/null) && UNAME_MACHINE=i486
+		(/bin/uname -X|grep '^Machine.*Pentium' >/dev/null) \
+			&& UNAME_MACHINE=i586
+		(/bin/uname -X|grep '^Machine.*Pent *II' >/dev/null) \
+			&& UNAME_MACHINE=i686
+		(/bin/uname -X|grep '^Machine.*Pentium Pro' >/dev/null) \
+			&& UNAME_MACHINE=i686
+		echo ${UNAME_MACHINE}-pc-sco$UNAME_REL
+	else
+		echo ${UNAME_MACHINE}-pc-sysv32
+	fi
+	exit 0 ;;
+    pc:*:*:*)
+	# Left here for compatibility:
+        # uname -m prints for DJGPP always 'pc', but it prints nothing about
+        # the processor, so we play safe by assuming i386.
+	echo i386-pc-msdosdjgpp
+        exit 0 ;;
+    Intel:Mach:3*:*)
+	echo i386-pc-mach3
+	exit 0 ;;
+    paragon:*:*:*)
+	echo i860-intel-osf1
+	exit 0 ;;
+    i860:*:4.*:*) # i860-SVR4
+	if grep Stardent /usr/include/sys/uadmin.h >/dev/null 2>&1 ; then
+	  echo i860-stardent-sysv${UNAME_RELEASE} # Stardent Vistra i860-SVR4
+	else # Add other i860-SVR4 vendors below as they are discovered.
+	  echo i860-unknown-sysv${UNAME_RELEASE}  # Unknown i860-SVR4
+	fi
+	exit 0 ;;
+    mini*:CTIX:SYS*5:*)
+	# "miniframe"
+	echo m68010-convergent-sysv
+	exit 0 ;;
+    mc68k:UNIX:SYSTEM5:3.51m)
+	echo m68k-convergent-sysv
+	exit 0 ;;
+    M680?0:D-NIX:5.3:*)
+	echo m68k-diab-dnix
+	exit 0 ;;
+    M68*:*:R3V[567]*:*)
+	test -r /sysV68 && echo 'm68k-motorola-sysv' && exit 0 ;;
+    3[34]??:*:4.0:3.0 | 3[34]??A:*:4.0:3.0 | 3[34]??,*:*:4.0:3.0 | 3[34]??/*:*:4.0:3.0 | 4400:*:4.0:3.0 | 4850:*:4.0:3.0 | SKA40:*:4.0:3.0 | SDS2:*:4.0:3.0 | SHG2:*:4.0:3.0)
+	OS_REL=''
+	test -r /etc/.relid \
+	&& OS_REL=.`sed -n 's/[^ ]* [^ ]* \([0-9][0-9]\).*/\1/p' < /etc/.relid`
+	/bin/uname -p 2>/dev/null | grep 86 >/dev/null \
+	  && echo i486-ncr-sysv4.3${OS_REL} && exit 0
+	/bin/uname -p 2>/dev/null | /bin/grep entium >/dev/null \
+	  && echo i586-ncr-sysv4.3${OS_REL} && exit 0 ;;
+    3[34]??:*:4.0:* | 3[34]??,*:*:4.0:*)
+        /bin/uname -p 2>/dev/null | grep 86 >/dev/null \
+          && echo i486-ncr-sysv4 && exit 0 ;;
+    m68*:LynxOS:2.*:* | m68*:LynxOS:3.0*:*)
+	echo m68k-unknown-lynxos${UNAME_RELEASE}
+	exit 0 ;;
+    mc68030:UNIX_System_V:4.*:*)
+	echo m68k-atari-sysv4
+	exit 0 ;;
+    TSUNAMI:LynxOS:2.*:*)
+	echo sparc-unknown-lynxos${UNAME_RELEASE}
+	exit 0 ;;
+    rs6000:LynxOS:2.*:*)
+	echo rs6000-unknown-lynxos${UNAME_RELEASE}
+	exit 0 ;;
+    PowerPC:LynxOS:2.*:* | PowerPC:LynxOS:3.[01]*:* | PowerPC:LynxOS:4.0*:*)
+	echo powerpc-unknown-lynxos${UNAME_RELEASE}
+	exit 0 ;;
+    SM[BE]S:UNIX_SV:*:*)
+	echo mips-dde-sysv${UNAME_RELEASE}
+	exit 0 ;;
+    RM*:ReliantUNIX-*:*:*)
+	echo mips-sni-sysv4
+	exit 0 ;;
+    RM*:SINIX-*:*:*)
+	echo mips-sni-sysv4
+	exit 0 ;;
+    *:SINIX-*:*:*)
+	if uname -p 2>/dev/null >/dev/null ; then
+		UNAME_MACHINE=`(uname -p) 2>/dev/null`
+		echo ${UNAME_MACHINE}-sni-sysv4
+	else
+		echo ns32k-sni-sysv
+	fi
+	exit 0 ;;
+    PENTIUM:*:4.0*:*) # Unisys `ClearPath HMP IX 4000' SVR4/MP effort
+                      # says <Richard.M.Bartel@ccMail.Census.GOV>
+        echo i586-unisys-sysv4
+        exit 0 ;;
+    *:UNIX_System_V:4*:FTX*)
+	# From Gerald Hewes <hewes@openmarket.com>.
+	# How about differentiating between stratus architectures? -djm
+	echo hppa1.1-stratus-sysv4
+	exit 0 ;;
+    *:*:*:FTX*)
+	# From seanf@swdc.stratus.com.
+	echo i860-stratus-sysv4
+	exit 0 ;;
+    *:VOS:*:*)
+	# From Paul.Green@stratus.com.
+	echo hppa1.1-stratus-vos
+	exit 0 ;;
+    mc68*:A/UX:*:*)
+	echo m68k-apple-aux${UNAME_RELEASE}
+	exit 0 ;;
+    news*:NEWS-OS:6*:*)
+	echo mips-sony-newsos6
+	exit 0 ;;
+    R[34]000:*System_V*:*:* | R4000:UNIX_SYSV:*:* | R*000:UNIX_SV:*:*)
+	if [ -d /usr/nec ]; then
+	        echo mips-nec-sysv${UNAME_RELEASE}
+	else
+	        echo mips-unknown-sysv${UNAME_RELEASE}
+	fi
+        exit 0 ;;
+    BeBox:BeOS:*:*)	# BeOS running on hardware made by Be, PPC only.
+	echo powerpc-be-beos
+	exit 0 ;;
+    BeMac:BeOS:*:*)	# BeOS running on Mac or Mac clone, PPC only.
+	echo powerpc-apple-beos
+	exit 0 ;;
+    BePC:BeOS:*:*)	# BeOS running on Intel PC compatible.
+	echo i586-pc-beos
+	exit 0 ;;
+    SX-4:SUPER-UX:*:*)
+	echo sx4-nec-superux${UNAME_RELEASE}
+	exit 0 ;;
+    SX-5:SUPER-UX:*:*)
+	echo sx5-nec-superux${UNAME_RELEASE}
+	exit 0 ;;
+    SX-6:SUPER-UX:*:*)
+	echo sx6-nec-superux${UNAME_RELEASE}
+	exit 0 ;;
+    Power*:Rhapsody:*:*)
+	echo powerpc-apple-rhapsody${UNAME_RELEASE}
+	exit 0 ;;
+    *:Rhapsody:*:*)
+	echo ${UNAME_MACHINE}-apple-rhapsody${UNAME_RELEASE}
+	exit 0 ;;
+    *:Darwin:*:*)
+	case `uname -p` in
+	    *86) UNAME_PROCESSOR=i686 ;;
+	    powerpc) UNAME_PROCESSOR=powerpc ;;
+	esac
+	echo ${UNAME_PROCESSOR}-apple-darwin${UNAME_RELEASE}
+	exit 0 ;;
+    *:procnto*:*:* | *:QNX:[0123456789]*:*)
+	UNAME_PROCESSOR=`uname -p`
+	if test "$UNAME_PROCESSOR" = "x86"; then
+		UNAME_PROCESSOR=i386
+		UNAME_MACHINE=pc
+	fi
+	echo ${UNAME_PROCESSOR}-${UNAME_MACHINE}-nto-qnx${UNAME_RELEASE}
+	exit 0 ;;
+    *:QNX:*:4*)
+	echo i386-pc-qnx
+	exit 0 ;;
+    NSR-[DGKLNPTVW]:NONSTOP_KERNEL:*:*)
+	echo nsr-tandem-nsk${UNAME_RELEASE}
+	exit 0 ;;
+    *:NonStop-UX:*:*)
+	echo mips-compaq-nonstopux
+	exit 0 ;;
+    BS2000:POSIX*:*:*)
+	echo bs2000-siemens-sysv
+	exit 0 ;;
+    DS/*:UNIX_System_V:*:*)
+	echo ${UNAME_MACHINE}-${UNAME_SYSTEM}-${UNAME_RELEASE}
+	exit 0 ;;
+    *:Plan9:*:*)
+	# "uname -m" is not consistent, so use $cputype instead. 386
+	# is converted to i386 for consistency with other x86
+	# operating systems.
+	if test "$cputype" = "386"; then
+	    UNAME_MACHINE=i386
+	else
+	    UNAME_MACHINE="$cputype"
+	fi
+	echo ${UNAME_MACHINE}-unknown-plan9
+	exit 0 ;;
+    *:TOPS-10:*:*)
+	echo pdp10-unknown-tops10
+	exit 0 ;;
+    *:TENEX:*:*)
+	echo pdp10-unknown-tenex
+	exit 0 ;;
+    KS10:TOPS-20:*:* | KL10:TOPS-20:*:* | TYPE4:TOPS-20:*:*)
+	echo pdp10-dec-tops20
+	exit 0 ;;
+    XKL-1:TOPS-20:*:* | TYPE5:TOPS-20:*:*)
+	echo pdp10-xkl-tops20
+	exit 0 ;;
+    *:TOPS-20:*:*)
+	echo pdp10-unknown-tops20
+	exit 0 ;;
+    *:ITS:*:*)
+	echo pdp10-unknown-its
+	exit 0 ;;
+esac
+
+#echo '(No uname command or uname output not recognized.)' 1>&2
+#echo "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" 1>&2
+
+eval $set_cc_for_build
+cat >$dummy.c <<EOF
+#ifdef _SEQUENT_
+# include <sys/types.h>
+# include <sys/utsname.h>
+#endif
+main ()
+{
+#if defined (sony)
+#if defined (MIPSEB)
+  /* BFD wants "bsd" instead of "newsos".  Perhaps BFD should be changed,
+     I don't know....  */
+  printf ("mips-sony-bsd\n"); exit (0);
+#else
+#include <sys/param.h>
+  printf ("m68k-sony-newsos%s\n",
+#ifdef NEWSOS4
+          "4"
+#else
+	  ""
+#endif
+         ); exit (0);
+#endif
+#endif
+
+#if defined (__arm) && defined (__acorn) && defined (__unix)
+  printf ("arm-acorn-riscix"); exit (0);
+#endif
+
+#if defined (hp300) && !defined (hpux)
+  printf ("m68k-hp-bsd\n"); exit (0);
+#endif
+
+#if defined (NeXT)
+#if !defined (__ARCHITECTURE__)
+#define __ARCHITECTURE__ "m68k"
+#endif
+  int version;
+  version=`(hostinfo | sed -n 's/.*NeXT Mach \([0-9]*\).*/\1/p') 2>/dev/null`;
+  if (version < 4)
+    printf ("%s-next-nextstep%d\n", __ARCHITECTURE__, version);
+  else
+    printf ("%s-next-openstep%d\n", __ARCHITECTURE__, version);
+  exit (0);
+#endif
+
+#if defined (MULTIMAX) || defined (n16)
+#if defined (UMAXV)
+  printf ("ns32k-encore-sysv\n"); exit (0);
+#else
+#if defined (CMU)
+  printf ("ns32k-encore-mach\n"); exit (0);
+#else
+  printf ("ns32k-encore-bsd\n"); exit (0);
+#endif
+#endif
+#endif
+
+#if defined (__386BSD__)
+  printf ("i386-pc-bsd\n"); exit (0);
+#endif
+
+#if defined (sequent)
+#if defined (i386)
+  printf ("i386-sequent-dynix\n"); exit (0);
+#endif
+#if defined (ns32000)
+  printf ("ns32k-sequent-dynix\n"); exit (0);
+#endif
+#endif
+
+#if defined (_SEQUENT_)
+    struct utsname un;
+
+    uname(&un);
+
+    if (strncmp(un.version, "V2", 2) == 0) {
+	printf ("i386-sequent-ptx2\n"); exit (0);
+    }
+    if (strncmp(un.version, "V1", 2) == 0) { /* XXX is V1 correct? */
+	printf ("i386-sequent-ptx1\n"); exit (0);
+    }
+    printf ("i386-sequent-ptx\n"); exit (0);
+
+#endif
+
+#if defined (vax)
+# if !defined (ultrix)
+#  include <sys/param.h>
+#  if defined (BSD)
+#   if BSD == 43
+      printf ("vax-dec-bsd4.3\n"); exit (0);
+#   else
+#    if BSD == 199006
+      printf ("vax-dec-bsd4.3reno\n"); exit (0);
+#    else
+      printf ("vax-dec-bsd\n"); exit (0);
+#    endif
+#   endif
+#  else
+    printf ("vax-dec-bsd\n"); exit (0);
+#  endif
+# else
+    printf ("vax-dec-ultrix\n"); exit (0);
+# endif
+#endif
+
+#if defined (alliant) && defined (i860)
+  printf ("i860-alliant-bsd\n"); exit (0);
+#endif
+
+  exit (1);
+}
+EOF
+
+$CC_FOR_BUILD -o $dummy $dummy.c 2>/dev/null && $dummy && exit 0
+
+# Apollos put the system type in the environment.
+
+test -d /usr/apollo && { echo ${ISP}-apollo-${SYSTYPE}; exit 0; }
+
+# Convex versions that predate uname can use getsysinfo(1)
+
+if [ -x /usr/convex/getsysinfo ]
+then
+    case `getsysinfo -f cpu_type` in
+    c1*)
+	echo c1-convex-bsd
+	exit 0 ;;
+    c2*)
+	if getsysinfo -f scalar_acc
+	then echo c32-convex-bsd
+	else echo c2-convex-bsd
+	fi
+	exit 0 ;;
+    c34*)
+	echo c34-convex-bsd
+	exit 0 ;;
+    c38*)
+	echo c38-convex-bsd
+	exit 0 ;;
+    c4*)
+	echo c4-convex-bsd
+	exit 0 ;;
+    esac
+fi
+
+cat >&2 <<EOF
+$0: unable to guess system type
+
+This script, last modified $timestamp, has failed to recognize
+the operating system you are using. It is advised that you
+download the most up to date version of the config scripts from
+
+    ftp://ftp.gnu.org/pub/gnu/config/
+
+If the version you run ($0) is already up to date, please
+send the following data and any information you think might be
+pertinent to <config-patches@gnu.org> in order to provide the needed
+information to handle your system.
+
+config.guess timestamp = $timestamp
+
+uname -m = `(uname -m) 2>/dev/null || echo unknown`
+uname -r = `(uname -r) 2>/dev/null || echo unknown`
+uname -s = `(uname -s) 2>/dev/null || echo unknown`
+uname -v = `(uname -v) 2>/dev/null || echo unknown`
+
+/usr/bin/uname -p = `(/usr/bin/uname -p) 2>/dev/null`
+/bin/uname -X     = `(/bin/uname -X) 2>/dev/null`
+
+hostinfo               = `(hostinfo) 2>/dev/null`
+/bin/universe          = `(/bin/universe) 2>/dev/null`
+/usr/bin/arch -k       = `(/usr/bin/arch -k) 2>/dev/null`
+/bin/arch              = `(/bin/arch) 2>/dev/null`
+/usr/bin/oslevel       = `(/usr/bin/oslevel) 2>/dev/null`
+/usr/convex/getsysinfo = `(/usr/convex/getsysinfo) 2>/dev/null`
+
+UNAME_MACHINE = ${UNAME_MACHINE}
+UNAME_RELEASE = ${UNAME_RELEASE}
+UNAME_SYSTEM  = ${UNAME_SYSTEM}
+UNAME_VERSION = ${UNAME_VERSION}
+EOF
+
+exit 1
+
+# Local variables:
+# eval: (add-hook 'write-file-hooks 'time-stamp)
+# time-stamp-start: "timestamp='"
+# time-stamp-format: "%:y-%02m-%02d"
+# time-stamp-end: "'"
+# End:
diff --git a/config.sub b/config.sub
new file mode 100644
index 0000000..fe4f1ed
--- /dev/null
+++ b/config.sub
@@ -0,0 +1,1492 @@
+#! /bin/sh
+# Configuration validation subroutine script.
+#   Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
+#   2000, 2001, 2002, 2003 Free Software Foundation, Inc.
+
+timestamp='2003-05-09'
+
+# This file is (in principle) common to ALL GNU software.
+# The presence of a machine in this file suggests that SOME GNU software
+# can handle that machine.  It does not imply ALL GNU software can.
+#
+# This file is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330,
+# Boston, MA 02111-1307, USA.
+
+# As a special exception to the GNU General Public License, if you
+# distribute this file as part of a program that contains a
+# configuration script generated by Autoconf, you may include it under
+# the same distribution terms that you use for the rest of that program.
+
+# Please send patches to <config-patches@gnu.org>.  Submit a context
+# diff and a properly formatted ChangeLog entry.
+#
+# Configuration subroutine to validate and canonicalize a configuration type.
+# Supply the specified configuration type as an argument.
+# If it is invalid, we print an error message on stderr and exit with code 1.
+# Otherwise, we print the canonical config type on stdout and succeed.
+
+# This file is supposed to be the same for all GNU packages
+# and recognize all the CPU types, system types and aliases
+# that are meaningful with *any* GNU software.
+# Each package is responsible for reporting which valid configurations
+# it does not support.  The user should be able to distinguish
+# a failure to support a valid configuration from a meaningless
+# configuration.
+
+# The goal of this file is to map all the various variations of a given
+# machine specification into a single specification in the form:
+#	CPU_TYPE-MANUFACTURER-OPERATING_SYSTEM
+# or in some cases, the newer four-part form:
+#	CPU_TYPE-MANUFACTURER-KERNEL-OPERATING_SYSTEM
+# It is wrong to echo any other type of specification.
+
+me=`echo "$0" | sed -e 's,.*/,,'`
+
+usage="\
+Usage: $0 [OPTION] CPU-MFR-OPSYS
+       $0 [OPTION] ALIAS
+
+Canonicalize a configuration name.
+
+Operation modes:
+  -h, --help         print this help, then exit
+  -t, --time-stamp   print date of last modification, then exit
+  -v, --version      print version number, then exit
+
+Report bugs and patches to <config-patches@gnu.org>."
+
+version="\
+GNU config.sub ($timestamp)
+
+Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001
+Free Software Foundation, Inc.
+
+This is free software; see the source for copying conditions.  There is NO
+warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE."
+
+help="
+Try \`$me --help' for more information."
+
+# Parse command line
+while test $# -gt 0 ; do
+  case $1 in
+    --time-stamp | --time* | -t )
+       echo "$timestamp" ; exit 0 ;;
+    --version | -v )
+       echo "$version" ; exit 0 ;;
+    --help | --h* | -h )
+       echo "$usage"; exit 0 ;;
+    -- )     # Stop option processing
+       shift; break ;;
+    - )	# Use stdin as input.
+       break ;;
+    -* )
+       echo "$me: invalid option $1$help"
+       exit 1 ;;
+
+    *local*)
+       # First pass through any local machine types.
+       echo $1
+       exit 0;;
+
+    * )
+       break ;;
+  esac
+done
+
+case $# in
+ 0) echo "$me: missing argument$help" >&2
+    exit 1;;
+ 1) ;;
+ *) echo "$me: too many arguments$help" >&2
+    exit 1;;
+esac
+
+# Separate what the user gave into CPU-COMPANY and OS or KERNEL-OS (if any).
+# Here we must recognize all the valid KERNEL-OS combinations.
+maybe_os=`echo $1 | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\2/'`
+case $maybe_os in
+  nto-qnx* | linux-gnu* | freebsd*-gnu* | netbsd*-gnu* | storm-chaos* | os2-emx* | rtmk-nova*)
+    os=-$maybe_os
+    basic_machine=`echo $1 | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\1/'`
+    ;;
+  *)
+    basic_machine=`echo $1 | sed 's/-[^-]*$//'`
+    if [ $basic_machine != $1 ]
+    then os=`echo $1 | sed 's/.*-/-/'`
+    else os=; fi
+    ;;
+esac
+
+### Let's recognize common machines as not being operating systems so
+### that things like config.sub decstation-3100 work.  We also
+### recognize some manufacturers as not being operating systems, so we
+### can provide default operating systems below.
+case $os in
+	-sun*os*)
+		# Prevent following clause from handling this invalid input.
+		;;
+	-dec* | -mips* | -sequent* | -encore* | -pc532* | -sgi* | -sony* | \
+	-att* | -7300* | -3300* | -delta* | -motorola* | -sun[234]* | \
+	-unicom* | -ibm* | -next | -hp | -isi* | -apollo | -altos* | \
+	-convergent* | -ncr* | -news | -32* | -3600* | -3100* | -hitachi* |\
+	-c[123]* | -convex* | -sun | -crds | -omron* | -dg | -ultra | -tti* | \
+	-harris | -dolphin | -highlevel | -gould | -cbm | -ns | -masscomp | \
+	-apple | -axis)
+		os=
+		basic_machine=$1
+		;;
+	-sim | -cisco | -oki | -wec | -winbond)
+		os=
+		basic_machine=$1
+		;;
+	-scout)
+		;;
+	-wrs)
+		os=-vxworks
+		basic_machine=$1
+		;;
+	-chorusos*)
+		os=-chorusos
+		basic_machine=$1
+		;;
+ 	-chorusrdb)
+ 		os=-chorusrdb
+		basic_machine=$1
+ 		;;
+	-hiux*)
+		os=-hiuxwe2
+		;;
+	-sco5)
+		os=-sco3.2v5
+		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
+		;;
+	-sco4)
+		os=-sco3.2v4
+		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
+		;;
+	-sco3.2.[4-9]*)
+		os=`echo $os | sed -e 's/sco3.2./sco3.2v/'`
+		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
+		;;
+	-sco3.2v[4-9]*)
+		# Don't forget version if it is 3.2v4 or newer.
+		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
+		;;
+	-sco*)
+		os=-sco3.2v2
+		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
+		;;
+	-udk*)
+		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
+		;;
+	-isc)
+		os=-isc2.2
+		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
+		;;
+	-clix*)
+		basic_machine=clipper-intergraph
+		;;
+	-isc*)
+		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
+		;;
+	-lynx*)
+		os=-lynxos
+		;;
+	-ptx*)
+		basic_machine=`echo $1 | sed -e 's/86-.*/86-sequent/'`
+		;;
+	-windowsnt*)
+		os=`echo $os | sed -e 's/windowsnt/winnt/'`
+		;;
+	-psos*)
+		os=-psos
+		;;
+	-mint | -mint[0-9]*)
+		basic_machine=m68k-atari
+		os=-mint
+		;;
+esac
+
+# Decode aliases for certain CPU-COMPANY combinations.
+case $basic_machine in
+	# Recognize the basic CPU types without company name.
+	# Some are omitted here because they have special meanings below.
+	1750a | 580 \
+	| a29k \
+	| alpha | alphaev[4-8] | alphaev56 | alphaev6[78] | alphapca5[67] \
+	| alpha64 | alpha64ev[4-8] | alpha64ev56 | alpha64ev6[78] | alpha64pca5[67] \
+	| arc | arm | arm[bl]e | arme[lb] | armv[2345] | armv[345][lb] | avr \
+	| clipper \
+	| d10v | d30v | dlx | dsp16xx \
+	| fr30 | frv \
+	| h8300 | h8500 | hppa | hppa1.[01] | hppa2.0 | hppa2.0[nw] | hppa64 \
+	| i370 | i860 | i960 | ia64 \
+	| ip2k \
+	| m32r | m68000 | m68k | m88k | mcore \
+	| mips | mipsbe | mipseb | mipsel | mipsle \
+	| mips16 \
+	| mips64 | mips64el \
+	| mips64vr | mips64vrel \
+	| mips64orion | mips64orionel \
+	| mips64vr4100 | mips64vr4100el \
+	| mips64vr4300 | mips64vr4300el \
+	| mips64vr5000 | mips64vr5000el \
+	| mipsisa32 | mipsisa32el \
+	| mipsisa32r2 | mipsisa32r2el \
+	| mipsisa64 | mipsisa64el \
+	| mipsisa64sb1 | mipsisa64sb1el \
+	| mipsisa64sr71k | mipsisa64sr71kel \
+	| mipstx39 | mipstx39el \
+	| mn10200 | mn10300 \
+	| msp430 \
+	| ns16k | ns32k \
+	| openrisc | or32 \
+	| pdp10 | pdp11 | pj | pjl \
+	| powerpc | powerpc64 | powerpc64le | powerpcle | ppcbe \
+	| pyramid \
+	| sh | sh[1234] | sh[23]e | sh[34]eb | shbe | shle | sh[1234]le | sh3ele \
+	| sh64 | sh64le \
+	| sparc | sparc64 | sparc86x | sparclet | sparclite | sparcv9 | sparcv9b \
+	| strongarm \
+	| tahoe | thumb | tic80 | tron \
+	| v850 | v850e \
+	| we32k \
+	| x86 | xscale | xstormy16 | xtensa \
+	| z8k)
+		basic_machine=$basic_machine-unknown
+		;;
+	m6811 | m68hc11 | m6812 | m68hc12)
+		# Motorola 68HC11/12.
+		basic_machine=$basic_machine-unknown
+		os=-none
+		;;
+	m88110 | m680[12346]0 | m683?2 | m68360 | m5200 | v70 | w65 | z8k)
+		;;
+
+	# We use `pc' rather than `unknown'
+	# because (1) that's what they normally are, and
+	# (2) the word "unknown" tends to confuse beginning users.
+	i*86 | x86_64)
+	  basic_machine=$basic_machine-pc
+	  ;;
+	# Object if more than one company name word.
+	*-*-*)
+		echo Invalid configuration \`$1\': machine \`$basic_machine\' not recognized 1>&2
+		exit 1
+		;;
+	# Recognize the basic CPU types with company name.
+	580-* \
+	| a29k-* \
+	| alpha-* | alphaev[4-8]-* | alphaev56-* | alphaev6[78]-* \
+	| alpha64-* | alpha64ev[4-8]-* | alpha64ev56-* | alpha64ev6[78]-* \
+	| alphapca5[67]-* | alpha64pca5[67]-* | arc-* \
+	| arm-*  | armbe-* | armle-* | armeb-* | armv*-* \
+	| avr-* \
+	| bs2000-* \
+	| c[123]* | c30-* | [cjt]90-* | c4x-* | c54x-* | c55x-* | c6x-* \
+	| clipper-* | cydra-* \
+	| d10v-* | d30v-* | dlx-* \
+	| elxsi-* \
+	| f30[01]-* | f700-* | fr30-* | frv-* | fx80-* \
+	| h8300-* | h8500-* \
+	| hppa-* | hppa1.[01]-* | hppa2.0-* | hppa2.0[nw]-* | hppa64-* \
+	| i*86-* | i860-* | i960-* | ia64-* \
+	| ip2k-* \
+	| m32r-* \
+	| m68000-* | m680[012346]0-* | m68360-* | m683?2-* | m68k-* \
+	| m88110-* | m88k-* | mcore-* \
+	| mips-* | mipsbe-* | mipseb-* | mipsel-* | mipsle-* \
+	| mips16-* \
+	| mips64-* | mips64el-* \
+	| mips64vr-* | mips64vrel-* \
+	| mips64orion-* | mips64orionel-* \
+	| mips64vr4100-* | mips64vr4100el-* \
+	| mips64vr4300-* | mips64vr4300el-* \
+	| mips64vr5000-* | mips64vr5000el-* \
+	| mipsisa32-* | mipsisa32el-* \
+	| mipsisa32r2-* | mipsisa32r2el-* \
+	| mipsisa64-* | mipsisa64el-* \
+	| mipsisa64sb1-* | mipsisa64sb1el-* \
+	| mipsisa64sr71k-* | mipsisa64sr71kel-* \
+	| mipstx39-* | mipstx39el-* \
+	| msp430-* \
+	| none-* | np1-* | nv1-* | ns16k-* | ns32k-* \
+	| orion-* \
+	| pdp10-* | pdp11-* | pj-* | pjl-* | pn-* | power-* \
+	| powerpc-* | powerpc64-* | powerpc64le-* | powerpcle-* | ppcbe-* \
+	| pyramid-* \
+	| romp-* | rs6000-* \
+	| sh-* | sh[1234]-* | sh[23]e-* | sh[34]eb-* | shbe-* \
+	| shle-* | sh[1234]le-* | sh3ele-* | sh64-* | sh64le-* \
+	| sparc-* | sparc64-* | sparc86x-* | sparclet-* | sparclite-* \
+	| sparcv9-* | sparcv9b-* | strongarm-* | sv1-* | sx?-* \
+	| tahoe-* | thumb-* \
+	| tic30-* | tic4x-* | tic54x-* | tic55x-* | tic6x-* | tic80-* \
+	| tron-* \
+	| v850-* | v850e-* | vax-* \
+	| we32k-* \
+	| x86-* | x86_64-* | xps100-* | xscale-* | xstormy16-* \
+	| xtensa-* \
+	| ymp-* \
+	| z8k-*)
+		;;
+	# Recognize the various machine names and aliases which stand
+	# for a CPU type and a company and sometimes even an OS.
+	386bsd)
+		basic_machine=i386-unknown
+		os=-bsd
+		;;
+	3b1 | 7300 | 7300-att | att-7300 | pc7300 | safari | unixpc)
+		basic_machine=m68000-att
+		;;
+	3b*)
+		basic_machine=we32k-att
+		;;
+	a29khif)
+		basic_machine=a29k-amd
+		os=-udi
+		;;
+	adobe68k)
+		basic_machine=m68010-adobe
+		os=-scout
+		;;
+	alliant | fx80)
+		basic_machine=fx80-alliant
+		;;
+	altos | altos3068)
+		basic_machine=m68k-altos
+		;;
+	am29k)
+		basic_machine=a29k-none
+		os=-bsd
+		;;
+	amd64)
+		basic_machine=x86_64-pc
+		;;
+	amdahl)
+		basic_machine=580-amdahl
+		os=-sysv
+		;;
+	amiga | amiga-*)
+		basic_machine=m68k-unknown
+		;;
+	amigaos | amigados)
+		basic_machine=m68k-unknown
+		os=-amigaos
+		;;
+	amigaunix | amix)
+		basic_machine=m68k-unknown
+		os=-sysv4
+		;;
+	apollo68)
+		basic_machine=m68k-apollo
+		os=-sysv
+		;;
+	apollo68bsd)
+		basic_machine=m68k-apollo
+		os=-bsd
+		;;
+	aux)
+		basic_machine=m68k-apple
+		os=-aux
+		;;
+	balance)
+		basic_machine=ns32k-sequent
+		os=-dynix
+		;;
+	c90)
+		basic_machine=c90-cray
+		os=-unicos
+		;;
+	convex-c1)
+		basic_machine=c1-convex
+		os=-bsd
+		;;
+	convex-c2)
+		basic_machine=c2-convex
+		os=-bsd
+		;;
+	convex-c32)
+		basic_machine=c32-convex
+		os=-bsd
+		;;
+	convex-c34)
+		basic_machine=c34-convex
+		os=-bsd
+		;;
+	convex-c38)
+		basic_machine=c38-convex
+		os=-bsd
+		;;
+	cray | j90)
+		basic_machine=j90-cray
+		os=-unicos
+		;;
+	crds | unos)
+		basic_machine=m68k-crds
+		;;
+	cris | cris-* | etrax*)
+		basic_machine=cris-axis
+		;;
+	da30 | da30-*)
+		basic_machine=m68k-da30
+		;;
+	decstation | decstation-3100 | pmax | pmax-* | pmin | dec3100 | decstatn)
+		basic_machine=mips-dec
+		;;
+	decsystem10* | dec10*)
+		basic_machine=pdp10-dec
+		os=-tops10
+		;;
+	decsystem20* | dec20*)
+		basic_machine=pdp10-dec
+		os=-tops20
+		;;
+	delta | 3300 | motorola-3300 | motorola-delta \
+	      | 3300-motorola | delta-motorola)
+		basic_machine=m68k-motorola
+		;;
+	delta88)
+		basic_machine=m88k-motorola
+		os=-sysv3
+		;;
+	dpx20 | dpx20-*)
+		basic_machine=rs6000-bull
+		os=-bosx
+		;;
+	dpx2* | dpx2*-bull)
+		basic_machine=m68k-bull
+		os=-sysv3
+		;;
+	ebmon29k)
+		basic_machine=a29k-amd
+		os=-ebmon
+		;;
+	elxsi)
+		basic_machine=elxsi-elxsi
+		os=-bsd
+		;;
+	encore | umax | mmax)
+		basic_machine=ns32k-encore
+		;;
+	es1800 | OSE68k | ose68k | ose | OSE)
+		basic_machine=m68k-ericsson
+		os=-ose
+		;;
+	fx2800)
+		basic_machine=i860-alliant
+		;;
+	genix)
+		basic_machine=ns32k-ns
+		;;
+	gmicro)
+		basic_machine=tron-gmicro
+		os=-sysv
+		;;
+	go32)
+		basic_machine=i386-pc
+		os=-go32
+		;;
+	h3050r* | hiux*)
+		basic_machine=hppa1.1-hitachi
+		os=-hiuxwe2
+		;;
+	h8300hms)
+		basic_machine=h8300-hitachi
+		os=-hms
+		;;
+	h8300xray)
+		basic_machine=h8300-hitachi
+		os=-xray
+		;;
+	h8500hms)
+		basic_machine=h8500-hitachi
+		os=-hms
+		;;
+	harris)
+		basic_machine=m88k-harris
+		os=-sysv3
+		;;
+	hp300-*)
+		basic_machine=m68k-hp
+		;;
+	hp300bsd)
+		basic_machine=m68k-hp
+		os=-bsd
+		;;
+	hp300hpux)
+		basic_machine=m68k-hp
+		os=-hpux
+		;;
+	hp3k9[0-9][0-9] | hp9[0-9][0-9])
+		basic_machine=hppa1.0-hp
+		;;
+	hp9k2[0-9][0-9] | hp9k31[0-9])
+		basic_machine=m68000-hp
+		;;
+	hp9k3[2-9][0-9])
+		basic_machine=m68k-hp
+		;;
+	hp9k6[0-9][0-9] | hp6[0-9][0-9])
+		basic_machine=hppa1.0-hp
+		;;
+	hp9k7[0-79][0-9] | hp7[0-79][0-9])
+		basic_machine=hppa1.1-hp
+		;;
+	hp9k78[0-9] | hp78[0-9])
+		# FIXME: really hppa2.0-hp
+		basic_machine=hppa1.1-hp
+		;;
+	hp9k8[67]1 | hp8[67]1 | hp9k80[24] | hp80[24] | hp9k8[78]9 | hp8[78]9 | hp9k893 | hp893)
+		# FIXME: really hppa2.0-hp
+		basic_machine=hppa1.1-hp
+		;;
+	hp9k8[0-9][13679] | hp8[0-9][13679])
+		basic_machine=hppa1.1-hp
+		;;
+	hp9k8[0-9][0-9] | hp8[0-9][0-9])
+		basic_machine=hppa1.0-hp
+		;;
+	hppa-next)
+		os=-nextstep3
+		;;
+	hppaosf)
+		basic_machine=hppa1.1-hp
+		os=-osf
+		;;
+	hppro)
+		basic_machine=hppa1.1-hp
+		os=-proelf
+		;;
+	i370-ibm* | ibm*)
+		basic_machine=i370-ibm
+		;;
+# I'm not sure what "Sysv32" means.  Should this be sysv3.2?
+	i*86v32)
+		basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'`
+		os=-sysv32
+		;;
+	i*86v4*)
+		basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'`
+		os=-sysv4
+		;;
+	i*86v)
+		basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'`
+		os=-sysv
+		;;
+	i*86sol2)
+		basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'`
+		os=-solaris2
+		;;
+	i386mach)
+		basic_machine=i386-mach
+		os=-mach
+		;;
+	i386-vsta | vsta)
+		basic_machine=i386-unknown
+		os=-vsta
+		;;
+	iris | iris4d)
+		basic_machine=mips-sgi
+		case $os in
+		    -irix*)
+			;;
+		    *)
+			os=-irix4
+			;;
+		esac
+		;;
+	isi68 | isi)
+		basic_machine=m68k-isi
+		os=-sysv
+		;;
+	m88k-omron*)
+		basic_machine=m88k-omron
+		;;
+	magnum | m3230)
+		basic_machine=mips-mips
+		os=-sysv
+		;;
+	merlin)
+		basic_machine=ns32k-utek
+		os=-sysv
+		;;
+	mingw32)
+		basic_machine=i386-pc
+		os=-mingw32
+		;;
+	miniframe)
+		basic_machine=m68000-convergent
+		;;
+	*mint | -mint[0-9]* | *MiNT | *MiNT[0-9]*)
+		basic_machine=m68k-atari
+		os=-mint
+		;;
+	mips3*-*)
+		basic_machine=`echo $basic_machine | sed -e 's/mips3/mips64/'`
+		;;
+	mips3*)
+		basic_machine=`echo $basic_machine | sed -e 's/mips3/mips64/'`-unknown
+		;;
+	mmix*)
+		basic_machine=mmix-knuth
+		os=-mmixware
+		;;
+	monitor)
+		basic_machine=m68k-rom68k
+		os=-coff
+		;;
+	morphos)
+		basic_machine=powerpc-unknown
+		os=-morphos
+		;;
+	msdos)
+		basic_machine=i386-pc
+		os=-msdos
+		;;
+	mvs)
+		basic_machine=i370-ibm
+		os=-mvs
+		;;
+	ncr3000)
+		basic_machine=i486-ncr
+		os=-sysv4
+		;;
+	netbsd386)
+		basic_machine=i386-unknown
+		os=-netbsd
+		;;
+	netwinder)
+		basic_machine=armv4l-rebel
+		os=-linux
+		;;
+	news | news700 | news800 | news900)
+		basic_machine=m68k-sony
+		os=-newsos
+		;;
+	news1000)
+		basic_machine=m68030-sony
+		os=-newsos
+		;;
+	news-3600 | risc-news)
+		basic_machine=mips-sony
+		os=-newsos
+		;;
+	necv70)
+		basic_machine=v70-nec
+		os=-sysv
+		;;
+	next | m*-next )
+		basic_machine=m68k-next
+		case $os in
+		    -nextstep* )
+			;;
+		    -ns2*)
+		      os=-nextstep2
+			;;
+		    *)
+		      os=-nextstep3
+			;;
+		esac
+		;;
+	nh3000)
+		basic_machine=m68k-harris
+		os=-cxux
+		;;
+	nh[45]000)
+		basic_machine=m88k-harris
+		os=-cxux
+		;;
+	nindy960)
+		basic_machine=i960-intel
+		os=-nindy
+		;;
+	mon960)
+		basic_machine=i960-intel
+		os=-mon960
+		;;
+	nonstopux)
+		basic_machine=mips-compaq
+		os=-nonstopux
+		;;
+	np1)
+		basic_machine=np1-gould
+		;;
+	nv1)
+		basic_machine=nv1-cray
+		os=-unicosmp
+		;;
+	nsr-tandem)
+		basic_machine=nsr-tandem
+		;;
+	op50n-* | op60c-*)
+		basic_machine=hppa1.1-oki
+		os=-proelf
+		;;
+	or32 | or32-*)
+		basic_machine=or32-unknown
+		os=-coff
+		;;
+	OSE68000 | ose68000)
+		basic_machine=m68000-ericsson
+		os=-ose
+		;;
+	os68k)
+		basic_machine=m68k-none
+		os=-os68k
+		;;
+	pa-hitachi)
+		basic_machine=hppa1.1-hitachi
+		os=-hiuxwe2
+		;;
+	paragon)
+		basic_machine=i860-intel
+		os=-osf
+		;;
+	pbd)
+		basic_machine=sparc-tti
+		;;
+	pbb)
+		basic_machine=m68k-tti
+		;;
+	pc532 | pc532-*)
+		basic_machine=ns32k-pc532
+		;;
+	pentium | p5 | k5 | k6 | nexgen | viac3)
+		basic_machine=i586-pc
+		;;
+	pentiumpro | p6 | 6x86 | athlon | athlon_*)
+		basic_machine=i686-pc
+		;;
+	pentiumii | pentium2)
+		basic_machine=i686-pc
+		;;
+	pentium-* | p5-* | k5-* | k6-* | nexgen-* | viac3-*)
+		basic_machine=i586-`echo $basic_machine | sed 's/^[^-]*-//'`
+		;;
+	pentiumpro-* | p6-* | 6x86-* | athlon-*)
+		basic_machine=i686-`echo $basic_machine | sed 's/^[^-]*-//'`
+		;;
+	pentiumii-* | pentium2-*)
+		basic_machine=i686-`echo $basic_machine | sed 's/^[^-]*-//'`
+		;;
+	pn)
+		basic_machine=pn-gould
+		;;
+	power)	basic_machine=power-ibm
+		;;
+	ppc)	basic_machine=powerpc-unknown
+		;;
+	ppc-*)	basic_machine=powerpc-`echo $basic_machine | sed 's/^[^-]*-//'`
+		;;
+	ppcle | powerpclittle | ppc-le | powerpc-little)
+		basic_machine=powerpcle-unknown
+		;;
+	ppcle-* | powerpclittle-*)
+		basic_machine=powerpcle-`echo $basic_machine | sed 's/^[^-]*-//'`
+		;;
+	ppc64)	basic_machine=powerpc64-unknown
+		;;
+	ppc64-*) basic_machine=powerpc64-`echo $basic_machine | sed 's/^[^-]*-//'`
+		;;
+	ppc64le | powerpc64little | ppc64-le | powerpc64-little)
+		basic_machine=powerpc64le-unknown
+		;;
+	ppc64le-* | powerpc64little-*)
+		basic_machine=powerpc64le-`echo $basic_machine | sed 's/^[^-]*-//'`
+		;;
+	ps2)
+		basic_machine=i386-ibm
+		;;
+	pw32)
+		basic_machine=i586-unknown
+		os=-pw32
+		;;
+	rom68k)
+		basic_machine=m68k-rom68k
+		os=-coff
+		;;
+	rm[46]00)
+		basic_machine=mips-siemens
+		;;
+	rtpc | rtpc-*)
+		basic_machine=romp-ibm
+		;;
+	s390 | s390-*)
+		basic_machine=s390-ibm
+		;;
+	s390x | s390x-*)
+		basic_machine=s390x-ibm
+		;;
+	sa29200)
+		basic_machine=a29k-amd
+		os=-udi
+		;;
+	sb1)
+		basic_machine=mipsisa64sb1-unknown
+		;;
+	sb1el)
+		basic_machine=mipsisa64sb1el-unknown
+		;;
+	sequent)
+		basic_machine=i386-sequent
+		;;
+	sh)
+		basic_machine=sh-hitachi
+		os=-hms
+		;;
+	sparclite-wrs | simso-wrs)
+		basic_machine=sparclite-wrs
+		os=-vxworks
+		;;
+	sps7)
+		basic_machine=m68k-bull
+		os=-sysv2
+		;;
+	spur)
+		basic_machine=spur-unknown
+		;;
+	st2000)
+		basic_machine=m68k-tandem
+		;;
+	stratus)
+		basic_machine=i860-stratus
+		os=-sysv4
+		;;
+	sun2)
+		basic_machine=m68000-sun
+		;;
+	sun2os3)
+		basic_machine=m68000-sun
+		os=-sunos3
+		;;
+	sun2os4)
+		basic_machine=m68000-sun
+		os=-sunos4
+		;;
+	sun3os3)
+		basic_machine=m68k-sun
+		os=-sunos3
+		;;
+	sun3os4)
+		basic_machine=m68k-sun
+		os=-sunos4
+		;;
+	sun4os3)
+		basic_machine=sparc-sun
+		os=-sunos3
+		;;
+	sun4os4)
+		basic_machine=sparc-sun
+		os=-sunos4
+		;;
+	sun4sol2)
+		basic_machine=sparc-sun
+		os=-solaris2
+		;;
+	sun3 | sun3-*)
+		basic_machine=m68k-sun
+		;;
+	sun4)
+		basic_machine=sparc-sun
+		;;
+	sun386 | sun386i | roadrunner)
+		basic_machine=i386-sun
+		;;
+	sv1)
+		basic_machine=sv1-cray
+		os=-unicos
+		;;
+	symmetry)
+		basic_machine=i386-sequent
+		os=-dynix
+		;;
+	t3e)
+		basic_machine=alphaev5-cray
+		os=-unicos
+		;;
+	t90)
+		basic_machine=t90-cray
+		os=-unicos
+		;;
+        tic4x | c4x*)
+		basic_machine=tic4x-unknown
+		os=-coff
+		;;
+	tic54x | c54x*)
+		basic_machine=tic54x-unknown
+		os=-coff
+		;;
+	tic55x | c55x*)
+		basic_machine=tic55x-unknown
+		os=-coff
+		;;
+	tic6x | c6x*)
+		basic_machine=tic6x-unknown
+		os=-coff
+		;;
+	tx39)
+		basic_machine=mipstx39-unknown
+		;;
+	tx39el)
+		basic_machine=mipstx39el-unknown
+		;;
+	toad1)
+		basic_machine=pdp10-xkl
+		os=-tops20
+		;;
+	tower | tower-32)
+		basic_machine=m68k-ncr
+		;;
+	udi29k)
+		basic_machine=a29k-amd
+		os=-udi
+		;;
+	ultra3)
+		basic_machine=a29k-nyu
+		os=-sym1
+		;;
+	v810 | necv810)
+		basic_machine=v810-nec
+		os=-none
+		;;
+	vaxv)
+		basic_machine=vax-dec
+		os=-sysv
+		;;
+	vms)
+		basic_machine=vax-dec
+		os=-vms
+		;;
+	vpp*|vx|vx-*)
+		basic_machine=f301-fujitsu
+		;;
+	vxworks960)
+		basic_machine=i960-wrs
+		os=-vxworks
+		;;
+	vxworks68)
+		basic_machine=m68k-wrs
+		os=-vxworks
+		;;
+	vxworks29k)
+		basic_machine=a29k-wrs
+		os=-vxworks
+		;;
+	w65*)
+		basic_machine=w65-wdc
+		os=-none
+		;;
+	w89k-*)
+		basic_machine=hppa1.1-winbond
+		os=-proelf
+		;;
+	xps | xps100)
+		basic_machine=xps100-honeywell
+		;;
+	ymp)
+		basic_machine=ymp-cray
+		os=-unicos
+		;;
+	z8k-*-coff)
+		basic_machine=z8k-unknown
+		os=-sim
+		;;
+	none)
+		basic_machine=none-none
+		os=-none
+		;;
+
+# Here we handle the default manufacturer of certain CPU types.  It is in
+# some cases the only manufacturer, in others, it is the most popular.
+	w89k)
+		basic_machine=hppa1.1-winbond
+		;;
+	op50n)
+		basic_machine=hppa1.1-oki
+		;;
+	op60c)
+		basic_machine=hppa1.1-oki
+		;;
+	romp)
+		basic_machine=romp-ibm
+		;;
+	rs6000)
+		basic_machine=rs6000-ibm
+		;;
+	vax)
+		basic_machine=vax-dec
+		;;
+	pdp10)
+		# there are many clones, so DEC is not a safe bet
+		basic_machine=pdp10-unknown
+		;;
+	pdp11)
+		basic_machine=pdp11-dec
+		;;
+	we32k)
+		basic_machine=we32k-att
+		;;
+	sh3 | sh4 | sh[34]eb | sh[1234]le | sh[23]ele)
+		basic_machine=sh-unknown
+		;;
+	sh64)
+		basic_machine=sh64-unknown
+		;;
+	sparc | sparcv9 | sparcv9b)
+		basic_machine=sparc-sun
+		;;
+	cydra)
+		basic_machine=cydra-cydrome
+		;;
+	orion)
+		basic_machine=orion-highlevel
+		;;
+	orion105)
+		basic_machine=clipper-highlevel
+		;;
+	mac | mpw | mac-mpw)
+		basic_machine=m68k-apple
+		;;
+	pmac | pmac-mpw)
+		basic_machine=powerpc-apple
+		;;
+	*-unknown)
+		# Make sure to match an already-canonicalized machine name.
+		;;
+	*)
+		echo Invalid configuration \`$1\': machine \`$basic_machine\' not recognized 1>&2
+		exit 1
+		;;
+esac
+
+# Here we canonicalize certain aliases for manufacturers.
+case $basic_machine in
+	*-digital*)
+		basic_machine=`echo $basic_machine | sed 's/digital.*/dec/'`
+		;;
+	*-commodore*)
+		basic_machine=`echo $basic_machine | sed 's/commodore.*/cbm/'`
+		;;
+	*)
+		;;
+esac
+
+# Decode manufacturer-specific aliases for certain operating systems.
+
+if [ x"$os" != x"" ]
+then
+case $os in
+        # First match some system type aliases
+        # that might get confused with valid system types.
+	# -solaris* is a basic system type, with this one exception.
+	-solaris1 | -solaris1.*)
+		os=`echo $os | sed -e 's|solaris1|sunos4|'`
+		;;
+	-solaris)
+		os=-solaris2
+		;;
+	-svr4*)
+		os=-sysv4
+		;;
+	-unixware*)
+		os=-sysv4.2uw
+		;;
+	-gnu/linux*)
+		os=`echo $os | sed -e 's|gnu/linux|linux-gnu|'`
+		;;
+	# First accept the basic system types.
+	# The portable systems comes first.
+	# Each alternative MUST END IN A *, to match a version number.
+	# -sysv* is not here because it comes later, after sysvr4.
+	-gnu* | -bsd* | -mach* | -minix* | -genix* | -ultrix* | -irix* \
+	      | -*vms* | -sco* | -esix* | -isc* | -aix* | -sunos | -sunos[34]*\
+	      | -hpux* | -unos* | -osf* | -luna* | -dgux* | -solaris* | -sym* \
+	      | -amigaos* | -amigados* | -msdos* | -newsos* | -unicos* | -aof* \
+	      | -aos* \
+	      | -nindy* | -vxsim* | -vxworks* | -ebmon* | -hms* | -mvs* \
+	      | -clix* | -riscos* | -uniplus* | -iris* | -rtu* | -xenix* \
+	      | -hiux* | -386bsd* | -netbsd* | -openbsd* | -freebsd* | -riscix* \
+	      | -lynxos* | -bosx* | -nextstep* | -cxux* | -aout* | -elf* | -oabi* \
+	      | -ptx* | -coff* | -ecoff* | -winnt* | -domain* | -vsta* \
+	      | -udi* | -eabi* | -lites* | -ieee* | -go32* | -aux* \
+	      | -chorusos* | -chorusrdb* \
+	      | -cygwin* | -pe* | -psos* | -moss* | -proelf* | -rtems* \
+	      | -mingw32* | -linux-gnu* | -uxpv* | -beos* | -mpeix* | -udk* \
+	      | -interix* | -uwin* | -mks* | -rhapsody* | -darwin* | -opened* \
+	      | -openstep* | -oskit* | -conix* | -pw32* | -nonstopux* \
+	      | -storm-chaos* | -tops10* | -tenex* | -tops20* | -its* \
+	      | -os2* | -vos* | -palmos* | -uclinux* | -nucleus* \
+	      | -morphos* | -superux* | -rtmk* | -rtmk-nova* | -windiss* \
+	      | -powermax* | -dnix*)
+	# Remember, each alternative MUST END IN *, to match a version number.
+		;;
+	-qnx*)
+		case $basic_machine in
+		    x86-* | i*86-*)
+			;;
+		    *)
+			os=-nto$os
+			;;
+		esac
+		;;
+	-nto-qnx*)
+		;;
+	-nto*)
+		os=`echo $os | sed -e 's|nto|nto-qnx|'`
+		;;
+	-sim | -es1800* | -hms* | -xray | -os68k* | -none* | -v88r* \
+	      | -windows* | -osx | -abug | -netware* | -os9* | -beos* \
+	      | -macos* | -mpw* | -magic* | -mmixware* | -mon960* | -lnews*)
+		;;
+	-mac*)
+		os=`echo $os | sed -e 's|mac|macos|'`
+		;;
+	-linux*)
+		os=`echo $os | sed -e 's|linux|linux-gnu|'`
+		;;
+	-sunos5*)
+		os=`echo $os | sed -e 's|sunos5|solaris2|'`
+		;;
+	-sunos6*)
+		os=`echo $os | sed -e 's|sunos6|solaris3|'`
+		;;
+	-opened*)
+		os=-openedition
+		;;
+	-wince*)
+		os=-wince
+		;;
+	-osfrose*)
+		os=-osfrose
+		;;
+	-osf*)
+		os=-osf
+		;;
+	-utek*)
+		os=-bsd
+		;;
+	-dynix*)
+		os=-bsd
+		;;
+	-acis*)
+		os=-aos
+		;;
+	-atheos*)
+		os=-atheos
+		;;
+	-386bsd)
+		os=-bsd
+		;;
+	-ctix* | -uts*)
+		os=-sysv
+		;;
+	-nova*)
+		os=-rtmk-nova
+		;;
+	-ns2 )
+		os=-nextstep2
+		;;
+	-nsk*)
+		os=-nsk
+		;;
+	# Preserve the version number of sinix5.
+	-sinix5.*)
+		os=`echo $os | sed -e 's|sinix|sysv|'`
+		;;
+	-sinix*)
+		os=-sysv4
+		;;
+	-triton*)
+		os=-sysv3
+		;;
+	-oss*)
+		os=-sysv3
+		;;
+	-svr4)
+		os=-sysv4
+		;;
+	-svr3)
+		os=-sysv3
+		;;
+	-sysvr4)
+		os=-sysv4
+		;;
+	# This must come after -sysvr4.
+	-sysv*)
+		;;
+	-ose*)
+		os=-ose
+		;;
+	-es1800*)
+		os=-ose
+		;;
+	-xenix)
+		os=-xenix
+		;;
+	-*mint | -mint[0-9]* | -*MiNT | -MiNT[0-9]*)
+		os=-mint
+		;;
+	-aros*)
+		os=-aros
+		;;
+	-kaos*)
+		os=-kaos
+		;;
+	-none)
+		;;
+	*)
+		# Get rid of the `-' at the beginning of $os.
+		os=`echo $os | sed 's/[^-]*-//'`
+		echo Invalid configuration \`$1\': system \`$os\' not recognized 1>&2
+		exit 1
+		;;
+esac
+else
+
+# Here we handle the default operating systems that come with various machines.
+# The value should be what the vendor currently ships out the door with their
+# machine or put another way, the most popular os provided with the machine.
+
+# Note that if you're going to try to match "-MANUFACTURER" here (say,
+# "-sun"), then you have to tell the case statement up towards the top
+# that MANUFACTURER isn't an operating system.  Otherwise, code above
+# will signal an error saying that MANUFACTURER isn't an operating
+# system, and we'll never get to this point.
+
+case $basic_machine in
+	*-acorn)
+		os=-riscix1.2
+		;;
+	arm*-rebel)
+		os=-linux
+		;;
+	arm*-semi)
+		os=-aout
+		;;
+	# This must come before the *-dec entry.
+	pdp10-*)
+		os=-tops20
+		;;
+	pdp11-*)
+		os=-none
+		;;
+	*-dec | vax-*)
+		os=-ultrix4.2
+		;;
+	m68*-apollo)
+		os=-domain
+		;;
+	i386-sun)
+		os=-sunos4.0.2
+		;;
+	m68000-sun)
+		os=-sunos3
+		# This also exists in the configure program, but was not the
+		# default.
+		# os=-sunos4
+		;;
+	m68*-cisco)
+		os=-aout
+		;;
+	mips*-cisco)
+		os=-elf
+		;;
+	mips*-*)
+		os=-elf
+		;;
+	or32-*)
+		os=-coff
+		;;
+	*-tti)	# must be before sparc entry or we get the wrong os.
+		os=-sysv3
+		;;
+	sparc-* | *-sun)
+		os=-sunos4.1.1
+		;;
+	*-be)
+		os=-beos
+		;;
+	*-ibm)
+		os=-aix
+		;;
+	*-wec)
+		os=-proelf
+		;;
+	*-winbond)
+		os=-proelf
+		;;
+	*-oki)
+		os=-proelf
+		;;
+	*-hp)
+		os=-hpux
+		;;
+	*-hitachi)
+		os=-hiux
+		;;
+	i860-* | *-att | *-ncr | *-altos | *-motorola | *-convergent)
+		os=-sysv
+		;;
+	*-cbm)
+		os=-amigaos
+		;;
+	*-dg)
+		os=-dgux
+		;;
+	*-dolphin)
+		os=-sysv3
+		;;
+	m68k-ccur)
+		os=-rtu
+		;;
+	m88k-omron*)
+		os=-luna
+		;;
+	*-next )
+		os=-nextstep
+		;;
+	*-sequent)
+		os=-ptx
+		;;
+	*-crds)
+		os=-unos
+		;;
+	*-ns)
+		os=-genix
+		;;
+	i370-*)
+		os=-mvs
+		;;
+	*-next)
+		os=-nextstep3
+		;;
+	*-gould)
+		os=-sysv
+		;;
+	*-highlevel)
+		os=-bsd
+		;;
+	*-encore)
+		os=-bsd
+		;;
+	*-sgi)
+		os=-irix
+		;;
+	*-siemens)
+		os=-sysv4
+		;;
+	*-masscomp)
+		os=-rtu
+		;;
+	f30[01]-fujitsu | f700-fujitsu)
+		os=-uxpv
+		;;
+	*-rom68k)
+		os=-coff
+		;;
+	*-*bug)
+		os=-coff
+		;;
+	*-apple)
+		os=-macos
+		;;
+	*-atari*)
+		os=-mint
+		;;
+	*)
+		os=-none
+		;;
+esac
+fi
+
+# Here we handle the case where we know the os, and the CPU type, but not the
+# manufacturer.  We pick the logical manufacturer.
+vendor=unknown
+case $basic_machine in
+	*-unknown)
+		case $os in
+			-riscix*)
+				vendor=acorn
+				;;
+			-sunos*)
+				vendor=sun
+				;;
+			-aix*)
+				vendor=ibm
+				;;
+			-beos*)
+				vendor=be
+				;;
+			-hpux*)
+				vendor=hp
+				;;
+			-mpeix*)
+				vendor=hp
+				;;
+			-hiux*)
+				vendor=hitachi
+				;;
+			-unos*)
+				vendor=crds
+				;;
+			-dgux*)
+				vendor=dg
+				;;
+			-luna*)
+				vendor=omron
+				;;
+			-genix*)
+				vendor=ns
+				;;
+			-mvs* | -opened*)
+				vendor=ibm
+				;;
+			-ptx*)
+				vendor=sequent
+				;;
+			-vxsim* | -vxworks* | -windiss*)
+				vendor=wrs
+				;;
+			-aux*)
+				vendor=apple
+				;;
+			-hms*)
+				vendor=hitachi
+				;;
+			-mpw* | -macos*)
+				vendor=apple
+				;;
+			-*mint | -mint[0-9]* | -*MiNT | -MiNT[0-9]*)
+				vendor=atari
+				;;
+			-vos*)
+				vendor=stratus
+				;;
+		esac
+		basic_machine=`echo $basic_machine | sed "s/unknown/$vendor/"`
+		;;
+esac
+
+echo $basic_machine$os
+exit 0
+
+# Local variables:
+# eval: (add-hook 'write-file-hooks 'time-stamp)
+# time-stamp-start: "timestamp='"
+# time-stamp-format: "%:y-%02m-%02d"
+# time-stamp-end: "'"
+# End:
diff --git a/configure.in b/configure.in
new file mode 100644
index 0000000..e860831
--- /dev/null
+++ b/configure.in
@@ -0,0 +1,617 @@
+#                                               -*- Autoconf -*-
+# Process this file with autoconf and autoheader to produce a configure script.
+
+# This Autoconf file was cobbled from various locations. In particular, a bunch
+# of the platform checks have been taken straight from OpenSSH's configure.ac
+# Huge thanks to them for dealing with the horrible platform-specifics :)
+
+AC_PREREQ(2.50)
+AC_INIT(buffer.c)
+
+OLDCFLAGS=$CFLAGS
+# Checks for programs.
+AC_PROG_CC
+AC_PROG_MAKE_SET
+
+if test -z "$LD" ; then
+	LD=$CC
+fi
+AC_SUBST(LD)	
+
+if test -z "$OLDCFLAGS" && test "$GCC" = "yes"; then
+	AC_MSG_RESULT(No \$CFLAGS set... using "-Os -W -Wall" for GCC)
+	CFLAGS="-Os -W -Wall"
+fi
+
+# Host specific options
+# this isn't a definitive list of hosts, they are just added as required
+AC_CANONICAL_HOST
+
+case "$host" in
+
+*-*-linux*)
+	no_ptmx_check=1
+	;;
+
+*-*-solaris*)
+	CFLAGS="$CFLAGS -I/usr/local/include"
+	LDFLAGS="$LDFLAGS -L/usr/local/lib -R/usr/local/lib"
+	conf_lastlog_location="/var/adm/lastlog"
+	AC_MSG_CHECKING(for obsolete utmp and wtmp in solaris2.x)
+	sol2ver=`echo "$host"| sed -e 's/.*[[0-9]]\.//'`
+	if test "$sol2ver" -ge 8; then
+		AC_MSG_RESULT(yes)
+		AC_DEFINE(DISABLE_UTMP,,Disable utmp)
+		AC_DEFINE(DISABLE_WTMP,,Disable wtmp)
+	else
+		AC_MSG_RESULT(no)
+	fi
+	AC_CHECK_LIB(socket, socket, LIBS="$LIBS -lsocket")
+	AC_CHECK_LIB(nsl, yp_match, LIBS="$LIBS -lnsl")
+	;;
+
+*-*-aix*)
+	AC_DEFINE(AIX,,Using AIX)
+	# OpenSSH thinks it's broken. If it isn't, let me know.
+	AC_DEFINE(BROKEN_GETADDRINFO,,Broken getaddrinfo)
+	;;
+	
+*-*-hpux*)
+	LIBS="$LIBS -lsec"
+	# It's probably broken.
+	AC_DEFINE(BROKEN_GETADDRINFO,,Broken getaddrinfo)
+	;;
+*-dec-osf*)
+	AC_DEFINE(BROKEN_GETADDRINFO,,Broken getaddrinfo)
+	;;
+esac
+
+AC_CHECK_TOOL(AR, ar, :)
+AC_CHECK_TOOL(RANLIB, ranlib, :)
+AC_CHECK_TOOL(STRIP, strip, :)
+AC_CHECK_TOOL(INSTALL, install, :)
+
+dnl Can't use login() or logout() with uclibc
+AC_CHECK_DECL(__UCLIBC__, 
+	[
+	no_loginfunc_check=1
+	AC_MSG_RESULT(Using uClibc - login() and logout() probably don't work, so we won't use them.)
+	],,,)
+
+# Checks for libraries.
+AC_CHECK_LIB(crypt, crypt, LIBS="$LIBS -lcrypt")
+
+# Check if zlib is needed
+AC_ARG_WITH(zlib,
+	[  --with-zlib=PATH        Use zlib in PATH],
+	[
+		# option is given
+		if test -d "$withval/lib"; then
+			LDFLAGS="-L${withval}/lib ${LDFLAGS}"
+		else
+			LDFLAGS="-L${withval} ${LDFLAGS}"
+		fi
+		if test -d "$withval/include"; then
+			CPPFLAGS="-I${withval}/include ${CPPFLAGS}"
+		else
+			CPPFLAGS="-I${withval} ${CPPFLAGS}"
+		fi
+	]
+)
+
+AC_ARG_ENABLE(zlib,
+	[  --disable-zlib          Don't include zlib support],
+	[
+		if test "x$enableval" = "xno"; then
+			AC_DEFINE(DISABLE_ZLIB,, Use zlib)
+			AC_MSG_RESULT(Disabling zlib)
+		else
+			AC_CHECK_LIB(z, deflate, , AC_MSG_ERROR([*** zlib missing - install first or check config.log ***]))
+			AC_MSG_RESULT(Enabling zlib)
+		fi
+	],
+	[
+		# if not disabled, check for zlib
+		AC_CHECK_LIB(z, deflate, , AC_MSG_ERROR([*** zlib missing - install first or check config.log ***]))
+		AC_MSG_RESULT(Enabling zlib)
+	]
+)
+
+# Check if pam is needed
+AC_ARG_WITH(pam,
+	[  --with-pam=PATH        Use pam in PATH],
+	[
+		# option is given
+		if test -d "$withval/lib"; then
+			LDFLAGS="-L${withval}/lib ${LDFLAGS}"
+		else
+			LDFLAGS="-L${withval} ${LDFLAGS}"
+		fi
+		if test -d "$withval/include"; then
+			CPPFLAGS="-I${withval}/include ${CPPFLAGS}"
+		else
+			CPPFLAGS="-I${withval} ${CPPFLAGS}"
+		fi
+	]
+)
+
+
+AC_ARG_ENABLE(pam,
+	[  --enable-pam          Try to include PAM support],
+	[
+		if test "x$enableval" = "xyes"; then
+			AC_CHECK_LIB(pam, pam_authenticate, , AC_MSG_ERROR([*** PAM missing - install first or check config.log ***]))
+			AC_MSG_RESULT(Enabling PAM)
+		else
+			AC_DEFINE(DISABLE_PAM,, Use PAM)
+			AC_MSG_RESULT(Disabling PAM)
+		fi
+	],
+	[
+		# disable it by default
+		AC_DEFINE(DISABLE_PAM,, Use PAM)
+		AC_MSG_RESULT(Disabling PAM)
+	]
+)
+
+AC_ARG_ENABLE(openpty,
+	[  --disable-openpty       Don't use openpty, use alternative method],
+	[
+		if test "x$enableval" = "xno"; then
+			AC_MSG_RESULT(Not using openpty)
+		else
+			AC_MSG_RESULT(Using openpty if available)
+			AC_SEARCH_LIBS(openpty, util, [AC_DEFINE(HAVE_OPENPTY,,Have openpty() function)])
+		fi
+	],
+	[
+		AC_MSG_RESULT(Using openpty if available)
+		AC_SEARCH_LIBS(openpty, util, [AC_DEFINE(HAVE_OPENPTY)])
+	]
+)
+		
+
+AC_ARG_ENABLE(syslog,
+	[  --disable-syslog        Don't include syslog support],
+	[
+		if test "x$enableval" = "xno"; then
+			AC_DEFINE(DISABLE_SYSLOG,, Using syslog)
+			AC_MSG_RESULT(Disabling syslog)
+		else
+			AC_MSG_RESULT(Enabling syslog)
+		fi
+	],
+	[
+		AC_MSG_RESULT(Enabling syslog)
+	]
+)
+
+AC_ARG_ENABLE(shadow,
+	[  --disable-shadow        Don't use shadow passwords (if available)],
+	[
+		if test "x$enableval" = "xno"; then
+			AC_MSG_RESULT(Not using shadow passwords)
+		else
+			AC_CHECK_HEADERS([shadow.h])
+			AC_MSG_RESULT(Using shadow passwords if available)
+		fi
+	],
+	[
+		AC_CHECK_HEADERS([shadow.h])
+		AC_MSG_RESULT(Using shadow passwords if available)
+	]
+)
+			
+
+# Checks for header files.
+AC_HEADER_STDC
+AC_HEADER_SYS_WAIT
+AC_CHECK_HEADERS([fcntl.h limits.h netinet/in.h netinet/tcp.h stdlib.h string.h sys/socket.h sys/time.h termios.h unistd.h crypt.h pty.h ioctl.h libutil.h libgen.h inttypes.h stropts.h utmp.h utmpx.h lastlog.h paths.h util.h netdb.h security/pam_appl.h pam/pam_appl.h netinet/in_systm.h])
+
+# Checks for typedefs, structures, and compiler characteristics.
+AC_C_CONST
+AC_TYPE_UID_T
+AC_TYPE_MODE_T
+AC_TYPE_PID_T
+AC_TYPE_SIZE_T
+AC_HEADER_TIME
+
+AC_CHECK_TYPES([uint16_t, u_int16_t, struct sockaddr_storage])
+AC_CHECK_TYPE([socklen_t], ,[
+	AC_MSG_CHECKING([for socklen_t equivalent])
+	AC_CACHE_VAL([curl_cv_socklen_t_equiv],
+	[
+	# Systems have either "struct sockaddr *" or
+	# "void *" as the second argument to getpeername
+	curl_cv_socklen_t_equiv=
+	for arg2 in "struct sockaddr" void; do
+		for t in int size_t unsigned long "unsigned long"; do
+		AC_TRY_COMPILE([
+			#include <sys/types.h>
+			#include <sys/socket.h>
+
+			int getpeername (int, $arg2 *, $t *);
+		],[
+			$t len;
+			getpeername(0,0,&len);
+		],[
+			curl_cv_socklen_t_equiv="$t"
+			break
+		])
+		done
+	done
+
+	if test "x$curl_cv_socklen_t_equiv" = x; then
+		AC_MSG_ERROR([Cannot find a type to use in place of socklen_t])
+	fi
+	])
+	AC_MSG_RESULT($curl_cv_socklen_t_equiv)
+	AC_DEFINE_UNQUOTED(socklen_t, $curl_cv_socklen_t_equiv,
+			[type to use in place of socklen_t if not defined])],
+	[#include <sys/types.h>
+	#include <sys/socket.h>])
+
+# for the fake-rfc2553 stuff - straight from OpenSSH
+
+AC_CACHE_CHECK([for struct sockaddr_storage], ac_cv_have_struct_sockaddr_storage, [
+	AC_TRY_COMPILE(
+		[
+#include <sys/types.h>
+#include <sys/socket.h>
+		],
+		[ struct sockaddr_storage s; ],
+		[ ac_cv_have_struct_sockaddr_storage="yes" ],
+		[ ac_cv_have_struct_sockaddr_storage="no" ]
+	)
+])
+if test "x$ac_cv_have_struct_sockaddr_storage" = "xyes" ; then
+	AC_DEFINE(HAVE_STRUCT_SOCKADDR_STORAGE)
+fi
+
+AC_CACHE_CHECK([for struct sockaddr_in6], ac_cv_have_struct_sockaddr_in6, [
+	AC_TRY_COMPILE(
+		[
+#include <sys/types.h>
+#include <netinet/in.h>
+		],
+		[ struct sockaddr_in6 s; s.sin6_family = 0; ],
+		[ ac_cv_have_struct_sockaddr_in6="yes" ],
+		[ ac_cv_have_struct_sockaddr_in6="no" ]
+	)
+])
+if test "x$ac_cv_have_struct_sockaddr_in6" = "xyes" ; then
+	AC_DEFINE(HAVE_STRUCT_SOCKADDR_IN6,,Have struct sockaddr_in6)
+fi
+
+AC_CACHE_CHECK([for struct in6_addr], ac_cv_have_struct_in6_addr, [
+	AC_TRY_COMPILE(
+		[
+#include <sys/types.h>
+#include <netinet/in.h>
+		],
+		[ struct in6_addr s; s.s6_addr[0] = 0; ],
+		[ ac_cv_have_struct_in6_addr="yes" ],
+		[ ac_cv_have_struct_in6_addr="no" ]
+	)
+])
+if test "x$ac_cv_have_struct_in6_addr" = "xyes" ; then
+	AC_DEFINE(HAVE_STRUCT_IN6_ADDR,,Have struct in6_addr)
+fi
+
+AC_CACHE_CHECK([for struct addrinfo], ac_cv_have_struct_addrinfo, [
+	AC_TRY_COMPILE(
+		[
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netdb.h>
+		],
+		[ struct addrinfo s; s.ai_flags = AI_PASSIVE; ],
+		[ ac_cv_have_struct_addrinfo="yes" ],
+		[ ac_cv_have_struct_addrinfo="no" ]
+	)
+])
+if test "x$ac_cv_have_struct_addrinfo" = "xyes" ; then
+	AC_DEFINE(HAVE_STRUCT_ADDRINFO,,Have struct addrinfo)
+fi
+
+
+# IRIX has a const char return value for gai_strerror()
+AC_CHECK_FUNCS(gai_strerror,[
+	AC_DEFINE(HAVE_GAI_STRERROR)
+	AC_TRY_COMPILE([
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netdb.h>
+
+const char *gai_strerror(int);],[
+char *str;
+
+str = gai_strerror(0);],[
+		AC_DEFINE(HAVE_CONST_GAI_STRERROR_PROTO, 1,
+		[Define if gai_strerror() returns const char *])])])
+
+# for loginrec.c
+
+AC_CHECK_MEMBERS([struct utmp.ut_host, struct utmp.ut_pid, struct utmp.ut_type, struct utmp.ut_tv, struct utmp.ut_id, struct utmp.ut_addr, struct utmp.ut_addr_v6, struct utmp.ut_exit, struct utmp.ut_time],,,[
+#include <sys/types.h>
+#if HAVE_UTMP_H
+#include <utmp.h>
+#endif
+])
+
+AC_CHECK_MEMBERS([struct utmpx.ut_host, struct utmpx.ut_syslen, struct utmpx.ut_type, struct utmpx.ut_id, struct utmpx.ut_addr, struct utmpx.ut_addr_v6, struct utmpx.ut_time, struct utmpx.ut_tv, struct sockaddr_storage.ss_family, struct sockadd_storage.__family],,,[
+#include <sys/types.h>
+#include <sys/socket.h>
+#if HAVE_UTMPX_H
+#include <utmpx.h>
+#endif
+])
+
+AC_CHECK_FUNCS(endutent getutent getutid getutline pututline setutent)
+AC_CHECK_FUNCS(utmpname)
+AC_CHECK_FUNCS(endutxent getutxent getutxid getutxline pututxline )
+AC_CHECK_FUNCS(setutxent utmpxname)
+AC_CHECK_FUNCS(logout updwtmp logwtmp)
+
+dnl Added from OpenSSH 3.6.1p2's configure.ac
+
+dnl allow user to disable some login recording features
+AC_ARG_ENABLE(lastlog,
+	[  --disable-lastlog       Disable use of lastlog even if detected [no]],
+	[ AC_DEFINE(DISABLE_LASTLOG,,Disable use of lastlog()) ]
+)
+AC_ARG_ENABLE(utmp,
+	[  --disable-utmp          Disable use of utmp even if detected [no]],
+	[ AC_DEFINE(DISABLE_UTMP,,Disable use of utmp) ]
+)
+AC_ARG_ENABLE(utmpx,
+	[  --disable-utmpx         Disable use of utmpx even if detected [no]],
+	[ AC_DEFINE(DISABLE_UTMPX,,Disable use of utmpx) ]
+)
+AC_ARG_ENABLE(wtmp,
+	[  --disable-wtmp          Disable use of wtmp even if detected [no]],
+	[ AC_DEFINE(DISABLE_WTMP,,Disable use of wtmp) ]
+)
+AC_ARG_ENABLE(wtmpx,
+	[  --disable-wtmpx         Disable use of wtmpx even if detected [no]],
+	[ AC_DEFINE(DISABLE_WTMPX,,Disable use of wtmpx) ]
+)
+AC_ARG_ENABLE(loginfunc,
+	[  --disable-loginfunc     Disable use of login() etc. [no]],
+	[ no_loginfunc_check=1
+	AC_MSG_RESULT(Not using login() etc) ]
+)
+AC_ARG_ENABLE(pututline,
+	[  --disable-pututline     Disable use of pututline() etc. ([uw]tmp) [no]],
+	[ AC_DEFINE(DISABLE_PUTUTLINE,,Disable use of pututline()) ]
+)
+AC_ARG_ENABLE(pututxline,
+	[  --disable-pututxline    Disable use of pututxline() etc. ([uw]tmpx) [no]],
+	[ AC_DEFINE(DISABLE_PUTUTXLINE,,Disable use of pututxline()) ]
+)
+AC_ARG_WITH(lastlog,
+  [  --with-lastlog=FILE|DIR specify lastlog location [common locations]],
+	[
+		if test "x$withval" = "xno" ; then	
+			AC_DEFINE(DISABLE_LASTLOG)
+		else
+			conf_lastlog_location=$withval
+		fi
+	]
+)
+
+if test -z "$no_loginfunc_check"; then
+	dnl    Checks for libutil functions (login(), logout() etc, not openpty() )
+	AC_SEARCH_LIBS(login, util bsd, [AC_DEFINE(HAVE_LOGIN,,Have login() function)])
+	AC_CHECK_FUNCS(logout updwtmp logwtmp)
+fi
+
+dnl lastlog, [uw]tmpx? detection
+dnl  NOTE: set the paths in the platform section to avoid the
+dnl   need for command-line parameters
+dnl lastlog and [uw]tmp are subject to a file search if all else fails
+
+dnl lastlog detection
+dnl  NOTE: the code itself will detect if lastlog is a directory
+AC_MSG_CHECKING([if your system defines LASTLOG_FILE])
+AC_TRY_COMPILE([
+#include <sys/types.h>
+#include <utmp.h>
+#ifdef HAVE_LASTLOG_H
+#  include <lastlog.h>
+#endif
+#ifdef HAVE_PATHS_H
+#  include <paths.h>
+#endif
+#ifdef HAVE_LOGIN_H
+# include <login.h>
+#endif
+	],
+	[ char *lastlog = LASTLOG_FILE; ],
+	[ AC_MSG_RESULT(yes) ],
+	[
+		AC_MSG_RESULT(no)
+		AC_MSG_CHECKING([if your system defines _PATH_LASTLOG])
+		AC_TRY_COMPILE([
+#include <sys/types.h>
+#include <utmp.h>
+#ifdef HAVE_LASTLOG_H
+#  include <lastlog.h>
+#endif
+#ifdef HAVE_PATHS_H
+#  include <paths.h>
+#endif
+		],
+		[ char *lastlog = _PATH_LASTLOG; ],
+		[ AC_MSG_RESULT(yes) ],
+		[
+			AC_MSG_RESULT(no)
+			system_lastlog_path=no
+		])
+	]
+)
+
+if test -z "$conf_lastlog_location"; then
+	if test x"$system_lastlog_path" = x"no" ; then
+		for f in /var/log/lastlog /usr/adm/lastlog /var/adm/lastlog /etc/security/lastlog ; do
+				if (test -d "$f" || test -f "$f") ; then
+					conf_lastlog_location=$f
+				fi
+		done
+		if test -z "$conf_lastlog_location"; then
+			AC_MSG_WARN([** Cannot find lastlog **])
+			dnl Don't define DISABLE_LASTLOG - that means we don't try wtmp/wtmpx
+		fi
+	fi
+fi
+
+if test -n "$conf_lastlog_location"; then
+	AC_DEFINE_UNQUOTED(CONF_LASTLOG_FILE, "$conf_lastlog_location", lastlog file location)
+fi	
+
+dnl utmp detection
+AC_MSG_CHECKING([if your system defines UTMP_FILE])
+AC_TRY_COMPILE([
+#include <sys/types.h>
+#include <utmp.h>
+#ifdef HAVE_PATHS_H
+#  include <paths.h>
+#endif
+	],
+	[ char *utmp = UTMP_FILE; ],
+	[ AC_MSG_RESULT(yes) ],
+	[ AC_MSG_RESULT(no)
+	  system_utmp_path=no ]
+)
+if test -z "$conf_utmp_location"; then
+	if test x"$system_utmp_path" = x"no" ; then
+		for f in /etc/utmp /usr/adm/utmp /var/run/utmp; do
+			if test -f $f ; then
+				conf_utmp_location=$f
+			fi
+		done
+		if test -z "$conf_utmp_location"; then
+			AC_DEFINE(DISABLE_UTMP)
+		fi
+	fi
+fi
+if test -n "$conf_utmp_location"; then
+	AC_DEFINE_UNQUOTED(CONF_UTMP_FILE, "$conf_utmp_location", utmp file location)
+fi	
+
+dnl wtmp detection
+AC_MSG_CHECKING([if your system defines WTMP_FILE])
+AC_TRY_COMPILE([
+#include <sys/types.h>
+#include <utmp.h>
+#ifdef HAVE_PATHS_H
+#  include <paths.h>
+#endif
+	],
+	[ char *wtmp = WTMP_FILE; ],
+	[ AC_MSG_RESULT(yes) ],
+	[ AC_MSG_RESULT(no)
+	  system_wtmp_path=no ]
+)
+if test -z "$conf_wtmp_location"; then
+	if test x"$system_wtmp_path" = x"no" ; then
+		for f in /usr/adm/wtmp /var/log/wtmp; do
+			if test -f $f ; then
+				conf_wtmp_location=$f
+			fi
+		done
+		if test -z "$conf_wtmp_location"; then
+			AC_DEFINE(DISABLE_WTMP)
+		fi
+	fi
+fi
+if test -n "$conf_wtmp_location"; then
+	AC_DEFINE_UNQUOTED(CONF_WTMP_FILE, "$conf_wtmp_location", wtmp file location)
+fi	
+
+
+dnl utmpx detection - I don't know any system so perverse as to require
+dnl  utmpx, but not define UTMPX_FILE (ditto wtmpx.) No doubt it's out
+dnl  there, though.
+AC_MSG_CHECKING([if your system defines UTMPX_FILE])
+AC_TRY_COMPILE([
+#include <sys/types.h>
+#include <utmp.h>
+#ifdef HAVE_UTMPX_H
+#include <utmpx.h>
+#endif
+#ifdef HAVE_PATHS_H
+#  include <paths.h>
+#endif
+	],
+	[ char *utmpx = UTMPX_FILE; ],
+	[ AC_MSG_RESULT(yes) ],
+	[ AC_MSG_RESULT(no)
+	  system_utmpx_path=no ]
+)
+if test -z "$conf_utmpx_location"; then
+	if test x"$system_utmpx_path" = x"no" ; then
+		AC_DEFINE(DISABLE_UTMPX)
+	fi
+else
+	AC_DEFINE_UNQUOTED(CONF_UTMPX_FILE, "$conf_utmpx_location", utmpx file location)
+fi	
+
+dnl wtmpx detection
+AC_MSG_CHECKING([if your system defines WTMPX_FILE])
+AC_TRY_COMPILE([
+#include <sys/types.h>
+#include <utmp.h>
+#ifdef HAVE_UTMPX_H
+#include <utmpx.h>
+#endif
+#ifdef HAVE_PATHS_H
+#  include <paths.h>
+#endif
+	],
+	[ char *wtmpx = WTMPX_FILE; ],
+	[ AC_MSG_RESULT(yes) ],
+	[ AC_MSG_RESULT(no)
+	  system_wtmpx_path=no ]
+)
+if test -z "$conf_wtmpx_location"; then
+	if test x"$system_wtmpx_path" = x"no" ; then
+		AC_DEFINE(DISABLE_WTMPX)
+	fi
+else
+	AC_DEFINE_UNQUOTED(CONF_WTMPX_FILE, "$conf_wtmpx_location", wtmpx file location)
+fi	
+
+# Checks for library functions.
+AC_PROG_GCC_TRADITIONAL
+AC_FUNC_MEMCMP
+AC_FUNC_SELECT_ARGTYPES
+AC_TYPE_SIGNAL
+AC_CHECK_FUNCS([dup2 getspnam getusershell memset putenv select socket strdup clearenv strlcpy strlcat daemon basename _getpty getaddrinfo freeaddrinfo getnameinfo])
+
+AC_SEARCH_LIBS(basename, gen, AC_DEFINE(HAVE_BASENAME))
+
+# Solaris needs ptmx
+if test -z "$no_ptmx_check" ; then
+	if test x"$cross_compiling" = x"no" ; then
+		AC_CHECK_FILE("/dev/ptmx", AC_DEFINE(USE_DEV_PTMX,,Use /dev/ptmx))
+	else
+		AC_MSG_RESULT(Not checking for /dev/ptmx, we're cross-compiling)
+	fi
+fi
+
+if test -z "$no_ptc_check" ; then
+	if test x"$cross_compiling" = x"no" ; then
+		AC_CHECK_FILE("/dev/ptc", AC_DEFINE(HAVE_DEV_PTS_AND_PTC,,Use /dev/ptc & /dev/pts))
+	else
+		AC_MSG_RESULT(Not checking for /dev/ptc & /dev/pts\, we're cross-compiling)
+	fi
+fi
+
+AC_EXEEXT
+AC_CONFIG_HEADER(config.h)
+AC_OUTPUT(Makefile)
+AC_OUTPUT(libtomcrypt/Makefile)
+AC_OUTPUT(libtommath/Makefile)
+AC_MSG_RESULT()
+AC_MSG_RESULT(Now edit options.h to choose features.)
diff --git a/dbclient.1 b/dbclient.1
new file mode 100644
index 0000000..4d7cc3c
--- /dev/null
+++ b/dbclient.1
@@ -0,0 +1,74 @@
+.TH dbclient 1
+.SH NAME
+dbclient \- lightweight SSH2 client
+.SH SYNOPSIS
+.B dbclient
+[\-Tt] [\-p
+.I port\fR] [\-i
+.I id\fR] [\-L
+.I l\fR:\fIh\fR:\fIr\fR] [\-R
+.I l\fR:\fIh\fR:\fIr\fR] [\-l
+.IR user ]
+.I host
+.SH DESCRIPTION
+.B dbclient
+is a SSH 2 client designed to be small enough to be used in small memory
+environments, while still being functional and secure enough for general use.
+.SH OPTIONS
+.TP
+.B \-p \fIport
+Remote port.
+Connect to port
+.I port
+on the remote host.
+Default is 22.
+.TP
+.B \-i \fIidfile
+Identity file.
+Read the identity from file
+.I idfile
+(multiple allowed).
+.TP
+.B \-L \fIlocalport\fR:\fIremotehost\fR:\fIremoteport\fR
+Local port forwarding.
+Forward the port
+.I localport
+on the local host to port
+.I remoteport
+on the remote host
+.IR remotehost .
+.TP
+.B \-R \fIlocalport\fR:\fIremotehost\fR:\fIremoteport\fR
+Remote port forwarding.
+Forward the port
+.I remoteport
+on the remote host
+.I remotehost
+to port
+.I localport
+on the local host.
+.TP
+.B \-l \fIuser
+Username.
+Login as
+.I user
+on the remote host.
+.TP
+.B \-t
+Allocate a pty.
+.TP
+.B \-T
+Don't allocate a pty.
+.TP
+.B \-g
+Allow non-local hosts to connect to forwarded ports. Applies to -L and -R
+forwarded ports, though remote connections to -R forwarded ports may be limited
+by the ssh server.
+.SH AUTHOR
+Matt Johnston (matt@ucc.asn.au).
+.br
+Gerrit Pape (pape@smarden.org) wrote this manual page.
+.SH SEE ALSO
+dropbear(8), dropbearkey(8)
+.P
+http://matt.ucc.asn.au/dropbear/dropbear.html
diff --git a/dbmulti.c b/dbmulti.c
new file mode 100644
index 0000000..2b8df3f
--- /dev/null
+++ b/dbmulti.c
@@ -0,0 +1,90 @@
+/*
+ * Dropbear SSH
+ * 
+ * Copyright (c) 2002,2003 Matt Johnston
+ * All rights reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE. */
+
+#include "includes.h"
+
+/* definitions are cleanest if we just put them here */
+int dropbear_main(int argc, char ** argv);
+int dropbearkey_main(int argc, char ** argv);
+int dropbearconvert_main(int argc, char ** argv);
+int scp_main(int argc, char ** argv);
+
+int main(int argc, char ** argv) {
+
+	char * progname;
+
+	if (argc > 0) {
+		/* figure which form we're being called as */
+		progname = basename(argv[0]);
+
+#ifdef DBMULTI_dropbear
+		if (strcmp(progname, "dropbear") == 0) {
+			return dropbear_main(argc, argv);
+		}
+#endif
+#ifdef DBMULTI_dbclient
+		if (strcmp(progname, "dbclient") == 0
+				|| strcmp(progname, "ssh") == 0) {
+			return cli_main(argc, argv);
+		}
+#endif
+#ifdef DBMULTI_dropbearkey
+		if (strcmp(progname, "dropbearkey") == 0) {
+			return dropbearkey_main(argc, argv);
+		}
+#endif
+#ifdef DBMULTI_dropbearconvert
+		if (strcmp(progname, "dropbearconvert") == 0) {
+			return dropbearconvert_main(argc, argv);
+		}
+#endif
+#ifdef DBMULTI_scp
+		if (strcmp(progname, "scp") == 0) {
+			return scp_main(argc, argv);
+		}
+#endif
+	}
+
+	fprintf(stderr, "Dropbear multi-purpose version %s\n"
+			"Make a symlink pointing at this binary with one of the following names:\n"
+#ifdef DBMULTI_dropbear
+			"'dropbear' - the Dropbear server\n"
+#endif
+#ifdef DBMULTI_dbclient
+			"'dbclient' or 'ssh' - the Dropbear client\n"
+#endif
+#ifdef DBMULTI_dropbearkey
+			"'dropbearkey' - the key generator\n"
+#endif
+#ifdef DBMULTI_dropbearconvert
+			"'dropbearconvert' - the key converter\n"
+#endif
+#ifdef DBMULTI_scp
+			"'scp' - secure copy\n"
+#endif
+			,
+			DROPBEAR_VERSION);
+	exit(1);
+
+}
diff --git a/dbutil.c b/dbutil.c
new file mode 100644
index 0000000..15f51ba
--- /dev/null
+++ b/dbutil.c
@@ -0,0 +1,679 @@
+/*
+ * Dropbear - a SSH2 server
+ * 
+ * Copyright (c) 2002,2003 Matt Johnston
+ * All rights reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * strlcat() is copyright as follows:
+ * Copyright (c) 1998 Todd C. Miller <Todd.Miller@courtesan.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
+ * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
+ * THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
+
+#include "includes.h"
+#include "dbutil.h"
+#include "buffer.h"
+#include "session.h"
+#include "atomicio.h"
+
+#define MAX_FMT 100
+
+static void generic_dropbear_exit(int exitcode, const char* format, 
+		va_list param);
+static void generic_dropbear_log(int priority, const char* format, 
+		va_list param);
+
+void (*_dropbear_exit)(int exitcode, const char* format, va_list param) 
+						= generic_dropbear_exit;
+void (*_dropbear_log)(int priority, const char* format, va_list param)
+						= generic_dropbear_log;
+
+#ifdef DEBUG_TRACE
+int debug_trace = 0;
+#endif
+
+#ifndef DISABLE_SYSLOG
+void startsyslog() {
+
+	openlog(PROGNAME, LOG_PID, LOG_AUTHPRIV);
+
+}
+#endif /* DISABLE_SYSLOG */
+
+/* the "format" string must be <= 100 characters */
+void dropbear_close(const char* format, ...) {
+
+	va_list param;
+
+	va_start(param, format);
+	_dropbear_exit(EXIT_SUCCESS, format, param);
+	va_end(param);
+
+}
+
+void dropbear_exit(const char* format, ...) {
+
+	va_list param;
+
+	va_start(param, format);
+	_dropbear_exit(EXIT_FAILURE, format, param);
+	va_end(param);
+}
+
+static void generic_dropbear_exit(int exitcode, const char* format, 
+		va_list param) {
+
+	char fmtbuf[300];
+
+	snprintf(fmtbuf, sizeof(fmtbuf), "Exited: %s", format);
+
+	_dropbear_log(LOG_INFO, fmtbuf, param);
+
+	exit(exitcode);
+}
+
+void fail_assert(const char* expr, const char* file, int line) {
+	dropbear_exit("failed assertion (%s:%d): `%s'", file, line, expr);
+}
+
+static void generic_dropbear_log(int UNUSED(priority), const char* format, 
+		va_list param) {
+
+	char printbuf[1024];
+
+	vsnprintf(printbuf, sizeof(printbuf), format, param);
+
+	fprintf(stderr, "%s\n", printbuf);
+
+}
+
+/* this is what can be called to write arbitrary log messages */
+void dropbear_log(int priority, const char* format, ...) {
+
+	va_list param;
+
+	va_start(param, format);
+	_dropbear_log(priority, format, param);
+	va_end(param);
+}
+
+
+#ifdef DEBUG_TRACE
+void dropbear_trace(const char* format, ...) {
+
+	va_list param;
+
+	if (!debug_trace) {
+		return;
+	}
+
+	va_start(param, format);
+	fprintf(stderr, "TRACE: ");
+	vfprintf(stderr, format, param);
+	fprintf(stderr, "\n");
+	va_end(param);
+}
+#endif /* DEBUG_TRACE */
+
+static void set_sock_priority(int sock) {
+
+	int val;
+
+	/* disable nagle */
+	val = 1;
+	setsockopt(sock, IPPROTO_TCP, TCP_NODELAY, (void*)&val, sizeof(val));
+
+	/* set the TOS bit. note that this will fail for ipv6, I can't find any
+	 * equivalent. */
+#ifdef IPTOS_LOWDELAY
+	val = IPTOS_LOWDELAY;
+	setsockopt(sock, IPPROTO_IP, IP_TOS, (void*)&val, sizeof(val));
+#endif
+
+#ifdef SO_PRIORITY
+	/* linux specific, sets QoS class.
+	 * 6 looks to be optimal for interactive traffic (see tc-prio(8) ). */
+	val = 6;
+	setsockopt(sock, SOL_SOCKET, SO_PRIORITY, (void*) &val, sizeof(val));
+#endif
+
+}
+
+/* Listen on address:port. 
+ * Special cases are address of "" listening on everything,
+ * and address of NULL listening on localhost only.
+ * Returns the number of sockets bound on success, or -1 on failure. On
+ * failure, if errstring wasn't NULL, it'll be a newly malloced error
+ * string.*/
+int dropbear_listen(const char* address, const char* port,
+		int *socks, unsigned int sockcount, char **errstring, int *maxfd) {
+
+	struct addrinfo hints, *res = NULL, *res0 = NULL;
+	int err;
+	unsigned int nsock;
+	struct linger linger;
+	int val;
+	int sock;
+
+	TRACE(("enter dropbear_listen"))
+	
+	memset(&hints, 0, sizeof(hints));
+	hints.ai_family = AF_UNSPEC; /* TODO: let them flag v4 only etc */
+	hints.ai_socktype = SOCK_STREAM;
+
+	// for calling getaddrinfo:
+	// address == NULL and !AI_PASSIVE: local loopback
+	// address == NULL and AI_PASSIVE: all interfaces
+	// address != NULL: whatever the address says
+	if (!address) {
+		TRACE(("dropbear_listen: local loopback"))
+	} else {
+		if (address[0] == '\0') {
+			TRACE(("dropbear_listen: all interfaces"))
+			address = NULL;
+		}
+		hints.ai_flags = AI_PASSIVE;
+	}
+	err = getaddrinfo(address, port, &hints, &res0);
+
+	if (err) {
+		if (errstring != NULL && *errstring == NULL) {
+			int len;
+			len = 20 + strlen(gai_strerror(err));
+			*errstring = (char*)m_malloc(len);
+			snprintf(*errstring, len, "Error resolving: %s", gai_strerror(err));
+		}
+		if (res0) {
+			freeaddrinfo(res0);
+			res0 = NULL;
+		}
+		TRACE(("leave dropbear_listen: failed resolving"))
+		return -1;
+	}
+
+
+	nsock = 0;
+	for (res = res0; res != NULL && nsock < sockcount;
+			res = res->ai_next) {
+
+		/* Get a socket */
+		socks[nsock] = socket(res->ai_family, res->ai_socktype,
+				res->ai_protocol);
+
+		sock = socks[nsock]; /* For clarity */
+
+		if (sock < 0) {
+			err = errno;
+			TRACE(("socket() failed"))
+			continue;
+		}
+
+		/* Various useful socket options */
+		val = 1;
+		/* set to reuse, quick timeout */
+		setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, (void*) &val, sizeof(val));
+		linger.l_onoff = 1;
+		linger.l_linger = 5;
+		setsockopt(sock, SOL_SOCKET, SO_LINGER, (void*)&linger, sizeof(linger));
+
+		set_sock_priority(sock);
+
+		if (bind(sock, res->ai_addr, res->ai_addrlen) < 0) {
+			err = errno;
+			close(sock);
+			TRACE(("bind(%s) failed", port))
+			continue;
+		}
+
+		if (listen(sock, 20) < 0) {
+			err = errno;
+			close(sock);
+			TRACE(("listen() failed"))
+			continue;
+		}
+
+		*maxfd = MAX(*maxfd, sock);
+
+		nsock++;
+	}
+
+	if (res0) {
+		freeaddrinfo(res0);
+		res0 = NULL;
+	}
+
+	if (nsock == 0) {
+		if (errstring != NULL && *errstring == NULL) {
+			int len;
+			len = 20 + strlen(strerror(err));
+			*errstring = (char*)m_malloc(len);
+			snprintf(*errstring, len, "Error listening: %s", strerror(err));
+			TRACE(("leave dropbear_listen: failure, %s", strerror(err)))
+			return -1;
+		}
+	}
+
+	TRACE(("leave dropbear_listen: success, %d socks bound", nsock))
+	return nsock;
+}
+
+/* Connect via TCP to a host. Connection will try ipv4 or ipv6, will
+ * return immediately if nonblocking is set. On failure, if errstring
+ * wasn't null, it will be a newly malloced error message */
+
+/* TODO: maxfd */
+int connect_remote(const char* remotehost, const char* remoteport,
+		int nonblocking, char ** errstring) {
+
+	struct addrinfo *res0 = NULL, *res = NULL, hints;
+	int sock;
+	int err;
+
+	TRACE(("enter connect_remote"))
+
+	if (errstring != NULL) {
+		*errstring = NULL;
+	}
+
+	memset(&hints, 0, sizeof(hints));
+	hints.ai_socktype = SOCK_STREAM;
+	hints.ai_family = PF_UNSPEC;
+
+	err = getaddrinfo(remotehost, remoteport, &hints, &res0);
+	if (err) {
+		if (errstring != NULL && *errstring == NULL) {
+			int len;
+			len = 20 + strlen(gai_strerror(err));
+			*errstring = (char*)m_malloc(len);
+			snprintf(*errstring, len, "Error resolving: %s", gai_strerror(err));
+		}
+		TRACE(("Error resolving: %s", gai_strerror(err)))
+		return -1;
+	}
+
+	sock = -1;
+	err = EADDRNOTAVAIL;
+	for (res = res0; res; res = res->ai_next) {
+
+		sock = socket(res->ai_family, res->ai_socktype, res->ai_protocol);
+		if (sock < 0) {
+			err = errno;
+			continue;
+		}
+
+		if (nonblocking) {
+			if (fcntl(sock, F_SETFL, O_NONBLOCK) < 0) {
+				close(sock);
+				sock = -1;
+				if (errstring != NULL && *errstring == NULL) {
+					*errstring = m_strdup("Failed non-blocking");
+				}
+				TRACE(("Failed non-blocking: %s", strerror(errno)))
+				continue;
+			}
+		}
+
+		if (connect(sock, res->ai_addr, res->ai_addrlen) < 0) {
+			if (errno == EINPROGRESS && nonblocking) {
+				TRACE(("Connect in progress"))
+				break;
+			} else {
+				err = errno;
+				close(sock);
+				sock = -1;
+				continue;
+			}
+		}
+
+		break; /* Success */
+	}
+
+	if (sock < 0 && !(errno == EINPROGRESS && nonblocking)) {
+		/* Failed */
+		if (errstring != NULL && *errstring == NULL) {
+			int len;
+			len = 20 + strlen(strerror(err));
+			*errstring = (char*)m_malloc(len);
+			snprintf(*errstring, len, "Error connecting: %s", strerror(err));
+		}
+		TRACE(("Error connecting: %s", strerror(err)))
+	} else {
+		/* Success */
+		set_sock_priority(sock);
+	}
+
+	freeaddrinfo(res0);
+	if (sock > 0 && errstring != NULL && *errstring != NULL) {
+		m_free(*errstring);
+	}
+
+	TRACE(("leave connect_remote: sock %d\n", sock))
+	return sock;
+}
+
+/* Return a string representation of the socket address passed. The return
+ * value is allocated with malloc() */
+unsigned char * getaddrstring(struct sockaddr_storage* addr, int withport) {
+
+	char hbuf[NI_MAXHOST], sbuf[NI_MAXSERV];
+	char *retstring = NULL;
+	int ret;
+	unsigned int len;
+
+	len = sizeof(struct sockaddr_storage);
+	/* Some platforms such as Solaris 8 require that len is the length
+	 * of the specific structure. */
+	if (addr->ss_family == AF_INET) {
+		len = sizeof(struct sockaddr_in);
+	}
+#ifdef AF_INET6
+	if (addr->ss_family == AF_INET6) {
+		len = sizeof(struct sockaddr_in6);
+	}
+#endif
+
+	ret = getnameinfo((struct sockaddr*)addr, len, hbuf, sizeof(hbuf), 
+			sbuf, sizeof(sbuf), NI_NUMERICSERV | NI_NUMERICHOST);
+
+	if (ret != 0) {
+		/* This is a fairly bad failure - it'll fallback to IP if it
+		 * just can't resolve */
+		dropbear_exit("failed lookup (%d, %d)", ret, errno);
+	}
+
+	if (withport) {
+		len = strlen(hbuf) + 2 + strlen(sbuf);
+		retstring = (char*)m_malloc(len);
+		snprintf(retstring, len, "%s:%s", hbuf, sbuf);
+	} else {
+		retstring = m_strdup(hbuf);
+	}
+
+	return retstring;
+
+}
+
+/* Get the hostname corresponding to the address addr. On failure, the IP
+ * address is returned. The return value is allocated with strdup() */
+char* getaddrhostname(struct sockaddr_storage * addr) {
+
+	char hbuf[NI_MAXHOST];
+	char sbuf[NI_MAXSERV];
+	int ret;
+	unsigned int len;
+#ifdef DO_HOST_LOOKUP
+	const int flags = NI_NUMERICSERV;
+#else
+	const int flags = NI_NUMERICHOST | NI_NUMERICSERV;
+#endif
+
+	len = sizeof(struct sockaddr_storage);
+	/* Some platforms such as Solaris 8 require that len is the length
+	 * of the specific structure. */
+	if (addr->ss_family == AF_INET) {
+		len = sizeof(struct sockaddr_in);
+	}
+#ifdef AF_INET6
+	if (addr->ss_family == AF_INET6) {
+		len = sizeof(struct sockaddr_in6);
+	}
+#endif
+
+
+	ret = getnameinfo((struct sockaddr*)addr, len, hbuf, sizeof(hbuf),
+			sbuf, sizeof(sbuf), flags);
+
+	if (ret != 0) {
+		/* On some systems (Darwin does it) we get EINTR from getnameinfo
+		 * somehow. Eew. So we'll just return the IP, since that doesn't seem
+		 * to exhibit that behaviour. */
+		return getaddrstring(addr, 0);
+	}
+
+	return m_strdup(hbuf);
+}
+
+#ifdef DEBUG_TRACE
+void printhex(const char * label, const unsigned char * buf, int len) {
+
+	int i;
+
+	fprintf(stderr, "%s\n", label);
+	for (i = 0; i < len; i++) {
+		fprintf(stderr, "%02x", buf[i]);
+		if (i % 16 == 15) {
+			fprintf(stderr, "\n");
+		}
+		else if (i % 2 == 1) {
+			fprintf(stderr, " ");
+		}
+	}
+	fprintf(stderr, "\n");
+}
+#endif
+
+/* Strip all control characters from text (a null-terminated string), except
+ * for '\n', '\r' and '\t'.
+ * The result returned is a newly allocated string, this must be free()d after
+ * use */
+char * stripcontrol(const char * text) {
+
+	char * ret;
+	int len, pos;
+	int i;
+	
+	len = strlen(text);
+	ret = m_malloc(len+1);
+
+	pos = 0;
+	for (i = 0; i < len; i++) {
+		if ((text[i] <= '~' && text[i] >= ' ') /* normal printable range */
+				|| text[i] == '\n' || text[i] == '\r' || text[i] == '\t') {
+			ret[pos] = text[i];
+			pos++;
+		}
+	}
+	ret[pos] = 0x0;
+	return ret;
+}
+			
+
+/* reads the contents of filename into the buffer buf, from the current
+ * position, either to the end of the file, or the buffer being full.
+ * Returns DROPBEAR_SUCCESS or DROPBEAR_FAILURE */
+int buf_readfile(buffer* buf, const char* filename) {
+
+	int fd;
+	int len;
+	int maxlen;
+
+	fd = open(filename, O_RDONLY);
+
+	if (fd < 0) {
+		close(fd);
+		return DROPBEAR_FAILURE;
+	}
+	
+	do {
+		maxlen = buf->size - buf->pos;
+		len = read(fd, buf_getwriteptr(buf, maxlen),
+				maxlen);
+		buf_incrwritepos(buf, len);
+	} while (len < maxlen && len > 0);
+
+	close(fd);
+	return DROPBEAR_SUCCESS;
+}
+
+/* get a line from the file into buffer in the style expected for an
+ * authkeys file.
+ * Will return DROPBEAR_SUCCESS if data is read, or DROPBEAR_FAILURE on EOF.*/
+/* Only used for ~/.ssh/known_hosts and ~/.ssh/authorized_keys */
+#if defined(DROPBEAR_CLIENT) || defined(ENABLE_SVR_PUBKEY_AUTH)
+int buf_getline(buffer * line, FILE * authfile) {
+
+	int c = EOF;
+
+	TRACE(("enter buf_getline"))
+
+	buf_setpos(line, 0);
+	buf_setlen(line, 0);
+
+	while (line->pos < line->size) {
+
+		c = fgetc(authfile); /*getc() is weird with some uClibc systems*/
+		if (c == EOF || c == '\n' || c == '\r') {
+			goto out;
+		}
+
+		buf_putbyte(line, (unsigned char)c);
+	}
+
+	TRACE(("leave getauthline: line too long"))
+	/* We return success, but the line length will be zeroed - ie we just
+	 * ignore that line */
+	buf_setlen(line, 0);
+
+out:
+
+
+	/* if we didn't read anything before EOF or error, exit */
+	if (c == EOF && line->pos == 0) {
+		TRACE(("leave buf_getline: failure"))
+		return DROPBEAR_FAILURE;
+	} else {
+		TRACE(("leave buf_getline: success"))
+		buf_setpos(line, 0);
+		return DROPBEAR_SUCCESS;
+	}
+
+}	
+#endif
+
+/* make sure that the socket closes */
+void m_close(int fd) {
+
+	int val;
+	do {
+		val = close(fd);
+	} while (val < 0 && errno == EINTR);
+
+	if (val < 0 && errno != EBADF) {
+		/* Linux says EIO can happen */
+		dropbear_exit("Error closing fd %d, %s", fd, strerror(errno));
+	}
+}
+	
+void * m_malloc(size_t size) {
+
+	void* ret;
+
+	if (size == 0) {
+		dropbear_exit("m_malloc failed");
+	}
+	ret = calloc(1, size);
+	if (ret == NULL) {
+		dropbear_exit("m_malloc failed");
+	}
+	return ret;
+
+}
+
+void * m_strdup(const char * str) {
+	char* ret;
+
+	ret = strdup(str);
+	if (ret == NULL) {
+		dropbear_exit("m_strdup failed");
+	}
+	return ret;
+}
+
+void __m_free(void* ptr) {
+	if (ptr != NULL) {
+		free(ptr);
+	}
+}
+
+void * m_realloc(void* ptr, size_t size) {
+
+	void *ret;
+
+	if (size == 0) {
+		dropbear_exit("m_realloc failed");
+	}
+	ret = realloc(ptr, size);
+	if (ret == NULL) {
+		dropbear_exit("m_realloc failed");
+	}
+	return ret;
+}
+
+/* Clear the data, based on the method in David Wheeler's
+ * "Secure Programming for Linux and Unix HOWTO" */
+/* Beware of calling this from within dbutil.c - things might get
+ * optimised away */
+void m_burn(void *data, unsigned int len) {
+	volatile char *p = data;
+
+	if (data == NULL)
+		return;
+	while (len--) {
+		*p++ = 0x66;
+	}
+}
+
+
+void setnonblocking(int fd) {
+
+	TRACE(("setnonblocking: %d", fd))
+
+	if (fcntl(fd, F_SETFL, O_NONBLOCK) < 0) {
+		if (errno == ENODEV) {
+			/* Some devices (like /dev/null redirected in)
+			 * can't be set to non-blocking */
+			TRACE(("ignoring ENODEV for setnonblocking"))
+		} else {
+			dropbear_exit("Couldn't set nonblocking");
+		}
+	}
+	TRACE(("leave setnonblocking"))
+}
diff --git a/dbutil.h b/dbutil.h
new file mode 100644
index 0000000..d74e17e
--- /dev/null
+++ b/dbutil.h
@@ -0,0 +1,73 @@
+/*
+ * Dropbear - a SSH2 server
+ * 
+ * Copyright (c) 2002,2003 Matt Johnston
+ * All rights reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE. */
+
+#ifndef _DBUTIL_H_
+
+#define _DBUTIL_H_
+
+#include "includes.h"
+#include "buffer.h"
+
+#ifndef DISABLE_SYSLOG
+void startsyslog();
+#endif
+
+extern void (*_dropbear_exit)(int exitcode, const char* format, va_list param);
+extern void (*_dropbear_log)(int priority, const char* format, va_list param);
+
+void dropbear_exit(const char* format, ...);
+void dropbear_close(const char* format, ...);
+void dropbear_log(int priority, const char* format, ...);
+void fail_assert(const char* expr, const char* file, int line);
+#ifdef DEBUG_TRACE
+void dropbear_trace(const char* format, ...);
+void printhex(const char * label, const unsigned char * buf, int len);
+extern int debug_trace;
+#endif
+char * stripcontrol(const char * text);
+unsigned char * getaddrstring(struct sockaddr_storage* addr, int withport);
+int dropbear_listen(const char* address, const char* port,
+		int *socks, unsigned int sockcount, char **errstring, int *maxfd);
+int connect_remote(const char* remotehost, const char* remoteport,
+		int nonblocking, char ** errstring);
+char* getaddrhostname(struct sockaddr_storage * addr);
+int buf_readfile(buffer* buf, const char* filename);
+int buf_getline(buffer * line, FILE * authfile);
+
+void m_close(int fd);
+void * m_malloc(size_t size);
+void * m_strdup(const char * str);
+void * m_realloc(void* ptr, size_t size);
+#define m_free(X) __m_free(X); (X) = NULL;
+void __m_free(void* ptr);
+void m_burn(void* data, unsigned int len);
+void setnonblocking(int fd);
+
+/* Used to force mp_ints to be initialised */
+#define DEF_MP_INT(X) mp_int X = {0, 0, 0, NULL}
+
+/* Dropbear assertion */
+#define dropbear_assert(X) do { if (!(X)) { fail_assert(#X, __FILE__, __LINE__); } } while (0)
+
+#endif /* _DBUTIL_H_ */
diff --git a/debian/README.Debian b/debian/README.Debian
new file mode 100644
index 0000000..8cdac38
--- /dev/null
+++ b/debian/README.Debian
@@ -0,0 +1,41 @@
+Dropbear for Debian
+-------------------
+
+This package will attempt to listen on port 22. If the OpenSSH 
+package ("ssh") is installed, the file /etc/default/dropbear 
+will be set up so that the server does not start by default.
+
+You can run Dropbear concurrently with OpenSSH 'sshd' by 
+modifying /etc/default/dropbear so that "NO_START" is set to 
+"0" and changing the port number that Dropbear runs on. Follow 
+the instructions in the file.
+
+This package suggests you install the "ssh" package. This package 
+provides the "ssh" client program, as well as the "/usr/bin/scp" 
+binary you will need to be able to retrieve files from a server 
+running Dropbear via SCP.
+
+Replacing OpenSSH "sshd" with Dropbear
+--------------------------------------
+
+You will still want to have the "ssh" package installed, as it 
+provides the "ssh" and "scp" binaries. When you install this 
+package, it checks for existing OpenSSH host keys and if found, 
+converts them to the Dropbear format.
+
+If this appears to have worked, you should be able to change over 
+by following these steps:
+
+1. Stop the OpenSSH server
+   % /etc/init.d/ssh stop
+2. Prevent the OpenSSH server from starting in the future
+   % touch /etc/ssh/sshd_not_to_be_run
+3. Modify the Dropbear defaults file, set NO_START to 0 and 
+   ensure DROPBEAR_PORT is set to 22.
+   % editor /etc/default/dropbear
+4. Restart the Dropbear server.
+   % /etc/init.d/dropbear restart
+
+See the Dropbear homepage for more information:
+  http://matt.ucc.asn.au/dropbear/dropbear.html
+
diff --git a/debian/README.Debian.diet b/debian/README.Debian.diet
new file mode 100644
index 0000000..bd0cb5c
--- /dev/null
+++ b/debian/README.Debian.diet
@@ -0,0 +1,15 @@
+Building with the diet libc
+---------------------------
+
+This package optionally can be built with the diet libc instead of the
+glibc to provide small statically linked programs.  The resulting package
+has no dependency on any other package.
+
+To use the diet libc, make sure the latest versions of the dietlibc-dev
+package is installed, and set DEB_BUILD_OPTIONS=diet in the environment
+when building the package, e.g.:
+
+ # apt-get install dietlibc-dev
+ $ DEB_BUILD_OPTIONS=diet fakeroot apt-get source -b dropbear
+
+ -- Gerrit Pape <pape@smarden.org>, Sat, 17 Jul 2004 19:09:34 +0000
diff --git a/debian/README.runit b/debian/README.runit
new file mode 100644
index 0000000..4ac2814
--- /dev/null
+++ b/debian/README.runit
@@ -0,0 +1,46 @@
+Using the dropbear SSH server with runit's services supervision
+---------------------------------------------------------------
+
+The dropbear SSH server is perfectly suited to be run under runit's
+service supervision, and this package already has prepared an adequate
+service directory.  Follow these steps to enable the dropbear service
+using the runit package.
+
+If not yet installed on your system, install the runit package, and make
+sure its service supervision is enabled (it's by default)
+
+ # apt-get install runit
+
+Make sure the dropbear service normally handled through the sysv init
+script is stopped
+
+ # /etc/init.d/dropbear stop
+
+Create the system user ``dropbearlog'' which will run the logger service,
+and own the logs
+
+ # adduser --system --home /var/log/dropbear --no-create-home dropbearlog
+
+Create the log directory and make the newly created system user the owner
+of this directory
+
+ # mkdir -p /var/log/dropbear && chown dropbearlog /var/log/dropbear
+
+Optionally adjust the configuration of the dropbear service by editing the
+run script
+
+ # vi /etc/dropbear/run
+
+Finally enable the service by linking dropbear's service directory to
+/var/service/.  The service will be started within five seconds, and
+automatically at boot time.  The sysv init script is disabled; see the
+runsvctrl(8) program for information on how to control services handled by
+runit.  See the svlogd(8) program on how to configure the log service.
+
+ # ln -s /etc/dropbear /var/service/
+
+Optionally check the status of the service a few seconds later
+
+ # runsvstat -l /var/service/dropbear
+
+ -- Gerrit Pape <pape@smarden.org>, Sun, 16 May 2004 15:52:34 +0000
diff --git a/debian/changelog b/debian/changelog
new file mode 100644
index 0000000..89c5875
--- /dev/null
+++ b/debian/changelog
@@ -0,0 +1,194 @@
+dropbear (0.47-0.1) unstable; urgency=high
+
+  * New upstream release.
+  * SECURITY: Fix incorrect buffer sizing.
+
+ -- Matt Johnston <matt@ucc.asn.au>  Thu, 8 Dec 2005 19:20:21 +0800
+
+dropbear (0.46-2) unstable; urgency=low
+
+  * debian/control: Standards-Version: 3.6.2.1; update descriptions to
+    mention included server and client (thx Tino Keitel).
+  * debian/dropbear.init: allow '/etc/init.d/dropbear stop' even though
+    'NO_START is not set to zero.' (closes: #336723).
+
+ -- Gerrit Pape <pape@smarden.org>  Tue,  6 Dec 2005 13:30:49 +0000
+
+dropbear (0.46-1) unstable; urgency=medium
+
+  * New upstream release, various fixes.
+  * debian/diff/dbclient-usage-typo.diff, debian/diff/manpages.diff: remove;
+    obsolete.
+  * debian/dbclient.1: move to ./dbclient.1.
+
+ -- Matt Johnston <matt@ucc.asn.au>  Fri, 8 July 2005 21:32:55 +0800
+
+dropbear (0.45-3) unstable; urgency=low
+
+  * debian/dropbear.init: init script prints human readable message in case
+    it's disabled (closes: #309099).
+  * debian/dropbear.postinst: configure: restart service through init script
+    instead of start.
+  * debian/dropbear.prerm: set -u -> set -e.
+
+ -- Gerrit Pape <pape@smarden.org>  Wed, 25 May 2005 22:38:17 +0000
+
+dropbear (0.45-2) unstable; urgency=low
+
+  * Matt Johnston:
+    * New upstream release, various fixes.
+
+ -- Gerrit Pape <pape@smarden.org>  Sat, 12 Mar 2005 15:17:55 +0000
+
+dropbear (0.44-1) unstable; urgency=low
+
+  * New upstream release.
+  * debian/rules: install /usr/bin/dbclient; handle possible patches more
+    gracefully; install debian/dbclient.1 man page; enable target patch;
+    minor.
+  * debian/implicit: update to revision 1.10.
+  * debian/dbclient.1: new; man page.
+  * debian/diff/dbclient-usage-typo.diff: new; fix typo.
+  * debian/diff/manpages.diff: new; add references to dbclient man page.
+
+ -- Gerrit Pape <pape@smarden.org>  Sat,  8 Jan 2005 22:50:43 +0000
+
+dropbear (0.43-2) unstable; urgency=high
+
+  * Matt Johnston:
+    * New upstream release 0.43
+    * SECURITY: Don't attempt to free uninitialised buffers in DSS verification
+      code
+    * Handle portforwarding to servers which don't send any initial data
+      (Closes: #258426)
+  * debian/dropbear.postinst: remove code causing bothersome warning on
+    package install (closes: #256752).
+  * debian/README.Debian.diet: new; how to build with the diet libc.
+  * debian/dropbear.docs: add debian/README.Debian.diet.
+  * debian/rules: support "diet" in DEB_BUILD_OPTIONS; minor cleanup.
+
+ -- Gerrit Pape <pape@smarden.org>  Sat, 17 Jul 2004 19:31:19 +0000
+
+dropbear (0.42-1) unstable; urgency=low
+
+  * New upstream release 0.42.
+  * debian/diff/cvs-20040520.diff: remove; obsolete.
+  * debian/rules: disable target patch.
+
+ -- Matt Johnston <matt@ucc.asn.au>  Wed, 16 June 2004 12:44:54 +0800
+
+dropbear (0.41-3) unstable; urgency=low
+
+  * 1st upload to the Debian archive (closes: #216553).
+  * debian/diff/cvs-20040520.diff: new; stable cvs snapshot.
+  * debian/rules: new target patch: apply diffs in debian/diff/, reverse
+    apply in target clean; install man pages.
+  * debian/control: Priority: optional.
+
+ -- Gerrit Pape <pape@smarden.org>  Sun, 23 May 2004 08:32:37 +0000
+
+dropbear (0.41-2) unstable; urgency=low
+
+  * new maintainer.
+  * debian/control: no longer Build-Depends: debhelper; Build-Depends:
+    libz-dev; Standards-Version: 3.6.1.0; Suggests: runit; update
+    descriptions.
+  * debian/rules: stop using debhelper, use implicit rules; cleanup;
+    install dropbearconvert into /usr/lib/dropbear/.
+  * debian/impicit: new; implicit rules.
+  * debian/copyright.in: adapt.
+  * debian/dropbear.init: minor adaptions; test for dropbear service
+    directory.
+  * debian/README.runit: new; how to use dropbear with runit.
+  * debian/README.Debian, debian/docs: rename to debian/dropbear.*.
+  * debian/dropbear.docs: add debian/README.runit
+  * debian/conffiles: rename to debian/dropbear.conffiles; add init
+    script, and run scripts.
+  * debian/postinst: rename to debian/dropbear.postinst; adapt; use
+    invloke-rc.d dropbear start.
+  * debian/dropbear.prerm: new; invoke-rc.d dropbear stop.
+  * debian/postrm: rename to debian/dropbear.postrm; adapt; clean up
+    service directories.
+  * debian/compat, debian/dirs, dropbear.default: remove; obsolete.
+
+ -- Gerrit Pape <pape@smarden.org>  Sun, 16 May 2004 16:50:55 +0000
+
+dropbear (0.41-1) unstable; urgency=low
+
+  * Updated to 0.41 release.
+  * Various minor fixes
+
+ -- Matt Johnston <matt@ucc.asn.au>  Mon, 19 Jan 2004 23:20:54 +0800
+
+dropbear (0.39-1) unstable; urgency=low
+
+  * updated to 0.39 release. Some new features, some bugfixes.
+
+ -- Matt Johnston <matt@ucc.asn.au>  Tue, 16 Dec 2003 16:20:54 +0800
+
+dropbear (0.38-1) unstable; urgency=medium
+
+  * updated to 0.38 release - various important bugfixes
+
+ -- Matt Johnston <matt@ucc.asn.au>  Sat, 11 Oct 2003 16:28:54 +0800
+
+dropbear (0.37-1) unstable; urgency=medium
+
+  * updated to 0.37 release - various important bugfixes
+
+ -- Matt Johnston <matt@ucc.asn.au>  Wed, 24 Sept 2003 19:43:54 +0800
+
+dropbear (0.36-1) unstable; urgency=high
+
+  * updated to 0.36 release - various important bugfixes
+
+ -- Matt Johnston <matt@ucc.asn.au>  Tues, 19 Aug 2003 12:20:54 +0800
+
+dropbear (0.35-1) unstable; urgency=high
+
+  * updated to 0.35 release - contains fix for remotely exploitable
+    vulnerability.
+
+ -- Matt Johnston <matt@ucc.asn.au>  Sun, 17 Aug 2003 05:37:47 +0800
+
+dropbear (0.34-1) unstable; urgency=medium
+
+  * updated to 0.34 release
+
+ -- Matt Johnston <matt@ucc.asn.au>  Fri, 15 Aug 2003 15:10:00 +0800
+
+dropbear (0.33-1) unstable; urgency=medium
+
+  * updated to 0.33 release
+
+ -- Matt Johnston <matt@ucc.asn.au>  Sun, 22 Jun 2003 22:22:00 +0800
+
+dropbear (0.32cvs-1) unstable; urgency=medium
+
+  * now maintained in UCC CVS
+  * debian/copyright.in file added, generated from LICENSE
+
+ -- Grahame Bowland <grahame@angrygoats.net>  Tue, 21 Jun 2003 17:57:02 +0800
+
+dropbear (0.32cvs-1) unstable; urgency=medium
+
+  * sync with CVS
+  * fixes X crash bug
+
+ -- Grahame Bowland <grahame@angrygoats.net>  Tue, 20 Jun 2003 15:04:47 +0800
+
+dropbear (0.32-2) unstable; urgency=low
+
+  * fix creation of host keys to use correct names in /etc/dropbear
+  * init script "restart" function fixed
+  * purging this package now deletes the host keys and /etc/dropbear
+  * change priority in debian/control to 'standard'
+
+ -- Grahame Bowland <grahame@angrygoats.net>  Tue, 17 Jun 2003 15:04:47 +0800
+
+dropbear (0.32-1) unstable; urgency=low
+
+  * Initial Release.
+
+ -- Grahame Bowland <grahame@angrygoats.net>  Tue, 17 Jun 2003 15:04:47 +0800
+
diff --git a/debian/control b/debian/control
new file mode 100644
index 0000000..81835b3
--- /dev/null
+++ b/debian/control
@@ -0,0 +1,20 @@
+Source: dropbear
+Section: net
+Priority: optional
+Maintainer: Gerrit Pape <pape@smarden.org>
+Build-Depends: libz-dev
+Standards-Version: 3.6.2.1
+
+Package: dropbear
+Architecture: any
+Depends: ${shlibs:Depends}
+Suggests: ssh, runit
+Description: lightweight SSH2 server and client
+ dropbear is a SSH 2 server and client designed to be small enough to
+ be used in small memory environments, while still being functional and
+ secure enough for general use.
+ .
+ It implements most required features of the SSH 2 protocol, and other
+ features such as X11 and authentication agent forwarding.
+ .
+ See http://matt.ucc.asn.au/dropbear/dropbear.html
diff --git a/debian/copyright.in b/debian/copyright.in
new file mode 100644
index 0000000..79526d3
--- /dev/null
+++ b/debian/copyright.in
@@ -0,0 +1,11 @@
+This package was debianized by Grahame Bowland <grahame.angrygoats.net> on
+Tue, 17 Jun 2003 15:04:47 +0800, maintained temporarily by Matt Johnston
+<matt@ucc.asn.au>, and was adopted by Gerrit Pape <pape@smarden.org> on
+Sun, 16 May 2004 14:38:33 +0000.
+
+It was downloaded from http://matt.ucc.asn.au/dropbear/
+
+Upstream Author: Matt Johnston <matt@ucc.asn.au>
+
+Copyright: 
+
diff --git a/debian/dropbear.README.Debian b/debian/dropbear.README.Debian
new file mode 100644
index 0000000..8cdac38
--- /dev/null
+++ b/debian/dropbear.README.Debian
@@ -0,0 +1,41 @@
+Dropbear for Debian
+-------------------
+
+This package will attempt to listen on port 22. If the OpenSSH 
+package ("ssh") is installed, the file /etc/default/dropbear 
+will be set up so that the server does not start by default.
+
+You can run Dropbear concurrently with OpenSSH 'sshd' by 
+modifying /etc/default/dropbear so that "NO_START" is set to 
+"0" and changing the port number that Dropbear runs on. Follow 
+the instructions in the file.
+
+This package suggests you install the "ssh" package. This package 
+provides the "ssh" client program, as well as the "/usr/bin/scp" 
+binary you will need to be able to retrieve files from a server 
+running Dropbear via SCP.
+
+Replacing OpenSSH "sshd" with Dropbear
+--------------------------------------
+
+You will still want to have the "ssh" package installed, as it 
+provides the "ssh" and "scp" binaries. When you install this 
+package, it checks for existing OpenSSH host keys and if found, 
+converts them to the Dropbear format.
+
+If this appears to have worked, you should be able to change over 
+by following these steps:
+
+1. Stop the OpenSSH server
+   % /etc/init.d/ssh stop
+2. Prevent the OpenSSH server from starting in the future
+   % touch /etc/ssh/sshd_not_to_be_run
+3. Modify the Dropbear defaults file, set NO_START to 0 and 
+   ensure DROPBEAR_PORT is set to 22.
+   % editor /etc/default/dropbear
+4. Restart the Dropbear server.
+   % /etc/init.d/dropbear restart
+
+See the Dropbear homepage for more information:
+  http://matt.ucc.asn.au/dropbear/dropbear.html
+
diff --git a/debian/dropbear.conffiles b/debian/dropbear.conffiles
new file mode 100644
index 0000000..6919006
--- /dev/null
+++ b/debian/dropbear.conffiles
@@ -0,0 +1,3 @@
+/etc/init.d/dropbear
+/etc/dropbear/run
+/etc/dropbear/log/run
diff --git a/debian/dropbear.default b/debian/dropbear.default
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/debian/dropbear.default
diff --git a/debian/dropbear.docs b/debian/dropbear.docs
new file mode 100644
index 0000000..94fec74
--- /dev/null
+++ b/debian/dropbear.docs
@@ -0,0 +1,4 @@
+README
+TODO
+debian/README.runit
+debian/README.Debian.diet
diff --git a/debian/dropbear.init b/debian/dropbear.init
new file mode 100644
index 0000000..7979c8d
--- /dev/null
+++ b/debian/dropbear.init
@@ -0,0 +1,61 @@
+#!/bin/sh
+#
+# Do not configure this file. Edit /etc/default/dropbear instead!
+#
+
+PATH=/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin
+DAEMON=/usr/sbin/dropbear
+NAME=dropbear
+DESC="Dropbear SSH server"
+
+DROPBEAR_PORT=22
+DROPBEAR_EXTRA_ARGS=
+NO_START=0
+
+set -e
+
+cancel() { echo "$1" >&2; exit 0; };
+test ! -r /etc/default/dropbear || . /etc/default/dropbear
+test -x "$DAEMON" || cancel "$DAEMON does not exist or is not executable."
+test ! -h /var/service/dropbear || \
+  cancel '/var/service/dropbear exists, service is controlled through runit.'
+
+test -z "$DROPBEAR_BANNER" || \
+  DROPBEAR_EXTRA_ARGS="$DROPBEAR_EXTRA_ARGS -b $DROPBEAR_BANNER"
+test -n "$DROPBEAR_RSAKEY" || \
+  DROPBEAR_RSAKEY="/etc/dropbear/dropbear_rsa_host_key"
+test -n "$DROPBEAR_DSSKEY" || \
+  DROPBEAR_DSSKEY="/etc/dropbear/dropbear_dss_host_key"
+
+case "$1" in
+  start)
+	test "$NO_START" = "0" || cancel 'NO_START is not set to zero.'
+	echo -n "Starting $DESC: "
+	start-stop-daemon --start --quiet --pidfile /var/run/"$NAME".pid \
+	  --exec "$DAEMON" -- -d "$DROPBEAR_DSSKEY" -r "$DROPBEAR_RSAKEY" \
+	    -p "$DROPBEAR_PORT" $DROPBEAR_EXTRA_ARGS
+	echo "$NAME."
+	;;
+  stop)
+	echo -n "Stopping $DESC: "
+	start-stop-daemon --stop --quiet --oknodo --pidfile /var/run/"$NAME".pid
+	echo "$NAME."
+	;;
+  restart|force-reload)
+	test "$NO_START" = "0" || cancel 'NO_START is not set to zero.'
+	echo -n "Restarting $DESC: "
+	start-stop-daemon --stop --quiet --oknodo --pidfile /var/run/"$NAME".pid
+	sleep 1
+	start-stop-daemon --start --quiet --pidfile /var/run/"$NAME".pid \
+	  --exec "$DAEMON" -- -d "$DROPBEAR_DSSKEY" -r "$DROPBEAR_RSAKEY" \
+	    -p "$DROPBEAR_PORT" $DROPBEAR_EXTRA_ARGS
+	echo "$NAME."
+	;;
+  *)
+	N=/etc/init.d/$NAME
+	echo "Usage: $N {start|stop|restart|force-reload}" >&2
+	exit 1
+	;;
+esac
+
+exit 0
diff --git a/debian/dropbear.postinst b/debian/dropbear.postinst
new file mode 100644
index 0000000..312eb05
--- /dev/null
+++ b/debian/dropbear.postinst
@@ -0,0 +1,67 @@
+#!/bin/sh
+set -e
+
+test "$1" = 'configure' || exit 0
+
+if test ! -e /etc/dropbear/dropbear_rsa_host_key; then
+  if test -f /etc/ssh/ssh_host_rsa_key; then
+    echo "Converting existing OpenSSH RSA host key to Dropbear format."
+    /usr/lib/dropbear/dropbearconvert openssh dropbear \
+      /etc/ssh/ssh_host_rsa_key /etc/dropbear/dropbear_rsa_host_key
+  else
+    echo "Generating Dropbear RSA key. Please wait."
+    dropbearkey -t rsa -f /etc/dropbear/dropbear_rsa_host_key
+  fi
+fi
+if test ! -e /etc/dropbear/dropbear_dss_host_key; then
+  if test -f /etc/ssh/ssh_host_dsa_key; then
+    echo "Converting existing OpenSSH RSA host key to Dropbear format."
+    /usr/lib/dropbear/dropbearconvert openssh dropbear \
+      /etc/ssh/ssh_host_dsa_key /etc/dropbear/dropbear_dss_host_key
+  else
+    echo "Generating Dropbear DSS key. Please wait."
+    dropbearkey -t dss -f /etc/dropbear/dropbear_dss_host_key
+  fi
+fi
+if test ! -s /etc/default/dropbear; then 
+  # check whether OpenSSH seems to be installed.
+  if test -x /usr/sbin/sshd; then
+    cat <<EOT
+OpenSSH appears to be installed.  Setting /etc/default/dropbear so that
+Dropbear will not start by default.  Edit this file to change this behaviour.
+
+EOT
+    cat >>/etc/default/dropbear <<EOT
+# disabled because OpenSSH is installed
+# change to NO_START=0 to enable Dropbear
+NO_START=1
+
+EOT
+  fi
+  cat >>/etc/default/dropbear <<EOT
+# the TCP port that Dropbear listens on
+DROPBEAR_PORT=22
+
+# any additional arguments for Dropbear
+DROPBEAR_EXTRA_ARGS=
+
+# specify an optional banner file containing a message to be
+# sent to clients before they connect, such as "/etc/issue.net"
+DROPBEAR_BANNER=""
+
+# RSA hostkey file (default: /etc/dropbear/dropbear_rsa_host_key)
+#DROPBEAR_RSAKEY="/etc/dropbear/dropbear_rsa_host_key"
+
+# DSS hostkey file (default: /etc/dropbear/dropbear_dss_host_key)
+#DROPBEAR_DSSKEY="/etc/dropbear/dropbear_dss_host_key"
+EOT
+fi
+
+if test -x /etc/init.d/dropbear; then
+  update-rc.d dropbear defaults >/dev/null
+  if test -x /usr/sbin/invoke-rc.d; then
+    invoke-rc.d dropbear restart
+  else
+    /etc/init.d/dropbear restart
+  fi
+fi
diff --git a/debian/dropbear.postrm b/debian/dropbear.postrm
new file mode 100644
index 0000000..d09dab0
--- /dev/null
+++ b/debian/dropbear.postrm
@@ -0,0 +1,12 @@
+#! /bin/sh
+set -e
+
+test "$1" = 'purge' || exit 0
+if test -e /etc/dropbear; then
+  rm -f /etc/dropbear/dropbear_rsa_host_key
+  rm -f /etc/dropbear/dropbear_dss_host_key
+  rmdir --ignore-fail-on-non-empty /etc/dropbear
+fi
+update-rc.d dropbear remove >/dev/null
+rm -f /etc/default/dropbear
+rm -rf /etc/dropbear/supervise /etc/dropbear/log/supervise
diff --git a/debian/dropbear.prerm b/debian/dropbear.prerm
new file mode 100644
index 0000000..e63cdb8
--- /dev/null
+++ b/debian/dropbear.prerm
@@ -0,0 +1,11 @@
+#!/bin/sh
+set -e
+
+test "$1" = 'remove' || test "$1" = 'deconfigure' || exit 0
+if test -x /etc/init.d/dropbear; then
+  if test -x /usr/sbin/invoke-rc.d; then
+    invoke-rc.d dropbear stop
+  else
+    /etc/init.d/dropbear stop
+  fi
+fi
diff --git a/debian/implicit b/debian/implicit
new file mode 100644
index 0000000..57a444a
--- /dev/null
+++ b/debian/implicit
@@ -0,0 +1,85 @@
+# $Id: implicit,v 1.10 2004/07/03 15:20:00 pape Exp $
+
+.PHONY: deb-checkdir deb-checkuid
+
+deb-checkdir:
+	@test -e debian/control || sh -cx '! : wrong directory'
+deb-checkuid:
+	@test "`id -u`" -eq 0 || sh -cx '! : need root privileges'
+
+%.deb: %.deb-docs %.deb-DEBIAN
+	@rm -f $*.deb $*.deb-checkdir $*.deb-docs $*.deb-docs-base \
+	  $*.deb-docs-docs $*.deb-docs-examples $*.deb-DEBIAN \
+	  $*.deb-DEBIAN-dir $*.deb-DEBIAN-scripts $*.deb-DEBIAN-md5sums
+
+%.udeb: %.deb-DEBIAN
+	@rm -f $*.deb $*.deb-checkdir $*.deb-DEBIAN $*.deb-DEBIAN-dir \
+	  $*.deb-DEBIAN-scripts $*.deb-DEBIAN-md5sums
+
+%.deb-checkdir:
+	@test -d debian/$* || sh -cx '! : directory debian/$* missing'
+	@test "`id -u`" -eq 0 || sh -cx '! : need root privileges'
+
+%.deb-docs-base:
+	: implicit
+	@rm -f debian/$*/usr/share/doc/$*/* || :
+	@install -d -m0755 debian/$*/usr/share/doc/$*
+	: debian/$*/usr/share/doc/$*/
+	@sh -cx 'install -m0644 debian/copyright debian/$*/usr/share/doc/$*/'
+	@sh -cx 'install -m0644 debian/changelog \
+	  debian/$*/usr/share/doc/$*/changelog.Debian'
+	@test ! -r changelog || \
+	  sh -cx 'install -m0644 changelog debian/$*/usr/share/doc/$*/'
+	@test -r debian/$*/usr/share/doc/$*/changelog || \
+	  sh -cx 'mv debian/$*/usr/share/doc/$*/changelog.Debian \
+	    debian/$*/usr/share/doc/$*/changelog'
+	@test -s debian/$*/usr/share/doc/$*/changelog || \
+	  sh -cx 'rm -f debian/$*/usr/share/doc/$*/changelog'
+	@gzip -9 debian/$*/usr/share/doc/$*/changelog*
+%.deb-docs-docs:
+	@for i in `cat debian/$*.docs 2>/dev/null || :`; do \
+	  sh -cx "install -m0644 $$i debian/$*/usr/share/doc/$*/" || exit 1; \
+	done
+	@test ! -r debian/$*.README.Debian || \
+	  sh -cx 'install -m0644 debian/$*.README.Debian \
+	    debian/$*/usr/share/doc/$*/README.Debian'
+	@if test -r debian/$*.NEWS.Debian; then \
+	  sh -cx 'install -m0644 debian/$*.NEWS.Debian \
+	    debian/$*/usr/share/doc/$*/NEWS.Debian && \
+	      gzip -9 debian/$*/usr/share/doc/$*/NEWS.Debian'; \
+	fi
+%.deb-docs-examples:
+	@rm -rf debian/$*/usr/share/doc/$*/examples
+	: debian/$*/usr/share/doc/$*/examples/
+	@test ! -r debian/$*.examples || \
+	  install -d -m0755 debian/$*/usr/share/doc/$*/examples
+	@for i in `cat debian/$*.examples 2>/dev/null || :`; do \
+	  sh -cx "install -m0644 $$i debian/$*/usr/share/doc/$*/examples/" \
+	    || exit 1; \
+	done
+%.deb-docs: %.deb-checkdir %.deb-docs-base %.deb-docs-docs %.deb-docs-examples
+	: debian/$*/usr/share/doc/$*/ ok
+
+%.deb-DEBIAN-base:
+	@rm -rf debian/$*/DEBIAN
+	: debian/$*/DEBIAN/
+	@install -d -m0755 debian/$*/DEBIAN
+	@for i in conffiles shlibs templates; do \
+	  test ! -r debian/$*.$$i || \
+	    sh -cx "install -m0644 debian/$*.$$i debian/$*/DEBIAN/$$i" \
+	      || exit 1; \
+	done
+%.deb-DEBIAN-scripts:
+	@for i in preinst prerm postinst postrm config; do \
+	  test ! -r debian/$*.$$i || \
+	    sh -cx "install -m0755 debian/$*.$$i debian/$*/DEBIAN/$$i" \
+	      || exit 1; \
+	done
+%.deb-DEBIAN-md5sums:
+	: debian/$*/DEBIAN/md5sums
+	@rm -f debian/$*/DEBIAN/md5sums
+	@cd debian/$* && find * -path 'DEBIAN' -prune -o \
+	  -type f -exec md5sum {} >>DEBIAN/md5sums \;
+%.deb-DEBIAN: %.deb-checkdir %.deb-DEBIAN-base %.deb-DEBIAN-scripts \
+	  %.deb-DEBIAN-md5sums
+	: debian/$*/DEBIAN/ ok
diff --git a/debian/rules b/debian/rules
new file mode 100755
index 0000000..52c3ea8
--- /dev/null
+++ b/debian/rules
@@ -0,0 +1,107 @@
+#!/usr/bin/make -f
+
+#export DH_OPTIONS
+DEB_HOST_GNU_TYPE ?=$(shell dpkg-architecture -qDEB_HOST_GNU_TYPE)
+DEB_BUILD_GNU_TYPE ?=$(shell dpkg-architecture -qDEB_BUILD_GNU_TYPE)
+
+STRIP =strip
+ifneq (,$(findstring nostrip,$(DEB_BUILD_OPTIONS)))
+  STRIP =: nostrip
+endif
+
+CFLAGS =-Wall -g
+ifneq (,$(findstring noopt,$(DEB_BUILD_OPTIONS)))
+  CFLAGS +=-O0
+else
+  CFLAGS +=-O2
+endif
+
+CONFFLAGS =
+CC =gcc
+ifneq (,$(findstring diet,$(DEB_BUILD_OPTIONS)))
+  CONFFLAGS =--disable-zlib
+  CC =diet -v -Os gcc -nostdinc
+endif
+
+DIR =$(shell pwd)/debian/dropbear
+
+patch: deb-checkdir patch-stamp
+patch-stamp:
+	for i in `ls -1 debian/diff/*.diff || :`; do \
+	  patch -p0 <$$i || exit 1; \
+	done
+	touch patch-stamp
+
+config.status: patch-stamp configure
+	CC='$(CC)' \
+	CFLAGS='$(CFLAGS)'' -DSFTPSERVER_PATH="\"/usr/lib/sftp-server\""' \
+	  ./configure --host='$(DEB_HOST_GNU_TYPE)' \
+	    --build='$(DEB_BUILD_GNU_TYPE)' --prefix=/usr \
+	    --mandir=\$${prefix}/share/man --infodir=\$${prefix}/share/info \
+	    $(CONFFLAGS)
+
+build: deb-checkdir build-stamp
+build-stamp: config.status
+	$(MAKE) CC='$(CC)' LD='$(CC)'
+	touch build-stamp
+
+clean: deb-checkdir deb-checkuid
+	-$(MAKE) distclean
+	test ! -e patch-stamp || \
+	  for i in `ls -1r debian/diff/*.diff || :`; do \
+	    patch -p0 -R <$$i; \
+	  done
+	rm -f patch-stamp build-stamp config.log config.status
+	rm -rf '$(DIR)'
+	rm -f debian/files debian/substvars debian/copyright changelog
+
+install: deb-checkdir deb-checkuid build-stamp
+	rm -rf '$(DIR)'
+	install -d -m0755 '$(DIR)'/etc/dropbear
+	# programs
+	install -d -m0755 '$(DIR)'/usr/sbin
+	install -m0755 dropbear '$(DIR)'/usr/sbin/dropbear
+	install -d -m0755 '$(DIR)'/usr/bin
+	install -m0755 dbclient '$(DIR)'/usr/bin/dbclient
+	install -m0755 dropbearkey '$(DIR)'/usr/bin/dropbearkey
+	install -d -m0755 '$(DIR)'/usr/lib/dropbear
+	install -m0755 dropbearconvert \
+	  '$(DIR)'/usr/lib/dropbear/dropbearconvert
+	$(STRIP) -R .comment -R .note '$(DIR)'/usr/sbin/* \
+	  '$(DIR)'/usr/bin/* '$(DIR)'/usr/lib/dropbear/*
+	# init and run scripts
+	install -d -m0755 '$(DIR)'/etc/init.d
+	install -m0755 debian/dropbear.init '$(DIR)'/etc/init.d/dropbear
+	install -m0755 debian/service/run '$(DIR)'/etc/dropbear/run
+	install -d -m0755 '$(DIR)'/etc/dropbear/log
+	install -m0755 debian/service/log '$(DIR)'/etc/dropbear/log/run
+	ln -s /var/log/dropbear '$(DIR)'/etc/dropbear/log/main
+	ln -s /var/run/dropbear '$(DIR)'/etc/dropbear/supervise
+	ln -s /var/run/dropbear.log '$(DIR)'/etc/dropbear/log/supervise
+	# man pages
+	install -d -m0755 '$(DIR)'/usr/share/man/man8
+	for i in dropbear.8 dropbearkey.8; do \
+	  install -m644 $$i '$(DIR)'/usr/share/man/man8/ || exit 1; \
+	done
+	gzip -9 '$(DIR)'/usr/share/man/man8/*.8
+	install -d -m0755 '$(DIR)'/usr/share/man/man1
+	install -m644 dbclient.1 '$(DIR)'/usr/share/man/man1/
+	gzip -9 '$(DIR)'/usr/share/man/man1/*.1
+	# copyright, changelog
+	cat debian/copyright.in LICENSE >debian/copyright
+	test -r changelog || ln -s CHANGES changelog
+
+binary-indep:
+
+binary-arch: install dropbear.deb
+	test '$(CC)' != 'gcc' || \
+	  dpkg-shlibdeps '$(DIR)'/usr/sbin/* '$(DIR)'/usr/bin/* \
+	    '$(DIR)'/usr/lib/dropbear/*
+	dpkg-gencontrol -isp -pdropbear -P'$(DIR)'
+	dpkg -b '$(DIR)' ..
+
+binary: binary-arch binary-indep
+
+.PHONY: patch build clean install binary-indep binary-arch binary
+
+include debian/implicit
diff --git a/debian/service/log b/debian/service/log
new file mode 100644
index 0000000..2ffb13d
--- /dev/null
+++ b/debian/service/log
@@ -0,0 +1,2 @@
+#!/bin/sh
+exec chpst -udropbearlog svlogd -tt ./main
diff --git a/debian/service/run b/debian/service/run
new file mode 100644
index 0000000..f208085
--- /dev/null
+++ b/debian/service/run
@@ -0,0 +1,3 @@
+#!/bin/sh
+exec 2>&1
+exec dropbear -d ./dropbear_dss_host_key -r ./dropbear_rsa_host_key -F -E -p 22
diff --git a/debug.h b/debug.h
new file mode 100644
index 0000000..93cb891
--- /dev/null
+++ b/debug.h
@@ -0,0 +1,74 @@
+/*
+ * Dropbear - a SSH2 server
+ * 
+ * Copyright (c) 2002,2003 Matt Johnston
+ * All rights reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE. */
+
+#ifndef _DEBUG_H_
+#define _DEBUG_H_
+
+#include "includes.h"
+
+/* Debugging */
+
+/* Work well for valgrind - don't clear environment, be nicer with signals
+ * etc. Don't use this normally, it might cause problems */
+/* #define DEBUG_VALGRIND */
+
+/* Define this to compile in trace debugging printf()s. 
+ * You'll need to run programs with "-v" to turn this on.
+ *
+ * Caution: Don't use this in an unfriendly environment (ie unfirewalled),
+ * since the printing may not sanitise strings etc. This will add a reasonable
+ * amount to your executable size. */
+/*#define DEBUG_TRACE */
+
+/* All functions writing to the cleartext payload buffer call
+ * CHECKCLEARTOWRITE() before writing. This is only really useful if you're
+ * attempting to track down a problem */
+#define CHECKCLEARTOWRITE() assert(ses.writepayload->len == 0 && \
+		ses.writepayload->pos == 0)
+
+/* Define this, compile with -pg and set GMON_OUT_PREFIX=gmon to get gmon
+ * output when Dropbear forks. This will allow it gprof to be used.
+ * It's useful to run dropbear -F, so you don't fork as much */
+/* (This is Linux specific) */
+/*#define DEBUG_FORKGPROF*/
+
+/* A couple of flags, not usually useful, and mightn't do anything */
+
+/*#define DEBUG_KEXHASH*/
+/*#define DEBUG_RSA*/
+
+/* you don't need to touch this block */
+#ifdef DEBUG_TRACE
+#define TRACE(X) dropbear_trace X;
+#else /*DEBUG_TRACE*/
+#define TRACE(X)
+#endif /*DEBUG_TRACE*/
+
+/* For testing as non-root on shadowed systems, include the crypt of a password
+ * here. You can then log in as any user with this password. Ensure that you
+ * make your own password, and are careful about using this. This will also
+ * disable some of the chown pty code etc*/
+/* #define DEBUG_HACKCRYPT "hL8nrFDt0aJ3E" */ /* this is crypt("password") */
+
+#endif
diff --git a/dropbear.8 b/dropbear.8
new file mode 100644
index 0000000..38cf7e2
--- /dev/null
+++ b/dropbear.8
@@ -0,0 +1,84 @@
+.TH dropbear 8
+.SH NAME
+dropbear \- lightweight SSH2 server
+.SH SYNOPSIS
+.B dropbear
+[\-FEmwsgjki] [\-b
+.I banner\fR] [\-d
+.I dsskey\fR] [\-r
+.I rsakey\fR] [\-p
+.IR port ]
+.SH DESCRIPTION
+.B dropbear
+is a SSH 2 server designed to be small enough to be used in small memory
+environments, while still being functional and secure enough for general use.
+.SH OPTIONS
+.TP
+.B \-b \fIbanner
+bannerfile.
+Display the contents of the file
+.I banner
+before user login (default: none).
+.TP
+.B \-d \fIdsskey
+dsskeyfile.
+Use the contents of the file
+.I dsskey
+for the dss host key (default: /etc/dropbear/dropbear_dss_host_key).
+This file is generated with
+.BR dropbearkey (8).
+.TP
+.B \-r \fIrsakey
+rsakeyfile.
+Use the contents of the file
+.I rsakey
+for the rsa host key (default: /etc/dropbear/dropbear_rsa_host_key).
+This file is generated with
+.BR dropbearkey (8).
+.TP
+.B \-F
+Don't fork into background.
+.TP
+.B \-E
+Log to standard error rather than syslog.
+.TP
+.B \-m
+Don't display the message of the day on login.
+.TP
+.B \-w
+Disallow root logins.
+.TP
+.B \-s
+Disable password logins.
+.TP
+.B \-g
+Disable password logins for root.
+.TP
+.B \-j
+Disable local port forwarding.
+.TP
+.B \-k
+Disable remote port forwarding.
+.TP
+.B \-p \fIport
+Listen on specified tcp port
+.IR port ;
+up to 10 can be specified (default 22 if none specified).
+.TP
+.B \-i
+Service program mode.
+Use this option to run
+.B dropbear
+under TCP/IP servers like inetd, tcpsvd, or tcpserver.
+In program mode the \-F option is implied, and \-p options are ignored.
+.TP
+.B \-a
+Allow remote hosts to connect to forwarded ports.
+.SH AUTHOR
+Matt Johnston (matt@ucc.asn.au).
+.br
+Gerrit Pape (pape@smarden.org) wrote this manual page.
+.SH SEE ALSO
+dropbearkey(8), dbclient(1)
+.P
+http://matt.ucc.asn.au/dropbear/dropbear.html
diff --git a/dropbearconvert.c b/dropbearconvert.c
new file mode 100644
index 0000000..9e16fe7
--- /dev/null
+++ b/dropbearconvert.c
@@ -0,0 +1,149 @@
+/*
+ * Dropbear - a SSH2 server
+ * 
+ * Copyright (c) 2002,2003 Matt Johnston
+ * All rights reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE. */
+
+/* This program converts to/from Dropbear and OpenSSH private-key formats */
+#include "includes.h"
+#include "signkey.h"
+#include "buffer.h"
+#include "dbutil.h"
+#include "keyimport.h"
+
+
+static int do_convert(int intype, const char* infile, int outtype,
+		const char* outfile);
+
+static void printhelp(char * progname);
+
+static void printhelp(char * progname) {
+
+	fprintf(stderr, "Usage: %s <inputtype> <outputtype> <inputfile> <outputfile>\n\n"
+					"CAUTION: This program is for convenience only, and is not secure if used on\n"
+					"untrusted input files, ie it could allow arbitrary code execution.\n"
+					"All parameters must be specified in order.\n"
+					"\n"
+					"The input and output types are one of:\n"
+					"openssh\n"
+					"dropbear\n"
+					"\n"
+					"Example:\n"
+					"dropbearconvert openssh dropbear /etc/ssh/ssh_host_rsa_key /etc/dropbear_rsa_host_key\n",
+					progname);
+}
+
+#if defined(DBMULTI_dropbearconvert) || !defined(DROPBEAR_MULTI)
+#if defined(DBMULTI_dropbearconvert) && defined(DROPBEAR_MULTI)
+int dropbearconvert_main(int argc, char ** argv) {
+#else 
+int main(int argc, char ** argv) {
+#endif
+
+	int intype, outtype;
+	const char* infile;
+	const char* outfile;
+
+#ifdef DEBUG_TRACE
+	/* It's hard for it to get in the way _too_ much */
+	debug_trace = 1;
+#endif
+
+	/* get the commandline options */
+	if (argc != 5) {
+		fprintf(stderr, "All arguments must be specified\n");
+		goto usage;
+	}
+
+	/* input type */
+	if (argv[1][0] == 'd') {
+		intype = KEYFILE_DROPBEAR;
+	} else if (argv[1][0] == 'o') {
+		intype = KEYFILE_OPENSSH;
+	} else {
+		fprintf(stderr, "Invalid input key type\n");
+		goto usage;
+	}
+
+	/* output type */
+	if (argv[2][0] == 'd') {
+		outtype = KEYFILE_DROPBEAR;
+	} else if (argv[2][0] == 'o') {
+		outtype = KEYFILE_OPENSSH;
+	} else {
+		fprintf(stderr, "Invalid output key type\n");
+		goto usage;
+	}
+
+	/* we don't want output readable by others */
+	umask(077);
+
+	infile = argv[3];
+	outfile = argv[4];
+
+	return do_convert(intype, infile, outtype, outfile);
+
+usage:
+	printhelp(argv[0]);
+	return 1;
+}
+#endif
+
+static int do_convert(int intype, const char* infile, int outtype,
+		const char* outfile) {
+
+	sign_key * key = NULL;
+	char * keytype = NULL;
+	int ret = 1;
+
+	key = import_read(infile, NULL, intype);
+	if (!key) {
+		fprintf(stderr, "Error reading key from '%s'\n",
+				infile);
+		goto out;
+	}
+
+#ifdef DROPBEAR_RSA
+	if (key->rsakey != NULL) {
+		keytype = "RSA";
+	}
+#endif
+#ifdef DROPBEAR_DSS
+	if (key->dsskey != NULL) {
+		keytype = "DSS";
+	}
+#endif
+
+	fprintf(stderr, "Key is a %s key\n", keytype);
+
+	if (import_write(outfile, key, NULL, outtype) != 1) {
+		fprintf(stderr, "Error writing key to '%s'\n", outfile);
+	} else {
+		fprintf(stderr, "Wrote key to '%s'\n", outfile);
+		ret = 0;
+	}
+
+out:
+	if (key) {
+		sign_key_free(key);
+	}
+	return ret;
+}
diff --git a/dropbearkey.8 b/dropbearkey.8
new file mode 100644
index 0000000..a093d85
--- /dev/null
+++ b/dropbearkey.8
@@ -0,0 +1,47 @@
+.TH dropbearkey 8
+.SH NAME
+dropbearkey \- create private keys for the use with dropbear(8)
+.SH SYNOPSIS
+.B dropbearkey
+\-t
+.I type
+\-f
+.I file
+[\-s
+.IR bits ]
+.SH DESCRIPTION
+.B dropbearkey
+generates a type
+.I rsa
+or
+.I dss
+SSH private key, and saves it to a file for the use with the
+.BR dropbear (8)
+SSH 2 server.
+.SH OPTIONS
+.TP
+.B \-t \fItype
+Type of key to generate.
+Must be one of
+.I rsa
+or
+.IR dss .
+.TP
+.B \-f \fIfile
+Write the secret key to the file
+.IR file .
+.TP
+.B \-s \fIbits
+Set the key size to
+.I bits
+bits, should be multiple of 8 (optional).
+.SH EXAMPLE
+ # dropbearkey -t rsa -f /etc/dropbear/dropbear_rsa_host_key
+.SH AUTHOR
+Matt Johnston (matt@ucc.asn.au).
+.br
+Gerrit Pape (pape@smarden.org) wrote this manual page.
+.SH SEE ALSO
+dropbear(8), dbclient(1)
+.P
+http://matt.ucc.asn.au/dropbear/dropbear.html
diff --git a/dropbearkey.c b/dropbearkey.c
new file mode 100644
index 0000000..8ceefdc
--- /dev/null
+++ b/dropbearkey.c
@@ -0,0 +1,355 @@
+/*
+ * Dropbear - a SSH2 server
+ * 
+ * Copyright (c) 2002,2003 Matt Johnston
+ * All rights reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE. */
+
+/* The format of the keyfiles is basically a raw dump of the buffer. Data types
+ * are specified in the transport draft - string is a 32-bit len then the
+ * non-null-terminated string, mp_int is a 32-bit len then the bignum data.
+ * The actual functions are buf_put_rsa_priv_key() and buf_put_dss_priv_key()
+
+ * RSA:
+ * string	"ssh-rsa"
+ * mp_int	e
+ * mp_int	n
+ * mp_int	d
+ * mp_int	p (newer versions only)
+ * mp_int	q (newer versions only) 
+ *
+ * DSS:
+ * string	"ssh-dss"
+ * mp_int	p
+ * mp_int	q
+ * mp_int	g
+ * mp_int	y
+ * mp_int	x
+ *
+ */
+#include "includes.h"
+#include "signkey.h"
+#include "buffer.h"
+#include "dbutil.h"
+
+#include "genrsa.h"
+#include "gendss.h"
+
+static void printhelp(char * progname);
+
+#define RSA_SIZE (1024/8) /* 1024 bit */
+#define DSS_SIZE (1024/8) /* 1024 bit */
+
+static void buf_writefile(buffer * buf, const char * filename);
+static void printpubkey(sign_key * key, int keytype);
+static void justprintpub(const char* filename);
+
+/* Print a help message */
+static void printhelp(char * progname) {
+
+	fprintf(stderr, "Usage: %s -t <type> -f <filename> [-s bits]\n"
+					"Options are:\n"
+					"-t type	Type of key to generate. One of:\n"
+#ifdef DROPBEAR_RSA
+					"		rsa\n"
+#endif
+#ifdef DROPBEAR_DSS
+					"		dss\n"
+#endif
+					"-f filename	Use filename for the secret key\n"
+					"-s bits	Key size in bits, should be a multiple of 8 (optional)\n"
+					"-y		Just print the publickey and fingerprint for the\n		private key in <filename>.\n"
+#ifdef DEBUG_TRACE
+					"-v		verbose\n"
+#endif
+					,progname);
+}
+
+#if defined(DBMULTI_dropbearkey) || !defined(DROPBEAR_MULTI)
+#if defined(DBMULTI_dropbearkey) && defined(DROPBEAR_MULTI)
+int dropbearkey_main(int argc, char ** argv) {
+#else
+int main(int argc, char ** argv) {
+#endif
+
+	int i;
+	char ** next = 0;
+	sign_key *key = NULL;
+	buffer *buf = NULL;
+	char * filename = NULL;
+	int keytype = -1;
+	char * typetext = NULL;
+	char * sizetext = NULL;
+	unsigned int bits;
+	unsigned int keysize;
+	int printpub = 0;
+
+	/* get the commandline options */
+	for (i = 1; i < argc; i++) {
+		if (argv[i] == NULL) {
+			continue; /* Whack */
+		} 
+		if (next) {
+			*next = argv[i];
+			next = NULL;
+			continue;
+		}
+
+		if (argv[i][0] == '-') {
+			switch (argv[i][1]) {
+				case 'f':
+					next = &filename;
+					break;
+				case 't':
+					next = &typetext;
+					break;
+				case 's':
+					next = &sizetext;
+					break;
+				case 'y':
+					printpub = 1;
+					break;
+				case 'h':
+					printhelp(argv[0]);
+					exit(EXIT_SUCCESS);
+					break;
+#ifdef DEBUG_TRACE
+				case 'v':
+					debug_trace = 1;
+					break;
+#endif
+				default:
+					fprintf(stderr, "Unknown argument %s\n", argv[i]);
+					printhelp(argv[0]);
+					exit(EXIT_FAILURE);
+					break;
+			}
+		}
+	}
+
+	if (!filename) {
+		fprintf(stderr, "Must specify a key filename\n");
+		printhelp(argv[0]);
+		exit(EXIT_FAILURE);
+	}
+
+	if (printpub) {
+		justprintpub(filename);
+		/* Not reached */
+	}
+
+	/* check/parse args */
+	if (!typetext) {
+		fprintf(stderr, "Must specify key type\n");
+		printhelp(argv[0]);
+		exit(EXIT_FAILURE);
+	}
+
+	if (strlen(typetext) == 3) {
+#ifdef DROPBEAR_RSA
+		if (strncmp(typetext, "rsa", 3) == 0) {
+			keytype = DROPBEAR_SIGNKEY_RSA;
+			TRACE(("type is rsa"))
+		}
+#endif
+#ifdef DROPBEAR_DSS
+		if (strncmp(typetext, "dss", 3) == 0) {
+			keytype = DROPBEAR_SIGNKEY_DSS;
+			TRACE(("type is dss"))
+		}
+#endif
+	}
+	if (keytype == -1) {
+		fprintf(stderr, "Unknown key type '%s'\n", typetext);
+		printhelp(argv[0]);
+		exit(EXIT_FAILURE);
+	}
+
+	if (sizetext) {
+		if (sscanf(sizetext, "%u", &bits) != 1) {
+			fprintf(stderr, "Bits must be an integer\n");
+			exit(EXIT_FAILURE);
+		}
+	
+		if (bits < 512 || bits > 4096 || (bits % 8 != 0)) {
+			fprintf(stderr, "Bits must satisfy 512 <= bits <= 4096, and be a"
+					" multiple of 8\n");
+			exit(EXIT_FAILURE);
+		}
+
+		keysize = bits / 8;
+	} else {
+		if (keytype == DROPBEAR_SIGNKEY_DSS) {
+			keysize = DSS_SIZE;
+		} else if (keytype == DROPBEAR_SIGNKEY_RSA) {
+			keysize = RSA_SIZE;
+		} else {
+			exit(EXIT_FAILURE); /* not reached */
+		}
+	}
+
+
+	fprintf(stderr, "Will output %d bit %s secret key to '%s'\n", keysize*8,
+			typetext, filename);
+
+	/* don't want the file readable by others */
+	umask(077);
+
+	/* now we can generate the key */
+	key = new_sign_key();
+	
+	fprintf(stderr, "Generating key, this may take a while...\n");
+	switch(keytype) {
+#ifdef DROPBEAR_RSA
+		case DROPBEAR_SIGNKEY_RSA:
+			key->rsakey = gen_rsa_priv_key(keysize); /* 128 bytes = 1024 bit */
+			break;
+#endif
+#ifdef DROPBEAR_DSS
+		case DROPBEAR_SIGNKEY_DSS:
+			key->dsskey = gen_dss_priv_key(keysize); /* 128 bytes = 1024 bit */
+			break;
+#endif
+		default:
+			fprintf(stderr, "Internal error, bad key type\n");
+			exit(EXIT_FAILURE);
+	}
+
+	buf = buf_new(MAX_PRIVKEY_SIZE); 
+
+	buf_put_priv_key(buf, key, keytype);
+	buf_setpos(buf, 0);
+	buf_writefile(buf, filename);
+
+	buf_burn(buf);
+	buf_free(buf);
+
+	printpubkey(key, keytype);
+
+	sign_key_free(key);
+
+	return EXIT_SUCCESS;
+}
+#endif
+
+static void justprintpub(const char* filename) {
+
+	buffer *buf = NULL;
+	sign_key *key = NULL;
+	int keytype;
+	int ret;
+	int err = DROPBEAR_FAILURE;
+
+	buf = buf_new(MAX_PRIVKEY_SIZE);
+	ret = buf_readfile(buf, filename);
+
+	if (ret != DROPBEAR_SUCCESS) {
+		fprintf(stderr, "Failed reading '%s'\n", filename);
+		goto out;
+	}
+
+	key = new_sign_key();
+	keytype = DROPBEAR_SIGNKEY_ANY;
+
+	buf_setpos(buf, 0);
+	ret = buf_get_priv_key(buf, key, &keytype);
+	if (ret == DROPBEAR_FAILURE) {
+		fprintf(stderr, "Bad key in '%s'\n", filename);
+		goto out;
+	}
+
+	printpubkey(key, keytype);
+
+	err = DROPBEAR_SUCCESS;
+
+out:
+	buf_burn(buf);
+	buf_free(buf);
+	buf = NULL;
+	sign_key_free(key);
+	key = NULL;
+	exit(err);
+}
+
+static void printpubkey(sign_key * key, int keytype) {
+
+	buffer * buf = NULL;
+	unsigned char base64key[MAX_PUBKEY_SIZE*2];
+	unsigned long base64len;
+	int err;
+	const char * typestring = NULL;
+	char *fp = NULL;
+	int len;
+
+	buf = buf_new(MAX_PUBKEY_SIZE);
+	buf_put_pub_key(buf, key, keytype);
+	buf_setpos(buf, 4);
+
+	len = buf->len - buf->pos;
+
+	base64len = sizeof(base64key);
+	err = base64_encode(buf_getptr(buf, len), len, base64key, &base64len);
+
+	if (err != CRYPT_OK) {
+		fprintf(stderr, "base64 failed");
+	}
+
+	typestring = signkey_name_from_type(keytype, &err);
+
+	fp = sign_key_fingerprint(buf_getptr(buf, len), len);
+
+	printf("Public key portion is:\n%s %s\nFingerprint: %s\n",
+			typestring, base64key, fp);
+
+	m_free(fp);
+	buf_free(buf);
+}
+
+/* Write a buffer to a file specified, failing if the file exists */
+static void buf_writefile(buffer * buf, const char * filename) {
+
+	int fd;
+	int len;
+
+	fd = open(filename, O_RDWR | O_CREAT | O_EXCL, S_IRUSR | S_IWUSR);
+	if (fd < 0) {
+		fprintf(stderr, "Couldn't create new file %s\n", filename);
+		perror("Reason");
+		buf_burn(buf);
+		exit(EXIT_FAILURE);
+	}
+
+	/* write the file now */
+	while (buf->pos != buf->len) {
+		len = write(fd, buf_getptr(buf, buf->len - buf->pos),
+				buf->len - buf->pos);
+		if (errno == EINTR) {
+			continue;
+		}
+		if (len <= 0) {
+			fprintf(stderr, "Failed writing file '%s'\n",filename);
+			perror("Reason");
+			exit(EXIT_FAILURE);
+		}
+		buf_incrpos(buf, len);
+	}
+
+	close(fd);
+}
diff --git a/dss.c b/dss.c
new file mode 100644
index 0000000..84a093c
--- /dev/null
+++ b/dss.c
@@ -0,0 +1,416 @@
+/*
+ * Dropbear - a SSH2 server
+ * 
+ * Copyright (c) 2002,2003 Matt Johnston
+ * All rights reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE. */
+
+#include "includes.h"
+#include "dbutil.h"
+#include "bignum.h"
+#include "dss.h"
+#include "buffer.h"
+#include "ssh.h"
+#include "random.h"
+
+/* Handle DSS (Digital Signature Standard), aka DSA (D.S. Algorithm),
+ * operations, such as key reading, signing, verification. Key generation
+ * is in gendss.c, since it isn't required in the server itself.
+ *
+ * See FIPS186 or the Handbook of Applied Cryptography for details of the
+ * algorithm */
+
+#ifdef DROPBEAR_DSS 
+
+/* Load a dss key from a buffer, initialising the values.
+ * The key will have the same format as buf_put_dss_key.
+ * These should be freed with dss_key_free.
+ * Returns DROPBEAR_SUCCESS or DROPBEAR_FAILURE */
+int buf_get_dss_pub_key(buffer* buf, dss_key *key) {
+
+	TRACE(("enter buf_get_dss_pub_key"))
+	dropbear_assert(key != NULL);
+	key->p = m_malloc(sizeof(mp_int));
+	key->q = m_malloc(sizeof(mp_int));
+	key->g = m_malloc(sizeof(mp_int));
+	key->y = m_malloc(sizeof(mp_int));
+	m_mp_init_multi(key->p, key->q, key->g, key->y, NULL);
+	key->x = NULL;
+
+	buf_incrpos(buf, 4+SSH_SIGNKEY_DSS_LEN); /* int + "ssh-dss" */
+	if (buf_getmpint(buf, key->p) == DROPBEAR_FAILURE
+	 || buf_getmpint(buf, key->q) == DROPBEAR_FAILURE
+	 || buf_getmpint(buf, key->g) == DROPBEAR_FAILURE
+	 || buf_getmpint(buf, key->y) == DROPBEAR_FAILURE) {
+		TRACE(("leave buf_get_dss_pub_key: failed reading mpints"))
+		return DROPBEAR_FAILURE;
+	}
+
+	if (mp_count_bits(key->p) < MIN_DSS_KEYLEN) {
+		dropbear_log(LOG_WARNING, "DSS key too short");
+		TRACE(("leave buf_get_dss_pub_key: short key"))
+		return DROPBEAR_FAILURE;
+	}
+
+	TRACE(("leave buf_get_dss_pub_key: success"))
+	return DROPBEAR_SUCCESS;
+}
+
+/* Same as buf_get_dss_pub_key, but reads a private "x" key at the end.
+ * Loads a private dss key from a buffer
+ * Returns DROPBEAR_SUCCESS or DROPBEAR_FAILURE */
+int buf_get_dss_priv_key(buffer* buf, dss_key *key) {
+
+	int ret = DROPBEAR_FAILURE;
+
+	dropbear_assert(key != NULL);
+
+	ret = buf_get_dss_pub_key(buf, key);
+	if (ret == DROPBEAR_FAILURE) {
+		return DROPBEAR_FAILURE;
+	}
+
+	key->x = m_malloc(sizeof(mp_int));
+	m_mp_init(key->x);
+	ret = buf_getmpint(buf, key->x);
+
+	return ret;
+}
+	
+
+/* Clear and free the memory used by a public or private key */
+void dss_key_free(dss_key *key) {
+
+	TRACE(("enter dsa_key_free"))
+	if (key == NULL) {
+		TRACE(("enter dsa_key_free: key == NULL"))
+		return;
+	}
+	if (key->p) {
+		mp_clear(key->p);
+		m_free(key->p);
+	}
+	if (key->q) {
+		mp_clear(key->q);
+		m_free(key->q);
+	}
+	if (key->g) {
+		mp_clear(key->g);
+		m_free(key->g);
+	}
+	if (key->y) {
+		mp_clear(key->y);
+		m_free(key->y);
+	}
+	if (key->x) {
+		mp_clear(key->x);
+		m_free(key->x);
+	}
+	m_free(key);
+	TRACE(("leave dsa_key_free"))
+}
+
+/* put the dss public key into the buffer in the required format:
+ *
+ * string	"ssh-dss"
+ * mpint	p
+ * mpint	q
+ * mpint	g
+ * mpint	y
+ */
+void buf_put_dss_pub_key(buffer* buf, dss_key *key) {
+
+	dropbear_assert(key != NULL);
+	buf_putstring(buf, SSH_SIGNKEY_DSS, SSH_SIGNKEY_DSS_LEN);
+	buf_putmpint(buf, key->p);
+	buf_putmpint(buf, key->q);
+	buf_putmpint(buf, key->g);
+	buf_putmpint(buf, key->y);
+
+}
+
+/* Same as buf_put_dss_pub_key, but with the private "x" key appended */
+void buf_put_dss_priv_key(buffer* buf, dss_key *key) {
+
+	dropbear_assert(key != NULL);
+	buf_put_dss_pub_key(buf, key);
+	buf_putmpint(buf, key->x);
+
+}
+
+#ifdef DROPBEAR_SIGNKEY_VERIFY
+/* Verify a DSS signature (in buf) made on data by the key given. 
+ * returns DROPBEAR_SUCCESS or DROPBEAR_FAILURE */
+int buf_dss_verify(buffer* buf, dss_key *key, const unsigned char* data,
+		unsigned int len) {
+
+	unsigned char msghash[SHA1_HASH_SIZE];
+	hash_state hs;
+	int ret = DROPBEAR_FAILURE;
+	DEF_MP_INT(val1);
+	DEF_MP_INT(val2);
+	DEF_MP_INT(val3);
+	DEF_MP_INT(val4);
+	char * string = NULL;
+	int stringlen;
+
+	TRACE(("enter buf_dss_verify"))
+	dropbear_assert(key != NULL);
+
+	m_mp_init_multi(&val1, &val2, &val3, &val4, NULL);
+
+	/* get blob, check length */
+	string = buf_getstring(buf, &stringlen);
+	if (stringlen != 2*SHA1_HASH_SIZE) {
+		goto out;
+	}
+
+	/* hash the data */
+	sha1_init(&hs);
+	sha1_process(&hs, data, len);
+	sha1_done(&hs, msghash);
+
+	/* create the signature - s' and r' are the received signatures in buf */
+	/* w = (s')-1 mod q */
+	/* let val1 = s' */
+	bytes_to_mp(&val1, &string[SHA1_HASH_SIZE], SHA1_HASH_SIZE);
+
+	if (mp_cmp(&val1, key->q) != MP_LT) {
+		TRACE(("verify failed, s' >= q"))
+		goto out;
+	}
+	/* let val2 = w = (s')^-1 mod q*/
+	if (mp_invmod(&val1, key->q, &val2) != MP_OKAY) {
+		goto out;
+	}
+
+	/* u1 = ((SHA(M')w) mod q */
+	/* let val1 = SHA(M') = msghash */
+	bytes_to_mp(&val1, msghash, SHA1_HASH_SIZE);
+
+	/* let val3 = u1 = ((SHA(M')w) mod q */
+	if (mp_mulmod(&val1, &val2, key->q, &val3) != MP_OKAY) {
+		goto out;
+	}
+
+	/* u2 = ((r')w) mod q */
+	/* let val1 = r' */
+	bytes_to_mp(&val1, &string[0], SHA1_HASH_SIZE);
+	if (mp_cmp(&val1, key->q) != MP_LT) {
+		TRACE(("verify failed, r' >= q"))
+		goto out;
+	}
+	/* let val4 = u2 = ((r')w) mod q */
+	if (mp_mulmod(&val1, &val2, key->q, &val4) != MP_OKAY) {
+		goto out;
+	}
+
+	/* v = (((g)^u1 (y)^u2) mod p) mod q */
+	/* val2 = g^u1 mod p */
+	if (mp_exptmod(key->g, &val3, key->p, &val2) != MP_OKAY) {
+		goto out;
+	}
+	/* val3 = y^u2 mod p */
+	if (mp_exptmod(key->y, &val4, key->p, &val3) != MP_OKAY) {
+		goto out;
+	}
+	/* val4 = ((g)^u1 (y)^u2) mod p */
+	if (mp_mulmod(&val2, &val3, key->p, &val4) != MP_OKAY) {
+		goto out;
+	}
+	/* val2 = v = (((g)^u1 (y)^u2) mod p) mod q */
+	if (mp_mod(&val4, key->q, &val2) != MP_OKAY) {
+		goto out;
+	}
+	
+	/* check whether signatures verify */
+	if (mp_cmp(&val2, &val1) == MP_EQ) {
+		/* good sig */
+		ret = DROPBEAR_SUCCESS;
+	}
+
+out:
+	mp_clear_multi(&val1, &val2, &val3, &val4, NULL);
+	m_free(string);
+
+	return ret;
+
+}
+#endif /* DROPBEAR_SIGNKEY_VERIFY */
+
+#ifdef DSS_PROTOK	
+/* convert an unsigned mp into an array of bytes, malloced.
+ * This array must be freed after use, len contains the length of the array,
+ * if len != NULL */
+static unsigned char* mptobytes(mp_int *mp, int *len) {
+	
+	unsigned char* ret;
+	int size;
+
+	size = mp_unsigned_bin_size(mp);
+	ret = m_malloc(size);
+	if (mp_to_unsigned_bin(mp, ret) != MP_OKAY) {
+		dropbear_exit("mem alloc error");
+	}
+	if (len != NULL) {
+		*len = size;
+	}
+	return ret;
+}
+#endif
+
+/* Sign the data presented with key, writing the signature contents
+ * to the buffer
+ *
+ * When DSS_PROTOK is #defined:
+ * The alternate k generation method is based on the method used in PuTTY. 
+ * In particular to avoid being vulnerable to attacks using flaws in random
+ * generation of k, we use the following:
+ *
+ * proto_k = SHA512 ( SHA512(x) || SHA160(message) )
+ * k = proto_k mod q
+ *
+ * Now we aren't relying on the random number generation to protect the private
+ * key x, which is a long term secret */
+void buf_put_dss_sign(buffer* buf, dss_key *key, const unsigned char* data,
+		unsigned int len) {
+
+	unsigned char msghash[SHA1_HASH_SIZE];
+	unsigned int writelen;
+	unsigned int i;
+#ifdef DSS_PROTOK
+	unsigned char privkeyhash[SHA512_HASH_SIZE];
+	unsigned char *privkeytmp;
+	unsigned char proto_k[SHA512_HASH_SIZE];
+	DEF_MP_INT(dss_protok);
+#endif
+	DEF_MP_INT(dss_k);
+	DEF_MP_INT(dss_m);
+	DEF_MP_INT(dss_temp1);
+	DEF_MP_INT(dss_temp2);
+	DEF_MP_INT(dss_r);
+	DEF_MP_INT(dss_s);
+	hash_state hs;
+	
+	TRACE(("enter buf_put_dss_sign"))
+	dropbear_assert(key != NULL);
+	
+	/* hash the data */
+	sha1_init(&hs);
+	sha1_process(&hs, data, len);
+	sha1_done(&hs, msghash);
+
+	m_mp_init_multi(&dss_k, &dss_temp1, &dss_temp2, &dss_r, &dss_s,
+			&dss_m, NULL);
+#ifdef DSS_PROTOK	
+	/* hash the privkey */
+	privkeytmp = mptobytes(key->x, &i);
+	sha512_init(&hs);
+	sha512_process(&hs, "the quick brown fox jumped over the lazy dog", 44);
+	sha512_process(&hs, privkeytmp, i);
+	sha512_done(&hs, privkeyhash);
+	m_burn(privkeytmp, i);
+	m_free(privkeytmp);
+
+	/* calculate proto_k */
+	sha512_init(&hs);
+	sha512_process(&hs, privkeyhash, SHA512_HASH_SIZE);
+	sha512_process(&hs, msghash, SHA1_HASH_SIZE);
+	sha512_done(&hs, proto_k);
+
+	/* generate k */
+	m_mp_init(&dss_protok);
+	bytes_to_mp(&dss_protok, proto_k, SHA512_HASH_SIZE);
+	mp_mod(&dss_protok, key->q, &dss_k);
+	mp_clear(&dss_protok);
+	m_burn(proto_k, SHA512_HASH_SIZE);
+#else /* DSS_PROTOK not defined*/
+	gen_random_mpint(key->q, &dss_k);
+#endif
+
+	/* now generate the actual signature */
+	bytes_to_mp(&dss_m, msghash, SHA1_HASH_SIZE);
+
+	/* g^k mod p */
+	if (mp_exptmod(key->g, &dss_k, key->p, &dss_temp1) !=  MP_OKAY) {
+		dropbear_exit("dss error");
+	}
+	/* r = (g^k mod p) mod q */
+	if (mp_mod(&dss_temp1, key->q, &dss_r) != MP_OKAY) {
+		dropbear_exit("dss error");
+	}
+
+	/* x*r mod q */
+	if (mp_mulmod(&dss_r, key->x, key->q, &dss_temp1) != MP_OKAY) {
+		dropbear_exit("dss error");
+	}
+	/* (SHA1(M) + xr) mod q) */
+	if (mp_addmod(&dss_m, &dss_temp1, key->q, &dss_temp2) != MP_OKAY) {
+		dropbear_exit("dss error");
+	}
+	
+	/* (k^-1) mod q */
+	if (mp_invmod(&dss_k, key->q, &dss_temp1) != MP_OKAY) {
+		dropbear_exit("dss error");
+	}
+
+	/* s = (k^-1(SHA1(M) + xr)) mod q */
+	if (mp_mulmod(&dss_temp1, &dss_temp2, key->q, &dss_s) != MP_OKAY) {
+		dropbear_exit("dss error");
+	}
+
+	buf_putstring(buf, SSH_SIGNKEY_DSS, SSH_SIGNKEY_DSS_LEN);
+	buf_putint(buf, 2*SHA1_HASH_SIZE);
+
+	writelen = mp_unsigned_bin_size(&dss_r);
+	dropbear_assert(writelen <= SHA1_HASH_SIZE);
+	/* need to pad to 160 bits with leading zeros */
+	for (i = 0; i < SHA1_HASH_SIZE - writelen; i++) {
+		buf_putbyte(buf, 0);
+	}
+	if (mp_to_unsigned_bin(&dss_r, buf_getwriteptr(buf, writelen)) 
+			!= MP_OKAY) {
+		dropbear_exit("dss error");
+	}
+	mp_clear(&dss_r);
+	buf_incrwritepos(buf, writelen);
+
+	writelen = mp_unsigned_bin_size(&dss_s);
+	dropbear_assert(writelen <= SHA1_HASH_SIZE);
+	/* need to pad to 160 bits with leading zeros */
+	for (i = 0; i < SHA1_HASH_SIZE - writelen; i++) {
+		buf_putbyte(buf, 0);
+	}
+	if (mp_to_unsigned_bin(&dss_s, buf_getwriteptr(buf, writelen)) 
+			!= MP_OKAY) {
+		dropbear_exit("dss error");
+	}
+	mp_clear(&dss_s);
+	buf_incrwritepos(buf, writelen);
+
+	mp_clear_multi(&dss_k, &dss_temp1, &dss_temp2, &dss_r, &dss_s,
+			&dss_m, NULL);
+	
+	/* create the signature to return */
+
+	TRACE(("leave buf_put_dss_sign"))
+}
+
+#endif /* DROPBEAR_DSS */
diff --git a/dss.h b/dss.h
new file mode 100644
index 0000000..0b69256
--- /dev/null
+++ b/dss.h
@@ -0,0 +1,61 @@
+/*
+ * Dropbear - a SSH2 server
+ * 
+ * Copyright (c) 2002,2003 Matt Johnston
+ * All rights reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE. */
+
+#ifndef _DSS_H_
+#define _DSS_H_
+
+#include "includes.h"
+#include "buffer.h"
+
+#ifdef DROPBEAR_DSS 
+
+#define DSS_SIGNATURE_SIZE 4+SSH_SIGNKEY_DSS_LEN+4+2*SHA1_HASH_SIZE
+
+struct DSS_key {
+
+	mp_int* p;
+	mp_int* q;
+	mp_int* g;
+	mp_int* y;
+	mp_int* x;
+
+};
+
+typedef struct DSS_key dss_key;
+
+void buf_put_dss_sign(buffer* buf, dss_key *key, const unsigned char* data,
+		unsigned int len);
+#ifdef DROPBEAR_SIGNKEY_VERIFY
+int buf_dss_verify(buffer* buf, dss_key *key, const unsigned char* data,
+		unsigned int len);
+#endif
+int buf_get_dss_pub_key(buffer* buf, dss_key *key);
+int buf_get_dss_priv_key(buffer* buf, dss_key *key);
+void buf_put_dss_pub_key(buffer* buf, dss_key *key);
+void buf_put_dss_priv_key(buffer* buf, dss_key *key);
+void dss_key_free(dss_key *key);
+
+#endif /* DROPBEAR_DSS */
+
+#endif /* _DSS_H_ */
diff --git a/fake-rfc2553.c b/fake-rfc2553.c
new file mode 100644
index 0000000..afbea88
--- /dev/null
+++ b/fake-rfc2553.c
@@ -0,0 +1,227 @@
+/*
+ *
+ * Taken from OpenSSH 3.8.1p1
+ * 
+ * Copyright (C) 2000-2003 Damien Miller.  All rights reserved.
+ * Copyright (C) 1999 WIDE Project.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the project nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Pseudo-implementation of RFC2553 name / address resolution functions
+ *
+ * But these functions are not implemented correctly. The minimum subset
+ * is implemented for ssh use only. For example, this routine assumes
+ * that ai_family is AF_INET. Don't use it for another purpose.
+ */
+
+#include "includes.h"
+
+/* RCSID("$.Id: fake-rfc2553.c,v 1.5 2003/09/22 02:08:23 dtucker Exp $");*/
+
+#ifndef HAVE_GETNAMEINFO
+int getnameinfo(const struct sockaddr *sa, size_t salen, char *host, 
+                size_t hostlen, char *serv, size_t servlen, int flags)
+{
+	struct sockaddr_in *sin = (struct sockaddr_in *)sa;
+	struct hostent *hp;
+	char tmpserv[16];
+
+	if (serv != NULL) {
+		snprintf(tmpserv, sizeof(tmpserv), "%d", ntohs(sin->sin_port));
+		if (strlcpy(serv, tmpserv, servlen) >= servlen)
+			return (EAI_MEMORY);
+	}
+
+	if (host != NULL) {
+		if (flags & NI_NUMERICHOST) {
+			if (strlcpy(host, inet_ntoa(sin->sin_addr),
+			    hostlen) >= hostlen)
+				return (EAI_MEMORY);
+			else
+				return (0);
+		} else {
+			hp = gethostbyaddr((char *)&sin->sin_addr, 
+			    sizeof(struct in_addr), AF_INET);
+			if (hp == NULL)
+				return (EAI_NODATA);
+			
+			if (strlcpy(host, hp->h_name, hostlen) >= hostlen)
+				return (EAI_MEMORY);
+			else
+				return (0);
+		}
+	}
+	return (0);
+}
+#endif /* !HAVE_GETNAMEINFO */
+
+#ifndef HAVE_GAI_STRERROR
+#ifdef HAVE_CONST_GAI_STRERROR_PROTO
+const char *
+#else
+char *
+#endif
+gai_strerror(int err)
+{
+	switch (err) {
+	case EAI_NODATA:
+		return ("no address associated with name");
+	case EAI_MEMORY:
+		return ("memory allocation failure.");
+	case EAI_NONAME:
+		return ("nodename nor servname provided, or not known");
+	default:
+		return ("unknown/invalid error.");
+	}
+}    
+#endif /* !HAVE_GAI_STRERROR */
+
+#ifndef HAVE_FREEADDRINFO
+void
+freeaddrinfo(struct addrinfo *ai)
+{
+	struct addrinfo *next;
+
+	for(; ai != NULL;) {
+		next = ai->ai_next;
+		free(ai);
+		ai = next;
+	}
+}
+#endif /* !HAVE_FREEADDRINFO */
+
+#ifndef HAVE_GETADDRINFO
+static struct
+addrinfo *malloc_ai(int port, u_long addr, const struct addrinfo *hints)
+{
+	struct addrinfo *ai;
+
+	ai = malloc(sizeof(*ai) + sizeof(struct sockaddr_in));
+	if (ai == NULL)
+		return (NULL);
+	
+	memset(ai, '\0', sizeof(*ai) + sizeof(struct sockaddr_in));
+	
+	ai->ai_addr = (struct sockaddr *)(ai + 1);
+	/* XXX -- ssh doesn't use sa_len */
+	ai->ai_addrlen = sizeof(struct sockaddr_in);
+	ai->ai_addr->sa_family = ai->ai_family = AF_INET;
+
+	((struct sockaddr_in *)(ai)->ai_addr)->sin_port = port;
+	((struct sockaddr_in *)(ai)->ai_addr)->sin_addr.s_addr = addr;
+	
+	/* XXX: the following is not generally correct, but does what we want */
+	if (hints->ai_socktype)
+		ai->ai_socktype = hints->ai_socktype;
+	else
+		ai->ai_socktype = SOCK_STREAM;
+
+	if (hints->ai_protocol)
+		ai->ai_protocol = hints->ai_protocol;
+
+	return (ai);
+}
+
+int
+getaddrinfo(const char *hostname, const char *servname, 
+    const struct addrinfo *hints, struct addrinfo **res)
+{
+	struct hostent *hp;
+	struct servent *sp;
+	struct in_addr in;
+	int i;
+	long int port;
+	u_long addr;
+
+	port = 0;
+	if (servname != NULL) {
+		char *cp;
+
+		port = strtol(servname, &cp, 10);
+		if (port > 0 && port <= 65535 && *cp == '\0')
+			port = htons(port);
+		else if ((sp = getservbyname(servname, NULL)) != NULL)
+			port = sp->s_port;
+		else
+			port = 0;
+	}
+
+	if (hints && hints->ai_flags & AI_PASSIVE) {
+		addr = htonl(0x00000000);
+		if (hostname && inet_aton(hostname, &in) != 0)
+			addr = in.s_addr;
+		*res = malloc_ai(port, addr, hints);
+		if (*res == NULL) 
+			return (EAI_MEMORY);
+		return (0);
+	}
+		
+	if (!hostname) {
+		*res = malloc_ai(port, htonl(0x7f000001), hints);
+		if (*res == NULL) 
+			return (EAI_MEMORY);
+		return (0);
+	}
+	
+	if (inet_aton(hostname, &in)) {
+		*res = malloc_ai(port, in.s_addr, hints);
+		if (*res == NULL) 
+			return (EAI_MEMORY);
+		return (0);
+	}
+	
+	/* Don't try DNS if AI_NUMERICHOST is set */
+	if (hints && hints->ai_flags & AI_NUMERICHOST)
+		return (EAI_NONAME);
+	
+	hp = gethostbyname(hostname);
+	if (hp && hp->h_name && hp->h_name[0] && hp->h_addr_list[0]) {
+		struct addrinfo *cur, *prev;
+
+		cur = prev = *res = NULL;
+		for (i = 0; hp->h_addr_list[i]; i++) {
+			struct in_addr *in = (struct in_addr *)hp->h_addr_list[i];
+
+			cur = malloc_ai(port, in->s_addr, hints);
+			if (cur == NULL) {
+				if (*res != NULL)
+					freeaddrinfo(*res);
+				return (EAI_MEMORY);
+			}
+			if (prev)
+				prev->ai_next = cur;
+			else
+				*res = cur;
+
+			prev = cur;
+		}
+		return (0);
+	}
+	
+	return (EAI_NODATA);
+}
+#endif /* !HAVE_GETADDRINFO */
diff --git a/fake-rfc2553.h b/fake-rfc2553.h
new file mode 100644
index 0000000..053e6a6
--- /dev/null
+++ b/fake-rfc2553.h
@@ -0,0 +1,162 @@
+/* Taken from OpenSSH 3.8.1p1 */
+
+/* $.Id: fake-rfc2553.h,v 1.9 2004/03/10 10:06:33 dtucker Exp $ */
+
+/*
+ * Copyright (C) 2000-2003 Damien Miller.  All rights reserved.
+ * Copyright (C) 1999 WIDE Project.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the project nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Pseudo-implementation of RFC2553 name / address resolution functions
+ *
+ * But these functions are not implemented correctly. The minimum subset
+ * is implemented for ssh use only. For example, this routine assumes
+ * that ai_family is AF_INET. Don't use it for another purpose.
+ */
+
+#ifndef _FAKE_RFC2553_H
+#define _FAKE_RFC2553_H
+
+#include "includes.h"
+
+/*
+ * First, socket and INET6 related definitions 
+ */
+#ifndef HAVE_STRUCT_SOCKADDR_STORAGE
+# define	_SS_MAXSIZE	128	/* Implementation specific max size */
+# define       _SS_PADSIZE     (_SS_MAXSIZE - sizeof (struct sockaddr))
+struct sockaddr_storage {
+	struct sockaddr	ss_sa;
+	char		__ss_pad2[_SS_PADSIZE];
+};
+# define ss_family ss_sa.sa_family
+#endif /* !HAVE_STRUCT_SOCKADDR_STORAGE */
+
+#ifndef IN6_IS_ADDR_LOOPBACK
+# define IN6_IS_ADDR_LOOPBACK(a) \
+	(((u_int32_t *)(a))[0] == 0 && ((u_int32_t *)(a))[1] == 0 && \
+	 ((u_int32_t *)(a))[2] == 0 && ((u_int32_t *)(a))[3] == htonl(1))
+#endif /* !IN6_IS_ADDR_LOOPBACK */
+
+#ifndef HAVE_STRUCT_IN6_ADDR
+struct in6_addr {
+	u_int8_t	s6_addr[16];
+};
+#endif /* !HAVE_STRUCT_IN6_ADDR */
+
+#ifndef HAVE_STRUCT_SOCKADDR_IN6
+struct sockaddr_in6 {
+	unsigned short	sin6_family;
+	u_int16_t	sin6_port;
+	u_int32_t	sin6_flowinfo;
+	struct in6_addr	sin6_addr;
+};
+#endif /* !HAVE_STRUCT_SOCKADDR_IN6 */
+
+#ifndef AF_INET6
+/* Define it to something that should never appear */
+#define AF_INET6 AF_MAX
+#endif
+
+/*
+ * Next, RFC2553 name / address resolution API
+ */
+
+#ifndef NI_NUMERICHOST
+# define NI_NUMERICHOST    (1)
+#endif
+#ifndef NI_NAMEREQD
+# define NI_NAMEREQD       (1<<1)
+#endif
+#ifndef NI_NUMERICSERV
+# define NI_NUMERICSERV    (1<<2)
+#endif
+
+#ifndef AI_PASSIVE
+# define AI_PASSIVE		(1)
+#endif
+#ifndef AI_CANONNAME
+# define AI_CANONNAME		(1<<1)
+#endif
+#ifndef AI_NUMERICHOST
+# define AI_NUMERICHOST		(1<<2)
+#endif
+
+#ifndef NI_MAXSERV
+# define NI_MAXSERV 32
+#endif /* !NI_MAXSERV */
+#ifndef NI_MAXHOST
+# define NI_MAXHOST 1025
+#endif /* !NI_MAXHOST */
+
+#ifndef EAI_NODATA
+# define EAI_NODATA	1
+# define EAI_MEMORY	2
+# define EAI_NONAME	3
+#endif
+
+#ifndef HAVE_STRUCT_ADDRINFO
+struct addrinfo {
+	int	ai_flags;	/* AI_PASSIVE, AI_CANONNAME */
+	int	ai_family;	/* PF_xxx */
+	int	ai_socktype;	/* SOCK_xxx */
+	int	ai_protocol;	/* 0 or IPPROTO_xxx for IPv4 and IPv6 */
+	size_t	ai_addrlen;	/* length of ai_addr */
+	char	*ai_canonname;	/* canonical name for hostname */
+	struct sockaddr *ai_addr;	/* binary address */
+	struct addrinfo *ai_next;	/* next structure in linked list */
+};
+#endif /* !HAVE_STRUCT_ADDRINFO */
+
+#ifndef HAVE_GETADDRINFO
+#ifdef getaddrinfo
+# undef getaddrinfo
+#endif
+#define getaddrinfo(a,b,c,d)	(ssh_getaddrinfo(a,b,c,d))
+int getaddrinfo(const char *, const char *, 
+    const struct addrinfo *, struct addrinfo **);
+#endif /* !HAVE_GETADDRINFO */
+
+#if !defined(HAVE_GAI_STRERROR) && !defined(HAVE_CONST_GAI_STRERROR_PROTO)
+#define gai_strerror(a)		(ssh_gai_strerror(a))
+char *gai_strerror(int);
+#endif /* !HAVE_GAI_STRERROR */
+
+#ifndef HAVE_FREEADDRINFO
+#define freeaddrinfo(a)		(ssh_freeaddrinfo(a))
+void freeaddrinfo(struct addrinfo *);
+#endif /* !HAVE_FREEADDRINFO */
+
+#ifndef HAVE_GETNAMEINFO
+#define getnameinfo(a,b,c,d,e,f,g) (ssh_getnameinfo(a,b,c,d,e,f,g))
+int getnameinfo(const struct sockaddr *, size_t, char *, size_t, 
+    char *, size_t, int);
+#endif /* !HAVE_GETNAMEINFO */
+
+#endif /* !_FAKE_RFC2553_H */
+
diff --git a/filelist.txt b/filelist.txt
new file mode 100644
index 0000000..8281c14
--- /dev/null
+++ b/filelist.txt
@@ -0,0 +1,117 @@
+This file is out of date - it remains here in case it is still of use.
+The basic naming convention is svr- and cli-  for seperate parts,
+then common- for common parts. Some files have no prefix.
+
+A brief rundown on which files do what, and their corresponding sections
+in the IETF drafts. The .c files usually have corresponding .h files.
+
+Transport layer  draft-ietf-secsh-transport-16.txt
+===============
+
+session.c		Contains the main select() loop, and handles setting
+			up/closing down ssh connections
+
+algo.c			Framework for handling various ciphers/hashes/algos,
+			and choosing between the lists of client/server
+			preferred ones
+
+kex.c			Key exchange routines, used at startup to negotiate
+			which algorithms to use, and also to obtain session
+			keys. This also runs when rekeying during the
+			connection.
+
+packet.c		Handles the basic packet encryption/decryption,
+			and switching to the appropriate packet handlers.
+			Called from session.c's main select loop.
+
+service.c		Handles service requests (userauth or connection)
+
+
+Authentication  draft-ietf-secsh-userauth-17.txt
+==============
+
+auth.c			General auth handling, including user checking etc,
+			passes different auth types to auth{passwd,pubkey}
+
+authpasswd.c		Handles /etc/passwd or /etc/shadow auth
+
+authpubkey.c		Handles ~/.ssh/authorized_keys auth
+
+
+Connection  draft-ietf-secsh-connect-17.txt
+==========
+
+channel.c		Channel handling routines - each shell/tcp conn/agent
+			etc is a channel.
+
+chansession.c		Handles shell/exec requests
+
+sshpty.c		From OpenSSH, allocates PTYs etc
+
+termcodes.c		Mapping of POSIX terminal codes to SSH terminal codes
+
+loginrec.c		From OpenSSH, handles utmp/wtmp logging
+
+x11fwd.c		Handles X11 forwarding
+
+agentfwd.c		Handles auth-agent forwarding requests
+
+localtcpfwd.c		Handles -L style tcp forwarding requests, setting
+			up the listening port and also handling connections
+			to that port (and subsequent channels)
+
+
+Program-related
+===============
+
+dbmulti.c		Combination binary chooser main() function
+
+dbutil.c		Various utility functions, incl logging, memory etc
+
+dropbearconvert.c	Conversion from dropbear<->openssh keys, uses
+			keyimport.c to do most of the work
+
+dropbearkey.c		Generates keys, calling gen{dss,rsa}
+
+keyimport.c		Modified from PuTTY, converts between key types
+
+main.c			dropbear's main(), handles listening, forking for
+			new connections, child-process limits
+
+runopts.c		Parses commandline options
+
+options.h		Compile-time feature selection
+
+config.h		Features selected from configure
+
+debug.h			Compile-time selection of debug features
+
+includes.h		Included system headers etc
+
+
+Generic Routines
+================
+
+signkey.c		A generic handler for pubkeys, switches to dss or rsa
+			depending on the key type
+
+rsa.c			RSA asymmetric crypto routines
+
+dss.c			DSS asymmetric crypto routines
+
+gendss.c		DSS key generation
+
+genrsa.c		RSA key generation
+
+bignum.c		Some bignum helper functions
+
+queue.c			A queue, used to enqueue encrypted packets to send
+
+random.c		PRNG, based on /dev/urandom or prngd
+
+atomicio.c		From OpenSSH, does `blocking' IO on non-blocking fds
+
+buffer.c		Buffer-usage routines, with size checking etc
+
+
+vim:set ts=8:
diff --git a/gendss.c b/gendss.c
new file mode 100644
index 0000000..bf46d3d
--- /dev/null
+++ b/gendss.c
@@ -0,0 +1,198 @@
+/*
+ * Dropbear - a SSH2 server
+ * 
+ * Copyright (c) 2002,2003 Matt Johnston
+ * All rights reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE. */
+
+#include "includes.h"
+#include "dbutil.h"
+#include "signkey.h"
+#include "bignum.h"
+#include "random.h"
+#include "buffer.h"
+#include "gendss.h"
+#include "dss.h"
+
+#define QSIZE 20 /* 160 bit */
+
+/* This is just a test */
+
+#ifdef DROPBEAR_DSS
+
+static void getq(dss_key *key);
+static void getp(dss_key *key, unsigned int size);
+static void getg(dss_key *key);
+static void getx(dss_key *key);
+static void gety(dss_key *key);
+
+dss_key * gen_dss_priv_key(unsigned int size) {
+
+	dss_key *key;
+
+	key = (dss_key*)m_malloc(sizeof(dss_key));
+
+	key->p = (mp_int*)m_malloc(sizeof(mp_int));
+	key->q = (mp_int*)m_malloc(sizeof(mp_int));
+	key->g = (mp_int*)m_malloc(sizeof(mp_int));
+	key->y = (mp_int*)m_malloc(sizeof(mp_int));
+	key->x = (mp_int*)m_malloc(sizeof(mp_int));
+	m_mp_init_multi(key->p, key->q, key->g, key->y, key->x, NULL);
+	
+	seedrandom();
+	
+	getq(key);
+	getp(key, size);
+	getg(key);
+	getx(key);
+	gety(key);
+
+	return key;
+	
+}
+
+static void getq(dss_key *key) {
+
+	char buf[QSIZE];
+
+	/* 160 bit prime */
+	genrandom(buf, QSIZE);
+	buf[0] |= 0x80; /* top bit high */
+	buf[QSIZE-1] |= 0x01; /* bottom bit high */
+
+	bytes_to_mp(key->q, buf, QSIZE);
+
+	/* 18 rounds are required according to HAC */
+	if (mp_prime_next_prime(key->q, 18, 0) != MP_OKAY) {
+		fprintf(stderr, "dss key generation failed\n");
+		exit(1);
+	}
+}
+
+static void getp(dss_key *key, unsigned int size) {
+
+	DEF_MP_INT(tempX);
+	DEF_MP_INT(tempC);
+	DEF_MP_INT(tempP);
+	DEF_MP_INT(temp2q);
+	int result;
+	unsigned char *buf;
+
+	m_mp_init_multi(&tempX, &tempC, &tempP, &temp2q, NULL);
+
+
+	/* 2*q */
+	if (mp_mul_d(key->q, 2, &temp2q) != MP_OKAY) {
+		fprintf(stderr, "dss key generation failed\n");
+		exit(1);
+	}
+	
+	buf = (unsigned char*)m_malloc(size);
+
+	result = 0;
+	do {
+		
+		genrandom(buf, size);
+		buf[0] |= 0x80; /* set the top bit high */
+
+		/* X is a random mp_int */
+		bytes_to_mp(&tempX, buf, size);
+
+		/* C = X mod 2q */
+		if (mp_mod(&tempX, &temp2q, &tempC) != MP_OKAY) {
+			fprintf(stderr, "dss key generation failed\n");
+			exit(1);
+		}
+
+		/* P = X - (C - 1) = X - C + 1*/
+		if (mp_sub(&tempX, &tempC, &tempP) != MP_OKAY) {
+			fprintf(stderr, "dss key generation failed\n");
+			exit(1);
+		}
+		
+		if (mp_add_d(&tempP, 1, key->p) != MP_OKAY) {
+			fprintf(stderr, "dss key generation failed\n");
+			exit(1);
+		}
+
+		/* now check for prime, 5 rounds is enough according to HAC */
+		/* result == 1  =>  p is prime */
+		if (mp_prime_is_prime(key->p, 5, &result) != MP_OKAY) {
+			fprintf(stderr, "dss key generation failed\n");
+			exit(1);
+		}
+	} while (!result);
+
+	mp_clear_multi(&tempX, &tempC, &tempP, &temp2q, NULL);
+	m_burn(buf, size);
+	m_free(buf);
+}
+
+static void getg(dss_key * key) {
+
+	DEF_MP_INT(div);
+	DEF_MP_INT(h);
+	DEF_MP_INT(val);
+
+	m_mp_init_multi(&div, &h, &val, NULL);
+
+	/* get div=(p-1)/q */
+	if (mp_sub_d(key->p, 1, &val) != MP_OKAY) {
+		fprintf(stderr, "dss key generation failed\n");
+		exit(1);
+	}
+	if (mp_div(&val, key->q, &div, NULL) != MP_OKAY) {
+		fprintf(stderr, "dss key generation failed\n");
+		exit(1);
+	}
+
+	/* initialise h=1 */
+	mp_set(&h, 1);
+	do {
+		/* now keep going with g=h^div mod p, until g > 1 */
+		if (mp_exptmod(&h, &div, key->p, key->g) != MP_OKAY) {
+			fprintf(stderr, "dss key generation failed\n");
+			exit(1);
+		}
+
+		if (mp_add_d(&h, 1, &h) != MP_OKAY) {
+			fprintf(stderr, "dss key generation failed\n");
+			exit(1);
+		}
+	
+	} while (mp_cmp_d(key->g, 1) != MP_GT);
+
+	mp_clear_multi(&div, &h, &val, NULL);
+}
+
+static void getx(dss_key *key) {
+
+	gen_random_mpint(key->q, key->x);
+}
+
+static void gety(dss_key *key) {
+
+	if (mp_exptmod(key->g, key->x, key->p, key->y) != MP_OKAY) {
+		fprintf(stderr, "dss key generation failed\n");
+		exit(1);
+	}
+}
+
+#endif /* DROPBEAR_DSS */
diff --git a/gendss.h b/gendss.h
new file mode 100644
index 0000000..246dae3
--- /dev/null
+++ b/gendss.h
@@ -0,0 +1,36 @@
+/*
+ * Dropbear - a SSH2 server
+ * 
+ * Copyright (c) 2002,2003 Matt Johnston
+ * All rights reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE. */
+
+#ifndef _GENDSS_H_
+#define _GENDSS_H_
+
+#include "dss.h"
+
+#ifdef DROPBEAR_DSS
+
+dss_key * gen_dss_priv_key(unsigned int size);
+
+#endif /* DROPBEAR_DSS */
+
+#endif /* _GENDSS_H_ */
diff --git a/genrsa.c b/genrsa.c
new file mode 100644
index 0000000..73a7984
--- /dev/null
+++ b/genrsa.c
@@ -0,0 +1,137 @@
+/*
+ * Dropbear - a SSH2 server
+ * 
+ * Copyright (c) 2002,2003 Matt Johnston
+ * All rights reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE. */
+
+#include "includes.h"
+#include "dbutil.h"
+#include "bignum.h"
+#include "random.h"
+#include "rsa.h"
+#include "genrsa.h"
+
+#define RSA_E 65537
+
+#ifdef DROPBEAR_RSA
+
+static void getrsaprime(mp_int* prime, mp_int *primeminus, 
+		mp_int* rsa_e, unsigned int size);
+
+/* mostly taken from libtomcrypt's rsa key generation routine */
+rsa_key * gen_rsa_priv_key(unsigned int size) {
+
+	rsa_key * key;
+	DEF_MP_INT(pminus);
+	DEF_MP_INT(qminus);
+	DEF_MP_INT(lcm);
+
+	key = (rsa_key*)m_malloc(sizeof(rsa_key));
+
+	key->e = (mp_int*)m_malloc(sizeof(mp_int));
+	key->n = (mp_int*)m_malloc(sizeof(mp_int));
+	key->d = (mp_int*)m_malloc(sizeof(mp_int));
+	key->p = (mp_int*)m_malloc(sizeof(mp_int));
+	key->q = (mp_int*)m_malloc(sizeof(mp_int));
+
+	m_mp_init_multi(key->e, key->n, key->d, key->p, key->q,
+			&pminus, &lcm, &qminus, NULL);
+
+	seedrandom();
+
+	if (mp_set_int(key->e, RSA_E) != MP_OKAY) {
+		fprintf(stderr, "rsa generation failed\n");
+		exit(1);
+	}
+
+	/* PuTTY doesn't like it if the modulus isn't a multiple of 8 bits,
+	 * so we just generate them until we get one which is OK */
+	getrsaprime(key->p, &pminus, key->e, size/2);
+	do {
+		getrsaprime(key->q, &qminus, key->e, size/2);
+
+		if (mp_mul(key->p, key->q, key->n) != MP_OKAY) {
+			fprintf(stderr, "rsa generation failed\n");
+			exit(1);
+		}
+	} while (mp_count_bits(key->n) % 8 != 0);
+
+	/* lcm(p-1, q-1) */
+	if (mp_lcm(&pminus, &qminus, &lcm) != MP_OKAY) {
+		fprintf(stderr, "rsa generation failed\n");
+		exit(1);
+	}
+
+	/* de = 1 mod lcm(p-1,q-1) */
+	/* therefore d = (e^-1) mod lcm(p-1,q-1) */
+	if (mp_invmod(key->e, &lcm, key->d) != MP_OKAY) {
+		fprintf(stderr, "rsa generation failed\n");
+		exit(1);
+	}
+
+	mp_clear_multi(&pminus, &qminus, &lcm, NULL);
+
+	return key;
+}	
+
+/* return a prime suitable for p or q */
+static void getrsaprime(mp_int* prime, mp_int *primeminus, 
+		mp_int* rsa_e, unsigned int size) {
+
+	unsigned char *buf;
+	DEF_MP_INT(temp_gcd);
+
+	buf = (unsigned char*)m_malloc(size+1);
+
+	m_mp_init(&temp_gcd);
+	do {
+		/* generate a random odd number with MSB set, then find the
+		   the next prime above it */
+		genrandom(buf, size+1);
+		buf[0] |= 0x80; /* MSB set */
+
+		bytes_to_mp(prime, buf, size+1);
+
+		/* find the next integer which is prime, 8 round of miller-rabin */
+		if (mp_prime_next_prime(prime, 8, 0) != MP_OKAY) {
+			fprintf(stderr, "rsa generation failed\n");
+			exit(1);
+		}
+
+		/* subtract one to get p-1 */
+		if (mp_sub_d(prime, 1, primeminus) != MP_OKAY) {
+			fprintf(stderr, "rsa generation failed\n");
+			exit(1);
+		}
+		/* check relative primality to e */
+		if (mp_gcd(primeminus, rsa_e, &temp_gcd) != MP_OKAY) {
+			fprintf(stderr, "rsa generation failed\n");
+			exit(1);
+		}
+	} while (mp_cmp_d(&temp_gcd, 1) != MP_EQ); /* while gcd(p-1, e) != 1 */
+
+	/* now we have a good value for result */
+	mp_clear(&temp_gcd);
+	m_burn(buf, size+1);
+	m_free(buf);
+}
+
+#endif /* DROPBEAR_RSA */
diff --git a/genrsa.h b/genrsa.h
new file mode 100644
index 0000000..ef9f579
--- /dev/null
+++ b/genrsa.h
@@ -0,0 +1,36 @@
+/*
+ * Dropbear - a SSH2 server
+ * 
+ * Copyright (c) 2002,2003 Matt Johnston
+ * All rights reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE. */
+
+#ifndef _GENRSA_H_
+#define _GENRSA_H_
+
+#include "rsa.h"
+
+#ifdef DROPBEAR_RSA
+
+rsa_key * gen_rsa_priv_key(unsigned int size);
+
+#endif /* DROPBEAR_RSA */
+
+#endif /* _GENRSA_H_ */
diff --git a/includes.h b/includes.h
new file mode 100644
index 0000000..06c9692
--- /dev/null
+++ b/includes.h
@@ -0,0 +1,155 @@
+/*
+ * Dropbear - a SSH2 server
+ * 
+ * Copyright (c) 2002,2003 Matt Johnston
+ * All rights reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE. */
+
+#ifndef _INCLUDES_H_
+#define _INCLUDES_H_
+
+
+#include "config.h"
+#include "options.h"
+#include "debug.h"
+
+#include <sys/types.h>
+#include <sys/ioctl.h>
+#include <sys/param.h> /* required for BSD4_4 define */
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/un.h>
+#include <sys/wait.h>
+
+#include <stdio.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <grp.h>
+#include <limits.h>
+#include <pwd.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <string.h>
+#include <termios.h>
+#include <unistd.h>
+#include <syslog.h>
+#include <netdb.h>
+#include <ctype.h>
+#include <stdarg.h>
+#include <dirent.h>
+
+#ifdef HAVE_UTMP_H
+#include <utmp.h>
+#endif
+
+#ifdef HAVE_UTMPX_H
+#include <utmpx.h>
+#endif
+
+#ifdef HAVE_PATHS_H
+#include <paths.h>
+#endif
+
+#ifdef HAVE_LASTLOG_H
+#include <lastlog.h>
+#endif
+
+#include <arpa/inet.h>
+
+#ifdef HAVE_NETINET_IN_H
+#include <netinet/in.h>
+#endif
+
+/* netbsd 1.6 needs this to be included before netinet/ip.h for some
+ * undocumented reason */
+#ifdef HAVE_NETINET_IN_SYSTM_H
+#include <netinet/in_systm.h>
+#endif
+
+#include <netinet/ip.h>
+
+#ifdef HAVE_NETINET_TCP_H
+#include <netinet/tcp.h>
+#endif
+
+#ifdef HAVE_INTTYPES_H
+#include <inttypes.h>
+#endif
+
+#ifdef HAVE_LIBUTIL_H
+#include <libutil.h>
+#endif
+
+#ifdef HAVE_CRYPT_H
+#include <crypt.h>
+#endif
+
+#ifndef DISABLE_ZLIB
+#include <zlib.h>
+#endif
+
+#ifdef HAVE_UTIL_H
+#include <util.h>
+#endif
+
+#ifdef HAVE_SHADOW_H
+#include <shadow.h>
+#endif
+
+#ifdef HAVE_LIBGEN_H
+#include <libgen.h>
+#endif
+
+#include "libtomcrypt/src/headers/tomcrypt.h"
+#include "libtommath/tommath.h"
+
+#include "compat.h"
+#include "fake-rfc2553.h"
+
+#ifndef HAVE_UINT16_T
+#ifndef HAVE_U_INT16_T
+typedef unsigned short u_int16_t;
+#endif /* HAVE_U_INT16_T */
+typedef u_int16_t uint16_t;
+#endif /* HAVE_UINT16_T */
+
+#ifndef LOG_AUTHPRIV
+#define LOG_AUTHPRIV LOG_AUTH
+#endif
+
+/* glibc 2.1.3 systems have sockaddr_storage.__ss_family rather than
+ * sockaddr_storage.ss_family */
+#if !defined(HAVE_STRUCT_SOCKADDR_STORAGE_SS_FAMILY) \
+    && defined(HAVE_STRUCT_SOCKADDR_STORAGE___SS_FAMILY)
+#define ss_family __ss_family
+#endif
+
+/* so we can avoid warnings about unused params (ie in signal handlers etc) */
+#ifdef UNUSED 
+#elif defined(__GNUC__) 
+# define UNUSED(x) UNUSED_ ## x __attribute__((unused)) 
+#elif defined(__LCLINT__) 
+# define UNUSED(x) /*@unused@*/ x 
+#else 
+# define UNUSED(x) x 
+#endif
+
+#endif /* _INCLUDES_H_ */
diff --git a/install-sh b/install-sh
new file mode 100644
index 0000000..e9de238
--- /dev/null
+++ b/install-sh
@@ -0,0 +1,251 @@
+#!/bin/sh
+#
+# install - install a program, script, or datafile
+# This comes from X11R5 (mit/util/scripts/install.sh).
+#
+# Copyright 1991 by the Massachusetts Institute of Technology
+#
+# Permission to use, copy, modify, distribute, and sell this software and its
+# documentation for any purpose is hereby granted without fee, provided that
+# the above copyright notice appear in all copies and that both that
+# copyright notice and this permission notice appear in supporting
+# documentation, and that the name of M.I.T. not be used in advertising or
+# publicity pertaining to distribution of the software without specific,
+# written prior permission.  M.I.T. makes no representations about the
+# suitability of this software for any purpose.  It is provided "as is"
+# without express or implied warranty.
+#
+# Calling this script install-sh is preferred over install.sh, to prevent
+# `make' implicit rules from creating a file called install from it
+# when there is no Makefile.
+#
+# This script is compatible with the BSD install script, but was written
+# from scratch.  It can only install one file at a time, a restriction
+# shared with many OS's install programs.
+
+
+# set DOITPROG to echo to test this script
+
+# Don't use :- since 4.3BSD and earlier shells don't like it.
+doit="${DOITPROG-}"
+
+
+# put in absolute paths if you don't have them in your path; or use env. vars.
+
+mvprog="${MVPROG-mv}"
+cpprog="${CPPROG-cp}"
+chmodprog="${CHMODPROG-chmod}"
+chownprog="${CHOWNPROG-chown}"
+chgrpprog="${CHGRPPROG-chgrp}"
+stripprog="${STRIPPROG-strip}"
+rmprog="${RMPROG-rm}"
+mkdirprog="${MKDIRPROG-mkdir}"
+
+transformbasename=""
+transform_arg=""
+instcmd="$mvprog"
+chmodcmd="$chmodprog 0755"
+chowncmd=""
+chgrpcmd=""
+stripcmd=""
+rmcmd="$rmprog -f"
+mvcmd="$mvprog"
+src=""
+dst=""
+dir_arg=""
+
+while [ x"$1" != x ]; do
+    case $1 in
+	-c) instcmd="$cpprog"
+	    shift
+	    continue;;
+
+	-d) dir_arg=true
+	    shift
+	    continue;;
+
+	-m) chmodcmd="$chmodprog $2"
+	    shift
+	    shift
+	    continue;;
+
+	-o) chowncmd="$chownprog $2"
+	    shift
+	    shift
+	    continue;;
+
+	-g) chgrpcmd="$chgrpprog $2"
+	    shift
+	    shift
+	    continue;;
+
+	-s) stripcmd="$stripprog"
+	    shift
+	    continue;;
+
+	-t=*) transformarg=`echo $1 | sed 's/-t=//'`
+	    shift
+	    continue;;
+
+	-b=*) transformbasename=`echo $1 | sed 's/-b=//'`
+	    shift
+	    continue;;
+
+	*)  if [ x"$src" = x ]
+	    then
+		src=$1
+	    else
+		# this colon is to work around a 386BSD /bin/sh bug
+		:
+		dst=$1
+	    fi
+	    shift
+	    continue;;
+    esac
+done
+
+if [ x"$src" = x ]
+then
+	echo "install:	no input file specified"
+	exit 1
+else
+	true
+fi
+
+if [ x"$dir_arg" != x ]; then
+	dst=$src
+	src=""
+	
+	if [ -d $dst ]; then
+		instcmd=:
+		chmodcmd=""
+	else
+		instcmd=mkdir
+	fi
+else
+
+# Waiting for this to be detected by the "$instcmd $src $dsttmp" command
+# might cause directories to be created, which would be especially bad 
+# if $src (and thus $dsttmp) contains '*'.
+
+	if [ -f $src -o -d $src ]
+	then
+		true
+	else
+		echo "install:  $src does not exist"
+		exit 1
+	fi
+	
+	if [ x"$dst" = x ]
+	then
+		echo "install:	no destination specified"
+		exit 1
+	else
+		true
+	fi
+
+# If destination is a directory, append the input filename; if your system
+# does not like double slashes in filenames, you may need to add some logic
+
+	if [ -d $dst ]
+	then
+		dst="$dst"/`basename $src`
+	else
+		true
+	fi
+fi
+
+## this sed command emulates the dirname command
+dstdir=`echo $dst | sed -e 's,[^/]*$,,;s,/$,,;s,^$,.,'`
+
+# Make sure that the destination directory exists.
+#  this part is taken from Noah Friedman's mkinstalldirs script
+
+# Skip lots of stat calls in the usual case.
+if [ ! -d "$dstdir" ]; then
+defaultIFS='	
+'
+IFS="${IFS-${defaultIFS}}"
+
+oIFS="${IFS}"
+# Some sh's can't handle IFS=/ for some reason.
+IFS='%'
+set - `echo ${dstdir} | sed -e 's@/@%@g' -e 's@^%@/@'`
+IFS="${oIFS}"
+
+pathcomp=''
+
+while [ $# -ne 0 ] ; do
+	pathcomp="${pathcomp}${1}"
+	shift
+
+	if [ ! -d "${pathcomp}" ] ;
+        then
+		$mkdirprog "${pathcomp}"
+	else
+		true
+	fi
+
+	pathcomp="${pathcomp}/"
+done
+fi
+
+if [ x"$dir_arg" != x ]
+then
+	$doit $instcmd $dst &&
+
+	if [ x"$chowncmd" != x ]; then $doit $chowncmd $dst; else true ; fi &&
+	if [ x"$chgrpcmd" != x ]; then $doit $chgrpcmd $dst; else true ; fi &&
+	if [ x"$stripcmd" != x ]; then $doit $stripcmd $dst; else true ; fi &&
+	if [ x"$chmodcmd" != x ]; then $doit $chmodcmd $dst; else true ; fi
+else
+
+# If we're going to rename the final executable, determine the name now.
+
+	if [ x"$transformarg" = x ] 
+	then
+		dstfile=`basename $dst`
+	else
+		dstfile=`basename $dst $transformbasename | 
+			sed $transformarg`$transformbasename
+	fi
+
+# don't allow the sed command to completely eliminate the filename
+
+	if [ x"$dstfile" = x ] 
+	then
+		dstfile=`basename $dst`
+	else
+		true
+	fi
+
+# Make a temp file name in the proper directory.
+
+	dsttmp=$dstdir/#inst.$$#
+
+# Move or copy the file name to the temp name
+
+	$doit $instcmd $src $dsttmp &&
+
+	trap "rm -f ${dsttmp}" 0 &&
+
+# and set any options; do chmod last to preserve setuid bits
+
+# If any of these fail, we abort the whole thing.  If we want to
+# ignore errors from any of these, just make sure not to ignore
+# errors from the above "$doit $instcmd $src $dsttmp" command.
+
+	if [ x"$chowncmd" != x ]; then $doit $chowncmd $dsttmp; else true;fi &&
+	if [ x"$chgrpcmd" != x ]; then $doit $chgrpcmd $dsttmp; else true;fi &&
+	if [ x"$stripcmd" != x ]; then $doit $stripcmd $dsttmp; else true;fi &&
+	if [ x"$chmodcmd" != x ]; then $doit $chmodcmd $dsttmp; else true;fi &&
+
+# Now rename the file to the real destination.
+
+	$doit $rmcmd -f $dstdir/$dstfile &&
+	$doit $mvcmd $dsttmp $dstdir/$dstfile 
+
+fi &&
+
+
+exit 0
diff --git a/kex.h b/kex.h
new file mode 100644
index 0000000..448ad1b
--- /dev/null
+++ b/kex.h
@@ -0,0 +1,64 @@
+/*
+ * Dropbear - a SSH2 server
+ * 
+ * Copyright (c) 2002,2003 Matt Johnston
+ * All rights reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE. */
+
+#ifndef _KEX_H_
+#define _KEX_H_
+
+#include "includes.h"
+#include "algo.h"
+
+void send_msg_kexinit();
+void recv_msg_kexinit();
+void send_msg_newkeys();
+void recv_msg_newkeys();
+void kexfirstinitialise();
+void gen_kexdh_vals(mp_int *dh_pub, mp_int *dh_priv);
+void kexdh_comb_key(mp_int *dh_pub_us, mp_int *dh_priv, mp_int *dh_pub_them,
+		sign_key *hostkey);
+
+void recv_msg_kexdh_init(); /* server */
+
+void send_msg_kexdh_init(); /* client */
+void recv_msg_kexdh_reply(); /* client */
+
+struct KEXState {
+
+	unsigned sentkexinit : 1; /*set when we've sent/recv kexinit packet */
+	unsigned recvkexinit : 1;
+	unsigned firstfollows : 1; /* true when first_kex_packet_follows is set */
+	unsigned sentnewkeys : 1; /* set once we've send/recv'ed MSG_NEWKEYS*/
+	unsigned recvnewkeys : 1;
+
+	unsigned donefirstkex : 1; /* Set to 1 after the first kex has completed,
+								  ie the transport layer has been set up */
+
+	long lastkextime; /* time of the last kex */
+	unsigned int datatrans; /* data transmitted since last kex */
+	unsigned int datarecv; /* data received since last kex */
+
+};
+
+#define MAX_KEXHASHBUF 2000
+
+#endif /* _KEX_H_ */
diff --git a/keyimport.c b/keyimport.c
new file mode 100644
index 0000000..a0474f3
--- /dev/null
+++ b/keyimport.c
@@ -0,0 +1,1730 @@
+/*
+ * Based on PuTTY's import.c for importing/exporting OpenSSH and SSH.com
+ * keyfiles.
+ *
+ * The horribleness of the code is probably mine (matt).
+ *
+ * Modifications copyright 2003 Matt Johnston
+ *
+ * PuTTY is copyright 1997-2003 Simon Tatham.
+ * 
+ * Portions copyright Robert de Bath, Joris van Rantwijk, Delian
+ * Delchev, Andreas Schultz, Jeroen Massar, Wez Furlong, Nicolas Barry,
+ * Justin Bradford, and CORE SDI S.A.
+ * 
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation files
+ * (the "Software"), to deal in the Software without restriction,
+ * including without limitation the rights to use, copy, modify, merge,
+ * publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so,
+ * subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT.  IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE
+ * FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
+ * CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "keyimport.h"
+#include "bignum.h"
+#include "buffer.h"
+#include "dbutil.h"
+
+#define PUT_32BIT(cp, value) do { \
+  (cp)[3] = (unsigned char)(value); \
+  (cp)[2] = (unsigned char)((value) >> 8); \
+  (cp)[1] = (unsigned char)((value) >> 16); \
+  (cp)[0] = (unsigned char)((value) >> 24); } while (0)
+
+#define GET_32BIT(cp) \
+	(((unsigned long)(unsigned char)(cp)[0] << 24) | \
+	((unsigned long)(unsigned char)(cp)[1] << 16) | \
+	((unsigned long)(unsigned char)(cp)[2] << 8) | \
+	((unsigned long)(unsigned char)(cp)[3]))
+
+static int openssh_encrypted(const char *filename);
+static sign_key *openssh_read(const char *filename, char *passphrase);
+static int openssh_write(const char *filename, sign_key *key,
+				  char *passphrase);
+
+static int dropbear_write(const char*filename, sign_key * key);
+static sign_key *dropbear_read(const char* filename);
+
+#if 0
+static int sshcom_encrypted(const char *filename, char **comment);
+static struct ssh2_userkey *sshcom_read(const char *filename, char *passphrase);
+static int sshcom_write(const char *filename, struct ssh2_userkey *key,
+				 char *passphrase);
+#endif
+
+int import_encrypted(const char* filename, int filetype) {
+
+	if (filetype == KEYFILE_OPENSSH) {
+		return openssh_encrypted(filename);
+#if 0
+	} else if (filetype == KEYFILE_SSHCOM) {
+		return sshcom_encrypted(filename, NULL);
+#endif
+	}
+	return 0;
+}
+
+sign_key *import_read(const char *filename, char *passphrase, int filetype) {
+
+	if (filetype == KEYFILE_OPENSSH) {
+		return openssh_read(filename, passphrase);
+	} else if (filetype == KEYFILE_DROPBEAR) {
+		return dropbear_read(filename);
+#if 0
+	} else if (filetype == KEYFILE_SSHCOM) {
+		return sshcom_read(filename, passphrase);
+#endif
+	}
+	return NULL;
+}
+
+int import_write(const char *filename, sign_key *key, char *passphrase,
+		int filetype) {
+
+	if (filetype == KEYFILE_OPENSSH) {
+		return openssh_write(filename, key, passphrase);
+	} else if (filetype == KEYFILE_DROPBEAR) {
+		return dropbear_write(filename, key);
+#if 0
+	} else if (filetype == KEYFILE_SSHCOM) {
+		return sshcom_write(filename, key, passphrase);
+#endif
+	}
+	return 0;
+}
+
+static sign_key *dropbear_read(const char* filename) {
+
+	buffer * buf = NULL;
+	sign_key *ret = NULL;
+	int type;
+
+	buf = buf_new(MAX_PRIVKEY_SIZE);
+	if (buf_readfile(buf, filename) == DROPBEAR_FAILURE) {
+		goto error;
+	}
+
+	buf_setpos(buf, 0);
+	ret = new_sign_key();
+
+	type = DROPBEAR_SIGNKEY_ANY;
+	if (buf_get_priv_key(buf, ret, &type) == DROPBEAR_FAILURE){
+		goto error;
+	}
+	buf_free(buf);
+
+	return ret;
+
+error:
+	if (buf) {
+		buf_free(buf);
+	}
+	if (ret) {
+		sign_key_free(ret);
+	}
+	return NULL;
+}
+
+/* returns 0 on fail, 1 on success */
+static int dropbear_write(const char*filename, sign_key * key) {
+
+	int keytype = -1;
+	buffer * buf;
+	FILE*fp;
+	int len;
+	int ret;
+
+#ifdef DROPBEAR_RSA
+	if (key->rsakey != NULL) {
+		keytype = DROPBEAR_SIGNKEY_RSA;
+	}
+#endif
+#ifdef DROPBEAR_DSS
+	if (key->dsskey != NULL) {
+		keytype = DROPBEAR_SIGNKEY_DSS;
+	}
+#endif
+
+	buf = buf_new(MAX_PRIVKEY_SIZE);
+	buf_put_priv_key(buf, key, keytype);
+
+	fp = fopen(filename, "w");
+	if (!fp) {
+		ret = 0;
+		goto out;
+	}
+
+	buf_setpos(buf, 0);
+	do {
+		len = fwrite(buf_getptr(buf, buf->len - buf->pos),
+				1, buf->len - buf->pos, fp);
+		buf_incrpos(buf, len);
+	} while (len > 0 && buf->len != buf->pos);
+
+	fclose(fp);
+
+	if (buf->pos != buf->len) {
+		ret = 0;
+	} else {
+		ret = 1;
+	}
+out:
+	buf_free(buf);
+	return ret;
+}
+
+
+/* ----------------------------------------------------------------------
+ * Helper routines. (The base64 ones are defined in sshpubk.c.)
+ */
+
+#define isbase64(c) (	((c) >= 'A' && (c) <= 'Z') || \
+						 ((c) >= 'a' && (c) <= 'z') || \
+						 ((c) >= '0' && (c) <= '9') || \
+						 (c) == '+' || (c) == '/' || (c) == '=' \
+						 )
+
+/* cpl has to be less than 100 */
+static void base64_encode_fp(FILE * fp, unsigned char *data,
+		int datalen, int cpl)
+{
+    char out[100];
+    int n;
+	unsigned long outlen;
+	int rawcpl;
+	rawcpl = cpl * 3 / 4;
+	dropbear_assert((unsigned int)cpl < sizeof(out));
+
+    while (datalen > 0) {
+		n = (datalen < rawcpl ? datalen : rawcpl);
+		outlen = sizeof(out);
+		base64_encode(data, n, out, &outlen);
+		data += n;
+		datalen -= n;
+		fwrite(out, 1, outlen, fp);
+		fputc('\n', fp);
+    }
+}
+/*
+ * Read an ASN.1/BER identifier and length pair.
+ * 
+ * Flags are a combination of the #defines listed below.
+ * 
+ * Returns -1 if unsuccessful; otherwise returns the number of
+ * bytes used out of the source data.
+ */
+
+/* ASN.1 tag classes. */
+#define ASN1_CLASS_UNIVERSAL		(0 << 6)
+#define ASN1_CLASS_APPLICATION	  (1 << 6)
+#define ASN1_CLASS_CONTEXT_SPECIFIC (2 << 6)
+#define ASN1_CLASS_PRIVATE		  (3 << 6)
+#define ASN1_CLASS_MASK			 (3 << 6)
+
+/* Primitive versus constructed bit. */
+#define ASN1_CONSTRUCTED			(1 << 5)
+
+static int ber_read_id_len(void *source, int sourcelen,
+						   int *id, int *length, int *flags)
+{
+	unsigned char *p = (unsigned char *) source;
+
+	if (sourcelen == 0)
+		return -1;
+
+	*flags = (*p & 0xE0);
+	if ((*p & 0x1F) == 0x1F) {
+		*id = 0;
+		while (*p & 0x80) {
+			*id = (*id << 7) | (*p & 0x7F);
+			p++, sourcelen--;
+			if (sourcelen == 0)
+				return -1;
+		}
+		*id = (*id << 7) | (*p & 0x7F);
+		p++, sourcelen--;
+	} else {
+		*id = *p & 0x1F;
+		p++, sourcelen--;
+	}
+
+	if (sourcelen == 0)
+		return -1;
+
+	if (*p & 0x80) {
+		int n = *p & 0x7F;
+		p++, sourcelen--;
+		if (sourcelen < n)
+			return -1;
+		*length = 0;
+		while (n--)
+			*length = (*length << 8) | (*p++);
+		sourcelen -= n;
+	} else {
+		*length = *p;
+		p++, sourcelen--;
+	}
+
+	return p - (unsigned char *) source;
+}
+
+/*
+ * Write an ASN.1/BER identifier and length pair. Returns the
+ * number of bytes consumed. Assumes dest contains enough space.
+ * Will avoid writing anything if dest is NULL, but still return
+ * amount of space required.
+ */
+static int ber_write_id_len(void *dest, int id, int length, int flags)
+{
+	unsigned char *d = (unsigned char *)dest;
+	int len = 0;
+
+	if (id <= 30) {
+		/*
+		 * Identifier is one byte.
+		 */
+		len++;
+		if (d) *d++ = id | flags;
+	} else {
+		int n;
+		/*
+		 * Identifier is multiple bytes: the first byte is 11111
+		 * plus the flags, and subsequent bytes encode the value of
+		 * the identifier, 7 bits at a time, with the top bit of
+		 * each byte 1 except the last one which is 0.
+		 */
+		len++;
+		if (d) *d++ = 0x1F | flags;
+		for (n = 1; (id >> (7*n)) > 0; n++)
+			continue;					   /* count the bytes */
+		while (n--) {
+			len++;
+			if (d) *d++ = (n ? 0x80 : 0) | ((id >> (7*n)) & 0x7F);
+		}
+	}
+
+	if (length < 128) {
+		/*
+		 * Length is one byte.
+		 */
+		len++;
+		if (d) *d++ = length;
+	} else {
+		int n;
+		/*
+		 * Length is multiple bytes. The first is 0x80 plus the
+		 * number of subsequent bytes, and the subsequent bytes
+		 * encode the actual length.
+		 */
+		for (n = 1; (length >> (8*n)) > 0; n++)
+			continue;					   /* count the bytes */
+		len++;
+		if (d) *d++ = 0x80 | n;
+		while (n--) {
+			len++;
+			if (d) *d++ = (length >> (8*n)) & 0xFF;
+		}
+	}
+
+	return len;
+}
+
+
+/* Simple structure to point to an mp-int within a blob. */
+struct mpint_pos { void *start; int bytes; };
+
+/* ----------------------------------------------------------------------
+ * Code to read and write OpenSSH private keys.
+ */
+
+enum { OSSH_DSA, OSSH_RSA };
+struct openssh_key {
+	int type;
+	int encrypted;
+	char iv[32];
+	unsigned char *keyblob;
+	unsigned int keyblob_len, keyblob_size;
+};
+
+static struct openssh_key *load_openssh_key(const char *filename)
+{
+	struct openssh_key *ret;
+	FILE *fp;
+	char buffer[256];
+	char *errmsg = NULL, *p = NULL;
+	int headers_done;
+	unsigned long len, outlen;
+
+	ret = (struct openssh_key*)m_malloc(sizeof(struct openssh_key));
+	ret->keyblob = NULL;
+	ret->keyblob_len = ret->keyblob_size = 0;
+	ret->encrypted = 0;
+	memset(ret->iv, 0, sizeof(ret->iv));
+
+	if (strlen(filename) == 1 && filename[0] == '-') {
+		fp = stdin;
+	} else {
+		fp = fopen(filename, "r");
+	}
+	if (!fp) {
+		errmsg = "Unable to open key file";
+		goto error;
+	}
+	if (!fgets(buffer, sizeof(buffer), fp) ||
+		0 != strncmp(buffer, "-----BEGIN ", 11) ||
+		0 != strcmp(buffer+strlen(buffer)-17, "PRIVATE KEY-----\n")) {
+		errmsg = "File does not begin with OpenSSH key header";
+		goto error;
+	}
+	if (!strcmp(buffer, "-----BEGIN RSA PRIVATE KEY-----\n"))
+		ret->type = OSSH_RSA;
+	else if (!strcmp(buffer, "-----BEGIN DSA PRIVATE KEY-----\n"))
+		ret->type = OSSH_DSA;
+	else {
+		errmsg = "Unrecognised key type";
+		goto error;
+	}
+
+	headers_done = 0;
+	while (1) {
+		if (!fgets(buffer, sizeof(buffer), fp)) {
+			errmsg = "Unexpected end of file";
+			goto error;
+		}
+		if (0 == strncmp(buffer, "-----END ", 9) &&
+			0 == strcmp(buffer+strlen(buffer)-17, "PRIVATE KEY-----\n"))
+			break;					   /* done */
+		if ((p = strchr(buffer, ':')) != NULL) {
+			if (headers_done) {
+				errmsg = "Header found in body of key data";
+				goto error;
+			}
+			*p++ = '\0';
+			while (*p && isspace((unsigned char)*p)) p++;
+			if (!strcmp(buffer, "Proc-Type")) {
+				if (p[0] != '4' || p[1] != ',') {
+					errmsg = "Proc-Type is not 4 (only 4 is supported)";
+					goto error;
+				}
+				p += 2;
+				if (!strcmp(p, "ENCRYPTED\n"))
+					ret->encrypted = 1;
+			} else if (!strcmp(buffer, "DEK-Info")) {
+				int i, j;
+
+				if (strncmp(p, "DES-EDE3-CBC,", 13)) {
+					errmsg = "Ciphers other than DES-EDE3-CBC not supported";
+					goto error;
+				}
+				p += 13;
+				for (i = 0; i < 8; i++) {
+					if (1 != sscanf(p, "%2x", &j))
+						break;
+					ret->iv[i] = j;
+					p += 2;
+				}
+				if (i < 8) {
+					errmsg = "Expected 16-digit iv in DEK-Info";
+					goto error;
+				}
+			}
+		} else {
+			headers_done = 1;
+			len = strlen(buffer);
+			outlen = len*4/3;
+			if (ret->keyblob_len + outlen > ret->keyblob_size) {
+				ret->keyblob_size = ret->keyblob_len + outlen + 256;
+				ret->keyblob = (unsigned char*)m_realloc(ret->keyblob,
+						ret->keyblob_size);
+			}
+			outlen = ret->keyblob_size - ret->keyblob_len;
+			if (base64_decode(buffer, len, 
+						ret->keyblob + ret->keyblob_len, &outlen) != CRYPT_OK){
+				errmsg = "Error decoding base64";
+				goto error;
+			}
+			ret->keyblob_len += outlen;
+		}
+	}
+
+	if (ret->keyblob_len == 0 || !ret->keyblob) {
+		errmsg = "Key body not present";
+		goto error;
+	}
+
+	if (ret->encrypted && ret->keyblob_len % 8 != 0) {
+		errmsg = "Encrypted key blob is not a multiple of cipher block size";
+		goto error;
+	}
+
+	memset(buffer, 0, sizeof(buffer));
+	return ret;
+
+	error:
+	memset(buffer, 0, sizeof(buffer));
+	if (ret) {
+		if (ret->keyblob) {
+			memset(ret->keyblob, 0, ret->keyblob_size);
+			m_free(ret->keyblob);
+		}
+		memset(&ret, 0, sizeof(ret));
+		m_free(ret);
+	}
+	if (errmsg) {
+		fprintf(stderr, "Error: %s\n", errmsg);
+	}
+	return NULL;
+}
+
+static int openssh_encrypted(const char *filename)
+{
+	struct openssh_key *key = load_openssh_key(filename);
+	int ret;
+
+	if (!key)
+		return 0;
+	ret = key->encrypted;
+	memset(key->keyblob, 0, key->keyblob_size);
+	m_free(key->keyblob);
+	memset(&key, 0, sizeof(key));
+	m_free(key);
+	return ret;
+}
+
+static sign_key *openssh_read(const char *filename, char *passphrase)
+{
+	struct openssh_key *key;
+	unsigned char *p;
+	int ret, id, len, flags;
+	int i, num_integers = 0;
+	sign_key *retval = NULL;
+	char *errmsg;
+	char *modptr = NULL;
+	int modlen = -9999;
+	int type;
+
+	sign_key *retkey;
+	buffer * blobbuf = NULL;
+
+	key = load_openssh_key(filename);
+
+	if (!key)
+		return NULL;
+
+	if (key->encrypted) {
+		errmsg = "encrypted keys not supported currently";
+		goto error;
+#if 0
+		/* matt TODO */
+		/*
+		 * Derive encryption key from passphrase and iv/salt:
+		 * 
+		 *  - let block A equal MD5(passphrase || iv)
+		 *  - let block B equal MD5(A || passphrase || iv)
+		 *  - block C would be MD5(B || passphrase || iv) and so on
+		 *  - encryption key is the first N bytes of A || B
+		 */
+		struct MD5Context md5c;
+		unsigned char keybuf[32];
+
+		MD5Init(&md5c);
+		MD5Update(&md5c, (unsigned char *)passphrase, strlen(passphrase));
+		MD5Update(&md5c, (unsigned char *)key->iv, 8);
+		MD5Final(keybuf, &md5c);
+
+		MD5Init(&md5c);
+		MD5Update(&md5c, keybuf, 16);
+		MD5Update(&md5c, (unsigned char *)passphrase, strlen(passphrase));
+		MD5Update(&md5c, (unsigned char *)key->iv, 8);
+		MD5Final(keybuf+16, &md5c);
+
+		/*
+		 * Now decrypt the key blob.
+		 */
+		des3_decrypt_pubkey_ossh(keybuf, (unsigned char *)key->iv,
+								 key->keyblob, key->keyblob_len);
+
+		memset(&md5c, 0, sizeof(md5c));
+		memset(keybuf, 0, sizeof(keybuf));
+#endif 
+	}
+
+	/*
+	 * Now we have a decrypted key blob, which contains an ASN.1
+	 * encoded private key. We must now untangle the ASN.1.
+	 *
+	 * We expect the whole key blob to be formatted as a SEQUENCE
+	 * (0x30 followed by a length code indicating that the rest of
+	 * the blob is part of the sequence). Within that SEQUENCE we
+	 * expect to see a bunch of INTEGERs. What those integers mean
+	 * depends on the key type:
+	 *
+	 *  - For RSA, we expect the integers to be 0, n, e, d, p, q,
+	 *	dmp1, dmq1, iqmp in that order. (The last three are d mod
+	 *	(p-1), d mod (q-1), inverse of q mod p respectively.)
+	 *
+	 *  - For DSA, we expect them to be 0, p, q, g, y, x in that
+	 *	order.
+	 */
+	
+	p = key->keyblob;
+
+	/* Expect the SEQUENCE header. Take its absence as a failure to decrypt. */
+	ret = ber_read_id_len(p, key->keyblob_len, &id, &len, &flags);
+	p += ret;
+	if (ret < 0 || id != 16) {
+		errmsg = "ASN.1 decoding failure - wrong password?";
+		goto error;
+	}
+
+	/* Expect a load of INTEGERs. */
+	if (key->type == OSSH_RSA)
+		num_integers = 9;
+	else if (key->type == OSSH_DSA)
+		num_integers = 6;
+
+	/*
+	 * Space to create key blob in.
+	 */
+	blobbuf = buf_new(3000);
+
+	if (key->type == OSSH_DSA) {
+		buf_putstring(blobbuf, "ssh-dss", 7);
+	} else if (key->type == OSSH_RSA) {
+		buf_putstring(blobbuf, "ssh-rsa", 7);
+	}
+
+	for (i = 0; i < num_integers; i++) {
+		ret = ber_read_id_len(p, key->keyblob+key->keyblob_len-p,
+							  &id, &len, &flags);
+		p += ret;
+		if (ret < 0 || id != 2 ||
+			key->keyblob+key->keyblob_len-p < len) {
+			errmsg = "ASN.1 decoding failure";
+			goto error;
+		}
+
+		if (i == 0) {
+			/*
+			 * The first integer should be zero always (I think
+			 * this is some sort of version indication).
+			 */
+			if (len != 1 || p[0] != 0) {
+				errmsg = "Version number mismatch";
+				goto error;
+			}
+		} else if (key->type == OSSH_RSA) {
+			/*
+			 * OpenSSH key order is n, e, d, p, q, dmp1, dmq1, iqmp
+			 * but we want e, n, d, p, q
+			 */
+			if (i == 1) {
+				/* Save the details for after we deal with number 2. */
+				modptr = (char *)p;
+				modlen = len;
+			} else if (i >= 2 && i <= 5) {
+				buf_putstring(blobbuf, p, len);
+				if (i == 2) {
+					buf_putstring(blobbuf, modptr, modlen);
+				}
+			}
+		} else if (key->type == OSSH_DSA) {
+			/*
+			 * OpenSSH key order is p, q, g, y, x,
+			 * we want the same.
+			 */
+			buf_putstring(blobbuf, p, len);
+		}
+
+		/* Skip past the number. */
+		p += len;
+	}
+
+	/*
+	 * Now put together the actual key. Simplest way to do this is
+	 * to assemble our own key blobs and feed them to the createkey
+	 * functions; this is a bit faffy but it does mean we get all
+	 * the sanity checks for free.
+	 */
+	retkey = new_sign_key();
+	buf_setpos(blobbuf, 0);
+	type = DROPBEAR_SIGNKEY_ANY;
+	if (buf_get_priv_key(blobbuf, retkey, &type)
+			!= DROPBEAR_SUCCESS) {
+		errmsg = "unable to create key structure";
+		sign_key_free(retkey);
+		retkey = NULL;
+		goto error;
+	}
+
+	errmsg = NULL;					 /* no error */
+	retval = retkey;
+
+	error:
+	if (blobbuf) {
+		buf_burn(blobbuf);
+		buf_free(blobbuf);
+	}
+	m_burn(key->keyblob, key->keyblob_size);
+	m_free(key->keyblob);
+	m_burn(key, sizeof(key));
+	m_free(key);
+	if (errmsg) {
+		fprintf(stderr, "Error: %s\n", errmsg);
+	}
+	return retval;
+}
+
+static int openssh_write(const char *filename, sign_key *key,
+				  char *passphrase)
+{
+	buffer * keyblob = NULL;
+	buffer * extrablob = NULL; /* used for calculated values to write */
+	unsigned char *outblob = NULL;
+	int outlen = -9999;
+	struct mpint_pos numbers[9];
+	int nnumbers = -1, pos, len, seqlen, i;
+	char *header = NULL, *footer = NULL;
+	char zero[1];
+	unsigned char iv[8];
+	int ret = 0;
+	FILE *fp;
+	int keytype = -1;
+
+#ifdef DROPBEAR_RSA
+	mp_int dmp1, dmq1, iqmp, tmpval; /* for rsa */
+
+	if (key->rsakey != NULL) {
+		keytype = DROPBEAR_SIGNKEY_RSA;
+	}
+#endif
+#ifdef DROPBEAR_DSS
+	if (key->dsskey != NULL) {
+		keytype = DROPBEAR_SIGNKEY_DSS;
+	}
+#endif
+
+	dropbear_assert(keytype != -1);
+
+	/*
+	 * Fetch the key blobs.
+	 */
+	keyblob = buf_new(3000);
+	buf_put_priv_key(keyblob, key, keytype);
+
+	buf_setpos(keyblob, 0);
+	/* skip the "ssh-rsa" or "ssh-dss" header */
+	buf_incrpos(keyblob, buf_getint(keyblob));
+
+	/*
+	 * Find the sequence of integers to be encoded into the OpenSSH
+	 * key blob, and also decide on the header line.
+	 */
+	numbers[0].start = zero; numbers[0].bytes = 1; zero[0] = '\0';
+
+#ifdef DROPBEAR_RSA
+	if (keytype == DROPBEAR_SIGNKEY_RSA) {
+
+		if (key->rsakey->p == NULL || key->rsakey->q == NULL) {
+			fprintf(stderr, "Pre-0.33 Dropbear keys cannot be converted to OpenSSH keys.\n");
+			goto error;
+		}
+
+		/* e */
+		numbers[2].bytes = buf_getint(keyblob);
+		numbers[2].start = buf_getptr(keyblob, numbers[2].bytes);
+		buf_incrpos(keyblob, numbers[2].bytes);
+		
+		/* n */
+		numbers[1].bytes = buf_getint(keyblob);
+		numbers[1].start = buf_getptr(keyblob, numbers[1].bytes);
+		buf_incrpos(keyblob, numbers[1].bytes);
+		
+		/* d */
+		numbers[3].bytes = buf_getint(keyblob);
+		numbers[3].start = buf_getptr(keyblob, numbers[3].bytes);
+		buf_incrpos(keyblob, numbers[3].bytes);
+		
+		/* p */
+		numbers[4].bytes = buf_getint(keyblob);
+		numbers[4].start = buf_getptr(keyblob, numbers[4].bytes);
+		buf_incrpos(keyblob, numbers[4].bytes);
+		
+		/* q */
+		numbers[5].bytes = buf_getint(keyblob);
+		numbers[5].start = buf_getptr(keyblob, numbers[5].bytes);
+		buf_incrpos(keyblob, numbers[5].bytes);
+
+		/* now calculate some extra parameters: */
+		m_mp_init(&tmpval);
+		m_mp_init(&dmp1);
+		m_mp_init(&dmq1);
+		m_mp_init(&iqmp);
+
+		/* dmp1 = d mod (p-1) */
+		if (mp_sub_d(key->rsakey->p, 1, &tmpval) != MP_OKAY) {
+			fprintf(stderr, "Bignum error for p-1\n");
+			goto error;
+		}
+		if (mp_mod(key->rsakey->d, &tmpval, &dmp1) != MP_OKAY) {
+			fprintf(stderr, "Bignum error for dmp1\n");
+			goto error;
+		}
+
+		/* dmq1 = d mod (q-1) */
+		if (mp_sub_d(key->rsakey->q, 1, &tmpval) != MP_OKAY) {
+			fprintf(stderr, "Bignum error for q-1\n");
+			goto error;
+		}
+		if (mp_mod(key->rsakey->d, &tmpval, &dmq1) != MP_OKAY) {
+			fprintf(stderr, "Bignum error for dmq1\n");
+			goto error;
+		}
+
+		/* iqmp = (q^-1) mod p */
+		if (mp_invmod(key->rsakey->q, key->rsakey->p, &iqmp) != MP_OKAY) {
+			fprintf(stderr, "Bignum error for iqmp\n");
+			goto error;
+		}
+
+		extrablob = buf_new(2000);
+		buf_putmpint(extrablob, &dmp1);
+		buf_putmpint(extrablob, &dmq1);
+		buf_putmpint(extrablob, &iqmp);
+		buf_setpos(extrablob, 0);
+		mp_clear(&dmp1);
+		mp_clear(&dmq1);
+		mp_clear(&iqmp);
+		mp_clear(&tmpval);
+		
+		/* dmp1 */
+		numbers[6].bytes = buf_getint(extrablob);
+		numbers[6].start = buf_getptr(extrablob, numbers[6].bytes);
+		buf_incrpos(extrablob, numbers[6].bytes);
+		
+		/* dmq1 */
+		numbers[7].bytes = buf_getint(extrablob);
+		numbers[7].start = buf_getptr(extrablob, numbers[7].bytes);
+		buf_incrpos(extrablob, numbers[7].bytes);
+		
+		/* iqmp */
+		numbers[8].bytes = buf_getint(extrablob);
+		numbers[8].start = buf_getptr(extrablob, numbers[8].bytes);
+		buf_incrpos(extrablob, numbers[8].bytes);
+
+		nnumbers = 9;
+		header = "-----BEGIN RSA PRIVATE KEY-----\n";
+		footer = "-----END RSA PRIVATE KEY-----\n";
+	}
+#endif /* DROPBEAR_RSA */
+
+#ifdef DROPBEAR_DSS
+	if (keytype == DROPBEAR_SIGNKEY_DSS) {
+
+		/* p */
+		numbers[1].bytes = buf_getint(keyblob);
+		numbers[1].start = buf_getptr(keyblob, numbers[1].bytes);
+		buf_incrpos(keyblob, numbers[1].bytes);
+
+		/* q */
+		numbers[2].bytes = buf_getint(keyblob);
+		numbers[2].start = buf_getptr(keyblob, numbers[2].bytes);
+		buf_incrpos(keyblob, numbers[2].bytes);
+
+		/* g */
+		numbers[3].bytes = buf_getint(keyblob);
+		numbers[3].start = buf_getptr(keyblob, numbers[3].bytes);
+		buf_incrpos(keyblob, numbers[3].bytes);
+
+		/* y */
+		numbers[4].bytes = buf_getint(keyblob);
+		numbers[4].start = buf_getptr(keyblob, numbers[4].bytes);
+		buf_incrpos(keyblob, numbers[4].bytes);
+
+		/* x */
+		numbers[5].bytes = buf_getint(keyblob);
+		numbers[5].start = buf_getptr(keyblob, numbers[5].bytes);
+		buf_incrpos(keyblob, numbers[5].bytes);
+
+		nnumbers = 6;
+		header = "-----BEGIN DSA PRIVATE KEY-----\n";
+		footer = "-----END DSA PRIVATE KEY-----\n";
+	}
+#endif /* DROPBEAR_DSS */
+
+	/*
+	 * Now count up the total size of the ASN.1 encoded integers,
+	 * so as to determine the length of the containing SEQUENCE.
+	 */
+	len = 0;
+	for (i = 0; i < nnumbers; i++) {
+		len += ber_write_id_len(NULL, 2, numbers[i].bytes, 0);
+		len += numbers[i].bytes;
+	}
+	seqlen = len;
+	/* Now add on the SEQUENCE header. */
+	len += ber_write_id_len(NULL, 16, seqlen, ASN1_CONSTRUCTED);
+	/* Round up to the cipher block size, ensuring we have at least one
+	 * byte of padding (see below). */
+	outlen = len;
+	if (passphrase)
+		outlen = (outlen+8) &~ 7;
+
+	/*
+	 * Now we know how big outblob needs to be. Allocate it.
+	 */
+	outblob = (unsigned char*)m_malloc(outlen);
+
+	/*
+	 * And write the data into it.
+	 */
+	pos = 0;
+	pos += ber_write_id_len(outblob+pos, 16, seqlen, ASN1_CONSTRUCTED);
+	for (i = 0; i < nnumbers; i++) {
+		pos += ber_write_id_len(outblob+pos, 2, numbers[i].bytes, 0);
+		memcpy(outblob+pos, numbers[i].start, numbers[i].bytes);
+		pos += numbers[i].bytes;
+	}
+
+	/*
+	 * Padding on OpenSSH keys is deterministic. The number of
+	 * padding bytes is always more than zero, and always at most
+	 * the cipher block length. The value of each padding byte is
+	 * equal to the number of padding bytes. So a plaintext that's
+	 * an exact multiple of the block size will be padded with 08
+	 * 08 08 08 08 08 08 08 (assuming a 64-bit block cipher); a
+	 * plaintext one byte less than a multiple of the block size
+	 * will be padded with just 01.
+	 * 
+	 * This enables the OpenSSL key decryption function to strip
+	 * off the padding algorithmically and return the unpadded
+	 * plaintext to the next layer: it looks at the final byte, and
+	 * then expects to find that many bytes at the end of the data
+	 * with the same value. Those are all removed and the rest is
+	 * returned.
+	 */
+	dropbear_assert(pos == len);
+	while (pos < outlen) {
+		outblob[pos++] = outlen - len;
+	}
+
+	/*
+	 * Encrypt the key.
+	 */
+	if (passphrase) {
+		fprintf(stderr, "Encrypted keys aren't supported currently\n");
+		goto error;
+#if 0
+		/*
+		 * Invent an iv. Then derive encryption key from passphrase
+		 * and iv/salt:
+		 * 
+		 *  - let block A equal MD5(passphrase || iv)
+		 *  - let block B equal MD5(A || passphrase || iv)
+		 *  - block C would be MD5(B || passphrase || iv) and so on
+		 *  - encryption key is the first N bytes of A || B
+		 */
+		struct MD5Context md5c;
+		unsigned char keybuf[32];
+
+		for (i = 0; i < 8; i++) iv[i] = random_byte();
+
+		MD5Init(&md5c);
+		MD5Update(&md5c, (unsigned char *)passphrase, strlen(passphrase));
+		MD5Update(&md5c, iv, 8);
+		MD5Final(keybuf, &md5c);
+
+		MD5Init(&md5c);
+		MD5Update(&md5c, keybuf, 16);
+		MD5Update(&md5c, (unsigned char *)passphrase, strlen(passphrase));
+		MD5Update(&md5c, iv, 8);
+		MD5Final(keybuf+16, &md5c);
+
+		/*
+		 * Now encrypt the key blob.
+		 */
+		des3_encrypt_pubkey_ossh(keybuf, iv, outblob, outlen);
+
+		memset(&md5c, 0, sizeof(md5c));
+		memset(keybuf, 0, sizeof(keybuf));
+#endif
+	}
+
+	/*
+	 * And save it. We'll use Unix line endings just in case it's
+	 * subsequently transferred in binary mode.
+	 */
+	if (strlen(filename) == 1 && filename[0] == '-') {
+		fp = stdout;
+	} else {
+		fp = fopen(filename, "wb");	  /* ensure Unix line endings */
+	}
+	if (!fp) {
+		fprintf(stderr, "Failed opening output file\n");
+		goto error;
+	}
+	fputs(header, fp);
+	if (passphrase) {
+		fprintf(fp, "Proc-Type: 4,ENCRYPTED\nDEK-Info: DES-EDE3-CBC,");
+		for (i = 0; i < 8; i++)
+			fprintf(fp, "%02X", iv[i]);
+		fprintf(fp, "\n\n");
+	}
+	base64_encode_fp(fp, outblob, outlen, 64);
+	fputs(footer, fp);
+	fclose(fp);
+	ret = 1;
+
+	error:
+	if (outblob) {
+		memset(outblob, 0, outlen);
+		m_free(outblob);
+	}
+	if (keyblob) {
+		buf_burn(keyblob);
+		buf_free(keyblob);
+	}
+	if (extrablob) {
+		buf_burn(extrablob);
+		buf_free(extrablob);
+	}
+	return ret;
+}
+
+#if 0
+/* XXX TODO ssh.com stuff isn't going yet */
+
+/* ----------------------------------------------------------------------
+ * Code to read ssh.com private keys.
+ */
+
+/*
+ * The format of the base64 blob is largely ssh2-packet-formatted,
+ * except that mpints are a bit different: they're more like the
+ * old ssh1 mpint. You have a 32-bit bit count N, followed by
+ * (N+7)/8 bytes of data.
+ * 
+ * So. The blob contains:
+ * 
+ *  - uint32 0x3f6ff9eb	   (magic number)
+ *  - uint32 size			 (total blob size)
+ *  - string key-type		 (see below)
+ *  - string cipher-type	  (tells you if key is encrypted)
+ *  - string encrypted-blob
+ * 
+ * (The first size field includes the size field itself and the
+ * magic number before it. All other size fields are ordinary ssh2
+ * strings, so the size field indicates how much data is to
+ * _follow_.)
+ * 
+ * The encrypted blob, once decrypted, contains a single string
+ * which in turn contains the payload. (This allows padding to be
+ * added after that string while still making it clear where the
+ * real payload ends. Also it probably makes for a reasonable
+ * decryption check.)
+ * 
+ * The payload blob, for an RSA key, contains:
+ *  - mpint e
+ *  - mpint d
+ *  - mpint n  (yes, the public and private stuff is intermixed)
+ *  - mpint u  (presumably inverse of p mod q)
+ *  - mpint p  (p is the smaller prime)
+ *  - mpint q  (q is the larger)
+ * 
+ * For a DSA key, the payload blob contains:
+ *  - uint32 0
+ *  - mpint p
+ *  - mpint g
+ *  - mpint q
+ *  - mpint y
+ *  - mpint x
+ * 
+ * Alternatively, if the parameters are `predefined', that
+ * (0,p,g,q) sequence can be replaced by a uint32 1 and a string
+ * containing some predefined parameter specification. *shudder*,
+ * but I doubt we'll encounter this in real life.
+ * 
+ * The key type strings are ghastly. The RSA key I looked at had a
+ * type string of
+ * 
+ *   `if-modn{sign{rsa-pkcs1-sha1},encrypt{rsa-pkcs1v2-oaep}}'
+ * 
+ * and the DSA key wasn't much better:
+ * 
+ *   `dl-modp{sign{dsa-nist-sha1},dh{plain}}'
+ * 
+ * It isn't clear that these will always be the same. I think it
+ * might be wise just to look at the `if-modn{sign{rsa' and
+ * `dl-modp{sign{dsa' prefixes.
+ * 
+ * Finally, the encryption. The cipher-type string appears to be
+ * either `none' or `3des-cbc'. Looks as if this is SSH2-style
+ * 3des-cbc (i.e. outer cbc rather than inner). The key is created
+ * from the passphrase by means of yet another hashing faff:
+ * 
+ *  - first 16 bytes are MD5(passphrase)
+ *  - next 16 bytes are MD5(passphrase || first 16 bytes)
+ *  - if there were more, they'd be MD5(passphrase || first 32),
+ *	and so on.
+ */
+
+#define SSHCOM_MAGIC_NUMBER 0x3f6ff9eb
+
+struct sshcom_key {
+	char comment[256];				 /* allowing any length is overkill */
+	unsigned char *keyblob;
+	int keyblob_len, keyblob_size;
+};
+
+static struct sshcom_key *load_sshcom_key(const char *filename)
+{
+	struct sshcom_key *ret;
+	FILE *fp;
+	char buffer[256];
+	int len;
+	char *errmsg, *p;
+	int headers_done;
+	char base64_bit[4];
+	int base64_chars = 0;
+
+	ret = snew(struct sshcom_key);
+	ret->comment[0] = '\0';
+	ret->keyblob = NULL;
+	ret->keyblob_len = ret->keyblob_size = 0;
+
+	fp = fopen(filename, "r");
+	if (!fp) {
+		errmsg = "Unable to open key file";
+		goto error;
+	}
+	if (!fgets(buffer, sizeof(buffer), fp) ||
+		0 != strcmp(buffer, "---- BEGIN SSH2 ENCRYPTED PRIVATE KEY ----\n")) {
+		errmsg = "File does not begin with ssh.com key header";
+		goto error;
+	}
+
+	headers_done = 0;
+	while (1) {
+		if (!fgets(buffer, sizeof(buffer), fp)) {
+			errmsg = "Unexpected end of file";
+			goto error;
+		}
+		if (!strcmp(buffer, "---- END SSH2 ENCRYPTED PRIVATE KEY ----\n"))
+			break;					 /* done */
+		if ((p = strchr(buffer, ':')) != NULL) {
+			if (headers_done) {
+				errmsg = "Header found in body of key data";
+				goto error;
+			}
+			*p++ = '\0';
+			while (*p && isspace((unsigned char)*p)) p++;
+			/*
+			 * Header lines can end in a trailing backslash for
+			 * continuation.
+			 */
+			while ((len = strlen(p)) > (int)(sizeof(buffer) - (p-buffer) -1) ||
+				   p[len-1] != '\n' || p[len-2] == '\\') {
+				if (len > (int)((p-buffer) + sizeof(buffer)-2)) {
+					errmsg = "Header line too long to deal with";
+					goto error;
+				}
+				if (!fgets(p+len-2, sizeof(buffer)-(p-buffer)-(len-2), fp)) {
+					errmsg = "Unexpected end of file";
+					goto error;
+				}
+			}
+			p[strcspn(p, "\n")] = '\0';
+			if (!strcmp(buffer, "Comment")) {
+				/* Strip quotes in comment if present. */
+				if (p[0] == '"' && p[strlen(p)-1] == '"') {
+					p++;
+					p[strlen(p)-1] = '\0';
+				}
+				strncpy(ret->comment, p, sizeof(ret->comment));
+				ret->comment[sizeof(ret->comment)-1] = '\0';
+			}
+		} else {
+			headers_done = 1;
+
+			p = buffer;
+			while (isbase64(*p)) {
+				base64_bit[base64_chars++] = *p;
+				if (base64_chars == 4) {
+					unsigned char out[3];
+
+					base64_chars = 0;
+
+					len = base64_decode_atom(base64_bit, out);
+
+					if (len <= 0) {
+						errmsg = "Invalid base64 encoding";
+						goto error;
+					}
+
+					if (ret->keyblob_len + len > ret->keyblob_size) {
+						ret->keyblob_size = ret->keyblob_len + len + 256;
+						ret->keyblob = sresize(ret->keyblob, ret->keyblob_size,
+											   unsigned char);
+					}
+
+					memcpy(ret->keyblob + ret->keyblob_len, out, len);
+					ret->keyblob_len += len;
+				}
+
+				p++;
+			}
+		}
+	}
+
+	if (ret->keyblob_len == 0 || !ret->keyblob) {
+		errmsg = "Key body not present";
+		goto error;
+	}
+
+	return ret;
+
+	error:
+	if (ret) {
+		if (ret->keyblob) {
+			memset(ret->keyblob, 0, ret->keyblob_size);
+			m_free(ret->keyblob);
+		}
+		memset(&ret, 0, sizeof(ret));
+		m_free(ret);
+	}
+	return NULL;
+}
+
+int sshcom_encrypted(const char *filename, char **comment)
+{
+	struct sshcom_key *key = load_sshcom_key(filename);
+	int pos, len, answer;
+
+	*comment = NULL;
+	if (!key)
+		return 0;
+
+	/*
+	 * Check magic number.
+	 */
+	if (GET_32BIT(key->keyblob) != 0x3f6ff9eb)
+		return 0;					  /* key is invalid */
+
+	/*
+	 * Find the cipher-type string.
+	 */
+	answer = 0;
+	pos = 8;
+	if (key->keyblob_len < pos+4)
+		goto done;					 /* key is far too short */
+	pos += 4 + GET_32BIT(key->keyblob + pos);   /* skip key type */
+	if (key->keyblob_len < pos+4)
+		goto done;					 /* key is far too short */
+	len = GET_32BIT(key->keyblob + pos);   /* find cipher-type length */
+	if (key->keyblob_len < pos+4+len)
+		goto done;					 /* cipher type string is incomplete */
+	if (len != 4 || 0 != memcmp(key->keyblob + pos + 4, "none", 4))
+		answer = 1;
+
+	done:
+	*comment = dupstr(key->comment);
+	memset(key->keyblob, 0, key->keyblob_size);
+	m_free(key->keyblob);
+	memset(&key, 0, sizeof(key));
+	m_free(key);
+	return answer;
+}
+
+static int sshcom_read_mpint(void *data, int len, struct mpint_pos *ret)
+{
+	int bits;
+	int bytes;
+	unsigned char *d = (unsigned char *) data;
+
+	if (len < 4)
+		goto error;
+	bits = GET_32BIT(d);
+
+	bytes = (bits + 7) / 8;
+	if (len < 4+bytes)
+		goto error;
+
+	ret->start = d + 4;
+	ret->bytes = bytes;
+	return bytes+4;
+
+	error:
+	ret->start = NULL;
+	ret->bytes = -1;
+	return len;						/* ensure further calls fail as well */
+}
+
+static int sshcom_put_mpint(void *target, void *data, int len)
+{
+	unsigned char *d = (unsigned char *)target;
+	unsigned char *i = (unsigned char *)data;
+	int bits = len * 8 - 1;
+
+	while (bits > 0) {
+		if (*i & (1 << (bits & 7)))
+			break;
+		if (!(bits-- & 7))
+			i++, len--;
+	}
+
+	PUT_32BIT(d, bits+1);
+	memcpy(d+4, i, len);
+	return len+4;
+}
+
+sign_key *sshcom_read(const char *filename, char *passphrase)
+{
+	struct sshcom_key *key = load_sshcom_key(filename);
+	char *errmsg;
+	int pos, len;
+	const char prefix_rsa[] = "if-modn{sign{rsa";
+	const char prefix_dsa[] = "dl-modp{sign{dsa";
+	enum { RSA, DSA } type;
+	int encrypted;
+	char *ciphertext;
+	int cipherlen;
+	struct ssh2_userkey *ret = NULL, *retkey;
+	const struct ssh_signkey *alg;
+	unsigned char *blob = NULL;
+	int blobsize, publen, privlen;
+
+	if (!key)
+		return NULL;
+
+	/*
+	 * Check magic number.
+	 */
+	if (GET_32BIT(key->keyblob) != SSHCOM_MAGIC_NUMBER) {
+		errmsg = "Key does not begin with magic number";
+		goto error;
+	}
+
+	/*
+	 * Determine the key type.
+	 */
+	pos = 8;
+	if (key->keyblob_len < pos+4 ||
+		(len = GET_32BIT(key->keyblob + pos)) > key->keyblob_len - pos - 4) {
+		errmsg = "Key blob does not contain a key type string";
+		goto error;
+	}
+	if (len > sizeof(prefix_rsa) - 1 &&
+		!memcmp(key->keyblob+pos+4, prefix_rsa, sizeof(prefix_rsa) - 1)) {
+		type = RSA;
+	} else if (len > sizeof(prefix_dsa) - 1 &&
+		!memcmp(key->keyblob+pos+4, prefix_dsa, sizeof(prefix_dsa) - 1)) {
+		type = DSA;
+	} else {
+		errmsg = "Key is of unknown type";
+		goto error;
+	}
+	pos += 4+len;
+
+	/*
+	 * Determine the cipher type.
+	 */
+	if (key->keyblob_len < pos+4 ||
+		(len = GET_32BIT(key->keyblob + pos)) > key->keyblob_len - pos - 4) {
+		errmsg = "Key blob does not contain a cipher type string";
+		goto error;
+	}
+	if (len == 4 && !memcmp(key->keyblob+pos+4, "none", 4))
+		encrypted = 0;
+	else if (len == 8 && !memcmp(key->keyblob+pos+4, "3des-cbc", 8))
+		encrypted = 1;
+	else {
+		errmsg = "Key encryption is of unknown type";
+		goto error;
+	}
+	pos += 4+len;
+
+	/*
+	 * Get hold of the encrypted part of the key.
+	 */
+	if (key->keyblob_len < pos+4 ||
+		(len = GET_32BIT(key->keyblob + pos)) > key->keyblob_len - pos - 4) {
+		errmsg = "Key blob does not contain actual key data";
+		goto error;
+	}
+	ciphertext = (char *)key->keyblob + pos + 4;
+	cipherlen = len;
+	if (cipherlen == 0) {
+		errmsg = "Length of key data is zero";
+		goto error;
+	}
+
+	/*
+	 * Decrypt it if necessary.
+	 */
+	if (encrypted) {
+		/*
+		 * Derive encryption key from passphrase and iv/salt:
+		 * 
+		 *  - let block A equal MD5(passphrase)
+		 *  - let block B equal MD5(passphrase || A)
+		 *  - block C would be MD5(passphrase || A || B) and so on
+		 *  - encryption key is the first N bytes of A || B
+		 */
+		struct MD5Context md5c;
+		unsigned char keybuf[32], iv[8];
+
+		if (cipherlen % 8 != 0) {
+			errmsg = "Encrypted part of key is not a multiple of cipher block"
+				" size";
+			goto error;
+		}
+
+		MD5Init(&md5c);
+		MD5Update(&md5c, (unsigned char *)passphrase, strlen(passphrase));
+		MD5Final(keybuf, &md5c);
+
+		MD5Init(&md5c);
+		MD5Update(&md5c, (unsigned char *)passphrase, strlen(passphrase));
+		MD5Update(&md5c, keybuf, 16);
+		MD5Final(keybuf+16, &md5c);
+
+		/*
+		 * Now decrypt the key blob.
+		 */
+		memset(iv, 0, sizeof(iv));
+		des3_decrypt_pubkey_ossh(keybuf, iv, (unsigned char *)ciphertext,
+								 cipherlen);
+
+		memset(&md5c, 0, sizeof(md5c));
+		memset(keybuf, 0, sizeof(keybuf));
+
+		/*
+		 * Hereafter we return WRONG_PASSPHRASE for any parsing
+		 * error. (But only if we've just tried to decrypt it!
+		 * Returning WRONG_PASSPHRASE for an unencrypted key is
+		 * automatic doom.)
+		 */
+		if (encrypted)
+			ret = SSH2_WRONG_PASSPHRASE;
+	}
+
+	/*
+	 * Strip away the containing string to get to the real meat.
+	 */
+	len = GET_32BIT(ciphertext);
+	if (len > cipherlen-4) {
+		errmsg = "containing string was ill-formed";
+		goto error;
+	}
+	ciphertext += 4;
+	cipherlen = len;
+
+	/*
+	 * Now we break down into RSA versus DSA. In either case we'll
+	 * construct public and private blobs in our own format, and
+	 * end up feeding them to alg->createkey().
+	 */
+	blobsize = cipherlen + 256;
+	blob = snewn(blobsize, unsigned char);
+	privlen = 0;
+	if (type == RSA) {
+		struct mpint_pos n, e, d, u, p, q;
+		int pos = 0;
+		pos += sshcom_read_mpint(ciphertext+pos, cipherlen-pos, &e);
+		pos += sshcom_read_mpint(ciphertext+pos, cipherlen-pos, &d);
+		pos += sshcom_read_mpint(ciphertext+pos, cipherlen-pos, &n);
+		pos += sshcom_read_mpint(ciphertext+pos, cipherlen-pos, &u);
+		pos += sshcom_read_mpint(ciphertext+pos, cipherlen-pos, &p);
+		pos += sshcom_read_mpint(ciphertext+pos, cipherlen-pos, &q);
+		if (!q.start) {
+			errmsg = "key data did not contain six integers";
+			goto error;
+		}
+
+		alg = &ssh_rsa;
+		pos = 0;
+		pos += put_string(blob+pos, "ssh-rsa", 7);
+		pos += put_mp(blob+pos, e.start, e.bytes);
+		pos += put_mp(blob+pos, n.start, n.bytes);
+		publen = pos;
+		pos += put_string(blob+pos, d.start, d.bytes);
+		pos += put_mp(blob+pos, q.start, q.bytes);
+		pos += put_mp(blob+pos, p.start, p.bytes);
+		pos += put_mp(blob+pos, u.start, u.bytes);
+		privlen = pos - publen;
+	} else if (type == DSA) {
+		struct mpint_pos p, q, g, x, y;
+		int pos = 4;
+		if (GET_32BIT(ciphertext) != 0) {
+			errmsg = "predefined DSA parameters not supported";
+			goto error;
+		}
+		pos += sshcom_read_mpint(ciphertext+pos, cipherlen-pos, &p);
+		pos += sshcom_read_mpint(ciphertext+pos, cipherlen-pos, &g);
+		pos += sshcom_read_mpint(ciphertext+pos, cipherlen-pos, &q);
+		pos += sshcom_read_mpint(ciphertext+pos, cipherlen-pos, &y);
+		pos += sshcom_read_mpint(ciphertext+pos, cipherlen-pos, &x);
+		if (!x.start) {
+			errmsg = "key data did not contain five integers";
+			goto error;
+		}
+
+		alg = &ssh_dss;
+		pos = 0;
+		pos += put_string(blob+pos, "ssh-dss", 7);
+		pos += put_mp(blob+pos, p.start, p.bytes);
+		pos += put_mp(blob+pos, q.start, q.bytes);
+		pos += put_mp(blob+pos, g.start, g.bytes);
+		pos += put_mp(blob+pos, y.start, y.bytes);
+		publen = pos;
+		pos += put_mp(blob+pos, x.start, x.bytes);
+		privlen = pos - publen;
+	}
+
+	dropbear_assert(privlen > 0);			   /* should have bombed by now if not */
+
+	retkey = snew(struct ssh2_userkey);
+	retkey->alg = alg;
+	retkey->data = alg->createkey(blob, publen, blob+publen, privlen);
+	if (!retkey->data) {
+		m_free(retkey);
+		errmsg = "unable to create key data structure";
+		goto error;
+	}
+	retkey->comment = dupstr(key->comment);
+
+	errmsg = NULL; /* no error */
+	ret = retkey;
+
+	error:
+	if (blob) {
+		memset(blob, 0, blobsize);
+		m_free(blob);
+	}
+	memset(key->keyblob, 0, key->keyblob_size);
+	m_free(key->keyblob);
+	memset(&key, 0, sizeof(key));
+	m_free(key);
+	return ret;
+}
+
+int sshcom_write(const char *filename, sign_key *key,
+				 char *passphrase)
+{
+	unsigned char *pubblob, *privblob;
+	int publen, privlen;
+	unsigned char *outblob;
+	int outlen;
+	struct mpint_pos numbers[6];
+	int nnumbers, initial_zero, pos, lenpos, i;
+	char *type;
+	char *ciphertext;
+	int cipherlen;
+	int ret = 0;
+	FILE *fp;
+
+	/*
+	 * Fetch the key blobs.
+	 */
+	pubblob = key->alg->public_blob(key->data, &publen);
+	privblob = key->alg->private_blob(key->data, &privlen);
+	outblob = NULL;
+
+	/*
+	 * Find the sequence of integers to be encoded into the OpenSSH
+	 * key blob, and also decide on the header line.
+	 */
+	if (key->alg == &ssh_rsa) {
+		int pos;
+		struct mpint_pos n, e, d, p, q, iqmp;
+
+		pos = 4 + GET_32BIT(pubblob);
+		pos += ssh2_read_mpint(pubblob+pos, publen-pos, &e);
+		pos += ssh2_read_mpint(pubblob+pos, publen-pos, &n);
+		pos = 0;
+		pos += ssh2_read_mpint(privblob+pos, privlen-pos, &d);
+		pos += ssh2_read_mpint(privblob+pos, privlen-pos, &p);
+		pos += ssh2_read_mpint(privblob+pos, privlen-pos, &q);
+		pos += ssh2_read_mpint(privblob+pos, privlen-pos, &iqmp);
+
+		dropbear_assert(e.start && iqmp.start); /* can't go wrong */
+
+		numbers[0] = e;
+		numbers[1] = d;
+		numbers[2] = n;
+		numbers[3] = iqmp;
+		numbers[4] = q;
+		numbers[5] = p;
+
+		nnumbers = 6;
+		initial_zero = 0;
+		type = "if-modn{sign{rsa-pkcs1-sha1},encrypt{rsa-pkcs1v2-oaep}}";
+	} else if (key->alg == &ssh_dss) {
+		int pos;
+		struct mpint_pos p, q, g, y, x;
+
+		pos = 4 + GET_32BIT(pubblob);
+		pos += ssh2_read_mpint(pubblob+pos, publen-pos, &p);
+		pos += ssh2_read_mpint(pubblob+pos, publen-pos, &q);
+		pos += ssh2_read_mpint(pubblob+pos, publen-pos, &g);
+		pos += ssh2_read_mpint(pubblob+pos, publen-pos, &y);
+		pos = 0;
+		pos += ssh2_read_mpint(privblob+pos, privlen-pos, &x);
+
+		dropbear_assert(y.start && x.start); /* can't go wrong */
+
+		numbers[0] = p;
+		numbers[1] = g;
+		numbers[2] = q;
+		numbers[3] = y;
+		numbers[4] = x;
+
+		nnumbers = 5;
+		initial_zero = 1;
+		type = "dl-modp{sign{dsa-nist-sha1},dh{plain}}";
+	} else {
+		dropbear_assert(0);					 /* zoinks! */
+	}
+
+	/*
+	 * Total size of key blob will be somewhere under 512 plus
+	 * combined length of integers. We'll calculate the more
+	 * precise size as we construct the blob.
+	 */
+	outlen = 512;
+	for (i = 0; i < nnumbers; i++)
+		outlen += 4 + numbers[i].bytes;
+	outblob = snewn(outlen, unsigned char);
+
+	/*
+	 * Create the unencrypted key blob.
+	 */
+	pos = 0;
+	PUT_32BIT(outblob+pos, SSHCOM_MAGIC_NUMBER); pos += 4;
+	pos += 4;							   /* length field, fill in later */
+	pos += put_string(outblob+pos, type, strlen(type));
+	{
+		char *ciphertype = passphrase ? "3des-cbc" : "none";
+		pos += put_string(outblob+pos, ciphertype, strlen(ciphertype));
+	}
+	lenpos = pos;					   /* remember this position */
+	pos += 4;							   /* encrypted-blob size */
+	pos += 4;							   /* encrypted-payload size */
+	if (initial_zero) {
+		PUT_32BIT(outblob+pos, 0);
+		pos += 4;
+	}
+	for (i = 0; i < nnumbers; i++)
+		pos += sshcom_put_mpint(outblob+pos,
+								numbers[i].start, numbers[i].bytes);
+	/* Now wrap up the encrypted payload. */
+	PUT_32BIT(outblob+lenpos+4, pos - (lenpos+8));
+	/* Pad encrypted blob to a multiple of cipher block size. */
+	if (passphrase) {
+		int padding = -(pos - (lenpos+4)) & 7;
+		while (padding--)
+			outblob[pos++] = random_byte();
+	}
+	ciphertext = (char *)outblob+lenpos+4;
+	cipherlen = pos - (lenpos+4);
+	dropbear_assert(!passphrase || cipherlen % 8 == 0);
+	/* Wrap up the encrypted blob string. */
+	PUT_32BIT(outblob+lenpos, cipherlen);
+	/* And finally fill in the total length field. */
+	PUT_32BIT(outblob+4, pos);
+
+	dropbear_assert(pos < outlen);
+
+	/*
+	 * Encrypt the key.
+	 */
+	if (passphrase) {
+		/*
+		 * Derive encryption key from passphrase and iv/salt:
+		 * 
+		 *  - let block A equal MD5(passphrase)
+		 *  - let block B equal MD5(passphrase || A)
+		 *  - block C would be MD5(passphrase || A || B) and so on
+		 *  - encryption key is the first N bytes of A || B
+		 */
+		struct MD5Context md5c;
+		unsigned char keybuf[32], iv[8];
+
+		MD5Init(&md5c);
+		MD5Update(&md5c, (unsigned char *)passphrase, strlen(passphrase));
+		MD5Final(keybuf, &md5c);
+
+		MD5Init(&md5c);
+		MD5Update(&md5c, (unsigned char *)passphrase, strlen(passphrase));
+		MD5Update(&md5c, keybuf, 16);
+		MD5Final(keybuf+16, &md5c);
+
+		/*
+		 * Now decrypt the key blob.
+		 */
+		memset(iv, 0, sizeof(iv));
+		des3_encrypt_pubkey_ossh(keybuf, iv, (unsigned char *)ciphertext,
+								 cipherlen);
+
+		memset(&md5c, 0, sizeof(md5c));
+		memset(keybuf, 0, sizeof(keybuf));
+	}
+
+	/*
+	 * And save it. We'll use Unix line endings just in case it's
+	 * subsequently transferred in binary mode.
+	 */
+	fp = fopen(filename, "wb");	  /* ensure Unix line endings */
+	if (!fp)
+		goto error;
+	fputs("---- BEGIN SSH2 ENCRYPTED PRIVATE KEY ----\n", fp);
+	fprintf(fp, "Comment: \"");
+	/*
+	 * Comment header is broken with backslash-newline if it goes
+	 * over 70 chars. Although it's surrounded by quotes, it
+	 * _doesn't_ escape backslashes or quotes within the string.
+	 * Don't ask me, I didn't design it.
+	 */
+	{
+		int slen = 60;					   /* starts at 60 due to "Comment: " */
+		char *c = key->comment;
+		while ((int)strlen(c) > slen) {
+			fprintf(fp, "%.*s\\\n", slen, c);
+			c += slen;
+			slen = 70;					   /* allow 70 chars on subsequent lines */
+		}
+		fprintf(fp, "%s\"\n", c);
+	}
+	base64_encode_fp(fp, outblob, pos, 70);
+	fputs("---- END SSH2 ENCRYPTED PRIVATE KEY ----\n", fp);
+	fclose(fp);
+	ret = 1;
+
+	error:
+	if (outblob) {
+		memset(outblob, 0, outlen);
+		m_free(outblob);
+	}
+	if (privblob) {
+		memset(privblob, 0, privlen);
+		m_free(privblob);
+	}
+	if (pubblob) {
+		memset(pubblob, 0, publen);
+		m_free(pubblob);
+	}
+	return ret;
+}
+#endif /* ssh.com stuff disabled */
diff --git a/keyimport.h b/keyimport.h
new file mode 100644
index 0000000..19f212f
--- /dev/null
+++ b/keyimport.h
@@ -0,0 +1,42 @@
+/*
+ * Dropbear SSH
+ * 
+ * Copyright (c) 2002,2003 Matt Johnston
+ * All rights reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE. */
+
+#ifndef _KEYIMPORT_H_
+#define _KEYIMPORT_H_
+
+#include "includes.h"
+#include "signkey.h"
+
+enum {
+	KEYFILE_DROPBEAR,
+	KEYFILE_OPENSSH,
+	KEYFILE_SSHCOM
+};
+
+int import_write(const char *filename, sign_key *key, char *passphrase,
+		int filetype);
+sign_key *import_read(const char *filename, char *passphrase, int filetype);
+int import_encrypted(const char* filename, int filetype);
+
+#endif /* _KEYIMPORT_H_ */
diff --git a/Doxyfile b/libtomcrypt/Doxyfile
index a8de8a9..a8de8a9 100644
--- a/Doxyfile
+++ b/libtomcrypt/Doxyfile
diff --git a/libtomcrypt/LICENSE b/libtomcrypt/LICENSE
new file mode 100644
index 0000000..5d678c5
--- /dev/null
+++ b/libtomcrypt/LICENSE
@@ -0,0 +1,5 @@
+LibTomCrypt is public domain.  As should all quality software be.
+
+Tom St Denis
+
+
diff --git a/libtomcrypt/Makefile.in b/libtomcrypt/Makefile.in
new file mode 100644
index 0000000..e1bee57
--- /dev/null
+++ b/libtomcrypt/Makefile.in
@@ -0,0 +1,282 @@
+# MAKEFILE for linux GCC
+#
+# Tom St Denis
+# Modified by Clay Culver
+
+# The version
+VERSION=1.05
+
+VPATH=@srcdir@
+srcdir=@srcdir@
+
+# Compiler and Linker Names
+#CC=gcc
+#LD=ld
+
+# Archiver [makes .a files]
+#AR=ar
+#ARFLAGS=r
+
+# Compilation flags. Note the += does not write over the user's CFLAGS!
+# The rest of the flags come from the parent Dropbear makefile
+CFLAGS += -c -I$(srcdir)/src/headers/ -I$(srcdir)/../
+
+# additional warnings (newer GCC 3.4 and higher)
+#CFLAGS += -Wsystem-headers -Wdeclaration-after-statement -Wbad-function-cast -Wcast-align -Wstrict-prototypes -Wmissing-prototypes \
+#		  -Wmissing-declarations -Wpointer-arith 
+
+# optimize for SPEED
+#CFLAGS += -O3 -funroll-loops
+
+# add -fomit-frame-pointer.  hinders debugging!
+#CFLAGS += -fomit-frame-pointer
+
+# optimize for SIZE
+#CFLAGS += -Os -DLTC_SMALL_CODE
+
+# older GCCs can't handle the "rotate with immediate" ROLc/RORc/etc macros
+# define this to help
+#CFLAGS += -DLTC_NO_ROLC
+
+# compile for DEBUGING (required for ccmalloc checking!!!)
+#CFLAGS += -g3 -DLTC_NO_ASM
+
+#Output filenames for various targets.
+LIBNAME=libtomcrypt.a
+LIBTEST=testprof/libtomcrypt_prof.a
+HASH=hashsum
+CRYPT=encrypt
+SMALL=small
+PROF=x86_prof
+TV=tv_gen
+MULTI=multi
+TIMING=timing
+TEST=test
+
+#LIBPATH-The directory for libtomcrypt to be installed to.
+#INCPATH-The directory to install the header files for libtomcrypt.
+#DATAPATH-The directory to install the pdf docs.
+DESTDIR=
+LIBPATH=/usr/lib
+INCPATH=/usr/include
+DATAPATH=/usr/share/doc/libtomcrypt/pdf
+
+#Who do we install as?
+ifdef INSTALL_USER
+USER=$(INSTALL_USER)
+else
+USER=root
+endif
+
+ifdef INSTALL_GROUP
+GROUP=$(INSTALL_GROUP)
+else
+GROUP=wheel
+endif
+
+#List of objects to compile.
+
+#Leave MPI built-in or force developer to link against libtommath?
+#MPIOBJECT=src/misc/mpi/mpi.o
+#Dropbear uses libtommath
+MPIOBJECT=
+
+OBJECTS=src/ciphers/aes/aes_enc.o $(MPIOBJECT) src/ciphers/aes/aes.o src/ciphers/anubis.o \
+src/ciphers/blowfish.o src/ciphers/cast5.o src/ciphers/des.o src/ciphers/khazad.o src/ciphers/noekeon.o \
+src/ciphers/rc2.o src/ciphers/rc5.o src/ciphers/rc6.o src/ciphers/safer/safer.o \
+src/ciphers/safer/safer_tab.o src/ciphers/safer/saferp.o src/ciphers/skipjack.o \
+src/ciphers/twofish/twofish.o src/ciphers/xtea.o src/encauth/ccm/ccm_memory.o \
+src/encauth/ccm/ccm_test.o src/encauth/eax/eax_addheader.o src/encauth/eax/eax_decrypt.o \
+src/encauth/eax/eax_decrypt_verify_memory.o src/encauth/eax/eax_done.o src/encauth/eax/eax_encrypt.o \
+src/encauth/eax/eax_encrypt_authenticate_memory.o src/encauth/eax/eax_init.o \
+src/encauth/eax/eax_test.o src/encauth/gcm/gcm_add_aad.o src/encauth/gcm/gcm_add_iv.o \
+src/encauth/gcm/gcm_done.o src/encauth/gcm/gcm_gf_mult.o src/encauth/gcm/gcm_init.o \
+src/encauth/gcm/gcm_memory.o src/encauth/gcm/gcm_process.o src/encauth/gcm/gcm_reset.o \
+src/encauth/gcm/gcm_test.o src/encauth/ocb/ocb_decrypt.o src/encauth/ocb/ocb_decrypt_verify_memory.o \
+src/encauth/ocb/ocb_done_decrypt.o src/encauth/ocb/ocb_done_encrypt.o src/encauth/ocb/ocb_encrypt.o \
+src/encauth/ocb/ocb_encrypt_authenticate_memory.o src/encauth/ocb/ocb_init.o src/encauth/ocb/ocb_ntz.o \
+src/encauth/ocb/ocb_shift_xor.o src/encauth/ocb/ocb_test.o src/encauth/ocb/s_ocb_done.o \
+src/hashes/chc/chc.o src/hashes/helper/hash_file.o src/hashes/helper/hash_filehandle.o \
+src/hashes/helper/hash_memory.o src/hashes/helper/hash_memory_multi.o src/hashes/md2.o src/hashes/md4.o \
+src/hashes/md5.o src/hashes/rmd128.o src/hashes/rmd160.o src/hashes/sha1.o src/hashes/sha2/sha256.o \
+src/hashes/sha2/sha512.o src/hashes/tiger.o src/hashes/whirl/whirl.o src/mac/hmac/hmac_done.o \
+src/mac/hmac/hmac_file.o src/mac/hmac/hmac_init.o src/mac/hmac/hmac_memory.o \
+src/mac/hmac/hmac_memory_multi.o src/mac/hmac/hmac_process.o src/mac/hmac/hmac_test.o \
+src/mac/omac/omac_done.o src/mac/omac/omac_file.o src/mac/omac/omac_init.o src/mac/omac/omac_memory.o \
+src/mac/omac/omac_memory_multi.o src/mac/omac/omac_process.o src/mac/omac/omac_test.o \
+src/mac/pelican/pelican.o src/mac/pelican/pelican_memory.o src/mac/pelican/pelican_test.o \
+src/mac/pmac/pmac_done.o src/mac/pmac/pmac_file.o src/mac/pmac/pmac_init.o src/mac/pmac/pmac_memory.o \
+src/mac/pmac/pmac_memory_multi.o src/mac/pmac/pmac_ntz.o src/mac/pmac/pmac_process.o \
+src/mac/pmac/pmac_shift_xor.o src/mac/pmac/pmac_test.o src/misc/base64/base64_decode.o \
+src/misc/base64/base64_encode.o src/misc/burn_stack.o src/misc/crypt/crypt.o \
+src/misc/crypt/crypt_argchk.o src/misc/crypt/crypt_cipher_descriptor.o \
+src/misc/crypt/crypt_cipher_is_valid.o src/misc/crypt/crypt_find_cipher.o \
+src/misc/crypt/crypt_find_cipher_any.o src/misc/crypt/crypt_find_cipher_id.o \
+src/misc/crypt/crypt_find_hash.o src/misc/crypt/crypt_find_hash_any.o \
+src/misc/crypt/crypt_find_hash_id.o src/misc/crypt/crypt_find_prng.o \
+src/misc/crypt/crypt_hash_descriptor.o src/misc/crypt/crypt_hash_is_valid.o \
+src/misc/crypt/crypt_prng_descriptor.o src/misc/crypt/crypt_prng_is_valid.o \
+src/misc/crypt/crypt_register_cipher.o src/misc/crypt/crypt_register_hash.o \
+src/misc/crypt/crypt_register_prng.o src/misc/crypt/crypt_unregister_cipher.o \
+src/misc/crypt/crypt_unregister_hash.o src/misc/crypt/crypt_unregister_prng.o \
+src/misc/error_to_string.o src/misc/mpi/is_prime.o src/misc/mpi/mpi_to_ltc_error.o \
+src/misc/mpi/rand_prime.o src/misc/pkcs5/pkcs_5_1.o src/misc/pkcs5/pkcs_5_2.o src/misc/zeromem.o \
+src/modes/cbc/cbc_decrypt.o src/modes/cbc/cbc_done.o src/modes/cbc/cbc_encrypt.o \
+src/modes/cbc/cbc_getiv.o src/modes/cbc/cbc_setiv.o src/modes/cbc/cbc_start.o \
+src/modes/cfb/cfb_decrypt.o src/modes/cfb/cfb_done.o src/modes/cfb/cfb_encrypt.o \
+src/modes/cfb/cfb_getiv.o src/modes/cfb/cfb_setiv.o src/modes/cfb/cfb_start.o \
+src/modes/ctr/ctr_decrypt.o src/modes/ctr/ctr_done.o src/modes/ctr/ctr_encrypt.o \
+src/modes/ctr/ctr_getiv.o src/modes/ctr/ctr_setiv.o src/modes/ctr/ctr_start.o \
+src/modes/ecb/ecb_decrypt.o src/modes/ecb/ecb_done.o src/modes/ecb/ecb_encrypt.o \
+src/modes/ecb/ecb_start.o src/modes/ofb/ofb_decrypt.o src/modes/ofb/ofb_done.o \
+src/modes/ofb/ofb_encrypt.o src/modes/ofb/ofb_getiv.o src/modes/ofb/ofb_setiv.o \
+src/modes/ofb/ofb_start.o 
+
+HEADERS=src/headers/tommath_superclass.h src/headers/tomcrypt_cfg.h src/headers/tomcrypt_mac.h \
+src/headers/tomcrypt_macros.h src/headers/tomcrypt_custom.h src/headers/tomcrypt_argchk.h \
+src/headers/tomcrypt_cipher.h src/headers/tomcrypt_pk.h src/headers/tommath_class.h \
+src/headers/ltc_tommath.h src/headers/tomcrypt_hash.h src/headers/tomcrypt_misc.h \
+src/headers/tomcrypt.h src/headers/tomcrypt_pkcs.h src/headers/tomcrypt_prng.h testprof/tomcrypt_test.h
+
+TESTOBJECTS=demos/test.o
+HASHOBJECTS=demos/hashsum.o
+CRYPTOBJECTS=demos/encrypt.o
+SMALLOBJECTS=demos/small.o
+TVS=demos/tv_gen.o
+MULTIS=demos/multi.o
+TIMINGS=demos/timing.o
+TESTS=demos/test.o
+
+#Files left over from making the crypt.pdf.
+LEFTOVERS=*.dvi *.log *.aux *.toc *.idx *.ilg *.ind *.out
+
+#Compressed filenames
+COMPRESSED=crypt-$(VERSION).tar.bz2 crypt-$(VERSION).zip
+
+#The default rule for make builds the libtomcrypt library.
+default:library
+
+#ciphers come in two flavours... enc+dec and enc 
+src/ciphers/aes/aes_enc.o: src/ciphers/aes/aes.c src/ciphers/aes/aes_tab.c
+	$(CC) $(CFLAGS) -DENCRYPT_ONLY -c src/ciphers/aes/aes.c -o src/ciphers/aes/aes_enc.o
+
+#These are the rules to make certain object files.
+src/ciphers/aes/aes.o: src/ciphers/aes/aes.c src/ciphers/aes/aes_tab.c
+src/ciphers/twofish/twofish.o: src/ciphers/twofish/twofish.c src/ciphers/twofish/twofish_tab.c
+src/hashes/whirl/whirl.o: src/hashes/whirl/whirl.c src/hashes/whirl/whirltab.c
+src/pk/ecc/ecc.o: src/pk/ecc/ecc.c src/pk/ecc/ecc_sys.c
+src/pk/dh/dh.o: src/pk/dh/dh.c src/pk/dh/dh_sys.c
+src/hashes/sha2/sha512.o: src/hashes/sha2/sha512.c src/hashes/sha2/sha384.c
+src/hashes/sha2/sha256.o: src/hashes/sha2/sha256.c src/hashes/sha2/sha224.c
+
+#This rule makes the libtomcrypt library.
+library: $(LIBNAME)
+
+$(LIBTEST): 
+	cd testprof ; CFLAGS="$(CFLAGS)" make 
+
+$(LIBNAME): $(OBJECTS)
+	$(AR) $(ARFLAGS) $@ $(OBJECTS) 
+	$(RANLIB) $(LIBNAME)
+
+#This rule makes the hash program included with libtomcrypt
+hashsum: library $(HASHOBJECTS)
+	$(CC) $(HASHOBJECTS) $(LIBNAME) -o $(HASH) $(WARN)
+
+#makes the crypt program
+crypt: library $(CRYPTOBJECTS)
+	$(CC) $(CRYPTOBJECTS) $(LIBNAME) -o $(CRYPT) $(WARN)
+
+#makes the small program
+small: library $(SMALLOBJECTS)
+	$(CC) $(SMALLOBJECTS) $(LIBNAME) -o $(SMALL) $(WARN)
+	
+tv_gen: library $(TVS)
+	$(CC) $(TVS) $(LIBNAME) -o $(TV)
+
+multi: library $(MULTIS)
+	$(CC) $(MULTIS) $(LIBNAME) -o $(MULTI)
+
+timing: library $(LIBTEST) $(TIMINGS)
+	$(CC) $(TIMINGS) $(LIBTEST) $(LIBNAME) $(EXTRALIBS) -o $(TIMING)
+
+test: library $(LIBTEST) $(TESTS)
+	$(CC) $(TESTS) $(LIBTEST) $(LIBNAME) -o $(TEST)
+
+
+#This rule installs the library and the header files. This must be run
+#as root in order to have a high enough permission to write to the correct
+#directories and to set the owner and group to root.
+install: library docs
+	install -d -g $(GROUP) -o $(USER) $(DESTDIR)$(LIBPATH)
+	install -d -g $(GROUP) -o $(USER) $(DESTDIR)$(INCPATH)
+	install -d -g $(GROUP) -o $(USER) $(DESTDIR)$(DATAPATH)
+	install -g $(GROUP) -o $(USER) $(LIBNAME) $(DESTDIR)$(LIBPATH)
+	install -g $(GROUP) -o $(USER) $(HEADERS) $(DESTDIR)$(INCPATH)
+	install -g $(GROUP) -o $(USER) doc/crypt.pdf $(DESTDIR)$(DATAPATH)
+
+install_test: $(LIBTEST)
+	install -d -g $(GROUP) -o $(USER) $(DESTDIR)$(LIBPATH)
+	install -d -g $(GROUP) -o $(USER) $(DESTDIR)$(INCPATH)
+	install -g $(GROUP) -o $(USER) $(LIBTEST) $(DESTDIR)$(LIBPATH)
+
+profile:
+	CFLAGS="$(CFLAGS) -fprofile-generate" make timing EXTRALIBS=-lgcov
+	./timing
+	rm -f timing `find . -type f | grep [.][ao] | xargs`
+	CFLAGS="$(CFLAGS) -fprofile-use" make timing EXTRALIBS=-lgcov
+
+
+#This rule cleans the source tree of all compiled code, not including the pdf
+#documentation.
+clean:
+	-rm -f $(OBJECTS)
+	-rm -f libtomcrypt.a
+
+#build the doxy files (requires Doxygen, tetex and patience)
+doxy:
+	doxygen
+	cd doc/doxygen/latex ; make ; mv -f refman.pdf ../../.
+	echo The huge doxygen PDF should be available as doc/refman.pdf
+	
+#This builds the crypt.pdf file. Note that the rm -f *.pdf has been removed
+#from the clean command! This is because most people would like to keep the
+#nice pre-compiled crypt.pdf that comes with libtomcrypt! We only need to
+#delete it if we are rebuilding it.
+docs: crypt.tex
+	rm -f doc/crypt.pdf $(LEFTOVERS)
+	echo "hello" > crypt.ind
+	latex crypt > /dev/null
+	latex crypt > /dev/null
+	makeindex crypt.idx > /dev/null
+	latex crypt > /dev/null
+	dvipdf crypt
+	mv -ivf crypt.pdf doc/crypt.pdf
+	rm -f $(LEFTOVERS)
+
+docdvi: crypt.tex
+	echo hello > crypt.ind
+	latex crypt > /dev/null
+	latex crypt > /dev/null
+	makeindex crypt.idx
+	latex crypt > /dev/null
+
+#zipup the project (take that!)
+no_oops: clean
+	cd .. ; cvs commit 
+	echo Scanning for scratch/dirty files
+	find . -type f | grep -v CVS | xargs -n 1 bash mess.sh
+
+zipup: no_oops docs
+	cd .. ; rm -rf crypt* libtomcrypt-$(VERSION) ; mkdir libtomcrypt-$(VERSION) ; \
+	cp -R ./libtomcrypt/* ./libtomcrypt-$(VERSION)/ ; \
+	cd libtomcrypt-$(VERSION) ; rm -rf `find . -type d | grep CVS | xargs` ; cd .. ; \
+	tar -cjvf crypt-$(VERSION).tar.bz2 libtomcrypt-$(VERSION) ; \
+	zip -9r crypt-$(VERSION).zip libtomcrypt-$(VERSION) ; \
+	gpg -b -a crypt-$(VERSION).tar.bz2 ; gpg -b -a crypt-$(VERSION).zip ; \
+	mv -fv crypt* ~ ; rm -rf libtomcrypt-$(VERSION)
+
+
+# $Source: /cvs/libtom/libtomcrypt/makefile,v $ 
+# $Revision: 1.70 $ 
+# $Date: 2005/06/19 18:03:24 $ 
diff --git a/libtomcrypt/TODO b/libtomcrypt/TODO
new file mode 100644
index 0000000..f4f0665
--- /dev/null
+++ b/libtomcrypt/TODO
@@ -0,0 +1,10 @@
+For 1.06
+
+1. export ECC functions globally [e.g. mulmod and the sets]
+   - goal is tv_gen module and test vectors
+2. ASN.1 SET and T61String
+3. phase out DH code [RSA/ECC/DSA is enough]
+4. Some ASN.1 demo programs [for now read the source code!]
+5. Start working towards making the bignum code plugable
+6. Look into other ECC point muls and consider a "precomp" interface 
+7. Add OID for ciphers and PRNGs to their descriptors
diff --git a/build.sh b/libtomcrypt/build.sh
index a018aac..a018aac 100644
--- a/build.sh
+++ b/libtomcrypt/build.sh
diff --git a/changes b/libtomcrypt/changes
index 65ccd16..65ccd16 100644
--- a/changes
+++ b/libtomcrypt/changes
diff --git a/crypt.tex b/libtomcrypt/crypt.tex
index cf1c37e..cf1c37e 100644
--- a/crypt.tex
+++ b/libtomcrypt/crypt.tex
diff --git a/demos/encrypt.c b/libtomcrypt/demos/encrypt.c
index d8eb293..d8eb293 100644
--- a/demos/encrypt.c
+++ b/libtomcrypt/demos/encrypt.c
diff --git a/demos/hashsum.c b/libtomcrypt/demos/hashsum.c
index 653b6ef..653b6ef 100644
--- a/demos/hashsum.c
+++ b/libtomcrypt/demos/hashsum.c
diff --git a/demos/multi.c b/libtomcrypt/demos/multi.c
index af4d6b6..af4d6b6 100644
--- a/demos/multi.c
+++ b/libtomcrypt/demos/multi.c
diff --git a/demos/small.c b/libtomcrypt/demos/small.c
index 6bdd842..6bdd842 100644
--- a/demos/small.c
+++ b/libtomcrypt/demos/small.c
diff --git a/demos/test.c b/libtomcrypt/demos/test.c
index f6c7170..f6c7170 100644
--- a/demos/test.c
+++ b/libtomcrypt/demos/test.c
diff --git a/demos/timing.c b/libtomcrypt/demos/timing.c
index 368d6e4..368d6e4 100644
--- a/demos/timing.c
+++ b/libtomcrypt/demos/timing.c
diff --git a/demos/tv_gen.c b/libtomcrypt/demos/tv_gen.c
index edaae3e..edaae3e 100644
--- a/demos/tv_gen.c
+++ b/libtomcrypt/demos/tv_gen.c
diff --git a/doc/footer.html b/libtomcrypt/doc/footer.html
index a1895ac..a1895ac 100644
--- a/doc/footer.html
+++ b/libtomcrypt/doc/footer.html
diff --git a/doc/header.html b/libtomcrypt/doc/header.html
index 231475d..231475d 100644
--- a/doc/header.html
+++ b/libtomcrypt/doc/header.html
diff --git a/genlist.sh b/libtomcrypt/genlist.sh
index 832e8e7..832e8e7 100644
--- a/genlist.sh
+++ b/libtomcrypt/genlist.sh
diff --git a/makefile.icc b/libtomcrypt/makefile.icc
index a9ab38f..a9ab38f 100644
--- a/makefile.icc
+++ b/libtomcrypt/makefile.icc
diff --git a/makefile.msvc b/libtomcrypt/makefile.msvc
index ae01079..ae01079 100644
--- a/makefile.msvc
+++ b/libtomcrypt/makefile.msvc
diff --git a/makefile.shared b/libtomcrypt/makefile.shared
index 62cc87b..62cc87b 100644
--- a/makefile.shared
+++ b/libtomcrypt/makefile.shared
diff --git a/mess.sh b/libtomcrypt/mess.sh
index bf639ce..bf639ce 100644
--- a/mess.sh
+++ b/libtomcrypt/mess.sh
diff --git a/notes/base64_tv.txt b/libtomcrypt/notes/base64_tv.txt
index 01c8a4e..01c8a4e 100644
--- a/notes/base64_tv.txt
+++ b/libtomcrypt/notes/base64_tv.txt
diff --git a/notes/ccm_tv.txt b/libtomcrypt/notes/ccm_tv.txt
index 3ff4b77..3ff4b77 100644
--- a/notes/ccm_tv.txt
+++ b/libtomcrypt/notes/ccm_tv.txt
diff --git a/notes/cipher_tv.txt b/libtomcrypt/notes/cipher_tv.txt
index c649d26..c649d26 100644
--- a/notes/cipher_tv.txt
+++ b/libtomcrypt/notes/cipher_tv.txt
diff --git a/notes/eax_tv.txt b/libtomcrypt/notes/eax_tv.txt
index 95cd7c1..95cd7c1 100644
--- a/notes/eax_tv.txt
+++ b/libtomcrypt/notes/eax_tv.txt
diff --git a/notes/etc/saferp_optimizer.c b/libtomcrypt/notes/etc/saferp_optimizer.c
index 32de878..32de878 100644
--- a/notes/etc/saferp_optimizer.c
+++ b/libtomcrypt/notes/etc/saferp_optimizer.c
diff --git a/notes/etc/whirlgen.c b/libtomcrypt/notes/etc/whirlgen.c
index c06687e..c06687e 100644
--- a/notes/etc/whirlgen.c
+++ b/libtomcrypt/notes/etc/whirlgen.c
diff --git a/notes/etc/whirltest.c b/libtomcrypt/notes/etc/whirltest.c
index 226e012..226e012 100644
--- a/notes/etc/whirltest.c
+++ b/libtomcrypt/notes/etc/whirltest.c
diff --git a/notes/gcm_tv.txt b/libtomcrypt/notes/gcm_tv.txt
index 79d3b8d..79d3b8d 100644
--- a/notes/gcm_tv.txt
+++ b/libtomcrypt/notes/gcm_tv.txt
diff --git a/notes/hash_tv.txt b/libtomcrypt/notes/hash_tv.txt
index 4f2714d..4f2714d 100644
--- a/notes/hash_tv.txt
+++ b/libtomcrypt/notes/hash_tv.txt
diff --git a/notes/hmac_tv.txt b/libtomcrypt/notes/hmac_tv.txt
index 18edd70..18edd70 100644
--- a/notes/hmac_tv.txt
+++ b/libtomcrypt/notes/hmac_tv.txt
diff --git a/notes/ocb_tv.txt b/libtomcrypt/notes/ocb_tv.txt
index 6429228..6429228 100644
--- a/notes/ocb_tv.txt
+++ b/libtomcrypt/notes/ocb_tv.txt
diff --git a/notes/omac_tv.txt b/libtomcrypt/notes/omac_tv.txt
index 56d8da6..56d8da6 100644
--- a/notes/omac_tv.txt
+++ b/libtomcrypt/notes/omac_tv.txt
diff --git a/notes/pmac_tv.txt b/libtomcrypt/notes/pmac_tv.txt
index e0a1900..e0a1900 100644
--- a/notes/pmac_tv.txt
+++ b/libtomcrypt/notes/pmac_tv.txt
diff --git a/notes/tech0001.txt b/libtomcrypt/notes/tech0001.txt
index daf7e57..daf7e57 100644
--- a/notes/tech0001.txt
+++ b/libtomcrypt/notes/tech0001.txt
diff --git a/notes/tech0002.txt b/libtomcrypt/notes/tech0002.txt
index b9990e0..b9990e0 100644
--- a/notes/tech0002.txt
+++ b/libtomcrypt/notes/tech0002.txt
diff --git a/notes/tech0003.txt b/libtomcrypt/notes/tech0003.txt
index 1a21867..1a21867 100644
--- a/notes/tech0003.txt
+++ b/libtomcrypt/notes/tech0003.txt
diff --git a/notes/tech0004.txt b/libtomcrypt/notes/tech0004.txt
index 2acd378..2acd378 100644
--- a/notes/tech0004.txt
+++ b/libtomcrypt/notes/tech0004.txt
diff --git a/notes/tech0005.txt b/libtomcrypt/notes/tech0005.txt
index 8f393d5..8f393d5 100644
--- a/notes/tech0005.txt
+++ b/libtomcrypt/notes/tech0005.txt
diff --git a/notes/tech0006.txt b/libtomcrypt/notes/tech0006.txt
index ecbe8b0..ecbe8b0 100644
--- a/notes/tech0006.txt
+++ b/libtomcrypt/notes/tech0006.txt
diff --git a/parsenames.pl b/libtomcrypt/parsenames.pl
index 761f036..761f036 100644
--- a/parsenames.pl
+++ b/libtomcrypt/parsenames.pl
diff --git a/run.sh b/libtomcrypt/run.sh
index b652110..b652110 100644
--- a/run.sh
+++ b/libtomcrypt/run.sh
diff --git a/src/ciphers/aes/aes.c b/libtomcrypt/src/ciphers/aes/aes.c
index 0e4933f..0e4933f 100644
--- a/src/ciphers/aes/aes.c
+++ b/libtomcrypt/src/ciphers/aes/aes.c
diff --git a/src/ciphers/aes/aes_tab.c b/libtomcrypt/src/ciphers/aes/aes_tab.c
index 31d276e..31d276e 100644
--- a/src/ciphers/aes/aes_tab.c
+++ b/libtomcrypt/src/ciphers/aes/aes_tab.c
diff --git a/src/ciphers/anubis.c b/libtomcrypt/src/ciphers/anubis.c
index 511eac2..511eac2 100644
--- a/src/ciphers/anubis.c
+++ b/libtomcrypt/src/ciphers/anubis.c
diff --git a/src/ciphers/blowfish.c b/libtomcrypt/src/ciphers/blowfish.c
index 3502983..3502983 100644
--- a/src/ciphers/blowfish.c
+++ b/libtomcrypt/src/ciphers/blowfish.c
diff --git a/src/ciphers/cast5.c b/libtomcrypt/src/ciphers/cast5.c
index f4b045c..f4b045c 100644
--- a/src/ciphers/cast5.c
+++ b/libtomcrypt/src/ciphers/cast5.c
diff --git a/src/ciphers/des.c b/libtomcrypt/src/ciphers/des.c
index 84258d0..84258d0 100644
--- a/src/ciphers/des.c
+++ b/libtomcrypt/src/ciphers/des.c
diff --git a/src/ciphers/khazad.c b/libtomcrypt/src/ciphers/khazad.c
index d07134c..d07134c 100644
--- a/src/ciphers/khazad.c
+++ b/libtomcrypt/src/ciphers/khazad.c
diff --git a/src/ciphers/noekeon.c b/libtomcrypt/src/ciphers/noekeon.c
index e502dfe..e502dfe 100644
--- a/src/ciphers/noekeon.c
+++ b/libtomcrypt/src/ciphers/noekeon.c
diff --git a/src/ciphers/rc2.c b/libtomcrypt/src/ciphers/rc2.c
index e91f83a..e91f83a 100644
--- a/src/ciphers/rc2.c
+++ b/libtomcrypt/src/ciphers/rc2.c
diff --git a/src/ciphers/rc5.c b/libtomcrypt/src/ciphers/rc5.c
index 9465ac8..9465ac8 100644
--- a/src/ciphers/rc5.c
+++ b/libtomcrypt/src/ciphers/rc5.c
diff --git a/src/ciphers/rc6.c b/libtomcrypt/src/ciphers/rc6.c
index ae6f114..ae6f114 100644
--- a/src/ciphers/rc6.c
+++ b/libtomcrypt/src/ciphers/rc6.c
diff --git a/src/ciphers/safer/safer.c b/libtomcrypt/src/ciphers/safer/safer.c
index bad00a0..bad00a0 100644
--- a/src/ciphers/safer/safer.c
+++ b/libtomcrypt/src/ciphers/safer/safer.c
diff --git a/src/ciphers/safer/safer_tab.c b/libtomcrypt/src/ciphers/safer/safer_tab.c
index 47fbd1c..47fbd1c 100644
--- a/src/ciphers/safer/safer_tab.c
+++ b/libtomcrypt/src/ciphers/safer/safer_tab.c
diff --git a/src/ciphers/safer/saferp.c b/libtomcrypt/src/ciphers/safer/saferp.c
index 6d0d589..6d0d589 100644
--- a/src/ciphers/safer/saferp.c
+++ b/libtomcrypt/src/ciphers/safer/saferp.c
diff --git a/src/ciphers/skipjack.c b/libtomcrypt/src/ciphers/skipjack.c
index 4cb3bf2..4cb3bf2 100644
--- a/src/ciphers/skipjack.c
+++ b/libtomcrypt/src/ciphers/skipjack.c
diff --git a/src/ciphers/twofish/twofish.c b/libtomcrypt/src/ciphers/twofish/twofish.c
index f147bab..f147bab 100644
--- a/src/ciphers/twofish/twofish.c
+++ b/libtomcrypt/src/ciphers/twofish/twofish.c
diff --git a/src/ciphers/twofish/twofish_tab.c b/libtomcrypt/src/ciphers/twofish/twofish_tab.c
index 47d4717..47d4717 100644
--- a/src/ciphers/twofish/twofish_tab.c
+++ b/libtomcrypt/src/ciphers/twofish/twofish_tab.c
diff --git a/src/ciphers/xtea.c b/libtomcrypt/src/ciphers/xtea.c
index a45a13a..a45a13a 100644
--- a/src/ciphers/xtea.c
+++ b/libtomcrypt/src/ciphers/xtea.c
diff --git a/src/encauth/ccm/ccm_memory.c b/libtomcrypt/src/encauth/ccm/ccm_memory.c
index 48dc4b7..48dc4b7 100644
--- a/src/encauth/ccm/ccm_memory.c
+++ b/libtomcrypt/src/encauth/ccm/ccm_memory.c
diff --git a/src/encauth/ccm/ccm_test.c b/libtomcrypt/src/encauth/ccm/ccm_test.c
index c33502a..c33502a 100644
--- a/src/encauth/ccm/ccm_test.c
+++ b/libtomcrypt/src/encauth/ccm/ccm_test.c
diff --git a/src/encauth/eax/eax_addheader.c b/libtomcrypt/src/encauth/eax/eax_addheader.c
index cfb325d..cfb325d 100644
--- a/src/encauth/eax/eax_addheader.c
+++ b/libtomcrypt/src/encauth/eax/eax_addheader.c
diff --git a/src/encauth/eax/eax_decrypt.c b/libtomcrypt/src/encauth/eax/eax_decrypt.c
index 13e23cc..13e23cc 100644
--- a/src/encauth/eax/eax_decrypt.c
+++ b/libtomcrypt/src/encauth/eax/eax_decrypt.c
diff --git a/src/encauth/eax/eax_decrypt_verify_memory.c b/libtomcrypt/src/encauth/eax/eax_decrypt_verify_memory.c
index d6d72ab..d6d72ab 100644
--- a/src/encauth/eax/eax_decrypt_verify_memory.c
+++ b/libtomcrypt/src/encauth/eax/eax_decrypt_verify_memory.c
diff --git a/src/encauth/eax/eax_done.c b/libtomcrypt/src/encauth/eax/eax_done.c
index 10e2c23..10e2c23 100644
--- a/src/encauth/eax/eax_done.c
+++ b/libtomcrypt/src/encauth/eax/eax_done.c
diff --git a/src/encauth/eax/eax_encrypt.c b/libtomcrypt/src/encauth/eax/eax_encrypt.c
index a646588..a646588 100644
--- a/src/encauth/eax/eax_encrypt.c
+++ b/libtomcrypt/src/encauth/eax/eax_encrypt.c
diff --git a/src/encauth/eax/eax_encrypt_authenticate_memory.c b/libtomcrypt/src/encauth/eax/eax_encrypt_authenticate_memory.c
index 9760616..9760616 100644
--- a/src/encauth/eax/eax_encrypt_authenticate_memory.c
+++ b/libtomcrypt/src/encauth/eax/eax_encrypt_authenticate_memory.c
diff --git a/src/encauth/eax/eax_init.c b/libtomcrypt/src/encauth/eax/eax_init.c
index 942b25f..942b25f 100644
--- a/src/encauth/eax/eax_init.c
+++ b/libtomcrypt/src/encauth/eax/eax_init.c
diff --git a/src/encauth/eax/eax_test.c b/libtomcrypt/src/encauth/eax/eax_test.c
index b08f5ab..b08f5ab 100644
--- a/src/encauth/eax/eax_test.c
+++ b/libtomcrypt/src/encauth/eax/eax_test.c
diff --git a/src/encauth/gcm/gcm_add_aad.c b/libtomcrypt/src/encauth/gcm/gcm_add_aad.c
index 03cb279..03cb279 100644
--- a/src/encauth/gcm/gcm_add_aad.c
+++ b/libtomcrypt/src/encauth/gcm/gcm_add_aad.c
diff --git a/src/encauth/gcm/gcm_add_iv.c b/libtomcrypt/src/encauth/gcm/gcm_add_iv.c
index 107cf35..107cf35 100644
--- a/src/encauth/gcm/gcm_add_iv.c
+++ b/libtomcrypt/src/encauth/gcm/gcm_add_iv.c
diff --git a/src/encauth/gcm/gcm_done.c b/libtomcrypt/src/encauth/gcm/gcm_done.c
index 96ef7f3..96ef7f3 100644
--- a/src/encauth/gcm/gcm_done.c
+++ b/libtomcrypt/src/encauth/gcm/gcm_done.c
diff --git a/src/encauth/gcm/gcm_gf_mult.c b/libtomcrypt/src/encauth/gcm/gcm_gf_mult.c
index 06db22b..06db22b 100644
--- a/src/encauth/gcm/gcm_gf_mult.c
+++ b/libtomcrypt/src/encauth/gcm/gcm_gf_mult.c
diff --git a/src/encauth/gcm/gcm_init.c b/libtomcrypt/src/encauth/gcm/gcm_init.c
index 249b61d..249b61d 100644
--- a/src/encauth/gcm/gcm_init.c
+++ b/libtomcrypt/src/encauth/gcm/gcm_init.c
diff --git a/src/encauth/gcm/gcm_memory.c b/libtomcrypt/src/encauth/gcm/gcm_memory.c
index ee10d3f..ee10d3f 100644
--- a/src/encauth/gcm/gcm_memory.c
+++ b/libtomcrypt/src/encauth/gcm/gcm_memory.c
diff --git a/src/encauth/gcm/gcm_process.c b/libtomcrypt/src/encauth/gcm/gcm_process.c
index f512e28..f512e28 100644
--- a/src/encauth/gcm/gcm_process.c
+++ b/libtomcrypt/src/encauth/gcm/gcm_process.c
diff --git a/src/encauth/gcm/gcm_reset.c b/libtomcrypt/src/encauth/gcm/gcm_reset.c
index b336a92..b336a92 100644
--- a/src/encauth/gcm/gcm_reset.c
+++ b/libtomcrypt/src/encauth/gcm/gcm_reset.c
diff --git a/src/encauth/gcm/gcm_test.c b/libtomcrypt/src/encauth/gcm/gcm_test.c
index 7106188..7106188 100644
--- a/src/encauth/gcm/gcm_test.c
+++ b/libtomcrypt/src/encauth/gcm/gcm_test.c
diff --git a/src/encauth/ocb/ocb_decrypt.c b/libtomcrypt/src/encauth/ocb/ocb_decrypt.c
index a0f2754..a0f2754 100644
--- a/src/encauth/ocb/ocb_decrypt.c
+++ b/libtomcrypt/src/encauth/ocb/ocb_decrypt.c
diff --git a/src/encauth/ocb/ocb_decrypt_verify_memory.c b/libtomcrypt/src/encauth/ocb/ocb_decrypt_verify_memory.c
index 0173eff..0173eff 100644
--- a/src/encauth/ocb/ocb_decrypt_verify_memory.c
+++ b/libtomcrypt/src/encauth/ocb/ocb_decrypt_verify_memory.c
diff --git a/src/encauth/ocb/ocb_done_decrypt.c b/libtomcrypt/src/encauth/ocb/ocb_done_decrypt.c
index fb149bd..fb149bd 100644
--- a/src/encauth/ocb/ocb_done_decrypt.c
+++ b/libtomcrypt/src/encauth/ocb/ocb_done_decrypt.c
diff --git a/src/encauth/ocb/ocb_done_encrypt.c b/libtomcrypt/src/encauth/ocb/ocb_done_encrypt.c
index 27126e7..27126e7 100644
--- a/src/encauth/ocb/ocb_done_encrypt.c
+++ b/libtomcrypt/src/encauth/ocb/ocb_done_encrypt.c
diff --git a/src/encauth/ocb/ocb_encrypt.c b/libtomcrypt/src/encauth/ocb/ocb_encrypt.c
index 2a48551..2a48551 100644
--- a/src/encauth/ocb/ocb_encrypt.c
+++ b/libtomcrypt/src/encauth/ocb/ocb_encrypt.c
diff --git a/src/encauth/ocb/ocb_encrypt_authenticate_memory.c b/libtomcrypt/src/encauth/ocb/ocb_encrypt_authenticate_memory.c
index beb72c8..beb72c8 100644
--- a/src/encauth/ocb/ocb_encrypt_authenticate_memory.c
+++ b/libtomcrypt/src/encauth/ocb/ocb_encrypt_authenticate_memory.c
diff --git a/src/encauth/ocb/ocb_init.c b/libtomcrypt/src/encauth/ocb/ocb_init.c
index 9266d5c..9266d5c 100644
--- a/src/encauth/ocb/ocb_init.c
+++ b/libtomcrypt/src/encauth/ocb/ocb_init.c
diff --git a/src/encauth/ocb/ocb_ntz.c b/libtomcrypt/src/encauth/ocb/ocb_ntz.c
index 9b83c53..9b83c53 100644
--- a/src/encauth/ocb/ocb_ntz.c
+++ b/libtomcrypt/src/encauth/ocb/ocb_ntz.c
diff --git a/src/encauth/ocb/ocb_shift_xor.c b/libtomcrypt/src/encauth/ocb/ocb_shift_xor.c
index b03de4b..b03de4b 100644
--- a/src/encauth/ocb/ocb_shift_xor.c
+++ b/libtomcrypt/src/encauth/ocb/ocb_shift_xor.c
diff --git a/src/encauth/ocb/ocb_test.c b/libtomcrypt/src/encauth/ocb/ocb_test.c
index 7a7fc77..7a7fc77 100644
--- a/src/encauth/ocb/ocb_test.c
+++ b/libtomcrypt/src/encauth/ocb/ocb_test.c
diff --git a/src/encauth/ocb/s_ocb_done.c b/libtomcrypt/src/encauth/ocb/s_ocb_done.c
index 7399b54..7399b54 100644
--- a/src/encauth/ocb/s_ocb_done.c
+++ b/libtomcrypt/src/encauth/ocb/s_ocb_done.c
diff --git a/src/hashes/chc/chc.c b/libtomcrypt/src/hashes/chc/chc.c
index 4ff39f0..4ff39f0 100644
--- a/src/hashes/chc/chc.c
+++ b/libtomcrypt/src/hashes/chc/chc.c
diff --git a/src/hashes/helper/hash_file.c b/libtomcrypt/src/hashes/helper/hash_file.c
index 50f726a..50f726a 100644
--- a/src/hashes/helper/hash_file.c
+++ b/libtomcrypt/src/hashes/helper/hash_file.c
diff --git a/src/hashes/helper/hash_filehandle.c b/libtomcrypt/src/hashes/helper/hash_filehandle.c
index 201ef83..201ef83 100644
--- a/src/hashes/helper/hash_filehandle.c
+++ b/libtomcrypt/src/hashes/helper/hash_filehandle.c
diff --git a/src/hashes/helper/hash_memory.c b/libtomcrypt/src/hashes/helper/hash_memory.c
index 7e849b4..7e849b4 100644
--- a/src/hashes/helper/hash_memory.c
+++ b/libtomcrypt/src/hashes/helper/hash_memory.c
diff --git a/src/hashes/helper/hash_memory_multi.c b/libtomcrypt/src/hashes/helper/hash_memory_multi.c
index 8fcb8e9..8fcb8e9 100644
--- a/src/hashes/helper/hash_memory_multi.c
+++ b/libtomcrypt/src/hashes/helper/hash_memory_multi.c
diff --git a/src/hashes/md2.c b/libtomcrypt/src/hashes/md2.c
index db60fbd..db60fbd 100644
--- a/src/hashes/md2.c
+++ b/libtomcrypt/src/hashes/md2.c
diff --git a/src/hashes/md4.c b/libtomcrypt/src/hashes/md4.c
index 17ef35e..17ef35e 100644
--- a/src/hashes/md4.c
+++ b/libtomcrypt/src/hashes/md4.c
diff --git a/src/hashes/md5.c b/libtomcrypt/src/hashes/md5.c
index fb8ff31..fb8ff31 100644
--- a/src/hashes/md5.c
+++ b/libtomcrypt/src/hashes/md5.c
diff --git a/src/hashes/rmd128.c b/libtomcrypt/src/hashes/rmd128.c
index 1c663e4..1c663e4 100644
--- a/src/hashes/rmd128.c
+++ b/libtomcrypt/src/hashes/rmd128.c
diff --git a/src/hashes/rmd160.c b/libtomcrypt/src/hashes/rmd160.c
index 6124f59..6124f59 100644
--- a/src/hashes/rmd160.c
+++ b/libtomcrypt/src/hashes/rmd160.c
diff --git a/src/hashes/sha1.c b/libtomcrypt/src/hashes/sha1.c
index a75d04d..a75d04d 100644
--- a/src/hashes/sha1.c
+++ b/libtomcrypt/src/hashes/sha1.c
diff --git a/src/hashes/sha2/sha224.c b/libtomcrypt/src/hashes/sha2/sha224.c
index bff2fdf..bff2fdf 100644
--- a/src/hashes/sha2/sha224.c
+++ b/libtomcrypt/src/hashes/sha2/sha224.c
diff --git a/src/hashes/sha2/sha256.c b/libtomcrypt/src/hashes/sha2/sha256.c
index c48c0f6..c48c0f6 100644
--- a/src/hashes/sha2/sha256.c
+++ b/libtomcrypt/src/hashes/sha2/sha256.c
diff --git a/src/hashes/sha2/sha384.c b/libtomcrypt/src/hashes/sha2/sha384.c
index 43f8fb6..43f8fb6 100644
--- a/src/hashes/sha2/sha384.c
+++ b/libtomcrypt/src/hashes/sha2/sha384.c
diff --git a/src/hashes/sha2/sha512.c b/libtomcrypt/src/hashes/sha2/sha512.c
index 7b6805b..7b6805b 100644
--- a/src/hashes/sha2/sha512.c
+++ b/libtomcrypt/src/hashes/sha2/sha512.c
diff --git a/src/hashes/tiger.c b/libtomcrypt/src/hashes/tiger.c
index 250c186..250c186 100644
--- a/src/hashes/tiger.c
+++ b/libtomcrypt/src/hashes/tiger.c
diff --git a/src/hashes/whirl/whirl.c b/libtomcrypt/src/hashes/whirl/whirl.c
index 1bd7983..1bd7983 100644
--- a/src/hashes/whirl/whirl.c
+++ b/libtomcrypt/src/hashes/whirl/whirl.c
diff --git a/src/hashes/whirl/whirltab.c b/libtomcrypt/src/hashes/whirl/whirltab.c
index c83d0b2..c83d0b2 100644
--- a/src/hashes/whirl/whirltab.c
+++ b/libtomcrypt/src/hashes/whirl/whirltab.c
diff --git a/src/headers/ltc_tommath.h b/libtomcrypt/src/headers/ltc_tommath.h
index 2d62b4e..2d62b4e 100644
--- a/src/headers/ltc_tommath.h
+++ b/libtomcrypt/src/headers/ltc_tommath.h
diff --git a/src/headers/tomcrypt.h b/libtomcrypt/src/headers/tomcrypt.h
index 4804194..4804194 100644
--- a/src/headers/tomcrypt.h
+++ b/libtomcrypt/src/headers/tomcrypt.h
diff --git a/src/headers/tomcrypt_argchk.h b/libtomcrypt/src/headers/tomcrypt_argchk.h
index ef344ee..ef344ee 100644
--- a/src/headers/tomcrypt_argchk.h
+++ b/libtomcrypt/src/headers/tomcrypt_argchk.h
diff --git a/src/headers/tomcrypt_cfg.h b/libtomcrypt/src/headers/tomcrypt_cfg.h
index 1d5bc6c..1d5bc6c 100644
--- a/src/headers/tomcrypt_cfg.h
+++ b/libtomcrypt/src/headers/tomcrypt_cfg.h
diff --git a/src/headers/tomcrypt_cipher.h b/libtomcrypt/src/headers/tomcrypt_cipher.h
index 4f00302..4f00302 100644
--- a/src/headers/tomcrypt_cipher.h
+++ b/libtomcrypt/src/headers/tomcrypt_cipher.h
diff --git a/src/headers/tomcrypt_custom.h b/libtomcrypt/src/headers/tomcrypt_custom.h
index ad61f52..ad61f52 100644
--- a/src/headers/tomcrypt_custom.h
+++ b/libtomcrypt/src/headers/tomcrypt_custom.h
diff --git a/src/headers/tomcrypt_hash.h b/libtomcrypt/src/headers/tomcrypt_hash.h
index a0534f9..a0534f9 100644
--- a/src/headers/tomcrypt_hash.h
+++ b/libtomcrypt/src/headers/tomcrypt_hash.h
diff --git a/src/headers/tomcrypt_mac.h b/libtomcrypt/src/headers/tomcrypt_mac.h
index 411b1bc..411b1bc 100644
--- a/src/headers/tomcrypt_mac.h
+++ b/libtomcrypt/src/headers/tomcrypt_mac.h
diff --git a/src/headers/tomcrypt_macros.h b/libtomcrypt/src/headers/tomcrypt_macros.h
index 488931f..488931f 100644
--- a/src/headers/tomcrypt_macros.h
+++ b/libtomcrypt/src/headers/tomcrypt_macros.h
diff --git a/src/headers/tomcrypt_misc.h b/libtomcrypt/src/headers/tomcrypt_misc.h
index 3b44795..3b44795 100644
--- a/src/headers/tomcrypt_misc.h
+++ b/libtomcrypt/src/headers/tomcrypt_misc.h
diff --git a/src/headers/tomcrypt_pk.h b/libtomcrypt/src/headers/tomcrypt_pk.h
index 4f047de..4f047de 100644
--- a/src/headers/tomcrypt_pk.h
+++ b/libtomcrypt/src/headers/tomcrypt_pk.h
diff --git a/src/headers/tomcrypt_pkcs.h b/libtomcrypt/src/headers/tomcrypt_pkcs.h
index 8d850de..8d850de 100644
--- a/src/headers/tomcrypt_pkcs.h
+++ b/libtomcrypt/src/headers/tomcrypt_pkcs.h
diff --git a/src/headers/tomcrypt_prng.h b/libtomcrypt/src/headers/tomcrypt_prng.h
index f96d408..f96d408 100644
--- a/src/headers/tomcrypt_prng.h
+++ b/libtomcrypt/src/headers/tomcrypt_prng.h
diff --git a/src/headers/tommath_class.h b/libtomcrypt/src/headers/tommath_class.h
index 02dd7cf..02dd7cf 100644
--- a/src/headers/tommath_class.h
+++ b/libtomcrypt/src/headers/tommath_class.h
diff --git a/src/headers/tommath_superclass.h b/libtomcrypt/src/headers/tommath_superclass.h
index 89f3f57..89f3f57 100644
--- a/src/headers/tommath_superclass.h
+++ b/libtomcrypt/src/headers/tommath_superclass.h
diff --git a/src/mac/hmac/hmac_done.c b/libtomcrypt/src/mac/hmac/hmac_done.c
index f64d044..f64d044 100644
--- a/src/mac/hmac/hmac_done.c
+++ b/libtomcrypt/src/mac/hmac/hmac_done.c
diff --git a/src/mac/hmac/hmac_file.c b/libtomcrypt/src/mac/hmac/hmac_file.c
index f88d692..f88d692 100644
--- a/src/mac/hmac/hmac_file.c
+++ b/libtomcrypt/src/mac/hmac/hmac_file.c
diff --git a/src/mac/hmac/hmac_init.c b/libtomcrypt/src/mac/hmac/hmac_init.c
index e64d47f..e64d47f 100644
--- a/src/mac/hmac/hmac_init.c
+++ b/libtomcrypt/src/mac/hmac/hmac_init.c
diff --git a/src/mac/hmac/hmac_memory.c b/libtomcrypt/src/mac/hmac/hmac_memory.c
index 99959f4..99959f4 100644
--- a/src/mac/hmac/hmac_memory.c
+++ b/libtomcrypt/src/mac/hmac/hmac_memory.c
diff --git a/src/mac/hmac/hmac_memory_multi.c b/libtomcrypt/src/mac/hmac/hmac_memory_multi.c
index b7a6d96..b7a6d96 100644
--- a/src/mac/hmac/hmac_memory_multi.c
+++ b/libtomcrypt/src/mac/hmac/hmac_memory_multi.c
diff --git a/src/mac/hmac/hmac_process.c b/libtomcrypt/src/mac/hmac/hmac_process.c
index 2919282..2919282 100644
--- a/src/mac/hmac/hmac_process.c
+++ b/libtomcrypt/src/mac/hmac/hmac_process.c
diff --git a/src/mac/hmac/hmac_test.c b/libtomcrypt/src/mac/hmac/hmac_test.c
index 87758f5..87758f5 100644
--- a/src/mac/hmac/hmac_test.c
+++ b/libtomcrypt/src/mac/hmac/hmac_test.c
diff --git a/src/mac/omac/omac_done.c b/libtomcrypt/src/mac/omac/omac_done.c
index 37292d5..37292d5 100644
--- a/src/mac/omac/omac_done.c
+++ b/libtomcrypt/src/mac/omac/omac_done.c
diff --git a/src/mac/omac/omac_file.c b/libtomcrypt/src/mac/omac/omac_file.c
index 63784b0..63784b0 100644
--- a/src/mac/omac/omac_file.c
+++ b/libtomcrypt/src/mac/omac/omac_file.c
diff --git a/src/mac/omac/omac_init.c b/libtomcrypt/src/mac/omac/omac_init.c
index 8866772..8866772 100644
--- a/src/mac/omac/omac_init.c
+++ b/libtomcrypt/src/mac/omac/omac_init.c
diff --git a/src/mac/omac/omac_memory.c b/libtomcrypt/src/mac/omac/omac_memory.c
index 3b1521c..3b1521c 100644
--- a/src/mac/omac/omac_memory.c
+++ b/libtomcrypt/src/mac/omac/omac_memory.c
diff --git a/src/mac/omac/omac_memory_multi.c b/libtomcrypt/src/mac/omac/omac_memory_multi.c
index 52f7323..52f7323 100644
--- a/src/mac/omac/omac_memory_multi.c
+++ b/libtomcrypt/src/mac/omac/omac_memory_multi.c
diff --git a/src/mac/omac/omac_process.c b/libtomcrypt/src/mac/omac/omac_process.c
index ca3ec6b..ca3ec6b 100644
--- a/src/mac/omac/omac_process.c
+++ b/libtomcrypt/src/mac/omac/omac_process.c
diff --git a/src/mac/omac/omac_test.c b/libtomcrypt/src/mac/omac/omac_test.c
index 1bc9ead..1bc9ead 100644
--- a/src/mac/omac/omac_test.c
+++ b/libtomcrypt/src/mac/omac/omac_test.c
diff --git a/src/mac/pelican/pelican.c b/libtomcrypt/src/mac/pelican/pelican.c
index 85bf9ee..85bf9ee 100644
--- a/src/mac/pelican/pelican.c
+++ b/libtomcrypt/src/mac/pelican/pelican.c
diff --git a/src/mac/pelican/pelican_memory.c b/libtomcrypt/src/mac/pelican/pelican_memory.c
index 093340d..093340d 100644
--- a/src/mac/pelican/pelican_memory.c
+++ b/libtomcrypt/src/mac/pelican/pelican_memory.c
diff --git a/src/mac/pelican/pelican_test.c b/libtomcrypt/src/mac/pelican/pelican_test.c
index 06ec3f0..06ec3f0 100644
--- a/src/mac/pelican/pelican_test.c
+++ b/libtomcrypt/src/mac/pelican/pelican_test.c
diff --git a/src/mac/pmac/pmac_done.c b/libtomcrypt/src/mac/pmac/pmac_done.c
index 09c430c..09c430c 100644
--- a/src/mac/pmac/pmac_done.c
+++ b/libtomcrypt/src/mac/pmac/pmac_done.c
diff --git a/src/mac/pmac/pmac_file.c b/libtomcrypt/src/mac/pmac/pmac_file.c
index 1034c6f..1034c6f 100644
--- a/src/mac/pmac/pmac_file.c
+++ b/libtomcrypt/src/mac/pmac/pmac_file.c
diff --git a/src/mac/pmac/pmac_init.c b/libtomcrypt/src/mac/pmac/pmac_init.c
index 5bd94a0..5bd94a0 100644
--- a/src/mac/pmac/pmac_init.c
+++ b/libtomcrypt/src/mac/pmac/pmac_init.c
diff --git a/src/mac/pmac/pmac_memory.c b/libtomcrypt/src/mac/pmac/pmac_memory.c
index fcdef99..fcdef99 100644
--- a/src/mac/pmac/pmac_memory.c
+++ b/libtomcrypt/src/mac/pmac/pmac_memory.c
diff --git a/src/mac/pmac/pmac_memory_multi.c b/libtomcrypt/src/mac/pmac/pmac_memory_multi.c
index 2cc8572..2cc8572 100644
--- a/src/mac/pmac/pmac_memory_multi.c
+++ b/libtomcrypt/src/mac/pmac/pmac_memory_multi.c
diff --git a/src/mac/pmac/pmac_ntz.c b/libtomcrypt/src/mac/pmac/pmac_ntz.c
index 7ec4550..7ec4550 100644
--- a/src/mac/pmac/pmac_ntz.c
+++ b/libtomcrypt/src/mac/pmac/pmac_ntz.c
diff --git a/src/mac/pmac/pmac_process.c b/libtomcrypt/src/mac/pmac/pmac_process.c
index 9ebd44e..9ebd44e 100644
--- a/src/mac/pmac/pmac_process.c
+++ b/libtomcrypt/src/mac/pmac/pmac_process.c
diff --git a/src/mac/pmac/pmac_shift_xor.c b/libtomcrypt/src/mac/pmac/pmac_shift_xor.c
index f24c22b..f24c22b 100644
--- a/src/mac/pmac/pmac_shift_xor.c
+++ b/libtomcrypt/src/mac/pmac/pmac_shift_xor.c
diff --git a/src/mac/pmac/pmac_test.c b/libtomcrypt/src/mac/pmac/pmac_test.c
index 6d88703..6d88703 100644
--- a/src/mac/pmac/pmac_test.c
+++ b/libtomcrypt/src/mac/pmac/pmac_test.c
diff --git a/src/misc/base64/base64_decode.c b/libtomcrypt/src/misc/base64/base64_decode.c
index 551add1..551add1 100644
--- a/src/misc/base64/base64_decode.c
+++ b/libtomcrypt/src/misc/base64/base64_decode.c
diff --git a/src/misc/base64/base64_encode.c b/libtomcrypt/src/misc/base64/base64_encode.c
index 5978643..5978643 100644
--- a/src/misc/base64/base64_encode.c
+++ b/libtomcrypt/src/misc/base64/base64_encode.c
diff --git a/src/misc/burn_stack.c b/libtomcrypt/src/misc/burn_stack.c
index 7ac6518..7ac6518 100644
--- a/src/misc/burn_stack.c
+++ b/libtomcrypt/src/misc/burn_stack.c
diff --git a/src/misc/crypt/crypt.c b/libtomcrypt/src/misc/crypt/crypt.c
index eb75b8d..eb75b8d 100644
--- a/src/misc/crypt/crypt.c
+++ b/libtomcrypt/src/misc/crypt/crypt.c
diff --git a/src/misc/crypt/crypt_argchk.c b/libtomcrypt/src/misc/crypt/crypt_argchk.c
index 699c6cf..699c6cf 100644
--- a/src/misc/crypt/crypt_argchk.c
+++ b/libtomcrypt/src/misc/crypt/crypt_argchk.c
diff --git a/src/misc/crypt/crypt_cipher_descriptor.c b/libtomcrypt/src/misc/crypt/crypt_cipher_descriptor.c
index 127e85c..127e85c 100644
--- a/src/misc/crypt/crypt_cipher_descriptor.c
+++ b/libtomcrypt/src/misc/crypt/crypt_cipher_descriptor.c
diff --git a/src/misc/crypt/crypt_cipher_is_valid.c b/libtomcrypt/src/misc/crypt/crypt_cipher_is_valid.c
index 7830303..7830303 100644
--- a/src/misc/crypt/crypt_cipher_is_valid.c
+++ b/libtomcrypt/src/misc/crypt/crypt_cipher_is_valid.c
diff --git a/src/misc/crypt/crypt_find_cipher.c b/libtomcrypt/src/misc/crypt/crypt_find_cipher.c
index e7fba73..e7fba73 100644
--- a/src/misc/crypt/crypt_find_cipher.c
+++ b/libtomcrypt/src/misc/crypt/crypt_find_cipher.c
diff --git a/src/misc/crypt/crypt_find_cipher_any.c b/libtomcrypt/src/misc/crypt/crypt_find_cipher_any.c
index 6dd0e6d..6dd0e6d 100644
--- a/src/misc/crypt/crypt_find_cipher_any.c
+++ b/libtomcrypt/src/misc/crypt/crypt_find_cipher_any.c
diff --git a/src/misc/crypt/crypt_find_cipher_id.c b/libtomcrypt/src/misc/crypt/crypt_find_cipher_id.c
index 5f5f874..5f5f874 100644
--- a/src/misc/crypt/crypt_find_cipher_id.c
+++ b/libtomcrypt/src/misc/crypt/crypt_find_cipher_id.c
diff --git a/src/misc/crypt/crypt_find_hash.c b/libtomcrypt/src/misc/crypt/crypt_find_hash.c
index d02a449..d02a449 100644
--- a/src/misc/crypt/crypt_find_hash.c
+++ b/libtomcrypt/src/misc/crypt/crypt_find_hash.c
diff --git a/src/misc/crypt/crypt_find_hash_any.c b/libtomcrypt/src/misc/crypt/crypt_find_hash_any.c
index 1172f22..1172f22 100644
--- a/src/misc/crypt/crypt_find_hash_any.c
+++ b/libtomcrypt/src/misc/crypt/crypt_find_hash_any.c
diff --git a/src/misc/crypt/crypt_find_hash_id.c b/libtomcrypt/src/misc/crypt/crypt_find_hash_id.c
index 8cd0d38..8cd0d38 100644
--- a/src/misc/crypt/crypt_find_hash_id.c
+++ b/libtomcrypt/src/misc/crypt/crypt_find_hash_id.c
diff --git a/src/misc/crypt/crypt_find_prng.c b/libtomcrypt/src/misc/crypt/crypt_find_prng.c
index 503813d..503813d 100644
--- a/src/misc/crypt/crypt_find_prng.c
+++ b/libtomcrypt/src/misc/crypt/crypt_find_prng.c
diff --git a/src/misc/crypt/crypt_hash_descriptor.c b/libtomcrypt/src/misc/crypt/crypt_hash_descriptor.c
index f8583d8..f8583d8 100644
--- a/src/misc/crypt/crypt_hash_descriptor.c
+++ b/libtomcrypt/src/misc/crypt/crypt_hash_descriptor.c
diff --git a/src/misc/crypt/crypt_hash_is_valid.c b/libtomcrypt/src/misc/crypt/crypt_hash_is_valid.c
index a8130d4..a8130d4 100644
--- a/src/misc/crypt/crypt_hash_is_valid.c
+++ b/libtomcrypt/src/misc/crypt/crypt_hash_is_valid.c
diff --git a/src/misc/crypt/crypt_prng_descriptor.c b/libtomcrypt/src/misc/crypt/crypt_prng_descriptor.c
index 7335f5d..7335f5d 100644
--- a/src/misc/crypt/crypt_prng_descriptor.c
+++ b/libtomcrypt/src/misc/crypt/crypt_prng_descriptor.c
diff --git a/src/misc/crypt/crypt_prng_is_valid.c b/libtomcrypt/src/misc/crypt/crypt_prng_is_valid.c
index a7a7ed6..a7a7ed6 100644
--- a/src/misc/crypt/crypt_prng_is_valid.c
+++ b/libtomcrypt/src/misc/crypt/crypt_prng_is_valid.c
diff --git a/src/misc/crypt/crypt_register_cipher.c b/libtomcrypt/src/misc/crypt/crypt_register_cipher.c
index c55d7c0..c55d7c0 100644
--- a/src/misc/crypt/crypt_register_cipher.c
+++ b/libtomcrypt/src/misc/crypt/crypt_register_cipher.c
diff --git a/src/misc/crypt/crypt_register_hash.c b/libtomcrypt/src/misc/crypt/crypt_register_hash.c
index 0ff521f..0ff521f 100644
--- a/src/misc/crypt/crypt_register_hash.c
+++ b/libtomcrypt/src/misc/crypt/crypt_register_hash.c
diff --git a/src/misc/crypt/crypt_register_prng.c b/libtomcrypt/src/misc/crypt/crypt_register_prng.c
index 5ab4a49..5ab4a49 100644
--- a/src/misc/crypt/crypt_register_prng.c
+++ b/libtomcrypt/src/misc/crypt/crypt_register_prng.c
diff --git a/src/misc/crypt/crypt_unregister_cipher.c b/libtomcrypt/src/misc/crypt/crypt_unregister_cipher.c
index 4081ca8..4081ca8 100644
--- a/src/misc/crypt/crypt_unregister_cipher.c
+++ b/libtomcrypt/src/misc/crypt/crypt_unregister_cipher.c
diff --git a/src/misc/crypt/crypt_unregister_hash.c b/libtomcrypt/src/misc/crypt/crypt_unregister_hash.c
index fdd25b2..fdd25b2 100644
--- a/src/misc/crypt/crypt_unregister_hash.c
+++ b/libtomcrypt/src/misc/crypt/crypt_unregister_hash.c
diff --git a/src/misc/crypt/crypt_unregister_prng.c b/libtomcrypt/src/misc/crypt/crypt_unregister_prng.c
index 4aff6de..4aff6de 100644
--- a/src/misc/crypt/crypt_unregister_prng.c
+++ b/libtomcrypt/src/misc/crypt/crypt_unregister_prng.c
diff --git a/src/misc/error_to_string.c b/libtomcrypt/src/misc/error_to_string.c
index 6167a70..6167a70 100644
--- a/src/misc/error_to_string.c
+++ b/libtomcrypt/src/misc/error_to_string.c
diff --git a/src/misc/mpi/is_prime.c b/libtomcrypt/src/misc/mpi/is_prime.c
index 9ee4ed2..9ee4ed2 100644
--- a/src/misc/mpi/is_prime.c
+++ b/libtomcrypt/src/misc/mpi/is_prime.c
diff --git a/src/misc/mpi/mpi_to_ltc_error.c b/libtomcrypt/src/misc/mpi/mpi_to_ltc_error.c
index bc39ea1..bc39ea1 100644
--- a/src/misc/mpi/mpi_to_ltc_error.c
+++ b/libtomcrypt/src/misc/mpi/mpi_to_ltc_error.c
diff --git a/src/misc/mpi/rand_prime.c b/libtomcrypt/src/misc/mpi/rand_prime.c
index 9c5921a..9c5921a 100644
--- a/src/misc/mpi/rand_prime.c
+++ b/libtomcrypt/src/misc/mpi/rand_prime.c
diff --git a/src/misc/pkcs5/pkcs_5_1.c b/libtomcrypt/src/misc/pkcs5/pkcs_5_1.c
index 90411b7..90411b7 100644
--- a/src/misc/pkcs5/pkcs_5_1.c
+++ b/libtomcrypt/src/misc/pkcs5/pkcs_5_1.c
diff --git a/src/misc/pkcs5/pkcs_5_2.c b/libtomcrypt/src/misc/pkcs5/pkcs_5_2.c
index 08d8a18..08d8a18 100644
--- a/src/misc/pkcs5/pkcs_5_2.c
+++ b/libtomcrypt/src/misc/pkcs5/pkcs_5_2.c
diff --git a/src/misc/zeromem.c b/libtomcrypt/src/misc/zeromem.c
index 1ab07dc..1ab07dc 100644
--- a/src/misc/zeromem.c
+++ b/libtomcrypt/src/misc/zeromem.c
diff --git a/src/modes/cbc/cbc_decrypt.c b/libtomcrypt/src/modes/cbc/cbc_decrypt.c
index fc3fdd5..fc3fdd5 100644
--- a/src/modes/cbc/cbc_decrypt.c
+++ b/libtomcrypt/src/modes/cbc/cbc_decrypt.c
diff --git a/src/modes/cbc/cbc_done.c b/libtomcrypt/src/modes/cbc/cbc_done.c
index afaa9bb..afaa9bb 100644
--- a/src/modes/cbc/cbc_done.c
+++ b/libtomcrypt/src/modes/cbc/cbc_done.c
diff --git a/src/modes/cbc/cbc_encrypt.c b/libtomcrypt/src/modes/cbc/cbc_encrypt.c
index a6b41b1..a6b41b1 100644
--- a/src/modes/cbc/cbc_encrypt.c
+++ b/libtomcrypt/src/modes/cbc/cbc_encrypt.c
diff --git a/src/modes/cbc/cbc_getiv.c b/libtomcrypt/src/modes/cbc/cbc_getiv.c
index ab418b1..ab418b1 100644
--- a/src/modes/cbc/cbc_getiv.c
+++ b/libtomcrypt/src/modes/cbc/cbc_getiv.c
diff --git a/src/modes/cbc/cbc_setiv.c b/libtomcrypt/src/modes/cbc/cbc_setiv.c
index c38e713..c38e713 100644
--- a/src/modes/cbc/cbc_setiv.c
+++ b/libtomcrypt/src/modes/cbc/cbc_setiv.c
diff --git a/src/modes/cbc/cbc_start.c b/libtomcrypt/src/modes/cbc/cbc_start.c
index 833bb87..833bb87 100644
--- a/src/modes/cbc/cbc_start.c
+++ b/libtomcrypt/src/modes/cbc/cbc_start.c
diff --git a/src/modes/cfb/cfb_decrypt.c b/libtomcrypt/src/modes/cfb/cfb_decrypt.c
index 3d51ba5..3d51ba5 100644
--- a/src/modes/cfb/cfb_decrypt.c
+++ b/libtomcrypt/src/modes/cfb/cfb_decrypt.c
diff --git a/src/modes/cfb/cfb_done.c b/libtomcrypt/src/modes/cfb/cfb_done.c
index 8924761..8924761 100644
--- a/src/modes/cfb/cfb_done.c
+++ b/libtomcrypt/src/modes/cfb/cfb_done.c
diff --git a/src/modes/cfb/cfb_encrypt.c b/libtomcrypt/src/modes/cfb/cfb_encrypt.c
index cca0116..cca0116 100644
--- a/src/modes/cfb/cfb_encrypt.c
+++ b/libtomcrypt/src/modes/cfb/cfb_encrypt.c
diff --git a/src/modes/cfb/cfb_getiv.c b/libtomcrypt/src/modes/cfb/cfb_getiv.c
index 5c5b4c4..5c5b4c4 100644
--- a/src/modes/cfb/cfb_getiv.c
+++ b/libtomcrypt/src/modes/cfb/cfb_getiv.c
diff --git a/src/modes/cfb/cfb_setiv.c b/libtomcrypt/src/modes/cfb/cfb_setiv.c
index d075a0d..d075a0d 100644
--- a/src/modes/cfb/cfb_setiv.c
+++ b/libtomcrypt/src/modes/cfb/cfb_setiv.c
diff --git a/src/modes/cfb/cfb_start.c b/libtomcrypt/src/modes/cfb/cfb_start.c
index 755e173..755e173 100644
--- a/src/modes/cfb/cfb_start.c
+++ b/libtomcrypt/src/modes/cfb/cfb_start.c
diff --git a/src/modes/ctr/ctr_decrypt.c b/libtomcrypt/src/modes/ctr/ctr_decrypt.c
index e1d1d51..e1d1d51 100644
--- a/src/modes/ctr/ctr_decrypt.c
+++ b/libtomcrypt/src/modes/ctr/ctr_decrypt.c
diff --git a/src/modes/ctr/ctr_done.c b/libtomcrypt/src/modes/ctr/ctr_done.c
index f2e79ba..f2e79ba 100644
--- a/src/modes/ctr/ctr_done.c
+++ b/libtomcrypt/src/modes/ctr/ctr_done.c
diff --git a/src/modes/ctr/ctr_encrypt.c b/libtomcrypt/src/modes/ctr/ctr_encrypt.c
index 79795ae..79795ae 100644
--- a/src/modes/ctr/ctr_encrypt.c
+++ b/libtomcrypt/src/modes/ctr/ctr_encrypt.c
diff --git a/src/modes/ctr/ctr_getiv.c b/libtomcrypt/src/modes/ctr/ctr_getiv.c
index 50ce6a0..50ce6a0 100644
--- a/src/modes/ctr/ctr_getiv.c
+++ b/libtomcrypt/src/modes/ctr/ctr_getiv.c
diff --git a/src/modes/ctr/ctr_setiv.c b/libtomcrypt/src/modes/ctr/ctr_setiv.c
index 64d4c43..64d4c43 100644
--- a/src/modes/ctr/ctr_setiv.c
+++ b/libtomcrypt/src/modes/ctr/ctr_setiv.c
diff --git a/src/modes/ctr/ctr_start.c b/libtomcrypt/src/modes/ctr/ctr_start.c
index 7c7eebb..7c7eebb 100644
--- a/src/modes/ctr/ctr_start.c
+++ b/libtomcrypt/src/modes/ctr/ctr_start.c
diff --git a/src/modes/ecb/ecb_decrypt.c b/libtomcrypt/src/modes/ecb/ecb_decrypt.c
index aa83661..aa83661 100644
--- a/src/modes/ecb/ecb_decrypt.c
+++ b/libtomcrypt/src/modes/ecb/ecb_decrypt.c
diff --git a/src/modes/ecb/ecb_done.c b/libtomcrypt/src/modes/ecb/ecb_done.c
index a072615..a072615 100644
--- a/src/modes/ecb/ecb_done.c
+++ b/libtomcrypt/src/modes/ecb/ecb_done.c
diff --git a/src/modes/ecb/ecb_encrypt.c b/libtomcrypt/src/modes/ecb/ecb_encrypt.c
index 21e0385..21e0385 100644
--- a/src/modes/ecb/ecb_encrypt.c
+++ b/libtomcrypt/src/modes/ecb/ecb_encrypt.c
diff --git a/src/modes/ecb/ecb_start.c b/libtomcrypt/src/modes/ecb/ecb_start.c
index f7baa81..f7baa81 100644
--- a/src/modes/ecb/ecb_start.c
+++ b/libtomcrypt/src/modes/ecb/ecb_start.c
diff --git a/src/modes/ofb/ofb_decrypt.c b/libtomcrypt/src/modes/ofb/ofb_decrypt.c
index cf5f19d..cf5f19d 100644
--- a/src/modes/ofb/ofb_decrypt.c
+++ b/libtomcrypt/src/modes/ofb/ofb_decrypt.c
diff --git a/src/modes/ofb/ofb_done.c b/libtomcrypt/src/modes/ofb/ofb_done.c
index 5e114f4..5e114f4 100644
--- a/src/modes/ofb/ofb_done.c
+++ b/libtomcrypt/src/modes/ofb/ofb_done.c
diff --git a/src/modes/ofb/ofb_encrypt.c b/libtomcrypt/src/modes/ofb/ofb_encrypt.c
index d66979a..d66979a 100644
--- a/src/modes/ofb/ofb_encrypt.c
+++ b/libtomcrypt/src/modes/ofb/ofb_encrypt.c
diff --git a/src/modes/ofb/ofb_getiv.c b/libtomcrypt/src/modes/ofb/ofb_getiv.c
index f945fff..f945fff 100644
--- a/src/modes/ofb/ofb_getiv.c
+++ b/libtomcrypt/src/modes/ofb/ofb_getiv.c
diff --git a/src/modes/ofb/ofb_setiv.c b/libtomcrypt/src/modes/ofb/ofb_setiv.c
index f678601..f678601 100644
--- a/src/modes/ofb/ofb_setiv.c
+++ b/libtomcrypt/src/modes/ofb/ofb_setiv.c
diff --git a/src/modes/ofb/ofb_start.c b/libtomcrypt/src/modes/ofb/ofb_start.c
index 083e381..083e381 100644
--- a/src/modes/ofb/ofb_start.c
+++ b/libtomcrypt/src/modes/ofb/ofb_start.c
diff --git a/src/pk/asn1/der/bit/der_decode_bit_string.c b/libtomcrypt/src/pk/asn1/der/bit/der_decode_bit_string.c
index da5b989..da5b989 100644
--- a/src/pk/asn1/der/bit/der_decode_bit_string.c
+++ b/libtomcrypt/src/pk/asn1/der/bit/der_decode_bit_string.c
diff --git a/src/pk/asn1/der/bit/der_encode_bit_string.c b/libtomcrypt/src/pk/asn1/der/bit/der_encode_bit_string.c
index 569c15b..569c15b 100644
--- a/src/pk/asn1/der/bit/der_encode_bit_string.c
+++ b/libtomcrypt/src/pk/asn1/der/bit/der_encode_bit_string.c
diff --git a/src/pk/asn1/der/bit/der_length_bit_string.c b/libtomcrypt/src/pk/asn1/der/bit/der_length_bit_string.c
index dd6ea6d..dd6ea6d 100644
--- a/src/pk/asn1/der/bit/der_length_bit_string.c
+++ b/libtomcrypt/src/pk/asn1/der/bit/der_length_bit_string.c
diff --git a/src/pk/asn1/der/choice/der_decode_choice.c b/libtomcrypt/src/pk/asn1/der/choice/der_decode_choice.c
index 61cba11..61cba11 100644
--- a/src/pk/asn1/der/choice/der_decode_choice.c
+++ b/libtomcrypt/src/pk/asn1/der/choice/der_decode_choice.c
diff --git a/src/pk/asn1/der/ia5/der_decode_ia5_string.c b/libtomcrypt/src/pk/asn1/der/ia5/der_decode_ia5_string.c
index ac0a4af..ac0a4af 100644
--- a/src/pk/asn1/der/ia5/der_decode_ia5_string.c
+++ b/libtomcrypt/src/pk/asn1/der/ia5/der_decode_ia5_string.c
diff --git a/src/pk/asn1/der/ia5/der_encode_ia5_string.c b/libtomcrypt/src/pk/asn1/der/ia5/der_encode_ia5_string.c
index a79b46e..a79b46e 100644
--- a/src/pk/asn1/der/ia5/der_encode_ia5_string.c
+++ b/libtomcrypt/src/pk/asn1/der/ia5/der_encode_ia5_string.c
diff --git a/src/pk/asn1/der/ia5/der_length_ia5_string.c b/libtomcrypt/src/pk/asn1/der/ia5/der_length_ia5_string.c
index d07d630..d07d630 100644
--- a/src/pk/asn1/der/ia5/der_length_ia5_string.c
+++ b/libtomcrypt/src/pk/asn1/der/ia5/der_length_ia5_string.c
diff --git a/src/pk/asn1/der/integer/der_decode_integer.c b/libtomcrypt/src/pk/asn1/der/integer/der_decode_integer.c
index e68b2c9..e68b2c9 100644
--- a/src/pk/asn1/der/integer/der_decode_integer.c
+++ b/libtomcrypt/src/pk/asn1/der/integer/der_decode_integer.c
diff --git a/src/pk/asn1/der/integer/der_encode_integer.c b/libtomcrypt/src/pk/asn1/der/integer/der_encode_integer.c
index f0f41be..f0f41be 100644
--- a/src/pk/asn1/der/integer/der_encode_integer.c
+++ b/libtomcrypt/src/pk/asn1/der/integer/der_encode_integer.c
diff --git a/src/pk/asn1/der/integer/der_length_integer.c b/libtomcrypt/src/pk/asn1/der/integer/der_length_integer.c
index 1bfee45..1bfee45 100644
--- a/src/pk/asn1/der/integer/der_length_integer.c
+++ b/libtomcrypt/src/pk/asn1/der/integer/der_length_integer.c
diff --git a/src/pk/asn1/der/object_identifier/der_decode_object_identifier.c b/libtomcrypt/src/pk/asn1/der/object_identifier/der_decode_object_identifier.c
index c69c9a3..c69c9a3 100644
--- a/src/pk/asn1/der/object_identifier/der_decode_object_identifier.c
+++ b/libtomcrypt/src/pk/asn1/der/object_identifier/der_decode_object_identifier.c
diff --git a/src/pk/asn1/der/object_identifier/der_encode_object_identifier.c b/libtomcrypt/src/pk/asn1/der/object_identifier/der_encode_object_identifier.c
index 16eb112..16eb112 100644
--- a/src/pk/asn1/der/object_identifier/der_encode_object_identifier.c
+++ b/libtomcrypt/src/pk/asn1/der/object_identifier/der_encode_object_identifier.c
diff --git a/src/pk/asn1/der/object_identifier/der_length_object_identifier.c b/libtomcrypt/src/pk/asn1/der/object_identifier/der_length_object_identifier.c
index d03d1a7..d03d1a7 100644
--- a/src/pk/asn1/der/object_identifier/der_length_object_identifier.c
+++ b/libtomcrypt/src/pk/asn1/der/object_identifier/der_length_object_identifier.c
diff --git a/src/pk/asn1/der/octet/der_decode_octet_string.c b/libtomcrypt/src/pk/asn1/der/octet/der_decode_octet_string.c
index 2d3800f..2d3800f 100644
--- a/src/pk/asn1/der/octet/der_decode_octet_string.c
+++ b/libtomcrypt/src/pk/asn1/der/octet/der_decode_octet_string.c
diff --git a/src/pk/asn1/der/octet/der_encode_octet_string.c b/libtomcrypt/src/pk/asn1/der/octet/der_encode_octet_string.c
index f34b708..f34b708 100644
--- a/src/pk/asn1/der/octet/der_encode_octet_string.c
+++ b/libtomcrypt/src/pk/asn1/der/octet/der_encode_octet_string.c
diff --git a/src/pk/asn1/der/octet/der_length_octet_string.c b/libtomcrypt/src/pk/asn1/der/octet/der_length_octet_string.c
index 9a0bed3..9a0bed3 100644
--- a/src/pk/asn1/der/octet/der_length_octet_string.c
+++ b/libtomcrypt/src/pk/asn1/der/octet/der_length_octet_string.c
diff --git a/src/pk/asn1/der/printable_string/der_decode_printable_string.c b/libtomcrypt/src/pk/asn1/der/printable_string/der_decode_printable_string.c
index cbc9239..cbc9239 100644
--- a/src/pk/asn1/der/printable_string/der_decode_printable_string.c
+++ b/libtomcrypt/src/pk/asn1/der/printable_string/der_decode_printable_string.c
diff --git a/src/pk/asn1/der/printable_string/der_encode_printable_string.c b/libtomcrypt/src/pk/asn1/der/printable_string/der_encode_printable_string.c
index ab4e8bf..ab4e8bf 100644
--- a/src/pk/asn1/der/printable_string/der_encode_printable_string.c
+++ b/libtomcrypt/src/pk/asn1/der/printable_string/der_encode_printable_string.c
diff --git a/src/pk/asn1/der/printable_string/der_length_printable_string.c b/libtomcrypt/src/pk/asn1/der/printable_string/der_length_printable_string.c
index 34065f2..34065f2 100644
--- a/src/pk/asn1/der/printable_string/der_length_printable_string.c
+++ b/libtomcrypt/src/pk/asn1/der/printable_string/der_length_printable_string.c
diff --git a/src/pk/asn1/der/sequence/der_decode_sequence.c b/libtomcrypt/src/pk/asn1/der/sequence/der_decode_sequence.c
index 335a882..335a882 100644
--- a/src/pk/asn1/der/sequence/der_decode_sequence.c
+++ b/libtomcrypt/src/pk/asn1/der/sequence/der_decode_sequence.c
diff --git a/src/pk/asn1/der/sequence/der_decode_sequence_multi.c b/libtomcrypt/src/pk/asn1/der/sequence/der_decode_sequence_multi.c
index a4a1038..a4a1038 100644
--- a/src/pk/asn1/der/sequence/der_decode_sequence_multi.c
+++ b/libtomcrypt/src/pk/asn1/der/sequence/der_decode_sequence_multi.c
diff --git a/src/pk/asn1/der/sequence/der_encode_sequence.c b/libtomcrypt/src/pk/asn1/der/sequence/der_encode_sequence.c
index d53153c..d53153c 100644
--- a/src/pk/asn1/der/sequence/der_encode_sequence.c
+++ b/libtomcrypt/src/pk/asn1/der/sequence/der_encode_sequence.c
diff --git a/src/pk/asn1/der/sequence/der_encode_sequence_multi.c b/libtomcrypt/src/pk/asn1/der/sequence/der_encode_sequence_multi.c
index 9ff48b9..9ff48b9 100644
--- a/src/pk/asn1/der/sequence/der_encode_sequence_multi.c
+++ b/libtomcrypt/src/pk/asn1/der/sequence/der_encode_sequence_multi.c
diff --git a/src/pk/asn1/der/sequence/der_length_sequence.c b/libtomcrypt/src/pk/asn1/der/sequence/der_length_sequence.c
index 9120451..9120451 100644
--- a/src/pk/asn1/der/sequence/der_length_sequence.c
+++ b/libtomcrypt/src/pk/asn1/der/sequence/der_length_sequence.c
diff --git a/src/pk/asn1/der/short_integer/der_decode_short_integer.c b/libtomcrypt/src/pk/asn1/der/short_integer/der_decode_short_integer.c
index 7d3d3b7..7d3d3b7 100644
--- a/src/pk/asn1/der/short_integer/der_decode_short_integer.c
+++ b/libtomcrypt/src/pk/asn1/der/short_integer/der_decode_short_integer.c
diff --git a/src/pk/asn1/der/short_integer/der_encode_short_integer.c b/libtomcrypt/src/pk/asn1/der/short_integer/der_encode_short_integer.c
index eb94d38..eb94d38 100644
--- a/src/pk/asn1/der/short_integer/der_encode_short_integer.c
+++ b/libtomcrypt/src/pk/asn1/der/short_integer/der_encode_short_integer.c
diff --git a/src/pk/asn1/der/short_integer/der_length_short_integer.c b/libtomcrypt/src/pk/asn1/der/short_integer/der_length_short_integer.c
index 7324b4a..7324b4a 100644
--- a/src/pk/asn1/der/short_integer/der_length_short_integer.c
+++ b/libtomcrypt/src/pk/asn1/der/short_integer/der_length_short_integer.c
diff --git a/src/pk/asn1/der/utctime/der_decode_utctime.c b/libtomcrypt/src/pk/asn1/der/utctime/der_decode_utctime.c
index 43ba033..43ba033 100644
--- a/src/pk/asn1/der/utctime/der_decode_utctime.c
+++ b/libtomcrypt/src/pk/asn1/der/utctime/der_decode_utctime.c
diff --git a/src/pk/asn1/der/utctime/der_encode_utctime.c b/libtomcrypt/src/pk/asn1/der/utctime/der_encode_utctime.c
index dad8c84..dad8c84 100644
--- a/src/pk/asn1/der/utctime/der_encode_utctime.c
+++ b/libtomcrypt/src/pk/asn1/der/utctime/der_encode_utctime.c
diff --git a/src/pk/asn1/der/utctime/der_length_utctime.c b/libtomcrypt/src/pk/asn1/der/utctime/der_length_utctime.c
index 84e654a..84e654a 100644
--- a/src/pk/asn1/der/utctime/der_length_utctime.c
+++ b/libtomcrypt/src/pk/asn1/der/utctime/der_length_utctime.c
diff --git a/src/pk/dh/dh.c b/libtomcrypt/src/pk/dh/dh.c
index a54f29b..a54f29b 100644
--- a/src/pk/dh/dh.c
+++ b/libtomcrypt/src/pk/dh/dh.c
diff --git a/src/pk/dh/dh_sys.c b/libtomcrypt/src/pk/dh/dh_sys.c
index 4f10556..4f10556 100644
--- a/src/pk/dh/dh_sys.c
+++ b/libtomcrypt/src/pk/dh/dh_sys.c
diff --git a/src/pk/dsa/dsa_export.c b/libtomcrypt/src/pk/dsa/dsa_export.c
index 5a093d9..5a093d9 100644
--- a/src/pk/dsa/dsa_export.c
+++ b/libtomcrypt/src/pk/dsa/dsa_export.c
diff --git a/src/pk/dsa/dsa_free.c b/libtomcrypt/src/pk/dsa/dsa_free.c
index 9157acb..9157acb 100644
--- a/src/pk/dsa/dsa_free.c
+++ b/libtomcrypt/src/pk/dsa/dsa_free.c
diff --git a/src/pk/dsa/dsa_import.c b/libtomcrypt/src/pk/dsa/dsa_import.c
index e81bac8..e81bac8 100644
--- a/src/pk/dsa/dsa_import.c
+++ b/libtomcrypt/src/pk/dsa/dsa_import.c
diff --git a/src/pk/dsa/dsa_make_key.c b/libtomcrypt/src/pk/dsa/dsa_make_key.c
index 02f69e0..02f69e0 100644
--- a/src/pk/dsa/dsa_make_key.c
+++ b/libtomcrypt/src/pk/dsa/dsa_make_key.c
diff --git a/src/pk/dsa/dsa_sign_hash.c b/libtomcrypt/src/pk/dsa/dsa_sign_hash.c
index 48d29a2..48d29a2 100644
--- a/src/pk/dsa/dsa_sign_hash.c
+++ b/libtomcrypt/src/pk/dsa/dsa_sign_hash.c
diff --git a/src/pk/dsa/dsa_verify_hash.c b/libtomcrypt/src/pk/dsa/dsa_verify_hash.c
index 11e5c33..11e5c33 100644
--- a/src/pk/dsa/dsa_verify_hash.c
+++ b/libtomcrypt/src/pk/dsa/dsa_verify_hash.c
diff --git a/src/pk/dsa/dsa_verify_key.c b/libtomcrypt/src/pk/dsa/dsa_verify_key.c
index b7be103..b7be103 100644
--- a/src/pk/dsa/dsa_verify_key.c
+++ b/libtomcrypt/src/pk/dsa/dsa_verify_key.c
diff --git a/src/pk/ecc/ecc.c b/libtomcrypt/src/pk/ecc/ecc.c
index 469d56d..469d56d 100644
--- a/src/pk/ecc/ecc.c
+++ b/libtomcrypt/src/pk/ecc/ecc.c
diff --git a/src/pk/ecc/ecc_sys.c b/libtomcrypt/src/pk/ecc/ecc_sys.c
index 65ead31..65ead31 100644
--- a/src/pk/ecc/ecc_sys.c
+++ b/libtomcrypt/src/pk/ecc/ecc_sys.c
diff --git a/src/pk/packet_store_header.c b/libtomcrypt/src/pk/packet_store_header.c
index 855ca70..855ca70 100644
--- a/src/pk/packet_store_header.c
+++ b/libtomcrypt/src/pk/packet_store_header.c
diff --git a/src/pk/packet_valid_header.c b/libtomcrypt/src/pk/packet_valid_header.c
index 59db0f7..59db0f7 100644
--- a/src/pk/packet_valid_header.c
+++ b/libtomcrypt/src/pk/packet_valid_header.c
diff --git a/src/pk/pkcs1/pkcs_1_i2osp.c b/libtomcrypt/src/pk/pkcs1/pkcs_1_i2osp.c
index 3e68eb8..3e68eb8 100644
--- a/src/pk/pkcs1/pkcs_1_i2osp.c
+++ b/libtomcrypt/src/pk/pkcs1/pkcs_1_i2osp.c
diff --git a/src/pk/pkcs1/pkcs_1_mgf1.c b/libtomcrypt/src/pk/pkcs1/pkcs_1_mgf1.c
index aac7494..aac7494 100644
--- a/src/pk/pkcs1/pkcs_1_mgf1.c
+++ b/libtomcrypt/src/pk/pkcs1/pkcs_1_mgf1.c
diff --git a/src/pk/pkcs1/pkcs_1_oaep_decode.c b/libtomcrypt/src/pk/pkcs1/pkcs_1_oaep_decode.c
index cd4931b..cd4931b 100644
--- a/src/pk/pkcs1/pkcs_1_oaep_decode.c
+++ b/libtomcrypt/src/pk/pkcs1/pkcs_1_oaep_decode.c
diff --git a/src/pk/pkcs1/pkcs_1_oaep_encode.c b/libtomcrypt/src/pk/pkcs1/pkcs_1_oaep_encode.c
index 12670b5..12670b5 100644
--- a/src/pk/pkcs1/pkcs_1_oaep_encode.c
+++ b/libtomcrypt/src/pk/pkcs1/pkcs_1_oaep_encode.c
diff --git a/src/pk/pkcs1/pkcs_1_os2ip.c b/libtomcrypt/src/pk/pkcs1/pkcs_1_os2ip.c
index a9a1858..a9a1858 100644
--- a/src/pk/pkcs1/pkcs_1_os2ip.c
+++ b/libtomcrypt/src/pk/pkcs1/pkcs_1_os2ip.c
diff --git a/src/pk/pkcs1/pkcs_1_pss_decode.c b/libtomcrypt/src/pk/pkcs1/pkcs_1_pss_decode.c
index e61c33f..e61c33f 100644
--- a/src/pk/pkcs1/pkcs_1_pss_decode.c
+++ b/libtomcrypt/src/pk/pkcs1/pkcs_1_pss_decode.c
diff --git a/src/pk/pkcs1/pkcs_1_pss_encode.c b/libtomcrypt/src/pk/pkcs1/pkcs_1_pss_encode.c
index 899605c..899605c 100644
--- a/src/pk/pkcs1/pkcs_1_pss_encode.c
+++ b/libtomcrypt/src/pk/pkcs1/pkcs_1_pss_encode.c
diff --git a/src/pk/rsa/rsa_decrypt_key.c b/libtomcrypt/src/pk/rsa/rsa_decrypt_key.c
index 7cea807..7cea807 100644
--- a/src/pk/rsa/rsa_decrypt_key.c
+++ b/libtomcrypt/src/pk/rsa/rsa_decrypt_key.c
diff --git a/src/pk/rsa/rsa_encrypt_key.c b/libtomcrypt/src/pk/rsa/rsa_encrypt_key.c
index d29aa83..d29aa83 100644
--- a/src/pk/rsa/rsa_encrypt_key.c
+++ b/libtomcrypt/src/pk/rsa/rsa_encrypt_key.c
diff --git a/src/pk/rsa/rsa_export.c b/libtomcrypt/src/pk/rsa/rsa_export.c
index f2e324d..f2e324d 100644
--- a/src/pk/rsa/rsa_export.c
+++ b/libtomcrypt/src/pk/rsa/rsa_export.c
diff --git a/src/pk/rsa/rsa_exptmod.c b/libtomcrypt/src/pk/rsa/rsa_exptmod.c
index 379404c..379404c 100644
--- a/src/pk/rsa/rsa_exptmod.c
+++ b/libtomcrypt/src/pk/rsa/rsa_exptmod.c
diff --git a/src/pk/rsa/rsa_free.c b/libtomcrypt/src/pk/rsa/rsa_free.c
index 69eeaec..69eeaec 100644
--- a/src/pk/rsa/rsa_free.c
+++ b/libtomcrypt/src/pk/rsa/rsa_free.c
diff --git a/src/pk/rsa/rsa_import.c b/libtomcrypt/src/pk/rsa/rsa_import.c
index 5706134..5706134 100644
--- a/src/pk/rsa/rsa_import.c
+++ b/libtomcrypt/src/pk/rsa/rsa_import.c
diff --git a/src/pk/rsa/rsa_make_key.c b/libtomcrypt/src/pk/rsa/rsa_make_key.c
index d874106..d874106 100644
--- a/src/pk/rsa/rsa_make_key.c
+++ b/libtomcrypt/src/pk/rsa/rsa_make_key.c
diff --git a/src/pk/rsa/rsa_sign_hash.c b/libtomcrypt/src/pk/rsa/rsa_sign_hash.c
index d31bda3..d31bda3 100644
--- a/src/pk/rsa/rsa_sign_hash.c
+++ b/libtomcrypt/src/pk/rsa/rsa_sign_hash.c
diff --git a/src/pk/rsa/rsa_verify_hash.c b/libtomcrypt/src/pk/rsa/rsa_verify_hash.c
index 690364d..690364d 100644
--- a/src/pk/rsa/rsa_verify_hash.c
+++ b/libtomcrypt/src/pk/rsa/rsa_verify_hash.c
diff --git a/src/prngs/fortuna.c b/libtomcrypt/src/prngs/fortuna.c
index 8a3b8ea..8a3b8ea 100644
--- a/src/prngs/fortuna.c
+++ b/libtomcrypt/src/prngs/fortuna.c
diff --git a/src/prngs/rc4.c b/libtomcrypt/src/prngs/rc4.c
index 4d29d9a..4d29d9a 100644
--- a/src/prngs/rc4.c
+++ b/libtomcrypt/src/prngs/rc4.c
diff --git a/src/prngs/rng_get_bytes.c b/libtomcrypt/src/prngs/rng_get_bytes.c
index a711dfa..a711dfa 100644
--- a/src/prngs/rng_get_bytes.c
+++ b/libtomcrypt/src/prngs/rng_get_bytes.c
diff --git a/src/prngs/rng_make_prng.c b/libtomcrypt/src/prngs/rng_make_prng.c
index 51b6e6c..51b6e6c 100644
--- a/src/prngs/rng_make_prng.c
+++ b/libtomcrypt/src/prngs/rng_make_prng.c
diff --git a/src/prngs/sober128.c b/libtomcrypt/src/prngs/sober128.c
index c89f01c..c89f01c 100644
--- a/src/prngs/sober128.c
+++ b/libtomcrypt/src/prngs/sober128.c
diff --git a/src/prngs/sober128tab.c b/libtomcrypt/src/prngs/sober128tab.c
index b50c77b..b50c77b 100644
--- a/src/prngs/sober128tab.c
+++ b/libtomcrypt/src/prngs/sober128tab.c
diff --git a/src/prngs/sprng.c b/libtomcrypt/src/prngs/sprng.c
index 4e657da..4e657da 100644
--- a/src/prngs/sprng.c
+++ b/libtomcrypt/src/prngs/sprng.c
diff --git a/src/prngs/yarrow.c b/libtomcrypt/src/prngs/yarrow.c
index 0e59c22..0e59c22 100644
--- a/src/prngs/yarrow.c
+++ b/libtomcrypt/src/prngs/yarrow.c
diff --git a/testbuild.sh b/libtomcrypt/testbuild.sh
index a17c677..a17c677 100644
--- a/testbuild.sh
+++ b/libtomcrypt/testbuild.sh
diff --git a/testme.sh b/libtomcrypt/testme.sh
index da0f97b..da0f97b 100644
--- a/testme.sh
+++ b/libtomcrypt/testme.sh
diff --git a/testprof/base64_test.c b/libtomcrypt/testprof/base64_test.c
index af20a67..af20a67 100644
--- a/testprof/base64_test.c
+++ b/libtomcrypt/testprof/base64_test.c
diff --git a/testprof/cipher_hash_test.c b/libtomcrypt/testprof/cipher_hash_test.c
index 27232e2..27232e2 100644
--- a/testprof/cipher_hash_test.c
+++ b/libtomcrypt/testprof/cipher_hash_test.c
diff --git a/testprof/der_tests.c b/libtomcrypt/testprof/der_tests.c
index 8119351..8119351 100644
--- a/testprof/der_tests.c
+++ b/libtomcrypt/testprof/der_tests.c
diff --git a/testprof/dh_tests.c b/libtomcrypt/testprof/dh_tests.c
index 3852e47..3852e47 100644
--- a/testprof/dh_tests.c
+++ b/libtomcrypt/testprof/dh_tests.c
diff --git a/testprof/dsa_test.c b/libtomcrypt/testprof/dsa_test.c
index aa4988c..aa4988c 100644
--- a/testprof/dsa_test.c
+++ b/libtomcrypt/testprof/dsa_test.c
diff --git a/testprof/ecc_test.c b/libtomcrypt/testprof/ecc_test.c
index d5f88ab..d5f88ab 100644
--- a/testprof/ecc_test.c
+++ b/libtomcrypt/testprof/ecc_test.c
diff --git a/testprof/mac_test.c b/libtomcrypt/testprof/mac_test.c
index 3c9e902..3c9e902 100644
--- a/testprof/mac_test.c
+++ b/libtomcrypt/testprof/mac_test.c
diff --git a/testprof/makefile b/libtomcrypt/testprof/makefile
index f4be577..f4be577 100644
--- a/testprof/makefile
+++ b/libtomcrypt/testprof/makefile
diff --git a/testprof/makefile.icc b/libtomcrypt/testprof/makefile.icc
index ff87660..ff87660 100644
--- a/testprof/makefile.icc
+++ b/libtomcrypt/testprof/makefile.icc
diff --git a/testprof/makefile.msvc b/libtomcrypt/testprof/makefile.msvc
index 6e15ffb..6e15ffb 100644
--- a/testprof/makefile.msvc
+++ b/libtomcrypt/testprof/makefile.msvc
diff --git a/testprof/makefile.shared b/libtomcrypt/testprof/makefile.shared
index b4219f0..b4219f0 100644
--- a/testprof/makefile.shared
+++ b/libtomcrypt/testprof/makefile.shared
diff --git a/testprof/modes_test.c b/libtomcrypt/testprof/modes_test.c
index d394267..d394267 100644
--- a/testprof/modes_test.c
+++ b/libtomcrypt/testprof/modes_test.c
diff --git a/testprof/pkcs_1_test.c b/libtomcrypt/testprof/pkcs_1_test.c
index 1a47d85..1a47d85 100644
--- a/testprof/pkcs_1_test.c
+++ b/libtomcrypt/testprof/pkcs_1_test.c
diff --git a/testprof/rsa_test.c b/libtomcrypt/testprof/rsa_test.c
index 4facdcd..4facdcd 100644
--- a/testprof/rsa_test.c
+++ b/libtomcrypt/testprof/rsa_test.c
diff --git a/testprof/store_test.c b/libtomcrypt/testprof/store_test.c
index 5a38d65..5a38d65 100644
--- a/testprof/store_test.c
+++ b/libtomcrypt/testprof/store_test.c
diff --git a/testprof/test.c b/libtomcrypt/testprof/test.c
index 5a38421..5a38421 100644
--- a/testprof/test.c
+++ b/libtomcrypt/testprof/test.c
diff --git a/testprof/test.key b/libtomcrypt/testprof/test.key
index e4996c3..e4996c3 100644
--- a/testprof/test.key
+++ b/libtomcrypt/testprof/test.key
diff --git a/testprof/tomcrypt_test.h b/libtomcrypt/testprof/tomcrypt_test.h
index 2c66bad..2c66bad 100644
--- a/testprof/tomcrypt_test.h
+++ b/libtomcrypt/testprof/tomcrypt_test.h
diff --git a/testprof/x86_prof.c b/libtomcrypt/testprof/x86_prof.c
index 997180c..997180c 100644
--- a/testprof/x86_prof.c
+++ b/libtomcrypt/testprof/x86_prof.c
diff --git a/libtommath/LICENSE b/libtommath/LICENSE
new file mode 100644
index 0000000..5baa792
--- /dev/null
+++ b/libtommath/LICENSE
@@ -0,0 +1,4 @@
+LibTomMath is hereby released into the Public Domain.  
+
+-- Tom St Denis
+
diff --git a/libtommath/Makefile.in b/libtommath/Makefile.in
new file mode 100644
index 0000000..e96173a
--- /dev/null
+++ b/libtommath/Makefile.in
@@ -0,0 +1,165 @@
+#Makefile for GCC
+#
+#Tom St Denis
+
+#version of library 
+VERSION=0.35
+
+VPATH=@srcdir@
+srcdir=@srcdir@
+
+# Dropbear takes flags from the toplevel makefile
+CFLAGS += -I$(srcdir)
+
+#CFLAGS  +=  -I./ -Wall -W -Wshadow -Wsign-compare
+
+#for speed 
+#CFLAGS += -O3 -funroll-all-loops
+
+#for size 
+#CFLAGS += -Os
+
+#x86 optimizations [should be valid for any GCC install though]
+#CFLAGS  += -fomit-frame-pointer
+
+#debug
+#CFLAGS += -g3
+
+#install as this user
+USER=root
+GROUP=root
+
+default: libtommath.a
+
+#default files to install
+LIBNAME=libtommath.a
+HEADERS=tommath.h tommath_class.h tommath_superclass.h
+
+#LIBPATH-The directory for libtommath to be installed to.
+#INCPATH-The directory to install the header files for libtommath.
+#DATAPATH-The directory to install the pdf docs.
+DESTDIR=
+LIBPATH=/usr/lib
+INCPATH=/usr/include
+DATAPATH=/usr/share/doc/libtommath/pdf
+
+OBJECTS=bncore.o bn_mp_init.o bn_mp_clear.o bn_mp_exch.o bn_mp_grow.o bn_mp_shrink.o \
+bn_mp_clamp.o bn_mp_zero.o  bn_mp_set.o bn_mp_set_int.o bn_mp_init_size.o bn_mp_copy.o \
+bn_mp_init_copy.o bn_mp_abs.o bn_mp_neg.o bn_mp_cmp_mag.o bn_mp_cmp.o bn_mp_cmp_d.o \
+bn_mp_rshd.o bn_mp_lshd.o bn_mp_mod_2d.o bn_mp_div_2d.o bn_mp_mul_2d.o bn_mp_div_2.o \
+bn_mp_mul_2.o bn_s_mp_add.o bn_s_mp_sub.o bn_fast_s_mp_mul_digs.o bn_s_mp_mul_digs.o \
+bn_fast_s_mp_mul_high_digs.o bn_s_mp_mul_high_digs.o bn_fast_s_mp_sqr.o bn_s_mp_sqr.o \
+bn_mp_add.o bn_mp_sub.o bn_mp_karatsuba_mul.o bn_mp_mul.o bn_mp_karatsuba_sqr.o \
+bn_mp_sqr.o bn_mp_div.o bn_mp_mod.o bn_mp_add_d.o bn_mp_sub_d.o bn_mp_mul_d.o \
+bn_mp_div_d.o bn_mp_mod_d.o bn_mp_expt_d.o bn_mp_addmod.o bn_mp_submod.o \
+bn_mp_mulmod.o bn_mp_sqrmod.o bn_mp_gcd.o bn_mp_lcm.o bn_fast_mp_invmod.o bn_mp_invmod.o \
+bn_mp_reduce.o bn_mp_montgomery_setup.o bn_fast_mp_montgomery_reduce.o bn_mp_montgomery_reduce.o \
+bn_mp_exptmod_fast.o bn_mp_exptmod.o bn_mp_2expt.o bn_mp_n_root.o bn_mp_jacobi.o bn_reverse.o \
+bn_mp_count_bits.o bn_mp_read_unsigned_bin.o bn_mp_read_signed_bin.o bn_mp_to_unsigned_bin.o \
+bn_mp_to_signed_bin.o bn_mp_unsigned_bin_size.o bn_mp_signed_bin_size.o  \
+bn_mp_xor.o bn_mp_and.o bn_mp_or.o bn_mp_rand.o bn_mp_montgomery_calc_normalization.o \
+bn_mp_prime_is_divisible.o bn_prime_tab.o bn_mp_prime_fermat.o bn_mp_prime_miller_rabin.o \
+bn_mp_prime_is_prime.o bn_mp_prime_next_prime.o bn_mp_dr_reduce.o \
+bn_mp_dr_is_modulus.o bn_mp_dr_setup.o bn_mp_reduce_setup.o \
+bn_mp_toom_mul.o bn_mp_toom_sqr.o bn_mp_div_3.o bn_s_mp_exptmod.o \
+bn_mp_reduce_2k.o bn_mp_reduce_is_2k.o bn_mp_reduce_2k_setup.o \
+bn_mp_reduce_2k_l.o bn_mp_reduce_is_2k_l.o bn_mp_reduce_2k_setup_l.o \
+bn_mp_radix_smap.o bn_mp_read_radix.o bn_mp_toradix.o bn_mp_radix_size.o \
+bn_mp_fread.o bn_mp_fwrite.o bn_mp_cnt_lsb.o bn_error.o \
+bn_mp_init_multi.o bn_mp_clear_multi.o bn_mp_exteuclid.o bn_mp_toradix_n.o \
+bn_mp_prime_random_ex.o bn_mp_get_int.o bn_mp_sqrt.o bn_mp_is_square.o bn_mp_init_set.o \
+bn_mp_init_set_int.o bn_mp_invmod_slow.o bn_mp_prime_rabin_miller_trials.o \
+bn_mp_to_signed_bin_n.o bn_mp_to_unsigned_bin_n.o
+
+libtommath.a:  $(OBJECTS)
+	$(AR) $(ARFLAGS) libtommath.a $(OBJECTS)
+	$(RANLIB) libtommath.a
+
+#make a profiled library (takes a while!!!)
+#
+# This will build the library with profile generation
+# then run the test demo and rebuild the library.
+# 
+# So far I've seen improvements in the MP math
+profiled:
+	make CFLAGS="$(CFLAGS) -fprofile-arcs -DTESTING" timing
+	./ltmtest
+	rm -f *.a *.o ltmtest
+	make CFLAGS="$(CFLAGS) -fbranch-probabilities"
+
+#make a single object profiled library 
+profiled_single:
+	perl gen.pl
+	$(CC) $(CFLAGS) -fprofile-arcs -DTESTING -c mpi.c -o mpi.o
+	$(CC) $(CFLAGS) -DTESTING -DTIMER demo/timing.c mpi.o -o ltmtest
+	./ltmtest
+	rm -f *.o ltmtest
+	$(CC) $(CFLAGS) -fbranch-probabilities -DTESTING -c mpi.c -o mpi.o
+	$(AR) $(ARFLAGS) libtommath.a mpi.o
+	ranlib libtommath.a	
+
+install: libtommath.a
+	install -d -g $(GROUP) -o $(USER) $(DESTDIR)$(LIBPATH)
+	install -d -g $(GROUP) -o $(USER) $(DESTDIR)$(INCPATH)
+	install -g $(GROUP) -o $(USER) $(LIBNAME) $(DESTDIR)$(LIBPATH)
+	install -g $(GROUP) -o $(USER) $(HEADERS) $(DESTDIR)$(INCPATH)
+
+test: libtommath.a demo/demo.o
+	$(CC) $(CFLAGS) demo/demo.o libtommath.a -o test
+	
+mtest: test	
+	cd mtest ; $(CC) $(CFLAGS) mtest.c -o mtest
+        
+timing: libtommath.a
+	$(CC) $(CFLAGS) -DTIMER demo/timing.c libtommath.a -o ltmtest
+
+# makes the LTM book DVI file, requires tetex, perl and makeindex [part of tetex I think]
+docdvi: tommath.src
+	cd pics ; make 
+	echo "hello" > tommath.ind
+	perl booker.pl
+	latex tommath > /dev/null
+	latex tommath > /dev/null
+	makeindex tommath
+	latex tommath > /dev/null
+
+# poster, makes the single page PDF poster
+poster: poster.tex
+	pdflatex poster
+	rm -f poster.aux poster.log 
+
+# makes the LTM book PDF file, requires tetex, cleans up the LaTeX temp files
+docs:   docdvi
+	dvipdf tommath
+	rm -f tommath.log tommath.aux tommath.dvi tommath.idx tommath.toc tommath.lof tommath.ind tommath.ilg
+	cd pics ; make clean
+	
+#LTM user manual
+mandvi: bn.tex
+	echo "hello" > bn.ind
+	latex bn > /dev/null
+	latex bn > /dev/null
+	makeindex bn
+	latex bn > /dev/null
+
+#LTM user manual [pdf]
+manual:	mandvi
+	pdflatex bn >/dev/null
+	rm -f bn.aux bn.dvi bn.log bn.idx bn.lof bn.out bn.toc
+
+pretty: 
+	perl pretty.build
+
+clean:
+	rm -f *.bat *.pdf *.o *.a *.obj *.lib *.exe *.dll etclib/*.o demo/demo.o test ltmtest mpitest mtest/mtest mtest/mtest.exe \
+        *.idx *.toc *.log *.aux *.dvi *.lof *.ind *.ilg *.ps *.log *.s mpi.c *.da *.dyn *.dpi tommath.tex *.lo *.la
+	rm -rf .libs
+	cd etc && make clean
+	cd pics && make clean
+
+zipup: clean manual poster docs
+	perl gen.pl ; mv mpi.c pre_gen/ ; \
+	cd .. ; rm -rf ltm* libtommath-$(VERSION) ; mkdir libtommath-$(VERSION) ; \
+	cp -R ./libtommath/* ./libtommath-$(VERSION)/ ; \
+	tar -c libtommath-$(VERSION)/* | bzip2 -9vvc > ltm-$(VERSION).tar.bz2 ; \
+	zip -9 -r ltm-$(VERSION).zip libtommath-$(VERSION)/*
diff --git a/libtommath/TODO b/libtommath/TODO
new file mode 100644
index 0000000..deffba1
--- /dev/null
+++ b/libtommath/TODO
@@ -0,0 +1,16 @@
+things for book in order of importance...
+
+- Fix up pseudo-code [only] for combas that are not consistent with source
+- Start in chapter 3 [basics] and work up...
+   - re-write to prose [less abrupt]
+   - clean up pseudo code [spacing]
+   - more examples where appropriate and figures
+
+Goal:
+   - Get sync done by mid January [roughly 8-12 hours work]
+   - Finish ch3-6 by end of January [roughly 12-16 hours of work]
+   - Finish ch7-end by mid Feb [roughly 20-24 hours of work].
+
+Goal isn't "first edition" but merely cleaner to read.
+
+
diff --git a/libtommath/bn.tex b/libtommath/bn.tex
new file mode 100644
index 0000000..244bd6f
--- /dev/null
+++ b/libtommath/bn.tex
@@ -0,0 +1,1835 @@
+\documentclass[b5paper]{book}
+\usepackage{hyperref}
+\usepackage{makeidx}
+\usepackage{amssymb}
+\usepackage{color}
+\usepackage{alltt}
+\usepackage{graphicx}
+\usepackage{layout}
+\def\union{\cup}
+\def\intersect{\cap}
+\def\getsrandom{\stackrel{\rm R}{\gets}}
+\def\cross{\times}
+\def\cat{\hspace{0.5em} \| \hspace{0.5em}}
+\def\catn{$\|$}
+\def\divides{\hspace{0.3em} | \hspace{0.3em}}
+\def\nequiv{\not\equiv}
+\def\approx{\raisebox{0.2ex}{\mbox{\small $\sim$}}}
+\def\lcm{{\rm lcm}}
+\def\gcd{{\rm gcd}}
+\def\log{{\rm log}}
+\def\ord{{\rm ord}}
+\def\abs{{\mathit abs}}
+\def\rep{{\mathit rep}}
+\def\mod{{\mathit\ mod\ }}
+\renewcommand{\pmod}[1]{\ ({\rm mod\ }{#1})}
+\newcommand{\floor}[1]{\left\lfloor{#1}\right\rfloor}
+\newcommand{\ceil}[1]{\left\lceil{#1}\right\rceil}
+\def\Or{{\rm\ or\ }}
+\def\And{{\rm\ and\ }}
+\def\iff{\hspace{1em}\Longleftrightarrow\hspace{1em}}
+\def\implies{\Rightarrow}
+\def\undefined{{\rm ``undefined"}}
+\def\Proof{\vspace{1ex}\noindent {\bf Proof:}\hspace{1em}}
+\let\oldphi\phi
+\def\phi{\varphi}
+\def\Pr{{\rm Pr}}
+\newcommand{\str}[1]{{\mathbf{#1}}}
+\def\F{{\mathbb F}}
+\def\N{{\mathbb N}}
+\def\Z{{\mathbb Z}}
+\def\R{{\mathbb R}}
+\def\C{{\mathbb C}}
+\def\Q{{\mathbb Q}}
+\definecolor{DGray}{gray}{0.5}
+\newcommand{\emailaddr}[1]{\mbox{$<${#1}$>$}}
+\def\twiddle{\raisebox{0.3ex}{\mbox{\tiny $\sim$}}}
+\def\gap{\vspace{0.5ex}}
+\makeindex
+\begin{document}
+\frontmatter
+\pagestyle{empty}
+\title{LibTomMath User Manual \\ v0.35}
+\author{Tom St Denis \\ tomstdenis@iahu.ca}
+\maketitle
+This text, the library and the accompanying textbook are all hereby placed in the public domain.  This book has been 
+formatted for B5 [176x250] paper using the \LaTeX{} {\em book} macro package.
+
+\vspace{10cm}
+
+\begin{flushright}Open Source.  Open Academia.  Open Minds.
+
+\mbox{ }
+
+Tom St Denis,
+
+Ontario, Canada
+\end{flushright}
+
+\tableofcontents
+\listoffigures
+\mainmatter
+\pagestyle{headings}
+\chapter{Introduction}
+\section{What is LibTomMath?}
+LibTomMath is a library of source code which provides a series of efficient and carefully written functions for manipulating
+large integer numbers.  It was written in portable ISO C source code so that it will build on any platform with a conforming
+C compiler.  
+
+In a nutshell the library was written from scratch with verbose comments to help instruct computer science students how
+to implement ``bignum'' math.  However, the resulting code has proven to be very useful.  It has been used by numerous 
+universities, commercial and open source software developers.  It has been used on a variety of platforms ranging from
+Linux and Windows based x86 to ARM based Gameboys and PPC based MacOS machines.  
+
+\section{License}
+As of the v0.25 the library source code has been placed in the public domain with every new release.  As of the v0.28
+release the textbook ``Implementing Multiple Precision Arithmetic'' has been placed in the public domain with every new
+release as well.  This textbook is meant to compliment the project by providing a more solid walkthrough of the development
+algorithms used in the library.
+
+Since both\footnote{Note that the MPI files under mtest/ are copyrighted by Michael Fromberger.  They are not required to use LibTomMath.} are in the 
+public domain everyone is entitled to do with them as they see fit.
+
+\section{Building LibTomMath}
+
+LibTomMath is meant to be very ``GCC friendly'' as it comes with a makefile well suited for GCC.  However, the library will
+also build in MSVC, Borland C out of the box.  For any other ISO C compiler a makefile will have to be made by the end
+developer.  
+
+\subsection{Static Libraries}
+To build as a static library for GCC issue the following
+\begin{alltt}
+make
+\end{alltt}
+
+command.  This will build the library and archive the object files in ``libtommath.a''.  Now you link against 
+that and include ``tommath.h'' within your programs.  Alternatively to build with MSVC issue the following
+\begin{alltt}
+nmake -f makefile.msvc
+\end{alltt}
+
+This will build the library and archive the object files in ``tommath.lib''.  This has been tested with MSVC 
+version 6.00 with service pack 5.  
+
+\subsection{Shared Libraries}
+To build as a shared library for GCC issue the following
+\begin{alltt}
+make -f makefile.shared
+\end{alltt}
+This requires the ``libtool'' package (common on most Linux/BSD systems).  It will build LibTomMath as both shared
+and static then install (by default) into /usr/lib as well as install the header files in /usr/include.  The shared 
+library (resource) will be called ``libtommath.la'' while the static library called ``libtommath.a''.  Generally 
+you use libtool to link your application against the shared object.  
+
+There is limited support for making a ``DLL'' in windows via the ``makefile.cygwin\_dll'' makefile.  It requires 
+Cygwin to work with since it requires the auto-export/import functionality.  The resulting DLL and import library 
+``libtommath.dll.a'' can be used to link LibTomMath dynamically to any Windows program using Cygwin.
+
+\subsection{Testing}
+To build the library and the test harness type
+
+\begin{alltt}
+make test
+\end{alltt}
+
+This will build the library, ``test'' and ``mtest/mtest''.  The ``test'' program will accept test vectors and verify the
+results.  ``mtest/mtest'' will generate test vectors using the MPI library by Michael Fromberger\footnote{A copy of MPI
+is included in the package}.  Simply pipe mtest into test using
+
+\begin{alltt}
+mtest/mtest | test
+\end{alltt}
+
+If you do not have a ``/dev/urandom'' style RNG source you will have to write your own PRNG and simply pipe that into 
+mtest.  For example, if your PRNG program is called ``myprng'' simply invoke
+
+\begin{alltt}
+myprng | mtest/mtest | test
+\end{alltt}
+
+This will output a row of numbers that are increasing.  Each column is a different test (such as addition, multiplication, etc)
+that is being performed.  The numbers represent how many times the test was invoked.  If an error is detected the program
+will exit with a dump of the relevent numbers it was working with.
+
+\section{Build Configuration}
+LibTomMath can configured at build time in three phases we shall call ``depends'', ``tweaks'' and ``trims''.  
+Each phase changes how the library is built and they are applied one after another respectively.  
+
+To make the system more powerful you can tweak the build process.  Classes are defined in the file
+``tommath\_superclass.h''.  By default, the symbol ``LTM\_ALL'' shall be defined which simply 
+instructs the system to build all of the functions.  This is how LibTomMath used to be packaged.  This will give you 
+access to every function LibTomMath offers.
+
+However, there are cases where such a build is not optional.  For instance, you want to perform RSA operations.  You 
+don't need the vast majority of the library to perform these operations.  Aside from LTM\_ALL there is 
+another pre--defined class ``SC\_RSA\_1'' which works in conjunction with the RSA from LibTomCrypt.  Additional 
+classes can be defined base on the need of the user.
+
+\subsection{Build Depends}
+In the file tommath\_class.h you will see a large list of C ``defines'' followed by a series of ``ifdefs''
+which further define symbols.  All of the symbols (technically they're macros $\ldots$) represent a given C source
+file.  For instance, BN\_MP\_ADD\_C represents the file ``bn\_mp\_add.c''.  When a define has been enabled the
+function in the respective file will be compiled and linked into the library.  Accordingly when the define
+is absent the file will not be compiled and not contribute any size to the library.
+
+You will also note that the header tommath\_class.h is actually recursively included (it includes itself twice).  
+This is to help resolve as many dependencies as possible.  In the last pass the symbol LTM\_LAST will be defined.  
+This is useful for ``trims''.
+
+\subsection{Build Tweaks}
+A tweak is an algorithm ``alternative''.  For example, to provide tradeoffs (usually between size and space).
+They can be enabled at any pass of the configuration phase.
+
+\begin{small}
+\begin{center}
+\begin{tabular}{|l|l|}
+\hline \textbf{Define} & \textbf{Purpose} \\
+\hline BN\_MP\_DIV\_SMALL & Enables a slower, smaller and equally \\
+                          & functional mp\_div() function \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+
+\subsection{Build Trims}
+A trim is a manner of removing functionality from a function that is not required.  For instance, to perform
+RSA cryptography you only require exponentiation with odd moduli so even moduli support can be safely removed.  
+Build trims are meant to be defined on the last pass of the configuration which means they are to be defined
+only if LTM\_LAST has been defined.
+
+\subsubsection{Moduli Related}
+\begin{small}
+\begin{center}
+\begin{tabular}{|l|l|}
+\hline \textbf{Restriction} & \textbf{Undefine} \\
+\hline Exponentiation with odd moduli only & BN\_S\_MP\_EXPTMOD\_C \\
+                                           & BN\_MP\_REDUCE\_C \\
+                                           & BN\_MP\_REDUCE\_SETUP\_C \\
+                                           & BN\_S\_MP\_MUL\_HIGH\_DIGS\_C \\
+                                           & BN\_FAST\_S\_MP\_MUL\_HIGH\_DIGS\_C \\
+\hline Exponentiation with random odd moduli & (The above plus the following) \\
+                                           & BN\_MP\_REDUCE\_2K\_C \\
+                                           & BN\_MP\_REDUCE\_2K\_SETUP\_C \\
+                                           & BN\_MP\_REDUCE\_IS\_2K\_C \\
+                                           & BN\_MP\_DR\_IS\_MODULUS\_C \\
+                                           & BN\_MP\_DR\_REDUCE\_C \\
+                                           & BN\_MP\_DR\_SETUP\_C \\
+\hline Modular inverse odd moduli only     & BN\_MP\_INVMOD\_SLOW\_C \\
+\hline Modular inverse (both, smaller/slower) & BN\_FAST\_MP\_INVMOD\_C \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+
+\subsubsection{Operand Size Related}
+\begin{small}
+\begin{center}
+\begin{tabular}{|l|l|}
+\hline \textbf{Restriction} & \textbf{Undefine} \\
+\hline Moduli $\le 2560$ bits              & BN\_MP\_MONTGOMERY\_REDUCE\_C \\
+                                           & BN\_S\_MP\_MUL\_DIGS\_C \\
+                                           & BN\_S\_MP\_MUL\_HIGH\_DIGS\_C \\
+                                           & BN\_S\_MP\_SQR\_C \\
+\hline Polynomial Schmolynomial            & BN\_MP\_KARATSUBA\_MUL\_C \\
+                                           & BN\_MP\_KARATSUBA\_SQR\_C \\
+                                           & BN\_MP\_TOOM\_MUL\_C \\ 
+                                           & BN\_MP\_TOOM\_SQR\_C \\
+
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+
+
+\section{Purpose of LibTomMath}
+Unlike  GNU MP (GMP) Library, LIP, OpenSSL or various other commercial kits (Miracl), LibTomMath was not written with 
+bleeding edge performance in mind.  First and foremost LibTomMath was written to be entirely open.  Not only is the 
+source code public domain (unlike various other GPL/etc licensed code), not only is the code freely downloadable but the
+source code is also accessible for computer science students attempting to learn ``BigNum'' or multiple precision
+arithmetic techniques. 
+
+LibTomMath was written to be an instructive collection of source code.  This is why there are many comments, only one
+function per source file and often I use a ``middle-road'' approach where I don't cut corners for an extra 2\% speed
+increase.
+
+Source code alone cannot really teach how the algorithms work which is why I also wrote a textbook that accompanies
+the library (beat that!).
+
+So you may be thinking ``should I use LibTomMath?'' and the answer is a definite maybe.  Let me tabulate what I think
+are the pros and cons of LibTomMath by comparing it to the math routines from GnuPG\footnote{GnuPG v1.2.3 versus LibTomMath v0.28}.
+
+\newpage\begin{figure}[here]
+\begin{small}
+\begin{center}
+\begin{tabular}{|l|c|c|l|}
+\hline \textbf{Criteria} & \textbf{Pro} & \textbf{Con} & \textbf{Notes} \\
+\hline Few lines of code per file & X & & GnuPG $ = 300.9$, LibTomMath  $ = 71.97$ \\
+\hline Commented function prototypes & X && GnuPG function names are cryptic. \\
+\hline Speed && X & LibTomMath is slower.  \\
+\hline Totally free & X & & GPL has unfavourable restrictions.\\
+\hline Large function base & X & & GnuPG is barebones. \\
+\hline Five modular reduction algorithms & X & & Faster modular exponentiation for a variety of moduli. \\
+\hline Portable & X & & GnuPG requires configuration to build. \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{LibTomMath Valuation}
+\end{figure}
+
+It may seem odd to compare LibTomMath to GnuPG since the math in GnuPG is only a small portion of the entire application. 
+However, LibTomMath was written with cryptography in mind.  It provides essentially all of the functions a cryptosystem
+would require when working with large integers.  
+
+So it may feel tempting to just rip the math code out of GnuPG (or GnuMP where it was taken from originally) in your
+own application but I think there are reasons not to.  While LibTomMath is slower than libraries such as GnuMP it is
+not normally significantly slower.  On x86 machines the difference is normally a factor of two when performing modular
+exponentiations.  It depends largely on the processor, compiler and the moduli being used.
+
+Essentially the only time you wouldn't use LibTomMath is when blazing speed is the primary concern.  However,
+on the other side of the coin LibTomMath offers you a totally free (public domain) well structured math library
+that is very flexible, complete and performs well in resource contrained environments.  Fast RSA for example can
+be performed with as little as 8KB of ram for data (again depending on build options).  
+
+\chapter{Getting Started with LibTomMath}
+\section{Building Programs}
+In order to use LibTomMath you must include ``tommath.h'' and link against the appropriate library file (typically 
+libtommath.a).  There is no library initialization required and the entire library is thread safe.
+
+\section{Return Codes}
+There are three possible return codes a function may return.
+
+\index{MP\_OKAY}\index{MP\_YES}\index{MP\_NO}\index{MP\_VAL}\index{MP\_MEM}
+\begin{figure}[here!]
+\begin{center}
+\begin{small}
+\begin{tabular}{|l|l|}
+\hline \textbf{Code} & \textbf{Meaning} \\
+\hline MP\_OKAY & The function succeeded. \\
+\hline MP\_VAL  & The function input was invalid. \\
+\hline MP\_MEM  & Heap memory exhausted. \\
+\hline &\\
+\hline MP\_YES  & Response is yes. \\
+\hline MP\_NO   & Response is no. \\
+\hline
+\end{tabular}
+\end{small}
+\end{center}
+\caption{Return Codes}
+\end{figure}
+
+The last two codes listed are not actually ``return'ed'' by a function.  They are placed in an integer (the caller must
+provide the address of an integer it can store to) which the caller can access.  To convert one of the three return codes
+to a string use the following function.
+
+\index{mp\_error\_to\_string}
+\begin{alltt}
+char *mp_error_to_string(int code);
+\end{alltt}
+
+This will return a pointer to a string which describes the given error code.  It will not work for the return codes 
+MP\_YES and MP\_NO.  
+
+\section{Data Types}
+The basic ``multiple precision integer'' type is known as the ``mp\_int'' within LibTomMath.  This data type is used to
+organize all of the data required to manipulate the integer it represents.  Within LibTomMath it has been prototyped
+as the following.
+
+\index{mp\_int}
+\begin{alltt}
+typedef struct  \{
+    int used, alloc, sign;
+    mp_digit *dp;
+\} mp_int;
+\end{alltt}
+
+Where ``mp\_digit'' is a data type that represents individual digits of the integer.  By default, an mp\_digit is the
+ISO C ``unsigned long'' data type and each digit is $28-$bits long.  The mp\_digit type can be configured to suit other
+platforms by defining the appropriate macros.  
+
+All LTM functions that use the mp\_int type will expect a pointer to mp\_int structure.  You must allocate memory to
+hold the structure itself by yourself (whether off stack or heap it doesn't matter).  The very first thing that must be
+done to use an mp\_int is that it must be initialized.
+
+\section{Function Organization}
+
+The arithmetic functions of the library are all organized to have the same style prototype.  That is source operands
+are passed on the left and the destination is on the right.  For instance,
+
+\begin{alltt}
+mp_add(&a, &b, &c);       /* c = a + b */
+mp_mul(&a, &a, &c);       /* c = a * a */
+mp_div(&a, &b, &c, &d);   /* c = [a/b], d = a mod b */
+\end{alltt}
+
+Another feature of the way the functions have been implemented is that source operands can be destination operands as well.
+For instance,
+
+\begin{alltt}
+mp_add(&a, &b, &b);       /* b = a + b */
+mp_div(&a, &b, &a, &c);   /* a = [a/b], c = a mod b */
+\end{alltt}
+
+This allows operands to be re-used which can make programming simpler.
+
+\section{Initialization}
+\subsection{Single Initialization}
+A single mp\_int can be initialized with the ``mp\_init'' function. 
+
+\index{mp\_init}
+\begin{alltt}
+int mp_init (mp_int * a);
+\end{alltt}
+
+This function expects a pointer to an mp\_int structure and will initialize the members of the structure so the mp\_int
+represents the default integer which is zero.  If the functions returns MP\_OKAY then the mp\_int is ready to be used
+by the other LibTomMath functions.
+
+\begin{small} \begin{alltt}
+int main(void)
+\{
+   mp_int number;
+   int result;
+
+   if ((result = mp_init(&number)) != MP_OKAY) \{
+      printf("Error initializing the number.  \%s", 
+             mp_error_to_string(result));
+      return EXIT_FAILURE;
+   \}
+ 
+   /* use the number */
+
+   return EXIT_SUCCESS;
+\}
+\end{alltt} \end{small}
+
+\subsection{Single Free}
+When you are finished with an mp\_int it is ideal to return the heap it used back to the system.  The following function 
+provides this functionality.
+
+\index{mp\_clear}
+\begin{alltt}
+void mp_clear (mp_int * a);
+\end{alltt}
+
+The function expects a pointer to a previously initialized mp\_int structure and frees the heap it uses.  It sets the 
+pointer\footnote{The ``dp'' member.} within the mp\_int to \textbf{NULL} which is used to prevent double free situations. 
+Is is legal to call mp\_clear() twice on the same mp\_int in a row.  
+
+\begin{small} \begin{alltt}
+int main(void)
+\{
+   mp_int number;
+   int result;
+
+   if ((result = mp_init(&number)) != MP_OKAY) \{
+      printf("Error initializing the number.  \%s", 
+             mp_error_to_string(result));
+      return EXIT_FAILURE;
+   \}
+ 
+   /* use the number */
+
+   /* We're done with it. */
+   mp_clear(&number);
+
+   return EXIT_SUCCESS;
+\}
+\end{alltt} \end{small}
+
+\subsection{Multiple Initializations}
+Certain algorithms require more than one large integer.  In these instances it is ideal to initialize all of the mp\_int
+variables in an ``all or nothing'' fashion.  That is, they are either all initialized successfully or they are all
+not initialized.
+
+The  mp\_init\_multi() function provides this functionality.
+
+\index{mp\_init\_multi} \index{mp\_clear\_multi}
+\begin{alltt}
+int mp_init_multi(mp_int *mp, ...);
+\end{alltt}
+
+It accepts a \textbf{NULL} terminated list of pointers to mp\_int structures.  It will attempt to initialize them all
+at once.  If the function returns MP\_OKAY then all of the mp\_int variables are ready to use, otherwise none of them
+are available for use.  A complementary mp\_clear\_multi() function allows multiple mp\_int variables to be free'd 
+from the heap at the same time.  
+
+\begin{small} \begin{alltt}
+int main(void)
+\{
+   mp_int num1, num2, num3;
+   int result;
+
+   if ((result = mp_init_multi(&num1, 
+                               &num2,
+                               &num3, NULL)) != MP\_OKAY) \{      
+      printf("Error initializing the numbers.  \%s", 
+             mp_error_to_string(result));
+      return EXIT_FAILURE;
+   \}
+ 
+   /* use the numbers */
+
+   /* We're done with them. */
+   mp_clear_multi(&num1, &num2, &num3, NULL);
+
+   return EXIT_SUCCESS;
+\}
+\end{alltt} \end{small}
+
+\subsection{Other Initializers}
+To initialized and make a copy of an mp\_int the mp\_init\_copy() function has been provided.  
+
+\index{mp\_init\_copy}
+\begin{alltt}
+int mp_init_copy (mp_int * a, mp_int * b);
+\end{alltt}
+
+This function will initialize $a$ and make it a copy of $b$ if all goes well.
+
+\begin{small} \begin{alltt}
+int main(void)
+\{
+   mp_int num1, num2;
+   int result;
+
+   /* initialize and do work on num1 ... */
+
+   /* We want a copy of num1 in num2 now */
+   if ((result = mp_init_copy(&num2, &num1)) != MP_OKAY) \{
+     printf("Error initializing the copy.  \%s", 
+             mp_error_to_string(result));
+      return EXIT_FAILURE;
+   \}
+ 
+   /* now num2 is ready and contains a copy of num1 */
+
+   /* We're done with them. */
+   mp_clear_multi(&num1, &num2, NULL);
+
+   return EXIT_SUCCESS;
+\}
+\end{alltt} \end{small}
+
+Another less common initializer is mp\_init\_size() which allows the user to initialize an mp\_int with a given
+default number of digits.  By default, all initializers allocate \textbf{MP\_PREC} digits.  This function lets
+you override this behaviour.
+
+\index{mp\_init\_size}
+\begin{alltt}
+int mp_init_size (mp_int * a, int size);
+\end{alltt}
+
+The $size$ parameter must be greater than zero.  If the function succeeds the mp\_int $a$ will be initialized
+to have $size$ digits (which are all initially zero).  
+
+\begin{small} \begin{alltt}
+int main(void)
+\{
+   mp_int number;
+   int result;
+
+   /* we need a 60-digit number */
+   if ((result = mp_init_size(&number, 60)) != MP_OKAY) \{
+      printf("Error initializing the number.  \%s", 
+             mp_error_to_string(result));
+      return EXIT_FAILURE;
+   \}
+ 
+   /* use the number */
+
+   return EXIT_SUCCESS;
+\}
+\end{alltt} \end{small}
+
+\section{Maintenance Functions}
+
+\subsection{Reducing Memory Usage}
+When an mp\_int is in a state where it won't be changed again\footnote{A Diffie-Hellman modulus for instance.} excess
+digits can be removed to return memory to the heap with the mp\_shrink() function.
+
+\index{mp\_shrink}
+\begin{alltt}
+int mp_shrink (mp_int * a);
+\end{alltt}
+
+This will remove excess digits of the mp\_int $a$.  If the operation fails the mp\_int should be intact without the
+excess digits being removed.  Note that you can use a shrunk mp\_int in further computations, however, such operations
+will require heap operations which can be slow.  It is not ideal to shrink mp\_int variables that you will further
+modify in the system (unless you are seriously low on memory).  
+
+\begin{small} \begin{alltt}
+int main(void)
+\{
+   mp_int number;
+   int result;
+
+   if ((result = mp_init(&number)) != MP_OKAY) \{
+      printf("Error initializing the number.  \%s", 
+             mp_error_to_string(result));
+      return EXIT_FAILURE;
+   \}
+ 
+   /* use the number [e.g. pre-computation]  */
+
+   /* We're done with it for now. */
+   if ((result = mp_shrink(&number)) != MP_OKAY) \{
+      printf("Error shrinking the number.  \%s", 
+             mp_error_to_string(result));
+      return EXIT_FAILURE;
+   \}
+
+   /* use it .... */
+
+
+   /* we're done with it. */ 
+   mp_clear(&number);
+
+   return EXIT_SUCCESS;
+\}
+\end{alltt} \end{small}
+
+\subsection{Adding additional digits}
+
+Within the mp\_int structure are two parameters which control the limitations of the array of digits that represent
+the integer the mp\_int is meant to equal.   The \textit{used} parameter dictates how many digits are significant, that is,
+contribute to the value of the mp\_int.  The \textit{alloc} parameter dictates how many digits are currently available in
+the array.  If you need to perform an operation that requires more digits you will have to mp\_grow() the mp\_int to
+your desired size.  
+
+\index{mp\_grow}
+\begin{alltt}
+int mp_grow (mp_int * a, int size);
+\end{alltt}
+
+This will grow the array of digits of $a$ to $size$.  If the \textit{alloc} parameter is already bigger than
+$size$ the function will not do anything.
+
+\begin{small} \begin{alltt}
+int main(void)
+\{
+   mp_int number;
+   int result;
+
+   if ((result = mp_init(&number)) != MP_OKAY) \{
+      printf("Error initializing the number.  \%s", 
+             mp_error_to_string(result));
+      return EXIT_FAILURE;
+   \}
+ 
+   /* use the number */
+
+   /* We need to add 20 digits to the number  */
+   if ((result = mp_grow(&number, number.alloc + 20)) != MP_OKAY) \{
+      printf("Error growing the number.  \%s", 
+             mp_error_to_string(result));
+      return EXIT_FAILURE;
+   \}
+
+
+   /* use the number */
+
+   /* we're done with it. */ 
+   mp_clear(&number);
+
+   return EXIT_SUCCESS;
+\}
+\end{alltt} \end{small}
+
+\chapter{Basic Operations}
+\section{Small Constants}
+Setting mp\_ints to small constants is a relatively common operation.  To accomodate these instances there are two
+small constant assignment functions.  The first function is used to set a single digit constant while the second sets
+an ISO C style ``unsigned long'' constant.  The reason for both functions is efficiency.  Setting a single digit is quick but the
+domain of a digit can change (it's always at least $0 \ldots 127$).  
+
+\subsection{Single Digit}
+
+Setting a single digit can be accomplished with the following function.
+
+\index{mp\_set}
+\begin{alltt}
+void mp_set (mp_int * a, mp_digit b);
+\end{alltt}
+
+This will zero the contents of $a$ and make it represent an integer equal to the value of $b$.  Note that this
+function has a return type of \textbf{void}.  It cannot cause an error so it is safe to assume the function
+succeeded.
+
+\begin{small} \begin{alltt}
+int main(void)
+\{
+   mp_int number;
+   int result;
+
+   if ((result = mp_init(&number)) != MP_OKAY) \{
+      printf("Error initializing the number.  \%s", 
+             mp_error_to_string(result));
+      return EXIT_FAILURE;
+   \}
+ 
+   /* set the number to 5 */
+   mp_set(&number, 5);
+
+   /* we're done with it. */ 
+   mp_clear(&number);
+
+   return EXIT_SUCCESS;
+\}
+\end{alltt} \end{small}
+
+\subsection{Long Constants}
+
+To set a constant that is the size of an ISO C ``unsigned long'' and larger than a single digit the following function 
+can be used.
+
+\index{mp\_set\_int}
+\begin{alltt}
+int mp_set_int (mp_int * a, unsigned long b);
+\end{alltt}
+
+This will assign the value of the 32-bit variable $b$ to the mp\_int $a$.  Unlike mp\_set() this function will always
+accept a 32-bit input regardless of the size of a single digit.  However, since the value may span several digits 
+this function can fail if it runs out of heap memory.
+
+To get the ``unsigned long'' copy of an mp\_int the following function can be used.
+
+\index{mp\_get\_int}
+\begin{alltt}
+unsigned long mp_get_int (mp_int * a);
+\end{alltt}
+
+This will return the 32 least significant bits of the mp\_int $a$.  
+
+\begin{small} \begin{alltt}
+int main(void)
+\{
+   mp_int number;
+   int result;
+
+   if ((result = mp_init(&number)) != MP_OKAY) \{
+      printf("Error initializing the number.  \%s", 
+             mp_error_to_string(result));
+      return EXIT_FAILURE;
+   \}
+ 
+   /* set the number to 654321 (note this is bigger than 127) */
+   if ((result = mp_set_int(&number, 654321)) != MP_OKAY) \{
+      printf("Error setting the value of the number.  \%s", 
+             mp_error_to_string(result));
+      return EXIT_FAILURE;
+   \}
+
+   printf("number == \%lu", mp_get_int(&number));
+
+   /* we're done with it. */ 
+   mp_clear(&number);
+
+   return EXIT_SUCCESS;
+\}
+\end{alltt} \end{small}
+
+This should output the following if the program succeeds.
+
+\begin{alltt}
+number == 654321
+\end{alltt}
+
+\subsection{Initialize and Setting Constants}
+To both initialize and set small constants the following two functions are available.
+\index{mp\_init\_set} \index{mp\_init\_set\_int}
+\begin{alltt}
+int mp_init_set (mp_int * a, mp_digit b);
+int mp_init_set_int (mp_int * a, unsigned long b);
+\end{alltt}
+
+Both functions work like the previous counterparts except they first mp\_init $a$ before setting the values.  
+
+\begin{alltt}
+int main(void)
+\{
+   mp_int number1, number2;
+   int    result;
+
+   /* initialize and set a single digit */
+   if ((result = mp_init_set(&number1, 100)) != MP_OKAY) \{
+      printf("Error setting number1: \%s", 
+             mp_error_to_string(result));
+      return EXIT_FAILURE;
+   \}             
+
+   /* initialize and set a long */
+   if ((result = mp_init_set_int(&number2, 1023)) != MP_OKAY) \{
+      printf("Error setting number2: \%s", 
+             mp_error_to_string(result));
+      return EXIT_FAILURE;
+   \}
+
+   /* display */
+   printf("Number1, Number2 == \%lu, \%lu",
+          mp_get_int(&number1), mp_get_int(&number2));
+
+   /* clear */
+   mp_clear_multi(&number1, &number2, NULL);
+
+   return EXIT_SUCCESS;
+\}
+\end{alltt}
+
+If this program succeeds it shall output.
+\begin{alltt}
+Number1, Number2 == 100, 1023
+\end{alltt}
+
+\section{Comparisons}
+
+Comparisons in LibTomMath are always performed in a ``left to right'' fashion.  There are three possible return codes
+for any comparison.
+
+\index{MP\_GT} \index{MP\_EQ} \index{MP\_LT}
+\begin{figure}[here]
+\begin{center}
+\begin{tabular}{|c|c|}
+\hline \textbf{Result Code} & \textbf{Meaning} \\
+\hline MP\_GT & $a > b$ \\
+\hline MP\_EQ & $a = b$ \\
+\hline MP\_LT & $a < b$ \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Comparison Codes for $a, b$}
+\label{fig:CMP}
+\end{figure}
+
+In figure \ref{fig:CMP} two integers $a$ and $b$ are being compared.  In this case $a$ is said to be ``to the left'' of 
+$b$.  
+
+\subsection{Unsigned comparison}
+
+An unsigned comparison considers only the digits themselves and not the associated \textit{sign} flag of the 
+mp\_int structures.  This is analogous to an absolute comparison.  The function mp\_cmp\_mag() will compare two
+mp\_int variables based on their digits only. 
+
+\index{mp\_cmp\_mag}
+\begin{alltt}
+int mp_cmp_mag(mp_int * a, mp_int * b);
+\end{alltt}
+This will compare $a$ to $b$ placing $a$ to the left of $b$.  This function cannot fail and will return one of the
+three compare codes listed in figure \ref{fig:CMP}.
+
+\begin{small} \begin{alltt}
+int main(void)
+\{
+   mp_int number1, number2;
+   int result;
+
+   if ((result = mp_init_multi(&number1, &number2, NULL)) != MP_OKAY) \{
+      printf("Error initializing the numbers.  \%s", 
+             mp_error_to_string(result));
+      return EXIT_FAILURE;
+   \}
+ 
+   /* set the number1 to 5 */
+   mp_set(&number1, 5);
+  
+   /* set the number2 to -6 */
+   mp_set(&number2, 6);
+   if ((result = mp_neg(&number2, &number2)) != MP_OKAY) \{
+      printf("Error negating number2.  \%s", 
+             mp_error_to_string(result));
+      return EXIT_FAILURE;
+   \}
+
+   switch(mp_cmp_mag(&number1, &number2)) \{
+       case MP_GT:  printf("|number1| > |number2|"); break;
+       case MP_EQ:  printf("|number1| = |number2|"); break;
+       case MP_LT:  printf("|number1| < |number2|"); break;
+   \}
+
+   /* we're done with it. */ 
+   mp_clear_multi(&number1, &number2, NULL);
+
+   return EXIT_SUCCESS;
+\}
+\end{alltt} \end{small}
+
+If this program\footnote{This function uses the mp\_neg() function which is discussed in section \ref{sec:NEG}.} completes 
+successfully it should print the following.
+
+\begin{alltt}
+|number1| < |number2|
+\end{alltt}
+
+This is because $\vert -6 \vert = 6$ and obviously $5 < 6$.
+
+\subsection{Signed comparison}
+
+To compare two mp\_int variables based on their signed value the mp\_cmp() function is provided.
+
+\index{mp\_cmp}
+\begin{alltt}
+int mp_cmp(mp_int * a, mp_int * b);
+\end{alltt}
+
+This will compare $a$ to the left of $b$.  It will first compare the signs of the two mp\_int variables.  If they
+differ it will return immediately based on their signs.  If the signs are equal then it will compare the digits
+individually.  This function will return one of the compare conditions codes listed in figure \ref{fig:CMP}.
+
+\begin{small} \begin{alltt}
+int main(void)
+\{
+   mp_int number1, number2;
+   int result;
+
+   if ((result = mp_init_multi(&number1, &number2, NULL)) != MP_OKAY) \{
+      printf("Error initializing the numbers.  \%s", 
+             mp_error_to_string(result));
+      return EXIT_FAILURE;
+   \}
+ 
+   /* set the number1 to 5 */
+   mp_set(&number1, 5);
+  
+   /* set the number2 to -6 */
+   mp_set(&number2, 6);
+   if ((result = mp_neg(&number2, &number2)) != MP_OKAY) \{
+      printf("Error negating number2.  \%s", 
+             mp_error_to_string(result));
+      return EXIT_FAILURE;
+   \}
+
+   switch(mp_cmp(&number1, &number2)) \{
+       case MP_GT:  printf("number1 > number2"); break;
+       case MP_EQ:  printf("number1 = number2"); break;
+       case MP_LT:  printf("number1 < number2"); break;
+   \}
+
+   /* we're done with it. */ 
+   mp_clear_multi(&number1, &number2, NULL);
+
+   return EXIT_SUCCESS;
+\}
+\end{alltt} \end{small}
+
+If this program\footnote{This function uses the mp\_neg() function which is discussed in section \ref{sec:NEG}.} completes 
+successfully it should print the following.
+
+\begin{alltt}
+number1 > number2
+\end{alltt}
+
+\subsection{Single Digit}
+
+To compare a single digit against an mp\_int the following function has been provided.
+
+\index{mp\_cmp\_d}
+\begin{alltt}
+int mp_cmp_d(mp_int * a, mp_digit b);
+\end{alltt}
+
+This will compare $a$ to the left of $b$ using a signed comparison.  Note that it will always treat $b$ as 
+positive.  This function is rather handy when you have to compare against small values such as $1$ (which often
+comes up in cryptography).  The function cannot fail and will return one of the tree compare condition codes
+listed in figure \ref{fig:CMP}.
+
+
+\begin{small} \begin{alltt}
+int main(void)
+\{
+   mp_int number;
+   int result;
+
+   if ((result = mp_init(&number)) != MP_OKAY) \{
+      printf("Error initializing the number.  \%s", 
+             mp_error_to_string(result));
+      return EXIT_FAILURE;
+   \}
+ 
+   /* set the number to 5 */
+   mp_set(&number, 5);
+
+   switch(mp_cmp_d(&number, 7)) \{
+       case MP_GT:  printf("number > 7"); break;
+       case MP_EQ:  printf("number = 7"); break;
+       case MP_LT:  printf("number < 7"); break;
+   \}
+
+   /* we're done with it. */ 
+   mp_clear(&number);
+
+   return EXIT_SUCCESS;
+\}
+\end{alltt} \end{small}
+
+If this program functions properly it will print out the following.
+
+\begin{alltt}
+number < 7
+\end{alltt}
+
+\section{Logical Operations}
+
+Logical operations are operations that can be performed either with simple shifts or boolean operators such as
+AND, XOR and OR directly.  These operations are very quick.
+
+\subsection{Multiplication by two}
+
+Multiplications and divisions by any power of two can be performed with quick logical shifts either left or
+right depending on the operation.  
+
+When multiplying or dividing by two a special case routine can be used which are as follows.
+\index{mp\_mul\_2} \index{mp\_div\_2}
+\begin{alltt}
+int mp_mul_2(mp_int * a, mp_int * b);
+int mp_div_2(mp_int * a, mp_int * b);
+\end{alltt}
+
+The former will assign twice $a$ to $b$ while the latter will assign half $a$ to $b$.  These functions are fast
+since the shift counts and maskes are hardcoded into the routines.
+
+\begin{small} \begin{alltt}
+int main(void)
+\{
+   mp_int number;
+   int result;
+
+   if ((result = mp_init(&number)) != MP_OKAY) \{
+      printf("Error initializing the number.  \%s", 
+             mp_error_to_string(result));
+      return EXIT_FAILURE;
+   \}
+ 
+   /* set the number to 5 */
+   mp_set(&number, 5);
+
+   /* multiply by two */
+   if ((result = mp\_mul\_2(&number, &number)) != MP_OKAY) \{
+      printf("Error multiplying the number.  \%s", 
+             mp_error_to_string(result));
+      return EXIT_FAILURE;
+   \}
+   switch(mp_cmp_d(&number, 7)) \{
+       case MP_GT:  printf("2*number > 7"); break;
+       case MP_EQ:  printf("2*number = 7"); break;
+       case MP_LT:  printf("2*number < 7"); break;
+   \}
+
+   /* now divide by two */
+   if ((result = mp\_div\_2(&number, &number)) != MP_OKAY) \{
+      printf("Error dividing the number.  \%s", 
+             mp_error_to_string(result));
+      return EXIT_FAILURE;
+   \}
+   switch(mp_cmp_d(&number, 7)) \{
+       case MP_GT:  printf("2*number/2 > 7"); break;
+       case MP_EQ:  printf("2*number/2 = 7"); break;
+       case MP_LT:  printf("2*number/2 < 7"); break;
+   \}
+
+   /* we're done with it. */ 
+   mp_clear(&number);
+
+   return EXIT_SUCCESS;
+\}
+\end{alltt} \end{small}
+
+If this program is successful it will print out the following text.
+
+\begin{alltt}
+2*number > 7
+2*number/2 < 7
+\end{alltt}
+
+Since $10 > 7$ and $5 < 7$.  To multiply by a power of two the following function can be used.
+
+\index{mp\_mul\_2d}
+\begin{alltt}
+int mp_mul_2d(mp_int * a, int b, mp_int * c);
+\end{alltt}
+
+This will multiply $a$ by $2^b$ and store the result in ``c''.  If the value of $b$ is less than or equal to 
+zero the function will copy $a$ to ``c'' without performing any further actions.  
+
+To divide by a power of two use the following.
+
+\index{mp\_div\_2d}
+\begin{alltt}
+int mp_div_2d (mp_int * a, int b, mp_int * c, mp_int * d);
+\end{alltt}
+Which will divide $a$ by $2^b$, store the quotient in ``c'' and the remainder in ``d'.  If $b \le 0$ then the
+function simply copies $a$ over to ``c'' and zeroes $d$.  The variable $d$ may be passed as a \textbf{NULL}
+value to signal that the remainder is not desired.
+
+\subsection{Polynomial Basis Operations}
+
+Strictly speaking the organization of the integers within the mp\_int structures is what is known as a 
+``polynomial basis''.  This simply means a field element is stored by divisions of a radix.  For example, if
+$f(x) = \sum_{i=0}^{k} y_ix^k$ for any vector $\vec y$ then the array of digits in $\vec y$ are said to be 
+the polynomial basis representation of $z$ if $f(\beta) = z$ for a given radix $\beta$.  
+
+To multiply by the polynomial $g(x) = x$ all you have todo is shift the digits of the basis left one place.  The
+following function provides this operation.
+
+\index{mp\_lshd}
+\begin{alltt}
+int mp_lshd (mp_int * a, int b);
+\end{alltt}
+
+This will multiply $a$ in place by $x^b$ which is equivalent to shifting the digits left $b$ places and inserting zeroes
+in the least significant digits.  Similarly to divide by a power of $x$ the following function is provided.
+
+\index{mp\_rshd}
+\begin{alltt}
+void mp_rshd (mp_int * a, int b)
+\end{alltt}
+This will divide $a$ in place by $x^b$ and discard the remainder.  This function cannot fail as it performs the operations
+in place and no new digits are required to complete it.
+
+\subsection{AND, OR and XOR Operations}
+
+While AND, OR and XOR operations are not typical ``bignum functions'' they can be useful in several instances.  The
+three functions are prototyped as follows.
+
+\index{mp\_or} \index{mp\_and} \index{mp\_xor}
+\begin{alltt}
+int mp_or  (mp_int * a, mp_int * b, mp_int * c);
+int mp_and (mp_int * a, mp_int * b, mp_int * c);
+int mp_xor (mp_int * a, mp_int * b, mp_int * c);
+\end{alltt}
+
+Which compute $c = a \odot b$ where $\odot$ is one of OR, AND or XOR.  
+
+\section{Addition and Subtraction}
+
+To compute an addition or subtraction the following two functions can be used.
+
+\index{mp\_add} \index{mp\_sub}
+\begin{alltt}
+int mp_add (mp_int * a, mp_int * b, mp_int * c);
+int mp_sub (mp_int * a, mp_int * b, mp_int * c)
+\end{alltt}
+
+Which perform $c = a \odot b$ where $\odot$ is one of signed addition or subtraction.  The operations are fully sign
+aware.
+
+\section{Sign Manipulation}
+\subsection{Negation}
+\label{sec:NEG}
+Simple integer negation can be performed with the following.
+
+\index{mp\_neg}
+\begin{alltt}
+int mp_neg (mp_int * a, mp_int * b);
+\end{alltt}
+
+Which assigns $-a$ to $b$.  
+
+\subsection{Absolute}
+Simple integer absolutes can be performed with the following.
+
+\index{mp\_neg}
+\begin{alltt}
+int mp_abs (mp_int * a, mp_int * b);
+\end{alltt}
+
+Which assigns $\vert a \vert$ to $b$.  
+
+\section{Integer Division and Remainder}
+To perform a complete and general integer division with remainder use the following function.
+
+\index{mp\_div}
+\begin{alltt}
+int mp_div (mp_int * a, mp_int * b, mp_int * c, mp_int * d);
+\end{alltt}
+                                                        
+This divides $a$ by $b$ and stores the quotient in $c$ and $d$.  The signed quotient is computed such that 
+$bc + d = a$.  Note that either of $c$ or $d$ can be set to \textbf{NULL} if their value is not required.  If 
+$b$ is zero the function returns \textbf{MP\_VAL}.  
+
+
+\chapter{Multiplication and Squaring}
+\section{Multiplication}
+A full signed integer multiplication can be performed with the following.
+\index{mp\_mul}
+\begin{alltt}
+int mp_mul (mp_int * a, mp_int * b, mp_int * c);
+\end{alltt}
+Which assigns the full signed product $ab$ to $c$.  This function actually breaks into one of four cases which are 
+specific multiplication routines optimized for given parameters.  First there are the Toom-Cook multiplications which
+should only be used with very large inputs.  This is followed by the Karatsuba multiplications which are for moderate
+sized inputs.  Then followed by the Comba and baseline multipliers.
+
+Fortunately for the developer you don't really need to know this unless you really want to fine tune the system.  mp\_mul()
+will determine on its own\footnote{Some tweaking may be required.} what routine to use automatically when it is called.
+
+\begin{alltt}
+int main(void)
+\{
+   mp_int number1, number2;
+   int result;
+
+   /* Initialize the numbers */
+   if ((result = mp_init_multi(&number1, 
+                               &number2, NULL)) != MP_OKAY) \{
+      printf("Error initializing the numbers.  \%s", 
+             mp_error_to_string(result));
+      return EXIT_FAILURE;
+   \}
+
+   /* set the terms */
+   if ((result = mp_set_int(&number, 257)) != MP_OKAY) \{
+      printf("Error setting number1.  \%s", 
+             mp_error_to_string(result));
+      return EXIT_FAILURE;
+   \}
+ 
+   if ((result = mp_set_int(&number2, 1023)) != MP_OKAY) \{
+      printf("Error setting number2.  \%s", 
+             mp_error_to_string(result));
+      return EXIT_FAILURE;
+   \}
+
+   /* multiply them */
+   if ((result = mp_mul(&number1, &number2,
+                        &number1)) != MP_OKAY) \{
+      printf("Error multiplying terms.  \%s", 
+             mp_error_to_string(result));
+      return EXIT_FAILURE;
+   \}
+
+   /* display */
+   printf("number1 * number2 == \%lu", mp_get_int(&number1));
+
+   /* free terms and return */
+   mp_clear_multi(&number1, &number2, NULL);
+
+   return EXIT_SUCCESS;
+\}
+\end{alltt}   
+
+If this program succeeds it shall output the following.
+
+\begin{alltt}
+number1 * number2 == 262911
+\end{alltt}
+
+\section{Squaring}
+Since squaring can be performed faster than multiplication it is performed it's own function instead of just using
+mp\_mul().
+
+\index{mp\_sqr}
+\begin{alltt}
+int mp_sqr (mp_int * a, mp_int * b);
+\end{alltt}
+
+Will square $a$ and store it in $b$.  Like the case of multiplication there are four different squaring
+algorithms all which can be called from mp\_sqr().  It is ideal to use mp\_sqr over mp\_mul when squaring terms because
+of the speed difference.  
+
+\section{Tuning Polynomial Basis Routines}
+
+Both of the Toom-Cook and Karatsuba multiplication algorithms are faster than the traditional $O(n^2)$ approach that
+the Comba and baseline algorithms use.  At $O(n^{1.464973})$ and $O(n^{1.584962})$ running times respectively they require 
+considerably less work.  For example, a 10000-digit multiplication would take roughly 724,000 single precision
+multiplications with Toom-Cook or 100,000,000 single precision multiplications with the standard Comba (a factor
+of 138).
+
+So why not always use Karatsuba or Toom-Cook?   The simple answer is that they have so much overhead that they're not
+actually faster than Comba until you hit distinct  ``cutoff'' points.  For Karatsuba with the default configuration, 
+GCC 3.3.1 and an Athlon XP processor the cutoff point is roughly 110 digits (about 70 for the Intel P4).  That is, at 
+110 digits Karatsuba and Comba multiplications just about break even and for 110+ digits Karatsuba is faster.
+
+Toom-Cook has incredible overhead and is probably only useful for very large inputs.  So far no known cutoff points 
+exist and for the most part I just set the cutoff points very high to make sure they're not called.
+
+A demo program in the ``etc/'' directory of the project called ``tune.c'' can be used to find the cutoff points.  This
+can be built with GCC as follows
+
+\begin{alltt}
+make XXX
+\end{alltt}
+Where ``XXX'' is one of the following entries from the table \ref{fig:tuning}.
+
+\begin{figure}[here]
+\begin{center}
+\begin{small}
+\begin{tabular}{|l|l|}
+\hline \textbf{Value of XXX} & \textbf{Meaning} \\
+\hline tune & Builds portable tuning application \\
+\hline tune86 & Builds x86 (pentium and up) program for COFF \\
+\hline tune86c & Builds x86 program for Cygwin \\
+\hline tune86l & Builds x86 program for Linux (ELF format) \\
+\hline
+\end{tabular}
+\end{small}
+\end{center}
+\caption{Build Names for Tuning Programs}
+\label{fig:tuning}
+\end{figure}
+
+When the program is running it will output a series of measurements for different cutoff points.  It will first find
+good Karatsuba squaring and multiplication points.  Then it proceeds to find Toom-Cook points.  Note that the Toom-Cook
+tuning takes a very long time as the cutoff points are likely to be very high.
+
+\chapter{Modular Reduction}
+
+Modular reduction is process of taking the remainder of one quantity divided by another.  Expressed 
+as (\ref{eqn:mod}) the modular reduction is equivalent to the remainder of $b$ divided by $c$.  
+
+\begin{equation}
+a \equiv b \mbox{ (mod }c\mbox{)}
+\label{eqn:mod}
+\end{equation}
+
+Of particular interest to cryptography are reductions where $b$ is limited to the range $0 \le b < c^2$ since particularly 
+fast reduction algorithms can be written for the limited range.  
+
+Note that one of the four optimized reduction algorithms are automatically chosen in the modular exponentiation
+algorithm mp\_exptmod when an appropriate modulus is detected.  
+
+\section{Straight Division}
+In order to effect an arbitrary modular reduction the following algorithm is provided.
+
+\index{mp\_mod}
+\begin{alltt}
+int mp_mod(mp_int *a, mp_int *b, mp_int *c);
+\end{alltt}
+
+This reduces $a$ modulo $b$ and stores the result in $c$.  The sign of $c$ shall agree with the sign 
+of $b$.  This algorithm accepts an input $a$ of any range and is not limited by $0 \le a < b^2$.
+
+\section{Barrett Reduction}
+
+Barrett reduction is a generic optimized reduction algorithm that requires pre--computation to achieve
+a decent speedup over straight division.  First a $\mu$ value must be precomputed with the following function.
+
+\index{mp\_reduce\_setup}
+\begin{alltt}
+int mp_reduce_setup(mp_int *a, mp_int *b);
+\end{alltt}
+
+Given a modulus in $b$ this produces the required $\mu$ value in $a$.  For any given modulus this only has to
+be computed once.  Modular reduction can now be performed with the following.
+
+\index{mp\_reduce}
+\begin{alltt}
+int mp_reduce(mp_int *a, mp_int *b, mp_int *c);
+\end{alltt}
+
+This will reduce $a$ in place modulo $b$ with the precomputed $\mu$ value in $c$.  $a$ must be in the range
+$0 \le a < b^2$.
+
+\begin{alltt}
+int main(void)
+\{
+   mp_int   a, b, c, mu;
+   int      result;
+
+   /* initialize a,b to desired values, mp_init mu, 
+    * c and set c to 1...we want to compute a^3 mod b 
+    */
+
+   /* get mu value */
+   if ((result = mp_reduce_setup(&mu, b)) != MP_OKAY) \{
+      printf("Error getting mu.  \%s", 
+             mp_error_to_string(result));
+      return EXIT_FAILURE;
+   \}
+
+   /* square a to get c = a^2 */
+   if ((result = mp_sqr(&a, &c)) != MP_OKAY) \{
+      printf("Error squaring.  \%s", 
+             mp_error_to_string(result));
+      return EXIT_FAILURE;
+   \}
+
+   /* now reduce `c' modulo b */
+   if ((result = mp_reduce(&c, &b, &mu)) != MP_OKAY) \{
+      printf("Error reducing.  \%s", 
+             mp_error_to_string(result));
+      return EXIT_FAILURE;
+   \}
+   
+   /* multiply a to get c = a^3 */
+   if ((result = mp_mul(&a, &c, &c)) != MP_OKAY) \{
+      printf("Error reducing.  \%s", 
+             mp_error_to_string(result));
+      return EXIT_FAILURE;
+   \}
+
+   /* now reduce `c' modulo b  */
+   if ((result = mp_reduce(&c, &b, &mu)) != MP_OKAY) \{
+      printf("Error reducing.  \%s", 
+             mp_error_to_string(result));
+      return EXIT_FAILURE;
+   \}
+  
+   /* c now equals a^3 mod b */
+
+   return EXIT_SUCCESS;
+\}
+\end{alltt} 
+
+This program will calculate $a^3 \mbox{ mod }b$ if all the functions succeed.  
+
+\section{Montgomery Reduction}
+
+Montgomery is a specialized reduction algorithm for any odd moduli.  Like Barrett reduction a pre--computation
+step is required.  This is accomplished with the following.
+
+\index{mp\_montgomery\_setup}
+\begin{alltt}
+int mp_montgomery_setup(mp_int *a, mp_digit *mp);
+\end{alltt}
+
+For the given odd moduli $a$ the precomputation value is placed in $mp$.  The reduction is computed with the 
+following.
+
+\index{mp\_montgomery\_reduce}
+\begin{alltt}
+int mp_montgomery_reduce(mp_int *a, mp_int *m, mp_digit mp);
+\end{alltt}
+This reduces $a$ in place modulo $m$ with the pre--computed value $mp$.   $a$ must be in the range
+$0 \le a < b^2$.
+
+Montgomery reduction is faster than Barrett reduction for moduli smaller than the ``comba'' limit.  With the default
+setup for instance, the limit is $127$ digits ($3556$--bits).   Note that this function is not limited to
+$127$ digits just that it falls back to a baseline algorithm after that point.  
+
+An important observation is that this reduction does not return $a \mbox{ mod }m$ but $aR^{-1} \mbox{ mod }m$ 
+where $R = \beta^n$, $n$ is the n number of digits in $m$ and $\beta$ is radix used (default is $2^{28}$).  
+
+To quickly calculate $R$ the following function was provided.
+
+\index{mp\_montgomery\_calc\_normalization}
+\begin{alltt}
+int mp_montgomery_calc_normalization(mp_int *a, mp_int *b);
+\end{alltt}
+Which calculates $a = R$ for the odd moduli $b$ without using multiplication or division.  
+
+The normal modus operandi for Montgomery reductions is to normalize the integers before entering the system.  For
+example, to calculate $a^3 \mbox { mod }b$ using Montgomery reduction the value of $a$ can be normalized by
+multiplying it by $R$.  Consider the following code snippet.
+
+\begin{alltt}
+int main(void)
+\{
+   mp_int   a, b, c, R;
+   mp_digit mp;
+   int      result;
+
+   /* initialize a,b to desired values, 
+    * mp_init R, c and set c to 1.... 
+    */
+
+   /* get normalization */
+   if ((result = mp_montgomery_calc_normalization(&R, b)) != MP_OKAY) \{
+      printf("Error getting norm.  \%s", 
+             mp_error_to_string(result));
+      return EXIT_FAILURE;
+   \}
+
+   /* get mp value */
+   if ((result = mp_montgomery_setup(&c, &mp)) != MP_OKAY) \{
+      printf("Error setting up montgomery.  \%s", 
+             mp_error_to_string(result));
+      return EXIT_FAILURE;
+   \}
+
+   /* normalize `a' so now a is equal to aR */
+   if ((result = mp_mulmod(&a, &R, &b, &a)) != MP_OKAY) \{
+      printf("Error computing aR.  \%s", 
+             mp_error_to_string(result));
+      return EXIT_FAILURE;
+   \}
+
+   /* square a to get c = a^2R^2 */
+   if ((result = mp_sqr(&a, &c)) != MP_OKAY) \{
+      printf("Error squaring.  \%s", 
+             mp_error_to_string(result));
+      return EXIT_FAILURE;
+   \}
+
+   /* now reduce `c' back down to c = a^2R^2 * R^-1 == a^2R */
+   if ((result = mp_montgomery_reduce(&c, &b, mp)) != MP_OKAY) \{
+      printf("Error reducing.  \%s", 
+             mp_error_to_string(result));
+      return EXIT_FAILURE;
+   \}
+   
+   /* multiply a to get c = a^3R^2 */
+   if ((result = mp_mul(&a, &c, &c)) != MP_OKAY) \{
+      printf("Error reducing.  \%s", 
+             mp_error_to_string(result));
+      return EXIT_FAILURE;
+   \}
+
+   /* now reduce `c' back down to c = a^3R^2 * R^-1 == a^3R */
+   if ((result = mp_montgomery_reduce(&c, &b, mp)) != MP_OKAY) \{
+      printf("Error reducing.  \%s", 
+             mp_error_to_string(result));
+      return EXIT_FAILURE;
+   \}
+   
+   /* now reduce (again) `c' back down to c = a^3R * R^-1 == a^3 */
+   if ((result = mp_montgomery_reduce(&c, &b, mp)) != MP_OKAY) \{
+      printf("Error reducing.  \%s", 
+             mp_error_to_string(result));
+      return EXIT_FAILURE;
+   \}
+
+   /* c now equals a^3 mod b */
+
+   return EXIT_SUCCESS;
+\}
+\end{alltt} 
+
+This particular example does not look too efficient but it demonstrates the point of the algorithm.  By 
+normalizing the inputs the reduced results are always of the form $aR$ for some variable $a$.  This allows
+a single final reduction to correct for the normalization and the fast reduction used within the algorithm.
+
+For more details consider examining the file \textit{bn\_mp\_exptmod\_fast.c}.
+
+\section{Restricted Dimminished Radix}
+
+``Dimminished Radix'' reduction refers to reduction with respect to moduli that are ameniable to simple
+digit shifting and small multiplications.  In this case the ``restricted'' variant refers to moduli of the
+form $\beta^k - p$ for some $k \ge 0$ and $0 < p < \beta$ where $\beta$ is the radix (default to $2^{28}$).  
+
+As in the case of Montgomery reduction there is a pre--computation phase required for a given modulus.
+
+\index{mp\_dr\_setup}
+\begin{alltt}
+void mp_dr_setup(mp_int *a, mp_digit *d);
+\end{alltt}
+
+This computes the value required for the modulus $a$ and stores it in $d$.  This function cannot fail
+and does not return any error codes.  After the pre--computation a reduction can be performed with the
+following.
+
+\index{mp\_dr\_reduce}
+\begin{alltt}
+int mp_dr_reduce(mp_int *a, mp_int *b, mp_digit mp);
+\end{alltt}
+
+This reduces $a$ in place modulo $b$ with the pre--computed value $mp$.  $b$ must be of a restricted
+dimminished radix form and $a$ must be in the range $0 \le a < b^2$.  Dimminished radix reductions are 
+much faster than both Barrett and Montgomery reductions as they have a much lower asymtotic running time.  
+
+Since the moduli are restricted this algorithm is not particularly useful for something like Rabin, RSA or
+BBS cryptographic purposes.  This reduction algorithm is useful for Diffie-Hellman and ECC where fixed
+primes are acceptable.  
+
+Note that unlike Montgomery reduction there is no normalization process.  The result of this function is
+equal to the correct residue.
+
+\section{Unrestricted Dimminshed Radix}
+
+Unrestricted reductions work much like the restricted counterparts except in this case the moduli is of the 
+form $2^k - p$ for $0 < p < \beta$.  In this sense the unrestricted reductions are more flexible as they 
+can be applied to a wider range of numbers.  
+
+\index{mp\_reduce\_2k\_setup}
+\begin{alltt}
+int mp_reduce_2k_setup(mp_int *a, mp_digit *d);
+\end{alltt}
+
+This will compute the required $d$ value for the given moduli $a$.  
+
+\index{mp\_reduce\_2k}
+\begin{alltt}
+int mp_reduce_2k(mp_int *a, mp_int *n, mp_digit d);
+\end{alltt}
+
+This will reduce $a$ in place modulo $n$ with the pre--computed value $d$.  From my experience this routine is 
+slower than mp\_dr\_reduce but faster for most moduli sizes than the Montgomery reduction.  
+
+\chapter{Exponentiation}
+\section{Single Digit Exponentiation}
+\index{mp\_expt\_d}
+\begin{alltt}
+int mp_expt_d (mp_int * a, mp_digit b, mp_int * c)
+\end{alltt}
+This computes $c = a^b$ using a simple binary left-to-right algorithm.  It is faster than repeated multiplications by 
+$a$ for all values of $b$ greater than three.  
+
+\section{Modular Exponentiation}
+\index{mp\_exptmod}
+\begin{alltt}
+int mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
+\end{alltt}
+This computes $Y \equiv G^X \mbox{ (mod }P\mbox{)}$ using a variable width sliding window algorithm.  This function
+will automatically detect the fastest modular reduction technique to use during the operation.  For negative values of 
+$X$ the operation is performed as $Y \equiv (G^{-1} \mbox{ mod }P)^{\vert X \vert} \mbox{ (mod }P\mbox{)}$ provided that 
+$gcd(G, P) = 1$.
+
+This function is actually a shell around the two internal exponentiation functions.  This routine will automatically
+detect when Barrett, Montgomery, Restricted and Unrestricted Dimminished Radix based exponentiation can be used.  Generally
+moduli of the a ``restricted dimminished radix'' form lead to the fastest modular exponentiations.  Followed by Montgomery
+and the other two algorithms.
+
+\section{Root Finding}
+\index{mp\_n\_root}
+\begin{alltt}
+int mp_n_root (mp_int * a, mp_digit b, mp_int * c)
+\end{alltt}
+This computes $c = a^{1/b}$ such that $c^b \le a$ and $(c+1)^b > a$.  The implementation of this function is not 
+ideal for values of $b$ greater than three.  It will work but become very slow.  So unless you are working with very small
+numbers (less than 1000 bits) I'd avoid $b > 3$ situations.  Will return a positive root only for even roots and return
+a root with the sign of the input for odd roots.  For example, performing $4^{1/2}$ will return $2$ whereas $(-8)^{1/3}$ 
+will return $-2$.  
+
+This algorithm uses the ``Newton Approximation'' method and will converge on the correct root fairly quickly.  Since
+the algorithm requires raising $a$ to the power of $b$ it is not ideal to attempt to find roots for large
+values of $b$.  If particularly large roots are required then a factor method could be used instead.  For example,
+$a^{1/16}$ is equivalent to $\left (a^{1/4} \right)^{1/4}$ or simply 
+$\left ( \left ( \left ( a^{1/2} \right )^{1/2} \right )^{1/2} \right )^{1/2}$
+
+\chapter{Prime Numbers}
+\section{Trial Division}
+\index{mp\_prime\_is\_divisible}
+\begin{alltt}
+int mp_prime_is_divisible (mp_int * a, int *result)
+\end{alltt}
+This will attempt to evenly divide $a$ by a list of primes\footnote{Default is the first 256 primes.} and store the 
+outcome in ``result''.  That is if $result = 0$ then $a$ is not divisible by the primes, otherwise it is.  Note that 
+if the function does not return \textbf{MP\_OKAY} the value in ``result'' should be considered undefined\footnote{Currently
+the default is to set it to zero first.}.
+
+\section{Fermat Test}
+\index{mp\_prime\_fermat}
+\begin{alltt}
+int mp_prime_fermat (mp_int * a, mp_int * b, int *result)
+\end{alltt}
+Performs a Fermat primality test to the base $b$.  That is it computes $b^a \mbox{ mod }a$ and tests whether the value is
+equal to $b$ or not.  If the values are equal then $a$ is probably prime and $result$ is set to one.  Otherwise $result$
+is set to zero.
+
+\section{Miller-Rabin Test}
+\index{mp\_prime\_miller\_rabin}
+\begin{alltt}
+int mp_prime_miller_rabin (mp_int * a, mp_int * b, int *result)
+\end{alltt}
+Performs a Miller-Rabin test to the base $b$ of $a$.  This test is much stronger than the Fermat test and is very hard to
+fool (besides with Carmichael numbers).  If $a$ passes the test (therefore is probably prime) $result$ is set to one.  
+Otherwise $result$ is set to zero.  
+
+Note that is suggested that you use the Miller-Rabin test instead of the Fermat test since all of the failures of 
+Miller-Rabin are a subset of the failures of the Fermat test.
+
+\subsection{Required Number of Tests}
+Generally to ensure a number is very likely to be prime you have to perform the Miller-Rabin with at least a half-dozen
+or so unique bases.  However, it has been proven that the probability of failure goes down as the size of the input goes up.
+This is why a simple function has been provided to help out.
+
+\index{mp\_prime\_rabin\_miller\_trials}
+\begin{alltt}
+int mp_prime_rabin_miller_trials(int size)
+\end{alltt}
+This returns the number of trials required for a $2^{-96}$ (or lower) probability of failure for a given ``size'' expressed
+in bits.  This comes in handy specially since larger numbers are slower to test.  For example, a 512-bit number would
+require ten tests whereas a 1024-bit number would only require four tests. 
+
+You should always still perform a trial division before a Miller-Rabin test though.
+
+\section{Primality Testing}
+\index{mp\_prime\_is\_prime}
+\begin{alltt}
+int mp_prime_is_prime (mp_int * a, int t, int *result)
+\end{alltt}
+This will perform a trial division followed by $t$ rounds of Miller-Rabin tests on $a$ and store the result in $result$.  
+If $a$ passes all of the tests $result$ is set to one, otherwise it is set to zero.  Note that $t$ is bounded by 
+$1 \le t < PRIME\_SIZE$ where $PRIME\_SIZE$ is the number of primes in the prime number table (by default this is $256$).
+
+\section{Next Prime}
+\index{mp\_prime\_next\_prime}
+\begin{alltt}
+int mp_prime_next_prime(mp_int *a, int t, int bbs_style)
+\end{alltt}
+This finds the next prime after $a$ that passes mp\_prime\_is\_prime() with $t$ tests.  Set $bbs\_style$ to one if you 
+want only the next prime congruent to $3 \mbox{ mod } 4$, otherwise set it to zero to find any next prime.  
+
+\section{Random Primes}
+\index{mp\_prime\_random}
+\begin{alltt}
+int mp_prime_random(mp_int *a, int t, int size, int bbs, 
+                    ltm_prime_callback cb, void *dat)
+\end{alltt}
+This will find a prime greater than $256^{size}$ which can be ``bbs\_style'' or not depending on $bbs$ and must pass
+$t$ rounds of tests.  The ``ltm\_prime\_callback'' is a typedef for 
+
+\begin{alltt}
+typedef int ltm_prime_callback(unsigned char *dst, int len, void *dat);
+\end{alltt}
+
+Which is a function that must read $len$ bytes (and return the amount stored) into $dst$.  The $dat$ variable is simply
+copied from the original input.  It can be used to pass RNG context data to the callback.  The function 
+mp\_prime\_random() is more suitable for generating primes which must be secret (as in the case of RSA) since there 
+is no skew on the least significant bits.
+
+\textit{Note:}  As of v0.30 of the LibTomMath library this function has been deprecated.  It is still available
+but users are encouraged to use the new mp\_prime\_random\_ex() function instead.
+
+\subsection{Extended Generation}
+\index{mp\_prime\_random\_ex}
+\begin{alltt}
+int mp_prime_random_ex(mp_int *a,    int t, 
+                       int     size, int flags, 
+                       ltm_prime_callback cb, void *dat);
+\end{alltt}
+This will generate a prime in $a$ using $t$ tests of the primality testing algorithms.  The variable $size$
+specifies the bit length of the prime desired.  The variable $flags$ specifies one of several options available
+(see fig. \ref{fig:primeopts}) which can be OR'ed together.  The callback parameters are used as in 
+mp\_prime\_random().
+
+\begin{figure}[here]
+\begin{center}
+\begin{small}
+\begin{tabular}{|r|l|}
+\hline \textbf{Flag}         & \textbf{Meaning} \\
+\hline LTM\_PRIME\_BBS       & Make the prime congruent to $3$ modulo $4$ \\
+\hline LTM\_PRIME\_SAFE      & Make a prime $p$ such that $(p - 1)/2$ is also prime. \\
+                             & This option implies LTM\_PRIME\_BBS as well. \\
+\hline LTM\_PRIME\_2MSB\_OFF & Makes sure that the bit adjacent to the most significant bit \\
+                             & Is forced to zero.  \\
+\hline LTM\_PRIME\_2MSB\_ON  & Makes sure that the bit adjacent to the most significant bit \\
+                             & Is forced to one. \\
+\hline
+\end{tabular}
+\end{small}
+\end{center}
+\caption{Primality Generation Options}
+\label{fig:primeopts}
+\end{figure}
+
+\chapter{Input and Output}
+\section{ASCII Conversions}
+\subsection{To ASCII}
+\index{mp\_toradix}
+\begin{alltt}
+int mp_toradix (mp_int * a, char *str, int radix);
+\end{alltt}
+This still store $a$ in ``str'' as a base-``radix'' string of ASCII chars.  This function appends a NUL character
+to terminate the string.  Valid values of ``radix'' line in the range $[2, 64]$.  To determine the size (exact) required
+by the conversion before storing any data use the following function.
+
+\index{mp\_radix\_size}
+\begin{alltt}
+int mp_radix_size (mp_int * a, int radix, int *size)
+\end{alltt}
+This stores in ``size'' the number of characters (including space for the NUL terminator) required.  Upon error this 
+function returns an error code and ``size'' will be zero.  
+
+\subsection{From ASCII}
+\index{mp\_read\_radix}
+\begin{alltt}
+int mp_read_radix (mp_int * a, char *str, int radix);
+\end{alltt}
+This will read the base-``radix'' NUL terminated string from ``str'' into $a$.  It will stop reading when it reads a
+character it does not recognize (which happens to include th NUL char... imagine that...).  A single leading $-$ sign
+can be used to denote a negative number.
+
+\section{Binary Conversions}
+
+Converting an mp\_int to and from binary is another keen idea.
+
+\index{mp\_unsigned\_bin\_size}
+\begin{alltt}
+int mp_unsigned_bin_size(mp_int *a);
+\end{alltt}
+
+This will return the number of bytes (octets) required to store the unsigned copy of the integer $a$.
+
+\index{mp\_to\_unsigned\_bin}
+\begin{alltt}
+int mp_to_unsigned_bin(mp_int *a, unsigned char *b);
+\end{alltt}
+This will store $a$ into the buffer $b$ in big--endian format.  Fortunately this is exactly what DER (or is it ASN?)
+requires.  It does not store the sign of the integer.
+
+\index{mp\_read\_unsigned\_bin}
+\begin{alltt}
+int mp_read_unsigned_bin(mp_int *a, unsigned char *b, int c);
+\end{alltt}
+This will read in an unsigned big--endian array of bytes (octets) from $b$ of length $c$ into $a$.  The resulting
+integer $a$ will always be positive.
+
+For those who acknowledge the existence of negative numbers (heretic!) there are ``signed'' versions of the
+previous functions.
+
+\begin{alltt}
+int mp_signed_bin_size(mp_int *a);
+int mp_read_signed_bin(mp_int *a, unsigned char *b, int c);
+int mp_to_signed_bin(mp_int *a, unsigned char *b);
+\end{alltt}
+They operate essentially the same as the unsigned copies except they prefix the data with zero or non--zero
+byte depending on the sign.  If the sign is zpos (e.g. not negative) the prefix is zero, otherwise the prefix
+is non--zero.  
+
+\chapter{Algebraic Functions}
+\section{Extended Euclidean Algorithm}
+\index{mp\_exteuclid}
+\begin{alltt}
+int mp_exteuclid(mp_int *a, mp_int *b, 
+                 mp_int *U1, mp_int *U2, mp_int *U3);
+\end{alltt}
+
+This finds the triple U1/U2/U3 using the Extended Euclidean algorithm such that the following equation holds.
+
+\begin{equation}
+a \cdot U1 + b \cdot U2 = U3
+\end{equation}
+
+Any of the U1/U2/U3 paramters can be set to \textbf{NULL} if they are not desired.  
+
+\section{Greatest Common Divisor}
+\index{mp\_gcd}
+\begin{alltt}
+int mp_gcd (mp_int * a, mp_int * b, mp_int * c)
+\end{alltt}
+This will compute the greatest common divisor of $a$ and $b$ and store it in $c$.
+
+\section{Least Common Multiple}
+\index{mp\_lcm}
+\begin{alltt}
+int mp_lcm (mp_int * a, mp_int * b, mp_int * c)
+\end{alltt}
+This will compute the least common multiple of $a$ and $b$ and store it in $c$.
+
+\section{Jacobi Symbol}
+\index{mp\_jacobi}
+\begin{alltt}
+int mp_jacobi (mp_int * a, mp_int * p, int *c)
+\end{alltt}
+This will compute the Jacobi symbol for $a$ with respect to $p$.  If $p$ is prime this essentially computes the Legendre
+symbol.  The result is stored in $c$ and can take on one of three values $\lbrace -1, 0, 1 \rbrace$.  If $p$ is prime
+then the result will be $-1$ when $a$ is not a quadratic residue modulo $p$.  The result will be $0$ if $a$ divides $p$
+and the result will be $1$ if $a$ is a quadratic residue modulo $p$.  
+
+\section{Modular Inverse}
+\index{mp\_invmod}
+\begin{alltt}
+int mp_invmod (mp_int * a, mp_int * b, mp_int * c)
+\end{alltt}
+Computes the multiplicative inverse of $a$ modulo $b$ and stores the result in $c$ such that $ac \equiv 1 \mbox{ (mod }b\mbox{)}$.
+
+\section{Single Digit Functions}
+
+For those using small numbers (\textit{snicker snicker}) there are several ``helper'' functions
+
+\index{mp\_add\_d} \index{mp\_sub\_d} \index{mp\_mul\_d} \index{mp\_div\_d} \index{mp\_mod\_d}
+\begin{alltt}
+int mp_add_d(mp_int *a, mp_digit b, mp_int *c);
+int mp_sub_d(mp_int *a, mp_digit b, mp_int *c);
+int mp_mul_d(mp_int *a, mp_digit b, mp_int *c);
+int mp_div_d(mp_int *a, mp_digit b, mp_int *c, mp_digit *d);
+int mp_mod_d(mp_int *a, mp_digit b, mp_digit *c);
+\end{alltt}
+
+These work like the full mp\_int capable variants except the second parameter $b$ is a mp\_digit.  These
+functions fairly handy if you have to work with relatively small numbers since you will not have to allocate
+an entire mp\_int to store a number like $1$ or $2$.
+
+\input{bn.ind}
+
+\end{document}
diff --git a/libtommath/bn_error.c b/libtommath/bn_error.c
new file mode 100644
index 0000000..1546784
--- /dev/null
+++ b/libtommath/bn_error.c
@@ -0,0 +1,43 @@
+#include <tommath.h>
+#ifdef BN_ERROR_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+static const struct {
+     int code;
+     char *msg;
+} msgs[] = {
+     { MP_OKAY, "Successful" },
+     { MP_MEM,  "Out of heap" },
+     { MP_VAL,  "Value out of range" }
+};
+
+/* return a char * string for a given code */
+char *mp_error_to_string(int code)
+{
+   int x;
+
+   /* scan the lookup table for the given message */
+   for (x = 0; x < (int)(sizeof(msgs) / sizeof(msgs[0])); x++) {
+       if (msgs[x].code == code) {
+          return msgs[x].msg;
+       }
+   }
+
+   /* generic reply for invalid code */
+   return "Invalid error code";
+}
+
+#endif
diff --git a/libtommath/bn_fast_mp_invmod.c b/libtommath/bn_fast_mp_invmod.c
new file mode 100644
index 0000000..acc8364
--- /dev/null
+++ b/libtommath/bn_fast_mp_invmod.c
@@ -0,0 +1,144 @@
+#include <tommath.h>
+#ifdef BN_FAST_MP_INVMOD_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* computes the modular inverse via binary extended euclidean algorithm, 
+ * that is c = 1/a mod b 
+ *
+ * Based on slow invmod except this is optimized for the case where b is 
+ * odd as per HAC Note 14.64 on pp. 610
+ */
+int fast_mp_invmod (mp_int * a, mp_int * b, mp_int * c)
+{
+  mp_int  x, y, u, v, B, D;
+  int     res, neg;
+
+  /* 2. [modified] b must be odd   */
+  if (mp_iseven (b) == 1) {
+    return MP_VAL;
+  }
+
+  /* init all our temps */
+  if ((res = mp_init_multi(&x, &y, &u, &v, &B, &D, NULL)) != MP_OKAY) {
+     return res;
+  }
+
+  /* x == modulus, y == value to invert */
+  if ((res = mp_copy (b, &x)) != MP_OKAY) {
+    goto LBL_ERR;
+  }
+
+  /* we need y = |a| */
+  if ((res = mp_mod (a, b, &y)) != MP_OKAY) {
+    goto LBL_ERR;
+  }
+
+  /* 3. u=x, v=y, A=1, B=0, C=0,D=1 */
+  if ((res = mp_copy (&x, &u)) != MP_OKAY) {
+    goto LBL_ERR;
+  }
+  if ((res = mp_copy (&y, &v)) != MP_OKAY) {
+    goto LBL_ERR;
+  }
+  mp_set (&D, 1);
+
+top:
+  /* 4.  while u is even do */
+  while (mp_iseven (&u) == 1) {
+    /* 4.1 u = u/2 */
+    if ((res = mp_div_2 (&u, &u)) != MP_OKAY) {
+      goto LBL_ERR;
+    }
+    /* 4.2 if B is odd then */
+    if (mp_isodd (&B) == 1) {
+      if ((res = mp_sub (&B, &x, &B)) != MP_OKAY) {
+        goto LBL_ERR;
+      }
+    }
+    /* B = B/2 */
+    if ((res = mp_div_2 (&B, &B)) != MP_OKAY) {
+      goto LBL_ERR;
+    }
+  }
+
+  /* 5.  while v is even do */
+  while (mp_iseven (&v) == 1) {
+    /* 5.1 v = v/2 */
+    if ((res = mp_div_2 (&v, &v)) != MP_OKAY) {
+      goto LBL_ERR;
+    }
+    /* 5.2 if D is odd then */
+    if (mp_isodd (&D) == 1) {
+      /* D = (D-x)/2 */
+      if ((res = mp_sub (&D, &x, &D)) != MP_OKAY) {
+        goto LBL_ERR;
+      }
+    }
+    /* D = D/2 */
+    if ((res = mp_div_2 (&D, &D)) != MP_OKAY) {
+      goto LBL_ERR;
+    }
+  }
+
+  /* 6.  if u >= v then */
+  if (mp_cmp (&u, &v) != MP_LT) {
+    /* u = u - v, B = B - D */
+    if ((res = mp_sub (&u, &v, &u)) != MP_OKAY) {
+      goto LBL_ERR;
+    }
+
+    if ((res = mp_sub (&B, &D, &B)) != MP_OKAY) {
+      goto LBL_ERR;
+    }
+  } else {
+    /* v - v - u, D = D - B */
+    if ((res = mp_sub (&v, &u, &v)) != MP_OKAY) {
+      goto LBL_ERR;
+    }
+
+    if ((res = mp_sub (&D, &B, &D)) != MP_OKAY) {
+      goto LBL_ERR;
+    }
+  }
+
+  /* if not zero goto step 4 */
+  if (mp_iszero (&u) == 0) {
+    goto top;
+  }
+
+  /* now a = C, b = D, gcd == g*v */
+
+  /* if v != 1 then there is no inverse */
+  if (mp_cmp_d (&v, 1) != MP_EQ) {
+    res = MP_VAL;
+    goto LBL_ERR;
+  }
+
+  /* b is now the inverse */
+  neg = a->sign;
+  while (D.sign == MP_NEG) {
+    if ((res = mp_add (&D, b, &D)) != MP_OKAY) {
+      goto LBL_ERR;
+    }
+  }
+  mp_exch (&D, c);
+  c->sign = neg;
+  res = MP_OKAY;
+
+LBL_ERR:mp_clear_multi (&x, &y, &u, &v, &B, &D, NULL);
+  return res;
+}
+#endif
diff --git a/libtommath/bn_fast_mp_montgomery_reduce.c b/libtommath/bn_fast_mp_montgomery_reduce.c
new file mode 100644
index 0000000..14f307f
--- /dev/null
+++ b/libtommath/bn_fast_mp_montgomery_reduce.c
@@ -0,0 +1,168 @@
+#include <tommath.h>
+#ifdef BN_FAST_MP_MONTGOMERY_REDUCE_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* computes xR**-1 == x (mod N) via Montgomery Reduction
+ *
+ * This is an optimized implementation of montgomery_reduce
+ * which uses the comba method to quickly calculate the columns of the
+ * reduction.
+ *
+ * Based on Algorithm 14.32 on pp.601 of HAC.
+*/
+int fast_mp_montgomery_reduce (mp_int * x, mp_int * n, mp_digit rho)
+{
+  int     ix, res, olduse;
+  mp_word W[MP_WARRAY];
+
+  /* get old used count */
+  olduse = x->used;
+
+  /* grow a as required */
+  if (x->alloc < n->used + 1) {
+    if ((res = mp_grow (x, n->used + 1)) != MP_OKAY) {
+      return res;
+    }
+  }
+
+  /* first we have to get the digits of the input into
+   * an array of double precision words W[...]
+   */
+  {
+    register mp_word *_W;
+    register mp_digit *tmpx;
+
+    /* alias for the W[] array */
+    _W   = W;
+
+    /* alias for the digits of  x*/
+    tmpx = x->dp;
+
+    /* copy the digits of a into W[0..a->used-1] */
+    for (ix = 0; ix < x->used; ix++) {
+      *_W++ = *tmpx++;
+    }
+
+    /* zero the high words of W[a->used..m->used*2] */
+    for (; ix < n->used * 2 + 1; ix++) {
+      *_W++ = 0;
+    }
+  }
+
+  /* now we proceed to zero successive digits
+   * from the least significant upwards
+   */
+  for (ix = 0; ix < n->used; ix++) {
+    /* mu = ai * m' mod b
+     *
+     * We avoid a double precision multiplication (which isn't required)
+     * by casting the value down to a mp_digit.  Note this requires
+     * that W[ix-1] have  the carry cleared (see after the inner loop)
+     */
+    register mp_digit mu;
+    mu = (mp_digit) (((W[ix] & MP_MASK) * rho) & MP_MASK);
+
+    /* a = a + mu * m * b**i
+     *
+     * This is computed in place and on the fly.  The multiplication
+     * by b**i is handled by offseting which columns the results
+     * are added to.
+     *
+     * Note the comba method normally doesn't handle carries in the
+     * inner loop In this case we fix the carry from the previous
+     * column since the Montgomery reduction requires digits of the
+     * result (so far) [see above] to work.  This is
+     * handled by fixing up one carry after the inner loop.  The
+     * carry fixups are done in order so after these loops the
+     * first m->used words of W[] have the carries fixed
+     */
+    {
+      register int iy;
+      register mp_digit *tmpn;
+      register mp_word *_W;
+
+      /* alias for the digits of the modulus */
+      tmpn = n->dp;
+
+      /* Alias for the columns set by an offset of ix */
+      _W = W + ix;
+
+      /* inner loop */
+      for (iy = 0; iy < n->used; iy++) {
+          *_W++ += ((mp_word)mu) * ((mp_word)*tmpn++);
+      }
+    }
+
+    /* now fix carry for next digit, W[ix+1] */
+    W[ix + 1] += W[ix] >> ((mp_word) DIGIT_BIT);
+  }
+
+  /* now we have to propagate the carries and
+   * shift the words downward [all those least
+   * significant digits we zeroed].
+   */
+  {
+    register mp_digit *tmpx;
+    register mp_word *_W, *_W1;
+
+    /* nox fix rest of carries */
+
+    /* alias for current word */
+    _W1 = W + ix;
+
+    /* alias for next word, where the carry goes */
+    _W = W + ++ix;
+
+    for (; ix <= n->used * 2 + 1; ix++) {
+      *_W++ += *_W1++ >> ((mp_word) DIGIT_BIT);
+    }
+
+    /* copy out, A = A/b**n
+     *
+     * The result is A/b**n but instead of converting from an
+     * array of mp_word to mp_digit than calling mp_rshd
+     * we just copy them in the right order
+     */
+
+    /* alias for destination word */
+    tmpx = x->dp;
+
+    /* alias for shifted double precision result */
+    _W = W + n->used;
+
+    for (ix = 0; ix < n->used + 1; ix++) {
+      *tmpx++ = (mp_digit)(*_W++ & ((mp_word) MP_MASK));
+    }
+
+    /* zero oldused digits, if the input a was larger than
+     * m->used+1 we'll have to clear the digits
+     */
+    for (; ix < olduse; ix++) {
+      *tmpx++ = 0;
+    }
+  }
+
+  /* set the max used and clamp */
+  x->used = n->used + 1;
+  mp_clamp (x);
+
+  /* if A >= m then A = A - m */
+  if (mp_cmp_mag (x, n) != MP_LT) {
+    return s_mp_sub (x, n, x);
+  }
+  return MP_OKAY;
+}
+#endif
diff --git a/libtommath/bn_fast_s_mp_mul_digs.c b/libtommath/bn_fast_s_mp_mul_digs.c
new file mode 100644
index 0000000..df3da26
--- /dev/null
+++ b/libtommath/bn_fast_s_mp_mul_digs.c
@@ -0,0 +1,105 @@
+#include <tommath.h>
+#ifdef BN_FAST_S_MP_MUL_DIGS_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* Fast (comba) multiplier
+ *
+ * This is the fast column-array [comba] multiplier.  It is 
+ * designed to compute the columns of the product first 
+ * then handle the carries afterwards.  This has the effect 
+ * of making the nested loops that compute the columns very
+ * simple and schedulable on super-scalar processors.
+ *
+ * This has been modified to produce a variable number of 
+ * digits of output so if say only a half-product is required 
+ * you don't have to compute the upper half (a feature 
+ * required for fast Barrett reduction).
+ *
+ * Based on Algorithm 14.12 on pp.595 of HAC.
+ *
+ */
+int fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
+{
+  int     olduse, res, pa, ix, iz;
+  mp_digit W[MP_WARRAY];
+  register mp_word  _W;
+
+  /* grow the destination as required */
+  if (c->alloc < digs) {
+    if ((res = mp_grow (c, digs)) != MP_OKAY) {
+      return res;
+    }
+  }
+
+  /* number of output digits to produce */
+  pa = MIN(digs, a->used + b->used);
+
+  /* clear the carry */
+  _W = 0;
+  for (ix = 0; ix < pa; ix++) { 
+      int      tx, ty;
+      int      iy;
+      mp_digit *tmpx, *tmpy;
+
+      /* get offsets into the two bignums */
+      ty = MIN(b->used-1, ix);
+      tx = ix - ty;
+
+      /* setup temp aliases */
+      tmpx = a->dp + tx;
+      tmpy = b->dp + ty;
+
+      /* this is the number of times the loop will iterrate, essentially 
+         while (tx++ < a->used && ty-- >= 0) { ... }
+       */
+      iy = MIN(a->used-tx, ty+1);
+
+      /* execute loop */
+      for (iz = 0; iz < iy; ++iz) {
+         _W += ((mp_word)*tmpx++)*((mp_word)*tmpy--);
+      }
+
+      /* store term */
+      W[ix] = ((mp_digit)_W) & MP_MASK;
+
+      /* make next carry */
+      _W = _W >> ((mp_word)DIGIT_BIT);
+  }
+
+  /* store final carry */
+  W[ix] = (mp_digit)(_W & MP_MASK);
+
+  /* setup dest */
+  olduse  = c->used;
+  c->used = pa;
+
+  {
+    register mp_digit *tmpc;
+    tmpc = c->dp;
+    for (ix = 0; ix < pa+1; ix++) {
+      /* now extract the previous digit [below the carry] */
+      *tmpc++ = W[ix];
+    }
+
+    /* clear unused digits [that existed in the old copy of c] */
+    for (; ix < olduse; ix++) {
+      *tmpc++ = 0;
+    }
+  }
+  mp_clamp (c);
+  return MP_OKAY;
+}
+#endif
diff --git a/libtommath/bn_fast_s_mp_mul_high_digs.c b/libtommath/bn_fast_s_mp_mul_high_digs.c
new file mode 100644
index 0000000..ee657f9
--- /dev/null
+++ b/libtommath/bn_fast_s_mp_mul_high_digs.c
@@ -0,0 +1,97 @@
+#include <tommath.h>
+#ifdef BN_FAST_S_MP_MUL_HIGH_DIGS_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* this is a modified version of fast_s_mul_digs that only produces
+ * output digits *above* digs.  See the comments for fast_s_mul_digs
+ * to see how it works.
+ *
+ * This is used in the Barrett reduction since for one of the multiplications
+ * only the higher digits were needed.  This essentially halves the work.
+ *
+ * Based on Algorithm 14.12 on pp.595 of HAC.
+ */
+int fast_s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
+{
+  int     olduse, res, pa, ix, iz;
+  mp_digit W[MP_WARRAY];
+  mp_word  _W;
+
+  /* grow the destination as required */
+  pa = a->used + b->used;
+  if (c->alloc < pa) {
+    if ((res = mp_grow (c, pa)) != MP_OKAY) {
+      return res;
+    }
+  }
+
+  /* number of output digits to produce */
+  pa = a->used + b->used;
+  _W = 0;
+  for (ix = digs; ix < pa; ix++) { 
+      int      tx, ty, iy;
+      mp_digit *tmpx, *tmpy;
+
+      /* get offsets into the two bignums */
+      ty = MIN(b->used-1, ix);
+      tx = ix - ty;
+
+      /* setup temp aliases */
+      tmpx = a->dp + tx;
+      tmpy = b->dp + ty;
+
+      /* this is the number of times the loop will iterrate, essentially its 
+         while (tx++ < a->used && ty-- >= 0) { ... }
+       */
+      iy = MIN(a->used-tx, ty+1);
+
+      /* execute loop */
+      for (iz = 0; iz < iy; iz++) {
+         _W += ((mp_word)*tmpx++)*((mp_word)*tmpy--);
+      }
+
+      /* store term */
+      W[ix] = ((mp_digit)_W) & MP_MASK;
+
+      /* make next carry */
+      _W = _W >> ((mp_word)DIGIT_BIT);
+  }
+  
+  /* store final carry */
+  W[ix] = (mp_digit)(_W & MP_MASK);
+
+  /* setup dest */
+  olduse  = c->used;
+  c->used = pa;
+
+  {
+    register mp_digit *tmpc;
+
+    tmpc = c->dp + digs;
+    for (ix = digs; ix <= pa; ix++) {
+      /* now extract the previous digit [below the carry] */
+      *tmpc++ = W[ix];
+    }
+
+    /* clear unused digits [that existed in the old copy of c] */
+    for (; ix < olduse; ix++) {
+      *tmpc++ = 0;
+    }
+  }
+  mp_clamp (c);
+  return MP_OKAY;
+}
+#endif
diff --git a/libtommath/bn_fast_s_mp_sqr.c b/libtommath/bn_fast_s_mp_sqr.c
new file mode 100644
index 0000000..66a2942
--- /dev/null
+++ b/libtommath/bn_fast_s_mp_sqr.c
@@ -0,0 +1,110 @@
+#include <tommath.h>
+#ifdef BN_FAST_S_MP_SQR_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* the jist of squaring...
+ * you do like mult except the offset of the tmpx [one that 
+ * starts closer to zero] can't equal the offset of tmpy.  
+ * So basically you set up iy like before then you min it with
+ * (ty-tx) so that it never happens.  You double all those 
+ * you add in the inner loop
+
+After that loop you do the squares and add them in.
+*/
+
+int fast_s_mp_sqr (mp_int * a, mp_int * b)
+{
+  int       olduse, res, pa, ix, iz;
+  mp_digit   W[MP_WARRAY], *tmpx;
+  mp_word   W1;
+
+  /* grow the destination as required */
+  pa = a->used + a->used;
+  if (b->alloc < pa) {
+    if ((res = mp_grow (b, pa)) != MP_OKAY) {
+      return res;
+    }
+  }
+
+  /* number of output digits to produce */
+  W1 = 0;
+  for (ix = 0; ix < pa; ix++) { 
+      int      tx, ty, iy;
+      mp_word  _W;
+      mp_digit *tmpy;
+
+      /* clear counter */
+      _W = 0;
+
+      /* get offsets into the two bignums */
+      ty = MIN(a->used-1, ix);
+      tx = ix - ty;
+
+      /* setup temp aliases */
+      tmpx = a->dp + tx;
+      tmpy = a->dp + ty;
+
+      /* this is the number of times the loop will iterrate, essentially
+         while (tx++ < a->used && ty-- >= 0) { ... }
+       */
+      iy = MIN(a->used-tx, ty+1);
+
+      /* now for squaring tx can never equal ty 
+       * we halve the distance since they approach at a rate of 2x
+       * and we have to round because odd cases need to be executed
+       */
+      iy = MIN(iy, (ty-tx+1)>>1);
+
+      /* execute loop */
+      for (iz = 0; iz < iy; iz++) {
+         _W += ((mp_word)*tmpx++)*((mp_word)*tmpy--);
+      }
+
+      /* double the inner product and add carry */
+      _W = _W + _W + W1;
+
+      /* even columns have the square term in them */
+      if ((ix&1) == 0) {
+         _W += ((mp_word)a->dp[ix>>1])*((mp_word)a->dp[ix>>1]);
+      }
+
+      /* store it */
+      W[ix] = (mp_digit)(_W & MP_MASK);
+
+      /* make next carry */
+      W1 = _W >> ((mp_word)DIGIT_BIT);
+  }
+
+  /* setup dest */
+  olduse  = b->used;
+  b->used = a->used+a->used;
+
+  {
+    mp_digit *tmpb;
+    tmpb = b->dp;
+    for (ix = 0; ix < pa; ix++) {
+      *tmpb++ = W[ix] & MP_MASK;
+    }
+
+    /* clear unused digits [that existed in the old copy of c] */
+    for (; ix < olduse; ix++) {
+      *tmpb++ = 0;
+    }
+  }
+  mp_clamp (b);
+  return MP_OKAY;
+}
+#endif
diff --git a/libtommath/bn_mp_2expt.c b/libtommath/bn_mp_2expt.c
new file mode 100644
index 0000000..45a6818
--- /dev/null
+++ b/libtommath/bn_mp_2expt.c
@@ -0,0 +1,44 @@
+#include <tommath.h>
+#ifdef BN_MP_2EXPT_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* computes a = 2**b 
+ *
+ * Simple algorithm which zeroes the int, grows it then just sets one bit
+ * as required.
+ */
+int
+mp_2expt (mp_int * a, int b)
+{
+  int     res;
+
+  /* zero a as per default */
+  mp_zero (a);
+
+  /* grow a to accomodate the single bit */
+  if ((res = mp_grow (a, b / DIGIT_BIT + 1)) != MP_OKAY) {
+    return res;
+  }
+
+  /* set the used count of where the bit will go */
+  a->used = b / DIGIT_BIT + 1;
+
+  /* put the single bit in its place */
+  a->dp[b / DIGIT_BIT] = ((mp_digit)1) << (b % DIGIT_BIT);
+
+  return MP_OKAY;
+}
+#endif
diff --git a/libtommath/bn_mp_abs.c b/libtommath/bn_mp_abs.c
new file mode 100644
index 0000000..34f810f
--- /dev/null
+++ b/libtommath/bn_mp_abs.c
@@ -0,0 +1,39 @@
+#include <tommath.h>
+#ifdef BN_MP_ABS_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* b = |a| 
+ *
+ * Simple function copies the input and fixes the sign to positive
+ */
+int
+mp_abs (mp_int * a, mp_int * b)
+{
+  int     res;
+
+  /* copy a to b */
+  if (a != b) {
+     if ((res = mp_copy (a, b)) != MP_OKAY) {
+       return res;
+     }
+  }
+
+  /* force the sign of b to positive */
+  b->sign = MP_ZPOS;
+
+  return MP_OKAY;
+}
+#endif
diff --git a/libtommath/bn_mp_add.c b/libtommath/bn_mp_add.c
new file mode 100644
index 0000000..554b7f7
--- /dev/null
+++ b/libtommath/bn_mp_add.c
@@ -0,0 +1,49 @@
+#include <tommath.h>
+#ifdef BN_MP_ADD_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* high level addition (handles signs) */
+int mp_add (mp_int * a, mp_int * b, mp_int * c)
+{
+  int     sa, sb, res;
+
+  /* get sign of both inputs */
+  sa = a->sign;
+  sb = b->sign;
+
+  /* handle two cases, not four */
+  if (sa == sb) {
+    /* both positive or both negative */
+    /* add their magnitudes, copy the sign */
+    c->sign = sa;
+    res = s_mp_add (a, b, c);
+  } else {
+    /* one positive, the other negative */
+    /* subtract the one with the greater magnitude from */
+    /* the one of the lesser magnitude.  The result gets */
+    /* the sign of the one with the greater magnitude. */
+    if (mp_cmp_mag (a, b) == MP_LT) {
+      c->sign = sb;
+      res = s_mp_sub (b, a, c);
+    } else {
+      c->sign = sa;
+      res = s_mp_sub (a, b, c);
+    }
+  }
+  return res;
+}
+
+#endif
diff --git a/libtommath/bn_mp_add_d.c b/libtommath/bn_mp_add_d.c
new file mode 100644
index 0000000..bdd0280
--- /dev/null
+++ b/libtommath/bn_mp_add_d.c
@@ -0,0 +1,105 @@
+#include <tommath.h>
+#ifdef BN_MP_ADD_D_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* single digit addition */
+int
+mp_add_d (mp_int * a, mp_digit b, mp_int * c)
+{
+  int     res, ix, oldused;
+  mp_digit *tmpa, *tmpc, mu;
+
+  /* grow c as required */
+  if (c->alloc < a->used + 1) {
+     if ((res = mp_grow(c, a->used + 1)) != MP_OKAY) {
+        return res;
+     }
+  }
+
+  /* if a is negative and |a| >= b, call c = |a| - b */
+  if (a->sign == MP_NEG && (a->used > 1 || a->dp[0] >= b)) {
+     /* temporarily fix sign of a */
+     a->sign = MP_ZPOS;
+
+     /* c = |a| - b */
+     res = mp_sub_d(a, b, c);
+
+     /* fix sign  */
+     a->sign = c->sign = MP_NEG;
+
+     return res;
+  }
+
+  /* old number of used digits in c */
+  oldused = c->used;
+
+  /* sign always positive */
+  c->sign = MP_ZPOS;
+
+  /* source alias */
+  tmpa    = a->dp;
+
+  /* destination alias */
+  tmpc    = c->dp;
+
+  /* if a is positive */
+  if (a->sign == MP_ZPOS) {
+     /* add digit, after this we're propagating
+      * the carry.
+      */
+     *tmpc   = *tmpa++ + b;
+     mu      = *tmpc >> DIGIT_BIT;
+     *tmpc++ &= MP_MASK;
+
+     /* now handle rest of the digits */
+     for (ix = 1; ix < a->used; ix++) {
+        *tmpc   = *tmpa++ + mu;
+        mu      = *tmpc >> DIGIT_BIT;
+        *tmpc++ &= MP_MASK;
+     }
+     /* set final carry */
+     ix++;
+     *tmpc++  = mu;
+
+     /* setup size */
+     c->used = a->used + 1;
+  } else {
+     /* a was negative and |a| < b */
+     c->used  = 1;
+
+     /* the result is a single digit */
+     if (a->used == 1) {
+        *tmpc++  =  b - a->dp[0];
+     } else {
+        *tmpc++  =  b;
+     }
+
+     /* setup count so the clearing of oldused
+      * can fall through correctly
+      */
+     ix       = 1;
+  }
+
+  /* now zero to oldused */
+  while (ix++ < oldused) {
+     *tmpc++ = 0;
+  }
+  mp_clamp(c);
+
+  return MP_OKAY;
+}
+
+#endif
diff --git a/libtommath/bn_mp_addmod.c b/libtommath/bn_mp_addmod.c
new file mode 100644
index 0000000..13eb33f
--- /dev/null
+++ b/libtommath/bn_mp_addmod.c
@@ -0,0 +1,37 @@
+#include <tommath.h>
+#ifdef BN_MP_ADDMOD_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* d = a + b (mod c) */
+int
+mp_addmod (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
+{
+  int     res;
+  mp_int  t;
+
+  if ((res = mp_init (&t)) != MP_OKAY) {
+    return res;
+  }
+
+  if ((res = mp_add (a, b, &t)) != MP_OKAY) {
+    mp_clear (&t);
+    return res;
+  }
+  res = mp_mod (&t, c, d);
+  mp_clear (&t);
+  return res;
+}
+#endif
diff --git a/libtommath/bn_mp_and.c b/libtommath/bn_mp_and.c
new file mode 100644
index 0000000..61dc386
--- /dev/null
+++ b/libtommath/bn_mp_and.c
@@ -0,0 +1,53 @@
+#include <tommath.h>
+#ifdef BN_MP_AND_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* AND two ints together */
+int
+mp_and (mp_int * a, mp_int * b, mp_int * c)
+{
+  int     res, ix, px;
+  mp_int  t, *x;
+
+  if (a->used > b->used) {
+    if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
+      return res;
+    }
+    px = b->used;
+    x = b;
+  } else {
+    if ((res = mp_init_copy (&t, b)) != MP_OKAY) {
+      return res;
+    }
+    px = a->used;
+    x = a;
+  }
+
+  for (ix = 0; ix < px; ix++) {
+    t.dp[ix] &= x->dp[ix];
+  }
+
+  /* zero digits above the last from the smallest mp_int */
+  for (; ix < t.used; ix++) {
+    t.dp[ix] = 0;
+  }
+
+  mp_clamp (&t);
+  mp_exch (c, &t);
+  mp_clear (&t);
+  return MP_OKAY;
+}
+#endif
diff --git a/libtommath/bn_mp_clamp.c b/libtommath/bn_mp_clamp.c
new file mode 100644
index 0000000..c172611
--- /dev/null
+++ b/libtommath/bn_mp_clamp.c
@@ -0,0 +1,40 @@
+#include <tommath.h>
+#ifdef BN_MP_CLAMP_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* trim unused digits 
+ *
+ * This is used to ensure that leading zero digits are
+ * trimed and the leading "used" digit will be non-zero
+ * Typically very fast.  Also fixes the sign if there
+ * are no more leading digits
+ */
+void
+mp_clamp (mp_int * a)
+{
+  /* decrease used while the most significant digit is
+   * zero.
+   */
+  while (a->used > 0 && a->dp[a->used - 1] == 0) {
+    --(a->used);
+  }
+
+  /* reset the sign flag if used == 0 */
+  if (a->used == 0) {
+    a->sign = MP_ZPOS;
+  }
+}
+#endif
diff --git a/libtommath/bn_mp_clear.c b/libtommath/bn_mp_clear.c
new file mode 100644
index 0000000..1dc053b
--- /dev/null
+++ b/libtommath/bn_mp_clear.c
@@ -0,0 +1,43 @@
+#include <tommath.h>
+#ifdef BN_MP_CLEAR_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* clear one (frees)  */
+void
+mp_clear (mp_int * a)
+{
+  volatile mp_digit *p;
+  int len;
+
+  /* only do anything if a hasn't been freed previously */
+  if (a->dp != NULL) {
+    /* first zero the digits */
+	len = a->alloc;
+	p = a->dp;
+	while (len--) {
+		*p++ = 0;
+	}
+
+    /* free ram */
+    XFREE(a->dp);
+
+    /* reset members to make debugging easier */
+    a->dp    = NULL;
+    a->alloc = a->used = 0;
+    a->sign  = MP_ZPOS;
+  }
+}
+#endif
diff --git a/libtommath/bn_mp_clear_multi.c b/libtommath/bn_mp_clear_multi.c
new file mode 100644
index 0000000..24cbe73
--- /dev/null
+++ b/libtommath/bn_mp_clear_multi.c
@@ -0,0 +1,30 @@
+#include <tommath.h>
+#ifdef BN_MP_CLEAR_MULTI_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <stdarg.h>
+
+void mp_clear_multi(mp_int *mp, ...) 
+{
+    mp_int* next_mp = mp;
+    va_list args;
+    va_start(args, mp);
+    while (next_mp != NULL) {
+        mp_clear(next_mp);
+        next_mp = va_arg(args, mp_int*);
+    }
+    va_end(args);
+}
+#endif
diff --git a/libtommath/bn_mp_cmp.c b/libtommath/bn_mp_cmp.c
new file mode 100644
index 0000000..583b5f8
--- /dev/null
+++ b/libtommath/bn_mp_cmp.c
@@ -0,0 +1,39 @@
+#include <tommath.h>
+#ifdef BN_MP_CMP_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* compare two ints (signed)*/
+int
+mp_cmp (mp_int * a, mp_int * b)
+{
+  /* compare based on sign */
+  if (a->sign != b->sign) {
+     if (a->sign == MP_NEG) {
+        return MP_LT;
+     } else {
+        return MP_GT;
+     }
+  }
+  
+  /* compare digits */
+  if (a->sign == MP_NEG) {
+     /* if negative compare opposite direction */
+     return mp_cmp_mag(b, a);
+  } else {
+     return mp_cmp_mag(a, b);
+  }
+}
+#endif
diff --git a/libtommath/bn_mp_cmp_d.c b/libtommath/bn_mp_cmp_d.c
new file mode 100644
index 0000000..882b1c9
--- /dev/null
+++ b/libtommath/bn_mp_cmp_d.c
@@ -0,0 +1,40 @@
+#include <tommath.h>
+#ifdef BN_MP_CMP_D_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* compare a digit */
+int mp_cmp_d(mp_int * a, mp_digit b)
+{
+  /* compare based on sign */
+  if (a->sign == MP_NEG) {
+    return MP_LT;
+  }
+
+  /* compare based on magnitude */
+  if (a->used > 1) {
+    return MP_GT;
+  }
+
+  /* compare the only digit of a to b */
+  if (a->dp[0] > b) {
+    return MP_GT;
+  } else if (a->dp[0] < b) {
+    return MP_LT;
+  } else {
+    return MP_EQ;
+  }
+}
+#endif
diff --git a/libtommath/bn_mp_cmp_mag.c b/libtommath/bn_mp_cmp_mag.c
new file mode 100644
index 0000000..a0f351c
--- /dev/null
+++ b/libtommath/bn_mp_cmp_mag.c
@@ -0,0 +1,51 @@
+#include <tommath.h>
+#ifdef BN_MP_CMP_MAG_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* compare maginitude of two ints (unsigned) */
+int mp_cmp_mag (mp_int * a, mp_int * b)
+{
+  int     n;
+  mp_digit *tmpa, *tmpb;
+
+  /* compare based on # of non-zero digits */
+  if (a->used > b->used) {
+    return MP_GT;
+  }
+  
+  if (a->used < b->used) {
+    return MP_LT;
+  }
+
+  /* alias for a */
+  tmpa = a->dp + (a->used - 1);
+
+  /* alias for b */
+  tmpb = b->dp + (a->used - 1);
+
+  /* compare based on digits  */
+  for (n = 0; n < a->used; ++n, --tmpa, --tmpb) {
+    if (*tmpa > *tmpb) {
+      return MP_GT;
+    }
+
+    if (*tmpa < *tmpb) {
+      return MP_LT;
+    }
+  }
+  return MP_EQ;
+}
+#endif
diff --git a/libtommath/bn_mp_cnt_lsb.c b/libtommath/bn_mp_cnt_lsb.c
new file mode 100644
index 0000000..571f03f
--- /dev/null
+++ b/libtommath/bn_mp_cnt_lsb.c
@@ -0,0 +1,49 @@
+#include <tommath.h>
+#ifdef BN_MP_CNT_LSB_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+static const int lnz[16] = { 
+   4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0
+};
+
+/* Counts the number of lsbs which are zero before the first zero bit */
+int mp_cnt_lsb(mp_int *a)
+{
+   int x;
+   mp_digit q, qq;
+
+   /* easy out */
+   if (mp_iszero(a) == 1) {
+      return 0;
+   }
+
+   /* scan lower digits until non-zero */
+   for (x = 0; x < a->used && a->dp[x] == 0; x++);
+   q = a->dp[x];
+   x *= DIGIT_BIT;
+
+   /* now scan this digit until a 1 is found */
+   if ((q & 1) == 0) {
+      do {
+         qq  = q & 15;
+         x  += lnz[qq];
+         q >>= 4;
+      } while (qq == 0);
+   }
+   return x;
+}
+
+#endif
diff --git a/libtommath/bn_mp_copy.c b/libtommath/bn_mp_copy.c
new file mode 100644
index 0000000..183ec9b
--- /dev/null
+++ b/libtommath/bn_mp_copy.c
@@ -0,0 +1,64 @@
+#include <tommath.h>
+#ifdef BN_MP_COPY_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* copy, b = a */
+int
+mp_copy (mp_int * a, mp_int * b)
+{
+  int     res, n;
+
+  /* if dst == src do nothing */
+  if (a == b) {
+    return MP_OKAY;
+  }
+
+  /* grow dest */
+  if (b->alloc < a->used) {
+     if ((res = mp_grow (b, a->used)) != MP_OKAY) {
+        return res;
+     }
+  }
+
+  /* zero b and copy the parameters over */
+  {
+    register mp_digit *tmpa, *tmpb;
+
+    /* pointer aliases */
+
+    /* source */
+    tmpa = a->dp;
+
+    /* destination */
+    tmpb = b->dp;
+
+    /* copy all the digits */
+    for (n = 0; n < a->used; n++) {
+      *tmpb++ = *tmpa++;
+    }
+
+    /* clear high digits */
+    for (; n < b->used; n++) {
+      *tmpb++ = 0;
+    }
+  }
+
+  /* copy used count and sign */
+  b->used = a->used;
+  b->sign = a->sign;
+  return MP_OKAY;
+}
+#endif
diff --git a/libtommath/bn_mp_count_bits.c b/libtommath/bn_mp_count_bits.c
new file mode 100644
index 0000000..f3f85ac
--- /dev/null
+++ b/libtommath/bn_mp_count_bits.c
@@ -0,0 +1,41 @@
+#include <tommath.h>
+#ifdef BN_MP_COUNT_BITS_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* returns the number of bits in an int */
+int
+mp_count_bits (mp_int * a)
+{
+  int     r;
+  mp_digit q;
+
+  /* shortcut */
+  if (a->used == 0) {
+    return 0;
+  }
+
+  /* get number of digits and add that */
+  r = (a->used - 1) * DIGIT_BIT;
+  
+  /* take the last digit and count the bits in it */
+  q = a->dp[a->used - 1];
+  while (q > ((mp_digit) 0)) {
+    ++r;
+    q >>= ((mp_digit) 1);
+  }
+  return r;
+}
+#endif
diff --git a/libtommath/bn_mp_div.c b/libtommath/bn_mp_div.c
new file mode 100644
index 0000000..6b2b8f0
--- /dev/null
+++ b/libtommath/bn_mp_div.c
@@ -0,0 +1,288 @@
+#include <tommath.h>
+#ifdef BN_MP_DIV_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+#ifdef BN_MP_DIV_SMALL
+
+/* slower bit-bang division... also smaller */
+int mp_div(mp_int * a, mp_int * b, mp_int * c, mp_int * d)
+{
+   mp_int ta, tb, tq, q;
+   int    res, n, n2;
+
+  /* is divisor zero ? */
+  if (mp_iszero (b) == 1) {
+    return MP_VAL;
+  }
+
+  /* if a < b then q=0, r = a */
+  if (mp_cmp_mag (a, b) == MP_LT) {
+    if (d != NULL) {
+      res = mp_copy (a, d);
+    } else {
+      res = MP_OKAY;
+    }
+    if (c != NULL) {
+      mp_zero (c);
+    }
+    return res;
+  }
+	
+  /* init our temps */
+  if ((res = mp_init_multi(&ta, &tb, &tq, &q, NULL) != MP_OKAY)) {
+     return res;
+  }
+
+
+  mp_set(&tq, 1);
+  n = mp_count_bits(a) - mp_count_bits(b);
+  if (((res = mp_abs(a, &ta)) != MP_OKAY) ||
+      ((res = mp_abs(b, &tb)) != MP_OKAY) || 
+      ((res = mp_mul_2d(&tb, n, &tb)) != MP_OKAY) ||
+      ((res = mp_mul_2d(&tq, n, &tq)) != MP_OKAY)) {
+      goto LBL_ERR;
+  }
+
+  while (n-- >= 0) {
+     if (mp_cmp(&tb, &ta) != MP_GT) {
+        if (((res = mp_sub(&ta, &tb, &ta)) != MP_OKAY) ||
+            ((res = mp_add(&q, &tq, &q)) != MP_OKAY)) {
+           goto LBL_ERR;
+        }
+     }
+     if (((res = mp_div_2d(&tb, 1, &tb, NULL)) != MP_OKAY) ||
+         ((res = mp_div_2d(&tq, 1, &tq, NULL)) != MP_OKAY)) {
+           goto LBL_ERR;
+     }
+  }
+
+  /* now q == quotient and ta == remainder */
+  n  = a->sign;
+  n2 = (a->sign == b->sign ? MP_ZPOS : MP_NEG);
+  if (c != NULL) {
+     mp_exch(c, &q);
+     c->sign  = (mp_iszero(c) == MP_YES) ? MP_ZPOS : n2;
+  }
+  if (d != NULL) {
+     mp_exch(d, &ta);
+     d->sign = (mp_iszero(d) == MP_YES) ? MP_ZPOS : n;
+  }
+LBL_ERR:
+   mp_clear_multi(&ta, &tb, &tq, &q, NULL);
+   return res;
+}
+
+#else
+
+/* integer signed division. 
+ * c*b + d == a [e.g. a/b, c=quotient, d=remainder]
+ * HAC pp.598 Algorithm 14.20
+ *
+ * Note that the description in HAC is horribly 
+ * incomplete.  For example, it doesn't consider 
+ * the case where digits are removed from 'x' in 
+ * the inner loop.  It also doesn't consider the 
+ * case that y has fewer than three digits, etc..
+ *
+ * The overall algorithm is as described as 
+ * 14.20 from HAC but fixed to treat these cases.
+*/
+int mp_div (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
+{
+  mp_int  q, x, y, t1, t2;
+  int     res, n, t, i, norm, neg;
+
+  /* is divisor zero ? */
+  if (mp_iszero (b) == 1) {
+    return MP_VAL;
+  }
+
+  /* if a < b then q=0, r = a */
+  if (mp_cmp_mag (a, b) == MP_LT) {
+    if (d != NULL) {
+      res = mp_copy (a, d);
+    } else {
+      res = MP_OKAY;
+    }
+    if (c != NULL) {
+      mp_zero (c);
+    }
+    return res;
+  }
+
+  if ((res = mp_init_size (&q, a->used + 2)) != MP_OKAY) {
+    return res;
+  }
+  q.used = a->used + 2;
+
+  if ((res = mp_init (&t1)) != MP_OKAY) {
+    goto LBL_Q;
+  }
+
+  if ((res = mp_init (&t2)) != MP_OKAY) {
+    goto LBL_T1;
+  }
+
+  if ((res = mp_init_copy (&x, a)) != MP_OKAY) {
+    goto LBL_T2;
+  }
+
+  if ((res = mp_init_copy (&y, b)) != MP_OKAY) {
+    goto LBL_X;
+  }
+
+  /* fix the sign */
+  neg = (a->sign == b->sign) ? MP_ZPOS : MP_NEG;
+  x.sign = y.sign = MP_ZPOS;
+
+  /* normalize both x and y, ensure that y >= b/2, [b == 2**DIGIT_BIT] */
+  norm = mp_count_bits(&y) % DIGIT_BIT;
+  if (norm < (int)(DIGIT_BIT-1)) {
+     norm = (DIGIT_BIT-1) - norm;
+     if ((res = mp_mul_2d (&x, norm, &x)) != MP_OKAY) {
+       goto LBL_Y;
+     }
+     if ((res = mp_mul_2d (&y, norm, &y)) != MP_OKAY) {
+       goto LBL_Y;
+     }
+  } else {
+     norm = 0;
+  }
+
+  /* note hac does 0 based, so if used==5 then its 0,1,2,3,4, e.g. use 4 */
+  n = x.used - 1;
+  t = y.used - 1;
+
+  /* while (x >= y*b**n-t) do { q[n-t] += 1; x -= y*b**{n-t} } */
+  if ((res = mp_lshd (&y, n - t)) != MP_OKAY) { /* y = y*b**{n-t} */
+    goto LBL_Y;
+  }
+
+  while (mp_cmp (&x, &y) != MP_LT) {
+    ++(q.dp[n - t]);
+    if ((res = mp_sub (&x, &y, &x)) != MP_OKAY) {
+      goto LBL_Y;
+    }
+  }
+
+  /* reset y by shifting it back down */
+  mp_rshd (&y, n - t);
+
+  /* step 3. for i from n down to (t + 1) */
+  for (i = n; i >= (t + 1); i--) {
+    if (i > x.used) {
+      continue;
+    }
+
+    /* step 3.1 if xi == yt then set q{i-t-1} to b-1, 
+     * otherwise set q{i-t-1} to (xi*b + x{i-1})/yt */
+    if (x.dp[i] == y.dp[t]) {
+      q.dp[i - t - 1] = ((((mp_digit)1) << DIGIT_BIT) - 1);
+    } else {
+      mp_word tmp;
+      tmp = ((mp_word) x.dp[i]) << ((mp_word) DIGIT_BIT);
+      tmp |= ((mp_word) x.dp[i - 1]);
+      tmp /= ((mp_word) y.dp[t]);
+      if (tmp > (mp_word) MP_MASK)
+        tmp = MP_MASK;
+      q.dp[i - t - 1] = (mp_digit) (tmp & (mp_word) (MP_MASK));
+    }
+
+    /* while (q{i-t-1} * (yt * b + y{t-1})) > 
+             xi * b**2 + xi-1 * b + xi-2 
+     
+       do q{i-t-1} -= 1; 
+    */
+    q.dp[i - t - 1] = (q.dp[i - t - 1] + 1) & MP_MASK;
+    do {
+      q.dp[i - t - 1] = (q.dp[i - t - 1] - 1) & MP_MASK;
+
+      /* find left hand */
+      mp_zero (&t1);
+      t1.dp[0] = (t - 1 < 0) ? 0 : y.dp[t - 1];
+      t1.dp[1] = y.dp[t];
+      t1.used = 2;
+      if ((res = mp_mul_d (&t1, q.dp[i - t - 1], &t1)) != MP_OKAY) {
+        goto LBL_Y;
+      }
+
+      /* find right hand */
+      t2.dp[0] = (i - 2 < 0) ? 0 : x.dp[i - 2];
+      t2.dp[1] = (i - 1 < 0) ? 0 : x.dp[i - 1];
+      t2.dp[2] = x.dp[i];
+      t2.used = 3;
+    } while (mp_cmp_mag(&t1, &t2) == MP_GT);
+
+    /* step 3.3 x = x - q{i-t-1} * y * b**{i-t-1} */
+    if ((res = mp_mul_d (&y, q.dp[i - t - 1], &t1)) != MP_OKAY) {
+      goto LBL_Y;
+    }
+
+    if ((res = mp_lshd (&t1, i - t - 1)) != MP_OKAY) {
+      goto LBL_Y;
+    }
+
+    if ((res = mp_sub (&x, &t1, &x)) != MP_OKAY) {
+      goto LBL_Y;
+    }
+
+    /* if x < 0 then { x = x + y*b**{i-t-1}; q{i-t-1} -= 1; } */
+    if (x.sign == MP_NEG) {
+      if ((res = mp_copy (&y, &t1)) != MP_OKAY) {
+        goto LBL_Y;
+      }
+      if ((res = mp_lshd (&t1, i - t - 1)) != MP_OKAY) {
+        goto LBL_Y;
+      }
+      if ((res = mp_add (&x, &t1, &x)) != MP_OKAY) {
+        goto LBL_Y;
+      }
+
+      q.dp[i - t - 1] = (q.dp[i - t - 1] - 1UL) & MP_MASK;
+    }
+  }
+
+  /* now q is the quotient and x is the remainder 
+   * [which we have to normalize] 
+   */
+  
+  /* get sign before writing to c */
+  x.sign = x.used == 0 ? MP_ZPOS : a->sign;
+
+  if (c != NULL) {
+    mp_clamp (&q);
+    mp_exch (&q, c);
+    c->sign = neg;
+  }
+
+  if (d != NULL) {
+    mp_div_2d (&x, norm, &x, NULL);
+    mp_exch (&x, d);
+  }
+
+  res = MP_OKAY;
+
+LBL_Y:mp_clear (&y);
+LBL_X:mp_clear (&x);
+LBL_T2:mp_clear (&t2);
+LBL_T1:mp_clear (&t1);
+LBL_Q:mp_clear (&q);
+  return res;
+}
+
+#endif
+
+#endif
diff --git a/libtommath/bn_mp_div_2.c b/libtommath/bn_mp_div_2.c
new file mode 100644
index 0000000..5777997
--- /dev/null
+++ b/libtommath/bn_mp_div_2.c
@@ -0,0 +1,64 @@
+#include <tommath.h>
+#ifdef BN_MP_DIV_2_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* b = a/2 */
+int mp_div_2(mp_int * a, mp_int * b)
+{
+  int     x, res, oldused;
+
+  /* copy */
+  if (b->alloc < a->used) {
+    if ((res = mp_grow (b, a->used)) != MP_OKAY) {
+      return res;
+    }
+  }
+
+  oldused = b->used;
+  b->used = a->used;
+  {
+    register mp_digit r, rr, *tmpa, *tmpb;
+
+    /* source alias */
+    tmpa = a->dp + b->used - 1;
+
+    /* dest alias */
+    tmpb = b->dp + b->used - 1;
+
+    /* carry */
+    r = 0;
+    for (x = b->used - 1; x >= 0; x--) {
+      /* get the carry for the next iteration */
+      rr = *tmpa & 1;
+
+      /* shift the current digit, add in carry and store */
+      *tmpb-- = (*tmpa-- >> 1) | (r << (DIGIT_BIT - 1));
+
+      /* forward carry to next iteration */
+      r = rr;
+    }
+
+    /* zero excess digits */
+    tmpb = b->dp + b->used;
+    for (x = b->used; x < oldused; x++) {
+      *tmpb++ = 0;
+    }
+  }
+  b->sign = a->sign;
+  mp_clamp (b);
+  return MP_OKAY;
+}
+#endif
diff --git a/libtommath/bn_mp_div_2d.c b/libtommath/bn_mp_div_2d.c
new file mode 100644
index 0000000..cf103f2
--- /dev/null
+++ b/libtommath/bn_mp_div_2d.c
@@ -0,0 +1,93 @@
+#include <tommath.h>
+#ifdef BN_MP_DIV_2D_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* shift right by a certain bit count (store quotient in c, optional remainder in d) */
+int mp_div_2d (mp_int * a, int b, mp_int * c, mp_int * d)
+{
+  mp_digit D, r, rr;
+  int     x, res;
+  mp_int  t;
+
+
+  /* if the shift count is <= 0 then we do no work */
+  if (b <= 0) {
+    res = mp_copy (a, c);
+    if (d != NULL) {
+      mp_zero (d);
+    }
+    return res;
+  }
+
+  if ((res = mp_init (&t)) != MP_OKAY) {
+    return res;
+  }
+
+  /* get the remainder */
+  if (d != NULL) {
+    if ((res = mp_mod_2d (a, b, &t)) != MP_OKAY) {
+      mp_clear (&t);
+      return res;
+    }
+  }
+
+  /* copy */
+  if ((res = mp_copy (a, c)) != MP_OKAY) {
+    mp_clear (&t);
+    return res;
+  }
+
+  /* shift by as many digits in the bit count */
+  if (b >= (int)DIGIT_BIT) {
+    mp_rshd (c, b / DIGIT_BIT);
+  }
+
+  /* shift any bit count < DIGIT_BIT */
+  D = (mp_digit) (b % DIGIT_BIT);
+  if (D != 0) {
+    register mp_digit *tmpc, mask, shift;
+
+    /* mask */
+    mask = (((mp_digit)1) << D) - 1;
+
+    /* shift for lsb */
+    shift = DIGIT_BIT - D;
+
+    /* alias */
+    tmpc = c->dp + (c->used - 1);
+
+    /* carry */
+    r = 0;
+    for (x = c->used - 1; x >= 0; x--) {
+      /* get the lower  bits of this word in a temp */
+      rr = *tmpc & mask;
+
+      /* shift the current word and mix in the carry bits from the previous word */
+      *tmpc = (*tmpc >> D) | (r << shift);
+      --tmpc;
+
+      /* set the carry to the carry bits of the current word found above */
+      r = rr;
+    }
+  }
+  mp_clamp (c);
+  if (d != NULL) {
+    mp_exch (&t, d);
+  }
+  mp_clear (&t);
+  return MP_OKAY;
+}
+#endif
diff --git a/libtommath/bn_mp_div_3.c b/libtommath/bn_mp_div_3.c
new file mode 100644
index 0000000..7cbafc1
--- /dev/null
+++ b/libtommath/bn_mp_div_3.c
@@ -0,0 +1,75 @@
+#include <tommath.h>
+#ifdef BN_MP_DIV_3_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* divide by three (based on routine from MPI and the GMP manual) */
+int
+mp_div_3 (mp_int * a, mp_int *c, mp_digit * d)
+{
+  mp_int   q;
+  mp_word  w, t;
+  mp_digit b;
+  int      res, ix;
+  
+  /* b = 2**DIGIT_BIT / 3 */
+  b = (((mp_word)1) << ((mp_word)DIGIT_BIT)) / ((mp_word)3);
+
+  if ((res = mp_init_size(&q, a->used)) != MP_OKAY) {
+     return res;
+  }
+  
+  q.used = a->used;
+  q.sign = a->sign;
+  w = 0;
+  for (ix = a->used - 1; ix >= 0; ix--) {
+     w = (w << ((mp_word)DIGIT_BIT)) | ((mp_word)a->dp[ix]);
+
+     if (w >= 3) {
+        /* multiply w by [1/3] */
+        t = (w * ((mp_word)b)) >> ((mp_word)DIGIT_BIT);
+
+        /* now subtract 3 * [w/3] from w, to get the remainder */
+        w -= t+t+t;
+
+        /* fixup the remainder as required since
+         * the optimization is not exact.
+         */
+        while (w >= 3) {
+           t += 1;
+           w -= 3;
+        }
+      } else {
+        t = 0;
+      }
+      q.dp[ix] = (mp_digit)t;
+  }
+
+  /* [optional] store the remainder */
+  if (d != NULL) {
+     *d = (mp_digit)w;
+  }
+
+  /* [optional] store the quotient */
+  if (c != NULL) {
+     mp_clamp(&q);
+     mp_exch(&q, c);
+  }
+  mp_clear(&q);
+  
+  return res;
+}
+
+#endif
diff --git a/libtommath/bn_mp_div_d.c b/libtommath/bn_mp_div_d.c
new file mode 100644
index 0000000..9b58aa6
--- /dev/null
+++ b/libtommath/bn_mp_div_d.c
@@ -0,0 +1,106 @@
+#include <tommath.h>
+#ifdef BN_MP_DIV_D_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+static int s_is_power_of_two(mp_digit b, int *p)
+{
+   int x;
+
+   for (x = 1; x < DIGIT_BIT; x++) {
+      if (b == (((mp_digit)1)<<x)) {
+         *p = x;
+         return 1;
+      }
+   }
+   return 0;
+}
+
+/* single digit division (based on routine from MPI) */
+int mp_div_d (mp_int * a, mp_digit b, mp_int * c, mp_digit * d)
+{
+  mp_int  q;
+  mp_word w;
+  mp_digit t;
+  int     res, ix;
+
+  /* cannot divide by zero */
+  if (b == 0) {
+     return MP_VAL;
+  }
+
+  /* quick outs */
+  if (b == 1 || mp_iszero(a) == 1) {
+     if (d != NULL) {
+        *d = 0;
+     }
+     if (c != NULL) {
+        return mp_copy(a, c);
+     }
+     return MP_OKAY;
+  }
+
+  /* power of two ? */
+  if (s_is_power_of_two(b, &ix) == 1) {
+     if (d != NULL) {
+        *d = a->dp[0] & ((((mp_digit)1)<<ix) - 1);
+     }
+     if (c != NULL) {
+        return mp_div_2d(a, ix, c, NULL);
+     }
+     return MP_OKAY;
+  }
+
+#ifdef BN_MP_DIV_3_C
+  /* three? */
+  if (b == 3) {
+     return mp_div_3(a, c, d);
+  }
+#endif
+
+  /* no easy answer [c'est la vie].  Just division */
+  if ((res = mp_init_size(&q, a->used)) != MP_OKAY) {
+     return res;
+  }
+  
+  q.used = a->used;
+  q.sign = a->sign;
+  w = 0;
+  for (ix = a->used - 1; ix >= 0; ix--) {
+     w = (w << ((mp_word)DIGIT_BIT)) | ((mp_word)a->dp[ix]);
+     
+     if (w >= b) {
+        t = (mp_digit)(w / b);
+        w -= ((mp_word)t) * ((mp_word)b);
+      } else {
+        t = 0;
+      }
+      q.dp[ix] = (mp_digit)t;
+  }
+  
+  if (d != NULL) {
+     *d = (mp_digit)w;
+  }
+  
+  if (c != NULL) {
+     mp_clamp(&q);
+     mp_exch(&q, c);
+  }
+  mp_clear(&q);
+  
+  return res;
+}
+
+#endif
diff --git a/libtommath/bn_mp_dr_is_modulus.c b/libtommath/bn_mp_dr_is_modulus.c
new file mode 100644
index 0000000..5ef78a3
--- /dev/null
+++ b/libtommath/bn_mp_dr_is_modulus.c
@@ -0,0 +1,39 @@
+#include <tommath.h>
+#ifdef BN_MP_DR_IS_MODULUS_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* determines if a number is a valid DR modulus */
+int mp_dr_is_modulus(mp_int *a)
+{
+   int ix;
+
+   /* must be at least two digits */
+   if (a->used < 2) {
+      return 0;
+   }
+
+   /* must be of the form b**k - a [a <= b] so all
+    * but the first digit must be equal to -1 (mod b).
+    */
+   for (ix = 1; ix < a->used; ix++) {
+       if (a->dp[ix] != MP_MASK) {
+          return 0;
+       }
+   }
+   return 1;
+}
+
+#endif
diff --git a/libtommath/bn_mp_dr_reduce.c b/libtommath/bn_mp_dr_reduce.c
new file mode 100644
index 0000000..9bb7ad7
--- /dev/null
+++ b/libtommath/bn_mp_dr_reduce.c
@@ -0,0 +1,90 @@
+#include <tommath.h>
+#ifdef BN_MP_DR_REDUCE_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* reduce "x" in place modulo "n" using the Diminished Radix algorithm.
+ *
+ * Based on algorithm from the paper
+ *
+ * "Generating Efficient Primes for Discrete Log Cryptosystems"
+ *                 Chae Hoon Lim, Pil Joong Lee,
+ *          POSTECH Information Research Laboratories
+ *
+ * The modulus must be of a special format [see manual]
+ *
+ * Has been modified to use algorithm 7.10 from the LTM book instead
+ *
+ * Input x must be in the range 0 <= x <= (n-1)**2
+ */
+int
+mp_dr_reduce (mp_int * x, mp_int * n, mp_digit k)
+{
+  int      err, i, m;
+  mp_word  r;
+  mp_digit mu, *tmpx1, *tmpx2;
+
+  /* m = digits in modulus */
+  m = n->used;
+
+  /* ensure that "x" has at least 2m digits */
+  if (x->alloc < m + m) {
+    if ((err = mp_grow (x, m + m)) != MP_OKAY) {
+      return err;
+    }
+  }
+
+/* top of loop, this is where the code resumes if
+ * another reduction pass is required.
+ */
+top:
+  /* aliases for digits */
+  /* alias for lower half of x */
+  tmpx1 = x->dp;
+
+  /* alias for upper half of x, or x/B**m */
+  tmpx2 = x->dp + m;
+
+  /* set carry to zero */
+  mu = 0;
+
+  /* compute (x mod B**m) + k * [x/B**m] inline and inplace */
+  for (i = 0; i < m; i++) {
+      r         = ((mp_word)*tmpx2++) * ((mp_word)k) + *tmpx1 + mu;
+      *tmpx1++  = (mp_digit)(r & MP_MASK);
+      mu        = (mp_digit)(r >> ((mp_word)DIGIT_BIT));
+  }
+
+  /* set final carry */
+  *tmpx1++ = mu;
+
+  /* zero words above m */
+  for (i = m + 1; i < x->used; i++) {
+      *tmpx1++ = 0;
+  }
+
+  /* clamp, sub and return */
+  mp_clamp (x);
+
+  /* if x >= n then subtract and reduce again
+   * Each successive "recursion" makes the input smaller and smaller.
+   */
+  if (mp_cmp_mag (x, n) != MP_LT) {
+    s_mp_sub(x, n, x);
+    goto top;
+  }
+  return MP_OKAY;
+}
+#endif
diff --git a/libtommath/bn_mp_dr_setup.c b/libtommath/bn_mp_dr_setup.c
new file mode 100644
index 0000000..029d310
--- /dev/null
+++ b/libtommath/bn_mp_dr_setup.c
@@ -0,0 +1,28 @@
+#include <tommath.h>
+#ifdef BN_MP_DR_SETUP_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* determines the setup value */
+void mp_dr_setup(mp_int *a, mp_digit *d)
+{
+   /* the casts are required if DIGIT_BIT is one less than
+    * the number of bits in a mp_digit [e.g. DIGIT_BIT==31]
+    */
+   *d = (mp_digit)((((mp_word)1) << ((mp_word)DIGIT_BIT)) - 
+        ((mp_word)a->dp[0]));
+}
+
+#endif
diff --git a/libtommath/bn_mp_exch.c b/libtommath/bn_mp_exch.c
new file mode 100644
index 0000000..0ef485a
--- /dev/null
+++ b/libtommath/bn_mp_exch.c
@@ -0,0 +1,30 @@
+#include <tommath.h>
+#ifdef BN_MP_EXCH_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* swap the elements of two integers, for cases where you can't simply swap the 
+ * mp_int pointers around
+ */
+void
+mp_exch (mp_int * a, mp_int * b)
+{
+  mp_int  t;
+
+  t  = *a;
+  *a = *b;
+  *b = t;
+}
+#endif
diff --git a/libtommath/bn_mp_expt_d.c b/libtommath/bn_mp_expt_d.c
new file mode 100644
index 0000000..fdb8bd9
--- /dev/null
+++ b/libtommath/bn_mp_expt_d.c
@@ -0,0 +1,53 @@
+#include <tommath.h>
+#ifdef BN_MP_EXPT_D_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* calculate c = a**b  using a square-multiply algorithm */
+int mp_expt_d (mp_int * a, mp_digit b, mp_int * c)
+{
+  int     res, x;
+  mp_int  g;
+
+  if ((res = mp_init_copy (&g, a)) != MP_OKAY) {
+    return res;
+  }
+
+  /* set initial result */
+  mp_set (c, 1);
+
+  for (x = 0; x < (int) DIGIT_BIT; x++) {
+    /* square */
+    if ((res = mp_sqr (c, c)) != MP_OKAY) {
+      mp_clear (&g);
+      return res;
+    }
+
+    /* if the bit is set multiply */
+    if ((b & (mp_digit) (((mp_digit)1) << (DIGIT_BIT - 1))) != 0) {
+      if ((res = mp_mul (c, &g, c)) != MP_OKAY) {
+         mp_clear (&g);
+         return res;
+      }
+    }
+
+    /* shift to next bit */
+    b <<= 1;
+  }
+
+  mp_clear (&g);
+  return MP_OKAY;
+}
+#endif
diff --git a/libtommath/bn_mp_exptmod.c b/libtommath/bn_mp_exptmod.c
new file mode 100644
index 0000000..7c4e2f8
--- /dev/null
+++ b/libtommath/bn_mp_exptmod.c
@@ -0,0 +1,108 @@
+#include <tommath.h>
+#ifdef BN_MP_EXPTMOD_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+
+/* this is a shell function that calls either the normal or Montgomery
+ * exptmod functions.  Originally the call to the montgomery code was
+ * embedded in the normal function but that wasted alot of stack space
+ * for nothing (since 99% of the time the Montgomery code would be called)
+ */
+int mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
+{
+  int dr;
+
+  /* modulus P must be positive */
+  if (P->sign == MP_NEG) {
+     return MP_VAL;
+  }
+
+  /* if exponent X is negative we have to recurse */
+  if (X->sign == MP_NEG) {
+#ifdef BN_MP_INVMOD_C
+     mp_int tmpG, tmpX;
+     int err;
+
+     /* first compute 1/G mod P */
+     if ((err = mp_init(&tmpG)) != MP_OKAY) {
+        return err;
+     }
+     if ((err = mp_invmod(G, P, &tmpG)) != MP_OKAY) {
+        mp_clear(&tmpG);
+        return err;
+     }
+
+     /* now get |X| */
+     if ((err = mp_init(&tmpX)) != MP_OKAY) {
+        mp_clear(&tmpG);
+        return err;
+     }
+     if ((err = mp_abs(X, &tmpX)) != MP_OKAY) {
+        mp_clear_multi(&tmpG, &tmpX, NULL);
+        return err;
+     }
+
+     /* and now compute (1/G)**|X| instead of G**X [X < 0] */
+     err = mp_exptmod(&tmpG, &tmpX, P, Y);
+     mp_clear_multi(&tmpG, &tmpX, NULL);
+     return err;
+#else 
+     /* no invmod */
+     return MP_VAL;
+#endif
+  }
+
+/* modified diminished radix reduction */
+#if defined(BN_MP_REDUCE_IS_2K_L_C) && defined(BN_MP_REDUCE_2K_L_C)
+  if (mp_reduce_is_2k_l(P) == MP_YES) {
+     return s_mp_exptmod(G, X, P, Y, 1);
+  }
+#endif
+
+#ifdef BN_MP_DR_IS_MODULUS_C
+  /* is it a DR modulus? */
+  dr = mp_dr_is_modulus(P);
+#else
+  /* default to no */
+  dr = 0;
+#endif
+
+#ifdef BN_MP_REDUCE_IS_2K_C
+  /* if not, is it a unrestricted DR modulus? */
+  if (dr == 0) {
+     dr = mp_reduce_is_2k(P) << 1;
+  }
+#endif
+    
+  /* if the modulus is odd or dr != 0 use the montgomery method */
+#ifdef BN_MP_EXPTMOD_FAST_C
+  if (mp_isodd (P) == 1 || dr !=  0) {
+    return mp_exptmod_fast (G, X, P, Y, dr);
+  } else {
+#endif
+#ifdef BN_S_MP_EXPTMOD_C
+    /* otherwise use the generic Barrett reduction technique */
+    return s_mp_exptmod (G, X, P, Y, 0);
+#else
+    /* no exptmod for evens */
+    return MP_VAL;
+#endif
+#ifdef BN_MP_EXPTMOD_FAST_C
+  }
+#endif
+}
+
+#endif
diff --git a/libtommath/bn_mp_exptmod_fast.c b/libtommath/bn_mp_exptmod_fast.c
new file mode 100644
index 0000000..82be9ac
--- /dev/null
+++ b/libtommath/bn_mp_exptmod_fast.c
@@ -0,0 +1,317 @@
+#include <tommath.h>
+#ifdef BN_MP_EXPTMOD_FAST_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* computes Y == G**X mod P, HAC pp.616, Algorithm 14.85
+ *
+ * Uses a left-to-right k-ary sliding window to compute the modular exponentiation.
+ * The value of k changes based on the size of the exponent.
+ *
+ * Uses Montgomery or Diminished Radix reduction [whichever appropriate]
+ */
+
+#ifdef MP_LOW_MEM
+   #define TAB_SIZE 32
+#else
+   #define TAB_SIZE 256
+#endif
+
+int mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
+{
+  mp_int  M[TAB_SIZE], res;
+  mp_digit buf, mp;
+  int     err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
+
+  /* use a pointer to the reduction algorithm.  This allows us to use
+   * one of many reduction algorithms without modding the guts of
+   * the code with if statements everywhere.
+   */
+  int     (*redux)(mp_int*,mp_int*,mp_digit);
+
+  /* find window size */
+  x = mp_count_bits (X);
+  if (x <= 7) {
+    winsize = 2;
+  } else if (x <= 36) {
+    winsize = 3;
+  } else if (x <= 140) {
+    winsize = 4;
+  } else if (x <= 450) {
+    winsize = 5;
+  } else if (x <= 1303) {
+    winsize = 6;
+  } else if (x <= 3529) {
+    winsize = 7;
+  } else {
+    winsize = 8;
+  }
+
+#ifdef MP_LOW_MEM
+  if (winsize > 5) {
+     winsize = 5;
+  }
+#endif
+
+  /* init M array */
+  /* init first cell */
+  if ((err = mp_init(&M[1])) != MP_OKAY) {
+     return err;
+  }
+
+  /* now init the second half of the array */
+  for (x = 1<<(winsize-1); x < (1 << winsize); x++) {
+    if ((err = mp_init(&M[x])) != MP_OKAY) {
+      for (y = 1<<(winsize-1); y < x; y++) {
+        mp_clear (&M[y]);
+      }
+      mp_clear(&M[1]);
+      return err;
+    }
+  }
+
+  /* determine and setup reduction code */
+  if (redmode == 0) {
+#ifdef BN_MP_MONTGOMERY_SETUP_C     
+     /* now setup montgomery  */
+     if ((err = mp_montgomery_setup (P, &mp)) != MP_OKAY) {
+        goto LBL_M;
+     }
+#else
+     err = MP_VAL;
+     goto LBL_M;
+#endif
+
+     /* automatically pick the comba one if available (saves quite a few calls/ifs) */
+#ifdef BN_FAST_MP_MONTGOMERY_REDUCE_C
+     if (((P->used * 2 + 1) < MP_WARRAY) &&
+          P->used < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
+        redux = fast_mp_montgomery_reduce;
+     } else 
+#endif
+     {
+#ifdef BN_MP_MONTGOMERY_REDUCE_C
+        /* use slower baseline Montgomery method */
+        redux = mp_montgomery_reduce;
+#else
+        err = MP_VAL;
+        goto LBL_M;
+#endif
+     }
+  } else if (redmode == 1) {
+#if defined(BN_MP_DR_SETUP_C) && defined(BN_MP_DR_REDUCE_C)
+     /* setup DR reduction for moduli of the form B**k - b */
+     mp_dr_setup(P, &mp);
+     redux = mp_dr_reduce;
+#else
+     err = MP_VAL;
+     goto LBL_M;
+#endif
+  } else {
+#if defined(BN_MP_REDUCE_2K_SETUP_C) && defined(BN_MP_REDUCE_2K_C)
+     /* setup DR reduction for moduli of the form 2**k - b */
+     if ((err = mp_reduce_2k_setup(P, &mp)) != MP_OKAY) {
+        goto LBL_M;
+     }
+     redux = mp_reduce_2k;
+#else
+     err = MP_VAL;
+     goto LBL_M;
+#endif
+  }
+
+  /* setup result */
+  if ((err = mp_init (&res)) != MP_OKAY) {
+    goto LBL_M;
+  }
+
+  /* create M table
+   *
+
+   *
+   * The first half of the table is not computed though accept for M[0] and M[1]
+   */
+
+  if (redmode == 0) {
+#ifdef BN_MP_MONTGOMERY_CALC_NORMALIZATION_C
+     /* now we need R mod m */
+     if ((err = mp_montgomery_calc_normalization (&res, P)) != MP_OKAY) {
+       goto LBL_RES;
+     }
+#else 
+     err = MP_VAL;
+     goto LBL_RES;
+#endif
+
+     /* now set M[1] to G * R mod m */
+     if ((err = mp_mulmod (G, &res, P, &M[1])) != MP_OKAY) {
+       goto LBL_RES;
+     }
+  } else {
+     mp_set(&res, 1);
+     if ((err = mp_mod(G, P, &M[1])) != MP_OKAY) {
+        goto LBL_RES;
+     }
+  }
+
+  /* compute the value at M[1<<(winsize-1)] by squaring M[1] (winsize-1) times */
+  if ((err = mp_copy (&M[1], &M[1 << (winsize - 1)])) != MP_OKAY) {
+    goto LBL_RES;
+  }
+
+  for (x = 0; x < (winsize - 1); x++) {
+    if ((err = mp_sqr (&M[1 << (winsize - 1)], &M[1 << (winsize - 1)])) != MP_OKAY) {
+      goto LBL_RES;
+    }
+    if ((err = redux (&M[1 << (winsize - 1)], P, mp)) != MP_OKAY) {
+      goto LBL_RES;
+    }
+  }
+
+  /* create upper table */
+  for (x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x++) {
+    if ((err = mp_mul (&M[x - 1], &M[1], &M[x])) != MP_OKAY) {
+      goto LBL_RES;
+    }
+    if ((err = redux (&M[x], P, mp)) != MP_OKAY) {
+      goto LBL_RES;
+    }
+  }
+
+  /* set initial mode and bit cnt */
+  mode   = 0;
+  bitcnt = 1;
+  buf    = 0;
+  digidx = X->used - 1;
+  bitcpy = 0;
+  bitbuf = 0;
+
+  for (;;) {
+    /* grab next digit as required */
+    if (--bitcnt == 0) {
+      /* if digidx == -1 we are out of digits so break */
+      if (digidx == -1) {
+        break;
+      }
+      /* read next digit and reset bitcnt */
+      buf    = X->dp[digidx--];
+      bitcnt = (int)DIGIT_BIT;
+    }
+
+    /* grab the next msb from the exponent */
+    y     = (mp_digit)(buf >> (DIGIT_BIT - 1)) & 1;
+    buf <<= (mp_digit)1;
+
+    /* if the bit is zero and mode == 0 then we ignore it
+     * These represent the leading zero bits before the first 1 bit
+     * in the exponent.  Technically this opt is not required but it
+     * does lower the # of trivial squaring/reductions used
+     */
+    if (mode == 0 && y == 0) {
+      continue;
+    }
+
+    /* if the bit is zero and mode == 1 then we square */
+    if (mode == 1 && y == 0) {
+      if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
+        goto LBL_RES;
+      }
+      if ((err = redux (&res, P, mp)) != MP_OKAY) {
+        goto LBL_RES;
+      }
+      continue;
+    }
+
+    /* else we add it to the window */
+    bitbuf |= (y << (winsize - ++bitcpy));
+    mode    = 2;
+
+    if (bitcpy == winsize) {
+      /* ok window is filled so square as required and multiply  */
+      /* square first */
+      for (x = 0; x < winsize; x++) {
+        if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
+          goto LBL_RES;
+        }
+        if ((err = redux (&res, P, mp)) != MP_OKAY) {
+          goto LBL_RES;
+        }
+      }
+
+      /* then multiply */
+      if ((err = mp_mul (&res, &M[bitbuf], &res)) != MP_OKAY) {
+        goto LBL_RES;
+      }
+      if ((err = redux (&res, P, mp)) != MP_OKAY) {
+        goto LBL_RES;
+      }
+
+      /* empty window and reset */
+      bitcpy = 0;
+      bitbuf = 0;
+      mode   = 1;
+    }
+  }
+
+  /* if bits remain then square/multiply */
+  if (mode == 2 && bitcpy > 0) {
+    /* square then multiply if the bit is set */
+    for (x = 0; x < bitcpy; x++) {
+      if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
+        goto LBL_RES;
+      }
+      if ((err = redux (&res, P, mp)) != MP_OKAY) {
+        goto LBL_RES;
+      }
+
+      /* get next bit of the window */
+      bitbuf <<= 1;
+      if ((bitbuf & (1 << winsize)) != 0) {
+        /* then multiply */
+        if ((err = mp_mul (&res, &M[1], &res)) != MP_OKAY) {
+          goto LBL_RES;
+        }
+        if ((err = redux (&res, P, mp)) != MP_OKAY) {
+          goto LBL_RES;
+        }
+      }
+    }
+  }
+
+  if (redmode == 0) {
+     /* fixup result if Montgomery reduction is used
+      * recall that any value in a Montgomery system is
+      * actually multiplied by R mod n.  So we have
+      * to reduce one more time to cancel out the factor
+      * of R.
+      */
+     if ((err = redux(&res, P, mp)) != MP_OKAY) {
+       goto LBL_RES;
+     }
+  }
+
+  /* swap res with Y */
+  mp_exch (&res, Y);
+  err = MP_OKAY;
+LBL_RES:mp_clear (&res);
+LBL_M:
+  mp_clear(&M[1]);
+  for (x = 1<<(winsize-1); x < (1 << winsize); x++) {
+    mp_clear (&M[x]);
+  }
+  return err;
+}
+#endif
+
diff --git a/libtommath/bn_mp_exteuclid.c b/libtommath/bn_mp_exteuclid.c
new file mode 100644
index 0000000..c4ebab4
--- /dev/null
+++ b/libtommath/bn_mp_exteuclid.c
@@ -0,0 +1,78 @@
+#include <tommath.h>
+#ifdef BN_MP_EXTEUCLID_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* Extended euclidean algorithm of (a, b) produces 
+   a*u1 + b*u2 = u3
+ */
+int mp_exteuclid(mp_int *a, mp_int *b, mp_int *U1, mp_int *U2, mp_int *U3)
+{
+   mp_int u1,u2,u3,v1,v2,v3,t1,t2,t3,q,tmp;
+   int err;
+
+   if ((err = mp_init_multi(&u1, &u2, &u3, &v1, &v2, &v3, &t1, &t2, &t3, &q, &tmp, NULL)) != MP_OKAY) {
+      return err;
+   }
+
+   /* initialize, (u1,u2,u3) = (1,0,a) */
+   mp_set(&u1, 1);
+   if ((err = mp_copy(a, &u3)) != MP_OKAY)                                        { goto _ERR; }
+
+   /* initialize, (v1,v2,v3) = (0,1,b) */
+   mp_set(&v2, 1);
+   if ((err = mp_copy(b, &v3)) != MP_OKAY)                                        { goto _ERR; }
+
+   /* loop while v3 != 0 */
+   while (mp_iszero(&v3) == MP_NO) {
+       /* q = u3/v3 */
+       if ((err = mp_div(&u3, &v3, &q, NULL)) != MP_OKAY)                         { goto _ERR; }
+
+       /* (t1,t2,t3) = (u1,u2,u3) - (v1,v2,v3)q */
+       if ((err = mp_mul(&v1, &q, &tmp)) != MP_OKAY)                              { goto _ERR; }
+       if ((err = mp_sub(&u1, &tmp, &t1)) != MP_OKAY)                             { goto _ERR; }
+       if ((err = mp_mul(&v2, &q, &tmp)) != MP_OKAY)                              { goto _ERR; }
+       if ((err = mp_sub(&u2, &tmp, &t2)) != MP_OKAY)                             { goto _ERR; }
+       if ((err = mp_mul(&v3, &q, &tmp)) != MP_OKAY)                              { goto _ERR; }
+       if ((err = mp_sub(&u3, &tmp, &t3)) != MP_OKAY)                             { goto _ERR; }
+
+       /* (u1,u2,u3) = (v1,v2,v3) */
+       if ((err = mp_copy(&v1, &u1)) != MP_OKAY)                                  { goto _ERR; }
+       if ((err = mp_copy(&v2, &u2)) != MP_OKAY)                                  { goto _ERR; }
+       if ((err = mp_copy(&v3, &u3)) != MP_OKAY)                                  { goto _ERR; }
+
+       /* (v1,v2,v3) = (t1,t2,t3) */
+       if ((err = mp_copy(&t1, &v1)) != MP_OKAY)                                  { goto _ERR; }
+       if ((err = mp_copy(&t2, &v2)) != MP_OKAY)                                  { goto _ERR; }
+       if ((err = mp_copy(&t3, &v3)) != MP_OKAY)                                  { goto _ERR; }
+   }
+
+   /* make sure U3 >= 0 */
+   if (u3.sign == MP_NEG) {
+      mp_neg(&u1, &u1);
+      mp_neg(&u2, &u2);
+      mp_neg(&u3, &u3);
+   }
+
+   /* copy result out */
+   if (U1 != NULL) { mp_exch(U1, &u1); }
+   if (U2 != NULL) { mp_exch(U2, &u2); }
+   if (U3 != NULL) { mp_exch(U3, &u3); }
+
+   err = MP_OKAY;
+_ERR: mp_clear_multi(&u1, &u2, &u3, &v1, &v2, &v3, &t1, &t2, &t3, &q, &tmp, NULL);
+   return err;
+}
+#endif
diff --git a/libtommath/bn_mp_fread.c b/libtommath/bn_mp_fread.c
new file mode 100644
index 0000000..293df3f
--- /dev/null
+++ b/libtommath/bn_mp_fread.c
@@ -0,0 +1,63 @@
+#include <tommath.h>
+#ifdef BN_MP_FREAD_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* read a bigint from a file stream in ASCII */
+int mp_fread(mp_int *a, int radix, FILE *stream)
+{
+   int err, ch, neg, y;
+   
+   /* clear a */
+   mp_zero(a);
+   
+   /* if first digit is - then set negative */
+   ch = fgetc(stream);
+   if (ch == '-') {
+      neg = MP_NEG;
+      ch = fgetc(stream);
+   } else {
+      neg = MP_ZPOS;
+   }
+   
+   for (;;) {
+      /* find y in the radix map */
+      for (y = 0; y < radix; y++) {
+          if (mp_s_rmap[y] == ch) {
+             break;
+          }
+      }
+      if (y == radix) {
+         break;
+      }
+      
+      /* shift up and add */
+      if ((err = mp_mul_d(a, radix, a)) != MP_OKAY) {
+         return err;
+      }
+      if ((err = mp_add_d(a, y, a)) != MP_OKAY) {
+         return err;
+      }
+      
+      ch = fgetc(stream);
+   }
+   if (mp_cmp_d(a, 0) != MP_EQ) {
+      a->sign = neg;
+   }
+   
+   return MP_OKAY;
+}
+
+#endif
diff --git a/libtommath/bn_mp_fwrite.c b/libtommath/bn_mp_fwrite.c
new file mode 100644
index 0000000..8fa3129
--- /dev/null
+++ b/libtommath/bn_mp_fwrite.c
@@ -0,0 +1,48 @@
+#include <tommath.h>
+#ifdef BN_MP_FWRITE_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+int mp_fwrite(mp_int *a, int radix, FILE *stream)
+{
+   char *buf;
+   int err, len, x;
+   
+   if ((err = mp_radix_size(a, radix, &len)) != MP_OKAY) {
+      return err;
+   }
+
+   buf = OPT_CAST(char) XMALLOC (len);
+   if (buf == NULL) {
+      return MP_MEM;
+   }
+   
+   if ((err = mp_toradix(a, buf, radix)) != MP_OKAY) {
+      XFREE (buf);
+      return err;
+   }
+   
+   for (x = 0; x < len; x++) {
+       if (fputc(buf[x], stream) == EOF) {
+          XFREE (buf);
+          return MP_VAL;
+       }
+   }
+   
+   XFREE (buf);
+   return MP_OKAY;
+}
+
+#endif
diff --git a/libtommath/bn_mp_gcd.c b/libtommath/bn_mp_gcd.c
new file mode 100644
index 0000000..6265df1
--- /dev/null
+++ b/libtommath/bn_mp_gcd.c
@@ -0,0 +1,109 @@
+#include <tommath.h>
+#ifdef BN_MP_GCD_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* Greatest Common Divisor using the binary method */
+int mp_gcd (mp_int * a, mp_int * b, mp_int * c)
+{
+  mp_int  u, v;
+  int     k, u_lsb, v_lsb, res;
+
+  /* either zero than gcd is the largest */
+  if (mp_iszero (a) == 1 && mp_iszero (b) == 0) {
+    return mp_abs (b, c);
+  }
+  if (mp_iszero (a) == 0 && mp_iszero (b) == 1) {
+    return mp_abs (a, c);
+  }
+
+  /* optimized.  At this point if a == 0 then
+   * b must equal zero too
+   */
+  if (mp_iszero (a) == 1) {
+    mp_zero(c);
+    return MP_OKAY;
+  }
+
+  /* get copies of a and b we can modify */
+  if ((res = mp_init_copy (&u, a)) != MP_OKAY) {
+    return res;
+  }
+
+  if ((res = mp_init_copy (&v, b)) != MP_OKAY) {
+    goto LBL_U;
+  }
+
+  /* must be positive for the remainder of the algorithm */
+  u.sign = v.sign = MP_ZPOS;
+
+  /* B1.  Find the common power of two for u and v */
+  u_lsb = mp_cnt_lsb(&u);
+  v_lsb = mp_cnt_lsb(&v);
+  k     = MIN(u_lsb, v_lsb);
+
+  if (k > 0) {
+     /* divide the power of two out */
+     if ((res = mp_div_2d(&u, k, &u, NULL)) != MP_OKAY) {
+        goto LBL_V;
+     }
+
+     if ((res = mp_div_2d(&v, k, &v, NULL)) != MP_OKAY) {
+        goto LBL_V;
+     }
+  }
+
+  /* divide any remaining factors of two out */
+  if (u_lsb != k) {
+     if ((res = mp_div_2d(&u, u_lsb - k, &u, NULL)) != MP_OKAY) {
+        goto LBL_V;
+     }
+  }
+
+  if (v_lsb != k) {
+     if ((res = mp_div_2d(&v, v_lsb - k, &v, NULL)) != MP_OKAY) {
+        goto LBL_V;
+     }
+  }
+
+  while (mp_iszero(&v) == 0) {
+     /* make sure v is the largest */
+     if (mp_cmp_mag(&u, &v) == MP_GT) {
+        /* swap u and v to make sure v is >= u */
+        mp_exch(&u, &v);
+     }
+     
+     /* subtract smallest from largest */
+     if ((res = s_mp_sub(&v, &u, &v)) != MP_OKAY) {
+        goto LBL_V;
+     }
+     
+     /* Divide out all factors of two */
+     if ((res = mp_div_2d(&v, mp_cnt_lsb(&v), &v, NULL)) != MP_OKAY) {
+        goto LBL_V;
+     } 
+  } 
+
+  /* multiply by 2**k which we divided out at the beginning */
+  if ((res = mp_mul_2d (&u, k, c)) != MP_OKAY) {
+     goto LBL_V;
+  }
+  c->sign = MP_ZPOS;
+  res = MP_OKAY;
+LBL_V:mp_clear (&u);
+LBL_U:mp_clear (&v);
+  return res;
+}
+#endif
diff --git a/libtommath/bn_mp_get_int.c b/libtommath/bn_mp_get_int.c
new file mode 100644
index 0000000..034467b
--- /dev/null
+++ b/libtommath/bn_mp_get_int.c
@@ -0,0 +1,41 @@
+#include <tommath.h>
+#ifdef BN_MP_GET_INT_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* get the lower 32-bits of an mp_int */
+unsigned long mp_get_int(mp_int * a) 
+{
+  int i;
+  unsigned long res;
+
+  if (a->used == 0) {
+     return 0;
+  }
+
+  /* get number of digits of the lsb we have to read */
+  i = MIN(a->used,(int)((sizeof(unsigned long)*CHAR_BIT+DIGIT_BIT-1)/DIGIT_BIT))-1;
+
+  /* get most significant digit of result */
+  res = DIGIT(a,i);
+   
+  while (--i >= 0) {
+    res = (res << DIGIT_BIT) | DIGIT(a,i);
+  }
+
+  /* force result to 32-bits always so it is consistent on non 32-bit platforms */
+  return res & 0xFFFFFFFFUL;
+}
+#endif
diff --git a/libtommath/bn_mp_grow.c b/libtommath/bn_mp_grow.c
new file mode 100644
index 0000000..12a78a8
--- /dev/null
+++ b/libtommath/bn_mp_grow.c
@@ -0,0 +1,53 @@
+#include <tommath.h>
+#ifdef BN_MP_GROW_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* grow as required */
+int mp_grow (mp_int * a, int size)
+{
+  int     i;
+  mp_digit *tmp;
+
+  /* if the alloc size is smaller alloc more ram */
+  if (a->alloc < size) {
+    /* ensure there are always at least MP_PREC digits extra on top */
+    size += (MP_PREC * 2) - (size % MP_PREC);
+
+    /* reallocate the array a->dp
+     *
+     * We store the return in a temporary variable
+     * in case the operation failed we don't want
+     * to overwrite the dp member of a.
+     */
+    tmp = OPT_CAST(mp_digit) XREALLOC (a->dp, sizeof (mp_digit) * size);
+    if (tmp == NULL) {
+      /* reallocation failed but "a" is still valid [can be freed] */
+      return MP_MEM;
+    }
+
+    /* reallocation succeeded so set a->dp */
+    a->dp = tmp;
+
+    /* zero excess digits */
+    i        = a->alloc;
+    a->alloc = size;
+    for (; i < a->alloc; i++) {
+      a->dp[i] = 0;
+    }
+  }
+  return MP_OKAY;
+}
+#endif
diff --git a/libtommath/bn_mp_init.c b/libtommath/bn_mp_init.c
new file mode 100644
index 0000000..9d70554
--- /dev/null
+++ b/libtommath/bn_mp_init.c
@@ -0,0 +1,42 @@
+#include <tommath.h>
+#ifdef BN_MP_INIT_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* init a new mp_int */
+int mp_init (mp_int * a)
+{
+  int i;
+
+  /* allocate memory required and clear it */
+  a->dp = OPT_CAST(mp_digit) XMALLOC (sizeof (mp_digit) * MP_PREC);
+  if (a->dp == NULL) {
+    return MP_MEM;
+  }
+
+  /* set the digits to zero */
+  for (i = 0; i < MP_PREC; i++) {
+      a->dp[i] = 0;
+  }
+
+  /* set the used to zero, allocated digits to the default precision
+   * and sign to positive */
+  a->used  = 0;
+  a->alloc = MP_PREC;
+  a->sign  = MP_ZPOS;
+
+  return MP_OKAY;
+}
+#endif
diff --git a/libtommath/bn_mp_init_copy.c b/libtommath/bn_mp_init_copy.c
new file mode 100644
index 0000000..b1b0fa2
--- /dev/null
+++ b/libtommath/bn_mp_init_copy.c
@@ -0,0 +1,28 @@
+#include <tommath.h>
+#ifdef BN_MP_INIT_COPY_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* creates "a" then copies b into it */
+int mp_init_copy (mp_int * a, mp_int * b)
+{
+  int     res;
+
+  if ((res = mp_init (a)) != MP_OKAY) {
+    return res;
+  }
+  return mp_copy (b, a);
+}
+#endif
diff --git a/libtommath/bn_mp_init_multi.c b/libtommath/bn_mp_init_multi.c
new file mode 100644
index 0000000..8cb123a
--- /dev/null
+++ b/libtommath/bn_mp_init_multi.c
@@ -0,0 +1,55 @@
+#include <tommath.h>
+#ifdef BN_MP_INIT_MULTI_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <stdarg.h>
+
+int mp_init_multi(mp_int *mp, ...) 
+{
+    mp_err res = MP_OKAY;      /* Assume ok until proven otherwise */
+    int n = 0;                 /* Number of ok inits */
+    mp_int* cur_arg = mp;
+    va_list args;
+
+    va_start(args, mp);        /* init args to next argument from caller */
+    while (cur_arg != NULL) {
+        if (mp_init(cur_arg) != MP_OKAY) {
+            /* Oops - error! Back-track and mp_clear what we already
+               succeeded in init-ing, then return error.
+            */
+            va_list clean_args;
+            
+            /* end the current list */
+            va_end(args);
+            
+            /* now start cleaning up */            
+            cur_arg = mp;
+            va_start(clean_args, mp);
+            while (n--) {
+                mp_clear(cur_arg);
+                cur_arg = va_arg(clean_args, mp_int*);
+            }
+            va_end(clean_args);
+            res = MP_MEM;
+            break;
+        }
+        n++;
+        cur_arg = va_arg(args, mp_int*);
+    }
+    va_end(args);
+    return res;                /* Assumed ok, if error flagged above. */
+}
+
+#endif
diff --git a/libtommath/bn_mp_init_set.c b/libtommath/bn_mp_init_set.c
new file mode 100644
index 0000000..0251e61
--- /dev/null
+++ b/libtommath/bn_mp_init_set.c
@@ -0,0 +1,28 @@
+#include <tommath.h>
+#ifdef BN_MP_INIT_SET_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* initialize and set a digit */
+int mp_init_set (mp_int * a, mp_digit b)
+{
+  int err;
+  if ((err = mp_init(a)) != MP_OKAY) {
+     return err;
+  }
+  mp_set(a, b);
+  return err;
+}
+#endif
diff --git a/libtommath/bn_mp_init_set_int.c b/libtommath/bn_mp_init_set_int.c
new file mode 100644
index 0000000..f59fd19
--- /dev/null
+++ b/libtommath/bn_mp_init_set_int.c
@@ -0,0 +1,27 @@
+#include <tommath.h>
+#ifdef BN_MP_INIT_SET_INT_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* initialize and set a digit */
+int mp_init_set_int (mp_int * a, unsigned long b)
+{
+  int err;
+  if ((err = mp_init(a)) != MP_OKAY) {
+     return err;
+  }
+  return mp_set_int(a, b);
+}
+#endif
diff --git a/libtommath/bn_mp_init_size.c b/libtommath/bn_mp_init_size.c
new file mode 100644
index 0000000..845ce2c
--- /dev/null
+++ b/libtommath/bn_mp_init_size.c
@@ -0,0 +1,44 @@
+#include <tommath.h>
+#ifdef BN_MP_INIT_SIZE_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* init an mp_init for a given size */
+int mp_init_size (mp_int * a, int size)
+{
+  int x;
+
+  /* pad size so there are always extra digits */
+  size += (MP_PREC * 2) - (size % MP_PREC);	
+  
+  /* alloc mem */
+  a->dp = OPT_CAST(mp_digit) XMALLOC (sizeof (mp_digit) * size);
+  if (a->dp == NULL) {
+    return MP_MEM;
+  }
+
+  /* set the members */
+  a->used  = 0;
+  a->alloc = size;
+  a->sign  = MP_ZPOS;
+
+  /* zero the digits */
+  for (x = 0; x < size; x++) {
+      a->dp[x] = 0;
+  }
+
+  return MP_OKAY;
+}
+#endif
diff --git a/libtommath/bn_mp_invmod.c b/libtommath/bn_mp_invmod.c
new file mode 100644
index 0000000..46118ad
--- /dev/null
+++ b/libtommath/bn_mp_invmod.c
@@ -0,0 +1,39 @@
+#include <tommath.h>
+#ifdef BN_MP_INVMOD_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* hac 14.61, pp608 */
+int mp_invmod (mp_int * a, mp_int * b, mp_int * c)
+{
+  /* b cannot be negative */
+  if (b->sign == MP_NEG || mp_iszero(b) == 1) {
+    return MP_VAL;
+  }
+
+#ifdef BN_FAST_MP_INVMOD_C
+  /* if the modulus is odd we can use a faster routine instead */
+  if (mp_isodd (b) == 1) {
+    return fast_mp_invmod (a, b, c);
+  }
+#endif
+
+#ifdef BN_MP_INVMOD_SLOW_C
+  return mp_invmod_slow(a, b, c);
+#endif
+
+  return MP_VAL;
+}
+#endif
diff --git a/libtommath/bn_mp_invmod_slow.c b/libtommath/bn_mp_invmod_slow.c
new file mode 100644
index 0000000..c048655
--- /dev/null
+++ b/libtommath/bn_mp_invmod_slow.c
@@ -0,0 +1,171 @@
+#include <tommath.h>
+#ifdef BN_MP_INVMOD_SLOW_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* hac 14.61, pp608 */
+int mp_invmod_slow (mp_int * a, mp_int * b, mp_int * c)
+{
+  mp_int  x, y, u, v, A, B, C, D;
+  int     res;
+
+  /* b cannot be negative */
+  if (b->sign == MP_NEG || mp_iszero(b) == 1) {
+    return MP_VAL;
+  }
+
+  /* init temps */
+  if ((res = mp_init_multi(&x, &y, &u, &v, 
+                           &A, &B, &C, &D, NULL)) != MP_OKAY) {
+     return res;
+  }
+
+  /* x = a, y = b */
+  if ((res = mp_mod(a, b, &x)) != MP_OKAY) {
+      goto LBL_ERR;
+  }
+  if ((res = mp_copy (b, &y)) != MP_OKAY) {
+    goto LBL_ERR;
+  }
+
+  /* 2. [modified] if x,y are both even then return an error! */
+  if (mp_iseven (&x) == 1 && mp_iseven (&y) == 1) {
+    res = MP_VAL;
+    goto LBL_ERR;
+  }
+
+  /* 3. u=x, v=y, A=1, B=0, C=0,D=1 */
+  if ((res = mp_copy (&x, &u)) != MP_OKAY) {
+    goto LBL_ERR;
+  }
+  if ((res = mp_copy (&y, &v)) != MP_OKAY) {
+    goto LBL_ERR;
+  }
+  mp_set (&A, 1);
+  mp_set (&D, 1);
+
+top:
+  /* 4.  while u is even do */
+  while (mp_iseven (&u) == 1) {
+    /* 4.1 u = u/2 */
+    if ((res = mp_div_2 (&u, &u)) != MP_OKAY) {
+      goto LBL_ERR;
+    }
+    /* 4.2 if A or B is odd then */
+    if (mp_isodd (&A) == 1 || mp_isodd (&B) == 1) {
+      /* A = (A+y)/2, B = (B-x)/2 */
+      if ((res = mp_add (&A, &y, &A)) != MP_OKAY) {
+         goto LBL_ERR;
+      }
+      if ((res = mp_sub (&B, &x, &B)) != MP_OKAY) {
+         goto LBL_ERR;
+      }
+    }
+    /* A = A/2, B = B/2 */
+    if ((res = mp_div_2 (&A, &A)) != MP_OKAY) {
+      goto LBL_ERR;
+    }
+    if ((res = mp_div_2 (&B, &B)) != MP_OKAY) {
+      goto LBL_ERR;
+    }
+  }
+
+  /* 5.  while v is even do */
+  while (mp_iseven (&v) == 1) {
+    /* 5.1 v = v/2 */
+    if ((res = mp_div_2 (&v, &v)) != MP_OKAY) {
+      goto LBL_ERR;
+    }
+    /* 5.2 if C or D is odd then */
+    if (mp_isodd (&C) == 1 || mp_isodd (&D) == 1) {
+      /* C = (C+y)/2, D = (D-x)/2 */
+      if ((res = mp_add (&C, &y, &C)) != MP_OKAY) {
+         goto LBL_ERR;
+      }
+      if ((res = mp_sub (&D, &x, &D)) != MP_OKAY) {
+         goto LBL_ERR;
+      }
+    }
+    /* C = C/2, D = D/2 */
+    if ((res = mp_div_2 (&C, &C)) != MP_OKAY) {
+      goto LBL_ERR;
+    }
+    if ((res = mp_div_2 (&D, &D)) != MP_OKAY) {
+      goto LBL_ERR;
+    }
+  }
+
+  /* 6.  if u >= v then */
+  if (mp_cmp (&u, &v) != MP_LT) {
+    /* u = u - v, A = A - C, B = B - D */
+    if ((res = mp_sub (&u, &v, &u)) != MP_OKAY) {
+      goto LBL_ERR;
+    }
+
+    if ((res = mp_sub (&A, &C, &A)) != MP_OKAY) {
+      goto LBL_ERR;
+    }
+
+    if ((res = mp_sub (&B, &D, &B)) != MP_OKAY) {
+      goto LBL_ERR;
+    }
+  } else {
+    /* v - v - u, C = C - A, D = D - B */
+    if ((res = mp_sub (&v, &u, &v)) != MP_OKAY) {
+      goto LBL_ERR;
+    }
+
+    if ((res = mp_sub (&C, &A, &C)) != MP_OKAY) {
+      goto LBL_ERR;
+    }
+
+    if ((res = mp_sub (&D, &B, &D)) != MP_OKAY) {
+      goto LBL_ERR;
+    }
+  }
+
+  /* if not zero goto step 4 */
+  if (mp_iszero (&u) == 0)
+    goto top;
+
+  /* now a = C, b = D, gcd == g*v */
+
+  /* if v != 1 then there is no inverse */
+  if (mp_cmp_d (&v, 1) != MP_EQ) {
+    res = MP_VAL;
+    goto LBL_ERR;
+  }
+
+  /* if its too low */
+  while (mp_cmp_d(&C, 0) == MP_LT) {
+      if ((res = mp_add(&C, b, &C)) != MP_OKAY) {
+         goto LBL_ERR;
+      }
+  }
+  
+  /* too big */
+  while (mp_cmp_mag(&C, b) != MP_LT) {
+      if ((res = mp_sub(&C, b, &C)) != MP_OKAY) {
+         goto LBL_ERR;
+      }
+  }
+  
+  /* C is now the inverse */
+  mp_exch (&C, c);
+  res = MP_OKAY;
+LBL_ERR:mp_clear_multi (&x, &y, &u, &v, &A, &B, &C, &D, NULL);
+  return res;
+}
+#endif
diff --git a/libtommath/bn_mp_is_square.c b/libtommath/bn_mp_is_square.c
new file mode 100644
index 0000000..969d237
--- /dev/null
+++ b/libtommath/bn_mp_is_square.c
@@ -0,0 +1,105 @@
+#include <tommath.h>
+#ifdef BN_MP_IS_SQUARE_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* Check if remainders are possible squares - fast exclude non-squares */
+static const char rem_128[128] = {
+ 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
+ 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
+ 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
+ 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
+ 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
+ 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
+ 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
+ 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1
+};
+
+static const char rem_105[105] = {
+ 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
+ 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1,
+ 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
+ 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
+ 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
+ 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1,
+ 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1
+};
+
+/* Store non-zero to ret if arg is square, and zero if not */
+int mp_is_square(mp_int *arg,int *ret) 
+{
+  int           res;
+  mp_digit      c;
+  mp_int        t;
+  unsigned long r;
+
+  /* Default to Non-square :) */
+  *ret = MP_NO; 
+
+  if (arg->sign == MP_NEG) {
+    return MP_VAL;
+  }
+
+  /* digits used?  (TSD) */
+  if (arg->used == 0) {
+     return MP_OKAY;
+  }
+
+  /* First check mod 128 (suppose that DIGIT_BIT is at least 7) */
+  if (rem_128[127 & DIGIT(arg,0)] == 1) {
+     return MP_OKAY;
+  }
+
+  /* Next check mod 105 (3*5*7) */
+  if ((res = mp_mod_d(arg,105,&c)) != MP_OKAY) {
+     return res;
+  }
+  if (rem_105[c] == 1) {
+     return MP_OKAY;
+  }
+
+
+  if ((res = mp_init_set_int(&t,11L*13L*17L*19L*23L*29L*31L)) != MP_OKAY) {
+     return res;
+  }
+  if ((res = mp_mod(arg,&t,&t)) != MP_OKAY) {
+     goto ERR;
+  }
+  r = mp_get_int(&t);
+  /* Check for other prime modules, note it's not an ERROR but we must
+   * free "t" so the easiest way is to goto ERR.  We know that res
+   * is already equal to MP_OKAY from the mp_mod call 
+   */ 
+  if ( (1L<<(r%11)) & 0x5C4L )             goto ERR;
+  if ( (1L<<(r%13)) & 0x9E4L )             goto ERR;
+  if ( (1L<<(r%17)) & 0x5CE8L )            goto ERR;
+  if ( (1L<<(r%19)) & 0x4F50CL )           goto ERR;
+  if ( (1L<<(r%23)) & 0x7ACCA0L )          goto ERR;
+  if ( (1L<<(r%29)) & 0xC2EDD0CL )         goto ERR;
+  if ( (1L<<(r%31)) & 0x6DE2B848L )        goto ERR;
+
+  /* Final check - is sqr(sqrt(arg)) == arg ? */
+  if ((res = mp_sqrt(arg,&t)) != MP_OKAY) {
+     goto ERR;
+  }
+  if ((res = mp_sqr(&t,&t)) != MP_OKAY) {
+     goto ERR;
+  }
+
+  *ret = (mp_cmp_mag(&t,arg) == MP_EQ) ? MP_YES : MP_NO;
+ERR:mp_clear(&t);
+  return res;
+}
+#endif
diff --git a/libtommath/bn_mp_jacobi.c b/libtommath/bn_mp_jacobi.c
new file mode 100644
index 0000000..74cbbf3
--- /dev/null
+++ b/libtommath/bn_mp_jacobi.c
@@ -0,0 +1,101 @@
+#include <tommath.h>
+#ifdef BN_MP_JACOBI_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* computes the jacobi c = (a | n) (or Legendre if n is prime)
+ * HAC pp. 73 Algorithm 2.149
+ */
+int mp_jacobi (mp_int * a, mp_int * p, int *c)
+{
+  mp_int  a1, p1;
+  int     k, s, r, res;
+  mp_digit residue;
+
+  /* if p <= 0 return MP_VAL */
+  if (mp_cmp_d(p, 0) != MP_GT) {
+     return MP_VAL;
+  }
+
+  /* step 1.  if a == 0, return 0 */
+  if (mp_iszero (a) == 1) {
+    *c = 0;
+    return MP_OKAY;
+  }
+
+  /* step 2.  if a == 1, return 1 */
+  if (mp_cmp_d (a, 1) == MP_EQ) {
+    *c = 1;
+    return MP_OKAY;
+  }
+
+  /* default */
+  s = 0;
+
+  /* step 3.  write a = a1 * 2**k  */
+  if ((res = mp_init_copy (&a1, a)) != MP_OKAY) {
+    return res;
+  }
+
+  if ((res = mp_init (&p1)) != MP_OKAY) {
+    goto LBL_A1;
+  }
+
+  /* divide out larger power of two */
+  k = mp_cnt_lsb(&a1);
+  if ((res = mp_div_2d(&a1, k, &a1, NULL)) != MP_OKAY) {
+     goto LBL_P1;
+  }
+
+  /* step 4.  if e is even set s=1 */
+  if ((k & 1) == 0) {
+    s = 1;
+  } else {
+    /* else set s=1 if p = 1/7 (mod 8) or s=-1 if p = 3/5 (mod 8) */
+    residue = p->dp[0] & 7;
+
+    if (residue == 1 || residue == 7) {
+      s = 1;
+    } else if (residue == 3 || residue == 5) {
+      s = -1;
+    }
+  }
+
+  /* step 5.  if p == 3 (mod 4) *and* a1 == 3 (mod 4) then s = -s */
+  if ( ((p->dp[0] & 3) == 3) && ((a1.dp[0] & 3) == 3)) {
+    s = -s;
+  }
+
+  /* if a1 == 1 we're done */
+  if (mp_cmp_d (&a1, 1) == MP_EQ) {
+    *c = s;
+  } else {
+    /* n1 = n mod a1 */
+    if ((res = mp_mod (p, &a1, &p1)) != MP_OKAY) {
+      goto LBL_P1;
+    }
+    if ((res = mp_jacobi (&p1, &a1, &r)) != MP_OKAY) {
+      goto LBL_P1;
+    }
+    *c = s * r;
+  }
+
+  /* done */
+  res = MP_OKAY;
+LBL_P1:mp_clear (&p1);
+LBL_A1:mp_clear (&a1);
+  return res;
+}
+#endif
diff --git a/libtommath/bn_mp_karatsuba_mul.c b/libtommath/bn_mp_karatsuba_mul.c
new file mode 100644
index 0000000..daa78c7
--- /dev/null
+++ b/libtommath/bn_mp_karatsuba_mul.c
@@ -0,0 +1,163 @@
+#include <tommath.h>
+#ifdef BN_MP_KARATSUBA_MUL_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* c = |a| * |b| using Karatsuba Multiplication using 
+ * three half size multiplications
+ *
+ * Let B represent the radix [e.g. 2**DIGIT_BIT] and 
+ * let n represent half of the number of digits in 
+ * the min(a,b)
+ *
+ * a = a1 * B**n + a0
+ * b = b1 * B**n + b0
+ *
+ * Then, a * b => 
+   a1b1 * B**2n + ((a1 - a0)(b1 - b0) + a0b0 + a1b1) * B + a0b0
+ *
+ * Note that a1b1 and a0b0 are used twice and only need to be 
+ * computed once.  So in total three half size (half # of 
+ * digit) multiplications are performed, a0b0, a1b1 and 
+ * (a1-b1)(a0-b0)
+ *
+ * Note that a multiplication of half the digits requires
+ * 1/4th the number of single precision multiplications so in 
+ * total after one call 25% of the single precision multiplications 
+ * are saved.  Note also that the call to mp_mul can end up back 
+ * in this function if the a0, a1, b0, or b1 are above the threshold.  
+ * This is known as divide-and-conquer and leads to the famous 
+ * O(N**lg(3)) or O(N**1.584) work which is asymptopically lower than 
+ * the standard O(N**2) that the baseline/comba methods use.  
+ * Generally though the overhead of this method doesn't pay off 
+ * until a certain size (N ~ 80) is reached.
+ */
+int mp_karatsuba_mul (mp_int * a, mp_int * b, mp_int * c)
+{
+  mp_int  x0, x1, y0, y1, t1, x0y0, x1y1;
+  int     B, err;
+
+  /* default the return code to an error */
+  err = MP_MEM;
+
+  /* min # of digits */
+  B = MIN (a->used, b->used);
+
+  /* now divide in two */
+  B = B >> 1;
+
+  /* init copy all the temps */
+  if (mp_init_size (&x0, B) != MP_OKAY)
+    goto ERR;
+  if (mp_init_size (&x1, a->used - B) != MP_OKAY)
+    goto X0;
+  if (mp_init_size (&y0, B) != MP_OKAY)
+    goto X1;
+  if (mp_init_size (&y1, b->used - B) != MP_OKAY)
+    goto Y0;
+
+  /* init temps */
+  if (mp_init_size (&t1, B * 2) != MP_OKAY)
+    goto Y1;
+  if (mp_init_size (&x0y0, B * 2) != MP_OKAY)
+    goto T1;
+  if (mp_init_size (&x1y1, B * 2) != MP_OKAY)
+    goto X0Y0;
+
+  /* now shift the digits */
+  x0.used = y0.used = B;
+  x1.used = a->used - B;
+  y1.used = b->used - B;
+
+  {
+    register int x;
+    register mp_digit *tmpa, *tmpb, *tmpx, *tmpy;
+
+    /* we copy the digits directly instead of using higher level functions
+     * since we also need to shift the digits
+     */
+    tmpa = a->dp;
+    tmpb = b->dp;
+
+    tmpx = x0.dp;
+    tmpy = y0.dp;
+    for (x = 0; x < B; x++) {
+      *tmpx++ = *tmpa++;
+      *tmpy++ = *tmpb++;
+    }
+
+    tmpx = x1.dp;
+    for (x = B; x < a->used; x++) {
+      *tmpx++ = *tmpa++;
+    }
+
+    tmpy = y1.dp;
+    for (x = B; x < b->used; x++) {
+      *tmpy++ = *tmpb++;
+    }
+  }
+
+  /* only need to clamp the lower words since by definition the 
+   * upper words x1/y1 must have a known number of digits
+   */
+  mp_clamp (&x0);
+  mp_clamp (&y0);
+
+  /* now calc the products x0y0 and x1y1 */
+  /* after this x0 is no longer required, free temp [x0==t2]! */
+  if (mp_mul (&x0, &y0, &x0y0) != MP_OKAY)  
+    goto X1Y1;          /* x0y0 = x0*y0 */
+  if (mp_mul (&x1, &y1, &x1y1) != MP_OKAY)
+    goto X1Y1;          /* x1y1 = x1*y1 */
+
+  /* now calc x1-x0 and y1-y0 */
+  if (mp_sub (&x1, &x0, &t1) != MP_OKAY)
+    goto X1Y1;          /* t1 = x1 - x0 */
+  if (mp_sub (&y1, &y0, &x0) != MP_OKAY)
+    goto X1Y1;          /* t2 = y1 - y0 */
+  if (mp_mul (&t1, &x0, &t1) != MP_OKAY)
+    goto X1Y1;          /* t1 = (x1 - x0) * (y1 - y0) */
+
+  /* add x0y0 */
+  if (mp_add (&x0y0, &x1y1, &x0) != MP_OKAY)
+    goto X1Y1;          /* t2 = x0y0 + x1y1 */
+  if (mp_sub (&x0, &t1, &t1) != MP_OKAY)
+    goto X1Y1;          /* t1 = x0y0 + x1y1 - (x1-x0)*(y1-y0) */
+
+  /* shift by B */
+  if (mp_lshd (&t1, B) != MP_OKAY)
+    goto X1Y1;          /* t1 = (x0y0 + x1y1 - (x1-x0)*(y1-y0))<<B */
+  if (mp_lshd (&x1y1, B * 2) != MP_OKAY)
+    goto X1Y1;          /* x1y1 = x1y1 << 2*B */
+
+  if (mp_add (&x0y0, &t1, &t1) != MP_OKAY)
+    goto X1Y1;          /* t1 = x0y0 + t1 */
+  if (mp_add (&t1, &x1y1, c) != MP_OKAY)
+    goto X1Y1;          /* t1 = x0y0 + t1 + x1y1 */
+
+  /* Algorithm succeeded set the return code to MP_OKAY */
+  err = MP_OKAY;
+
+X1Y1:mp_clear (&x1y1);
+X0Y0:mp_clear (&x0y0);
+T1:mp_clear (&t1);
+Y1:mp_clear (&y1);
+Y0:mp_clear (&y0);
+X1:mp_clear (&x1);
+X0:mp_clear (&x0);
+ERR:
+  return err;
+}
+#endif
diff --git a/libtommath/bn_mp_karatsuba_sqr.c b/libtommath/bn_mp_karatsuba_sqr.c
new file mode 100644
index 0000000..315ceab
--- /dev/null
+++ b/libtommath/bn_mp_karatsuba_sqr.c
@@ -0,0 +1,117 @@
+#include <tommath.h>
+#ifdef BN_MP_KARATSUBA_SQR_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* Karatsuba squaring, computes b = a*a using three 
+ * half size squarings
+ *
+ * See comments of karatsuba_mul for details.  It 
+ * is essentially the same algorithm but merely 
+ * tuned to perform recursive squarings.
+ */
+int mp_karatsuba_sqr (mp_int * a, mp_int * b)
+{
+  mp_int  x0, x1, t1, t2, x0x0, x1x1;
+  int     B, err;
+
+  err = MP_MEM;
+
+  /* min # of digits */
+  B = a->used;
+
+  /* now divide in two */
+  B = B >> 1;
+
+  /* init copy all the temps */
+  if (mp_init_size (&x0, B) != MP_OKAY)
+    goto ERR;
+  if (mp_init_size (&x1, a->used - B) != MP_OKAY)
+    goto X0;
+
+  /* init temps */
+  if (mp_init_size (&t1, a->used * 2) != MP_OKAY)
+    goto X1;
+  if (mp_init_size (&t2, a->used * 2) != MP_OKAY)
+    goto T1;
+  if (mp_init_size (&x0x0, B * 2) != MP_OKAY)
+    goto T2;
+  if (mp_init_size (&x1x1, (a->used - B) * 2) != MP_OKAY)
+    goto X0X0;
+
+  {
+    register int x;
+    register mp_digit *dst, *src;
+
+    src = a->dp;
+
+    /* now shift the digits */
+    dst = x0.dp;
+    for (x = 0; x < B; x++) {
+      *dst++ = *src++;
+    }
+
+    dst = x1.dp;
+    for (x = B; x < a->used; x++) {
+      *dst++ = *src++;
+    }
+  }
+
+  x0.used = B;
+  x1.used = a->used - B;
+
+  mp_clamp (&x0);
+
+  /* now calc the products x0*x0 and x1*x1 */
+  if (mp_sqr (&x0, &x0x0) != MP_OKAY)
+    goto X1X1;           /* x0x0 = x0*x0 */
+  if (mp_sqr (&x1, &x1x1) != MP_OKAY)
+    goto X1X1;           /* x1x1 = x1*x1 */
+
+  /* now calc (x1-x0)**2 */
+  if (mp_sub (&x1, &x0, &t1) != MP_OKAY)
+    goto X1X1;           /* t1 = x1 - x0 */
+  if (mp_sqr (&t1, &t1) != MP_OKAY)
+    goto X1X1;           /* t1 = (x1 - x0) * (x1 - x0) */
+
+  /* add x0y0 */
+  if (s_mp_add (&x0x0, &x1x1, &t2) != MP_OKAY)
+    goto X1X1;           /* t2 = x0x0 + x1x1 */
+  if (mp_sub (&t2, &t1, &t1) != MP_OKAY)
+    goto X1X1;           /* t1 = x0x0 + x1x1 - (x1-x0)*(x1-x0) */
+
+  /* shift by B */
+  if (mp_lshd (&t1, B) != MP_OKAY)
+    goto X1X1;           /* t1 = (x0x0 + x1x1 - (x1-x0)*(x1-x0))<<B */
+  if (mp_lshd (&x1x1, B * 2) != MP_OKAY)
+    goto X1X1;           /* x1x1 = x1x1 << 2*B */
+
+  if (mp_add (&x0x0, &t1, &t1) != MP_OKAY)
+    goto X1X1;           /* t1 = x0x0 + t1 */
+  if (mp_add (&t1, &x1x1, b) != MP_OKAY)
+    goto X1X1;           /* t1 = x0x0 + t1 + x1x1 */
+
+  err = MP_OKAY;
+
+X1X1:mp_clear (&x1x1);
+X0X0:mp_clear (&x0x0);
+T2:mp_clear (&t2);
+T1:mp_clear (&t1);
+X1:mp_clear (&x1);
+X0:mp_clear (&x0);
+ERR:
+  return err;
+}
+#endif
diff --git a/libtommath/bn_mp_lcm.c b/libtommath/bn_mp_lcm.c
new file mode 100644
index 0000000..8e3a759
--- /dev/null
+++ b/libtommath/bn_mp_lcm.c
@@ -0,0 +1,56 @@
+#include <tommath.h>
+#ifdef BN_MP_LCM_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* computes least common multiple as |a*b|/(a, b) */
+int mp_lcm (mp_int * a, mp_int * b, mp_int * c)
+{
+  int     res;
+  mp_int  t1, t2;
+
+
+  if ((res = mp_init_multi (&t1, &t2, NULL)) != MP_OKAY) {
+    return res;
+  }
+
+  /* t1 = get the GCD of the two inputs */
+  if ((res = mp_gcd (a, b, &t1)) != MP_OKAY) {
+    goto LBL_T;
+  }
+
+  /* divide the smallest by the GCD */
+  if (mp_cmp_mag(a, b) == MP_LT) {
+     /* store quotient in t2 such that t2 * b is the LCM */
+     if ((res = mp_div(a, &t1, &t2, NULL)) != MP_OKAY) {
+        goto LBL_T;
+     }
+     res = mp_mul(b, &t2, c);
+  } else {
+     /* store quotient in t2 such that t2 * a is the LCM */
+     if ((res = mp_div(b, &t1, &t2, NULL)) != MP_OKAY) {
+        goto LBL_T;
+     }
+     res = mp_mul(a, &t2, c);
+  }
+
+  /* fix the sign to positive */
+  c->sign = MP_ZPOS;
+
+LBL_T:
+  mp_clear_multi (&t1, &t2, NULL);
+  return res;
+}
+#endif
diff --git a/libtommath/bn_mp_lshd.c b/libtommath/bn_mp_lshd.c
new file mode 100644
index 0000000..398b648
--- /dev/null
+++ b/libtommath/bn_mp_lshd.c
@@ -0,0 +1,63 @@
+#include <tommath.h>
+#ifdef BN_MP_LSHD_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* shift left a certain amount of digits */
+int mp_lshd (mp_int * a, int b)
+{
+  int     x, res;
+
+  /* if its less than zero return */
+  if (b <= 0) {
+    return MP_OKAY;
+  }
+
+  /* grow to fit the new digits */
+  if (a->alloc < a->used + b) {
+     if ((res = mp_grow (a, a->used + b)) != MP_OKAY) {
+       return res;
+     }
+  }
+
+  {
+    register mp_digit *top, *bottom;
+
+    /* increment the used by the shift amount then copy upwards */
+    a->used += b;
+
+    /* top */
+    top = a->dp + a->used - 1;
+
+    /* base */
+    bottom = a->dp + a->used - 1 - b;
+
+    /* much like mp_rshd this is implemented using a sliding window
+     * except the window goes the otherway around.  Copying from
+     * the bottom to the top.  see bn_mp_rshd.c for more info.
+     */
+    for (x = a->used - 1; x >= b; x--) {
+      *top-- = *bottom--;
+    }
+
+    /* zero the lower digits */
+    top = a->dp;
+    for (x = 0; x < b; x++) {
+      *top++ = 0;
+    }
+  }
+  return MP_OKAY;
+}
+#endif
diff --git a/libtommath/bn_mp_mod.c b/libtommath/bn_mp_mod.c
new file mode 100644
index 0000000..75779bb
--- /dev/null
+++ b/libtommath/bn_mp_mod.c
@@ -0,0 +1,44 @@
+#include <tommath.h>
+#ifdef BN_MP_MOD_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* c = a mod b, 0 <= c < b */
+int
+mp_mod (mp_int * a, mp_int * b, mp_int * c)
+{
+  mp_int  t;
+  int     res;
+
+  if ((res = mp_init (&t)) != MP_OKAY) {
+    return res;
+  }
+
+  if ((res = mp_div (a, b, NULL, &t)) != MP_OKAY) {
+    mp_clear (&t);
+    return res;
+  }
+
+  if (t.sign != b->sign) {
+    res = mp_add (b, &t, c);
+  } else {
+    res = MP_OKAY;
+    mp_exch (&t, c);
+  }
+
+  mp_clear (&t);
+  return res;
+}
+#endif
diff --git a/libtommath/bn_mp_mod_2d.c b/libtommath/bn_mp_mod_2d.c
new file mode 100644
index 0000000..589e4ba
--- /dev/null
+++ b/libtommath/bn_mp_mod_2d.c
@@ -0,0 +1,51 @@
+#include <tommath.h>
+#ifdef BN_MP_MOD_2D_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* calc a value mod 2**b */
+int
+mp_mod_2d (mp_int * a, int b, mp_int * c)
+{
+  int     x, res;
+
+  /* if b is <= 0 then zero the int */
+  if (b <= 0) {
+    mp_zero (c);
+    return MP_OKAY;
+  }
+
+  /* if the modulus is larger than the value than return */
+  if (b >= (int) (a->used * DIGIT_BIT)) {
+    res = mp_copy (a, c);
+    return res;
+  }
+
+  /* copy */
+  if ((res = mp_copy (a, c)) != MP_OKAY) {
+    return res;
+  }
+
+  /* zero digits above the last digit of the modulus */
+  for (x = (b / DIGIT_BIT) + ((b % DIGIT_BIT) == 0 ? 0 : 1); x < c->used; x++) {
+    c->dp[x] = 0;
+  }
+  /* clear the digit that is not completely outside/inside the modulus */
+  c->dp[b / DIGIT_BIT] &=
+    (mp_digit) ((((mp_digit) 1) << (((mp_digit) b) % DIGIT_BIT)) - ((mp_digit) 1));
+  mp_clamp (c);
+  return MP_OKAY;
+}
+#endif
diff --git a/libtommath/bn_mp_mod_d.c b/libtommath/bn_mp_mod_d.c
new file mode 100644
index 0000000..8a2ad24
--- /dev/null
+++ b/libtommath/bn_mp_mod_d.c
@@ -0,0 +1,23 @@
+#include <tommath.h>
+#ifdef BN_MP_MOD_D_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+int
+mp_mod_d (mp_int * a, mp_digit b, mp_digit * c)
+{
+  return mp_div_d(a, b, NULL, c);
+}
+#endif
diff --git a/libtommath/bn_mp_montgomery_calc_normalization.c b/libtommath/bn_mp_montgomery_calc_normalization.c
new file mode 100644
index 0000000..e2efc34
--- /dev/null
+++ b/libtommath/bn_mp_montgomery_calc_normalization.c
@@ -0,0 +1,55 @@
+#include <tommath.h>
+#ifdef BN_MP_MONTGOMERY_CALC_NORMALIZATION_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/*
+ * shifts with subtractions when the result is greater than b.
+ *
+ * The method is slightly modified to shift B unconditionally upto just under
+ * the leading bit of b.  This saves alot of multiple precision shifting.
+ */
+int mp_montgomery_calc_normalization (mp_int * a, mp_int * b)
+{
+  int     x, bits, res;
+
+  /* how many bits of last digit does b use */
+  bits = mp_count_bits (b) % DIGIT_BIT;
+
+  if (b->used > 1) {
+     if ((res = mp_2expt (a, (b->used - 1) * DIGIT_BIT + bits - 1)) != MP_OKAY) {
+        return res;
+     }
+  } else {
+     mp_set(a, 1);
+     bits = 1;
+  }
+
+
+  /* now compute C = A * B mod b */
+  for (x = bits - 1; x < (int)DIGIT_BIT; x++) {
+    if ((res = mp_mul_2 (a, a)) != MP_OKAY) {
+      return res;
+    }
+    if (mp_cmp_mag (a, b) != MP_LT) {
+      if ((res = s_mp_sub (a, b, a)) != MP_OKAY) {
+        return res;
+      }
+    }
+  }
+
+  return MP_OKAY;
+}
+#endif
diff --git a/libtommath/bn_mp_montgomery_reduce.c b/libtommath/bn_mp_montgomery_reduce.c
new file mode 100644
index 0000000..3095fa7
--- /dev/null
+++ b/libtommath/bn_mp_montgomery_reduce.c
@@ -0,0 +1,114 @@
+#include <tommath.h>
+#ifdef BN_MP_MONTGOMERY_REDUCE_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* computes xR**-1 == x (mod N) via Montgomery Reduction */
+int
+mp_montgomery_reduce (mp_int * x, mp_int * n, mp_digit rho)
+{
+  int     ix, res, digs;
+  mp_digit mu;
+
+  /* can the fast reduction [comba] method be used?
+   *
+   * Note that unlike in mul you're safely allowed *less*
+   * than the available columns [255 per default] since carries
+   * are fixed up in the inner loop.
+   */
+  digs = n->used * 2 + 1;
+  if ((digs < MP_WARRAY) &&
+      n->used <
+      (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
+    return fast_mp_montgomery_reduce (x, n, rho);
+  }
+
+  /* grow the input as required */
+  if (x->alloc < digs) {
+    if ((res = mp_grow (x, digs)) != MP_OKAY) {
+      return res;
+    }
+  }
+  x->used = digs;
+
+  for (ix = 0; ix < n->used; ix++) {
+    /* mu = ai * rho mod b
+     *
+     * The value of rho must be precalculated via
+     * montgomery_setup() such that
+     * it equals -1/n0 mod b this allows the
+     * following inner loop to reduce the
+     * input one digit at a time
+     */
+    mu = (mp_digit) (((mp_word)x->dp[ix]) * ((mp_word)rho) & MP_MASK);
+
+    /* a = a + mu * m * b**i */
+    {
+      register int iy;
+      register mp_digit *tmpn, *tmpx, u;
+      register mp_word r;
+
+      /* alias for digits of the modulus */
+      tmpn = n->dp;
+
+      /* alias for the digits of x [the input] */
+      tmpx = x->dp + ix;
+
+      /* set the carry to zero */
+      u = 0;
+
+      /* Multiply and add in place */
+      for (iy = 0; iy < n->used; iy++) {
+        /* compute product and sum */
+        r       = ((mp_word)mu) * ((mp_word)*tmpn++) +
+                  ((mp_word) u) + ((mp_word) * tmpx);
+
+        /* get carry */
+        u       = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
+
+        /* fix digit */
+        *tmpx++ = (mp_digit)(r & ((mp_word) MP_MASK));
+      }
+      /* At this point the ix'th digit of x should be zero */
+
+
+      /* propagate carries upwards as required*/
+      while (u) {
+        *tmpx   += u;
+        u        = *tmpx >> DIGIT_BIT;
+        *tmpx++ &= MP_MASK;
+      }
+    }
+  }
+
+  /* at this point the n.used'th least
+   * significant digits of x are all zero
+   * which means we can shift x to the
+   * right by n.used digits and the
+   * residue is unchanged.
+   */
+
+  /* x = x/b**n.used */
+  mp_clamp(x);
+  mp_rshd (x, n->used);
+
+  /* if x >= n then x = x - n */
+  if (mp_cmp_mag (x, n) != MP_LT) {
+    return s_mp_sub (x, n, x);
+  }
+
+  return MP_OKAY;
+}
+#endif
diff --git a/libtommath/bn_mp_montgomery_setup.c b/libtommath/bn_mp_montgomery_setup.c
new file mode 100644
index 0000000..9dfc087
--- /dev/null
+++ b/libtommath/bn_mp_montgomery_setup.c
@@ -0,0 +1,55 @@
+#include <tommath.h>
+#ifdef BN_MP_MONTGOMERY_SETUP_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* setups the montgomery reduction stuff */
+int
+mp_montgomery_setup (mp_int * n, mp_digit * rho)
+{
+  mp_digit x, b;
+
+/* fast inversion mod 2**k
+ *
+ * Based on the fact that
+ *
+ * XA = 1 (mod 2**n)  =>  (X(2-XA)) A = 1 (mod 2**2n)
+ *                    =>  2*X*A - X*X*A*A = 1
+ *                    =>  2*(1) - (1)     = 1
+ */
+  b = n->dp[0];
+
+  if ((b & 1) == 0) {
+    return MP_VAL;
+  }
+
+  x = (((b + 2) & 4) << 1) + b; /* here x*a==1 mod 2**4 */
+  x *= 2 - b * x;               /* here x*a==1 mod 2**8 */
+#if !defined(MP_8BIT)
+  x *= 2 - b * x;               /* here x*a==1 mod 2**16 */
+#endif
+#if defined(MP_64BIT) || !(defined(MP_8BIT) || defined(MP_16BIT))
+  x *= 2 - b * x;               /* here x*a==1 mod 2**32 */
+#endif
+#ifdef MP_64BIT
+  x *= 2 - b * x;               /* here x*a==1 mod 2**64 */
+#endif
+
+  /* rho = -1/m mod b */
+  *rho = (((mp_word)1 << ((mp_word) DIGIT_BIT)) - x) & MP_MASK;
+
+  return MP_OKAY;
+}
+#endif
diff --git a/libtommath/bn_mp_mul.c b/libtommath/bn_mp_mul.c
new file mode 100644
index 0000000..f9cfa09
--- /dev/null
+++ b/libtommath/bn_mp_mul.c
@@ -0,0 +1,62 @@
+#include <tommath.h>
+#ifdef BN_MP_MUL_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* high level multiplication (handles sign) */
+int mp_mul (mp_int * a, mp_int * b, mp_int * c)
+{
+  int     res, neg;
+  neg = (a->sign == b->sign) ? MP_ZPOS : MP_NEG;
+
+  /* use Toom-Cook? */
+#ifdef BN_MP_TOOM_MUL_C
+  if (MIN (a->used, b->used) >= TOOM_MUL_CUTOFF) {
+    res = mp_toom_mul(a, b, c);
+  } else 
+#endif
+#ifdef BN_MP_KARATSUBA_MUL_C
+  /* use Karatsuba? */
+  if (MIN (a->used, b->used) >= KARATSUBA_MUL_CUTOFF) {
+    res = mp_karatsuba_mul (a, b, c);
+  } else 
+#endif
+  {
+    /* can we use the fast multiplier?
+     *
+     * The fast multiplier can be used if the output will 
+     * have less than MP_WARRAY digits and the number of 
+     * digits won't affect carry propagation
+     */
+    int     digs = a->used + b->used + 1;
+
+#ifdef BN_FAST_S_MP_MUL_DIGS_C
+    if ((digs < MP_WARRAY) &&
+        MIN(a->used, b->used) <= 
+        (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
+      res = fast_s_mp_mul_digs (a, b, c, digs);
+    } else 
+#endif
+#ifdef BN_S_MP_MUL_DIGS_C
+      res = s_mp_mul (a, b, c); /* uses s_mp_mul_digs */
+#else
+      res = MP_VAL;
+#endif
+
+  }
+  c->sign = (c->used > 0) ? neg : MP_ZPOS;
+  return res;
+}
+#endif
diff --git a/libtommath/bn_mp_mul_2.c b/libtommath/bn_mp_mul_2.c
new file mode 100644
index 0000000..6936681
--- /dev/null
+++ b/libtommath/bn_mp_mul_2.c
@@ -0,0 +1,78 @@
+#include <tommath.h>
+#ifdef BN_MP_MUL_2_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* b = a*2 */
+int mp_mul_2(mp_int * a, mp_int * b)
+{
+  int     x, res, oldused;
+
+  /* grow to accomodate result */
+  if (b->alloc < a->used + 1) {
+    if ((res = mp_grow (b, a->used + 1)) != MP_OKAY) {
+      return res;
+    }
+  }
+
+  oldused = b->used;
+  b->used = a->used;
+
+  {
+    register mp_digit r, rr, *tmpa, *tmpb;
+
+    /* alias for source */
+    tmpa = a->dp;
+    
+    /* alias for dest */
+    tmpb = b->dp;
+
+    /* carry */
+    r = 0;
+    for (x = 0; x < a->used; x++) {
+    
+      /* get what will be the *next* carry bit from the 
+       * MSB of the current digit 
+       */
+      rr = *tmpa >> ((mp_digit)(DIGIT_BIT - 1));
+      
+      /* now shift up this digit, add in the carry [from the previous] */
+      *tmpb++ = ((*tmpa++ << ((mp_digit)1)) | r) & MP_MASK;
+      
+      /* copy the carry that would be from the source 
+       * digit into the next iteration 
+       */
+      r = rr;
+    }
+
+    /* new leading digit? */
+    if (r != 0) {
+      /* add a MSB which is always 1 at this point */
+      *tmpb = 1;
+      ++(b->used);
+    }
+
+    /* now zero any excess digits on the destination 
+     * that we didn't write to 
+     */
+    tmpb = b->dp + b->used;
+    for (x = b->used; x < oldused; x++) {
+      *tmpb++ = 0;
+    }
+  }
+  b->sign = a->sign;
+  return MP_OKAY;
+}
+#endif
diff --git a/libtommath/bn_mp_mul_2d.c b/libtommath/bn_mp_mul_2d.c
new file mode 100644
index 0000000..04cb8dd
--- /dev/null
+++ b/libtommath/bn_mp_mul_2d.c
@@ -0,0 +1,81 @@
+#include <tommath.h>
+#ifdef BN_MP_MUL_2D_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* shift left by a certain bit count */
+int mp_mul_2d (mp_int * a, int b, mp_int * c)
+{
+  mp_digit d;
+  int      res;
+
+  /* copy */
+  if (a != c) {
+     if ((res = mp_copy (a, c)) != MP_OKAY) {
+       return res;
+     }
+  }
+
+  if (c->alloc < (int)(c->used + b/DIGIT_BIT + 1)) {
+     if ((res = mp_grow (c, c->used + b / DIGIT_BIT + 1)) != MP_OKAY) {
+       return res;
+     }
+  }
+
+  /* shift by as many digits in the bit count */
+  if (b >= (int)DIGIT_BIT) {
+    if ((res = mp_lshd (c, b / DIGIT_BIT)) != MP_OKAY) {
+      return res;
+    }
+  }
+
+  /* shift any bit count < DIGIT_BIT */
+  d = (mp_digit) (b % DIGIT_BIT);
+  if (d != 0) {
+    register mp_digit *tmpc, shift, mask, r, rr;
+    register int x;
+
+    /* bitmask for carries */
+    mask = (((mp_digit)1) << d) - 1;
+
+    /* shift for msbs */
+    shift = DIGIT_BIT - d;
+
+    /* alias */
+    tmpc = c->dp;
+
+    /* carry */
+    r    = 0;
+    for (x = 0; x < c->used; x++) {
+      /* get the higher bits of the current word */
+      rr = (*tmpc >> shift) & mask;
+
+      /* shift the current word and OR in the carry */
+      *tmpc = ((*tmpc << d) | r) & MP_MASK;
+      ++tmpc;
+
+      /* set the carry to the carry bits of the current word */
+      r = rr;
+    }
+    
+    /* set final carry */
+    if (r != 0) {
+       c->dp[(c->used)++] = r;
+    }
+  }
+  mp_clamp (c);
+  return MP_OKAY;
+}
+#endif
diff --git a/libtommath/bn_mp_mul_d.c b/libtommath/bn_mp_mul_d.c
new file mode 100644
index 0000000..9e11eef
--- /dev/null
+++ b/libtommath/bn_mp_mul_d.c
@@ -0,0 +1,75 @@
+#include <tommath.h>
+#ifdef BN_MP_MUL_D_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* multiply by a digit */
+int
+mp_mul_d (mp_int * a, mp_digit b, mp_int * c)
+{
+  mp_digit u, *tmpa, *tmpc;
+  mp_word  r;
+  int      ix, res, olduse;
+
+  /* make sure c is big enough to hold a*b */
+  if (c->alloc < a->used + 1) {
+    if ((res = mp_grow (c, a->used + 1)) != MP_OKAY) {
+      return res;
+    }
+  }
+
+  /* get the original destinations used count */
+  olduse = c->used;
+
+  /* set the sign */
+  c->sign = a->sign;
+
+  /* alias for a->dp [source] */
+  tmpa = a->dp;
+
+  /* alias for c->dp [dest] */
+  tmpc = c->dp;
+
+  /* zero carry */
+  u = 0;
+
+  /* compute columns */
+  for (ix = 0; ix < a->used; ix++) {
+    /* compute product and carry sum for this term */
+    r       = ((mp_word) u) + ((mp_word)*tmpa++) * ((mp_word)b);
+
+    /* mask off higher bits to get a single digit */
+    *tmpc++ = (mp_digit) (r & ((mp_word) MP_MASK));
+
+    /* send carry into next iteration */
+    u       = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
+  }
+
+  /* store final carry [if any] and increment ix offset  */
+  *tmpc++ = u;
+  ++ix;
+
+  /* now zero digits above the top */
+  while (ix++ < olduse) {
+     *tmpc++ = 0;
+  }
+
+  /* set used count */
+  c->used = a->used + 1;
+  mp_clamp(c);
+
+  return MP_OKAY;
+}
+#endif
diff --git a/libtommath/bn_mp_mulmod.c b/libtommath/bn_mp_mulmod.c
new file mode 100644
index 0000000..d34e90a
--- /dev/null
+++ b/libtommath/bn_mp_mulmod.c
@@ -0,0 +1,37 @@
+#include <tommath.h>
+#ifdef BN_MP_MULMOD_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* d = a * b (mod c) */
+int
+mp_mulmod (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
+{
+  int     res;
+  mp_int  t;
+
+  if ((res = mp_init (&t)) != MP_OKAY) {
+    return res;
+  }
+
+  if ((res = mp_mul (a, b, &t)) != MP_OKAY) {
+    mp_clear (&t);
+    return res;
+  }
+  res = mp_mod (&t, c, d);
+  mp_clear (&t);
+  return res;
+}
+#endif
diff --git a/libtommath/bn_mp_n_root.c b/libtommath/bn_mp_n_root.c
new file mode 100644
index 0000000..7b11aa2
--- /dev/null
+++ b/libtommath/bn_mp_n_root.c
@@ -0,0 +1,128 @@
+#include <tommath.h>
+#ifdef BN_MP_N_ROOT_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* find the n'th root of an integer 
+ *
+ * Result found such that (c)**b <= a and (c+1)**b > a 
+ *
+ * This algorithm uses Newton's approximation 
+ * x[i+1] = x[i] - f(x[i])/f'(x[i]) 
+ * which will find the root in log(N) time where 
+ * each step involves a fair bit.  This is not meant to 
+ * find huge roots [square and cube, etc].
+ */
+int mp_n_root (mp_int * a, mp_digit b, mp_int * c)
+{
+  mp_int  t1, t2, t3;
+  int     res, neg;
+
+  /* input must be positive if b is even */
+  if ((b & 1) == 0 && a->sign == MP_NEG) {
+    return MP_VAL;
+  }
+
+  if ((res = mp_init (&t1)) != MP_OKAY) {
+    return res;
+  }
+
+  if ((res = mp_init (&t2)) != MP_OKAY) {
+    goto LBL_T1;
+  }
+
+  if ((res = mp_init (&t3)) != MP_OKAY) {
+    goto LBL_T2;
+  }
+
+  /* if a is negative fudge the sign but keep track */
+  neg     = a->sign;
+  a->sign = MP_ZPOS;
+
+  /* t2 = 2 */
+  mp_set (&t2, 2);
+
+  do {
+    /* t1 = t2 */
+    if ((res = mp_copy (&t2, &t1)) != MP_OKAY) {
+      goto LBL_T3;
+    }
+
+    /* t2 = t1 - ((t1**b - a) / (b * t1**(b-1))) */
+    
+    /* t3 = t1**(b-1) */
+    if ((res = mp_expt_d (&t1, b - 1, &t3)) != MP_OKAY) {   
+      goto LBL_T3;
+    }
+
+    /* numerator */
+    /* t2 = t1**b */
+    if ((res = mp_mul (&t3, &t1, &t2)) != MP_OKAY) {    
+      goto LBL_T3;
+    }
+
+    /* t2 = t1**b - a */
+    if ((res = mp_sub (&t2, a, &t2)) != MP_OKAY) {  
+      goto LBL_T3;
+    }
+
+    /* denominator */
+    /* t3 = t1**(b-1) * b  */
+    if ((res = mp_mul_d (&t3, b, &t3)) != MP_OKAY) {    
+      goto LBL_T3;
+    }
+
+    /* t3 = (t1**b - a)/(b * t1**(b-1)) */
+    if ((res = mp_div (&t2, &t3, &t3, NULL)) != MP_OKAY) {  
+      goto LBL_T3;
+    }
+
+    if ((res = mp_sub (&t1, &t3, &t2)) != MP_OKAY) {
+      goto LBL_T3;
+    }
+  }  while (mp_cmp (&t1, &t2) != MP_EQ);
+
+  /* result can be off by a few so check */
+  for (;;) {
+    if ((res = mp_expt_d (&t1, b, &t2)) != MP_OKAY) {
+      goto LBL_T3;
+    }
+
+    if (mp_cmp (&t2, a) == MP_GT) {
+      if ((res = mp_sub_d (&t1, 1, &t1)) != MP_OKAY) {
+         goto LBL_T3;
+      }
+    } else {
+      break;
+    }
+  }
+
+  /* reset the sign of a first */
+  a->sign = neg;
+
+  /* set the result */
+  mp_exch (&t1, c);
+
+  /* set the sign of the result */
+  c->sign = neg;
+
+  res = MP_OKAY;
+
+LBL_T3:mp_clear (&t3);
+LBL_T2:mp_clear (&t2);
+LBL_T1:mp_clear (&t1);
+  return res;
+}
+#endif
diff --git a/libtommath/bn_mp_neg.c b/libtommath/bn_mp_neg.c
new file mode 100644
index 0000000..159cd74
--- /dev/null
+++ b/libtommath/bn_mp_neg.c
@@ -0,0 +1,36 @@
+#include <tommath.h>
+#ifdef BN_MP_NEG_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* b = -a */
+int mp_neg (mp_int * a, mp_int * b)
+{
+  int     res;
+  if (a != b) {
+     if ((res = mp_copy (a, b)) != MP_OKAY) {
+        return res;
+     }
+  }
+
+  if (mp_iszero(b) != MP_YES) {
+     b->sign = (a->sign == MP_ZPOS) ? MP_NEG : MP_ZPOS;
+  } else {
+     b->sign = MP_ZPOS;
+  }
+
+  return MP_OKAY;
+}
+#endif
diff --git a/libtommath/bn_mp_or.c b/libtommath/bn_mp_or.c
new file mode 100644
index 0000000..dccee7e
--- /dev/null
+++ b/libtommath/bn_mp_or.c
@@ -0,0 +1,46 @@
+#include <tommath.h>
+#ifdef BN_MP_OR_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* OR two ints together */
+int mp_or (mp_int * a, mp_int * b, mp_int * c)
+{
+  int     res, ix, px;
+  mp_int  t, *x;
+
+  if (a->used > b->used) {
+    if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
+      return res;
+    }
+    px = b->used;
+    x = b;
+  } else {
+    if ((res = mp_init_copy (&t, b)) != MP_OKAY) {
+      return res;
+    }
+    px = a->used;
+    x = a;
+  }
+
+  for (ix = 0; ix < px; ix++) {
+    t.dp[ix] |= x->dp[ix];
+  }
+  mp_clamp (&t);
+  mp_exch (c, &t);
+  mp_clear (&t);
+  return MP_OKAY;
+}
+#endif
diff --git a/libtommath/bn_mp_prime_fermat.c b/libtommath/bn_mp_prime_fermat.c
new file mode 100644
index 0000000..fd74dbe
--- /dev/null
+++ b/libtommath/bn_mp_prime_fermat.c
@@ -0,0 +1,58 @@
+#include <tommath.h>
+#ifdef BN_MP_PRIME_FERMAT_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* performs one Fermat test.
+ * 
+ * If "a" were prime then b**a == b (mod a) since the order of
+ * the multiplicative sub-group would be phi(a) = a-1.  That means
+ * it would be the same as b**(a mod (a-1)) == b**1 == b (mod a).
+ *
+ * Sets result to 1 if the congruence holds, or zero otherwise.
+ */
+int mp_prime_fermat (mp_int * a, mp_int * b, int *result)
+{
+  mp_int  t;
+  int     err;
+
+  /* default to composite  */
+  *result = MP_NO;
+
+  /* ensure b > 1 */
+  if (mp_cmp_d(b, 1) != MP_GT) {
+     return MP_VAL;
+  }
+
+  /* init t */
+  if ((err = mp_init (&t)) != MP_OKAY) {
+    return err;
+  }
+
+  /* compute t = b**a mod a */
+  if ((err = mp_exptmod (b, a, a, &t)) != MP_OKAY) {
+    goto LBL_T;
+  }
+
+  /* is it equal to b? */
+  if (mp_cmp (&t, b) == MP_EQ) {
+    *result = MP_YES;
+  }
+
+  err = MP_OKAY;
+LBL_T:mp_clear (&t);
+  return err;
+}
+#endif
diff --git a/libtommath/bn_mp_prime_is_divisible.c b/libtommath/bn_mp_prime_is_divisible.c
new file mode 100644
index 0000000..f85fe7c
--- /dev/null
+++ b/libtommath/bn_mp_prime_is_divisible.c
@@ -0,0 +1,46 @@
+#include <tommath.h>
+#ifdef BN_MP_PRIME_IS_DIVISIBLE_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* determines if an integers is divisible by one 
+ * of the first PRIME_SIZE primes or not
+ *
+ * sets result to 0 if not, 1 if yes
+ */
+int mp_prime_is_divisible (mp_int * a, int *result)
+{
+  int     err, ix;
+  mp_digit res;
+
+  /* default to not */
+  *result = MP_NO;
+
+  for (ix = 0; ix < PRIME_SIZE; ix++) {
+    /* what is a mod LBL_prime_tab[ix] */
+    if ((err = mp_mod_d (a, ltm_prime_tab[ix], &res)) != MP_OKAY) {
+      return err;
+    }
+
+    /* is the residue zero? */
+    if (res == 0) {
+      *result = MP_YES;
+      return MP_OKAY;
+    }
+  }
+
+  return MP_OKAY;
+}
+#endif
diff --git a/libtommath/bn_mp_prime_is_prime.c b/libtommath/bn_mp_prime_is_prime.c
new file mode 100644
index 0000000..188053a
--- /dev/null
+++ b/libtommath/bn_mp_prime_is_prime.c
@@ -0,0 +1,79 @@
+#include <tommath.h>
+#ifdef BN_MP_PRIME_IS_PRIME_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* performs a variable number of rounds of Miller-Rabin
+ *
+ * Probability of error after t rounds is no more than
+
+ *
+ * Sets result to 1 if probably prime, 0 otherwise
+ */
+int mp_prime_is_prime (mp_int * a, int t, int *result)
+{
+  mp_int  b;
+  int     ix, err, res;
+
+  /* default to no */
+  *result = MP_NO;
+
+  /* valid value of t? */
+  if (t <= 0 || t > PRIME_SIZE) {
+    return MP_VAL;
+  }
+
+  /* is the input equal to one of the primes in the table? */
+  for (ix = 0; ix < PRIME_SIZE; ix++) {
+      if (mp_cmp_d(a, ltm_prime_tab[ix]) == MP_EQ) {
+         *result = 1;
+         return MP_OKAY;
+      }
+  }
+
+  /* first perform trial division */
+  if ((err = mp_prime_is_divisible (a, &res)) != MP_OKAY) {
+    return err;
+  }
+
+  /* return if it was trivially divisible */
+  if (res == MP_YES) {
+    return MP_OKAY;
+  }
+
+  /* now perform the miller-rabin rounds */
+  if ((err = mp_init (&b)) != MP_OKAY) {
+    return err;
+  }
+
+  for (ix = 0; ix < t; ix++) {
+    /* set the prime */
+    mp_set (&b, ltm_prime_tab[ix]);
+
+    if ((err = mp_prime_miller_rabin (a, &b, &res)) != MP_OKAY) {
+      goto LBL_B;
+    }
+
+    if (res == MP_NO) {
+      goto LBL_B;
+    }
+  }
+
+  /* passed the test */
+  *result = MP_YES;
+LBL_B:mp_clear (&b);
+  return err;
+}
+#endif
diff --git a/libtommath/bn_mp_prime_miller_rabin.c b/libtommath/bn_mp_prime_miller_rabin.c
new file mode 100644
index 0000000..758a2c3
--- /dev/null
+++ b/libtommath/bn_mp_prime_miller_rabin.c
@@ -0,0 +1,99 @@
+#include <tommath.h>
+#ifdef BN_MP_PRIME_MILLER_RABIN_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* Miller-Rabin test of "a" to the base of "b" as described in 
+ * HAC pp. 139 Algorithm 4.24
+ *
+ * Sets result to 0 if definitely composite or 1 if probably prime.
+ * Randomly the chance of error is no more than 1/4 and often 
+ * very much lower.
+ */
+int mp_prime_miller_rabin (mp_int * a, mp_int * b, int *result)
+{
+  mp_int  n1, y, r;
+  int     s, j, err;
+
+  /* default */
+  *result = MP_NO;
+
+  /* ensure b > 1 */
+  if (mp_cmp_d(b, 1) != MP_GT) {
+     return MP_VAL;
+  }     
+
+  /* get n1 = a - 1 */
+  if ((err = mp_init_copy (&n1, a)) != MP_OKAY) {
+    return err;
+  }
+  if ((err = mp_sub_d (&n1, 1, &n1)) != MP_OKAY) {
+    goto LBL_N1;
+  }
+
+  /* set 2**s * r = n1 */
+  if ((err = mp_init_copy (&r, &n1)) != MP_OKAY) {
+    goto LBL_N1;
+  }
+
+  /* count the number of least significant bits
+   * which are zero
+   */
+  s = mp_cnt_lsb(&r);
+
+  /* now divide n - 1 by 2**s */
+  if ((err = mp_div_2d (&r, s, &r, NULL)) != MP_OKAY) {
+    goto LBL_R;
+  }
+
+  /* compute y = b**r mod a */
+  if ((err = mp_init (&y)) != MP_OKAY) {
+    goto LBL_R;
+  }
+  if ((err = mp_exptmod (b, &r, a, &y)) != MP_OKAY) {
+    goto LBL_Y;
+  }
+
+  /* if y != 1 and y != n1 do */
+  if (mp_cmp_d (&y, 1) != MP_EQ && mp_cmp (&y, &n1) != MP_EQ) {
+    j = 1;
+    /* while j <= s-1 and y != n1 */
+    while ((j <= (s - 1)) && mp_cmp (&y, &n1) != MP_EQ) {
+      if ((err = mp_sqrmod (&y, a, &y)) != MP_OKAY) {
+         goto LBL_Y;
+      }
+
+      /* if y == 1 then composite */
+      if (mp_cmp_d (&y, 1) == MP_EQ) {
+         goto LBL_Y;
+      }
+
+      ++j;
+    }
+
+    /* if y != n1 then composite */
+    if (mp_cmp (&y, &n1) != MP_EQ) {
+      goto LBL_Y;
+    }
+  }
+
+  /* probably prime now */
+  *result = MP_YES;
+LBL_Y:mp_clear (&y);
+LBL_R:mp_clear (&r);
+LBL_N1:mp_clear (&n1);
+  return err;
+}
+#endif
diff --git a/libtommath/bn_mp_prime_next_prime.c b/libtommath/bn_mp_prime_next_prime.c
new file mode 100644
index 0000000..24f93c4
--- /dev/null
+++ b/libtommath/bn_mp_prime_next_prime.c
@@ -0,0 +1,166 @@
+#include <tommath.h>
+#ifdef BN_MP_PRIME_NEXT_PRIME_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* finds the next prime after the number "a" using "t" trials
+ * of Miller-Rabin.
+ *
+ * bbs_style = 1 means the prime must be congruent to 3 mod 4
+ */
+int mp_prime_next_prime(mp_int *a, int t, int bbs_style)
+{
+   int      err, res, x, y;
+   mp_digit res_tab[PRIME_SIZE], step, kstep;
+   mp_int   b;
+
+   /* ensure t is valid */
+   if (t <= 0 || t > PRIME_SIZE) {
+      return MP_VAL;
+   }
+
+   /* force positive */
+   a->sign = MP_ZPOS;
+
+   /* simple algo if a is less than the largest prime in the table */
+   if (mp_cmp_d(a, ltm_prime_tab[PRIME_SIZE-1]) == MP_LT) {
+      /* find which prime it is bigger than */
+      for (x = PRIME_SIZE - 2; x >= 0; x--) {
+          if (mp_cmp_d(a, ltm_prime_tab[x]) != MP_LT) {
+             if (bbs_style == 1) {
+                /* ok we found a prime smaller or
+                 * equal [so the next is larger]
+                 *
+                 * however, the prime must be
+                 * congruent to 3 mod 4
+                 */
+                if ((ltm_prime_tab[x + 1] & 3) != 3) {
+                   /* scan upwards for a prime congruent to 3 mod 4 */
+                   for (y = x + 1; y < PRIME_SIZE; y++) {
+                       if ((ltm_prime_tab[y] & 3) == 3) {
+                          mp_set(a, ltm_prime_tab[y]);
+                          return MP_OKAY;
+                       }
+                   }
+                }
+             } else {
+                mp_set(a, ltm_prime_tab[x + 1]);
+                return MP_OKAY;
+             }
+          }
+      }
+      /* at this point a maybe 1 */
+      if (mp_cmp_d(a, 1) == MP_EQ) {
+         mp_set(a, 2);
+         return MP_OKAY;
+      }
+      /* fall through to the sieve */
+   }
+
+   /* generate a prime congruent to 3 mod 4 or 1/3 mod 4? */
+   if (bbs_style == 1) {
+      kstep   = 4;
+   } else {
+      kstep   = 2;
+   }
+
+   /* at this point we will use a combination of a sieve and Miller-Rabin */
+
+   if (bbs_style == 1) {
+      /* if a mod 4 != 3 subtract the correct value to make it so */
+      if ((a->dp[0] & 3) != 3) {
+         if ((err = mp_sub_d(a, (a->dp[0] & 3) + 1, a)) != MP_OKAY) { return err; };
+      }
+   } else {
+      if (mp_iseven(a) == 1) {
+         /* force odd */
+         if ((err = mp_sub_d(a, 1, a)) != MP_OKAY) {
+            return err;
+         }
+      }
+   }
+
+   /* generate the restable */
+   for (x = 1; x < PRIME_SIZE; x++) {
+      if ((err = mp_mod_d(a, ltm_prime_tab[x], res_tab + x)) != MP_OKAY) {
+         return err;
+      }
+   }
+
+   /* init temp used for Miller-Rabin Testing */
+   if ((err = mp_init(&b)) != MP_OKAY) {
+      return err;
+   }
+
+   for (;;) {
+      /* skip to the next non-trivially divisible candidate */
+      step = 0;
+      do {
+         /* y == 1 if any residue was zero [e.g. cannot be prime] */
+         y     =  0;
+
+         /* increase step to next candidate */
+         step += kstep;
+
+         /* compute the new residue without using division */
+         for (x = 1; x < PRIME_SIZE; x++) {
+             /* add the step to each residue */
+             res_tab[x] += kstep;
+
+             /* subtract the modulus [instead of using division] */
+             if (res_tab[x] >= ltm_prime_tab[x]) {
+                res_tab[x]  -= ltm_prime_tab[x];
+             }
+
+             /* set flag if zero */
+             if (res_tab[x] == 0) {
+                y = 1;
+             }
+         }
+      } while (y == 1 && step < ((((mp_digit)1)<<DIGIT_BIT) - kstep));
+
+      /* add the step */
+      if ((err = mp_add_d(a, step, a)) != MP_OKAY) {
+         goto LBL_ERR;
+      }
+
+      /* if didn't pass sieve and step == MAX then skip test */
+      if (y == 1 && step >= ((((mp_digit)1)<<DIGIT_BIT) - kstep)) {
+         continue;
+      }
+
+      /* is this prime? */
+      for (x = 0; x < t; x++) {
+          mp_set(&b, ltm_prime_tab[t]);
+          if ((err = mp_prime_miller_rabin(a, &b, &res)) != MP_OKAY) {
+             goto LBL_ERR;
+          }
+          if (res == MP_NO) {
+             break;
+          }
+      }
+
+      if (res == MP_YES) {
+         break;
+      }
+   }
+
+   err = MP_OKAY;
+LBL_ERR:
+   mp_clear(&b);
+   return err;
+}
+
+#endif
diff --git a/libtommath/bn_mp_prime_rabin_miller_trials.c b/libtommath/bn_mp_prime_rabin_miller_trials.c
new file mode 100644
index 0000000..d1d0867
--- /dev/null
+++ b/libtommath/bn_mp_prime_rabin_miller_trials.c
@@ -0,0 +1,48 @@
+#include <tommath.h>
+#ifdef BN_MP_PRIME_RABIN_MILLER_TRIALS_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+
+static const struct {
+   int k, t;
+} sizes[] = {
+{   128,    28 },
+{   256,    16 },
+{   384,    10 },
+{   512,     7 },
+{   640,     6 },
+{   768,     5 },
+{   896,     4 },
+{  1024,     4 }
+};
+
+/* returns # of RM trials required for a given bit size */
+int mp_prime_rabin_miller_trials(int size)
+{
+   int x;
+
+   for (x = 0; x < (int)(sizeof(sizes)/(sizeof(sizes[0]))); x++) {
+       if (sizes[x].k == size) {
+          return sizes[x].t;
+       } else if (sizes[x].k > size) {
+          return (x == 0) ? sizes[0].t : sizes[x - 1].t;
+       }
+   }
+   return sizes[x-1].t + 1;
+}
+
+
+#endif
diff --git a/libtommath/bn_mp_prime_random_ex.c b/libtommath/bn_mp_prime_random_ex.c
new file mode 100644
index 0000000..78c0583
--- /dev/null
+++ b/libtommath/bn_mp_prime_random_ex.c
@@ -0,0 +1,123 @@
+#include <tommath.h>
+#ifdef BN_MP_PRIME_RANDOM_EX_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* makes a truly random prime of a given size (bits),
+ *
+ * Flags are as follows:
+ * 
+ *   LTM_PRIME_BBS      - make prime congruent to 3 mod 4
+ *   LTM_PRIME_SAFE     - make sure (p-1)/2 is prime as well (implies LTM_PRIME_BBS)
+ *   LTM_PRIME_2MSB_OFF - make the 2nd highest bit zero
+ *   LTM_PRIME_2MSB_ON  - make the 2nd highest bit one
+ *
+ * You have to supply a callback which fills in a buffer with random bytes.  "dat" is a parameter you can
+ * have passed to the callback (e.g. a state or something).  This function doesn't use "dat" itself
+ * so it can be NULL
+ *
+ */
+
+/* This is possibly the mother of all prime generation functions, muahahahahaha! */
+int mp_prime_random_ex(mp_int *a, int t, int size, int flags, ltm_prime_callback cb, void *dat)
+{
+   unsigned char *tmp, maskAND, maskOR_msb, maskOR_lsb;
+   int res, err, bsize, maskOR_msb_offset;
+
+   /* sanity check the input */
+   if (size <= 1 || t <= 0) {
+      return MP_VAL;
+   }
+
+   /* LTM_PRIME_SAFE implies LTM_PRIME_BBS */
+   if (flags & LTM_PRIME_SAFE) {
+      flags |= LTM_PRIME_BBS;
+   }
+
+   /* calc the byte size */
+   bsize = (size>>3) + ((size&7)?1:0);
+
+   /* we need a buffer of bsize bytes */
+   tmp = OPT_CAST(unsigned char) XMALLOC(bsize);
+   if (tmp == NULL) {
+      return MP_MEM;
+   }
+
+   /* calc the maskAND value for the MSbyte*/
+   maskAND = ((size&7) == 0) ? 0xFF : (0xFF >> (8 - (size & 7)));
+
+   /* calc the maskOR_msb */
+   maskOR_msb        = 0;
+   maskOR_msb_offset = ((size & 7) == 1) ? 1 : 0;
+   if (flags & LTM_PRIME_2MSB_ON) {
+      maskOR_msb     |= 1 << ((size - 2) & 7);
+   } else if (flags & LTM_PRIME_2MSB_OFF) {
+      maskAND        &= ~(1 << ((size - 2) & 7));
+   } 
+
+   /* get the maskOR_lsb */
+   maskOR_lsb         = 1;
+   if (flags & LTM_PRIME_BBS) {
+      maskOR_lsb     |= 3;
+   }
+
+   do {
+      /* read the bytes */
+      if (cb(tmp, bsize, dat) != bsize) {
+         err = MP_VAL;
+         goto error;
+      }
+ 
+      /* work over the MSbyte */
+      tmp[0]    &= maskAND;
+      tmp[0]    |= 1 << ((size - 1) & 7);
+
+      /* mix in the maskORs */
+      tmp[maskOR_msb_offset]   |= maskOR_msb;
+      tmp[bsize-1]             |= maskOR_lsb;
+
+      /* read it in */
+      if ((err = mp_read_unsigned_bin(a, tmp, bsize)) != MP_OKAY)     { goto error; }
+
+      /* is it prime? */
+      if ((err = mp_prime_is_prime(a, t, &res)) != MP_OKAY)           { goto error; }
+      if (res == MP_NO) {  
+         continue;
+      }
+
+      if (flags & LTM_PRIME_SAFE) {
+         /* see if (a-1)/2 is prime */
+         if ((err = mp_sub_d(a, 1, a)) != MP_OKAY)                    { goto error; }
+         if ((err = mp_div_2(a, a)) != MP_OKAY)                       { goto error; }
+ 
+         /* is it prime? */
+         if ((err = mp_prime_is_prime(a, t, &res)) != MP_OKAY)        { goto error; }
+      }
+   } while (res == MP_NO);
+
+   if (flags & LTM_PRIME_SAFE) {
+      /* restore a to the original value */
+      if ((err = mp_mul_2(a, a)) != MP_OKAY)                          { goto error; }
+      if ((err = mp_add_d(a, 1, a)) != MP_OKAY)                       { goto error; }
+   }
+
+   err = MP_OKAY;
+error:
+   XFREE(tmp);
+   return err;
+}
+
+
+#endif
diff --git a/libtommath/bn_mp_radix_size.c b/libtommath/bn_mp_radix_size.c
new file mode 100644
index 0000000..3d423ba
--- /dev/null
+++ b/libtommath/bn_mp_radix_size.c
@@ -0,0 +1,74 @@
+#include <tommath.h>
+#ifdef BN_MP_RADIX_SIZE_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* returns size of ASCII reprensentation */
+int mp_radix_size (mp_int * a, int radix, int *size)
+{
+  int     res, digs;
+  mp_int  t;
+  mp_digit d;
+
+  *size = 0;
+
+  /* special case for binary */
+  if (radix == 2) {
+    *size = mp_count_bits (a) + (a->sign == MP_NEG ? 1 : 0) + 1;
+    return MP_OKAY;
+  }
+
+  /* make sure the radix is in range */
+  if (radix < 2 || radix > 64) {
+    return MP_VAL;
+  }
+
+  if (mp_iszero(a) == MP_YES) {
+     *size = 2;
+    return MP_OKAY;
+  }
+
+  /* digs is the digit count */
+  digs = 0;
+
+  /* if it's negative add one for the sign */
+  if (a->sign == MP_NEG) {
+    ++digs;
+  }
+
+  /* init a copy of the input */
+  if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
+    return res;
+  }
+
+  /* force temp to positive */
+  t.sign = MP_ZPOS; 
+
+  /* fetch out all of the digits */
+  while (mp_iszero (&t) == MP_NO) {
+    if ((res = mp_div_d (&t, (mp_digit) radix, &t, &d)) != MP_OKAY) {
+      mp_clear (&t);
+      return res;
+    }
+    ++digs;
+  }
+  mp_clear (&t);
+
+  /* return digs + 1, the 1 is for the NULL byte that would be required. */
+  *size = digs + 1;
+  return MP_OKAY;
+}
+
+#endif
diff --git a/libtommath/bn_mp_radix_smap.c b/libtommath/bn_mp_radix_smap.c
new file mode 100644
index 0000000..bc7517d
--- /dev/null
+++ b/libtommath/bn_mp_radix_smap.c
@@ -0,0 +1,20 @@
+#include <tommath.h>
+#ifdef BN_MP_RADIX_SMAP_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* chars used in radix conversions */
+const char *mp_s_rmap = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+/";
+#endif
diff --git a/libtommath/bn_mp_rand.c b/libtommath/bn_mp_rand.c
new file mode 100644
index 0000000..0dc7019
--- /dev/null
+++ b/libtommath/bn_mp_rand.c
@@ -0,0 +1,51 @@
+#include <tommath.h>
+#ifdef BN_MP_RAND_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* makes a pseudo-random int of a given size */
+int
+mp_rand (mp_int * a, int digits)
+{
+  int     res;
+  mp_digit d;
+
+  mp_zero (a);
+  if (digits <= 0) {
+    return MP_OKAY;
+  }
+
+  /* first place a random non-zero digit */
+  do {
+    d = ((mp_digit) abs (rand ())) & MP_MASK;
+  } while (d == 0);
+
+  if ((res = mp_add_d (a, d, a)) != MP_OKAY) {
+    return res;
+  }
+
+  while (--digits > 0) {
+    if ((res = mp_lshd (a, 1)) != MP_OKAY) {
+      return res;
+    }
+
+    if ((res = mp_add_d (a, ((mp_digit) abs (rand ())), a)) != MP_OKAY) {
+      return res;
+    }
+  }
+
+  return MP_OKAY;
+}
+#endif
diff --git a/libtommath/bn_mp_read_radix.c b/libtommath/bn_mp_read_radix.c
new file mode 100644
index 0000000..1ec3937
--- /dev/null
+++ b/libtommath/bn_mp_read_radix.c
@@ -0,0 +1,78 @@
+#include <tommath.h>
+#ifdef BN_MP_READ_RADIX_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* read a string [ASCII] in a given radix */
+int mp_read_radix (mp_int * a, const char *str, int radix)
+{
+  int     y, res, neg;
+  char    ch;
+
+  /* make sure the radix is ok */
+  if (radix < 2 || radix > 64) {
+    return MP_VAL;
+  }
+
+  /* if the leading digit is a 
+   * minus set the sign to negative. 
+   */
+  if (*str == '-') {
+    ++str;
+    neg = MP_NEG;
+  } else {
+    neg = MP_ZPOS;
+  }
+
+  /* set the integer to the default of zero */
+  mp_zero (a);
+  
+  /* process each digit of the string */
+  while (*str) {
+    /* if the radix < 36 the conversion is case insensitive
+     * this allows numbers like 1AB and 1ab to represent the same  value
+     * [e.g. in hex]
+     */
+    ch = (char) ((radix < 36) ? toupper (*str) : *str);
+    for (y = 0; y < 64; y++) {
+      if (ch == mp_s_rmap[y]) {
+         break;
+      }
+    }
+
+    /* if the char was found in the map 
+     * and is less than the given radix add it
+     * to the number, otherwise exit the loop. 
+     */
+    if (y < radix) {
+      if ((res = mp_mul_d (a, (mp_digit) radix, a)) != MP_OKAY) {
+         return res;
+      }
+      if ((res = mp_add_d (a, (mp_digit) y, a)) != MP_OKAY) {
+         return res;
+      }
+    } else {
+      break;
+    }
+    ++str;
+  }
+  
+  /* set the sign only if a != 0 */
+  if (mp_iszero(a) != 1) {
+     a->sign = neg;
+  }
+  return MP_OKAY;
+}
+#endif
diff --git a/libtommath/bn_mp_read_signed_bin.c b/libtommath/bn_mp_read_signed_bin.c
new file mode 100644
index 0000000..814d6c1
--- /dev/null
+++ b/libtommath/bn_mp_read_signed_bin.c
@@ -0,0 +1,38 @@
+#include <tommath.h>
+#ifdef BN_MP_READ_SIGNED_BIN_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* read signed bin, big endian, first byte is 0==positive or 1==negative */
+int
+mp_read_signed_bin (mp_int * a, unsigned char *b, int c)
+{
+  int     res;
+
+  /* read magnitude */
+  if ((res = mp_read_unsigned_bin (a, b + 1, c - 1)) != MP_OKAY) {
+    return res;
+  }
+
+  /* first byte is 0 for positive, non-zero for negative */
+  if (b[0] == 0) {
+     a->sign = MP_ZPOS;
+  } else {
+     a->sign = MP_NEG;
+  }
+
+  return MP_OKAY;
+}
+#endif
diff --git a/libtommath/bn_mp_read_unsigned_bin.c b/libtommath/bn_mp_read_unsigned_bin.c
new file mode 100644
index 0000000..946457d
--- /dev/null
+++ b/libtommath/bn_mp_read_unsigned_bin.c
@@ -0,0 +1,52 @@
+#include <tommath.h>
+#ifdef BN_MP_READ_UNSIGNED_BIN_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* reads a unsigned char array, assumes the msb is stored first [big endian] */
+int
+mp_read_unsigned_bin (mp_int * a, unsigned char *b, int c)
+{
+  int     res;
+
+  /* make sure there are at least two digits */
+  if (a->alloc < 2) {
+     if ((res = mp_grow(a, 2)) != MP_OKAY) {
+        return res;
+     }
+  }
+
+  /* zero the int */
+  mp_zero (a);
+
+  /* read the bytes in */
+  while (c-- > 0) {
+    if ((res = mp_mul_2d (a, 8, a)) != MP_OKAY) {
+      return res;
+    }
+
+#ifndef MP_8BIT
+      a->dp[0] |= *b++;
+      a->used += 1;
+#else
+      a->dp[0] = (*b & MP_MASK);
+      a->dp[1] |= ((*b++ >> 7U) & 1);
+      a->used += 2;
+#endif
+  }
+  mp_clamp (a);
+  return MP_OKAY;
+}
+#endif
diff --git a/libtommath/bn_mp_reduce.c b/libtommath/bn_mp_reduce.c
new file mode 100644
index 0000000..d746445
--- /dev/null
+++ b/libtommath/bn_mp_reduce.c
@@ -0,0 +1,96 @@
+#include <tommath.h>
+#ifdef BN_MP_REDUCE_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* reduces x mod m, assumes 0 < x < m**2, mu is 
+ * precomputed via mp_reduce_setup.
+ * From HAC pp.604 Algorithm 14.42
+ */
+int mp_reduce (mp_int * x, mp_int * m, mp_int * mu)
+{
+  mp_int  q;
+  int     res, um = m->used;
+
+  /* q = x */
+  if ((res = mp_init_copy (&q, x)) != MP_OKAY) {
+    return res;
+  }
+
+  /* q1 = x / b**(k-1)  */
+  mp_rshd (&q, um - 1);         
+
+  /* according to HAC this optimization is ok */
+  if (((unsigned long) um) > (((mp_digit)1) << (DIGIT_BIT - 1))) {
+    if ((res = mp_mul (&q, mu, &q)) != MP_OKAY) {
+      goto CLEANUP;
+    }
+  } else {
+#ifdef BN_S_MP_MUL_HIGH_DIGS_C
+    if ((res = s_mp_mul_high_digs (&q, mu, &q, um)) != MP_OKAY) {
+      goto CLEANUP;
+    }
+#elif defined(BN_FAST_S_MP_MUL_HIGH_DIGS_C)
+    if ((res = fast_s_mp_mul_high_digs (&q, mu, &q, um)) != MP_OKAY) {
+      goto CLEANUP;
+    }
+#else 
+    { 
+      res = MP_VAL;
+      goto CLEANUP;
+    }
+#endif
+  }
+
+  /* q3 = q2 / b**(k+1) */
+  mp_rshd (&q, um + 1);         
+
+  /* x = x mod b**(k+1), quick (no division) */
+  if ((res = mp_mod_2d (x, DIGIT_BIT * (um + 1), x)) != MP_OKAY) {
+    goto CLEANUP;
+  }
+
+  /* q = q * m mod b**(k+1), quick (no division) */
+  if ((res = s_mp_mul_digs (&q, m, &q, um + 1)) != MP_OKAY) {
+    goto CLEANUP;
+  }
+
+  /* x = x - q */
+  if ((res = mp_sub (x, &q, x)) != MP_OKAY) {
+    goto CLEANUP;
+  }
+
+  /* If x < 0, add b**(k+1) to it */
+  if (mp_cmp_d (x, 0) == MP_LT) {
+    mp_set (&q, 1);
+    if ((res = mp_lshd (&q, um + 1)) != MP_OKAY)
+      goto CLEANUP;
+    if ((res = mp_add (x, &q, x)) != MP_OKAY)
+      goto CLEANUP;
+  }
+
+  /* Back off if it's too big */
+  while (mp_cmp (x, m) != MP_LT) {
+    if ((res = s_mp_sub (x, m, x)) != MP_OKAY) {
+      goto CLEANUP;
+    }
+  }
+  
+CLEANUP:
+  mp_clear (&q);
+
+  return res;
+}
+#endif
diff --git a/libtommath/bn_mp_reduce_2k.c b/libtommath/bn_mp_reduce_2k.c
new file mode 100644
index 0000000..28c3a00
--- /dev/null
+++ b/libtommath/bn_mp_reduce_2k.c
@@ -0,0 +1,57 @@
+#include <tommath.h>
+#ifdef BN_MP_REDUCE_2K_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* reduces a modulo n where n is of the form 2**p - d */
+int mp_reduce_2k(mp_int *a, mp_int *n, mp_digit d)
+{
+   mp_int q;
+   int    p, res;
+   
+   if ((res = mp_init(&q)) != MP_OKAY) {
+      return res;
+   }
+   
+   p = mp_count_bits(n);    
+top:
+   /* q = a/2**p, a = a mod 2**p */
+   if ((res = mp_div_2d(a, p, &q, a)) != MP_OKAY) {
+      goto ERR;
+   }
+   
+   if (d != 1) {
+      /* q = q * d */
+      if ((res = mp_mul_d(&q, d, &q)) != MP_OKAY) { 
+         goto ERR;
+      }
+   }
+   
+   /* a = a + q */
+   if ((res = s_mp_add(a, &q, a)) != MP_OKAY) {
+      goto ERR;
+   }
+   
+   if (mp_cmp_mag(a, n) != MP_LT) {
+      s_mp_sub(a, n, a);
+      goto top;
+   }
+   
+ERR:
+   mp_clear(&q);
+   return res;
+}
+
+#endif
diff --git a/libtommath/bn_mp_reduce_2k_l.c b/libtommath/bn_mp_reduce_2k_l.c
new file mode 100644
index 0000000..1d7e1f0
--- /dev/null
+++ b/libtommath/bn_mp_reduce_2k_l.c
@@ -0,0 +1,58 @@
+#include <tommath.h>
+#ifdef BN_MP_REDUCE_2K_L_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* reduces a modulo n where n is of the form 2**p - d 
+   This differs from reduce_2k since "d" can be larger
+   than a single digit.
+*/
+int mp_reduce_2k_l(mp_int *a, mp_int *n, mp_int *d)
+{
+   mp_int q;
+   int    p, res;
+   
+   if ((res = mp_init(&q)) != MP_OKAY) {
+      return res;
+   }
+   
+   p = mp_count_bits(n);    
+top:
+   /* q = a/2**p, a = a mod 2**p */
+   if ((res = mp_div_2d(a, p, &q, a)) != MP_OKAY) {
+      goto ERR;
+   }
+   
+   /* q = q * d */
+   if ((res = mp_mul(&q, d, &q)) != MP_OKAY) { 
+      goto ERR;
+   }
+   
+   /* a = a + q */
+   if ((res = s_mp_add(a, &q, a)) != MP_OKAY) {
+      goto ERR;
+   }
+   
+   if (mp_cmp_mag(a, n) != MP_LT) {
+      s_mp_sub(a, n, a);
+      goto top;
+   }
+   
+ERR:
+   mp_clear(&q);
+   return res;
+}
+
+#endif
diff --git a/libtommath/bn_mp_reduce_2k_setup.c b/libtommath/bn_mp_reduce_2k_setup.c
new file mode 100644
index 0000000..585e1b7
--- /dev/null
+++ b/libtommath/bn_mp_reduce_2k_setup.c
@@ -0,0 +1,43 @@
+#include <tommath.h>
+#ifdef BN_MP_REDUCE_2K_SETUP_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* determines the setup value */
+int mp_reduce_2k_setup(mp_int *a, mp_digit *d)
+{
+   int res, p;
+   mp_int tmp;
+   
+   if ((res = mp_init(&tmp)) != MP_OKAY) {
+      return res;
+   }
+   
+   p = mp_count_bits(a);
+   if ((res = mp_2expt(&tmp, p)) != MP_OKAY) {
+      mp_clear(&tmp);
+      return res;
+   }
+   
+   if ((res = s_mp_sub(&tmp, a, &tmp)) != MP_OKAY) {
+      mp_clear(&tmp);
+      return res;
+   }
+   
+   *d = tmp.dp[0];
+   mp_clear(&tmp);
+   return MP_OKAY;
+}
+#endif
diff --git a/libtommath/bn_mp_reduce_2k_setup_l.c b/libtommath/bn_mp_reduce_2k_setup_l.c
new file mode 100644
index 0000000..810a456
--- /dev/null
+++ b/libtommath/bn_mp_reduce_2k_setup_l.c
@@ -0,0 +1,40 @@
+#include <tommath.h>
+#ifdef BN_MP_REDUCE_2K_SETUP_L_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* determines the setup value */
+int mp_reduce_2k_setup_l(mp_int *a, mp_int *d)
+{
+   int    res;
+   mp_int tmp;
+   
+   if ((res = mp_init(&tmp)) != MP_OKAY) {
+      return res;
+   }
+   
+   if ((res = mp_2expt(&tmp, mp_count_bits(a))) != MP_OKAY) {
+      goto ERR;
+   }
+   
+   if ((res = s_mp_sub(&tmp, a, d)) != MP_OKAY) {
+      goto ERR;
+   }
+   
+ERR:
+   mp_clear(&tmp);
+   return res;
+}
+#endif
diff --git a/libtommath/bn_mp_reduce_is_2k.c b/libtommath/bn_mp_reduce_is_2k.c
new file mode 100644
index 0000000..0fb8384
--- /dev/null
+++ b/libtommath/bn_mp_reduce_is_2k.c
@@ -0,0 +1,48 @@
+#include <tommath.h>
+#ifdef BN_MP_REDUCE_IS_2K_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* determines if mp_reduce_2k can be used */
+int mp_reduce_is_2k(mp_int *a)
+{
+   int ix, iy, iw;
+   mp_digit iz;
+   
+   if (a->used == 0) {
+      return MP_NO;
+   } else if (a->used == 1) {
+      return MP_YES;
+   } else if (a->used > 1) {
+      iy = mp_count_bits(a);
+      iz = 1;
+      iw = 1;
+    
+      /* Test every bit from the second digit up, must be 1 */
+      for (ix = DIGIT_BIT; ix < iy; ix++) {
+          if ((a->dp[iw] & iz) == 0) {
+             return MP_NO;
+          }
+          iz <<= 1;
+          if (iz > (mp_digit)MP_MASK) {
+             ++iw;
+             iz = 1;
+          }
+      }
+   }
+   return MP_YES;
+}
+
+#endif
diff --git a/libtommath/bn_mp_reduce_is_2k_l.c b/libtommath/bn_mp_reduce_is_2k_l.c
new file mode 100644
index 0000000..ceba0ed
--- /dev/null
+++ b/libtommath/bn_mp_reduce_is_2k_l.c
@@ -0,0 +1,40 @@
+#include <tommath.h>
+#ifdef BN_MP_REDUCE_IS_2K_L_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* determines if reduce_2k_l can be used */
+int mp_reduce_is_2k_l(mp_int *a)
+{
+   int ix, iy;
+   
+   if (a->used == 0) {
+      return MP_NO;
+   } else if (a->used == 1) {
+      return MP_YES;
+   } else if (a->used > 1) {
+      /* if more than half of the digits are -1 we're sold */
+      for (iy = ix = 0; ix < a->used; ix++) {
+          if (a->dp[ix] == MP_MASK) {
+              ++iy;
+          }
+      }
+      return (iy >= (a->used/2)) ? MP_YES : MP_NO;
+      
+   }
+   return MP_NO;
+}
+
+#endif
diff --git a/libtommath/bn_mp_reduce_setup.c b/libtommath/bn_mp_reduce_setup.c
new file mode 100644
index 0000000..99f158a
--- /dev/null
+++ b/libtommath/bn_mp_reduce_setup.c
@@ -0,0 +1,30 @@
+#include <tommath.h>
+#ifdef BN_MP_REDUCE_SETUP_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* pre-calculate the value required for Barrett reduction
+ * For a given modulus "b" it calulates the value required in "a"
+ */
+int mp_reduce_setup (mp_int * a, mp_int * b)
+{
+  int     res;
+  
+  if ((res = mp_2expt (a, b->used * 2 * DIGIT_BIT)) != MP_OKAY) {
+    return res;
+  }
+  return mp_div (a, b, a, NULL);
+}
+#endif
diff --git a/libtommath/bn_mp_rshd.c b/libtommath/bn_mp_rshd.c
new file mode 100644
index 0000000..913dda6
--- /dev/null
+++ b/libtommath/bn_mp_rshd.c
@@ -0,0 +1,68 @@
+#include <tommath.h>
+#ifdef BN_MP_RSHD_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* shift right a certain amount of digits */
+void mp_rshd (mp_int * a, int b)
+{
+  int     x;
+
+  /* if b <= 0 then ignore it */
+  if (b <= 0) {
+    return;
+  }
+
+  /* if b > used then simply zero it and return */
+  if (a->used <= b) {
+    mp_zero (a);
+    return;
+  }
+
+  {
+    register mp_digit *bottom, *top;
+
+    /* shift the digits down */
+
+    /* bottom */
+    bottom = a->dp;
+
+    /* top [offset into digits] */
+    top = a->dp + b;
+
+    /* this is implemented as a sliding window where 
+     * the window is b-digits long and digits from 
+     * the top of the window are copied to the bottom
+     *
+     * e.g.
+
+     b-2 | b-1 | b0 | b1 | b2 | ... | bb |   ---->
+                 /\                   |      ---->
+                  \-------------------/      ---->
+     */
+    for (x = 0; x < (a->used - b); x++) {
+      *bottom++ = *top++;
+    }
+
+    /* zero the top digits */
+    for (; x < a->used; x++) {
+      *bottom++ = 0;
+    }
+  }
+  
+  /* remove excess digits */
+  a->used -= b;
+}
+#endif
diff --git a/libtommath/bn_mp_set.c b/libtommath/bn_mp_set.c
new file mode 100644
index 0000000..078fd5f
--- /dev/null
+++ b/libtommath/bn_mp_set.c
@@ -0,0 +1,25 @@
+#include <tommath.h>
+#ifdef BN_MP_SET_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* set to a digit */
+void mp_set (mp_int * a, mp_digit b)
+{
+  mp_zero (a);
+  a->dp[0] = b & MP_MASK;
+  a->used  = (a->dp[0] != 0) ? 1 : 0;
+}
+#endif
diff --git a/libtommath/bn_mp_set_int.c b/libtommath/bn_mp_set_int.c
new file mode 100644
index 0000000..bd47136
--- /dev/null
+++ b/libtommath/bn_mp_set_int.c
@@ -0,0 +1,44 @@
+#include <tommath.h>
+#ifdef BN_MP_SET_INT_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* set a 32-bit const */
+int mp_set_int (mp_int * a, unsigned long b)
+{
+  int     x, res;
+
+  mp_zero (a);
+  
+  /* set four bits at a time */
+  for (x = 0; x < 8; x++) {
+    /* shift the number up four bits */
+    if ((res = mp_mul_2d (a, 4, a)) != MP_OKAY) {
+      return res;
+    }
+
+    /* OR in the top four bits of the source */
+    a->dp[0] |= (b >> 28) & 15;
+
+    /* shift the source up to the next four bits */
+    b <<= 4;
+
+    /* ensure that digits are not clamped off */
+    a->used += 1;
+  }
+  mp_clamp (a);
+  return MP_OKAY;
+}
+#endif
diff --git a/libtommath/bn_mp_shrink.c b/libtommath/bn_mp_shrink.c
new file mode 100644
index 0000000..b31f9d2
--- /dev/null
+++ b/libtommath/bn_mp_shrink.c
@@ -0,0 +1,31 @@
+#include <tommath.h>
+#ifdef BN_MP_SHRINK_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* shrink a bignum */
+int mp_shrink (mp_int * a)
+{
+  mp_digit *tmp;
+  if (a->alloc != a->used && a->used > 0) {
+    if ((tmp = OPT_CAST(mp_digit) XREALLOC (a->dp, sizeof (mp_digit) * a->used)) == NULL) {
+      return MP_MEM;
+    }
+    a->dp    = tmp;
+    a->alloc = a->used;
+  }
+  return MP_OKAY;
+}
+#endif
diff --git a/libtommath/bn_mp_signed_bin_size.c b/libtommath/bn_mp_signed_bin_size.c
new file mode 100644
index 0000000..30048cb
--- /dev/null
+++ b/libtommath/bn_mp_signed_bin_size.c
@@ -0,0 +1,23 @@
+#include <tommath.h>
+#ifdef BN_MP_SIGNED_BIN_SIZE_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* get the size for an signed equivalent */
+int mp_signed_bin_size (mp_int * a)
+{
+  return 1 + mp_unsigned_bin_size (a);
+}
+#endif
diff --git a/libtommath/bn_mp_sqr.c b/libtommath/bn_mp_sqr.c
new file mode 100644
index 0000000..b1fdb57
--- /dev/null
+++ b/libtommath/bn_mp_sqr.c
@@ -0,0 +1,54 @@
+#include <tommath.h>
+#ifdef BN_MP_SQR_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* computes b = a*a */
+int
+mp_sqr (mp_int * a, mp_int * b)
+{
+  int     res;
+
+#ifdef BN_MP_TOOM_SQR_C
+  /* use Toom-Cook? */
+  if (a->used >= TOOM_SQR_CUTOFF) {
+    res = mp_toom_sqr(a, b);
+  /* Karatsuba? */
+  } else 
+#endif
+#ifdef BN_MP_KARATSUBA_SQR_C
+if (a->used >= KARATSUBA_SQR_CUTOFF) {
+    res = mp_karatsuba_sqr (a, b);
+  } else 
+#endif
+  {
+#ifdef BN_FAST_S_MP_SQR_C
+    /* can we use the fast comba multiplier? */
+    if ((a->used * 2 + 1) < MP_WARRAY && 
+         a->used < 
+         (1 << (sizeof(mp_word) * CHAR_BIT - 2*DIGIT_BIT - 1))) {
+      res = fast_s_mp_sqr (a, b);
+    } else
+#endif
+#ifdef BN_S_MP_SQR_C
+      res = s_mp_sqr (a, b);
+#else
+      res = MP_VAL;
+#endif
+  }
+  b->sign = MP_ZPOS;
+  return res;
+}
+#endif
diff --git a/libtommath/bn_mp_sqrmod.c b/libtommath/bn_mp_sqrmod.c
new file mode 100644
index 0000000..1923be4
--- /dev/null
+++ b/libtommath/bn_mp_sqrmod.c
@@ -0,0 +1,37 @@
+#include <tommath.h>
+#ifdef BN_MP_SQRMOD_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* c = a * a (mod b) */
+int
+mp_sqrmod (mp_int * a, mp_int * b, mp_int * c)
+{
+  int     res;
+  mp_int  t;
+
+  if ((res = mp_init (&t)) != MP_OKAY) {
+    return res;
+  }
+
+  if ((res = mp_sqr (a, &t)) != MP_OKAY) {
+    mp_clear (&t);
+    return res;
+  }
+  res = mp_mod (&t, b, c);
+  mp_clear (&t);
+  return res;
+}
+#endif
diff --git a/libtommath/bn_mp_sqrt.c b/libtommath/bn_mp_sqrt.c
new file mode 100644
index 0000000..76cec87
--- /dev/null
+++ b/libtommath/bn_mp_sqrt.c
@@ -0,0 +1,77 @@
+#include <tommath.h>
+#ifdef BN_MP_SQRT_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* this function is less generic than mp_n_root, simpler and faster */
+int mp_sqrt(mp_int *arg, mp_int *ret) 
+{
+  int res;
+  mp_int t1,t2;
+
+  /* must be positive */
+  if (arg->sign == MP_NEG) {
+    return MP_VAL;
+  }
+
+  /* easy out */
+  if (mp_iszero(arg) == MP_YES) {
+    mp_zero(ret);
+    return MP_OKAY;
+  }
+
+  if ((res = mp_init_copy(&t1, arg)) != MP_OKAY) {
+    return res;
+  }
+
+  if ((res = mp_init(&t2)) != MP_OKAY) {
+    goto E2;
+  }
+
+  /* First approx. (not very bad for large arg) */
+  mp_rshd (&t1,t1.used/2);
+
+  /* t1 > 0  */ 
+  if ((res = mp_div(arg,&t1,&t2,NULL)) != MP_OKAY) {
+    goto E1;
+  }
+  if ((res = mp_add(&t1,&t2,&t1)) != MP_OKAY) {
+    goto E1;
+  }
+  if ((res = mp_div_2(&t1,&t1)) != MP_OKAY) {
+    goto E1;
+  }
+  /* And now t1 > sqrt(arg) */
+  do { 
+    if ((res = mp_div(arg,&t1,&t2,NULL)) != MP_OKAY) {
+      goto E1;
+    }
+    if ((res = mp_add(&t1,&t2,&t1)) != MP_OKAY) {
+      goto E1;
+    }
+    if ((res = mp_div_2(&t1,&t1)) != MP_OKAY) {
+      goto E1;
+    }
+    /* t1 >= sqrt(arg) >= t2 at this point */
+  } while (mp_cmp_mag(&t1,&t2) == MP_GT);
+
+  mp_exch(&t1,ret);
+
+E1: mp_clear(&t2);
+E2: mp_clear(&t1);
+  return res;
+}
+
+#endif
diff --git a/libtommath/bn_mp_sub.c b/libtommath/bn_mp_sub.c
new file mode 100644
index 0000000..97495f4
--- /dev/null
+++ b/libtommath/bn_mp_sub.c
@@ -0,0 +1,55 @@
+#include <tommath.h>
+#ifdef BN_MP_SUB_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* high level subtraction (handles signs) */
+int
+mp_sub (mp_int * a, mp_int * b, mp_int * c)
+{
+  int     sa, sb, res;
+
+  sa = a->sign;
+  sb = b->sign;
+
+  if (sa != sb) {
+    /* subtract a negative from a positive, OR */
+    /* subtract a positive from a negative. */
+    /* In either case, ADD their magnitudes, */
+    /* and use the sign of the first number. */
+    c->sign = sa;
+    res = s_mp_add (a, b, c);
+  } else {
+    /* subtract a positive from a positive, OR */
+    /* subtract a negative from a negative. */
+    /* First, take the difference between their */
+    /* magnitudes, then... */
+    if (mp_cmp_mag (a, b) != MP_LT) {
+      /* Copy the sign from the first */
+      c->sign = sa;
+      /* The first has a larger or equal magnitude */
+      res = s_mp_sub (a, b, c);
+    } else {
+      /* The result has the *opposite* sign from */
+      /* the first number. */
+      c->sign = (sa == MP_ZPOS) ? MP_NEG : MP_ZPOS;
+      /* The second has a larger magnitude */
+      res = s_mp_sub (b, a, c);
+    }
+  }
+  return res;
+}
+
+#endif
diff --git a/libtommath/bn_mp_sub_d.c b/libtommath/bn_mp_sub_d.c
new file mode 100644
index 0000000..4923dde
--- /dev/null
+++ b/libtommath/bn_mp_sub_d.c
@@ -0,0 +1,85 @@
+#include <tommath.h>
+#ifdef BN_MP_SUB_D_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* single digit subtraction */
+int
+mp_sub_d (mp_int * a, mp_digit b, mp_int * c)
+{
+  mp_digit *tmpa, *tmpc, mu;
+  int       res, ix, oldused;
+
+  /* grow c as required */
+  if (c->alloc < a->used + 1) {
+     if ((res = mp_grow(c, a->used + 1)) != MP_OKAY) {
+        return res;
+     }
+  }
+
+  /* if a is negative just do an unsigned
+   * addition [with fudged signs]
+   */
+  if (a->sign == MP_NEG) {
+     a->sign = MP_ZPOS;
+     res     = mp_add_d(a, b, c);
+     a->sign = c->sign = MP_NEG;
+     return res;
+  }
+
+  /* setup regs */
+  oldused = c->used;
+  tmpa    = a->dp;
+  tmpc    = c->dp;
+
+  /* if a <= b simply fix the single digit */
+  if ((a->used == 1 && a->dp[0] <= b) || a->used == 0) {
+     if (a->used == 1) {
+        *tmpc++ = b - *tmpa;
+     } else {
+        *tmpc++ = b;
+     }
+     ix      = 1;
+
+     /* negative/1digit */
+     c->sign = MP_NEG;
+     c->used = 1;
+  } else {
+     /* positive/size */
+     c->sign = MP_ZPOS;
+     c->used = a->used;
+
+     /* subtract first digit */
+     *tmpc    = *tmpa++ - b;
+     mu       = *tmpc >> (sizeof(mp_digit) * CHAR_BIT - 1);
+     *tmpc++ &= MP_MASK;
+
+     /* handle rest of the digits */
+     for (ix = 1; ix < a->used; ix++) {
+        *tmpc    = *tmpa++ - mu;
+        mu       = *tmpc >> (sizeof(mp_digit) * CHAR_BIT - 1);
+        *tmpc++ &= MP_MASK;
+     }
+  }
+
+  /* zero excess digits */
+  while (ix++ < oldused) {
+     *tmpc++ = 0;
+  }
+  mp_clamp(c);
+  return MP_OKAY;
+}
+
+#endif
diff --git a/libtommath/bn_mp_submod.c b/libtommath/bn_mp_submod.c
new file mode 100644
index 0000000..b999c85
--- /dev/null
+++ b/libtommath/bn_mp_submod.c
@@ -0,0 +1,38 @@
+#include <tommath.h>
+#ifdef BN_MP_SUBMOD_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* d = a - b (mod c) */
+int
+mp_submod (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
+{
+  int     res;
+  mp_int  t;
+
+
+  if ((res = mp_init (&t)) != MP_OKAY) {
+    return res;
+  }
+
+  if ((res = mp_sub (a, b, &t)) != MP_OKAY) {
+    mp_clear (&t);
+    return res;
+  }
+  res = mp_mod (&t, c, d);
+  mp_clear (&t);
+  return res;
+}
+#endif
diff --git a/libtommath/bn_mp_to_signed_bin.c b/libtommath/bn_mp_to_signed_bin.c
new file mode 100644
index 0000000..b0a597e
--- /dev/null
+++ b/libtommath/bn_mp_to_signed_bin.c
@@ -0,0 +1,29 @@
+#include <tommath.h>
+#ifdef BN_MP_TO_SIGNED_BIN_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* store in signed [big endian] format */
+int mp_to_signed_bin (mp_int * a, unsigned char *b)
+{
+  int     res;
+
+  if ((res = mp_to_unsigned_bin (a, b + 1)) != MP_OKAY) {
+    return res;
+  }
+  b[0] = (unsigned char) ((a->sign == MP_ZPOS) ? 0 : 1);
+  return MP_OKAY;
+}
+#endif
diff --git a/libtommath/bn_mp_to_signed_bin_n.c b/libtommath/bn_mp_to_signed_bin_n.c
new file mode 100644
index 0000000..0f765ee
--- /dev/null
+++ b/libtommath/bn_mp_to_signed_bin_n.c
@@ -0,0 +1,27 @@
+#include <tommath.h>
+#ifdef BN_MP_TO_SIGNED_BIN_N_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* store in signed [big endian] format */
+int mp_to_signed_bin_n (mp_int * a, unsigned char *b, unsigned long *outlen)
+{
+   if (*outlen < (unsigned long)mp_signed_bin_size(a)) {
+      return MP_VAL;
+   }
+   *outlen = mp_signed_bin_size(a);
+   return mp_to_signed_bin(a, b);
+}
+#endif
diff --git a/libtommath/bn_mp_to_unsigned_bin.c b/libtommath/bn_mp_to_unsigned_bin.c
new file mode 100644
index 0000000..000967e
--- /dev/null
+++ b/libtommath/bn_mp_to_unsigned_bin.c
@@ -0,0 +1,44 @@
+#include <tommath.h>
+#ifdef BN_MP_TO_UNSIGNED_BIN_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* store in unsigned [big endian] format */
+int mp_to_unsigned_bin (mp_int * a, unsigned char *b)
+{
+  int     x, res;
+  mp_int  t;
+
+  if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
+    return res;
+  }
+
+  x = 0;
+  while (mp_iszero (&t) == 0) {
+#ifndef MP_8BIT
+      b[x++] = (unsigned char) (t.dp[0] & 255);
+#else
+      b[x++] = (unsigned char) (t.dp[0] | ((t.dp[1] & 0x01) << 7));
+#endif
+    if ((res = mp_div_2d (&t, 8, &t, NULL)) != MP_OKAY) {
+      mp_clear (&t);
+      return res;
+    }
+  }
+  bn_reverse (b, x);
+  mp_clear (&t);
+  return MP_OKAY;
+}
+#endif
diff --git a/libtommath/bn_mp_to_unsigned_bin_n.c b/libtommath/bn_mp_to_unsigned_bin_n.c
new file mode 100644
index 0000000..d0256b4
--- /dev/null
+++ b/libtommath/bn_mp_to_unsigned_bin_n.c
@@ -0,0 +1,27 @@
+#include <tommath.h>
+#ifdef BN_MP_TO_UNSIGNED_BIN_N_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* store in unsigned [big endian] format */
+int mp_to_unsigned_bin_n (mp_int * a, unsigned char *b, unsigned long *outlen)
+{
+   if (*outlen < (unsigned long)mp_unsigned_bin_size(a)) {
+      return MP_VAL;
+   }
+   *outlen = mp_unsigned_bin_size(a);
+   return mp_to_unsigned_bin(a, b);
+}
+#endif
diff --git a/libtommath/bn_mp_toom_mul.c b/libtommath/bn_mp_toom_mul.c
new file mode 100644
index 0000000..125331b
--- /dev/null
+++ b/libtommath/bn_mp_toom_mul.c
@@ -0,0 +1,280 @@
+#include <tommath.h>
+#ifdef BN_MP_TOOM_MUL_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* multiplication using the Toom-Cook 3-way algorithm 
+ *
+ * Much more complicated than Karatsuba but has a lower 
+ * asymptotic running time of O(N**1.464).  This algorithm is 
+ * only particularly useful on VERY large inputs 
+ * (we're talking 1000s of digits here...).
+*/
+int mp_toom_mul(mp_int *a, mp_int *b, mp_int *c)
+{
+    mp_int w0, w1, w2, w3, w4, tmp1, tmp2, a0, a1, a2, b0, b1, b2;
+    int res, B;
+        
+    /* init temps */
+    if ((res = mp_init_multi(&w0, &w1, &w2, &w3, &w4, 
+                             &a0, &a1, &a2, &b0, &b1, 
+                             &b2, &tmp1, &tmp2, NULL)) != MP_OKAY) {
+       return res;
+    }
+    
+    /* B */
+    B = MIN(a->used, b->used) / 3;
+    
+    /* a = a2 * B**2 + a1 * B + a0 */
+    if ((res = mp_mod_2d(a, DIGIT_BIT * B, &a0)) != MP_OKAY) {
+       goto ERR;
+    }
+
+    if ((res = mp_copy(a, &a1)) != MP_OKAY) {
+       goto ERR;
+    }
+    mp_rshd(&a1, B);
+    mp_mod_2d(&a1, DIGIT_BIT * B, &a1);
+
+    if ((res = mp_copy(a, &a2)) != MP_OKAY) {
+       goto ERR;
+    }
+    mp_rshd(&a2, B*2);
+    
+    /* b = b2 * B**2 + b1 * B + b0 */
+    if ((res = mp_mod_2d(b, DIGIT_BIT * B, &b0)) != MP_OKAY) {
+       goto ERR;
+    }
+
+    if ((res = mp_copy(b, &b1)) != MP_OKAY) {
+       goto ERR;
+    }
+    mp_rshd(&b1, B);
+    mp_mod_2d(&b1, DIGIT_BIT * B, &b1);
+
+    if ((res = mp_copy(b, &b2)) != MP_OKAY) {
+       goto ERR;
+    }
+    mp_rshd(&b2, B*2);
+    
+    /* w0 = a0*b0 */
+    if ((res = mp_mul(&a0, &b0, &w0)) != MP_OKAY) {
+       goto ERR;
+    }
+    
+    /* w4 = a2 * b2 */
+    if ((res = mp_mul(&a2, &b2, &w4)) != MP_OKAY) {
+       goto ERR;
+    }
+    
+    /* w1 = (a2 + 2(a1 + 2a0))(b2 + 2(b1 + 2b0)) */
+    if ((res = mp_mul_2(&a0, &tmp1)) != MP_OKAY) {
+       goto ERR;
+    }
+    if ((res = mp_add(&tmp1, &a1, &tmp1)) != MP_OKAY) {
+       goto ERR;
+    }
+    if ((res = mp_mul_2(&tmp1, &tmp1)) != MP_OKAY) {
+       goto ERR;
+    }
+    if ((res = mp_add(&tmp1, &a2, &tmp1)) != MP_OKAY) {
+       goto ERR;
+    }
+    
+    if ((res = mp_mul_2(&b0, &tmp2)) != MP_OKAY) {
+       goto ERR;
+    }
+    if ((res = mp_add(&tmp2, &b1, &tmp2)) != MP_OKAY) {
+       goto ERR;
+    }
+    if ((res = mp_mul_2(&tmp2, &tmp2)) != MP_OKAY) {
+       goto ERR;
+    }
+    if ((res = mp_add(&tmp2, &b2, &tmp2)) != MP_OKAY) {
+       goto ERR;
+    }
+    
+    if ((res = mp_mul(&tmp1, &tmp2, &w1)) != MP_OKAY) {
+       goto ERR;
+    }
+    
+    /* w3 = (a0 + 2(a1 + 2a2))(b0 + 2(b1 + 2b2)) */
+    if ((res = mp_mul_2(&a2, &tmp1)) != MP_OKAY) {
+       goto ERR;
+    }
+    if ((res = mp_add(&tmp1, &a1, &tmp1)) != MP_OKAY) {
+       goto ERR;
+    }
+    if ((res = mp_mul_2(&tmp1, &tmp1)) != MP_OKAY) {
+       goto ERR;
+    }
+    if ((res = mp_add(&tmp1, &a0, &tmp1)) != MP_OKAY) {
+       goto ERR;
+    }
+    
+    if ((res = mp_mul_2(&b2, &tmp2)) != MP_OKAY) {
+       goto ERR;
+    }
+    if ((res = mp_add(&tmp2, &b1, &tmp2)) != MP_OKAY) {
+       goto ERR;
+    }
+    if ((res = mp_mul_2(&tmp2, &tmp2)) != MP_OKAY) {
+       goto ERR;
+    }
+    if ((res = mp_add(&tmp2, &b0, &tmp2)) != MP_OKAY) {
+       goto ERR;
+    }
+    
+    if ((res = mp_mul(&tmp1, &tmp2, &w3)) != MP_OKAY) {
+       goto ERR;
+    }
+    
+
+    /* w2 = (a2 + a1 + a0)(b2 + b1 + b0) */
+    if ((res = mp_add(&a2, &a1, &tmp1)) != MP_OKAY) {
+       goto ERR;
+    }
+    if ((res = mp_add(&tmp1, &a0, &tmp1)) != MP_OKAY) {
+       goto ERR;
+    }
+    if ((res = mp_add(&b2, &b1, &tmp2)) != MP_OKAY) {
+       goto ERR;
+    }
+    if ((res = mp_add(&tmp2, &b0, &tmp2)) != MP_OKAY) {
+       goto ERR;
+    }
+    if ((res = mp_mul(&tmp1, &tmp2, &w2)) != MP_OKAY) {
+       goto ERR;
+    }
+    
+    /* now solve the matrix 
+    
+       0  0  0  0  1
+       1  2  4  8  16
+       1  1  1  1  1
+       16 8  4  2  1
+       1  0  0  0  0
+       
+       using 12 subtractions, 4 shifts, 
+              2 small divisions and 1 small multiplication 
+     */
+     
+     /* r1 - r4 */
+     if ((res = mp_sub(&w1, &w4, &w1)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* r3 - r0 */
+     if ((res = mp_sub(&w3, &w0, &w3)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* r1/2 */
+     if ((res = mp_div_2(&w1, &w1)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* r3/2 */
+     if ((res = mp_div_2(&w3, &w3)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* r2 - r0 - r4 */
+     if ((res = mp_sub(&w2, &w0, &w2)) != MP_OKAY) {
+        goto ERR;
+     }
+     if ((res = mp_sub(&w2, &w4, &w2)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* r1 - r2 */
+     if ((res = mp_sub(&w1, &w2, &w1)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* r3 - r2 */
+     if ((res = mp_sub(&w3, &w2, &w3)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* r1 - 8r0 */
+     if ((res = mp_mul_2d(&w0, 3, &tmp1)) != MP_OKAY) {
+        goto ERR;
+     }
+     if ((res = mp_sub(&w1, &tmp1, &w1)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* r3 - 8r4 */
+     if ((res = mp_mul_2d(&w4, 3, &tmp1)) != MP_OKAY) {
+        goto ERR;
+     }
+     if ((res = mp_sub(&w3, &tmp1, &w3)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* 3r2 - r1 - r3 */
+     if ((res = mp_mul_d(&w2, 3, &w2)) != MP_OKAY) {
+        goto ERR;
+     }
+     if ((res = mp_sub(&w2, &w1, &w2)) != MP_OKAY) {
+        goto ERR;
+     }
+     if ((res = mp_sub(&w2, &w3, &w2)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* r1 - r2 */
+     if ((res = mp_sub(&w1, &w2, &w1)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* r3 - r2 */
+     if ((res = mp_sub(&w3, &w2, &w3)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* r1/3 */
+     if ((res = mp_div_3(&w1, &w1, NULL)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* r3/3 */
+     if ((res = mp_div_3(&w3, &w3, NULL)) != MP_OKAY) {
+        goto ERR;
+     }
+     
+     /* at this point shift W[n] by B*n */
+     if ((res = mp_lshd(&w1, 1*B)) != MP_OKAY) {
+        goto ERR;
+     }
+     if ((res = mp_lshd(&w2, 2*B)) != MP_OKAY) {
+        goto ERR;
+     }
+     if ((res = mp_lshd(&w3, 3*B)) != MP_OKAY) {
+        goto ERR;
+     }
+     if ((res = mp_lshd(&w4, 4*B)) != MP_OKAY) {
+        goto ERR;
+     }     
+     
+     if ((res = mp_add(&w0, &w1, c)) != MP_OKAY) {
+        goto ERR;
+     }
+     if ((res = mp_add(&w2, &w3, &tmp1)) != MP_OKAY) {
+        goto ERR;
+     }
+     if ((res = mp_add(&w4, &tmp1, &tmp1)) != MP_OKAY) {
+        goto ERR;
+     }
+     if ((res = mp_add(&tmp1, c, c)) != MP_OKAY) {
+        goto ERR;
+     }     
+     
+ERR:
+     mp_clear_multi(&w0, &w1, &w2, &w3, &w4, 
+                    &a0, &a1, &a2, &b0, &b1, 
+                    &b2, &tmp1, &tmp2, NULL);
+     return res;
+}     
+     
+#endif
diff --git a/libtommath/bn_mp_toom_sqr.c b/libtommath/bn_mp_toom_sqr.c
new file mode 100644
index 0000000..8c46fea
--- /dev/null
+++ b/libtommath/bn_mp_toom_sqr.c
@@ -0,0 +1,222 @@
+#include <tommath.h>
+#ifdef BN_MP_TOOM_SQR_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* squaring using Toom-Cook 3-way algorithm */
+int
+mp_toom_sqr(mp_int *a, mp_int *b)
+{
+    mp_int w0, w1, w2, w3, w4, tmp1, a0, a1, a2;
+    int res, B;
+
+    /* init temps */
+    if ((res = mp_init_multi(&w0, &w1, &w2, &w3, &w4, &a0, &a1, &a2, &tmp1, NULL)) != MP_OKAY) {
+       return res;
+    }
+
+    /* B */
+    B = a->used / 3;
+
+    /* a = a2 * B**2 + a1 * B + a0 */
+    if ((res = mp_mod_2d(a, DIGIT_BIT * B, &a0)) != MP_OKAY) {
+       goto ERR;
+    }
+
+    if ((res = mp_copy(a, &a1)) != MP_OKAY) {
+       goto ERR;
+    }
+    mp_rshd(&a1, B);
+    mp_mod_2d(&a1, DIGIT_BIT * B, &a1);
+
+    if ((res = mp_copy(a, &a2)) != MP_OKAY) {
+       goto ERR;
+    }
+    mp_rshd(&a2, B*2);
+
+    /* w0 = a0*a0 */
+    if ((res = mp_sqr(&a0, &w0)) != MP_OKAY) {
+       goto ERR;
+    }
+
+    /* w4 = a2 * a2 */
+    if ((res = mp_sqr(&a2, &w4)) != MP_OKAY) {
+       goto ERR;
+    }
+
+    /* w1 = (a2 + 2(a1 + 2a0))**2 */
+    if ((res = mp_mul_2(&a0, &tmp1)) != MP_OKAY) {
+       goto ERR;
+    }
+    if ((res = mp_add(&tmp1, &a1, &tmp1)) != MP_OKAY) {
+       goto ERR;
+    }
+    if ((res = mp_mul_2(&tmp1, &tmp1)) != MP_OKAY) {
+       goto ERR;
+    }
+    if ((res = mp_add(&tmp1, &a2, &tmp1)) != MP_OKAY) {
+       goto ERR;
+    }
+
+    if ((res = mp_sqr(&tmp1, &w1)) != MP_OKAY) {
+       goto ERR;
+    }
+
+    /* w3 = (a0 + 2(a1 + 2a2))**2 */
+    if ((res = mp_mul_2(&a2, &tmp1)) != MP_OKAY) {
+       goto ERR;
+    }
+    if ((res = mp_add(&tmp1, &a1, &tmp1)) != MP_OKAY) {
+       goto ERR;
+    }
+    if ((res = mp_mul_2(&tmp1, &tmp1)) != MP_OKAY) {
+       goto ERR;
+    }
+    if ((res = mp_add(&tmp1, &a0, &tmp1)) != MP_OKAY) {
+       goto ERR;
+    }
+
+    if ((res = mp_sqr(&tmp1, &w3)) != MP_OKAY) {
+       goto ERR;
+    }
+
+
+    /* w2 = (a2 + a1 + a0)**2 */
+    if ((res = mp_add(&a2, &a1, &tmp1)) != MP_OKAY) {
+       goto ERR;
+    }
+    if ((res = mp_add(&tmp1, &a0, &tmp1)) != MP_OKAY) {
+       goto ERR;
+    }
+    if ((res = mp_sqr(&tmp1, &w2)) != MP_OKAY) {
+       goto ERR;
+    }
+
+    /* now solve the matrix
+
+       0  0  0  0  1
+       1  2  4  8  16
+       1  1  1  1  1
+       16 8  4  2  1
+       1  0  0  0  0
+
+       using 12 subtractions, 4 shifts, 2 small divisions and 1 small multiplication.
+     */
+
+     /* r1 - r4 */
+     if ((res = mp_sub(&w1, &w4, &w1)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* r3 - r0 */
+     if ((res = mp_sub(&w3, &w0, &w3)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* r1/2 */
+     if ((res = mp_div_2(&w1, &w1)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* r3/2 */
+     if ((res = mp_div_2(&w3, &w3)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* r2 - r0 - r4 */
+     if ((res = mp_sub(&w2, &w0, &w2)) != MP_OKAY) {
+        goto ERR;
+     }
+     if ((res = mp_sub(&w2, &w4, &w2)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* r1 - r2 */
+     if ((res = mp_sub(&w1, &w2, &w1)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* r3 - r2 */
+     if ((res = mp_sub(&w3, &w2, &w3)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* r1 - 8r0 */
+     if ((res = mp_mul_2d(&w0, 3, &tmp1)) != MP_OKAY) {
+        goto ERR;
+     }
+     if ((res = mp_sub(&w1, &tmp1, &w1)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* r3 - 8r4 */
+     if ((res = mp_mul_2d(&w4, 3, &tmp1)) != MP_OKAY) {
+        goto ERR;
+     }
+     if ((res = mp_sub(&w3, &tmp1, &w3)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* 3r2 - r1 - r3 */
+     if ((res = mp_mul_d(&w2, 3, &w2)) != MP_OKAY) {
+        goto ERR;
+     }
+     if ((res = mp_sub(&w2, &w1, &w2)) != MP_OKAY) {
+        goto ERR;
+     }
+     if ((res = mp_sub(&w2, &w3, &w2)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* r1 - r2 */
+     if ((res = mp_sub(&w1, &w2, &w1)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* r3 - r2 */
+     if ((res = mp_sub(&w3, &w2, &w3)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* r1/3 */
+     if ((res = mp_div_3(&w1, &w1, NULL)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* r3/3 */
+     if ((res = mp_div_3(&w3, &w3, NULL)) != MP_OKAY) {
+        goto ERR;
+     }
+
+     /* at this point shift W[n] by B*n */
+     if ((res = mp_lshd(&w1, 1*B)) != MP_OKAY) {
+        goto ERR;
+     }
+     if ((res = mp_lshd(&w2, 2*B)) != MP_OKAY) {
+        goto ERR;
+     }
+     if ((res = mp_lshd(&w3, 3*B)) != MP_OKAY) {
+        goto ERR;
+     }
+     if ((res = mp_lshd(&w4, 4*B)) != MP_OKAY) {
+        goto ERR;
+     }
+
+     if ((res = mp_add(&w0, &w1, b)) != MP_OKAY) {
+        goto ERR;
+     }
+     if ((res = mp_add(&w2, &w3, &tmp1)) != MP_OKAY) {
+        goto ERR;
+     }
+     if ((res = mp_add(&w4, &tmp1, &tmp1)) != MP_OKAY) {
+        goto ERR;
+     }
+     if ((res = mp_add(&tmp1, b, b)) != MP_OKAY) {
+        goto ERR;
+     }
+
+ERR:
+     mp_clear_multi(&w0, &w1, &w2, &w3, &w4, &a0, &a1, &a2, &tmp1, NULL);
+     return res;
+}
+
+#endif
diff --git a/libtommath/bn_mp_toradix.c b/libtommath/bn_mp_toradix.c
new file mode 100644
index 0000000..a206d5e
--- /dev/null
+++ b/libtommath/bn_mp_toradix.c
@@ -0,0 +1,71 @@
+#include <tommath.h>
+#ifdef BN_MP_TORADIX_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* stores a bignum as a ASCII string in a given radix (2..64) */
+int mp_toradix (mp_int * a, char *str, int radix)
+{
+  int     res, digs;
+  mp_int  t;
+  mp_digit d;
+  char   *_s = str;
+
+  /* check range of the radix */
+  if (radix < 2 || radix > 64) {
+    return MP_VAL;
+  }
+
+  /* quick out if its zero */
+  if (mp_iszero(a) == 1) {
+     *str++ = '0';
+     *str = '\0';
+     return MP_OKAY;
+  }
+
+  if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
+    return res;
+  }
+
+  /* if it is negative output a - */
+  if (t.sign == MP_NEG) {
+    ++_s;
+    *str++ = '-';
+    t.sign = MP_ZPOS;
+  }
+
+  digs = 0;
+  while (mp_iszero (&t) == 0) {
+    if ((res = mp_div_d (&t, (mp_digit) radix, &t, &d)) != MP_OKAY) {
+      mp_clear (&t);
+      return res;
+    }
+    *str++ = mp_s_rmap[d];
+    ++digs;
+  }
+
+  /* reverse the digits of the string.  In this case _s points
+   * to the first digit [exluding the sign] of the number]
+   */
+  bn_reverse ((unsigned char *)_s, digs);
+
+  /* append a NULL so the string is properly terminated */
+  *str = '\0';
+
+  mp_clear (&t);
+  return MP_OKAY;
+}
+
+#endif
diff --git a/libtommath/bn_mp_toradix_n.c b/libtommath/bn_mp_toradix_n.c
new file mode 100644
index 0000000..7d43558
--- /dev/null
+++ b/libtommath/bn_mp_toradix_n.c
@@ -0,0 +1,85 @@
+#include <tommath.h>
+#ifdef BN_MP_TORADIX_N_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* stores a bignum as a ASCII string in a given radix (2..64) 
+ *
+ * Stores upto maxlen-1 chars and always a NULL byte 
+ */
+int mp_toradix_n(mp_int * a, char *str, int radix, int maxlen)
+{
+  int     res, digs;
+  mp_int  t;
+  mp_digit d;
+  char   *_s = str;
+
+  /* check range of the maxlen, radix */
+  if (maxlen < 3 || radix < 2 || radix > 64) {
+    return MP_VAL;
+  }
+
+  /* quick out if its zero */
+  if (mp_iszero(a) == 1) {
+     *str++ = '0';
+     *str = '\0';
+     return MP_OKAY;
+  }
+
+  if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
+    return res;
+  }
+
+  /* if it is negative output a - */
+  if (t.sign == MP_NEG) {
+    /* we have to reverse our digits later... but not the - sign!! */
+    ++_s;
+
+    /* store the flag and mark the number as positive */
+    *str++ = '-';
+    t.sign = MP_ZPOS;
+ 
+    /* subtract a char */
+    --maxlen;
+  }
+
+  digs = 0;
+  while (mp_iszero (&t) == 0) {
+    if ((res = mp_div_d (&t, (mp_digit) radix, &t, &d)) != MP_OKAY) {
+      mp_clear (&t);
+      return res;
+    }
+    *str++ = mp_s_rmap[d];
+    ++digs;
+
+    if (--maxlen == 1) {
+       /* no more room */
+       break;
+    }
+  }
+
+  /* reverse the digits of the string.  In this case _s points
+   * to the first digit [exluding the sign] of the number]
+   */
+  bn_reverse ((unsigned char *)_s, digs);
+
+  /* append a NULL so the string is properly terminated */
+  *str = '\0';
+
+  mp_clear (&t);
+  return MP_OKAY;
+}
+
+#endif
diff --git a/libtommath/bn_mp_unsigned_bin_size.c b/libtommath/bn_mp_unsigned_bin_size.c
new file mode 100644
index 0000000..091f406
--- /dev/null
+++ b/libtommath/bn_mp_unsigned_bin_size.c
@@ -0,0 +1,24 @@
+#include <tommath.h>
+#ifdef BN_MP_UNSIGNED_BIN_SIZE_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* get the size for an unsigned equivalent */
+int mp_unsigned_bin_size (mp_int * a)
+{
+  int     size = mp_count_bits (a);
+  return (size / 8 + ((size & 7) != 0 ? 1 : 0));
+}
+#endif
diff --git a/libtommath/bn_mp_xor.c b/libtommath/bn_mp_xor.c
new file mode 100644
index 0000000..de7e62c
--- /dev/null
+++ b/libtommath/bn_mp_xor.c
@@ -0,0 +1,47 @@
+#include <tommath.h>
+#ifdef BN_MP_XOR_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* XOR two ints together */
+int
+mp_xor (mp_int * a, mp_int * b, mp_int * c)
+{
+  int     res, ix, px;
+  mp_int  t, *x;
+
+  if (a->used > b->used) {
+    if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
+      return res;
+    }
+    px = b->used;
+    x = b;
+  } else {
+    if ((res = mp_init_copy (&t, b)) != MP_OKAY) {
+      return res;
+    }
+    px = a->used;
+    x = a;
+  }
+
+  for (ix = 0; ix < px; ix++) {
+     t.dp[ix] ^= x->dp[ix];
+  }
+  mp_clamp (&t);
+  mp_exch (c, &t);
+  mp_clear (&t);
+  return MP_OKAY;
+}
+#endif
diff --git a/libtommath/bn_mp_zero.c b/libtommath/bn_mp_zero.c
new file mode 100644
index 0000000..c8d8907
--- /dev/null
+++ b/libtommath/bn_mp_zero.c
@@ -0,0 +1,32 @@
+#include <tommath.h>
+#ifdef BN_MP_ZERO_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* set to zero */
+void mp_zero (mp_int * a)
+{
+  int       n;
+  mp_digit *tmp;
+
+  a->sign = MP_ZPOS;
+  a->used = 0;
+
+  tmp = a->dp;
+  for (n = 0; n < a->alloc; n++) {
+     *tmp++ = 0;
+  }
+}
+#endif
diff --git a/libtommath/bn_prime_tab.c b/libtommath/bn_prime_tab.c
new file mode 100644
index 0000000..14306c2
--- /dev/null
+++ b/libtommath/bn_prime_tab.c
@@ -0,0 +1,57 @@
+#include <tommath.h>
+#ifdef BN_PRIME_TAB_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+const mp_digit ltm_prime_tab[] = {
+  0x0002, 0x0003, 0x0005, 0x0007, 0x000B, 0x000D, 0x0011, 0x0013,
+  0x0017, 0x001D, 0x001F, 0x0025, 0x0029, 0x002B, 0x002F, 0x0035,
+  0x003B, 0x003D, 0x0043, 0x0047, 0x0049, 0x004F, 0x0053, 0x0059,
+  0x0061, 0x0065, 0x0067, 0x006B, 0x006D, 0x0071, 0x007F,
+#ifndef MP_8BIT
+  0x0083,
+  0x0089, 0x008B, 0x0095, 0x0097, 0x009D, 0x00A3, 0x00A7, 0x00AD,
+  0x00B3, 0x00B5, 0x00BF, 0x00C1, 0x00C5, 0x00C7, 0x00D3, 0x00DF,
+  0x00E3, 0x00E5, 0x00E9, 0x00EF, 0x00F1, 0x00FB, 0x0101, 0x0107,
+  0x010D, 0x010F, 0x0115, 0x0119, 0x011B, 0x0125, 0x0133, 0x0137,
+
+  0x0139, 0x013D, 0x014B, 0x0151, 0x015B, 0x015D, 0x0161, 0x0167,
+  0x016F, 0x0175, 0x017B, 0x017F, 0x0185, 0x018D, 0x0191, 0x0199,
+  0x01A3, 0x01A5, 0x01AF, 0x01B1, 0x01B7, 0x01BB, 0x01C1, 0x01C9,
+  0x01CD, 0x01CF, 0x01D3, 0x01DF, 0x01E7, 0x01EB, 0x01F3, 0x01F7,
+  0x01FD, 0x0209, 0x020B, 0x021D, 0x0223, 0x022D, 0x0233, 0x0239,
+  0x023B, 0x0241, 0x024B, 0x0251, 0x0257, 0x0259, 0x025F, 0x0265,
+  0x0269, 0x026B, 0x0277, 0x0281, 0x0283, 0x0287, 0x028D, 0x0293,
+  0x0295, 0x02A1, 0x02A5, 0x02AB, 0x02B3, 0x02BD, 0x02C5, 0x02CF,
+
+  0x02D7, 0x02DD, 0x02E3, 0x02E7, 0x02EF, 0x02F5, 0x02F9, 0x0301,
+  0x0305, 0x0313, 0x031D, 0x0329, 0x032B, 0x0335, 0x0337, 0x033B,
+  0x033D, 0x0347, 0x0355, 0x0359, 0x035B, 0x035F, 0x036D, 0x0371,
+  0x0373, 0x0377, 0x038B, 0x038F, 0x0397, 0x03A1, 0x03A9, 0x03AD,
+  0x03B3, 0x03B9, 0x03C7, 0x03CB, 0x03D1, 0x03D7, 0x03DF, 0x03E5,
+  0x03F1, 0x03F5, 0x03FB, 0x03FD, 0x0407, 0x0409, 0x040F, 0x0419,
+  0x041B, 0x0425, 0x0427, 0x042D, 0x043F, 0x0443, 0x0445, 0x0449,
+  0x044F, 0x0455, 0x045D, 0x0463, 0x0469, 0x047F, 0x0481, 0x048B,
+
+  0x0493, 0x049D, 0x04A3, 0x04A9, 0x04B1, 0x04BD, 0x04C1, 0x04C7,
+  0x04CD, 0x04CF, 0x04D5, 0x04E1, 0x04EB, 0x04FD, 0x04FF, 0x0503,
+  0x0509, 0x050B, 0x0511, 0x0515, 0x0517, 0x051B, 0x0527, 0x0529,
+  0x052F, 0x0551, 0x0557, 0x055D, 0x0565, 0x0577, 0x0581, 0x058F,
+  0x0593, 0x0595, 0x0599, 0x059F, 0x05A7, 0x05AB, 0x05AD, 0x05B3,
+  0x05BF, 0x05C9, 0x05CB, 0x05CF, 0x05D1, 0x05D5, 0x05DB, 0x05E7,
+  0x05F3, 0x05FB, 0x0607, 0x060D, 0x0611, 0x0617, 0x061F, 0x0623,
+  0x062B, 0x062F, 0x063D, 0x0641, 0x0647, 0x0649, 0x064D, 0x0653
+#endif
+};
+#endif
diff --git a/libtommath/bn_reverse.c b/libtommath/bn_reverse.c
new file mode 100644
index 0000000..851a6e8
--- /dev/null
+++ b/libtommath/bn_reverse.c
@@ -0,0 +1,35 @@
+#include <tommath.h>
+#ifdef BN_REVERSE_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* reverse an array, used for radix code */
+void
+bn_reverse (unsigned char *s, int len)
+{
+  int     ix, iy;
+  unsigned char t;
+
+  ix = 0;
+  iy = len - 1;
+  while (ix < iy) {
+    t     = s[ix];
+    s[ix] = s[iy];
+    s[iy] = t;
+    ++ix;
+    --iy;
+  }
+}
+#endif
diff --git a/libtommath/bn_s_mp_add.c b/libtommath/bn_s_mp_add.c
new file mode 100644
index 0000000..2b378ae
--- /dev/null
+++ b/libtommath/bn_s_mp_add.c
@@ -0,0 +1,105 @@
+#include <tommath.h>
+#ifdef BN_S_MP_ADD_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* low level addition, based on HAC pp.594, Algorithm 14.7 */
+int
+s_mp_add (mp_int * a, mp_int * b, mp_int * c)
+{
+  mp_int *x;
+  int     olduse, res, min, max;
+
+  /* find sizes, we let |a| <= |b| which means we have to sort
+   * them.  "x" will point to the input with the most digits
+   */
+  if (a->used > b->used) {
+    min = b->used;
+    max = a->used;
+    x = a;
+  } else {
+    min = a->used;
+    max = b->used;
+    x = b;
+  }
+
+  /* init result */
+  if (c->alloc < max + 1) {
+    if ((res = mp_grow (c, max + 1)) != MP_OKAY) {
+      return res;
+    }
+  }
+
+  /* get old used digit count and set new one */
+  olduse = c->used;
+  c->used = max + 1;
+
+  {
+    register mp_digit u, *tmpa, *tmpb, *tmpc;
+    register int i;
+
+    /* alias for digit pointers */
+
+    /* first input */
+    tmpa = a->dp;
+
+    /* second input */
+    tmpb = b->dp;
+
+    /* destination */
+    tmpc = c->dp;
+
+    /* zero the carry */
+    u = 0;
+    for (i = 0; i < min; i++) {
+      /* Compute the sum at one digit, T[i] = A[i] + B[i] + U */
+      *tmpc = *tmpa++ + *tmpb++ + u;
+
+      /* U = carry bit of T[i] */
+      u = *tmpc >> ((mp_digit)DIGIT_BIT);
+
+      /* take away carry bit from T[i] */
+      *tmpc++ &= MP_MASK;
+    }
+
+    /* now copy higher words if any, that is in A+B 
+     * if A or B has more digits add those in 
+     */
+    if (min != max) {
+      for (; i < max; i++) {
+        /* T[i] = X[i] + U */
+        *tmpc = x->dp[i] + u;
+
+        /* U = carry bit of T[i] */
+        u = *tmpc >> ((mp_digit)DIGIT_BIT);
+
+        /* take away carry bit from T[i] */
+        *tmpc++ &= MP_MASK;
+      }
+    }
+
+    /* add carry */
+    *tmpc++ = u;
+
+    /* clear digits above oldused */
+    for (i = c->used; i < olduse; i++) {
+      *tmpc++ = 0;
+    }
+  }
+
+  mp_clamp (c);
+  return MP_OKAY;
+}
+#endif
diff --git a/libtommath/bn_s_mp_exptmod.c b/libtommath/bn_s_mp_exptmod.c
new file mode 100644
index 0000000..597e877
--- /dev/null
+++ b/libtommath/bn_s_mp_exptmod.c
@@ -0,0 +1,249 @@
+#include <tommath.h>
+#ifdef BN_S_MP_EXPTMOD_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+#ifdef MP_LOW_MEM
+   #define TAB_SIZE 32
+#else
+   #define TAB_SIZE 256
+#endif
+
+int s_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
+{
+  mp_int  M[TAB_SIZE], res, mu;
+  mp_digit buf;
+  int     err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
+  int (*redux)(mp_int*,mp_int*,mp_int*);
+
+  /* find window size */
+  x = mp_count_bits (X);
+  if (x <= 7) {
+    winsize = 2;
+  } else if (x <= 36) {
+    winsize = 3;
+  } else if (x <= 140) {
+    winsize = 4;
+  } else if (x <= 450) {
+    winsize = 5;
+  } else if (x <= 1303) {
+    winsize = 6;
+  } else if (x <= 3529) {
+    winsize = 7;
+  } else {
+    winsize = 8;
+  }
+
+#ifdef MP_LOW_MEM
+    if (winsize > 5) {
+       winsize = 5;
+    }
+#endif
+
+  /* init M array */
+  /* init first cell */
+  if ((err = mp_init(&M[1])) != MP_OKAY) {
+     return err; 
+  }
+
+  /* now init the second half of the array */
+  for (x = 1<<(winsize-1); x < (1 << winsize); x++) {
+    if ((err = mp_init(&M[x])) != MP_OKAY) {
+      for (y = 1<<(winsize-1); y < x; y++) {
+        mp_clear (&M[y]);
+      }
+      mp_clear(&M[1]);
+      return err;
+    }
+  }
+
+  /* create mu, used for Barrett reduction */
+  if ((err = mp_init (&mu)) != MP_OKAY) {
+    goto LBL_M;
+  }
+  
+  if (redmode == 0) {
+     if ((err = mp_reduce_setup (&mu, P)) != MP_OKAY) {
+        goto LBL_MU;
+     }
+     redux = mp_reduce;
+  } else {
+     if ((err = mp_reduce_2k_setup_l (P, &mu)) != MP_OKAY) {
+        goto LBL_MU;
+     }
+     redux = mp_reduce_2k_l;
+  }    
+
+  /* create M table
+   *
+   * The M table contains powers of the base, 
+   * e.g. M[x] = G**x mod P
+   *
+   * The first half of the table is not 
+   * computed though accept for M[0] and M[1]
+   */
+  if ((err = mp_mod (G, P, &M[1])) != MP_OKAY) {
+    goto LBL_MU;
+  }
+
+  /* compute the value at M[1<<(winsize-1)] by squaring 
+   * M[1] (winsize-1) times 
+   */
+  if ((err = mp_copy (&M[1], &M[1 << (winsize - 1)])) != MP_OKAY) {
+    goto LBL_MU;
+  }
+
+  for (x = 0; x < (winsize - 1); x++) {
+    /* square it */
+    if ((err = mp_sqr (&M[1 << (winsize - 1)], 
+                       &M[1 << (winsize - 1)])) != MP_OKAY) {
+      goto LBL_MU;
+    }
+
+    /* reduce modulo P */
+    if ((err = redux (&M[1 << (winsize - 1)], P, &mu)) != MP_OKAY) {
+      goto LBL_MU;
+    }
+  }
+
+  /* create upper table, that is M[x] = M[x-1] * M[1] (mod P)
+   * for x = (2**(winsize - 1) + 1) to (2**winsize - 1)
+   */
+  for (x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x++) {
+    if ((err = mp_mul (&M[x - 1], &M[1], &M[x])) != MP_OKAY) {
+      goto LBL_MU;
+    }
+    if ((err = redux (&M[x], P, &mu)) != MP_OKAY) {
+      goto LBL_MU;
+    }
+  }
+
+  /* setup result */
+  if ((err = mp_init (&res)) != MP_OKAY) {
+    goto LBL_MU;
+  }
+  mp_set (&res, 1);
+
+  /* set initial mode and bit cnt */
+  mode   = 0;
+  bitcnt = 1;
+  buf    = 0;
+  digidx = X->used - 1;
+  bitcpy = 0;
+  bitbuf = 0;
+
+  for (;;) {
+    /* grab next digit as required */
+    if (--bitcnt == 0) {
+      /* if digidx == -1 we are out of digits */
+      if (digidx == -1) {
+        break;
+      }
+      /* read next digit and reset the bitcnt */
+      buf    = X->dp[digidx--];
+      bitcnt = (int) DIGIT_BIT;
+    }
+
+    /* grab the next msb from the exponent */
+    y     = (buf >> (mp_digit)(DIGIT_BIT - 1)) & 1;
+    buf <<= (mp_digit)1;
+
+    /* if the bit is zero and mode == 0 then we ignore it
+     * These represent the leading zero bits before the first 1 bit
+     * in the exponent.  Technically this opt is not required but it
+     * does lower the # of trivial squaring/reductions used
+     */
+    if (mode == 0 && y == 0) {
+      continue;
+    }
+
+    /* if the bit is zero and mode == 1 then we square */
+    if (mode == 1 && y == 0) {
+      if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
+        goto LBL_RES;
+      }
+      if ((err = redux (&res, P, &mu)) != MP_OKAY) {
+        goto LBL_RES;
+      }
+      continue;
+    }
+
+    /* else we add it to the window */
+    bitbuf |= (y << (winsize - ++bitcpy));
+    mode    = 2;
+
+    if (bitcpy == winsize) {
+      /* ok window is filled so square as required and multiply  */
+      /* square first */
+      for (x = 0; x < winsize; x++) {
+        if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
+          goto LBL_RES;
+        }
+        if ((err = redux (&res, P, &mu)) != MP_OKAY) {
+          goto LBL_RES;
+        }
+      }
+
+      /* then multiply */
+      if ((err = mp_mul (&res, &M[bitbuf], &res)) != MP_OKAY) {
+        goto LBL_RES;
+      }
+      if ((err = redux (&res, P, &mu)) != MP_OKAY) {
+        goto LBL_RES;
+      }
+
+      /* empty window and reset */
+      bitcpy = 0;
+      bitbuf = 0;
+      mode   = 1;
+    }
+  }
+
+  /* if bits remain then square/multiply */
+  if (mode == 2 && bitcpy > 0) {
+    /* square then multiply if the bit is set */
+    for (x = 0; x < bitcpy; x++) {
+      if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
+        goto LBL_RES;
+      }
+      if ((err = redux (&res, P, &mu)) != MP_OKAY) {
+        goto LBL_RES;
+      }
+
+      bitbuf <<= 1;
+      if ((bitbuf & (1 << winsize)) != 0) {
+        /* then multiply */
+        if ((err = mp_mul (&res, &M[1], &res)) != MP_OKAY) {
+          goto LBL_RES;
+        }
+        if ((err = redux (&res, P, &mu)) != MP_OKAY) {
+          goto LBL_RES;
+        }
+      }
+    }
+  }
+
+  mp_exch (&res, Y);
+  err = MP_OKAY;
+LBL_RES:mp_clear (&res);
+LBL_MU:mp_clear (&mu);
+LBL_M:
+  mp_clear(&M[1]);
+  for (x = 1<<(winsize-1); x < (1 << winsize); x++) {
+    mp_clear (&M[x]);
+  }
+  return err;
+}
+#endif
diff --git a/libtommath/bn_s_mp_mul_digs.c b/libtommath/bn_s_mp_mul_digs.c
new file mode 100644
index 0000000..b40ae2e
--- /dev/null
+++ b/libtommath/bn_s_mp_mul_digs.c
@@ -0,0 +1,86 @@
+#include <tommath.h>
+#ifdef BN_S_MP_MUL_DIGS_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* multiplies |a| * |b| and only computes upto digs digits of result
+ * HAC pp. 595, Algorithm 14.12  Modified so you can control how 
+ * many digits of output are created.
+ */
+int s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
+{
+  mp_int  t;
+  int     res, pa, pb, ix, iy;
+  mp_digit u;
+  mp_word r;
+  mp_digit tmpx, *tmpt, *tmpy;
+
+  /* can we use the fast multiplier? */
+  if (((digs) < MP_WARRAY) &&
+      MIN (a->used, b->used) < 
+          (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
+    return fast_s_mp_mul_digs (a, b, c, digs);
+  }
+
+  if ((res = mp_init_size (&t, digs)) != MP_OKAY) {
+    return res;
+  }
+  t.used = digs;
+
+  /* compute the digits of the product directly */
+  pa = a->used;
+  for (ix = 0; ix < pa; ix++) {
+    /* set the carry to zero */
+    u = 0;
+
+    /* limit ourselves to making digs digits of output */
+    pb = MIN (b->used, digs - ix);
+
+    /* setup some aliases */
+    /* copy of the digit from a used within the nested loop */
+    tmpx = a->dp[ix];
+    
+    /* an alias for the destination shifted ix places */
+    tmpt = t.dp + ix;
+    
+    /* an alias for the digits of b */
+    tmpy = b->dp;
+
+    /* compute the columns of the output and propagate the carry */
+    for (iy = 0; iy < pb; iy++) {
+      /* compute the column as a mp_word */
+      r       = ((mp_word)*tmpt) +
+                ((mp_word)tmpx) * ((mp_word)*tmpy++) +
+                ((mp_word) u);
+
+      /* the new column is the lower part of the result */
+      *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
+
+      /* get the carry word from the result */
+      u       = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
+    }
+    /* set carry if it is placed below digs */
+    if (ix + iy < digs) {
+      *tmpt = u;
+    }
+  }
+
+  mp_clamp (&t);
+  mp_exch (&t, c);
+
+  mp_clear (&t);
+  return MP_OKAY;
+}
+#endif
diff --git a/libtommath/bn_s_mp_mul_high_digs.c b/libtommath/bn_s_mp_mul_high_digs.c
new file mode 100644
index 0000000..a060248
--- /dev/null
+++ b/libtommath/bn_s_mp_mul_high_digs.c
@@ -0,0 +1,77 @@
+#include <tommath.h>
+#ifdef BN_S_MP_MUL_HIGH_DIGS_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* multiplies |a| * |b| and does not compute the lower digs digits
+ * [meant to get the higher part of the product]
+ */
+int
+s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
+{
+  mp_int  t;
+  int     res, pa, pb, ix, iy;
+  mp_digit u;
+  mp_word r;
+  mp_digit tmpx, *tmpt, *tmpy;
+
+  /* can we use the fast multiplier? */
+#ifdef BN_FAST_S_MP_MUL_HIGH_DIGS_C
+  if (((a->used + b->used + 1) < MP_WARRAY)
+      && MIN (a->used, b->used) < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
+    return fast_s_mp_mul_high_digs (a, b, c, digs);
+  }
+#endif
+
+  if ((res = mp_init_size (&t, a->used + b->used + 1)) != MP_OKAY) {
+    return res;
+  }
+  t.used = a->used + b->used + 1;
+
+  pa = a->used;
+  pb = b->used;
+  for (ix = 0; ix < pa; ix++) {
+    /* clear the carry */
+    u = 0;
+
+    /* left hand side of A[ix] * B[iy] */
+    tmpx = a->dp[ix];
+
+    /* alias to the address of where the digits will be stored */
+    tmpt = &(t.dp[digs]);
+
+    /* alias for where to read the right hand side from */
+    tmpy = b->dp + (digs - ix);
+
+    for (iy = digs - ix; iy < pb; iy++) {
+      /* calculate the double precision result */
+      r       = ((mp_word)*tmpt) +
+                ((mp_word)tmpx) * ((mp_word)*tmpy++) +
+                ((mp_word) u);
+
+      /* get the lower part */
+      *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
+
+      /* carry the carry */
+      u       = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
+    }
+    *tmpt = u;
+  }
+  mp_clamp (&t);
+  mp_exch (&t, c);
+  mp_clear (&t);
+  return MP_OKAY;
+}
+#endif
diff --git a/libtommath/bn_s_mp_sqr.c b/libtommath/bn_s_mp_sqr.c
new file mode 100644
index 0000000..9cdb563
--- /dev/null
+++ b/libtommath/bn_s_mp_sqr.c
@@ -0,0 +1,80 @@
+#include <tommath.h>
+#ifdef BN_S_MP_SQR_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* low level squaring, b = a*a, HAC pp.596-597, Algorithm 14.16 */
+int s_mp_sqr (mp_int * a, mp_int * b)
+{
+  mp_int  t;
+  int     res, ix, iy, pa;
+  mp_word r;
+  mp_digit u, tmpx, *tmpt;
+
+  pa = a->used;
+  if ((res = mp_init_size (&t, 2*pa + 1)) != MP_OKAY) {
+    return res;
+  }
+
+  /* default used is maximum possible size */
+  t.used = 2*pa + 1;
+
+  for (ix = 0; ix < pa; ix++) {
+    /* first calculate the digit at 2*ix */
+    /* calculate double precision result */
+    r = ((mp_word) t.dp[2*ix]) +
+        ((mp_word)a->dp[ix])*((mp_word)a->dp[ix]);
+
+    /* store lower part in result */
+    t.dp[ix+ix] = (mp_digit) (r & ((mp_word) MP_MASK));
+
+    /* get the carry */
+    u           = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
+
+    /* left hand side of A[ix] * A[iy] */
+    tmpx        = a->dp[ix];
+
+    /* alias for where to store the results */
+    tmpt        = t.dp + (2*ix + 1);
+    
+    for (iy = ix + 1; iy < pa; iy++) {
+      /* first calculate the product */
+      r       = ((mp_word)tmpx) * ((mp_word)a->dp[iy]);
+
+      /* now calculate the double precision result, note we use
+       * addition instead of *2 since it's easier to optimize
+       */
+      r       = ((mp_word) *tmpt) + r + r + ((mp_word) u);
+
+      /* store lower part */
+      *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
+
+      /* get carry */
+      u       = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
+    }
+    /* propagate upwards */
+    while (u != ((mp_digit) 0)) {
+      r       = ((mp_word) *tmpt) + ((mp_word) u);
+      *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
+      u       = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
+    }
+  }
+
+  mp_clamp (&t);
+  mp_exch (&t, b);
+  mp_clear (&t);
+  return MP_OKAY;
+}
+#endif
diff --git a/libtommath/bn_s_mp_sub.c b/libtommath/bn_s_mp_sub.c
new file mode 100644
index 0000000..5b7aef9
--- /dev/null
+++ b/libtommath/bn_s_mp_sub.c
@@ -0,0 +1,85 @@
+#include <tommath.h>
+#ifdef BN_S_MP_SUB_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* low level subtraction (assumes |a| > |b|), HAC pp.595 Algorithm 14.9 */
+int
+s_mp_sub (mp_int * a, mp_int * b, mp_int * c)
+{
+  int     olduse, res, min, max;
+
+  /* find sizes */
+  min = b->used;
+  max = a->used;
+
+  /* init result */
+  if (c->alloc < max) {
+    if ((res = mp_grow (c, max)) != MP_OKAY) {
+      return res;
+    }
+  }
+  olduse = c->used;
+  c->used = max;
+
+  {
+    register mp_digit u, *tmpa, *tmpb, *tmpc;
+    register int i;
+
+    /* alias for digit pointers */
+    tmpa = a->dp;
+    tmpb = b->dp;
+    tmpc = c->dp;
+
+    /* set carry to zero */
+    u = 0;
+    for (i = 0; i < min; i++) {
+      /* T[i] = A[i] - B[i] - U */
+      *tmpc = *tmpa++ - *tmpb++ - u;
+
+      /* U = carry bit of T[i]
+       * Note this saves performing an AND operation since
+       * if a carry does occur it will propagate all the way to the
+       * MSB.  As a result a single shift is enough to get the carry
+       */
+      u = *tmpc >> ((mp_digit)(CHAR_BIT * sizeof (mp_digit) - 1));
+
+      /* Clear carry from T[i] */
+      *tmpc++ &= MP_MASK;
+    }
+
+    /* now copy higher words if any, e.g. if A has more digits than B  */
+    for (; i < max; i++) {
+      /* T[i] = A[i] - U */
+      *tmpc = *tmpa++ - u;
+
+      /* U = carry bit of T[i] */
+      u = *tmpc >> ((mp_digit)(CHAR_BIT * sizeof (mp_digit) - 1));
+
+      /* Clear carry from T[i] */
+      *tmpc++ &= MP_MASK;
+    }
+
+    /* clear digits above used (since we may not have grown result above) */
+    for (i = c->used; i < olduse; i++) {
+      *tmpc++ = 0;
+    }
+  }
+
+  mp_clamp (c);
+  return MP_OKAY;
+}
+
+#endif
diff --git a/libtommath/bncore.c b/libtommath/bncore.c
new file mode 100644
index 0000000..82e3132
--- /dev/null
+++ b/libtommath/bncore.c
@@ -0,0 +1,32 @@
+#include <tommath.h>
+#ifdef BNCORE_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* Known optimal configurations
+
+ CPU                    /Compiler     /MUL CUTOFF/SQR CUTOFF
+-------------------------------------------------------------
+ Intel P4 Northwood     /GCC v3.4.1   /        88/       128/LTM 0.32 ;-)
+ AMD Athlon64           /GCC v3.4.4   /        74/       124/LTM 0.34
+ 
+*/
+
+int     KARATSUBA_MUL_CUTOFF = 74,      /* Min. number of digits before Karatsuba multiplication is used. */
+        KARATSUBA_SQR_CUTOFF = 124,     /* Min. number of digits before Karatsuba squaring is used. */
+        
+        TOOM_MUL_CUTOFF      = 350,      /* no optimal values of these are known yet so set em high */
+        TOOM_SQR_CUTOFF      = 400; 
+#endif
diff --git a/libtommath/booker.pl b/libtommath/booker.pl
new file mode 100644
index 0000000..5c77e53
--- /dev/null
+++ b/libtommath/booker.pl
@@ -0,0 +1,262 @@
+#!/bin/perl
+#
+#Used to prepare the book "tommath.src" for LaTeX by pre-processing it into a .tex file
+#
+#Essentially you write the "tommath.src" as normal LaTex except where you want code snippets you put
+#
+#EXAM,file
+#
+#This preprocessor will then open "file" and insert it as a verbatim copy.
+#
+#Tom St Denis
+
+#get graphics type
+if (shift =~ /PDF/) {
+   $graph = "";
+} else {
+   $graph = ".ps";
+}   
+
+open(IN,"<tommath.src") or die "Can't open source file";
+open(OUT,">tommath.tex") or die "Can't open destination file";
+
+print "Scanning for sections\n";
+$chapter = $section = $subsection = 0;
+$x = 0;
+while (<IN>) {
+   print ".";
+   if (!(++$x % 80)) { print "\n"; }
+   #update the headings 
+   if (~($_ =~ /\*/)) {
+      if ($_ =~ /\\chapter{.+}/) {
+          ++$chapter;
+          $section = $subsection = 0;
+      } elsif ($_ =~ /\\section{.+}/) {
+          ++$section;
+          $subsection = 0;
+      } elsif ($_ =~ /\\subsection{.+}/) {
+          ++$subsection;
+      }
+   }      
+
+   if ($_ =~ m/MARK/) {
+      @m = split(",",$_);
+      chomp(@m[1]);
+      $index1{@m[1]} = $chapter;
+      $index2{@m[1]} = $section;
+      $index3{@m[1]} = $subsection;
+   }
+}
+close(IN);
+
+open(IN,"<tommath.src") or die "Can't open source file";
+$readline = $wroteline = 0;
+$srcline = 0;
+
+while (<IN>) {
+   ++$readline;
+   ++$srcline;
+   
+   if ($_ =~ m/MARK/) {
+   } elsif ($_ =~ m/EXAM/ || $_ =~ m/LIST/) {
+      if ($_ =~ m/EXAM/) {
+         $skipheader = 1;
+      } else {
+         $skipheader = 0;
+      }
+      
+      # EXAM,file
+      chomp($_);
+      @m = split(",",$_);
+      open(SRC,"<$m[1]") or die "Error:$srcline:Can't open source file $m[1]";
+      
+      print "$srcline:Inserting $m[1]:";
+      
+      $line = 0;
+      $tmp = $m[1];
+      $tmp =~ s/_/"\\_"/ge;
+      print OUT "\\vspace{+3mm}\\begin{small}\n\\hspace{-5.1mm}{\\bf File}: $tmp\n\\vspace{-3mm}\n\\begin{alltt}\n";
+      $wroteline += 5;
+      
+      if ($skipheader == 1) {
+         # scan till next end of comment, e.g. skip license 
+         while (<SRC>) {
+            $text[$line++] = $_;
+            last if ($_ =~ /math\.libtomcrypt\.org/);
+         }
+         <SRC>;   
+      }
+      
+      $inline = 0;
+      while (<SRC>) {
+         $text[$line++] = $_;
+         ++$inline;
+         chomp($_);
+         $_ =~ s/\t/"    "/ge;
+         $_ =~ s/{/"^{"/ge;
+         $_ =~ s/}/"^}"/ge;
+         $_ =~ s/\\/'\symbol{92}'/ge;
+         $_ =~ s/\^/"\\"/ge;
+           
+         printf OUT ("%03d   ", $line);
+         for ($x = 0; $x < length($_); $x++) {
+             print OUT chr(vec($_, $x, 8));
+             if ($x == 75) { 
+                 print OUT "\n      ";
+                 ++$wroteline;
+             }
+         }
+         print OUT "\n";
+         ++$wroteline;
+      }
+      $totlines = $line;
+      print OUT "\\end{alltt}\n\\end{small}\n";
+      close(SRC);
+      print "$inline lines\n";
+      $wroteline += 2;
+   } elsif ($_ =~ m/@\d+,.+@/) {
+     # line contains [number,text]
+     # e.g. @14,for (ix = 0)@
+     $txt = $_;
+     while ($txt =~ m/@\d+,.+@/) {
+        @m = split("@",$txt);      # splits into text, one, two
+        @parms = split(",",$m[1]);  # splits one,two into two elements 
+                
+        # now search from $parms[0] down for $parms[1] 
+        $found1 = 0;
+        $found2 = 0;
+        for ($i = $parms[0]; $i < $totlines && $found1 == 0; $i++) {
+           if ($text[$i] =~ m/\Q$parms[1]\E/) {
+              $foundline1 = $i + 1;
+              $found1 = 1;
+           }
+        }
+        
+        # now search backwards
+        for ($i = $parms[0] - 1; $i >= 0 && $found2 == 0; $i--) {
+           if ($text[$i] =~ m/\Q$parms[1]\E/) {
+              $foundline2 = $i + 1;
+              $found2 = 1;
+           }
+        }
+        
+        # now use the closest match or the first if tied
+        if ($found1 == 1 && $found2 == 0) {
+           $found = 1;
+           $foundline = $foundline1;
+        } elsif ($found1 == 0 && $found2 == 1) {
+           $found = 1;
+           $foundline = $foundline2;
+        } elsif ($found1 == 1 && $found2 == 1) {
+           $found = 1;
+           if (($foundline1 - $parms[0]) <= ($parms[0] - $foundline2)) {
+              $foundline = $foundline1;
+           } else {
+              $foundline = $foundline2;
+           }
+        } else {
+           $found = 0;
+        }
+                      
+        # if found replace 
+        if ($found == 1) {
+           $delta = $parms[0] - $foundline;
+           print "Found replacement tag for \"$parms[1]\" on line $srcline which refers to line $foundline (delta $delta)\n";
+           $_ =~ s/@\Q$m[1]\E@/$foundline/;
+        } else {
+           print "ERROR:  The tag \"$parms[1]\" on line $srcline was not found in the most recently parsed source!\n";
+        }
+        
+        # remake the rest of the line 
+        $cnt = @m;
+        $txt = "";
+        for ($i = 2; $i < $cnt; $i++) {
+            $txt = $txt . $m[$i] . "@";
+        }
+     }
+     print OUT $_;
+     ++$wroteline;
+   } elsif ($_ =~ /~.+~/) {
+      # line contains a ~text~ pair used to refer to indexing :-)
+      $txt = $_;
+      while ($txt =~ /~.+~/) {
+         @m = split("~", $txt);
+         
+         # word is the second position
+         $word = @m[1];
+         $a = $index1{$word};
+         $b = $index2{$word};
+         $c = $index3{$word};
+         
+         # if chapter (a) is zero it wasn't found
+         if ($a == 0) {
+            print "ERROR: the tag \"$word\" on line $srcline was not found previously marked.\n";
+         } else {
+            # format the tag as x, x.y or x.y.z depending on the values
+            $str = $a;
+            $str = $str . ".$b" if ($b != 0);
+            $str = $str . ".$c" if ($c != 0);
+            
+            if ($b == 0 && $c == 0) {
+               # its a chapter
+               if ($a <= 10) {
+                  if ($a == 1) {
+                     $str = "chapter one";
+                  } elsif ($a == 2) {
+                     $str = "chapter two";
+                  } elsif ($a == 3) {
+                     $str = "chapter three";
+                  } elsif ($a == 4) {
+                     $str = "chapter four";
+                  } elsif ($a == 5) {
+                     $str = "chapter five";
+                  } elsif ($a == 6) {
+                     $str = "chapter six";
+                  } elsif ($a == 7) {
+                     $str = "chapter seven";
+                  } elsif ($a == 8) {
+                     $str = "chapter eight";
+                  } elsif ($a == 9) {
+                     $str = "chapter nine";
+                  } elsif ($a == 2) {
+                     $str = "chapter ten";
+                  }
+               } else {
+                  $str = "chapter " . $str;
+               }
+            } else {
+               $str = "section " . $str     if ($b != 0 && $c == 0);            
+               $str = "sub-section " . $str if ($b != 0 && $c != 0);
+            }
+            
+            #substitute
+            $_ =~ s/~\Q$word\E~/$str/;
+            
+            print "Found replacement tag for marker \"$word\" on line $srcline which refers to $str\n";
+         }
+         
+         # remake rest of the line
+         $cnt = @m;
+         $txt = "";
+         for ($i = 2; $i < $cnt; $i++) {
+             $txt = $txt . $m[$i] . "~";
+         }
+      }
+      print OUT $_;
+      ++$wroteline;
+   } elsif ($_ =~ m/FIGU/) {
+      # FIGU,file,caption
+      chomp($_);
+      @m = split(",", $_);
+      print OUT "\\begin{center}\n\\begin{figure}[here]\n\\includegraphics{pics/$m[1]$graph}\n";
+      print OUT "\\caption{$m[2]}\n\\label{pic:$m[1]}\n\\end{figure}\n\\end{center}\n";
+      $wroteline += 4;
+   } else {
+      print OUT $_;
+      ++$wroteline;
+   }
+}
+print "Read $readline lines, wrote $wroteline lines\n";
+
+close (OUT);
+close (IN);
diff --git a/libtommath/callgraph.txt b/libtommath/callgraph.txt
new file mode 100644
index 0000000..2efcf24
--- /dev/null
+++ b/libtommath/callgraph.txt
@@ -0,0 +1,11913 @@
+BN_PRIME_TAB_C
+
+
+BN_MP_SQRT_C
++--->BN_MP_N_ROOT_C
+|   +--->BN_MP_INIT_C
+|   +--->BN_MP_SET_C
+|   |   +--->BN_MP_ZERO_C
+|   +--->BN_MP_COPY_C
+|   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_EXPT_D_C
+|   |   +--->BN_MP_INIT_COPY_C
+|   |   +--->BN_MP_SQR_C
+|   |   |   +--->BN_MP_TOOM_SQR_C
+|   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_KARATSUBA_SQR_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_FAST_S_MP_SQR_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_SQR_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_MUL_C
+|   |   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   +--->BN_MP_MUL_C
+|   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_SUB_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_SUB_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_SUB_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_MUL_D_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_DIV_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_COUNT_BITS_C
+|   |   +--->BN_MP_ABS_C
+|   |   +--->BN_MP_MUL_2D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_DIV_2D_C
+|   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_INIT_COPY_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_CMP_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   +--->BN_MP_SUB_D_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ADD_D_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_EXCH_C
+|   +--->BN_MP_CLEAR_C
++--->BN_MP_ZERO_C
++--->BN_MP_INIT_COPY_C
+|   +--->BN_MP_COPY_C
+|   |   +--->BN_MP_GROW_C
++--->BN_MP_RSHD_C
++--->BN_MP_DIV_C
+|   +--->BN_MP_CMP_MAG_C
+|   +--->BN_MP_COPY_C
+|   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_INIT_MULTI_C
+|   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_SET_C
+|   +--->BN_MP_COUNT_BITS_C
+|   +--->BN_MP_ABS_C
+|   +--->BN_MP_MUL_2D_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_LSHD_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CMP_C
+|   +--->BN_MP_SUB_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_ADD_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_DIV_2D_C
+|   |   +--->BN_MP_MOD_2D_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   +--->BN_MP_EXCH_C
+|   +--->BN_MP_CLEAR_MULTI_C
+|   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_INIT_SIZE_C
+|   +--->BN_MP_LSHD_C
+|   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_MUL_D_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CLEAR_C
++--->BN_MP_ADD_C
+|   +--->BN_S_MP_ADD_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CMP_MAG_C
+|   +--->BN_S_MP_SUB_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
++--->BN_MP_DIV_2_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_CLAMP_C
++--->BN_MP_CMP_MAG_C
++--->BN_MP_EXCH_C
++--->BN_MP_CLEAR_C
+
+
+BN_MP_CMP_D_C
+
+
+BN_MP_EXCH_C
+
+
+BN_MP_IS_SQUARE_C
++--->BN_MP_MOD_D_C
+|   +--->BN_MP_DIV_D_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_DIV_2D_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_INIT_C
+|   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_DIV_3_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_INIT_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_INIT_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_CLEAR_C
++--->BN_MP_INIT_SET_INT_C
+|   +--->BN_MP_INIT_C
+|   +--->BN_MP_SET_INT_C
+|   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_MUL_2D_C
+|   |   |   +--->BN_MP_COPY_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CLAMP_C
++--->BN_MP_MOD_C
+|   +--->BN_MP_INIT_C
+|   +--->BN_MP_DIV_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_SET_C
+|   |   +--->BN_MP_COUNT_BITS_C
+|   |   +--->BN_MP_ABS_C
+|   |   +--->BN_MP_MUL_2D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_DIV_2D_C
+|   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_INIT_COPY_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_MUL_D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_ADD_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_EXCH_C
++--->BN_MP_GET_INT_C
++--->BN_MP_SQRT_C
+|   +--->BN_MP_N_ROOT_C
+|   |   +--->BN_MP_INIT_C
+|   |   +--->BN_MP_SET_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_EXPT_D_C
+|   |   |   +--->BN_MP_INIT_COPY_C
+|   |   |   +--->BN_MP_SQR_C
+|   |   |   |   +--->BN_MP_TOOM_SQR_C
+|   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   +--->BN_MP_KARATSUBA_SQR_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   +--->BN_FAST_S_MP_SQR_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_SQR_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_MUL_C
+|   |   |   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_MUL_C
+|   |   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_MUL_D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_DIV_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   +--->BN_MP_ABS_C
+|   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_DIV_2D_C
+|   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_INIT_COPY_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_CMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_MP_SUB_D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_ADD_D_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_ZERO_C
+|   +--->BN_MP_INIT_COPY_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_RSHD_C
+|   +--->BN_MP_DIV_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_SET_C
+|   |   +--->BN_MP_COUNT_BITS_C
+|   |   +--->BN_MP_ABS_C
+|   |   +--->BN_MP_MUL_2D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_DIV_2D_C
+|   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_MUL_D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_ADD_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_DIV_2_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CMP_MAG_C
+|   +--->BN_MP_EXCH_C
+|   +--->BN_MP_CLEAR_C
++--->BN_MP_SQR_C
+|   +--->BN_MP_TOOM_SQR_C
+|   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   +--->BN_MP_INIT_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_MOD_2D_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_COPY_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_MUL_2_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_DIV_2_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_MUL_2D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_MUL_D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_DIV_3_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_INIT_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_KARATSUBA_SQR_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_INIT_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLEAR_C
+|   +--->BN_FAST_S_MP_SQR_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_S_MP_SQR_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_INIT_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_CLEAR_C
++--->BN_MP_CMP_MAG_C
++--->BN_MP_CLEAR_C
+
+
+BN_MP_NEG_C
++--->BN_MP_COPY_C
+|   +--->BN_MP_GROW_C
+
+
+BN_MP_EXPTMOD_C
++--->BN_MP_INIT_C
++--->BN_MP_INVMOD_C
+|   +--->BN_FAST_MP_INVMOD_C
+|   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_MOD_C
+|   |   |   +--->BN_MP_DIV_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_SET_C
+|   |   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   |   +--->BN_MP_ABS_C
+|   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_DIV_2D_C
+|   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_INIT_COPY_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_SET_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_DIV_2_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_MP_CMP_D_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_INVMOD_SLOW_C
+|   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_MOD_C
+|   |   |   +--->BN_MP_DIV_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_SET_C
+|   |   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   |   +--->BN_MP_ABS_C
+|   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_DIV_2D_C
+|   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_INIT_COPY_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_SET_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_DIV_2_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_MP_CMP_D_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   +--->BN_MP_CLEAR_C
++--->BN_MP_CLEAR_C
++--->BN_MP_ABS_C
+|   +--->BN_MP_COPY_C
+|   |   +--->BN_MP_GROW_C
++--->BN_MP_CLEAR_MULTI_C
++--->BN_MP_REDUCE_IS_2K_L_C
++--->BN_S_MP_EXPTMOD_C
+|   +--->BN_MP_COUNT_BITS_C
+|   +--->BN_MP_REDUCE_SETUP_C
+|   |   +--->BN_MP_2EXPT_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_DIV_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_MP_COPY_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   +--->BN_MP_SET_C
+|   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_C
+|   |   |   +--->BN_MP_SUB_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_DIV_2D_C
+|   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_INIT_COPY_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_REDUCE_C
+|   |   +--->BN_MP_INIT_COPY_C
+|   |   |   +--->BN_MP_COPY_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_MUL_C
+|   |   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_S_MP_MUL_HIGH_DIGS_C
+|   |   |   +--->BN_FAST_S_MP_MUL_HIGH_DIGS_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_FAST_S_MP_MUL_HIGH_DIGS_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_MOD_2D_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_COPY_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_D_C
+|   |   +--->BN_MP_SET_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_REDUCE_2K_SETUP_L_C
+|   |   +--->BN_MP_2EXPT_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_REDUCE_2K_L_C
+|   |   +--->BN_MP_DIV_2D_C
+|   |   |   +--->BN_MP_COPY_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_MUL_C
+|   |   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_MOD_C
+|   |   +--->BN_MP_DIV_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_MP_COPY_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   +--->BN_MP_SET_C
+|   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_C
+|   |   |   +--->BN_MP_SUB_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_DIV_2D_C
+|   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_INIT_COPY_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   +--->BN_MP_COPY_C
+|   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_SQR_C
+|   |   +--->BN_MP_TOOM_SQR_C
+|   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_SUB_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_KARATSUBA_SQR_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_SUB_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_FAST_S_MP_SQR_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_S_MP_SQR_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   +--->BN_MP_MUL_C
+|   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_SUB_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_SUB_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   +--->BN_MP_SET_C
+|   |   +--->BN_MP_ZERO_C
+|   +--->BN_MP_EXCH_C
++--->BN_MP_DR_IS_MODULUS_C
++--->BN_MP_REDUCE_IS_2K_C
+|   +--->BN_MP_REDUCE_2K_C
+|   |   +--->BN_MP_COUNT_BITS_C
+|   |   +--->BN_MP_DIV_2D_C
+|   |   |   +--->BN_MP_COPY_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_MUL_D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_COUNT_BITS_C
++--->BN_MP_EXPTMOD_FAST_C
+|   +--->BN_MP_COUNT_BITS_C
+|   +--->BN_MP_MONTGOMERY_SETUP_C
+|   +--->BN_FAST_MP_MONTGOMERY_REDUCE_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_S_MP_SUB_C
+|   +--->BN_MP_MONTGOMERY_REDUCE_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_S_MP_SUB_C
+|   +--->BN_MP_DR_SETUP_C
+|   +--->BN_MP_DR_REDUCE_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_S_MP_SUB_C
+|   +--->BN_MP_REDUCE_2K_SETUP_C
+|   |   +--->BN_MP_2EXPT_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_REDUCE_2K_C
+|   |   +--->BN_MP_DIV_2D_C
+|   |   |   +--->BN_MP_COPY_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_MUL_D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_MONTGOMERY_CALC_NORMALIZATION_C
+|   |   +--->BN_MP_2EXPT_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_SET_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_MUL_2_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_MULMOD_C
+|   |   +--->BN_MP_MUL_C
+|   |   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_MOD_C
+|   |   |   +--->BN_MP_DIV_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   +--->BN_MP_SET_C
+|   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_DIV_2D_C
+|   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_INIT_COPY_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   +--->BN_MP_SET_C
+|   |   +--->BN_MP_ZERO_C
+|   +--->BN_MP_MOD_C
+|   |   +--->BN_MP_DIV_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_MP_COPY_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_C
+|   |   |   +--->BN_MP_SUB_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_DIV_2D_C
+|   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_INIT_COPY_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   +--->BN_MP_COPY_C
+|   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_SQR_C
+|   |   +--->BN_MP_TOOM_SQR_C
+|   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_SUB_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_KARATSUBA_SQR_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_SUB_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_FAST_S_MP_SQR_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_S_MP_SQR_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   +--->BN_MP_MUL_C
+|   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_SUB_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_SUB_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   +--->BN_MP_EXCH_C
+
+
+BN_MP_OR_C
++--->BN_MP_INIT_COPY_C
+|   +--->BN_MP_COPY_C
+|   |   +--->BN_MP_GROW_C
++--->BN_MP_CLAMP_C
++--->BN_MP_EXCH_C
++--->BN_MP_CLEAR_C
+
+
+BN_MP_ZERO_C
+
+
+BN_MP_GROW_C
+
+
+BN_MP_COUNT_BITS_C
+
+
+BN_MP_PRIME_FERMAT_C
++--->BN_MP_CMP_D_C
++--->BN_MP_INIT_C
++--->BN_MP_EXPTMOD_C
+|   +--->BN_MP_INVMOD_C
+|   |   +--->BN_FAST_MP_INVMOD_C
+|   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_COPY_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_MOD_C
+|   |   |   |   +--->BN_MP_DIV_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_SET_C
+|   |   |   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   |   |   +--->BN_MP_ABS_C
+|   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_DIV_2D_C
+|   |   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_INIT_COPY_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_SET_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_SUB_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_INVMOD_SLOW_C
+|   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_MOD_C
+|   |   |   |   +--->BN_MP_DIV_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_SET_C
+|   |   |   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   |   |   +--->BN_MP_ABS_C
+|   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_DIV_2D_C
+|   |   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_INIT_COPY_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_COPY_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_SET_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_SUB_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_ABS_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_CLEAR_MULTI_C
+|   +--->BN_MP_REDUCE_IS_2K_L_C
+|   +--->BN_S_MP_EXPTMOD_C
+|   |   +--->BN_MP_COUNT_BITS_C
+|   |   +--->BN_MP_REDUCE_SETUP_C
+|   |   |   +--->BN_MP_2EXPT_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_DIV_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   +--->BN_MP_SET_C
+|   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_DIV_2D_C
+|   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_INIT_COPY_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_REDUCE_C
+|   |   |   +--->BN_MP_INIT_COPY_C
+|   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_MUL_C
+|   |   |   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_S_MP_MUL_HIGH_DIGS_C
+|   |   |   |   +--->BN_FAST_S_MP_MUL_HIGH_DIGS_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_FAST_S_MP_MUL_HIGH_DIGS_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_SUB_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_SET_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_REDUCE_2K_SETUP_L_C
+|   |   |   +--->BN_MP_2EXPT_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_REDUCE_2K_L_C
+|   |   |   +--->BN_MP_DIV_2D_C
+|   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_MUL_C
+|   |   |   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_MOD_C
+|   |   |   +--->BN_MP_DIV_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   +--->BN_MP_SET_C
+|   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_DIV_2D_C
+|   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_INIT_COPY_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_SQR_C
+|   |   |   +--->BN_MP_TOOM_SQR_C
+|   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_KARATSUBA_SQR_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_FAST_S_MP_SQR_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_SQR_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_MUL_C
+|   |   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_SET_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_EXCH_C
+|   +--->BN_MP_DR_IS_MODULUS_C
+|   +--->BN_MP_REDUCE_IS_2K_C
+|   |   +--->BN_MP_REDUCE_2K_C
+|   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   +--->BN_MP_DIV_2D_C
+|   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_COUNT_BITS_C
+|   +--->BN_MP_EXPTMOD_FAST_C
+|   |   +--->BN_MP_COUNT_BITS_C
+|   |   +--->BN_MP_MONTGOMERY_SETUP_C
+|   |   +--->BN_FAST_MP_MONTGOMERY_REDUCE_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   +--->BN_MP_MONTGOMERY_REDUCE_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   +--->BN_MP_DR_SETUP_C
+|   |   +--->BN_MP_DR_REDUCE_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   +--->BN_MP_REDUCE_2K_SETUP_C
+|   |   |   +--->BN_MP_2EXPT_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_REDUCE_2K_C
+|   |   |   +--->BN_MP_DIV_2D_C
+|   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_MONTGOMERY_CALC_NORMALIZATION_C
+|   |   |   +--->BN_MP_2EXPT_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_SET_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_MULMOD_C
+|   |   |   +--->BN_MP_MUL_C
+|   |   |   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_MOD_C
+|   |   |   |   +--->BN_MP_DIV_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   +--->BN_MP_SET_C
+|   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_DIV_2D_C
+|   |   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_INIT_COPY_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_SET_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_MOD_C
+|   |   |   +--->BN_MP_DIV_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_DIV_2D_C
+|   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_INIT_COPY_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_SQR_C
+|   |   |   +--->BN_MP_TOOM_SQR_C
+|   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_KARATSUBA_SQR_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_FAST_S_MP_SQR_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_SQR_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_MUL_C
+|   |   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_EXCH_C
++--->BN_MP_CMP_C
+|   +--->BN_MP_CMP_MAG_C
++--->BN_MP_CLEAR_C
+
+
+BN_MP_SUBMOD_C
++--->BN_MP_INIT_C
++--->BN_MP_SUB_C
+|   +--->BN_S_MP_ADD_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CMP_MAG_C
+|   +--->BN_S_MP_SUB_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
++--->BN_MP_CLEAR_C
++--->BN_MP_MOD_C
+|   +--->BN_MP_DIV_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_INIT_MULTI_C
+|   |   +--->BN_MP_SET_C
+|   |   +--->BN_MP_COUNT_BITS_C
+|   |   +--->BN_MP_ABS_C
+|   |   +--->BN_MP_MUL_2D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_DIV_2D_C
+|   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_INIT_COPY_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_MUL_D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_ADD_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_EXCH_C
+
+
+BN_MP_MOD_2D_C
++--->BN_MP_ZERO_C
++--->BN_MP_COPY_C
+|   +--->BN_MP_GROW_C
++--->BN_MP_CLAMP_C
+
+
+BN_MP_TORADIX_N_C
++--->BN_MP_INIT_COPY_C
+|   +--->BN_MP_COPY_C
+|   |   +--->BN_MP_GROW_C
++--->BN_MP_DIV_D_C
+|   +--->BN_MP_COPY_C
+|   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_DIV_2D_C
+|   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_MOD_2D_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   +--->BN_MP_DIV_3_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_INIT_SIZE_C
+|   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_EXCH_C
+|   +--->BN_MP_CLEAR_C
++--->BN_MP_CLEAR_C
+
+
+BN_MP_CMP_C
++--->BN_MP_CMP_MAG_C
+
+
+BNCORE_C
+
+
+BN_MP_TORADIX_C
++--->BN_MP_INIT_COPY_C
+|   +--->BN_MP_COPY_C
+|   |   +--->BN_MP_GROW_C
++--->BN_MP_DIV_D_C
+|   +--->BN_MP_COPY_C
+|   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_DIV_2D_C
+|   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_MOD_2D_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   +--->BN_MP_DIV_3_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_INIT_SIZE_C
+|   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_EXCH_C
+|   +--->BN_MP_CLEAR_C
++--->BN_MP_CLEAR_C
+
+
+BN_MP_ADD_D_C
++--->BN_MP_GROW_C
++--->BN_MP_SUB_D_C
+|   +--->BN_MP_CLAMP_C
++--->BN_MP_CLAMP_C
+
+
+BN_MP_DIV_3_C
++--->BN_MP_INIT_SIZE_C
+|   +--->BN_MP_INIT_C
++--->BN_MP_CLAMP_C
++--->BN_MP_EXCH_C
++--->BN_MP_CLEAR_C
+
+
+BN_FAST_S_MP_MUL_DIGS_C
++--->BN_MP_GROW_C
++--->BN_MP_CLAMP_C
+
+
+BN_MP_SQRMOD_C
++--->BN_MP_INIT_C
++--->BN_MP_SQR_C
+|   +--->BN_MP_TOOM_SQR_C
+|   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_MOD_2D_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_COPY_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_MUL_2_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_DIV_2_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_MUL_2D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_MUL_D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_DIV_3_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_KARATSUBA_SQR_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLEAR_C
+|   +--->BN_FAST_S_MP_SQR_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_S_MP_SQR_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_CLEAR_C
++--->BN_MP_CLEAR_C
++--->BN_MP_MOD_C
+|   +--->BN_MP_DIV_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_INIT_MULTI_C
+|   |   +--->BN_MP_SET_C
+|   |   +--->BN_MP_COUNT_BITS_C
+|   |   +--->BN_MP_ABS_C
+|   |   +--->BN_MP_MUL_2D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_DIV_2D_C
+|   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_INIT_COPY_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_MUL_D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_ADD_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_EXCH_C
+
+
+BN_MP_INVMOD_C
++--->BN_FAST_MP_INVMOD_C
+|   +--->BN_MP_INIT_MULTI_C
+|   |   +--->BN_MP_INIT_C
+|   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_COPY_C
+|   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_MOD_C
+|   |   +--->BN_MP_INIT_C
+|   |   +--->BN_MP_DIV_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_SET_C
+|   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   +--->BN_MP_ABS_C
+|   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_C
+|   |   |   +--->BN_MP_SUB_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_DIV_2D_C
+|   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_INIT_COPY_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   +--->BN_MP_SET_C
+|   |   +--->BN_MP_ZERO_C
+|   +--->BN_MP_DIV_2_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_SUB_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CMP_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   +--->BN_MP_CMP_D_C
+|   +--->BN_MP_ADD_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_EXCH_C
+|   +--->BN_MP_CLEAR_MULTI_C
+|   |   +--->BN_MP_CLEAR_C
++--->BN_MP_INVMOD_SLOW_C
+|   +--->BN_MP_INIT_MULTI_C
+|   |   +--->BN_MP_INIT_C
+|   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_MOD_C
+|   |   +--->BN_MP_INIT_C
+|   |   +--->BN_MP_DIV_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_MP_COPY_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_SET_C
+|   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   +--->BN_MP_ABS_C
+|   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_C
+|   |   |   +--->BN_MP_SUB_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_DIV_2D_C
+|   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_INIT_COPY_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   +--->BN_MP_COPY_C
+|   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_SET_C
+|   |   +--->BN_MP_ZERO_C
+|   +--->BN_MP_DIV_2_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_ADD_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_SUB_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CMP_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   +--->BN_MP_CMP_D_C
+|   +--->BN_MP_CMP_MAG_C
+|   +--->BN_MP_EXCH_C
+|   +--->BN_MP_CLEAR_MULTI_C
+|   |   +--->BN_MP_CLEAR_C
+
+
+BN_MP_AND_C
++--->BN_MP_INIT_COPY_C
+|   +--->BN_MP_COPY_C
+|   |   +--->BN_MP_GROW_C
++--->BN_MP_CLAMP_C
++--->BN_MP_EXCH_C
++--->BN_MP_CLEAR_C
+
+
+BN_MP_MUL_D_C
++--->BN_MP_GROW_C
++--->BN_MP_CLAMP_C
+
+
+BN_FAST_MP_INVMOD_C
++--->BN_MP_INIT_MULTI_C
+|   +--->BN_MP_INIT_C
+|   +--->BN_MP_CLEAR_C
++--->BN_MP_COPY_C
+|   +--->BN_MP_GROW_C
++--->BN_MP_MOD_C
+|   +--->BN_MP_INIT_C
+|   +--->BN_MP_DIV_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_SET_C
+|   |   +--->BN_MP_COUNT_BITS_C
+|   |   +--->BN_MP_ABS_C
+|   |   +--->BN_MP_MUL_2D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_DIV_2D_C
+|   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_INIT_COPY_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_MUL_D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_ADD_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_EXCH_C
++--->BN_MP_SET_C
+|   +--->BN_MP_ZERO_C
++--->BN_MP_DIV_2_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_CLAMP_C
++--->BN_MP_SUB_C
+|   +--->BN_S_MP_ADD_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CMP_MAG_C
+|   +--->BN_S_MP_SUB_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
++--->BN_MP_CMP_C
+|   +--->BN_MP_CMP_MAG_C
++--->BN_MP_CMP_D_C
++--->BN_MP_ADD_C
+|   +--->BN_S_MP_ADD_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CMP_MAG_C
+|   +--->BN_S_MP_SUB_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
++--->BN_MP_EXCH_C
++--->BN_MP_CLEAR_MULTI_C
+|   +--->BN_MP_CLEAR_C
+
+
+BN_MP_FWRITE_C
++--->BN_MP_RADIX_SIZE_C
+|   +--->BN_MP_COUNT_BITS_C
+|   +--->BN_MP_INIT_COPY_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_DIV_D_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_DIV_2D_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_DIV_3_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_CLEAR_C
++--->BN_MP_TORADIX_C
+|   +--->BN_MP_INIT_COPY_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_DIV_D_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_DIV_2D_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_DIV_3_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_CLEAR_C
+
+
+BN_S_MP_SQR_C
++--->BN_MP_INIT_SIZE_C
+|   +--->BN_MP_INIT_C
++--->BN_MP_CLAMP_C
++--->BN_MP_EXCH_C
++--->BN_MP_CLEAR_C
+
+
+BN_MP_N_ROOT_C
++--->BN_MP_INIT_C
++--->BN_MP_SET_C
+|   +--->BN_MP_ZERO_C
++--->BN_MP_COPY_C
+|   +--->BN_MP_GROW_C
++--->BN_MP_EXPT_D_C
+|   +--->BN_MP_INIT_COPY_C
+|   +--->BN_MP_SQR_C
+|   |   +--->BN_MP_TOOM_SQR_C
+|   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_SUB_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_KARATSUBA_SQR_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_SUB_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_FAST_S_MP_SQR_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_S_MP_SQR_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_MUL_C
+|   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_SUB_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_SUB_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
++--->BN_MP_MUL_C
+|   +--->BN_MP_TOOM_MUL_C
+|   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_MOD_2D_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_MUL_2_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_DIV_2_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_MUL_2D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_MUL_D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_DIV_3_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_KARATSUBA_MUL_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_CLEAR_C
+|   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_S_MP_MUL_DIGS_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_CLEAR_C
++--->BN_MP_SUB_C
+|   +--->BN_S_MP_ADD_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CMP_MAG_C
+|   +--->BN_S_MP_SUB_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
++--->BN_MP_MUL_D_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_CLAMP_C
++--->BN_MP_DIV_C
+|   +--->BN_MP_CMP_MAG_C
+|   +--->BN_MP_ZERO_C
+|   +--->BN_MP_INIT_MULTI_C
+|   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_COUNT_BITS_C
+|   +--->BN_MP_ABS_C
+|   +--->BN_MP_MUL_2D_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CMP_C
+|   +--->BN_MP_ADD_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_DIV_2D_C
+|   |   +--->BN_MP_MOD_2D_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   +--->BN_MP_EXCH_C
+|   +--->BN_MP_CLEAR_MULTI_C
+|   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_INIT_SIZE_C
+|   +--->BN_MP_INIT_COPY_C
+|   +--->BN_MP_LSHD_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_RSHD_C
+|   +--->BN_MP_RSHD_C
+|   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CLEAR_C
++--->BN_MP_CMP_C
+|   +--->BN_MP_CMP_MAG_C
++--->BN_MP_SUB_D_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_ADD_D_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CLAMP_C
++--->BN_MP_EXCH_C
++--->BN_MP_CLEAR_C
+
+
+BN_MP_PRIME_RABIN_MILLER_TRIALS_C
+
+
+BN_MP_RADIX_SIZE_C
++--->BN_MP_COUNT_BITS_C
++--->BN_MP_INIT_COPY_C
+|   +--->BN_MP_COPY_C
+|   |   +--->BN_MP_GROW_C
++--->BN_MP_DIV_D_C
+|   +--->BN_MP_COPY_C
+|   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_DIV_2D_C
+|   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_MOD_2D_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   +--->BN_MP_DIV_3_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_INIT_SIZE_C
+|   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_EXCH_C
+|   +--->BN_MP_CLEAR_C
++--->BN_MP_CLEAR_C
+
+
+BN_MP_READ_SIGNED_BIN_C
++--->BN_MP_READ_UNSIGNED_BIN_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_ZERO_C
+|   +--->BN_MP_MUL_2D_C
+|   |   +--->BN_MP_COPY_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CLAMP_C
+
+
+BN_MP_PRIME_RANDOM_EX_C
++--->BN_MP_READ_UNSIGNED_BIN_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_ZERO_C
+|   +--->BN_MP_MUL_2D_C
+|   |   +--->BN_MP_COPY_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CLAMP_C
++--->BN_MP_PRIME_IS_PRIME_C
+|   +--->BN_MP_CMP_D_C
+|   +--->BN_MP_PRIME_IS_DIVISIBLE_C
+|   |   +--->BN_MP_MOD_D_C
+|   |   |   +--->BN_MP_DIV_D_C
+|   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_DIV_2D_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_INIT_C
+|   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_INIT_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_INIT_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_INIT_C
+|   +--->BN_MP_SET_C
+|   |   +--->BN_MP_ZERO_C
+|   +--->BN_MP_PRIME_MILLER_RABIN_C
+|   |   +--->BN_MP_INIT_COPY_C
+|   |   |   +--->BN_MP_COPY_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_SUB_D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_ADD_D_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CNT_LSB_C
+|   |   +--->BN_MP_DIV_2D_C
+|   |   |   +--->BN_MP_COPY_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_EXPTMOD_C
+|   |   |   +--->BN_MP_INVMOD_C
+|   |   |   |   +--->BN_FAST_MP_INVMOD_C
+|   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_MOD_C
+|   |   |   |   |   |   +--->BN_MP_DIV_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   |   |   |   |   +--->BN_MP_ABS_C
+|   |   |   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   +--->BN_MP_INVMOD_SLOW_C
+|   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   +--->BN_MP_MOD_C
+|   |   |   |   |   |   +--->BN_MP_DIV_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   |   |   |   |   +--->BN_MP_ABS_C
+|   |   |   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_ABS_C
+|   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   +--->BN_MP_REDUCE_IS_2K_L_C
+|   |   |   +--->BN_S_MP_EXPTMOD_C
+|   |   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   |   +--->BN_MP_REDUCE_SETUP_C
+|   |   |   |   |   +--->BN_MP_2EXPT_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_DIV_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_REDUCE_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_MUL_C
+|   |   |   |   |   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_S_MP_MUL_HIGH_DIGS_C
+|   |   |   |   |   |   +--->BN_FAST_S_MP_MUL_HIGH_DIGS_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_FAST_S_MP_MUL_HIGH_DIGS_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_REDUCE_2K_SETUP_L_C
+|   |   |   |   |   +--->BN_MP_2EXPT_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_REDUCE_2K_L_C
+|   |   |   |   |   +--->BN_MP_MUL_C
+|   |   |   |   |   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_MOD_C
+|   |   |   |   |   +--->BN_MP_DIV_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_SQR_C
+|   |   |   |   |   +--->BN_MP_TOOM_SQR_C
+|   |   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_KARATSUBA_SQR_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_FAST_S_MP_SQR_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_SQR_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_MUL_C
+|   |   |   |   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_DR_IS_MODULUS_C
+|   |   |   +--->BN_MP_REDUCE_IS_2K_C
+|   |   |   |   +--->BN_MP_REDUCE_2K_C
+|   |   |   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   +--->BN_MP_EXPTMOD_FAST_C
+|   |   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   |   +--->BN_MP_MONTGOMERY_SETUP_C
+|   |   |   |   +--->BN_FAST_MP_MONTGOMERY_REDUCE_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_MONTGOMERY_REDUCE_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_DR_SETUP_C
+|   |   |   |   +--->BN_MP_DR_REDUCE_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_REDUCE_2K_SETUP_C
+|   |   |   |   |   +--->BN_MP_2EXPT_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_REDUCE_2K_C
+|   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_MONTGOMERY_CALC_NORMALIZATION_C
+|   |   |   |   |   +--->BN_MP_2EXPT_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_MULMOD_C
+|   |   |   |   |   +--->BN_MP_MUL_C
+|   |   |   |   |   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_MOD_C
+|   |   |   |   |   |   +--->BN_MP_DIV_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_MOD_C
+|   |   |   |   |   +--->BN_MP_DIV_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_SQR_C
+|   |   |   |   |   +--->BN_MP_TOOM_SQR_C
+|   |   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_KARATSUBA_SQR_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_FAST_S_MP_SQR_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_SQR_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_MUL_C
+|   |   |   |   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_CMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_MP_SQRMOD_C
+|   |   |   +--->BN_MP_SQR_C
+|   |   |   |   +--->BN_MP_TOOM_SQR_C
+|   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   +--->BN_MP_KARATSUBA_SQR_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   +--->BN_FAST_S_MP_SQR_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_SQR_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_MOD_C
+|   |   |   |   +--->BN_MP_DIV_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   |   |   +--->BN_MP_ABS_C
+|   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_CLEAR_C
++--->BN_MP_SUB_D_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_ADD_D_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CLAMP_C
++--->BN_MP_DIV_2_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_CLAMP_C
++--->BN_MP_MUL_2_C
+|   +--->BN_MP_GROW_C
++--->BN_MP_ADD_D_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_CLAMP_C
+
+
+BN_MP_KARATSUBA_SQR_C
++--->BN_MP_INIT_SIZE_C
+|   +--->BN_MP_INIT_C
++--->BN_MP_CLAMP_C
++--->BN_MP_SQR_C
+|   +--->BN_MP_TOOM_SQR_C
+|   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   +--->BN_MP_INIT_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_MOD_2D_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_COPY_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_MUL_2_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_DIV_2_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_MUL_2D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   +--->BN_MP_MUL_D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_DIV_3_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   +--->BN_FAST_S_MP_SQR_C
+|   |   +--->BN_MP_GROW_C
+|   +--->BN_S_MP_SQR_C
+|   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_CLEAR_C
++--->BN_MP_SUB_C
+|   +--->BN_S_MP_ADD_C
+|   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_CMP_MAG_C
+|   +--->BN_S_MP_SUB_C
+|   |   +--->BN_MP_GROW_C
++--->BN_S_MP_ADD_C
+|   +--->BN_MP_GROW_C
++--->BN_MP_LSHD_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_ZERO_C
++--->BN_MP_ADD_C
+|   +--->BN_MP_CMP_MAG_C
+|   +--->BN_S_MP_SUB_C
+|   |   +--->BN_MP_GROW_C
++--->BN_MP_CLEAR_C
+
+
+BN_MP_INIT_COPY_C
++--->BN_MP_COPY_C
+|   +--->BN_MP_GROW_C
+
+
+BN_MP_CLAMP_C
+
+
+BN_MP_TOOM_SQR_C
++--->BN_MP_INIT_MULTI_C
+|   +--->BN_MP_INIT_C
+|   +--->BN_MP_CLEAR_C
++--->BN_MP_MOD_2D_C
+|   +--->BN_MP_ZERO_C
+|   +--->BN_MP_COPY_C
+|   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_CLAMP_C
++--->BN_MP_COPY_C
+|   +--->BN_MP_GROW_C
++--->BN_MP_RSHD_C
+|   +--->BN_MP_ZERO_C
++--->BN_MP_SQR_C
+|   +--->BN_MP_KARATSUBA_SQR_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_INIT_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLEAR_C
+|   +--->BN_FAST_S_MP_SQR_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_S_MP_SQR_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_INIT_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_CLEAR_C
++--->BN_MP_MUL_2_C
+|   +--->BN_MP_GROW_C
++--->BN_MP_ADD_C
+|   +--->BN_S_MP_ADD_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CMP_MAG_C
+|   +--->BN_S_MP_SUB_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
++--->BN_MP_SUB_C
+|   +--->BN_S_MP_ADD_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CMP_MAG_C
+|   +--->BN_S_MP_SUB_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
++--->BN_MP_DIV_2_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_CLAMP_C
++--->BN_MP_MUL_2D_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_LSHD_C
+|   +--->BN_MP_CLAMP_C
++--->BN_MP_MUL_D_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_CLAMP_C
++--->BN_MP_DIV_3_C
+|   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_INIT_C
+|   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_EXCH_C
+|   +--->BN_MP_CLEAR_C
++--->BN_MP_LSHD_C
+|   +--->BN_MP_GROW_C
++--->BN_MP_CLEAR_MULTI_C
+|   +--->BN_MP_CLEAR_C
+
+
+BN_MP_MOD_C
++--->BN_MP_INIT_C
++--->BN_MP_DIV_C
+|   +--->BN_MP_CMP_MAG_C
+|   +--->BN_MP_COPY_C
+|   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_ZERO_C
+|   +--->BN_MP_INIT_MULTI_C
+|   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_SET_C
+|   +--->BN_MP_COUNT_BITS_C
+|   +--->BN_MP_ABS_C
+|   +--->BN_MP_MUL_2D_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CMP_C
+|   +--->BN_MP_SUB_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_ADD_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_DIV_2D_C
+|   |   +--->BN_MP_MOD_2D_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   +--->BN_MP_EXCH_C
+|   +--->BN_MP_CLEAR_MULTI_C
+|   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_INIT_SIZE_C
+|   +--->BN_MP_INIT_COPY_C
+|   +--->BN_MP_LSHD_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_RSHD_C
+|   +--->BN_MP_RSHD_C
+|   +--->BN_MP_MUL_D_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CLEAR_C
++--->BN_MP_CLEAR_C
++--->BN_MP_ADD_C
+|   +--->BN_S_MP_ADD_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CMP_MAG_C
+|   +--->BN_S_MP_SUB_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
++--->BN_MP_EXCH_C
+
+
+BN_MP_INIT_C
+
+
+BN_MP_TOOM_MUL_C
++--->BN_MP_INIT_MULTI_C
+|   +--->BN_MP_INIT_C
+|   +--->BN_MP_CLEAR_C
++--->BN_MP_MOD_2D_C
+|   +--->BN_MP_ZERO_C
+|   +--->BN_MP_COPY_C
+|   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_CLAMP_C
++--->BN_MP_COPY_C
+|   +--->BN_MP_GROW_C
++--->BN_MP_RSHD_C
+|   +--->BN_MP_ZERO_C
++--->BN_MP_MUL_C
+|   +--->BN_MP_KARATSUBA_MUL_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_INIT_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLEAR_C
+|   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_S_MP_MUL_DIGS_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_INIT_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_CLEAR_C
++--->BN_MP_MUL_2_C
+|   +--->BN_MP_GROW_C
++--->BN_MP_ADD_C
+|   +--->BN_S_MP_ADD_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CMP_MAG_C
+|   +--->BN_S_MP_SUB_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
++--->BN_MP_SUB_C
+|   +--->BN_S_MP_ADD_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CMP_MAG_C
+|   +--->BN_S_MP_SUB_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
++--->BN_MP_DIV_2_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_CLAMP_C
++--->BN_MP_MUL_2D_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_LSHD_C
+|   +--->BN_MP_CLAMP_C
++--->BN_MP_MUL_D_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_CLAMP_C
++--->BN_MP_DIV_3_C
+|   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_INIT_C
+|   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_EXCH_C
+|   +--->BN_MP_CLEAR_C
++--->BN_MP_LSHD_C
+|   +--->BN_MP_GROW_C
++--->BN_MP_CLEAR_MULTI_C
+|   +--->BN_MP_CLEAR_C
+
+
+BN_MP_PRIME_IS_PRIME_C
++--->BN_MP_CMP_D_C
++--->BN_MP_PRIME_IS_DIVISIBLE_C
+|   +--->BN_MP_MOD_D_C
+|   |   +--->BN_MP_DIV_D_C
+|   |   |   +--->BN_MP_COPY_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_DIV_2D_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_INIT_C
+|   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_INIT_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_INIT_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_CLEAR_C
++--->BN_MP_INIT_C
++--->BN_MP_SET_C
+|   +--->BN_MP_ZERO_C
++--->BN_MP_PRIME_MILLER_RABIN_C
+|   +--->BN_MP_INIT_COPY_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_SUB_D_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ADD_D_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CNT_LSB_C
+|   +--->BN_MP_DIV_2D_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_MOD_2D_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   +--->BN_MP_EXPTMOD_C
+|   |   +--->BN_MP_INVMOD_C
+|   |   |   +--->BN_FAST_MP_INVMOD_C
+|   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_MOD_C
+|   |   |   |   |   +--->BN_MP_DIV_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   |   |   |   +--->BN_MP_ABS_C
+|   |   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_INVMOD_SLOW_C
+|   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   +--->BN_MP_MOD_C
+|   |   |   |   |   +--->BN_MP_DIV_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   |   |   |   +--->BN_MP_ABS_C
+|   |   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_ABS_C
+|   |   |   +--->BN_MP_COPY_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   +--->BN_MP_REDUCE_IS_2K_L_C
+|   |   +--->BN_S_MP_EXPTMOD_C
+|   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   +--->BN_MP_REDUCE_SETUP_C
+|   |   |   |   +--->BN_MP_2EXPT_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_DIV_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_REDUCE_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_MUL_C
+|   |   |   |   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_S_MP_MUL_HIGH_DIGS_C
+|   |   |   |   |   +--->BN_FAST_S_MP_MUL_HIGH_DIGS_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_FAST_S_MP_MUL_HIGH_DIGS_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_REDUCE_2K_SETUP_L_C
+|   |   |   |   +--->BN_MP_2EXPT_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_REDUCE_2K_L_C
+|   |   |   |   +--->BN_MP_MUL_C
+|   |   |   |   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_MOD_C
+|   |   |   |   +--->BN_MP_DIV_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_COPY_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_SQR_C
+|   |   |   |   +--->BN_MP_TOOM_SQR_C
+|   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_KARATSUBA_SQR_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_FAST_S_MP_SQR_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_SQR_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_MUL_C
+|   |   |   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_DR_IS_MODULUS_C
+|   |   +--->BN_MP_REDUCE_IS_2K_C
+|   |   |   +--->BN_MP_REDUCE_2K_C
+|   |   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   +--->BN_MP_EXPTMOD_FAST_C
+|   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   +--->BN_MP_MONTGOMERY_SETUP_C
+|   |   |   +--->BN_FAST_MP_MONTGOMERY_REDUCE_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_MONTGOMERY_REDUCE_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_DR_SETUP_C
+|   |   |   +--->BN_MP_DR_REDUCE_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_REDUCE_2K_SETUP_C
+|   |   |   |   +--->BN_MP_2EXPT_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_REDUCE_2K_C
+|   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_MONTGOMERY_CALC_NORMALIZATION_C
+|   |   |   |   +--->BN_MP_2EXPT_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_MULMOD_C
+|   |   |   |   +--->BN_MP_MUL_C
+|   |   |   |   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_MOD_C
+|   |   |   |   |   +--->BN_MP_DIV_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_MOD_C
+|   |   |   |   +--->BN_MP_DIV_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_COPY_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_SQR_C
+|   |   |   |   +--->BN_MP_TOOM_SQR_C
+|   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_KARATSUBA_SQR_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_FAST_S_MP_SQR_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_SQR_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_MUL_C
+|   |   |   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_EXCH_C
+|   +--->BN_MP_CMP_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   +--->BN_MP_SQRMOD_C
+|   |   +--->BN_MP_SQR_C
+|   |   |   +--->BN_MP_TOOM_SQR_C
+|   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_KARATSUBA_SQR_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_FAST_S_MP_SQR_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_SQR_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_MOD_C
+|   |   |   +--->BN_MP_DIV_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   |   +--->BN_MP_ABS_C
+|   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   +--->BN_MP_CLEAR_C
++--->BN_MP_CLEAR_C
+
+
+BN_MP_COPY_C
++--->BN_MP_GROW_C
+
+
+BN_S_MP_SUB_C
++--->BN_MP_GROW_C
++--->BN_MP_CLAMP_C
+
+
+BN_MP_READ_UNSIGNED_BIN_C
++--->BN_MP_GROW_C
++--->BN_MP_ZERO_C
++--->BN_MP_MUL_2D_C
+|   +--->BN_MP_COPY_C
+|   +--->BN_MP_LSHD_C
+|   |   +--->BN_MP_RSHD_C
+|   +--->BN_MP_CLAMP_C
++--->BN_MP_CLAMP_C
+
+
+BN_MP_EXPTMOD_FAST_C
++--->BN_MP_COUNT_BITS_C
++--->BN_MP_INIT_C
++--->BN_MP_CLEAR_C
++--->BN_MP_MONTGOMERY_SETUP_C
++--->BN_FAST_MP_MONTGOMERY_REDUCE_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_ZERO_C
+|   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CMP_MAG_C
+|   +--->BN_S_MP_SUB_C
++--->BN_MP_MONTGOMERY_REDUCE_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_ZERO_C
+|   +--->BN_MP_CMP_MAG_C
+|   +--->BN_S_MP_SUB_C
++--->BN_MP_DR_SETUP_C
++--->BN_MP_DR_REDUCE_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CMP_MAG_C
+|   +--->BN_S_MP_SUB_C
++--->BN_MP_REDUCE_2K_SETUP_C
+|   +--->BN_MP_2EXPT_C
+|   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_GROW_C
+|   +--->BN_S_MP_SUB_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
++--->BN_MP_REDUCE_2K_C
+|   +--->BN_MP_DIV_2D_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_MOD_2D_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   +--->BN_MP_MUL_D_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_S_MP_ADD_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CMP_MAG_C
+|   +--->BN_S_MP_SUB_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
++--->BN_MP_MONTGOMERY_CALC_NORMALIZATION_C
+|   +--->BN_MP_2EXPT_C
+|   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_SET_C
+|   |   +--->BN_MP_ZERO_C
+|   +--->BN_MP_MUL_2_C
+|   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_CMP_MAG_C
+|   +--->BN_S_MP_SUB_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
++--->BN_MP_MULMOD_C
+|   +--->BN_MP_MUL_C
+|   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_COPY_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_SUB_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_SUB_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   +--->BN_MP_MOD_C
+|   |   +--->BN_MP_DIV_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_MP_COPY_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   +--->BN_MP_SET_C
+|   |   |   +--->BN_MP_ABS_C
+|   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_C
+|   |   |   +--->BN_MP_SUB_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_DIV_2D_C
+|   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_INIT_COPY_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
++--->BN_MP_SET_C
+|   +--->BN_MP_ZERO_C
++--->BN_MP_MOD_C
+|   +--->BN_MP_DIV_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_INIT_MULTI_C
+|   |   +--->BN_MP_ABS_C
+|   |   +--->BN_MP_MUL_2D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_DIV_2D_C
+|   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_INIT_COPY_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_MUL_D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_ADD_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_EXCH_C
++--->BN_MP_COPY_C
+|   +--->BN_MP_GROW_C
++--->BN_MP_SQR_C
+|   +--->BN_MP_TOOM_SQR_C
+|   |   +--->BN_MP_INIT_MULTI_C
+|   |   +--->BN_MP_MOD_2D_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_MUL_2_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_DIV_2_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_MUL_2D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_MUL_D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_DIV_3_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLEAR_MULTI_C
+|   +--->BN_MP_KARATSUBA_SQR_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   +--->BN_FAST_S_MP_SQR_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_S_MP_SQR_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
++--->BN_MP_MUL_C
+|   +--->BN_MP_TOOM_MUL_C
+|   |   +--->BN_MP_INIT_MULTI_C
+|   |   +--->BN_MP_MOD_2D_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_MUL_2_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_DIV_2_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_MUL_2D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_MUL_D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_DIV_3_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLEAR_MULTI_C
+|   +--->BN_MP_KARATSUBA_MUL_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_S_MP_MUL_DIGS_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
++--->BN_MP_EXCH_C
+
+
+BN_MP_TO_UNSIGNED_BIN_C
++--->BN_MP_INIT_COPY_C
+|   +--->BN_MP_COPY_C
+|   |   +--->BN_MP_GROW_C
++--->BN_MP_DIV_2D_C
+|   +--->BN_MP_COPY_C
+|   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_ZERO_C
+|   +--->BN_MP_MOD_2D_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_RSHD_C
+|   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_EXCH_C
++--->BN_MP_CLEAR_C
+
+
+BN_MP_SET_INT_C
++--->BN_MP_ZERO_C
++--->BN_MP_MUL_2D_C
+|   +--->BN_MP_COPY_C
+|   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_LSHD_C
+|   |   +--->BN_MP_RSHD_C
+|   +--->BN_MP_CLAMP_C
++--->BN_MP_CLAMP_C
+
+
+BN_MP_MOD_D_C
++--->BN_MP_DIV_D_C
+|   +--->BN_MP_COPY_C
+|   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_DIV_2D_C
+|   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_INIT_C
+|   |   +--->BN_MP_MOD_2D_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   +--->BN_MP_DIV_3_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_INIT_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_INIT_C
+|   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_EXCH_C
+|   +--->BN_MP_CLEAR_C
+
+
+BN_MP_SQR_C
++--->BN_MP_TOOM_SQR_C
+|   +--->BN_MP_INIT_MULTI_C
+|   |   +--->BN_MP_INIT_C
+|   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_MOD_2D_C
+|   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_COPY_C
+|   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_ZERO_C
+|   +--->BN_MP_MUL_2_C
+|   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_ADD_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_SUB_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_DIV_2_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_MUL_2D_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_LSHD_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_MUL_D_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_DIV_3_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_INIT_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_LSHD_C
+|   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_CLEAR_MULTI_C
+|   |   +--->BN_MP_CLEAR_C
++--->BN_MP_KARATSUBA_SQR_C
+|   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_INIT_C
+|   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_SUB_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   +--->BN_S_MP_ADD_C
+|   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_LSHD_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_ZERO_C
+|   +--->BN_MP_ADD_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_CLEAR_C
++--->BN_FAST_S_MP_SQR_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_CLAMP_C
++--->BN_S_MP_SQR_C
+|   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_INIT_C
+|   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_EXCH_C
+|   +--->BN_MP_CLEAR_C
+
+
+BN_MP_MULMOD_C
++--->BN_MP_INIT_C
++--->BN_MP_MUL_C
+|   +--->BN_MP_TOOM_MUL_C
+|   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_MOD_2D_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_COPY_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_MUL_2_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_DIV_2_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_MUL_2D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_MUL_D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_DIV_3_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_KARATSUBA_MUL_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_CLEAR_C
+|   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_S_MP_MUL_DIGS_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_CLEAR_C
++--->BN_MP_CLEAR_C
++--->BN_MP_MOD_C
+|   +--->BN_MP_DIV_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_INIT_MULTI_C
+|   |   +--->BN_MP_SET_C
+|   |   +--->BN_MP_COUNT_BITS_C
+|   |   +--->BN_MP_ABS_C
+|   |   +--->BN_MP_MUL_2D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_DIV_2D_C
+|   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_INIT_COPY_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_MUL_D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_ADD_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_EXCH_C
+
+
+BN_MP_DIV_2D_C
++--->BN_MP_COPY_C
+|   +--->BN_MP_GROW_C
++--->BN_MP_ZERO_C
++--->BN_MP_INIT_C
++--->BN_MP_MOD_2D_C
+|   +--->BN_MP_CLAMP_C
++--->BN_MP_CLEAR_C
++--->BN_MP_RSHD_C
++--->BN_MP_CLAMP_C
++--->BN_MP_EXCH_C
+
+
+BN_S_MP_ADD_C
++--->BN_MP_GROW_C
++--->BN_MP_CLAMP_C
+
+
+BN_FAST_S_MP_SQR_C
++--->BN_MP_GROW_C
++--->BN_MP_CLAMP_C
+
+
+BN_S_MP_MUL_DIGS_C
++--->BN_FAST_S_MP_MUL_DIGS_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_CLAMP_C
++--->BN_MP_INIT_SIZE_C
+|   +--->BN_MP_INIT_C
++--->BN_MP_CLAMP_C
++--->BN_MP_EXCH_C
++--->BN_MP_CLEAR_C
+
+
+BN_MP_XOR_C
++--->BN_MP_INIT_COPY_C
+|   +--->BN_MP_COPY_C
+|   |   +--->BN_MP_GROW_C
++--->BN_MP_CLAMP_C
++--->BN_MP_EXCH_C
++--->BN_MP_CLEAR_C
+
+
+BN_MP_RADIX_SMAP_C
+
+
+BN_MP_DR_IS_MODULUS_C
+
+
+BN_MP_MONTGOMERY_CALC_NORMALIZATION_C
++--->BN_MP_COUNT_BITS_C
++--->BN_MP_2EXPT_C
+|   +--->BN_MP_ZERO_C
+|   +--->BN_MP_GROW_C
++--->BN_MP_SET_C
+|   +--->BN_MP_ZERO_C
++--->BN_MP_MUL_2_C
+|   +--->BN_MP_GROW_C
++--->BN_MP_CMP_MAG_C
++--->BN_S_MP_SUB_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_CLAMP_C
+
+
+BN_MP_SUB_C
++--->BN_S_MP_ADD_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_CLAMP_C
++--->BN_MP_CMP_MAG_C
++--->BN_S_MP_SUB_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_CLAMP_C
+
+
+BN_MP_INIT_MULTI_C
++--->BN_MP_INIT_C
++--->BN_MP_CLEAR_C
+
+
+BN_S_MP_MUL_HIGH_DIGS_C
++--->BN_FAST_S_MP_MUL_HIGH_DIGS_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_CLAMP_C
++--->BN_MP_INIT_SIZE_C
+|   +--->BN_MP_INIT_C
++--->BN_MP_CLAMP_C
++--->BN_MP_EXCH_C
++--->BN_MP_CLEAR_C
+
+
+BN_MP_PRIME_NEXT_PRIME_C
++--->BN_MP_CMP_D_C
++--->BN_MP_SET_C
+|   +--->BN_MP_ZERO_C
++--->BN_MP_SUB_D_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_ADD_D_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CLAMP_C
++--->BN_MP_MOD_D_C
+|   +--->BN_MP_DIV_D_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_DIV_2D_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_INIT_C
+|   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_DIV_3_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_INIT_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_INIT_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_CLEAR_C
++--->BN_MP_INIT_C
++--->BN_MP_ADD_D_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_CLAMP_C
++--->BN_MP_PRIME_MILLER_RABIN_C
+|   +--->BN_MP_INIT_COPY_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_CNT_LSB_C
+|   +--->BN_MP_DIV_2D_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_MOD_2D_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   +--->BN_MP_EXPTMOD_C
+|   |   +--->BN_MP_INVMOD_C
+|   |   |   +--->BN_FAST_MP_INVMOD_C
+|   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_MOD_C
+|   |   |   |   |   +--->BN_MP_DIV_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   |   |   |   +--->BN_MP_ABS_C
+|   |   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_INVMOD_SLOW_C
+|   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   +--->BN_MP_MOD_C
+|   |   |   |   |   +--->BN_MP_DIV_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   |   |   |   +--->BN_MP_ABS_C
+|   |   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_ABS_C
+|   |   |   +--->BN_MP_COPY_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   +--->BN_MP_REDUCE_IS_2K_L_C
+|   |   +--->BN_S_MP_EXPTMOD_C
+|   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   +--->BN_MP_REDUCE_SETUP_C
+|   |   |   |   +--->BN_MP_2EXPT_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_DIV_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_REDUCE_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_MUL_C
+|   |   |   |   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_S_MP_MUL_HIGH_DIGS_C
+|   |   |   |   |   +--->BN_FAST_S_MP_MUL_HIGH_DIGS_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_FAST_S_MP_MUL_HIGH_DIGS_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_REDUCE_2K_SETUP_L_C
+|   |   |   |   +--->BN_MP_2EXPT_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_REDUCE_2K_L_C
+|   |   |   |   +--->BN_MP_MUL_C
+|   |   |   |   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_MOD_C
+|   |   |   |   +--->BN_MP_DIV_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_COPY_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_SQR_C
+|   |   |   |   +--->BN_MP_TOOM_SQR_C
+|   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_KARATSUBA_SQR_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_FAST_S_MP_SQR_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_SQR_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_MUL_C
+|   |   |   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_DR_IS_MODULUS_C
+|   |   +--->BN_MP_REDUCE_IS_2K_C
+|   |   |   +--->BN_MP_REDUCE_2K_C
+|   |   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   +--->BN_MP_EXPTMOD_FAST_C
+|   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   +--->BN_MP_MONTGOMERY_SETUP_C
+|   |   |   +--->BN_FAST_MP_MONTGOMERY_REDUCE_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_MONTGOMERY_REDUCE_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_DR_SETUP_C
+|   |   |   +--->BN_MP_DR_REDUCE_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_REDUCE_2K_SETUP_C
+|   |   |   |   +--->BN_MP_2EXPT_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_REDUCE_2K_C
+|   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_MONTGOMERY_CALC_NORMALIZATION_C
+|   |   |   |   +--->BN_MP_2EXPT_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_MULMOD_C
+|   |   |   |   +--->BN_MP_MUL_C
+|   |   |   |   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_MOD_C
+|   |   |   |   |   +--->BN_MP_DIV_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_MOD_C
+|   |   |   |   +--->BN_MP_DIV_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_COPY_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_SQR_C
+|   |   |   |   +--->BN_MP_TOOM_SQR_C
+|   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_KARATSUBA_SQR_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_FAST_S_MP_SQR_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_SQR_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_MUL_C
+|   |   |   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_EXCH_C
+|   +--->BN_MP_CMP_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   +--->BN_MP_SQRMOD_C
+|   |   +--->BN_MP_SQR_C
+|   |   |   +--->BN_MP_TOOM_SQR_C
+|   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_KARATSUBA_SQR_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_FAST_S_MP_SQR_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_SQR_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_MOD_C
+|   |   |   +--->BN_MP_DIV_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   |   +--->BN_MP_ABS_C
+|   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   +--->BN_MP_CLEAR_C
++--->BN_MP_CLEAR_C
+
+
+BN_MP_SIGNED_BIN_SIZE_C
++--->BN_MP_UNSIGNED_BIN_SIZE_C
+|   +--->BN_MP_COUNT_BITS_C
+
+
+BN_MP_INVMOD_SLOW_C
++--->BN_MP_INIT_MULTI_C
+|   +--->BN_MP_INIT_C
+|   +--->BN_MP_CLEAR_C
++--->BN_MP_MOD_C
+|   +--->BN_MP_INIT_C
+|   +--->BN_MP_DIV_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_SET_C
+|   |   +--->BN_MP_COUNT_BITS_C
+|   |   +--->BN_MP_ABS_C
+|   |   +--->BN_MP_MUL_2D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_DIV_2D_C
+|   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_INIT_COPY_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_MUL_D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_ADD_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_EXCH_C
++--->BN_MP_COPY_C
+|   +--->BN_MP_GROW_C
++--->BN_MP_SET_C
+|   +--->BN_MP_ZERO_C
++--->BN_MP_DIV_2_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_CLAMP_C
++--->BN_MP_ADD_C
+|   +--->BN_S_MP_ADD_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CMP_MAG_C
+|   +--->BN_S_MP_SUB_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
++--->BN_MP_SUB_C
+|   +--->BN_S_MP_ADD_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CMP_MAG_C
+|   +--->BN_S_MP_SUB_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
++--->BN_MP_CMP_C
+|   +--->BN_MP_CMP_MAG_C
++--->BN_MP_CMP_D_C
++--->BN_MP_CMP_MAG_C
++--->BN_MP_EXCH_C
++--->BN_MP_CLEAR_MULTI_C
+|   +--->BN_MP_CLEAR_C
+
+
+BN_MP_LCM_C
++--->BN_MP_INIT_MULTI_C
+|   +--->BN_MP_INIT_C
+|   +--->BN_MP_CLEAR_C
++--->BN_MP_GCD_C
+|   +--->BN_MP_ABS_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_ZERO_C
+|   +--->BN_MP_INIT_COPY_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_CNT_LSB_C
+|   +--->BN_MP_DIV_2D_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_MOD_2D_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   +--->BN_MP_CMP_MAG_C
+|   +--->BN_MP_EXCH_C
+|   +--->BN_S_MP_SUB_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_MUL_2D_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CLEAR_C
++--->BN_MP_CMP_MAG_C
++--->BN_MP_DIV_C
+|   +--->BN_MP_COPY_C
+|   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_ZERO_C
+|   +--->BN_MP_SET_C
+|   +--->BN_MP_COUNT_BITS_C
+|   +--->BN_MP_ABS_C
+|   +--->BN_MP_MUL_2D_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CMP_C
+|   +--->BN_MP_SUB_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_ADD_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_DIV_2D_C
+|   |   +--->BN_MP_INIT_C
+|   |   +--->BN_MP_MOD_2D_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   +--->BN_MP_EXCH_C
+|   +--->BN_MP_CLEAR_MULTI_C
+|   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_INIT_C
+|   +--->BN_MP_INIT_C
+|   +--->BN_MP_INIT_COPY_C
+|   +--->BN_MP_LSHD_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_RSHD_C
+|   +--->BN_MP_RSHD_C
+|   +--->BN_MP_MUL_D_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CLEAR_C
++--->BN_MP_MUL_C
+|   +--->BN_MP_TOOM_MUL_C
+|   |   +--->BN_MP_MOD_2D_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_COPY_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_MUL_2_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_DIV_2_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_MUL_2D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_MUL_D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_DIV_3_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_INIT_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_KARATSUBA_MUL_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_INIT_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_CLEAR_C
+|   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_S_MP_MUL_DIGS_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_INIT_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_CLEAR_C
++--->BN_MP_CLEAR_MULTI_C
+|   +--->BN_MP_CLEAR_C
+
+
+BN_MP_REDUCE_2K_L_C
++--->BN_MP_INIT_C
++--->BN_MP_COUNT_BITS_C
++--->BN_MP_DIV_2D_C
+|   +--->BN_MP_COPY_C
+|   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_ZERO_C
+|   +--->BN_MP_MOD_2D_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_RSHD_C
+|   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_EXCH_C
++--->BN_MP_MUL_C
+|   +--->BN_MP_TOOM_MUL_C
+|   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_MOD_2D_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_COPY_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_MUL_2_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_DIV_2_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_MUL_2D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_MUL_D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_DIV_3_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_KARATSUBA_MUL_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_CLEAR_C
+|   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_S_MP_MUL_DIGS_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_CLEAR_C
++--->BN_S_MP_ADD_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_CLAMP_C
++--->BN_MP_CMP_MAG_C
++--->BN_S_MP_SUB_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_CLAMP_C
++--->BN_MP_CLEAR_C
+
+
+BN_REVERSE_C
+
+
+BN_MP_PRIME_IS_DIVISIBLE_C
++--->BN_MP_MOD_D_C
+|   +--->BN_MP_DIV_D_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_DIV_2D_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_INIT_C
+|   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_DIV_3_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_INIT_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_INIT_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_CLEAR_C
+
+
+BN_MP_SET_C
++--->BN_MP_ZERO_C
+
+
+BN_MP_GCD_C
++--->BN_MP_ABS_C
+|   +--->BN_MP_COPY_C
+|   |   +--->BN_MP_GROW_C
++--->BN_MP_ZERO_C
++--->BN_MP_INIT_COPY_C
+|   +--->BN_MP_COPY_C
+|   |   +--->BN_MP_GROW_C
++--->BN_MP_CNT_LSB_C
++--->BN_MP_DIV_2D_C
+|   +--->BN_MP_COPY_C
+|   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_MOD_2D_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_RSHD_C
+|   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_EXCH_C
++--->BN_MP_CMP_MAG_C
++--->BN_MP_EXCH_C
++--->BN_S_MP_SUB_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_CLAMP_C
++--->BN_MP_MUL_2D_C
+|   +--->BN_MP_COPY_C
+|   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_LSHD_C
+|   |   +--->BN_MP_RSHD_C
+|   +--->BN_MP_CLAMP_C
++--->BN_MP_CLEAR_C
+
+
+BN_MP_REDUCE_2K_SETUP_L_C
++--->BN_MP_INIT_C
++--->BN_MP_2EXPT_C
+|   +--->BN_MP_ZERO_C
+|   +--->BN_MP_GROW_C
++--->BN_MP_COUNT_BITS_C
++--->BN_S_MP_SUB_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_CLAMP_C
++--->BN_MP_CLEAR_C
+
+
+BN_MP_READ_RADIX_C
++--->BN_MP_ZERO_C
++--->BN_MP_MUL_D_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_CLAMP_C
++--->BN_MP_ADD_D_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_SUB_D_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CLAMP_C
+
+
+BN_FAST_S_MP_MUL_HIGH_DIGS_C
++--->BN_MP_GROW_C
++--->BN_MP_CLAMP_C
+
+
+BN_FAST_MP_MONTGOMERY_REDUCE_C
++--->BN_MP_GROW_C
++--->BN_MP_RSHD_C
+|   +--->BN_MP_ZERO_C
++--->BN_MP_CLAMP_C
++--->BN_MP_CMP_MAG_C
++--->BN_S_MP_SUB_C
+
+
+BN_MP_DIV_D_C
++--->BN_MP_COPY_C
+|   +--->BN_MP_GROW_C
++--->BN_MP_DIV_2D_C
+|   +--->BN_MP_ZERO_C
+|   +--->BN_MP_INIT_C
+|   +--->BN_MP_MOD_2D_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_RSHD_C
+|   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_EXCH_C
++--->BN_MP_DIV_3_C
+|   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_INIT_C
+|   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_EXCH_C
+|   +--->BN_MP_CLEAR_C
++--->BN_MP_INIT_SIZE_C
+|   +--->BN_MP_INIT_C
++--->BN_MP_CLAMP_C
++--->BN_MP_EXCH_C
++--->BN_MP_CLEAR_C
+
+
+BN_MP_REDUCE_2K_SETUP_C
++--->BN_MP_INIT_C
++--->BN_MP_COUNT_BITS_C
++--->BN_MP_2EXPT_C
+|   +--->BN_MP_ZERO_C
+|   +--->BN_MP_GROW_C
++--->BN_MP_CLEAR_C
++--->BN_S_MP_SUB_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_CLAMP_C
+
+
+BN_MP_INIT_SET_C
++--->BN_MP_INIT_C
++--->BN_MP_SET_C
+|   +--->BN_MP_ZERO_C
+
+
+BN_MP_REDUCE_2K_C
++--->BN_MP_INIT_C
++--->BN_MP_COUNT_BITS_C
++--->BN_MP_DIV_2D_C
+|   +--->BN_MP_COPY_C
+|   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_ZERO_C
+|   +--->BN_MP_MOD_2D_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_RSHD_C
+|   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_EXCH_C
++--->BN_MP_MUL_D_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_CLAMP_C
++--->BN_S_MP_ADD_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_CLAMP_C
++--->BN_MP_CMP_MAG_C
++--->BN_S_MP_SUB_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_CLAMP_C
++--->BN_MP_CLEAR_C
+
+
+BN_ERROR_C
+
+
+BN_MP_EXPT_D_C
++--->BN_MP_INIT_COPY_C
+|   +--->BN_MP_COPY_C
+|   |   +--->BN_MP_GROW_C
++--->BN_MP_SET_C
+|   +--->BN_MP_ZERO_C
++--->BN_MP_SQR_C
+|   +--->BN_MP_TOOM_SQR_C
+|   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_MOD_2D_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_COPY_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_MUL_2_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_DIV_2_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_MUL_2D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_MUL_D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_DIV_3_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_KARATSUBA_SQR_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLEAR_C
+|   +--->BN_FAST_S_MP_SQR_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_S_MP_SQR_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_CLEAR_C
++--->BN_MP_CLEAR_C
++--->BN_MP_MUL_C
+|   +--->BN_MP_TOOM_MUL_C
+|   |   +--->BN_MP_INIT_MULTI_C
+|   |   +--->BN_MP_MOD_2D_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_COPY_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_MUL_2_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_DIV_2_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_MUL_2D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_MUL_D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_DIV_3_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLEAR_MULTI_C
+|   +--->BN_MP_KARATSUBA_MUL_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_S_MP_MUL_DIGS_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+
+
+BN_S_MP_EXPTMOD_C
++--->BN_MP_COUNT_BITS_C
++--->BN_MP_INIT_C
++--->BN_MP_CLEAR_C
++--->BN_MP_REDUCE_SETUP_C
+|   +--->BN_MP_2EXPT_C
+|   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_DIV_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_INIT_MULTI_C
+|   |   +--->BN_MP_SET_C
+|   |   +--->BN_MP_ABS_C
+|   |   +--->BN_MP_MUL_2D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_DIV_2D_C
+|   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_INIT_COPY_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_MUL_D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CLAMP_C
++--->BN_MP_REDUCE_C
+|   +--->BN_MP_INIT_COPY_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_ZERO_C
+|   +--->BN_MP_MUL_C
+|   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_COPY_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_SUB_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_SUB_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   +--->BN_S_MP_MUL_HIGH_DIGS_C
+|   |   +--->BN_FAST_S_MP_MUL_HIGH_DIGS_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   +--->BN_FAST_S_MP_MUL_HIGH_DIGS_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_MOD_2D_C
+|   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_S_MP_MUL_DIGS_C
+|   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   +--->BN_MP_SUB_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CMP_D_C
+|   +--->BN_MP_SET_C
+|   |   +--->BN_MP_ZERO_C
+|   +--->BN_MP_LSHD_C
+|   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_ADD_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CMP_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   +--->BN_S_MP_SUB_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
++--->BN_MP_REDUCE_2K_SETUP_L_C
+|   +--->BN_MP_2EXPT_C
+|   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_GROW_C
+|   +--->BN_S_MP_SUB_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
++--->BN_MP_REDUCE_2K_L_C
+|   +--->BN_MP_DIV_2D_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_MOD_2D_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   +--->BN_MP_MUL_C
+|   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_COPY_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_SUB_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_SUB_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   +--->BN_S_MP_ADD_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CMP_MAG_C
+|   +--->BN_S_MP_SUB_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
++--->BN_MP_MOD_C
+|   +--->BN_MP_DIV_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_INIT_MULTI_C
+|   |   +--->BN_MP_SET_C
+|   |   +--->BN_MP_ABS_C
+|   |   +--->BN_MP_MUL_2D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_DIV_2D_C
+|   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_INIT_COPY_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_MUL_D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_ADD_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_EXCH_C
++--->BN_MP_COPY_C
+|   +--->BN_MP_GROW_C
++--->BN_MP_SQR_C
+|   +--->BN_MP_TOOM_SQR_C
+|   |   +--->BN_MP_INIT_MULTI_C
+|   |   +--->BN_MP_MOD_2D_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_MUL_2_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_DIV_2_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_MUL_2D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_MUL_D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_DIV_3_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLEAR_MULTI_C
+|   +--->BN_MP_KARATSUBA_SQR_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   +--->BN_FAST_S_MP_SQR_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_S_MP_SQR_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
++--->BN_MP_MUL_C
+|   +--->BN_MP_TOOM_MUL_C
+|   |   +--->BN_MP_INIT_MULTI_C
+|   |   +--->BN_MP_MOD_2D_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_MUL_2_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_DIV_2_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_MUL_2D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_MUL_D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_DIV_3_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLEAR_MULTI_C
+|   +--->BN_MP_KARATSUBA_MUL_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_S_MP_MUL_DIGS_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
++--->BN_MP_SET_C
+|   +--->BN_MP_ZERO_C
++--->BN_MP_EXCH_C
+
+
+BN_MP_ABS_C
++--->BN_MP_COPY_C
+|   +--->BN_MP_GROW_C
+
+
+BN_MP_INIT_SET_INT_C
++--->BN_MP_INIT_C
++--->BN_MP_SET_INT_C
+|   +--->BN_MP_ZERO_C
+|   +--->BN_MP_MUL_2D_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CLAMP_C
+
+
+BN_MP_SUB_D_C
++--->BN_MP_GROW_C
++--->BN_MP_ADD_D_C
+|   +--->BN_MP_CLAMP_C
++--->BN_MP_CLAMP_C
+
+
+BN_MP_TO_SIGNED_BIN_C
++--->BN_MP_TO_UNSIGNED_BIN_C
+|   +--->BN_MP_INIT_COPY_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_DIV_2D_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_MOD_2D_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   +--->BN_MP_CLEAR_C
+
+
+BN_MP_DIV_2_C
++--->BN_MP_GROW_C
++--->BN_MP_CLAMP_C
+
+
+BN_MP_REDUCE_IS_2K_C
++--->BN_MP_REDUCE_2K_C
+|   +--->BN_MP_INIT_C
+|   +--->BN_MP_COUNT_BITS_C
+|   +--->BN_MP_DIV_2D_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_MOD_2D_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   +--->BN_MP_MUL_D_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_S_MP_ADD_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CMP_MAG_C
+|   +--->BN_S_MP_SUB_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CLEAR_C
++--->BN_MP_COUNT_BITS_C
+
+
+BN_MP_INIT_SIZE_C
++--->BN_MP_INIT_C
+
+
+BN_MP_DIV_C
++--->BN_MP_CMP_MAG_C
++--->BN_MP_COPY_C
+|   +--->BN_MP_GROW_C
++--->BN_MP_ZERO_C
++--->BN_MP_INIT_MULTI_C
+|   +--->BN_MP_INIT_C
+|   +--->BN_MP_CLEAR_C
++--->BN_MP_SET_C
++--->BN_MP_COUNT_BITS_C
++--->BN_MP_ABS_C
++--->BN_MP_MUL_2D_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_LSHD_C
+|   |   +--->BN_MP_RSHD_C
+|   +--->BN_MP_CLAMP_C
++--->BN_MP_CMP_C
++--->BN_MP_SUB_C
+|   +--->BN_S_MP_ADD_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_S_MP_SUB_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
++--->BN_MP_ADD_C
+|   +--->BN_S_MP_ADD_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_S_MP_SUB_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
++--->BN_MP_DIV_2D_C
+|   +--->BN_MP_INIT_C
+|   +--->BN_MP_MOD_2D_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_RSHD_C
+|   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_EXCH_C
++--->BN_MP_EXCH_C
++--->BN_MP_CLEAR_MULTI_C
+|   +--->BN_MP_CLEAR_C
++--->BN_MP_INIT_SIZE_C
+|   +--->BN_MP_INIT_C
++--->BN_MP_INIT_C
++--->BN_MP_INIT_COPY_C
++--->BN_MP_LSHD_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_RSHD_C
++--->BN_MP_RSHD_C
++--->BN_MP_MUL_D_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_CLAMP_C
++--->BN_MP_CLAMP_C
++--->BN_MP_CLEAR_C
+
+
+BN_MP_CLEAR_C
+
+
+BN_MP_MONTGOMERY_REDUCE_C
++--->BN_FAST_MP_MONTGOMERY_REDUCE_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_ZERO_C
+|   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CMP_MAG_C
+|   +--->BN_S_MP_SUB_C
++--->BN_MP_GROW_C
++--->BN_MP_CLAMP_C
++--->BN_MP_RSHD_C
+|   +--->BN_MP_ZERO_C
++--->BN_MP_CMP_MAG_C
++--->BN_S_MP_SUB_C
+
+
+BN_MP_MUL_2_C
++--->BN_MP_GROW_C
+
+
+BN_MP_UNSIGNED_BIN_SIZE_C
++--->BN_MP_COUNT_BITS_C
+
+
+BN_MP_ADDMOD_C
++--->BN_MP_INIT_C
++--->BN_MP_ADD_C
+|   +--->BN_S_MP_ADD_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CMP_MAG_C
+|   +--->BN_S_MP_SUB_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
++--->BN_MP_CLEAR_C
++--->BN_MP_MOD_C
+|   +--->BN_MP_DIV_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_INIT_MULTI_C
+|   |   +--->BN_MP_SET_C
+|   |   +--->BN_MP_COUNT_BITS_C
+|   |   +--->BN_MP_ABS_C
+|   |   +--->BN_MP_MUL_2D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_DIV_2D_C
+|   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_INIT_COPY_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_MUL_D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_EXCH_C
+
+
+BN_MP_ADD_C
++--->BN_S_MP_ADD_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_CLAMP_C
++--->BN_MP_CMP_MAG_C
++--->BN_S_MP_SUB_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_CLAMP_C
+
+
+BN_MP_TO_SIGNED_BIN_N_C
++--->BN_MP_SIGNED_BIN_SIZE_C
+|   +--->BN_MP_UNSIGNED_BIN_SIZE_C
+|   |   +--->BN_MP_COUNT_BITS_C
++--->BN_MP_TO_SIGNED_BIN_C
+|   +--->BN_MP_TO_UNSIGNED_BIN_C
+|   |   +--->BN_MP_INIT_COPY_C
+|   |   |   +--->BN_MP_COPY_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_DIV_2D_C
+|   |   |   +--->BN_MP_COPY_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_CLEAR_C
+
+
+BN_MP_REDUCE_IS_2K_L_C
+
+
+BN_MP_RAND_C
++--->BN_MP_ZERO_C
++--->BN_MP_ADD_D_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_SUB_D_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CLAMP_C
++--->BN_MP_LSHD_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_RSHD_C
+
+
+BN_MP_CNT_LSB_C
+
+
+BN_MP_2EXPT_C
++--->BN_MP_ZERO_C
++--->BN_MP_GROW_C
+
+
+BN_MP_RSHD_C
++--->BN_MP_ZERO_C
+
+
+BN_MP_SHRINK_C
+
+
+BN_MP_TO_UNSIGNED_BIN_N_C
++--->BN_MP_UNSIGNED_BIN_SIZE_C
+|   +--->BN_MP_COUNT_BITS_C
++--->BN_MP_TO_UNSIGNED_BIN_C
+|   +--->BN_MP_INIT_COPY_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_DIV_2D_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_MOD_2D_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   +--->BN_MP_CLEAR_C
+
+
+BN_MP_REDUCE_C
++--->BN_MP_REDUCE_SETUP_C
+|   +--->BN_MP_2EXPT_C
+|   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_DIV_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   +--->BN_MP_INIT_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_SET_C
+|   |   +--->BN_MP_COUNT_BITS_C
+|   |   +--->BN_MP_ABS_C
+|   |   +--->BN_MP_MUL_2D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_DIV_2D_C
+|   |   |   +--->BN_MP_INIT_C
+|   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_INIT_C
+|   |   +--->BN_MP_INIT_C
+|   |   +--->BN_MP_INIT_COPY_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_MUL_D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CLEAR_C
++--->BN_MP_INIT_COPY_C
+|   +--->BN_MP_COPY_C
+|   |   +--->BN_MP_GROW_C
++--->BN_MP_RSHD_C
+|   +--->BN_MP_ZERO_C
++--->BN_MP_MUL_C
+|   +--->BN_MP_TOOM_MUL_C
+|   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_MOD_2D_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_COPY_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_MUL_2_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_DIV_2_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_MUL_2D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_MUL_D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_DIV_3_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_KARATSUBA_MUL_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLEAR_C
+|   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_S_MP_MUL_DIGS_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_CLEAR_C
++--->BN_S_MP_MUL_HIGH_DIGS_C
+|   +--->BN_FAST_S_MP_MUL_HIGH_DIGS_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_INIT_SIZE_C
+|   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_EXCH_C
+|   +--->BN_MP_CLEAR_C
++--->BN_FAST_S_MP_MUL_HIGH_DIGS_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_CLAMP_C
++--->BN_MP_MOD_2D_C
+|   +--->BN_MP_ZERO_C
+|   +--->BN_MP_COPY_C
+|   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_CLAMP_C
++--->BN_S_MP_MUL_DIGS_C
+|   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_INIT_SIZE_C
+|   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_EXCH_C
+|   +--->BN_MP_CLEAR_C
++--->BN_MP_SUB_C
+|   +--->BN_S_MP_ADD_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CMP_MAG_C
+|   +--->BN_S_MP_SUB_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
++--->BN_MP_CMP_D_C
++--->BN_MP_SET_C
+|   +--->BN_MP_ZERO_C
++--->BN_MP_LSHD_C
+|   +--->BN_MP_GROW_C
++--->BN_MP_ADD_C
+|   +--->BN_S_MP_ADD_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CMP_MAG_C
+|   +--->BN_S_MP_SUB_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
++--->BN_MP_CMP_C
+|   +--->BN_MP_CMP_MAG_C
++--->BN_S_MP_SUB_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_CLAMP_C
++--->BN_MP_CLEAR_C
+
+
+BN_MP_MUL_2D_C
++--->BN_MP_COPY_C
+|   +--->BN_MP_GROW_C
++--->BN_MP_GROW_C
++--->BN_MP_LSHD_C
+|   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_ZERO_C
++--->BN_MP_CLAMP_C
+
+
+BN_MP_GET_INT_C
+
+
+BN_MP_JACOBI_C
++--->BN_MP_CMP_D_C
++--->BN_MP_INIT_COPY_C
+|   +--->BN_MP_COPY_C
+|   |   +--->BN_MP_GROW_C
++--->BN_MP_CNT_LSB_C
++--->BN_MP_DIV_2D_C
+|   +--->BN_MP_COPY_C
+|   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_ZERO_C
+|   +--->BN_MP_MOD_2D_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_RSHD_C
+|   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_EXCH_C
++--->BN_MP_MOD_C
+|   +--->BN_MP_DIV_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_SET_C
+|   |   +--->BN_MP_COUNT_BITS_C
+|   |   +--->BN_MP_ABS_C
+|   |   +--->BN_MP_MUL_2D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_MUL_D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_ADD_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_EXCH_C
++--->BN_MP_CLEAR_C
+
+
+BN_MP_CLEAR_MULTI_C
++--->BN_MP_CLEAR_C
+
+
+BN_MP_MUL_C
++--->BN_MP_TOOM_MUL_C
+|   +--->BN_MP_INIT_MULTI_C
+|   |   +--->BN_MP_INIT_C
+|   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_MOD_2D_C
+|   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_COPY_C
+|   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_ZERO_C
+|   +--->BN_MP_MUL_2_C
+|   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_ADD_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_SUB_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_DIV_2_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_MUL_2D_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_LSHD_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_MUL_D_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_DIV_3_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_INIT_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_LSHD_C
+|   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_CLEAR_MULTI_C
+|   |   +--->BN_MP_CLEAR_C
++--->BN_MP_KARATSUBA_MUL_C
+|   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_INIT_C
+|   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_SUB_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_ADD_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_LSHD_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_ZERO_C
+|   +--->BN_MP_CLEAR_C
++--->BN_FAST_S_MP_MUL_DIGS_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_CLAMP_C
++--->BN_S_MP_MUL_DIGS_C
+|   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_INIT_C
+|   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_EXCH_C
+|   +--->BN_MP_CLEAR_C
+
+
+BN_MP_EXTEUCLID_C
++--->BN_MP_INIT_MULTI_C
+|   +--->BN_MP_INIT_C
+|   +--->BN_MP_CLEAR_C
++--->BN_MP_SET_C
+|   +--->BN_MP_ZERO_C
++--->BN_MP_COPY_C
+|   +--->BN_MP_GROW_C
++--->BN_MP_DIV_C
+|   +--->BN_MP_CMP_MAG_C
+|   +--->BN_MP_ZERO_C
+|   +--->BN_MP_COUNT_BITS_C
+|   +--->BN_MP_ABS_C
+|   +--->BN_MP_MUL_2D_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CMP_C
+|   +--->BN_MP_SUB_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_ADD_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_DIV_2D_C
+|   |   +--->BN_MP_INIT_C
+|   |   +--->BN_MP_MOD_2D_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   +--->BN_MP_EXCH_C
+|   +--->BN_MP_CLEAR_MULTI_C
+|   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_INIT_C
+|   +--->BN_MP_INIT_C
+|   +--->BN_MP_INIT_COPY_C
+|   +--->BN_MP_LSHD_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_RSHD_C
+|   +--->BN_MP_RSHD_C
+|   +--->BN_MP_MUL_D_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CLEAR_C
++--->BN_MP_MUL_C
+|   +--->BN_MP_TOOM_MUL_C
+|   |   +--->BN_MP_MOD_2D_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_MUL_2_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_DIV_2_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_MUL_2D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_MUL_D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_DIV_3_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_INIT_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_KARATSUBA_MUL_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_INIT_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_CLEAR_C
+|   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_S_MP_MUL_DIGS_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_INIT_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_CLEAR_C
++--->BN_MP_SUB_C
+|   +--->BN_S_MP_ADD_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CMP_MAG_C
+|   +--->BN_S_MP_SUB_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
++--->BN_MP_NEG_C
++--->BN_MP_EXCH_C
++--->BN_MP_CLEAR_MULTI_C
+|   +--->BN_MP_CLEAR_C
+
+
+BN_MP_DR_REDUCE_C
++--->BN_MP_GROW_C
++--->BN_MP_CLAMP_C
++--->BN_MP_CMP_MAG_C
++--->BN_S_MP_SUB_C
+
+
+BN_MP_FREAD_C
++--->BN_MP_ZERO_C
++--->BN_MP_MUL_D_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_CLAMP_C
++--->BN_MP_ADD_D_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_SUB_D_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CLAMP_C
++--->BN_MP_CMP_D_C
+
+
+BN_MP_REDUCE_SETUP_C
++--->BN_MP_2EXPT_C
+|   +--->BN_MP_ZERO_C
+|   +--->BN_MP_GROW_C
++--->BN_MP_DIV_C
+|   +--->BN_MP_CMP_MAG_C
+|   +--->BN_MP_COPY_C
+|   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_ZERO_C
+|   +--->BN_MP_INIT_MULTI_C
+|   |   +--->BN_MP_INIT_C
+|   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_SET_C
+|   +--->BN_MP_COUNT_BITS_C
+|   +--->BN_MP_ABS_C
+|   +--->BN_MP_MUL_2D_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CMP_C
+|   +--->BN_MP_SUB_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_ADD_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_DIV_2D_C
+|   |   +--->BN_MP_INIT_C
+|   |   +--->BN_MP_MOD_2D_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   +--->BN_MP_EXCH_C
+|   +--->BN_MP_CLEAR_MULTI_C
+|   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_INIT_C
+|   +--->BN_MP_INIT_C
+|   +--->BN_MP_INIT_COPY_C
+|   +--->BN_MP_LSHD_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_RSHD_C
+|   +--->BN_MP_RSHD_C
+|   +--->BN_MP_MUL_D_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CLEAR_C
+
+
+BN_MP_MONTGOMERY_SETUP_C
+
+
+BN_MP_KARATSUBA_MUL_C
++--->BN_MP_MUL_C
+|   +--->BN_MP_TOOM_MUL_C
+|   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   +--->BN_MP_INIT_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_MOD_2D_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_COPY_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_MUL_2_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_DIV_2_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_MUL_2D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_MUL_D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_DIV_3_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_INIT_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_S_MP_MUL_DIGS_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_INIT_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_CLEAR_C
++--->BN_MP_INIT_SIZE_C
+|   +--->BN_MP_INIT_C
++--->BN_MP_CLAMP_C
++--->BN_MP_SUB_C
+|   +--->BN_S_MP_ADD_C
+|   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_CMP_MAG_C
+|   +--->BN_S_MP_SUB_C
+|   |   +--->BN_MP_GROW_C
++--->BN_MP_ADD_C
+|   +--->BN_S_MP_ADD_C
+|   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_CMP_MAG_C
+|   +--->BN_S_MP_SUB_C
+|   |   +--->BN_MP_GROW_C
++--->BN_MP_LSHD_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_ZERO_C
++--->BN_MP_CLEAR_C
+
+
+BN_MP_LSHD_C
++--->BN_MP_GROW_C
++--->BN_MP_RSHD_C
+|   +--->BN_MP_ZERO_C
+
+
+BN_MP_PRIME_MILLER_RABIN_C
++--->BN_MP_CMP_D_C
++--->BN_MP_INIT_COPY_C
+|   +--->BN_MP_COPY_C
+|   |   +--->BN_MP_GROW_C
++--->BN_MP_SUB_D_C
+|   +--->BN_MP_GROW_C
+|   +--->BN_MP_ADD_D_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CLAMP_C
++--->BN_MP_CNT_LSB_C
++--->BN_MP_DIV_2D_C
+|   +--->BN_MP_COPY_C
+|   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_ZERO_C
+|   +--->BN_MP_MOD_2D_C
+|   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_RSHD_C
+|   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_EXCH_C
++--->BN_MP_EXPTMOD_C
+|   +--->BN_MP_INVMOD_C
+|   |   +--->BN_FAST_MP_INVMOD_C
+|   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_COPY_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_MOD_C
+|   |   |   |   +--->BN_MP_DIV_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_SET_C
+|   |   |   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   |   |   +--->BN_MP_ABS_C
+|   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_SET_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_SUB_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_INVMOD_SLOW_C
+|   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_MOD_C
+|   |   |   |   +--->BN_MP_DIV_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_SET_C
+|   |   |   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   |   |   +--->BN_MP_ABS_C
+|   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_COPY_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_SET_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_SUB_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_ABS_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   +--->BN_MP_CLEAR_MULTI_C
+|   +--->BN_MP_REDUCE_IS_2K_L_C
+|   +--->BN_S_MP_EXPTMOD_C
+|   |   +--->BN_MP_COUNT_BITS_C
+|   |   +--->BN_MP_REDUCE_SETUP_C
+|   |   |   +--->BN_MP_2EXPT_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_DIV_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   +--->BN_MP_SET_C
+|   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_REDUCE_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_MUL_C
+|   |   |   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_S_MP_MUL_HIGH_DIGS_C
+|   |   |   |   +--->BN_FAST_S_MP_MUL_HIGH_DIGS_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_FAST_S_MP_MUL_HIGH_DIGS_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_SUB_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_SET_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_REDUCE_2K_SETUP_L_C
+|   |   |   +--->BN_MP_2EXPT_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_REDUCE_2K_L_C
+|   |   |   +--->BN_MP_MUL_C
+|   |   |   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_MOD_C
+|   |   |   +--->BN_MP_DIV_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   +--->BN_MP_SET_C
+|   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_SQR_C
+|   |   |   +--->BN_MP_TOOM_SQR_C
+|   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_KARATSUBA_SQR_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_FAST_S_MP_SQR_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_SQR_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_MUL_C
+|   |   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_SET_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_EXCH_C
+|   +--->BN_MP_DR_IS_MODULUS_C
+|   +--->BN_MP_REDUCE_IS_2K_C
+|   |   +--->BN_MP_REDUCE_2K_C
+|   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_COUNT_BITS_C
+|   +--->BN_MP_EXPTMOD_FAST_C
+|   |   +--->BN_MP_COUNT_BITS_C
+|   |   +--->BN_MP_MONTGOMERY_SETUP_C
+|   |   +--->BN_FAST_MP_MONTGOMERY_REDUCE_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   +--->BN_MP_MONTGOMERY_REDUCE_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   +--->BN_MP_DR_SETUP_C
+|   |   +--->BN_MP_DR_REDUCE_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   +--->BN_MP_REDUCE_2K_SETUP_C
+|   |   |   +--->BN_MP_2EXPT_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_REDUCE_2K_C
+|   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_MONTGOMERY_CALC_NORMALIZATION_C
+|   |   |   +--->BN_MP_2EXPT_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_SET_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_MULMOD_C
+|   |   |   +--->BN_MP_MUL_C
+|   |   |   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_MOD_C
+|   |   |   |   +--->BN_MP_DIV_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   |   +--->BN_MP_SET_C
+|   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_SET_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_MOD_C
+|   |   |   +--->BN_MP_DIV_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_SQR_C
+|   |   |   +--->BN_MP_TOOM_SQR_C
+|   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_KARATSUBA_SQR_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_FAST_S_MP_SQR_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_SQR_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_MUL_C
+|   |   |   +--->BN_MP_TOOM_MUL_C
+|   |   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_KARATSUBA_MUL_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_FAST_S_MP_MUL_DIGS_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_MUL_DIGS_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_EXCH_C
++--->BN_MP_CMP_C
+|   +--->BN_MP_CMP_MAG_C
++--->BN_MP_SQRMOD_C
+|   +--->BN_MP_SQR_C
+|   |   +--->BN_MP_TOOM_SQR_C
+|   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_COPY_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_MUL_2_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_SUB_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_DIV_2_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_DIV_3_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_KARATSUBA_SQR_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_SUB_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_FAST_S_MP_SQR_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_S_MP_SQR_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_MOD_C
+|   |   +--->BN_MP_DIV_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_MP_COPY_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_INIT_MULTI_C
+|   |   |   +--->BN_MP_SET_C
+|   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   +--->BN_MP_ABS_C
+|   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_SUB_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
++--->BN_MP_CLEAR_C
+
+
+BN_MP_DR_SETUP_C
+
+
+BN_MP_CMP_MAG_C
+
+
diff --git a/libtommath/changes.txt b/libtommath/changes.txt
new file mode 100644
index 0000000..99e40c1
--- /dev/null
+++ b/libtommath/changes.txt
@@ -0,0 +1,360 @@
+March 12th, 2005
+v0.35  -- Stupid XOR function missing line again... oops.
+       -- Fixed bug in invmod not handling negative inputs correctly [Wolfgang Ehrhardt]
+       -- Made exteuclid always give positive u3 output...[ Wolfgang Ehrhardt ]
+       -- [Wolfgang Ehrhardt] Suggested a fix for mp_reduce() which avoided underruns.  ;-)
+       -- mp_rand() would emit one too many digits and it was possible to get a 0 out of it ... oops
+       -- Added montgomery to the testing to make sure it handles 1..10 digit moduli correctly
+       -- Fixed bug in comba that would lead to possible erroneous outputs when "pa < digs" 
+       -- Fixed bug in mp_toradix_size for "0" [Kevin Kenny]
+       -- Updated chapters 1-5 of the textbook ;-) It now talks about the new comba code!
+
+February 12th, 2005
+v0.34  -- Fixed two more small errors in mp_prime_random_ex()
+       -- Fixed overflow in mp_mul_d() [Kevin Kenny]
+       -- Added mp_to_(un)signed_bin_n() functions which do bounds checking for ya [and report the size]
+       -- Added "large" diminished radix support.  Speeds up things like DSA where the moduli is of the form 2^k - P for some P < 2^(k/2) or so
+          Actually is faster than Montgomery on my AMD64 (and probably much faster on a P4)
+       -- Updated the manual a bit
+       -- Ok so I haven't done the textbook work yet... My current freelance gig has landed me in France till the 
+          end of Feb/05.  Once I get back I'll have tons of free time and I plan to go to town on the book.
+          As of this release the API will freeze.  At least until the book catches up with all the changes.  I welcome
+          bug reports but new algorithms will have to wait.
+
+December 23rd, 2004
+v0.33  -- Fixed "small" variant for mp_div() which would munge with negative dividends...
+       -- Fixed bug in mp_prime_random_ex() which would set the most significant byte to zero when
+          no special flags were set
+       -- Fixed overflow [minor] bug in fast_s_mp_sqr()
+       -- Made the makefiles easier to configure the group/user that ltm will install as
+       -- Fixed "final carry" bug in comba multipliers. (Volkan Ceylan)
+       -- Matt Johnston pointed out a missing semi-colon in mp_exptmod
+
+October 29th, 2004
+v0.32  -- Added "makefile.shared" for shared object support
+       -- Added more to the build options/configs in the manual
+       -- Started the Depends framework, wrote dep.pl to scan deps and 
+          produce "callgraph.txt" ;-)
+       -- Wrote SC_RSA_1 which will enable close to the minimum required to perform
+          RSA on 32-bit [or 64-bit] platforms with LibTomCrypt
+       -- Merged in the small/slower mp_div replacement.  You can now toggle which
+          you want to use as your mp_div() at build time.  Saves roughly 8KB or so.
+       -- Renamed a few files and changed some comments to make depends system work better.
+          (No changes to function names)
+       -- Merged in new Combas that perform 2 reads per inner loop instead of the older 
+          3reads/2writes per inner loop of the old code.  Really though if you want speed
+          learn to use TomsFastMath ;-)
+
+August 9th, 2004
+v0.31  -- "profiled" builds now :-) new timings for Intel Northwoods
+       -- Added "pretty" build target
+       -- Update mp_init() to actually assign 0's instead of relying on calloc()
+       -- "Wolfgang Ehrhardt" <Wolfgang.Ehrhardt@munich.netsurf.de> found a bug in mp_mul() where if
+          you multiply a negative by zero you get negative zero as the result.  Oops.
+       -- J Harper from PeerSec let me toy with his AMD64 and I got 60-bit digits working properly
+          [this also means that I fixed a bug where if sizeof(int) < sizeof(mp_digit) it would bug]
+
+April 11th, 2004
+v0.30  -- Added "mp_toradix_n" which stores upto "n-1" least significant digits of an mp_int
+       -- Johan Lindh sent a patch so MSVC wouldn't whine about redefining malloc [in weird dll modes]
+       -- Henrik Goldman spotted a missing OPT_CAST in mp_fwrite()
+       -- Tuned tommath.h so that when MP_LOW_MEM is defined MP_PREC shall be reduced.
+          [I also allow MP_PREC to be externally defined now]
+       -- Sped up mp_cnt_lsb() by using a 4x4 table [e.g. 4x speedup]
+       -- Added mp_prime_random_ex() which is a more versatile prime generator accurate to
+          exact bit lengths (unlike the deprecated but still available mp_prime_random() which
+          is only accurate to byte lengths).  See the new LTM_PRIME_* flags ;-)
+       -- Alex Polushin contributed an optimized mp_sqrt() as well as mp_get_int() and mp_is_square().
+          I've cleaned them all up to be a little more consistent [along with one bug fix] for this release.
+       -- Added mp_init_set and mp_init_set_int to initialize and set small constants with one function
+          call.
+       -- Removed /etclib directory [um LibTomPoly deprecates this].
+       -- Fixed mp_mod() so the sign of the result agrees with the sign of the modulus.
+       ++ N.B.  My semester is almost up so expect updates to the textbook to be posted to the libtomcrypt.org 
+          website.  
+
+Jan 25th, 2004
+v0.29  ++ Note: "Henrik" from the v0.28 changelog refers to Henrik Goldman ;-)
+       -- Added fix to mp_shrink to prevent a realloc when used == 0 [e.g. realloc zero bytes???]
+       -- Made the mp_prime_rabin_miller_trials() function internal table smaller and also
+          set the minimum number of tests to two (sounds a bit safer).
+       -- Added a mp_exteuclid() which computes the extended euclidean algorithm.
+       -- Fixed a memory leak in s_mp_exptmod() [called when Barrett reduction is to be used] which would arise
+          if a multiplication or subsequent reduction failed [would not free the temp result].
+       -- Made an API change to mp_radix_size().  It now returns an error code and stores the required size
+          through an "int star" passed to it.
+
+Dec 24th, 2003
+v0.28  -- Henrik Goldman suggested I add casts to the montomgery code [stores into mu...] so compilers wouldn't
+          spew [erroneous] diagnostics... fixed.
+       -- Henrik Goldman also spotted two typos.  One in mp_radix_size() and another in mp_toradix().
+       -- Added fix to mp_shrink() to avoid a memory leak.
+       -- Added mp_prime_random() which requires a callback to make truly random primes of a given nature
+          (idea from chat with Niels Ferguson at Crypto'03)
+       -- Picked up a second wind.  I'm filled with Gooo.  Mission Gooo!
+       -- Removed divisions from mp_reduce_is_2k()
+       -- Sped up mp_div_d() [general case] to use only one division per digit instead of two.
+       -- Added the heap macros from LTC to LTM.  Now you can easily [by editing four lines of tommath.h]
+          change the name of the heap functions used in LTM [also compatible with LTC via MPI mode]
+       -- Added bn_prime_rabin_miller_trials() which gives the number of Rabin-Miller trials to achieve
+          a failure rate of less than 2^-96
+       -- fixed bug in fast_mp_invmod().  The initial testing logic was wrong.  An invalid input is not when
+          "a" and "b" are even it's when "b" is even [the algo is for odd moduli only].
+       -- Started a new manual [finally].  It is incomplete and will be finished as time goes on.  I had to stop
+          adding full demos around half way in chapter three so I could at least get a good portion of the
+          manual done.   If you really need help using the library you can always email me!
+       -- My Textbook is now included as part of the package [all Public Domain]
+
+Sept 19th, 2003
+v0.27  -- Removed changes.txt~ which was made by accident since "kate" decided it was
+          a good time to re-enable backups... [kde is fun!]
+       -- In mp_grow() "a->dp" is not overwritten by realloc call [re: memory leak]
+          Now if mp_grow() fails the mp_int is still valid and can be cleared via
+          mp_clear() to reclaim the memory.
+       -- Henrik Goldman found a buffer overflow bug in mp_add_d().  Fixed.
+       -- Cleaned up mp_mul_d() to be much easier to read and follow.
+
+Aug 29th, 2003
+v0.26  -- Fixed typo that caused warning with GCC 3.2
+       -- Martin Marcel noticed a bug in mp_neg() that allowed negative zeroes.
+          Also, Martin is the fellow who noted the bugs in mp_gcd() of 0.24/0.25.
+       -- Martin Marcel noticed an optimization [and slight bug] in mp_lcm().
+       -- Added fix to mp_read_unsigned_bin to prevent a buffer overflow.
+       -- Beefed up the comments in the baseline multipliers [and montgomery]
+       -- Added "mont" demo to the makefile.msvc in etc/
+       -- Optimized sign compares in mp_cmp from 4 to 2 cases.
+
+Aug 4th, 2003
+v0.25  -- Fix to mp_gcd again... oops (0,-a) == (-a, 0) == a
+       -- Fix to mp_clear which didn't reset the sign  [Greg Rose]
+       -- Added mp_error_to_string() to convert return codes to strings.  [Greg Rose]
+       -- Optimized fast_mp_invmod() to do the test for invalid inputs [both even]
+          first so temps don't have to be initialized if it's going to fail.
+       -- Optimized mp_gcd() by removing mp_div_2d calls for when one of the inputs
+          is odd.
+       -- Tons of new comments, some indentation fixups, etc.
+       -- mp_jacobi() returns MP_VAL if the modulus is less than or equal to zero.
+       -- fixed two typos in the header of each file :-)
+       -- LibTomMath is officially Public Domain [see LICENSE]
+
+July 15th, 2003
+v0.24  -- Optimized mp_add_d and mp_sub_d to not allocate temporary variables
+       -- Fixed mp_gcd() so the gcd of 0,0 is 0.  Allows the gcd operation to be chained
+          e.g. (0,0,a) == a [instead of 1]
+       -- Should be one of the last release for a while.  Working on LibTomMath book now.
+       -- optimized the pprime demo [/etc/pprime.c] to first make a huge table of single
+          digit primes then it reads them randomly instead of randomly choosing/testing single
+          digit primes.
+
+July 12th, 2003
+v0.23  -- Optimized mp_prime_next_prime() to not use mp_mod [via is_divisible()] in each
+          iteration.  Instead now a smaller table is kept of the residues which can be updated
+          without division.
+       -- Fixed a bug in next_prime() where an input of zero would be treated as odd and
+          have two added to it [to move to the next odd].
+       -- fixed a bug in prime_fermat() and prime_miller_rabin() which allowed the base
+          to be negative, zero or one.  Normally the test is only valid if the base is
+          greater than one.
+       -- changed the next_prime() prototype to accept a new parameter "bbs_style" which
+          will find the next prime congruent to 3 mod 4.  The default [bbs_style==0] will
+          make primes which are either congruent to 1 or 3 mod 4.
+       -- fixed mp_read_unsigned_bin() so that it doesn't include both code for
+          the case DIGIT_BIT < 8 and >= 8
+       -- optimized div_d() to easy out on division by 1 [or if a == 0] and use
+          logical shifts if the divisor is a power of two.
+       -- the default DIGIT_BIT type was not int for non-default builds.  Fixed.
+
+July 2nd, 2003
+v0.22  -- Fixed up mp_invmod so the result is properly in range now [was always congruent to the inverse...]
+       -- Fixed up s_mp_exptmod and mp_exptmod_fast so the lower half of the pre-computed table isn't allocated
+          which makes the algorithm use half as much ram.
+       -- Fixed the install script not to make the book :-) [which isn't included anyways]
+       -- added mp_cnt_lsb() which counts how many of the lsbs are zero
+       -- optimized mp_gcd() to use the new mp_cnt_lsb() to replace multiple divisions by two by a single division.
+       -- applied similar optimization to mp_prime_miller_rabin().
+       -- Fixed a bug in both mp_invmod() and fast_mp_invmod() which tested for odd
+          via "mp_iseven() == 0" which is not valid [since zero is not even either].
+
+June 19th, 2003
+v0.21  -- Fixed bug in mp_mul_d which would not handle sign correctly [would not always forward it]
+       -- Removed the #line lines from gen.pl [was in violation of ISO C]
+
+June 8th, 2003
+v0.20  -- Removed the book from the package.  Added the TDCAL license document.
+       -- This release is officially pure-bred TDCAL again [last officially TDCAL based release was v0.16]
+
+June 6th, 2003
+v0.19  -- Fixed a bug in mp_montgomery_reduce() which was introduced when I tweaked mp_rshd() in the previous release.
+          Essentially the digits were not trimmed before the compare which cause a subtraction to occur all the time.
+       -- Fixed up etc/tune.c a bit to stop testing new cutoffs after 16 failures [to find more optimal points].
+          Brute force ho!
+
+
+May 29th, 2003
+v0.18  -- Fixed a bug in s_mp_sqr which would handle carries properly just not very elegantly.
+          (e.g. correct result, just bad looking code)
+       -- Fixed bug in mp_sqr which still had a 512 constant instead of MP_WARRAY
+       -- Added Toom-Cook multipliers [needs tuning!]
+       -- Added efficient divide by 3 algorithm mp_div_3
+       -- Re-wrote mp_div_d to be faster than calling mp_div
+       -- Added in a donated BCC makefile and a single page LTM poster (ahalhabsi@sbcglobal.net)
+       -- Added mp_reduce_2k which reduces an input modulo n = 2**p - k for any single digit k
+       -- Made the exptmod system be aware of the 2k reduction algorithms.
+       -- Rewrote mp_dr_reduce to be smaller, simpler and easier to understand.
+
+May 17th, 2003
+v0.17  -- Benjamin Goldberg submitted optimized mp_add and mp_sub routines.  A new gen.pl as well
+          as several smaller suggestions.  Thanks!
+       -- removed call to mp_cmp in inner loop of mp_div and put mp_cmp_mag in its place :-)
+       -- Fixed bug in mp_exptmod that would cause it to fail for odd moduli when DIGIT_BIT != 28
+       -- mp_exptmod now also returns errors if the modulus is negative and will handle negative exponents
+       -- mp_prime_is_prime will now return true if the input is one of the primes in the prime table
+       -- Damian M Gryski (dgryski@uwaterloo.ca) found a index out of bounds error in the
+          mp_fast_s_mp_mul_high_digs function which didn't come up before.  (fixed)
+       -- Refactored the DR reduction code so there is only one function per file.
+       -- Fixed bug in the mp_mul() which would erroneously avoid the faster multiplier [comba] when it was
+          allowed.  The bug would not cause the incorrect value to be produced just less efficient (fixed)
+       -- Fixed similar bug in the Montgomery reduction code.
+       -- Added tons of (mp_digit) casts so the 7/15/28/31 bit digit code will work flawlessly out of the box.
+          Also added limited support for 64-bit machines with a 60-bit digit.  Both thanks to Tom Wu (tom@arcot.com)
+       -- Added new comments here and there, cleaned up some code [style stuff]
+       -- Fixed a lingering typo in mp_exptmod* that would set bitcnt to zero then one.  Very silly stuff :-)
+       -- Fixed up mp_exptmod_fast so it would set "redux" to the comba Montgomery reduction if allowed.  This
+          saves quite a few calls and if statements.
+       -- Added etc/mont.c a test of the Montgomery reduction [assuming all else works :-| ]
+       -- Fixed up etc/tune.c to use a wider test range [more appropriate] also added a x86 based addition which
+          uses RDTSC for high precision timing.
+       -- Updated demo/demo.c to remove MPI stuff [won't work anyways], made the tests run for 2 seconds each so its
+          not so insanely slow.  Also made the output space delimited [and fixed up various errors]
+       -- Added logs directory, logs/graph.dem which will use gnuplot to make a series of PNG files
+          that go with the pre-made index.html.  You have to build [via make timing] and run ltmtest first in the
+          root of the package.
+       -- Fixed a bug in mp_sub and mp_add where "-a - -a" or "-a + a" would produce -0 as the result [obviously invalid].
+       -- Fixed a bug in mp_rshd.  If the count == a.used it should zero/return [instead of shifting]
+       -- Fixed a "off-by-one" bug in mp_mul2d.  The initial size check on alloc would be off by one if the residue
+          shifting caused a carry.
+       -- Fixed a bug where s_mp_mul_digs() would not call the Comba based routine if allowed.  This made Barrett reduction
+          slower than it had to be.
+
+Mar 29th, 2003
+v0.16  -- Sped up mp_div by making normalization one shift call
+       -- Sped up mp_mul_2d/mp_div_2d by aliasing pointers :-)
+       -- Cleaned up mp_gcd to use the macros for odd/even detection
+       -- Added comments here and there, mostly there but occasionally here too.
+
+Mar 22nd, 2003
+v0.15  -- Added series of prime testing routines to lib
+       -- Fixed up etc/tune.c
+       -- Added DR reduction algorithm
+       -- Beefed up the manual more.
+       -- Fixed up demo/demo.c so it doesn't have so many warnings and it does the full series of
+          tests
+       -- Added "pre-gen" directory which will hold a "gen.pl"'ed copy of the entire lib [done at
+          zipup time so its always the latest]
+       -- Added conditional casts for C++ users [boo!]
+
+Mar 15th, 2003
+v0.14  -- Tons of manual updates
+       -- cleaned up the directory
+       -- added MSVC makefiles
+       -- source changes [that I don't recall]
+       -- Fixed up the lshd/rshd code to use pointer aliasing
+       -- Fixed up the mul_2d and div_2d to not call rshd/lshd unless needed
+       -- Fixed up etc/tune.c a tad
+       -- fixed up demo/demo.c to output comma-delimited results of timing
+          also fixed up timing demo to use a finer granularity for various functions
+       -- fixed up demo/demo.c testing to pause during testing so my Duron won't catch on fire
+          [stays around 31-35C during testing :-)]
+
+Feb 13th, 2003
+v0.13  -- tons of minor speed-ups in low level add, sub, mul_2 and div_2 which propagate
+          to other functions like mp_invmod, mp_div, etc...
+       -- Sped up mp_exptmod_fast by using new code to find R mod m [e.g. B^n mod m]
+       -- minor fixes
+
+Jan 17th, 2003
+v0.12  -- re-wrote the majority of the makefile so its more portable and will
+          install via "make install" on most *nix platforms
+       -- Re-packaged all the source as seperate files.  Means the library a single
+          file packagage any more.  Instead of just adding "bn.c" you have to add
+          libtommath.a
+       -- Renamed "bn.h" to "tommath.h"
+       -- Changes to the manual to reflect all of this
+       -- Used GNU Indent to clean up the source
+
+Jan 15th, 2003
+v0.11  -- More subtle fixes
+       -- Moved to gentoo linux [hurrah!] so made *nix specific fixes to the make process
+       -- Sped up the montgomery reduction code quite a bit
+       -- fixed up demo so when building timing for the x86 it assumes ELF format now
+
+Jan 9th, 2003
+v0.10  -- Pekka Riikonen suggested fixes to the radix conversion code.
+       -- Added baseline montgomery and comba montgomery reductions, sped up exptmods
+          [to a point, see bn.h for MONTGOMERY_EXPT_CUTOFF]
+
+Jan 6th, 2003
+v0.09  -- Updated the manual to reflect recent changes.  :-)
+       -- Added Jacobi function (mp_jacobi) to supplement the number theory side of the lib
+       -- Added a Mersenne prime finder demo in ./etc/mersenne.c
+
+Jan 2nd, 2003
+v0.08  -- Sped up the multipliers by moving the inner loop variables into a smaller scope
+       -- Corrected a bunch of small "warnings"
+       -- Added more comments
+       -- Made "mtest" be able to use /dev/random, /dev/urandom or stdin for RNG data
+       -- Corrected some bugs where error messages were potentially ignored
+       -- add etc/pprime.c program which makes numbers which are provably prime.
+
+Jan 1st, 2003
+v0.07  -- Removed alot of heap operations from core functions to speed them up
+       -- Added a root finding function [and mp_sqrt macro like from MPI]
+       -- Added more to manual
+
+Dec 31st, 2002
+v0.06  -- Sped up the s_mp_add, s_mp_sub which inturn sped up mp_invmod, mp_exptmod, etc...
+       -- Cleaned up the header a bit more
+
+Dec 30th, 2002
+v0.05  -- Builds with MSVC out of the box
+       -- Fixed a bug in mp_invmod w.r.t. even moduli
+       -- Made mp_toradix and mp_read_radix use char instead of unsigned char arrays
+       -- Fixed up exptmod to use fewer multiplications
+       -- Fixed up mp_init_size to use only one heap operation
+          -- Note there is a slight "off-by-one" bug in the library somewhere
+             without the padding (see the source for comment) the library
+             crashes in libtomcrypt.  Anyways a reasonable workaround is to pad the
+             numbers which will always correct it since as the numbers grow the padding
+             will still be beyond the end of the number
+       -- Added more to the manual
+
+Dec 29th, 2002
+v0.04  -- Fixed a memory leak in mp_to_unsigned_bin
+       -- optimized invmod code
+       -- Fixed bug in mp_div
+       -- use exchange instead of copy for results
+       -- added a bit more to the manual
+
+Dec 27th, 2002
+v0.03  -- Sped up s_mp_mul_high_digs by not computing the carries of the lower digits
+       -- Fixed a bug where mp_set_int wouldn't zero the value first and set the used member.
+       -- fixed a bug in s_mp_mul_high_digs where the limit placed on the result digits was not calculated properly
+       -- fixed bugs in add/sub/mul/sqr_mod functions where if the modulus and dest were the same it wouldn't work
+       -- fixed a bug in mp_mod and mp_mod_d concerning negative inputs
+       -- mp_mul_d didn't preserve sign
+       -- Many many many many fixes
+       -- Works in LibTomCrypt now :-)
+       -- Added iterations to the timing demos... more accurate.
+       -- Tom needs a job.
+
+Dec 26th, 2002
+v0.02  -- Fixed a few "slips" in the manual.  This is "LibTomMath" afterall :-)
+       -- Added mp_cmp_mag, mp_neg, mp_abs and mp_radix_size that were missing.
+       -- Sped up the fast [comba] multipliers more [yahoo!]
+
+Dec 25th,2002
+v0.01  -- Initial release.  Gimme a break.
+       -- Todo list,
+           add details to manual [e.g. algorithms]
+           more comments in code
+           example programs
diff --git a/libtommath/demo/demo.c b/libtommath/demo/demo.c
new file mode 100644
index 0000000..0a6115a
--- /dev/null
+++ b/libtommath/demo/demo.c
@@ -0,0 +1,736 @@
+#include <time.h>
+
+#ifdef IOWNANATHLON
+#include <unistd.h>
+#define SLEEP sleep(4)
+#else
+#define SLEEP
+#endif
+
+#include "tommath.h"
+
+void ndraw(mp_int * a, char *name)
+{
+   char buf[16000];
+
+   printf("%s: ", name);
+   mp_toradix(a, buf, 10);
+   printf("%s\n", buf);
+}
+
+static void draw(mp_int * a)
+{
+   ndraw(a, "");
+}
+
+
+unsigned long lfsr = 0xAAAAAAAAUL;
+
+int lbit(void)
+{
+   if (lfsr & 0x80000000UL) {
+      lfsr = ((lfsr << 1) ^ 0x8000001BUL) & 0xFFFFFFFFUL;
+      return 1;
+   } else {
+      lfsr <<= 1;
+      return 0;
+   }
+}
+
+int myrng(unsigned char *dst, int len, void *dat)
+{
+   int x;
+
+   for (x = 0; x < len; x++)
+      dst[x] = rand() & 0xFF;
+   return len;
+}
+
+
+
+char cmd[4096], buf[4096];
+int main(void)
+{
+   mp_int a, b, c, d, e, f;
+   unsigned long expt_n, add_n, sub_n, mul_n, div_n, sqr_n, mul2d_n, div2d_n,
+      gcd_n, lcm_n, inv_n, div2_n, mul2_n, add_d_n, sub_d_n, t;
+   unsigned rr;
+   int i, n, err, cnt, ix, old_kara_m, old_kara_s;
+   mp_digit mp;
+
+
+   mp_init(&a);
+   mp_init(&b);
+   mp_init(&c);
+   mp_init(&d);
+   mp_init(&e);
+   mp_init(&f);
+
+   srand(time(NULL));
+
+#if 0
+   // test montgomery 
+   printf("Testing montgomery...\n");
+   for (i = 1; i < 10; i++) {
+      printf("Testing digit size: %d\n", i);
+      for (n = 0; n < 1000; n++) {
+         mp_rand(&a, i);
+         a.dp[0] |= 1;
+
+         // let's see if R is right
+         mp_montgomery_calc_normalization(&b, &a);
+         mp_montgomery_setup(&a, &mp);
+
+         // now test a random reduction 
+         for (ix = 0; ix < 100; ix++) {
+             mp_rand(&c, 1 + abs(rand()) % (2*i));
+             mp_copy(&c, &d);
+             mp_copy(&c, &e);
+
+             mp_mod(&d, &a, &d);
+             mp_montgomery_reduce(&c, &a, mp);
+             mp_mulmod(&c, &b, &a, &c);
+
+             if (mp_cmp(&c, &d) != MP_EQ) { 
+printf("d = e mod a, c = e MOD a\n");
+mp_todecimal(&a, buf); printf("a = %s\n", buf);
+mp_todecimal(&e, buf); printf("e = %s\n", buf);
+mp_todecimal(&d, buf); printf("d = %s\n", buf);
+mp_todecimal(&c, buf); printf("c = %s\n", buf);
+printf("compare no compare!\n"); exit(EXIT_FAILURE); }
+         }
+      }
+   }
+   printf("done\n");
+
+   // test mp_get_int
+   printf("Testing: mp_get_int\n");
+   for (i = 0; i < 1000; ++i) {
+      t = ((unsigned long) rand() * rand() + 1) & 0xFFFFFFFF;
+      mp_set_int(&a, t);
+      if (t != mp_get_int(&a)) {
+	 printf("mp_get_int() bad result!\n");
+	 return 1;
+      }
+   }
+   mp_set_int(&a, 0);
+   if (mp_get_int(&a) != 0) {
+      printf("mp_get_int() bad result!\n");
+      return 1;
+   }
+   mp_set_int(&a, 0xffffffff);
+   if (mp_get_int(&a) != 0xffffffff) {
+      printf("mp_get_int() bad result!\n");
+      return 1;
+   }
+   // test mp_sqrt
+   printf("Testing: mp_sqrt\n");
+   for (i = 0; i < 1000; ++i) {
+      printf("%6d\r", i);
+      fflush(stdout);
+      n = (rand() & 15) + 1;
+      mp_rand(&a, n);
+      if (mp_sqrt(&a, &b) != MP_OKAY) {
+	 printf("mp_sqrt() error!\n");
+	 return 1;
+      }
+      mp_n_root(&a, 2, &a);
+      if (mp_cmp_mag(&b, &a) != MP_EQ) {
+	 printf("mp_sqrt() bad result!\n");
+	 return 1;
+      }
+   }
+
+   printf("\nTesting: mp_is_square\n");
+   for (i = 0; i < 1000; ++i) {
+      printf("%6d\r", i);
+      fflush(stdout);
+
+      /* test mp_is_square false negatives */
+      n = (rand() & 7) + 1;
+      mp_rand(&a, n);
+      mp_sqr(&a, &a);
+      if (mp_is_square(&a, &n) != MP_OKAY) {
+	 printf("fn:mp_is_square() error!\n");
+	 return 1;
+      }
+      if (n == 0) {
+	 printf("fn:mp_is_square() bad result!\n");
+	 return 1;
+      }
+
+      /* test for false positives */
+      mp_add_d(&a, 1, &a);
+      if (mp_is_square(&a, &n) != MP_OKAY) {
+	 printf("fp:mp_is_square() error!\n");
+	 return 1;
+      }
+      if (n == 1) {
+	 printf("fp:mp_is_square() bad result!\n");
+	 return 1;
+      }
+
+   }
+   printf("\n\n");
+
+   /* test for size */
+   for (ix = 10; ix < 128; ix++) {
+      printf("Testing (not safe-prime): %9d bits    \r", ix);
+      fflush(stdout);
+      err =
+	 mp_prime_random_ex(&a, 8, ix,
+			    (rand() & 1) ? LTM_PRIME_2MSB_OFF :
+			    LTM_PRIME_2MSB_ON, myrng, NULL);
+      if (err != MP_OKAY) {
+	 printf("failed with err code %d\n", err);
+	 return EXIT_FAILURE;
+      }
+      if (mp_count_bits(&a) != ix) {
+	 printf("Prime is %d not %d bits!!!\n", mp_count_bits(&a), ix);
+	 return EXIT_FAILURE;
+      }
+   }
+
+   for (ix = 16; ix < 128; ix++) {
+      printf("Testing (   safe-prime): %9d bits    \r", ix);
+      fflush(stdout);
+      err =
+	 mp_prime_random_ex(&a, 8, ix,
+			    ((rand() & 1) ? LTM_PRIME_2MSB_OFF :
+			     LTM_PRIME_2MSB_ON) | LTM_PRIME_SAFE, myrng,
+			    NULL);
+      if (err != MP_OKAY) {
+	 printf("failed with err code %d\n", err);
+	 return EXIT_FAILURE;
+      }
+      if (mp_count_bits(&a) != ix) {
+	 printf("Prime is %d not %d bits!!!\n", mp_count_bits(&a), ix);
+	 return EXIT_FAILURE;
+      }
+      /* let's see if it's really a safe prime */
+      mp_sub_d(&a, 1, &a);
+      mp_div_2(&a, &a);
+      mp_prime_is_prime(&a, 8, &cnt);
+      if (cnt != MP_YES) {
+	 printf("sub is not prime!\n");
+	 return EXIT_FAILURE;
+      }
+   }
+
+   printf("\n\n");
+
+   mp_read_radix(&a, "123456", 10);
+   mp_toradix_n(&a, buf, 10, 3);
+   printf("a == %s\n", buf);
+   mp_toradix_n(&a, buf, 10, 4);
+   printf("a == %s\n", buf);
+   mp_toradix_n(&a, buf, 10, 30);
+   printf("a == %s\n", buf);
+
+
+#if 0
+   for (;;) {
+      fgets(buf, sizeof(buf), stdin);
+      mp_read_radix(&a, buf, 10);
+      mp_prime_next_prime(&a, 5, 1);
+      mp_toradix(&a, buf, 10);
+      printf("%s, %lu\n", buf, a.dp[0] & 3);
+   }
+#endif
+
+   /* test mp_cnt_lsb */
+   printf("testing mp_cnt_lsb...\n");
+   mp_set(&a, 1);
+   for (ix = 0; ix < 1024; ix++) {
+      if (mp_cnt_lsb(&a) != ix) {
+	 printf("Failed at %d, %d\n", ix, mp_cnt_lsb(&a));
+	 return 0;
+      }
+      mp_mul_2(&a, &a);
+   }
+
+/* test mp_reduce_2k */
+   printf("Testing mp_reduce_2k...\n");
+   for (cnt = 3; cnt <= 128; ++cnt) {
+      mp_digit tmp;
+
+      mp_2expt(&a, cnt);
+      mp_sub_d(&a, 2, &a);	/* a = 2**cnt - 2 */
+
+
+      printf("\nTesting %4d bits", cnt);
+      printf("(%d)", mp_reduce_is_2k(&a));
+      mp_reduce_2k_setup(&a, &tmp);
+      printf("(%d)", tmp);
+      for (ix = 0; ix < 1000; ix++) {
+	 if (!(ix & 127)) {
+	    printf(".");
+	    fflush(stdout);
+	 }
+	 mp_rand(&b, (cnt / DIGIT_BIT + 1) * 2);
+	 mp_copy(&c, &b);
+	 mp_mod(&c, &a, &c);
+	 mp_reduce_2k(&b, &a, 2);
+	 if (mp_cmp(&c, &b)) {
+	    printf("FAILED\n");
+	    exit(0);
+	 }
+      }
+   }
+
+/* test mp_div_3  */
+   printf("Testing mp_div_3...\n");
+   mp_set(&d, 3);
+   for (cnt = 0; cnt < 10000;) {
+      mp_digit r1, r2;
+
+      if (!(++cnt & 127))
+	 printf("%9d\r", cnt);
+      mp_rand(&a, abs(rand()) % 128 + 1);
+      mp_div(&a, &d, &b, &e);
+      mp_div_3(&a, &c, &r2);
+
+      if (mp_cmp(&b, &c) || mp_cmp_d(&e, r2)) {
+	 printf("\n\nmp_div_3 => Failure\n");
+      }
+   }
+   printf("\n\nPassed div_3 testing\n");
+
+/* test the DR reduction */
+   printf("testing mp_dr_reduce...\n");
+   for (cnt = 2; cnt < 32; cnt++) {
+      printf("%d digit modulus\n", cnt);
+      mp_grow(&a, cnt);
+      mp_zero(&a);
+      for (ix = 1; ix < cnt; ix++) {
+	 a.dp[ix] = MP_MASK;
+      }
+      a.used = cnt;
+      a.dp[0] = 3;
+
+      mp_rand(&b, cnt - 1);
+      mp_copy(&b, &c);
+
+      rr = 0;
+      do {
+	 if (!(rr & 127)) {
+	    printf("%9lu\r", rr);
+	    fflush(stdout);
+	 }
+	 mp_sqr(&b, &b);
+	 mp_add_d(&b, 1, &b);
+	 mp_copy(&b, &c);
+
+	 mp_mod(&b, &a, &b);
+	 mp_dr_reduce(&c, &a, (((mp_digit) 1) << DIGIT_BIT) - a.dp[0]);
+
+	 if (mp_cmp(&b, &c) != MP_EQ) {
+	    printf("Failed on trial %lu\n", rr);
+	    exit(-1);
+
+	 }
+      } while (++rr < 500);
+      printf("Passed DR test for %d digits\n", cnt);
+   }
+
+#endif
+
+/* test the mp_reduce_2k_l code */
+#if 0
+#if 0
+/* first load P with 2^1024 - 0x2A434 B9FDEC95 D8F9D550 FFFFFFFF FFFFFFFF */
+   mp_2expt(&a, 1024);
+   mp_read_radix(&b, "2A434B9FDEC95D8F9D550FFFFFFFFFFFFFFFF", 16);
+   mp_sub(&a, &b, &a);
+#elif 1
+/*  p = 2^2048 - 0x1 00000000 00000000 00000000 00000000 4945DDBF 8EA2A91D 5776399B B83E188F  */
+   mp_2expt(&a, 2048);
+   mp_read_radix(&b,
+		 "1000000000000000000000000000000004945DDBF8EA2A91D5776399BB83E188F",
+		 16);
+   mp_sub(&a, &b, &a);
+#endif
+
+   mp_todecimal(&a, buf);
+   printf("p==%s\n", buf);
+/* now mp_reduce_is_2k_l() should return */
+   if (mp_reduce_is_2k_l(&a) != 1) {
+      printf("mp_reduce_is_2k_l() return 0, should be 1\n");
+      return EXIT_FAILURE;
+   }
+   mp_reduce_2k_setup_l(&a, &d);
+   /* now do a million square+1 to see if it varies */
+   mp_rand(&b, 64);
+   mp_mod(&b, &a, &b);
+   mp_copy(&b, &c);
+   printf("testing mp_reduce_2k_l...");
+   fflush(stdout);
+   for (cnt = 0; cnt < (1UL << 20); cnt++) {
+      mp_sqr(&b, &b);
+      mp_add_d(&b, 1, &b);
+      mp_reduce_2k_l(&b, &a, &d);
+      mp_sqr(&c, &c);
+      mp_add_d(&c, 1, &c);
+      mp_mod(&c, &a, &c);
+      if (mp_cmp(&b, &c) != MP_EQ) {
+	 printf("mp_reduce_2k_l() failed at step %lu\n", cnt);
+	 mp_tohex(&b, buf);
+	 printf("b == %s\n", buf);
+	 mp_tohex(&c, buf);
+	 printf("c == %s\n", buf);
+	 return EXIT_FAILURE;
+      }
+   }
+   printf("...Passed\n");
+#endif
+
+   div2_n = mul2_n = inv_n = expt_n = lcm_n = gcd_n = add_n =
+      sub_n = mul_n = div_n = sqr_n = mul2d_n = div2d_n = cnt = add_d_n =
+      sub_d_n = 0;
+
+   /* force KARA and TOOM to enable despite cutoffs */
+   KARATSUBA_SQR_CUTOFF = KARATSUBA_MUL_CUTOFF = 110;
+   TOOM_SQR_CUTOFF = TOOM_MUL_CUTOFF = 150;
+
+   for (;;) {
+      /* randomly clear and re-init one variable, this has the affect of triming the alloc space */
+      switch (abs(rand()) % 7) {
+      case 0:
+	 mp_clear(&a);
+	 mp_init(&a);
+	 break;
+      case 1:
+	 mp_clear(&b);
+	 mp_init(&b);
+	 break;
+      case 2:
+	 mp_clear(&c);
+	 mp_init(&c);
+	 break;
+      case 3:
+	 mp_clear(&d);
+	 mp_init(&d);
+	 break;
+      case 4:
+	 mp_clear(&e);
+	 mp_init(&e);
+	 break;
+      case 5:
+	 mp_clear(&f);
+	 mp_init(&f);
+	 break;
+      case 6:
+	 break;			/* don't clear any */
+      }
+
+
+      printf
+	 ("%4lu/%4lu/%4lu/%4lu/%4lu/%4lu/%4lu/%4lu/%4lu/%4lu/%4lu/%4lu/%4lu/%4lu/%4lu ",
+	  add_n, sub_n, mul_n, div_n, sqr_n, mul2d_n, div2d_n, gcd_n, lcm_n,
+	  expt_n, inv_n, div2_n, mul2_n, add_d_n, sub_d_n);
+      fgets(cmd, 4095, stdin);
+      cmd[strlen(cmd) - 1] = 0;
+      printf("%s  ]\r", cmd);
+      fflush(stdout);
+      if (!strcmp(cmd, "mul2d")) {
+	 ++mul2d_n;
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&a, buf, 64);
+	 fgets(buf, 4095, stdin);
+	 sscanf(buf, "%d", &rr);
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&b, buf, 64);
+
+	 mp_mul_2d(&a, rr, &a);
+	 a.sign = b.sign;
+	 if (mp_cmp(&a, &b) != MP_EQ) {
+	    printf("mul2d failed, rr == %d\n", rr);
+	    draw(&a);
+	    draw(&b);
+	    return 0;
+	 }
+      } else if (!strcmp(cmd, "div2d")) {
+	 ++div2d_n;
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&a, buf, 64);
+	 fgets(buf, 4095, stdin);
+	 sscanf(buf, "%d", &rr);
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&b, buf, 64);
+
+	 mp_div_2d(&a, rr, &a, &e);
+	 a.sign = b.sign;
+	 if (a.used == b.used && a.used == 0) {
+	    a.sign = b.sign = MP_ZPOS;
+	 }
+	 if (mp_cmp(&a, &b) != MP_EQ) {
+	    printf("div2d failed, rr == %d\n", rr);
+	    draw(&a);
+	    draw(&b);
+	    return 0;
+	 }
+      } else if (!strcmp(cmd, "add")) {
+	 ++add_n;
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&a, buf, 64);
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&b, buf, 64);
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&c, buf, 64);
+	 mp_copy(&a, &d);
+	 mp_add(&d, &b, &d);
+	 if (mp_cmp(&c, &d) != MP_EQ) {
+	    printf("add %lu failure!\n", add_n);
+	    draw(&a);
+	    draw(&b);
+	    draw(&c);
+	    draw(&d);
+	    return 0;
+	 }
+
+	 /* test the sign/unsigned storage functions */
+
+	 rr = mp_signed_bin_size(&c);
+	 mp_to_signed_bin(&c, (unsigned char *) cmd);
+	 memset(cmd + rr, rand() & 255, sizeof(cmd) - rr);
+	 mp_read_signed_bin(&d, (unsigned char *) cmd, rr);
+	 if (mp_cmp(&c, &d) != MP_EQ) {
+	    printf("mp_signed_bin failure!\n");
+	    draw(&c);
+	    draw(&d);
+	    return 0;
+	 }
+
+
+	 rr = mp_unsigned_bin_size(&c);
+	 mp_to_unsigned_bin(&c, (unsigned char *) cmd);
+	 memset(cmd + rr, rand() & 255, sizeof(cmd) - rr);
+	 mp_read_unsigned_bin(&d, (unsigned char *) cmd, rr);
+	 if (mp_cmp_mag(&c, &d) != MP_EQ) {
+	    printf("mp_unsigned_bin failure!\n");
+	    draw(&c);
+	    draw(&d);
+	    return 0;
+	 }
+
+      } else if (!strcmp(cmd, "sub")) {
+	 ++sub_n;
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&a, buf, 64);
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&b, buf, 64);
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&c, buf, 64);
+	 mp_copy(&a, &d);
+	 mp_sub(&d, &b, &d);
+	 if (mp_cmp(&c, &d) != MP_EQ) {
+	    printf("sub %lu failure!\n", sub_n);
+	    draw(&a);
+	    draw(&b);
+	    draw(&c);
+	    draw(&d);
+	    return 0;
+	 }
+      } else if (!strcmp(cmd, "mul")) {
+	 ++mul_n;
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&a, buf, 64);
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&b, buf, 64);
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&c, buf, 64);
+	 mp_copy(&a, &d);
+	 mp_mul(&d, &b, &d);
+	 if (mp_cmp(&c, &d) != MP_EQ) {
+	    printf("mul %lu failure!\n", mul_n);
+	    draw(&a);
+	    draw(&b);
+	    draw(&c);
+	    draw(&d);
+	    return 0;
+	 }
+      } else if (!strcmp(cmd, "div")) {
+	 ++div_n;
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&a, buf, 64);
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&b, buf, 64);
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&c, buf, 64);
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&d, buf, 64);
+
+	 mp_div(&a, &b, &e, &f);
+	 if (mp_cmp(&c, &e) != MP_EQ || mp_cmp(&d, &f) != MP_EQ) {
+	    printf("div %lu %d, %d, failure!\n", div_n, mp_cmp(&c, &e),
+		   mp_cmp(&d, &f));
+	    draw(&a);
+	    draw(&b);
+	    draw(&c);
+	    draw(&d);
+	    draw(&e);
+	    draw(&f);
+	    return 0;
+	 }
+
+      } else if (!strcmp(cmd, "sqr")) {
+	 ++sqr_n;
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&a, buf, 64);
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&b, buf, 64);
+	 mp_copy(&a, &c);
+	 mp_sqr(&c, &c);
+	 if (mp_cmp(&b, &c) != MP_EQ) {
+	    printf("sqr %lu failure!\n", sqr_n);
+	    draw(&a);
+	    draw(&b);
+	    draw(&c);
+	    return 0;
+	 }
+      } else if (!strcmp(cmd, "gcd")) {
+	 ++gcd_n;
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&a, buf, 64);
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&b, buf, 64);
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&c, buf, 64);
+	 mp_copy(&a, &d);
+	 mp_gcd(&d, &b, &d);
+	 d.sign = c.sign;
+	 if (mp_cmp(&c, &d) != MP_EQ) {
+	    printf("gcd %lu failure!\n", gcd_n);
+	    draw(&a);
+	    draw(&b);
+	    draw(&c);
+	    draw(&d);
+	    return 0;
+	 }
+      } else if (!strcmp(cmd, "lcm")) {
+	 ++lcm_n;
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&a, buf, 64);
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&b, buf, 64);
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&c, buf, 64);
+	 mp_copy(&a, &d);
+	 mp_lcm(&d, &b, &d);
+	 d.sign = c.sign;
+	 if (mp_cmp(&c, &d) != MP_EQ) {
+	    printf("lcm %lu failure!\n", lcm_n);
+	    draw(&a);
+	    draw(&b);
+	    draw(&c);
+	    draw(&d);
+	    return 0;
+	 }
+      } else if (!strcmp(cmd, "expt")) {
+	 ++expt_n;
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&a, buf, 64);
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&b, buf, 64);
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&c, buf, 64);
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&d, buf, 64);
+	 mp_copy(&a, &e);
+	 mp_exptmod(&e, &b, &c, &e);
+	 if (mp_cmp(&d, &e) != MP_EQ) {
+	    printf("expt %lu failure!\n", expt_n);
+	    draw(&a);
+	    draw(&b);
+	    draw(&c);
+	    draw(&d);
+	    draw(&e);
+	    return 0;
+	 }
+      } else if (!strcmp(cmd, "invmod")) {
+	 ++inv_n;
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&a, buf, 64);
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&b, buf, 64);
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&c, buf, 64);
+	 mp_invmod(&a, &b, &d);
+	 mp_mulmod(&d, &a, &b, &e);
+	 if (mp_cmp_d(&e, 1) != MP_EQ) {
+	    printf("inv [wrong value from MPI?!] failure\n");
+	    draw(&a);
+	    draw(&b);
+	    draw(&c);
+	    draw(&d);
+	    mp_gcd(&a, &b, &e);
+	    draw(&e);
+	    return 0;
+	 }
+
+      } else if (!strcmp(cmd, "div2")) {
+	 ++div2_n;
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&a, buf, 64);
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&b, buf, 64);
+	 mp_div_2(&a, &c);
+	 if (mp_cmp(&c, &b) != MP_EQ) {
+	    printf("div_2 %lu failure\n", div2_n);
+	    draw(&a);
+	    draw(&b);
+	    draw(&c);
+	    return 0;
+	 }
+      } else if (!strcmp(cmd, "mul2")) {
+	 ++mul2_n;
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&a, buf, 64);
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&b, buf, 64);
+	 mp_mul_2(&a, &c);
+	 if (mp_cmp(&c, &b) != MP_EQ) {
+	    printf("mul_2 %lu failure\n", mul2_n);
+	    draw(&a);
+	    draw(&b);
+	    draw(&c);
+	    return 0;
+	 }
+      } else if (!strcmp(cmd, "add_d")) {
+	 ++add_d_n;
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&a, buf, 64);
+	 fgets(buf, 4095, stdin);
+	 sscanf(buf, "%d", &ix);
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&b, buf, 64);
+	 mp_add_d(&a, ix, &c);
+	 if (mp_cmp(&b, &c) != MP_EQ) {
+	    printf("add_d %lu failure\n", add_d_n);
+	    draw(&a);
+	    draw(&b);
+	    draw(&c);
+	    printf("d == %d\n", ix);
+	    return 0;
+	 }
+      } else if (!strcmp(cmd, "sub_d")) {
+	 ++sub_d_n;
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&a, buf, 64);
+	 fgets(buf, 4095, stdin);
+	 sscanf(buf, "%d", &ix);
+	 fgets(buf, 4095, stdin);
+	 mp_read_radix(&b, buf, 64);
+	 mp_sub_d(&a, ix, &c);
+	 if (mp_cmp(&b, &c) != MP_EQ) {
+	    printf("sub_d %lu failure\n", sub_d_n);
+	    draw(&a);
+	    draw(&b);
+	    draw(&c);
+	    printf("d == %d\n", ix);
+	    return 0;
+	 }
+      }
+   }
+   return 0;
+}
diff --git a/libtommath/demo/timing.c b/libtommath/demo/timing.c
new file mode 100644
index 0000000..bb3be52
--- /dev/null
+++ b/libtommath/demo/timing.c
@@ -0,0 +1,315 @@
+#include <tommath.h>
+#include <time.h>
+
+ulong64 _tt;
+
+#ifdef IOWNANATHLON
+#include <unistd.h>
+#define SLEEP sleep(4)
+#else
+#define SLEEP
+#endif
+
+
+void ndraw(mp_int * a, char *name)
+{
+   char buf[4096];
+
+   printf("%s: ", name);
+   mp_toradix(a, buf, 64);
+   printf("%s\n", buf);
+}
+
+static void draw(mp_int * a)
+{
+   ndraw(a, "");
+}
+
+
+unsigned long lfsr = 0xAAAAAAAAUL;
+
+int lbit(void)
+{
+   if (lfsr & 0x80000000UL) {
+      lfsr = ((lfsr << 1) ^ 0x8000001BUL) & 0xFFFFFFFFUL;
+      return 1;
+   } else {
+      lfsr <<= 1;
+      return 0;
+   }
+}
+
+/* RDTSC from Scott Duplichan */
+static ulong64 TIMFUNC(void)
+{
+#if defined __GNUC__
+#if defined(__i386__) || defined(__x86_64__)
+   unsigned long long a;
+   __asm__ __volatile__("rdtsc\nmovl %%eax,%0\nmovl %%edx,4+%0\n"::
+			"m"(a):"%eax", "%edx");
+   return a;
+#else /* gcc-IA64 version */
+   unsigned long result;
+   __asm__ __volatile__("mov %0=ar.itc":"=r"(result)::"memory");
+
+   while (__builtin_expect((int) result == -1, 0))
+      __asm__ __volatile__("mov %0=ar.itc":"=r"(result)::"memory");
+
+   return result;
+#endif
+
+   // Microsoft and Intel Windows compilers
+#elif defined _M_IX86
+   __asm rdtsc
+#elif defined _M_AMD64
+   return __rdtsc();
+#elif defined _M_IA64
+#if defined __INTEL_COMPILER
+#include <ia64intrin.h>
+#endif
+   return __getReg(3116);
+#else
+#error need rdtsc function for this build
+#endif
+}
+
+#define DO(x) x; x;
+//#define DO4(x) DO2(x); DO2(x);
+//#define DO8(x) DO4(x); DO4(x);
+//#define DO(x)  DO8(x); DO8(x);
+
+int main(void)
+{
+   ulong64 tt, gg, CLK_PER_SEC;
+   FILE *log, *logb, *logc, *logd;
+   mp_int a, b, c, d, e, f;
+   int n, cnt, ix, old_kara_m, old_kara_s;
+   unsigned rr;
+
+   mp_init(&a);
+   mp_init(&b);
+   mp_init(&c);
+   mp_init(&d);
+   mp_init(&e);
+   mp_init(&f);
+
+   srand(time(NULL));
+
+
+   /* temp. turn off TOOM */
+   TOOM_MUL_CUTOFF = TOOM_SQR_CUTOFF = 100000;
+
+   CLK_PER_SEC = TIMFUNC();
+   sleep(1);
+   CLK_PER_SEC = TIMFUNC() - CLK_PER_SEC;
+
+   printf("CLK_PER_SEC == %llu\n", CLK_PER_SEC);
+   goto exptmod;
+   log = fopen("logs/add.log", "w");
+   for (cnt = 8; cnt <= 128; cnt += 8) {
+      SLEEP;
+      mp_rand(&a, cnt);
+      mp_rand(&b, cnt);
+      rr = 0;
+      tt = -1;
+      do {
+	 gg = TIMFUNC();
+	 DO(mp_add(&a, &b, &c));
+	 gg = (TIMFUNC() - gg) >> 1;
+	 if (tt > gg)
+	    tt = gg;
+      } while (++rr < 100000);
+      printf("Adding\t\t%4d-bit => %9llu/sec, %9llu cycles\n",
+	     mp_count_bits(&a), CLK_PER_SEC / tt, tt);
+      fprintf(log, "%d %9llu\n", cnt * DIGIT_BIT, tt);
+      fflush(log);
+   }
+   fclose(log);
+
+   log = fopen("logs/sub.log", "w");
+   for (cnt = 8; cnt <= 128; cnt += 8) {
+      SLEEP;
+      mp_rand(&a, cnt);
+      mp_rand(&b, cnt);
+      rr = 0;
+      tt = -1;
+      do {
+	 gg = TIMFUNC();
+	 DO(mp_sub(&a, &b, &c));
+	 gg = (TIMFUNC() - gg) >> 1;
+	 if (tt > gg)
+	    tt = gg;
+      } while (++rr < 100000);
+
+      printf("Subtracting\t\t%4d-bit => %9llu/sec, %9llu cycles\n",
+	     mp_count_bits(&a), CLK_PER_SEC / tt, tt);
+      fprintf(log, "%d %9llu\n", cnt * DIGIT_BIT, tt);
+      fflush(log);
+   }
+   fclose(log);
+
+   /* do mult/square twice, first without karatsuba and second with */
+ multtest:
+   old_kara_m = KARATSUBA_MUL_CUTOFF;
+   old_kara_s = KARATSUBA_SQR_CUTOFF;
+   for (ix = 0; ix < 2; ix++) {
+      printf("With%s Karatsuba\n", (ix == 0) ? "out" : "");
+
+      KARATSUBA_MUL_CUTOFF = (ix == 0) ? 9999 : old_kara_m;
+      KARATSUBA_SQR_CUTOFF = (ix == 0) ? 9999 : old_kara_s;
+
+      log = fopen((ix == 0) ? "logs/mult.log" : "logs/mult_kara.log", "w");
+      for (cnt = 4; cnt <= 10240 / DIGIT_BIT; cnt += 2) {
+	 SLEEP;
+	 mp_rand(&a, cnt);
+	 mp_rand(&b, cnt);
+	 rr = 0;
+	 tt = -1;
+	 do {
+	    gg = TIMFUNC();
+	    DO(mp_mul(&a, &b, &c));
+	    gg = (TIMFUNC() - gg) >> 1;
+	    if (tt > gg)
+	       tt = gg;
+	 } while (++rr < 100);
+	 printf("Multiplying\t%4d-bit => %9llu/sec, %9llu cycles\n",
+		mp_count_bits(&a), CLK_PER_SEC / tt, tt);
+	 fprintf(log, "%d %9llu\n", mp_count_bits(&a), tt);
+	 fflush(log);
+      }
+      fclose(log);
+
+      log = fopen((ix == 0) ? "logs/sqr.log" : "logs/sqr_kara.log", "w");
+      for (cnt = 4; cnt <= 10240 / DIGIT_BIT; cnt += 2) {
+	 SLEEP;
+	 mp_rand(&a, cnt);
+	 rr = 0;
+	 tt = -1;
+	 do {
+	    gg = TIMFUNC();
+	    DO(mp_sqr(&a, &b));
+	    gg = (TIMFUNC() - gg) >> 1;
+	    if (tt > gg)
+	       tt = gg;
+	 } while (++rr < 100);
+	 printf("Squaring\t%4d-bit => %9llu/sec, %9llu cycles\n",
+		mp_count_bits(&a), CLK_PER_SEC / tt, tt);
+	 fprintf(log, "%d %9llu\n", mp_count_bits(&a), tt);
+	 fflush(log);
+      }
+      fclose(log);
+
+   }
+ exptmod:
+
+   {
+      char *primes[] = {
+	 /* 2K large moduli */
+	 "179769313486231590772930519078902473361797697894230657273430081157732675805500963132708477322407536021120113879871393357658789768814416622492847430639474124377767893424865485276302219601246094119453082952085005768838150682342462881473913110540827237163350510684586239334100047359817950870678242457666208137217",
+	 "32317006071311007300714876688669951960444102669715484032130345427524655138867890893197201411522913463688717960921898019494119559150490921095088152386448283120630877367300996091750197750389652106796057638384067568276792218642619756161838094338476170470581645852036305042887575891541065808607552399123930385521914333389668342420684974786564569494856176035326322058077805659331026192708460314150258592864177116725943603718461857357598351152301645904403697613233287231227125684710820209725157101726931323469678542580656697935045997268352998638099733077152121140120031150424541696791951097529546801429027668869927491725169",
+	 "1044388881413152506691752710716624382579964249047383780384233483283953907971557456848826811934997558340890106714439262837987573438185793607263236087851365277945956976543709998340361590134383718314428070011855946226376318839397712745672334684344586617496807908705803704071284048740118609114467977783598029006686938976881787785946905630190260940599579453432823469303026696443059025015972399867714215541693835559885291486318237914434496734087811872639496475100189041349008417061675093668333850551032972088269550769983616369411933015213796825837188091833656751221318492846368125550225998300412344784862595674492194617023806505913245610825731835380087608622102834270197698202313169017678006675195485079921636419370285375124784014907159135459982790513399611551794271106831134090584272884279791554849782954323534517065223269061394905987693002122963395687782878948440616007412945674919823050571642377154816321380631045902916136926708342856440730447899971901781465763473223850267253059899795996090799469201774624817718449867455659250178329070473119433165550807568221846571746373296884912819520317457002440926616910874148385078411929804522981857338977648103126085902995208257421855249796721729039744118165938433694823325696642096892124547425283",
+	 /* 2K moduli mersenne primes */
+	 "6864797660130609714981900799081393217269435300143305409394463459185543183397656052122559640661454554977296311391480858037121987999716643812574028291115057151",
+	 "531137992816767098689588206552468627329593117727031923199444138200403559860852242739162502265229285668889329486246501015346579337652707239409519978766587351943831270835393219031728127",
+	 "10407932194664399081925240327364085538615262247266704805319112350403608059673360298012239441732324184842421613954281007791383566248323464908139906605677320762924129509389220345773183349661583550472959420547689811211693677147548478866962501384438260291732348885311160828538416585028255604666224831890918801847068222203140521026698435488732958028878050869736186900714720710555703168729087",
+	 "1475979915214180235084898622737381736312066145333169775147771216478570297878078949377407337049389289382748507531496480477281264838760259191814463365330269540496961201113430156902396093989090226259326935025281409614983499388222831448598601834318536230923772641390209490231836446899608210795482963763094236630945410832793769905399982457186322944729636418890623372171723742105636440368218459649632948538696905872650486914434637457507280441823676813517852099348660847172579408422316678097670224011990280170474894487426924742108823536808485072502240519452587542875349976558572670229633962575212637477897785501552646522609988869914013540483809865681250419497686697771007",
+	 "259117086013202627776246767922441530941818887553125427303974923161874019266586362086201209516800483406550695241733194177441689509238807017410377709597512042313066624082916353517952311186154862265604547691127595848775610568757931191017711408826252153849035830401185072116424747461823031471398340229288074545677907941037288235820705892351068433882986888616658650280927692080339605869308790500409503709875902119018371991620994002568935113136548829739112656797303241986517250116412703509705427773477972349821676443446668383119322540099648994051790241624056519054483690809616061625743042361721863339415852426431208737266591962061753535748892894599629195183082621860853400937932839420261866586142503251450773096274235376822938649407127700846077124211823080804139298087057504713825264571448379371125032081826126566649084251699453951887789613650248405739378594599444335231188280123660406262468609212150349937584782292237144339628858485938215738821232393687046160677362909315071",
+	 "190797007524439073807468042969529173669356994749940177394741882673528979787005053706368049835514900244303495954950709725762186311224148828811920216904542206960744666169364221195289538436845390250168663932838805192055137154390912666527533007309292687539092257043362517857366624699975402375462954490293259233303137330643531556539739921926201438606439020075174723029056838272505051571967594608350063404495977660656269020823960825567012344189908927956646011998057988548630107637380993519826582389781888135705408653045219655801758081251164080554609057468028203308718724654081055323215860189611391296030471108443146745671967766308925858547271507311563765171008318248647110097614890313562856541784154881743146033909602737947385055355960331855614540900081456378659068370317267696980001187750995491090350108417050917991562167972281070161305972518044872048331306383715094854938415738549894606070722584737978176686422134354526989443028353644037187375385397838259511833166416134323695660367676897722287918773420968982326089026150031515424165462111337527431154890666327374921446276833564519776797633875503548665093914556482031482248883127023777039667707976559857333357013727342079099064400455741830654320379350833236245819348824064783585692924881021978332974949906122664421376034687815350484991",
+
+	 /* DR moduli */
+	 "14059105607947488696282932836518693308967803494693489478439861164411992439598399594747002144074658928593502845729752797260025831423419686528151609940203368612079",
+	 "101745825697019260773923519755878567461315282017759829107608914364075275235254395622580447400994175578963163918967182013639660669771108475957692810857098847138903161308502419410142185759152435680068435915159402496058513611411688900243039",
+	 "736335108039604595805923406147184530889923370574768772191969612422073040099331944991573923112581267542507986451953227192970402893063850485730703075899286013451337291468249027691733891486704001513279827771740183629161065194874727962517148100775228363421083691764065477590823919364012917984605619526140821797602431",
+	 "38564998830736521417281865696453025806593491967131023221754800625044118265468851210705360385717536794615180260494208076605798671660719333199513807806252394423283413430106003596332513246682903994829528690198205120921557533726473585751382193953592127439965050261476810842071573684505878854588706623484573925925903505747545471088867712185004135201289273405614415899438276535626346098904241020877974002916168099951885406379295536200413493190419727789712076165162175783",
+	 "542189391331696172661670440619180536749994166415993334151601745392193484590296600979602378676624808129613777993466242203025054573692562689251250471628358318743978285860720148446448885701001277560572526947619392551574490839286458454994488665744991822837769918095117129546414124448777033941223565831420390846864429504774477949153794689948747680362212954278693335653935890352619041936727463717926744868338358149568368643403037768649616778526013610493696186055899318268339432671541328195724261329606699831016666359440874843103020666106568222401047720269951530296879490444224546654729111504346660859907296364097126834834235287147",
+	 "1487259134814709264092032648525971038895865645148901180585340454985524155135260217788758027400478312256339496385275012465661575576202252063145698732079880294664220579764848767704076761853197216563262660046602703973050798218246170835962005598561669706844469447435461092542265792444947706769615695252256130901271870341005768912974433684521436211263358097522726462083917939091760026658925757076733484173202927141441492573799914240222628795405623953109131594523623353044898339481494120112723445689647986475279242446083151413667587008191682564376412347964146113898565886683139407005941383669325997475076910488086663256335689181157957571445067490187939553165903773554290260531009121879044170766615232300936675369451260747671432073394867530820527479172464106442450727640226503746586340279816318821395210726268291535648506190714616083163403189943334431056876038286530365757187367147446004855912033137386225053275419626102417236133948503",
+	 "1095121115716677802856811290392395128588168592409109494900178008967955253005183831872715423151551999734857184538199864469605657805519106717529655044054833197687459782636297255219742994736751541815269727940751860670268774903340296040006114013971309257028332849679096824800250742691718610670812374272414086863715763724622797509437062518082383056050144624962776302147890521249477060215148275163688301275847155316042279405557632639366066847442861422164832655874655824221577849928863023018366835675399949740429332468186340518172487073360822220449055340582568461568645259954873303616953776393853174845132081121976327462740354930744487429617202585015510744298530101547706821590188733515880733527449780963163909830077616357506845523215289297624086914545378511082534229620116563260168494523906566709418166011112754529766183554579321224940951177394088465596712620076240067370589036924024728375076210477267488679008016579588696191194060127319035195370137160936882402244399699172017835144537488486396906144217720028992863941288217185353914991583400421682751000603596655790990815525126154394344641336397793791497068253936771017031980867706707490224041075826337383538651825493679503771934836094655802776331664261631740148281763487765852746577808019633679",
+
+	 /* generic unrestricted moduli */
+	 "17933601194860113372237070562165128350027320072176844226673287945873370751245439587792371960615073855669274087805055507977323024886880985062002853331424203",
+	 "2893527720709661239493896562339544088620375736490408468011883030469939904368086092336458298221245707898933583190713188177399401852627749210994595974791782790253946539043962213027074922559572312141181787434278708783207966459019479487",
+	 "347743159439876626079252796797422223177535447388206607607181663903045907591201940478223621722118173270898487582987137708656414344685816179420855160986340457973820182883508387588163122354089264395604796675278966117567294812714812796820596564876450716066283126720010859041484786529056457896367683122960411136319",
+	 "47266428956356393164697365098120418976400602706072312735924071745438532218237979333351774907308168340693326687317443721193266215155735814510792148768576498491199122744351399489453533553203833318691678263241941706256996197460424029012419012634671862283532342656309677173602509498417976091509154360039893165037637034737020327399910409885798185771003505320583967737293415979917317338985837385734747478364242020380416892056650841470869294527543597349250299539682430605173321029026555546832473048600327036845781970289288898317888427517364945316709081173840186150794397479045034008257793436817683392375274635794835245695887",
+	 "436463808505957768574894870394349739623346440601945961161254440072143298152040105676491048248110146278752857839930515766167441407021501229924721335644557342265864606569000117714935185566842453630868849121480179691838399545644365571106757731317371758557990781880691336695584799313313687287468894148823761785582982549586183756806449017542622267874275103877481475534991201849912222670102069951687572917937634467778042874315463238062009202992087620963771759666448266532858079402669920025224220613419441069718482837399612644978839925207109870840278194042158748845445131729137117098529028886770063736487420613144045836803985635654192482395882603511950547826439092832800532152534003936926017612446606135655146445620623395788978726744728503058670046885876251527122350275750995227",
+	 "11424167473351836398078306042624362277956429440521137061889702611766348760692206243140413411077394583180726863277012016602279290144126785129569474909173584789822341986742719230331946072730319555984484911716797058875905400999504305877245849119687509023232790273637466821052576859232452982061831009770786031785669030271542286603956118755585683996118896215213488875253101894663403069677745948305893849505434201763745232895780711972432011344857521691017896316861403206449421332243658855453435784006517202894181640562433575390821384210960117518650374602256601091379644034244332285065935413233557998331562749140202965844219336298970011513882564935538704289446968322281451907487362046511461221329799897350993370560697505809686438782036235372137015731304779072430260986460269894522159103008260495503005267165927542949439526272736586626709581721032189532726389643625590680105784844246152702670169304203783072275089194754889511973916207",
+	 "1214855636816562637502584060163403830270705000634713483015101384881871978446801224798536155406895823305035467591632531067547890948695117172076954220727075688048751022421198712032848890056357845974246560748347918630050853933697792254955890439720297560693579400297062396904306270145886830719309296352765295712183040773146419022875165382778007040109957609739589875590885701126197906063620133954893216612678838507540777138437797705602453719559017633986486649523611975865005712371194067612263330335590526176087004421363598470302731349138773205901447704682181517904064735636518462452242791676541725292378925568296858010151852326316777511935037531017413910506921922450666933202278489024521263798482237150056835746454842662048692127173834433089016107854491097456725016327709663199738238442164843147132789153725513257167915555162094970853584447993125488607696008169807374736711297007473812256272245489405898470297178738029484459690836250560495461579533254473316340608217876781986188705928270735695752830825527963838355419762516246028680280988020401914551825487349990306976304093109384451438813251211051597392127491464898797406789175453067960072008590614886532333015881171367104445044718144312416815712216611576221546455968770801413440778423979",
+	 NULL
+      };
+      log = fopen("logs/expt.log", "w");
+      logb = fopen("logs/expt_dr.log", "w");
+      logc = fopen("logs/expt_2k.log", "w");
+      logd = fopen("logs/expt_2kl.log", "w");
+      for (n = 0; primes[n]; n++) {
+	 SLEEP;
+	 mp_read_radix(&a, primes[n], 10);
+	 mp_zero(&b);
+	 for (rr = 0; rr < (unsigned) mp_count_bits(&a); rr++) {
+	    mp_mul_2(&b, &b);
+	    b.dp[0] |= lbit();
+	    b.used += 1;
+	 }
+	 mp_sub_d(&a, 1, &c);
+	 mp_mod(&b, &c, &b);
+	 mp_set(&c, 3);
+	 rr = 0;
+	 tt = -1;
+	 do {
+	    gg = TIMFUNC();
+	    DO(mp_exptmod(&c, &b, &a, &d));
+	    gg = (TIMFUNC() - gg) >> 1;
+	    if (tt > gg)
+	       tt = gg;
+	 } while (++rr < 10);
+	 mp_sub_d(&a, 1, &e);
+	 mp_sub(&e, &b, &b);
+	 mp_exptmod(&c, &b, &a, &e);	/* c^(p-1-b) mod a */
+	 mp_mulmod(&e, &d, &a, &d);	/* c^b * c^(p-1-b) == c^p-1 == 1 */
+	 if (mp_cmp_d(&d, 1)) {
+	    printf("Different (%d)!!!\n", mp_count_bits(&a));
+	    draw(&d);
+	    exit(0);
+	 }
+	 printf("Exponentiating\t%4d-bit => %9llu/sec, %9llu cycles\n",
+		mp_count_bits(&a), CLK_PER_SEC / tt, tt);
+	 fprintf(n < 4 ? logd : (n < 9) ? logc : (n < 16) ? logb : log,
+		 "%d %9llu\n", mp_count_bits(&a), tt);
+      }
+   }
+   fclose(log);
+   fclose(logb);
+   fclose(logc);
+   fclose(logd);
+
+   log = fopen("logs/invmod.log", "w");
+   for (cnt = 4; cnt <= 128; cnt += 4) {
+      SLEEP;
+      mp_rand(&a, cnt);
+      mp_rand(&b, cnt);
+
+      do {
+	 mp_add_d(&b, 1, &b);
+	 mp_gcd(&a, &b, &c);
+      } while (mp_cmp_d(&c, 1) != MP_EQ);
+
+      rr = 0;
+      tt = -1;
+      do {
+	 gg = TIMFUNC();
+	 DO(mp_invmod(&b, &a, &c));
+	 gg = (TIMFUNC() - gg) >> 1;
+	 if (tt > gg)
+	    tt = gg;
+      } while (++rr < 1000);
+      mp_mulmod(&b, &c, &a, &d);
+      if (mp_cmp_d(&d, 1) != MP_EQ) {
+	 printf("Failed to invert\n");
+	 return 0;
+      }
+      printf("Inverting mod\t%4d-bit => %9llu/sec, %9llu cycles\n",
+	     mp_count_bits(&a), CLK_PER_SEC / tt, tt);
+      fprintf(log, "%d %9llu\n", cnt * DIGIT_BIT, tt);
+   }
+   fclose(log);
+
+   return 0;
+}
diff --git a/libtommath/dep.pl b/libtommath/dep.pl
new file mode 100644
index 0000000..c39e27e
--- /dev/null
+++ b/libtommath/dep.pl
@@ -0,0 +1,123 @@
+#!/usr/bin/perl 
+#
+# Walk through source, add labels and make classes
+#
+#use strict;
+
+my %deplist;
+
+#open class file and write preamble 
+open(CLASS, ">tommath_class.h") or die "Couldn't open tommath_class.h for writing\n";
+print CLASS "#if !(defined(LTM1) && defined(LTM2) && defined(LTM3))\n#if defined(LTM2)\n#define LTM3\n#endif\n#if defined(LTM1)\n#define LTM2\n#endif\n#define LTM1\n\n#if defined(LTM_ALL)\n";
+
+foreach my $filename (glob "bn*.c") {
+   my $define = $filename;
+
+print "Processing $filename\n";
+
+   # convert filename to upper case so we can use it as a define 
+   $define =~ tr/[a-z]/[A-Z]/;
+   $define =~ tr/\./_/;
+   print CLASS "#define $define\n";
+
+   # now copy text and apply #ifdef as required 
+   my $apply = 0;
+   open(SRC, "<$filename");
+   open(OUT, ">tmp");
+
+   # first line will be the #ifdef
+   my $line = <SRC>;
+   if ($line =~ /include/) {
+      print OUT $line;
+   } else {
+      print OUT "#include <tommath.h>\n#ifdef $define\n$line";
+      $apply = 1;
+   }
+   while (<SRC>) {
+      if (!($_ =~ /tommath\.h/)) {
+         print OUT $_;
+      }
+   }
+   if ($apply == 1) {
+      print OUT "#endif\n";
+   }
+   close SRC;
+   close OUT;
+
+   unlink($filename);
+   rename("tmp", $filename);
+}
+print CLASS "#endif\n\n";
+
+# now do classes 
+
+foreach my $filename (glob "bn*.c") {
+   open(SRC, "<$filename") or die "Can't open source file!\n"; 
+
+   # convert filename to upper case so we can use it as a define 
+   $filename =~ tr/[a-z]/[A-Z]/;
+   $filename =~ tr/\./_/;
+
+   print CLASS "#if defined($filename)\n";
+   my $list = $filename;
+
+   # scan for mp_* and make classes
+   while (<SRC>) {
+      my $line = $_;
+      while ($line =~ m/(fast_)*(s_)*mp\_[a-z_0-9]*/) {
+          $line = $';
+          # now $& is the match, we want to skip over LTM keywords like
+          # mp_int, mp_word, mp_digit
+          if (!($& eq "mp_digit") && !($& eq "mp_word") && !($& eq "mp_int")) {
+             my $a = $&;
+             $a =~ tr/[a-z]/[A-Z]/;
+             $a = "BN_" . $a . "_C";
+             if (!($list =~ /$a/)) {
+                print CLASS "   #define $a\n";
+             }
+             $list = $list . "," . $a;
+          }
+      }
+   }
+   @deplist{$filename} = $list;
+
+   print CLASS "#endif\n\n";
+   close SRC;
+}
+
+print CLASS "#ifdef LTM3\n#define LTM_LAST\n#endif\n#include <tommath_superclass.h>\n#include <tommath_class.h>\n#else\n#define LTM_LAST\n#endif\n";
+close CLASS;
+
+#now let's make a cool call graph... 
+
+open(OUT,">callgraph.txt");
+$indent = 0;
+foreach (keys %deplist) {
+   $list = "";
+   draw_func(@deplist{$_});
+   print OUT "\n\n";
+}
+close(OUT);
+
+sub draw_func()
+{
+   my @funcs = split(",", $_[0]);
+   if ($list =~ /@funcs[0]/) {
+      return;
+   } else {
+      $list = $list . @funcs[0];
+   }
+   if ($indent == 0) { }
+   elsif ($indent >= 1) { print OUT "|   " x ($indent - 1) . "+--->"; }
+   print OUT @funcs[0] . "\n";   
+   shift @funcs;
+      my $temp = $list;
+   foreach my $i (@funcs) {
+      ++$indent;
+      draw_func(@deplist{$i});
+      --$indent;
+   }
+      $list = $temp;
+}
+
+
diff --git a/libtommath/etc/2kprime.1 b/libtommath/etc/2kprime.1
new file mode 100644
index 0000000..c41ded1
--- /dev/null
+++ b/libtommath/etc/2kprime.1
@@ -0,0 +1,2 @@
+256-bits (k = 36113) = 115792089237316195423570985008687907853269984665640564039457584007913129603823
+512-bits (k = 38117) = 13407807929942597099574024998205846127479365820592393377723561443721764030073546976801874298166903427690031858186486050853753882811946569946433649006045979
diff --git a/libtommath/etc/2kprime.c b/libtommath/etc/2kprime.c
new file mode 100644
index 0000000..d48b83e
--- /dev/null
+++ b/libtommath/etc/2kprime.c
@@ -0,0 +1,80 @@
+/* Makes safe primes of a 2k nature */
+#include <tommath.h>
+#include <time.h>
+
+int sizes[] = {256, 512, 768, 1024, 1536, 2048, 3072, 4096};
+
+int main(void)
+{
+   char buf[2000];
+   int x, y;
+   mp_int q, p;
+   FILE *out;
+   clock_t t1;
+   mp_digit z;
+   
+   mp_init_multi(&q, &p, NULL);
+   
+   out = fopen("2kprime.1", "w");
+   for (x = 0; x < (int)(sizeof(sizes) / sizeof(sizes[0])); x++) {
+   top:
+       mp_2expt(&q, sizes[x]);
+       mp_add_d(&q, 3, &q);
+       z = -3;
+       
+       t1 = clock();
+       for(;;) {
+         mp_sub_d(&q, 4, &q);
+         z += 4;
+
+         if (z > MP_MASK) {
+            printf("No primes of size %d found\n", sizes[x]);
+            break;
+         }
+         
+         if (clock() - t1 > CLOCKS_PER_SEC) { 
+            printf("."); fflush(stdout);
+//            sleep((clock() - t1 + CLOCKS_PER_SEC/2)/CLOCKS_PER_SEC);
+            t1 = clock();
+         }
+         
+         /* quick test on q */
+         mp_prime_is_prime(&q, 1, &y);
+         if (y == 0) {
+            continue;
+         }
+
+         /* find (q-1)/2 */
+         mp_sub_d(&q, 1, &p);
+         mp_div_2(&p, &p);
+         mp_prime_is_prime(&p, 3, &y);
+         if (y == 0) {
+            continue;
+         }
+
+         /* test on q */
+         mp_prime_is_prime(&q, 3, &y);
+         if (y == 0) {
+            continue;
+         }
+
+         break;
+       }
+       
+       if (y == 0) {
+          ++sizes[x];
+          goto top;
+       }
+       
+       mp_toradix(&q, buf, 10);
+       printf("\n\n%d-bits (k = %lu) = %s\n", sizes[x], z, buf);
+       fprintf(out, "%d-bits (k = %lu) = %s\n", sizes[x], z, buf); fflush(out);
+   }
+   
+   return 0;
+}   
+       
+         
+            
+            
+          
diff --git a/libtommath/etc/drprime.c b/libtommath/etc/drprime.c
new file mode 100644
index 0000000..0ab8ea6
--- /dev/null
+++ b/libtommath/etc/drprime.c
@@ -0,0 +1,60 @@
+/* Makes safe primes of a DR nature */
+#include <tommath.h>
+
+int sizes[] = { 1+256/DIGIT_BIT, 1+512/DIGIT_BIT, 1+768/DIGIT_BIT, 1+1024/DIGIT_BIT, 1+2048/DIGIT_BIT, 1+4096/DIGIT_BIT };
+int main(void)
+{
+   int res, x, y;
+   char buf[4096];
+   FILE *out;
+   mp_int a, b;
+   
+   mp_init(&a);
+   mp_init(&b);
+   
+   out = fopen("drprimes.txt", "w");
+   for (x = 0; x < (int)(sizeof(sizes)/sizeof(sizes[0])); x++) {
+   top:
+       printf("Seeking a %d-bit safe prime\n", sizes[x] * DIGIT_BIT);
+       mp_grow(&a, sizes[x]);
+       mp_zero(&a);
+       for (y = 1; y < sizes[x]; y++) {
+           a.dp[y] = MP_MASK;
+       }
+       
+       /* make a DR modulus */
+       a.dp[0] = -1;
+       a.used = sizes[x];
+       
+       /* now loop */
+       res = 0;
+       for (;;) { 
+          a.dp[0] += 4;
+          if (a.dp[0] >= MP_MASK) break;
+          mp_prime_is_prime(&a, 1, &res);
+          if (res == 0) continue;
+          printf("."); fflush(stdout);
+          mp_sub_d(&a, 1, &b);
+          mp_div_2(&b, &b);
+          mp_prime_is_prime(&b, 3, &res);  
+          if (res == 0) continue;
+          mp_prime_is_prime(&a, 3, &res);
+          if (res == 1) break;
+	}
+        
+        if (res != 1) {
+           printf("Error not DR modulus\n"); sizes[x] += 1; goto top;
+        } else {
+           mp_toradix(&a, buf, 10);
+           printf("\n\np == %s\n\n", buf);
+           fprintf(out, "%d-bit prime:\np == %s\n\n", mp_count_bits(&a), buf); fflush(out);
+        }           
+   }
+   fclose(out);
+   
+   mp_clear(&a);
+   mp_clear(&b);
+   
+   return 0;
+}
+
diff --git a/libtommath/etc/drprimes.28 b/libtommath/etc/drprimes.28
new file mode 100644
index 0000000..9d438ad
--- /dev/null
+++ b/libtommath/etc/drprimes.28
@@ -0,0 +1,25 @@
+DR safe primes for 28-bit digits.
+
+224-bit prime:
+p == 26959946667150639794667015087019630673637144422540572481103341844143
+
+532-bit prime:
+p == 14059105607947488696282932836518693308967803494693489478439861164411992439598399594747002144074658928593502845729752797260025831423419686528151609940203368691747
+
+784-bit prime:
+p == 101745825697019260773923519755878567461315282017759829107608914364075275235254395622580447400994175578963163918967182013639660669771108475957692810857098847138903161308502419410142185759152435680068435915159402496058513611411688900243039
+
+1036-bit prime:
+p == 736335108039604595805923406147184530889923370574768772191969612422073040099331944991573923112581267542507986451953227192970402893063850485730703075899286013451337291468249027691733891486704001513279827771740183629161065194874727962517148100775228363421083691764065477590823919364012917984605619526140821798437127
+
+1540-bit prime:
+p == 38564998830736521417281865696453025806593491967131023221754800625044118265468851210705360385717536794615180260494208076605798671660719333199513807806252394423283413430106003596332513246682903994829528690198205120921557533726473585751382193953592127439965050261476810842071573684505878854588706623484573925925903505747545471088867712185004135201289273405614415899438276535626346098904241020877974002916168099951885406379295536200413493190419727789712076165162175783
+
+2072-bit prime:
+p == 542189391331696172661670440619180536749994166415993334151601745392193484590296600979602378676624808129613777993466242203025054573692562689251250471628358318743978285860720148446448885701001277560572526947619392551574490839286458454994488665744991822837769918095117129546414124448777033941223565831420390846864429504774477949153794689948747680362212954278693335653935890352619041936727463717926744868338358149568368643403037768649616778526013610493696186055899318268339432671541328195724261329606699831016666359440874843103020666106568222401047720269951530296879490444224546654729111504346660859907296364097126834834235287147
+
+3080-bit prime:
+p == 1487259134814709264092032648525971038895865645148901180585340454985524155135260217788758027400478312256339496385275012465661575576202252063145698732079880294664220579764848767704076761853197216563262660046602703973050798218246170835962005598561669706844469447435461092542265792444947706769615695252256130901271870341005768912974433684521436211263358097522726462083917939091760026658925757076733484173202927141441492573799914240222628795405623953109131594523623353044898339481494120112723445689647986475279242446083151413667587008191682564376412347964146113898565886683139407005941383669325997475076910488086663256335689181157957571445067490187939553165903773554290260531009121879044170766615232300936675369451260747671432073394867530820527479172464106442450727640226503746586340279816318821395210726268291535648506190714616083163403189943334431056876038286530365757187367147446004855912033137386225053275419626102417236133948503
+
+4116-bit prime:
+p == 1095121115716677802856811290392395128588168592409109494900178008967955253005183831872715423151551999734857184538199864469605657805519106717529655044054833197687459782636297255219742994736751541815269727940751860670268774903340296040006114013971309257028332849679096824800250742691718610670812374272414086863715763724622797509437062518082383056050144624962776302147890521249477060215148275163688301275847155316042279405557632639366066847442861422164832655874655824221577849928863023018366835675399949740429332468186340518172487073360822220449055340582568461568645259954873303616953776393853174845132081121976327462740354930744487429617202585015510744298530101547706821590188733515880733527449780963163909830077616357506845523215289297624086914545378511082534229620116563260168494523906566709418166011112754529766183554579321224940951177394088465596712620076240067370589036924024728375076210477267488679008016579588696191194060127319035195370137160936882402244399699172017835144537488486396906144217720028992863941288217185353914991583400421682751000603596655790990815525126154394344641336397793791497068253936771017031980867706707490224041075826337383538651825493679503771934836094655802776331664261631740148281763487765852746577808019633679
diff --git a/libtommath/etc/drprimes.txt b/libtommath/etc/drprimes.txt
new file mode 100644
index 0000000..2c887ea
--- /dev/null
+++ b/libtommath/etc/drprimes.txt
@@ -0,0 +1,6 @@
+280-bit prime:
+p == 1942668892225729070919461906823518906642406839052139521251812409738904285204940164839
+
+532-bit prime:
+p == 14059105607947488696282932836518693308967803494693489478439861164411992439598399594747002144074658928593502845729752797260025831423419686528151609940203368691747
+
diff --git a/libtommath/etc/makefile b/libtommath/etc/makefile
new file mode 100644
index 0000000..99154d8
--- /dev/null
+++ b/libtommath/etc/makefile
@@ -0,0 +1,50 @@
+CFLAGS += -Wall -W -Wshadow -O3 -fomit-frame-pointer -funroll-loops -I../
+
+# default lib name (requires install with root)
+# LIBNAME=-ltommath
+
+# libname when you can't install the lib with install
+LIBNAME=../libtommath.a
+
+#provable primes
+pprime: pprime.o
+	$(CC) pprime.o $(LIBNAME) -o pprime
+
+# portable [well requires clock()] tuning app
+tune: tune.o
+	$(CC) tune.o $(LIBNAME) -o tune
+	
+# same app but using RDTSC for higher precision [requires 80586+], coff based gcc installs [e.g. ming, cygwin, djgpp]
+tune86: tune.c
+	nasm -f coff timer.asm
+	$(CC) -DX86_TIMER $(CFLAGS) tune.c timer.o  $(LIBNAME) -o tune86
+	
+# for cygwin
+tune86c: tune.c
+	nasm -f gnuwin32 timer.asm
+	$(CC) -DX86_TIMER $(CFLAGS) tune.c timer.o  $(LIBNAME) -o tune86
+
+#make tune86 for linux or any ELF format
+tune86l: tune.c
+	nasm -f elf -DUSE_ELF timer.asm
+	$(CC) -DX86_TIMER $(CFLAGS) tune.c timer.o $(LIBNAME) -o tune86l
+        
+# spits out mersenne primes
+mersenne: mersenne.o
+	$(CC) mersenne.o $(LIBNAME) -o mersenne
+
+# fines DR safe primes for the given config
+drprime: drprime.o
+	$(CC) drprime.o $(LIBNAME) -o drprime
+	
+# fines 2k safe primes for the given config
+2kprime: 2kprime.o
+	$(CC) 2kprime.o $(LIBNAME) -o 2kprime
+
+mont: mont.o
+	$(CC) mont.o $(LIBNAME) -o mont
+
+        
+clean:
+	rm -f *.log *.o *.obj *.exe pprime tune mersenne drprime tune86 tune86l mont 2kprime pprime.dat \
+         *.da *.dyn *.dpi *~
diff --git a/libtommath/etc/makefile.icc b/libtommath/etc/makefile.icc
new file mode 100644
index 0000000..0a50728
--- /dev/null
+++ b/libtommath/etc/makefile.icc
@@ -0,0 +1,67 @@
+CC = icc
+
+CFLAGS += -I../
+
+# optimize for SPEED
+#
+# -mcpu= can be pentium, pentiumpro (covers PII through PIII) or pentium4
+# -ax?   specifies make code specifically for ? but compatible with IA-32
+# -x?    specifies compile solely for ? [not specifically IA-32 compatible]
+#
+# where ? is 
+#   K - PIII
+#   W - first P4 [Williamette]
+#   N - P4 Northwood
+#   P - P4 Prescott
+#   B - Blend of P4 and PM [mobile]
+#
+# Default to just generic max opts
+CFLAGS += -O3 -xN -ip
+
+# default lib name (requires install with root)
+# LIBNAME=-ltommath
+
+# libname when you can't install the lib with install
+LIBNAME=../libtommath.a
+
+#provable primes
+pprime: pprime.o
+	$(CC) pprime.o $(LIBNAME) -o pprime
+
+# portable [well requires clock()] tuning app
+tune: tune.o
+	$(CC) tune.o $(LIBNAME) -o tune
+	
+# same app but using RDTSC for higher precision [requires 80586+], coff based gcc installs [e.g. ming, cygwin, djgpp]
+tune86: tune.c
+	nasm -f coff timer.asm
+	$(CC) -DX86_TIMER $(CFLAGS) tune.c timer.o  $(LIBNAME) -o tune86
+	
+# for cygwin
+tune86c: tune.c
+	nasm -f gnuwin32 timer.asm
+	$(CC) -DX86_TIMER $(CFLAGS) tune.c timer.o  $(LIBNAME) -o tune86
+
+#make tune86 for linux or any ELF format
+tune86l: tune.c
+	nasm -f elf -DUSE_ELF timer.asm
+	$(CC) -DX86_TIMER $(CFLAGS) tune.c timer.o $(LIBNAME) -o tune86l
+        
+# spits out mersenne primes
+mersenne: mersenne.o
+	$(CC) mersenne.o $(LIBNAME) -o mersenne
+
+# fines DR safe primes for the given config
+drprime: drprime.o
+	$(CC) drprime.o $(LIBNAME) -o drprime
+	
+# fines 2k safe primes for the given config
+2kprime: 2kprime.o
+	$(CC) 2kprime.o $(LIBNAME) -o 2kprime
+
+mont: mont.o
+	$(CC) mont.o $(LIBNAME) -o mont
+
+        
+clean:
+	rm -f *.log *.o *.obj *.exe pprime tune mersenne drprime tune86 tune86l mont 2kprime pprime.dat *.il
diff --git a/libtommath/etc/makefile.msvc b/libtommath/etc/makefile.msvc
new file mode 100644
index 0000000..2833372
--- /dev/null
+++ b/libtommath/etc/makefile.msvc
@@ -0,0 +1,23 @@
+#MSVC Makefile
+#
+#Tom St Denis
+
+CFLAGS = /I../ /Ox /DWIN32 /W3
+
+pprime: pprime.obj
+	cl pprime.obj ../tommath.lib 
+
+mersenne: mersenne.obj
+	cl mersenne.obj ../tommath.lib
+	
+tune: tune.obj
+	cl tune.obj ../tommath.lib
+
+mont: mont.obj
+	cl mont.obj ../tommath.lib
+	
+drprime: drprime.obj
+	cl drprime.obj ../tommath.lib
+
+2kprime: 2kprime.obj
+	cl 2kprime.obj ../tommath.lib
diff --git a/libtommath/etc/mersenne.c b/libtommath/etc/mersenne.c
new file mode 100644
index 0000000..1cd5b50
--- /dev/null
+++ b/libtommath/etc/mersenne.c
@@ -0,0 +1,140 @@
+/* Finds Mersenne primes using the Lucas-Lehmer test 
+ *
+ * Tom St Denis, tomstdenis@iahu.ca
+ */
+#include <time.h>
+#include <tommath.h>
+
+int
+is_mersenne (long s, int *pp)
+{
+  mp_int  n, u;
+  int     res, k;
+  
+  *pp = 0;
+
+  if ((res = mp_init (&n)) != MP_OKAY) {
+    return res;
+  }
+
+  if ((res = mp_init (&u)) != MP_OKAY) {
+    goto LBL_N;
+  }
+
+  /* n = 2^s - 1 */
+  if ((res = mp_2expt(&n, s)) != MP_OKAY) {
+     goto LBL_MU;
+  }
+  if ((res = mp_sub_d (&n, 1, &n)) != MP_OKAY) {
+    goto LBL_MU;
+  }
+
+  /* set u=4 */
+  mp_set (&u, 4);
+
+  /* for k=1 to s-2 do */
+  for (k = 1; k <= s - 2; k++) {
+    /* u = u^2 - 2 mod n */
+    if ((res = mp_sqr (&u, &u)) != MP_OKAY) {
+      goto LBL_MU;
+    }
+    if ((res = mp_sub_d (&u, 2, &u)) != MP_OKAY) {
+      goto LBL_MU;
+    }
+
+    /* make sure u is positive */
+    while (u.sign == MP_NEG) {
+      if ((res = mp_add (&u, &n, &u)) != MP_OKAY) {
+         goto LBL_MU;
+      }
+    }
+
+    /* reduce */
+    if ((res = mp_reduce_2k (&u, &n, 1)) != MP_OKAY) {
+      goto LBL_MU;
+    }
+  }
+
+  /* if u == 0 then its prime */
+  if (mp_iszero (&u) == 1) {
+    mp_prime_is_prime(&n, 8, pp);
+  if (*pp != 1) printf("FAILURE\n");
+  }
+
+  res = MP_OKAY;
+LBL_MU:mp_clear (&u);
+LBL_N:mp_clear (&n);
+  return res;
+}
+
+/* square root of a long < 65536 */
+long
+i_sqrt (long x)
+{
+  long    x1, x2;
+
+  x2 = 16;
+  do {
+    x1 = x2;
+    x2 = x1 - ((x1 * x1) - x) / (2 * x1);
+  } while (x1 != x2);
+
+  if (x1 * x1 > x) {
+    --x1;
+  }
+
+  return x1;
+}
+
+/* is the long prime by brute force */
+int
+isprime (long k)
+{
+  long    y, z;
+
+  y = i_sqrt (k);
+  for (z = 2; z <= y; z++) {
+    if ((k % z) == 0)
+      return 0;
+  }
+  return 1;
+}
+
+
+int
+main (void)
+{
+  int     pp;
+  long    k;
+  clock_t tt;
+
+  k = 3;
+
+  for (;;) {
+    /* start time */
+    tt = clock ();
+
+    /* test if 2^k - 1 is prime */
+    if (is_mersenne (k, &pp) != MP_OKAY) {
+      printf ("Whoa error\n");
+      return -1;
+    }
+
+    if (pp == 1) {
+      /* count time */
+      tt = clock () - tt;
+
+      /* display if prime */
+      printf ("2^%-5ld - 1 is prime, test took %ld ticks\n", k, tt);
+    }
+
+    /* goto next odd exponent */
+    k += 2;
+
+    /* but make sure its prime */
+    while (isprime (k) == 0) {
+      k += 2;
+    }
+  }
+  return 0;
+}
diff --git a/libtommath/etc/mont.c b/libtommath/etc/mont.c
new file mode 100644
index 0000000..dbf1735
--- /dev/null
+++ b/libtommath/etc/mont.c
@@ -0,0 +1,46 @@
+/* tests the montgomery routines */
+#include <tommath.h>
+
+int main(void)
+{
+   mp_int modulus, R, p, pp;
+   mp_digit mp;
+   long x, y;
+
+   srand(time(NULL));
+   mp_init_multi(&modulus, &R, &p, &pp, NULL);
+
+   /* loop through various sizes */
+   for (x = 4; x < 256; x++) {
+       printf("DIGITS == %3ld...", x); fflush(stdout);
+       
+       /* make up the odd modulus */
+       mp_rand(&modulus, x);
+       modulus.dp[0] |= 1;
+       
+       /* now find the R value */
+       mp_montgomery_calc_normalization(&R, &modulus);
+       mp_montgomery_setup(&modulus, &mp);
+       
+       /* now run through a bunch tests */
+       for (y = 0; y < 1000; y++) {
+           mp_rand(&p, x/2);        /* p = random */
+           mp_mul(&p, &R, &pp);     /* pp = R * p */
+           mp_montgomery_reduce(&pp, &modulus, mp);
+           
+           /* should be equal to p */
+           if (mp_cmp(&pp, &p) != MP_EQ) {
+              printf("FAILURE!\n");
+              exit(-1);
+           }
+       }
+       printf("PASSED\n");
+    }
+    
+    return 0;
+}
+
+
+
+
+
diff --git a/libtommath/etc/pprime.c b/libtommath/etc/pprime.c
new file mode 100644
index 0000000..26e0d84
--- /dev/null
+++ b/libtommath/etc/pprime.c
@@ -0,0 +1,396 @@
+/* Generates provable primes
+ *
+ * See http://iahu.ca:8080/papers/pp.pdf for more info.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://tom.iahu.ca
+ */
+#include <time.h>
+#include "tommath.h"
+
+int   n_prime;
+FILE *primes;
+
+/* fast square root */
+static  mp_digit
+i_sqrt (mp_word x)
+{
+  mp_word x1, x2;
+
+  x2 = x;
+  do {
+    x1 = x2;
+    x2 = x1 - ((x1 * x1) - x) / (2 * x1);
+  } while (x1 != x2);
+
+  if (x1 * x1 > x) {
+    --x1;
+  }
+
+  return x1;
+}
+
+
+/* generates a prime digit */
+static void gen_prime (void)
+{
+  mp_digit r, x, y, next;
+  FILE *out;
+
+  out = fopen("pprime.dat", "wb");
+
+  /* write first set of primes */
+  r = 3; fwrite(&r, 1, sizeof(mp_digit), out);
+  r = 5; fwrite(&r, 1, sizeof(mp_digit), out);
+  r = 7; fwrite(&r, 1, sizeof(mp_digit), out);
+  r = 11; fwrite(&r, 1, sizeof(mp_digit), out);
+  r = 13; fwrite(&r, 1, sizeof(mp_digit), out);
+  r = 17; fwrite(&r, 1, sizeof(mp_digit), out);
+  r = 19; fwrite(&r, 1, sizeof(mp_digit), out);
+  r = 23; fwrite(&r, 1, sizeof(mp_digit), out);
+  r = 29; fwrite(&r, 1, sizeof(mp_digit), out);
+  r = 31; fwrite(&r, 1, sizeof(mp_digit), out);
+
+  /* get square root, since if 'r' is composite its factors must be < than this */
+  y = i_sqrt (r);
+  next = (y + 1) * (y + 1);
+
+  for (;;) {
+  do {
+    r += 2;			/* next candidate */
+    r &= MP_MASK;
+    if (r < 31) break;
+
+    /* update sqrt ? */
+    if (next <= r) {
+      ++y;
+      next = (y + 1) * (y + 1);
+    }
+
+    /* loop if divisible by 3,5,7,11,13,17,19,23,29  */
+    if ((r % 3) == 0) {
+      x = 0;
+      continue;
+    }
+    if ((r % 5) == 0) {
+      x = 0;
+      continue;
+    }
+    if ((r % 7) == 0) {
+      x = 0;
+      continue;
+    }
+    if ((r % 11) == 0) {
+      x = 0;
+      continue;
+    }
+    if ((r % 13) == 0) {
+      x = 0;
+      continue;
+    }
+    if ((r % 17) == 0) {
+      x = 0;
+      continue;
+    }
+    if ((r % 19) == 0) {
+      x = 0;
+      continue;
+    }
+    if ((r % 23) == 0) {
+      x = 0;
+      continue;
+    }
+    if ((r % 29) == 0) {
+      x = 0;
+      continue;
+    }
+
+    /* now check if r is divisible by x + k={1,7,11,13,17,19,23,29} */
+    for (x = 30; x <= y; x += 30) {
+      if ((r % (x + 1)) == 0) {
+	x = 0;
+	break;
+      }
+      if ((r % (x + 7)) == 0) {
+	x = 0;
+	break;
+      }
+      if ((r % (x + 11)) == 0) {
+	x = 0;
+	break;
+      }
+      if ((r % (x + 13)) == 0) {
+	x = 0;
+	break;
+      }
+      if ((r % (x + 17)) == 0) {
+	x = 0;
+	break;
+      }
+      if ((r % (x + 19)) == 0) {
+	x = 0;
+	break;
+      }
+      if ((r % (x + 23)) == 0) {
+	x = 0;
+	break;
+      }
+      if ((r % (x + 29)) == 0) {
+	x = 0;
+	break;
+      }
+    }
+  } while (x == 0);
+  if (r > 31) { fwrite(&r, 1, sizeof(mp_digit), out); printf("%9d\r", r); fflush(stdout); }
+  if (r < 31) break;
+  }
+
+  fclose(out);
+}
+
+void load_tab(void)
+{
+   primes = fopen("pprime.dat", "rb");
+   if (primes == NULL) {
+      gen_prime();
+      primes = fopen("pprime.dat", "rb");
+   }
+   fseek(primes, 0, SEEK_END);
+   n_prime = ftell(primes) / sizeof(mp_digit);
+}
+
+mp_digit prime_digit(void)
+{
+   int n;
+   mp_digit d;
+
+   n = abs(rand()) % n_prime;
+   fseek(primes, n * sizeof(mp_digit), SEEK_SET);
+   fread(&d, 1, sizeof(mp_digit), primes);
+   return d;
+}
+
+
+/* makes a prime of at least k bits */
+int
+pprime (int k, int li, mp_int * p, mp_int * q)
+{
+  mp_int  a, b, c, n, x, y, z, v;
+  int     res, ii;
+  static const mp_digit bases[] = { 2, 3, 5, 7, 11, 13, 17, 19 };
+
+  /* single digit ? */
+  if (k <= (int) DIGIT_BIT) {
+    mp_set (p, prime_digit ());
+    return MP_OKAY;
+  }
+
+  if ((res = mp_init (&c)) != MP_OKAY) {
+    return res;
+  }
+
+  if ((res = mp_init (&v)) != MP_OKAY) {
+    goto LBL_C;
+  }
+
+  /* product of first 50 primes */
+  if ((res =
+       mp_read_radix (&v,
+		      "19078266889580195013601891820992757757219839668357012055907516904309700014933909014729740190",
+		      10)) != MP_OKAY) {
+    goto LBL_V;
+  }
+
+  if ((res = mp_init (&a)) != MP_OKAY) {
+    goto LBL_V;
+  }
+
+  /* set the prime */
+  mp_set (&a, prime_digit ());
+
+  if ((res = mp_init (&b)) != MP_OKAY) {
+    goto LBL_A;
+  }
+
+  if ((res = mp_init (&n)) != MP_OKAY) {
+    goto LBL_B;
+  }
+
+  if ((res = mp_init (&x)) != MP_OKAY) {
+    goto LBL_N;
+  }
+
+  if ((res = mp_init (&y)) != MP_OKAY) {
+    goto LBL_X;
+  }
+
+  if ((res = mp_init (&z)) != MP_OKAY) {
+    goto LBL_Y;
+  }
+
+  /* now loop making the single digit */
+  while (mp_count_bits (&a) < k) {
+    fprintf (stderr, "prime has %4d bits left\r", k - mp_count_bits (&a));
+    fflush (stderr);
+  top:
+    mp_set (&b, prime_digit ());
+
+    /* now compute z = a * b * 2 */
+    if ((res = mp_mul (&a, &b, &z)) != MP_OKAY) {	/* z = a * b */
+      goto LBL_Z;
+    }
+
+    if ((res = mp_copy (&z, &c)) != MP_OKAY) {	/* c = a * b */
+      goto LBL_Z;
+    }
+
+    if ((res = mp_mul_2 (&z, &z)) != MP_OKAY) {	/* z = 2 * a * b */
+      goto LBL_Z;
+    }
+
+    /* n = z + 1 */
+    if ((res = mp_add_d (&z, 1, &n)) != MP_OKAY) {	/* n = z + 1 */
+      goto LBL_Z;
+    }
+
+    /* check (n, v) == 1 */
+    if ((res = mp_gcd (&n, &v, &y)) != MP_OKAY) {	/* y = (n, v) */
+      goto LBL_Z;
+    }
+
+    if (mp_cmp_d (&y, 1) != MP_EQ)
+      goto top;
+
+    /* now try base x=bases[ii]  */
+    for (ii = 0; ii < li; ii++) {
+      mp_set (&x, bases[ii]);
+
+      /* compute x^a mod n */
+      if ((res = mp_exptmod (&x, &a, &n, &y)) != MP_OKAY) {	/* y = x^a mod n */
+	goto LBL_Z;
+      }
+
+      /* if y == 1 loop */
+      if (mp_cmp_d (&y, 1) == MP_EQ)
+	continue;
+
+      /* now x^2a mod n */
+      if ((res = mp_sqrmod (&y, &n, &y)) != MP_OKAY) {	/* y = x^2a mod n */
+	goto LBL_Z;
+      }
+
+      if (mp_cmp_d (&y, 1) == MP_EQ)
+	continue;
+
+      /* compute x^b mod n */
+      if ((res = mp_exptmod (&x, &b, &n, &y)) != MP_OKAY) {	/* y = x^b mod n */
+	goto LBL_Z;
+      }
+
+      /* if y == 1 loop */
+      if (mp_cmp_d (&y, 1) == MP_EQ)
+	continue;
+
+      /* now x^2b mod n */
+      if ((res = mp_sqrmod (&y, &n, &y)) != MP_OKAY) {	/* y = x^2b mod n */
+	goto LBL_Z;
+      }
+
+      if (mp_cmp_d (&y, 1) == MP_EQ)
+	continue;
+
+      /* compute x^c mod n == x^ab mod n */
+      if ((res = mp_exptmod (&x, &c, &n, &y)) != MP_OKAY) {	/* y = x^ab mod n */
+	goto LBL_Z;
+      }
+
+      /* if y == 1 loop */
+      if (mp_cmp_d (&y, 1) == MP_EQ)
+	continue;
+
+      /* now compute (x^c mod n)^2 */
+      if ((res = mp_sqrmod (&y, &n, &y)) != MP_OKAY) {	/* y = x^2ab mod n */
+	goto LBL_Z;
+      }
+
+      /* y should be 1 */
+      if (mp_cmp_d (&y, 1) != MP_EQ)
+	continue;
+      break;
+    }
+
+    /* no bases worked? */
+    if (ii == li)
+      goto top;
+
+{
+   char buf[4096];
+
+   mp_toradix(&n, buf, 10);
+   printf("Certificate of primality for:\n%s\n\n", buf);
+   mp_toradix(&a, buf, 10);
+   printf("A == \n%s\n\n", buf);
+   mp_toradix(&b, buf, 10);
+   printf("B == \n%s\n\nG == %d\n", buf, bases[ii]);
+   printf("----------------------------------------------------------------\n");
+}
+
+    /* a = n */
+    mp_copy (&n, &a);
+  }
+
+  /* get q to be the order of the large prime subgroup */
+  mp_sub_d (&n, 1, q);
+  mp_div_2 (q, q);
+  mp_div (q, &b, q, NULL);
+
+  mp_exch (&n, p);
+
+  res = MP_OKAY;
+LBL_Z:mp_clear (&z);
+LBL_Y:mp_clear (&y);
+LBL_X:mp_clear (&x);
+LBL_N:mp_clear (&n);
+LBL_B:mp_clear (&b);
+LBL_A:mp_clear (&a);
+LBL_V:mp_clear (&v);
+LBL_C:mp_clear (&c);
+  return res;
+}
+
+
+int
+main (void)
+{
+  mp_int  p, q;
+  char    buf[4096];
+  int     k, li;
+  clock_t t1;
+
+  srand (time (NULL));
+  load_tab();
+
+  printf ("Enter # of bits: \n");
+  fgets (buf, sizeof (buf), stdin);
+  sscanf (buf, "%d", &k);
+
+  printf ("Enter number of bases to try (1 to 8):\n");
+  fgets (buf, sizeof (buf), stdin);
+  sscanf (buf, "%d", &li);
+
+
+  mp_init (&p);
+  mp_init (&q);
+
+  t1 = clock ();
+  pprime (k, li, &p, &q);
+  t1 = clock () - t1;
+
+  printf ("\n\nTook %ld ticks, %d bits\n", t1, mp_count_bits (&p));
+
+  mp_toradix (&p, buf, 10);
+  printf ("P == %s\n", buf);
+  mp_toradix (&q, buf, 10);
+  printf ("Q == %s\n", buf);
+
+  return 0;
+}
diff --git a/libtommath/etc/prime.1024 b/libtommath/etc/prime.1024
new file mode 100644
index 0000000..5636e2d
--- /dev/null
+++ b/libtommath/etc/prime.1024
@@ -0,0 +1,414 @@
+Enter # of bits: 
+Enter number of bases to try (1 to 8):
+Certificate of primality for:
+36360080703173363
+
+A == 
+89963569
+
+B == 
+202082249
+
+G == 2
+----------------------------------------------------------------
+Certificate of primality for:
+4851595597739856136987139
+
+A == 
+36360080703173363
+
+B == 
+66715963
+
+G == 2
+----------------------------------------------------------------
+Certificate of primality for:
+19550639734462621430325731591027
+
+A == 
+4851595597739856136987139
+
+B == 
+2014867
+
+G == 2
+----------------------------------------------------------------
+Certificate of primality for:
+10409036141344317165691858509923818734539
+
+A == 
+19550639734462621430325731591027
+
+B == 
+266207047
+
+G == 2
+----------------------------------------------------------------
+Certificate of primality for:
+1049829549988285012736475602118094726647504414203
+
+A == 
+10409036141344317165691858509923818734539
+
+B == 
+50428759
+
+G == 2
+----------------------------------------------------------------
+Certificate of primality for:
+77194737385528288387712399596835459931920358844586615003
+
+A == 
+1049829549988285012736475602118094726647504414203
+
+B == 
+36765367
+
+G == 2
+----------------------------------------------------------------
+Certificate of primality for:
+35663756695365208574443215955488689578374232732893628896541201763
+
+A == 
+77194737385528288387712399596835459931920358844586615003
+
+B == 
+230998627
+
+G == 2
+----------------------------------------------------------------
+Certificate of primality for:
+16711831463502165169495622246023119698415848120292671294127567620396469803
+
+A == 
+35663756695365208574443215955488689578374232732893628896541201763
+
+B == 
+234297127
+
+G == 2
+----------------------------------------------------------------
+Certificate of primality for:
+6163534781560285962890718925972249753147470953579266394395432475622345597103528739
+
+A == 
+16711831463502165169495622246023119698415848120292671294127567620396469803
+
+B == 
+184406323
+
+G == 2
+----------------------------------------------------------------
+Certificate of primality for:
+814258256205243497704094951432575867360065658372158511036259934640748088306764553488803787
+
+A == 
+6163534781560285962890718925972249753147470953579266394395432475622345597103528739
+
+B == 
+66054487
+
+G == 2
+----------------------------------------------------------------
+Certificate of primality for:
+176469695533271657902814176811660357049007467856432383037590673407330246967781451723764079581998187
+
+A == 
+814258256205243497704094951432575867360065658372158511036259934640748088306764553488803787
+
+B == 
+108362239
+
+G == 2
+----------------------------------------------------------------
+Certificate of primality for:
+44924492859445516541759485198544012102424796403707253610035148063863073596051272171194806669756971406400419
+
+A == 
+176469695533271657902814176811660357049007467856432383037590673407330246967781451723764079581998187
+
+B == 
+127286707
+
+G == 2
+----------------------------------------------------------------
+Certificate of primality for:
+20600996927219343383225424320134474929609459588323857796871086845924186191561749519858600696159932468024710985371059
+
+A == 
+44924492859445516541759485198544012102424796403707253610035148063863073596051272171194806669756971406400419
+
+B == 
+229284691
+
+G == 2
+----------------------------------------------------------------
+Certificate of primality for:
+6295696427695493110141186605837397185848992307978456138112526915330347715236378041486547994708748840844217371233735072572979
+
+A == 
+20600996927219343383225424320134474929609459588323857796871086845924186191561749519858600696159932468024710985371059
+
+B == 
+152800771
+
+G == 2
+----------------------------------------------------------------
+Certificate of primality for:
+3104984078042317488749073016454213579257792635142218294052134804187631661145261015102617582090263808696699966840735333252107678792123
+
+A == 
+6295696427695493110141186605837397185848992307978456138112526915330347715236378041486547994708748840844217371233735072572979
+
+B == 
+246595759
+
+G == 2
+----------------------------------------------------------------
+Certificate of primality for:
+26405175827665701256325699315126705508919255051121452292124404943796947287968603975320562847910946802396632302209435206627913466015741799499
+
+A == 
+3104984078042317488749073016454213579257792635142218294052134804187631661145261015102617582090263808696699966840735333252107678792123
+
+B == 
+4252063
+
+G == 2
+----------------------------------------------------------------
+Certificate of primality for:
+11122146237908413610034600609460545703591095894418599759742741406628055069007082998134905595800236452010905900391505454890446585211975124558601770163
+
+A == 
+26405175827665701256325699315126705508919255051121452292124404943796947287968603975320562847910946802396632302209435206627913466015741799499
+
+B == 
+210605419
+
+G == 2
+----------------------------------------------------------------
+Certificate of primality for:
+1649861642047798890580354082088712649911849362201343649289384923147797960364736011515757482030049342943790127685185806092659832129486307035500638595572396187
+
+A == 
+11122146237908413610034600609460545703591095894418599759742741406628055069007082998134905595800236452010905900391505454890446585211975124558601770163
+
+B == 
+74170111
+
+G == 2
+----------------------------------------------------------------
+Certificate of primality for:
+857983367126266717607389719637086684134462613006415859877666235955788392464081914127715967940968197765042399904117392707518175220864852816390004264107201177394565363
+
+A == 
+1649861642047798890580354082088712649911849362201343649289384923147797960364736011515757482030049342943790127685185806092659832129486307035500638595572396187
+
+B == 
+260016763
+
+G == 2
+----------------------------------------------------------------
+Certificate of primality for:
+175995909353623703257072120479340610010337144085688850745292031336724691277374210929188442230237711063783727092685448718515661641054886101716698390145283196296702450566161283
+
+A == 
+857983367126266717607389719637086684134462613006415859877666235955788392464081914127715967940968197765042399904117392707518175220864852816390004264107201177394565363
+
+B == 
+102563707
+
+G == 2
+----------------------------------------------------------------
+Certificate of primality for:
+48486002551155667224487059713350447239190772068092630563272168418880661006593537218144160068395218642353495339720640699721703003648144463556291315694787862009052641640656933232794283
+
+A == 
+175995909353623703257072120479340610010337144085688850745292031336724691277374210929188442230237711063783727092685448718515661641054886101716698390145283196296702450566161283
+
+B == 
+137747527
+
+G == 2
+----------------------------------------------------------------
+Certificate of primality for:
+13156468011529105025061495011938518171328604045212410096476697450506055664012861932372156505805788068791146986282263016790631108386790291275939575123375304599622623328517354163964228279867403
+
+A == 
+48486002551155667224487059713350447239190772068092630563272168418880661006593537218144160068395218642353495339720640699721703003648144463556291315694787862009052641640656933232794283
+
+B == 
+135672847
+
+G == 2
+----------------------------------------------------------------
+Certificate of primality for:
+6355194692790533601105154341731997464407930009404822926832136060319955058388106456084549316415200519472481147942263916585428906582726749131479465958107142228236909665306781538860053107680830113869123
+
+A == 
+13156468011529105025061495011938518171328604045212410096476697450506055664012861932372156505805788068791146986282263016790631108386790291275939575123375304599622623328517354163964228279867403
+
+B == 
+241523587
+
+G == 2
+----------------------------------------------------------------
+Certificate of primality for:
+3157116676535430302794438027544146642863331358530722860333745617571010460905857862561870488000265751138954271040017454405707755458702044884023184574412221802502351503929935224995314581932097706874819348858083
+
+A == 
+6355194692790533601105154341731997464407930009404822926832136060319955058388106456084549316415200519472481147942263916585428906582726749131479465958107142228236909665306781538860053107680830113869123
+
+B == 
+248388667
+
+G == 2
+----------------------------------------------------------------
+Certificate of primality for:
+390533129219992506725320633489467713907837370444962163378727819939092929448752905310115311180032249230394348337568973177802874166228132778126338883671958897238722734394783244237133367055422297736215754829839364158067
+
+A == 
+3157116676535430302794438027544146642863331358530722860333745617571010460905857862561870488000265751138954271040017454405707755458702044884023184574412221802502351503929935224995314581932097706874819348858083
+
+B == 
+61849651
+
+G == 2
+----------------------------------------------------------------
+Certificate of primality for:
+48583654555070224891047847050732516652910250240135992225139515777200432486685999462997073444468380434359929499498804723793106565291183220444221080449740542884172281158126259373095216435009661050109711341419005972852770440739
+
+A == 
+390533129219992506725320633489467713907837370444962163378727819939092929448752905310115311180032249230394348337568973177802874166228132778126338883671958897238722734394783244237133367055422297736215754829839364158067
+
+B == 
+62201707
+
+G == 2
+----------------------------------------------------------------
+Certificate of primality for:
+25733035251905120039135866524384525138869748427727001128764704499071378939227862068500633813538831598776578372709963673670934388213622433800015759585470542686333039614931682098922935087822950084908715298627996115185849260703525317419
+
+A == 
+48583654555070224891047847050732516652910250240135992225139515777200432486685999462997073444468380434359929499498804723793106565291183220444221080449740542884172281158126259373095216435009661050109711341419005972852770440739
+
+B == 
+264832231
+
+G == 2
+----------------------------------------------------------------
+Certificate of primality for:
+2804594464939948901906623499531073917980499195397462605359913717827014360538186518540781517129548650937632008683280555602633122170458773895504894807182664540529077836857897972175530148107545939211339044386106111633510166695386323426241809387
+
+A == 
+25733035251905120039135866524384525138869748427727001128764704499071378939227862068500633813538831598776578372709963673670934388213622433800015759585470542686333039614931682098922935087822950084908715298627996115185849260703525317419
+
+B == 
+54494047
+
+G == 2
+----------------------------------------------------------------
+Certificate of primality for:
+738136612083433720096707308165797114449914259256979340471077690416567237592465306112484843530074782721390528773594351482384711900456440808251196845265132086486672447136822046628407467459921823150600138073268385534588238548865012638209515923513516547
+
+A == 
+2804594464939948901906623499531073917980499195397462605359913717827014360538186518540781517129548650937632008683280555602633122170458773895504894807182664540529077836857897972175530148107545939211339044386106111633510166695386323426241809387
+
+B == 
+131594179
+
+G == 2
+----------------------------------------------------------------
+Certificate of primality for:
+392847529056126766528615419937165193421166694172790666626558750047057558168124866940509180171236517681470100877687445134633784815352076138790217228749332398026714192707447855731679485746120589851992221508292976900578299504461333767437280988393026452846013683
+
+A == 
+738136612083433720096707308165797114449914259256979340471077690416567237592465306112484843530074782721390528773594351482384711900456440808251196845265132086486672447136822046628407467459921823150600138073268385534588238548865012638209515923513516547
+
+B == 
+266107603
+
+G == 2
+----------------------------------------------------------------
+Certificate of primality for:
+168459393231883505975876919268398655632763956627405508859662408056221544310200546265681845397346956580604208064328814319465940958080244889692368602591598503944015835190587740756859842792554282496742843600573336023639256008687581291233481455395123454655488735304365627
+
+A == 
+392847529056126766528615419937165193421166694172790666626558750047057558168124866940509180171236517681470100877687445134633784815352076138790217228749332398026714192707447855731679485746120589851992221508292976900578299504461333767437280988393026452846013683
+
+B == 
+214408111
+
+G == 2
+----------------------------------------------------------------
+Certificate of primality for:
+14865774288636941404884923981945833072113667565310054952177860608355263252462409554658728941191929400198053290113492910272458441655458514080123870132092365833472436407455910185221474386718838138135065780840839893113912689594815485706154461164071775481134379794909690501684643
+
+A == 
+168459393231883505975876919268398655632763956627405508859662408056221544310200546265681845397346956580604208064328814319465940958080244889692368602591598503944015835190587740756859842792554282496742843600573336023639256008687581291233481455395123454655488735304365627
+
+B == 
+44122723
+
+G == 2
+----------------------------------------------------------------
+Certificate of primality for:
+1213301773203241614897109856134894783021668292000023984098824423682568173639394290886185366993108292039068940333907505157813934962357206131450244004178619265868614859794316361031904412926604138893775068853175215502104744339658944443630407632290152772487455298652998368296998719996019
+
+A == 
+14865774288636941404884923981945833072113667565310054952177860608355263252462409554658728941191929400198053290113492910272458441655458514080123870132092365833472436407455910185221474386718838138135065780840839893113912689594815485706154461164071775481134379794909690501684643
+
+B == 
+40808563
+
+G == 2
+----------------------------------------------------------------
+Certificate of primality for:
+186935245989515158127969129347464851990429060640910951266513740972248428651109062997368144722015290092846666943896556191257222521203647606911446635194198213436423080005867489516421559330500722264446765608763224572386410155413161172707802334865729654109050873820610813855041667633843601286843
+
+A == 
+1213301773203241614897109856134894783021668292000023984098824423682568173639394290886185366993108292039068940333907505157813934962357206131450244004178619265868614859794316361031904412926604138893775068853175215502104744339658944443630407632290152772487455298652998368296998719996019
+
+B == 
+77035759
+
+G == 2
+----------------------------------------------------------------
+Certificate of primality for:
+83142661079751490510739960019112406284111408348732592580459037404394946037094409915127399165633756159385609671956087845517678367844901424617866988187132480585966721962585586730693443536100138246516868613250009028187662080828012497191775172228832247706080044971423654632146928165751885302331924491683
+
+A == 
+186935245989515158127969129347464851990429060640910951266513740972248428651109062997368144722015290092846666943896556191257222521203647606911446635194198213436423080005867489516421559330500722264446765608763224572386410155413161172707802334865729654109050873820610813855041667633843601286843
+
+B == 
+222383587
+
+G == 2
+----------------------------------------------------------------
+Certificate of primality for:
+3892354773803809855317742245039794448230625839512638747643814927766738642436392673485997449586432241626440927010641564064764336402368634186618250134234189066179771240232458249806850838490410473462391401438160528157981942499581634732706904411807195259620779379274017704050790865030808501633772117217899534443
+
+A == 
+83142661079751490510739960019112406284111408348732592580459037404394946037094409915127399165633756159385609671956087845517678367844901424617866988187132480585966721962585586730693443536100138246516868613250009028187662080828012497191775172228832247706080044971423654632146928165751885302331924491683
+
+B == 
+23407687
+
+G == 2
+----------------------------------------------------------------
+Certificate of primality for:
+1663606652988091811284014366560171522582683318514519379924950390627250155440313691226744227787921928894551755219495501365555370027257568506349958010457682898612082048959464465369892842603765280317696116552850664773291371490339084156052244256635115997453399761029567033971998617303988376172539172702246575225837054723
+
+A == 
+3892354773803809855317742245039794448230625839512638747643814927766738642436392673485997449586432241626440927010641564064764336402368634186618250134234189066179771240232458249806850838490410473462391401438160528157981942499581634732706904411807195259620779379274017704050790865030808501633772117217899534443
+
+B == 
+213701827
+
+G == 2
+----------------------------------------------------------------
+
+
+Took 33057 ticks, 1048 bits
+P == 1663606652988091811284014366560171522582683318514519379924950390627250155440313691226744227787921928894551755219495501365555370027257568506349958010457682898612082048959464465369892842603765280317696116552850664773291371490339084156052244256635115997453399761029567033971998617303988376172539172702246575225837054723
+Q == 3892354773803809855317742245039794448230625839512638747643814927766738642436392673485997449586432241626440927010641564064764336402368634186618250134234189066179771240232458249806850838490410473462391401438160528157981942499581634732706904411807195259620779379274017704050790865030808501633772117217899534443
diff --git a/libtommath/etc/prime.512 b/libtommath/etc/prime.512
new file mode 100644
index 0000000..cb6ec30
--- /dev/null
+++ b/libtommath/etc/prime.512
@@ -0,0 +1,205 @@
+Enter # of bits: 
+Enter number of bases to try (1 to 8):
+Certificate of primality for:
+85933926807634727
+
+A == 
+253758023
+
+B == 
+169322581
+
+G == 5
+----------------------------------------------------------------
+Certificate of primality for:
+23930198825086241462113799
+
+A == 
+85933926807634727
+
+B == 
+139236037
+
+G == 11
+----------------------------------------------------------------
+Certificate of primality for:
+6401844647261612602378676572510019
+
+A == 
+23930198825086241462113799
+
+B == 
+133760791
+
+G == 2
+----------------------------------------------------------------
+Certificate of primality for:
+269731366027728777712034888684015329354259
+
+A == 
+6401844647261612602378676572510019
+
+B == 
+21066691
+
+G == 2
+----------------------------------------------------------------
+Certificate of primality for:
+37942338209025571690075025099189467992329684223707
+
+A == 
+269731366027728777712034888684015329354259
+
+B == 
+70333567
+
+G == 2
+----------------------------------------------------------------
+Certificate of primality for:
+15306904714258982484473490774101705363308327436988160248323
+
+A == 
+37942338209025571690075025099189467992329684223707
+
+B == 
+201712723
+
+G == 2
+----------------------------------------------------------------
+Certificate of primality for:
+1616744757018513392810355191503853040357155275733333124624513530099
+
+A == 
+15306904714258982484473490774101705363308327436988160248323
+
+B == 
+52810963
+
+G == 2
+----------------------------------------------------------------
+Certificate of primality for:
+464222094814208047161771036072622485188658077940154689939306386289983787983
+
+A == 
+1616744757018513392810355191503853040357155275733333124624513530099
+
+B == 
+143566909
+
+G == 5
+----------------------------------------------------------------
+Certificate of primality for:
+187429931674053784626487560729643601208757374994177258429930699354770049369025096447
+
+A == 
+464222094814208047161771036072622485188658077940154689939306386289983787983
+
+B == 
+201875281
+
+G == 5
+----------------------------------------------------------------
+Certificate of primality for:
+100579220846502621074093727119851331775052664444339632682598589456666938521976625305832917563
+
+A == 
+187429931674053784626487560729643601208757374994177258429930699354770049369025096447
+
+B == 
+268311523
+
+G == 2
+----------------------------------------------------------------
+Certificate of primality for:
+1173616081309758475197022137833792133815753368965945885089720153370737965497134878651384030219765163
+
+A == 
+100579220846502621074093727119851331775052664444339632682598589456666938521976625305832917563
+
+B == 
+5834287
+
+G == 2
+----------------------------------------------------------------
+Certificate of primality for:
+191456913489905913185935197655672585713573070349044195411728114905691721186574907738081340754373032735283623
+
+A == 
+1173616081309758475197022137833792133815753368965945885089720153370737965497134878651384030219765163
+
+B == 
+81567097
+
+G == 5
+----------------------------------------------------------------
+Certificate of primality for:
+57856530489201750164178576399448868489243874083056587683743345599898489554401618943240901541005080049321706789987519
+
+A == 
+191456913489905913185935197655672585713573070349044195411728114905691721186574907738081340754373032735283623
+
+B == 
+151095433
+
+G == 7
+----------------------------------------------------------------
+Certificate of primality for:
+13790529750452576698109671710773784949185621244122040804792403407272729038377767162233653248852099545134831722512085881814803
+
+A == 
+57856530489201750164178576399448868489243874083056587683743345599898489554401618943240901541005080049321706789987519
+
+B == 
+119178679
+
+G == 2
+----------------------------------------------------------------
+Certificate of primality for:
+7075985989000817742677547821106534174334812111605018857703825637170140040509067704269696198231266351631132464035671858077052876058979
+
+A == 
+13790529750452576698109671710773784949185621244122040804792403407272729038377767162233653248852099545134831722512085881814803
+
+B == 
+256552363
+
+G == 2
+----------------------------------------------------------------
+Certificate of primality for:
+1227273006232588072907488910282307435921226646895131225407452056677899411162892829564455154080310937471747140942360789623819327234258162420463
+
+A == 
+7075985989000817742677547821106534174334812111605018857703825637170140040509067704269696198231266351631132464035671858077052876058979
+
+B == 
+86720989
+
+G == 5
+----------------------------------------------------------------
+Certificate of primality for:
+446764896913554613686067036908702877942872355053329937790398156069936255759889884246832779737114032666318220500106499161852193765380831330106375235763
+
+A == 
+1227273006232588072907488910282307435921226646895131225407452056677899411162892829564455154080310937471747140942360789623819327234258162420463
+
+B == 
+182015287
+
+G == 2
+----------------------------------------------------------------
+Certificate of primality for:
+5290203010849586596974953717018896543907195901082056939587768479377028575911127944611236020459652034082251335583308070846379514569838984811187823420951275243
+
+A == 
+446764896913554613686067036908702877942872355053329937790398156069936255759889884246832779737114032666318220500106499161852193765380831330106375235763
+
+B == 
+5920567
+
+G == 2
+----------------------------------------------------------------
+
+
+Took 3454 ticks, 521 bits
+P == 5290203010849586596974953717018896543907195901082056939587768479377028575911127944611236020459652034082251335583308070846379514569838984811187823420951275243
+Q == 446764896913554613686067036908702877942872355053329937790398156069936255759889884246832779737114032666318220500106499161852193765380831330106375235763
diff --git a/libtommath/etc/timer.asm b/libtommath/etc/timer.asm
new file mode 100644
index 0000000..35890d9
--- /dev/null
+++ b/libtommath/etc/timer.asm
@@ -0,0 +1,37 @@
+; x86 timer in NASM
+;
+; Tom St Denis, tomstdenis@iahu.ca
+[bits 32]
+[section .data]
+time dd 0, 0
+
+[section .text]
+
+%ifdef USE_ELF
+[global t_start]
+t_start:
+%else
+[global _t_start]
+_t_start:
+%endif
+   push edx
+   push eax
+   rdtsc
+   mov [time+0],edx
+   mov [time+4],eax
+   pop eax
+   pop edx
+   ret
+   
+%ifdef USE_ELF
+[global t_read]
+t_read:
+%else
+[global _t_read]
+_t_read:
+%endif
+   rdtsc
+   sub eax,[time+4]
+   sbb edx,[time+0]
+   ret
+   
+\ No newline at end of file
diff --git a/libtommath/etc/tune.c b/libtommath/etc/tune.c
new file mode 100644
index 0000000..d054d10
--- /dev/null
+++ b/libtommath/etc/tune.c
@@ -0,0 +1,138 @@
+/* Tune the Karatsuba parameters
+ *
+ * Tom St Denis, tomstdenis@iahu.ca
+ */
+#include <tommath.h>
+#include <time.h>
+
+/* how many times todo each size mult.  Depends on your computer.  For slow computers
+ * this can be low like 5 or 10.  For fast [re: Athlon] should be 25 - 50 or so 
+ */
+#define TIMES (1UL<<14UL)
+
+/* RDTSC from Scott Duplichan */
+static ulong64 TIMFUNC (void)
+   {
+   #if defined __GNUC__
+      #if defined(__i386__) || defined(__x86_64__)
+         unsigned long long a;
+         __asm__ __volatile__ ("rdtsc\nmovl %%eax,%0\nmovl %%edx,4+%0\n"::"m"(a):"%eax","%edx");
+         return a;
+      #else /* gcc-IA64 version */
+         unsigned long result;
+         __asm__ __volatile__("mov %0=ar.itc" : "=r"(result) :: "memory");
+         while (__builtin_expect ((int) result == -1, 0))
+         __asm__ __volatile__("mov %0=ar.itc" : "=r"(result) :: "memory");
+         return result;
+      #endif
+
+   // Microsoft and Intel Windows compilers
+   #elif defined _M_IX86
+     __asm rdtsc
+   #elif defined _M_AMD64
+     return __rdtsc ();
+   #elif defined _M_IA64
+     #if defined __INTEL_COMPILER
+       #include <ia64intrin.h>
+     #endif
+      return __getReg (3116);
+   #else
+     #error need rdtsc function for this build
+   #endif
+   }
+
+
+#ifndef X86_TIMER
+
+/* generic ISO C timer */
+ulong64 LBL_T;
+void t_start(void) { LBL_T = TIMFUNC(); }
+ulong64 t_read(void) { return TIMFUNC() - LBL_T; }
+
+#else
+extern void t_start(void);
+extern ulong64 t_read(void);
+#endif
+
+ulong64 time_mult(int size, int s)
+{
+  unsigned long     x;
+  mp_int  a, b, c;
+  ulong64 t1;
+
+  mp_init (&a);
+  mp_init (&b);
+  mp_init (&c);
+
+  mp_rand (&a, size);
+  mp_rand (&b, size);
+
+  if (s == 1) { 
+      KARATSUBA_MUL_CUTOFF = size;
+  } else {
+      KARATSUBA_MUL_CUTOFF = 100000;
+  }
+
+  t_start();
+  for (x = 0; x < TIMES; x++) {
+      mp_mul(&a,&b,&c);
+  }
+  t1 = t_read();
+  mp_clear (&a);
+  mp_clear (&b);
+  mp_clear (&c);
+  return t1;
+}
+
+ulong64 time_sqr(int size, int s)
+{
+  unsigned long     x;
+  mp_int  a, b;
+  ulong64 t1;
+
+  mp_init (&a);
+  mp_init (&b);
+
+  mp_rand (&a, size);
+
+  if (s == 1) { 
+      KARATSUBA_SQR_CUTOFF = size;
+  } else {
+      KARATSUBA_SQR_CUTOFF = 100000;
+  }
+
+  t_start();
+  for (x = 0; x < TIMES; x++) {
+      mp_sqr(&a,&b);
+  }
+  t1 = t_read();
+  mp_clear (&a);
+  mp_clear (&b);
+  return t1;
+}
+
+int
+main (void)
+{
+  ulong64 t1, t2;
+  int x, y;
+
+  for (x = 8; ; x += 2) { 
+     t1 = time_mult(x, 0);
+     t2 = time_mult(x, 1);
+     printf("%d: %9llu %9llu, %9llu\n", x, t1, t2, t2 - t1);
+     if (t2 < t1) break;
+  }
+  y = x;
+
+  for (x = 8; ; x += 2) { 
+     t1 = time_sqr(x, 0);
+     t2 = time_sqr(x, 1);
+     printf("%d: %9llu %9llu, %9llu\n", x, t1, t2, t2 - t1);
+     if (t2 < t1) break;
+  }
+  printf("KARATSUBA_MUL_CUTOFF = %d\n", y);
+  printf("KARATSUBA_SQR_CUTOFF = %d\n", x);
+
+  return 0;
+}
diff --git a/libtommath/gen.pl b/libtommath/gen.pl
new file mode 100644
index 0000000..7236591
--- /dev/null
+++ b/libtommath/gen.pl
@@ -0,0 +1,17 @@
+#!/usr/bin/perl -w
+#
+# Generates a "single file" you can use to quickly
+# add the whole source without any makefile troubles
+#
+use strict;
+
+open( OUT, ">mpi.c" ) or die "Couldn't open mpi.c for writing: $!";
+foreach my $filename (glob "bn*.c") {
+   open( SRC, "<$filename" ) or die "Couldn't open $filename for reading: $!";
+   print OUT "/* Start: $filename */\n";
+   print OUT while <SRC>;
+   print OUT "\n/* End: $filename */\n\n";
+   close SRC or die "Error closing $filename after reading: $!";
+}
+print OUT "\n/* EOF */\n";
+close OUT or die "Error closing mpi.c after writing: $!";
+\ No newline at end of file
diff --git a/libtommath/logs/README b/libtommath/logs/README
new file mode 100644
index 0000000..ea20c81
--- /dev/null
+++ b/libtommath/logs/README
@@ -0,0 +1,13 @@
+To use the pretty graphs you have to first build/run the ltmtest from the root directory of the package.  
+Todo this type 
+
+make timing ; ltmtest
+
+in the root.  It will run for a while [about ten minutes on most PCs] and produce a series of .log files in logs/.
+
+After doing that run "gnuplot graphs.dem" to make the PNGs.  If you managed todo that all so far just open index.html to view
+them all :-)
+
+Have fun
+
+Tom
+\ No newline at end of file
diff --git a/libtommath/logs/add.log b/libtommath/logs/add.log
new file mode 100644
index 0000000..43503ac
--- /dev/null
+++ b/libtommath/logs/add.log
@@ -0,0 +1,16 @@
+480        87
+960       111
+1440       135
+1920       159
+2400       200
+2880       224
+3360       248
+3840       272
+4320       296
+4800       320
+5280       344
+5760       368
+6240       392
+6720       416
+7200       440
+7680       464
diff --git a/libtommath/logs/addsub.png b/libtommath/logs/addsub.png
new file mode 100644
index 0000000..a5679ac
--- /dev/null
+++ b/libtommath/logs/addsub.png
diff --git a/libtommath/logs/expt.log b/libtommath/logs/expt.log
new file mode 100644
index 0000000..920ba55
--- /dev/null
+++ b/libtommath/logs/expt.log
@@ -0,0 +1,7 @@
+513   1489160
+769   3688476
+1025   8162061
+2049  49260015
+2561  89579052
+3073 148797060
+4097 324449263
diff --git a/libtommath/logs/expt.png b/libtommath/logs/expt.png
new file mode 100644
index 0000000..9ee8bb7
--- /dev/null
+++ b/libtommath/logs/expt.png
diff --git a/libtommath/logs/expt_2k.log b/libtommath/logs/expt_2k.log
new file mode 100644
index 0000000..56b50db
--- /dev/null
+++ b/libtommath/logs/expt_2k.log
@@ -0,0 +1,5 @@
+607   2272809
+1279   9557382
+2203  36250309
+3217  87666486
+4253 174168369
diff --git a/libtommath/logs/expt_2kl.log b/libtommath/logs/expt_2kl.log
new file mode 100644
index 0000000..b2eb8c2
--- /dev/null
+++ b/libtommath/logs/expt_2kl.log
@@ -0,0 +1,4 @@
+1024   6954080
+2048  35993987
+4096 176068521
+521   1683720
diff --git a/libtommath/logs/expt_dr.log b/libtommath/logs/expt_dr.log
new file mode 100644
index 0000000..eb93fc9
--- /dev/null
+++ b/libtommath/logs/expt_dr.log
@@ -0,0 +1,7 @@
+532   1989592
+784   3898697
+1036   6519700
+1540  15676650
+2072  33128187
+3080  82963362
+4116 168358337
diff --git a/libtommath/logs/graphs.dem b/libtommath/logs/graphs.dem
new file mode 100644
index 0000000..dfaf613
--- /dev/null
+++ b/libtommath/logs/graphs.dem
@@ -0,0 +1,17 @@
+set terminal png
+set size 1.75
+set ylabel "Cycles per Operation"
+set xlabel "Operand size (bits)"
+
+set output "addsub.png"
+plot 'add.log' smooth bezier title "Addition", 'sub.log' smooth bezier title "Subtraction"
+
+set output "mult.png"
+plot 'sqr.log' smooth bezier title "Squaring (without Karatsuba)", 'sqr_kara.log' smooth bezier title "Squaring (Karatsuba)", 'mult.log' smooth bezier title "Multiplication (without Karatsuba)", 'mult_kara.log' smooth bezier title "Multiplication (Karatsuba)"
+
+set output "expt.png"
+plot 'expt.log' smooth bezier title "Exptmod (Montgomery)", 'expt_dr.log' smooth bezier title "Exptmod (Dimminished Radix)", 'expt_2k.log' smooth bezier title "Exptmod (2k Reduction)"
+
+set output "invmod.png"
+plot 'invmod.log' smooth bezier title "Modular Inverse"
+
diff --git a/libtommath/logs/index.html b/libtommath/logs/index.html
new file mode 100644
index 0000000..19fe403
--- /dev/null
+++ b/libtommath/logs/index.html
@@ -0,0 +1,24 @@
+<html>
+<head>
+<title>LibTomMath Log Plots</title>
+</head>
+<body>
+
+<h1>Addition and Subtraction</h1>
+<center><img src=addsub.png></center>
+<hr>
+
+<h1>Multipliers</h1>
+<center><img src=mult.png></center>
+<hr>
+
+<h1>Exptmod</h1>
+<center><img src=expt.png></center>
+<hr>
+
+<h1>Modular Inverse</h1>
+<center><img src=invmod.png></center>
+<hr>
+
+</body>
+</html>
+\ No newline at end of file
diff --git a/libtommath/logs/invmod.log b/libtommath/logs/invmod.log
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/libtommath/logs/invmod.log
diff --git a/libtommath/logs/invmod.png b/libtommath/logs/invmod.png
new file mode 100644
index 0000000..0a8a4ad
--- /dev/null
+++ b/libtommath/logs/invmod.png
diff --git a/libtommath/logs/mult.log b/libtommath/logs/mult.log
new file mode 100644
index 0000000..33563fc
--- /dev/null
+++ b/libtommath/logs/mult.log
@@ -0,0 +1,84 @@
+271       555
+390       855
+508      1161
+631      1605
+749      2117
+871      2687
+991      3329
+1108      4084
+1231      4786
+1351      5624
+1470      6392
+1586      7364
+1710      8218
+1830      9255
+1951     10217
+2067     11461
+2191     12463
+2308     13677
+2430     14800
+2551     16232
+2671     17460
+2791     18899
+2902     20247
+3028     21902
+3151     23240
+3267     24927
+3391     26441
+3511     28277
+3631     29838
+3749     31751
+3869     33673
+3989     35431
+4111     37518
+4231     39426
+4349     41504
+4471     43567
+4591     45786
+4711     47876
+4831     50299
+4951     52427
+5071     54785
+5189     57241
+5307     59730
+5431     62194
+5551     64761
+5670     67322
+5789     70073
+5907     72663
+6030     75437
+6151     78242
+6268     81202
+6389     83948
+6509     86985
+6631     89903
+6747     93184
+6869     96044
+6991     99286
+7109    102395
+7229    105917
+7351    108940
+7470    112490
+7589    115702
+7711    119508
+7831    122632
+7951    126410
+8071    129808
+8190    133895
+8311    137146
+8431    141218
+8549    144732
+8667    149131
+8790    152462
+8911    156754
+9030    160479
+9149    165138
+9271    168601
+9391    173185
+9511    176988
+9627    181976
+9751    185539
+9870    190388
+9991    194335
+10110    199605
+10228    203298
diff --git a/libtommath/logs/mult.png b/libtommath/logs/mult.png
new file mode 100644
index 0000000..4f7a4ee
--- /dev/null
+++ b/libtommath/logs/mult.png
diff --git a/libtommath/logs/mult_kara.log b/libtommath/logs/mult_kara.log
new file mode 100644
index 0000000..7136c79
--- /dev/null
+++ b/libtommath/logs/mult_kara.log
@@ -0,0 +1,84 @@
+271       560
+391       870
+511      1159
+631      1605
+750      2111
+871      2737
+991      3361
+1111      4054
+1231      4778
+1351      5600
+1471      6404
+1591      7323
+1710      8255
+1831      9239
+1948     10257
+2070     11397
+2190     12531
+2308     13665
+2429     14870
+2550     16175
+2671     17539
+2787     18879
+2911     20350
+3031     21807
+3150     23415
+3270     24897
+3388     26567
+3511     28205
+3627     30076
+3751     31744
+3869     33657
+3991     35425
+4111     37522
+4229     39363
+4351     41503
+4470     43491
+4590     45827
+4711     47795
+4828     50166
+4951     52318
+5070     54911
+5191     57036
+5308     58237
+5431     60248
+5551     62678
+5671     64786
+5791     67294
+5908     69343
+6031     71607
+6151     74166
+6271     76590
+6391     78734
+6511     81175
+6631     83742
+6750     86403
+6868     88873
+6990     91150
+7110     94211
+7228     96922
+7351     99445
+7469    102216
+7589    104968
+7711    108113
+7827    110758
+7950    113714
+8071    116511
+8186    119643
+8310    122679
+8425    125581
+8551    128715
+8669    131778
+8788    135116
+8910    138138
+9031    141628
+9148    144754
+9268    148367
+9391    151551
+9511    155033
+9631    158652
+9751    162125
+9871    165248
+9988    168627
+10111    172427
+10231    176412
diff --git a/libtommath/logs/sqr.log b/libtommath/logs/sqr.log
new file mode 100644
index 0000000..cd29fc5
--- /dev/null
+++ b/libtommath/logs/sqr.log
@@ -0,0 +1,84 @@
+265       562
+389       882
+509      1207
+631      1572
+750      1990
+859      2433
+991      2894
+1109      3555
+1230      4228
+1350      5018
+1471      5805
+1591      6579
+1709      7415
+1829      8329
+1949      9225
+2071     10139
+2188     11239
+2309     12178
+2431     13212
+2551     14294
+2671     15551
+2791     16512
+2911     17718
+3030     18876
+3150     20259
+3270     21374
+3391     22650
+3511     23948
+3631     25493
+3750     26756
+3870     28225
+3989     29705
+4110     31409
+4230     32834
+4351     34327
+4471     35818
+4591     37636
+4711     39228
+4830     40868
+4949     42393
+5070     44541
+5191     46269
+5310     48162
+5429     49728
+5548     51985
+5671     53948
+5791     55885
+5910     57584
+6031     60082
+6150     62239
+6270     64309
+6390     66014
+6511     68766
+6631     71012
+6750     73172
+6871     74952
+6991     77909
+7111     80371
+7231     82666
+7351     84531
+7469     87698
+7589     90318
+7711    225384
+7830    232428
+7950    240009
+8070    246522
+8190    253662
+8310    260961
+8431    269253
+8549    275743
+8671    283769
+8789    290811
+8911    300034
+9030    306873
+9149    315085
+9270    323944
+9390    332390
+9508    337519
+9631    348986
+9749    356904
+9871    367013
+9989    373831
+10108    381033
+10230    393475
diff --git a/libtommath/logs/sqr.old b/libtommath/logs/sqr.old
new file mode 100644
index 0000000..3c85882
--- /dev/null
+++ b/libtommath/logs/sqr.old
@@ -0,0 +1,17 @@
+896    382617
+1344    207161
+1792    131522
+2240     90775
+2688     66652
+3136     50955
+3584     11678
+4032      9342
+4480      7684
+4928      6382
+5376      5399
+5824      4545
+6272      3994
+6720      3490
+7168      3075
+7616      2733
+8064      2428
diff --git a/libtommath/logs/sqr_kara.log b/libtommath/logs/sqr_kara.log
new file mode 100644
index 0000000..06355a7
--- /dev/null
+++ b/libtommath/logs/sqr_kara.log
@@ -0,0 +1,84 @@
+271       560
+388       878
+511      1179
+629      1625
+751      1988
+871      2423
+989      2896
+1111      3561
+1231      4209
+1350      5015
+1470      5804
+1591      6556
+1709      7420
+1831      8263
+1951      9173
+2070     10153
+2191     11229
+2310     12167
+2431     13211
+2550     14309
+2671     15524
+2788     16525
+2910     17712
+3028     18822
+3148     20220
+3271     21343
+3391     22652
+3511     23944
+3630     25485
+3750     26778
+3868     28201
+3990     29653
+4111     31393
+4225     32841
+4350     34328
+4471     35786
+4590     37652
+4711     39245
+4830     40876
+4951     42433
+5068     44547
+5191     46321
+5311     48140
+5430     49727
+5550     52034
+5671     53954
+5791     55921
+5908     57597
+6031     60084
+6148     62226
+6270     64295
+6390     66045
+6511     68779
+6629     71003
+6751     73169
+6871     74992
+6991     77895
+7110     80376
+7231     82628
+7351     84468
+7470     87664
+7591     90284
+7711     91352
+7828     93995
+7950     96276
+8071     98691
+8190    101256
+8308    103631
+8431    105222
+8550    108343
+8671    110281
+8787    112764
+8911    115397
+9031    117690
+9151    120266
+9271    122715
+9391    124624
+9510    127937
+9630    130313
+9750    132914
+9871    136129
+9991    138517
+10108    141525
+10231    144225
diff --git a/libtommath/logs/sub.log b/libtommath/logs/sub.log
new file mode 100644
index 0000000..9f84fa2
--- /dev/null
+++ b/libtommath/logs/sub.log
@@ -0,0 +1,16 @@
+480        94
+960       116
+1440       140
+1920       164
+2400       205
+2880       229
+3360       253
+3840       277
+4320       299
+4800       321
+5280       345
+5760       371
+6240       395
+6720       419
+7200       441
+7680       465
diff --git a/libtommath/makefile.bcc b/libtommath/makefile.bcc
new file mode 100644
index 0000000..647c69a
--- /dev/null
+++ b/libtommath/makefile.bcc
@@ -0,0 +1,44 @@
+#
+# Borland C++Builder Makefile (makefile.bcc)
+#
+
+
+LIB = tlib
+CC = bcc32
+CFLAGS = -c -O2 -I.
+
+OBJECTS=bncore.obj bn_mp_init.obj bn_mp_clear.obj bn_mp_exch.obj bn_mp_grow.obj bn_mp_shrink.obj \
+bn_mp_clamp.obj bn_mp_zero.obj  bn_mp_set.obj bn_mp_set_int.obj bn_mp_init_size.obj bn_mp_copy.obj \
+bn_mp_init_copy.obj bn_mp_abs.obj bn_mp_neg.obj bn_mp_cmp_mag.obj bn_mp_cmp.obj bn_mp_cmp_d.obj \
+bn_mp_rshd.obj bn_mp_lshd.obj bn_mp_mod_2d.obj bn_mp_div_2d.obj bn_mp_mul_2d.obj bn_mp_div_2.obj \
+bn_mp_mul_2.obj bn_s_mp_add.obj bn_s_mp_sub.obj bn_fast_s_mp_mul_digs.obj bn_s_mp_mul_digs.obj \
+bn_fast_s_mp_mul_high_digs.obj bn_s_mp_mul_high_digs.obj bn_fast_s_mp_sqr.obj bn_s_mp_sqr.obj \
+bn_mp_add.obj bn_mp_sub.obj bn_mp_karatsuba_mul.obj bn_mp_mul.obj bn_mp_karatsuba_sqr.obj \
+bn_mp_sqr.obj bn_mp_div.obj bn_mp_mod.obj bn_mp_add_d.obj bn_mp_sub_d.obj bn_mp_mul_d.obj \
+bn_mp_div_d.obj bn_mp_mod_d.obj bn_mp_expt_d.obj bn_mp_addmod.obj bn_mp_submod.obj \
+bn_mp_mulmod.obj bn_mp_sqrmod.obj bn_mp_gcd.obj bn_mp_lcm.obj bn_fast_mp_invmod.obj bn_mp_invmod.obj \
+bn_mp_reduce.obj bn_mp_montgomery_setup.obj bn_fast_mp_montgomery_reduce.obj bn_mp_montgomery_reduce.obj \
+bn_mp_exptmod_fast.obj bn_mp_exptmod.obj bn_mp_2expt.obj bn_mp_n_root.obj bn_mp_jacobi.obj bn_reverse.obj \
+bn_mp_count_bits.obj bn_mp_read_unsigned_bin.obj bn_mp_read_signed_bin.obj bn_mp_to_unsigned_bin.obj \
+bn_mp_to_signed_bin.obj bn_mp_unsigned_bin_size.obj bn_mp_signed_bin_size.obj  \
+bn_mp_xor.obj bn_mp_and.obj bn_mp_or.obj bn_mp_rand.obj bn_mp_montgomery_calc_normalization.obj \
+bn_mp_prime_is_divisible.obj bn_prime_tab.obj bn_mp_prime_fermat.obj bn_mp_prime_miller_rabin.obj \
+bn_mp_prime_is_prime.obj bn_mp_prime_next_prime.obj bn_mp_dr_reduce.obj \
+bn_mp_dr_is_modulus.obj bn_mp_dr_setup.obj bn_mp_reduce_setup.obj \
+bn_mp_toom_mul.obj bn_mp_toom_sqr.obj bn_mp_div_3.obj bn_s_mp_exptmod.obj \
+bn_mp_reduce_2k.obj bn_mp_reduce_is_2k.obj bn_mp_reduce_2k_setup.obj \
+bn_mp_reduce_2k_l.obj bn_mp_reduce_is_2k_l.obj bn_mp_reduce_2k_setup_l.obj \
+bn_mp_radix_smap.obj bn_mp_read_radix.obj bn_mp_toradix.obj bn_mp_radix_size.obj \
+bn_mp_fread.obj bn_mp_fwrite.obj bn_mp_cnt_lsb.obj bn_error.obj \
+bn_mp_init_multi.obj bn_mp_clear_multi.obj bn_mp_exteuclid.obj bn_mp_toradix_n.obj \
+bn_mp_prime_random_ex.obj bn_mp_get_int.obj bn_mp_sqrt.obj bn_mp_is_square.obj \
+bn_mp_init_set.obj bn_mp_init_set_int.obj bn_mp_invmod_slow.obj bn_mp_prime_rabin_miller_trials.obj \
+bn_mp_to_signed_bin_n.obj bn_mp_to_unsigned_bin_n.obj
+
+TARGET = libtommath.lib
+
+$(TARGET): $(OBJECTS)
+
+.c.objbjbjbj:
+	$(CC) $(CFLAGS) $<
+	$(LIB) $(TARGET) -+$@
diff --git a/libtommath/makefile.cygwin_dll b/libtommath/makefile.cygwin_dll
new file mode 100644
index 0000000..85b10c7
--- /dev/null
+++ b/libtommath/makefile.cygwin_dll
@@ -0,0 +1,51 @@
+#Makefile for Cygwin-GCC
+#
+#This makefile will build a Windows DLL [doesn't require cygwin to run] in the file
+#libtommath.dll.  The import library is in libtommath.dll.a.  Remember to add
+#"-Wl,--enable-auto-import" to your client build to avoid the auto-import warnings
+#
+#Tom St Denis
+CFLAGS  +=  -I./ -Wall -W -Wshadow -O3 -funroll-loops -mno-cygwin
+
+#x86 optimizations [should be valid for any GCC install though]
+CFLAGS  += -fomit-frame-pointer 
+
+default: windll
+
+OBJECTS=bncore.o bn_mp_init.o bn_mp_clear.o bn_mp_exch.o bn_mp_grow.o bn_mp_shrink.o \
+bn_mp_clamp.o bn_mp_zero.o  bn_mp_set.o bn_mp_set_int.o bn_mp_init_size.o bn_mp_copy.o \
+bn_mp_init_copy.o bn_mp_abs.o bn_mp_neg.o bn_mp_cmp_mag.o bn_mp_cmp.o bn_mp_cmp_d.o \
+bn_mp_rshd.o bn_mp_lshd.o bn_mp_mod_2d.o bn_mp_div_2d.o bn_mp_mul_2d.o bn_mp_div_2.o \
+bn_mp_mul_2.o bn_s_mp_add.o bn_s_mp_sub.o bn_fast_s_mp_mul_digs.o bn_s_mp_mul_digs.o \
+bn_fast_s_mp_mul_high_digs.o bn_s_mp_mul_high_digs.o bn_fast_s_mp_sqr.o bn_s_mp_sqr.o \
+bn_mp_add.o bn_mp_sub.o bn_mp_karatsuba_mul.o bn_mp_mul.o bn_mp_karatsuba_sqr.o \
+bn_mp_sqr.o bn_mp_div.o bn_mp_mod.o bn_mp_add_d.o bn_mp_sub_d.o bn_mp_mul_d.o \
+bn_mp_div_d.o bn_mp_mod_d.o bn_mp_expt_d.o bn_mp_addmod.o bn_mp_submod.o \
+bn_mp_mulmod.o bn_mp_sqrmod.o bn_mp_gcd.o bn_mp_lcm.o bn_fast_mp_invmod.o bn_mp_invmod.o \
+bn_mp_reduce.o bn_mp_montgomery_setup.o bn_fast_mp_montgomery_reduce.o bn_mp_montgomery_reduce.o \
+bn_mp_exptmod_fast.o bn_mp_exptmod.o bn_mp_2expt.o bn_mp_n_root.o bn_mp_jacobi.o bn_reverse.o \
+bn_mp_count_bits.o bn_mp_read_unsigned_bin.o bn_mp_read_signed_bin.o bn_mp_to_unsigned_bin.o \
+bn_mp_to_signed_bin.o bn_mp_unsigned_bin_size.o bn_mp_signed_bin_size.o  \
+bn_mp_xor.o bn_mp_and.o bn_mp_or.o bn_mp_rand.o bn_mp_montgomery_calc_normalization.o \
+bn_mp_prime_is_divisible.o bn_prime_tab.o bn_mp_prime_fermat.o bn_mp_prime_miller_rabin.o \
+bn_mp_prime_is_prime.o bn_mp_prime_next_prime.o bn_mp_dr_reduce.o \
+bn_mp_dr_is_modulus.o bn_mp_dr_setup.o bn_mp_reduce_setup.o \
+bn_mp_toom_mul.o bn_mp_toom_sqr.o bn_mp_div_3.o bn_s_mp_exptmod.o \
+bn_mp_reduce_2k.o bn_mp_reduce_is_2k.o bn_mp_reduce_2k_setup.o \
+bn_mp_reduce_2k_l.o bn_mp_reduce_is_2k_l.o bn_mp_reduce_2k_setup_l.o \
+bn_mp_radix_smap.o bn_mp_read_radix.o bn_mp_toradix.o bn_mp_radix_size.o \
+bn_mp_fread.o bn_mp_fwrite.o bn_mp_cnt_lsb.o bn_error.o \
+bn_mp_init_multi.o bn_mp_clear_multi.o bn_mp_exteuclid.o bn_mp_toradix_n.o \
+bn_mp_prime_random_ex.o bn_mp_get_int.o bn_mp_sqrt.o bn_mp_is_square.o bn_mp_init_set.o \
+bn_mp_init_set_int.o bn_mp_invmod_slow.o bn_mp_prime_rabin_miller_trials.o \
+bn_mp_to_signed_bin_n.o bn_mp_to_unsigned_bin_n.o
+
+# make a Windows DLL via Cygwin
+windll:  $(OBJECTS)
+	gcc -mno-cygwin -mdll -o libtommath.dll -Wl,--out-implib=libtommath.dll.a -Wl,--export-all-symbols *.o
+	ranlib libtommath.dll.a
+
+# build the test program using the windows DLL
+test: $(OBJECTS) windll
+	gcc $(CFLAGS) demo/demo.c libtommath.dll.a -Wl,--enable-auto-import -o test -s
+	cd mtest ; $(CC) -O3 -fomit-frame-pointer -funroll-loops mtest.c -o mtest -s
diff --git a/libtommath/makefile.icc b/libtommath/makefile.icc
new file mode 100644
index 0000000..e764253
--- /dev/null
+++ b/libtommath/makefile.icc
@@ -0,0 +1,116 @@
+#Makefile for ICC
+#
+#Tom St Denis
+CC=icc
+
+CFLAGS  +=  -I./
+
+# optimize for SPEED
+#
+# -mcpu= can be pentium, pentiumpro (covers PII through PIII) or pentium4
+# -ax?   specifies make code specifically for ? but compatible with IA-32
+# -x?    specifies compile solely for ? [not specifically IA-32 compatible]
+#
+# where ? is 
+#   K - PIII
+#   W - first P4 [Williamette]
+#   N - P4 Northwood
+#   P - P4 Prescott
+#   B - Blend of P4 and PM [mobile]
+#
+# Default to just generic max opts
+CFLAGS += -O3 -xN
+
+#install as this user
+USER=root
+GROUP=root
+
+default: libtommath.a
+
+#default files to install
+LIBNAME=libtommath.a
+HEADERS=tommath.h
+
+#LIBPATH-The directory for libtomcrypt to be installed to.
+#INCPATH-The directory to install the header files for libtommath.
+#DATAPATH-The directory to install the pdf docs.
+DESTDIR=
+LIBPATH=/usr/lib
+INCPATH=/usr/include
+DATAPATH=/usr/share/doc/libtommath/pdf
+
+OBJECTS=bncore.o bn_mp_init.o bn_mp_clear.o bn_mp_exch.o bn_mp_grow.o bn_mp_shrink.o \
+bn_mp_clamp.o bn_mp_zero.o  bn_mp_set.o bn_mp_set_int.o bn_mp_init_size.o bn_mp_copy.o \
+bn_mp_init_copy.o bn_mp_abs.o bn_mp_neg.o bn_mp_cmp_mag.o bn_mp_cmp.o bn_mp_cmp_d.o \
+bn_mp_rshd.o bn_mp_lshd.o bn_mp_mod_2d.o bn_mp_div_2d.o bn_mp_mul_2d.o bn_mp_div_2.o \
+bn_mp_mul_2.o bn_s_mp_add.o bn_s_mp_sub.o bn_fast_s_mp_mul_digs.o bn_s_mp_mul_digs.o \
+bn_fast_s_mp_mul_high_digs.o bn_s_mp_mul_high_digs.o bn_fast_s_mp_sqr.o bn_s_mp_sqr.o \
+bn_mp_add.o bn_mp_sub.o bn_mp_karatsuba_mul.o bn_mp_mul.o bn_mp_karatsuba_sqr.o \
+bn_mp_sqr.o bn_mp_div.o bn_mp_mod.o bn_mp_add_d.o bn_mp_sub_d.o bn_mp_mul_d.o \
+bn_mp_div_d.o bn_mp_mod_d.o bn_mp_expt_d.o bn_mp_addmod.o bn_mp_submod.o \
+bn_mp_mulmod.o bn_mp_sqrmod.o bn_mp_gcd.o bn_mp_lcm.o bn_fast_mp_invmod.o bn_mp_invmod.o \
+bn_mp_reduce.o bn_mp_montgomery_setup.o bn_fast_mp_montgomery_reduce.o bn_mp_montgomery_reduce.o \
+bn_mp_exptmod_fast.o bn_mp_exptmod.o bn_mp_2expt.o bn_mp_n_root.o bn_mp_jacobi.o bn_reverse.o \
+bn_mp_count_bits.o bn_mp_read_unsigned_bin.o bn_mp_read_signed_bin.o bn_mp_to_unsigned_bin.o \
+bn_mp_to_signed_bin.o bn_mp_unsigned_bin_size.o bn_mp_signed_bin_size.o  \
+bn_mp_xor.o bn_mp_and.o bn_mp_or.o bn_mp_rand.o bn_mp_montgomery_calc_normalization.o \
+bn_mp_prime_is_divisible.o bn_prime_tab.o bn_mp_prime_fermat.o bn_mp_prime_miller_rabin.o \
+bn_mp_prime_is_prime.o bn_mp_prime_next_prime.o bn_mp_dr_reduce.o \
+bn_mp_dr_is_modulus.o bn_mp_dr_setup.o bn_mp_reduce_setup.o \
+bn_mp_toom_mul.o bn_mp_toom_sqr.o bn_mp_div_3.o bn_s_mp_exptmod.o \
+bn_mp_reduce_2k.o bn_mp_reduce_is_2k.o bn_mp_reduce_2k_setup.o \
+bn_mp_reduce_2k_l.o bn_mp_reduce_is_2k_l.o bn_mp_reduce_2k_setup_l.o \
+bn_mp_radix_smap.o bn_mp_read_radix.o bn_mp_toradix.o bn_mp_radix_size.o \
+bn_mp_fread.o bn_mp_fwrite.o bn_mp_cnt_lsb.o bn_error.o \
+bn_mp_init_multi.o bn_mp_clear_multi.o bn_mp_exteuclid.o bn_mp_toradix_n.o \
+bn_mp_prime_random_ex.o bn_mp_get_int.o bn_mp_sqrt.o bn_mp_is_square.o bn_mp_init_set.o \
+bn_mp_init_set_int.o bn_mp_invmod_slow.o bn_mp_prime_rabin_miller_trials.o \
+bn_mp_to_signed_bin_n.o bn_mp_to_unsigned_bin_n.o
+
+libtommath.a:  $(OBJECTS)
+	$(AR) $(ARFLAGS) libtommath.a $(OBJECTS)
+	ranlib libtommath.a
+
+#make a profiled library (takes a while!!!)
+#
+# This will build the library with profile generation
+# then run the test demo and rebuild the library.
+# 
+# So far I've seen improvements in the MP math
+profiled:
+	make -f makefile.icc CFLAGS="$(CFLAGS) -prof_gen -DTESTING" timing
+	./ltmtest
+	rm -f *.a *.o ltmtest
+	make -f makefile.icc CFLAGS="$(CFLAGS) -prof_use"
+
+#make a single object profiled library 
+profiled_single:
+	perl gen.pl
+	$(CC) $(CFLAGS) -prof_gen -DTESTING -c mpi.c -o mpi.o
+	$(CC) $(CFLAGS) -DTESTING -DTIMER demo/demo.c mpi.o -o ltmtest
+	./ltmtest
+	rm -f *.o ltmtest
+	$(CC) $(CFLAGS) -prof_use -ip -DTESTING -c mpi.c -o mpi.o
+	$(AR) $(ARFLAGS) libtommath.a mpi.o
+	ranlib libtommath.a	
+
+install: libtommath.a
+	install -d -g $(GROUP) -o $(USER) $(DESTDIR)$(LIBPATH)
+	install -d -g $(GROUP) -o $(USER) $(DESTDIR)$(INCPATH)
+	install -g $(GROUP) -o $(USER) $(LIBNAME) $(DESTDIR)$(LIBPATH)
+	install -g $(GROUP) -o $(USER) $(HEADERS) $(DESTDIR)$(INCPATH)
+
+test: libtommath.a demo/demo.o
+	$(CC) demo/demo.o libtommath.a -o test
+	
+mtest: test	
+	cd mtest ; $(CC) $(CFLAGS) mtest.c -o mtest
+        
+timing: libtommath.a
+	$(CC) $(CFLAGS) -DTIMER demo/timing.c libtommath.a -o ltmtest
+
+clean:
+	rm -f *.bat *.pdf *.o *.a *.obj *.lib *.exe *.dll etclib/*.o demo/demo.o test ltmtest mpitest mtest/mtest mtest/mtest.exe \
+        *.idx *.toc *.log *.aux *.dvi *.lof *.ind *.ilg *.ps *.log *.s mpi.c *.il etc/*.il *.dyn
+	cd etc ; make clean
+	cd pics ; make clean
diff --git a/libtommath/makefile.msvc b/libtommath/makefile.msvc
new file mode 100644
index 0000000..dbbf9f3
--- /dev/null
+++ b/libtommath/makefile.msvc
@@ -0,0 +1,38 @@
+#MSVC Makefile
+#
+#Tom St Denis
+
+CFLAGS = /I. /Ox /DWIN32 /W4
+
+default: library
+
+OBJECTS=bncore.obj bn_mp_init.obj bn_mp_clear.obj bn_mp_exch.obj bn_mp_grow.obj bn_mp_shrink.obj \
+bn_mp_clamp.obj bn_mp_zero.obj  bn_mp_set.obj bn_mp_set_int.obj bn_mp_init_size.obj bn_mp_copy.obj \
+bn_mp_init_copy.obj bn_mp_abs.obj bn_mp_neg.obj bn_mp_cmp_mag.obj bn_mp_cmp.obj bn_mp_cmp_d.obj \
+bn_mp_rshd.obj bn_mp_lshd.obj bn_mp_mod_2d.obj bn_mp_div_2d.obj bn_mp_mul_2d.obj bn_mp_div_2.obj \
+bn_mp_mul_2.obj bn_s_mp_add.obj bn_s_mp_sub.obj bn_fast_s_mp_mul_digs.obj bn_s_mp_mul_digs.obj \
+bn_fast_s_mp_mul_high_digs.obj bn_s_mp_mul_high_digs.obj bn_fast_s_mp_sqr.obj bn_s_mp_sqr.obj \
+bn_mp_add.obj bn_mp_sub.obj bn_mp_karatsuba_mul.obj bn_mp_mul.obj bn_mp_karatsuba_sqr.obj \
+bn_mp_sqr.obj bn_mp_div.obj bn_mp_mod.obj bn_mp_add_d.obj bn_mp_sub_d.obj bn_mp_mul_d.obj \
+bn_mp_div_d.obj bn_mp_mod_d.obj bn_mp_expt_d.obj bn_mp_addmod.obj bn_mp_submod.obj \
+bn_mp_mulmod.obj bn_mp_sqrmod.obj bn_mp_gcd.obj bn_mp_lcm.obj bn_fast_mp_invmod.obj bn_mp_invmod.obj \
+bn_mp_reduce.obj bn_mp_montgomery_setup.obj bn_fast_mp_montgomery_reduce.obj bn_mp_montgomery_reduce.obj \
+bn_mp_exptmod_fast.obj bn_mp_exptmod.obj bn_mp_2expt.obj bn_mp_n_root.obj bn_mp_jacobi.obj bn_reverse.obj \
+bn_mp_count_bits.obj bn_mp_read_unsigned_bin.obj bn_mp_read_signed_bin.obj bn_mp_to_unsigned_bin.obj \
+bn_mp_to_signed_bin.obj bn_mp_unsigned_bin_size.obj bn_mp_signed_bin_size.obj  \
+bn_mp_xor.obj bn_mp_and.obj bn_mp_or.obj bn_mp_rand.obj bn_mp_montgomery_calc_normalization.obj \
+bn_mp_prime_is_divisible.obj bn_prime_tab.obj bn_mp_prime_fermat.obj bn_mp_prime_miller_rabin.obj \
+bn_mp_prime_is_prime.obj bn_mp_prime_next_prime.obj bn_mp_dr_reduce.obj \
+bn_mp_dr_is_modulus.obj bn_mp_dr_setup.obj bn_mp_reduce_setup.obj \
+bn_mp_toom_mul.obj bn_mp_toom_sqr.obj bn_mp_div_3.obj bn_s_mp_exptmod.obj \
+bn_mp_reduce_2k.obj bn_mp_reduce_is_2k.obj bn_mp_reduce_2k_setup.obj \
+bn_mp_reduce_2k_l.obj bn_mp_reduce_is_2k_l.obj bn_mp_reduce_2k_setup_l.obj \
+bn_mp_radix_smap.obj bn_mp_read_radix.obj bn_mp_toradix.obj bn_mp_radix_size.obj \
+bn_mp_fread.obj bn_mp_fwrite.obj bn_mp_cnt_lsb.obj bn_error.obj \
+bn_mp_init_multi.obj bn_mp_clear_multi.obj bn_mp_exteuclid.obj bn_mp_toradix_n.obj \
+bn_mp_prime_random_ex.obj bn_mp_get_int.obj bn_mp_sqrt.obj bn_mp_is_square.obj \
+bn_mp_init_set.obj bn_mp_init_set_int.obj bn_mp_invmod_slow.obj bn_mp_prime_rabin_miller_trials.obj \
+bn_mp_to_signed_bin_n.obj bn_mp_to_unsigned_bin_n.obj
+
+library: $(OBJECTS)
+	lib /out:tommath.lib $(OBJECTS)
diff --git a/libtommath/makefile.shared b/libtommath/makefile.shared
new file mode 100644
index 0000000..7c35881
--- /dev/null
+++ b/libtommath/makefile.shared
@@ -0,0 +1,80 @@
+#Makefile for GCC
+#
+#Tom St Denis
+VERSION=0:35
+
+CC = libtool --mode=compile gcc
+CFLAGS  +=  -I./ -Wall -W -Wshadow -Wsign-compare
+
+#for speed 
+CFLAGS += -O3 -funroll-loops
+
+#for size 
+#CFLAGS += -Os
+
+#x86 optimizations [should be valid for any GCC install though]
+CFLAGS  += -fomit-frame-pointer
+
+#install as this user
+USER=root
+GROUP=root
+
+default: libtommath.la
+
+#default files to install
+LIBNAME=libtommath.la
+HEADERS=tommath.h tommath_class.h tommath_superclass.h
+
+#LIBPATH-The directory for libtommath to be installed to.
+#INCPATH-The directory to install the header files for libtommath.
+#DATAPATH-The directory to install the pdf docs.
+DESTDIR=
+LIBPATH=/usr/lib
+INCPATH=/usr/include
+DATAPATH=/usr/share/doc/libtommath/pdf
+
+OBJECTS=bncore.o bn_mp_init.o bn_mp_clear.o bn_mp_exch.o bn_mp_grow.o bn_mp_shrink.o \
+bn_mp_clamp.o bn_mp_zero.o  bn_mp_set.o bn_mp_set_int.o bn_mp_init_size.o bn_mp_copy.o \
+bn_mp_init_copy.o bn_mp_abs.o bn_mp_neg.o bn_mp_cmp_mag.o bn_mp_cmp.o bn_mp_cmp_d.o \
+bn_mp_rshd.o bn_mp_lshd.o bn_mp_mod_2d.o bn_mp_div_2d.o bn_mp_mul_2d.o bn_mp_div_2.o \
+bn_mp_mul_2.o bn_s_mp_add.o bn_s_mp_sub.o bn_fast_s_mp_mul_digs.o bn_s_mp_mul_digs.o \
+bn_fast_s_mp_mul_high_digs.o bn_s_mp_mul_high_digs.o bn_fast_s_mp_sqr.o bn_s_mp_sqr.o \
+bn_mp_add.o bn_mp_sub.o bn_mp_karatsuba_mul.o bn_mp_mul.o bn_mp_karatsuba_sqr.o \
+bn_mp_sqr.o bn_mp_div.o bn_mp_mod.o bn_mp_add_d.o bn_mp_sub_d.o bn_mp_mul_d.o \
+bn_mp_div_d.o bn_mp_mod_d.o bn_mp_expt_d.o bn_mp_addmod.o bn_mp_submod.o \
+bn_mp_mulmod.o bn_mp_sqrmod.o bn_mp_gcd.o bn_mp_lcm.o bn_fast_mp_invmod.o bn_mp_invmod.o \
+bn_mp_reduce.o bn_mp_montgomery_setup.o bn_fast_mp_montgomery_reduce.o bn_mp_montgomery_reduce.o \
+bn_mp_exptmod_fast.o bn_mp_exptmod.o bn_mp_2expt.o bn_mp_n_root.o bn_mp_jacobi.o bn_reverse.o \
+bn_mp_count_bits.o bn_mp_read_unsigned_bin.o bn_mp_read_signed_bin.o bn_mp_to_unsigned_bin.o \
+bn_mp_to_signed_bin.o bn_mp_unsigned_bin_size.o bn_mp_signed_bin_size.o  \
+bn_mp_xor.o bn_mp_and.o bn_mp_or.o bn_mp_rand.o bn_mp_montgomery_calc_normalization.o \
+bn_mp_prime_is_divisible.o bn_prime_tab.o bn_mp_prime_fermat.o bn_mp_prime_miller_rabin.o \
+bn_mp_prime_is_prime.o bn_mp_prime_next_prime.o bn_mp_dr_reduce.o \
+bn_mp_dr_is_modulus.o bn_mp_dr_setup.o bn_mp_reduce_setup.o \
+bn_mp_toom_mul.o bn_mp_toom_sqr.o bn_mp_div_3.o bn_s_mp_exptmod.o \
+bn_mp_reduce_2k.o bn_mp_reduce_is_2k.o bn_mp_reduce_2k_setup.o \
+bn_mp_reduce_2k_l.o bn_mp_reduce_is_2k_l.o bn_mp_reduce_2k_setup_l.o \
+bn_mp_radix_smap.o bn_mp_read_radix.o bn_mp_toradix.o bn_mp_radix_size.o \
+bn_mp_fread.o bn_mp_fwrite.o bn_mp_cnt_lsb.o bn_error.o \
+bn_mp_init_multi.o bn_mp_clear_multi.o bn_mp_exteuclid.o bn_mp_toradix_n.o \
+bn_mp_prime_random_ex.o bn_mp_get_int.o bn_mp_sqrt.o bn_mp_is_square.o bn_mp_init_set.o \
+bn_mp_init_set_int.o bn_mp_invmod_slow.o bn_mp_prime_rabin_miller_trials.o \
+bn_mp_to_signed_bin_n.o bn_mp_to_unsigned_bin_n.o
+
+
+libtommath.la:  $(OBJECTS)
+	libtool --mode=link gcc *.lo -o libtommath.la -rpath $(LIBPATH) -version-info $(VERSION)
+	libtool --mode=link gcc *.o -o libtommath.a 
+	libtool --mode=install install -c libtommath.la $(LIBPATH)/libtommath.la
+	install -d -g $(GROUP) -o $(USER) $(DESTDIR)$(INCPATH)
+	install -g $(GROUP) -o $(USER) $(HEADERS) $(DESTDIR)$(INCPATH)
+
+test: libtommath.a demo/demo.o
+	gcc $(CFLAGS) -c demo/demo.c -o demo/demo.o
+	libtool --mode=link gcc -o test demo/demo.o libtommath.la
+	
+mtest: test	
+	cd mtest ; gcc $(CFLAGS) mtest.c -o mtest -s
+        
+timing: libtommath.la
+	gcc $(CFLAGS) -DTIMER demo/timing.c libtommath.a -o ltmtest -s
diff --git a/libtommath/mtest/logtab.h b/libtommath/mtest/logtab.h
new file mode 100644
index 0000000..68462bd
--- /dev/null
+++ b/libtommath/mtest/logtab.h
@@ -0,0 +1,20 @@
+const float s_logv_2[] = {
+   0.000000000, 0.000000000, 1.000000000, 0.630929754, 	/*  0  1  2  3 */
+   0.500000000, 0.430676558, 0.386852807, 0.356207187, 	/*  4  5  6  7 */
+   0.333333333, 0.315464877, 0.301029996, 0.289064826, 	/*  8  9 10 11 */
+   0.278942946, 0.270238154, 0.262649535, 0.255958025, 	/* 12 13 14 15 */
+   0.250000000, 0.244650542, 0.239812467, 0.235408913, 	/* 16 17 18 19 */
+   0.231378213, 0.227670249, 0.224243824, 0.221064729, 	/* 20 21 22 23 */
+   0.218104292, 0.215338279, 0.212746054, 0.210309918, 	/* 24 25 26 27 */
+   0.208014598, 0.205846832, 0.203795047, 0.201849087, 	/* 28 29 30 31 */
+   0.200000000, 0.198239863, 0.196561632, 0.194959022, 	/* 32 33 34 35 */
+   0.193426404, 0.191958720, 0.190551412, 0.189200360, 	/* 36 37 38 39 */
+   0.187901825, 0.186652411, 0.185449023, 0.184288833, 	/* 40 41 42 43 */
+   0.183169251, 0.182087900, 0.181042597, 0.180031327, 	/* 44 45 46 47 */
+   0.179052232, 0.178103594, 0.177183820, 0.176291434, 	/* 48 49 50 51 */
+   0.175425064, 0.174583430, 0.173765343, 0.172969690, 	/* 52 53 54 55 */
+   0.172195434, 0.171441601, 0.170707280, 0.169991616, 	/* 56 57 58 59 */
+   0.169293808, 0.168613099, 0.167948779, 0.167300179, 	/* 60 61 62 63 */
+   0.166666667
+};
+
diff --git a/libtommath/mtest/mpi-config.h b/libtommath/mtest/mpi-config.h
new file mode 100644
index 0000000..1baf2c2
--- /dev/null
+++ b/libtommath/mtest/mpi-config.h
@@ -0,0 +1,86 @@
+/* Default configuration for MPI library */
+/* $Id: mpi-config.h,v 1.8 2000/07/11 04:28:14 sting Exp sting $ */
+
+#ifndef MPI_CONFIG_H_
+#define MPI_CONFIG_H_
+
+/*
+  For boolean options, 
+  0 = no
+  1 = yes
+
+  Other options are documented individually.
+
+ */
+
+#ifndef MP_IOFUNC
+#define MP_IOFUNC     0  /* include mp_print() ?                */
+#endif
+
+#ifndef MP_MODARITH
+#define MP_MODARITH   1  /* include modular arithmetic ?        */
+#endif
+
+#ifndef MP_NUMTH
+#define MP_NUMTH      1  /* include number theoretic functions? */
+#endif
+
+#ifndef MP_LOGTAB
+#define MP_LOGTAB     1  /* use table of logs instead of log()? */
+#endif
+
+#ifndef MP_MEMSET
+#define MP_MEMSET     1  /* use memset() to zero buffers?       */
+#endif
+
+#ifndef MP_MEMCPY
+#define MP_MEMCPY     1  /* use memcpy() to copy buffers?       */
+#endif
+
+#ifndef MP_CRYPTO
+#define MP_CRYPTO     1  /* erase memory on free?               */
+#endif
+
+#ifndef MP_ARGCHK
+/*
+  0 = no parameter checks
+  1 = runtime checks, continue execution and return an error to caller
+  2 = assertions; dump core on parameter errors
+ */
+#define MP_ARGCHK     2  /* how to check input arguments        */
+#endif
+
+#ifndef MP_DEBUG
+#define MP_DEBUG      0  /* print diagnostic output?            */
+#endif
+
+#ifndef MP_DEFPREC
+#define MP_DEFPREC    64 /* default precision, in digits        */
+#endif
+
+#ifndef MP_MACRO
+#define MP_MACRO      1  /* use macros for frequent calls?      */
+#endif
+
+#ifndef MP_SQUARE
+#define MP_SQUARE     1  /* use separate squaring code?         */
+#endif
+
+#ifndef MP_PTAB_SIZE
+/*
+  When building mpprime.c, we build in a table of small prime
+  values to use for primality testing.  The more you include,
+  the more space they take up.  See primes.c for the possible
+  values (currently 16, 32, 64, 128, 256, and 6542)
+ */
+#define MP_PTAB_SIZE  128  /* how many built-in primes?         */
+#endif
+
+#ifndef MP_COMPAT_MACROS
+#define MP_COMPAT_MACROS 1   /* define compatibility macros?    */
+#endif
+
+#endif /* ifndef MPI_CONFIG_H_ */
+
+
+/* crc==3287762869, version==2, Sat Feb 02 06:43:53 2002 */
diff --git a/libtommath/mtest/mpi-types.h b/libtommath/mtest/mpi-types.h
new file mode 100644
index 0000000..e097188
--- /dev/null
+++ b/libtommath/mtest/mpi-types.h
@@ -0,0 +1,16 @@
+/* Type definitions generated by 'types.pl' */
+typedef char               mp_sign;
+typedef unsigned short     mp_digit;  /* 2 byte type */
+typedef unsigned int       mp_word;   /* 4 byte type */
+typedef unsigned int       mp_size;
+typedef int                mp_err;
+
+#define MP_DIGIT_BIT       (CHAR_BIT*sizeof(mp_digit))
+#define MP_DIGIT_MAX       USHRT_MAX
+#define MP_WORD_BIT        (CHAR_BIT*sizeof(mp_word))
+#define MP_WORD_MAX        UINT_MAX
+
+#define MP_DIGIT_SIZE      2
+#define DIGIT_FMT          "%04X"
+#define RADIX              (MP_DIGIT_MAX+1)
+
diff --git a/libtommath/mtest/mpi.c b/libtommath/mtest/mpi.c
new file mode 100644
index 0000000..f7688f3
--- /dev/null
+++ b/libtommath/mtest/mpi.c
@@ -0,0 +1,3981 @@
+/*
+    mpi.c
+
+    by Michael J. Fromberger <sting@linguist.dartmouth.edu>
+    Copyright (C) 1998 Michael J. Fromberger, All Rights Reserved
+
+    Arbitrary precision integer arithmetic library
+
+    $Id: mpi.c,v 1.22 2001/09/14 15:11:20 sting Exp sting $
+ */
+
+#include "mpi.h"
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+
+#if MP_DEBUG
+#include <stdio.h>
+
+#define DIAG(T,V) {fprintf(stderr,T);mp_print(V,stderr);fputc('\n',stderr);}
+#else
+#define DIAG(T,V)
+#endif
+
+/* 
+   If MP_LOGTAB is not defined, use the math library to compute the
+   logarithms on the fly.  Otherwise, use the static table below.
+   Pick which works best for your system.
+ */
+#if MP_LOGTAB
+
+/* {{{ s_logv_2[] - log table for 2 in various bases */
+
+/*
+  A table of the logs of 2 for various bases (the 0 and 1 entries of
+  this table are meaningless and should not be referenced).  
+
+  This table is used to compute output lengths for the mp_toradix()
+  function.  Since a number n in radix r takes up about log_r(n)
+  digits, we estimate the output size by taking the least integer
+  greater than log_r(n), where:
+
+  log_r(n) = log_2(n) * log_r(2)
+
+  This table, therefore, is a table of log_r(2) for 2 <= r <= 36,
+  which are the output bases supported.  
+ */
+
+#include "logtab.h"
+
+/* }}} */
+#define LOG_V_2(R)  s_logv_2[(R)]
+
+#else
+
+#include <math.h>
+#define LOG_V_2(R)  (log(2.0)/log(R))
+
+#endif
+
+/* Default precision for newly created mp_int's      */
+static unsigned int s_mp_defprec = MP_DEFPREC;
+
+/* {{{ Digit arithmetic macros */
+
+/*
+  When adding and multiplying digits, the results can be larger than
+  can be contained in an mp_digit.  Thus, an mp_word is used.  These
+  macros mask off the upper and lower digits of the mp_word (the
+  mp_word may be more than 2 mp_digits wide, but we only concern
+  ourselves with the low-order 2 mp_digits)
+
+  If your mp_word DOES have more than 2 mp_digits, you need to
+  uncomment the first line, and comment out the second.
+ */
+
+/* #define  CARRYOUT(W)  (((W)>>DIGIT_BIT)&MP_DIGIT_MAX) */
+#define  CARRYOUT(W)  ((W)>>DIGIT_BIT)
+#define  ACCUM(W)     ((W)&MP_DIGIT_MAX)
+
+/* }}} */
+
+/* {{{ Comparison constants */
+
+#define  MP_LT       -1
+#define  MP_EQ        0
+#define  MP_GT        1
+
+/* }}} */
+
+/* {{{ Constant strings */
+
+/* Constant strings returned by mp_strerror() */
+static const char *mp_err_string[] = {
+  "unknown result code",     /* say what?            */
+  "boolean true",            /* MP_OKAY, MP_YES      */
+  "boolean false",           /* MP_NO                */
+  "out of memory",           /* MP_MEM               */
+  "argument out of range",   /* MP_RANGE             */
+  "invalid input parameter", /* MP_BADARG            */
+  "result is undefined"      /* MP_UNDEF             */
+};
+
+/* Value to digit maps for radix conversion   */
+
+/* s_dmap_1 - standard digits and letters */
+static const char *s_dmap_1 = 
+  "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+/";
+
+#if 0
+/* s_dmap_2 - base64 ordering for digits  */
+static const char *s_dmap_2 =
+  "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+#endif
+
+/* }}} */
+
+/* {{{ Static function declarations */
+
+/* 
+   If MP_MACRO is false, these will be defined as actual functions;
+   otherwise, suitable macro definitions will be used.  This works
+   around the fact that ANSI C89 doesn't support an 'inline' keyword
+   (although I hear C9x will ... about bloody time).  At present, the
+   macro definitions are identical to the function bodies, but they'll
+   expand in place, instead of generating a function call.
+
+   I chose these particular functions to be made into macros because
+   some profiling showed they are called a lot on a typical workload,
+   and yet they are primarily housekeeping.
+ */
+#if MP_MACRO == 0
+ void     s_mp_setz(mp_digit *dp, mp_size count); /* zero digits           */
+ void     s_mp_copy(mp_digit *sp, mp_digit *dp, mp_size count); /* copy    */
+ void    *s_mp_alloc(size_t nb, size_t ni);       /* general allocator     */
+ void     s_mp_free(void *ptr);                   /* general free function */
+#else
+
+ /* Even if these are defined as macros, we need to respect the settings
+    of the MP_MEMSET and MP_MEMCPY configuration options...
+  */
+ #if MP_MEMSET == 0
+  #define  s_mp_setz(dp, count) \
+       {int ix;for(ix=0;ix<(count);ix++)(dp)[ix]=0;}
+ #else
+  #define  s_mp_setz(dp, count) memset(dp, 0, (count) * sizeof(mp_digit))
+ #endif /* MP_MEMSET */
+
+ #if MP_MEMCPY == 0
+  #define  s_mp_copy(sp, dp, count) \
+       {int ix;for(ix=0;ix<(count);ix++)(dp)[ix]=(sp)[ix];}
+ #else
+  #define  s_mp_copy(sp, dp, count) memcpy(dp, sp, (count) * sizeof(mp_digit))
+ #endif /* MP_MEMCPY */
+
+ #define  s_mp_alloc(nb, ni)  calloc(nb, ni)
+ #define  s_mp_free(ptr) {if(ptr) free(ptr);}
+#endif /* MP_MACRO */
+
+mp_err   s_mp_grow(mp_int *mp, mp_size min);   /* increase allocated size */
+mp_err   s_mp_pad(mp_int *mp, mp_size min);    /* left pad with zeroes    */
+
+void     s_mp_clamp(mp_int *mp);               /* clip leading zeroes     */
+
+void     s_mp_exch(mp_int *a, mp_int *b);      /* swap a and b in place   */
+
+mp_err   s_mp_lshd(mp_int *mp, mp_size p);     /* left-shift by p digits  */
+void     s_mp_rshd(mp_int *mp, mp_size p);     /* right-shift by p digits */
+void     s_mp_div_2d(mp_int *mp, mp_digit d);  /* divide by 2^d in place  */
+void     s_mp_mod_2d(mp_int *mp, mp_digit d);  /* modulo 2^d in place     */
+mp_err   s_mp_mul_2d(mp_int *mp, mp_digit d);  /* multiply by 2^d in place*/
+void     s_mp_div_2(mp_int *mp);               /* divide by 2 in place    */
+mp_err   s_mp_mul_2(mp_int *mp);               /* multiply by 2 in place  */
+mp_digit s_mp_norm(mp_int *a, mp_int *b);      /* normalize for division  */
+mp_err   s_mp_add_d(mp_int *mp, mp_digit d);   /* unsigned digit addition */
+mp_err   s_mp_sub_d(mp_int *mp, mp_digit d);   /* unsigned digit subtract */
+mp_err   s_mp_mul_d(mp_int *mp, mp_digit d);   /* unsigned digit multiply */
+mp_err   s_mp_div_d(mp_int *mp, mp_digit d, mp_digit *r);
+		                               /* unsigned digit divide   */
+mp_err   s_mp_reduce(mp_int *x, mp_int *m, mp_int *mu);
+                                               /* Barrett reduction       */
+mp_err   s_mp_add(mp_int *a, mp_int *b);       /* magnitude addition      */
+mp_err   s_mp_sub(mp_int *a, mp_int *b);       /* magnitude subtract      */
+mp_err   s_mp_mul(mp_int *a, mp_int *b);       /* magnitude multiply      */
+#if 0
+void     s_mp_kmul(mp_digit *a, mp_digit *b, mp_digit *out, mp_size len);
+                                               /* multiply buffers in place */
+#endif
+#if MP_SQUARE
+mp_err   s_mp_sqr(mp_int *a);                  /* magnitude square        */
+#else
+#define  s_mp_sqr(a) s_mp_mul(a, a)
+#endif
+mp_err   s_mp_div(mp_int *a, mp_int *b);       /* magnitude divide        */
+mp_err   s_mp_2expt(mp_int *a, mp_digit k);    /* a = 2^k                 */
+int      s_mp_cmp(mp_int *a, mp_int *b);       /* magnitude comparison    */
+int      s_mp_cmp_d(mp_int *a, mp_digit d);    /* magnitude digit compare */
+int      s_mp_ispow2(mp_int *v);               /* is v a power of 2?      */
+int      s_mp_ispow2d(mp_digit d);             /* is d a power of 2?      */
+
+int      s_mp_tovalue(char ch, int r);          /* convert ch to value    */
+char     s_mp_todigit(int val, int r, int low); /* convert val to digit   */
+int      s_mp_outlen(int bits, int r);          /* output length in bytes */
+
+/* }}} */
+
+/* {{{ Default precision manipulation */
+
+unsigned int mp_get_prec(void)
+{
+  return s_mp_defprec;
+
+} /* end mp_get_prec() */
+
+void         mp_set_prec(unsigned int prec)
+{
+  if(prec == 0)
+    s_mp_defprec = MP_DEFPREC;
+  else
+    s_mp_defprec = prec;
+
+} /* end mp_set_prec() */
+
+/* }}} */
+
+/*------------------------------------------------------------------------*/
+/* {{{ mp_init(mp) */
+
+/*
+  mp_init(mp)
+
+  Initialize a new zero-valued mp_int.  Returns MP_OKAY if successful,
+  MP_MEM if memory could not be allocated for the structure.
+ */
+
+mp_err mp_init(mp_int *mp)
+{
+  return mp_init_size(mp, s_mp_defprec);
+
+} /* end mp_init() */
+
+/* }}} */
+
+/* {{{ mp_init_array(mp[], count) */
+
+mp_err mp_init_array(mp_int mp[], int count)
+{
+  mp_err  res;
+  int     pos;
+
+  ARGCHK(mp !=NULL && count > 0, MP_BADARG);
+
+  for(pos = 0; pos < count; ++pos) {
+    if((res = mp_init(&mp[pos])) != MP_OKAY)
+      goto CLEANUP;
+  }
+
+  return MP_OKAY;
+
+ CLEANUP:
+  while(--pos >= 0) 
+    mp_clear(&mp[pos]);
+
+  return res;
+
+} /* end mp_init_array() */
+
+/* }}} */
+
+/* {{{ mp_init_size(mp, prec) */
+
+/*
+  mp_init_size(mp, prec)
+
+  Initialize a new zero-valued mp_int with at least the given
+  precision; returns MP_OKAY if successful, or MP_MEM if memory could
+  not be allocated for the structure.
+ */
+
+mp_err mp_init_size(mp_int *mp, mp_size prec)
+{
+  ARGCHK(mp != NULL && prec > 0, MP_BADARG);
+
+  if((DIGITS(mp) = s_mp_alloc(prec, sizeof(mp_digit))) == NULL)
+    return MP_MEM;
+
+  SIGN(mp) = MP_ZPOS;
+  USED(mp) = 1;
+  ALLOC(mp) = prec;
+
+  return MP_OKAY;
+
+} /* end mp_init_size() */
+
+/* }}} */
+
+/* {{{ mp_init_copy(mp, from) */
+
+/*
+  mp_init_copy(mp, from)
+
+  Initialize mp as an exact copy of from.  Returns MP_OKAY if
+  successful, MP_MEM if memory could not be allocated for the new
+  structure.
+ */
+
+mp_err mp_init_copy(mp_int *mp, mp_int *from)
+{
+  ARGCHK(mp != NULL && from != NULL, MP_BADARG);
+
+  if(mp == from)
+    return MP_OKAY;
+
+  if((DIGITS(mp) = s_mp_alloc(USED(from), sizeof(mp_digit))) == NULL)
+    return MP_MEM;
+
+  s_mp_copy(DIGITS(from), DIGITS(mp), USED(from));
+  USED(mp) = USED(from);
+  ALLOC(mp) = USED(from);
+  SIGN(mp) = SIGN(from);
+
+  return MP_OKAY;
+
+} /* end mp_init_copy() */
+
+/* }}} */
+
+/* {{{ mp_copy(from, to) */
+
+/*
+  mp_copy(from, to)
+
+  Copies the mp_int 'from' to the mp_int 'to'.  It is presumed that
+  'to' has already been initialized (if not, use mp_init_copy()
+  instead). If 'from' and 'to' are identical, nothing happens.
+ */
+
+mp_err mp_copy(mp_int *from, mp_int *to)
+{
+  ARGCHK(from != NULL && to != NULL, MP_BADARG);
+
+  if(from == to)
+    return MP_OKAY;
+
+  { /* copy */
+    mp_digit   *tmp;
+
+    /*
+      If the allocated buffer in 'to' already has enough space to hold
+      all the used digits of 'from', we'll re-use it to avoid hitting
+      the memory allocater more than necessary; otherwise, we'd have
+      to grow anyway, so we just allocate a hunk and make the copy as
+      usual
+     */
+    if(ALLOC(to) >= USED(from)) {
+      s_mp_setz(DIGITS(to) + USED(from), ALLOC(to) - USED(from));
+      s_mp_copy(DIGITS(from), DIGITS(to), USED(from));
+      
+    } else {
+      if((tmp = s_mp_alloc(USED(from), sizeof(mp_digit))) == NULL)
+	return MP_MEM;
+
+      s_mp_copy(DIGITS(from), tmp, USED(from));
+
+      if(DIGITS(to) != NULL) {
+#if MP_CRYPTO
+	s_mp_setz(DIGITS(to), ALLOC(to));
+#endif
+	s_mp_free(DIGITS(to));
+      }
+
+      DIGITS(to) = tmp;
+      ALLOC(to) = USED(from);
+    }
+
+    /* Copy the precision and sign from the original */
+    USED(to) = USED(from);
+    SIGN(to) = SIGN(from);
+  } /* end copy */
+
+  return MP_OKAY;
+
+} /* end mp_copy() */
+
+/* }}} */
+
+/* {{{ mp_exch(mp1, mp2) */
+
+/*
+  mp_exch(mp1, mp2)
+
+  Exchange mp1 and mp2 without allocating any intermediate memory
+  (well, unless you count the stack space needed for this call and the
+  locals it creates...).  This cannot fail.
+ */
+
+void mp_exch(mp_int *mp1, mp_int *mp2)
+{
+#if MP_ARGCHK == 2
+  assert(mp1 != NULL && mp2 != NULL);
+#else
+  if(mp1 == NULL || mp2 == NULL)
+    return;
+#endif
+
+  s_mp_exch(mp1, mp2);
+
+} /* end mp_exch() */
+
+/* }}} */
+
+/* {{{ mp_clear(mp) */
+
+/*
+  mp_clear(mp)
+
+  Release the storage used by an mp_int, and void its fields so that
+  if someone calls mp_clear() again for the same int later, we won't
+  get tollchocked.
+ */
+
+void   mp_clear(mp_int *mp)
+{
+  if(mp == NULL)
+    return;
+
+  if(DIGITS(mp) != NULL) {
+#if MP_CRYPTO
+    s_mp_setz(DIGITS(mp), ALLOC(mp));
+#endif
+    s_mp_free(DIGITS(mp));
+    DIGITS(mp) = NULL;
+  }
+
+  USED(mp) = 0;
+  ALLOC(mp) = 0;
+
+} /* end mp_clear() */
+
+/* }}} */
+
+/* {{{ mp_clear_array(mp[], count) */
+
+void   mp_clear_array(mp_int mp[], int count)
+{
+  ARGCHK(mp != NULL && count > 0, MP_BADARG);
+
+  while(--count >= 0) 
+    mp_clear(&mp[count]);
+
+} /* end mp_clear_array() */
+
+/* }}} */
+
+/* {{{ mp_zero(mp) */
+
+/*
+  mp_zero(mp) 
+
+  Set mp to zero.  Does not change the allocated size of the structure,
+  and therefore cannot fail (except on a bad argument, which we ignore)
+ */
+void   mp_zero(mp_int *mp)
+{
+  if(mp == NULL)
+    return;
+
+  s_mp_setz(DIGITS(mp), ALLOC(mp));
+  USED(mp) = 1;
+  SIGN(mp) = MP_ZPOS;
+
+} /* end mp_zero() */
+
+/* }}} */
+
+/* {{{ mp_set(mp, d) */
+
+void   mp_set(mp_int *mp, mp_digit d)
+{
+  if(mp == NULL)
+    return;
+
+  mp_zero(mp);
+  DIGIT(mp, 0) = d;
+
+} /* end mp_set() */
+
+/* }}} */
+
+/* {{{ mp_set_int(mp, z) */
+
+mp_err mp_set_int(mp_int *mp, long z)
+{
+  int            ix;
+  unsigned long  v = abs(z);
+  mp_err         res;
+
+  ARGCHK(mp != NULL, MP_BADARG);
+
+  mp_zero(mp);
+  if(z == 0)
+    return MP_OKAY;  /* shortcut for zero */
+
+  for(ix = sizeof(long) - 1; ix >= 0; ix--) {
+
+    if((res = s_mp_mul_2d(mp, CHAR_BIT)) != MP_OKAY)
+      return res;
+
+    res = s_mp_add_d(mp, 
+		     (mp_digit)((v >> (ix * CHAR_BIT)) & UCHAR_MAX));
+    if(res != MP_OKAY)
+      return res;
+
+  }
+
+  if(z < 0)
+    SIGN(mp) = MP_NEG;
+
+  return MP_OKAY;
+
+} /* end mp_set_int() */
+
+/* }}} */
+
+/*------------------------------------------------------------------------*/
+/* {{{ Digit arithmetic */
+
+/* {{{ mp_add_d(a, d, b) */
+
+/*
+  mp_add_d(a, d, b)
+
+  Compute the sum b = a + d, for a single digit d.  Respects the sign of
+  its primary addend (single digits are unsigned anyway).
+ */
+
+mp_err mp_add_d(mp_int *a, mp_digit d, mp_int *b)
+{
+  mp_err   res = MP_OKAY;
+
+  ARGCHK(a != NULL && b != NULL, MP_BADARG);
+
+  if((res = mp_copy(a, b)) != MP_OKAY)
+    return res;
+
+  if(SIGN(b) == MP_ZPOS) {
+    res = s_mp_add_d(b, d);
+  } else if(s_mp_cmp_d(b, d) >= 0) {
+    res = s_mp_sub_d(b, d);
+  } else {
+    SIGN(b) = MP_ZPOS;
+
+    DIGIT(b, 0) = d - DIGIT(b, 0);
+  }
+
+  return res;
+
+} /* end mp_add_d() */
+
+/* }}} */
+
+/* {{{ mp_sub_d(a, d, b) */
+
+/*
+  mp_sub_d(a, d, b)
+
+  Compute the difference b = a - d, for a single digit d.  Respects the
+  sign of its subtrahend (single digits are unsigned anyway).
+ */
+
+mp_err mp_sub_d(mp_int *a, mp_digit d, mp_int *b)
+{
+  mp_err   res;
+
+  ARGCHK(a != NULL && b != NULL, MP_BADARG);
+
+  if((res = mp_copy(a, b)) != MP_OKAY)
+    return res;
+
+  if(SIGN(b) == MP_NEG) {
+    if((res = s_mp_add_d(b, d)) != MP_OKAY)
+      return res;
+
+  } else if(s_mp_cmp_d(b, d) >= 0) {
+    if((res = s_mp_sub_d(b, d)) != MP_OKAY)
+      return res;
+
+  } else {
+    mp_neg(b, b);
+
+    DIGIT(b, 0) = d - DIGIT(b, 0);
+    SIGN(b) = MP_NEG;
+  }
+
+  if(s_mp_cmp_d(b, 0) == 0)
+    SIGN(b) = MP_ZPOS;
+
+  return MP_OKAY;
+
+} /* end mp_sub_d() */
+
+/* }}} */
+
+/* {{{ mp_mul_d(a, d, b) */
+
+/*
+  mp_mul_d(a, d, b)
+
+  Compute the product b = a * d, for a single digit d.  Respects the sign
+  of its multiplicand (single digits are unsigned anyway)
+ */
+
+mp_err mp_mul_d(mp_int *a, mp_digit d, mp_int *b)
+{
+  mp_err  res;
+
+  ARGCHK(a != NULL && b != NULL, MP_BADARG);
+
+  if(d == 0) {
+    mp_zero(b);
+    return MP_OKAY;
+  }
+
+  if((res = mp_copy(a, b)) != MP_OKAY)
+    return res;
+
+  res = s_mp_mul_d(b, d);
+
+  return res;
+
+} /* end mp_mul_d() */
+
+/* }}} */
+
+/* {{{ mp_mul_2(a, c) */
+
+mp_err mp_mul_2(mp_int *a, mp_int *c)
+{
+  mp_err  res;
+
+  ARGCHK(a != NULL && c != NULL, MP_BADARG);
+
+  if((res = mp_copy(a, c)) != MP_OKAY)
+    return res;
+
+  return s_mp_mul_2(c);
+
+} /* end mp_mul_2() */
+
+/* }}} */
+
+/* {{{ mp_div_d(a, d, q, r) */
+
+/*
+  mp_div_d(a, d, q, r)
+
+  Compute the quotient q = a / d and remainder r = a mod d, for a
+  single digit d.  Respects the sign of its divisor (single digits are
+  unsigned anyway).
+ */
+
+mp_err mp_div_d(mp_int *a, mp_digit d, mp_int *q, mp_digit *r)
+{
+  mp_err   res;
+  mp_digit rem;
+  int      pow;
+
+  ARGCHK(a != NULL, MP_BADARG);
+
+  if(d == 0)
+    return MP_RANGE;
+
+  /* Shortcut for powers of two ... */
+  if((pow = s_mp_ispow2d(d)) >= 0) {
+    mp_digit  mask;
+
+    mask = (1 << pow) - 1;
+    rem = DIGIT(a, 0) & mask;
+
+    if(q) {
+      mp_copy(a, q);
+      s_mp_div_2d(q, pow);
+    }
+
+    if(r)
+      *r = rem;
+
+    return MP_OKAY;
+  }
+
+  /*
+    If the quotient is actually going to be returned, we'll try to
+    avoid hitting the memory allocator by copying the dividend into it
+    and doing the division there.  This can't be any _worse_ than
+    always copying, and will sometimes be better (since it won't make
+    another copy)
+
+    If it's not going to be returned, we need to allocate a temporary
+    to hold the quotient, which will just be discarded.
+   */
+  if(q) {
+    if((res = mp_copy(a, q)) != MP_OKAY)
+      return res;
+
+    res = s_mp_div_d(q, d, &rem);
+    if(s_mp_cmp_d(q, 0) == MP_EQ)
+      SIGN(q) = MP_ZPOS;
+
+  } else {
+    mp_int  qp;
+
+    if((res = mp_init_copy(&qp, a)) != MP_OKAY)
+      return res;
+
+    res = s_mp_div_d(&qp, d, &rem);
+    if(s_mp_cmp_d(&qp, 0) == 0)
+      SIGN(&qp) = MP_ZPOS;
+
+    mp_clear(&qp);
+  }
+
+  if(r)
+    *r = rem;
+
+  return res;
+
+} /* end mp_div_d() */
+
+/* }}} */
+
+/* {{{ mp_div_2(a, c) */
+
+/*
+  mp_div_2(a, c)
+
+  Compute c = a / 2, disregarding the remainder.
+ */
+
+mp_err mp_div_2(mp_int *a, mp_int *c)
+{
+  mp_err  res;
+
+  ARGCHK(a != NULL && c != NULL, MP_BADARG);
+
+  if((res = mp_copy(a, c)) != MP_OKAY)
+    return res;
+
+  s_mp_div_2(c);
+
+  return MP_OKAY;
+
+} /* end mp_div_2() */
+
+/* }}} */
+
+/* {{{ mp_expt_d(a, d, b) */
+
+mp_err mp_expt_d(mp_int *a, mp_digit d, mp_int *c)
+{
+  mp_int   s, x;
+  mp_err   res;
+
+  ARGCHK(a != NULL && c != NULL, MP_BADARG);
+
+  if((res = mp_init(&s)) != MP_OKAY)
+    return res;
+  if((res = mp_init_copy(&x, a)) != MP_OKAY)
+    goto X;
+
+  DIGIT(&s, 0) = 1;
+
+  while(d != 0) {
+    if(d & 1) {
+      if((res = s_mp_mul(&s, &x)) != MP_OKAY)
+	goto CLEANUP;
+    }
+
+    d >>= 1;
+
+    if((res = s_mp_sqr(&x)) != MP_OKAY)
+      goto CLEANUP;
+  }
+
+  s_mp_exch(&s, c);
+
+CLEANUP:
+  mp_clear(&x);
+X:
+  mp_clear(&s);
+
+  return res;
+
+} /* end mp_expt_d() */
+
+/* }}} */
+
+/* }}} */
+
+/*------------------------------------------------------------------------*/
+/* {{{ Full arithmetic */
+
+/* {{{ mp_abs(a, b) */
+
+/*
+  mp_abs(a, b)
+
+  Compute b = |a|.  'a' and 'b' may be identical.
+ */
+
+mp_err mp_abs(mp_int *a, mp_int *b)
+{
+  mp_err   res;
+
+  ARGCHK(a != NULL && b != NULL, MP_BADARG);
+
+  if((res = mp_copy(a, b)) != MP_OKAY)
+    return res;
+
+  SIGN(b) = MP_ZPOS;
+
+  return MP_OKAY;
+
+} /* end mp_abs() */
+
+/* }}} */
+
+/* {{{ mp_neg(a, b) */
+
+/*
+  mp_neg(a, b)
+
+  Compute b = -a.  'a' and 'b' may be identical.
+ */
+
+mp_err mp_neg(mp_int *a, mp_int *b)
+{
+  mp_err   res;
+
+  ARGCHK(a != NULL && b != NULL, MP_BADARG);
+
+  if((res = mp_copy(a, b)) != MP_OKAY)
+    return res;
+
+  if(s_mp_cmp_d(b, 0) == MP_EQ) 
+    SIGN(b) = MP_ZPOS;
+  else 
+    SIGN(b) = (SIGN(b) == MP_NEG) ? MP_ZPOS : MP_NEG;
+
+  return MP_OKAY;
+
+} /* end mp_neg() */
+
+/* }}} */
+
+/* {{{ mp_add(a, b, c) */
+
+/*
+  mp_add(a, b, c)
+
+  Compute c = a + b.  All parameters may be identical.
+ */
+
+mp_err mp_add(mp_int *a, mp_int *b, mp_int *c)
+{
+  mp_err  res;
+  int     cmp;
+
+  ARGCHK(a != NULL && b != NULL && c != NULL, MP_BADARG);
+
+  if(SIGN(a) == SIGN(b)) { /* same sign:  add values, keep sign */
+
+    /* Commutativity of addition lets us do this in either order,
+       so we avoid having to use a temporary even if the result 
+       is supposed to replace the output
+     */
+    if(c == b) {
+      if((res = s_mp_add(c, a)) != MP_OKAY)
+	return res;
+    } else {
+      if(c != a && (res = mp_copy(a, c)) != MP_OKAY)
+	return res;
+
+      if((res = s_mp_add(c, b)) != MP_OKAY) 
+	return res;
+    }
+
+  } else if((cmp = s_mp_cmp(a, b)) > 0) {  /* different sign: a > b   */
+
+    /* If the output is going to be clobbered, we will use a temporary
+       variable; otherwise, we'll do it without touching the memory 
+       allocator at all, if possible
+     */
+    if(c == b) {
+      mp_int  tmp;
+
+      if((res = mp_init_copy(&tmp, a)) != MP_OKAY)
+	return res;
+      if((res = s_mp_sub(&tmp, b)) != MP_OKAY) {
+	mp_clear(&tmp);
+	return res;
+      }
+
+      s_mp_exch(&tmp, c);
+      mp_clear(&tmp);
+
+    } else {
+
+      if(c != a && (res = mp_copy(a, c)) != MP_OKAY)
+	return res;
+      if((res = s_mp_sub(c, b)) != MP_OKAY)
+	return res;
+
+    }
+
+  } else if(cmp == 0) {             /* different sign, a == b   */
+
+    mp_zero(c);
+    return MP_OKAY;
+
+  } else {                          /* different sign: a < b    */
+
+    /* See above... */
+    if(c == a) {
+      mp_int  tmp;
+
+      if((res = mp_init_copy(&tmp, b)) != MP_OKAY)
+	return res;
+      if((res = s_mp_sub(&tmp, a)) != MP_OKAY) {
+	mp_clear(&tmp);
+	return res;
+      }
+
+      s_mp_exch(&tmp, c);
+      mp_clear(&tmp);
+
+    } else {
+
+      if(c != b && (res = mp_copy(b, c)) != MP_OKAY)
+	return res;
+      if((res = s_mp_sub(c, a)) != MP_OKAY)
+	return res;
+
+    }
+  }
+
+  if(USED(c) == 1 && DIGIT(c, 0) == 0)
+    SIGN(c) = MP_ZPOS;
+
+  return MP_OKAY;
+
+} /* end mp_add() */
+
+/* }}} */
+
+/* {{{ mp_sub(a, b, c) */
+
+/*
+  mp_sub(a, b, c)
+
+  Compute c = a - b.  All parameters may be identical.
+ */
+
+mp_err mp_sub(mp_int *a, mp_int *b, mp_int *c)
+{
+  mp_err  res;
+  int     cmp;
+
+  ARGCHK(a != NULL && b != NULL && c != NULL, MP_BADARG);
+
+  if(SIGN(a) != SIGN(b)) {
+    if(c == a) {
+      if((res = s_mp_add(c, b)) != MP_OKAY)
+	return res;
+    } else {
+      if(c != b && ((res = mp_copy(b, c)) != MP_OKAY))
+	return res;
+      if((res = s_mp_add(c, a)) != MP_OKAY)
+	return res;
+      SIGN(c) = SIGN(a);
+    }
+
+  } else if((cmp = s_mp_cmp(a, b)) > 0) { /* Same sign, a > b */
+    if(c == b) {
+      mp_int  tmp;
+
+      if((res = mp_init_copy(&tmp, a)) != MP_OKAY)
+	return res;
+      if((res = s_mp_sub(&tmp, b)) != MP_OKAY) {
+	mp_clear(&tmp);
+	return res;
+      }
+      s_mp_exch(&tmp, c);
+      mp_clear(&tmp);
+
+    } else {
+      if(c != a && ((res = mp_copy(a, c)) != MP_OKAY))
+	return res;
+
+      if((res = s_mp_sub(c, b)) != MP_OKAY)
+	return res;
+    }
+
+  } else if(cmp == 0) {  /* Same sign, equal magnitude */
+    mp_zero(c);
+    return MP_OKAY;
+
+  } else {               /* Same sign, b > a */
+    if(c == a) {
+      mp_int  tmp;
+
+      if((res = mp_init_copy(&tmp, b)) != MP_OKAY)
+	return res;
+
+      if((res = s_mp_sub(&tmp, a)) != MP_OKAY) {
+	mp_clear(&tmp);
+	return res;
+      }
+      s_mp_exch(&tmp, c);
+      mp_clear(&tmp);
+
+    } else {
+      if(c != b && ((res = mp_copy(b, c)) != MP_OKAY)) 
+	return res;
+
+      if((res = s_mp_sub(c, a)) != MP_OKAY)
+	return res;
+    }
+
+    SIGN(c) = !SIGN(b);
+  }
+
+  if(USED(c) == 1 && DIGIT(c, 0) == 0)
+    SIGN(c) = MP_ZPOS;
+
+  return MP_OKAY;
+
+} /* end mp_sub() */
+
+/* }}} */
+
+/* {{{ mp_mul(a, b, c) */
+
+/*
+  mp_mul(a, b, c)
+
+  Compute c = a * b.  All parameters may be identical.
+ */
+
+mp_err mp_mul(mp_int *a, mp_int *b, mp_int *c)
+{
+  mp_err   res;
+  mp_sign  sgn;
+
+  ARGCHK(a != NULL && b != NULL && c != NULL, MP_BADARG);
+
+  sgn = (SIGN(a) == SIGN(b)) ? MP_ZPOS : MP_NEG;
+
+  if(c == b) {
+    if((res = s_mp_mul(c, a)) != MP_OKAY)
+      return res;
+
+  } else {
+    if((res = mp_copy(a, c)) != MP_OKAY)
+      return res;
+
+    if((res = s_mp_mul(c, b)) != MP_OKAY)
+      return res;
+  }
+  
+  if(sgn == MP_ZPOS || s_mp_cmp_d(c, 0) == MP_EQ)
+    SIGN(c) = MP_ZPOS;
+  else
+    SIGN(c) = sgn;
+  
+  return MP_OKAY;
+
+} /* end mp_mul() */
+
+/* }}} */
+
+/* {{{ mp_mul_2d(a, d, c) */
+
+/*
+  mp_mul_2d(a, d, c)
+
+  Compute c = a * 2^d.  a may be the same as c.
+ */
+
+mp_err mp_mul_2d(mp_int *a, mp_digit d, mp_int *c)
+{
+  mp_err   res;
+
+  ARGCHK(a != NULL && c != NULL, MP_BADARG);
+
+  if((res = mp_copy(a, c)) != MP_OKAY)
+    return res;
+
+  if(d == 0)
+    return MP_OKAY;
+
+  return s_mp_mul_2d(c, d);
+
+} /* end mp_mul() */
+
+/* }}} */
+
+/* {{{ mp_sqr(a, b) */
+
+#if MP_SQUARE
+mp_err mp_sqr(mp_int *a, mp_int *b)
+{
+  mp_err   res;
+
+  ARGCHK(a != NULL && b != NULL, MP_BADARG);
+
+  if((res = mp_copy(a, b)) != MP_OKAY)
+    return res;
+
+  if((res = s_mp_sqr(b)) != MP_OKAY)
+    return res;
+
+  SIGN(b) = MP_ZPOS;
+
+  return MP_OKAY;
+
+} /* end mp_sqr() */
+#endif
+
+/* }}} */
+
+/* {{{ mp_div(a, b, q, r) */
+
+/*
+  mp_div(a, b, q, r)
+
+  Compute q = a / b and r = a mod b.  Input parameters may be re-used
+  as output parameters.  If q or r is NULL, that portion of the
+  computation will be discarded (although it will still be computed)
+
+  Pay no attention to the hacker behind the curtain.
+ */
+
+mp_err mp_div(mp_int *a, mp_int *b, mp_int *q, mp_int *r)
+{
+  mp_err   res;
+  mp_int   qtmp, rtmp;
+  int      cmp;
+
+  ARGCHK(a != NULL && b != NULL, MP_BADARG);
+
+  if(mp_cmp_z(b) == MP_EQ)
+    return MP_RANGE;
+
+  /* If a <= b, we can compute the solution without division, and
+     avoid any memory allocation
+   */
+  if((cmp = s_mp_cmp(a, b)) < 0) {
+    if(r) {
+      if((res = mp_copy(a, r)) != MP_OKAY)
+	return res;
+    }
+
+    if(q) 
+      mp_zero(q);
+
+    return MP_OKAY;
+
+  } else if(cmp == 0) {
+
+    /* Set quotient to 1, with appropriate sign */
+    if(q) {
+      int qneg = (SIGN(a) != SIGN(b));
+
+      mp_set(q, 1);
+      if(qneg)
+	SIGN(q) = MP_NEG;
+    }
+
+    if(r)
+      mp_zero(r);
+
+    return MP_OKAY;
+  }
+
+  /* If we get here, it means we actually have to do some division */
+
+  /* Set up some temporaries... */
+  if((res = mp_init_copy(&qtmp, a)) != MP_OKAY)
+    return res;
+  if((res = mp_init_copy(&rtmp, b)) != MP_OKAY)
+    goto CLEANUP;
+
+  if((res = s_mp_div(&qtmp, &rtmp)) != MP_OKAY)
+    goto CLEANUP;
+
+  /* Compute the signs for the output  */
+  SIGN(&rtmp) = SIGN(a); /* Sr = Sa              */
+  if(SIGN(a) == SIGN(b))
+    SIGN(&qtmp) = MP_ZPOS;  /* Sq = MP_ZPOS if Sa = Sb */
+  else
+    SIGN(&qtmp) = MP_NEG;   /* Sq = MP_NEG if Sa != Sb */
+
+  if(s_mp_cmp_d(&qtmp, 0) == MP_EQ)
+    SIGN(&qtmp) = MP_ZPOS;
+  if(s_mp_cmp_d(&rtmp, 0) == MP_EQ)
+    SIGN(&rtmp) = MP_ZPOS;
+
+  /* Copy output, if it is needed      */
+  if(q) 
+    s_mp_exch(&qtmp, q);
+
+  if(r) 
+    s_mp_exch(&rtmp, r);
+
+CLEANUP:
+  mp_clear(&rtmp);
+  mp_clear(&qtmp);
+
+  return res;
+
+} /* end mp_div() */
+
+/* }}} */
+
+/* {{{ mp_div_2d(a, d, q, r) */
+
+mp_err mp_div_2d(mp_int *a, mp_digit d, mp_int *q, mp_int *r)
+{
+  mp_err  res;
+
+  ARGCHK(a != NULL, MP_BADARG);
+
+  if(q) {
+    if((res = mp_copy(a, q)) != MP_OKAY)
+      return res;
+
+    s_mp_div_2d(q, d);
+  }
+
+  if(r) {
+    if((res = mp_copy(a, r)) != MP_OKAY)
+      return res;
+
+    s_mp_mod_2d(r, d);
+  }
+
+  return MP_OKAY;
+
+} /* end mp_div_2d() */
+
+/* }}} */
+
+/* {{{ mp_expt(a, b, c) */
+
+/*
+  mp_expt(a, b, c)
+
+  Compute c = a ** b, that is, raise a to the b power.  Uses a
+  standard iterative square-and-multiply technique.
+ */
+
+mp_err mp_expt(mp_int *a, mp_int *b, mp_int *c)
+{
+  mp_int   s, x;
+  mp_err   res;
+  mp_digit d;
+  int      dig, bit;
+
+  ARGCHK(a != NULL && b != NULL && c != NULL, MP_BADARG);
+
+  if(mp_cmp_z(b) < 0)
+    return MP_RANGE;
+
+  if((res = mp_init(&s)) != MP_OKAY)
+    return res;
+
+  mp_set(&s, 1);
+
+  if((res = mp_init_copy(&x, a)) != MP_OKAY)
+    goto X;
+
+  /* Loop over low-order digits in ascending order */
+  for(dig = 0; dig < (USED(b) - 1); dig++) {
+    d = DIGIT(b, dig);
+
+    /* Loop over bits of each non-maximal digit */
+    for(bit = 0; bit < DIGIT_BIT; bit++) {
+      if(d & 1) {
+	if((res = s_mp_mul(&s, &x)) != MP_OKAY) 
+	  goto CLEANUP;
+      }
+
+      d >>= 1;
+      
+      if((res = s_mp_sqr(&x)) != MP_OKAY)
+	goto CLEANUP;
+    }
+  }
+
+  /* Consider now the last digit... */
+  d = DIGIT(b, dig);
+
+  while(d) {
+    if(d & 1) {
+      if((res = s_mp_mul(&s, &x)) != MP_OKAY)
+	goto CLEANUP;
+    }
+
+    d >>= 1;
+
+    if((res = s_mp_sqr(&x)) != MP_OKAY)
+      goto CLEANUP;
+  }
+  
+  if(mp_iseven(b))
+    SIGN(&s) = SIGN(a);
+
+  res = mp_copy(&s, c);
+
+CLEANUP:
+  mp_clear(&x);
+X:
+  mp_clear(&s);
+
+  return res;
+
+} /* end mp_expt() */
+
+/* }}} */
+
+/* {{{ mp_2expt(a, k) */
+
+/* Compute a = 2^k */
+
+mp_err mp_2expt(mp_int *a, mp_digit k)
+{
+  ARGCHK(a != NULL, MP_BADARG);
+
+  return s_mp_2expt(a, k);
+
+} /* end mp_2expt() */
+
+/* }}} */
+
+/* {{{ mp_mod(a, m, c) */
+
+/*
+  mp_mod(a, m, c)
+
+  Compute c = a (mod m).  Result will always be 0 <= c < m.
+ */
+
+mp_err mp_mod(mp_int *a, mp_int *m, mp_int *c)
+{
+  mp_err  res;
+  int     mag;
+
+  ARGCHK(a != NULL && m != NULL && c != NULL, MP_BADARG);
+
+  if(SIGN(m) == MP_NEG)
+    return MP_RANGE;
+
+  /*
+     If |a| > m, we need to divide to get the remainder and take the
+     absolute value.  
+
+     If |a| < m, we don't need to do any division, just copy and adjust
+     the sign (if a is negative).
+
+     If |a| == m, we can simply set the result to zero.
+
+     This order is intended to minimize the average path length of the
+     comparison chain on common workloads -- the most frequent cases are
+     that |a| != m, so we do those first.
+   */
+  if((mag = s_mp_cmp(a, m)) > 0) {
+    if((res = mp_div(a, m, NULL, c)) != MP_OKAY)
+      return res;
+    
+    if(SIGN(c) == MP_NEG) {
+      if((res = mp_add(c, m, c)) != MP_OKAY)
+	return res;
+    }
+
+  } else if(mag < 0) {
+    if((res = mp_copy(a, c)) != MP_OKAY)
+      return res;
+
+    if(mp_cmp_z(a) < 0) {
+      if((res = mp_add(c, m, c)) != MP_OKAY)
+	return res;
+
+    }
+    
+  } else {
+    mp_zero(c);
+
+  }
+
+  return MP_OKAY;
+
+} /* end mp_mod() */
+
+/* }}} */
+
+/* {{{ mp_mod_d(a, d, c) */
+
+/*
+  mp_mod_d(a, d, c)
+
+  Compute c = a (mod d).  Result will always be 0 <= c < d
+ */
+mp_err mp_mod_d(mp_int *a, mp_digit d, mp_digit *c)
+{
+  mp_err   res;
+  mp_digit rem;
+
+  ARGCHK(a != NULL && c != NULL, MP_BADARG);
+
+  if(s_mp_cmp_d(a, d) > 0) {
+    if((res = mp_div_d(a, d, NULL, &rem)) != MP_OKAY)
+      return res;
+
+  } else {
+    if(SIGN(a) == MP_NEG)
+      rem = d - DIGIT(a, 0);
+    else
+      rem = DIGIT(a, 0);
+  }
+
+  if(c)
+    *c = rem;
+
+  return MP_OKAY;
+
+} /* end mp_mod_d() */
+
+/* }}} */
+
+/* {{{ mp_sqrt(a, b) */
+
+/*
+  mp_sqrt(a, b)
+
+  Compute the integer square root of a, and store the result in b.
+  Uses an integer-arithmetic version of Newton's iterative linear
+  approximation technique to determine this value; the result has the
+  following two properties:
+
+     b^2 <= a
+     (b+1)^2 >= a
+
+  It is a range error to pass a negative value.
+ */
+mp_err mp_sqrt(mp_int *a, mp_int *b)
+{
+  mp_int   x, t;
+  mp_err   res;
+
+  ARGCHK(a != NULL && b != NULL, MP_BADARG);
+
+  /* Cannot take square root of a negative value */
+  if(SIGN(a) == MP_NEG)
+    return MP_RANGE;
+
+  /* Special cases for zero and one, trivial     */
+  if(mp_cmp_d(a, 0) == MP_EQ || mp_cmp_d(a, 1) == MP_EQ) 
+    return mp_copy(a, b);
+    
+  /* Initialize the temporaries we'll use below  */
+  if((res = mp_init_size(&t, USED(a))) != MP_OKAY)
+    return res;
+
+  /* Compute an initial guess for the iteration as a itself */
+  if((res = mp_init_copy(&x, a)) != MP_OKAY)
+    goto X;
+
+s_mp_rshd(&x, (USED(&x)/2)+1);
+mp_add_d(&x, 1, &x);
+
+  for(;;) {
+    /* t = (x * x) - a */
+    mp_copy(&x, &t);      /* can't fail, t is big enough for original x */
+    if((res = mp_sqr(&t, &t)) != MP_OKAY ||
+       (res = mp_sub(&t, a, &t)) != MP_OKAY)
+      goto CLEANUP;
+
+    /* t = t / 2x       */
+    s_mp_mul_2(&x);
+    if((res = mp_div(&t, &x, &t, NULL)) != MP_OKAY)
+      goto CLEANUP;
+    s_mp_div_2(&x);
+
+    /* Terminate the loop, if the quotient is zero */
+    if(mp_cmp_z(&t) == MP_EQ)
+      break;
+
+    /* x = x - t       */
+    if((res = mp_sub(&x, &t, &x)) != MP_OKAY)
+      goto CLEANUP;
+
+  }
+
+  /* Copy result to output parameter */
+  mp_sub_d(&x, 1, &x);
+  s_mp_exch(&x, b);
+
+ CLEANUP:
+  mp_clear(&x);
+ X:
+  mp_clear(&t); 
+
+  return res;
+
+} /* end mp_sqrt() */
+
+/* }}} */
+
+/* }}} */
+
+/*------------------------------------------------------------------------*/
+/* {{{ Modular arithmetic */
+
+#if MP_MODARITH
+/* {{{ mp_addmod(a, b, m, c) */
+
+/*
+  mp_addmod(a, b, m, c)
+
+  Compute c = (a + b) mod m
+ */
+
+mp_err mp_addmod(mp_int *a, mp_int *b, mp_int *m, mp_int *c)
+{
+  mp_err  res;
+
+  ARGCHK(a != NULL && b != NULL && m != NULL && c != NULL, MP_BADARG);
+
+  if((res = mp_add(a, b, c)) != MP_OKAY)
+    return res;
+  if((res = mp_mod(c, m, c)) != MP_OKAY)
+    return res;
+
+  return MP_OKAY;
+
+}
+
+/* }}} */
+
+/* {{{ mp_submod(a, b, m, c) */
+
+/*
+  mp_submod(a, b, m, c)
+
+  Compute c = (a - b) mod m
+ */
+
+mp_err mp_submod(mp_int *a, mp_int *b, mp_int *m, mp_int *c)
+{
+  mp_err  res;
+
+  ARGCHK(a != NULL && b != NULL && m != NULL && c != NULL, MP_BADARG);
+
+  if((res = mp_sub(a, b, c)) != MP_OKAY)
+    return res;
+  if((res = mp_mod(c, m, c)) != MP_OKAY)
+    return res;
+
+  return MP_OKAY;
+
+}
+
+/* }}} */
+
+/* {{{ mp_mulmod(a, b, m, c) */
+
+/*
+  mp_mulmod(a, b, m, c)
+
+  Compute c = (a * b) mod m
+ */
+
+mp_err mp_mulmod(mp_int *a, mp_int *b, mp_int *m, mp_int *c)
+{
+  mp_err  res;
+
+  ARGCHK(a != NULL && b != NULL && m != NULL && c != NULL, MP_BADARG);
+
+  if((res = mp_mul(a, b, c)) != MP_OKAY)
+    return res;
+  if((res = mp_mod(c, m, c)) != MP_OKAY)
+    return res;
+
+  return MP_OKAY;
+
+}
+
+/* }}} */
+
+/* {{{ mp_sqrmod(a, m, c) */
+
+#if MP_SQUARE
+mp_err mp_sqrmod(mp_int *a, mp_int *m, mp_int *c)
+{
+  mp_err  res;
+
+  ARGCHK(a != NULL && m != NULL && c != NULL, MP_BADARG);
+
+  if((res = mp_sqr(a, c)) != MP_OKAY)
+    return res;
+  if((res = mp_mod(c, m, c)) != MP_OKAY)
+    return res;
+
+  return MP_OKAY;
+
+} /* end mp_sqrmod() */
+#endif
+
+/* }}} */
+
+/* {{{ mp_exptmod(a, b, m, c) */
+
+/*
+  mp_exptmod(a, b, m, c)
+
+  Compute c = (a ** b) mod m.  Uses a standard square-and-multiply
+  method with modular reductions at each step. (This is basically the
+  same code as mp_expt(), except for the addition of the reductions)
+  
+  The modular reductions are done using Barrett's algorithm (see
+  s_mp_reduce() below for details)
+ */
+
+mp_err mp_exptmod(mp_int *a, mp_int *b, mp_int *m, mp_int *c)
+{
+  mp_int   s, x, mu;
+  mp_err   res;
+  mp_digit d, *db = DIGITS(b);
+  mp_size  ub = USED(b);
+  int      dig, bit;
+
+  ARGCHK(a != NULL && b != NULL && c != NULL, MP_BADARG);
+
+  if(mp_cmp_z(b) < 0 || mp_cmp_z(m) <= 0)
+    return MP_RANGE;
+
+  if((res = mp_init(&s)) != MP_OKAY)
+    return res;
+  if((res = mp_init_copy(&x, a)) != MP_OKAY)
+    goto X;
+  if((res = mp_mod(&x, m, &x)) != MP_OKAY ||
+     (res = mp_init(&mu)) != MP_OKAY)
+    goto MU;
+
+  mp_set(&s, 1);
+
+  /* mu = b^2k / m */
+  s_mp_add_d(&mu, 1); 
+  s_mp_lshd(&mu, 2 * USED(m));
+  if((res = mp_div(&mu, m, &mu, NULL)) != MP_OKAY)
+    goto CLEANUP;
+
+  /* Loop over digits of b in ascending order, except highest order */
+  for(dig = 0; dig < (ub - 1); dig++) {
+    d = *db++;
+
+    /* Loop over the bits of the lower-order digits */
+    for(bit = 0; bit < DIGIT_BIT; bit++) {
+      if(d & 1) {
+	if((res = s_mp_mul(&s, &x)) != MP_OKAY)
+	  goto CLEANUP;
+	if((res = s_mp_reduce(&s, m, &mu)) != MP_OKAY)
+	  goto CLEANUP;
+      }
+
+      d >>= 1;
+
+      if((res = s_mp_sqr(&x)) != MP_OKAY)
+	goto CLEANUP;
+      if((res = s_mp_reduce(&x, m, &mu)) != MP_OKAY)
+	goto CLEANUP;
+    }
+  }
+
+  /* Now do the last digit... */
+  d = *db;
+
+  while(d) {
+    if(d & 1) {
+      if((res = s_mp_mul(&s, &x)) != MP_OKAY)
+	goto CLEANUP;
+      if((res = s_mp_reduce(&s, m, &mu)) != MP_OKAY)
+	goto CLEANUP;
+    }
+
+    d >>= 1;
+
+    if((res = s_mp_sqr(&x)) != MP_OKAY)
+      goto CLEANUP;
+    if((res = s_mp_reduce(&x, m, &mu)) != MP_OKAY)
+      goto CLEANUP;
+  }
+
+  s_mp_exch(&s, c);
+
+ CLEANUP:
+  mp_clear(&mu);
+ MU:
+  mp_clear(&x);
+ X:
+  mp_clear(&s);
+
+  return res;
+
+} /* end mp_exptmod() */
+
+/* }}} */
+
+/* {{{ mp_exptmod_d(a, d, m, c) */
+
+mp_err mp_exptmod_d(mp_int *a, mp_digit d, mp_int *m, mp_int *c)
+{
+  mp_int   s, x;
+  mp_err   res;
+
+  ARGCHK(a != NULL && c != NULL, MP_BADARG);
+
+  if((res = mp_init(&s)) != MP_OKAY)
+    return res;
+  if((res = mp_init_copy(&x, a)) != MP_OKAY)
+    goto X;
+
+  mp_set(&s, 1);
+
+  while(d != 0) {
+    if(d & 1) {
+      if((res = s_mp_mul(&s, &x)) != MP_OKAY ||
+	 (res = mp_mod(&s, m, &s)) != MP_OKAY)
+	goto CLEANUP;
+    }
+
+    d /= 2;
+
+    if((res = s_mp_sqr(&x)) != MP_OKAY ||
+       (res = mp_mod(&x, m, &x)) != MP_OKAY)
+      goto CLEANUP;
+  }
+
+  s_mp_exch(&s, c);
+
+CLEANUP:
+  mp_clear(&x);
+X:
+  mp_clear(&s);
+
+  return res;
+
+} /* end mp_exptmod_d() */
+
+/* }}} */
+#endif /* if MP_MODARITH */
+
+/* }}} */
+
+/*------------------------------------------------------------------------*/
+/* {{{ Comparison functions */
+
+/* {{{ mp_cmp_z(a) */
+
+/*
+  mp_cmp_z(a)
+
+  Compare a <=> 0.  Returns <0 if a<0, 0 if a=0, >0 if a>0.
+ */
+
+int    mp_cmp_z(mp_int *a)
+{
+  if(SIGN(a) == MP_NEG)
+    return MP_LT;
+  else if(USED(a) == 1 && DIGIT(a, 0) == 0)
+    return MP_EQ;
+  else
+    return MP_GT;
+
+} /* end mp_cmp_z() */
+
+/* }}} */
+
+/* {{{ mp_cmp_d(a, d) */
+
+/*
+  mp_cmp_d(a, d)
+
+  Compare a <=> d.  Returns <0 if a<d, 0 if a=d, >0 if a>d
+ */
+
+int    mp_cmp_d(mp_int *a, mp_digit d)
+{
+  ARGCHK(a != NULL, MP_EQ);
+
+  if(SIGN(a) == MP_NEG)
+    return MP_LT;
+
+  return s_mp_cmp_d(a, d);
+
+} /* end mp_cmp_d() */
+
+/* }}} */
+
+/* {{{ mp_cmp(a, b) */
+
+int    mp_cmp(mp_int *a, mp_int *b)
+{
+  ARGCHK(a != NULL && b != NULL, MP_EQ);
+
+  if(SIGN(a) == SIGN(b)) {
+    int  mag;
+
+    if((mag = s_mp_cmp(a, b)) == MP_EQ)
+      return MP_EQ;
+
+    if(SIGN(a) == MP_ZPOS)
+      return mag;
+    else
+      return -mag;
+
+  } else if(SIGN(a) == MP_ZPOS) {
+    return MP_GT;
+  } else {
+    return MP_LT;
+  }
+
+} /* end mp_cmp() */
+
+/* }}} */
+
+/* {{{ mp_cmp_mag(a, b) */
+
+/*
+  mp_cmp_mag(a, b)
+
+  Compares |a| <=> |b|, and returns an appropriate comparison result
+ */
+
+int    mp_cmp_mag(mp_int *a, mp_int *b)
+{
+  ARGCHK(a != NULL && b != NULL, MP_EQ);
+
+  return s_mp_cmp(a, b);
+
+} /* end mp_cmp_mag() */
+
+/* }}} */
+
+/* {{{ mp_cmp_int(a, z) */
+
+/*
+  This just converts z to an mp_int, and uses the existing comparison
+  routines.  This is sort of inefficient, but it's not clear to me how
+  frequently this wil get used anyway.  For small positive constants,
+  you can always use mp_cmp_d(), and for zero, there is mp_cmp_z().
+ */
+int    mp_cmp_int(mp_int *a, long z)
+{
+  mp_int  tmp;
+  int     out;
+
+  ARGCHK(a != NULL, MP_EQ);
+  
+  mp_init(&tmp); mp_set_int(&tmp, z);
+  out = mp_cmp(a, &tmp);
+  mp_clear(&tmp);
+
+  return out;
+
+} /* end mp_cmp_int() */
+
+/* }}} */
+
+/* {{{ mp_isodd(a) */
+
+/*
+  mp_isodd(a)
+
+  Returns a true (non-zero) value if a is odd, false (zero) otherwise.
+ */
+int    mp_isodd(mp_int *a)
+{
+  ARGCHK(a != NULL, 0);
+
+  return (DIGIT(a, 0) & 1);
+
+} /* end mp_isodd() */
+
+/* }}} */
+
+/* {{{ mp_iseven(a) */
+
+int    mp_iseven(mp_int *a)
+{
+  return !mp_isodd(a);
+
+} /* end mp_iseven() */
+
+/* }}} */
+
+/* }}} */
+
+/*------------------------------------------------------------------------*/
+/* {{{ Number theoretic functions */
+
+#if MP_NUMTH
+/* {{{ mp_gcd(a, b, c) */
+
+/*
+  Like the old mp_gcd() function, except computes the GCD using the
+  binary algorithm due to Josef Stein in 1961 (via Knuth).
+ */
+mp_err mp_gcd(mp_int *a, mp_int *b, mp_int *c)
+{
+  mp_err   res;
+  mp_int   u, v, t;
+  mp_size  k = 0;
+
+  ARGCHK(a != NULL && b != NULL && c != NULL, MP_BADARG);
+
+  if(mp_cmp_z(a) == MP_EQ && mp_cmp_z(b) == MP_EQ)
+      return MP_RANGE;
+  if(mp_cmp_z(a) == MP_EQ) {
+    return mp_copy(b, c);
+  } else if(mp_cmp_z(b) == MP_EQ) {
+    return mp_copy(a, c);
+  }
+
+  if((res = mp_init(&t)) != MP_OKAY)
+    return res;
+  if((res = mp_init_copy(&u, a)) != MP_OKAY)
+    goto U;
+  if((res = mp_init_copy(&v, b)) != MP_OKAY)
+    goto V;
+
+  SIGN(&u) = MP_ZPOS;
+  SIGN(&v) = MP_ZPOS;
+
+  /* Divide out common factors of 2 until at least 1 of a, b is even */
+  while(mp_iseven(&u) && mp_iseven(&v)) {
+    s_mp_div_2(&u);
+    s_mp_div_2(&v);
+    ++k;
+  }
+
+  /* Initialize t */
+  if(mp_isodd(&u)) {
+    if((res = mp_copy(&v, &t)) != MP_OKAY)
+      goto CLEANUP;
+    
+    /* t = -v */
+    if(SIGN(&v) == MP_ZPOS)
+      SIGN(&t) = MP_NEG;
+    else
+      SIGN(&t) = MP_ZPOS;
+    
+  } else {
+    if((res = mp_copy(&u, &t)) != MP_OKAY)
+      goto CLEANUP;
+
+  }
+
+  for(;;) {
+    while(mp_iseven(&t)) {
+      s_mp_div_2(&t);
+    }
+
+    if(mp_cmp_z(&t) == MP_GT) {
+      if((res = mp_copy(&t, &u)) != MP_OKAY)
+	goto CLEANUP;
+
+    } else {
+      if((res = mp_copy(&t, &v)) != MP_OKAY)
+	goto CLEANUP;
+
+      /* v = -t */
+      if(SIGN(&t) == MP_ZPOS)
+	SIGN(&v) = MP_NEG;
+      else
+	SIGN(&v) = MP_ZPOS;
+    }
+
+    if((res = mp_sub(&u, &v, &t)) != MP_OKAY)
+      goto CLEANUP;
+
+    if(s_mp_cmp_d(&t, 0) == MP_EQ)
+      break;
+  }
+
+  s_mp_2expt(&v, k);       /* v = 2^k   */
+  res = mp_mul(&u, &v, c); /* c = u * v */
+
+ CLEANUP:
+  mp_clear(&v);
+ V:
+  mp_clear(&u);
+ U:
+  mp_clear(&t);
+
+  return res;
+
+} /* end mp_bgcd() */
+
+/* }}} */
+
+/* {{{ mp_lcm(a, b, c) */
+
+/* We compute the least common multiple using the rule:
+
+   ab = [a, b](a, b)
+
+   ... by computing the product, and dividing out the gcd.
+ */
+
+mp_err mp_lcm(mp_int *a, mp_int *b, mp_int *c)
+{
+  mp_int  gcd, prod;
+  mp_err  res;
+
+  ARGCHK(a != NULL && b != NULL && c != NULL, MP_BADARG);
+
+  /* Set up temporaries */
+  if((res = mp_init(&gcd)) != MP_OKAY)
+    return res;
+  if((res = mp_init(&prod)) != MP_OKAY)
+    goto GCD;
+
+  if((res = mp_mul(a, b, &prod)) != MP_OKAY)
+    goto CLEANUP;
+  if((res = mp_gcd(a, b, &gcd)) != MP_OKAY)
+    goto CLEANUP;
+
+  res = mp_div(&prod, &gcd, c, NULL);
+
+ CLEANUP:
+  mp_clear(&prod);
+ GCD:
+  mp_clear(&gcd);
+
+  return res;
+
+} /* end mp_lcm() */
+
+/* }}} */
+
+/* {{{ mp_xgcd(a, b, g, x, y) */
+
+/*
+  mp_xgcd(a, b, g, x, y)
+
+  Compute g = (a, b) and values x and y satisfying Bezout's identity
+  (that is, ax + by = g).  This uses the extended binary GCD algorithm
+  based on the Stein algorithm used for mp_gcd()
+ */
+
+mp_err mp_xgcd(mp_int *a, mp_int *b, mp_int *g, mp_int *x, mp_int *y)
+{
+  mp_int   gx, xc, yc, u, v, A, B, C, D;
+  mp_int  *clean[9];
+  mp_err   res;
+  int      last = -1;
+
+  if(mp_cmp_z(b) == 0)
+    return MP_RANGE;
+
+  /* Initialize all these variables we need */
+  if((res = mp_init(&u)) != MP_OKAY) goto CLEANUP;
+  clean[++last] = &u;
+  if((res = mp_init(&v)) != MP_OKAY) goto CLEANUP;
+  clean[++last] = &v;
+  if((res = mp_init(&gx)) != MP_OKAY) goto CLEANUP;
+  clean[++last] = &gx;
+  if((res = mp_init(&A)) != MP_OKAY) goto CLEANUP;
+  clean[++last] = &A;
+  if((res = mp_init(&B)) != MP_OKAY) goto CLEANUP;
+  clean[++last] = &B;
+  if((res = mp_init(&C)) != MP_OKAY) goto CLEANUP;
+  clean[++last] = &C;
+  if((res = mp_init(&D)) != MP_OKAY) goto CLEANUP;
+  clean[++last] = &D;
+  if((res = mp_init_copy(&xc, a)) != MP_OKAY) goto CLEANUP;
+  clean[++last] = &xc;
+  mp_abs(&xc, &xc);
+  if((res = mp_init_copy(&yc, b)) != MP_OKAY) goto CLEANUP;
+  clean[++last] = &yc;
+  mp_abs(&yc, &yc);
+
+  mp_set(&gx, 1);
+
+  /* Divide by two until at least one of them is even */
+  while(mp_iseven(&xc) && mp_iseven(&yc)) {
+    s_mp_div_2(&xc);
+    s_mp_div_2(&yc);
+    if((res = s_mp_mul_2(&gx)) != MP_OKAY)
+      goto CLEANUP;
+  }
+
+  mp_copy(&xc, &u);
+  mp_copy(&yc, &v);
+  mp_set(&A, 1); mp_set(&D, 1);
+
+  /* Loop through binary GCD algorithm */
+  for(;;) {
+    while(mp_iseven(&u)) {
+      s_mp_div_2(&u);
+
+      if(mp_iseven(&A) && mp_iseven(&B)) {
+	s_mp_div_2(&A); s_mp_div_2(&B);
+      } else {
+	if((res = mp_add(&A, &yc, &A)) != MP_OKAY) goto CLEANUP;
+	s_mp_div_2(&A);
+	if((res = mp_sub(&B, &xc, &B)) != MP_OKAY) goto CLEANUP;
+	s_mp_div_2(&B);
+      }
+    }
+
+    while(mp_iseven(&v)) {
+      s_mp_div_2(&v);
+
+      if(mp_iseven(&C) && mp_iseven(&D)) {
+	s_mp_div_2(&C); s_mp_div_2(&D);
+      } else {
+	if((res = mp_add(&C, &yc, &C)) != MP_OKAY) goto CLEANUP;
+	s_mp_div_2(&C);
+	if((res = mp_sub(&D, &xc, &D)) != MP_OKAY) goto CLEANUP;
+	s_mp_div_2(&D);
+      }
+    }
+
+    if(mp_cmp(&u, &v) >= 0) {
+      if((res = mp_sub(&u, &v, &u)) != MP_OKAY) goto CLEANUP;
+      if((res = mp_sub(&A, &C, &A)) != MP_OKAY) goto CLEANUP;
+      if((res = mp_sub(&B, &D, &B)) != MP_OKAY) goto CLEANUP;
+
+    } else {
+      if((res = mp_sub(&v, &u, &v)) != MP_OKAY) goto CLEANUP;
+      if((res = mp_sub(&C, &A, &C)) != MP_OKAY) goto CLEANUP;
+      if((res = mp_sub(&D, &B, &D)) != MP_OKAY) goto CLEANUP;
+
+    }
+
+    /* If we're done, copy results to output */
+    if(mp_cmp_z(&u) == 0) {
+      if(x)
+	if((res = mp_copy(&C, x)) != MP_OKAY) goto CLEANUP;
+
+      if(y)
+	if((res = mp_copy(&D, y)) != MP_OKAY) goto CLEANUP;
+      
+      if(g)
+	if((res = mp_mul(&gx, &v, g)) != MP_OKAY) goto CLEANUP;
+
+      break;
+    }
+  }
+
+ CLEANUP:
+  while(last >= 0)
+    mp_clear(clean[last--]);
+
+  return res;
+
+} /* end mp_xgcd() */
+
+/* }}} */
+
+/* {{{ mp_invmod(a, m, c) */
+
+/*
+  mp_invmod(a, m, c)
+
+  Compute c = a^-1 (mod m), if there is an inverse for a (mod m).
+  This is equivalent to the question of whether (a, m) = 1.  If not,
+  MP_UNDEF is returned, and there is no inverse.
+ */
+
+mp_err mp_invmod(mp_int *a, mp_int *m, mp_int *c)
+{
+  mp_int  g, x;
+  mp_err  res;
+
+  ARGCHK(a && m && c, MP_BADARG);
+
+  if(mp_cmp_z(a) == 0 || mp_cmp_z(m) == 0)
+    return MP_RANGE;
+
+  if((res = mp_init(&g)) != MP_OKAY)
+    return res;
+  if((res = mp_init(&x)) != MP_OKAY)
+    goto X;
+
+  if((res = mp_xgcd(a, m, &g, &x, NULL)) != MP_OKAY)
+    goto CLEANUP;
+
+  if(mp_cmp_d(&g, 1) != MP_EQ) {
+    res = MP_UNDEF;
+    goto CLEANUP;
+  }
+
+  res = mp_mod(&x, m, c);
+  SIGN(c) = SIGN(a);
+
+CLEANUP:
+  mp_clear(&x);
+X:
+  mp_clear(&g);
+
+  return res;
+
+} /* end mp_invmod() */
+
+/* }}} */
+#endif /* if MP_NUMTH */
+
+/* }}} */
+
+/*------------------------------------------------------------------------*/
+/* {{{ mp_print(mp, ofp) */
+
+#if MP_IOFUNC
+/*
+  mp_print(mp, ofp)
+
+  Print a textual representation of the given mp_int on the output
+  stream 'ofp'.  Output is generated using the internal radix.
+ */
+
+void   mp_print(mp_int *mp, FILE *ofp)
+{
+  int   ix;
+
+  if(mp == NULL || ofp == NULL)
+    return;
+
+  fputc((SIGN(mp) == MP_NEG) ? '-' : '+', ofp);
+
+  for(ix = USED(mp) - 1; ix >= 0; ix--) {
+    fprintf(ofp, DIGIT_FMT, DIGIT(mp, ix));
+  }
+
+} /* end mp_print() */
+
+#endif /* if MP_IOFUNC */
+
+/* }}} */
+
+/*------------------------------------------------------------------------*/
+/* {{{ More I/O Functions */
+
+/* {{{ mp_read_signed_bin(mp, str, len) */
+
+/* 
+   mp_read_signed_bin(mp, str, len)
+
+   Read in a raw value (base 256) into the given mp_int
+ */
+
+mp_err  mp_read_signed_bin(mp_int *mp, unsigned char *str, int len)
+{
+  mp_err         res;
+
+  ARGCHK(mp != NULL && str != NULL && len > 0, MP_BADARG);
+
+  if((res = mp_read_unsigned_bin(mp, str + 1, len - 1)) == MP_OKAY) {
+    /* Get sign from first byte */
+    if(str[0])
+      SIGN(mp) = MP_NEG;
+    else
+      SIGN(mp) = MP_ZPOS;
+  }
+
+  return res;
+
+} /* end mp_read_signed_bin() */
+
+/* }}} */
+
+/* {{{ mp_signed_bin_size(mp) */
+
+int    mp_signed_bin_size(mp_int *mp)
+{
+  ARGCHK(mp != NULL, 0);
+
+  return mp_unsigned_bin_size(mp) + 1;
+
+} /* end mp_signed_bin_size() */
+
+/* }}} */
+
+/* {{{ mp_to_signed_bin(mp, str) */
+
+mp_err mp_to_signed_bin(mp_int *mp, unsigned char *str)
+{
+  ARGCHK(mp != NULL && str != NULL, MP_BADARG);
+
+  /* Caller responsible for allocating enough memory (use mp_raw_size(mp)) */
+  str[0] = (char)SIGN(mp);
+
+  return mp_to_unsigned_bin(mp, str + 1);
+
+} /* end mp_to_signed_bin() */
+
+/* }}} */
+
+/* {{{ mp_read_unsigned_bin(mp, str, len) */
+
+/*
+  mp_read_unsigned_bin(mp, str, len)
+
+  Read in an unsigned value (base 256) into the given mp_int
+ */
+
+mp_err  mp_read_unsigned_bin(mp_int *mp, unsigned char *str, int len)
+{
+  int     ix;
+  mp_err  res;
+
+  ARGCHK(mp != NULL && str != NULL && len > 0, MP_BADARG);
+
+  mp_zero(mp);
+
+  for(ix = 0; ix < len; ix++) {
+    if((res = s_mp_mul_2d(mp, CHAR_BIT)) != MP_OKAY)
+      return res;
+
+    if((res = mp_add_d(mp, str[ix], mp)) != MP_OKAY)
+      return res;
+  }
+  
+  return MP_OKAY;
+  
+} /* end mp_read_unsigned_bin() */
+
+/* }}} */
+
+/* {{{ mp_unsigned_bin_size(mp) */
+
+int     mp_unsigned_bin_size(mp_int *mp) 
+{
+  mp_digit   topdig;
+  int        count;
+
+  ARGCHK(mp != NULL, 0);
+
+  /* Special case for the value zero */
+  if(USED(mp) == 1 && DIGIT(mp, 0) == 0)
+    return 1;
+
+  count = (USED(mp) - 1) * sizeof(mp_digit);
+  topdig = DIGIT(mp, USED(mp) - 1);
+
+  while(topdig != 0) {
+    ++count;
+    topdig >>= CHAR_BIT;
+  }
+
+  return count;
+
+} /* end mp_unsigned_bin_size() */
+
+/* }}} */
+
+/* {{{ mp_to_unsigned_bin(mp, str) */
+
+mp_err mp_to_unsigned_bin(mp_int *mp, unsigned char *str)
+{
+  mp_digit      *dp, *end, d;
+  unsigned char *spos;
+
+  ARGCHK(mp != NULL && str != NULL, MP_BADARG);
+
+  dp = DIGITS(mp);
+  end = dp + USED(mp) - 1;
+  spos = str;
+
+  /* Special case for zero, quick test */
+  if(dp == end && *dp == 0) {
+    *str = '\0';
+    return MP_OKAY;
+  }
+
+  /* Generate digits in reverse order */
+  while(dp < end) {
+    int      ix;
+
+    d = *dp;
+    for(ix = 0; ix < sizeof(mp_digit); ++ix) {
+      *spos = d & UCHAR_MAX;
+      d >>= CHAR_BIT;
+      ++spos;
+    }
+
+    ++dp;
+  }
+
+  /* Now handle last digit specially, high order zeroes are not written */
+  d = *end;
+  while(d != 0) {
+    *spos = d & UCHAR_MAX;
+    d >>= CHAR_BIT;
+    ++spos;
+  }
+
+  /* Reverse everything to get digits in the correct order */
+  while(--spos > str) {
+    unsigned char t = *str;
+    *str = *spos;
+    *spos = t;
+
+    ++str;
+  }
+
+  return MP_OKAY;
+
+} /* end mp_to_unsigned_bin() */
+
+/* }}} */
+
+/* {{{ mp_count_bits(mp) */
+
+int    mp_count_bits(mp_int *mp)
+{
+  int      len;
+  mp_digit d;
+
+  ARGCHK(mp != NULL, MP_BADARG);
+
+  len = DIGIT_BIT * (USED(mp) - 1);
+  d = DIGIT(mp, USED(mp) - 1);
+
+  while(d != 0) {
+    ++len;
+    d >>= 1;
+  }
+
+  return len;
+  
+} /* end mp_count_bits() */
+
+/* }}} */
+
+/* {{{ mp_read_radix(mp, str, radix) */
+
+/*
+  mp_read_radix(mp, str, radix)
+
+  Read an integer from the given string, and set mp to the resulting
+  value.  The input is presumed to be in base 10.  Leading non-digit
+  characters are ignored, and the function reads until a non-digit
+  character or the end of the string.
+ */
+
+mp_err  mp_read_radix(mp_int *mp, unsigned char *str, int radix)
+{
+  int     ix = 0, val = 0;
+  mp_err  res;
+  mp_sign sig = MP_ZPOS;
+
+  ARGCHK(mp != NULL && str != NULL && radix >= 2 && radix <= MAX_RADIX, 
+	 MP_BADARG);
+
+  mp_zero(mp);
+
+  /* Skip leading non-digit characters until a digit or '-' or '+' */
+  while(str[ix] && 
+	(s_mp_tovalue(str[ix], radix) < 0) && 
+	str[ix] != '-' &&
+	str[ix] != '+') {
+    ++ix;
+  }
+
+  if(str[ix] == '-') {
+    sig = MP_NEG;
+    ++ix;
+  } else if(str[ix] == '+') {
+    sig = MP_ZPOS; /* this is the default anyway... */
+    ++ix;
+  }
+
+  while((val = s_mp_tovalue(str[ix], radix)) >= 0) {
+    if((res = s_mp_mul_d(mp, radix)) != MP_OKAY)
+      return res;
+    if((res = s_mp_add_d(mp, val)) != MP_OKAY)
+      return res;
+    ++ix;
+  }
+
+  if(s_mp_cmp_d(mp, 0) == MP_EQ)
+    SIGN(mp) = MP_ZPOS;
+  else
+    SIGN(mp) = sig;
+
+  return MP_OKAY;
+
+} /* end mp_read_radix() */
+
+/* }}} */
+
+/* {{{ mp_radix_size(mp, radix) */
+
+int    mp_radix_size(mp_int *mp, int radix)
+{
+  int  len;
+  ARGCHK(mp != NULL, 0);
+
+  len = s_mp_outlen(mp_count_bits(mp), radix) + 1; /* for NUL terminator */
+
+  if(mp_cmp_z(mp) < 0)
+    ++len; /* for sign */
+
+  return len;
+
+} /* end mp_radix_size() */
+
+/* }}} */
+
+/* {{{ mp_value_radix_size(num, qty, radix) */
+
+/* num = number of digits
+   qty = number of bits per digit
+   radix = target base
+   
+   Return the number of digits in the specified radix that would be
+   needed to express 'num' digits of 'qty' bits each.
+ */
+int    mp_value_radix_size(int num, int qty, int radix)
+{
+  ARGCHK(num >= 0 && qty > 0 && radix >= 2 && radix <= MAX_RADIX, 0);
+
+  return s_mp_outlen(num * qty, radix);
+
+} /* end mp_value_radix_size() */
+
+/* }}} */
+
+/* {{{ mp_toradix(mp, str, radix) */
+
+mp_err mp_toradix(mp_int *mp, unsigned char *str, int radix)
+{
+  int  ix, pos = 0;
+
+  ARGCHK(mp != NULL && str != NULL, MP_BADARG);
+  ARGCHK(radix > 1 && radix <= MAX_RADIX, MP_RANGE);
+
+  if(mp_cmp_z(mp) == MP_EQ) {
+    str[0] = '0';
+    str[1] = '\0';
+  } else {
+    mp_err   res;
+    mp_int   tmp;
+    mp_sign  sgn;
+    mp_digit rem, rdx = (mp_digit)radix;
+    char     ch;
+
+    if((res = mp_init_copy(&tmp, mp)) != MP_OKAY)
+      return res;
+
+    /* Save sign for later, and take absolute value */
+    sgn = SIGN(&tmp); SIGN(&tmp) = MP_ZPOS;
+
+    /* Generate output digits in reverse order      */
+    while(mp_cmp_z(&tmp) != 0) {
+      if((res = s_mp_div_d(&tmp, rdx, &rem)) != MP_OKAY) {
+	mp_clear(&tmp);
+	return res;
+      }
+
+      /* Generate digits, use capital letters */
+      ch = s_mp_todigit(rem, radix, 0);
+
+      str[pos++] = ch;
+    }
+
+    /* Add - sign if original value was negative */
+    if(sgn == MP_NEG)
+      str[pos++] = '-';
+
+    /* Add trailing NUL to end the string        */
+    str[pos--] = '\0';
+
+    /* Reverse the digits and sign indicator     */
+    ix = 0;
+    while(ix < pos) {
+      char tmp = str[ix];
+
+      str[ix] = str[pos];
+      str[pos] = tmp;
+      ++ix;
+      --pos;
+    }
+    
+    mp_clear(&tmp);
+  }
+
+  return MP_OKAY;
+
+} /* end mp_toradix() */
+
+/* }}} */
+
+/* {{{ mp_char2value(ch, r) */
+
+int    mp_char2value(char ch, int r)
+{
+  return s_mp_tovalue(ch, r);
+
+} /* end mp_tovalue() */
+
+/* }}} */
+
+/* }}} */
+
+/* {{{ mp_strerror(ec) */
+
+/*
+  mp_strerror(ec)
+
+  Return a string describing the meaning of error code 'ec'.  The
+  string returned is allocated in static memory, so the caller should
+  not attempt to modify or free the memory associated with this
+  string.
+ */
+const char  *mp_strerror(mp_err ec)
+{
+  int   aec = (ec < 0) ? -ec : ec;
+
+  /* Code values are negative, so the senses of these comparisons
+     are accurate */
+  if(ec < MP_LAST_CODE || ec > MP_OKAY) {
+    return mp_err_string[0];  /* unknown error code */
+  } else {
+    return mp_err_string[aec + 1];
+  }
+
+} /* end mp_strerror() */
+
+/* }}} */
+
+/*========================================================================*/
+/*------------------------------------------------------------------------*/
+/* Static function definitions (internal use only)                        */
+
+/* {{{ Memory management */
+
+/* {{{ s_mp_grow(mp, min) */
+
+/* Make sure there are at least 'min' digits allocated to mp              */
+mp_err   s_mp_grow(mp_int *mp, mp_size min)
+{
+  if(min > ALLOC(mp)) {
+    mp_digit   *tmp;
+
+    /* Set min to next nearest default precision block size */
+    min = ((min + (s_mp_defprec - 1)) / s_mp_defprec) * s_mp_defprec;
+
+    if((tmp = s_mp_alloc(min, sizeof(mp_digit))) == NULL)
+      return MP_MEM;
+
+    s_mp_copy(DIGITS(mp), tmp, USED(mp));
+
+#if MP_CRYPTO
+    s_mp_setz(DIGITS(mp), ALLOC(mp));
+#endif
+    s_mp_free(DIGITS(mp));
+    DIGITS(mp) = tmp;
+    ALLOC(mp) = min;
+  }
+
+  return MP_OKAY;
+
+} /* end s_mp_grow() */
+
+/* }}} */
+
+/* {{{ s_mp_pad(mp, min) */
+
+/* Make sure the used size of mp is at least 'min', growing if needed     */
+mp_err   s_mp_pad(mp_int *mp, mp_size min)
+{
+  if(min > USED(mp)) {
+    mp_err  res;
+
+    /* Make sure there is room to increase precision  */
+    if(min > ALLOC(mp) && (res = s_mp_grow(mp, min)) != MP_OKAY)
+      return res;
+
+    /* Increase precision; should already be 0-filled */
+    USED(mp) = min;
+  }
+
+  return MP_OKAY;
+
+} /* end s_mp_pad() */
+
+/* }}} */
+
+/* {{{ s_mp_setz(dp, count) */
+
+#if MP_MACRO == 0
+/* Set 'count' digits pointed to by dp to be zeroes                       */
+void s_mp_setz(mp_digit *dp, mp_size count)
+{
+#if MP_MEMSET == 0
+  int  ix;
+
+  for(ix = 0; ix < count; ix++)
+    dp[ix] = 0;
+#else
+  memset(dp, 0, count * sizeof(mp_digit));
+#endif
+
+} /* end s_mp_setz() */
+#endif
+
+/* }}} */
+
+/* {{{ s_mp_copy(sp, dp, count) */
+
+#if MP_MACRO == 0
+/* Copy 'count' digits from sp to dp                                      */
+void s_mp_copy(mp_digit *sp, mp_digit *dp, mp_size count)
+{
+#if MP_MEMCPY == 0
+  int  ix;
+
+  for(ix = 0; ix < count; ix++)
+    dp[ix] = sp[ix];
+#else
+  memcpy(dp, sp, count * sizeof(mp_digit));
+#endif
+
+} /* end s_mp_copy() */
+#endif
+
+/* }}} */
+
+/* {{{ s_mp_alloc(nb, ni) */
+
+#if MP_MACRO == 0
+/* Allocate ni records of nb bytes each, and return a pointer to that     */
+void    *s_mp_alloc(size_t nb, size_t ni)
+{
+  return calloc(nb, ni);
+
+} /* end s_mp_alloc() */
+#endif
+
+/* }}} */
+
+/* {{{ s_mp_free(ptr) */
+
+#if MP_MACRO == 0
+/* Free the memory pointed to by ptr                                      */
+void     s_mp_free(void *ptr)
+{
+  if(ptr)
+    free(ptr);
+
+} /* end s_mp_free() */
+#endif
+
+/* }}} */
+
+/* {{{ s_mp_clamp(mp) */
+
+/* Remove leading zeroes from the given value                             */
+void     s_mp_clamp(mp_int *mp)
+{
+  mp_size   du = USED(mp);
+  mp_digit *zp = DIGITS(mp) + du - 1;
+
+  while(du > 1 && !*zp--)
+    --du;
+
+  USED(mp) = du;
+
+} /* end s_mp_clamp() */
+
+
+/* }}} */
+
+/* {{{ s_mp_exch(a, b) */
+
+/* Exchange the data for a and b; (b, a) = (a, b)                         */
+void     s_mp_exch(mp_int *a, mp_int *b)
+{
+  mp_int   tmp;
+
+  tmp = *a;
+  *a = *b;
+  *b = tmp;
+
+} /* end s_mp_exch() */
+
+/* }}} */
+
+/* }}} */
+
+/* {{{ Arithmetic helpers */
+
+/* {{{ s_mp_lshd(mp, p) */
+
+/* 
+   Shift mp leftward by p digits, growing if needed, and zero-filling
+   the in-shifted digits at the right end.  This is a convenient
+   alternative to multiplication by powers of the radix
+ */   
+
+mp_err   s_mp_lshd(mp_int *mp, mp_size p)
+{
+  mp_err   res;
+  mp_size  pos;
+  mp_digit *dp;
+  int     ix;
+
+  if(p == 0)
+    return MP_OKAY;
+
+  if((res = s_mp_pad(mp, USED(mp) + p)) != MP_OKAY)
+    return res;
+
+  pos = USED(mp) - 1;
+  dp = DIGITS(mp);
+
+  /* Shift all the significant figures over as needed */
+  for(ix = pos - p; ix >= 0; ix--) 
+    dp[ix + p] = dp[ix];
+
+  /* Fill the bottom digits with zeroes */
+  for(ix = 0; ix < p; ix++)
+    dp[ix] = 0;
+
+  return MP_OKAY;
+
+} /* end s_mp_lshd() */
+
+/* }}} */
+
+/* {{{ s_mp_rshd(mp, p) */
+
+/* 
+   Shift mp rightward by p digits.  Maintains the invariant that
+   digits above the precision are all zero.  Digits shifted off the
+   end are lost.  Cannot fail.
+ */
+
+void     s_mp_rshd(mp_int *mp, mp_size p)
+{
+  mp_size  ix;
+  mp_digit *dp;
+
+  if(p == 0)
+    return;
+
+  /* Shortcut when all digits are to be shifted off */
+  if(p >= USED(mp)) {
+    s_mp_setz(DIGITS(mp), ALLOC(mp));
+    USED(mp) = 1;
+    SIGN(mp) = MP_ZPOS;
+    return;
+  }
+
+  /* Shift all the significant figures over as needed */
+  dp = DIGITS(mp);
+  for(ix = p; ix < USED(mp); ix++)
+    dp[ix - p] = dp[ix];
+
+  /* Fill the top digits with zeroes */
+  ix -= p;
+  while(ix < USED(mp))
+    dp[ix++] = 0;
+
+  /* Strip off any leading zeroes    */
+  s_mp_clamp(mp);
+
+} /* end s_mp_rshd() */
+
+/* }}} */
+
+/* {{{ s_mp_div_2(mp) */
+
+/* Divide by two -- take advantage of radix properties to do it fast      */
+void     s_mp_div_2(mp_int *mp)
+{
+  s_mp_div_2d(mp, 1);
+
+} /* end s_mp_div_2() */
+
+/* }}} */
+
+/* {{{ s_mp_mul_2(mp) */
+
+mp_err s_mp_mul_2(mp_int *mp)
+{
+  int      ix;
+  mp_digit kin = 0, kout, *dp = DIGITS(mp);
+  mp_err   res;
+
+  /* Shift digits leftward by 1 bit */
+  for(ix = 0; ix < USED(mp); ix++) {
+    kout = (dp[ix] >> (DIGIT_BIT - 1)) & 1;
+    dp[ix] = (dp[ix] << 1) | kin;
+
+    kin = kout;
+  }
+
+  /* Deal with rollover from last digit */
+  if(kin) {
+    if(ix >= ALLOC(mp)) {
+      if((res = s_mp_grow(mp, ALLOC(mp) + 1)) != MP_OKAY)
+	return res;
+      dp = DIGITS(mp);
+    }
+
+    dp[ix] = kin;
+    USED(mp) += 1;
+  }
+
+  return MP_OKAY;
+
+} /* end s_mp_mul_2() */
+
+/* }}} */
+
+/* {{{ s_mp_mod_2d(mp, d) */
+
+/*
+  Remainder the integer by 2^d, where d is a number of bits.  This
+  amounts to a bitwise AND of the value, and does not require the full
+  division code
+ */
+void     s_mp_mod_2d(mp_int *mp, mp_digit d)
+{
+  unsigned int  ndig = (d / DIGIT_BIT), nbit = (d % DIGIT_BIT);
+  unsigned int  ix;
+  mp_digit      dmask, *dp = DIGITS(mp);
+
+  if(ndig >= USED(mp))
+    return;
+
+  /* Flush all the bits above 2^d in its digit */
+  dmask = (1 << nbit) - 1;
+  dp[ndig] &= dmask;
+
+  /* Flush all digits above the one with 2^d in it */
+  for(ix = ndig + 1; ix < USED(mp); ix++)
+    dp[ix] = 0;
+
+  s_mp_clamp(mp);
+
+} /* end s_mp_mod_2d() */
+
+/* }}} */
+
+/* {{{ s_mp_mul_2d(mp, d) */
+
+/*
+  Multiply by the integer 2^d, where d is a number of bits.  This
+  amounts to a bitwise shift of the value, and does not require the
+  full multiplication code.
+ */
+mp_err    s_mp_mul_2d(mp_int *mp, mp_digit d)
+{
+  mp_err   res;
+  mp_digit save, next, mask, *dp;
+  mp_size  used;
+  int      ix;
+
+  if((res = s_mp_lshd(mp, d / DIGIT_BIT)) != MP_OKAY)
+    return res;
+
+  dp = DIGITS(mp); used = USED(mp);
+  d %= DIGIT_BIT;
+
+  mask = (1 << d) - 1;
+
+  /* If the shift requires another digit, make sure we've got one to
+     work with */
+  if((dp[used - 1] >> (DIGIT_BIT - d)) & mask) {
+    if((res = s_mp_grow(mp, used + 1)) != MP_OKAY)
+      return res;
+    dp = DIGITS(mp);
+  }
+
+  /* Do the shifting... */
+  save = 0;
+  for(ix = 0; ix < used; ix++) {
+    next = (dp[ix] >> (DIGIT_BIT - d)) & mask;
+    dp[ix] = (dp[ix] << d) | save;
+    save = next;
+  }
+
+  /* If, at this point, we have a nonzero carryout into the next
+     digit, we'll increase the size by one digit, and store it...
+   */
+  if(save) {
+    dp[used] = save;
+    USED(mp) += 1;
+  }
+
+  s_mp_clamp(mp);
+  return MP_OKAY;
+
+} /* end s_mp_mul_2d() */
+
+/* }}} */
+
+/* {{{ s_mp_div_2d(mp, d) */
+
+/*
+  Divide the integer by 2^d, where d is a number of bits.  This
+  amounts to a bitwise shift of the value, and does not require the
+  full division code (used in Barrett reduction, see below)
+ */
+void     s_mp_div_2d(mp_int *mp, mp_digit d)
+{
+  int       ix;
+  mp_digit  save, next, mask, *dp = DIGITS(mp);
+
+  s_mp_rshd(mp, d / DIGIT_BIT);
+  d %= DIGIT_BIT;
+
+  mask = (1 << d) - 1;
+
+  save = 0;
+  for(ix = USED(mp) - 1; ix >= 0; ix--) {
+    next = dp[ix] & mask;
+    dp[ix] = (dp[ix] >> d) | (save << (DIGIT_BIT - d));
+    save = next;
+  }
+
+  s_mp_clamp(mp);
+
+} /* end s_mp_div_2d() */
+
+/* }}} */
+
+/* {{{ s_mp_norm(a, b) */
+
+/*
+  s_mp_norm(a, b)
+
+  Normalize a and b for division, where b is the divisor.  In order
+  that we might make good guesses for quotient digits, we want the
+  leading digit of b to be at least half the radix, which we
+  accomplish by multiplying a and b by a constant.  This constant is
+  returned (so that it can be divided back out of the remainder at the
+  end of the division process).
+
+  We multiply by the smallest power of 2 that gives us a leading digit
+  at least half the radix.  By choosing a power of 2, we simplify the 
+  multiplication and division steps to simple shifts.
+ */
+mp_digit s_mp_norm(mp_int *a, mp_int *b)
+{
+  mp_digit  t, d = 0;
+
+  t = DIGIT(b, USED(b) - 1);
+  while(t < (RADIX / 2)) {
+    t <<= 1;
+    ++d;
+  }
+    
+  if(d != 0) {
+    s_mp_mul_2d(a, d);
+    s_mp_mul_2d(b, d);
+  }
+
+  return d;
+
+} /* end s_mp_norm() */
+
+/* }}} */
+
+/* }}} */
+
+/* {{{ Primitive digit arithmetic */
+
+/* {{{ s_mp_add_d(mp, d) */
+
+/* Add d to |mp| in place                                                 */
+mp_err   s_mp_add_d(mp_int *mp, mp_digit d)    /* unsigned digit addition */
+{
+  mp_word   w, k = 0;
+  mp_size   ix = 1, used = USED(mp);
+  mp_digit *dp = DIGITS(mp);
+
+  w = dp[0] + d;
+  dp[0] = ACCUM(w);
+  k = CARRYOUT(w);
+
+  while(ix < used && k) {
+    w = dp[ix] + k;
+    dp[ix] = ACCUM(w);
+    k = CARRYOUT(w);
+    ++ix;
+  }
+
+  if(k != 0) {
+    mp_err  res;
+
+    if((res = s_mp_pad(mp, USED(mp) + 1)) != MP_OKAY)
+      return res;
+
+    DIGIT(mp, ix) = k;
+  }
+
+  return MP_OKAY;
+
+} /* end s_mp_add_d() */
+
+/* }}} */
+
+/* {{{ s_mp_sub_d(mp, d) */
+
+/* Subtract d from |mp| in place, assumes |mp| > d                        */
+mp_err   s_mp_sub_d(mp_int *mp, mp_digit d)    /* unsigned digit subtract */
+{
+  mp_word   w, b = 0;
+  mp_size   ix = 1, used = USED(mp);
+  mp_digit *dp = DIGITS(mp);
+
+  /* Compute initial subtraction    */
+  w = (RADIX + dp[0]) - d;
+  b = CARRYOUT(w) ? 0 : 1;
+  dp[0] = ACCUM(w);
+
+  /* Propagate borrows leftward     */
+  while(b && ix < used) {
+    w = (RADIX + dp[ix]) - b;
+    b = CARRYOUT(w) ? 0 : 1;
+    dp[ix] = ACCUM(w);
+    ++ix;
+  }
+
+  /* Remove leading zeroes          */
+  s_mp_clamp(mp);
+
+  /* If we have a borrow out, it's a violation of the input invariant */
+  if(b)
+    return MP_RANGE;
+  else
+    return MP_OKAY;
+
+} /* end s_mp_sub_d() */
+
+/* }}} */
+
+/* {{{ s_mp_mul_d(a, d) */
+
+/* Compute a = a * d, single digit multiplication                         */
+mp_err   s_mp_mul_d(mp_int *a, mp_digit d)
+{
+  mp_word w, k = 0;
+  mp_size ix, max;
+  mp_err  res;
+  mp_digit *dp = DIGITS(a);
+
+  /*
+    Single-digit multiplication will increase the precision of the
+    output by at most one digit.  However, we can detect when this
+    will happen -- if the high-order digit of a, times d, gives a
+    two-digit result, then the precision of the result will increase;
+    otherwise it won't.  We use this fact to avoid calling s_mp_pad()
+    unless absolutely necessary.
+   */
+  max = USED(a);
+  w = dp[max - 1] * d;
+  if(CARRYOUT(w) != 0) {
+    if((res = s_mp_pad(a, max + 1)) != MP_OKAY)
+      return res;
+    dp = DIGITS(a);
+  }
+
+  for(ix = 0; ix < max; ix++) {
+    w = (dp[ix] * d) + k;
+    dp[ix] = ACCUM(w);
+    k = CARRYOUT(w);
+  }
+
+  /* If there is a precision increase, take care of it here; the above
+     test guarantees we have enough storage to do this safely.
+   */
+  if(k) {
+    dp[max] = k; 
+    USED(a) = max + 1;
+  }
+
+  s_mp_clamp(a);
+
+  return MP_OKAY;
+  
+} /* end s_mp_mul_d() */
+
+/* }}} */
+
+/* {{{ s_mp_div_d(mp, d, r) */
+
+/*
+  s_mp_div_d(mp, d, r)
+
+  Compute the quotient mp = mp / d and remainder r = mp mod d, for a
+  single digit d.  If r is null, the remainder will be discarded.
+ */
+
+mp_err   s_mp_div_d(mp_int *mp, mp_digit d, mp_digit *r)
+{
+  mp_word   w = 0, t;
+  mp_int    quot;
+  mp_err    res;
+  mp_digit *dp = DIGITS(mp), *qp;
+  int       ix;
+
+  if(d == 0)
+    return MP_RANGE;
+
+  /* Make room for the quotient */
+  if((res = mp_init_size(&quot, USED(mp))) != MP_OKAY)
+    return res;
+
+  USED(&quot) = USED(mp); /* so clamping will work below */
+  qp = DIGITS(&quot);
+
+  /* Divide without subtraction */
+  for(ix = USED(mp) - 1; ix >= 0; ix--) {
+    w = (w << DIGIT_BIT) | dp[ix];
+
+    if(w >= d) {
+      t = w / d;
+      w = w % d;
+    } else {
+      t = 0;
+    }
+
+    qp[ix] = t;
+  }
+
+  /* Deliver the remainder, if desired */
+  if(r)
+    *r = w;
+
+  s_mp_clamp(&quot);
+  mp_exch(&quot, mp);
+  mp_clear(&quot);
+
+  return MP_OKAY;
+
+} /* end s_mp_div_d() */
+
+/* }}} */
+
+/* }}} */
+
+/* {{{ Primitive full arithmetic */
+
+/* {{{ s_mp_add(a, b) */
+
+/* Compute a = |a| + |b|                                                  */
+mp_err   s_mp_add(mp_int *a, mp_int *b)        /* magnitude addition      */
+{
+  mp_word   w = 0;
+  mp_digit *pa, *pb;
+  mp_size   ix, used = USED(b);
+  mp_err    res;
+
+  /* Make sure a has enough precision for the output value */
+  if((used > USED(a)) && (res = s_mp_pad(a, used)) != MP_OKAY)
+    return res;
+
+  /*
+    Add up all digits up to the precision of b.  If b had initially
+    the same precision as a, or greater, we took care of it by the
+    padding step above, so there is no problem.  If b had initially
+    less precision, we'll have to make sure the carry out is duly
+    propagated upward among the higher-order digits of the sum.
+   */
+  pa = DIGITS(a);
+  pb = DIGITS(b);
+  for(ix = 0; ix < used; ++ix) {
+    w += *pa + *pb++;
+    *pa++ = ACCUM(w);
+    w = CARRYOUT(w);
+  }
+
+  /* If we run out of 'b' digits before we're actually done, make
+     sure the carries get propagated upward...  
+   */
+  used = USED(a);
+  while(w && ix < used) {
+    w += *pa;
+    *pa++ = ACCUM(w);
+    w = CARRYOUT(w);
+    ++ix;
+  }
+
+  /* If there's an overall carry out, increase precision and include
+     it.  We could have done this initially, but why touch the memory
+     allocator unless we're sure we have to?
+   */
+  if(w) {
+    if((res = s_mp_pad(a, used + 1)) != MP_OKAY)
+      return res;
+
+    DIGIT(a, ix) = w;  /* pa may not be valid after s_mp_pad() call */
+  }
+
+  return MP_OKAY;
+
+} /* end s_mp_add() */
+
+/* }}} */
+
+/* {{{ s_mp_sub(a, b) */
+
+/* Compute a = |a| - |b|, assumes |a| >= |b|                              */
+mp_err   s_mp_sub(mp_int *a, mp_int *b)        /* magnitude subtract      */
+{
+  mp_word   w = 0;
+  mp_digit *pa, *pb;
+  mp_size   ix, used = USED(b);
+
+  /*
+    Subtract and propagate borrow.  Up to the precision of b, this
+    accounts for the digits of b; after that, we just make sure the
+    carries get to the right place.  This saves having to pad b out to
+    the precision of a just to make the loops work right...
+   */
+  pa = DIGITS(a);
+  pb = DIGITS(b);
+
+  for(ix = 0; ix < used; ++ix) {
+    w = (RADIX + *pa) - w - *pb++;
+    *pa++ = ACCUM(w);
+    w = CARRYOUT(w) ? 0 : 1;
+  }
+
+  used = USED(a);
+  while(ix < used) {
+    w = RADIX + *pa - w;
+    *pa++ = ACCUM(w);
+    w = CARRYOUT(w) ? 0 : 1;
+    ++ix;
+  }
+
+  /* Clobber any leading zeroes we created    */
+  s_mp_clamp(a);
+
+  /* 
+     If there was a borrow out, then |b| > |a| in violation
+     of our input invariant.  We've already done the work,
+     but we'll at least complain about it...
+   */
+  if(w)
+    return MP_RANGE;
+  else
+    return MP_OKAY;
+
+} /* end s_mp_sub() */
+
+/* }}} */
+
+mp_err   s_mp_reduce(mp_int *x, mp_int *m, mp_int *mu)
+{
+  mp_int   q;
+  mp_err   res;
+  mp_size  um = USED(m);
+
+  if((res = mp_init_copy(&q, x)) != MP_OKAY)
+    return res;
+
+  s_mp_rshd(&q, um - 1);       /* q1 = x / b^(k-1)  */
+  s_mp_mul(&q, mu);            /* q2 = q1 * mu      */
+  s_mp_rshd(&q, um + 1);       /* q3 = q2 / b^(k+1) */
+
+  /* x = x mod b^(k+1), quick (no division) */
+  s_mp_mod_2d(x, (mp_digit)(DIGIT_BIT * (um + 1)));
+
+  /* q = q * m mod b^(k+1), quick (no division), uses the short multiplier */
+#ifndef SHRT_MUL
+  s_mp_mul(&q, m);
+  s_mp_mod_2d(&q, (mp_digit)(DIGIT_BIT * (um + 1)));
+#else
+  s_mp_mul_dig(&q, m, um + 1);
+#endif  
+
+  /* x = x - q */
+  if((res = mp_sub(x, &q, x)) != MP_OKAY)
+    goto CLEANUP;
+
+  /* If x < 0, add b^(k+1) to it */
+  if(mp_cmp_z(x) < 0) {
+    mp_set(&q, 1);
+    if((res = s_mp_lshd(&q, um + 1)) != MP_OKAY)
+      goto CLEANUP;
+    if((res = mp_add(x, &q, x)) != MP_OKAY)
+      goto CLEANUP;
+  }
+
+  /* Back off if it's too big */
+  while(mp_cmp(x, m) >= 0) {
+    if((res = s_mp_sub(x, m)) != MP_OKAY)
+      break;
+  }
+
+ CLEANUP:
+  mp_clear(&q);
+
+  return res;
+
+} /* end s_mp_reduce() */
+
+
+
+/* {{{ s_mp_mul(a, b) */
+
+/* Compute a = |a| * |b|                                                  */
+mp_err   s_mp_mul(mp_int *a, mp_int *b)
+{
+  mp_word   w, k = 0;
+  mp_int    tmp;
+  mp_err    res;
+  mp_size   ix, jx, ua = USED(a), ub = USED(b);
+  mp_digit *pa, *pb, *pt, *pbt;
+
+  if((res = mp_init_size(&tmp, ua + ub)) != MP_OKAY)
+    return res;
+
+  /* This has the effect of left-padding with zeroes... */
+  USED(&tmp) = ua + ub;
+
+  /* We're going to need the base value each iteration */
+  pbt = DIGITS(&tmp);
+
+  /* Outer loop:  Digits of b */
+
+  pb = DIGITS(b);
+  for(ix = 0; ix < ub; ++ix, ++pb) {
+    if(*pb == 0) 
+      continue;
+
+    /* Inner product:  Digits of a */
+    pa = DIGITS(a);
+    for(jx = 0; jx < ua; ++jx, ++pa) {
+      pt = pbt + ix + jx;
+      w = *pb * *pa + k + *pt;
+      *pt = ACCUM(w);
+      k = CARRYOUT(w);
+    }
+
+    pbt[ix + jx] = k;
+    k = 0;
+  }
+
+  s_mp_clamp(&tmp);
+  s_mp_exch(&tmp, a);
+
+  mp_clear(&tmp);
+
+  return MP_OKAY;
+
+} /* end s_mp_mul() */
+
+/* }}} */
+
+/* {{{ s_mp_kmul(a, b, out, len) */
+
+#if 0
+void   s_mp_kmul(mp_digit *a, mp_digit *b, mp_digit *out, mp_size len)
+{
+  mp_word   w, k = 0;
+  mp_size   ix, jx;
+  mp_digit *pa, *pt;
+
+  for(ix = 0; ix < len; ++ix, ++b) {
+    if(*b == 0)
+      continue;
+    
+    pa = a;
+    for(jx = 0; jx < len; ++jx, ++pa) {
+      pt = out + ix + jx;
+      w = *b * *pa + k + *pt;
+      *pt = ACCUM(w);
+      k = CARRYOUT(w);
+    }
+
+    out[ix + jx] = k;
+    k = 0;
+  }
+
+} /* end s_mp_kmul() */
+#endif
+
+/* }}} */
+
+/* {{{ s_mp_sqr(a) */
+
+/*
+  Computes the square of a, in place.  This can be done more
+  efficiently than a general multiplication, because many of the
+  computation steps are redundant when squaring.  The inner product
+  step is a bit more complicated, but we save a fair number of
+  iterations of the multiplication loop.
+ */
+#if MP_SQUARE
+mp_err   s_mp_sqr(mp_int *a)
+{
+  mp_word  w, k = 0;
+  mp_int   tmp;
+  mp_err   res;
+  mp_size  ix, jx, kx, used = USED(a);
+  mp_digit *pa1, *pa2, *pt, *pbt;
+
+  if((res = mp_init_size(&tmp, 2 * used)) != MP_OKAY)
+    return res;
+
+  /* Left-pad with zeroes */
+  USED(&tmp) = 2 * used;
+
+  /* We need the base value each time through the loop */
+  pbt = DIGITS(&tmp);
+
+  pa1 = DIGITS(a);
+  for(ix = 0; ix < used; ++ix, ++pa1) {
+    if(*pa1 == 0)
+      continue;
+
+    w = DIGIT(&tmp, ix + ix) + (*pa1 * *pa1);
+
+    pbt[ix + ix] = ACCUM(w);
+    k = CARRYOUT(w);
+
+    /*
+      The inner product is computed as:
+
+         (C, S) = t[i,j] + 2 a[i] a[j] + C
+
+      This can overflow what can be represented in an mp_word, and
+      since C arithmetic does not provide any way to check for
+      overflow, we have to check explicitly for overflow conditions
+      before they happen.
+     */
+    for(jx = ix + 1, pa2 = DIGITS(a) + jx; jx < used; ++jx, ++pa2) {
+      mp_word  u = 0, v;
+      
+      /* Store this in a temporary to avoid indirections later */
+      pt = pbt + ix + jx;
+
+      /* Compute the multiplicative step */
+      w = *pa1 * *pa2;
+
+      /* If w is more than half MP_WORD_MAX, the doubling will
+	 overflow, and we need to record a carry out into the next
+	 word */
+      u = (w >> (MP_WORD_BIT - 1)) & 1;
+
+      /* Double what we've got, overflow will be ignored as defined
+	 for C arithmetic (we've already noted if it is to occur)
+       */
+      w *= 2;
+
+      /* Compute the additive step */
+      v = *pt + k;
+
+      /* If we do not already have an overflow carry, check to see
+	 if the addition will cause one, and set the carry out if so 
+       */
+      u |= ((MP_WORD_MAX - v) < w);
+
+      /* Add in the rest, again ignoring overflow */
+      w += v;
+
+      /* Set the i,j digit of the output */
+      *pt = ACCUM(w);
+
+      /* Save carry information for the next iteration of the loop.
+	 This is why k must be an mp_word, instead of an mp_digit */
+      k = CARRYOUT(w) | (u << DIGIT_BIT);
+
+    } /* for(jx ...) */
+
+    /* Set the last digit in the cycle and reset the carry */
+    k = DIGIT(&tmp, ix + jx) + k;
+    pbt[ix + jx] = ACCUM(k);
+    k = CARRYOUT(k);
+
+    /* If we are carrying out, propagate the carry to the next digit
+       in the output.  This may cascade, so we have to be somewhat
+       circumspect -- but we will have enough precision in the output
+       that we won't overflow 
+     */
+    kx = 1;
+    while(k) {
+      k = pbt[ix + jx + kx] + 1;
+      pbt[ix + jx + kx] = ACCUM(k);
+      k = CARRYOUT(k);
+      ++kx;
+    }
+  } /* for(ix ...) */
+
+  s_mp_clamp(&tmp);
+  s_mp_exch(&tmp, a);
+
+  mp_clear(&tmp);
+
+  return MP_OKAY;
+
+} /* end s_mp_sqr() */
+#endif
+
+/* }}} */
+
+/* {{{ s_mp_div(a, b) */
+
+/*
+  s_mp_div(a, b)
+
+  Compute a = a / b and b = a mod b.  Assumes b > a.
+ */
+
+mp_err   s_mp_div(mp_int *a, mp_int *b)
+{
+  mp_int   quot, rem, t;
+  mp_word  q;
+  mp_err   res;
+  mp_digit d;
+  int      ix;
+
+  if(mp_cmp_z(b) == 0)
+    return MP_RANGE;
+
+  /* Shortcut if b is power of two */
+  if((ix = s_mp_ispow2(b)) >= 0) {
+    mp_copy(a, b);  /* need this for remainder */
+    s_mp_div_2d(a, (mp_digit)ix);
+    s_mp_mod_2d(b, (mp_digit)ix);
+
+    return MP_OKAY;
+  }
+
+  /* Allocate space to store the quotient */
+  if((res = mp_init_size(&quot, USED(a))) != MP_OKAY)
+    return res;
+
+  /* A working temporary for division     */
+  if((res = mp_init_size(&t, USED(a))) != MP_OKAY)
+    goto T;
+
+  /* Allocate space for the remainder     */
+  if((res = mp_init_size(&rem, USED(a))) != MP_OKAY)
+    goto REM;
+
+  /* Normalize to optimize guessing       */
+  d = s_mp_norm(a, b);
+
+  /* Perform the division itself...woo!   */
+  ix = USED(a) - 1;
+
+  while(ix >= 0) {
+    /* Find a partial substring of a which is at least b */
+    while(s_mp_cmp(&rem, b) < 0 && ix >= 0) {
+      if((res = s_mp_lshd(&rem, 1)) != MP_OKAY) 
+	goto CLEANUP;
+
+      if((res = s_mp_lshd(&quot, 1)) != MP_OKAY)
+	goto CLEANUP;
+
+      DIGIT(&rem, 0) = DIGIT(a, ix);
+      s_mp_clamp(&rem);
+      --ix;
+    }
+
+    /* If we didn't find one, we're finished dividing    */
+    if(s_mp_cmp(&rem, b) < 0) 
+      break;    
+
+    /* Compute a guess for the next quotient digit       */
+    q = DIGIT(&rem, USED(&rem) - 1);
+    if(q <= DIGIT(b, USED(b) - 1) && USED(&rem) > 1)
+      q = (q << DIGIT_BIT) | DIGIT(&rem, USED(&rem) - 2);
+
+    q /= DIGIT(b, USED(b) - 1);
+
+    /* The guess can be as much as RADIX + 1 */
+    if(q >= RADIX)
+      q = RADIX - 1;
+
+    /* See what that multiplies out to                   */
+    mp_copy(b, &t);
+    if((res = s_mp_mul_d(&t, q)) != MP_OKAY)
+      goto CLEANUP;
+
+    /* 
+       If it's too big, back it off.  We should not have to do this
+       more than once, or, in rare cases, twice.  Knuth describes a
+       method by which this could be reduced to a maximum of once, but
+       I didn't implement that here.
+     */
+    while(s_mp_cmp(&t, &rem) > 0) {
+      --q;
+      s_mp_sub(&t, b);
+    }
+
+    /* At this point, q should be the right next digit   */
+    if((res = s_mp_sub(&rem, &t)) != MP_OKAY)
+      goto CLEANUP;
+
+    /*
+      Include the digit in the quotient.  We allocated enough memory
+      for any quotient we could ever possibly get, so we should not
+      have to check for failures here
+     */
+    DIGIT(&quot, 0) = q;
+  }
+
+  /* Denormalize remainder                */
+  if(d != 0) 
+    s_mp_div_2d(&rem, d);
+
+  s_mp_clamp(&quot);
+  s_mp_clamp(&rem);
+
+  /* Copy quotient back to output         */
+  s_mp_exch(&quot, a);
+  
+  /* Copy remainder back to output        */
+  s_mp_exch(&rem, b);
+
+CLEANUP:
+  mp_clear(&rem);
+REM:
+  mp_clear(&t);
+T:
+  mp_clear(&quot);
+
+  return res;
+
+} /* end s_mp_div() */
+
+/* }}} */
+
+/* {{{ s_mp_2expt(a, k) */
+
+mp_err   s_mp_2expt(mp_int *a, mp_digit k)
+{
+  mp_err    res;
+  mp_size   dig, bit;
+
+  dig = k / DIGIT_BIT;
+  bit = k % DIGIT_BIT;
+
+  mp_zero(a);
+  if((res = s_mp_pad(a, dig + 1)) != MP_OKAY)
+    return res;
+  
+  DIGIT(a, dig) |= (1 << bit);
+
+  return MP_OKAY;
+
+} /* end s_mp_2expt() */
+
+/* }}} */
+
+
+/* }}} */
+
+/* }}} */
+
+/* {{{ Primitive comparisons */
+
+/* {{{ s_mp_cmp(a, b) */
+
+/* Compare |a| <=> |b|, return 0 if equal, <0 if a<b, >0 if a>b           */
+int      s_mp_cmp(mp_int *a, mp_int *b)
+{
+  mp_size   ua = USED(a), ub = USED(b);
+
+  if(ua > ub)
+    return MP_GT;
+  else if(ua < ub)
+    return MP_LT;
+  else {
+    int      ix = ua - 1;
+    mp_digit *ap = DIGITS(a) + ix, *bp = DIGITS(b) + ix;
+
+    while(ix >= 0) {
+      if(*ap > *bp)
+	return MP_GT;
+      else if(*ap < *bp)
+	return MP_LT;
+
+      --ap; --bp; --ix;
+    }
+
+    return MP_EQ;
+  }
+
+} /* end s_mp_cmp() */
+
+/* }}} */
+
+/* {{{ s_mp_cmp_d(a, d) */
+
+/* Compare |a| <=> d, return 0 if equal, <0 if a<d, >0 if a>d             */
+int      s_mp_cmp_d(mp_int *a, mp_digit d)
+{
+  mp_size  ua = USED(a);
+  mp_digit *ap = DIGITS(a);
+
+  if(ua > 1)
+    return MP_GT;
+
+  if(*ap < d) 
+    return MP_LT;
+  else if(*ap > d)
+    return MP_GT;
+  else
+    return MP_EQ;
+
+} /* end s_mp_cmp_d() */
+
+/* }}} */
+
+/* {{{ s_mp_ispow2(v) */
+
+/*
+  Returns -1 if the value is not a power of two; otherwise, it returns
+  k such that v = 2^k, i.e. lg(v).
+ */
+int      s_mp_ispow2(mp_int *v)
+{
+  mp_digit d, *dp;
+  mp_size  uv = USED(v);
+  int      extra = 0, ix;
+
+  d = DIGIT(v, uv - 1); /* most significant digit of v */
+
+  while(d && ((d & 1) == 0)) {
+    d >>= 1;
+    ++extra;
+  }
+
+  if(d == 1) {
+    ix = uv - 2;
+    dp = DIGITS(v) + ix;
+
+    while(ix >= 0) {
+      if(*dp)
+	return -1; /* not a power of two */
+
+      --dp; --ix;
+    }
+
+    return ((uv - 1) * DIGIT_BIT) + extra;
+  } 
+
+  return -1;
+
+} /* end s_mp_ispow2() */
+
+/* }}} */
+
+/* {{{ s_mp_ispow2d(d) */
+
+int      s_mp_ispow2d(mp_digit d)
+{
+  int   pow = 0;
+
+  while((d & 1) == 0) {
+    ++pow; d >>= 1;
+  }
+
+  if(d == 1)
+    return pow;
+
+  return -1;
+
+} /* end s_mp_ispow2d() */
+
+/* }}} */
+
+/* }}} */
+
+/* {{{ Primitive I/O helpers */
+
+/* {{{ s_mp_tovalue(ch, r) */
+
+/*
+  Convert the given character to its digit value, in the given radix.
+  If the given character is not understood in the given radix, -1 is
+  returned.  Otherwise the digit's numeric value is returned.
+
+  The results will be odd if you use a radix < 2 or > 62, you are
+  expected to know what you're up to.
+ */
+int      s_mp_tovalue(char ch, int r)
+{
+  int    val, xch;
+  
+  if(r > 36)
+    xch = ch;
+  else
+    xch = toupper(ch);
+
+  if(isdigit(xch))
+    val = xch - '0';
+  else if(isupper(xch))
+    val = xch - 'A' + 10;
+  else if(islower(xch))
+    val = xch - 'a' + 36;
+  else if(xch == '+')
+    val = 62;
+  else if(xch == '/')
+    val = 63;
+  else 
+    return -1;
+
+  if(val < 0 || val >= r)
+    return -1;
+
+  return val;
+
+} /* end s_mp_tovalue() */
+
+/* }}} */
+
+/* {{{ s_mp_todigit(val, r, low) */
+
+/*
+  Convert val to a radix-r digit, if possible.  If val is out of range
+  for r, returns zero.  Otherwise, returns an ASCII character denoting
+  the value in the given radix.
+
+  The results may be odd if you use a radix < 2 or > 64, you are
+  expected to know what you're doing.
+ */
+  
+char     s_mp_todigit(int val, int r, int low)
+{
+  char   ch;
+
+  if(val < 0 || val >= r)
+    return 0;
+
+  ch = s_dmap_1[val];
+
+  if(r <= 36 && low)
+    ch = tolower(ch);
+
+  return ch;
+
+} /* end s_mp_todigit() */
+
+/* }}} */
+
+/* {{{ s_mp_outlen(bits, radix) */
+
+/* 
+   Return an estimate for how long a string is needed to hold a radix
+   r representation of a number with 'bits' significant bits.
+
+   Does not include space for a sign or a NUL terminator.
+ */
+int      s_mp_outlen(int bits, int r)
+{
+  return (int)((double)bits * LOG_V_2(r));
+
+} /* end s_mp_outlen() */
+
+/* }}} */
+
+/* }}} */
+
+/*------------------------------------------------------------------------*/
+/* HERE THERE BE DRAGONS                                                  */
+/* crc==4242132123, version==2, Sat Feb 02 06:43:52 2002 */
diff --git a/libtommath/mtest/mpi.h b/libtommath/mtest/mpi.h
new file mode 100644
index 0000000..f7a3d14
--- /dev/null
+++ b/libtommath/mtest/mpi.h
@@ -0,0 +1,227 @@
+/*
+    mpi.h
+
+    by Michael J. Fromberger <sting@linguist.dartmouth.edu>
+    Copyright (C) 1998 Michael J. Fromberger, All Rights Reserved
+
+    Arbitrary precision integer arithmetic library
+
+    $Id: mpi.h,v 1.15 2001/09/17 14:16:22 sting Exp $
+ */
+
+#ifndef _H_MPI_
+#define _H_MPI_
+
+#include "mpi-config.h"
+
+#define  MP_LT       -1
+#define  MP_EQ        0
+#define  MP_GT        1
+
+#if MP_DEBUG
+#undef MP_IOFUNC
+#define MP_IOFUNC 1
+#endif
+
+#if MP_IOFUNC
+#include <stdio.h>
+#include <ctype.h>
+#endif
+
+#include <limits.h>
+
+#define  MP_NEG  1
+#define  MP_ZPOS 0
+
+/* Included for compatibility... */
+#define  NEG     MP_NEG
+#define  ZPOS    MP_ZPOS
+
+#define  MP_OKAY          0 /* no error, all is well */
+#define  MP_YES           0 /* yes (boolean result)  */
+#define  MP_NO           -1 /* no (boolean result)   */
+#define  MP_MEM          -2 /* out of memory         */
+#define  MP_RANGE        -3 /* argument out of range */
+#define  MP_BADARG       -4 /* invalid parameter     */
+#define  MP_UNDEF        -5 /* answer is undefined   */
+#define  MP_LAST_CODE    MP_UNDEF
+
+#include "mpi-types.h"
+
+/* Included for compatibility... */
+#define DIGIT_BIT         MP_DIGIT_BIT
+#define DIGIT_MAX         MP_DIGIT_MAX
+
+/* Macros for accessing the mp_int internals           */
+#define  SIGN(MP)     ((MP)->sign)
+#define  USED(MP)     ((MP)->used)
+#define  ALLOC(MP)    ((MP)->alloc)
+#define  DIGITS(MP)   ((MP)->dp)
+#define  DIGIT(MP,N)  (MP)->dp[(N)]
+
+#if MP_ARGCHK == 1
+#define  ARGCHK(X,Y)  {if(!(X)){return (Y);}}
+#elif MP_ARGCHK == 2
+#include <assert.h>
+#define  ARGCHK(X,Y)  assert(X)
+#else
+#define  ARGCHK(X,Y)  /*  */
+#endif
+
+/* This defines the maximum I/O base (minimum is 2)   */
+#define MAX_RADIX         64
+
+typedef struct {
+  mp_sign       sign;    /* sign of this quantity      */
+  mp_size       alloc;   /* how many digits allocated  */
+  mp_size       used;    /* how many digits used       */
+  mp_digit     *dp;      /* the digits themselves      */
+} mp_int;
+
+/*------------------------------------------------------------------------*/
+/* Default precision                                                      */
+
+unsigned int mp_get_prec(void);
+void         mp_set_prec(unsigned int prec);
+
+/*------------------------------------------------------------------------*/
+/* Memory management                                                      */
+
+mp_err mp_init(mp_int *mp);
+mp_err mp_init_array(mp_int mp[], int count);
+mp_err mp_init_size(mp_int *mp, mp_size prec);
+mp_err mp_init_copy(mp_int *mp, mp_int *from);
+mp_err mp_copy(mp_int *from, mp_int *to);
+void   mp_exch(mp_int *mp1, mp_int *mp2);
+void   mp_clear(mp_int *mp);
+void   mp_clear_array(mp_int mp[], int count);
+void   mp_zero(mp_int *mp);
+void   mp_set(mp_int *mp, mp_digit d);
+mp_err mp_set_int(mp_int *mp, long z);
+mp_err mp_shrink(mp_int *a);
+
+
+/*------------------------------------------------------------------------*/
+/* Single digit arithmetic                                                */
+
+mp_err mp_add_d(mp_int *a, mp_digit d, mp_int *b);
+mp_err mp_sub_d(mp_int *a, mp_digit d, mp_int *b);
+mp_err mp_mul_d(mp_int *a, mp_digit d, mp_int *b);
+mp_err mp_mul_2(mp_int *a, mp_int *c);
+mp_err mp_div_d(mp_int *a, mp_digit d, mp_int *q, mp_digit *r);
+mp_err mp_div_2(mp_int *a, mp_int *c);
+mp_err mp_expt_d(mp_int *a, mp_digit d, mp_int *c);
+
+/*------------------------------------------------------------------------*/
+/* Sign manipulations                                                     */
+
+mp_err mp_abs(mp_int *a, mp_int *b);
+mp_err mp_neg(mp_int *a, mp_int *b);
+
+/*------------------------------------------------------------------------*/
+/* Full arithmetic                                                        */
+
+mp_err mp_add(mp_int *a, mp_int *b, mp_int *c);
+mp_err mp_sub(mp_int *a, mp_int *b, mp_int *c);
+mp_err mp_mul(mp_int *a, mp_int *b, mp_int *c);
+mp_err mp_mul_2d(mp_int *a, mp_digit d, mp_int *c);
+#if MP_SQUARE
+mp_err mp_sqr(mp_int *a, mp_int *b);
+#else
+#define mp_sqr(a, b) mp_mul(a, a, b)
+#endif
+mp_err mp_div(mp_int *a, mp_int *b, mp_int *q, mp_int *r);
+mp_err mp_div_2d(mp_int *a, mp_digit d, mp_int *q, mp_int *r);
+mp_err mp_expt(mp_int *a, mp_int *b, mp_int *c);
+mp_err mp_2expt(mp_int *a, mp_digit k);
+mp_err mp_sqrt(mp_int *a, mp_int *b);
+
+/*------------------------------------------------------------------------*/
+/* Modular arithmetic                                                     */
+
+#if MP_MODARITH
+mp_err mp_mod(mp_int *a, mp_int *m, mp_int *c);
+mp_err mp_mod_d(mp_int *a, mp_digit d, mp_digit *c);
+mp_err mp_addmod(mp_int *a, mp_int *b, mp_int *m, mp_int *c);
+mp_err mp_submod(mp_int *a, mp_int *b, mp_int *m, mp_int *c);
+mp_err mp_mulmod(mp_int *a, mp_int *b, mp_int *m, mp_int *c);
+#if MP_SQUARE
+mp_err mp_sqrmod(mp_int *a, mp_int *m, mp_int *c);
+#else
+#define mp_sqrmod(a, m, c) mp_mulmod(a, a, m, c)
+#endif
+mp_err mp_exptmod(mp_int *a, mp_int *b, mp_int *m, mp_int *c);
+mp_err mp_exptmod_d(mp_int *a, mp_digit d, mp_int *m, mp_int *c);
+#endif /* MP_MODARITH */
+
+/*------------------------------------------------------------------------*/
+/* Comparisons                                                            */
+
+int    mp_cmp_z(mp_int *a);
+int    mp_cmp_d(mp_int *a, mp_digit d);
+int    mp_cmp(mp_int *a, mp_int *b);
+int    mp_cmp_mag(mp_int *a, mp_int *b);
+int    mp_cmp_int(mp_int *a, long z);
+int    mp_isodd(mp_int *a);
+int    mp_iseven(mp_int *a);
+
+/*------------------------------------------------------------------------*/
+/* Number theoretic                                                       */
+
+#if MP_NUMTH
+mp_err mp_gcd(mp_int *a, mp_int *b, mp_int *c);
+mp_err mp_lcm(mp_int *a, mp_int *b, mp_int *c);
+mp_err mp_xgcd(mp_int *a, mp_int *b, mp_int *g, mp_int *x, mp_int *y);
+mp_err mp_invmod(mp_int *a, mp_int *m, mp_int *c);
+#endif /* end MP_NUMTH */
+
+/*------------------------------------------------------------------------*/
+/* Input and output                                                       */
+
+#if MP_IOFUNC
+void   mp_print(mp_int *mp, FILE *ofp);
+#endif /* end MP_IOFUNC */
+
+/*------------------------------------------------------------------------*/
+/* Base conversion                                                        */
+
+#define BITS     1
+#define BYTES    CHAR_BIT
+
+mp_err mp_read_signed_bin(mp_int *mp, unsigned char *str, int len);
+int    mp_signed_bin_size(mp_int *mp);
+mp_err mp_to_signed_bin(mp_int *mp, unsigned char *str);
+
+mp_err mp_read_unsigned_bin(mp_int *mp, unsigned char *str, int len);
+int    mp_unsigned_bin_size(mp_int *mp);
+mp_err mp_to_unsigned_bin(mp_int *mp, unsigned char *str);
+
+int    mp_count_bits(mp_int *mp);
+
+#if MP_COMPAT_MACROS
+#define mp_read_raw(mp, str, len) mp_read_signed_bin((mp), (str), (len))
+#define mp_raw_size(mp)           mp_signed_bin_size(mp)
+#define mp_toraw(mp, str)         mp_to_signed_bin((mp), (str))
+#define mp_read_mag(mp, str, len) mp_read_unsigned_bin((mp), (str), (len))
+#define mp_mag_size(mp)           mp_unsigned_bin_size(mp)
+#define mp_tomag(mp, str)         mp_to_unsigned_bin((mp), (str))
+#endif
+
+mp_err mp_read_radix(mp_int *mp, unsigned char *str, int radix);
+int    mp_radix_size(mp_int *mp, int radix);
+int    mp_value_radix_size(int num, int qty, int radix);
+mp_err mp_toradix(mp_int *mp, unsigned char *str, int radix);
+
+int    mp_char2value(char ch, int r);
+
+#define mp_tobinary(M, S)  mp_toradix((M), (S), 2)
+#define mp_tooctal(M, S)   mp_toradix((M), (S), 8)
+#define mp_todecimal(M, S) mp_toradix((M), (S), 10)
+#define mp_tohex(M, S)     mp_toradix((M), (S), 16)
+
+/*------------------------------------------------------------------------*/
+/* Error strings                                                          */
+
+const  char  *mp_strerror(mp_err ec);
+
+#endif /* end _H_MPI_ */
diff --git a/libtommath/mtest/mtest.c b/libtommath/mtest/mtest.c
new file mode 100644
index 0000000..d46f456
--- /dev/null
+++ b/libtommath/mtest/mtest.c
@@ -0,0 +1,304 @@
+/* makes a bignum test harness with NUM tests per operation
+ *
+ * the output is made in the following format [one parameter per line]
+
+operation
+operand1
+operand2
+[... operandN]
+result1
+result2
+[... resultN]
+
+So for example "a * b mod n" would be
+
+mulmod
+a
+b
+n
+a*b mod n
+
+e.g. if a=3, b=4 n=11 then
+
+mulmod
+3
+4
+11
+1
+
+ */
+
+#ifdef MP_8BIT
+#define THE_MASK 127
+#else
+#define THE_MASK 32767
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include "mpi.c"
+
+FILE *rng;
+
+void rand_num(mp_int *a)
+{
+   int n, size;
+   unsigned char buf[2048];
+
+   size = 1 + ((fgetc(rng)<<8) + fgetc(rng)) % 101;
+   buf[0] = (fgetc(rng)&1)?1:0;
+   fread(buf+1, 1, size, rng);
+   while (buf[1] == 0) buf[1] = fgetc(rng);
+   mp_read_raw(a, buf, 1+size);
+}
+
+void rand_num2(mp_int *a)
+{
+   int n, size;
+   unsigned char buf[2048];
+
+   size = 10 + ((fgetc(rng)<<8) + fgetc(rng)) % 101;
+   buf[0] = (fgetc(rng)&1)?1:0;
+   fread(buf+1, 1, size, rng);
+   while (buf[1] == 0) buf[1] = fgetc(rng);
+   mp_read_raw(a, buf, 1+size);
+}
+
+#define mp_to64(a, b) mp_toradix(a, b, 64)
+
+int main(void)
+{
+   int n, tmp;
+   mp_int a, b, c, d, e;
+   clock_t t1;
+   char buf[4096];
+
+   mp_init(&a);
+   mp_init(&b);
+   mp_init(&c);
+   mp_init(&d);
+   mp_init(&e);
+
+
+   /* initial (2^n - 1)^2 testing, makes sure the comba multiplier works [it has the new carry code] */
+/*
+   mp_set(&a, 1);
+   for (n = 1; n < 8192; n++) {
+       mp_mul(&a, &a, &c);
+       printf("mul\n");
+       mp_to64(&a, buf);
+       printf("%s\n%s\n", buf, buf);
+       mp_to64(&c, buf);
+       printf("%s\n", buf);
+
+       mp_add_d(&a, 1, &a);
+       mp_mul_2(&a, &a);
+       mp_sub_d(&a, 1, &a);
+   }
+*/
+
+   rng = fopen("/dev/urandom", "rb");
+   if (rng == NULL) {
+      rng = fopen("/dev/random", "rb");
+      if (rng == NULL) {
+         fprintf(stderr, "\nWarning:  stdin used as random source\n\n");
+         rng = stdin;
+      }
+   }
+
+   t1 = clock();
+   for (;;) {
+#if 0
+      if (clock() - t1 > CLOCKS_PER_SEC) {
+         sleep(2);
+         t1 = clock();
+      }
+#endif
+       n = fgetc(rng) % 15;
+
+   if (n == 0) {
+       /* add tests */
+       rand_num(&a);
+       rand_num(&b);
+       mp_add(&a, &b, &c);
+       printf("add\n");
+       mp_to64(&a, buf);
+       printf("%s\n", buf);
+       mp_to64(&b, buf);
+       printf("%s\n", buf);
+       mp_to64(&c, buf);
+       printf("%s\n", buf);
+   } else if (n == 1) {
+      /* sub tests */
+       rand_num(&a);
+       rand_num(&b);
+       mp_sub(&a, &b, &c);
+       printf("sub\n");
+       mp_to64(&a, buf);
+       printf("%s\n", buf);
+       mp_to64(&b, buf);
+       printf("%s\n", buf);
+       mp_to64(&c, buf);
+       printf("%s\n", buf);
+   } else if (n == 2) {
+       /* mul tests */
+       rand_num(&a);
+       rand_num(&b);
+       mp_mul(&a, &b, &c);
+       printf("mul\n");
+       mp_to64(&a, buf);
+       printf("%s\n", buf);
+       mp_to64(&b, buf);
+       printf("%s\n", buf);
+       mp_to64(&c, buf);
+       printf("%s\n", buf);
+   } else if (n == 3) {
+      /* div tests */
+       rand_num(&a);
+       rand_num(&b);
+       mp_div(&a, &b, &c, &d);
+       printf("div\n");
+       mp_to64(&a, buf);
+       printf("%s\n", buf);
+       mp_to64(&b, buf);
+       printf("%s\n", buf);
+       mp_to64(&c, buf);
+       printf("%s\n", buf);
+       mp_to64(&d, buf);
+       printf("%s\n", buf);
+   } else if (n == 4) {
+      /* sqr tests */
+       rand_num(&a);
+       mp_sqr(&a, &b);
+       printf("sqr\n");
+       mp_to64(&a, buf);
+       printf("%s\n", buf);
+       mp_to64(&b, buf);
+       printf("%s\n", buf);
+   } else if (n == 5) {
+      /* mul_2d test */
+      rand_num(&a);
+      mp_copy(&a, &b);
+      n = fgetc(rng) & 63;
+      mp_mul_2d(&b, n, &b);
+      mp_to64(&a, buf);
+      printf("mul2d\n");
+      printf("%s\n", buf);
+      printf("%d\n", n);
+      mp_to64(&b, buf);
+      printf("%s\n", buf);
+   } else if (n == 6) {
+      /* div_2d test */
+      rand_num(&a);
+      mp_copy(&a, &b);
+      n = fgetc(rng) & 63;
+      mp_div_2d(&b, n, &b, NULL);
+      mp_to64(&a, buf);
+      printf("div2d\n");
+      printf("%s\n", buf);
+      printf("%d\n", n);
+      mp_to64(&b, buf);
+      printf("%s\n", buf);
+   } else if (n == 7) {
+      /* gcd test */
+      rand_num(&a);
+      rand_num(&b);
+      a.sign = MP_ZPOS;
+      b.sign = MP_ZPOS;
+      mp_gcd(&a, &b, &c);
+      printf("gcd\n");
+      mp_to64(&a, buf);
+      printf("%s\n", buf);
+      mp_to64(&b, buf);
+      printf("%s\n", buf);
+      mp_to64(&c, buf);
+      printf("%s\n", buf);
+   } else if (n == 8) {
+      /* lcm test */
+      rand_num(&a);
+      rand_num(&b);
+      a.sign = MP_ZPOS;
+      b.sign = MP_ZPOS;
+      mp_lcm(&a, &b, &c);
+      printf("lcm\n");
+      mp_to64(&a, buf);
+      printf("%s\n", buf);
+      mp_to64(&b, buf);
+      printf("%s\n", buf);
+      mp_to64(&c, buf);
+      printf("%s\n", buf);
+   } else if (n == 9) {
+      /* exptmod test */
+      rand_num2(&a);
+      rand_num2(&b);
+      rand_num2(&c);
+//      if (c.dp[0]&1) mp_add_d(&c, 1, &c);
+      a.sign = b.sign = c.sign = 0;
+      mp_exptmod(&a, &b, &c, &d);
+      printf("expt\n");
+      mp_to64(&a, buf);
+      printf("%s\n", buf);
+      mp_to64(&b, buf);
+      printf("%s\n", buf);
+      mp_to64(&c, buf);
+      printf("%s\n", buf);
+      mp_to64(&d, buf);
+      printf("%s\n", buf);
+   } else if (n == 10) {
+      /* invmod test */
+      rand_num2(&a);
+      rand_num2(&b);
+      b.sign = MP_ZPOS;
+      a.sign = MP_ZPOS;
+      mp_gcd(&a, &b, &c);
+      if (mp_cmp_d(&c, 1) != 0) continue;
+      if (mp_cmp_d(&b, 1) == 0) continue;
+      mp_invmod(&a, &b, &c);
+      printf("invmod\n");
+      mp_to64(&a, buf);
+      printf("%s\n", buf);
+      mp_to64(&b, buf);
+      printf("%s\n", buf);
+      mp_to64(&c, buf);
+      printf("%s\n", buf);
+   } else if (n == 11) {
+      rand_num(&a);
+      mp_mul_2(&a, &a);
+      mp_div_2(&a, &b);
+      printf("div2\n");
+      mp_to64(&a, buf);
+      printf("%s\n", buf);
+      mp_to64(&b, buf);
+      printf("%s\n", buf);
+   } else if (n == 12) {
+      rand_num2(&a);
+      mp_mul_2(&a, &b);
+      printf("mul2\n");
+      mp_to64(&a, buf);
+      printf("%s\n", buf);
+      mp_to64(&b, buf);
+      printf("%s\n", buf);
+   } else if (n == 13) {
+      rand_num2(&a);
+      tmp = abs(rand()) & THE_MASK;
+      mp_add_d(&a, tmp, &b);
+      printf("add_d\n");
+      mp_to64(&a, buf);
+      printf("%s\n%d\n", buf, tmp);
+      mp_to64(&b, buf);
+      printf("%s\n", buf);
+   } else if (n == 14) {
+      rand_num2(&a);
+      tmp = abs(rand()) & THE_MASK;
+      mp_sub_d(&a, tmp, &b);
+      printf("sub_d\n");
+      mp_to64(&a, buf);
+      printf("%s\n%d\n", buf, tmp);
+      mp_to64(&b, buf);
+      printf("%s\n", buf);
+   }
+   }
+   fclose(rng);
+   return 0;
+}
diff --git a/libtommath/pics/design_process.sxd b/libtommath/pics/design_process.sxd
new file mode 100644
index 0000000..7414dbb
--- /dev/null
+++ b/libtommath/pics/design_process.sxd
diff --git a/libtommath/pics/design_process.tif b/libtommath/pics/design_process.tif
new file mode 100644
index 0000000..4a0c012
--- /dev/null
+++ b/libtommath/pics/design_process.tif
diff --git a/libtommath/pics/expt_state.sxd b/libtommath/pics/expt_state.sxd
new file mode 100644
index 0000000..6518404
--- /dev/null
+++ b/libtommath/pics/expt_state.sxd
diff --git a/libtommath/pics/expt_state.tif b/libtommath/pics/expt_state.tif
new file mode 100644
index 0000000..cb06e8e
--- /dev/null
+++ b/libtommath/pics/expt_state.tif
diff --git a/libtommath/pics/makefile b/libtommath/pics/makefile
new file mode 100644
index 0000000..3ecb02f
--- /dev/null
+++ b/libtommath/pics/makefile
@@ -0,0 +1,35 @@
+# makes the images... yeah
+
+default:  pses
+
+design_process.ps: design_process.tif
+	tiff2ps -s -e design_process.tif > design_process.ps
+
+sliding_window.ps: sliding_window.tif
+	tiff2ps -s -e sliding_window.tif > sliding_window.ps
+	
+expt_state.ps: expt_state.tif
+	tiff2ps -s -e expt_state.tif > expt_state.ps
+
+primality.ps: primality.tif
+	tiff2ps -s -e primality.tif > primality.ps
+
+design_process.pdf: design_process.ps
+	epstopdf design_process.ps
+
+sliding_window.pdf: sliding_window.ps
+	epstopdf sliding_window.ps
+	
+expt_state.pdf: expt_state.ps
+	epstopdf expt_state.ps
+
+primality.pdf: primality.ps
+	epstopdf primality.ps
+
+
+pses: sliding_window.ps expt_state.ps primality.ps design_process.ps
+pdfes: sliding_window.pdf expt_state.pdf primality.pdf design_process.pdf
+
+clean:
+	rm -rf *.ps *.pdf .xvpics
+   
+\ No newline at end of file
diff --git a/libtommath/pics/primality.tif b/libtommath/pics/primality.tif
new file mode 100644
index 0000000..76d6be3
--- /dev/null
+++ b/libtommath/pics/primality.tif
diff --git a/libtommath/pics/radix.sxd b/libtommath/pics/radix.sxd
new file mode 100644
index 0000000..b9eb9a0
--- /dev/null
+++ b/libtommath/pics/radix.sxd
diff --git a/libtommath/pics/sliding_window.sxd b/libtommath/pics/sliding_window.sxd
new file mode 100644
index 0000000..91e7c0d
--- /dev/null
+++ b/libtommath/pics/sliding_window.sxd
diff --git a/libtommath/pics/sliding_window.tif b/libtommath/pics/sliding_window.tif
new file mode 100644
index 0000000..bb4cb96
--- /dev/null
+++ b/libtommath/pics/sliding_window.tif
diff --git a/libtommath/poster.out b/libtommath/poster.out
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/libtommath/poster.out
diff --git a/libtommath/poster.tex b/libtommath/poster.tex
new file mode 100644
index 0000000..e7388f4
--- /dev/null
+++ b/libtommath/poster.tex
@@ -0,0 +1,35 @@
+\documentclass[landscape,11pt]{article}
+\usepackage{amsmath, amssymb}
+\usepackage{hyperref}
+\begin{document}
+\hspace*{-3in}
+\begin{tabular}{llllll}
+$c = a + b$  & {\tt mp\_add(\&a, \&b, \&c)} & $b = 2a$  & {\tt mp\_mul\_2(\&a, \&b)} & \\
+$c = a - b$  & {\tt mp\_sub(\&a, \&b, \&c)} & $b = a/2$ & {\tt mp\_div\_2(\&a, \&b)} & \\
+$c = ab $   & {\tt mp\_mul(\&a, \&b, \&c)}  & $c = 2^ba$  & {\tt mp\_mul\_2d(\&a, b, \&c)}  \\
+$b = a^2 $  & {\tt mp\_sqr(\&a, \&b)}       & $c = a/2^b, d = a \mod 2^b$ & {\tt mp\_div\_2d(\&a, b, \&c, \&d)} \\
+$c = \lfloor a/b \rfloor, d = a \mod b$ & {\tt mp\_div(\&a, \&b, \&c, \&d)} & $c = a \mod 2^b $  & {\tt mp\_mod\_2d(\&a, b, \&c)}  \\
+ && \\
+$a = b $  & {\tt mp\_set\_int(\&a, b)}  & $c = a \vee b$  & {\tt mp\_or(\&a, \&b, \&c)}  \\
+$b = a $  & {\tt mp\_copy(\&a, \&b)} & $c = a \wedge b$  & {\tt mp\_and(\&a, \&b, \&c)}  \\
+ && $c = a \oplus b$  & {\tt mp\_xor(\&a, \&b, \&c)}  \\
+ & \\
+$b = -a $  & {\tt mp\_neg(\&a, \&b)}  & $d = a + b \mod c$  & {\tt mp\_addmod(\&a, \&b, \&c, \&d)}  \\
+$b = |a| $  & {\tt mp\_abs(\&a, \&b)} & $d = a - b \mod c$  & {\tt mp\_submod(\&a, \&b, \&c, \&d)}  \\
+ && $d = ab \mod c$  & {\tt mp\_mulmod(\&a, \&b, \&c, \&d)}  \\
+Compare $a$ and $b$ & {\tt mp\_cmp(\&a, \&b)} & $c = a^2 \mod b$  & {\tt mp\_sqrmod(\&a, \&b, \&c)}  \\
+Is Zero? & {\tt mp\_iszero(\&a)} & $c = a^{-1} \mod b$  & {\tt mp\_invmod(\&a, \&b, \&c)} \\
+Is Even? & {\tt mp\_iseven(\&a)} & $d = a^b \mod c$ & {\tt mp\_exptmod(\&a, \&b, \&c, \&d)} \\
+Is Odd ? & {\tt mp\_isodd(\&a)} \\
+&\\
+$\vert \vert a \vert \vert$ & {\tt mp\_unsigned\_bin\_size(\&a)} & $res$ = 1 if $a$ prime to $t$ rounds? & {\tt mp\_prime\_is\_prime(\&a, t, \&res)} \\
+$buf \leftarrow a$          & {\tt mp\_to\_unsigned\_bin(\&a, buf)} & Next prime after $a$ to $t$ rounds. & {\tt mp\_prime\_next\_prime(\&a, t, bbs\_style)} \\
+$a \leftarrow buf[0..len-1]$          & {\tt mp\_read\_unsigned\_bin(\&a, buf, len)} \\
+&\\
+$b = \sqrt{a}$ & {\tt mp\_sqrt(\&a, \&b)}  & $c = \mbox{gcd}(a, b)$ & {\tt mp\_gcd(\&a, \&b, \&c)} \\
+$c = a^{1/b}$ & {\tt mp\_n\_root(\&a, b, \&c)} & $c = \mbox{lcm}(a, b)$ & {\tt mp\_lcm(\&a, \&b, \&c)} \\
+&\\
+Greater Than & MP\_GT & Equal To & MP\_EQ \\
+Less Than & MP\_LT & Bits per digit & DIGIT\_BIT \\
+\end{tabular}
+\end{document}
diff --git a/libtommath/pre_gen/mpi.c b/libtommath/pre_gen/mpi.c
new file mode 100644
index 0000000..8ec8a10
--- /dev/null
+++ b/libtommath/pre_gen/mpi.c
@@ -0,0 +1,9048 @@
+/* Start: bn_error.c */
+#include <tommath.h>
+#ifdef BN_ERROR_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+static const struct {
+     int code;
+     char *msg;
+} msgs[] = {
+     { MP_OKAY, "Successful" },
+     { MP_MEM,  "Out of heap" },
+     { MP_VAL,  "Value out of range" }
+};
+
+/* return a char * string for a given code */
+char *mp_error_to_string(int code)
+{
+   int x;
+
+   /* scan the lookup table for the given message */
+   for (x = 0; x < (int)(sizeof(msgs) / sizeof(msgs[0])); x++) {
+       if (msgs[x].code == code) {
+          return msgs[x].msg;
+       }
+   }
+
+   /* generic reply for invalid code */
+   return "Invalid error code";
+}
+
+#endif
+
+/* End: bn_error.c */
+
+/* Start: bn_fast_mp_invmod.c */
+#include <tommath.h>
+#ifdef BN_FAST_MP_INVMOD_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* computes the modular inverse via binary extended euclidean algorithm, 
+ * that is c = 1/a mod b 
+ *
+ * Based on slow invmod except this is optimized for the case where b is 
+ * odd as per HAC Note 14.64 on pp. 610
+ */
+int fast_mp_invmod (mp_int * a, mp_int * b, mp_int * c)
+{
+  mp_int  x, y, u, v, B, D;
+  int     res, neg;
+
+  /* 2. [modified] b must be odd   */
+  if (mp_iseven (b) == 1) {
+    return MP_VAL;
+  }
+
+  /* init all our temps */
+  if ((res = mp_init_multi(&x, &y, &u, &v, &B, &D, NULL)) != MP_OKAY) {
+     return res;
+  }
+
+  /* x == modulus, y == value to invert */
+  if ((res = mp_copy (b, &x)) != MP_OKAY) {
+    goto LBL_ERR;
+  }
+
+  /* we need y = |a| */
+  if ((res = mp_mod (a, b, &y)) != MP_OKAY) {
+    goto LBL_ERR;
+  }
+
+  /* 3. u=x, v=y, A=1, B=0, C=0,D=1 */
+  if ((res = mp_copy (&x, &u)) != MP_OKAY) {
+    goto LBL_ERR;
+  }
+  if ((res = mp_copy (&y, &v)) != MP_OKAY) {
+    goto LBL_ERR;
+  }
+  mp_set (&D, 1);
+
+top:
+  /* 4.  while u is even do */
+  while (mp_iseven (&u) == 1) {
+    /* 4.1 u = u/2 */
+    if ((res = mp_div_2 (&u, &u)) != MP_OKAY) {
+      goto LBL_ERR;
+    }
+    /* 4.2 if B is odd then */
+    if (mp_isodd (&B) == 1) {
+      if ((res = mp_sub (&B, &x, &B)) != MP_OKAY) {
+        goto LBL_ERR;
+      }
+    }
+    /* B = B/2 */
+    if ((res = mp_div_2 (&B, &B)) != MP_OKAY) {
+      goto LBL_ERR;
+    }
+  }
+
+  /* 5.  while v is even do */
+  while (mp_iseven (&v) == 1) {
+    /* 5.1 v = v/2 */
+    if ((res = mp_div_2 (&v, &v)) != MP_OKAY) {
+      goto LBL_ERR;
+    }
+    /* 5.2 if D is odd then */
+    if (mp_isodd (&D) == 1) {
+      /* D = (D-x)/2 */
+      if ((res = mp_sub (&D, &x, &D)) != MP_OKAY) {
+        goto LBL_ERR;
+      }
+    }
+    /* D = D/2 */
+    if ((res = mp_div_2 (&D, &D)) != MP_OKAY) {
+      goto LBL_ERR;
+    }
+  }
+
+  /* 6.  if u >= v then */
+  if (mp_cmp (&u, &v) != MP_LT) {
+    /* u = u - v, B = B - D */
+    if ((res = mp_sub (&u, &v, &u)) != MP_OKAY) {
+      goto LBL_ERR;
+    }
+
+    if ((res = mp_sub (&B, &D, &B)) != MP_OKAY) {
+      goto LBL_ERR;
+    }
+  } else {
+    /* v - v - u, D = D - B */
+    if ((res = mp_sub (&v, &u, &v)) != MP_OKAY) {
+      goto LBL_ERR;
+    }
+
+    if ((res = mp_sub (&D, &B, &D)) != MP_OKAY) {
+      goto LBL_ERR;
+    }
+  }
+
+  /* if not zero goto step 4 */
+  if (mp_iszero (&u) == 0) {
+    goto top;
+  }
+
+  /* now a = C, b = D, gcd == g*v */
+
+  /* if v != 1 then there is no inverse */
+  if (mp_cmp_d (&v, 1) != MP_EQ) {
+    res = MP_VAL;
+    goto LBL_ERR;
+  }
+
+  /* b is now the inverse */
+  neg = a->sign;
+  while (D.sign == MP_NEG) {
+    if ((res = mp_add (&D, b, &D)) != MP_OKAY) {
+      goto LBL_ERR;
+    }
+  }
+  mp_exch (&D, c);
+  c->sign = neg;
+  res = MP_OKAY;
+
+LBL_ERR:mp_clear_multi (&x, &y, &u, &v, &B, &D, NULL);
+  return res;
+}
+#endif
+
+/* End: bn_fast_mp_invmod.c */
+
+/* Start: bn_fast_mp_montgomery_reduce.c */
+#include <tommath.h>
+#ifdef BN_FAST_MP_MONTGOMERY_REDUCE_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* computes xR**-1 == x (mod N) via Montgomery Reduction
+ *
+ * This is an optimized implementation of montgomery_reduce
+ * which uses the comba method to quickly calculate the columns of the
+ * reduction.
+ *
+ * Based on Algorithm 14.32 on pp.601 of HAC.
+*/
+int fast_mp_montgomery_reduce (mp_int * x, mp_int * n, mp_digit rho)
+{
+  int     ix, res, olduse;
+  mp_word W[MP_WARRAY];
+
+  /* get old used count */
+  olduse = x->used;
+
+  /* grow a as required */
+  if (x->alloc < n->used + 1) {
+    if ((res = mp_grow (x, n->used + 1)) != MP_OKAY) {
+      return res;
+    }
+  }
+
+  /* first we have to get the digits of the input into
+   * an array of double precision words W[...]
+   */
+  {
+    register mp_word *_W;
+    register mp_digit *tmpx;
+
+    /* alias for the W[] array */
+    _W   = W;
+
+    /* alias for the digits of  x*/
+    tmpx = x->dp;
+
+    /* copy the digits of a into W[0..a->used-1] */
+    for (ix = 0; ix < x->used; ix++) {
+      *_W++ = *tmpx++;
+    }
+
+    /* zero the high words of W[a->used..m->used*2] */
+    for (; ix < n->used * 2 + 1; ix++) {
+      *_W++ = 0;
+    }
+  }
+
+  /* now we proceed to zero successive digits
+   * from the least significant upwards
+   */
+  for (ix = 0; ix < n->used; ix++) {
+    /* mu = ai * m' mod b
+     *
+     * We avoid a double precision multiplication (which isn't required)
+     * by casting the value down to a mp_digit.  Note this requires
+     * that W[ix-1] have  the carry cleared (see after the inner loop)
+     */
+    register mp_digit mu;
+    mu = (mp_digit) (((W[ix] & MP_MASK) * rho) & MP_MASK);
+
+    /* a = a + mu * m * b**i
+     *
+     * This is computed in place and on the fly.  The multiplication
+     * by b**i is handled by offseting which columns the results
+     * are added to.
+     *
+     * Note the comba method normally doesn't handle carries in the
+     * inner loop In this case we fix the carry from the previous
+     * column since the Montgomery reduction requires digits of the
+     * result (so far) [see above] to work.  This is
+     * handled by fixing up one carry after the inner loop.  The
+     * carry fixups are done in order so after these loops the
+     * first m->used words of W[] have the carries fixed
+     */
+    {
+      register int iy;
+      register mp_digit *tmpn;
+      register mp_word *_W;
+
+      /* alias for the digits of the modulus */
+      tmpn = n->dp;
+
+      /* Alias for the columns set by an offset of ix */
+      _W = W + ix;
+
+      /* inner loop */
+      for (iy = 0; iy < n->used; iy++) {
+          *_W++ += ((mp_word)mu) * ((mp_word)*tmpn++);
+      }
+    }
+
+    /* now fix carry for next digit, W[ix+1] */
+    W[ix + 1] += W[ix] >> ((mp_word) DIGIT_BIT);
+  }
+
+  /* now we have to propagate the carries and
+   * shift the words downward [all those least
+   * significant digits we zeroed].
+   */
+  {
+    register mp_digit *tmpx;
+    register mp_word *_W, *_W1;
+
+    /* nox fix rest of carries */
+
+    /* alias for current word */
+    _W1 = W + ix;
+
+    /* alias for next word, where the carry goes */
+    _W = W + ++ix;
+
+    for (; ix <= n->used * 2 + 1; ix++) {
+      *_W++ += *_W1++ >> ((mp_word) DIGIT_BIT);
+    }
+
+    /* copy out, A = A/b**n
+     *
+     * The result is A/b**n but instead of converting from an
+     * array of mp_word to mp_digit than calling mp_rshd
+     * we just copy them in the right order
+     */
+
+    /* alias for destination word */
+    tmpx = x->dp;
+
+    /* alias for shifted double precision result */
+    _W = W + n->used;
+
+    for (ix = 0; ix < n->used + 1; ix++) {
+      *tmpx++ = (mp_digit)(*_W++ & ((mp_word) MP_MASK));
+    }
+
+    /* zero oldused digits, if the input a was larger than
+     * m->used+1 we'll have to clear the digits
+     */
+    for (; ix < olduse; ix++) {
+      *tmpx++ = 0;
+    }
+  }
+
+  /* set the max used and clamp */
+  x->used = n->used + 1;
+  mp_clamp (x);
+
+  /* if A >= m then A = A - m */
+  if (mp_cmp_mag (x, n) != MP_LT) {
+    return s_mp_sub (x, n, x);
+  }
+  return MP_OKAY;
+}
+#endif
+
+/* End: bn_fast_mp_montgomery_reduce.c */
+
+/* Start: bn_fast_s_mp_mul_digs.c */
+#include <tommath.h>
+#ifdef BN_FAST_S_MP_MUL_DIGS_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* Fast (comba) multiplier
+ *
+ * This is the fast column-array [comba] multiplier.  It is 
+ * designed to compute the columns of the product first 
+ * then handle the carries afterwards.  This has the effect 
+ * of making the nested loops that compute the columns very
+ * simple and schedulable on super-scalar processors.
+ *
+ * This has been modified to produce a variable number of 
+ * digits of output so if say only a half-product is required 
+ * you don't have to compute the upper half (a feature 
+ * required for fast Barrett reduction).
+ *
+ * Based on Algorithm 14.12 on pp.595 of HAC.
+ *
+ */
+int fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
+{
+  int     olduse, res, pa, ix, iz;
+  mp_digit W[MP_WARRAY];
+  register mp_word  _W;
+
+  /* grow the destination as required */
+  if (c->alloc < digs) {
+    if ((res = mp_grow (c, digs)) != MP_OKAY) {
+      return res;
+    }
+  }
+
+  /* number of output digits to produce */
+  pa = MIN(digs, a->used + b->used);
+
+  /* clear the carry */
+  _W = 0;
+  for (ix = 0; ix < pa; ix++) { 
+      int      tx, ty;
+      int      iy;
+      mp_digit *tmpx, *tmpy;
+
+      /* get offsets into the two bignums */
+      ty = MIN(b->used-1, ix);
+      tx = ix - ty;
+
+      /* setup temp aliases */
+      tmpx = a->dp + tx;
+      tmpy = b->dp + ty;
+
+      /* this is the number of times the loop will iterrate, essentially 
+         while (tx++ < a->used && ty-- >= 0) { ... }
+       */
+      iy = MIN(a->used-tx, ty+1);
+
+      /* execute loop */
+      for (iz = 0; iz < iy; ++iz) {
+         _W += ((mp_word)*tmpx++)*((mp_word)*tmpy--);
+      }
+
+      /* store term */
+      W[ix] = ((mp_digit)_W) & MP_MASK;
+
+      /* make next carry */
+      _W = _W >> ((mp_word)DIGIT_BIT);
+  }
+
+  /* store final carry */
+  W[ix] = (mp_digit)(_W & MP_MASK);
+
+  /* setup dest */
+  olduse  = c->used;
+  c->used = pa;
+
+  {
+    register mp_digit *tmpc;
+    tmpc = c->dp;
+    for (ix = 0; ix < pa+1; ix++) {
+      /* now extract the previous digit [below the carry] */
+      *tmpc++ = W[ix];
+    }
+
+    /* clear unused digits [that existed in the old copy of c] */
+    for (; ix < olduse; ix++) {
+      *tmpc++ = 0;
+    }
+  }
+  mp_clamp (c);
+  return MP_OKAY;
+}
+#endif
+
+/* End: bn_fast_s_mp_mul_digs.c */
+
+/* Start: bn_fast_s_mp_mul_high_digs.c */
+#include <tommath.h>
+#ifdef BN_FAST_S_MP_MUL_HIGH_DIGS_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* this is a modified version of fast_s_mul_digs that only produces
+ * output digits *above* digs.  See the comments for fast_s_mul_digs
+ * to see how it works.
+ *
+ * This is used in the Barrett reduction since for one of the multiplications
+ * only the higher digits were needed.  This essentially halves the work.
+ *
+ * Based on Algorithm 14.12 on pp.595 of HAC.
+ */
+int fast_s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
+{
+  int     olduse, res, pa, ix, iz;
+  mp_digit W[MP_WARRAY];
+  mp_word  _W;
+
+  /* grow the destination as required */
+  pa = a->used + b->used;
+  if (c->alloc < pa) {
+    if ((res = mp_grow (c, pa)) != MP_OKAY) {
+      return res;
+    }
+  }
+
+  /* number of output digits to produce */
+  pa = a->used + b->used;
+  _W = 0;
+  for (ix = digs; ix < pa; ix++) { 
+      int      tx, ty, iy;
+      mp_digit *tmpx, *tmpy;
+
+      /* get offsets into the two bignums */
+      ty = MIN(b->used-1, ix);
+      tx = ix - ty;
+
+      /* setup temp aliases */
+      tmpx = a->dp + tx;
+      tmpy = b->dp + ty;
+
+      /* this is the number of times the loop will iterrate, essentially its 
+         while (tx++ < a->used && ty-- >= 0) { ... }
+       */
+      iy = MIN(a->used-tx, ty+1);
+
+      /* execute loop */
+      for (iz = 0; iz < iy; iz++) {
+         _W += ((mp_word)*tmpx++)*((mp_word)*tmpy--);
+      }
+
+      /* store term */
+      W[ix] = ((mp_digit)_W) & MP_MASK;
+
+      /* make next carry */
+      _W = _W >> ((mp_word)DIGIT_BIT);
+  }
+  
+  /* store final carry */
+  W[ix] = (mp_digit)(_W & MP_MASK);
+
+  /* setup dest */
+  olduse  = c->used;
+  c->used = pa;
+
+  {
+    register mp_digit *tmpc;
+
+    tmpc = c->dp + digs;
+    for (ix = digs; ix <= pa; ix++) {
+      /* now extract the previous digit [below the carry] */
+      *tmpc++ = W[ix];
+    }
+
+    /* clear unused digits [that existed in the old copy of c] */
+    for (; ix < olduse; ix++) {
+      *tmpc++ = 0;
+    }
+  }
+  mp_clamp (c);
+  return MP_OKAY;
+}
+#endif
+
+/* End: bn_fast_s_mp_mul_high_digs.c */
+
+/* Start: bn_fast_s_mp_sqr.c */
+#include <tommath.h>
+#ifdef BN_FAST_S_MP_SQR_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* the jist of squaring...
+ * you do like mult except the offset of the tmpx [one that 
+ * starts closer to zero] can't equal the offset of tmpy.  
+ * So basically you set up iy like before then you min it with
+ * (ty-tx) so that it never happens.  You double all those 
+ * you add in the inner loop
+
+After that loop you do the squares and add them in.
+*/
+
+int fast_s_mp_sqr (mp_int * a, mp_int * b)
+{
+  int       olduse, res, pa, ix, iz;
+  mp_digit   W[MP_WARRAY], *tmpx;
+  mp_word   W1;
+
+  /* grow the destination as required */
+  pa = a->used + a->used;
+  if (b->alloc < pa) {
+    if ((res = mp_grow (b, pa)) != MP_OKAY) {
+      return res;
+    }
+  }
+
+  /* number of output digits to produce */
+  W1 = 0;
+  for (ix = 0; ix < pa; ix++) { 
+      int      tx, ty, iy;
+      mp_word  _W;
+      mp_digit *tmpy;
+
+      /* clear counter */
+      _W = 0;
+
+      /* get offsets into the two bignums */
+      ty = MIN(a->used-1, ix);
+      tx = ix - ty;
+
+      /* setup temp aliases */
+      tmpx = a->dp + tx;
+      tmpy = a->dp + ty;
+
+      /* this is the number of times the loop will iterrate, essentially
+         while (tx++ < a->used && ty-- >= 0) { ... }
+       */
+      iy = MIN(a->used-tx, ty+1);
+
+      /* now for squaring tx can never equal ty 
+       * we halve the distance since they approach at a rate of 2x
+       * and we have to round because odd cases need to be executed
+       */
+      iy = MIN(iy, (ty-tx+1)>>1);
+
+      /* execute loop */
+      for (iz = 0; iz < iy; iz++) {
+         _W += ((mp_word)*tmpx++)*((mp_word)*tmpy--);
+      }
+
+      /* double the inner product and add carry */
+      _W = _W + _W + W1;
+
+      /* even columns have the square term in them */
+      if ((ix&1) == 0) {
+         _W += ((mp_word)a->dp[ix>>1])*((mp_word)a->dp[ix>>1]);
+      }
+
+      /* store it */
+      W[ix] = (mp_digit)(_W & MP_MASK);
+
+      /* make next carry */
+      W1 = _W >> ((mp_word)DIGIT_BIT);
+  }
+
+  /* setup dest */
+  olduse  = b->used;
+  b->used = a->used+a->used;
+
+  {
+    mp_digit *tmpb;
+    tmpb = b->dp;
+    for (ix = 0; ix < pa; ix++) {
+      *tmpb++ = W[ix] & MP_MASK;
+    }
+
+    /* clear unused digits [that existed in the old copy of c] */
+    for (; ix < olduse; ix++) {
+      *tmpb++ = 0;
+    }
+  }
+  mp_clamp (b);
+  return MP_OKAY;
+}
+#endif
+
+/* End: bn_fast_s_mp_sqr.c */
+
+/* Start: bn_mp_2expt.c */
+#include <tommath.h>
+#ifdef BN_MP_2EXPT_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* computes a = 2**b 
+ *
+ * Simple algorithm which zeroes the int, grows it then just sets one bit
+ * as required.
+ */
+int
+mp_2expt (mp_int * a, int b)
+{
+  int     res;
+
+  /* zero a as per default */
+  mp_zero (a);
+
+  /* grow a to accomodate the single bit */
+  if ((res = mp_grow (a, b / DIGIT_BIT + 1)) != MP_OKAY) {
+    return res;
+  }
+
+  /* set the used count of where the bit will go */
+  a->used = b / DIGIT_BIT + 1;
+
+  /* put the single bit in its place */
+  a->dp[b / DIGIT_BIT] = ((mp_digit)1) << (b % DIGIT_BIT);
+
+  return MP_OKAY;
+}
+#endif
+
+/* End: bn_mp_2expt.c */
+
+/* Start: bn_mp_abs.c */
+#include <tommath.h>
+#ifdef BN_MP_ABS_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* b = |a| 
+ *
+ * Simple function copies the input and fixes the sign to positive
+ */
+int
+mp_abs (mp_int * a, mp_int * b)
+{
+  int     res;
+
+  /* copy a to b */
+  if (a != b) {
+     if ((res = mp_copy (a, b)) != MP_OKAY) {
+       return res;
+     }
+  }
+
+  /* force the sign of b to positive */
+  b->sign = MP_ZPOS;
+
+  return MP_OKAY;
+}
+#endif
+
+/* End: bn_mp_abs.c */
+
+/* Start: bn_mp_add.c */
+#include <tommath.h>
+#ifdef BN_MP_ADD_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* high level addition (handles signs) */
+int mp_add (mp_int * a, mp_int * b, mp_int * c)
+{
+  int     sa, sb, res;
+
+  /* get sign of both inputs */
+  sa = a->sign;
+  sb = b->sign;
+
+  /* handle two cases, not four */
+  if (sa == sb) {
+    /* both positive or both negative */
+    /* add their magnitudes, copy the sign */
+    c->sign = sa;
+    res = s_mp_add (a, b, c);
+  } else {
+    /* one positive, the other negative */
+    /* subtract the one with the greater magnitude from */
+    /* the one of the lesser magnitude.  The result gets */
+    /* the sign of the one with the greater magnitude. */
+    if (mp_cmp_mag (a, b) == MP_LT) {
+      c->sign = sb;
+      res = s_mp_sub (b, a, c);
+    } else {
+      c->sign = sa;
+      res = s_mp_sub (a, b, c);
+    }
+  }
+  return res;
+}
+
+#endif
+
+/* End: bn_mp_add.c */
+
+/* Start: bn_mp_add_d.c */
+#include <tommath.h>
+#ifdef BN_MP_ADD_D_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* single digit addition */
+int
+mp_add_d (mp_int * a, mp_digit b, mp_int * c)
+{
+  int     res, ix, oldused;
+  mp_digit *tmpa, *tmpc, mu;
+
+  /* grow c as required */
+  if (c->alloc < a->used + 1) {
+     if ((res = mp_grow(c, a->used + 1)) != MP_OKAY) {
+        return res;
+     }
+  }
+
+  /* if a is negative and |a| >= b, call c = |a| - b */
+  if (a->sign == MP_NEG && (a->used > 1 || a->dp[0] >= b)) {
+     /* temporarily fix sign of a */
+     a->sign = MP_ZPOS;
+
+     /* c = |a| - b */
+     res = mp_sub_d(a, b, c);
+
+     /* fix sign  */
+     a->sign = c->sign = MP_NEG;
+
+     return res;
+  }
+
+  /* old number of used digits in c */
+  oldused = c->used;
+
+  /* sign always positive */
+  c->sign = MP_ZPOS;
+
+  /* source alias */
+  tmpa    = a->dp;
+
+  /* destination alias */
+  tmpc    = c->dp;
+
+  /* if a is positive */
+  if (a->sign == MP_ZPOS) {
+     /* add digit, after this we're propagating
+      * the carry.
+      */
+     *tmpc   = *tmpa++ + b;
+     mu      = *tmpc >> DIGIT_BIT;
+     *tmpc++ &= MP_MASK;
+
+     /* now handle rest of the digits */
+     for (ix = 1; ix < a->used; ix++) {
+        *tmpc   = *tmpa++ + mu;
+        mu      = *tmpc >> DIGIT_BIT;
+        *tmpc++ &= MP_MASK;
+     }
+     /* set final carry */
+     ix++;
+     *tmpc++  = mu;
+
+     /* setup size */
+     c->used = a->used + 1;
+  } else {
+     /* a was negative and |a| < b */
+     c->used  = 1;
+
+     /* the result is a single digit */
+     if (a->used == 1) {
+        *tmpc++  =  b - a->dp[0];
+     } else {
+        *tmpc++  =  b;
+     }
+
+     /* setup count so the clearing of oldused
+      * can fall through correctly
+      */
+     ix       = 1;
+  }
+
+  /* now zero to oldused */
+  while (ix++ < oldused) {
+     *tmpc++ = 0;
+  }
+  mp_clamp(c);
+
+  return MP_OKAY;
+}
+
+#endif
+
+/* End: bn_mp_add_d.c */
+
+/* Start: bn_mp_addmod.c */
+#include <tommath.h>
+#ifdef BN_MP_ADDMOD_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* d = a + b (mod c) */
+int
+mp_addmod (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
+{
+  int     res;
+  mp_int  t;
+
+  if ((res = mp_init (&t)) != MP_OKAY) {
+    return res;
+  }
+
+  if ((res = mp_add (a, b, &t)) != MP_OKAY) {
+    mp_clear (&t);
+    return res;
+  }
+  res = mp_mod (&t, c, d);
+  mp_clear (&t);
+  return res;
+}
+#endif
+
+/* End: bn_mp_addmod.c */
+
+/* Start: bn_mp_and.c */
+#include <tommath.h>
+#ifdef BN_MP_AND_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* AND two ints together */
+int
+mp_and (mp_int * a, mp_int * b, mp_int * c)
+{
+  int     res, ix, px;
+  mp_int  t, *x;
+
+  if (a->used > b->used) {
+    if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
+      return res;
+    }
+    px = b->used;
+    x = b;
+  } else {
+    if ((res = mp_init_copy (&t, b)) != MP_OKAY) {
+      return res;
+    }
+    px = a->used;
+    x = a;
+  }
+
+  for (ix = 0; ix < px; ix++) {
+    t.dp[ix] &= x->dp[ix];
+  }
+
+  /* zero digits above the last from the smallest mp_int */
+  for (; ix < t.used; ix++) {
+    t.dp[ix] = 0;
+  }
+
+  mp_clamp (&t);
+  mp_exch (c, &t);
+  mp_clear (&t);
+  return MP_OKAY;
+}
+#endif
+
+/* End: bn_mp_and.c */
+
+/* Start: bn_mp_clamp.c */
+#include <tommath.h>
+#ifdef BN_MP_CLAMP_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* trim unused digits 
+ *
+ * This is used to ensure that leading zero digits are
+ * trimed and the leading "used" digit will be non-zero
+ * Typically very fast.  Also fixes the sign if there
+ * are no more leading digits
+ */
+void
+mp_clamp (mp_int * a)
+{
+  /* decrease used while the most significant digit is
+   * zero.
+   */
+  while (a->used > 0 && a->dp[a->used - 1] == 0) {
+    --(a->used);
+  }
+
+  /* reset the sign flag if used == 0 */
+  if (a->used == 0) {
+    a->sign = MP_ZPOS;
+  }
+}
+#endif
+
+/* End: bn_mp_clamp.c */
+
+/* Start: bn_mp_clear.c */
+#include <tommath.h>
+#ifdef BN_MP_CLEAR_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* clear one (frees)  */
+void
+mp_clear (mp_int * a)
+{
+  int i;
+
+  /* only do anything if a hasn't been freed previously */
+  if (a->dp != NULL) {
+    /* first zero the digits */
+    for (i = 0; i < a->used; i++) {
+        a->dp[i] = 0;
+    }
+
+    /* free ram */
+    XFREE(a->dp);
+
+    /* reset members to make debugging easier */
+    a->dp    = NULL;
+    a->alloc = a->used = 0;
+    a->sign  = MP_ZPOS;
+  }
+}
+#endif
+
+/* End: bn_mp_clear.c */
+
+/* Start: bn_mp_clear_multi.c */
+#include <tommath.h>
+#ifdef BN_MP_CLEAR_MULTI_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <stdarg.h>
+
+void mp_clear_multi(mp_int *mp, ...) 
+{
+    mp_int* next_mp = mp;
+    va_list args;
+    va_start(args, mp);
+    while (next_mp != NULL) {
+        mp_clear(next_mp);
+        next_mp = va_arg(args, mp_int*);
+    }
+    va_end(args);
+}
+#endif
+
+/* End: bn_mp_clear_multi.c */
+
+/* Start: bn_mp_cmp.c */
+#include <tommath.h>
+#ifdef BN_MP_CMP_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* compare two ints (signed)*/
+int
+mp_cmp (mp_int * a, mp_int * b)
+{
+  /* compare based on sign */
+  if (a->sign != b->sign) {
+     if (a->sign == MP_NEG) {
+        return MP_LT;
+     } else {
+        return MP_GT;
+     }
+  }
+  
+  /* compare digits */
+  if (a->sign == MP_NEG) {
+     /* if negative compare opposite direction */
+     return mp_cmp_mag(b, a);
+  } else {
+     return mp_cmp_mag(a, b);
+  }
+}
+#endif
+
+/* End: bn_mp_cmp.c */
+
+/* Start: bn_mp_cmp_d.c */
+#include <tommath.h>
+#ifdef BN_MP_CMP_D_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* compare a digit */
+int mp_cmp_d(mp_int * a, mp_digit b)
+{
+  /* compare based on sign */
+  if (a->sign == MP_NEG) {
+    return MP_LT;
+  }
+
+  /* compare based on magnitude */
+  if (a->used > 1) {
+    return MP_GT;
+  }
+
+  /* compare the only digit of a to b */
+  if (a->dp[0] > b) {
+    return MP_GT;
+  } else if (a->dp[0] < b) {
+    return MP_LT;
+  } else {
+    return MP_EQ;
+  }
+}
+#endif
+
+/* End: bn_mp_cmp_d.c */
+
+/* Start: bn_mp_cmp_mag.c */
+#include <tommath.h>
+#ifdef BN_MP_CMP_MAG_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* compare maginitude of two ints (unsigned) */
+int mp_cmp_mag (mp_int * a, mp_int * b)
+{
+  int     n;
+  mp_digit *tmpa, *tmpb;
+
+  /* compare based on # of non-zero digits */
+  if (a->used > b->used) {
+    return MP_GT;
+  }
+  
+  if (a->used < b->used) {
+    return MP_LT;
+  }
+
+  /* alias for a */
+  tmpa = a->dp + (a->used - 1);
+
+  /* alias for b */
+  tmpb = b->dp + (a->used - 1);
+
+  /* compare based on digits  */
+  for (n = 0; n < a->used; ++n, --tmpa, --tmpb) {
+    if (*tmpa > *tmpb) {
+      return MP_GT;
+    }
+
+    if (*tmpa < *tmpb) {
+      return MP_LT;
+    }
+  }
+  return MP_EQ;
+}
+#endif
+
+/* End: bn_mp_cmp_mag.c */
+
+/* Start: bn_mp_cnt_lsb.c */
+#include <tommath.h>
+#ifdef BN_MP_CNT_LSB_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+static const int lnz[16] = { 
+   4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0
+};
+
+/* Counts the number of lsbs which are zero before the first zero bit */
+int mp_cnt_lsb(mp_int *a)
+{
+   int x;
+   mp_digit q, qq;
+
+   /* easy out */
+   if (mp_iszero(a) == 1) {
+      return 0;
+   }
+
+   /* scan lower digits until non-zero */
+   for (x = 0; x < a->used && a->dp[x] == 0; x++);
+   q = a->dp[x];
+   x *= DIGIT_BIT;
+
+   /* now scan this digit until a 1 is found */
+   if ((q & 1) == 0) {
+      do {
+         qq  = q & 15;
+         x  += lnz[qq];
+         q >>= 4;
+      } while (qq == 0);
+   }
+   return x;
+}
+
+#endif
+
+/* End: bn_mp_cnt_lsb.c */
+
+/* Start: bn_mp_copy.c */
+#include <tommath.h>
+#ifdef BN_MP_COPY_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* copy, b = a */
+int
+mp_copy (mp_int * a, mp_int * b)
+{
+  int     res, n;
+
+  /* if dst == src do nothing */
+  if (a == b) {
+    return MP_OKAY;
+  }
+
+  /* grow dest */
+  if (b->alloc < a->used) {
+     if ((res = mp_grow (b, a->used)) != MP_OKAY) {
+        return res;
+     }
+  }
+
+  /* zero b and copy the parameters over */
+  {
+    register mp_digit *tmpa, *tmpb;
+
+    /* pointer aliases */
+
+    /* source */
+    tmpa = a->dp;
+
+    /* destination */
+    tmpb = b->dp;
+
+    /* copy all the digits */
+    for (n = 0; n < a->used; n++) {
+      *tmpb++ = *tmpa++;
+    }
+
+    /* clear high digits */
+    for (; n < b->used; n++) {
+      *tmpb++ = 0;
+    }
+  }
+
+  /* copy used count and sign */
+  b->used = a->used;
+  b->sign = a->sign;
+  return MP_OKAY;
+}
+#endif
+
+/* End: bn_mp_copy.c */
+
+/* Start: bn_mp_count_bits.c */
+#include <tommath.h>
+#ifdef BN_MP_COUNT_BITS_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* returns the number of bits in an int */
+int
+mp_count_bits (mp_int * a)
+{
+  int     r;
+  mp_digit q;
+
+  /* shortcut */
+  if (a->used == 0) {
+    return 0;
+  }
+
+  /* get number of digits and add that */
+  r = (a->used - 1) * DIGIT_BIT;
+  
+  /* take the last digit and count the bits in it */
+  q = a->dp[a->used - 1];
+  while (q > ((mp_digit) 0)) {
+    ++r;
+    q >>= ((mp_digit) 1);
+  }
+  return r;
+}
+#endif
+
+/* End: bn_mp_count_bits.c */
+
+/* Start: bn_mp_div.c */
+#include <tommath.h>
+#ifdef BN_MP_DIV_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+#ifdef BN_MP_DIV_SMALL
+
+/* slower bit-bang division... also smaller */
+int mp_div(mp_int * a, mp_int * b, mp_int * c, mp_int * d)
+{
+   mp_int ta, tb, tq, q;
+   int    res, n, n2;
+
+  /* is divisor zero ? */
+  if (mp_iszero (b) == 1) {
+    return MP_VAL;
+  }
+
+  /* if a < b then q=0, r = a */
+  if (mp_cmp_mag (a, b) == MP_LT) {
+    if (d != NULL) {
+      res = mp_copy (a, d);
+    } else {
+      res = MP_OKAY;
+    }
+    if (c != NULL) {
+      mp_zero (c);
+    }
+    return res;
+  }
+	
+  /* init our temps */
+  if ((res = mp_init_multi(&ta, &tb, &tq, &q, NULL) != MP_OKAY)) {
+     return res;
+  }
+
+
+  mp_set(&tq, 1);
+  n = mp_count_bits(a) - mp_count_bits(b);
+  if (((res = mp_abs(a, &ta)) != MP_OKAY) ||
+      ((res = mp_abs(b, &tb)) != MP_OKAY) || 
+      ((res = mp_mul_2d(&tb, n, &tb)) != MP_OKAY) ||
+      ((res = mp_mul_2d(&tq, n, &tq)) != MP_OKAY)) {
+      goto LBL_ERR;
+  }
+
+  while (n-- >= 0) {
+     if (mp_cmp(&tb, &ta) != MP_GT) {
+        if (((res = mp_sub(&ta, &tb, &ta)) != MP_OKAY) ||
+            ((res = mp_add(&q, &tq, &q)) != MP_OKAY)) {
+           goto LBL_ERR;
+        }
+     }
+     if (((res = mp_div_2d(&tb, 1, &tb, NULL)) != MP_OKAY) ||
+         ((res = mp_div_2d(&tq, 1, &tq, NULL)) != MP_OKAY)) {
+           goto LBL_ERR;
+     }
+  }
+
+  /* now q == quotient and ta == remainder */
+  n  = a->sign;
+  n2 = (a->sign == b->sign ? MP_ZPOS : MP_NEG);
+  if (c != NULL) {
+     mp_exch(c, &q);
+     c->sign  = (mp_iszero(c) == MP_YES) ? MP_ZPOS : n2;
+  }
+  if (d != NULL) {
+     mp_exch(d, &ta);
+     d->sign = (mp_iszero(d) == MP_YES) ? MP_ZPOS : n;
+  }
+LBL_ERR:
+   mp_clear_multi(&ta, &tb, &tq, &q, NULL);
+   return res;
+}
+
+#else
+
+/* integer signed division. 
+ * c*b + d == a [e.g. a/b, c=quotient, d=remainder]
+ * HAC pp.598 Algorithm 14.20
+ *
+ * Note that the description in HAC is horribly 
+ * incomplete.  For example, it doesn't consider 
+ * the case where digits are removed from 'x' in 
+ * the inner loop.  It also doesn't consider the 
+ * case that y has fewer than three digits, etc..
+ *
+ * The overall algorithm is as described as 
+ * 14.20 from HAC but fixed to treat these cases.
+*/
+int mp_div (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
+{
+  mp_int  q, x, y, t1, t2;
+  int     res, n, t, i, norm, neg;
+
+  /* is divisor zero ? */
+  if (mp_iszero (b) == 1) {
+    return MP_VAL;
+  }
+
+  /* if a < b then q=0, r = a */
+  if (mp_cmp_mag (a, b) == MP_LT) {
+    if (d != NULL) {
+      res = mp_copy (a, d);
+    } else {
+      res = MP_OKAY;
+    }
+    if (c != NULL) {
+      mp_zero (c);
+    }
+    return res;
+  }
+
+  if ((res = mp_init_size (&q, a->used + 2)) != MP_OKAY) {
+    return res;
+  }
+  q.used = a->used + 2;
+
+  if ((res = mp_init (&t1)) != MP_OKAY) {
+    goto LBL_Q;
+  }
+
+  if ((res = mp_init (&t2)) != MP_OKAY) {
+    goto LBL_T1;
+  }
+
+  if ((res = mp_init_copy (&x, a)) != MP_OKAY) {
+    goto LBL_T2;
+  }
+
+  if ((res = mp_init_copy (&y, b)) != MP_OKAY) {
+    goto LBL_X;
+  }
+
+  /* fix the sign */
+  neg = (a->sign == b->sign) ? MP_ZPOS : MP_NEG;
+  x.sign = y.sign = MP_ZPOS;
+
+  /* normalize both x and y, ensure that y >= b/2, [b == 2**DIGIT_BIT] */
+  norm = mp_count_bits(&y) % DIGIT_BIT;
+  if (norm < (int)(DIGIT_BIT-1)) {
+     norm = (DIGIT_BIT-1) - norm;
+     if ((res = mp_mul_2d (&x, norm, &x)) != MP_OKAY) {
+       goto LBL_Y;
+     }
+     if ((res = mp_mul_2d (&y, norm, &y)) != MP_OKAY) {
+       goto LBL_Y;
+     }
+  } else {
+     norm = 0;
+  }
+
+  /* note hac does 0 based, so if used==5 then its 0,1,2,3,4, e.g. use 4 */
+  n = x.used - 1;
+  t = y.used - 1;
+
+  /* while (x >= y*b**n-t) do { q[n-t] += 1; x -= y*b**{n-t} } */
+  if ((res = mp_lshd (&y, n - t)) != MP_OKAY) { /* y = y*b**{n-t} */
+    goto LBL_Y;
+  }
+
+  while (mp_cmp (&x, &y) != MP_LT) {
+    ++(q.dp[n - t]);
+    if ((res = mp_sub (&x, &y, &x)) != MP_OKAY) {
+      goto LBL_Y;
+    }
+  }
+
+  /* reset y by shifting it back down */
+  mp_rshd (&y, n - t);
+
+  /* step 3. for i from n down to (t + 1) */
+  for (i = n; i >= (t + 1); i--) {
+    if (i > x.used) {
+      continue;
+    }
+
+    /* step 3.1 if xi == yt then set q{i-t-1} to b-1, 
+     * otherwise set q{i-t-1} to (xi*b + x{i-1})/yt */
+    if (x.dp[i] == y.dp[t]) {
+      q.dp[i - t - 1] = ((((mp_digit)1) << DIGIT_BIT) - 1);
+    } else {
+      mp_word tmp;
+      tmp = ((mp_word) x.dp[i]) << ((mp_word) DIGIT_BIT);
+      tmp |= ((mp_word) x.dp[i - 1]);
+      tmp /= ((mp_word) y.dp[t]);
+      if (tmp > (mp_word) MP_MASK)
+        tmp = MP_MASK;
+      q.dp[i - t - 1] = (mp_digit) (tmp & (mp_word) (MP_MASK));
+    }
+
+    /* while (q{i-t-1} * (yt * b + y{t-1})) > 
+             xi * b**2 + xi-1 * b + xi-2 
+     
+       do q{i-t-1} -= 1; 
+    */
+    q.dp[i - t - 1] = (q.dp[i - t - 1] + 1) & MP_MASK;
+    do {
+      q.dp[i - t - 1] = (q.dp[i - t - 1] - 1) & MP_MASK;
+
+      /* find left hand */
+      mp_zero (&t1);
+      t1.dp[0] = (t - 1 < 0) ? 0 : y.dp[t - 1];
+      t1.dp[1] = y.dp[t];
+      t1.used = 2;
+      if ((res = mp_mul_d (&t1, q.dp[i - t - 1], &t1)) != MP_OKAY) {
+        goto LBL_Y;
+      }
+
+      /* find right hand */
+      t2.dp[0] = (i - 2 < 0) ? 0 : x.dp[i - 2];
+      t2.dp[1] = (i - 1 < 0) ? 0 : x.dp[i - 1];
+      t2.dp[2] = x.dp[i];
+      t2.used = 3;
+    } while (mp_cmp_mag(&t1, &t2) == MP_GT);
+
+    /* step 3.3 x = x - q{i-t-1} * y * b**{i-t-1} */
+    if ((res = mp_mul_d (&y, q.dp[i - t - 1], &t1)) != MP_OKAY) {
+      goto LBL_Y;
+    }
+
+    if ((res = mp_lshd (&t1, i - t - 1)) != MP_OKAY) {
+      goto LBL_Y;
+    }
+
+    if ((res = mp_sub (&x, &t1, &x)) != MP_OKAY) {
+      goto LBL_Y;
+    }
+
+    /* if x < 0 then { x = x + y*b**{i-t-1}; q{i-t-1} -= 1; } */
+    if (x.sign == MP_NEG) {
+      if ((res = mp_copy (&y, &t1)) != MP_OKAY) {
+        goto LBL_Y;
+      }
+      if ((res = mp_lshd (&t1, i - t - 1)) != MP_OKAY) {
+        goto LBL_Y;
+      }
+      if ((res = mp_add (&x, &t1, &x)) != MP_OKAY) {
+        goto LBL_Y;
+      }
+
+      q.dp[i - t - 1] = (q.dp[i - t - 1] - 1UL) & MP_MASK;
+    }
+  }
+
+  /* now q is the quotient and x is the remainder 
+   * [which we have to normalize] 
+   */
+  
+  /* get sign before writing to c */
+  x.sign = x.used == 0 ? MP_ZPOS : a->sign;
+
+  if (c != NULL) {
+    mp_clamp (&q);
+    mp_exch (&q, c);
+    c->sign = neg;
+  }
+
+  if (d != NULL) {
+    mp_div_2d (&x, norm, &x, NULL);
+    mp_exch (&x, d);
+  }
+
+  res = MP_OKAY;
+
+LBL_Y:mp_clear (&y);
+LBL_X:mp_clear (&x);
+LBL_T2:mp_clear (&t2);
+LBL_T1:mp_clear (&t1);
+LBL_Q:mp_clear (&q);
+  return res;
+}
+
+#endif
+
+#endif
+
+/* End: bn_mp_div.c */
+
+/* Start: bn_mp_div_2.c */
+#include <tommath.h>
+#ifdef BN_MP_DIV_2_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* b = a/2 */
+int mp_div_2(mp_int * a, mp_int * b)
+{
+  int     x, res, oldused;
+
+  /* copy */
+  if (b->alloc < a->used) {
+    if ((res = mp_grow (b, a->used)) != MP_OKAY) {
+      return res;
+    }
+  }
+
+  oldused = b->used;
+  b->used = a->used;
+  {
+    register mp_digit r, rr, *tmpa, *tmpb;
+
+    /* source alias */
+    tmpa = a->dp + b->used - 1;
+
+    /* dest alias */
+    tmpb = b->dp + b->used - 1;
+
+    /* carry */
+    r = 0;
+    for (x = b->used - 1; x >= 0; x--) {
+      /* get the carry for the next iteration */
+      rr = *tmpa & 1;
+
+      /* shift the current digit, add in carry and store */
+      *tmpb-- = (*tmpa-- >> 1) | (r << (DIGIT_BIT - 1));
+
+      /* forward carry to next iteration */
+      r = rr;
+    }
+
+    /* zero excess digits */
+    tmpb = b->dp + b->used;
+    for (x = b->used; x < oldused; x++) {
+      *tmpb++ = 0;
+    }
+  }
+  b->sign = a->sign;
+  mp_clamp (b);
+  return MP_OKAY;
+}
+#endif
+
+/* End: bn_mp_div_2.c */
+
+/* Start: bn_mp_div_2d.c */
+#include <tommath.h>
+#ifdef BN_MP_DIV_2D_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* shift right by a certain bit count (store quotient in c, optional remainder in d) */
+int mp_div_2d (mp_int * a, int b, mp_int * c, mp_int * d)
+{
+  mp_digit D, r, rr;
+  int     x, res;
+  mp_int  t;
+
+
+  /* if the shift count is <= 0 then we do no work */
+  if (b <= 0) {
+    res = mp_copy (a, c);
+    if (d != NULL) {
+      mp_zero (d);
+    }
+    return res;
+  }
+
+  if ((res = mp_init (&t)) != MP_OKAY) {
+    return res;
+  }
+
+  /* get the remainder */
+  if (d != NULL) {
+    if ((res = mp_mod_2d (a, b, &t)) != MP_OKAY) {
+      mp_clear (&t);
+      return res;
+    }
+  }
+
+  /* copy */
+  if ((res = mp_copy (a, c)) != MP_OKAY) {
+    mp_clear (&t);
+    return res;
+  }
+
+  /* shift by as many digits in the bit count */
+  if (b >= (int)DIGIT_BIT) {
+    mp_rshd (c, b / DIGIT_BIT);
+  }
+
+  /* shift any bit count < DIGIT_BIT */
+  D = (mp_digit) (b % DIGIT_BIT);
+  if (D != 0) {
+    register mp_digit *tmpc, mask, shift;
+
+    /* mask */
+    mask = (((mp_digit)1) << D) - 1;
+
+    /* shift for lsb */
+    shift = DIGIT_BIT - D;
+
+    /* alias */
+    tmpc = c->dp + (c->used - 1);
+
+    /* carry */
+    r = 0;
+    for (x = c->used - 1; x >= 0; x--) {
+      /* get the lower  bits of this word in a temp */
+      rr = *tmpc & mask;
+
+      /* shift the current word and mix in the carry bits from the previous word */
+      *tmpc = (*tmpc >> D) | (r << shift);
+      --tmpc;
+
+      /* set the carry to the carry bits of the current word found above */
+      r = rr;
+    }
+  }
+  mp_clamp (c);
+  if (d != NULL) {
+    mp_exch (&t, d);
+  }
+  mp_clear (&t);
+  return MP_OKAY;
+}
+#endif
+
+/* End: bn_mp_div_2d.c */
+
+/* Start: bn_mp_div_3.c */
+#include <tommath.h>
+#ifdef BN_MP_DIV_3_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* divide by three (based on routine from MPI and the GMP manual) */
+int
+mp_div_3 (mp_int * a, mp_int *c, mp_digit * d)
+{
+  mp_int   q;
+  mp_word  w, t;
+  mp_digit b;
+  int      res, ix;
+  
+  /* b = 2**DIGIT_BIT / 3 */
+  b = (((mp_word)1) << ((mp_word)DIGIT_BIT)) / ((mp_word)3);
+
+  if ((res = mp_init_size(&q, a->used)) != MP_OKAY) {
+     return res;
+  }
+  
+  q.used = a->used;
+  q.sign = a->sign;
+  w = 0;
+  for (ix = a->used - 1; ix >= 0; ix--) {
+     w = (w << ((mp_word)DIGIT_BIT)) | ((mp_word)a->dp[ix]);
+
+     if (w >= 3) {
+        /* multiply w by [1/3] */
+        t = (w * ((mp_word)b)) >> ((mp_word)DIGIT_BIT);
+
+        /* now subtract 3 * [w/3] from w, to get the remainder */
+        w -= t+t+t;
+
+        /* fixup the remainder as required since
+         * the optimization is not exact.
+         */
+        while (w >= 3) {
+           t += 1;
+           w -= 3;
+        }
+      } else {
+        t = 0;
+      }
+      q.dp[ix] = (mp_digit)t;
+  }
+
+  /* [optional] store the remainder */
+  if (d != NULL) {
+     *d = (mp_digit)w;
+  }
+
+  /* [optional] store the quotient */
+  if (c != NULL) {
+     mp_clamp(&q);
+     mp_exch(&q, c);
+  }
+  mp_clear(&q);
+  
+  return res;
+}
+
+#endif
+
+/* End: bn_mp_div_3.c */
+
+/* Start: bn_mp_div_d.c */
+#include <tommath.h>
+#ifdef BN_MP_DIV_D_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+static int s_is_power_of_two(mp_digit b, int *p)
+{
+   int x;
+
+   for (x = 1; x < DIGIT_BIT; x++) {
+      if (b == (((mp_digit)1)<<x)) {
+         *p = x;
+         return 1;
+      }
+   }
+   return 0;
+}
+
+/* single digit division (based on routine from MPI) */
+int mp_div_d (mp_int * a, mp_digit b, mp_int * c, mp_digit * d)
+{
+  mp_int  q;
+  mp_word w;
+  mp_digit t;
+  int     res, ix;
+
+  /* cannot divide by zero */
+  if (b == 0) {
+     return MP_VAL;
+  }
+
+  /* quick outs */
+  if (b == 1 || mp_iszero(a) == 1) {
+     if (d != NULL) {
+        *d = 0;
+     }
+     if (c != NULL) {
+        return mp_copy(a, c);
+     }
+     return MP_OKAY;
+  }
+
+  /* power of two ? */
+  if (s_is_power_of_two(b, &ix) == 1) {
+     if (d != NULL) {
+        *d = a->dp[0] & ((((mp_digit)1)<<ix) - 1);
+     }
+     if (c != NULL) {
+        return mp_div_2d(a, ix, c, NULL);
+     }
+     return MP_OKAY;
+  }
+
+#ifdef BN_MP_DIV_3_C
+  /* three? */
+  if (b == 3) {
+     return mp_div_3(a, c, d);
+  }
+#endif
+
+  /* no easy answer [c'est la vie].  Just division */
+  if ((res = mp_init_size(&q, a->used)) != MP_OKAY) {
+     return res;
+  }
+  
+  q.used = a->used;
+  q.sign = a->sign;
+  w = 0;
+  for (ix = a->used - 1; ix >= 0; ix--) {
+     w = (w << ((mp_word)DIGIT_BIT)) | ((mp_word)a->dp[ix]);
+     
+     if (w >= b) {
+        t = (mp_digit)(w / b);
+        w -= ((mp_word)t) * ((mp_word)b);
+      } else {
+        t = 0;
+      }
+      q.dp[ix] = (mp_digit)t;
+  }
+  
+  if (d != NULL) {
+     *d = (mp_digit)w;
+  }
+  
+  if (c != NULL) {
+     mp_clamp(&q);
+     mp_exch(&q, c);
+  }
+  mp_clear(&q);
+  
+  return res;
+}
+
+#endif
+
+/* End: bn_mp_div_d.c */
+
+/* Start: bn_mp_dr_is_modulus.c */
+#include <tommath.h>
+#ifdef BN_MP_DR_IS_MODULUS_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* determines if a number is a valid DR modulus */
+int mp_dr_is_modulus(mp_int *a)
+{
+   int ix;
+
+   /* must be at least two digits */
+   if (a->used < 2) {
+      return 0;
+   }
+
+   /* must be of the form b**k - a [a <= b] so all
+    * but the first digit must be equal to -1 (mod b).
+    */
+   for (ix = 1; ix < a->used; ix++) {
+       if (a->dp[ix] != MP_MASK) {
+          return 0;
+       }
+   }
+   return 1;
+}
+
+#endif
+
+/* End: bn_mp_dr_is_modulus.c */
+
+/* Start: bn_mp_dr_reduce.c */
+#include <tommath.h>
+#ifdef BN_MP_DR_REDUCE_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* reduce "x" in place modulo "n" using the Diminished Radix algorithm.
+ *
+ * Based on algorithm from the paper
+ *
+ * "Generating Efficient Primes for Discrete Log Cryptosystems"
+ *                 Chae Hoon Lim, Pil Joong Lee,
+ *          POSTECH Information Research Laboratories
+ *
+ * The modulus must be of a special format [see manual]
+ *
+ * Has been modified to use algorithm 7.10 from the LTM book instead
+ *
+ * Input x must be in the range 0 <= x <= (n-1)**2
+ */
+int
+mp_dr_reduce (mp_int * x, mp_int * n, mp_digit k)
+{
+  int      err, i, m;
+  mp_word  r;
+  mp_digit mu, *tmpx1, *tmpx2;
+
+  /* m = digits in modulus */
+  m = n->used;
+
+  /* ensure that "x" has at least 2m digits */
+  if (x->alloc < m + m) {
+    if ((err = mp_grow (x, m + m)) != MP_OKAY) {
+      return err;
+    }
+  }
+
+/* top of loop, this is where the code resumes if
+ * another reduction pass is required.
+ */
+top:
+  /* aliases for digits */
+  /* alias for lower half of x */
+  tmpx1 = x->dp;
+
+  /* alias for upper half of x, or x/B**m */
+  tmpx2 = x->dp + m;
+
+  /* set carry to zero */
+  mu = 0;
+
+  /* compute (x mod B**m) + k * [x/B**m] inline and inplace */
+  for (i = 0; i < m; i++) {
+      r         = ((mp_word)*tmpx2++) * ((mp_word)k) + *tmpx1 + mu;
+      *tmpx1++  = (mp_digit)(r & MP_MASK);
+      mu        = (mp_digit)(r >> ((mp_word)DIGIT_BIT));
+  }
+
+  /* set final carry */
+  *tmpx1++ = mu;
+
+  /* zero words above m */
+  for (i = m + 1; i < x->used; i++) {
+      *tmpx1++ = 0;
+  }
+
+  /* clamp, sub and return */
+  mp_clamp (x);
+
+  /* if x >= n then subtract and reduce again
+   * Each successive "recursion" makes the input smaller and smaller.
+   */
+  if (mp_cmp_mag (x, n) != MP_LT) {
+    s_mp_sub(x, n, x);
+    goto top;
+  }
+  return MP_OKAY;
+}
+#endif
+
+/* End: bn_mp_dr_reduce.c */
+
+/* Start: bn_mp_dr_setup.c */
+#include <tommath.h>
+#ifdef BN_MP_DR_SETUP_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* determines the setup value */
+void mp_dr_setup(mp_int *a, mp_digit *d)
+{
+   /* the casts are required if DIGIT_BIT is one less than
+    * the number of bits in a mp_digit [e.g. DIGIT_BIT==31]
+    */
+   *d = (mp_digit)((((mp_word)1) << ((mp_word)DIGIT_BIT)) - 
+        ((mp_word)a->dp[0]));
+}
+
+#endif
+
+/* End: bn_mp_dr_setup.c */
+
+/* Start: bn_mp_exch.c */
+#include <tommath.h>
+#ifdef BN_MP_EXCH_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* swap the elements of two integers, for cases where you can't simply swap the 
+ * mp_int pointers around
+ */
+void
+mp_exch (mp_int * a, mp_int * b)
+{
+  mp_int  t;
+
+  t  = *a;
+  *a = *b;
+  *b = t;
+}
+#endif
+
+/* End: bn_mp_exch.c */
+
+/* Start: bn_mp_expt_d.c */
+#include <tommath.h>
+#ifdef BN_MP_EXPT_D_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* calculate c = a**b  using a square-multiply algorithm */
+int mp_expt_d (mp_int * a, mp_digit b, mp_int * c)
+{
+  int     res, x;
+  mp_int  g;
+
+  if ((res = mp_init_copy (&g, a)) != MP_OKAY) {
+    return res;
+  }
+
+  /* set initial result */
+  mp_set (c, 1);
+
+  for (x = 0; x < (int) DIGIT_BIT; x++) {
+    /* square */
+    if ((res = mp_sqr (c, c)) != MP_OKAY) {
+      mp_clear (&g);
+      return res;
+    }
+
+    /* if the bit is set multiply */
+    if ((b & (mp_digit) (((mp_digit)1) << (DIGIT_BIT - 1))) != 0) {
+      if ((res = mp_mul (c, &g, c)) != MP_OKAY) {
+         mp_clear (&g);
+         return res;
+      }
+    }
+
+    /* shift to next bit */
+    b <<= 1;
+  }
+
+  mp_clear (&g);
+  return MP_OKAY;
+}
+#endif
+
+/* End: bn_mp_expt_d.c */
+
+/* Start: bn_mp_exptmod.c */
+#include <tommath.h>
+#ifdef BN_MP_EXPTMOD_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+
+/* this is a shell function that calls either the normal or Montgomery
+ * exptmod functions.  Originally the call to the montgomery code was
+ * embedded in the normal function but that wasted alot of stack space
+ * for nothing (since 99% of the time the Montgomery code would be called)
+ */
+int mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
+{
+  int dr;
+
+  /* modulus P must be positive */
+  if (P->sign == MP_NEG) {
+     return MP_VAL;
+  }
+
+  /* if exponent X is negative we have to recurse */
+  if (X->sign == MP_NEG) {
+#ifdef BN_MP_INVMOD_C
+     mp_int tmpG, tmpX;
+     int err;
+
+     /* first compute 1/G mod P */
+     if ((err = mp_init(&tmpG)) != MP_OKAY) {
+        return err;
+     }
+     if ((err = mp_invmod(G, P, &tmpG)) != MP_OKAY) {
+        mp_clear(&tmpG);
+        return err;
+     }
+
+     /* now get |X| */
+     if ((err = mp_init(&tmpX)) != MP_OKAY) {
+        mp_clear(&tmpG);
+        return err;
+     }
+     if ((err = mp_abs(X, &tmpX)) != MP_OKAY) {
+        mp_clear_multi(&tmpG, &tmpX, NULL);
+        return err;
+     }
+
+     /* and now compute (1/G)**|X| instead of G**X [X < 0] */
+     err = mp_exptmod(&tmpG, &tmpX, P, Y);
+     mp_clear_multi(&tmpG, &tmpX, NULL);
+     return err;
+#else 
+     /* no invmod */
+     return MP_VAL;
+#endif
+  }
+
+/* modified diminished radix reduction */
+#if defined(BN_MP_REDUCE_IS_2K_L_C) && defined(BN_MP_REDUCE_2K_L_C)
+  if (mp_reduce_is_2k_l(P) == MP_YES) {
+     return s_mp_exptmod(G, X, P, Y, 1);
+  }
+#endif
+
+#ifdef BN_MP_DR_IS_MODULUS_C
+  /* is it a DR modulus? */
+  dr = mp_dr_is_modulus(P);
+#else
+  /* default to no */
+  dr = 0;
+#endif
+
+#ifdef BN_MP_REDUCE_IS_2K_C
+  /* if not, is it a unrestricted DR modulus? */
+  if (dr == 0) {
+     dr = mp_reduce_is_2k(P) << 1;
+  }
+#endif
+    
+  /* if the modulus is odd or dr != 0 use the montgomery method */
+#ifdef BN_MP_EXPTMOD_FAST_C
+  if (mp_isodd (P) == 1 || dr !=  0) {
+    return mp_exptmod_fast (G, X, P, Y, dr);
+  } else {
+#endif
+#ifdef BN_S_MP_EXPTMOD_C
+    /* otherwise use the generic Barrett reduction technique */
+    return s_mp_exptmod (G, X, P, Y, 0);
+#else
+    /* no exptmod for evens */
+    return MP_VAL;
+#endif
+#ifdef BN_MP_EXPTMOD_FAST_C
+  }
+#endif
+}
+
+#endif
+
+/* End: bn_mp_exptmod.c */
+
+/* Start: bn_mp_exptmod_fast.c */
+#include <tommath.h>
+#ifdef BN_MP_EXPTMOD_FAST_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* computes Y == G**X mod P, HAC pp.616, Algorithm 14.85
+ *
+ * Uses a left-to-right k-ary sliding window to compute the modular exponentiation.
+ * The value of k changes based on the size of the exponent.
+ *
+ * Uses Montgomery or Diminished Radix reduction [whichever appropriate]
+ */
+
+#ifdef MP_LOW_MEM
+   #define TAB_SIZE 32
+#else
+   #define TAB_SIZE 256
+#endif
+
+int mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
+{
+  mp_int  M[TAB_SIZE], res;
+  mp_digit buf, mp;
+  int     err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
+
+  /* use a pointer to the reduction algorithm.  This allows us to use
+   * one of many reduction algorithms without modding the guts of
+   * the code with if statements everywhere.
+   */
+  int     (*redux)(mp_int*,mp_int*,mp_digit);
+
+  /* find window size */
+  x = mp_count_bits (X);
+  if (x <= 7) {
+    winsize = 2;
+  } else if (x <= 36) {
+    winsize = 3;
+  } else if (x <= 140) {
+    winsize = 4;
+  } else if (x <= 450) {
+    winsize = 5;
+  } else if (x <= 1303) {
+    winsize = 6;
+  } else if (x <= 3529) {
+    winsize = 7;
+  } else {
+    winsize = 8;
+  }
+
+#ifdef MP_LOW_MEM
+  if (winsize > 5) {
+     winsize = 5;
+  }
+#endif
+
+  /* init M array */
+  /* init first cell */
+  if ((err = mp_init(&M[1])) != MP_OKAY) {
+     return err;
+  }
+
+  /* now init the second half of the array */
+  for (x = 1<<(winsize-1); x < (1 << winsize); x++) {
+    if ((err = mp_init(&M[x])) != MP_OKAY) {
+      for (y = 1<<(winsize-1); y < x; y++) {
+        mp_clear (&M[y]);
+      }
+      mp_clear(&M[1]);
+      return err;
+    }
+  }
+
+  /* determine and setup reduction code */
+  if (redmode == 0) {
+#ifdef BN_MP_MONTGOMERY_SETUP_C     
+     /* now setup montgomery  */
+     if ((err = mp_montgomery_setup (P, &mp)) != MP_OKAY) {
+        goto LBL_M;
+     }
+#else
+     err = MP_VAL;
+     goto LBL_M;
+#endif
+
+     /* automatically pick the comba one if available (saves quite a few calls/ifs) */
+#ifdef BN_FAST_MP_MONTGOMERY_REDUCE_C
+     if (((P->used * 2 + 1) < MP_WARRAY) &&
+          P->used < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
+        redux = fast_mp_montgomery_reduce;
+     } else 
+#endif
+     {
+#ifdef BN_MP_MONTGOMERY_REDUCE_C
+        /* use slower baseline Montgomery method */
+        redux = mp_montgomery_reduce;
+#else
+        err = MP_VAL;
+        goto LBL_M;
+#endif
+     }
+  } else if (redmode == 1) {
+#if defined(BN_MP_DR_SETUP_C) && defined(BN_MP_DR_REDUCE_C)
+     /* setup DR reduction for moduli of the form B**k - b */
+     mp_dr_setup(P, &mp);
+     redux = mp_dr_reduce;
+#else
+     err = MP_VAL;
+     goto LBL_M;
+#endif
+  } else {
+#if defined(BN_MP_REDUCE_2K_SETUP_C) && defined(BN_MP_REDUCE_2K_C)
+     /* setup DR reduction for moduli of the form 2**k - b */
+     if ((err = mp_reduce_2k_setup(P, &mp)) != MP_OKAY) {
+        goto LBL_M;
+     }
+     redux = mp_reduce_2k;
+#else
+     err = MP_VAL;
+     goto LBL_M;
+#endif
+  }
+
+  /* setup result */
+  if ((err = mp_init (&res)) != MP_OKAY) {
+    goto LBL_M;
+  }
+
+  /* create M table
+   *
+
+   *
+   * The first half of the table is not computed though accept for M[0] and M[1]
+   */
+
+  if (redmode == 0) {
+#ifdef BN_MP_MONTGOMERY_CALC_NORMALIZATION_C
+     /* now we need R mod m */
+     if ((err = mp_montgomery_calc_normalization (&res, P)) != MP_OKAY) {
+       goto LBL_RES;
+     }
+#else 
+     err = MP_VAL;
+     goto LBL_RES;
+#endif
+
+     /* now set M[1] to G * R mod m */
+     if ((err = mp_mulmod (G, &res, P, &M[1])) != MP_OKAY) {
+       goto LBL_RES;
+     }
+  } else {
+     mp_set(&res, 1);
+     if ((err = mp_mod(G, P, &M[1])) != MP_OKAY) {
+        goto LBL_RES;
+     }
+  }
+
+  /* compute the value at M[1<<(winsize-1)] by squaring M[1] (winsize-1) times */
+  if ((err = mp_copy (&M[1], &M[1 << (winsize - 1)])) != MP_OKAY) {
+    goto LBL_RES;
+  }
+
+  for (x = 0; x < (winsize - 1); x++) {
+    if ((err = mp_sqr (&M[1 << (winsize - 1)], &M[1 << (winsize - 1)])) != MP_OKAY) {
+      goto LBL_RES;
+    }
+    if ((err = redux (&M[1 << (winsize - 1)], P, mp)) != MP_OKAY) {
+      goto LBL_RES;
+    }
+  }
+
+  /* create upper table */
+  for (x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x++) {
+    if ((err = mp_mul (&M[x - 1], &M[1], &M[x])) != MP_OKAY) {
+      goto LBL_RES;
+    }
+    if ((err = redux (&M[x], P, mp)) != MP_OKAY) {
+      goto LBL_RES;
+    }
+  }
+
+  /* set initial mode and bit cnt */
+  mode   = 0;
+  bitcnt = 1;
+  buf    = 0;
+  digidx = X->used - 1;
+  bitcpy = 0;
+  bitbuf = 0;
+
+  for (;;) {
+    /* grab next digit as required */
+    if (--bitcnt == 0) {
+      /* if digidx == -1 we are out of digits so break */
+      if (digidx == -1) {
+        break;
+      }
+      /* read next digit and reset bitcnt */
+      buf    = X->dp[digidx--];
+      bitcnt = (int)DIGIT_BIT;
+    }
+
+    /* grab the next msb from the exponent */
+    y     = (mp_digit)(buf >> (DIGIT_BIT - 1)) & 1;
+    buf <<= (mp_digit)1;
+
+    /* if the bit is zero and mode == 0 then we ignore it
+     * These represent the leading zero bits before the first 1 bit
+     * in the exponent.  Technically this opt is not required but it
+     * does lower the # of trivial squaring/reductions used
+     */
+    if (mode == 0 && y == 0) {
+      continue;
+    }
+
+    /* if the bit is zero and mode == 1 then we square */
+    if (mode == 1 && y == 0) {
+      if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
+        goto LBL_RES;
+      }
+      if ((err = redux (&res, P, mp)) != MP_OKAY) {
+        goto LBL_RES;
+      }
+      continue;
+    }
+
+    /* else we add it to the window */
+    bitbuf |= (y << (winsize - ++bitcpy));
+    mode    = 2;
+
+    if (bitcpy == winsize) {
+      /* ok window is filled so square as required and multiply  */
+      /* square first */
+      for (x = 0; x < winsize; x++) {
+        if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
+          goto LBL_RES;
+        }
+        if ((err = redux (&res, P, mp)) != MP_OKAY) {
+          goto LBL_RES;
+        }
+      }
+
+      /* then multiply */
+      if ((err = mp_mul (&res, &M[bitbuf], &res)) != MP_OKAY) {
+        goto LBL_RES;
+      }
+      if ((err = redux (&res, P, mp)) != MP_OKAY) {
+        goto LBL_RES;
+      }
+
+      /* empty window and reset */
+      bitcpy = 0;
+      bitbuf = 0;
+      mode   = 1;
+    }
+  }
+
+  /* if bits remain then square/multiply */
+  if (mode == 2 && bitcpy > 0) {
+    /* square then multiply if the bit is set */
+    for (x = 0; x < bitcpy; x++) {
+      if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
+        goto LBL_RES;
+      }
+      if ((err = redux (&res, P, mp)) != MP_OKAY) {
+        goto LBL_RES;
+      }
+
+      /* get next bit of the window */
+      bitbuf <<= 1;
+      if ((bitbuf & (1 << winsize)) != 0) {
+        /* then multiply */
+        if ((err = mp_mul (&res, &M[1], &res)) != MP_OKAY) {
+          goto LBL_RES;
+        }
+        if ((err = redux (&res, P, mp)) != MP_OKAY) {
+          goto LBL_RES;
+        }
+      }
+    }
+  }
+
+  if (redmode == 0) {
+     /* fixup result if Montgomery reduction is used
+      * recall that any value in a Montgomery system is
+      * actually multiplied by R mod n.  So we have
+      * to reduce one more time to cancel out the factor
+      * of R.
+      */
+     if ((err = redux(&res, P, mp)) != MP_OKAY) {
+       goto LBL_RES;
+     }
+  }
+
+  /* swap res with Y */
+  mp_exch (&res, Y);
+  err = MP_OKAY;
+LBL_RES:mp_clear (&res);
+LBL_M:
+  mp_clear(&M[1]);
+  for (x = 1<<(winsize-1); x < (1 << winsize); x++) {
+    mp_clear (&M[x]);
+  }
+  return err;
+}
+#endif
+
+
+/* End: bn_mp_exptmod_fast.c */
+
+/* Start: bn_mp_exteuclid.c */
+#include <tommath.h>
+#ifdef BN_MP_EXTEUCLID_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* Extended euclidean algorithm of (a, b) produces 
+   a*u1 + b*u2 = u3
+ */
+int mp_exteuclid(mp_int *a, mp_int *b, mp_int *U1, mp_int *U2, mp_int *U3)
+{
+   mp_int u1,u2,u3,v1,v2,v3,t1,t2,t3,q,tmp;
+   int err;
+
+   if ((err = mp_init_multi(&u1, &u2, &u3, &v1, &v2, &v3, &t1, &t2, &t3, &q, &tmp, NULL)) != MP_OKAY) {
+      return err;
+   }
+
+   /* initialize, (u1,u2,u3) = (1,0,a) */
+   mp_set(&u1, 1);
+   if ((err = mp_copy(a, &u3)) != MP_OKAY)                                        { goto _ERR; }
+
+   /* initialize, (v1,v2,v3) = (0,1,b) */
+   mp_set(&v2, 1);
+   if ((err = mp_copy(b, &v3)) != MP_OKAY)                                        { goto _ERR; }
+
+   /* loop while v3 != 0 */
+   while (mp_iszero(&v3) == MP_NO) {
+       /* q = u3/v3 */
+       if ((err = mp_div(&u3, &v3, &q, NULL)) != MP_OKAY)                         { goto _ERR; }
+
+       /* (t1,t2,t3) = (u1,u2,u3) - (v1,v2,v3)q */
+       if ((err = mp_mul(&v1, &q, &tmp)) != MP_OKAY)                              { goto _ERR; }
+       if ((err = mp_sub(&u1, &tmp, &t1)) != MP_OKAY)                             { goto _ERR; }
+       if ((err = mp_mul(&v2, &q, &tmp)) != MP_OKAY)                              { goto _ERR; }
+       if ((err = mp_sub(&u2, &tmp, &t2)) != MP_OKAY)                             { goto _ERR; }
+       if ((err = mp_mul(&v3, &q, &tmp)) != MP_OKAY)                              { goto _ERR; }
+       if ((err = mp_sub(&u3, &tmp, &t3)) != MP_OKAY)                             { goto _ERR; }
+
+       /* (u1,u2,u3) = (v1,v2,v3) */
+       if ((err = mp_copy(&v1, &u1)) != MP_OKAY)                                  { goto _ERR; }
+       if ((err = mp_copy(&v2, &u2)) != MP_OKAY)                                  { goto _ERR; }
+       if ((err = mp_copy(&v3, &u3)) != MP_OKAY)                                  { goto _ERR; }
+
+       /* (v1,v2,v3) = (t1,t2,t3) */
+       if ((err = mp_copy(&t1, &v1)) != MP_OKAY)                                  { goto _ERR; }
+       if ((err = mp_copy(&t2, &v2)) != MP_OKAY)                                  { goto _ERR; }
+       if ((err = mp_copy(&t3, &v3)) != MP_OKAY)                                  { goto _ERR; }
+   }
+
+   /* make sure U3 >= 0 */
+   if (u3.sign == MP_NEG) {
+      mp_neg(&u1, &u1);
+      mp_neg(&u2, &u2);
+      mp_neg(&u3, &u3);
+   }
+
+   /* copy result out */
+   if (U1 != NULL) { mp_exch(U1, &u1); }
+   if (U2 != NULL) { mp_exch(U2, &u2); }
+   if (U3 != NULL) { mp_exch(U3, &u3); }
+
+   err = MP_OKAY;
+_ERR: mp_clear_multi(&u1, &u2, &u3, &v1, &v2, &v3, &t1, &t2, &t3, &q, &tmp, NULL);
+   return err;
+}
+#endif
+
+/* End: bn_mp_exteuclid.c */
+
+/* Start: bn_mp_fread.c */
+#include <tommath.h>
+#ifdef BN_MP_FREAD_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* read a bigint from a file stream in ASCII */
+int mp_fread(mp_int *a, int radix, FILE *stream)
+{
+   int err, ch, neg, y;
+   
+   /* clear a */
+   mp_zero(a);
+   
+   /* if first digit is - then set negative */
+   ch = fgetc(stream);
+   if (ch == '-') {
+      neg = MP_NEG;
+      ch = fgetc(stream);
+   } else {
+      neg = MP_ZPOS;
+   }
+   
+   for (;;) {
+      /* find y in the radix map */
+      for (y = 0; y < radix; y++) {
+          if (mp_s_rmap[y] == ch) {
+             break;
+          }
+      }
+      if (y == radix) {
+         break;
+      }
+      
+      /* shift up and add */
+      if ((err = mp_mul_d(a, radix, a)) != MP_OKAY) {
+         return err;
+      }
+      if ((err = mp_add_d(a, y, a)) != MP_OKAY) {
+         return err;
+      }
+      
+      ch = fgetc(stream);
+   }
+   if (mp_cmp_d(a, 0) != MP_EQ) {
+      a->sign = neg;
+   }
+   
+   return MP_OKAY;
+}
+
+#endif
+
+/* End: bn_mp_fread.c */
+
+/* Start: bn_mp_fwrite.c */
+#include <tommath.h>
+#ifdef BN_MP_FWRITE_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+int mp_fwrite(mp_int *a, int radix, FILE *stream)
+{
+   char *buf;
+   int err, len, x;
+   
+   if ((err = mp_radix_size(a, radix, &len)) != MP_OKAY) {
+      return err;
+   }
+
+   buf = OPT_CAST(char) XMALLOC (len);
+   if (buf == NULL) {
+      return MP_MEM;
+   }
+   
+   if ((err = mp_toradix(a, buf, radix)) != MP_OKAY) {
+      XFREE (buf);
+      return err;
+   }
+   
+   for (x = 0; x < len; x++) {
+       if (fputc(buf[x], stream) == EOF) {
+          XFREE (buf);
+          return MP_VAL;
+       }
+   }
+   
+   XFREE (buf);
+   return MP_OKAY;
+}
+
+#endif
+
+/* End: bn_mp_fwrite.c */
+
+/* Start: bn_mp_gcd.c */
+#include <tommath.h>
+#ifdef BN_MP_GCD_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* Greatest Common Divisor using the binary method */
+int mp_gcd (mp_int * a, mp_int * b, mp_int * c)
+{
+  mp_int  u, v;
+  int     k, u_lsb, v_lsb, res;
+
+  /* either zero than gcd is the largest */
+  if (mp_iszero (a) == 1 && mp_iszero (b) == 0) {
+    return mp_abs (b, c);
+  }
+  if (mp_iszero (a) == 0 && mp_iszero (b) == 1) {
+    return mp_abs (a, c);
+  }
+
+  /* optimized.  At this point if a == 0 then
+   * b must equal zero too
+   */
+  if (mp_iszero (a) == 1) {
+    mp_zero(c);
+    return MP_OKAY;
+  }
+
+  /* get copies of a and b we can modify */
+  if ((res = mp_init_copy (&u, a)) != MP_OKAY) {
+    return res;
+  }
+
+  if ((res = mp_init_copy (&v, b)) != MP_OKAY) {
+    goto LBL_U;
+  }
+
+  /* must be positive for the remainder of the algorithm */
+  u.sign = v.sign = MP_ZPOS;
+
+  /* B1.  Find the common power of two for u and v */
+  u_lsb = mp_cnt_lsb(&u);
+  v_lsb = mp_cnt_lsb(&v);
+  k     = MIN(u_lsb, v_lsb);
+
+  if (k > 0) {
+     /* divide the power of two out */
+     if ((res = mp_div_2d(&u, k, &u, NULL)) != MP_OKAY) {
+        goto LBL_V;
+     }
+
+     if ((res = mp_div_2d(&v, k, &v, NULL)) != MP_OKAY) {
+        goto LBL_V;
+     }
+  }
+
+  /* divide any remaining factors of two out */
+  if (u_lsb != k) {
+     if ((res = mp_div_2d(&u, u_lsb - k, &u, NULL)) != MP_OKAY) {
+        goto LBL_V;
+     }
+  }
+
+  if (v_lsb != k) {
+     if ((res = mp_div_2d(&v, v_lsb - k, &v, NULL)) != MP_OKAY) {
+        goto LBL_V;
+     }
+  }
+
+  while (mp_iszero(&v) == 0) {
+     /* make sure v is the largest */
+     if (mp_cmp_mag(&u, &v) == MP_GT) {
+        /* swap u and v to make sure v is >= u */
+        mp_exch(&u, &v);
+     }
+     
+     /* subtract smallest from largest */
+     if ((res = s_mp_sub(&v, &u, &v)) != MP_OKAY) {
+        goto LBL_V;
+     }
+     
+     /* Divide out all factors of two */
+     if ((res = mp_div_2d(&v, mp_cnt_lsb(&v), &v, NULL)) != MP_OKAY) {
+        goto LBL_V;
+     } 
+  } 
+
+  /* multiply by 2**k which we divided out at the beginning */
+  if ((res = mp_mul_2d (&u, k, c)) != MP_OKAY) {
+     goto LBL_V;
+  }
+  c->sign = MP_ZPOS;
+  res = MP_OKAY;
+LBL_V:mp_clear (&u);
+LBL_U:mp_clear (&v);
+  return res;
+}
+#endif
+
+/* End: bn_mp_gcd.c */
+
+/* Start: bn_mp_get_int.c */
+#include <tommath.h>
+#ifdef BN_MP_GET_INT_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* get the lower 32-bits of an mp_int */
+unsigned long mp_get_int(mp_int * a) 
+{
+  int i;
+  unsigned long res;
+
+  if (a->used == 0) {
+     return 0;
+  }
+
+  /* get number of digits of the lsb we have to read */
+  i = MIN(a->used,(int)((sizeof(unsigned long)*CHAR_BIT+DIGIT_BIT-1)/DIGIT_BIT))-1;
+
+  /* get most significant digit of result */
+  res = DIGIT(a,i);
+   
+  while (--i >= 0) {
+    res = (res << DIGIT_BIT) | DIGIT(a,i);
+  }
+
+  /* force result to 32-bits always so it is consistent on non 32-bit platforms */
+  return res & 0xFFFFFFFFUL;
+}
+#endif
+
+/* End: bn_mp_get_int.c */
+
+/* Start: bn_mp_grow.c */
+#include <tommath.h>
+#ifdef BN_MP_GROW_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* grow as required */
+int mp_grow (mp_int * a, int size)
+{
+  int     i;
+  mp_digit *tmp;
+
+  /* if the alloc size is smaller alloc more ram */
+  if (a->alloc < size) {
+    /* ensure there are always at least MP_PREC digits extra on top */
+    size += (MP_PREC * 2) - (size % MP_PREC);
+
+    /* reallocate the array a->dp
+     *
+     * We store the return in a temporary variable
+     * in case the operation failed we don't want
+     * to overwrite the dp member of a.
+     */
+    tmp = OPT_CAST(mp_digit) XREALLOC (a->dp, sizeof (mp_digit) * size);
+    if (tmp == NULL) {
+      /* reallocation failed but "a" is still valid [can be freed] */
+      return MP_MEM;
+    }
+
+    /* reallocation succeeded so set a->dp */
+    a->dp = tmp;
+
+    /* zero excess digits */
+    i        = a->alloc;
+    a->alloc = size;
+    for (; i < a->alloc; i++) {
+      a->dp[i] = 0;
+    }
+  }
+  return MP_OKAY;
+}
+#endif
+
+/* End: bn_mp_grow.c */
+
+/* Start: bn_mp_init.c */
+#include <tommath.h>
+#ifdef BN_MP_INIT_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* init a new mp_int */
+int mp_init (mp_int * a)
+{
+  int i;
+
+  /* allocate memory required and clear it */
+  a->dp = OPT_CAST(mp_digit) XMALLOC (sizeof (mp_digit) * MP_PREC);
+  if (a->dp == NULL) {
+    return MP_MEM;
+  }
+
+  /* set the digits to zero */
+  for (i = 0; i < MP_PREC; i++) {
+      a->dp[i] = 0;
+  }
+
+  /* set the used to zero, allocated digits to the default precision
+   * and sign to positive */
+  a->used  = 0;
+  a->alloc = MP_PREC;
+  a->sign  = MP_ZPOS;
+
+  return MP_OKAY;
+}
+#endif
+
+/* End: bn_mp_init.c */
+
+/* Start: bn_mp_init_copy.c */
+#include <tommath.h>
+#ifdef BN_MP_INIT_COPY_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* creates "a" then copies b into it */
+int mp_init_copy (mp_int * a, mp_int * b)
+{
+  int     res;
+
+  if ((res = mp_init (a)) != MP_OKAY) {
+    return res;
+  }
+  return mp_copy (b, a);
+}
+#endif
+
+/* End: bn_mp_init_copy.c */
+
+/* Start: bn_mp_init_multi.c */
+#include <tommath.h>
+#ifdef BN_MP_INIT_MULTI_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <stdarg.h>
+
+int mp_init_multi(mp_int *mp, ...) 
+{
+    mp_err res = MP_OKAY;      /* Assume ok until proven otherwise */
+    int n = 0;                 /* Number of ok inits */
+    mp_int* cur_arg = mp;
+    va_list args;
+
+    va_start(args, mp);        /* init args to next argument from caller */
+    while (cur_arg != NULL) {
+        if (mp_init(cur_arg) != MP_OKAY) {
+            /* Oops - error! Back-track and mp_clear what we already
+               succeeded in init-ing, then return error.
+            */
+            va_list clean_args;
+            
+            /* end the current list */
+            va_end(args);
+            
+            /* now start cleaning up */            
+            cur_arg = mp;
+            va_start(clean_args, mp);
+            while (n--) {
+                mp_clear(cur_arg);
+                cur_arg = va_arg(clean_args, mp_int*);
+            }
+            va_end(clean_args);
+            res = MP_MEM;
+            break;
+        }
+        n++;
+        cur_arg = va_arg(args, mp_int*);
+    }
+    va_end(args);
+    return res;                /* Assumed ok, if error flagged above. */
+}
+
+#endif
+
+/* End: bn_mp_init_multi.c */
+
+/* Start: bn_mp_init_set.c */
+#include <tommath.h>
+#ifdef BN_MP_INIT_SET_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* initialize and set a digit */
+int mp_init_set (mp_int * a, mp_digit b)
+{
+  int err;
+  if ((err = mp_init(a)) != MP_OKAY) {
+     return err;
+  }
+  mp_set(a, b);
+  return err;
+}
+#endif
+
+/* End: bn_mp_init_set.c */
+
+/* Start: bn_mp_init_set_int.c */
+#include <tommath.h>
+#ifdef BN_MP_INIT_SET_INT_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* initialize and set a digit */
+int mp_init_set_int (mp_int * a, unsigned long b)
+{
+  int err;
+  if ((err = mp_init(a)) != MP_OKAY) {
+     return err;
+  }
+  return mp_set_int(a, b);
+}
+#endif
+
+/* End: bn_mp_init_set_int.c */
+
+/* Start: bn_mp_init_size.c */
+#include <tommath.h>
+#ifdef BN_MP_INIT_SIZE_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* init an mp_init for a given size */
+int mp_init_size (mp_int * a, int size)
+{
+  int x;
+
+  /* pad size so there are always extra digits */
+  size += (MP_PREC * 2) - (size % MP_PREC);	
+  
+  /* alloc mem */
+  a->dp = OPT_CAST(mp_digit) XMALLOC (sizeof (mp_digit) * size);
+  if (a->dp == NULL) {
+    return MP_MEM;
+  }
+
+  /* set the members */
+  a->used  = 0;
+  a->alloc = size;
+  a->sign  = MP_ZPOS;
+
+  /* zero the digits */
+  for (x = 0; x < size; x++) {
+      a->dp[x] = 0;
+  }
+
+  return MP_OKAY;
+}
+#endif
+
+/* End: bn_mp_init_size.c */
+
+/* Start: bn_mp_invmod.c */
+#include <tommath.h>
+#ifdef BN_MP_INVMOD_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* hac 14.61, pp608 */
+int mp_invmod (mp_int * a, mp_int * b, mp_int * c)
+{
+  /* b cannot be negative */
+  if (b->sign == MP_NEG || mp_iszero(b) == 1) {
+    return MP_VAL;
+  }
+
+#ifdef BN_FAST_MP_INVMOD_C
+  /* if the modulus is odd we can use a faster routine instead */
+  if (mp_isodd (b) == 1) {
+    return fast_mp_invmod (a, b, c);
+  }
+#endif
+
+#ifdef BN_MP_INVMOD_SLOW_C
+  return mp_invmod_slow(a, b, c);
+#endif
+
+  return MP_VAL;
+}
+#endif
+
+/* End: bn_mp_invmod.c */
+
+/* Start: bn_mp_invmod_slow.c */
+#include <tommath.h>
+#ifdef BN_MP_INVMOD_SLOW_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* hac 14.61, pp608 */
+int mp_invmod_slow (mp_int * a, mp_int * b, mp_int * c)
+{
+  mp_int  x, y, u, v, A, B, C, D;
+  int     res;
+
+  /* b cannot be negative */
+  if (b->sign == MP_NEG || mp_iszero(b) == 1) {
+    return MP_VAL;
+  }
+
+  /* init temps */
+  if ((res = mp_init_multi(&x, &y, &u, &v, 
+                           &A, &B, &C, &D, NULL)) != MP_OKAY) {
+     return res;
+  }
+
+  /* x = a, y = b */
+  if ((res = mp_mod(a, b, &x)) != MP_OKAY) {
+      goto LBL_ERR;
+  }
+  if ((res = mp_copy (b, &y)) != MP_OKAY) {
+    goto LBL_ERR;
+  }
+
+  /* 2. [modified] if x,y are both even then return an error! */
+  if (mp_iseven (&x) == 1 && mp_iseven (&y) == 1) {
+    res = MP_VAL;
+    goto LBL_ERR;
+  }
+
+  /* 3. u=x, v=y, A=1, B=0, C=0,D=1 */
+  if ((res = mp_copy (&x, &u)) != MP_OKAY) {
+    goto LBL_ERR;
+  }
+  if ((res = mp_copy (&y, &v)) != MP_OKAY) {
+    goto LBL_ERR;
+  }
+  mp_set (&A, 1);
+  mp_set (&D, 1);
+
+top:
+  /* 4.  while u is even do */
+  while (mp_iseven (&u) == 1) {
+    /* 4.1 u = u/2 */
+    if ((res = mp_div_2 (&u, &u)) != MP_OKAY) {
+      goto LBL_ERR;
+    }
+    /* 4.2 if A or B is odd then */
+    if (mp_isodd (&A) == 1 || mp_isodd (&B) == 1) {
+      /* A = (A+y)/2, B = (B-x)/2 */
+      if ((res = mp_add (&A, &y, &A)) != MP_OKAY) {
+         goto LBL_ERR;
+      }
+      if ((res = mp_sub (&B, &x, &B)) != MP_OKAY) {
+         goto LBL_ERR;
+      }
+    }
+    /* A = A/2, B = B/2 */
+    if ((res = mp_div_2 (&A, &A)) != MP_OKAY) {
+      goto LBL_ERR;
+    }
+    if ((res = mp_div_2 (&B, &B)) != MP_OKAY) {
+      goto LBL_ERR;
+    }
+  }
+
+  /* 5.  while v is even do */
+  while (mp_iseven (&v) == 1) {
+    /* 5.1 v = v/2 */
+    if ((res = mp_div_2 (&v, &v)) != MP_OKAY) {
+      goto LBL_ERR;
+    }
+    /* 5.2 if C or D is odd then */
+    if (mp_isodd (&C) == 1 || mp_isodd (&D) == 1) {
+      /* C = (C+y)/2, D = (D-x)/2 */
+      if ((res = mp_add (&C, &y, &C)) != MP_OKAY) {
+         goto LBL_ERR;
+      }
+      if ((res = mp_sub (&D, &x, &D)) != MP_OKAY) {
+         goto LBL_ERR;
+      }
+    }
+    /* C = C/2, D = D/2 */
+    if ((res = mp_div_2 (&C, &C)) != MP_OKAY) {
+      goto LBL_ERR;
+    }
+    if ((res = mp_div_2 (&D, &D)) != MP_OKAY) {
+      goto LBL_ERR;
+    }
+  }
+
+  /* 6.  if u >= v then */
+  if (mp_cmp (&u, &v) != MP_LT) {
+    /* u = u - v, A = A - C, B = B - D */
+    if ((res = mp_sub (&u, &v, &u)) != MP_OKAY) {
+      goto LBL_ERR;
+    }
+
+    if ((res = mp_sub (&A, &C, &A)) != MP_OKAY) {
+      goto LBL_ERR;
+    }
+
+    if ((res = mp_sub (&B, &D, &B)) != MP_OKAY) {
+      goto LBL_ERR;
+    }
+  } else {
+    /* v - v - u, C = C - A, D = D - B */
+    if ((res = mp_sub (&v, &u, &v)) != MP_OKAY) {
+      goto LBL_ERR;
+    }
+
+    if ((res = mp_sub (&C, &A, &C)) != MP_OKAY) {
+      goto LBL_ERR;
+    }
+
+    if ((res = mp_sub (&D, &B, &D)) != MP_OKAY) {
+      goto LBL_ERR;
+    }
+  }
+
+  /* if not zero goto step 4 */
+  if (mp_iszero (&u) == 0)
+    goto top;
+
+  /* now a = C, b = D, gcd == g*v */
+
+  /* if v != 1 then there is no inverse */
+  if (mp_cmp_d (&v, 1) != MP_EQ) {
+    res = MP_VAL;
+    goto LBL_ERR;
+  }
+
+  /* if its too low */
+  while (mp_cmp_d(&C, 0) == MP_LT) {
+      if ((res = mp_add(&C, b, &C)) != MP_OKAY) {
+         goto LBL_ERR;
+      }
+  }
+  
+  /* too big */
+  while (mp_cmp_mag(&C, b) != MP_LT) {
+      if ((res = mp_sub(&C, b, &C)) != MP_OKAY) {
+         goto LBL_ERR;
+      }
+  }
+  
+  /* C is now the inverse */
+  mp_exch (&C, c);
+  res = MP_OKAY;
+LBL_ERR:mp_clear_multi (&x, &y, &u, &v, &A, &B, &C, &D, NULL);
+  return res;
+}
+#endif
+
+/* End: bn_mp_invmod_slow.c */
+
+/* Start: bn_mp_is_square.c */
+#include <tommath.h>
+#ifdef BN_MP_IS_SQUARE_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* Check if remainders are possible squares - fast exclude non-squares */
+static const char rem_128[128] = {
+ 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
+ 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
+ 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
+ 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
+ 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
+ 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
+ 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
+ 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1
+};
+
+static const char rem_105[105] = {
+ 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
+ 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1,
+ 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
+ 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
+ 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
+ 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1,
+ 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1
+};
+
+/* Store non-zero to ret if arg is square, and zero if not */
+int mp_is_square(mp_int *arg,int *ret) 
+{
+  int           res;
+  mp_digit      c;
+  mp_int        t;
+  unsigned long r;
+
+  /* Default to Non-square :) */
+  *ret = MP_NO; 
+
+  if (arg->sign == MP_NEG) {
+    return MP_VAL;
+  }
+
+  /* digits used?  (TSD) */
+  if (arg->used == 0) {
+     return MP_OKAY;
+  }
+
+  /* First check mod 128 (suppose that DIGIT_BIT is at least 7) */
+  if (rem_128[127 & DIGIT(arg,0)] == 1) {
+     return MP_OKAY;
+  }
+
+  /* Next check mod 105 (3*5*7) */
+  if ((res = mp_mod_d(arg,105,&c)) != MP_OKAY) {
+     return res;
+  }
+  if (rem_105[c] == 1) {
+     return MP_OKAY;
+  }
+
+
+  if ((res = mp_init_set_int(&t,11L*13L*17L*19L*23L*29L*31L)) != MP_OKAY) {
+     return res;
+  }
+  if ((res = mp_mod(arg,&t,&t)) != MP_OKAY) {
+     goto ERR;
+  }
+  r = mp_get_int(&t);
+  /* Check for other prime modules, note it's not an ERROR but we must
+   * free "t" so the easiest way is to goto ERR.  We know that res
+   * is already equal to MP_OKAY from the mp_mod call 
+   */ 
+  if ( (1L<<(r%11)) & 0x5C4L )             goto ERR;
+  if ( (1L<<(r%13)) & 0x9E4L )             goto ERR;
+  if ( (1L<<(r%17)) & 0x5CE8L )            goto ERR;
+  if ( (1L<<(r%19)) & 0x4F50CL )           goto ERR;
+  if ( (1L<<(r%23)) & 0x7ACCA0L )          goto ERR;
+  if ( (1L<<(r%29)) & 0xC2EDD0CL )         goto ERR;
+  if ( (1L<<(r%31)) & 0x6DE2B848L )        goto ERR;
+
+  /* Final check - is sqr(sqrt(arg)) == arg ? */
+  if ((res = mp_sqrt(arg,&t)) != MP_OKAY) {
+     goto ERR;
+  }
+  if ((res = mp_sqr(&t,&t)) != MP_OKAY) {
+     goto ERR;
+  }
+
+  *ret = (mp_cmp_mag(&t,arg) == MP_EQ) ? MP_YES : MP_NO;
+ERR:mp_clear(&t);
+  return res;
+}
+#endif
+
+/* End: bn_mp_is_square.c */
+
+/* Start: bn_mp_jacobi.c */
+#include <tommath.h>
+#ifdef BN_MP_JACOBI_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* computes the jacobi c = (a | n) (or Legendre if n is prime)
+ * HAC pp. 73 Algorithm 2.149
+ */
+int mp_jacobi (mp_int * a, mp_int * p, int *c)
+{
+  mp_int  a1, p1;
+  int     k, s, r, res;
+  mp_digit residue;
+
+  /* if p <= 0 return MP_VAL */
+  if (mp_cmp_d(p, 0) != MP_GT) {
+     return MP_VAL;
+  }
+
+  /* step 1.  if a == 0, return 0 */
+  if (mp_iszero (a) == 1) {
+    *c = 0;
+    return MP_OKAY;
+  }
+
+  /* step 2.  if a == 1, return 1 */
+  if (mp_cmp_d (a, 1) == MP_EQ) {
+    *c = 1;
+    return MP_OKAY;
+  }
+
+  /* default */
+  s = 0;
+
+  /* step 3.  write a = a1 * 2**k  */
+  if ((res = mp_init_copy (&a1, a)) != MP_OKAY) {
+    return res;
+  }
+
+  if ((res = mp_init (&p1)) != MP_OKAY) {
+    goto LBL_A1;
+  }
+
+  /* divide out larger power of two */
+  k = mp_cnt_lsb(&a1);
+  if ((res = mp_div_2d(&a1, k, &a1, NULL)) != MP_OKAY) {
+     goto LBL_P1;
+  }
+
+  /* step 4.  if e is even set s=1 */
+  if ((k & 1) == 0) {
+    s = 1;
+  } else {
+    /* else set s=1 if p = 1/7 (mod 8) or s=-1 if p = 3/5 (mod 8) */
+    residue = p->dp[0] & 7;
+
+    if (residue == 1 || residue == 7) {
+      s = 1;
+    } else if (residue == 3 || residue == 5) {
+      s = -1;
+    }
+  }
+
+  /* step 5.  if p == 3 (mod 4) *and* a1 == 3 (mod 4) then s = -s */
+  if ( ((p->dp[0] & 3) == 3) && ((a1.dp[0] & 3) == 3)) {
+    s = -s;
+  }
+
+  /* if a1 == 1 we're done */
+  if (mp_cmp_d (&a1, 1) == MP_EQ) {
+    *c = s;
+  } else {
+    /* n1 = n mod a1 */
+    if ((res = mp_mod (p, &a1, &p1)) != MP_OKAY) {
+      goto LBL_P1;
+    }
+    if ((res = mp_jacobi (&p1, &a1, &r)) != MP_OKAY) {
+      goto LBL_P1;
+    }
+    *c = s * r;
+  }
+
+  /* done */
+  res = MP_OKAY;
+LBL_P1:mp_clear (&p1);
+LBL_A1:mp_clear (&a1);
+  return res;
+}
+#endif
+
+/* End: bn_mp_jacobi.c */
+
+/* Start: bn_mp_karatsuba_mul.c */
+#include <tommath.h>
+#ifdef BN_MP_KARATSUBA_MUL_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* c = |a| * |b| using Karatsuba Multiplication using 
+ * three half size multiplications
+ *
+ * Let B represent the radix [e.g. 2**DIGIT_BIT] and 
+ * let n represent half of the number of digits in 
+ * the min(a,b)
+ *
+ * a = a1 * B**n + a0
+ * b = b1 * B**n + b0
+ *
+ * Then, a * b => 
+   a1b1 * B**2n + ((a1 - a0)(b1 - b0) + a0b0 + a1b1) * B + a0b0
+ *
+ * Note that a1b1 and a0b0 are used twice and only need to be 
+ * computed once.  So in total three half size (half # of 
+ * digit) multiplications are performed, a0b0, a1b1 and 
+ * (a1-b1)(a0-b0)
+ *
+ * Note that a multiplication of half the digits requires
+ * 1/4th the number of single precision multiplications so in 
+ * total after one call 25% of the single precision multiplications 
+ * are saved.  Note also that the call to mp_mul can end up back 
+ * in this function if the a0, a1, b0, or b1 are above the threshold.  
+ * This is known as divide-and-conquer and leads to the famous 
+ * O(N**lg(3)) or O(N**1.584) work which is asymptopically lower than 
+ * the standard O(N**2) that the baseline/comba methods use.  
+ * Generally though the overhead of this method doesn't pay off 
+ * until a certain size (N ~ 80) is reached.
+ */
+int mp_karatsuba_mul (mp_int * a, mp_int * b, mp_int * c)
+{
+  mp_int  x0, x1, y0, y1, t1, x0y0, x1y1;
+  int     B, err;
+
+  /* default the return code to an error */
+  err = MP_MEM;
+
+  /* min # of digits */
+  B = MIN (a->used, b->used);
+
+  /* now divide in two */
+  B = B >> 1;
+
+  /* init copy all the temps */
+  if (mp_init_size (&x0, B) != MP_OKAY)
+    goto ERR;
+  if (mp_init_size (&x1, a->used - B) != MP_OKAY)
+    goto X0;
+  if (mp_init_size (&y0, B) != MP_OKAY)
+    goto X1;
+  if (mp_init_size (&y1, b->used - B) != MP_OKAY)
+    goto Y0;
+
+  /* init temps */
+  if (mp_init_size (&t1, B * 2) != MP_OKAY)
+    goto Y1;
+  if (mp_init_size (&x0y0, B * 2) != MP_OKAY)
+    goto T1;
+  if (mp_init_size (&x1y1, B * 2) != MP_OKAY)
+    goto X0Y0;
+
+  /* now shift the digits */
+  x0.used = y0.used = B;
+  x1.used = a->used - B;
+  y1.used = b->used - B;
+
+  {
+    register int x;
+    register mp_digit *tmpa, *tmpb, *tmpx, *tmpy;
+
+    /* we copy the digits directly instead of using higher level functions
+     * since we also need to shift the digits
+     */
+    tmpa = a->dp;
+    tmpb = b->dp;
+
+    tmpx = x0.dp;
+    tmpy = y0.dp;
+    for (x = 0; x < B; x++) {
+      *tmpx++ = *tmpa++;
+      *tmpy++ = *tmpb++;
+    }
+
+    tmpx = x1.dp;
+    for (x = B; x < a->used; x++) {
+      *tmpx++ = *tmpa++;
+    }
+
+    tmpy = y1.dp;
+    for (x = B; x < b->used; x++) {
+      *tmpy++ = *tmpb++;
+    }
+  }
+
+  /* only need to clamp the lower words since by definition the 
+   * upper words x1/y1 must have a known number of digits
+   */
+  mp_clamp (&x0);
+  mp_clamp (&y0);
+
+  /* now calc the products x0y0 and x1y1 */
+  /* after this x0 is no longer required, free temp [x0==t2]! */
+  if (mp_mul (&x0, &y0, &x0y0) != MP_OKAY)  
+    goto X1Y1;          /* x0y0 = x0*y0 */
+  if (mp_mul (&x1, &y1, &x1y1) != MP_OKAY)
+    goto X1Y1;          /* x1y1 = x1*y1 */
+
+  /* now calc x1-x0 and y1-y0 */
+  if (mp_sub (&x1, &x0, &t1) != MP_OKAY)
+    goto X1Y1;          /* t1 = x1 - x0 */
+  if (mp_sub (&y1, &y0, &x0) != MP_OKAY)
+    goto X1Y1;          /* t2 = y1 - y0 */
+  if (mp_mul (&t1, &x0, &t1) != MP_OKAY)
+    goto X1Y1;          /* t1 = (x1 - x0) * (y1 - y0) */
+
+  /* add x0y0 */
+  if (mp_add (&x0y0, &x1y1, &x0) != MP_OKAY)
+    goto X1Y1;          /* t2 = x0y0 + x1y1 */
+  if (mp_sub (&x0, &t1, &t1) != MP_OKAY)
+    goto X1Y1;          /* t1 = x0y0 + x1y1 - (x1-x0)*(y1-y0) */
+
+  /* shift by B */
+  if (mp_lshd (&t1, B) != MP_OKAY)
+    goto X1Y1;          /* t1 = (x0y0 + x1y1 - (x1-x0)*(y1-y0))<<B */
+  if (mp_lshd (&x1y1, B * 2) != MP_OKAY)
+    goto X1Y1;          /* x1y1 = x1y1 << 2*B */
+
+  if (mp_add (&x0y0, &t1, &t1) != MP_OKAY)
+    goto X1Y1;          /* t1 = x0y0 + t1 */
+  if (mp_add (&t1, &x1y1, c) != MP_OKAY)
+    goto X1Y1;          /* t1 = x0y0 + t1 + x1y1 */
+
+  /* Algorithm succeeded set the return code to MP_OKAY */
+  err = MP_OKAY;
+
+X1Y1:mp_clear (&x1y1);
+X0Y0:mp_clear (&x0y0);
+T1:mp_clear (&t1);
+Y1:mp_clear (&y1);
+Y0:mp_clear (&y0);
+X1:mp_clear (&x1);
+X0:mp_clear (&x0);
+ERR:
+  return err;
+}
+#endif
+
+/* End: bn_mp_karatsuba_mul.c */
+
+/* Start: bn_mp_karatsuba_sqr.c */
+#include <tommath.h>
+#ifdef BN_MP_KARATSUBA_SQR_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* Karatsuba squaring, computes b = a*a using three 
+ * half size squarings
+ *
+ * See comments of karatsuba_mul for details.  It 
+ * is essentially the same algorithm but merely 
+ * tuned to perform recursive squarings.
+ */
+int mp_karatsuba_sqr (mp_int * a, mp_int * b)
+{
+  mp_int  x0, x1, t1, t2, x0x0, x1x1;
+  int     B, err;
+
+  err = MP_MEM;
+
+  /* min # of digits */
+  B = a->used;
+
+  /* now divide in two */
+  B = B >> 1;
+
+  /* init copy all the temps */
+  if (mp_init_size (&x0, B) != MP_OKAY)
+    goto ERR;
+  if (mp_init_size (&x1, a->used - B) != MP_OKAY)
+    goto X0;
+
+  /* init temps */
+  if (mp_init_size (&t1, a->used * 2) != MP_OKAY)
+    goto X1;
+  if (mp_init_size (&t2, a->used * 2) != MP_OKAY)
+    goto T1;
+  if (mp_init_size (&x0x0, B * 2) != MP_OKAY)
+    goto T2;
+  if (mp_init_size (&x1x1, (a->used - B) * 2) != MP_OKAY)
+    goto X0X0;
+
+  {
+    register int x;
+    register mp_digit *dst, *src;
+
+    src = a->dp;
+
+    /* now shift the digits */
+    dst = x0.dp;
+    for (x = 0; x < B; x++) {
+      *dst++ = *src++;
+    }
+
+    dst = x1.dp;
+    for (x = B; x < a->used; x++) {
+      *dst++ = *src++;
+    }
+  }
+
+  x0.used = B;
+  x1.used = a->used - B;
+
+  mp_clamp (&x0);
+
+  /* now calc the products x0*x0 and x1*x1 */
+  if (mp_sqr (&x0, &x0x0) != MP_OKAY)
+    goto X1X1;           /* x0x0 = x0*x0 */
+  if (mp_sqr (&x1, &x1x1) != MP_OKAY)
+    goto X1X1;           /* x1x1 = x1*x1 */
+
+  /* now calc (x1-x0)**2 */
+  if (mp_sub (&x1, &x0, &t1) != MP_OKAY)
+    goto X1X1;           /* t1 = x1 - x0 */
+  if (mp_sqr (&t1, &t1) != MP_OKAY)
+    goto X1X1;           /* t1 = (x1 - x0) * (x1 - x0) */
+
+  /* add x0y0 */
+  if (s_mp_add (&x0x0, &x1x1, &t2) != MP_OKAY)
+    goto X1X1;           /* t2 = x0x0 + x1x1 */
+  if (mp_sub (&t2, &t1, &t1) != MP_OKAY)
+    goto X1X1;           /* t1 = x0x0 + x1x1 - (x1-x0)*(x1-x0) */
+
+  /* shift by B */
+  if (mp_lshd (&t1, B) != MP_OKAY)
+    goto X1X1;           /* t1 = (x0x0 + x1x1 - (x1-x0)*(x1-x0))<<B */
+  if (mp_lshd (&x1x1, B * 2) != MP_OKAY)
+    goto X1X1;           /* x1x1 = x1x1 << 2*B */
+
+  if (mp_add (&x0x0, &t1, &t1) != MP_OKAY)
+    goto X1X1;           /* t1 = x0x0 + t1 */
+  if (mp_add (&t1, &x1x1, b) != MP_OKAY)
+    goto X1X1;           /* t1 = x0x0 + t1 + x1x1 */
+
+  err = MP_OKAY;
+
+X1X1:mp_clear (&x1x1);
+X0X0:mp_clear (&x0x0);
+T2:mp_clear (&t2);
+T1:mp_clear (&t1);
+X1:mp_clear (&x1);
+X0:mp_clear (&x0);
+ERR:
+  return err;
+}
+#endif
+
+/* End: bn_mp_karatsuba_sqr.c */
+
+/* Start: bn_mp_lcm.c */
+#include <tommath.h>
+#ifdef BN_MP_LCM_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* computes least common multiple as |a*b|/(a, b) */
+int mp_lcm (mp_int * a, mp_int * b, mp_int * c)
+{
+  int     res;
+  mp_int  t1, t2;
+
+
+  if ((res = mp_init_multi (&t1, &t2, NULL)) != MP_OKAY) {
+    return res;
+  }
+
+  /* t1 = get the GCD of the two inputs */
+  if ((res = mp_gcd (a, b, &t1)) != MP_OKAY) {
+    goto LBL_T;
+  }
+
+  /* divide the smallest by the GCD */
+  if (mp_cmp_mag(a, b) == MP_LT) {
+     /* store quotient in t2 such that t2 * b is the LCM */
+     if ((res = mp_div(a, &t1, &t2, NULL)) != MP_OKAY) {
+        goto LBL_T;
+     }
+     res = mp_mul(b, &t2, c);
+  } else {
+     /* store quotient in t2 such that t2 * a is the LCM */
+     if ((res = mp_div(b, &t1, &t2, NULL)) != MP_OKAY) {
+        goto LBL_T;
+     }
+     res = mp_mul(a, &t2, c);
+  }
+
+  /* fix the sign to positive */
+  c->sign = MP_ZPOS;
+
+LBL_T:
+  mp_clear_multi (&t1, &t2, NULL);
+  return res;
+}
+#endif
+
+/* End: bn_mp_lcm.c */
+
+/* Start: bn_mp_lshd.c */
+#include <tommath.h>
+#ifdef BN_MP_LSHD_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* shift left a certain amount of digits */
+int mp_lshd (mp_int * a, int b)
+{
+  int     x, res;
+
+  /* if its less than zero return */
+  if (b <= 0) {
+    return MP_OKAY;
+  }
+
+  /* grow to fit the new digits */
+  if (a->alloc < a->used + b) {
+     if ((res = mp_grow (a, a->used + b)) != MP_OKAY) {
+       return res;
+     }
+  }
+
+  {
+    register mp_digit *top, *bottom;
+
+    /* increment the used by the shift amount then copy upwards */
+    a->used += b;
+
+    /* top */
+    top = a->dp + a->used - 1;
+
+    /* base */
+    bottom = a->dp + a->used - 1 - b;
+
+    /* much like mp_rshd this is implemented using a sliding window
+     * except the window goes the otherway around.  Copying from
+     * the bottom to the top.  see bn_mp_rshd.c for more info.
+     */
+    for (x = a->used - 1; x >= b; x--) {
+      *top-- = *bottom--;
+    }
+
+    /* zero the lower digits */
+    top = a->dp;
+    for (x = 0; x < b; x++) {
+      *top++ = 0;
+    }
+  }
+  return MP_OKAY;
+}
+#endif
+
+/* End: bn_mp_lshd.c */
+
+/* Start: bn_mp_mod.c */
+#include <tommath.h>
+#ifdef BN_MP_MOD_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* c = a mod b, 0 <= c < b */
+int
+mp_mod (mp_int * a, mp_int * b, mp_int * c)
+{
+  mp_int  t;
+  int     res;
+
+  if ((res = mp_init (&t)) != MP_OKAY) {
+    return res;
+  }
+
+  if ((res = mp_div (a, b, NULL, &t)) != MP_OKAY) {
+    mp_clear (&t);
+    return res;
+  }
+
+  if (t.sign != b->sign) {
+    res = mp_add (b, &t, c);
+  } else {
+    res = MP_OKAY;
+    mp_exch (&t, c);
+  }
+
+  mp_clear (&t);
+  return res;
+}
+#endif
+
+/* End: bn_mp_mod.c */
+
+/* Start: bn_mp_mod_2d.c */
+#include <tommath.h>
+#ifdef BN_MP_MOD_2D_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* calc a value mod 2**b */
+int
+mp_mod_2d (mp_int * a, int b, mp_int * c)
+{
+  int     x, res;
+
+  /* if b is <= 0 then zero the int */
+  if (b <= 0) {
+    mp_zero (c);
+    return MP_OKAY;
+  }
+
+  /* if the modulus is larger than the value than return */
+  if (b >= (int) (a->used * DIGIT_BIT)) {
+    res = mp_copy (a, c);
+    return res;
+  }
+
+  /* copy */
+  if ((res = mp_copy (a, c)) != MP_OKAY) {
+    return res;
+  }
+
+  /* zero digits above the last digit of the modulus */
+  for (x = (b / DIGIT_BIT) + ((b % DIGIT_BIT) == 0 ? 0 : 1); x < c->used; x++) {
+    c->dp[x] = 0;
+  }
+  /* clear the digit that is not completely outside/inside the modulus */
+  c->dp[b / DIGIT_BIT] &=
+    (mp_digit) ((((mp_digit) 1) << (((mp_digit) b) % DIGIT_BIT)) - ((mp_digit) 1));
+  mp_clamp (c);
+  return MP_OKAY;
+}
+#endif
+
+/* End: bn_mp_mod_2d.c */
+
+/* Start: bn_mp_mod_d.c */
+#include <tommath.h>
+#ifdef BN_MP_MOD_D_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+int
+mp_mod_d (mp_int * a, mp_digit b, mp_digit * c)
+{
+  return mp_div_d(a, b, NULL, c);
+}
+#endif
+
+/* End: bn_mp_mod_d.c */
+
+/* Start: bn_mp_montgomery_calc_normalization.c */
+#include <tommath.h>
+#ifdef BN_MP_MONTGOMERY_CALC_NORMALIZATION_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/*
+ * shifts with subtractions when the result is greater than b.
+ *
+ * The method is slightly modified to shift B unconditionally upto just under
+ * the leading bit of b.  This saves alot of multiple precision shifting.
+ */
+int mp_montgomery_calc_normalization (mp_int * a, mp_int * b)
+{
+  int     x, bits, res;
+
+  /* how many bits of last digit does b use */
+  bits = mp_count_bits (b) % DIGIT_BIT;
+
+  if (b->used > 1) {
+     if ((res = mp_2expt (a, (b->used - 1) * DIGIT_BIT + bits - 1)) != MP_OKAY) {
+        return res;
+     }
+  } else {
+     mp_set(a, 1);
+     bits = 1;
+  }
+
+
+  /* now compute C = A * B mod b */
+  for (x = bits - 1; x < (int)DIGIT_BIT; x++) {
+    if ((res = mp_mul_2 (a, a)) != MP_OKAY) {
+      return res;
+    }
+    if (mp_cmp_mag (a, b) != MP_LT) {
+      if ((res = s_mp_sub (a, b, a)) != MP_OKAY) {
+        return res;
+      }
+    }
+  }
+
+  return MP_OKAY;
+}
+#endif
+
+/* End: bn_mp_montgomery_calc_normalization.c */
+
+/* Start: bn_mp_montgomery_reduce.c */
+#include <tommath.h>
+#ifdef BN_MP_MONTGOMERY_REDUCE_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* computes xR**-1 == x (mod N) via Montgomery Reduction */
+int
+mp_montgomery_reduce (mp_int * x, mp_int * n, mp_digit rho)
+{
+  int     ix, res, digs;
+  mp_digit mu;
+
+  /* can the fast reduction [comba] method be used?
+   *
+   * Note that unlike in mul you're safely allowed *less*
+   * than the available columns [255 per default] since carries
+   * are fixed up in the inner loop.
+   */
+  digs = n->used * 2 + 1;
+  if ((digs < MP_WARRAY) &&
+      n->used <
+      (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
+    return fast_mp_montgomery_reduce (x, n, rho);
+  }
+
+  /* grow the input as required */
+  if (x->alloc < digs) {
+    if ((res = mp_grow (x, digs)) != MP_OKAY) {
+      return res;
+    }
+  }
+  x->used = digs;
+
+  for (ix = 0; ix < n->used; ix++) {
+    /* mu = ai * rho mod b
+     *
+     * The value of rho must be precalculated via
+     * montgomery_setup() such that
+     * it equals -1/n0 mod b this allows the
+     * following inner loop to reduce the
+     * input one digit at a time
+     */
+    mu = (mp_digit) (((mp_word)x->dp[ix]) * ((mp_word)rho) & MP_MASK);
+
+    /* a = a + mu * m * b**i */
+    {
+      register int iy;
+      register mp_digit *tmpn, *tmpx, u;
+      register mp_word r;
+
+      /* alias for digits of the modulus */
+      tmpn = n->dp;
+
+      /* alias for the digits of x [the input] */
+      tmpx = x->dp + ix;
+
+      /* set the carry to zero */
+      u = 0;
+
+      /* Multiply and add in place */
+      for (iy = 0; iy < n->used; iy++) {
+        /* compute product and sum */
+        r       = ((mp_word)mu) * ((mp_word)*tmpn++) +
+                  ((mp_word) u) + ((mp_word) * tmpx);
+
+        /* get carry */
+        u       = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
+
+        /* fix digit */
+        *tmpx++ = (mp_digit)(r & ((mp_word) MP_MASK));
+      }
+      /* At this point the ix'th digit of x should be zero */
+
+
+      /* propagate carries upwards as required*/
+      while (u) {
+        *tmpx   += u;
+        u        = *tmpx >> DIGIT_BIT;
+        *tmpx++ &= MP_MASK;
+      }
+    }
+  }
+
+  /* at this point the n.used'th least
+   * significant digits of x are all zero
+   * which means we can shift x to the
+   * right by n.used digits and the
+   * residue is unchanged.
+   */
+
+  /* x = x/b**n.used */
+  mp_clamp(x);
+  mp_rshd (x, n->used);
+
+  /* if x >= n then x = x - n */
+  if (mp_cmp_mag (x, n) != MP_LT) {
+    return s_mp_sub (x, n, x);
+  }
+
+  return MP_OKAY;
+}
+#endif
+
+/* End: bn_mp_montgomery_reduce.c */
+
+/* Start: bn_mp_montgomery_setup.c */
+#include <tommath.h>
+#ifdef BN_MP_MONTGOMERY_SETUP_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* setups the montgomery reduction stuff */
+int
+mp_montgomery_setup (mp_int * n, mp_digit * rho)
+{
+  mp_digit x, b;
+
+/* fast inversion mod 2**k
+ *
+ * Based on the fact that
+ *
+ * XA = 1 (mod 2**n)  =>  (X(2-XA)) A = 1 (mod 2**2n)
+ *                    =>  2*X*A - X*X*A*A = 1
+ *                    =>  2*(1) - (1)     = 1
+ */
+  b = n->dp[0];
+
+  if ((b & 1) == 0) {
+    return MP_VAL;
+  }
+
+  x = (((b + 2) & 4) << 1) + b; /* here x*a==1 mod 2**4 */
+  x *= 2 - b * x;               /* here x*a==1 mod 2**8 */
+#if !defined(MP_8BIT)
+  x *= 2 - b * x;               /* here x*a==1 mod 2**16 */
+#endif
+#if defined(MP_64BIT) || !(defined(MP_8BIT) || defined(MP_16BIT))
+  x *= 2 - b * x;               /* here x*a==1 mod 2**32 */
+#endif
+#ifdef MP_64BIT
+  x *= 2 - b * x;               /* here x*a==1 mod 2**64 */
+#endif
+
+  /* rho = -1/m mod b */
+  *rho = (((mp_word)1 << ((mp_word) DIGIT_BIT)) - x) & MP_MASK;
+
+  return MP_OKAY;
+}
+#endif
+
+/* End: bn_mp_montgomery_setup.c */
+
+/* Start: bn_mp_mul.c */
+#include <tommath.h>
+#ifdef BN_MP_MUL_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* high level multiplication (handles sign) */
+int mp_mul (mp_int * a, mp_int * b, mp_int * c)
+{
+  int     res, neg;
+  neg = (a->sign == b->sign) ? MP_ZPOS : MP_NEG;
+
+  /* use Toom-Cook? */
+#ifdef BN_MP_TOOM_MUL_C
+  if (MIN (a->used, b->used) >= TOOM_MUL_CUTOFF) {
+    res = mp_toom_mul(a, b, c);
+  } else 
+#endif
+#ifdef BN_MP_KARATSUBA_MUL_C
+  /* use Karatsuba? */
+  if (MIN (a->used, b->used) >= KARATSUBA_MUL_CUTOFF) {
+    res = mp_karatsuba_mul (a, b, c);
+  } else 
+#endif
+  {
+    /* can we use the fast multiplier?
+     *
+     * The fast multiplier can be used if the output will 
+     * have less than MP_WARRAY digits and the number of 
+     * digits won't affect carry propagation
+     */
+    int     digs = a->used + b->used + 1;
+
+#ifdef BN_FAST_S_MP_MUL_DIGS_C
+    if ((digs < MP_WARRAY) &&
+        MIN(a->used, b->used) <= 
+        (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
+      res = fast_s_mp_mul_digs (a, b, c, digs);
+    } else 
+#endif
+#ifdef BN_S_MP_MUL_DIGS_C
+      res = s_mp_mul (a, b, c); /* uses s_mp_mul_digs */
+#else
+      res = MP_VAL;
+#endif
+
+  }
+  c->sign = (c->used > 0) ? neg : MP_ZPOS;
+  return res;
+}
+#endif
+
+/* End: bn_mp_mul.c */
+
+/* Start: bn_mp_mul_2.c */
+#include <tommath.h>
+#ifdef BN_MP_MUL_2_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* b = a*2 */
+int mp_mul_2(mp_int * a, mp_int * b)
+{
+  int     x, res, oldused;
+
+  /* grow to accomodate result */
+  if (b->alloc < a->used + 1) {
+    if ((res = mp_grow (b, a->used + 1)) != MP_OKAY) {
+      return res;
+    }
+  }
+
+  oldused = b->used;
+  b->used = a->used;
+
+  {
+    register mp_digit r, rr, *tmpa, *tmpb;
+
+    /* alias for source */
+    tmpa = a->dp;
+    
+    /* alias for dest */
+    tmpb = b->dp;
+
+    /* carry */
+    r = 0;
+    for (x = 0; x < a->used; x++) {
+    
+      /* get what will be the *next* carry bit from the 
+       * MSB of the current digit 
+       */
+      rr = *tmpa >> ((mp_digit)(DIGIT_BIT - 1));
+      
+      /* now shift up this digit, add in the carry [from the previous] */
+      *tmpb++ = ((*tmpa++ << ((mp_digit)1)) | r) & MP_MASK;
+      
+      /* copy the carry that would be from the source 
+       * digit into the next iteration 
+       */
+      r = rr;
+    }
+
+    /* new leading digit? */
+    if (r != 0) {
+      /* add a MSB which is always 1 at this point */
+      *tmpb = 1;
+      ++(b->used);
+    }
+
+    /* now zero any excess digits on the destination 
+     * that we didn't write to 
+     */
+    tmpb = b->dp + b->used;
+    for (x = b->used; x < oldused; x++) {
+      *tmpb++ = 0;
+    }
+  }
+  b->sign = a->sign;
+  return MP_OKAY;
+}
+#endif
+
+/* End: bn_mp_mul_2.c */
+
+/* Start: bn_mp_mul_2d.c */
+#include <tommath.h>
+#ifdef BN_MP_MUL_2D_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* shift left by a certain bit count */
+int mp_mul_2d (mp_int * a, int b, mp_int * c)
+{
+  mp_digit d;
+  int      res;
+
+  /* copy */
+  if (a != c) {
+     if ((res = mp_copy (a, c)) != MP_OKAY) {
+       return res;
+     }
+  }
+
+  if (c->alloc < (int)(c->used + b/DIGIT_BIT + 1)) {
+     if ((res = mp_grow (c, c->used + b / DIGIT_BIT + 1)) != MP_OKAY) {
+       return res;
+     }
+  }
+
+  /* shift by as many digits in the bit count */
+  if (b >= (int)DIGIT_BIT) {
+    if ((res = mp_lshd (c, b / DIGIT_BIT)) != MP_OKAY) {
+      return res;
+    }
+  }
+
+  /* shift any bit count < DIGIT_BIT */
+  d = (mp_digit) (b % DIGIT_BIT);
+  if (d != 0) {
+    register mp_digit *tmpc, shift, mask, r, rr;
+    register int x;
+
+    /* bitmask for carries */
+    mask = (((mp_digit)1) << d) - 1;
+
+    /* shift for msbs */
+    shift = DIGIT_BIT - d;
+
+    /* alias */
+    tmpc = c->dp;
+
+    /* carry */
+    r    = 0;
+    for (x = 0; x < c->used; x++) {
+      /* get the higher bits of the current word */
+      rr = (*tmpc >> shift) & mask;
+
+      /* shift the current word and OR in the carry */
+      *tmpc = ((*tmpc << d) | r) & MP_MASK;
+      ++tmpc;
+
+      /* set the carry to the carry bits of the current word */
+      r = rr;
+    }
+    
+    /* set final carry */
+    if (r != 0) {
+       c->dp[(c->used)++] = r;
+    }
+  }
+  mp_clamp (c);
+  return MP_OKAY;
+}
+#endif
+
+/* End: bn_mp_mul_2d.c */
+
+/* Start: bn_mp_mul_d.c */
+#include <tommath.h>
+#ifdef BN_MP_MUL_D_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* multiply by a digit */
+int
+mp_mul_d (mp_int * a, mp_digit b, mp_int * c)
+{
+  mp_digit u, *tmpa, *tmpc;
+  mp_word  r;
+  int      ix, res, olduse;
+
+  /* make sure c is big enough to hold a*b */
+  if (c->alloc < a->used + 1) {
+    if ((res = mp_grow (c, a->used + 1)) != MP_OKAY) {
+      return res;
+    }
+  }
+
+  /* get the original destinations used count */
+  olduse = c->used;
+
+  /* set the sign */
+  c->sign = a->sign;
+
+  /* alias for a->dp [source] */
+  tmpa = a->dp;
+
+  /* alias for c->dp [dest] */
+  tmpc = c->dp;
+
+  /* zero carry */
+  u = 0;
+
+  /* compute columns */
+  for (ix = 0; ix < a->used; ix++) {
+    /* compute product and carry sum for this term */
+    r       = ((mp_word) u) + ((mp_word)*tmpa++) * ((mp_word)b);
+
+    /* mask off higher bits to get a single digit */
+    *tmpc++ = (mp_digit) (r & ((mp_word) MP_MASK));
+
+    /* send carry into next iteration */
+    u       = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
+  }
+
+  /* store final carry [if any] and increment ix offset  */
+  *tmpc++ = u;
+  ++ix;
+
+  /* now zero digits above the top */
+  while (ix++ < olduse) {
+     *tmpc++ = 0;
+  }
+
+  /* set used count */
+  c->used = a->used + 1;
+  mp_clamp(c);
+
+  return MP_OKAY;
+}
+#endif
+
+/* End: bn_mp_mul_d.c */
+
+/* Start: bn_mp_mulmod.c */
+#include <tommath.h>
+#ifdef BN_MP_MULMOD_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* d = a * b (mod c) */
+int
+mp_mulmod (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
+{
+  int     res;
+  mp_int  t;
+
+  if ((res = mp_init (&t)) != MP_OKAY) {
+    return res;
+  }
+
+  if ((res = mp_mul (a, b, &t)) != MP_OKAY) {
+    mp_clear (&t);
+    return res;
+  }
+  res = mp_mod (&t, c, d);
+  mp_clear (&t);
+  return res;
+}
+#endif
+
+/* End: bn_mp_mulmod.c */
+
+/* Start: bn_mp_n_root.c */
+#include <tommath.h>
+#ifdef BN_MP_N_ROOT_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* find the n'th root of an integer 
+ *
+ * Result found such that (c)**b <= a and (c+1)**b > a 
+ *
+ * This algorithm uses Newton's approximation 
+ * x[i+1] = x[i] - f(x[i])/f'(x[i]) 
+ * which will find the root in log(N) time where 
+ * each step involves a fair bit.  This is not meant to 
+ * find huge roots [square and cube, etc].
+ */
+int mp_n_root (mp_int * a, mp_digit b, mp_int * c)
+{
+  mp_int  t1, t2, t3;
+  int     res, neg;
+
+  /* input must be positive if b is even */
+  if ((b & 1) == 0 && a->sign == MP_NEG) {
+    return MP_VAL;
+  }
+
+  if ((res = mp_init (&t1)) != MP_OKAY) {
+    return res;
+  }
+
+  if ((res = mp_init (&t2)) != MP_OKAY) {
+    goto LBL_T1;
+  }
+
+  if ((res = mp_init (&t3)) != MP_OKAY) {
+    goto LBL_T2;
+  }
+
+  /* if a is negative fudge the sign but keep track */
+  neg     = a->sign;
+  a->sign = MP_ZPOS;
+
+  /* t2 = 2 */
+  mp_set (&t2, 2);
+
+  do {
+    /* t1 = t2 */
+    if ((res = mp_copy (&t2, &t1)) != MP_OKAY) {
+      goto LBL_T3;
+    }
+
+    /* t2 = t1 - ((t1**b - a) / (b * t1**(b-1))) */
+    
+    /* t3 = t1**(b-1) */
+    if ((res = mp_expt_d (&t1, b - 1, &t3)) != MP_OKAY) {   
+      goto LBL_T3;
+    }
+
+    /* numerator */
+    /* t2 = t1**b */
+    if ((res = mp_mul (&t3, &t1, &t2)) != MP_OKAY) {    
+      goto LBL_T3;
+    }
+
+    /* t2 = t1**b - a */
+    if ((res = mp_sub (&t2, a, &t2)) != MP_OKAY) {  
+      goto LBL_T3;
+    }
+
+    /* denominator */
+    /* t3 = t1**(b-1) * b  */
+    if ((res = mp_mul_d (&t3, b, &t3)) != MP_OKAY) {    
+      goto LBL_T3;
+    }
+
+    /* t3 = (t1**b - a)/(b * t1**(b-1)) */
+    if ((res = mp_div (&t2, &t3, &t3, NULL)) != MP_OKAY) {  
+      goto LBL_T3;
+    }
+
+    if ((res = mp_sub (&t1, &t3, &t2)) != MP_OKAY) {
+      goto LBL_T3;
+    }
+  }  while (mp_cmp (&t1, &t2) != MP_EQ);
+
+  /* result can be off by a few so check */
+  for (;;) {
+    if ((res = mp_expt_d (&t1, b, &t2)) != MP_OKAY) {
+      goto LBL_T3;
+    }
+
+    if (mp_cmp (&t2, a) == MP_GT) {
+      if ((res = mp_sub_d (&t1, 1, &t1)) != MP_OKAY) {
+         goto LBL_T3;
+      }
+    } else {
+      break;
+    }
+  }
+
+  /* reset the sign of a first */
+  a->sign = neg;
+
+  /* set the result */
+  mp_exch (&t1, c);
+
+  /* set the sign of the result */
+  c->sign = neg;
+
+  res = MP_OKAY;
+
+LBL_T3:mp_clear (&t3);
+LBL_T2:mp_clear (&t2);
+LBL_T1:mp_clear (&t1);
+  return res;
+}
+#endif
+
+/* End: bn_mp_n_root.c */
+
+/* Start: bn_mp_neg.c */
+#include <tommath.h>
+#ifdef BN_MP_NEG_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* b = -a */
+int mp_neg (mp_int * a, mp_int * b)
+{
+  int     res;
+  if (a != b) {
+     if ((res = mp_copy (a, b)) != MP_OKAY) {
+        return res;
+     }
+  }
+
+  if (mp_iszero(b) != MP_YES) {
+     b->sign = (a->sign == MP_ZPOS) ? MP_NEG : MP_ZPOS;
+  } else {
+     b->sign = MP_ZPOS;
+  }
+
+  return MP_OKAY;
+}
+#endif
+
+/* End: bn_mp_neg.c */
+
+/* Start: bn_mp_or.c */
+#include <tommath.h>
+#ifdef BN_MP_OR_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* OR two ints together */
+int mp_or (mp_int * a, mp_int * b, mp_int * c)
+{
+  int     res, ix, px;
+  mp_int  t, *x;
+
+  if (a->used > b->used) {
+    if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
+      return res;
+    }
+    px = b->used;
+    x = b;
+  } else {
+    if ((res = mp_init_copy (&t, b)) != MP_OKAY) {
+      return res;
+    }
+    px = a->used;
+    x = a;
+  }
+
+  for (ix = 0; ix < px; ix++) {
+    t.dp[ix] |= x->dp[ix];
+  }
+  mp_clamp (&t);
+  mp_exch (c, &t);
+  mp_clear (&t);
+  return MP_OKAY;
+}
+#endif
+
+/* End: bn_mp_or.c */
+
+/* Start: bn_mp_prime_fermat.c */
+#include <tommath.h>
+#ifdef BN_MP_PRIME_FERMAT_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* performs one Fermat test.
+ * 
+ * If "a" were prime then b**a == b (mod a) since the order of
+ * the multiplicative sub-group would be phi(a) = a-1.  That means
+ * it would be the same as b**(a mod (a-1)) == b**1 == b (mod a).
+ *
+ * Sets result to 1 if the congruence holds, or zero otherwise.
+ */
+int mp_prime_fermat (mp_int * a, mp_int * b, int *result)
+{
+  mp_int  t;
+  int     err;
+
+  /* default to composite  */
+  *result = MP_NO;
+
+  /* ensure b > 1 */
+  if (mp_cmp_d(b, 1) != MP_GT) {
+     return MP_VAL;
+  }
+
+  /* init t */
+  if ((err = mp_init (&t)) != MP_OKAY) {
+    return err;
+  }
+
+  /* compute t = b**a mod a */
+  if ((err = mp_exptmod (b, a, a, &t)) != MP_OKAY) {
+    goto LBL_T;
+  }
+
+  /* is it equal to b? */
+  if (mp_cmp (&t, b) == MP_EQ) {
+    *result = MP_YES;
+  }
+
+  err = MP_OKAY;
+LBL_T:mp_clear (&t);
+  return err;
+}
+#endif
+
+/* End: bn_mp_prime_fermat.c */
+
+/* Start: bn_mp_prime_is_divisible.c */
+#include <tommath.h>
+#ifdef BN_MP_PRIME_IS_DIVISIBLE_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* determines if an integers is divisible by one 
+ * of the first PRIME_SIZE primes or not
+ *
+ * sets result to 0 if not, 1 if yes
+ */
+int mp_prime_is_divisible (mp_int * a, int *result)
+{
+  int     err, ix;
+  mp_digit res;
+
+  /* default to not */
+  *result = MP_NO;
+
+  for (ix = 0; ix < PRIME_SIZE; ix++) {
+    /* what is a mod LBL_prime_tab[ix] */
+    if ((err = mp_mod_d (a, ltm_prime_tab[ix], &res)) != MP_OKAY) {
+      return err;
+    }
+
+    /* is the residue zero? */
+    if (res == 0) {
+      *result = MP_YES;
+      return MP_OKAY;
+    }
+  }
+
+  return MP_OKAY;
+}
+#endif
+
+/* End: bn_mp_prime_is_divisible.c */
+
+/* Start: bn_mp_prime_is_prime.c */
+#include <tommath.h>
+#ifdef BN_MP_PRIME_IS_PRIME_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* performs a variable number of rounds of Miller-Rabin
+ *
+ * Probability of error after t rounds is no more than
+
+ *
+ * Sets result to 1 if probably prime, 0 otherwise
+ */
+int mp_prime_is_prime (mp_int * a, int t, int *result)
+{
+  mp_int  b;
+  int     ix, err, res;
+
+  /* default to no */
+  *result = MP_NO;
+
+  /* valid value of t? */
+  if (t <= 0 || t > PRIME_SIZE) {
+    return MP_VAL;
+  }
+
+  /* is the input equal to one of the primes in the table? */
+  for (ix = 0; ix < PRIME_SIZE; ix++) {
+      if (mp_cmp_d(a, ltm_prime_tab[ix]) == MP_EQ) {
+         *result = 1;
+         return MP_OKAY;
+      }
+  }
+
+  /* first perform trial division */
+  if ((err = mp_prime_is_divisible (a, &res)) != MP_OKAY) {
+    return err;
+  }
+
+  /* return if it was trivially divisible */
+  if (res == MP_YES) {
+    return MP_OKAY;
+  }
+
+  /* now perform the miller-rabin rounds */
+  if ((err = mp_init (&b)) != MP_OKAY) {
+    return err;
+  }
+
+  for (ix = 0; ix < t; ix++) {
+    /* set the prime */
+    mp_set (&b, ltm_prime_tab[ix]);
+
+    if ((err = mp_prime_miller_rabin (a, &b, &res)) != MP_OKAY) {
+      goto LBL_B;
+    }
+
+    if (res == MP_NO) {
+      goto LBL_B;
+    }
+  }
+
+  /* passed the test */
+  *result = MP_YES;
+LBL_B:mp_clear (&b);
+  return err;
+}
+#endif
+
+/* End: bn_mp_prime_is_prime.c */
+
+/* Start: bn_mp_prime_miller_rabin.c */
+#include <tommath.h>
+#ifdef BN_MP_PRIME_MILLER_RABIN_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* Miller-Rabin test of "a" to the base of "b" as described in 
+ * HAC pp. 139 Algorithm 4.24
+ *
+ * Sets result to 0 if definitely composite or 1 if probably prime.
+ * Randomly the chance of error is no more than 1/4 and often 
+ * very much lower.
+ */
+int mp_prime_miller_rabin (mp_int * a, mp_int * b, int *result)
+{
+  mp_int  n1, y, r;
+  int     s, j, err;
+
+  /* default */
+  *result = MP_NO;
+
+  /* ensure b > 1 */
+  if (mp_cmp_d(b, 1) != MP_GT) {
+     return MP_VAL;
+  }     
+
+  /* get n1 = a - 1 */
+  if ((err = mp_init_copy (&n1, a)) != MP_OKAY) {
+    return err;
+  }
+  if ((err = mp_sub_d (&n1, 1, &n1)) != MP_OKAY) {
+    goto LBL_N1;
+  }
+
+  /* set 2**s * r = n1 */
+  if ((err = mp_init_copy (&r, &n1)) != MP_OKAY) {
+    goto LBL_N1;
+  }
+
+  /* count the number of least significant bits
+   * which are zero
+   */
+  s = mp_cnt_lsb(&r);
+
+  /* now divide n - 1 by 2**s */
+  if ((err = mp_div_2d (&r, s, &r, NULL)) != MP_OKAY) {
+    goto LBL_R;
+  }
+
+  /* compute y = b**r mod a */
+  if ((err = mp_init (&y)) != MP_OKAY) {
+    goto LBL_R;
+  }
+  if ((err = mp_exptmod (b, &r, a, &y)) != MP_OKAY) {
+    goto LBL_Y;
+  }
+
+  /* if y != 1 and y != n1 do */
+  if (mp_cmp_d (&y, 1) != MP_EQ && mp_cmp (&y, &n1) != MP_EQ) {
+    j = 1;
+    /* while j <= s-1 and y != n1 */
+    while ((j <= (s - 1)) && mp_cmp (&y, &n1) != MP_EQ) {
+      if ((err = mp_sqrmod (&y, a, &y)) != MP_OKAY) {
+         goto LBL_Y;
+      }
+
+      /* if y == 1 then composite */
+      if (mp_cmp_d (&y, 1) == MP_EQ) {
+         goto LBL_Y;
+      }
+
+      ++j;
+    }
+
+    /* if y != n1 then composite */
+    if (mp_cmp (&y, &n1) != MP_EQ) {
+      goto LBL_Y;
+    }
+  }
+
+  /* probably prime now */
+  *result = MP_YES;
+LBL_Y:mp_clear (&y);
+LBL_R:mp_clear (&r);
+LBL_N1:mp_clear (&n1);
+  return err;
+}
+#endif
+
+/* End: bn_mp_prime_miller_rabin.c */
+
+/* Start: bn_mp_prime_next_prime.c */
+#include <tommath.h>
+#ifdef BN_MP_PRIME_NEXT_PRIME_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* finds the next prime after the number "a" using "t" trials
+ * of Miller-Rabin.
+ *
+ * bbs_style = 1 means the prime must be congruent to 3 mod 4
+ */
+int mp_prime_next_prime(mp_int *a, int t, int bbs_style)
+{
+   int      err, res, x, y;
+   mp_digit res_tab[PRIME_SIZE], step, kstep;
+   mp_int   b;
+
+   /* ensure t is valid */
+   if (t <= 0 || t > PRIME_SIZE) {
+      return MP_VAL;
+   }
+
+   /* force positive */
+   a->sign = MP_ZPOS;
+
+   /* simple algo if a is less than the largest prime in the table */
+   if (mp_cmp_d(a, ltm_prime_tab[PRIME_SIZE-1]) == MP_LT) {
+      /* find which prime it is bigger than */
+      for (x = PRIME_SIZE - 2; x >= 0; x--) {
+          if (mp_cmp_d(a, ltm_prime_tab[x]) != MP_LT) {
+             if (bbs_style == 1) {
+                /* ok we found a prime smaller or
+                 * equal [so the next is larger]
+                 *
+                 * however, the prime must be
+                 * congruent to 3 mod 4
+                 */
+                if ((ltm_prime_tab[x + 1] & 3) != 3) {
+                   /* scan upwards for a prime congruent to 3 mod 4 */
+                   for (y = x + 1; y < PRIME_SIZE; y++) {
+                       if ((ltm_prime_tab[y] & 3) == 3) {
+                          mp_set(a, ltm_prime_tab[y]);
+                          return MP_OKAY;
+                       }
+                   }
+                }
+             } else {
+                mp_set(a, ltm_prime_tab[x + 1]);
+                return MP_OKAY;
+             }
+          }
+      }
+      /* at this point a maybe 1 */
+      if (mp_cmp_d(a, 1) == MP_EQ) {
+         mp_set(a, 2);
+         return MP_OKAY;
+      }
+      /* fall through to the sieve */
+   }
+
+   /* generate a prime congruent to 3 mod 4 or 1/3 mod 4? */
+   if (bbs_style == 1) {
+      kstep   = 4;
+   } else {
+      kstep   = 2;
+   }
+
+   /* at this point we will use a combination of a sieve and Miller-Rabin */
+
+   if (bbs_style == 1) {
+      /* if a mod 4 != 3 subtract the correct value to make it so */
+      if ((a->dp[0] & 3) != 3) {
+         if ((err = mp_sub_d(a, (a->dp[0] & 3) + 1, a)) != MP_OKAY) { return err; };
+      }
+   } else {
+      if (mp_iseven(a) == 1) {
+         /* force odd */
+         if ((err = mp_sub_d(a, 1, a)) != MP_OKAY) {
+            return err;
+         }
+      }
+   }
+
+   /* generate the restable */
+   for (x = 1; x < PRIME_SIZE; x++) {
+      if ((err = mp_mod_d(a, ltm_prime_tab[x], res_tab + x)) != MP_OKAY) {
+         return err;
+      }
+   }
+
+   /* init temp used for Miller-Rabin Testing */
+   if ((err = mp_init(&b)) != MP_OKAY) {
+      return err;
+   }
+
+   for (;;) {
+      /* skip to the next non-trivially divisible candidate */
+      step = 0;
+      do {
+         /* y == 1 if any residue was zero [e.g. cannot be prime] */
+         y     =  0;
+
+         /* increase step to next candidate */
+         step += kstep;
+
+         /* compute the new residue without using division */
+         for (x = 1; x < PRIME_SIZE; x++) {
+             /* add the step to each residue */
+             res_tab[x] += kstep;
+
+             /* subtract the modulus [instead of using division] */
+             if (res_tab[x] >= ltm_prime_tab[x]) {
+                res_tab[x]  -= ltm_prime_tab[x];
+             }
+
+             /* set flag if zero */
+             if (res_tab[x] == 0) {
+                y = 1;
+             }
+         }
+      } while (y == 1 && step < ((((mp_digit)1)<<DIGIT_BIT) - kstep));
+
+      /* add the step */
+      if ((err = mp_add_d(a, step, a)) != MP_OKAY) {
+         goto LBL_ERR;
+      }
+
+      /* if didn't pass sieve and step == MAX then skip test */
+      if (y == 1 && step >= ((((mp_digit)1)<<DIGIT_BIT) - kstep)) {
+         continue;
+      }
+
+      /* is this prime? */
+      for (x = 0; x < t; x++) {
+          mp_set(&b, ltm_prime_tab[t]);
+          if ((err = mp_prime_miller_rabin(a, &b, &res)) != MP_OKAY) {
+             goto LBL_ERR;
+          }
+          if (res == MP_NO) {
+             break;
+          }
+      }
+
+      if (res == MP_YES) {
+         break;
+      }
+   }
+
+   err = MP_OKAY;
+LBL_ERR:
+   mp_clear(&b);
+   return err;
+}
+
+#endif
+
+/* End: bn_mp_prime_next_prime.c */
+
+/* Start: bn_mp_prime_rabin_miller_trials.c */
+#include <tommath.h>
+#ifdef BN_MP_PRIME_RABIN_MILLER_TRIALS_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+
+static const struct {
+   int k, t;
+} sizes[] = {
+{   128,    28 },
+{   256,    16 },
+{   384,    10 },
+{   512,     7 },
+{   640,     6 },
+{   768,     5 },
+{   896,     4 },
+{  1024,     4 }
+};
+
+/* returns # of RM trials required for a given bit size */
+int mp_prime_rabin_miller_trials(int size)
+{
+   int x;
+
+   for (x = 0; x < (int)(sizeof(sizes)/(sizeof(sizes[0]))); x++) {
+       if (sizes[x].k == size) {
+          return sizes[x].t;
+       } else if (sizes[x].k > size) {
+          return (x == 0) ? sizes[0].t : sizes[x - 1].t;
+       }
+   }
+   return sizes[x-1].t + 1;
+}
+
+
+#endif
+
+/* End: bn_mp_prime_rabin_miller_trials.c */
+
+/* Start: bn_mp_prime_random_ex.c */
+#include <tommath.h>
+#ifdef BN_MP_PRIME_RANDOM_EX_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* makes a truly random prime of a given size (bits),
+ *
+ * Flags are as follows:
+ * 
+ *   LTM_PRIME_BBS      - make prime congruent to 3 mod 4
+ *   LTM_PRIME_SAFE     - make sure (p-1)/2 is prime as well (implies LTM_PRIME_BBS)
+ *   LTM_PRIME_2MSB_OFF - make the 2nd highest bit zero
+ *   LTM_PRIME_2MSB_ON  - make the 2nd highest bit one
+ *
+ * You have to supply a callback which fills in a buffer with random bytes.  "dat" is a parameter you can
+ * have passed to the callback (e.g. a state or something).  This function doesn't use "dat" itself
+ * so it can be NULL
+ *
+ */
+
+/* This is possibly the mother of all prime generation functions, muahahahahaha! */
+int mp_prime_random_ex(mp_int *a, int t, int size, int flags, ltm_prime_callback cb, void *dat)
+{
+   unsigned char *tmp, maskAND, maskOR_msb, maskOR_lsb;
+   int res, err, bsize, maskOR_msb_offset;
+
+   /* sanity check the input */
+   if (size <= 1 || t <= 0) {
+      return MP_VAL;
+   }
+
+   /* LTM_PRIME_SAFE implies LTM_PRIME_BBS */
+   if (flags & LTM_PRIME_SAFE) {
+      flags |= LTM_PRIME_BBS;
+   }
+
+   /* calc the byte size */
+   bsize = (size>>3) + ((size&7)?1:0);
+
+   /* we need a buffer of bsize bytes */
+   tmp = OPT_CAST(unsigned char) XMALLOC(bsize);
+   if (tmp == NULL) {
+      return MP_MEM;
+   }
+
+   /* calc the maskAND value for the MSbyte*/
+   maskAND = ((size&7) == 0) ? 0xFF : (0xFF >> (8 - (size & 7)));
+
+   /* calc the maskOR_msb */
+   maskOR_msb        = 0;
+   maskOR_msb_offset = ((size & 7) == 1) ? 1 : 0;
+   if (flags & LTM_PRIME_2MSB_ON) {
+      maskOR_msb     |= 1 << ((size - 2) & 7);
+   } else if (flags & LTM_PRIME_2MSB_OFF) {
+      maskAND        &= ~(1 << ((size - 2) & 7));
+   } 
+
+   /* get the maskOR_lsb */
+   maskOR_lsb         = 1;
+   if (flags & LTM_PRIME_BBS) {
+      maskOR_lsb     |= 3;
+   }
+
+   do {
+      /* read the bytes */
+      if (cb(tmp, bsize, dat) != bsize) {
+         err = MP_VAL;
+         goto error;
+      }
+ 
+      /* work over the MSbyte */
+      tmp[0]    &= maskAND;
+      tmp[0]    |= 1 << ((size - 1) & 7);
+
+      /* mix in the maskORs */
+      tmp[maskOR_msb_offset]   |= maskOR_msb;
+      tmp[bsize-1]             |= maskOR_lsb;
+
+      /* read it in */
+      if ((err = mp_read_unsigned_bin(a, tmp, bsize)) != MP_OKAY)     { goto error; }
+
+      /* is it prime? */
+      if ((err = mp_prime_is_prime(a, t, &res)) != MP_OKAY)           { goto error; }
+      if (res == MP_NO) {  
+         continue;
+      }
+
+      if (flags & LTM_PRIME_SAFE) {
+         /* see if (a-1)/2 is prime */
+         if ((err = mp_sub_d(a, 1, a)) != MP_OKAY)                    { goto error; }
+         if ((err = mp_div_2(a, a)) != MP_OKAY)                       { goto error; }
+ 
+         /* is it prime? */
+         if ((err = mp_prime_is_prime(a, t, &res)) != MP_OKAY)        { goto error; }
+      }
+   } while (res == MP_NO);
+
+   if (flags & LTM_PRIME_SAFE) {
+      /* restore a to the original value */
+      if ((err = mp_mul_2(a, a)) != MP_OKAY)                          { goto error; }
+      if ((err = mp_add_d(a, 1, a)) != MP_OKAY)                       { goto error; }
+   }
+
+   err = MP_OKAY;
+error:
+   XFREE(tmp);
+   return err;
+}
+
+
+#endif
+
+/* End: bn_mp_prime_random_ex.c */
+
+/* Start: bn_mp_radix_size.c */
+#include <tommath.h>
+#ifdef BN_MP_RADIX_SIZE_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* returns size of ASCII reprensentation */
+int mp_radix_size (mp_int * a, int radix, int *size)
+{
+  int     res, digs;
+  mp_int  t;
+  mp_digit d;
+
+  *size = 0;
+
+  /* special case for binary */
+  if (radix == 2) {
+    *size = mp_count_bits (a) + (a->sign == MP_NEG ? 1 : 0) + 1;
+    return MP_OKAY;
+  }
+
+  /* make sure the radix is in range */
+  if (radix < 2 || radix > 64) {
+    return MP_VAL;
+  }
+
+  if (mp_iszero(a) == MP_YES) {
+     *size = 2;
+    return MP_OKAY;
+  }
+
+  /* digs is the digit count */
+  digs = 0;
+
+  /* if it's negative add one for the sign */
+  if (a->sign == MP_NEG) {
+    ++digs;
+  }
+
+  /* init a copy of the input */
+  if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
+    return res;
+  }
+
+  /* force temp to positive */
+  t.sign = MP_ZPOS; 
+
+  /* fetch out all of the digits */
+  while (mp_iszero (&t) == MP_NO) {
+    if ((res = mp_div_d (&t, (mp_digit) radix, &t, &d)) != MP_OKAY) {
+      mp_clear (&t);
+      return res;
+    }
+    ++digs;
+  }
+  mp_clear (&t);
+
+  /* return digs + 1, the 1 is for the NULL byte that would be required. */
+  *size = digs + 1;
+  return MP_OKAY;
+}
+
+#endif
+
+/* End: bn_mp_radix_size.c */
+
+/* Start: bn_mp_radix_smap.c */
+#include <tommath.h>
+#ifdef BN_MP_RADIX_SMAP_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* chars used in radix conversions */
+const char *mp_s_rmap = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+/";
+#endif
+
+/* End: bn_mp_radix_smap.c */
+
+/* Start: bn_mp_rand.c */
+#include <tommath.h>
+#ifdef BN_MP_RAND_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* makes a pseudo-random int of a given size */
+int
+mp_rand (mp_int * a, int digits)
+{
+  int     res;
+  mp_digit d;
+
+  mp_zero (a);
+  if (digits <= 0) {
+    return MP_OKAY;
+  }
+
+  /* first place a random non-zero digit */
+  do {
+    d = ((mp_digit) abs (rand ())) & MP_MASK;
+  } while (d == 0);
+
+  if ((res = mp_add_d (a, d, a)) != MP_OKAY) {
+    return res;
+  }
+
+  while (--digits > 0) {
+    if ((res = mp_lshd (a, 1)) != MP_OKAY) {
+      return res;
+    }
+
+    if ((res = mp_add_d (a, ((mp_digit) abs (rand ())), a)) != MP_OKAY) {
+      return res;
+    }
+  }
+
+  return MP_OKAY;
+}
+#endif
+
+/* End: bn_mp_rand.c */
+
+/* Start: bn_mp_read_radix.c */
+#include <tommath.h>
+#ifdef BN_MP_READ_RADIX_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* read a string [ASCII] in a given radix */
+int mp_read_radix (mp_int * a, const char *str, int radix)
+{
+  int     y, res, neg;
+  char    ch;
+
+  /* make sure the radix is ok */
+  if (radix < 2 || radix > 64) {
+    return MP_VAL;
+  }
+
+  /* if the leading digit is a 
+   * minus set the sign to negative. 
+   */
+  if (*str == '-') {
+    ++str;
+    neg = MP_NEG;
+  } else {
+    neg = MP_ZPOS;
+  }
+
+  /* set the integer to the default of zero */
+  mp_zero (a);
+  
+  /* process each digit of the string */
+  while (*str) {
+    /* if the radix < 36 the conversion is case insensitive
+     * this allows numbers like 1AB and 1ab to represent the same  value
+     * [e.g. in hex]
+     */
+    ch = (char) ((radix < 36) ? toupper (*str) : *str);
+    for (y = 0; y < 64; y++) {
+      if (ch == mp_s_rmap[y]) {
+         break;
+      }
+    }
+
+    /* if the char was found in the map 
+     * and is less than the given radix add it
+     * to the number, otherwise exit the loop. 
+     */
+    if (y < radix) {
+      if ((res = mp_mul_d (a, (mp_digit) radix, a)) != MP_OKAY) {
+         return res;
+      }
+      if ((res = mp_add_d (a, (mp_digit) y, a)) != MP_OKAY) {
+         return res;
+      }
+    } else {
+      break;
+    }
+    ++str;
+  }
+  
+  /* set the sign only if a != 0 */
+  if (mp_iszero(a) != 1) {
+     a->sign = neg;
+  }
+  return MP_OKAY;
+}
+#endif
+
+/* End: bn_mp_read_radix.c */
+
+/* Start: bn_mp_read_signed_bin.c */
+#include <tommath.h>
+#ifdef BN_MP_READ_SIGNED_BIN_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* read signed bin, big endian, first byte is 0==positive or 1==negative */
+int
+mp_read_signed_bin (mp_int * a, unsigned char *b, int c)
+{
+  int     res;
+
+  /* read magnitude */
+  if ((res = mp_read_unsigned_bin (a, b + 1, c - 1)) != MP_OKAY) {
+    return res;
+  }
+
+  /* first byte is 0 for positive, non-zero for negative */
+  if (b[0] == 0) {
+     a->sign = MP_ZPOS;
+  } else {
+     a->sign = MP_NEG;
+  }
+
+  return MP_OKAY;
+}
+#endif
+
+/* End: bn_mp_read_signed_bin.c */
+
+/* Start: bn_mp_read_unsigned_bin.c */
+#include <tommath.h>
+#ifdef BN_MP_READ_UNSIGNED_BIN_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* reads a unsigned char array, assumes the msb is stored first [big endian] */
+int
+mp_read_unsigned_bin (mp_int * a, unsigned char *b, int c)
+{
+  int     res;
+
+  /* make sure there are at least two digits */
+  if (a->alloc < 2) {
+     if ((res = mp_grow(a, 2)) != MP_OKAY) {
+        return res;
+     }
+  }
+
+  /* zero the int */
+  mp_zero (a);
+
+  /* read the bytes in */
+  while (c-- > 0) {
+    if ((res = mp_mul_2d (a, 8, a)) != MP_OKAY) {
+      return res;
+    }
+
+#ifndef MP_8BIT
+      a->dp[0] |= *b++;
+      a->used += 1;
+#else
+      a->dp[0] = (*b & MP_MASK);
+      a->dp[1] |= ((*b++ >> 7U) & 1);
+      a->used += 2;
+#endif
+  }
+  mp_clamp (a);
+  return MP_OKAY;
+}
+#endif
+
+/* End: bn_mp_read_unsigned_bin.c */
+
+/* Start: bn_mp_reduce.c */
+#include <tommath.h>
+#ifdef BN_MP_REDUCE_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* reduces x mod m, assumes 0 < x < m**2, mu is 
+ * precomputed via mp_reduce_setup.
+ * From HAC pp.604 Algorithm 14.42
+ */
+int mp_reduce (mp_int * x, mp_int * m, mp_int * mu)
+{
+  mp_int  q;
+  int     res, um = m->used;
+
+  /* q = x */
+  if ((res = mp_init_copy (&q, x)) != MP_OKAY) {
+    return res;
+  }
+
+  /* q1 = x / b**(k-1)  */
+  mp_rshd (&q, um - 1);         
+
+  /* according to HAC this optimization is ok */
+  if (((unsigned long) um) > (((mp_digit)1) << (DIGIT_BIT - 1))) {
+    if ((res = mp_mul (&q, mu, &q)) != MP_OKAY) {
+      goto CLEANUP;
+    }
+  } else {
+#ifdef BN_S_MP_MUL_HIGH_DIGS_C
+    if ((res = s_mp_mul_high_digs (&q, mu, &q, um)) != MP_OKAY) {
+      goto CLEANUP;
+    }
+#elif defined(BN_FAST_S_MP_MUL_HIGH_DIGS_C)
+    if ((res = fast_s_mp_mul_high_digs (&q, mu, &q, um)) != MP_OKAY) {
+      goto CLEANUP;
+    }
+#else 
+    { 
+      res = MP_VAL;
+      goto CLEANUP;
+    }
+#endif
+  }
+
+  /* q3 = q2 / b**(k+1) */
+  mp_rshd (&q, um + 1);         
+
+  /* x = x mod b**(k+1), quick (no division) */
+  if ((res = mp_mod_2d (x, DIGIT_BIT * (um + 1), x)) != MP_OKAY) {
+    goto CLEANUP;
+  }
+
+  /* q = q * m mod b**(k+1), quick (no division) */
+  if ((res = s_mp_mul_digs (&q, m, &q, um + 1)) != MP_OKAY) {
+    goto CLEANUP;
+  }
+
+  /* x = x - q */
+  if ((res = mp_sub (x, &q, x)) != MP_OKAY) {
+    goto CLEANUP;
+  }
+
+  /* If x < 0, add b**(k+1) to it */
+  if (mp_cmp_d (x, 0) == MP_LT) {
+    mp_set (&q, 1);
+    if ((res = mp_lshd (&q, um + 1)) != MP_OKAY)
+      goto CLEANUP;
+    if ((res = mp_add (x, &q, x)) != MP_OKAY)
+      goto CLEANUP;
+  }
+
+  /* Back off if it's too big */
+  while (mp_cmp (x, m) != MP_LT) {
+    if ((res = s_mp_sub (x, m, x)) != MP_OKAY) {
+      goto CLEANUP;
+    }
+  }
+  
+CLEANUP:
+  mp_clear (&q);
+
+  return res;
+}
+#endif
+
+/* End: bn_mp_reduce.c */
+
+/* Start: bn_mp_reduce_2k.c */
+#include <tommath.h>
+#ifdef BN_MP_REDUCE_2K_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* reduces a modulo n where n is of the form 2**p - d */
+int mp_reduce_2k(mp_int *a, mp_int *n, mp_digit d)
+{
+   mp_int q;
+   int    p, res;
+   
+   if ((res = mp_init(&q)) != MP_OKAY) {
+      return res;
+   }
+   
+   p = mp_count_bits(n);    
+top:
+   /* q = a/2**p, a = a mod 2**p */
+   if ((res = mp_div_2d(a, p, &q, a)) != MP_OKAY) {
+      goto ERR;
+   }
+   
+   if (d != 1) {
+      /* q = q * d */
+      if ((res = mp_mul_d(&q, d, &q)) != MP_OKAY) { 
+         goto ERR;
+      }
+   }
+   
+   /* a = a + q */
+   if ((res = s_mp_add(a, &q, a)) != MP_OKAY) {
+      goto ERR;
+   }
+   
+   if (mp_cmp_mag(a, n) != MP_LT) {
+      s_mp_sub(a, n, a);
+      goto top;
+   }
+   
+ERR:
+   mp_clear(&q);
+   return res;
+}
+
+#endif
+
+/* End: bn_mp_reduce_2k.c */
+
+/* Start: bn_mp_reduce_2k_l.c */
+#include <tommath.h>
+#ifdef BN_MP_REDUCE_2K_L_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* reduces a modulo n where n is of the form 2**p - d 
+   This differs from reduce_2k since "d" can be larger
+   than a single digit.
+*/
+int mp_reduce_2k_l(mp_int *a, mp_int *n, mp_int *d)
+{
+   mp_int q;
+   int    p, res;
+   
+   if ((res = mp_init(&q)) != MP_OKAY) {
+      return res;
+   }
+   
+   p = mp_count_bits(n);    
+top:
+   /* q = a/2**p, a = a mod 2**p */
+   if ((res = mp_div_2d(a, p, &q, a)) != MP_OKAY) {
+      goto ERR;
+   }
+   
+   /* q = q * d */
+   if ((res = mp_mul(&q, d, &q)) != MP_OKAY) { 
+      goto ERR;
+   }
+   
+   /* a = a + q */
+   if ((res = s_mp_add(a, &q, a)) != MP_OKAY) {
+      goto ERR;
+   }
+   
+   if (mp_cmp_mag(a, n) != MP_LT) {
+      s_mp_sub(a, n, a);
+      goto top;
+   }
+   
+ERR:
+   mp_clear(&q);
+   return res;
+}
+
+#endif
+
+/* End: bn_mp_reduce_2k_l.c */
+
+/* Start: bn_mp_reduce_2k_setup.c */
+#include <tommath.h>
+#ifdef BN_MP_REDUCE_2K_SETUP_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* determines the setup value */
+int mp_reduce_2k_setup(mp_int *a, mp_digit *d)
+{
+   int res, p;
+   mp_int tmp;
+   
+   if ((res = mp_init(&tmp)) != MP_OKAY) {
+      return res;
+   }
+   
+   p = mp_count_bits(a);
+   if ((res = mp_2expt(&tmp, p)) != MP_OKAY) {
+      mp_clear(&tmp);
+      return res;
+   }
+   
+   if ((res = s_mp_sub(&tmp, a, &tmp)) != MP_OKAY) {
+      mp_clear(&tmp);
+      return res;
+   }
+   
+   *d = tmp.dp[0];
+   mp_clear(&tmp);
+   return MP_OKAY;
+}
+#endif
+
+/* End: bn_mp_reduce_2k_setup.c */
+
+/* Start: bn_mp_reduce_2k_setup_l.c */
+#include <tommath.h>
+#ifdef BN_MP_REDUCE_2K_SETUP_L_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* determines the setup value */
+int mp_reduce_2k_setup_l(mp_int *a, mp_int *d)
+{
+   int    res;
+   mp_int tmp;
+   
+   if ((res = mp_init(&tmp)) != MP_OKAY) {
+      return res;
+   }
+   
+   if ((res = mp_2expt(&tmp, mp_count_bits(a))) != MP_OKAY) {
+      goto ERR;
+   }
+   
+   if ((res = s_mp_sub(&tmp, a, d)) != MP_OKAY) {
+      goto ERR;
+   }
+   
+ERR:
+   mp_clear(&tmp);
+   return res;
+}
+#endif
+
+/* End: bn_mp_reduce_2k_setup_l.c */
+
+/* Start: bn_mp_reduce_is_2k.c */
+#include <tommath.h>
+#ifdef BN_MP_REDUCE_IS_2K_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* determines if mp_reduce_2k can be used */
+int mp_reduce_is_2k(mp_int *a)
+{
+   int ix, iy, iw;
+   mp_digit iz;
+   
+   if (a->used == 0) {
+      return MP_NO;
+   } else if (a->used == 1) {
+      return MP_YES;
+   } else if (a->used > 1) {
+      iy = mp_count_bits(a);
+      iz = 1;
+      iw = 1;
+    
+      /* Test every bit from the second digit up, must be 1 */
+      for (ix = DIGIT_BIT; ix < iy; ix++) {
+          if ((a->dp[iw] & iz) == 0) {
+             return MP_NO;
+          }
+          iz <<= 1;
+          if (iz > (mp_digit)MP_MASK) {
+             ++iw;
+             iz = 1;
+          }
+      }
+   }
+   return MP_YES;
+}
+
+#endif
+
+/* End: bn_mp_reduce_is_2k.c */
+
+/* Start: bn_mp_reduce_is_2k_l.c */
+#include <tommath.h>
+#ifdef BN_MP_REDUCE_IS_2K_L_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* determines if reduce_2k_l can be used */
+int mp_reduce_is_2k_l(mp_int *a)
+{
+   int ix, iy;
+   
+   if (a->used == 0) {
+      return MP_NO;
+   } else if (a->used == 1) {
+      return MP_YES;
+   } else if (a->used > 1) {
+      /* if more than half of the digits are -1 we're sold */
+      for (iy = ix = 0; ix < a->used; ix++) {
+          if (a->dp[ix] == MP_MASK) {
+              ++iy;
+          }
+      }
+      return (iy >= (a->used/2)) ? MP_YES : MP_NO;
+      
+   }
+   return MP_NO;
+}
+
+#endif
+
+/* End: bn_mp_reduce_is_2k_l.c */
+
+/* Start: bn_mp_reduce_setup.c */
+#include <tommath.h>
+#ifdef BN_MP_REDUCE_SETUP_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* pre-calculate the value required for Barrett reduction
+ * For a given modulus "b" it calulates the value required in "a"
+ */
+int mp_reduce_setup (mp_int * a, mp_int * b)
+{
+  int     res;
+  
+  if ((res = mp_2expt (a, b->used * 2 * DIGIT_BIT)) != MP_OKAY) {
+    return res;
+  }
+  return mp_div (a, b, a, NULL);
+}
+#endif
+
+/* End: bn_mp_reduce_setup.c */
+
+/* Start: bn_mp_rshd.c */
+#include <tommath.h>
+#ifdef BN_MP_RSHD_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* shift right a certain amount of digits */
+void mp_rshd (mp_int * a, int b)
+{
+  int     x;
+
+  /* if b <= 0 then ignore it */
+  if (b <= 0) {
+    return;
+  }
+
+  /* if b > used then simply zero it and return */
+  if (a->used <= b) {
+    mp_zero (a);
+    return;
+  }
+
+  {
+    register mp_digit *bottom, *top;
+
+    /* shift the digits down */
+
+    /* bottom */
+    bottom = a->dp;
+
+    /* top [offset into digits] */
+    top = a->dp + b;
+
+    /* this is implemented as a sliding window where 
+     * the window is b-digits long and digits from 
+     * the top of the window are copied to the bottom
+     *
+     * e.g.
+
+     b-2 | b-1 | b0 | b1 | b2 | ... | bb |   ---->
+                 /\                   |      ---->
+                  \-------------------/      ---->
+     */
+    for (x = 0; x < (a->used - b); x++) {
+      *bottom++ = *top++;
+    }
+
+    /* zero the top digits */
+    for (; x < a->used; x++) {
+      *bottom++ = 0;
+    }
+  }
+  
+  /* remove excess digits */
+  a->used -= b;
+}
+#endif
+
+/* End: bn_mp_rshd.c */
+
+/* Start: bn_mp_set.c */
+#include <tommath.h>
+#ifdef BN_MP_SET_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* set to a digit */
+void mp_set (mp_int * a, mp_digit b)
+{
+  mp_zero (a);
+  a->dp[0] = b & MP_MASK;
+  a->used  = (a->dp[0] != 0) ? 1 : 0;
+}
+#endif
+
+/* End: bn_mp_set.c */
+
+/* Start: bn_mp_set_int.c */
+#include <tommath.h>
+#ifdef BN_MP_SET_INT_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* set a 32-bit const */
+int mp_set_int (mp_int * a, unsigned long b)
+{
+  int     x, res;
+
+  mp_zero (a);
+  
+  /* set four bits at a time */
+  for (x = 0; x < 8; x++) {
+    /* shift the number up four bits */
+    if ((res = mp_mul_2d (a, 4, a)) != MP_OKAY) {
+      return res;
+    }
+
+    /* OR in the top four bits of the source */
+    a->dp[0] |= (b >> 28) & 15;
+
+    /* shift the source up to the next four bits */
+    b <<= 4;
+
+    /* ensure that digits are not clamped off */
+    a->used += 1;
+  }
+  mp_clamp (a);
+  return MP_OKAY;
+}
+#endif
+
+/* End: bn_mp_set_int.c */
+
+/* Start: bn_mp_shrink.c */
+#include <tommath.h>
+#ifdef BN_MP_SHRINK_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* shrink a bignum */
+int mp_shrink (mp_int * a)
+{
+  mp_digit *tmp;
+  if (a->alloc != a->used && a->used > 0) {
+    if ((tmp = OPT_CAST(mp_digit) XREALLOC (a->dp, sizeof (mp_digit) * a->used)) == NULL) {
+      return MP_MEM;
+    }
+    a->dp    = tmp;
+    a->alloc = a->used;
+  }
+  return MP_OKAY;
+}
+#endif
+
+/* End: bn_mp_shrink.c */
+
+/* Start: bn_mp_signed_bin_size.c */
+#include <tommath.h>
+#ifdef BN_MP_SIGNED_BIN_SIZE_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* get the size for an signed equivalent */
+int mp_signed_bin_size (mp_int * a)
+{
+  return 1 + mp_unsigned_bin_size (a);
+}
+#endif
+
+/* End: bn_mp_signed_bin_size.c */
+
+/* Start: bn_mp_sqr.c */
+#include <tommath.h>
+#ifdef BN_MP_SQR_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* computes b = a*a */
+int
+mp_sqr (mp_int * a, mp_int * b)
+{
+  int     res;
+
+#ifdef BN_MP_TOOM_SQR_C
+  /* use Toom-Cook? */
+  if (a->used >= TOOM_SQR_CUTOFF) {
+    res = mp_toom_sqr(a, b);
+  /* Karatsuba? */
+  } else 
+#endif
+#ifdef BN_MP_KARATSUBA_SQR_C
+if (a->used >= KARATSUBA_SQR_CUTOFF) {
+    res = mp_karatsuba_sqr (a, b);
+  } else 
+#endif
+  {
+#ifdef BN_FAST_S_MP_SQR_C
+    /* can we use the fast comba multiplier? */
+    if ((a->used * 2 + 1) < MP_WARRAY && 
+         a->used < 
+         (1 << (sizeof(mp_word) * CHAR_BIT - 2*DIGIT_BIT - 1))) {
+      res = fast_s_mp_sqr (a, b);
+    } else
+#endif
+#ifdef BN_S_MP_SQR_C
+      res = s_mp_sqr (a, b);
+#else
+      res = MP_VAL;
+#endif
+  }
+  b->sign = MP_ZPOS;
+  return res;
+}
+#endif
+
+/* End: bn_mp_sqr.c */
+
+/* Start: bn_mp_sqrmod.c */
+#include <tommath.h>
+#ifdef BN_MP_SQRMOD_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* c = a * a (mod b) */
+int
+mp_sqrmod (mp_int * a, mp_int * b, mp_int * c)
+{
+  int     res;
+  mp_int  t;
+
+  if ((res = mp_init (&t)) != MP_OKAY) {
+    return res;
+  }
+
+  if ((res = mp_sqr (a, &t)) != MP_OKAY) {
+    mp_clear (&t);
+    return res;
+  }
+  res = mp_mod (&t, b, c);
+  mp_clear (&t);
+  return res;
+}
+#endif
+
+/* End: bn_mp_sqrmod.c */
+
+/* Start: bn_mp_sqrt.c */
+#include <tommath.h>
+#ifdef BN_MP_SQRT_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* this function is less generic than mp_n_root, simpler and faster */
+int mp_sqrt(mp_int *arg, mp_int *ret) 
+{
+  int res;
+  mp_int t1,t2;
+
+  /* must be positive */
+  if (arg->sign == MP_NEG) {
+    return MP_VAL;
+  }
+
+  /* easy out */
+  if (mp_iszero(arg) == MP_YES) {
+    mp_zero(ret);
+    return MP_OKAY;
+  }
+
+  if ((res = mp_init_copy(&t1, arg)) != MP_OKAY) {
+    return res;
+  }
+
+  if ((res = mp_init(&t2)) != MP_OKAY) {
+    goto E2;
+  }
+
+  /* First approx. (not very bad for large arg) */
+  mp_rshd (&t1,t1.used/2);
+
+  /* t1 > 0  */ 
+  if ((res = mp_div(arg,&t1,&t2,NULL)) != MP_OKAY) {
+    goto E1;
+  }
+  if ((res = mp_add(&t1,&t2,&t1)) != MP_OKAY) {
+    goto E1;
+  }
+  if ((res = mp_div_2(&t1,&t1)) != MP_OKAY) {
+    goto E1;
+  }
+  /* And now t1 > sqrt(arg) */
+  do { 
+    if ((res = mp_div(arg,&t1,&t2,NULL)) != MP_OKAY) {
+      goto E1;
+    }
+    if ((res = mp_add(&t1,&t2,&t1)) != MP_OKAY) {
+      goto E1;
+    }
+    if ((res = mp_div_2(&t1,&t1)) != MP_OKAY) {
+      goto E1;
+    }
+    /* t1 >= sqrt(arg) >= t2 at this point */
+  } while (mp_cmp_mag(&t1,&t2) == MP_GT);
+
+  mp_exch(&t1,ret);
+
+E1: mp_clear(&t2);
+E2: mp_clear(&t1);
+  return res;
+}
+
+#endif
+
+/* End: bn_mp_sqrt.c */
+
+/* Start: bn_mp_sub.c */
+#include <tommath.h>
+#ifdef BN_MP_SUB_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* high level subtraction (handles signs) */
+int
+mp_sub (mp_int * a, mp_int * b, mp_int * c)
+{
+  int     sa, sb, res;
+
+  sa = a->sign;
+  sb = b->sign;
+
+  if (sa != sb) {
+    /* subtract a negative from a positive, OR */
+    /* subtract a positive from a negative. */
+    /* In either case, ADD their magnitudes, */
+    /* and use the sign of the first number. */
+    c->sign = sa;
+    res = s_mp_add (a, b, c);
+  } else {
+    /* subtract a positive from a positive, OR */
+    /* subtract a negative from a negative. */
+    /* First, take the difference between their */
+    /* magnitudes, then... */
+    if (mp_cmp_mag (a, b) != MP_LT) {
+      /* Copy the sign from the first */
+      c->sign = sa;
+      /* The first has a larger or equal magnitude */
+      res = s_mp_sub (a, b, c);
+    } else {
+      /* The result has the *opposite* sign from */
+      /* the first number. */
+      c->sign = (sa == MP_ZPOS) ? MP_NEG : MP_ZPOS;
+      /* The second has a larger magnitude */
+      res = s_mp_sub (b, a, c);
+    }
+  }
+  return res;
+}
+
+#endif
+
+/* End: bn_mp_sub.c */
+
+/* Start: bn_mp_sub_d.c */
+#include <tommath.h>
+#ifdef BN_MP_SUB_D_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* single digit subtraction */
+int
+mp_sub_d (mp_int * a, mp_digit b, mp_int * c)
+{
+  mp_digit *tmpa, *tmpc, mu;
+  int       res, ix, oldused;
+
+  /* grow c as required */
+  if (c->alloc < a->used + 1) {
+     if ((res = mp_grow(c, a->used + 1)) != MP_OKAY) {
+        return res;
+     }
+  }
+
+  /* if a is negative just do an unsigned
+   * addition [with fudged signs]
+   */
+  if (a->sign == MP_NEG) {
+     a->sign = MP_ZPOS;
+     res     = mp_add_d(a, b, c);
+     a->sign = c->sign = MP_NEG;
+     return res;
+  }
+
+  /* setup regs */
+  oldused = c->used;
+  tmpa    = a->dp;
+  tmpc    = c->dp;
+
+  /* if a <= b simply fix the single digit */
+  if ((a->used == 1 && a->dp[0] <= b) || a->used == 0) {
+     if (a->used == 1) {
+        *tmpc++ = b - *tmpa;
+     } else {
+        *tmpc++ = b;
+     }
+     ix      = 1;
+
+     /* negative/1digit */
+     c->sign = MP_NEG;
+     c->used = 1;
+  } else {
+     /* positive/size */
+     c->sign = MP_ZPOS;
+     c->used = a->used;
+
+     /* subtract first digit */
+     *tmpc    = *tmpa++ - b;
+     mu       = *tmpc >> (sizeof(mp_digit) * CHAR_BIT - 1);
+     *tmpc++ &= MP_MASK;
+
+     /* handle rest of the digits */
+     for (ix = 1; ix < a->used; ix++) {
+        *tmpc    = *tmpa++ - mu;
+        mu       = *tmpc >> (sizeof(mp_digit) * CHAR_BIT - 1);
+        *tmpc++ &= MP_MASK;
+     }
+  }
+
+  /* zero excess digits */
+  while (ix++ < oldused) {
+     *tmpc++ = 0;
+  }
+  mp_clamp(c);
+  return MP_OKAY;
+}
+
+#endif
+
+/* End: bn_mp_sub_d.c */
+
+/* Start: bn_mp_submod.c */
+#include <tommath.h>
+#ifdef BN_MP_SUBMOD_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* d = a - b (mod c) */
+int
+mp_submod (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
+{
+  int     res;
+  mp_int  t;
+
+
+  if ((res = mp_init (&t)) != MP_OKAY) {
+    return res;
+  }
+
+  if ((res = mp_sub (a, b, &t)) != MP_OKAY) {
+    mp_clear (&t);
+    return res;
+  }
+  res = mp_mod (&t, c, d);
+  mp_clear (&t);
+  return res;
+}
+#endif
+
+/* End: bn_mp_submod.c */
+
+/* Start: bn_mp_to_signed_bin.c */
+#include <tommath.h>
+#ifdef BN_MP_TO_SIGNED_BIN_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* store in signed [big endian] format */
+int mp_to_signed_bin (mp_int * a, unsigned char *b)
+{
+  int     res;
+
+  if ((res = mp_to_unsigned_bin (a, b + 1)) != MP_OKAY) {
+    return res;
+  }
+  b[0] = (unsigned char) ((a->sign == MP_ZPOS) ? 0 : 1);
+  return MP_OKAY;
+}
+#endif
+
+/* End: bn_mp_to_signed_bin.c */
+
+/* Start: bn_mp_to_signed_bin_n.c */
+#include <tommath.h>
+#ifdef BN_MP_TO_SIGNED_BIN_N_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* store in signed [big endian] format */
+int mp_to_signed_bin_n (mp_int * a, unsigned char *b, unsigned long *outlen)
+{
+   if (*outlen < (unsigned long)mp_signed_bin_size(a)) {
+      return MP_VAL;
+   }
+   *outlen = mp_signed_bin_size(a);
+   return mp_to_signed_bin(a, b);
+}
+#endif
+
+/* End: bn_mp_to_signed_bin_n.c */
+
+/* Start: bn_mp_to_unsigned_bin.c */
+#include <tommath.h>
+#ifdef BN_MP_TO_UNSIGNED_BIN_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* store in unsigned [big endian] format */
+int mp_to_unsigned_bin (mp_int * a, unsigned char *b)
+{
+  int     x, res;
+  mp_int  t;
+
+  if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
+    return res;
+  }
+
+  x = 0;
+  while (mp_iszero (&t) == 0) {
+#ifndef MP_8BIT
+      b[x++] = (unsigned char) (t.dp[0] & 255);
+#else
+      b[x++] = (unsigned char) (t.dp[0] | ((t.dp[1] & 0x01) << 7));
+#endif
+    if ((res = mp_div_2d (&t, 8, &t, NULL)) != MP_OKAY) {
+      mp_clear (&t);
+      return res;
+    }
+  }
+  bn_reverse (b, x);
+  mp_clear (&t);
+  return MP_OKAY;
+}
+#endif
+
+/* End: bn_mp_to_unsigned_bin.c */
+
+/* Start: bn_mp_to_unsigned_bin_n.c */
+#include <tommath.h>
+#ifdef BN_MP_TO_UNSIGNED_BIN_N_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* store in unsigned [big endian] format */
+int mp_to_unsigned_bin_n (mp_int * a, unsigned char *b, unsigned long *outlen)
+{
+   if (*outlen < (unsigned long)mp_unsigned_bin_size(a)) {
+      return MP_VAL;
+   }
+   *outlen = mp_unsigned_bin_size(a);
+   return mp_to_unsigned_bin(a, b);
+}
+#endif
+
+/* End: bn_mp_to_unsigned_bin_n.c */
+
+/* Start: bn_mp_toom_mul.c */
+#include <tommath.h>
+#ifdef BN_MP_TOOM_MUL_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* multiplication using the Toom-Cook 3-way algorithm 
+ *
+ * Much more complicated than Karatsuba but has a lower 
+ * asymptotic running time of O(N**1.464).  This algorithm is 
+ * only particularly useful on VERY large inputs 
+ * (we're talking 1000s of digits here...).
+*/
+int mp_toom_mul(mp_int *a, mp_int *b, mp_int *c)
+{
+    mp_int w0, w1, w2, w3, w4, tmp1, tmp2, a0, a1, a2, b0, b1, b2;
+    int res, B;
+        
+    /* init temps */
+    if ((res = mp_init_multi(&w0, &w1, &w2, &w3, &w4, 
+                             &a0, &a1, &a2, &b0, &b1, 
+                             &b2, &tmp1, &tmp2, NULL)) != MP_OKAY) {
+       return res;
+    }
+    
+    /* B */
+    B = MIN(a->used, b->used) / 3;
+    
+    /* a = a2 * B**2 + a1 * B + a0 */
+    if ((res = mp_mod_2d(a, DIGIT_BIT * B, &a0)) != MP_OKAY) {
+       goto ERR;
+    }
+
+    if ((res = mp_copy(a, &a1)) != MP_OKAY) {
+       goto ERR;
+    }
+    mp_rshd(&a1, B);
+    mp_mod_2d(&a1, DIGIT_BIT * B, &a1);
+
+    if ((res = mp_copy(a, &a2)) != MP_OKAY) {
+       goto ERR;
+    }
+    mp_rshd(&a2, B*2);
+    
+    /* b = b2 * B**2 + b1 * B + b0 */
+    if ((res = mp_mod_2d(b, DIGIT_BIT * B, &b0)) != MP_OKAY) {
+       goto ERR;
+    }
+
+    if ((res = mp_copy(b, &b1)) != MP_OKAY) {
+       goto ERR;
+    }
+    mp_rshd(&b1, B);
+    mp_mod_2d(&b1, DIGIT_BIT * B, &b1);
+
+    if ((res = mp_copy(b, &b2)) != MP_OKAY) {
+       goto ERR;
+    }
+    mp_rshd(&b2, B*2);
+    
+    /* w0 = a0*b0 */
+    if ((res = mp_mul(&a0, &b0, &w0)) != MP_OKAY) {
+       goto ERR;
+    }
+    
+    /* w4 = a2 * b2 */
+    if ((res = mp_mul(&a2, &b2, &w4)) != MP_OKAY) {
+       goto ERR;
+    }
+    
+    /* w1 = (a2 + 2(a1 + 2a0))(b2 + 2(b1 + 2b0)) */
+    if ((res = mp_mul_2(&a0, &tmp1)) != MP_OKAY) {
+       goto ERR;
+    }
+    if ((res = mp_add(&tmp1, &a1, &tmp1)) != MP_OKAY) {
+       goto ERR;
+    }
+    if ((res = mp_mul_2(&tmp1, &tmp1)) != MP_OKAY) {
+       goto ERR;
+    }
+    if ((res = mp_add(&tmp1, &a2, &tmp1)) != MP_OKAY) {
+       goto ERR;
+    }
+    
+    if ((res = mp_mul_2(&b0, &tmp2)) != MP_OKAY) {
+       goto ERR;
+    }
+    if ((res = mp_add(&tmp2, &b1, &tmp2)) != MP_OKAY) {
+       goto ERR;
+    }
+    if ((res = mp_mul_2(&tmp2, &tmp2)) != MP_OKAY) {
+       goto ERR;
+    }
+    if ((res = mp_add(&tmp2, &b2, &tmp2)) != MP_OKAY) {
+       goto ERR;
+    }
+    
+    if ((res = mp_mul(&tmp1, &tmp2, &w1)) != MP_OKAY) {
+       goto ERR;
+    }
+    
+    /* w3 = (a0 + 2(a1 + 2a2))(b0 + 2(b1 + 2b2)) */
+    if ((res = mp_mul_2(&a2, &tmp1)) != MP_OKAY) {
+       goto ERR;
+    }
+    if ((res = mp_add(&tmp1, &a1, &tmp1)) != MP_OKAY) {
+       goto ERR;
+    }
+    if ((res = mp_mul_2(&tmp1, &tmp1)) != MP_OKAY) {
+       goto ERR;
+    }
+    if ((res = mp_add(&tmp1, &a0, &tmp1)) != MP_OKAY) {
+       goto ERR;
+    }
+    
+    if ((res = mp_mul_2(&b2, &tmp2)) != MP_OKAY) {
+       goto ERR;
+    }
+    if ((res = mp_add(&tmp2, &b1, &tmp2)) != MP_OKAY) {
+       goto ERR;
+    }
+    if ((res = mp_mul_2(&tmp2, &tmp2)) != MP_OKAY) {
+       goto ERR;
+    }
+    if ((res = mp_add(&tmp2, &b0, &tmp2)) != MP_OKAY) {
+       goto ERR;
+    }
+    
+    if ((res = mp_mul(&tmp1, &tmp2, &w3)) != MP_OKAY) {
+       goto ERR;
+    }
+    
+
+    /* w2 = (a2 + a1 + a0)(b2 + b1 + b0) */
+    if ((res = mp_add(&a2, &a1, &tmp1)) != MP_OKAY) {
+       goto ERR;
+    }
+    if ((res = mp_add(&tmp1, &a0, &tmp1)) != MP_OKAY) {
+       goto ERR;
+    }
+    if ((res = mp_add(&b2, &b1, &tmp2)) != MP_OKAY) {
+       goto ERR;
+    }
+    if ((res = mp_add(&tmp2, &b0, &tmp2)) != MP_OKAY) {
+       goto ERR;
+    }
+    if ((res = mp_mul(&tmp1, &tmp2, &w2)) != MP_OKAY) {
+       goto ERR;
+    }
+    
+    /* now solve the matrix 
+    
+       0  0  0  0  1
+       1  2  4  8  16
+       1  1  1  1  1
+       16 8  4  2  1
+       1  0  0  0  0
+       
+       using 12 subtractions, 4 shifts, 
+              2 small divisions and 1 small multiplication 
+     */
+     
+     /* r1 - r4 */
+     if ((res = mp_sub(&w1, &w4, &w1)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* r3 - r0 */
+     if ((res = mp_sub(&w3, &w0, &w3)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* r1/2 */
+     if ((res = mp_div_2(&w1, &w1)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* r3/2 */
+     if ((res = mp_div_2(&w3, &w3)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* r2 - r0 - r4 */
+     if ((res = mp_sub(&w2, &w0, &w2)) != MP_OKAY) {
+        goto ERR;
+     }
+     if ((res = mp_sub(&w2, &w4, &w2)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* r1 - r2 */
+     if ((res = mp_sub(&w1, &w2, &w1)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* r3 - r2 */
+     if ((res = mp_sub(&w3, &w2, &w3)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* r1 - 8r0 */
+     if ((res = mp_mul_2d(&w0, 3, &tmp1)) != MP_OKAY) {
+        goto ERR;
+     }
+     if ((res = mp_sub(&w1, &tmp1, &w1)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* r3 - 8r4 */
+     if ((res = mp_mul_2d(&w4, 3, &tmp1)) != MP_OKAY) {
+        goto ERR;
+     }
+     if ((res = mp_sub(&w3, &tmp1, &w3)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* 3r2 - r1 - r3 */
+     if ((res = mp_mul_d(&w2, 3, &w2)) != MP_OKAY) {
+        goto ERR;
+     }
+     if ((res = mp_sub(&w2, &w1, &w2)) != MP_OKAY) {
+        goto ERR;
+     }
+     if ((res = mp_sub(&w2, &w3, &w2)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* r1 - r2 */
+     if ((res = mp_sub(&w1, &w2, &w1)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* r3 - r2 */
+     if ((res = mp_sub(&w3, &w2, &w3)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* r1/3 */
+     if ((res = mp_div_3(&w1, &w1, NULL)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* r3/3 */
+     if ((res = mp_div_3(&w3, &w3, NULL)) != MP_OKAY) {
+        goto ERR;
+     }
+     
+     /* at this point shift W[n] by B*n */
+     if ((res = mp_lshd(&w1, 1*B)) != MP_OKAY) {
+        goto ERR;
+     }
+     if ((res = mp_lshd(&w2, 2*B)) != MP_OKAY) {
+        goto ERR;
+     }
+     if ((res = mp_lshd(&w3, 3*B)) != MP_OKAY) {
+        goto ERR;
+     }
+     if ((res = mp_lshd(&w4, 4*B)) != MP_OKAY) {
+        goto ERR;
+     }     
+     
+     if ((res = mp_add(&w0, &w1, c)) != MP_OKAY) {
+        goto ERR;
+     }
+     if ((res = mp_add(&w2, &w3, &tmp1)) != MP_OKAY) {
+        goto ERR;
+     }
+     if ((res = mp_add(&w4, &tmp1, &tmp1)) != MP_OKAY) {
+        goto ERR;
+     }
+     if ((res = mp_add(&tmp1, c, c)) != MP_OKAY) {
+        goto ERR;
+     }     
+     
+ERR:
+     mp_clear_multi(&w0, &w1, &w2, &w3, &w4, 
+                    &a0, &a1, &a2, &b0, &b1, 
+                    &b2, &tmp1, &tmp2, NULL);
+     return res;
+}     
+     
+#endif
+
+/* End: bn_mp_toom_mul.c */
+
+/* Start: bn_mp_toom_sqr.c */
+#include <tommath.h>
+#ifdef BN_MP_TOOM_SQR_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* squaring using Toom-Cook 3-way algorithm */
+int
+mp_toom_sqr(mp_int *a, mp_int *b)
+{
+    mp_int w0, w1, w2, w3, w4, tmp1, a0, a1, a2;
+    int res, B;
+
+    /* init temps */
+    if ((res = mp_init_multi(&w0, &w1, &w2, &w3, &w4, &a0, &a1, &a2, &tmp1, NULL)) != MP_OKAY) {
+       return res;
+    }
+
+    /* B */
+    B = a->used / 3;
+
+    /* a = a2 * B**2 + a1 * B + a0 */
+    if ((res = mp_mod_2d(a, DIGIT_BIT * B, &a0)) != MP_OKAY) {
+       goto ERR;
+    }
+
+    if ((res = mp_copy(a, &a1)) != MP_OKAY) {
+       goto ERR;
+    }
+    mp_rshd(&a1, B);
+    mp_mod_2d(&a1, DIGIT_BIT * B, &a1);
+
+    if ((res = mp_copy(a, &a2)) != MP_OKAY) {
+       goto ERR;
+    }
+    mp_rshd(&a2, B*2);
+
+    /* w0 = a0*a0 */
+    if ((res = mp_sqr(&a0, &w0)) != MP_OKAY) {
+       goto ERR;
+    }
+
+    /* w4 = a2 * a2 */
+    if ((res = mp_sqr(&a2, &w4)) != MP_OKAY) {
+       goto ERR;
+    }
+
+    /* w1 = (a2 + 2(a1 + 2a0))**2 */
+    if ((res = mp_mul_2(&a0, &tmp1)) != MP_OKAY) {
+       goto ERR;
+    }
+    if ((res = mp_add(&tmp1, &a1, &tmp1)) != MP_OKAY) {
+       goto ERR;
+    }
+    if ((res = mp_mul_2(&tmp1, &tmp1)) != MP_OKAY) {
+       goto ERR;
+    }
+    if ((res = mp_add(&tmp1, &a2, &tmp1)) != MP_OKAY) {
+       goto ERR;
+    }
+
+    if ((res = mp_sqr(&tmp1, &w1)) != MP_OKAY) {
+       goto ERR;
+    }
+
+    /* w3 = (a0 + 2(a1 + 2a2))**2 */
+    if ((res = mp_mul_2(&a2, &tmp1)) != MP_OKAY) {
+       goto ERR;
+    }
+    if ((res = mp_add(&tmp1, &a1, &tmp1)) != MP_OKAY) {
+       goto ERR;
+    }
+    if ((res = mp_mul_2(&tmp1, &tmp1)) != MP_OKAY) {
+       goto ERR;
+    }
+    if ((res = mp_add(&tmp1, &a0, &tmp1)) != MP_OKAY) {
+       goto ERR;
+    }
+
+    if ((res = mp_sqr(&tmp1, &w3)) != MP_OKAY) {
+       goto ERR;
+    }
+
+
+    /* w2 = (a2 + a1 + a0)**2 */
+    if ((res = mp_add(&a2, &a1, &tmp1)) != MP_OKAY) {
+       goto ERR;
+    }
+    if ((res = mp_add(&tmp1, &a0, &tmp1)) != MP_OKAY) {
+       goto ERR;
+    }
+    if ((res = mp_sqr(&tmp1, &w2)) != MP_OKAY) {
+       goto ERR;
+    }
+
+    /* now solve the matrix
+
+       0  0  0  0  1
+       1  2  4  8  16
+       1  1  1  1  1
+       16 8  4  2  1
+       1  0  0  0  0
+
+       using 12 subtractions, 4 shifts, 2 small divisions and 1 small multiplication.
+     */
+
+     /* r1 - r4 */
+     if ((res = mp_sub(&w1, &w4, &w1)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* r3 - r0 */
+     if ((res = mp_sub(&w3, &w0, &w3)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* r1/2 */
+     if ((res = mp_div_2(&w1, &w1)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* r3/2 */
+     if ((res = mp_div_2(&w3, &w3)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* r2 - r0 - r4 */
+     if ((res = mp_sub(&w2, &w0, &w2)) != MP_OKAY) {
+        goto ERR;
+     }
+     if ((res = mp_sub(&w2, &w4, &w2)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* r1 - r2 */
+     if ((res = mp_sub(&w1, &w2, &w1)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* r3 - r2 */
+     if ((res = mp_sub(&w3, &w2, &w3)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* r1 - 8r0 */
+     if ((res = mp_mul_2d(&w0, 3, &tmp1)) != MP_OKAY) {
+        goto ERR;
+     }
+     if ((res = mp_sub(&w1, &tmp1, &w1)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* r3 - 8r4 */
+     if ((res = mp_mul_2d(&w4, 3, &tmp1)) != MP_OKAY) {
+        goto ERR;
+     }
+     if ((res = mp_sub(&w3, &tmp1, &w3)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* 3r2 - r1 - r3 */
+     if ((res = mp_mul_d(&w2, 3, &w2)) != MP_OKAY) {
+        goto ERR;
+     }
+     if ((res = mp_sub(&w2, &w1, &w2)) != MP_OKAY) {
+        goto ERR;
+     }
+     if ((res = mp_sub(&w2, &w3, &w2)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* r1 - r2 */
+     if ((res = mp_sub(&w1, &w2, &w1)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* r3 - r2 */
+     if ((res = mp_sub(&w3, &w2, &w3)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* r1/3 */
+     if ((res = mp_div_3(&w1, &w1, NULL)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* r3/3 */
+     if ((res = mp_div_3(&w3, &w3, NULL)) != MP_OKAY) {
+        goto ERR;
+     }
+
+     /* at this point shift W[n] by B*n */
+     if ((res = mp_lshd(&w1, 1*B)) != MP_OKAY) {
+        goto ERR;
+     }
+     if ((res = mp_lshd(&w2, 2*B)) != MP_OKAY) {
+        goto ERR;
+     }
+     if ((res = mp_lshd(&w3, 3*B)) != MP_OKAY) {
+        goto ERR;
+     }
+     if ((res = mp_lshd(&w4, 4*B)) != MP_OKAY) {
+        goto ERR;
+     }
+
+     if ((res = mp_add(&w0, &w1, b)) != MP_OKAY) {
+        goto ERR;
+     }
+     if ((res = mp_add(&w2, &w3, &tmp1)) != MP_OKAY) {
+        goto ERR;
+     }
+     if ((res = mp_add(&w4, &tmp1, &tmp1)) != MP_OKAY) {
+        goto ERR;
+     }
+     if ((res = mp_add(&tmp1, b, b)) != MP_OKAY) {
+        goto ERR;
+     }
+
+ERR:
+     mp_clear_multi(&w0, &w1, &w2, &w3, &w4, &a0, &a1, &a2, &tmp1, NULL);
+     return res;
+}
+
+#endif
+
+/* End: bn_mp_toom_sqr.c */
+
+/* Start: bn_mp_toradix.c */
+#include <tommath.h>
+#ifdef BN_MP_TORADIX_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* stores a bignum as a ASCII string in a given radix (2..64) */
+int mp_toradix (mp_int * a, char *str, int radix)
+{
+  int     res, digs;
+  mp_int  t;
+  mp_digit d;
+  char   *_s = str;
+
+  /* check range of the radix */
+  if (radix < 2 || radix > 64) {
+    return MP_VAL;
+  }
+
+  /* quick out if its zero */
+  if (mp_iszero(a) == 1) {
+     *str++ = '0';
+     *str = '\0';
+     return MP_OKAY;
+  }
+
+  if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
+    return res;
+  }
+
+  /* if it is negative output a - */
+  if (t.sign == MP_NEG) {
+    ++_s;
+    *str++ = '-';
+    t.sign = MP_ZPOS;
+  }
+
+  digs = 0;
+  while (mp_iszero (&t) == 0) {
+    if ((res = mp_div_d (&t, (mp_digit) radix, &t, &d)) != MP_OKAY) {
+      mp_clear (&t);
+      return res;
+    }
+    *str++ = mp_s_rmap[d];
+    ++digs;
+  }
+
+  /* reverse the digits of the string.  In this case _s points
+   * to the first digit [exluding the sign] of the number]
+   */
+  bn_reverse ((unsigned char *)_s, digs);
+
+  /* append a NULL so the string is properly terminated */
+  *str = '\0';
+
+  mp_clear (&t);
+  return MP_OKAY;
+}
+
+#endif
+
+/* End: bn_mp_toradix.c */
+
+/* Start: bn_mp_toradix_n.c */
+#include <tommath.h>
+#ifdef BN_MP_TORADIX_N_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* stores a bignum as a ASCII string in a given radix (2..64) 
+ *
+ * Stores upto maxlen-1 chars and always a NULL byte 
+ */
+int mp_toradix_n(mp_int * a, char *str, int radix, int maxlen)
+{
+  int     res, digs;
+  mp_int  t;
+  mp_digit d;
+  char   *_s = str;
+
+  /* check range of the maxlen, radix */
+  if (maxlen < 3 || radix < 2 || radix > 64) {
+    return MP_VAL;
+  }
+
+  /* quick out if its zero */
+  if (mp_iszero(a) == 1) {
+     *str++ = '0';
+     *str = '\0';
+     return MP_OKAY;
+  }
+
+  if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
+    return res;
+  }
+
+  /* if it is negative output a - */
+  if (t.sign == MP_NEG) {
+    /* we have to reverse our digits later... but not the - sign!! */
+    ++_s;
+
+    /* store the flag and mark the number as positive */
+    *str++ = '-';
+    t.sign = MP_ZPOS;
+ 
+    /* subtract a char */
+    --maxlen;
+  }
+
+  digs = 0;
+  while (mp_iszero (&t) == 0) {
+    if ((res = mp_div_d (&t, (mp_digit) radix, &t, &d)) != MP_OKAY) {
+      mp_clear (&t);
+      return res;
+    }
+    *str++ = mp_s_rmap[d];
+    ++digs;
+
+    if (--maxlen == 1) {
+       /* no more room */
+       break;
+    }
+  }
+
+  /* reverse the digits of the string.  In this case _s points
+   * to the first digit [exluding the sign] of the number]
+   */
+  bn_reverse ((unsigned char *)_s, digs);
+
+  /* append a NULL so the string is properly terminated */
+  *str = '\0';
+
+  mp_clear (&t);
+  return MP_OKAY;
+}
+
+#endif
+
+/* End: bn_mp_toradix_n.c */
+
+/* Start: bn_mp_unsigned_bin_size.c */
+#include <tommath.h>
+#ifdef BN_MP_UNSIGNED_BIN_SIZE_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* get the size for an unsigned equivalent */
+int mp_unsigned_bin_size (mp_int * a)
+{
+  int     size = mp_count_bits (a);
+  return (size / 8 + ((size & 7) != 0 ? 1 : 0));
+}
+#endif
+
+/* End: bn_mp_unsigned_bin_size.c */
+
+/* Start: bn_mp_xor.c */
+#include <tommath.h>
+#ifdef BN_MP_XOR_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* XOR two ints together */
+int
+mp_xor (mp_int * a, mp_int * b, mp_int * c)
+{
+  int     res, ix, px;
+  mp_int  t, *x;
+
+  if (a->used > b->used) {
+    if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
+      return res;
+    }
+    px = b->used;
+    x = b;
+  } else {
+    if ((res = mp_init_copy (&t, b)) != MP_OKAY) {
+      return res;
+    }
+    px = a->used;
+    x = a;
+  }
+
+  for (ix = 0; ix < px; ix++) {
+     t.dp[ix] ^= x->dp[ix];
+  }
+  mp_clamp (&t);
+  mp_exch (c, &t);
+  mp_clear (&t);
+  return MP_OKAY;
+}
+#endif
+
+/* End: bn_mp_xor.c */
+
+/* Start: bn_mp_zero.c */
+#include <tommath.h>
+#ifdef BN_MP_ZERO_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* set to zero */
+void mp_zero (mp_int * a)
+{
+  int       n;
+  mp_digit *tmp;
+
+  a->sign = MP_ZPOS;
+  a->used = 0;
+
+  tmp = a->dp;
+  for (n = 0; n < a->alloc; n++) {
+     *tmp++ = 0;
+  }
+}
+#endif
+
+/* End: bn_mp_zero.c */
+
+/* Start: bn_prime_tab.c */
+#include <tommath.h>
+#ifdef BN_PRIME_TAB_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+const mp_digit ltm_prime_tab[] = {
+  0x0002, 0x0003, 0x0005, 0x0007, 0x000B, 0x000D, 0x0011, 0x0013,
+  0x0017, 0x001D, 0x001F, 0x0025, 0x0029, 0x002B, 0x002F, 0x0035,
+  0x003B, 0x003D, 0x0043, 0x0047, 0x0049, 0x004F, 0x0053, 0x0059,
+  0x0061, 0x0065, 0x0067, 0x006B, 0x006D, 0x0071, 0x007F,
+#ifndef MP_8BIT
+  0x0083,
+  0x0089, 0x008B, 0x0095, 0x0097, 0x009D, 0x00A3, 0x00A7, 0x00AD,
+  0x00B3, 0x00B5, 0x00BF, 0x00C1, 0x00C5, 0x00C7, 0x00D3, 0x00DF,
+  0x00E3, 0x00E5, 0x00E9, 0x00EF, 0x00F1, 0x00FB, 0x0101, 0x0107,
+  0x010D, 0x010F, 0x0115, 0x0119, 0x011B, 0x0125, 0x0133, 0x0137,
+
+  0x0139, 0x013D, 0x014B, 0x0151, 0x015B, 0x015D, 0x0161, 0x0167,
+  0x016F, 0x0175, 0x017B, 0x017F, 0x0185, 0x018D, 0x0191, 0x0199,
+  0x01A3, 0x01A5, 0x01AF, 0x01B1, 0x01B7, 0x01BB, 0x01C1, 0x01C9,
+  0x01CD, 0x01CF, 0x01D3, 0x01DF, 0x01E7, 0x01EB, 0x01F3, 0x01F7,
+  0x01FD, 0x0209, 0x020B, 0x021D, 0x0223, 0x022D, 0x0233, 0x0239,
+  0x023B, 0x0241, 0x024B, 0x0251, 0x0257, 0x0259, 0x025F, 0x0265,
+  0x0269, 0x026B, 0x0277, 0x0281, 0x0283, 0x0287, 0x028D, 0x0293,
+  0x0295, 0x02A1, 0x02A5, 0x02AB, 0x02B3, 0x02BD, 0x02C5, 0x02CF,
+
+  0x02D7, 0x02DD, 0x02E3, 0x02E7, 0x02EF, 0x02F5, 0x02F9, 0x0301,
+  0x0305, 0x0313, 0x031D, 0x0329, 0x032B, 0x0335, 0x0337, 0x033B,
+  0x033D, 0x0347, 0x0355, 0x0359, 0x035B, 0x035F, 0x036D, 0x0371,
+  0x0373, 0x0377, 0x038B, 0x038F, 0x0397, 0x03A1, 0x03A9, 0x03AD,
+  0x03B3, 0x03B9, 0x03C7, 0x03CB, 0x03D1, 0x03D7, 0x03DF, 0x03E5,
+  0x03F1, 0x03F5, 0x03FB, 0x03FD, 0x0407, 0x0409, 0x040F, 0x0419,
+  0x041B, 0x0425, 0x0427, 0x042D, 0x043F, 0x0443, 0x0445, 0x0449,
+  0x044F, 0x0455, 0x045D, 0x0463, 0x0469, 0x047F, 0x0481, 0x048B,
+
+  0x0493, 0x049D, 0x04A3, 0x04A9, 0x04B1, 0x04BD, 0x04C1, 0x04C7,
+  0x04CD, 0x04CF, 0x04D5, 0x04E1, 0x04EB, 0x04FD, 0x04FF, 0x0503,
+  0x0509, 0x050B, 0x0511, 0x0515, 0x0517, 0x051B, 0x0527, 0x0529,
+  0x052F, 0x0551, 0x0557, 0x055D, 0x0565, 0x0577, 0x0581, 0x058F,
+  0x0593, 0x0595, 0x0599, 0x059F, 0x05A7, 0x05AB, 0x05AD, 0x05B3,
+  0x05BF, 0x05C9, 0x05CB, 0x05CF, 0x05D1, 0x05D5, 0x05DB, 0x05E7,
+  0x05F3, 0x05FB, 0x0607, 0x060D, 0x0611, 0x0617, 0x061F, 0x0623,
+  0x062B, 0x062F, 0x063D, 0x0641, 0x0647, 0x0649, 0x064D, 0x0653
+#endif
+};
+#endif
+
+/* End: bn_prime_tab.c */
+
+/* Start: bn_reverse.c */
+#include <tommath.h>
+#ifdef BN_REVERSE_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* reverse an array, used for radix code */
+void
+bn_reverse (unsigned char *s, int len)
+{
+  int     ix, iy;
+  unsigned char t;
+
+  ix = 0;
+  iy = len - 1;
+  while (ix < iy) {
+    t     = s[ix];
+    s[ix] = s[iy];
+    s[iy] = t;
+    ++ix;
+    --iy;
+  }
+}
+#endif
+
+/* End: bn_reverse.c */
+
+/* Start: bn_s_mp_add.c */
+#include <tommath.h>
+#ifdef BN_S_MP_ADD_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* low level addition, based on HAC pp.594, Algorithm 14.7 */
+int
+s_mp_add (mp_int * a, mp_int * b, mp_int * c)
+{
+  mp_int *x;
+  int     olduse, res, min, max;
+
+  /* find sizes, we let |a| <= |b| which means we have to sort
+   * them.  "x" will point to the input with the most digits
+   */
+  if (a->used > b->used) {
+    min = b->used;
+    max = a->used;
+    x = a;
+  } else {
+    min = a->used;
+    max = b->used;
+    x = b;
+  }
+
+  /* init result */
+  if (c->alloc < max + 1) {
+    if ((res = mp_grow (c, max + 1)) != MP_OKAY) {
+      return res;
+    }
+  }
+
+  /* get old used digit count and set new one */
+  olduse = c->used;
+  c->used = max + 1;
+
+  {
+    register mp_digit u, *tmpa, *tmpb, *tmpc;
+    register int i;
+
+    /* alias for digit pointers */
+
+    /* first input */
+    tmpa = a->dp;
+
+    /* second input */
+    tmpb = b->dp;
+
+    /* destination */
+    tmpc = c->dp;
+
+    /* zero the carry */
+    u = 0;
+    for (i = 0; i < min; i++) {
+      /* Compute the sum at one digit, T[i] = A[i] + B[i] + U */
+      *tmpc = *tmpa++ + *tmpb++ + u;
+
+      /* U = carry bit of T[i] */
+      u = *tmpc >> ((mp_digit)DIGIT_BIT);
+
+      /* take away carry bit from T[i] */
+      *tmpc++ &= MP_MASK;
+    }
+
+    /* now copy higher words if any, that is in A+B 
+     * if A or B has more digits add those in 
+     */
+    if (min != max) {
+      for (; i < max; i++) {
+        /* T[i] = X[i] + U */
+        *tmpc = x->dp[i] + u;
+
+        /* U = carry bit of T[i] */
+        u = *tmpc >> ((mp_digit)DIGIT_BIT);
+
+        /* take away carry bit from T[i] */
+        *tmpc++ &= MP_MASK;
+      }
+    }
+
+    /* add carry */
+    *tmpc++ = u;
+
+    /* clear digits above oldused */
+    for (i = c->used; i < olduse; i++) {
+      *tmpc++ = 0;
+    }
+  }
+
+  mp_clamp (c);
+  return MP_OKAY;
+}
+#endif
+
+/* End: bn_s_mp_add.c */
+
+/* Start: bn_s_mp_exptmod.c */
+#include <tommath.h>
+#ifdef BN_S_MP_EXPTMOD_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+#ifdef MP_LOW_MEM
+   #define TAB_SIZE 32
+#else
+   #define TAB_SIZE 256
+#endif
+
+int s_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
+{
+  mp_int  M[TAB_SIZE], res, mu;
+  mp_digit buf;
+  int     err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
+  int (*redux)(mp_int*,mp_int*,mp_int*);
+
+  /* find window size */
+  x = mp_count_bits (X);
+  if (x <= 7) {
+    winsize = 2;
+  } else if (x <= 36) {
+    winsize = 3;
+  } else if (x <= 140) {
+    winsize = 4;
+  } else if (x <= 450) {
+    winsize = 5;
+  } else if (x <= 1303) {
+    winsize = 6;
+  } else if (x <= 3529) {
+    winsize = 7;
+  } else {
+    winsize = 8;
+  }
+
+#ifdef MP_LOW_MEM
+    if (winsize > 5) {
+       winsize = 5;
+    }
+#endif
+
+  /* init M array */
+  /* init first cell */
+  if ((err = mp_init(&M[1])) != MP_OKAY) {
+     return err; 
+  }
+
+  /* now init the second half of the array */
+  for (x = 1<<(winsize-1); x < (1 << winsize); x++) {
+    if ((err = mp_init(&M[x])) != MP_OKAY) {
+      for (y = 1<<(winsize-1); y < x; y++) {
+        mp_clear (&M[y]);
+      }
+      mp_clear(&M[1]);
+      return err;
+    }
+  }
+
+  /* create mu, used for Barrett reduction */
+  if ((err = mp_init (&mu)) != MP_OKAY) {
+    goto LBL_M;
+  }
+  
+  if (redmode == 0) {
+     if ((err = mp_reduce_setup (&mu, P)) != MP_OKAY) {
+        goto LBL_MU;
+     }
+     redux = mp_reduce;
+  } else {
+     if ((err = mp_reduce_2k_setup_l (P, &mu)) != MP_OKAY) {
+        goto LBL_MU;
+     }
+     redux = mp_reduce_2k_l;
+  }    
+
+  /* create M table
+   *
+   * The M table contains powers of the base, 
+   * e.g. M[x] = G**x mod P
+   *
+   * The first half of the table is not 
+   * computed though accept for M[0] and M[1]
+   */
+  if ((err = mp_mod (G, P, &M[1])) != MP_OKAY) {
+    goto LBL_MU;
+  }
+
+  /* compute the value at M[1<<(winsize-1)] by squaring 
+   * M[1] (winsize-1) times 
+   */
+  if ((err = mp_copy (&M[1], &M[1 << (winsize - 1)])) != MP_OKAY) {
+    goto LBL_MU;
+  }
+
+  for (x = 0; x < (winsize - 1); x++) {
+    /* square it */
+    if ((err = mp_sqr (&M[1 << (winsize - 1)], 
+                       &M[1 << (winsize - 1)])) != MP_OKAY) {
+      goto LBL_MU;
+    }
+
+    /* reduce modulo P */
+    if ((err = redux (&M[1 << (winsize - 1)], P, &mu)) != MP_OKAY) {
+      goto LBL_MU;
+    }
+  }
+
+  /* create upper table, that is M[x] = M[x-1] * M[1] (mod P)
+   * for x = (2**(winsize - 1) + 1) to (2**winsize - 1)
+   */
+  for (x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x++) {
+    if ((err = mp_mul (&M[x - 1], &M[1], &M[x])) != MP_OKAY) {
+      goto LBL_MU;
+    }
+    if ((err = redux (&M[x], P, &mu)) != MP_OKAY) {
+      goto LBL_MU;
+    }
+  }
+
+  /* setup result */
+  if ((err = mp_init (&res)) != MP_OKAY) {
+    goto LBL_MU;
+  }
+  mp_set (&res, 1);
+
+  /* set initial mode and bit cnt */
+  mode   = 0;
+  bitcnt = 1;
+  buf    = 0;
+  digidx = X->used - 1;
+  bitcpy = 0;
+  bitbuf = 0;
+
+  for (;;) {
+    /* grab next digit as required */
+    if (--bitcnt == 0) {
+      /* if digidx == -1 we are out of digits */
+      if (digidx == -1) {
+        break;
+      }
+      /* read next digit and reset the bitcnt */
+      buf    = X->dp[digidx--];
+      bitcnt = (int) DIGIT_BIT;
+    }
+
+    /* grab the next msb from the exponent */
+    y     = (buf >> (mp_digit)(DIGIT_BIT - 1)) & 1;
+    buf <<= (mp_digit)1;
+
+    /* if the bit is zero and mode == 0 then we ignore it
+     * These represent the leading zero bits before the first 1 bit
+     * in the exponent.  Technically this opt is not required but it
+     * does lower the # of trivial squaring/reductions used
+     */
+    if (mode == 0 && y == 0) {
+      continue;
+    }
+
+    /* if the bit is zero and mode == 1 then we square */
+    if (mode == 1 && y == 0) {
+      if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
+        goto LBL_RES;
+      }
+      if ((err = redux (&res, P, &mu)) != MP_OKAY) {
+        goto LBL_RES;
+      }
+      continue;
+    }
+
+    /* else we add it to the window */
+    bitbuf |= (y << (winsize - ++bitcpy));
+    mode    = 2;
+
+    if (bitcpy == winsize) {
+      /* ok window is filled so square as required and multiply  */
+      /* square first */
+      for (x = 0; x < winsize; x++) {
+        if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
+          goto LBL_RES;
+        }
+        if ((err = redux (&res, P, &mu)) != MP_OKAY) {
+          goto LBL_RES;
+        }
+      }
+
+      /* then multiply */
+      if ((err = mp_mul (&res, &M[bitbuf], &res)) != MP_OKAY) {
+        goto LBL_RES;
+      }
+      if ((err = redux (&res, P, &mu)) != MP_OKAY) {
+        goto LBL_RES;
+      }
+
+      /* empty window and reset */
+      bitcpy = 0;
+      bitbuf = 0;
+      mode   = 1;
+    }
+  }
+
+  /* if bits remain then square/multiply */
+  if (mode == 2 && bitcpy > 0) {
+    /* square then multiply if the bit is set */
+    for (x = 0; x < bitcpy; x++) {
+      if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
+        goto LBL_RES;
+      }
+      if ((err = redux (&res, P, &mu)) != MP_OKAY) {
+        goto LBL_RES;
+      }
+
+      bitbuf <<= 1;
+      if ((bitbuf & (1 << winsize)) != 0) {
+        /* then multiply */
+        if ((err = mp_mul (&res, &M[1], &res)) != MP_OKAY) {
+          goto LBL_RES;
+        }
+        if ((err = redux (&res, P, &mu)) != MP_OKAY) {
+          goto LBL_RES;
+        }
+      }
+    }
+  }
+
+  mp_exch (&res, Y);
+  err = MP_OKAY;
+LBL_RES:mp_clear (&res);
+LBL_MU:mp_clear (&mu);
+LBL_M:
+  mp_clear(&M[1]);
+  for (x = 1<<(winsize-1); x < (1 << winsize); x++) {
+    mp_clear (&M[x]);
+  }
+  return err;
+}
+#endif
+
+/* End: bn_s_mp_exptmod.c */
+
+/* Start: bn_s_mp_mul_digs.c */
+#include <tommath.h>
+#ifdef BN_S_MP_MUL_DIGS_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* multiplies |a| * |b| and only computes upto digs digits of result
+ * HAC pp. 595, Algorithm 14.12  Modified so you can control how 
+ * many digits of output are created.
+ */
+int s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
+{
+  mp_int  t;
+  int     res, pa, pb, ix, iy;
+  mp_digit u;
+  mp_word r;
+  mp_digit tmpx, *tmpt, *tmpy;
+
+  /* can we use the fast multiplier? */
+  if (((digs) < MP_WARRAY) &&
+      MIN (a->used, b->used) < 
+          (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
+    return fast_s_mp_mul_digs (a, b, c, digs);
+  }
+
+  if ((res = mp_init_size (&t, digs)) != MP_OKAY) {
+    return res;
+  }
+  t.used = digs;
+
+  /* compute the digits of the product directly */
+  pa = a->used;
+  for (ix = 0; ix < pa; ix++) {
+    /* set the carry to zero */
+    u = 0;
+
+    /* limit ourselves to making digs digits of output */
+    pb = MIN (b->used, digs - ix);
+
+    /* setup some aliases */
+    /* copy of the digit from a used within the nested loop */
+    tmpx = a->dp[ix];
+    
+    /* an alias for the destination shifted ix places */
+    tmpt = t.dp + ix;
+    
+    /* an alias for the digits of b */
+    tmpy = b->dp;
+
+    /* compute the columns of the output and propagate the carry */
+    for (iy = 0; iy < pb; iy++) {
+      /* compute the column as a mp_word */
+      r       = ((mp_word)*tmpt) +
+                ((mp_word)tmpx) * ((mp_word)*tmpy++) +
+                ((mp_word) u);
+
+      /* the new column is the lower part of the result */
+      *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
+
+      /* get the carry word from the result */
+      u       = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
+    }
+    /* set carry if it is placed below digs */
+    if (ix + iy < digs) {
+      *tmpt = u;
+    }
+  }
+
+  mp_clamp (&t);
+  mp_exch (&t, c);
+
+  mp_clear (&t);
+  return MP_OKAY;
+}
+#endif
+
+/* End: bn_s_mp_mul_digs.c */
+
+/* Start: bn_s_mp_mul_high_digs.c */
+#include <tommath.h>
+#ifdef BN_S_MP_MUL_HIGH_DIGS_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* multiplies |a| * |b| and does not compute the lower digs digits
+ * [meant to get the higher part of the product]
+ */
+int
+s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
+{
+  mp_int  t;
+  int     res, pa, pb, ix, iy;
+  mp_digit u;
+  mp_word r;
+  mp_digit tmpx, *tmpt, *tmpy;
+
+  /* can we use the fast multiplier? */
+#ifdef BN_FAST_S_MP_MUL_HIGH_DIGS_C
+  if (((a->used + b->used + 1) < MP_WARRAY)
+      && MIN (a->used, b->used) < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
+    return fast_s_mp_mul_high_digs (a, b, c, digs);
+  }
+#endif
+
+  if ((res = mp_init_size (&t, a->used + b->used + 1)) != MP_OKAY) {
+    return res;
+  }
+  t.used = a->used + b->used + 1;
+
+  pa = a->used;
+  pb = b->used;
+  for (ix = 0; ix < pa; ix++) {
+    /* clear the carry */
+    u = 0;
+
+    /* left hand side of A[ix] * B[iy] */
+    tmpx = a->dp[ix];
+
+    /* alias to the address of where the digits will be stored */
+    tmpt = &(t.dp[digs]);
+
+    /* alias for where to read the right hand side from */
+    tmpy = b->dp + (digs - ix);
+
+    for (iy = digs - ix; iy < pb; iy++) {
+      /* calculate the double precision result */
+      r       = ((mp_word)*tmpt) +
+                ((mp_word)tmpx) * ((mp_word)*tmpy++) +
+                ((mp_word) u);
+
+      /* get the lower part */
+      *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
+
+      /* carry the carry */
+      u       = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
+    }
+    *tmpt = u;
+  }
+  mp_clamp (&t);
+  mp_exch (&t, c);
+  mp_clear (&t);
+  return MP_OKAY;
+}
+#endif
+
+/* End: bn_s_mp_mul_high_digs.c */
+
+/* Start: bn_s_mp_sqr.c */
+#include <tommath.h>
+#ifdef BN_S_MP_SQR_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* low level squaring, b = a*a, HAC pp.596-597, Algorithm 14.16 */
+int s_mp_sqr (mp_int * a, mp_int * b)
+{
+  mp_int  t;
+  int     res, ix, iy, pa;
+  mp_word r;
+  mp_digit u, tmpx, *tmpt;
+
+  pa = a->used;
+  if ((res = mp_init_size (&t, 2*pa + 1)) != MP_OKAY) {
+    return res;
+  }
+
+  /* default used is maximum possible size */
+  t.used = 2*pa + 1;
+
+  for (ix = 0; ix < pa; ix++) {
+    /* first calculate the digit at 2*ix */
+    /* calculate double precision result */
+    r = ((mp_word) t.dp[2*ix]) +
+        ((mp_word)a->dp[ix])*((mp_word)a->dp[ix]);
+
+    /* store lower part in result */
+    t.dp[ix+ix] = (mp_digit) (r & ((mp_word) MP_MASK));
+
+    /* get the carry */
+    u           = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
+
+    /* left hand side of A[ix] * A[iy] */
+    tmpx        = a->dp[ix];
+
+    /* alias for where to store the results */
+    tmpt        = t.dp + (2*ix + 1);
+    
+    for (iy = ix + 1; iy < pa; iy++) {
+      /* first calculate the product */
+      r       = ((mp_word)tmpx) * ((mp_word)a->dp[iy]);
+
+      /* now calculate the double precision result, note we use
+       * addition instead of *2 since it's easier to optimize
+       */
+      r       = ((mp_word) *tmpt) + r + r + ((mp_word) u);
+
+      /* store lower part */
+      *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
+
+      /* get carry */
+      u       = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
+    }
+    /* propagate upwards */
+    while (u != ((mp_digit) 0)) {
+      r       = ((mp_word) *tmpt) + ((mp_word) u);
+      *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
+      u       = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
+    }
+  }
+
+  mp_clamp (&t);
+  mp_exch (&t, b);
+  mp_clear (&t);
+  return MP_OKAY;
+}
+#endif
+
+/* End: bn_s_mp_sqr.c */
+
+/* Start: bn_s_mp_sub.c */
+#include <tommath.h>
+#ifdef BN_S_MP_SUB_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* low level subtraction (assumes |a| > |b|), HAC pp.595 Algorithm 14.9 */
+int
+s_mp_sub (mp_int * a, mp_int * b, mp_int * c)
+{
+  int     olduse, res, min, max;
+
+  /* find sizes */
+  min = b->used;
+  max = a->used;
+
+  /* init result */
+  if (c->alloc < max) {
+    if ((res = mp_grow (c, max)) != MP_OKAY) {
+      return res;
+    }
+  }
+  olduse = c->used;
+  c->used = max;
+
+  {
+    register mp_digit u, *tmpa, *tmpb, *tmpc;
+    register int i;
+
+    /* alias for digit pointers */
+    tmpa = a->dp;
+    tmpb = b->dp;
+    tmpc = c->dp;
+
+    /* set carry to zero */
+    u = 0;
+    for (i = 0; i < min; i++) {
+      /* T[i] = A[i] - B[i] - U */
+      *tmpc = *tmpa++ - *tmpb++ - u;
+
+      /* U = carry bit of T[i]
+       * Note this saves performing an AND operation since
+       * if a carry does occur it will propagate all the way to the
+       * MSB.  As a result a single shift is enough to get the carry
+       */
+      u = *tmpc >> ((mp_digit)(CHAR_BIT * sizeof (mp_digit) - 1));
+
+      /* Clear carry from T[i] */
+      *tmpc++ &= MP_MASK;
+    }
+
+    /* now copy higher words if any, e.g. if A has more digits than B  */
+    for (; i < max; i++) {
+      /* T[i] = A[i] - U */
+      *tmpc = *tmpa++ - u;
+
+      /* U = carry bit of T[i] */
+      u = *tmpc >> ((mp_digit)(CHAR_BIT * sizeof (mp_digit) - 1));
+
+      /* Clear carry from T[i] */
+      *tmpc++ &= MP_MASK;
+    }
+
+    /* clear digits above used (since we may not have grown result above) */
+    for (i = c->used; i < olduse; i++) {
+      *tmpc++ = 0;
+    }
+  }
+
+  mp_clamp (c);
+  return MP_OKAY;
+}
+
+#endif
+
+/* End: bn_s_mp_sub.c */
+
+/* Start: bncore.c */
+#include <tommath.h>
+#ifdef BNCORE_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+
+/* Known optimal configurations
+
+ CPU                    /Compiler     /MUL CUTOFF/SQR CUTOFF
+-------------------------------------------------------------
+ Intel P4 Northwood     /GCC v3.4.1   /        88/       128/LTM 0.32 ;-)
+ AMD Athlon64           /GCC v3.4.4   /        74/       124/LTM 0.34
+ 
+*/
+
+int     KARATSUBA_MUL_CUTOFF = 74,      /* Min. number of digits before Karatsuba multiplication is used. */
+        KARATSUBA_SQR_CUTOFF = 124,     /* Min. number of digits before Karatsuba squaring is used. */
+        
+        TOOM_MUL_CUTOFF      = 350,      /* no optimal values of these are known yet so set em high */
+        TOOM_SQR_CUTOFF      = 400; 
+#endif
+
+/* End: bncore.c */
+
+
+/* EOF */
diff --git a/libtommath/pretty.build b/libtommath/pretty.build
new file mode 100644
index 0000000..a708b8a
--- /dev/null
+++ b/libtommath/pretty.build
@@ -0,0 +1,66 @@
+#!/bin/perl -w
+#
+# Cute little builder for perl 
+# Total waste of development time...
+#
+# This will build all the object files and then the archive .a file
+# requires GCC, GNU make and a sense of humour.
+#
+# Tom St Denis
+use strict;
+
+my $count = 0;
+my $starttime = time;
+my $rate  = 0;
+print "Scanning for source files...\n";
+foreach my $filename (glob "*.c") {
+       ++$count;
+}
+print "Source files to build: $count\nBuilding...\n";
+my $i = 0;
+my $lines = 0;
+my $filesbuilt = 0;
+foreach my $filename (glob "*.c") {
+       printf("Building %3.2f%%, ", (++$i/$count)*100.0);
+       if ($i % 4 == 0) { print "/, "; }
+       if ($i % 4 == 1) { print "-, "; }
+       if ($i % 4 == 2) { print "\\, "; }
+       if ($i % 4 == 3) { print "|, "; }
+       if ($rate > 0) {
+           my $tleft = ($count - $i) / $rate;
+           my $tsec  = $tleft%60;
+           my $tmin  = ($tleft/60)%60;
+           my $thour = ($tleft/3600)%60;
+           printf("%2d:%02d:%02d left, ", $thour, $tmin, $tsec);
+       }
+       my $cnt = ($i/$count)*30.0;
+       my $x   = 0;
+       print "[";
+       for (; $x < $cnt; $x++) { print "#"; }
+       for (; $x < 30; $x++)   { print " "; }
+       print "]\r";
+       my $tmp = $filename;
+       $tmp =~ s/\.c/".o"/ge;
+       if (open(SRC, "<$tmp")) {
+          close SRC;
+       } else {
+          !system("make $tmp > /dev/null 2>/dev/null") or die "\nERROR: Failed to make $tmp!!!\n";
+          open( SRC, "<$filename" ) or die "Couldn't open $filename for reading: $!";
+          ++$lines while (<SRC>);
+          close SRC or die "Error closing $filename after reading: $!";
+          ++$filesbuilt;
+       }      
+
+       # update timer 
+       if (time != $starttime) {
+          my $delay = time - $starttime;
+          $rate = $i/$delay;
+       }
+}
+
+# finish building the library 
+printf("\nFinished building source (%d seconds, %3.2f files per second).\n", time - $starttime, $rate);
+print "Compiled approximately $filesbuilt files and $lines lines of code.\n";
+print "Doing final make (building archive...)\n";
+!system("make > /dev/null 2>/dev/null") or die "\nERROR: Failed to perform last make command!!!\n";
+print "done.\n";
+\ No newline at end of file
diff --git a/libtommath/tombc/grammar.txt b/libtommath/tombc/grammar.txt
new file mode 100644
index 0000000..a780e75
--- /dev/null
+++ b/libtommath/tombc/grammar.txt
@@ -0,0 +1,35 @@
+program       := program statement | statement | empty
+statement     := { statement }                                                                              | 
+                 identifier = numexpression;                                                                | 
+                 identifier[numexpression] = numexpression;                                                 |
+                 function(expressionlist);                                                                  | 
+                 for (identifer = numexpression; numexpression; identifier = numexpression) { statement }   |
+                 while (numexpression) { statement }                                                        | 
+                 if (numexpresion) { statement } elif                                                       | 
+                 break;                                                                                     | 
+                 continue;                                                                                  
+                 
+elif          := else statement | empty
+function      := abs | countbits | exptmod | jacobi | print | isprime | nextprime | issquare | readinteger | exit
+expressionlist := expressionlist, expression | expression
+
+// LR(1) !!!?
+expression    := string | numexpression
+numexpression := cmpexpr && cmpexpr | cmpexpr \|\| cmpexpr | cmpexpr
+cmpexpr       := boolexpr  < boolexpr | boolexpr  > boolexpr | boolexpr == boolexpr | 
+                 boolexpr <= boolexpr | boolexpr >= boolexpr | boolexpr
+boolexpr      := shiftexpr & shiftexpr | shiftexpr ^ shiftexpr | shiftexpr \| shiftexpr | shiftexpr
+shiftexpr     := addsubexpr << addsubexpr | addsubexpr >> addsubexpr | addsubexpr
+addsubexpr    := mulexpr + mulexpr | mulexpr - mulexpr | mulexpr
+mulexpr       := expr * expr       | expr / expr | expr % expr | expr
+expr          := -nexpr | nexpr 
+nexpr         := integer | identifier | ( numexpression ) | identifier[numexpression] 
+
+identifier    := identifer digits | identifier alpha | alpha
+alpha         := a ... z | A ... Z
+integer       := hexnumber | digits 
+hexnumber     := 0xhexdigits
+hexdigits     := hexdigits hexdigit | hexdigit
+hexdigit      := 0 ... 9 | a ... f | A ... F
+digits        := digits digit | digit 
+digit         := 0 ... 9
diff --git a/libtommath/tommath.h b/libtommath/tommath.h
new file mode 100644
index 0000000..bcb9d86
--- /dev/null
+++ b/libtommath/tommath.h
@@ -0,0 +1,578 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#ifndef BN_H_
+#define BN_H_
+
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <ctype.h>
+#include <limits.h>
+
+#include <tommath_class.h>
+
+#undef MIN
+#define MIN(x,y) ((x)<(y)?(x):(y))
+#undef MAX
+#define MAX(x,y) ((x)>(y)?(x):(y))
+
+#ifdef __cplusplus
+extern "C" {
+
+/* C++ compilers don't like assigning void * to mp_digit * */
+#define  OPT_CAST(x)  (x *)
+
+#else
+
+/* C on the other hand doesn't care */
+#define  OPT_CAST(x)
+
+#endif
+
+
+/* detect 64-bit mode if possible */
+#if defined(__x86_64__) 
+   #if !(defined(MP_64BIT) && defined(MP_16BIT) && defined(MP_8BIT))
+      #define MP_64BIT
+   #endif
+#endif
+
+/* some default configurations.
+ *
+ * A "mp_digit" must be able to hold DIGIT_BIT + 1 bits
+ * A "mp_word" must be able to hold 2*DIGIT_BIT + 1 bits
+ *
+ * At the very least a mp_digit must be able to hold 7 bits
+ * [any size beyond that is ok provided it doesn't overflow the data type]
+ */
+#ifdef MP_8BIT
+   typedef unsigned char      mp_digit;
+   typedef unsigned short     mp_word;
+#elif defined(MP_16BIT)
+   typedef unsigned short     mp_digit;
+   typedef unsigned long      mp_word;
+#elif defined(MP_64BIT)
+   /* for GCC only on supported platforms */
+#ifndef CRYPT
+   typedef unsigned long long ulong64;
+   typedef signed long long   long64;
+#endif
+
+   typedef unsigned long      mp_digit;
+   typedef unsigned long      mp_word __attribute__ ((mode(TI)));
+
+   #define DIGIT_BIT          60
+#else
+   /* this is the default case, 28-bit digits */
+   
+   /* this is to make porting into LibTomCrypt easier :-) */
+#ifndef CRYPT
+   #if defined(_MSC_VER) || defined(__BORLANDC__) 
+      typedef unsigned __int64   ulong64;
+      typedef signed __int64     long64;
+   #else
+      typedef unsigned long long ulong64;
+      typedef signed long long   long64;
+   #endif
+#endif
+
+   typedef unsigned long      mp_digit;
+   typedef ulong64            mp_word;
+
+#ifdef MP_31BIT   
+   /* this is an extension that uses 31-bit digits */
+   #define DIGIT_BIT          31
+#else
+   /* default case is 28-bit digits, defines MP_28BIT as a handy macro to test */
+   #define DIGIT_BIT          28
+   #define MP_28BIT
+#endif   
+#endif
+
+/* define heap macros */
+#ifndef CRYPT
+   /* default to libc stuff */
+   #ifndef XMALLOC 
+       #define XMALLOC  malloc
+       #define XFREE    free
+       #define XREALLOC realloc
+       #define XCALLOC  calloc
+   #else
+      /* prototypes for our heap functions */
+      extern void *XMALLOC(size_t n);
+      extern void *REALLOC(void *p, size_t n);
+      extern void *XCALLOC(size_t n, size_t s);
+      extern void XFREE(void *p);
+   #endif
+#endif
+
+
+/* otherwise the bits per digit is calculated automatically from the size of a mp_digit */
+#ifndef DIGIT_BIT
+   #define DIGIT_BIT     ((int)((CHAR_BIT * sizeof(mp_digit) - 1)))  /* bits per digit */
+#endif
+
+#define MP_DIGIT_BIT     DIGIT_BIT
+#define MP_MASK          ((((mp_digit)1)<<((mp_digit)DIGIT_BIT))-((mp_digit)1))
+#define MP_DIGIT_MAX     MP_MASK
+
+/* equalities */
+#define MP_LT        -1   /* less than */
+#define MP_EQ         0   /* equal to */
+#define MP_GT         1   /* greater than */
+
+#define MP_ZPOS       0   /* positive integer */
+#define MP_NEG        1   /* negative */
+
+#define MP_OKAY       0   /* ok result */
+#define MP_MEM        -2  /* out of mem */
+#define MP_VAL        -3  /* invalid input */
+#define MP_RANGE      MP_VAL
+
+#define MP_YES        1   /* yes response */
+#define MP_NO         0   /* no response */
+
+/* Primality generation flags */
+#define LTM_PRIME_BBS      0x0001 /* BBS style prime */
+#define LTM_PRIME_SAFE     0x0002 /* Safe prime (p-1)/2 == prime */
+#define LTM_PRIME_2MSB_OFF 0x0004 /* force 2nd MSB to 0 */
+#define LTM_PRIME_2MSB_ON  0x0008 /* force 2nd MSB to 1 */
+
+typedef int           mp_err;
+
+/* you'll have to tune these... */
+extern int KARATSUBA_MUL_CUTOFF,
+           KARATSUBA_SQR_CUTOFF,
+           TOOM_MUL_CUTOFF,
+           TOOM_SQR_CUTOFF;
+
+/* define this to use lower memory usage routines (exptmods mostly) */
+/* #define MP_LOW_MEM */
+
+/* default precision */
+#ifndef MP_PREC
+   #ifndef MP_LOW_MEM
+      #define MP_PREC                 64     /* default digits of precision */
+   #else
+      #define MP_PREC                 8      /* default digits of precision */
+   #endif   
+#endif
+
+/* size of comba arrays, should be at least 2 * 2**(BITS_PER_WORD - BITS_PER_DIGIT*2) */
+#define MP_WARRAY               (1 << (sizeof(mp_word) * CHAR_BIT - 2 * DIGIT_BIT + 1))
+
+/* the infamous mp_int structure */
+typedef struct  {
+    int used, alloc, sign;
+    mp_digit *dp;
+} mp_int;
+
+/* callback for mp_prime_random, should fill dst with random bytes and return how many read [upto len] */
+typedef int ltm_prime_callback(unsigned char *dst, int len, void *dat);
+
+
+#define USED(m)    ((m)->used)
+#define DIGIT(m,k) ((m)->dp[(k)])
+#define SIGN(m)    ((m)->sign)
+
+/* error code to char* string */
+char *mp_error_to_string(int code);
+
+/* ---> init and deinit bignum functions <--- */
+/* init a bignum */
+int mp_init(mp_int *a);
+
+/* free a bignum */
+void mp_clear(mp_int *a);
+
+/* init a null terminated series of arguments */
+int mp_init_multi(mp_int *mp, ...);
+
+/* clear a null terminated series of arguments */
+void mp_clear_multi(mp_int *mp, ...);
+
+/* exchange two ints */
+void mp_exch(mp_int *a, mp_int *b);
+
+/* shrink ram required for a bignum */
+int mp_shrink(mp_int *a);
+
+/* grow an int to a given size */
+int mp_grow(mp_int *a, int size);
+
+/* init to a given number of digits */
+int mp_init_size(mp_int *a, int size);
+
+/* ---> Basic Manipulations <--- */
+#define mp_iszero(a) (((a)->used == 0) ? MP_YES : MP_NO)
+#define mp_iseven(a) (((a)->used > 0 && (((a)->dp[0] & 1) == 0)) ? MP_YES : MP_NO)
+#define mp_isodd(a)  (((a)->used > 0 && (((a)->dp[0] & 1) == 1)) ? MP_YES : MP_NO)
+
+/* set to zero */
+void mp_zero(mp_int *a);
+
+/* set to a digit */
+void mp_set(mp_int *a, mp_digit b);
+
+/* set a 32-bit const */
+int mp_set_int(mp_int *a, unsigned long b);
+
+/* get a 32-bit value */
+unsigned long mp_get_int(mp_int * a);
+
+/* initialize and set a digit */
+int mp_init_set (mp_int * a, mp_digit b);
+
+/* initialize and set 32-bit value */
+int mp_init_set_int (mp_int * a, unsigned long b);
+
+/* copy, b = a */
+int mp_copy(mp_int *a, mp_int *b);
+
+/* inits and copies, a = b */
+int mp_init_copy(mp_int *a, mp_int *b);
+
+/* trim unused digits */
+void mp_clamp(mp_int *a);
+
+/* ---> digit manipulation <--- */
+
+/* right shift by "b" digits */
+void mp_rshd(mp_int *a, int b);
+
+/* left shift by "b" digits */
+int mp_lshd(mp_int *a, int b);
+
+/* c = a / 2**b */
+int mp_div_2d(mp_int *a, int b, mp_int *c, mp_int *d);
+
+/* b = a/2 */
+int mp_div_2(mp_int *a, mp_int *b);
+
+/* c = a * 2**b */
+int mp_mul_2d(mp_int *a, int b, mp_int *c);
+
+/* b = a*2 */
+int mp_mul_2(mp_int *a, mp_int *b);
+
+/* c = a mod 2**d */
+int mp_mod_2d(mp_int *a, int b, mp_int *c);
+
+/* computes a = 2**b */
+int mp_2expt(mp_int *a, int b);
+
+/* Counts the number of lsbs which are zero before the first zero bit */
+int mp_cnt_lsb(mp_int *a);
+
+/* I Love Earth! */
+
+/* makes a pseudo-random int of a given size */
+int mp_rand(mp_int *a, int digits);
+
+/* ---> binary operations <--- */
+/* c = a XOR b  */
+int mp_xor(mp_int *a, mp_int *b, mp_int *c);
+
+/* c = a OR b */
+int mp_or(mp_int *a, mp_int *b, mp_int *c);
+
+/* c = a AND b */
+int mp_and(mp_int *a, mp_int *b, mp_int *c);
+
+/* ---> Basic arithmetic <--- */
+
+/* b = -a */
+int mp_neg(mp_int *a, mp_int *b);
+
+/* b = |a| */
+int mp_abs(mp_int *a, mp_int *b);
+
+/* compare a to b */
+int mp_cmp(mp_int *a, mp_int *b);
+
+/* compare |a| to |b| */
+int mp_cmp_mag(mp_int *a, mp_int *b);
+
+/* c = a + b */
+int mp_add(mp_int *a, mp_int *b, mp_int *c);
+
+/* c = a - b */
+int mp_sub(mp_int *a, mp_int *b, mp_int *c);
+
+/* c = a * b */
+int mp_mul(mp_int *a, mp_int *b, mp_int *c);
+
+/* b = a*a  */
+int mp_sqr(mp_int *a, mp_int *b);
+
+/* a/b => cb + d == a */
+int mp_div(mp_int *a, mp_int *b, mp_int *c, mp_int *d);
+
+/* c = a mod b, 0 <= c < b  */
+int mp_mod(mp_int *a, mp_int *b, mp_int *c);
+
+/* ---> single digit functions <--- */
+
+/* compare against a single digit */
+int mp_cmp_d(mp_int *a, mp_digit b);
+
+/* c = a + b */
+int mp_add_d(mp_int *a, mp_digit b, mp_int *c);
+
+/* c = a - b */
+int mp_sub_d(mp_int *a, mp_digit b, mp_int *c);
+
+/* c = a * b */
+int mp_mul_d(mp_int *a, mp_digit b, mp_int *c);
+
+/* a/b => cb + d == a */
+int mp_div_d(mp_int *a, mp_digit b, mp_int *c, mp_digit *d);
+
+/* a/3 => 3c + d == a */
+int mp_div_3(mp_int *a, mp_int *c, mp_digit *d);
+
+/* c = a**b */
+int mp_expt_d(mp_int *a, mp_digit b, mp_int *c);
+
+/* c = a mod b, 0 <= c < b  */
+int mp_mod_d(mp_int *a, mp_digit b, mp_digit *c);
+
+/* ---> number theory <--- */
+
+/* d = a + b (mod c) */
+int mp_addmod(mp_int *a, mp_int *b, mp_int *c, mp_int *d);
+
+/* d = a - b (mod c) */
+int mp_submod(mp_int *a, mp_int *b, mp_int *c, mp_int *d);
+
+/* d = a * b (mod c) */
+int mp_mulmod(mp_int *a, mp_int *b, mp_int *c, mp_int *d);
+
+/* c = a * a (mod b) */
+int mp_sqrmod(mp_int *a, mp_int *b, mp_int *c);
+
+/* c = 1/a (mod b) */
+int mp_invmod(mp_int *a, mp_int *b, mp_int *c);
+
+/* c = (a, b) */
+int mp_gcd(mp_int *a, mp_int *b, mp_int *c);
+
+/* produces value such that U1*a + U2*b = U3 */
+int mp_exteuclid(mp_int *a, mp_int *b, mp_int *U1, mp_int *U2, mp_int *U3);
+
+/* c = [a, b] or (a*b)/(a, b) */
+int mp_lcm(mp_int *a, mp_int *b, mp_int *c);
+
+/* finds one of the b'th root of a, such that |c|**b <= |a|
+ *
+ * returns error if a < 0 and b is even
+ */
+int mp_n_root(mp_int *a, mp_digit b, mp_int *c);
+
+/* special sqrt algo */
+int mp_sqrt(mp_int *arg, mp_int *ret);
+
+/* is number a square? */
+int mp_is_square(mp_int *arg, int *ret);
+
+/* computes the jacobi c = (a | n) (or Legendre if b is prime)  */
+int mp_jacobi(mp_int *a, mp_int *n, int *c);
+
+/* used to setup the Barrett reduction for a given modulus b */
+int mp_reduce_setup(mp_int *a, mp_int *b);
+
+/* Barrett Reduction, computes a (mod b) with a precomputed value c
+ *
+ * Assumes that 0 < a <= b*b, note if 0 > a > -(b*b) then you can merely
+ * compute the reduction as -1 * mp_reduce(mp_abs(a)) [pseudo code].
+ */
+int mp_reduce(mp_int *a, mp_int *b, mp_int *c);
+
+/* setups the montgomery reduction */
+int mp_montgomery_setup(mp_int *a, mp_digit *mp);
+
+/* computes a = B**n mod b without division or multiplication useful for
+ * normalizing numbers in a Montgomery system.
+ */
+int mp_montgomery_calc_normalization(mp_int *a, mp_int *b);
+
+/* computes x/R == x (mod N) via Montgomery Reduction */
+int mp_montgomery_reduce(mp_int *a, mp_int *m, mp_digit mp);
+
+/* returns 1 if a is a valid DR modulus */
+int mp_dr_is_modulus(mp_int *a);
+
+/* sets the value of "d" required for mp_dr_reduce */
+void mp_dr_setup(mp_int *a, mp_digit *d);
+
+/* reduces a modulo b using the Diminished Radix method */
+int mp_dr_reduce(mp_int *a, mp_int *b, mp_digit mp);
+
+/* returns true if a can be reduced with mp_reduce_2k */
+int mp_reduce_is_2k(mp_int *a);
+
+/* determines k value for 2k reduction */
+int mp_reduce_2k_setup(mp_int *a, mp_digit *d);
+
+/* reduces a modulo b where b is of the form 2**p - k [0 <= a] */
+int mp_reduce_2k(mp_int *a, mp_int *n, mp_digit d);
+
+/* returns true if a can be reduced with mp_reduce_2k_l */
+int mp_reduce_is_2k_l(mp_int *a);
+
+/* determines k value for 2k reduction */
+int mp_reduce_2k_setup_l(mp_int *a, mp_int *d);
+
+/* reduces a modulo b where b is of the form 2**p - k [0 <= a] */
+int mp_reduce_2k_l(mp_int *a, mp_int *n, mp_int *d);
+
+/* d = a**b (mod c) */
+int mp_exptmod(mp_int *a, mp_int *b, mp_int *c, mp_int *d);
+
+/* ---> Primes <--- */
+
+/* number of primes */
+#ifdef MP_8BIT
+   #define PRIME_SIZE      31
+#else
+   #define PRIME_SIZE      256
+#endif
+
+/* table of first PRIME_SIZE primes */
+extern const mp_digit ltm_prime_tab[];
+
+/* result=1 if a is divisible by one of the first PRIME_SIZE primes */
+int mp_prime_is_divisible(mp_int *a, int *result);
+
+/* performs one Fermat test of "a" using base "b".
+ * Sets result to 0 if composite or 1 if probable prime
+ */
+int mp_prime_fermat(mp_int *a, mp_int *b, int *result);
+
+/* performs one Miller-Rabin test of "a" using base "b".
+ * Sets result to 0 if composite or 1 if probable prime
+ */
+int mp_prime_miller_rabin(mp_int *a, mp_int *b, int *result);
+
+/* This gives [for a given bit size] the number of trials required
+ * such that Miller-Rabin gives a prob of failure lower than 2^-96 
+ */
+int mp_prime_rabin_miller_trials(int size);
+
+/* performs t rounds of Miller-Rabin on "a" using the first
+ * t prime bases.  Also performs an initial sieve of trial
+ * division.  Determines if "a" is prime with probability
+ * of error no more than (1/4)**t.
+ *
+ * Sets result to 1 if probably prime, 0 otherwise
+ */
+int mp_prime_is_prime(mp_int *a, int t, int *result);
+
+/* finds the next prime after the number "a" using "t" trials
+ * of Miller-Rabin.
+ *
+ * bbs_style = 1 means the prime must be congruent to 3 mod 4
+ */
+int mp_prime_next_prime(mp_int *a, int t, int bbs_style);
+
+/* makes a truly random prime of a given size (bytes),
+ * call with bbs = 1 if you want it to be congruent to 3 mod 4 
+ *
+ * You have to supply a callback which fills in a buffer with random bytes.  "dat" is a parameter you can
+ * have passed to the callback (e.g. a state or something).  This function doesn't use "dat" itself
+ * so it can be NULL
+ *
+ * The prime generated will be larger than 2^(8*size).
+ */
+#define mp_prime_random(a, t, size, bbs, cb, dat) mp_prime_random_ex(a, t, ((size) * 8) + 1, (bbs==1)?LTM_PRIME_BBS:0, cb, dat)
+
+/* makes a truly random prime of a given size (bits),
+ *
+ * Flags are as follows:
+ * 
+ *   LTM_PRIME_BBS      - make prime congruent to 3 mod 4
+ *   LTM_PRIME_SAFE     - make sure (p-1)/2 is prime as well (implies LTM_PRIME_BBS)
+ *   LTM_PRIME_2MSB_OFF - make the 2nd highest bit zero
+ *   LTM_PRIME_2MSB_ON  - make the 2nd highest bit one
+ *
+ * You have to supply a callback which fills in a buffer with random bytes.  "dat" is a parameter you can
+ * have passed to the callback (e.g. a state or something).  This function doesn't use "dat" itself
+ * so it can be NULL
+ *
+ */
+int mp_prime_random_ex(mp_int *a, int t, int size, int flags, ltm_prime_callback cb, void *dat);
+
+/* ---> radix conversion <--- */
+int mp_count_bits(mp_int *a);
+
+int mp_unsigned_bin_size(mp_int *a);
+int mp_read_unsigned_bin(mp_int *a, unsigned char *b, int c);
+int mp_to_unsigned_bin(mp_int *a, unsigned char *b);
+int mp_to_unsigned_bin_n (mp_int * a, unsigned char *b, unsigned long *outlen);
+
+int mp_signed_bin_size(mp_int *a);
+int mp_read_signed_bin(mp_int *a, unsigned char *b, int c);
+int mp_to_signed_bin(mp_int *a, unsigned char *b);
+int mp_to_signed_bin_n (mp_int * a, unsigned char *b, unsigned long *outlen);
+
+int mp_read_radix(mp_int *a, const char *str, int radix);
+int mp_toradix(mp_int *a, char *str, int radix);
+int mp_toradix_n(mp_int * a, char *str, int radix, int maxlen);
+int mp_radix_size(mp_int *a, int radix, int *size);
+
+int mp_fread(mp_int *a, int radix, FILE *stream);
+int mp_fwrite(mp_int *a, int radix, FILE *stream);
+
+#define mp_read_raw(mp, str, len) mp_read_signed_bin((mp), (str), (len))
+#define mp_raw_size(mp)           mp_signed_bin_size(mp)
+#define mp_toraw(mp, str)         mp_to_signed_bin((mp), (str))
+#define mp_read_mag(mp, str, len) mp_read_unsigned_bin((mp), (str), (len))
+#define mp_mag_size(mp)           mp_unsigned_bin_size(mp)
+#define mp_tomag(mp, str)         mp_to_unsigned_bin((mp), (str))
+
+#define mp_tobinary(M, S)  mp_toradix((M), (S), 2)
+#define mp_tooctal(M, S)   mp_toradix((M), (S), 8)
+#define mp_todecimal(M, S) mp_toradix((M), (S), 10)
+#define mp_tohex(M, S)     mp_toradix((M), (S), 16)
+
+/* lowlevel functions, do not call! */
+int s_mp_add(mp_int *a, mp_int *b, mp_int *c);
+int s_mp_sub(mp_int *a, mp_int *b, mp_int *c);
+#define s_mp_mul(a, b, c) s_mp_mul_digs(a, b, c, (a)->used + (b)->used + 1)
+int fast_s_mp_mul_digs(mp_int *a, mp_int *b, mp_int *c, int digs);
+int s_mp_mul_digs(mp_int *a, mp_int *b, mp_int *c, int digs);
+int fast_s_mp_mul_high_digs(mp_int *a, mp_int *b, mp_int *c, int digs);
+int s_mp_mul_high_digs(mp_int *a, mp_int *b, mp_int *c, int digs);
+int fast_s_mp_sqr(mp_int *a, mp_int *b);
+int s_mp_sqr(mp_int *a, mp_int *b);
+int mp_karatsuba_mul(mp_int *a, mp_int *b, mp_int *c);
+int mp_toom_mul(mp_int *a, mp_int *b, mp_int *c);
+int mp_karatsuba_sqr(mp_int *a, mp_int *b);
+int mp_toom_sqr(mp_int *a, mp_int *b);
+int fast_mp_invmod(mp_int *a, mp_int *b, mp_int *c);
+int mp_invmod_slow (mp_int * a, mp_int * b, mp_int * c);
+int fast_mp_montgomery_reduce(mp_int *a, mp_int *m, mp_digit mp);
+int mp_exptmod_fast(mp_int *G, mp_int *X, mp_int *P, mp_int *Y, int mode);
+int s_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int mode);
+void bn_reverse(unsigned char *s, int len);
+
+extern const char *mp_s_rmap;
+
+#ifdef __cplusplus
+   }
+#endif
+
+#endif
+
diff --git a/libtommath/tommath.out b/libtommath/tommath.out
new file mode 100644
index 0000000..9f62617
--- /dev/null
+++ b/libtommath/tommath.out
@@ -0,0 +1,139 @@
+\BOOKMARK [0][-]{chapter.1}{Introduction}{}
+\BOOKMARK [1][-]{section.1.1}{Multiple Precision Arithmetic}{chapter.1}
+\BOOKMARK [2][-]{subsection.1.1.1}{What is Multiple Precision Arithmetic?}{section.1.1}
+\BOOKMARK [2][-]{subsection.1.1.2}{The Need for Multiple Precision Arithmetic}{section.1.1}
+\BOOKMARK [2][-]{subsection.1.1.3}{Benefits of Multiple Precision Arithmetic}{section.1.1}
+\BOOKMARK [1][-]{section.1.2}{Purpose of This Text}{chapter.1}
+\BOOKMARK [1][-]{section.1.3}{Discussion and Notation}{chapter.1}
+\BOOKMARK [2][-]{subsection.1.3.1}{Notation}{section.1.3}
+\BOOKMARK [2][-]{subsection.1.3.2}{Precision Notation}{section.1.3}
+\BOOKMARK [2][-]{subsection.1.3.3}{Algorithm Inputs and Outputs}{section.1.3}
+\BOOKMARK [2][-]{subsection.1.3.4}{Mathematical Expressions}{section.1.3}
+\BOOKMARK [2][-]{subsection.1.3.5}{Work Effort}{section.1.3}
+\BOOKMARK [1][-]{section.1.4}{Exercises}{chapter.1}
+\BOOKMARK [1][-]{section.1.5}{Introduction to LibTomMath}{chapter.1}
+\BOOKMARK [2][-]{subsection.1.5.1}{What is LibTomMath?}{section.1.5}
+\BOOKMARK [2][-]{subsection.1.5.2}{Goals of LibTomMath}{section.1.5}
+\BOOKMARK [1][-]{section.1.6}{Choice of LibTomMath}{chapter.1}
+\BOOKMARK [2][-]{subsection.1.6.1}{Code Base}{section.1.6}
+\BOOKMARK [2][-]{subsection.1.6.2}{API Simplicity}{section.1.6}
+\BOOKMARK [2][-]{subsection.1.6.3}{Optimizations}{section.1.6}
+\BOOKMARK [2][-]{subsection.1.6.4}{Portability and Stability}{section.1.6}
+\BOOKMARK [2][-]{subsection.1.6.5}{Choice}{section.1.6}
+\BOOKMARK [0][-]{chapter.2}{Getting Started}{}
+\BOOKMARK [1][-]{section.2.1}{Library Basics}{chapter.2}
+\BOOKMARK [1][-]{section.2.2}{What is a Multiple Precision Integer?}{chapter.2}
+\BOOKMARK [2][-]{subsection.2.2.1}{The mp\137int Structure}{section.2.2}
+\BOOKMARK [1][-]{section.2.3}{Argument Passing}{chapter.2}
+\BOOKMARK [1][-]{section.2.4}{Return Values}{chapter.2}
+\BOOKMARK [1][-]{section.2.5}{Initialization and Clearing}{chapter.2}
+\BOOKMARK [2][-]{subsection.2.5.1}{Initializing an mp\137int}{section.2.5}
+\BOOKMARK [2][-]{subsection.2.5.2}{Clearing an mp\137int}{section.2.5}
+\BOOKMARK [1][-]{section.2.6}{Maintenance Algorithms}{chapter.2}
+\BOOKMARK [2][-]{subsection.2.6.1}{Augmenting an mp\137int's Precision}{section.2.6}
+\BOOKMARK [2][-]{subsection.2.6.2}{Initializing Variable Precision mp\137ints}{section.2.6}
+\BOOKMARK [2][-]{subsection.2.6.3}{Multiple Integer Initializations and Clearings}{section.2.6}
+\BOOKMARK [2][-]{subsection.2.6.4}{Clamping Excess Digits}{section.2.6}
+\BOOKMARK [0][-]{chapter.3}{Basic Operations}{}
+\BOOKMARK [1][-]{section.3.1}{Introduction}{chapter.3}
+\BOOKMARK [1][-]{section.3.2}{Assigning Values to mp\137int Structures}{chapter.3}
+\BOOKMARK [2][-]{subsection.3.2.1}{Copying an mp\137int}{section.3.2}
+\BOOKMARK [2][-]{subsection.3.2.2}{Creating a Clone}{section.3.2}
+\BOOKMARK [1][-]{section.3.3}{Zeroing an Integer}{chapter.3}
+\BOOKMARK [1][-]{section.3.4}{Sign Manipulation}{chapter.3}
+\BOOKMARK [2][-]{subsection.3.4.1}{Absolute Value}{section.3.4}
+\BOOKMARK [2][-]{subsection.3.4.2}{Integer Negation}{section.3.4}
+\BOOKMARK [1][-]{section.3.5}{Small Constants}{chapter.3}
+\BOOKMARK [2][-]{subsection.3.5.1}{Setting Small Constants}{section.3.5}
+\BOOKMARK [2][-]{subsection.3.5.2}{Setting Large Constants}{section.3.5}
+\BOOKMARK [1][-]{section.3.6}{Comparisons}{chapter.3}
+\BOOKMARK [2][-]{subsection.3.6.1}{Unsigned Comparisions}{section.3.6}
+\BOOKMARK [2][-]{subsection.3.6.2}{Signed Comparisons}{section.3.6}
+\BOOKMARK [0][-]{chapter.4}{Basic Arithmetic}{}
+\BOOKMARK [1][-]{section.4.1}{Introduction}{chapter.4}
+\BOOKMARK [1][-]{section.4.2}{Addition and Subtraction}{chapter.4}
+\BOOKMARK [2][-]{subsection.4.2.1}{Low Level Addition}{section.4.2}
+\BOOKMARK [2][-]{subsection.4.2.2}{Low Level Subtraction}{section.4.2}
+\BOOKMARK [2][-]{subsection.4.2.3}{High Level Addition}{section.4.2}
+\BOOKMARK [2][-]{subsection.4.2.4}{High Level Subtraction}{section.4.2}
+\BOOKMARK [1][-]{section.4.3}{Bit and Digit Shifting}{chapter.4}
+\BOOKMARK [2][-]{subsection.4.3.1}{Multiplication by Two}{section.4.3}
+\BOOKMARK [2][-]{subsection.4.3.2}{Division by Two}{section.4.3}
+\BOOKMARK [1][-]{section.4.4}{Polynomial Basis Operations}{chapter.4}
+\BOOKMARK [2][-]{subsection.4.4.1}{Multiplication by x}{section.4.4}
+\BOOKMARK [2][-]{subsection.4.4.2}{Division by x}{section.4.4}
+\BOOKMARK [1][-]{section.4.5}{Powers of Two}{chapter.4}
+\BOOKMARK [2][-]{subsection.4.5.1}{Multiplication by Power of Two}{section.4.5}
+\BOOKMARK [2][-]{subsection.4.5.2}{Division by Power of Two}{section.4.5}
+\BOOKMARK [2][-]{subsection.4.5.3}{Remainder of Division by Power of Two}{section.4.5}
+\BOOKMARK [0][-]{chapter.5}{Multiplication and Squaring}{}
+\BOOKMARK [1][-]{section.5.1}{The Multipliers}{chapter.5}
+\BOOKMARK [1][-]{section.5.2}{Multiplication}{chapter.5}
+\BOOKMARK [2][-]{subsection.5.2.1}{The Baseline Multiplication}{section.5.2}
+\BOOKMARK [2][-]{subsection.5.2.2}{Faster Multiplication by the ``Comba'' Method}{section.5.2}
+\BOOKMARK [2][-]{subsection.5.2.3}{Polynomial Basis Multiplication}{section.5.2}
+\BOOKMARK [2][-]{subsection.5.2.4}{Karatsuba Multiplication}{section.5.2}
+\BOOKMARK [2][-]{subsection.5.2.5}{Toom-Cook 3-Way Multiplication}{section.5.2}
+\BOOKMARK [2][-]{subsection.5.2.6}{Signed Multiplication}{section.5.2}
+\BOOKMARK [1][-]{section.5.3}{Squaring}{chapter.5}
+\BOOKMARK [2][-]{subsection.5.3.1}{The Baseline Squaring Algorithm}{section.5.3}
+\BOOKMARK [2][-]{subsection.5.3.2}{Faster Squaring by the ``Comba'' Method}{section.5.3}
+\BOOKMARK [2][-]{subsection.5.3.3}{Polynomial Basis Squaring}{section.5.3}
+\BOOKMARK [2][-]{subsection.5.3.4}{Karatsuba Squaring}{section.5.3}
+\BOOKMARK [2][-]{subsection.5.3.5}{Toom-Cook Squaring}{section.5.3}
+\BOOKMARK [2][-]{subsection.5.3.6}{High Level Squaring}{section.5.3}
+\BOOKMARK [0][-]{chapter.6}{Modular Reduction}{}
+\BOOKMARK [1][-]{section.6.1}{Basics of Modular Reduction}{chapter.6}
+\BOOKMARK [1][-]{section.6.2}{The Barrett Reduction}{chapter.6}
+\BOOKMARK [2][-]{subsection.6.2.1}{Fixed Point Arithmetic}{section.6.2}
+\BOOKMARK [2][-]{subsection.6.2.2}{Choosing a Radix Point}{section.6.2}
+\BOOKMARK [2][-]{subsection.6.2.3}{Trimming the Quotient}{section.6.2}
+\BOOKMARK [2][-]{subsection.6.2.4}{Trimming the Residue}{section.6.2}
+\BOOKMARK [2][-]{subsection.6.2.5}{The Barrett Algorithm}{section.6.2}
+\BOOKMARK [2][-]{subsection.6.2.6}{The Barrett Setup Algorithm}{section.6.2}
+\BOOKMARK [1][-]{section.6.3}{The Montgomery Reduction}{chapter.6}
+\BOOKMARK [2][-]{subsection.6.3.1}{Digit Based Montgomery Reduction}{section.6.3}
+\BOOKMARK [2][-]{subsection.6.3.2}{Baseline Montgomery Reduction}{section.6.3}
+\BOOKMARK [2][-]{subsection.6.3.3}{Faster ``Comba'' Montgomery Reduction}{section.6.3}
+\BOOKMARK [2][-]{subsection.6.3.4}{Montgomery Setup}{section.6.3}
+\BOOKMARK [1][-]{section.6.4}{The Diminished Radix Algorithm}{chapter.6}
+\BOOKMARK [2][-]{subsection.6.4.1}{Choice of Moduli}{section.6.4}
+\BOOKMARK [2][-]{subsection.6.4.2}{Choice of k}{section.6.4}
+\BOOKMARK [2][-]{subsection.6.4.3}{Restricted Diminished Radix Reduction}{section.6.4}
+\BOOKMARK [2][-]{subsection.6.4.4}{Unrestricted Diminished Radix Reduction}{section.6.4}
+\BOOKMARK [1][-]{section.6.5}{Algorithm Comparison}{chapter.6}
+\BOOKMARK [0][-]{chapter.7}{Exponentiation}{}
+\BOOKMARK [1][-]{section.7.1}{Exponentiation Basics}{chapter.7}
+\BOOKMARK [2][-]{subsection.7.1.1}{Single Digit Exponentiation}{section.7.1}
+\BOOKMARK [1][-]{section.7.2}{k-ary Exponentiation}{chapter.7}
+\BOOKMARK [2][-]{subsection.7.2.1}{Optimal Values of k}{section.7.2}
+\BOOKMARK [2][-]{subsection.7.2.2}{Sliding-Window Exponentiation}{section.7.2}
+\BOOKMARK [1][-]{section.7.3}{Modular Exponentiation}{chapter.7}
+\BOOKMARK [2][-]{subsection.7.3.1}{Barrett Modular Exponentiation}{section.7.3}
+\BOOKMARK [1][-]{section.7.4}{Quick Power of Two}{chapter.7}
+\BOOKMARK [0][-]{chapter.8}{Higher Level Algorithms}{}
+\BOOKMARK [1][-]{section.8.1}{Integer Division with Remainder}{chapter.8}
+\BOOKMARK [2][-]{subsection.8.1.1}{Quotient Estimation}{section.8.1}
+\BOOKMARK [2][-]{subsection.8.1.2}{Normalized Integers}{section.8.1}
+\BOOKMARK [2][-]{subsection.8.1.3}{Radix- Division with Remainder}{section.8.1}
+\BOOKMARK [1][-]{section.8.2}{Single Digit Helpers}{chapter.8}
+\BOOKMARK [2][-]{subsection.8.2.1}{Single Digit Addition and Subtraction}{section.8.2}
+\BOOKMARK [2][-]{subsection.8.2.2}{Single Digit Multiplication}{section.8.2}
+\BOOKMARK [2][-]{subsection.8.2.3}{Single Digit Division}{section.8.2}
+\BOOKMARK [2][-]{subsection.8.2.4}{Single Digit Root Extraction}{section.8.2}
+\BOOKMARK [1][-]{section.8.3}{Random Number Generation}{chapter.8}
+\BOOKMARK [1][-]{section.8.4}{Formatted Representations}{chapter.8}
+\BOOKMARK [2][-]{subsection.8.4.1}{Reading Radix-n Input}{section.8.4}
+\BOOKMARK [2][-]{subsection.8.4.2}{Generating Radix-n Output}{section.8.4}
+\BOOKMARK [0][-]{chapter.9}{Number Theoretic Algorithms}{}
+\BOOKMARK [1][-]{section.9.1}{Greatest Common Divisor}{chapter.9}
+\BOOKMARK [2][-]{subsection.9.1.1}{Complete Greatest Common Divisor}{section.9.1}
+\BOOKMARK [1][-]{section.9.2}{Least Common Multiple}{chapter.9}
+\BOOKMARK [1][-]{section.9.3}{Jacobi Symbol Computation}{chapter.9}
+\BOOKMARK [2][-]{subsection.9.3.1}{Jacobi Symbol}{section.9.3}
+\BOOKMARK [1][-]{section.9.4}{Modular Inverse}{chapter.9}
+\BOOKMARK [2][-]{subsection.9.4.1}{General Case}{section.9.4}
+\BOOKMARK [1][-]{section.9.5}{Primality Tests}{chapter.9}
+\BOOKMARK [2][-]{subsection.9.5.1}{Trial Division}{section.9.5}
+\BOOKMARK [2][-]{subsection.9.5.2}{The Fermat Test}{section.9.5}
+\BOOKMARK [2][-]{subsection.9.5.3}{The Miller-Rabin Test}{section.9.5}
diff --git a/libtommath/tommath.src b/libtommath/tommath.src
new file mode 100644
index 0000000..7a53860
--- /dev/null
+++ b/libtommath/tommath.src
@@ -0,0 +1,6353 @@
+\documentclass[b5paper]{book}
+\usepackage{hyperref}
+\usepackage{makeidx}
+\usepackage{amssymb}
+\usepackage{color}
+\usepackage{alltt}
+\usepackage{graphicx}
+\usepackage{layout}
+\def\union{\cup}
+\def\intersect{\cap}
+\def\getsrandom{\stackrel{\rm R}{\gets}}
+\def\cross{\times}
+\def\cat{\hspace{0.5em} \| \hspace{0.5em}}
+\def\catn{$\|$}
+\def\divides{\hspace{0.3em} | \hspace{0.3em}}
+\def\nequiv{\not\equiv}
+\def\approx{\raisebox{0.2ex}{\mbox{\small $\sim$}}}
+\def\lcm{{\rm lcm}}
+\def\gcd{{\rm gcd}}
+\def\log{{\rm log}}
+\def\ord{{\rm ord}}
+\def\abs{{\mathit abs}}
+\def\rep{{\mathit rep}}
+\def\mod{{\mathit\ mod\ }}
+\renewcommand{\pmod}[1]{\ ({\rm mod\ }{#1})}
+\newcommand{\floor}[1]{\left\lfloor{#1}\right\rfloor}
+\newcommand{\ceil}[1]{\left\lceil{#1}\right\rceil}
+\def\Or{{\rm\ or\ }}
+\def\And{{\rm\ and\ }}
+\def\iff{\hspace{1em}\Longleftrightarrow\hspace{1em}}
+\def\implies{\Rightarrow}
+\def\undefined{{\rm ``undefined"}}
+\def\Proof{\vspace{1ex}\noindent {\bf Proof:}\hspace{1em}}
+\let\oldphi\phi
+\def\phi{\varphi}
+\def\Pr{{\rm Pr}}
+\newcommand{\str}[1]{{\mathbf{#1}}}
+\def\F{{\mathbb F}}
+\def\N{{\mathbb N}}
+\def\Z{{\mathbb Z}}
+\def\R{{\mathbb R}}
+\def\C{{\mathbb C}}
+\def\Q{{\mathbb Q}}
+\definecolor{DGray}{gray}{0.5}
+\newcommand{\emailaddr}[1]{\mbox{$<${#1}$>$}}
+\def\twiddle{\raisebox{0.3ex}{\mbox{\tiny $\sim$}}}
+\def\gap{\vspace{0.5ex}}
+\makeindex
+\begin{document}
+\frontmatter
+\pagestyle{empty}
+\title{Multi--Precision Math}
+\author{\mbox{
+%\begin{small}
+\begin{tabular}{c}
+Tom St Denis \\
+Algonquin College \\
+\\
+Mads Rasmussen \\
+Open Communications Security \\
+\\
+Greg Rose \\
+QUALCOMM Australia \\
+\end{tabular}
+%\end{small}
+}
+}
+\maketitle
+This text has been placed in the public domain.  This text corresponds to the v0.35 release of the 
+LibTomMath project.
+
+\begin{alltt}
+Tom St Denis
+111 Banning Rd
+Ottawa, Ontario
+K2L 1C3
+Canada
+
+Phone: 1-613-836-3160
+Email: tomstdenis@iahu.ca
+\end{alltt}
+
+This text is formatted to the international B5 paper size of 176mm wide by 250mm tall using the \LaTeX{} 
+{\em book} macro package and the Perl {\em booker} package.
+
+\tableofcontents
+\listoffigures
+\chapter*{Prefaces}
+When I tell people about my LibTom projects and that I release them as public domain they are often puzzled.  
+They ask why I did it and especially why I continue to work on them for free.  The best I can explain it is ``Because I can.''  
+Which seems odd and perhaps too terse for adult conversation. I often qualify it with ``I am able, I am willing.'' which 
+perhaps explains it better.  I am the first to admit there is not anything that special with what I have done.  Perhaps
+others can see that too and then we would have a society to be proud of.  My LibTom projects are what I am doing to give 
+back to society in the form of tools and knowledge that can help others in their endeavours.
+
+I started writing this book because it was the most logical task to further my goal of open academia.  The LibTomMath source
+code itself was written to be easy to follow and learn from.  There are times, however, where pure C source code does not
+explain the algorithms properly.  Hence this book.  The book literally starts with the foundation of the library and works
+itself outwards to the more complicated algorithms.  The use of both pseudo--code and verbatim source code provides a duality
+of ``theory'' and ``practice'' that the computer science students of the world shall appreciate.  I never deviate too far
+from relatively straightforward algebra and I hope that this book can be a valuable learning asset.
+
+This book and indeed much of the LibTom projects would not exist in their current form if it was not for a plethora
+of kind people donating their time, resources and kind words to help support my work.  Writing a text of significant
+length (along with the source code) is a tiresome and lengthy process.  Currently the LibTom project is four years old,
+comprises of literally thousands of users and over 100,000 lines of source code, TeX and other material.  People like Mads and Greg 
+were there at the beginning to encourage me to work well.  It is amazing how timely validation from others can boost morale to 
+continue the project. Definitely my parents were there for me by providing room and board during the many months of work in 2003.  
+
+To my many friends whom I have met through the years I thank you for the good times and the words of encouragement.  I hope I
+honour your kind gestures with this project.
+
+Open Source.  Open Academia.  Open Minds.
+
+\begin{flushright} Tom St Denis \end{flushright}
+
+\newpage
+I found the opportunity to work with Tom appealing for several reasons, not only could I broaden my own horizons, but also 
+contribute to educate others facing the problem of having to handle big number mathematical calculations.
+
+This book is Tom's child and he has been caring and fostering the project ever since the beginning with a clear mind of 
+how he wanted the project to turn out. I have helped by proofreading the text and we have had several discussions about 
+the layout and language used.
+
+I hold a masters degree in cryptography from the University of Southern Denmark and have always been interested in the 
+practical aspects of cryptography. 
+
+Having worked in the security consultancy business for several years in S\~{a}o Paulo, Brazil, I have been in touch with a 
+great deal of work in which multiple precision mathematics was needed. Understanding the possibilities for speeding up 
+multiple precision calculations is often very important since we deal with outdated machine architecture where modular 
+reductions, for example, become painfully slow.
+
+This text is for people who stop and wonder when first examining algorithms such as RSA for the first time and asks 
+themselves, ``You tell me this is only secure for large numbers, fine; but how do you implement these numbers?''
+
+\begin{flushright}
+Mads Rasmussen
+
+S\~{a}o Paulo - SP
+
+Brazil
+\end{flushright}
+
+\newpage
+It's all because I broke my leg. That just happened to be at about the same time that Tom asked for someone to review the section of the book about 
+Karatsuba multiplication. I was laid up, alone and immobile, and thought ``Why not?'' I vaguely knew what Karatsuba multiplication was, but not 
+really, so I thought I could help, learn, and stop myself from watching daytime cable TV, all at once.
+
+At the time of writing this, I've still not met Tom or Mads in meatspace. I've been following Tom's progress since his first splash on the 
+sci.crypt Usenet news group. I watched him go from a clueless newbie, to the cryptographic equivalent of a reformed smoker, to a real
+contributor to the field, over a period of about two years. I've been impressed with his obvious intelligence, and astounded by his productivity. 
+Of course, he's young enough to be my own child, so he doesn't have my problems with staying awake.
+
+When I reviewed that single section of the book, in its very earliest form, I was very pleasantly surprised. So I decided to collaborate more fully, 
+and at least review all of it, and perhaps write some bits too. There's still a long way to go with it, and I have watched a number of close 
+friends go through the mill of publication, so I think that the way to go is longer than Tom thinks it is. Nevertheless, it's a good effort, 
+and I'm pleased to be involved with it.
+
+\begin{flushright}
+Greg Rose, Sydney, Australia, June 2003. 
+\end{flushright}
+
+\mainmatter
+\pagestyle{headings}
+\chapter{Introduction}
+\section{Multiple Precision Arithmetic}
+
+\subsection{What is Multiple Precision Arithmetic?}
+When we think of long-hand arithmetic such as addition or multiplication we rarely consider the fact that we instinctively
+raise or lower the precision of the numbers we are dealing with.  For example, in decimal we almost immediate can 
+reason that $7$ times $6$ is $42$.  However, $42$ has two digits of precision as opposed to one digit we started with.  
+Further multiplications of say $3$ result in a larger precision result $126$.  In these few examples we have multiple 
+precisions for the numbers we are working with.  Despite the various levels of precision a single subset\footnote{With the occasional optimization.}
+ of algorithms can be designed to accomodate them.  
+
+By way of comparison a fixed or single precision operation would lose precision on various operations.  For example, in
+the decimal system with fixed precision $6 \cdot 7 = 2$.
+
+Essentially at the heart of computer based multiple precision arithmetic are the same long-hand algorithms taught in
+schools to manually add, subtract, multiply and divide.  
+
+\subsection{The Need for Multiple Precision Arithmetic}
+The most prevalent need for multiple precision arithmetic, often referred to as ``bignum'' math, is within the implementation
+of public-key cryptography algorithms.   Algorithms such as RSA \cite{RSAREF} and Diffie-Hellman \cite{DHREF} require 
+integers of significant magnitude to resist known cryptanalytic attacks.  For example, at the time of this writing a 
+typical RSA modulus would be at least greater than $10^{309}$.  However, modern programming languages such as ISO C \cite{ISOC} and 
+Java \cite{JAVA} only provide instrinsic support for integers which are relatively small and single precision.
+
+\begin{figure}[!here]
+\begin{center}
+\begin{tabular}{|r|c|}
+\hline \textbf{Data Type} & \textbf{Range} \\
+\hline char  & $-128 \ldots 127$ \\
+\hline short & $-32768 \ldots 32767$ \\
+\hline long  & $-2147483648 \ldots 2147483647$ \\
+\hline long long & $-9223372036854775808 \ldots 9223372036854775807$ \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Typical Data Types for the C Programming Language}
+\label{fig:ISOC}
+\end{figure}
+
+The largest data type guaranteed to be provided by the ISO C programming 
+language\footnote{As per the ISO C standard.  However, each compiler vendor is allowed to augment the precision as they 
+see fit.}  can only represent values up to $10^{19}$ as shown in figure \ref{fig:ISOC}. On its own the C language is 
+insufficient to accomodate the magnitude required for the problem at hand.  An RSA modulus of magnitude $10^{19}$ could be 
+trivially factored\footnote{A Pollard-Rho factoring would take only $2^{16}$ time.} on the average desktop computer, 
+rendering any protocol based on the algorithm insecure.  Multiple precision algorithms solve this very problem by 
+extending the range of representable integers while using single precision data types.
+
+Most advancements in fast multiple precision arithmetic stem from the need for faster and more efficient cryptographic 
+primitives.  Faster modular reduction and exponentiation algorithms such as Barrett's algorithm, which have appeared in 
+various cryptographic journals, can render algorithms such as RSA and Diffie-Hellman more efficient.  In fact, several 
+major companies such as RSA Security, Certicom and Entrust have built entire product lines on the implementation and 
+deployment of efficient algorithms.
+
+However, cryptography is not the only field of study that can benefit from fast multiple precision integer routines.  
+Another auxiliary use of multiple precision integers is high precision floating point data types.  
+The basic IEEE \cite{IEEE} standard floating point type is made up of an integer mantissa $q$, an exponent $e$ and a sign bit $s$.  
+Numbers are given in the form $n = q \cdot b^e \cdot -1^s$ where $b = 2$ is the most common base for IEEE.  Since IEEE 
+floating point is meant to be implemented in hardware the precision of the mantissa is often fairly small 
+(\textit{23, 48 and 64 bits}).  The mantissa is merely an integer and a multiple precision integer could be used to create
+a mantissa of much larger precision than hardware alone can efficiently support.  This approach could be useful where 
+scientific applications must minimize the total output error over long calculations.
+
+Yet another use for large integers is within arithmetic on polynomials of large characteristic (i.e. $GF(p)[x]$ for large $p$).
+In fact the library discussed within this text has already been used to form a polynomial basis library\footnote{See \url{http://poly.libtomcrypt.org} for more details.}.
+
+\subsection{Benefits of Multiple Precision Arithmetic}
+\index{precision}
+The benefit of multiple precision representations over single or fixed precision representations is that 
+no precision is lost while representing the result of an operation which requires excess precision.  For example, 
+the product of two $n$-bit integers requires at least $2n$ bits of precision to be represented faithfully.  A multiple 
+precision algorithm would augment the precision of the destination to accomodate the result while a single precision system 
+would truncate excess bits to maintain a fixed level of precision.
+
+It is possible to implement algorithms which require large integers with fixed precision algorithms.  For example, elliptic
+curve cryptography (\textit{ECC}) is often implemented on smartcards by fixing the precision of the integers to the maximum 
+size the system will ever need.  Such an approach can lead to vastly simpler algorithms which can accomodate the 
+integers required even if the host platform cannot natively accomodate them\footnote{For example, the average smartcard 
+processor has an 8 bit accumulator.}.  However, as efficient as such an approach may be, the resulting source code is not
+normally very flexible.  It cannot, at runtime, accomodate inputs of higher magnitude than the designer anticipated.
+
+Multiple precision algorithms have the most overhead of any style of arithmetic.  For the the most part the 
+overhead can be kept to a minimum with careful planning, but overall, it is not well suited for most memory starved
+platforms.  However, multiple precision algorithms do offer the most flexibility in terms of the magnitude of the 
+inputs.  That is, the same algorithms based on multiple precision integers can accomodate any reasonable size input 
+without the designer's explicit forethought.  This leads to lower cost of ownership for the code as it only has to 
+be written and tested once.
+
+\section{Purpose of This Text}
+The purpose of this text is to instruct the reader regarding how to implement efficient multiple precision algorithms.  
+That is to not only explain a limited subset of the core theory behind the algorithms but also the various ``house keeping'' 
+elements that are neglected by authors of other texts on the subject.  Several well reknowned texts \cite{TAOCPV2,HAC} 
+give considerably detailed explanations of the theoretical aspects of algorithms and often very little information 
+regarding the practical implementation aspects.  
+
+In most cases how an algorithm is explained and how it is actually implemented are two very different concepts.  For 
+example, the Handbook of Applied Cryptography (\textit{HAC}), algorithm 14.7 on page 594, gives a relatively simple 
+algorithm for performing multiple precision integer addition.  However, the description lacks any discussion concerning 
+the fact that the two integer inputs may be of differing magnitudes.  As a result the implementation is not as simple
+as the text would lead people to believe.  Similarly the division routine (\textit{algorithm 14.20, pp. 598}) does not 
+discuss how to handle sign or handle the dividend's decreasing magnitude in the main loop (\textit{step \#3}).
+
+Both texts also do not discuss several key optimal algorithms required such as ``Comba'' and Karatsuba multipliers 
+and fast modular inversion, which we consider practical oversights.  These optimal algorithms are vital to achieve 
+any form of useful performance in non-trivial applications.  
+
+To solve this problem the focus of this text is on the practical aspects of implementing a multiple precision integer
+package.  As a case study the ``LibTomMath''\footnote{Available at \url{http://math.libtomcrypt.org}} package is used 
+to demonstrate algorithms with real implementations\footnote{In the ISO C programming language.} that have been field 
+tested and work very well.  The LibTomMath library is freely available on the Internet for all uses and this text 
+discusses a very large portion of the inner workings of the library.
+
+The algorithms that are presented will always include at least one ``pseudo-code'' description followed 
+by the actual C source code that implements the algorithm.  The pseudo-code can be used to implement the same 
+algorithm in other programming languages as the reader sees fit.  
+
+This text shall also serve as a walkthrough of the creation of multiple precision algorithms from scratch.  Showing
+the reader how the algorithms fit together as well as where to start on various taskings.  
+
+\section{Discussion and Notation}
+\subsection{Notation}
+A multiple precision integer of $n$-digits shall be denoted as $x = (x_{n-1}, \ldots, x_1, x_0)_{ \beta }$ and represent
+the integer $x \equiv \sum_{i=0}^{n-1} x_i\beta^i$.  The elements of the array $x$ are said to be the radix $\beta$ digits 
+of the integer.  For example, $x = (1,2,3)_{10}$ would represent the integer 
+$1\cdot 10^2 + 2\cdot10^1 + 3\cdot10^0 = 123$.  
+
+\index{mp\_int}
+The term ``mp\_int'' shall refer to a composite structure which contains the digits of the integer it represents, as well 
+as auxilary data required to manipulate the data.  These additional members are discussed further in section 
+\ref{sec:MPINT}.  For the purposes of this text a ``multiple precision integer'' and an ``mp\_int'' are assumed to be 
+synonymous.  When an algorithm is specified to accept an mp\_int variable it is assumed the various auxliary data members 
+are present as well.  An expression of the type \textit{variablename.item} implies that it should evaluate to the 
+member named ``item'' of the variable.  For example, a string of characters may have a member ``length'' which would 
+evaluate to the number of characters in the string.  If the string $a$ equals ``hello'' then it follows that 
+$a.length = 5$.  
+
+For certain discussions more generic algorithms are presented to help the reader understand the final algorithm used
+to solve a given problem.  When an algorithm is described as accepting an integer input it is assumed the input is 
+a plain integer with no additional multiple-precision members.  That is, algorithms that use integers as opposed to 
+mp\_ints as inputs do not concern themselves with the housekeeping operations required such as memory management.  These 
+algorithms will be used to establish the relevant theory which will subsequently be used to describe a multiple
+precision algorithm to solve the same problem.  
+
+\subsection{Precision Notation}
+The variable $\beta$ represents the radix of a single digit of a multiple precision integer and 
+must be of the form $q^p$ for $q, p \in \Z^+$.  A single precision variable must be able to represent integers in 
+the range $0 \le x < q \beta$ while a double precision variable must be able to represent integers in the range 
+$0 \le x < q \beta^2$.  The extra radix-$q$ factor allows additions and subtractions to proceed without truncation of the 
+carry.  Since all modern computers are binary, it is assumed that $q$ is two.
+
+\index{mp\_digit} \index{mp\_word}
+Within the source code that will be presented for each algorithm, the data type \textbf{mp\_digit} will represent 
+a single precision integer type, while, the data type \textbf{mp\_word} will represent a double precision integer type.  In 
+several algorithms (notably the Comba routines) temporary results will be stored in arrays of double precision mp\_words.  
+For the purposes of this text $x_j$ will refer to the $j$'th digit of a single precision array and $\hat x_j$ will refer to 
+the $j$'th digit of a double precision array.  Whenever an expression is to be assigned to a double precision
+variable it is assumed that all single precision variables are promoted to double precision during the evaluation.  
+Expressions that are assigned to a single precision variable are truncated to fit within the precision of a single
+precision data type.
+
+For example, if $\beta = 10^2$ a single precision data type may represent a value in the 
+range $0 \le x < 10^3$, while a double precision data type may represent a value in the range $0 \le x < 10^5$.  Let
+$a = 23$ and $b = 49$ represent two single precision variables.  The single precision product shall be written
+as $c \leftarrow a \cdot b$ while the double precision product shall be written as $\hat c \leftarrow a \cdot b$.
+In this particular case, $\hat c = 1127$ and $c = 127$.  The most significant digit of the product would not fit 
+in a single precision data type and as a result $c \ne \hat c$.  
+
+\subsection{Algorithm Inputs and Outputs}
+Within the algorithm descriptions all variables are assumed to be scalars of either single or double precision
+as indicated.  The only exception to this rule is when variables have been indicated to be of type mp\_int.  This 
+distinction is important as scalars are often used as array indicies and various other counters.  
+
+\subsection{Mathematical Expressions}
+The $\lfloor \mbox{ } \rfloor$ brackets imply an expression truncated to an integer not greater than the expression 
+itself.  For example, $\lfloor 5.7 \rfloor = 5$.  Similarly the $\lceil \mbox{ } \rceil$ brackets imply an expression
+rounded to an integer not less than the expression itself.  For example, $\lceil 5.1 \rceil = 6$.  Typically when 
+the $/$ division symbol is used the intention is to perform an integer division with truncation.  For example, 
+$5/2 = 2$ which will often be written as $\lfloor 5/2 \rfloor = 2$ for clarity.  When an expression is written as a 
+fraction a real value division is implied, for example ${5 \over 2} = 2.5$.  
+
+The norm of a multiple precision integer, for example $\vert \vert x \vert \vert$, will be used to represent the number of digits in the representation
+of the integer.  For example, $\vert \vert 123 \vert \vert = 3$ and $\vert \vert 79452 \vert \vert = 5$.  
+
+\subsection{Work Effort}
+\index{big-Oh}
+To measure the efficiency of the specified algorithms, a modified big-Oh notation is used.  In this system all 
+single precision operations are considered to have the same cost\footnote{Except where explicitly noted.}.  
+That is a single precision addition, multiplication and division are assumed to take the same time to 
+complete.  While this is generally not true in practice, it will simplify the discussions considerably.
+
+Some algorithms have slight advantages over others which is why some constants will not be removed in 
+the notation.  For example, a normal baseline multiplication (section \ref{sec:basemult}) requires $O(n^2)$ work while a 
+baseline squaring (section \ref{sec:basesquare}) requires $O({{n^2 + n}\over 2})$ work.  In standard big-Oh notation these 
+would both be said to be equivalent to $O(n^2)$.  However, 
+in the context of the this text this is not the case as the magnitude of the inputs will typically be rather small.  As a 
+result small constant factors in the work effort will make an observable difference in algorithm efficiency.
+
+All of the algorithms presented in this text have a polynomial time work level.  That is, of the form 
+$O(n^k)$ for $n, k \in \Z^{+}$.  This will help make useful comparisons in terms of the speed of the algorithms and how 
+various optimizations will help pay off in the long run.
+
+\section{Exercises}
+Within the more advanced chapters a section will be set aside to give the reader some challenging exercises related to
+the discussion at hand.  These exercises are not designed to be prize winning problems, but instead to be thought 
+provoking.  Wherever possible the problems are forward minded, stating problems that will be answered in subsequent 
+chapters.  The reader is encouraged to finish the exercises as they appear to get a better understanding of the 
+subject material.  
+
+That being said, the problems are designed to affirm knowledge of a particular subject matter.  Students in particular
+are encouraged to verify they can answer the problems correctly before moving on.
+
+Similar to the exercises of \cite[pp. ix]{TAOCPV2} these exercises are given a scoring system based on the difficulty of
+the problem.  However, unlike \cite{TAOCPV2} the problems do not get nearly as hard.  The scoring of these 
+exercises ranges from one (the easiest) to five (the hardest).  The following table sumarizes the 
+scoring system used.
+
+\begin{figure}[here]
+\begin{center}
+\begin{small}
+\begin{tabular}{|c|l|}
+\hline $\left [ 1 \right ]$ & An easy problem that should only take the reader a manner of \\
+                            & minutes to solve.  Usually does not involve much computer time \\
+                            & to solve. \\
+\hline $\left [ 2 \right ]$ & An easy problem that involves a marginal amount of computer \\
+                     & time usage.  Usually requires a program to be written to \\
+                     & solve the problem. \\
+\hline $\left [ 3 \right ]$ & A moderately hard problem that requires a non-trivial amount \\
+                     & of work.  Usually involves trivial research and development of \\
+                     & new theory from the perspective of a student. \\
+\hline $\left [ 4 \right ]$ & A moderately hard problem that involves a non-trivial amount \\
+                     & of work and research, the solution to which will demonstrate \\
+                     & a higher mastery of the subject matter. \\
+\hline $\left [ 5 \right ]$ & A hard problem that involves concepts that are difficult for a \\
+                     & novice to solve.  Solutions to these problems will demonstrate a \\
+                     & complete mastery of the given subject. \\
+\hline
+\end{tabular}
+\end{small}
+\end{center}
+\caption{Exercise Scoring System}
+\end{figure}
+
+Problems at the first level are meant to be simple questions that the reader can answer quickly without programming a solution or
+devising new theory.  These problems are quick tests to see if the material is understood.  Problems at the second level 
+are also designed to be easy but will require a program or algorithm to be implemented to arrive at the answer.  These
+two levels are essentially entry level questions.  
+
+Problems at the third level are meant to be a bit more difficult than the first two levels.  The answer is often 
+fairly obvious but arriving at an exacting solution requires some thought and skill.  These problems will almost always 
+involve devising a new algorithm or implementing a variation of another algorithm previously presented.  Readers who can
+answer these questions will feel comfortable with the concepts behind the topic at hand.
+
+Problems at the fourth level are meant to be similar to those of the level three questions except they will require 
+additional research to be completed.  The reader will most likely not know the answer right away, nor will the text provide 
+the exact details of the answer until a subsequent chapter.  
+
+Problems at the fifth level are meant to be the hardest 
+problems relative to all the other problems in the chapter.  People who can correctly answer fifth level problems have a 
+mastery of the subject matter at hand.
+
+Often problems will be tied together.  The purpose of this is to start a chain of thought that will be discussed in future chapters.  The reader
+is encouraged to answer the follow-up problems and try to draw the relevance of problems.
+
+\section{Introduction to LibTomMath}
+
+\subsection{What is LibTomMath?}
+LibTomMath is a free and open source multiple precision integer library written entirely in portable ISO C.  By portable it 
+is meant that the library does not contain any code that is computer platform dependent or otherwise problematic to use on 
+any given platform.  
+
+The library has been successfully tested under numerous operating systems including Unix\footnote{All of these
+trademarks belong to their respective rightful owners.}, MacOS, Windows, Linux, PalmOS and on standalone hardware such 
+as the Gameboy Advance.  The library is designed to contain enough functionality to be able to develop applications such 
+as public key cryptosystems and still maintain a relatively small footprint.
+
+\subsection{Goals of LibTomMath}
+
+Libraries which obtain the most efficiency are rarely written in a high level programming language such as C.  However, 
+even though this library is written entirely in ISO C, considerable care has been taken to optimize the algorithm implementations within the 
+library.  Specifically the code has been written to work well with the GNU C Compiler (\textit{GCC}) on both x86 and ARM 
+processors.  Wherever possible, highly efficient algorithms, such as Karatsuba multiplication, sliding window 
+exponentiation and Montgomery reduction have been provided to make the library more efficient.  
+
+Even with the nearly optimal and specialized algorithms that have been included the Application Programing Interface 
+(\textit{API}) has been kept as simple as possible.  Often generic place holder routines will make use of specialized 
+algorithms automatically without the developer's specific attention.  One such example is the generic multiplication 
+algorithm \textbf{mp\_mul()} which will automatically use Toom--Cook, Karatsuba, Comba or baseline multiplication 
+based on the magnitude of the inputs and the configuration of the library.  
+
+Making LibTomMath as efficient as possible is not the only goal of the LibTomMath project.  Ideally the library should 
+be source compatible with another popular library which makes it more attractive for developers to use.  In this case the
+MPI library was used as a API template for all the basic functions.  MPI was chosen because it is another library that fits 
+in the same niche as LibTomMath.  Even though LibTomMath uses MPI as the template for the function names and argument 
+passing conventions, it has been written from scratch by Tom St Denis.
+
+The project is also meant to act as a learning tool for students, the logic being that no easy-to-follow ``bignum'' 
+library exists which can be used to teach computer science students how to perform fast and reliable multiple precision 
+integer arithmetic.  To this end the source code has been given quite a few comments and algorithm discussion points.  
+
+\section{Choice of LibTomMath}
+LibTomMath was chosen as the case study of this text not only because the author of both projects is one and the same but
+for more worthy reasons.  Other libraries such as GMP \cite{GMP}, MPI \cite{MPI}, LIP \cite{LIP} and OpenSSL 
+\cite{OPENSSL} have multiple precision integer arithmetic routines but would not be ideal for this text for 
+reasons that will be explained in the following sub-sections.
+
+\subsection{Code Base}
+The LibTomMath code base is all portable ISO C source code.  This means that there are no platform dependent conditional
+segments of code littered throughout the source.  This clean and uncluttered approach to the library means that a
+developer can more readily discern the true intent of a given section of source code without trying to keep track of
+what conditional code will be used.
+
+The code base of LibTomMath is well organized.  Each function is in its own separate source code file 
+which allows the reader to find a given function very quickly.  On average there are $76$ lines of code per source
+file which makes the source very easily to follow.  By comparison MPI and LIP are single file projects making code tracing
+very hard.  GMP has many conditional code segments which also hinder tracing.  
+
+When compiled with GCC for the x86 processor and optimized for speed the entire library is approximately $100$KiB\footnote{The notation ``KiB'' means $2^{10}$ octets, similarly ``MiB'' means $2^{20}$ octets.}
+ which is fairly small compared to GMP (over $250$KiB).  LibTomMath is slightly larger than MPI (which compiles to about 
+$50$KiB) but LibTomMath is also much faster and more complete than MPI.
+
+\subsection{API Simplicity}
+LibTomMath is designed after the MPI library and shares the API design.  Quite often programs that use MPI will build 
+with LibTomMath without change. The function names correlate directly to the action they perform.  Almost all of the 
+functions share the same parameter passing convention.  The learning curve is fairly shallow with the API provided 
+which is an extremely valuable benefit for the student and developer alike.  
+
+The LIP library is an example of a library with an API that is awkward to work with.  LIP uses function names that are often ``compressed'' to 
+illegible short hand.  LibTomMath does not share this characteristic.  
+
+The GMP library also does not return error codes.  Instead it uses a POSIX.1 \cite{POSIX1} signal system where errors
+are signaled to the host application.  This happens to be the fastest approach but definitely not the most versatile.  In
+effect a math error (i.e. invalid input, heap error, etc) can cause a program to stop functioning which is definitely 
+undersireable in many situations.
+
+\subsection{Optimizations}
+While LibTomMath is certainly not the fastest library (GMP often beats LibTomMath by a factor of two) it does
+feature a set of optimal algorithms for tasks such as modular reduction, exponentiation, multiplication and squaring.  GMP 
+and LIP also feature such optimizations while MPI only uses baseline algorithms with no optimizations.  GMP lacks a few
+of the additional modular reduction optimizations that LibTomMath features\footnote{At the time of this writing GMP
+only had Barrett and Montgomery modular reduction algorithms.}.  
+
+LibTomMath is almost always an order of magnitude faster than the MPI library at computationally expensive tasks such as modular
+exponentiation.  In the grand scheme of ``bignum'' libraries LibTomMath is faster than the average library and usually  
+slower than the best libraries such as GMP and OpenSSL by only a small factor.
+
+\subsection{Portability and Stability}
+LibTomMath will build ``out of the box'' on any platform equipped with a modern version of the GNU C Compiler 
+(\textit{GCC}).  This means that without changes the library will build without configuration or setting up any 
+variables.  LIP and MPI will build ``out of the box'' as well but have numerous known bugs.  Most notably the author of 
+MPI has recently stopped working on his library and LIP has long since been discontinued.  
+
+GMP requires a configuration script to run and will not build out of the box.   GMP and LibTomMath are still in active
+development and are very stable across a variety of platforms.
+
+\subsection{Choice}
+LibTomMath is a relatively compact, well documented, highly optimized and portable library which seems only natural for
+the case study of this text.  Various source files from the LibTomMath project will be included within the text.  However, 
+the reader is encouraged to download their own copy of the library to actually be able to work with the library.  
+
+\chapter{Getting Started}
+\section{Library Basics}
+The trick to writing any useful library of source code is to build a solid foundation and work outwards from it.  First, 
+a problem along with allowable solution parameters should be identified and analyzed.  In this particular case the 
+inability to accomodate multiple precision integers is the problem.  Futhermore, the solution must be written
+as portable source code that is reasonably efficient across several different computer platforms.
+
+After a foundation is formed the remainder of the library can be designed and implemented in a hierarchical fashion.  
+That is, to implement the lowest level dependencies first and work towards the most abstract functions last.  For example, 
+before implementing a modular exponentiation algorithm one would implement a modular reduction algorithm.
+By building outwards from a base foundation instead of using a parallel design methodology the resulting project is 
+highly modular.  Being highly modular is a desirable property of any project as it often means the resulting product
+has a small footprint and updates are easy to perform.  
+
+Usually when I start a project I will begin with the header files.  I define the data types I think I will need and 
+prototype the initial functions that are not dependent on other functions (within the library).  After I 
+implement these base functions I prototype more dependent functions and implement them.   The process repeats until
+I implement all of the functions I require.  For example, in the case of LibTomMath I implemented functions such as 
+mp\_init() well before I implemented mp\_mul() and even further before I implemented mp\_exptmod().  As an example as to 
+why this design works note that the Karatsuba and Toom-Cook multipliers were written \textit{after} the 
+dependent function mp\_exptmod() was written.  Adding the new multiplication algorithms did not require changes to the 
+mp\_exptmod() function itself and lowered the total cost of ownership (\textit{so to speak}) and of development 
+for new algorithms.  This methodology allows new algorithms to be tested in a complete framework with relative ease.
+
+FIGU,design_process,Design Flow of the First Few Original LibTomMath Functions.
+
+Only after the majority of the functions were in place did I pursue a less hierarchical approach to auditing and optimizing
+the source code.  For example, one day I may audit the multipliers and the next day the polynomial basis functions.  
+
+It only makes sense to begin the text with the preliminary data types and support algorithms required as well.  
+This chapter discusses the core algorithms of the library which are the dependents for every other algorithm.
+
+\section{What is a Multiple Precision Integer?}
+Recall that most programming languages, in particular ISO C \cite{ISOC}, only have fixed precision data types that on their own cannot 
+be used to represent values larger than their precision will allow. The purpose of multiple precision algorithms is 
+to use fixed precision data types to create and manipulate multiple precision integers which may represent values 
+that are very large.  
+
+As a well known analogy, school children are taught how to form numbers larger than nine by prepending more radix ten digits.  In the decimal system
+the largest single digit value is $9$.  However, by concatenating digits together larger numbers may be represented.  Newly prepended digits 
+(\textit{to the left}) are said to be in a different power of ten column.  That is, the number $123$ can be described as having a $1$ in the hundreds 
+column, $2$ in the tens column and $3$ in the ones column.  Or more formally $123 = 1 \cdot 10^2 + 2 \cdot 10^1 + 3 \cdot 10^0$.  Computer based 
+multiple precision arithmetic is essentially the same concept.  Larger integers are represented by adjoining fixed 
+precision computer words with the exception that a different radix is used.
+
+What most people probably do not think about explicitly are the various other attributes that describe a multiple precision 
+integer.  For example, the integer $154_{10}$ has two immediately obvious properties.  First, the integer is positive, 
+that is the sign of this particular integer is positive as opposed to negative.  Second, the integer has three digits in 
+its representation.  There is an additional property that the integer posesses that does not concern pencil-and-paper 
+arithmetic.  The third property is how many digits placeholders are available to hold the integer.  
+
+The human analogy of this third property is ensuring there is enough space on the paper to write the integer.  For example,
+if one starts writing a large number too far to the right on a piece of paper they will have to erase it and move left.  
+Similarly, computer algorithms must maintain strict control over memory usage to ensure that the digits of an integer
+will not exceed the allowed boundaries.  These three properties make up what is known as a multiple precision 
+integer or mp\_int for short.  
+
+\subsection{The mp\_int Structure}
+\label{sec:MPINT}
+The mp\_int structure is the ISO C based manifestation of what represents a multiple precision integer.  The ISO C standard does not provide for 
+any such data type but it does provide for making composite data types known as structures.  The following is the structure definition 
+used within LibTomMath.
+
+\index{mp\_int}
+\begin{figure}[here]
+\begin{center}
+\begin{small}
+%\begin{verbatim}
+\begin{tabular}{|l|}
+\hline
+typedef struct \{ \\
+\hspace{3mm}int used, alloc, sign;\\
+\hspace{3mm}mp\_digit *dp;\\
+\} \textbf{mp\_int}; \\
+\hline
+\end{tabular}
+%\end{verbatim}
+\end{small}
+\caption{The mp\_int Structure}
+\label{fig:mpint}
+\end{center}
+\end{figure}
+
+The mp\_int structure (fig. \ref{fig:mpint}) can be broken down as follows.
+
+\begin{enumerate}
+\item The \textbf{used} parameter denotes how many digits of the array \textbf{dp} contain the digits used to represent
+a given integer.  The \textbf{used} count must be positive (or zero) and may not exceed the \textbf{alloc} count.  
+
+\item The \textbf{alloc} parameter denotes how 
+many digits are available in the array to use by functions before it has to increase in size.  When the \textbf{used} count 
+of a result would exceed the \textbf{alloc} count all of the algorithms will automatically increase the size of the 
+array to accommodate the precision of the result.  
+
+\item The pointer \textbf{dp} points to a dynamically allocated array of digits that represent the given multiple 
+precision integer.  It is padded with $(\textbf{alloc} - \textbf{used})$ zero digits.  The array is maintained in a least 
+significant digit order.  As a pencil and paper analogy the array is organized such that the right most digits are stored
+first starting at the location indexed by zero\footnote{In C all arrays begin at zero.} in the array.  For example, 
+if \textbf{dp} contains $\lbrace a, b, c, \ldots \rbrace$ where \textbf{dp}$_0 = a$, \textbf{dp}$_1 = b$, \textbf{dp}$_2 = c$, $\ldots$ then 
+it would represent the integer $a + b\beta + c\beta^2 + \ldots$  
+
+\index{MP\_ZPOS} \index{MP\_NEG}
+\item The \textbf{sign} parameter denotes the sign as either zero/positive (\textbf{MP\_ZPOS}) or negative (\textbf{MP\_NEG}).  
+\end{enumerate}
+
+\subsubsection{Valid mp\_int Structures}
+Several rules are placed on the state of an mp\_int structure and are assumed to be followed for reasons of efficiency.  
+The only exceptions are when the structure is passed to initialization functions such as mp\_init() and mp\_init\_copy().
+
+\begin{enumerate}
+\item The value of \textbf{alloc} may not be less than one.  That is \textbf{dp} always points to a previously allocated
+array of digits.
+\item The value of \textbf{used} may not exceed \textbf{alloc} and must be greater than or equal to zero.
+\item The value of \textbf{used} implies the digit at index $(used - 1)$ of the \textbf{dp} array is non-zero.  That is, 
+leading zero digits in the most significant positions must be trimmed.
+   \begin{enumerate}
+   \item Digits in the \textbf{dp} array at and above the \textbf{used} location must be zero.
+   \end{enumerate}
+\item The value of \textbf{sign} must be \textbf{MP\_ZPOS} if \textbf{used} is zero; 
+this represents the mp\_int value of zero.
+\end{enumerate}
+
+\section{Argument Passing}
+A convention of argument passing must be adopted early on in the development of any library.  Making the function 
+prototypes consistent will help eliminate many headaches in the future as the library grows to significant complexity.  
+In LibTomMath the multiple precision integer functions accept parameters from left to right as pointers to mp\_int 
+structures.  That means that the source (input) operands are placed on the left and the destination (output) on the right.   
+Consider the following examples.
+
+\begin{verbatim}
+   mp_mul(&a, &b, &c);   /* c = a * b */
+   mp_add(&a, &b, &a);   /* a = a + b */
+   mp_sqr(&a, &b);       /* b = a * a */
+\end{verbatim}
+
+The left to right order is a fairly natural way to implement the functions since it lets the developer read aloud the
+functions and make sense of them.  For example, the first function would read ``multiply a and b and store in c''.
+
+Certain libraries (\textit{LIP by Lenstra for instance}) accept parameters the other way around, to mimic the order
+of assignment expressions.  That is, the destination (output) is on the left and arguments (inputs) are on the right.  In 
+truth, it is entirely a matter of preference.  In the case of LibTomMath the convention from the MPI library has been 
+adopted.  
+
+Another very useful design consideration, provided for in LibTomMath, is whether to allow argument sources to also be a 
+destination.  For example, the second example (\textit{mp\_add}) adds $a$ to $b$ and stores in $a$.  This is an important 
+feature to implement since it allows the calling functions to cut down on the number of variables it must maintain.  
+However, to implement this feature specific care has to be given to ensure the destination is not modified before the 
+source is fully read.
+
+\section{Return Values}
+A well implemented application, no matter what its purpose, should trap as many runtime errors as possible and return them 
+to the caller.  By catching runtime errors a library can be guaranteed to prevent undefined behaviour.  However, the end 
+developer can still manage to cause a library to crash.  For example, by passing an invalid pointer an application may
+fault by dereferencing memory not owned by the application.
+
+In the case of LibTomMath the only errors that are checked for are related to inappropriate inputs (division by zero for 
+instance) and memory allocation errors.  It will not check that the mp\_int passed to any function is valid nor 
+will it check pointers for validity.  Any function that can cause a runtime error will return an error code as an 
+\textbf{int} data type with one of the following values (fig \ref{fig:errcodes}).
+
+\index{MP\_OKAY} \index{MP\_VAL} \index{MP\_MEM}
+\begin{figure}[here]
+\begin{center}
+\begin{tabular}{|l|l|}
+\hline \textbf{Value} & \textbf{Meaning} \\
+\hline \textbf{MP\_OKAY} & The function was successful \\
+\hline \textbf{MP\_VAL}  & One of the input value(s) was invalid \\
+\hline \textbf{MP\_MEM}  & The function ran out of heap memory \\
+\hline
+\end{tabular}
+\end{center}
+\caption{LibTomMath Error Codes}
+\label{fig:errcodes}
+\end{figure}
+
+When an error is detected within a function it should free any memory it allocated, often during the initialization of
+temporary mp\_ints, and return as soon as possible.  The goal is to leave the system in the same state it was when the 
+function was called.  Error checking with this style of API is fairly simple.
+
+\begin{verbatim}
+   int err;
+   if ((err = mp_add(&a, &b, &c)) != MP_OKAY) {
+      printf("Error: %s\n", mp_error_to_string(err));
+      exit(EXIT_FAILURE);
+   }
+\end{verbatim}
+
+The GMP \cite{GMP} library uses C style \textit{signals} to flag errors which is of questionable use.  Not all errors are fatal 
+and it was not deemed ideal by the author of LibTomMath to force developers to have signal handlers for such cases.
+
+\section{Initialization and Clearing}
+The logical starting point when actually writing multiple precision integer functions is the initialization and 
+clearing of the mp\_int structures.  These two algorithms will be used by the majority of the higher level algorithms.
+
+Given the basic mp\_int structure an initialization routine must first allocate memory to hold the digits of
+the integer.  Often it is optimal to allocate a sufficiently large pre-set number of digits even though
+the initial integer will represent zero.  If only a single digit were allocated quite a few subsequent re-allocations
+would occur when operations are performed on the integers.  There is a tradeoff between how many default digits to allocate
+and how many re-allocations are tolerable.  Obviously allocating an excessive amount of digits initially will waste 
+memory and become unmanageable.  
+
+If the memory for the digits has been successfully allocated then the rest of the members of the structure must
+be initialized.  Since the initial state of an mp\_int is to represent the zero integer, the allocated digits must be set
+to zero.  The \textbf{used} count set to zero and \textbf{sign} set to \textbf{MP\_ZPOS}.
+
+\subsection{Initializing an mp\_int}
+An mp\_int is said to be initialized if it is set to a valid, preferably default, state such that all of the members of the
+structure are set to valid values.  The mp\_init algorithm will perform such an action.
+
+\index{mp\_init}
+\begin{figure}[here]
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_init}. \\
+\textbf{Input}.   An mp\_int $a$ \\
+\textbf{Output}.  Allocate memory and initialize $a$ to a known valid mp\_int state.  \\
+\hline \\
+1.  Allocate memory for \textbf{MP\_PREC} digits. \\
+2.  If the allocation failed return(\textit{MP\_MEM}) \\
+3.  for $n$ from $0$ to $MP\_PREC - 1$ do  \\
+\hspace{3mm}3.1  $a_n \leftarrow 0$\\
+4.  $a.sign \leftarrow MP\_ZPOS$\\
+5.  $a.used \leftarrow 0$\\
+6.  $a.alloc \leftarrow MP\_PREC$\\
+7.  Return(\textit{MP\_OKAY})\\
+\hline
+\end{tabular}
+\end{center}
+\caption{Algorithm mp\_init}
+\end{figure}
+
+\textbf{Algorithm mp\_init.}
+The purpose of this function is to initialize an mp\_int structure so that the rest of the library can properly
+manipulte it.  It is assumed that the input may not have had any of its members previously initialized which is certainly
+a valid assumption if the input resides on the stack.  
+
+Before any of the members such as \textbf{sign}, \textbf{used} or \textbf{alloc} are initialized the memory for
+the digits is allocated.  If this fails the function returns before setting any of the other members.  The \textbf{MP\_PREC} 
+name represents a constant\footnote{Defined in the ``tommath.h'' header file within LibTomMath.} 
+used to dictate the minimum precision of newly initialized mp\_int integers.  Ideally, it is at least equal to the smallest
+precision number you'll be working with.
+
+Allocating a block of digits at first instead of a single digit has the benefit of lowering the number of usually slow
+heap operations later functions will have to perform in the future.  If \textbf{MP\_PREC} is set correctly the slack 
+memory and the number of heap operations will be trivial.
+
+Once the allocation has been made the digits have to be set to zero as well as the \textbf{used}, \textbf{sign} and
+\textbf{alloc} members initialized.  This ensures that the mp\_int will always represent the default state of zero regardless
+of the original condition of the input.
+
+\textbf{Remark.}
+This function introduces the idiosyncrasy that all iterative loops, commonly initiated with the ``for'' keyword, iterate incrementally
+when the ``to'' keyword is placed between two expressions.  For example, ``for $a$ from $b$ to $c$ do'' means that
+a subsequent expression (or body of expressions) are to be evaluated upto $c - b$ times so long as $b \le c$.  In each
+iteration the variable $a$ is substituted for a new integer that lies inclusively between $b$ and $c$.  If $b > c$ occured
+the loop would not iterate.  By contrast if the ``downto'' keyword were used in place of ``to'' the loop would iterate 
+decrementally.
+
+EXAM,bn_mp_init.c
+
+One immediate observation of this initializtion function is that it does not return a pointer to a mp\_int structure.  It 
+is assumed that the caller has already allocated memory for the mp\_int structure, typically on the application stack.  The 
+call to mp\_init() is used only to initialize the members of the structure to a known default state.  
+
+Here we see (line @23,XMALLOC@) the memory allocation is performed first.  This allows us to exit cleanly and quickly
+if there is an error.  If the allocation fails the routine will return \textbf{MP\_MEM} to the caller to indicate there
+was a memory error.  The function XMALLOC is what actually allocates the memory.  Technically XMALLOC is not a function
+but a macro defined in ``tommath.h``.  By default, XMALLOC will evaluate to malloc() which is the C library's built--in
+memory allocation routine.
+
+In order to assure the mp\_int is in a known state the digits must be set to zero.  On most platforms this could have been
+accomplished by using calloc() instead of malloc().  However,  to correctly initialize a integer type to a given value in a 
+portable fashion you have to actually assign the value.  The for loop (line @28,for@) performs this required
+operation.
+
+After the memory has been successfully initialized the remainder of the members are initialized 
+(lines @29,used@ through @31,sign@) to their respective default states.  At this point the algorithm has succeeded and
+a success code is returned to the calling function.  If this function returns \textbf{MP\_OKAY} it is safe to assume the 
+mp\_int structure has been properly initialized and is safe to use with other functions within the library.  
+
+\subsection{Clearing an mp\_int}
+When an mp\_int is no longer required by the application, the memory that has been allocated for its digits must be 
+returned to the application's memory pool with the mp\_clear algorithm.
+
+\begin{figure}[here]
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_clear}. \\
+\textbf{Input}.   An mp\_int $a$ \\
+\textbf{Output}.  The memory for $a$ shall be deallocated.  \\
+\hline \\
+1.  If $a$ has been previously freed then return(\textit{MP\_OKAY}). \\
+2.  for $n$ from 0 to $a.used - 1$ do \\
+\hspace{3mm}2.1  $a_n \leftarrow 0$ \\
+3.  Free the memory allocated for the digits of $a$. \\
+4.  $a.used \leftarrow 0$ \\
+5.  $a.alloc \leftarrow 0$ \\
+6.  $a.sign \leftarrow MP\_ZPOS$ \\
+7.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Algorithm mp\_clear}
+\end{figure}
+
+\textbf{Algorithm mp\_clear.}
+This algorithm accomplishes two goals.  First, it clears the digits and the other mp\_int members.  This ensures that 
+if a developer accidentally re-uses a cleared structure it is less likely to cause problems.  The second goal
+is to free the allocated memory.
+
+The logic behind the algorithm is extended by marking cleared mp\_int structures so that subsequent calls to this
+algorithm will not try to free the memory multiple times.  Cleared mp\_ints are detectable by having a pre-defined invalid 
+digit pointer \textbf{dp} setting.
+
+Once an mp\_int has been cleared the mp\_int structure is no longer in a valid state for any other algorithm
+with the exception of algorithms mp\_init, mp\_init\_copy, mp\_init\_size and mp\_clear.
+
+EXAM,bn_mp_clear.c
+
+The algorithm only operates on the mp\_int if it hasn't been previously cleared.  The if statement (line @23,a->dp != NULL@)
+checks to see if the \textbf{dp} member is not \textbf{NULL}.  If the mp\_int is a valid mp\_int then \textbf{dp} cannot be
+\textbf{NULL} in which case the if statement will evaluate to true.
+
+The digits of the mp\_int are cleared by the for loop (line @25,for@) which assigns a zero to every digit.  Similar to mp\_init()
+the digits are assigned zero instead of using block memory operations (such as memset()) since this is more portable.  
+
+The digits are deallocated off the heap via the XFREE macro.  Similar to XMALLOC the XFREE macro actually evaluates to
+a standard C library function.  In this case the free() function.  Since free() only deallocates the memory the pointer
+still has to be reset to \textbf{NULL} manually (line @33,NULL@).  
+
+Now that the digits have been cleared and deallocated the other members are set to their final values (lines @34,= 0@ and @35,ZPOS@).
+
+\section{Maintenance Algorithms}
+
+The previous sections describes how to initialize and clear an mp\_int structure.  To further support operations
+that are to be performed on mp\_int structures (such as addition and multiplication) the dependent algorithms must be
+able to augment the precision of an mp\_int and 
+initialize mp\_ints with differing initial conditions.  
+
+These algorithms complete the set of low level algorithms required to work with mp\_int structures in the higher level
+algorithms such as addition, multiplication and modular exponentiation.
+
+\subsection{Augmenting an mp\_int's Precision}
+When storing a value in an mp\_int structure, a sufficient number of digits must be available to accomodate the entire 
+result of an operation without loss of precision.  Quite often the size of the array given by the \textbf{alloc} member 
+is large enough to simply increase the \textbf{used} digit count.  However, when the size of the array is too small it 
+must be re-sized appropriately to accomodate the result.  The mp\_grow algorithm will provide this functionality.
+
+\newpage\begin{figure}[here]
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_grow}. \\
+\textbf{Input}.   An mp\_int $a$ and an integer $b$. \\
+\textbf{Output}.  $a$ is expanded to accomodate $b$ digits. \\
+\hline \\
+1.  if $a.alloc \ge b$ then return(\textit{MP\_OKAY}) \\
+2.  $u \leftarrow b\mbox{ (mod }MP\_PREC\mbox{)}$ \\
+3.  $v \leftarrow b + 2 \cdot MP\_PREC - u$ \\
+4.  Re-allocate the array of digits $a$ to size $v$ \\
+5.  If the allocation failed then return(\textit{MP\_MEM}). \\
+6.  for n from a.alloc to $v - 1$ do  \\
+\hspace{+3mm}6.1  $a_n \leftarrow 0$ \\
+7.  $a.alloc \leftarrow v$ \\
+8.  Return(\textit{MP\_OKAY}) \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Algorithm mp\_grow}
+\end{figure}
+
+\textbf{Algorithm mp\_grow.}
+It is ideal to prevent re-allocations from being performed if they are not required (step one).  This is useful to 
+prevent mp\_ints from growing excessively in code that erroneously calls mp\_grow.  
+
+The requested digit count is padded up to next multiple of \textbf{MP\_PREC} plus an additional \textbf{MP\_PREC} (steps two and three).  
+This helps prevent many trivial reallocations that would grow an mp\_int by trivially small values.  
+
+It is assumed that the reallocation (step four) leaves the lower $a.alloc$ digits of the mp\_int intact.  This is much 
+akin to how the \textit{realloc} function from the standard C library works.  Since the newly allocated digits are 
+assumed to contain undefined values they are initially set to zero.
+
+EXAM,bn_mp_grow.c
+
+A quick optimization is to first determine if a memory re-allocation is required at all.  The if statement (line @24,alloc@) checks
+if the \textbf{alloc} member of the mp\_int is smaller than the requested digit count.  If the count is not larger than \textbf{alloc}
+the function skips the re-allocation part thus saving time.
+
+When a re-allocation is performed it is turned into an optimal request to save time in the future.  The requested digit count is
+padded upwards to 2nd multiple of \textbf{MP\_PREC} larger than \textbf{alloc} (line @25, size@).  The XREALLOC function is used
+to re-allocate the memory.  As per the other functions XREALLOC is actually a macro which evaluates to realloc by default.  The realloc
+function leaves the base of the allocation intact which means the first \textbf{alloc} digits of the mp\_int are the same as before
+the re-allocation.  All	that is left is to clear the newly allocated digits and return.
+
+Note that the re-allocation result is actually stored in a temporary pointer $tmp$.  This is to allow this function to return
+an error with a valid pointer.  Earlier releases of the library stored the result of XREALLOC into the mp\_int $a$.  That would
+result in a memory leak if XREALLOC ever failed.  
+
+\subsection{Initializing Variable Precision mp\_ints}
+Occasionally the number of digits required will be known in advance of an initialization, based on, for example, the size 
+of input mp\_ints to a given algorithm.  The purpose of algorithm mp\_init\_size is similar to mp\_init except that it 
+will allocate \textit{at least} a specified number of digits.  
+
+\begin{figure}[here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_init\_size}. \\
+\textbf{Input}.   An mp\_int $a$ and the requested number of digits $b$. \\
+\textbf{Output}.  $a$ is initialized to hold at least $b$ digits. \\
+\hline \\
+1.  $u \leftarrow b \mbox{ (mod }MP\_PREC\mbox{)}$ \\
+2.  $v \leftarrow b + 2 \cdot MP\_PREC - u$ \\
+3.  Allocate $v$ digits. \\
+4.  for $n$ from $0$ to $v - 1$ do \\
+\hspace{3mm}4.1  $a_n \leftarrow 0$ \\
+5.  $a.sign \leftarrow MP\_ZPOS$\\
+6.  $a.used \leftarrow 0$\\
+7.  $a.alloc \leftarrow v$\\
+8.  Return(\textit{MP\_OKAY})\\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_init\_size}
+\end{figure}
+
+\textbf{Algorithm mp\_init\_size.}
+This algorithm will initialize an mp\_int structure $a$ like algorithm mp\_init with the exception that the number of 
+digits allocated can be controlled by the second input argument $b$.  The input size is padded upwards so it is a 
+multiple of \textbf{MP\_PREC} plus an additional \textbf{MP\_PREC} digits.  This padding is used to prevent trivial 
+allocations from becoming a bottleneck in the rest of the algorithms.
+
+Like algorithm mp\_init, the mp\_int structure is initialized to a default state representing the integer zero.  This 
+particular algorithm is useful if it is known ahead of time the approximate size of the input.  If the approximation is
+correct no further memory re-allocations are required to work with the mp\_int.
+
+EXAM,bn_mp_init_size.c
+
+The number of digits $b$ requested is padded (line @22,MP_PREC@) by first augmenting it to the next multiple of 
+\textbf{MP\_PREC} and then adding \textbf{MP\_PREC} to the result.  If the memory can be successfully allocated the 
+mp\_int is placed in a default state representing the integer zero.  Otherwise, the error code \textbf{MP\_MEM} will be 
+returned (line @27,return@).  
+
+The digits are allocated and set to zero at the same time with the calloc() function (line @25,XCALLOC@).  The 
+\textbf{used} count is set to zero, the \textbf{alloc} count set to the padded digit count and the \textbf{sign} flag set 
+to \textbf{MP\_ZPOS} to achieve a default valid mp\_int state (lines @29,used@, @30,alloc@ and @31,sign@).  If the function 
+returns succesfully then it is correct to assume that the mp\_int structure is in a valid state for the remainder of the 
+functions to work with.
+
+\subsection{Multiple Integer Initializations and Clearings}
+Occasionally a function will require a series of mp\_int data types to be made available simultaneously.  
+The purpose of algorithm mp\_init\_multi is to initialize a variable length array of mp\_int structures in a single
+statement.  It is essentially a shortcut to multiple initializations.
+
+\newpage\begin{figure}[here]
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_init\_multi}. \\
+\textbf{Input}.   Variable length array $V_k$ of mp\_int variables of length $k$. \\
+\textbf{Output}.  The array is initialized such that each mp\_int of $V_k$ is ready to use. \\
+\hline \\
+1.  for $n$ from 0 to $k - 1$ do \\
+\hspace{+3mm}1.1.  Initialize the mp\_int $V_n$ (\textit{mp\_init}) \\
+\hspace{+3mm}1.2.  If initialization failed then do \\
+\hspace{+6mm}1.2.1.  for $j$ from $0$ to $n$ do \\
+\hspace{+9mm}1.2.1.1.  Free the mp\_int $V_j$ (\textit{mp\_clear}) \\
+\hspace{+6mm}1.2.2.   Return(\textit{MP\_MEM}) \\
+2.  Return(\textit{MP\_OKAY}) \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Algorithm mp\_init\_multi}
+\end{figure}
+
+\textbf{Algorithm mp\_init\_multi.}
+The algorithm will initialize the array of mp\_int variables one at a time.  If a runtime error has been detected 
+(\textit{step 1.2}) all of the previously initialized variables are cleared.  The goal is an ``all or nothing'' 
+initialization which allows for quick recovery from runtime errors.
+
+EXAM,bn_mp_init_multi.c
+
+This function intializes a variable length list of mp\_int structure pointers.  However, instead of having the mp\_int
+structures in an actual C array they are simply passed as arguments to the function.  This function makes use of the 
+``...'' argument syntax of the C programming language.  The list is terminated with a final \textbf{NULL} argument 
+appended on the right.  
+
+The function uses the ``stdarg.h'' \textit{va} functions to step portably through the arguments to the function.  A count
+$n$ of succesfully initialized mp\_int structures is maintained (line @47,n++@) such that if a failure does occur,
+the algorithm can backtrack and free the previously initialized structures (lines @27,if@ to @46,}@).  
+
+
+\subsection{Clamping Excess Digits}
+When a function anticipates a result will be $n$ digits it is simpler to assume this is true within the body of 
+the function instead of checking during the computation.  For example, a multiplication of a $i$ digit number by a 
+$j$ digit produces a result of at most $i + j$ digits.  It is entirely possible that the result is $i + j - 1$ 
+though, with no final carry into the last position.  However, suppose the destination had to be first expanded 
+(\textit{via mp\_grow}) to accomodate $i + j - 1$ digits than further expanded to accomodate the final carry.  
+That would be a considerable waste of time since heap operations are relatively slow.
+
+The ideal solution is to always assume the result is $i + j$ and fix up the \textbf{used} count after the function
+terminates.  This way a single heap operation (\textit{at most}) is required.  However, if the result was not checked
+there would be an excess high order zero digit.  
+
+For example, suppose the product of two integers was $x_n = (0x_{n-1}x_{n-2}...x_0)_{\beta}$.  The leading zero digit 
+will not contribute to the precision of the result.  In fact, through subsequent operations more leading zero digits would
+accumulate to the point the size of the integer would be prohibitive.  As a result even though the precision is very 
+low the representation is excessively large.  
+
+The mp\_clamp algorithm is designed to solve this very problem.  It will trim high-order zeros by decrementing the 
+\textbf{used} count until a non-zero most significant digit is found.  Also in this system, zero is considered to be a 
+positive number which means that if the \textbf{used} count is decremented to zero, the sign must be set to 
+\textbf{MP\_ZPOS}.
+
+\begin{figure}[here]
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_clamp}. \\
+\textbf{Input}.   An mp\_int $a$ \\
+\textbf{Output}.  Any excess leading zero digits of $a$ are removed \\
+\hline \\
+1.  while $a.used > 0$ and $a_{a.used - 1} = 0$ do \\
+\hspace{+3mm}1.1  $a.used \leftarrow a.used - 1$ \\
+2.  if $a.used = 0$ then do \\
+\hspace{+3mm}2.1  $a.sign \leftarrow MP\_ZPOS$ \\
+\hline \\
+\end{tabular}
+\end{center}
+\caption{Algorithm mp\_clamp}
+\end{figure}
+
+\textbf{Algorithm mp\_clamp.}
+As can be expected this algorithm is very simple.  The loop on step one is expected to iterate only once or twice at
+the most.  For example, this will happen in cases where there is not a carry to fill the last position.  Step two fixes the sign for 
+when all of the digits are zero to ensure that the mp\_int is valid at all times.
+
+EXAM,bn_mp_clamp.c
+
+Note on line @27,while@ how to test for the \textbf{used} count is made on the left of the \&\& operator.  In the C programming
+language the terms to \&\& are evaluated left to right with a boolean short-circuit if any condition fails.  This is 
+important since if the \textbf{used} is zero the test on the right would fetch below the array.  That is obviously 
+undesirable.  The parenthesis on line @28,a->used@ is used to make sure the \textbf{used} count is decremented and not
+the pointer ``a''.  
+
+\section*{Exercises}
+\begin{tabular}{cl}
+$\left [ 1 \right ]$ & Discuss the relevance of the \textbf{used} member of the mp\_int structure. \\
+                     & \\
+$\left [ 1 \right ]$ & Discuss the consequences of not using padding when performing allocations.  \\
+                     & \\
+$\left [ 2 \right ]$ & Estimate an ideal value for \textbf{MP\_PREC} when performing 1024-bit RSA \\
+                     & encryption when $\beta = 2^{28}$.  \\
+                     & \\
+$\left [ 1 \right ]$ & Discuss the relevance of the algorithm mp\_clamp.  What does it prevent? \\
+                     & \\
+$\left [ 1 \right ]$ & Give an example of when the algorithm  mp\_init\_copy might be useful. \\
+                     & \\
+\end{tabular}
+
+
+%%%
+% CHAPTER FOUR
+%%%
+
+\chapter{Basic Operations}
+
+\section{Introduction}
+In the previous chapter a series of low level algorithms were established that dealt with initializing and maintaining
+mp\_int structures.  This chapter will discuss another set of seemingly non-algebraic algorithms which will form the low 
+level basis of the entire library.  While these algorithm are relatively trivial it is important to understand how they
+work before proceeding since these algorithms will be used almost intrinsically in the following chapters.
+
+The algorithms in this chapter deal primarily with more ``programmer'' related tasks such as creating copies of
+mp\_int structures, assigning small values to mp\_int structures and comparisons of the values mp\_int structures
+represent.   
+
+\section{Assigning Values to mp\_int Structures}
+\subsection{Copying an mp\_int}
+Assigning the value that a given mp\_int structure represents to another mp\_int structure shall be known as making
+a copy for the purposes of this text.  The copy of the mp\_int will be a separate entity that represents the same
+value as the mp\_int it was copied from.  The mp\_copy algorithm provides this functionality. 
+
+\newpage\begin{figure}[here]
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_copy}. \\
+\textbf{Input}.  An mp\_int $a$ and $b$. \\
+\textbf{Output}.  Store a copy of $a$ in $b$. \\
+\hline \\
+1.  If $b.alloc < a.used$ then grow $b$ to $a.used$ digits.  (\textit{mp\_grow}) \\
+2.  for $n$ from 0 to $a.used - 1$ do \\
+\hspace{3mm}2.1  $b_{n} \leftarrow a_{n}$ \\
+3.  for $n$ from $a.used$ to $b.used - 1$ do \\
+\hspace{3mm}3.1  $b_{n} \leftarrow 0$ \\
+4.  $b.used \leftarrow a.used$ \\
+5.  $b.sign \leftarrow a.sign$ \\
+6.  return(\textit{MP\_OKAY}) \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Algorithm mp\_copy}
+\end{figure}
+
+\textbf{Algorithm mp\_copy.}
+This algorithm copies the mp\_int $a$ such that upon succesful termination of the algorithm the mp\_int $b$ will
+represent the same integer as the mp\_int $a$.  The mp\_int $b$ shall be a complete and distinct copy of the 
+mp\_int $a$ meaing that the mp\_int $a$ can be modified and it shall not affect the value of the mp\_int $b$.
+
+If $b$ does not have enough room for the digits of $a$ it must first have its precision augmented via the mp\_grow 
+algorithm.  The digits of $a$ are copied over the digits of $b$ and any excess digits of $b$ are set to zero (step two
+and three).  The \textbf{used} and \textbf{sign} members of $a$ are finally copied over the respective members of
+$b$.
+
+\textbf{Remark.}  This algorithm also introduces a new idiosyncrasy that will be used throughout the rest of the
+text.  The error return codes of other algorithms are not explicitly checked in the pseudo-code presented.  For example, in 
+step one of the mp\_copy algorithm the return of mp\_grow is not explicitly checked to ensure it succeeded.  Text space is 
+limited so it is assumed that if a algorithm fails it will clear all temporarily allocated mp\_ints and return
+the error code itself.  However, the C code presented will demonstrate all of the error handling logic required to 
+implement the pseudo-code.
+
+EXAM,bn_mp_copy.c
+
+Occasionally a dependent algorithm may copy an mp\_int effectively into itself such as when the input and output
+mp\_int structures passed to a function are one and the same.  For this case it is optimal to return immediately without 
+copying digits (line @24,a == b@).  
+
+The mp\_int $b$ must have enough digits to accomodate the used digits of the mp\_int $a$.  If $b.alloc$ is less than
+$a.used$ the algorithm mp\_grow is used to augment the precision of $b$ (lines @29,alloc@ to @33,}@).  In order to
+simplify the inner loop that copies the digits from $a$ to $b$, two aliases $tmpa$ and $tmpb$ point directly at the digits
+of the mp\_ints $a$ and $b$ respectively.  These aliases (lines @42,tmpa@ and @45,tmpb@) allow the compiler to access the digits without first dereferencing the
+mp\_int pointers and then subsequently the pointer to the digits.  
+
+After the aliases are established the digits from $a$ are copied into $b$ (lines @48,for@ to @50,}@) and then the excess 
+digits of $b$ are set to zero (lines @53,for@ to @55,}@).  Both ``for'' loops make use of the pointer aliases and in 
+fact the alias for $b$ is carried through into the second ``for'' loop to clear the excess digits.  This optimization 
+allows the alias to stay in a machine register fairly easy between the two loops.
+
+\textbf{Remarks.}  The use of pointer aliases is an implementation methodology first introduced in this function that will
+be used considerably in other functions.  Technically, a pointer alias is simply a short hand alias used to lower the 
+number of pointer dereferencing operations required to access data.  For example, a for loop may resemble
+
+\begin{alltt}
+for (x = 0; x < 100; x++) \{
+    a->num[4]->dp[x] = 0;
+\}
+\end{alltt}
+
+This could be re-written using aliases as 
+
+\begin{alltt}
+mp_digit *tmpa;
+a = a->num[4]->dp;
+for (x = 0; x < 100; x++) \{
+    *a++ = 0;
+\}
+\end{alltt}
+
+In this case an alias is used to access the 
+array of digits within an mp\_int structure directly.  It may seem that a pointer alias is strictly not required 
+as a compiler may optimize out the redundant pointer operations.  However, there are two dominant reasons to use aliases.
+
+The first reason is that most compilers will not effectively optimize pointer arithmetic.  For example, some optimizations 
+may work for the Microsoft Visual C++ compiler (MSVC) and not for the GNU C Compiler (GCC).  Also some optimizations may 
+work for GCC and not MSVC.  As such it is ideal to find a common ground for as many compilers as possible.  Pointer 
+aliases optimize the code considerably before the compiler even reads the source code which means the end compiled code 
+stands a better chance of being faster.
+
+The second reason is that pointer aliases often can make an algorithm simpler to read.  Consider the first ``for'' 
+loop of the function mp\_copy() re-written to not use pointer aliases.
+
+\begin{alltt}
+    /* copy all the digits */
+    for (n = 0; n < a->used; n++) \{
+      b->dp[n] = a->dp[n];
+    \}
+\end{alltt}
+
+Whether this code is harder to read depends strongly on the individual.  However, it is quantifiably slightly more 
+complicated as there are four variables within the statement instead of just two.
+
+\subsubsection{Nested Statements}
+Another commonly used technique in the source routines is that certain sections of code are nested.  This is used in
+particular with the pointer aliases to highlight code phases.  For example, a Comba multiplier (discussed in chapter six)
+will typically have three different phases.  First the temporaries are initialized, then the columns calculated and 
+finally the carries are propagated.  In this example the middle column production phase will typically be nested as it
+uses temporary variables and aliases the most.
+
+The nesting also simplies the source code as variables that are nested are only valid for their scope.  As a result
+the various temporary variables required do not propagate into other sections of code.
+
+
+\subsection{Creating a Clone}
+Another common operation is to make a local temporary copy of an mp\_int argument.  To initialize an mp\_int 
+and then copy another existing mp\_int into the newly intialized mp\_int will be known as creating a clone.  This is 
+useful within functions that need to modify an argument but do not wish to actually modify the original copy.  The 
+mp\_init\_copy algorithm has been designed to help perform this task.
+
+\begin{figure}[here]
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_init\_copy}. \\
+\textbf{Input}.   An mp\_int $a$ and $b$\\
+\textbf{Output}.  $a$ is initialized to be a copy of $b$. \\
+\hline \\
+1.  Init $a$.  (\textit{mp\_init}) \\
+2.  Copy $b$ to $a$.  (\textit{mp\_copy}) \\
+3.  Return the status of the copy operation. \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Algorithm mp\_init\_copy}
+\end{figure}
+
+\textbf{Algorithm mp\_init\_copy.}
+This algorithm will initialize an mp\_int variable and copy another previously initialized mp\_int variable into it.  As 
+such this algorithm will perform two operations in one step.  
+
+EXAM,bn_mp_init_copy.c
+
+This will initialize \textbf{a} and make it a verbatim copy of the contents of \textbf{b}.  Note that 
+\textbf{a} will have its own memory allocated which means that \textbf{b} may be cleared after the call
+and \textbf{a} will be left intact.  
+
+\section{Zeroing an Integer}
+Reseting an mp\_int to the default state is a common step in many algorithms.  The mp\_zero algorithm will be the algorithm used to
+perform this task.
+
+\begin{figure}[here]
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_zero}. \\
+\textbf{Input}.   An mp\_int $a$ \\
+\textbf{Output}.  Zero the contents of $a$ \\
+\hline \\
+1.  $a.used \leftarrow 0$ \\
+2.  $a.sign \leftarrow$ MP\_ZPOS \\
+3.  for $n$ from 0 to $a.alloc - 1$ do \\
+\hspace{3mm}3.1  $a_n \leftarrow 0$ \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Algorithm mp\_zero}
+\end{figure}
+
+\textbf{Algorithm mp\_zero.}
+This algorithm simply resets a mp\_int to the default state.  
+
+EXAM,bn_mp_zero.c
+
+After the function is completed, all of the digits are zeroed, the \textbf{used} count is zeroed and the 
+\textbf{sign} variable is set to \textbf{MP\_ZPOS}.
+
+\section{Sign Manipulation}
+\subsection{Absolute Value}
+With the mp\_int representation of an integer, calculating the absolute value is trivial.  The mp\_abs algorithm will compute
+the absolute value of an mp\_int.
+
+\begin{figure}[here]
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_abs}. \\
+\textbf{Input}.   An mp\_int $a$ \\
+\textbf{Output}.  Computes $b = \vert a \vert$ \\
+\hline \\
+1.  Copy $a$ to $b$.  (\textit{mp\_copy}) \\
+2.  If the copy failed return(\textit{MP\_MEM}). \\
+3.  $b.sign \leftarrow MP\_ZPOS$ \\
+4.  Return(\textit{MP\_OKAY}) \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Algorithm mp\_abs}
+\end{figure}
+
+\textbf{Algorithm mp\_abs.}
+This algorithm computes the absolute of an mp\_int input.  First it copies $a$ over $b$.  This is an example of an
+algorithm where the check in mp\_copy that determines if the source and destination are equal proves useful.  This allows,
+for instance, the developer to pass the same mp\_int as the source and destination to this function without addition 
+logic to handle it.
+
+EXAM,bn_mp_abs.c
+
+This fairly trivial algorithm first eliminates non--required duplications (line @27,a != b@) and then sets the
+\textbf{sign} flag to \textbf{MP\_ZPOS}.
+
+\subsection{Integer Negation}
+With the mp\_int representation of an integer, calculating the negation is also trivial.  The mp\_neg algorithm will compute
+the negative of an mp\_int input.
+
+\begin{figure}[here]
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_neg}. \\
+\textbf{Input}.   An mp\_int $a$ \\
+\textbf{Output}.  Computes $b = -a$ \\
+\hline \\
+1.  Copy $a$ to $b$.  (\textit{mp\_copy}) \\
+2.  If the copy failed return(\textit{MP\_MEM}). \\
+3.  If $a.used = 0$ then return(\textit{MP\_OKAY}). \\
+4.  If $a.sign = MP\_ZPOS$ then do \\
+\hspace{3mm}4.1  $b.sign = MP\_NEG$. \\
+5.  else do \\
+\hspace{3mm}5.1  $b.sign = MP\_ZPOS$. \\
+6.  Return(\textit{MP\_OKAY}) \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Algorithm mp\_neg}
+\end{figure}
+
+\textbf{Algorithm mp\_neg.}
+This algorithm computes the negation of an input.  First it copies $a$ over $b$.  If $a$ has no used digits then
+the algorithm returns immediately.  Otherwise it flips the sign flag and stores the result in $b$.  Note that if 
+$a$ had no digits then it must be positive by definition.  Had step three been omitted then the algorithm would return
+zero as negative.
+
+EXAM,bn_mp_neg.c
+
+Like mp\_abs() this function avoids non--required duplications (line @21,a != b@) and then sets the sign.  We
+have to make sure that only non--zero values get a \textbf{sign} of \textbf{MP\_NEG}.  If the mp\_int is zero
+than the \textbf{sign} is hard--coded to \textbf{MP\_ZPOS}.
+
+\section{Small Constants}
+\subsection{Setting Small Constants}
+Often a mp\_int must be set to a relatively small value such as $1$ or $2$.  For these cases the mp\_set algorithm is useful.
+
+\newpage\begin{figure}[here]
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_set}. \\
+\textbf{Input}.   An mp\_int $a$ and a digit $b$ \\
+\textbf{Output}.  Make $a$ equivalent to $b$ \\
+\hline \\
+1.  Zero $a$ (\textit{mp\_zero}). \\
+2.  $a_0 \leftarrow b \mbox{ (mod }\beta\mbox{)}$ \\
+3.  $a.used \leftarrow  \left \lbrace \begin{array}{ll}
+                              1 &  \mbox{if }a_0 > 0 \\
+                              0 &  \mbox{if }a_0 = 0 
+                              \end{array} \right .$ \\
+\hline                              
+\end{tabular}
+\end{center}
+\caption{Algorithm mp\_set}
+\end{figure}
+
+\textbf{Algorithm mp\_set.}
+This algorithm sets a mp\_int to a small single digit value.  Step number 1 ensures that the integer is reset to the default state.  The
+single digit is set (\textit{modulo $\beta$}) and the \textbf{used} count is adjusted accordingly.
+
+EXAM,bn_mp_set.c
+
+First we zero (line @21,mp_zero@) the mp\_int to make sure that the other members are initialized for a 
+small positive constant.  mp\_zero() ensures that the \textbf{sign} is positive and the \textbf{used} count
+is zero.  Next we set the digit and reduce it modulo $\beta$ (line @22,MP_MASK@).  After this step we have to 
+check if the resulting digit is zero or not.  If it is not then we set the \textbf{used} count to one, otherwise
+to zero.
+
+We can quickly reduce modulo $\beta$ since it is of the form $2^k$ and a quick binary AND operation with 
+$2^k - 1$ will perform the same operation.
+
+One important limitation of this function is that it will only set one digit.  The size of a digit is not fixed, meaning source that uses 
+this function should take that into account.  Only trivially small constants can be set using this function.
+
+\subsection{Setting Large Constants}
+To overcome the limitations of the mp\_set algorithm the mp\_set\_int algorithm is ideal.  It accepts a ``long''
+data type as input and will always treat it as a 32-bit integer.
+
+\begin{figure}[here]
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_set\_int}. \\
+\textbf{Input}.   An mp\_int $a$ and a ``long'' integer $b$ \\
+\textbf{Output}.  Make $a$ equivalent to $b$ \\
+\hline \\
+1.  Zero $a$ (\textit{mp\_zero}) \\
+2.  for $n$ from 0 to 7 do \\
+\hspace{3mm}2.1  $a \leftarrow a \cdot 16$ (\textit{mp\_mul2d}) \\
+\hspace{3mm}2.2  $u \leftarrow \lfloor b / 2^{4(7 - n)} \rfloor \mbox{ (mod }16\mbox{)}$\\
+\hspace{3mm}2.3  $a_0 \leftarrow a_0 + u$ \\
+\hspace{3mm}2.4  $a.used \leftarrow a.used + 1$ \\
+3.  Clamp excess used digits (\textit{mp\_clamp}) \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Algorithm mp\_set\_int}
+\end{figure}
+
+\textbf{Algorithm mp\_set\_int.}
+The algorithm performs eight iterations of a simple loop where in each iteration four bits from the source are added to the 
+mp\_int.  Step 2.1 will multiply the current result by sixteen making room for four more bits in the less significant positions.  In step 2.2 the
+next four bits from the source are extracted and are added to the mp\_int. The \textbf{used} digit count is 
+incremented to reflect the addition.  The \textbf{used} digit counter is incremented since if any of the leading digits were zero the mp\_int would have
+zero digits used and the newly added four bits would be ignored.
+
+Excess zero digits are trimmed in steps 2.1 and 3 by using higher level algorithms mp\_mul2d and mp\_clamp.
+
+EXAM,bn_mp_set_int.c
+
+This function sets four bits of the number at a time to handle all practical \textbf{DIGIT\_BIT} sizes.  The weird
+addition on line @38,a->used@ ensures that the newly added in bits are added to the number of digits.  While it may not 
+seem obvious as to why the digit counter does not grow exceedingly large it is because of the shift on line @27,mp_mul_2d@ 
+as well as the  call to mp\_clamp() on line @40,mp_clamp@.  Both functions will clamp excess leading digits which keeps 
+the number of used digits low.
+
+\section{Comparisons}
+\subsection{Unsigned Comparisions}
+Comparing a multiple precision integer is performed with the exact same algorithm used to compare two decimal numbers.  For example,
+to compare $1,234$ to $1,264$ the digits are extracted by their positions.  That is we compare $1 \cdot 10^3 + 2 \cdot 10^2 + 3 \cdot 10^1 + 4 \cdot 10^0$
+to $1 \cdot 10^3 + 2 \cdot 10^2 + 6 \cdot 10^1 + 4 \cdot 10^0$ by comparing single digits at a time starting with the highest magnitude 
+positions.  If any leading digit of one integer is greater than a digit in the same position of another integer then obviously it must be greater.  
+
+The first comparision routine that will be developed is the unsigned magnitude compare which will perform a comparison based on the digits of two
+mp\_int variables alone.  It will ignore the sign of the two inputs.  Such a function is useful when an absolute comparison is required or if the 
+signs are known to agree in advance.
+
+To facilitate working with the results of the comparison functions three constants are required.  
+
+\begin{figure}[here]
+\begin{center}
+\begin{tabular}{|r|l|}
+\hline \textbf{Constant} & \textbf{Meaning} \\
+\hline \textbf{MP\_GT} & Greater Than \\
+\hline \textbf{MP\_EQ} & Equal To \\
+\hline \textbf{MP\_LT} & Less Than \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Comparison Return Codes}
+\end{figure}
+
+\begin{figure}[here]
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_cmp\_mag}. \\
+\textbf{Input}.   Two mp\_ints $a$ and $b$.  \\
+\textbf{Output}.  Unsigned comparison results ($a$ to the left of $b$). \\
+\hline \\
+1.  If $a.used > b.used$ then return(\textit{MP\_GT}) \\
+2.  If $a.used < b.used$ then return(\textit{MP\_LT}) \\
+3.  for n from $a.used - 1$ to 0 do \\
+\hspace{+3mm}3.1  if $a_n > b_n$ then return(\textit{MP\_GT}) \\
+\hspace{+3mm}3.2  if $a_n < b_n$ then return(\textit{MP\_LT}) \\
+4.  Return(\textit{MP\_EQ}) \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Algorithm mp\_cmp\_mag}
+\end{figure}
+
+\textbf{Algorithm mp\_cmp\_mag.}
+By saying ``$a$ to the left of $b$'' it is meant that the comparison is with respect to $a$, that is if $a$ is greater than $b$ it will return
+\textbf{MP\_GT} and similar with respect to when $a = b$ and $a < b$.  The first two steps compare the number of digits used in both $a$ and $b$.  
+Obviously if the digit counts differ there would be an imaginary zero digit in the smaller number where the leading digit of the larger number is.  
+If both have the same number of digits than the actual digits themselves must be compared starting at the leading digit.  
+
+By step three both inputs must have the same number of digits so its safe to start from either $a.used - 1$ or $b.used - 1$ and count down to
+the zero'th digit.  If after all of the digits have been compared, no difference is found, the algorithm returns \textbf{MP\_EQ}.
+
+EXAM,bn_mp_cmp_mag.c
+
+The two if statements (lines @24,if@ and @28,if@) compare the number of digits in the two inputs.  These two are 
+performed before all of the digits are compared since it is a very cheap test to perform and can potentially save 
+considerable time.  The implementation given is also not valid without those two statements.  $b.alloc$ may be 
+smaller than $a.used$, meaning that undefined values will be read from $b$ past the end of the array of digits.
+
+
+
+\subsection{Signed Comparisons}
+Comparing with sign considerations is also fairly critical in several routines (\textit{division for example}).  Based on an unsigned magnitude 
+comparison a trivial signed comparison algorithm can be written.
+
+\begin{figure}[here]
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_cmp}. \\
+\textbf{Input}.   Two mp\_ints $a$ and $b$ \\
+\textbf{Output}.  Signed Comparison Results ($a$ to the left of $b$) \\
+\hline \\
+1.  if $a.sign = MP\_NEG$ and $b.sign = MP\_ZPOS$ then return(\textit{MP\_LT}) \\
+2.  if $a.sign = MP\_ZPOS$ and $b.sign = MP\_NEG$ then return(\textit{MP\_GT}) \\
+3.  if $a.sign = MP\_NEG$ then \\
+\hspace{+3mm}3.1  Return the unsigned comparison of $b$ and $a$ (\textit{mp\_cmp\_mag}) \\
+4   Otherwise \\
+\hspace{+3mm}4.1  Return the unsigned comparison of $a$ and $b$ \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Algorithm mp\_cmp}
+\end{figure}
+
+\textbf{Algorithm mp\_cmp.}
+The first two steps compare the signs of the two inputs.  If the signs do not agree then it can return right away with the appropriate 
+comparison code.  When the signs are equal the digits of the inputs must be compared to determine the correct result.  In step 
+three the unsigned comparision flips the order of the arguments since they are both negative.  For instance, if $-a > -b$ then 
+$\vert a \vert < \vert b \vert$.  Step number four will compare the two when they are both positive.
+
+EXAM,bn_mp_cmp.c
+
+The two if statements (lines @22,if@ and @26,if@) perform the initial sign comparison.  If the signs are not the equal then which ever
+has the positive sign is larger.   The inputs are compared (line @30,if@) based on magnitudes.  If the signs were both 
+negative then the unsigned comparison is performed in the opposite direction (line @31,mp_cmp_mag@).  Otherwise, the signs are assumed to 
+be both positive and a forward direction unsigned comparison is performed.
+
+\section*{Exercises}
+\begin{tabular}{cl}
+$\left [ 2 \right ]$ & Modify algorithm mp\_set\_int to accept as input a variable length array of bits. \\
+                     & \\
+$\left [ 3 \right ]$ & Give the probability that algorithm mp\_cmp\_mag will have to compare $k$ digits  \\
+                     & of two random digits (of equal magnitude) before a difference is found. \\
+                     & \\
+$\left [ 1 \right ]$ & Suggest a simple method to speed up the implementation of mp\_cmp\_mag based  \\
+                     & on the observations made in the previous problem. \\
+                     &
+\end{tabular}
+
+\chapter{Basic Arithmetic}
+\section{Introduction}
+At this point algorithms for initialization, clearing, zeroing, copying, comparing and setting small constants have been 
+established.  The next logical set of algorithms to develop are addition, subtraction and digit shifting algorithms.  These 
+algorithms make use of the lower level algorithms and are the cruicial building block for the multiplication algorithms.  It is very important 
+that these algorithms are highly optimized.  On their own they are simple $O(n)$ algorithms but they can be called from higher level algorithms 
+which easily places them at $O(n^2)$ or even $O(n^3)$ work levels.  
+
+MARK,SHIFTS
+All of the algorithms within this chapter make use of the logical bit shift operations denoted by $<<$ and $>>$ for left and right 
+logical shifts respectively.  A logical shift is analogous to sliding the decimal point of radix-10 representations.  For example, the real 
+number $0.9345$ is equivalent to $93.45\%$ which is found by sliding the the decimal two places to the right (\textit{multiplying by $\beta^2 = 10^2$}).  
+Algebraically a binary logical shift is equivalent to a division or multiplication by a power of two.  
+For example, $a << k = a \cdot 2^k$ while $a >> k = \lfloor a/2^k \rfloor$.
+
+One significant difference between a logical shift and the way decimals are shifted is that digits below the zero'th position are removed
+from the number.  For example, consider $1101_2 >> 1$ using decimal notation this would produce $110.1_2$.  However, with a logical shift the 
+result is $110_2$.  
+
+\section{Addition and Subtraction}
+In common twos complement fixed precision arithmetic negative numbers are easily represented by subtraction from the modulus.  For example, with 32-bit integers
+$a - b\mbox{ (mod }2^{32}\mbox{)}$ is the same as $a + (2^{32} - b) \mbox{ (mod }2^{32}\mbox{)}$  since $2^{32} \equiv 0 \mbox{ (mod }2^{32}\mbox{)}$.  
+As a result subtraction can be performed with a trivial series of logical operations and an addition.
+
+However, in multiple precision arithmetic negative numbers are not represented in the same way.  Instead a sign flag is used to keep track of the
+sign of the integer.  As a result signed addition and subtraction are actually implemented as conditional usage of lower level addition or 
+subtraction algorithms with the sign fixed up appropriately.
+
+The lower level algorithms will add or subtract integers without regard to the sign flag.  That is they will add or subtract the magnitude of
+the integers respectively.
+
+\subsection{Low Level Addition}
+An unsigned addition of multiple precision integers is performed with the same long-hand algorithm used to add decimal numbers.  That is to add the 
+trailing digits first and propagate the resulting carry upwards.  Since this is a lower level algorithm the name will have a ``s\_'' prefix.  
+Historically that convention stems from the MPI library where ``s\_'' stood for static functions that were hidden from the developer entirely.
+
+\newpage
+\begin{figure}[!here]
+\begin{center}
+\begin{small}
+\begin{tabular}{l}
+\hline Algorithm \textbf{s\_mp\_add}. \\
+\textbf{Input}.   Two mp\_ints $a$ and $b$ \\
+\textbf{Output}.  The unsigned addition $c = \vert a \vert + \vert b \vert$. \\
+\hline \\
+1.  if $a.used > b.used$ then \\
+\hspace{+3mm}1.1  $min \leftarrow b.used$ \\
+\hspace{+3mm}1.2  $max \leftarrow a.used$ \\
+\hspace{+3mm}1.3  $x   \leftarrow a$ \\
+2.  else  \\
+\hspace{+3mm}2.1  $min \leftarrow a.used$ \\
+\hspace{+3mm}2.2  $max \leftarrow b.used$ \\
+\hspace{+3mm}2.3  $x   \leftarrow b$ \\
+3.  If $c.alloc < max + 1$ then grow $c$ to hold at least $max + 1$ digits (\textit{mp\_grow}) \\
+4.  $oldused \leftarrow c.used$ \\
+5.  $c.used \leftarrow max + 1$ \\
+6.  $u \leftarrow 0$ \\
+7.  for $n$ from $0$ to $min - 1$ do \\
+\hspace{+3mm}7.1  $c_n \leftarrow a_n + b_n + u$ \\
+\hspace{+3mm}7.2  $u \leftarrow c_n >> lg(\beta)$ \\
+\hspace{+3mm}7.3  $c_n \leftarrow c_n \mbox{ (mod }\beta\mbox{)}$ \\
+8.  if $min \ne max$ then do \\
+\hspace{+3mm}8.1  for $n$ from $min$ to $max - 1$ do \\
+\hspace{+6mm}8.1.1  $c_n \leftarrow x_n + u$ \\
+\hspace{+6mm}8.1.2  $u \leftarrow c_n >> lg(\beta)$ \\
+\hspace{+6mm}8.1.3  $c_n \leftarrow c_n \mbox{ (mod }\beta\mbox{)}$ \\
+9.  $c_{max} \leftarrow u$ \\
+10.  if $olduse > max$ then \\
+\hspace{+3mm}10.1  for $n$ from $max + 1$ to $oldused - 1$ do \\
+\hspace{+6mm}10.1.1  $c_n \leftarrow 0$ \\
+11.  Clamp excess digits in $c$.  (\textit{mp\_clamp}) \\
+12.  Return(\textit{MP\_OKAY}) \\
+\hline
+\end{tabular}
+\end{small}
+\end{center}
+\caption{Algorithm s\_mp\_add}
+\end{figure}
+
+\textbf{Algorithm s\_mp\_add.}
+This algorithm is loosely based on algorithm 14.7 of HAC \cite[pp. 594]{HAC} but has been extended to allow the inputs to have different magnitudes.  
+Coincidentally the description of algorithm A in Knuth \cite[pp. 266]{TAOCPV2} shares the same deficiency as the algorithm from \cite{HAC}.  Even the 
+MIX pseudo  machine code presented by Knuth \cite[pp. 266-267]{TAOCPV2} is incapable of handling inputs which are of different magnitudes.
+
+The first thing that has to be accomplished is to sort out which of the two inputs is the largest.  The addition logic
+will simply add all of the smallest input to the largest input and store that first part of the result in the
+destination.  Then it will apply a simpler addition loop to excess digits of the larger input.
+
+The first two steps will handle sorting the inputs such that $min$ and $max$ hold the digit counts of the two 
+inputs.  The variable $x$ will be an mp\_int alias for the largest input or the second input $b$ if they have the
+same number of digits.  After the inputs are sorted the destination $c$ is grown as required to accomodate the sum 
+of the two inputs.  The original \textbf{used} count of $c$ is copied and set to the new used count.  
+
+At this point the first addition loop will go through as many digit positions that both inputs have.  The carry
+variable $\mu$ is set to zero outside the loop.  Inside the loop an ``addition'' step requires three statements to produce
+one digit of the summand.  First
+two digits from $a$ and $b$ are added together along with the carry $\mu$.  The carry of this step is extracted and stored
+in $\mu$ and finally the digit of the result $c_n$ is truncated within the range $0 \le c_n < \beta$.
+
+Now all of the digit positions that both inputs have in common have been exhausted.  If $min \ne max$ then $x$ is an alias
+for one of the inputs that has more digits.  A simplified addition loop is then used to essentially copy the remaining digits
+and the carry to the destination.
+
+The final carry is stored in $c_{max}$ and digits above $max$ upto $oldused$ are zeroed which completes the addition.
+
+
+EXAM,bn_s_mp_add.c
+
+We first sort (lines @27,if@ to @35,}@) the inputs based on magnitude and determine the $min$ and $max$ variables.
+Note that $x$ is a pointer to an mp\_int assigned to the largest input, in effect it is a local alias.  Next we
+grow the destination (@37,init@ to @42,}@) ensure that it can accomodate the result of the addition. 
+
+Similar to the implementation of mp\_copy this function uses the braced code and local aliases coding style.  The three aliases that are on 
+lines @56,tmpa@, @59,tmpb@ and @62,tmpc@ represent the two inputs and destination variables respectively.  These aliases are used to ensure the
+compiler does not have to dereference $a$, $b$ or $c$ (respectively) to access the digits of the respective mp\_int.
+
+The initial carry $u$ will be cleared (line @65,u = 0@), note that $u$ is of type mp\_digit which ensures type 
+compatibility within the implementation.  The initial addition (line @66,for@ to @75,}@) adds digits from
+both inputs until the smallest input runs out of digits.  Similarly the conditional addition loop
+(line @81,for@ to @90,}@) adds the remaining digits from the larger of the two inputs.  The addition is finished 
+with the final carry being stored in $tmpc$ (line @94,tmpc++@).  Note the ``++'' operator within the same expression.
+After line @94,tmpc++@, $tmpc$ will point to the $c.used$'th digit of the mp\_int $c$.  This is useful
+for the next loop (line @97,for@ to @99,}@) which set any old upper digits to zero.
+
+\subsection{Low Level Subtraction}
+The low level unsigned subtraction algorithm is very similar to the low level unsigned addition algorithm.  The principle difference is that the
+unsigned subtraction algorithm requires the result to be positive.  That is when computing $a - b$ the condition $\vert a \vert \ge \vert b\vert$ must 
+be met for this algorithm to function properly.  Keep in mind this low level algorithm is not meant to be used in higher level algorithms directly.  
+This algorithm as will be shown can be used to create functional signed addition and subtraction algorithms.
+
+MARK,GAMMA
+
+For this algorithm a new variable is required to make the description simpler.  Recall from section 1.3.1 that a mp\_digit must be able to represent
+the range $0 \le x < 2\beta$ for the algorithms to work correctly.  However, it is allowable that a mp\_digit represent a larger range of values.  For 
+this algorithm we will assume that the variable $\gamma$ represents the number of bits available in a 
+mp\_digit (\textit{this implies $2^{\gamma} > \beta$}).  
+
+For example, the default for LibTomMath is to use a ``unsigned long'' for the mp\_digit ``type'' while $\beta = 2^{28}$.  In ISO C an ``unsigned long''
+data type must be able to represent $0 \le x < 2^{32}$ meaning that in this case $\gamma \ge 32$.
+
+\newpage\begin{figure}[!here]
+\begin{center}
+\begin{small}
+\begin{tabular}{l}
+\hline Algorithm \textbf{s\_mp\_sub}. \\
+\textbf{Input}.   Two mp\_ints $a$ and $b$ ($\vert a \vert \ge \vert b \vert$) \\
+\textbf{Output}.  The unsigned subtraction $c = \vert a \vert - \vert b \vert$. \\
+\hline \\
+1.  $min \leftarrow b.used$ \\
+2.  $max \leftarrow a.used$ \\
+3.  If $c.alloc < max$ then grow $c$ to hold at least $max$ digits.  (\textit{mp\_grow}) \\
+4.  $oldused \leftarrow c.used$ \\ 
+5.  $c.used \leftarrow max$ \\
+6.  $u \leftarrow 0$ \\
+7.  for $n$ from $0$ to $min - 1$ do \\
+\hspace{3mm}7.1  $c_n \leftarrow a_n - b_n - u$ \\
+\hspace{3mm}7.2  $u   \leftarrow c_n >> (\gamma - 1)$ \\
+\hspace{3mm}7.3  $c_n \leftarrow c_n \mbox{ (mod }\beta\mbox{)}$ \\
+8.  if $min < max$ then do \\
+\hspace{3mm}8.1  for $n$ from $min$ to $max - 1$ do \\
+\hspace{6mm}8.1.1  $c_n \leftarrow a_n - u$ \\
+\hspace{6mm}8.1.2  $u   \leftarrow c_n >> (\gamma - 1)$ \\
+\hspace{6mm}8.1.3  $c_n \leftarrow c_n \mbox{ (mod }\beta\mbox{)}$ \\
+9. if $oldused > max$ then do \\
+\hspace{3mm}9.1  for $n$ from $max$ to $oldused - 1$ do \\
+\hspace{6mm}9.1.1  $c_n \leftarrow 0$ \\
+10. Clamp excess digits of $c$.  (\textit{mp\_clamp}). \\
+11. Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{small}
+\end{center}
+\caption{Algorithm s\_mp\_sub}
+\end{figure}
+
+\textbf{Algorithm s\_mp\_sub.}
+This algorithm performs the unsigned subtraction of two mp\_int variables under the restriction that the result must be positive.  That is when
+passing variables $a$ and $b$ the condition that $\vert a \vert \ge \vert b \vert$ must be met for the algorithm to function correctly.  This
+algorithm is loosely based on algorithm 14.9 \cite[pp. 595]{HAC} and is similar to algorithm S in \cite[pp. 267]{TAOCPV2} as well.  As was the case
+of the algorithm s\_mp\_add both other references lack discussion concerning various practical details such as when the inputs differ in magnitude.
+
+The initial sorting of the inputs is trivial in this algorithm since $a$ is guaranteed to have at least the same magnitude of $b$.  Steps 1 and 2 
+set the $min$ and $max$ variables.  Unlike the addition routine there is guaranteed to be no carry which means that the final result can be at 
+most $max$ digits in length as opposed to $max + 1$.  Similar to the addition algorithm the \textbf{used} count of $c$ is copied locally and 
+set to the maximal count for the operation.
+
+The subtraction loop that begins on step seven is essentially the same as the addition loop of algorithm s\_mp\_add except single precision 
+subtraction is used instead.  Note the use of the $\gamma$ variable to extract the carry (\textit{also known as the borrow}) within the subtraction 
+loops.  Under the assumption that two's complement single precision arithmetic is used this will successfully extract the desired carry.  
+
+For example, consider subtracting $0101_2$ from $0100_2$ where $\gamma = 4$ and $\beta = 2$.  The least significant bit will force a carry upwards to 
+the third bit which will be set to zero after the borrow.  After the very first bit has been subtracted $4 - 1 \equiv 0011_2$ will remain,  When the 
+third bit of $0101_2$ is subtracted from the result it will cause another carry.  In this case though the carry will be forced to propagate all the 
+way to the most significant bit.  
+
+Recall that $\beta < 2^{\gamma}$.  This means that if a carry does occur just before the $lg(\beta)$'th bit it will propagate all the way to the most 
+significant bit.  Thus, the high order bits of the mp\_digit that are not part of the actual digit will either be all zero, or all one. All that
+is needed is a single zero or one bit for the carry.  Therefore a single logical shift right by $\gamma - 1$ positions is sufficient to extract the 
+carry.  This method of carry extraction may seem awkward but the reason for it becomes apparent when the implementation is discussed.  
+
+If $b$ has a smaller magnitude than $a$ then step 9 will force the carry and copy operation to propagate through the larger input $a$ into $c$.  Step
+10 will ensure that any leading digits of $c$ above the $max$'th position are zeroed.
+
+EXAM,bn_s_mp_sub.c
+
+Like low level addition we ``sort'' the inputs.  Except in this case the sorting is hardcoded 
+(lines @24,min@ and @25,max@).  In reality the $min$ and $max$ variables are only aliases and are only 
+used to make the source code easier to read.  Again the pointer alias optimization is used 
+within this algorithm.  The aliases $tmpa$, $tmpb$ and $tmpc$ are initialized
+(lines @42,tmpa@, @43,tmpb@ and @44,tmpc@) for $a$, $b$ and $c$ respectively.
+
+The first subtraction loop (lines @47,u = 0@ through @61,}@) subtract digits from both inputs until the smaller of
+the two inputs has been exhausted.  As remarked earlier there is an implementation reason for using the ``awkward'' 
+method of extracting the carry (line @57, >>@).  The traditional method for extracting the carry would be to shift 
+by $lg(\beta)$ positions and logically AND the least significant bit.  The AND operation is required because all of 
+the bits above the $\lg(\beta)$'th bit will be set to one after a carry occurs from subtraction.  This carry 
+extraction requires two relatively cheap operations to extract the carry.  The other method is to simply shift the 
+most significant bit to the least significant bit thus extracting the carry with a single cheap operation.  This 
+optimization only works on twos compliment machines which is a safe assumption to make.
+
+If $a$ has a larger magnitude than $b$ an additional loop (lines @64,for@ through @73,}@) is required to propagate 
+the carry through $a$ and copy the result to $c$.  
+
+\subsection{High Level Addition}
+Now that both lower level addition and subtraction algorithms have been established an effective high level signed addition algorithm can be
+established.  This high level addition algorithm will be what other algorithms and developers will use to perform addition of mp\_int data 
+types.  
+
+Recall from section 5.2 that an mp\_int represents an integer with an unsigned mantissa (\textit{the array of digits}) and a \textbf{sign} 
+flag.  A high level addition is actually performed as a series of eight separate cases which can be optimized down to three unique cases.
+
+\begin{figure}[!here]
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_add}. \\
+\textbf{Input}.   Two mp\_ints $a$ and $b$  \\
+\textbf{Output}.  The signed addition $c = a + b$. \\
+\hline \\
+1.  if $a.sign = b.sign$ then do \\
+\hspace{3mm}1.1  $c.sign \leftarrow a.sign$  \\
+\hspace{3mm}1.2  $c \leftarrow \vert a \vert + \vert b \vert$ (\textit{s\_mp\_add})\\
+2.  else do \\
+\hspace{3mm}2.1  if $\vert a \vert < \vert b \vert$ then do (\textit{mp\_cmp\_mag})  \\
+\hspace{6mm}2.1.1  $c.sign \leftarrow b.sign$ \\
+\hspace{6mm}2.1.2  $c \leftarrow \vert b \vert - \vert a \vert$ (\textit{s\_mp\_sub}) \\
+\hspace{3mm}2.2  else do \\
+\hspace{6mm}2.2.1  $c.sign \leftarrow a.sign$ \\
+\hspace{6mm}2.2.2  $c \leftarrow \vert a \vert - \vert b \vert$ \\
+3.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Algorithm mp\_add}
+\end{figure}
+
+\textbf{Algorithm mp\_add.}
+This algorithm performs the signed addition of two mp\_int variables.  There is no reference algorithm to draw upon from 
+either \cite{TAOCPV2} or \cite{HAC} since they both only provide unsigned operations.  The algorithm is fairly 
+straightforward but restricted since subtraction can only produce positive results.
+
+\begin{figure}[here]
+\begin{small}
+\begin{center}
+\begin{tabular}{|c|c|c|c|c|}
+\hline \textbf{Sign of $a$} & \textbf{Sign of $b$} & \textbf{$\vert a \vert > \vert b \vert $} & \textbf{Unsigned Operation} & \textbf{Result Sign Flag} \\
+\hline $+$ & $+$ & Yes & $c = a + b$ & $a.sign$ \\
+\hline $+$ & $+$ & No  & $c = a + b$ & $a.sign$ \\
+\hline $-$ & $-$ & Yes & $c = a + b$ & $a.sign$ \\
+\hline $-$ & $-$ & No  & $c = a + b$ & $a.sign$ \\
+\hline &&&&\\
+
+\hline $+$ & $-$ & No  & $c = b - a$ & $b.sign$ \\
+\hline $-$ & $+$ & No  & $c = b - a$ & $b.sign$ \\
+
+\hline &&&&\\
+
+\hline $+$ & $-$ & Yes & $c = a - b$ & $a.sign$ \\
+\hline $-$ & $+$ & Yes & $c = a - b$ & $a.sign$ \\
+
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Addition Guide Chart}
+\label{fig:AddChart}
+\end{figure}
+
+Figure~\ref{fig:AddChart} lists all of the eight possible input combinations and is sorted to show that only three 
+specific cases need to be handled.  The return code of the unsigned operations at step 1.2, 2.1.2 and 2.2.2 are 
+forwarded to step three to check for errors.  This simplifies the description of the algorithm considerably and best 
+follows how the implementation actually was achieved.
+
+Also note how the \textbf{sign} is set before the unsigned addition or subtraction is performed.  Recall from the descriptions of algorithms
+s\_mp\_add and s\_mp\_sub that the mp\_clamp function is used at the end to trim excess digits.  The mp\_clamp algorithm will set the \textbf{sign}
+to \textbf{MP\_ZPOS} when the \textbf{used} digit count reaches zero.
+
+For example, consider performing $-a + a$ with algorithm mp\_add.  By the description of the algorithm the sign is set to \textbf{MP\_NEG} which would
+produce a result of $-0$.  However, since the sign is set first then the unsigned addition is performed the subsequent usage of algorithm mp\_clamp 
+within algorithm s\_mp\_add will force $-0$ to become $0$.  
+
+EXAM,bn_mp_add.c
+
+The source code follows the algorithm fairly closely.  The most notable new source code addition is the usage of the $res$ integer variable which
+is used to pass result of the unsigned operations forward.  Unlike in the algorithm, the variable $res$ is merely returned as is without
+explicitly checking it and returning the constant \textbf{MP\_OKAY}.  The observation is this algorithm will succeed or fail only if the lower
+level functions do so.  Returning their return code is sufficient.
+
+\subsection{High Level Subtraction}
+The high level signed subtraction algorithm is essentially the same as the high level signed addition algorithm.  
+
+\newpage\begin{figure}[!here]
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_sub}. \\
+\textbf{Input}.   Two mp\_ints $a$ and $b$  \\
+\textbf{Output}.  The signed subtraction $c = a - b$. \\
+\hline \\
+1.  if $a.sign \ne b.sign$ then do \\
+\hspace{3mm}1.1  $c.sign \leftarrow a.sign$ \\
+\hspace{3mm}1.2  $c \leftarrow \vert a \vert + \vert b \vert$ (\textit{s\_mp\_add}) \\
+2.  else do \\
+\hspace{3mm}2.1  if $\vert a \vert \ge \vert b \vert$ then do (\textit{mp\_cmp\_mag}) \\
+\hspace{6mm}2.1.1  $c.sign \leftarrow a.sign$ \\
+\hspace{6mm}2.1.2  $c \leftarrow \vert a \vert  - \vert b \vert$ (\textit{s\_mp\_sub}) \\
+\hspace{3mm}2.2  else do \\
+\hspace{6mm}2.2.1  $c.sign \leftarrow  \left \lbrace \begin{array}{ll}
+                              MP\_ZPOS &  \mbox{if }a.sign = MP\_NEG \\
+                              MP\_NEG  &  \mbox{otherwise} \\
+                              \end{array} \right .$ \\
+\hspace{6mm}2.2.2  $c \leftarrow \vert b \vert  - \vert a \vert$ \\
+3.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Algorithm mp\_sub}
+\end{figure}
+
+\textbf{Algorithm mp\_sub.}
+This algorithm performs the signed subtraction of two inputs.  Similar to algorithm mp\_add there is no reference in either \cite{TAOCPV2} or 
+\cite{HAC}.  Also this algorithm is restricted by algorithm s\_mp\_sub.  Chart \ref{fig:SubChart} lists the eight possible inputs and
+the operations required.
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{|c|c|c|c|c|}
+\hline \textbf{Sign of $a$} & \textbf{Sign of $b$} & \textbf{$\vert a \vert \ge \vert b \vert $} & \textbf{Unsigned Operation} & \textbf{Result Sign Flag} \\
+\hline $+$ & $-$ & Yes & $c = a + b$ & $a.sign$ \\
+\hline $+$ & $-$ & No  & $c = a + b$ & $a.sign$ \\
+\hline $-$ & $+$ & Yes & $c = a + b$ & $a.sign$ \\
+\hline $-$ & $+$ & No  & $c = a + b$ & $a.sign$ \\
+\hline &&&& \\
+\hline $+$ & $+$ & Yes & $c = a - b$ & $a.sign$ \\
+\hline $-$ & $-$ & Yes & $c = a - b$ & $a.sign$ \\
+\hline &&&& \\
+\hline $+$ & $+$ & No  & $c = b - a$ & $\mbox{opposite of }a.sign$ \\
+\hline $-$ & $-$ & No  & $c = b - a$ & $\mbox{opposite of }a.sign$ \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Subtraction Guide Chart}
+\label{fig:SubChart}
+\end{figure}
+
+Similar to the case of algorithm mp\_add the \textbf{sign} is set first before the unsigned addition or subtraction.  That is to prevent the 
+algorithm from producing $-a - -a = -0$ as a result.  
+
+EXAM,bn_mp_sub.c
+
+Much like the implementation of algorithm mp\_add the variable $res$ is used to catch the return code of the unsigned addition or subtraction operations
+and forward it to the end of the function.  On line @38, != MP_LT@ the ``not equal to'' \textbf{MP\_LT} expression is used to emulate a 
+``greater than or equal to'' comparison.  
+
+\section{Bit and Digit Shifting}
+MARK,POLY
+It is quite common to think of a multiple precision integer as a polynomial in $x$, that is $y = f(\beta)$ where $f(x) = \sum_{i=0}^{n-1} a_i x^i$.  
+This notation arises within discussion of Montgomery and Diminished Radix Reduction as well as Karatsuba multiplication and squaring.  
+
+In order to facilitate operations on polynomials in $x$ as above a series of simple ``digit'' algorithms have to be established.  That is to shift
+the digits left or right as well to shift individual bits of the digits left and right.  It is important to note that not all ``shift'' operations
+are on radix-$\beta$ digits.  
+
+\subsection{Multiplication by Two}
+
+In a binary system where the radix is a power of two multiplication by two not only arises often in other algorithms it is a fairly efficient 
+operation to perform.  A single precision logical shift left is sufficient to multiply a single digit by two.  
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_mul\_2}. \\
+\textbf{Input}.   One mp\_int $a$ \\
+\textbf{Output}.  $b = 2a$. \\
+\hline \\
+1.  If $b.alloc < a.used + 1$ then grow $b$ to hold $a.used + 1$ digits.  (\textit{mp\_grow}) \\
+2.  $oldused \leftarrow b.used$ \\
+3.  $b.used \leftarrow a.used$ \\
+4.  $r \leftarrow 0$ \\
+5.  for $n$ from 0 to $a.used - 1$ do \\
+\hspace{3mm}5.1  $rr \leftarrow a_n >> (lg(\beta) - 1)$ \\
+\hspace{3mm}5.2  $b_n \leftarrow (a_n << 1) + r \mbox{ (mod }\beta\mbox{)}$ \\
+\hspace{3mm}5.3  $r \leftarrow rr$ \\
+6.  If $r \ne 0$ then do \\
+\hspace{3mm}6.1  $b_{n + 1} \leftarrow r$ \\
+\hspace{3mm}6.2  $b.used \leftarrow b.used + 1$ \\
+7.  If $b.used < oldused - 1$ then do \\
+\hspace{3mm}7.1  for $n$ from $b.used$ to $oldused - 1$ do \\
+\hspace{6mm}7.1.1  $b_n \leftarrow 0$ \\
+8.  $b.sign \leftarrow a.sign$ \\
+9.  Return(\textit{MP\_OKAY}).\\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_mul\_2}
+\end{figure}
+
+\textbf{Algorithm mp\_mul\_2.}
+This algorithm will quickly multiply a mp\_int by two provided $\beta$ is a power of two.  Neither \cite{TAOCPV2} nor \cite{HAC} describe such 
+an algorithm despite the fact it arises often in other algorithms.  The algorithm is setup much like the lower level algorithm s\_mp\_add since 
+it is for all intents and purposes equivalent to the operation $b = \vert a \vert + \vert a \vert$.  
+
+Step 1 and 2 grow the input as required to accomodate the maximum number of \textbf{used} digits in the result.  The initial \textbf{used} count
+is set to $a.used$ at step 4.  Only if there is a final carry will the \textbf{used} count require adjustment.
+
+Step 6 is an optimization implementation of the addition loop for this specific case.  That is since the two values being added together 
+are the same there is no need to perform two reads from the digits of $a$.  Step 6.1 performs a single precision shift on the current digit $a_n$ to
+obtain what will be the carry for the next iteration.  Step 6.2 calculates the $n$'th digit of the result as single precision shift of $a_n$ plus
+the previous carry.  Recall from ~SHIFTS~ that $a_n << 1$ is equivalent to $a_n \cdot 2$.  An iteration of the addition loop is finished with 
+forwarding the carry to the next iteration.
+
+Step 7 takes care of any final carry by setting the $a.used$'th digit of the result to the carry and augmenting the \textbf{used} count of $b$.  
+Step 8 clears any leading digits of $b$ in case it originally had a larger magnitude than $a$.
+
+EXAM,bn_mp_mul_2.c
+
+This implementation is essentially an optimized implementation of s\_mp\_add for the case of doubling an input.  The only noteworthy difference
+is the use of the logical shift operator on line @52,<<@ to perform a single precision doubling.  
+
+\subsection{Division by Two}
+A division by two can just as easily be accomplished with a logical shift right as multiplication by two can be with a logical shift left.
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_div\_2}. \\
+\textbf{Input}.   One mp\_int $a$ \\
+\textbf{Output}.  $b = a/2$. \\
+\hline \\
+1.  If $b.alloc < a.used$ then grow $b$ to hold $a.used$ digits.  (\textit{mp\_grow}) \\
+2.  If the reallocation failed return(\textit{MP\_MEM}). \\
+3.  $oldused \leftarrow b.used$ \\
+4.  $b.used \leftarrow a.used$ \\
+5.  $r \leftarrow 0$ \\
+6.  for $n$ from $b.used - 1$ to $0$ do \\
+\hspace{3mm}6.1  $rr \leftarrow a_n \mbox{ (mod }2\mbox{)}$\\
+\hspace{3mm}6.2  $b_n \leftarrow (a_n >> 1) + (r << (lg(\beta) - 1)) \mbox{ (mod }\beta\mbox{)}$ \\
+\hspace{3mm}6.3  $r \leftarrow rr$ \\
+7.  If $b.used < oldused - 1$ then do \\
+\hspace{3mm}7.1  for $n$ from $b.used$ to $oldused - 1$ do \\
+\hspace{6mm}7.1.1  $b_n \leftarrow 0$ \\
+8.  $b.sign \leftarrow a.sign$ \\
+9.  Clamp excess digits of $b$.  (\textit{mp\_clamp}) \\
+10.  Return(\textit{MP\_OKAY}).\\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_div\_2}
+\end{figure}
+
+\textbf{Algorithm mp\_div\_2.}
+This algorithm will divide an mp\_int by two using logical shifts to the right.  Like mp\_mul\_2 it uses a modified low level addition
+core as the basis of the algorithm.  Unlike mp\_mul\_2 the shift operations work from the leading digit to the trailing digit.  The algorithm
+could be written to work from the trailing digit to the leading digit however, it would have to stop one short of $a.used - 1$ digits to prevent
+reading past the end of the array of digits.
+
+Essentially the loop at step 6 is similar to that of mp\_mul\_2 except the logical shifts go in the opposite direction and the carry is at the 
+least significant bit not the most significant bit.  
+
+EXAM,bn_mp_div_2.c
+
+\section{Polynomial Basis Operations}
+Recall from ~POLY~ that any integer can be represented as a polynomial in $x$ as $y = f(\beta)$.  Such a representation is also known as
+the polynomial basis \cite[pp. 48]{ROSE}. Given such a notation a multiplication or division by $x$ amounts to shifting whole digits a single 
+place.  The need for such operations arises in several other higher level algorithms such as Barrett and Montgomery reduction, integer
+division and Karatsuba multiplication.  
+
+Converting from an array of digits to polynomial basis is very simple.  Consider the integer $y \equiv (a_2, a_1, a_0)_{\beta}$ and recall that
+$y = \sum_{i=0}^{2} a_i \beta^i$.  Simply replace $\beta$ with $x$ and the expression is in polynomial basis.  For example, $f(x) = 8x + 9$ is the
+polynomial basis representation for $89$ using radix ten.  That is, $f(10) = 8(10) + 9 = 89$.  
+
+\subsection{Multiplication by $x$}
+
+Given a polynomial in $x$ such as $f(x) = a_n x^n + a_{n-1} x^{n-1} + ... + a_0$ multiplying by $x$ amounts to shifting the coefficients up one 
+degree.  In this case $f(x) \cdot x = a_n x^{n+1} + a_{n-1} x^n + ... + a_0 x$.  From a scalar basis point of view multiplying by $x$ is equivalent to
+multiplying by the integer $\beta$.  
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_lshd}. \\
+\textbf{Input}.   One mp\_int $a$ and an integer $b$ \\
+\textbf{Output}.  $a \leftarrow a \cdot \beta^b$ (equivalent to multiplication by $x^b$). \\
+\hline \\
+1.  If $b \le 0$ then return(\textit{MP\_OKAY}). \\
+2.  If $a.alloc < a.used + b$ then grow $a$ to at least $a.used + b$ digits.  (\textit{mp\_grow}). \\
+3.  If the reallocation failed return(\textit{MP\_MEM}). \\
+4.  $a.used \leftarrow a.used + b$ \\
+5.  $i \leftarrow a.used - 1$ \\
+6.  $j \leftarrow a.used - 1 - b$ \\
+7.  for $n$ from $a.used - 1$ to $b$ do \\
+\hspace{3mm}7.1  $a_{i} \leftarrow a_{j}$ \\
+\hspace{3mm}7.2  $i \leftarrow i - 1$ \\
+\hspace{3mm}7.3  $j \leftarrow j - 1$ \\
+8.  for $n$ from 0 to $b - 1$ do \\
+\hspace{3mm}8.1  $a_n \leftarrow 0$ \\
+9.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_lshd}
+\end{figure}
+
+\textbf{Algorithm mp\_lshd.}
+This algorithm multiplies an mp\_int by the $b$'th power of $x$.  This is equivalent to multiplying by $\beta^b$.  The algorithm differs 
+from the other algorithms presented so far as it performs the operation in place instead storing the result in a separate location.  The
+motivation behind this change is due to the way this function is typically used.  Algorithms such as mp\_add store the result in an optionally
+different third mp\_int because the original inputs are often still required.  Algorithm mp\_lshd (\textit{and similarly algorithm mp\_rshd}) is
+typically used on values where the original value is no longer required.  The algorithm will return success immediately if 
+$b \le 0$ since the rest of algorithm is only valid when $b > 0$.  
+
+First the destination $a$ is grown as required to accomodate the result.  The counters $i$ and $j$ are used to form a \textit{sliding window} over
+the digits of $a$ of length $b$.  The head of the sliding window is at $i$ (\textit{the leading digit}) and the tail at $j$ (\textit{the trailing digit}).  
+The loop on step 7 copies the digit from the tail to the head.  In each iteration the window is moved down one digit.   The last loop on 
+step 8 sets the lower $b$ digits to zero.
+
+\newpage
+FIGU,sliding_window,Sliding Window Movement
+
+EXAM,bn_mp_lshd.c
+
+The if statement (line @24,if@) ensures that the $b$ variable is greater than zero since we do not interpret negative
+shift counts properly.  The \textbf{used} count is incremented by $b$ before the copy loop begins.  This elminates 
+the need for an additional variable in the for loop.  The variable $top$ (line @42,top@) is an alias
+for the leading digit while $bottom$ (line @45,bottom@) is an alias for the trailing edge.  The aliases form a 
+window of exactly $b$ digits over the input.  
+
+\subsection{Division by $x$}
+
+Division by powers of $x$ is easily achieved by shifting the digits right and removing any that will end up to the right of the zero'th digit.  
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_rshd}. \\
+\textbf{Input}.   One mp\_int $a$ and an integer $b$ \\
+\textbf{Output}.  $a \leftarrow a / \beta^b$ (Divide by $x^b$). \\
+\hline \\
+1.  If $b \le 0$ then return. \\
+2.  If $a.used \le b$ then do \\
+\hspace{3mm}2.1  Zero $a$.  (\textit{mp\_zero}). \\
+\hspace{3mm}2.2  Return. \\
+3.  $i \leftarrow 0$ \\
+4.  $j \leftarrow b$ \\
+5.  for $n$ from 0 to $a.used - b - 1$ do \\
+\hspace{3mm}5.1  $a_i \leftarrow a_j$ \\
+\hspace{3mm}5.2  $i \leftarrow i + 1$ \\
+\hspace{3mm}5.3  $j \leftarrow j + 1$ \\
+6.  for $n$ from $a.used - b$ to $a.used - 1$ do \\
+\hspace{3mm}6.1  $a_n \leftarrow 0$ \\
+7.  $a.used \leftarrow a.used - b$ \\
+8.  Return. \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_rshd}
+\end{figure}
+
+\textbf{Algorithm mp\_rshd.}
+This algorithm divides the input in place by the $b$'th power of $x$.  It is analogous to dividing by a $\beta^b$ but much quicker since
+it does not require single precision division.  This algorithm does not actually return an error code as it cannot fail.  
+
+If the input $b$ is less than one the algorithm quickly returns without performing any work.  If the \textbf{used} count is less than or equal
+to the shift count $b$ then it will simply zero the input and return.
+
+After the trivial cases of inputs have been handled the sliding window is setup.  Much like the case of algorithm mp\_lshd a sliding window that
+is $b$ digits wide is used to copy the digits.  Unlike mp\_lshd the window slides in the opposite direction from the trailing to the leading digit.  
+Also the digits are copied from the leading to the trailing edge.
+
+Once the window copy is complete the upper digits must be zeroed and the \textbf{used} count decremented.
+
+EXAM,bn_mp_rshd.c
+
+The only noteworthy element of this routine is the lack of a return type since it cannot fail.  Like mp\_lshd() we
+form a sliding window except we copy in the other direction.  After the window (line @59,for (;@) we then zero
+the upper digits of the input to make sure the result is correct.
+
+\section{Powers of Two}
+
+Now that algorithms for moving single bits as well as whole digits exist algorithms for moving the ``in between'' distances are required.  For 
+example, to quickly multiply by $2^k$ for any $k$ without using a full multiplier algorithm would prove useful.  Instead of performing single
+shifts $k$ times to achieve a multiplication by $2^{\pm k}$ a mixture of whole digit shifting and partial digit shifting is employed.  
+
+\subsection{Multiplication by Power of Two}
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_mul\_2d}. \\
+\textbf{Input}.   One mp\_int $a$ and an integer $b$ \\
+\textbf{Output}.  $c \leftarrow a \cdot 2^b$. \\
+\hline \\
+1.  $c \leftarrow a$.  (\textit{mp\_copy}) \\
+2.  If $c.alloc < c.used + \lfloor b / lg(\beta) \rfloor + 2$ then grow $c$ accordingly. \\
+3.  If the reallocation failed return(\textit{MP\_MEM}). \\
+4.  If $b \ge lg(\beta)$ then \\
+\hspace{3mm}4.1  $c \leftarrow c \cdot \beta^{\lfloor b / lg(\beta) \rfloor}$ (\textit{mp\_lshd}). \\
+\hspace{3mm}4.2  If step 4.1 failed return(\textit{MP\_MEM}). \\
+5.  $d \leftarrow b \mbox{ (mod }lg(\beta)\mbox{)}$ \\
+6.  If $d \ne 0$ then do \\
+\hspace{3mm}6.1  $mask \leftarrow 2^d$ \\
+\hspace{3mm}6.2  $r \leftarrow 0$ \\
+\hspace{3mm}6.3  for $n$ from $0$ to $c.used - 1$ do \\
+\hspace{6mm}6.3.1  $rr \leftarrow c_n >> (lg(\beta) - d) \mbox{ (mod }mask\mbox{)}$ \\
+\hspace{6mm}6.3.2  $c_n \leftarrow (c_n << d) + r \mbox{ (mod }\beta\mbox{)}$ \\
+\hspace{6mm}6.3.3  $r \leftarrow rr$ \\
+\hspace{3mm}6.4  If $r > 0$ then do \\
+\hspace{6mm}6.4.1  $c_{c.used} \leftarrow r$ \\
+\hspace{6mm}6.4.2  $c.used \leftarrow c.used + 1$ \\
+7.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_mul\_2d}
+\end{figure}
+
+\textbf{Algorithm mp\_mul\_2d.}
+This algorithm multiplies $a$ by $2^b$ and stores the result in $c$.  The algorithm uses algorithm mp\_lshd and a derivative of algorithm mp\_mul\_2 to
+quickly compute the product.
+
+First the algorithm will multiply $a$ by $x^{\lfloor b / lg(\beta) \rfloor}$ which will ensure that the remainder multiplicand is less than 
+$\beta$.  For example, if $b = 37$ and $\beta = 2^{28}$ then this step will multiply by $x$ leaving a multiplication by $2^{37 - 28} = 2^{9}$ 
+left.
+
+After the digits have been shifted appropriately at most $lg(\beta) - 1$ shifts are left to perform.  Step 5 calculates the number of remaining shifts 
+required.  If it is non-zero a modified shift loop is used to calculate the remaining product.  
+Essentially the loop is a generic version of algorith mp\_mul2 designed to handle any shift count in the range $1 \le x < lg(\beta)$.  The $mask$
+variable is used to extract the upper $d$ bits to form the carry for the next iteration.  
+
+This algorithm is loosely measured as a $O(2n)$ algorithm which means that if the input is $n$-digits that it takes $2n$ ``time'' to 
+complete.  It is possible to optimize this algorithm down to a $O(n)$ algorithm at a cost of making the algorithm slightly harder to follow.
+
+EXAM,bn_mp_mul_2d.c
+
+The shifting is performed in--place which means the first step (line @24,a != c@) is to copy the input to the 
+destination.  We avoid calling mp\_copy() by making sure the mp\_ints are different.  The destination then
+has to be grown (line @31,grow@) to accomodate the result.
+
+If the shift count $b$ is larger than $lg(\beta)$ then a call to mp\_lshd() is used to handle all of the multiples 
+of $lg(\beta)$.  Leaving only a remaining shift of $lg(\beta) - 1$ or fewer bits left.  Inside the actual shift 
+loop (lines @45,if@ to @76,}@) we make use of pre--computed values $shift$ and $mask$.   These are used to
+extract the carry bit(s) to pass into the next iteration of the loop.  The $r$ and $rr$ variables form a 
+chain between consecutive iterations to propagate the carry.  
+
+\subsection{Division by Power of Two}
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_div\_2d}. \\
+\textbf{Input}.   One mp\_int $a$ and an integer $b$ \\
+\textbf{Output}.  $c \leftarrow \lfloor a / 2^b \rfloor, d \leftarrow a \mbox{ (mod }2^b\mbox{)}$. \\
+\hline \\
+1.  If $b \le 0$ then do \\
+\hspace{3mm}1.1  $c \leftarrow a$ (\textit{mp\_copy}) \\
+\hspace{3mm}1.2  $d \leftarrow 0$ (\textit{mp\_zero}) \\
+\hspace{3mm}1.3  Return(\textit{MP\_OKAY}). \\
+2.  $c \leftarrow a$ \\
+3.  $d \leftarrow a \mbox{ (mod }2^b\mbox{)}$ (\textit{mp\_mod\_2d}) \\
+4.  If $b \ge lg(\beta)$ then do \\
+\hspace{3mm}4.1  $c \leftarrow \lfloor c/\beta^{\lfloor b/lg(\beta) \rfloor} \rfloor$ (\textit{mp\_rshd}). \\
+5.  $k \leftarrow b \mbox{ (mod }lg(\beta)\mbox{)}$ \\
+6.  If $k \ne 0$ then do \\
+\hspace{3mm}6.1  $mask \leftarrow 2^k$ \\
+\hspace{3mm}6.2  $r \leftarrow 0$ \\
+\hspace{3mm}6.3  for $n$ from $c.used - 1$ to $0$ do \\
+\hspace{6mm}6.3.1  $rr \leftarrow c_n \mbox{ (mod }mask\mbox{)}$ \\
+\hspace{6mm}6.3.2  $c_n \leftarrow (c_n >> k) + (r << (lg(\beta) - k))$ \\
+\hspace{6mm}6.3.3  $r \leftarrow rr$ \\
+7.  Clamp excess digits of $c$.  (\textit{mp\_clamp}) \\
+8.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_div\_2d}
+\end{figure}
+
+\textbf{Algorithm mp\_div\_2d.}
+This algorithm will divide an input $a$ by $2^b$ and produce the quotient and remainder.  The algorithm is designed much like algorithm 
+mp\_mul\_2d by first using whole digit shifts then single precision shifts.  This algorithm will also produce the remainder of the division
+by using algorithm mp\_mod\_2d.
+
+EXAM,bn_mp_div_2d.c
+
+The implementation of algorithm mp\_div\_2d is slightly different than the algorithm specifies.  The remainder $d$ may be optionally 
+ignored by passing \textbf{NULL} as the pointer to the mp\_int variable.    The temporary mp\_int variable $t$ is used to hold the 
+result of the remainder operation until the end.  This allows $d$ and $a$ to represent the same mp\_int without modifying $a$ before
+the quotient is obtained.
+
+The remainder of the source code is essentially the same as the source code for mp\_mul\_2d.  The only significant difference is
+the direction of the shifts.
+
+\subsection{Remainder of Division by Power of Two}
+
+The last algorithm in the series of polynomial basis power of two algorithms is calculating the remainder of division by $2^b$.  This
+algorithm benefits from the fact that in twos complement arithmetic $a \mbox{ (mod }2^b\mbox{)}$ is the same as $a$ AND $2^b - 1$.  
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_mod\_2d}. \\
+\textbf{Input}.   One mp\_int $a$ and an integer $b$ \\
+\textbf{Output}.  $c \leftarrow a \mbox{ (mod }2^b\mbox{)}$. \\
+\hline \\
+1.  If $b \le 0$ then do \\
+\hspace{3mm}1.1  $c \leftarrow 0$ (\textit{mp\_zero}) \\
+\hspace{3mm}1.2  Return(\textit{MP\_OKAY}). \\
+2.  If $b > a.used \cdot lg(\beta)$ then do \\
+\hspace{3mm}2.1  $c \leftarrow a$ (\textit{mp\_copy}) \\
+\hspace{3mm}2.2  Return the result of step 2.1. \\
+3.  $c \leftarrow a$ \\
+4.  If step 3 failed return(\textit{MP\_MEM}). \\
+5.  for $n$ from $\lceil b / lg(\beta) \rceil$ to $c.used$ do \\
+\hspace{3mm}5.1  $c_n \leftarrow 0$ \\
+6.  $k \leftarrow b \mbox{ (mod }lg(\beta)\mbox{)}$ \\
+7.  $c_{\lfloor b / lg(\beta) \rfloor} \leftarrow c_{\lfloor b / lg(\beta) \rfloor} \mbox{ (mod }2^{k}\mbox{)}$. \\
+8.  Clamp excess digits of $c$.  (\textit{mp\_clamp}) \\
+9.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_mod\_2d}
+\end{figure}
+
+\textbf{Algorithm mp\_mod\_2d.}
+This algorithm will quickly calculate the value of $a \mbox{ (mod }2^b\mbox{)}$.  First if $b$ is less than or equal to zero the 
+result is set to zero.  If $b$ is greater than the number of bits in $a$ then it simply copies $a$ to $c$ and returns.  Otherwise, $a$ 
+is copied to $b$, leading digits are removed and the remaining leading digit is trimed to the exact bit count.
+
+EXAM,bn_mp_mod_2d.c
+
+We first avoid cases of $b \le 0$ by simply mp\_zero()'ing the destination in such cases.  Next if $2^b$ is larger
+than the input we just mp\_copy() the input and return right away.  After this point we know we must actually
+perform some work to produce the remainder.
+
+Recalling that reducing modulo $2^k$ and a binary ``and'' with $2^k - 1$ are numerically equivalent we can quickly reduce 
+the number.  First we zero any digits above the last digit in $2^b$ (line @41,for@).  Next we reduce the 
+leading digit of both (line @45,&=@) and then mp\_clamp().
+
+\section*{Exercises}
+\begin{tabular}{cl}
+$\left [ 3 \right ] $ & Devise an algorithm that performs $a \cdot 2^b$ for generic values of $b$ \\
+                      & in $O(n)$ time. \\
+                      &\\
+$\left [ 3 \right ] $ & Devise an efficient algorithm to multiply by small low hamming  \\
+                      & weight values such as $3$, $5$ and $9$.  Extend it to handle all values \\
+                      & upto $64$ with a hamming weight less than three. \\
+                      &\\
+$\left [ 2 \right ] $ & Modify the preceding algorithm to handle values of the form \\
+                      & $2^k - 1$ as well. \\
+                      &\\
+$\left [ 3 \right ] $ & Using only algorithms mp\_mul\_2, mp\_div\_2 and mp\_add create an \\
+                      & algorithm to multiply two integers in roughly $O(2n^2)$ time for \\
+                      & any $n$-bit input.  Note that the time of addition is ignored in the \\
+                      & calculation.  \\
+                      & \\
+$\left [ 5 \right ] $ & Improve the previous algorithm to have a working time of at most \\
+                      & $O \left (2^{(k-1)}n + \left ({2n^2 \over k} \right ) \right )$ for an appropriate choice of $k$.  Again ignore \\
+                      & the cost of addition. \\
+                      & \\
+$\left [ 2 \right ] $ & Devise a chart to find optimal values of $k$ for the previous problem \\
+                      & for $n = 64 \ldots 1024$ in steps of $64$. \\
+                      & \\
+$\left [ 2 \right ] $ & Using only algorithms mp\_abs and mp\_sub devise another method for \\
+                      & calculating the result of a signed comparison. \\
+                      &
+\end{tabular}
+
+\chapter{Multiplication and Squaring}
+\section{The Multipliers}
+For most number theoretic problems including certain public key cryptographic algorithms, the ``multipliers'' form the most important subset of 
+algorithms of any multiple precision integer package.  The set of multiplier algorithms include integer multiplication, squaring and modular reduction 
+where in each of the algorithms single precision multiplication is the dominant operation performed.  This chapter will discuss integer multiplication 
+and squaring, leaving modular reductions for the subsequent chapter.  
+
+The importance of the multiplier algorithms is for the most part driven by the fact that certain popular public key algorithms are based on modular 
+exponentiation, that is computing $d \equiv a^b \mbox{ (mod }c\mbox{)}$ for some arbitrary choice of $a$, $b$, $c$ and $d$.  During a modular
+exponentiation the majority\footnote{Roughly speaking a modular exponentiation will spend about 40\% of the time performing modular reductions, 
+35\% of the time performing squaring and 25\% of the time performing multiplications.} of the processor time is spent performing single precision 
+multiplications.
+
+For centuries general purpose multiplication has required a lengthly $O(n^2)$ process, whereby each digit of one multiplicand has to be multiplied 
+against every digit of the other multiplicand.  Traditional long-hand multiplication is based on this process;  while the techniques can differ the 
+overall algorithm used is essentially the same.  Only ``recently'' have faster algorithms been studied.  First Karatsuba multiplication was discovered in 
+1962.  This algorithm can multiply two numbers with considerably fewer single precision multiplications when compared to the long-hand approach.  
+This technique led to the discovery of polynomial basis algorithms (\textit{good reference?}) and subquently Fourier Transform based solutions.  
+
+\section{Multiplication}
+\subsection{The Baseline Multiplication}
+\label{sec:basemult}
+\index{baseline multiplication}
+Computing the product of two integers in software can be achieved using a trivial adaptation of the standard $O(n^2)$ long-hand multiplication
+algorithm that school children are taught.  The algorithm is considered an $O(n^2)$ algorithm since for two $n$-digit inputs $n^2$ single precision 
+multiplications are required.  More specifically for a $m$ and $n$ digit input $m \cdot n$ single precision multiplications are required.  To 
+simplify most discussions, it will be assumed that the inputs have comparable number of digits.  
+
+The ``baseline multiplication'' algorithm is designed to act as the ``catch-all'' algorithm, only to be used when the faster algorithms cannot be 
+used.  This algorithm does not use any particularly interesting optimizations and should ideally be avoided if possible.    One important 
+facet of this algorithm, is that it has been modified to only produce a certain amount of output digits as resolution.  The importance of this 
+modification will become evident during the discussion of Barrett modular reduction.  Recall that for a $n$ and $m$ digit input the product 
+will be at most $n + m$ digits.  Therefore, this algorithm can be reduced to a full multiplier by having it produce $n + m$ digits of the product.  
+
+Recall from ~GAMMA~ the definition of $\gamma$ as the number of bits in the type \textbf{mp\_digit}.  We shall now extend the variable set to 
+include $\alpha$ which shall represent the number of bits in the type \textbf{mp\_word}.  This implies that $2^{\alpha} > 2 \cdot \beta^2$.  The 
+constant $\delta = 2^{\alpha - 2lg(\beta)}$ will represent the maximal weight of any column in a product (\textit{see ~COMBA~ for more information}).
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{s\_mp\_mul\_digs}. \\
+\textbf{Input}.   mp\_int $a$, mp\_int $b$ and an integer $digs$ \\
+\textbf{Output}.  $c \leftarrow \vert a \vert \cdot \vert b \vert \mbox{ (mod }\beta^{digs}\mbox{)}$. \\
+\hline \\
+1.  If min$(a.used, b.used) < \delta$ then do \\
+\hspace{3mm}1.1  Calculate $c = \vert a \vert \cdot \vert b \vert$ by the Comba method (\textit{see algorithm~\ref{fig:COMBAMULT}}).  \\
+\hspace{3mm}1.2  Return the result of step 1.1 \\
+\\
+Allocate and initialize a temporary mp\_int. \\
+2.  Init $t$ to be of size $digs$ \\
+3.  If step 2 failed return(\textit{MP\_MEM}). \\
+4.  $t.used \leftarrow digs$ \\
+\\
+Compute the product. \\
+5.  for $ix$ from $0$ to $a.used - 1$ do \\
+\hspace{3mm}5.1  $u \leftarrow 0$ \\
+\hspace{3mm}5.2  $pb \leftarrow \mbox{min}(b.used, digs - ix)$ \\
+\hspace{3mm}5.3  If $pb < 1$ then goto step 6. \\
+\hspace{3mm}5.4  for $iy$ from $0$ to $pb - 1$ do \\
+\hspace{6mm}5.4.1  $\hat r \leftarrow t_{iy + ix} + a_{ix} \cdot b_{iy} + u$ \\
+\hspace{6mm}5.4.2  $t_{iy + ix} \leftarrow \hat r \mbox{ (mod }\beta\mbox{)}$ \\
+\hspace{6mm}5.4.3  $u \leftarrow \lfloor \hat r / \beta \rfloor$ \\
+\hspace{3mm}5.5  if $ix + pb < digs$ then do \\
+\hspace{6mm}5.5.1  $t_{ix + pb} \leftarrow u$ \\
+6.  Clamp excess digits of $t$. \\
+7.  Swap $c$ with $t$ \\
+8.  Clear $t$ \\
+9.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm s\_mp\_mul\_digs}
+\end{figure}
+
+\textbf{Algorithm s\_mp\_mul\_digs.}
+This algorithm computes the unsigned product of two inputs $a$ and $b$, limited to an output precision of $digs$ digits.  While it may seem
+a bit awkward to modify the function from its simple $O(n^2)$ description, the usefulness of partial multipliers will arise in a subsequent 
+algorithm.  The algorithm is loosely based on algorithm 14.12 from \cite[pp. 595]{HAC} and is similar to Algorithm M of Knuth \cite[pp. 268]{TAOCPV2}.  
+Algorithm s\_mp\_mul\_digs differs from these cited references since it can produce a variable output precision regardless of the precision of the 
+inputs.
+
+The first thing this algorithm checks for is whether a Comba multiplier can be used instead.   If the minimum digit count of either
+input is less than $\delta$, then the Comba method may be used instead.    After the Comba method is ruled out, the baseline algorithm begins.  A 
+temporary mp\_int variable $t$ is used to hold the intermediate result of the product.  This allows the algorithm to be used to 
+compute products when either $a = c$ or $b = c$ without overwriting the inputs.  
+
+All of step 5 is the infamous $O(n^2)$ multiplication loop slightly modified to only produce upto $digs$ digits of output.  The $pb$ variable
+is given the count of digits to read from $b$ inside the nested loop.  If $pb \le 1$ then no more output digits can be produced and the algorithm
+will exit the loop.  The best way to think of the loops are as a series of $pb \times 1$ multiplications.    That is, in each pass of the 
+innermost loop $a_{ix}$ is multiplied against $b$ and the result is added (\textit{with an appropriate shift}) to $t$.  
+
+For example, consider multiplying $576$ by $241$.  That is equivalent to computing $10^0(1)(576) + 10^1(4)(576) + 10^2(2)(576)$ which is best
+visualized in the following table.
+
+\begin{figure}[here]
+\begin{center}
+\begin{tabular}{|c|c|c|c|c|c|l|}
+\hline   &&          & 5 & 7 & 6 & \\
+\hline   $\times$&&  & 2 & 4 & 1 & \\
+\hline &&&&&&\\
+  &&          & 5 & 7 & 6 & $10^0(1)(576)$ \\
+  &2 &   3    & 6 & 1 & 6 & $10^1(4)(576) + 10^0(1)(576)$ \\
+  1 & 3 & 8 & 8 & 1 & 6 &   $10^2(2)(576) + 10^1(4)(576) + 10^0(1)(576)$ \\
+\hline  
+\end{tabular}
+\end{center}
+\caption{Long-Hand Multiplication Diagram}
+\end{figure}
+
+Each row of the product is added to the result after being shifted to the left (\textit{multiplied by a power of the radix}) by the appropriate 
+count.  That is in pass $ix$ of the inner loop the product is added starting at the $ix$'th digit of the reult.
+
+Step 5.4.1 introduces the hat symbol (\textit{e.g. $\hat r$}) which represents a double precision variable.  The multiplication on that step
+is assumed to be a double wide output single precision multiplication.  That is, two single precision variables are multiplied to produce a
+double precision result.  The step is somewhat optimized from a long-hand multiplication algorithm because the carry from the addition in step
+5.4.1 is propagated through the nested loop.  If the carry was not propagated immediately it would overflow the single precision digit 
+$t_{ix+iy}$ and the result would be lost.  
+
+At step 5.5 the nested loop is finished and any carry that was left over should be forwarded.  The carry does not have to be added to the $ix+pb$'th
+digit since that digit is assumed to be zero at this point.  However, if $ix + pb \ge digs$ the carry is not set as it would make the result
+exceed the precision requested.
+
+EXAM,bn_s_mp_mul_digs.c
+
+First we determine (line @30,if@) if the Comba method can be used first since it's faster.  The conditions for 
+sing the Comba routine are that min$(a.used, b.used) < \delta$ and the number of digits of output is less than 
+\textbf{MP\_WARRAY}.  This new constant is used to control the stack usage in the Comba routines.  By default it is 
+set to $\delta$ but can be reduced when memory is at a premium.
+
+If we cannot use the Comba method we proceed to setup the baseline routine.  We allocate the the destination mp\_int
+$t$ (line @36,init@) to the exact size of the output to avoid further re--allocations.  At this point we now 
+begin the $O(n^2)$ loop.
+
+This implementation of multiplication has the caveat that it can be trimmed to only produce a variable number of
+digits as output.  In each iteration of the outer loop the $pb$ variable is set (line @48,MIN@) to the maximum 
+number of inner loop iterations.  
+
+Inside the inner loop we calculate $\hat r$ as the mp\_word product of the two mp\_digits and the addition of the
+carry from the previous iteration.  A particularly important observation is that most modern optimizing 
+C compilers (GCC for instance) can recognize that a $N \times N \rightarrow 2N$ multiplication is all that 
+is required for the product.  In x86 terms for example, this means using the MUL instruction.
+
+Each digit of the product is stored in turn (line @68,tmpt@) and the carry propagated (line @71,>>@) to the 
+next iteration.
+
+\subsection{Faster Multiplication by the ``Comba'' Method}
+MARK,COMBA
+
+One of the huge drawbacks of the ``baseline'' algorithms is that at the $O(n^2)$ level the carry must be 
+computed and propagated upwards.  This makes the nested loop very sequential and hard to unroll and implement 
+in parallel.  The ``Comba'' \cite{COMBA} method is named after little known (\textit{in cryptographic venues}) Paul G. 
+Comba who described a method of implementing fast multipliers that do not require nested carry fixup operations.  As an 
+interesting aside it seems that Paul Barrett describes a similar technique in his 1986 paper \cite{BARRETT} written 
+five years before.
+
+At the heart of the Comba technique is once again the long-hand algorithm.  Except in this case a slight 
+twist is placed on how the columns of the result are produced.  In the standard long-hand algorithm rows of products 
+are produced then added together to form the final result.  In the baseline algorithm the columns are added together 
+after each iteration to get the result instantaneously.  
+
+In the Comba algorithm the columns of the result are produced entirely independently of each other.  That is at 
+the $O(n^2)$ level a simple multiplication and addition step is performed.  The carries of the columns are propagated 
+after the nested loop to reduce the amount of work requiored. Succintly the first step of the algorithm is to compute 
+the product vector $\vec x$ as follows. 
+
+\begin{equation}
+\vec x_n = \sum_{i+j = n} a_ib_j, \forall n \in \lbrace 0, 1, 2, \ldots, i + j \rbrace
+\end{equation}
+
+Where $\vec x_n$ is the $n'th$ column of the output vector.  Consider the following example which computes the vector $\vec x$ for the multiplication
+of $576$ and $241$.  
+
+\newpage\begin{figure}[here]
+\begin{small}
+\begin{center}
+\begin{tabular}{|c|c|c|c|c|c|}
+  \hline &          & 5 & 7 & 6 & First Input\\
+  \hline $\times$ & & 2 & 4 & 1 & Second Input\\
+\hline            &                        & $1 \cdot 5 = 5$   & $1 \cdot 7 = 7$   & $1 \cdot 6 = 6$ & First pass \\
+                  &  $4 \cdot 5 = 20$      & $4 \cdot 7+5=33$  & $4 \cdot 6+7=31$  & 6               & Second pass \\
+   $2 \cdot 5 = 10$ &  $2 \cdot 7 + 20 = 34$ & $2 \cdot 6+33=45$ & 31                & 6             & Third pass \\
+\hline 10 & 34 & 45 & 31 & 6 & Final Result \\   
+\hline   
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Comba Multiplication Diagram}
+\end{figure}
+
+At this point the vector $x = \left < 10, 34, 45, 31, 6 \right >$ is the result of the first step of the Comba multipler.  
+Now the columns must be fixed by propagating the carry upwards.  The resultant vector will have one extra dimension over the input vector which is
+congruent to adding a leading zero digit.
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{Comba Fixup}. \\
+\textbf{Input}.   Vector $\vec x$ of dimension $k$ \\
+\textbf{Output}.  Vector $\vec x$ such that the carries have been propagated. \\
+\hline \\
+1.  for $n$ from $0$ to $k - 1$ do \\
+\hspace{3mm}1.1 $\vec x_{n+1} \leftarrow \vec x_{n+1} + \lfloor \vec x_{n}/\beta \rfloor$ \\
+\hspace{3mm}1.2 $\vec x_{n} \leftarrow \vec x_{n} \mbox{ (mod }\beta\mbox{)}$ \\
+2.  Return($\vec x$). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm Comba Fixup}
+\end{figure}
+
+With that algorithm and $k = 5$ and $\beta = 10$ the following vector is produced $\vec x= \left < 1, 3, 8, 8, 1, 6 \right >$.  In this case 
+$241 \cdot 576$ is in fact $138816$ and the procedure succeeded.  If the algorithm is correct and as will be demonstrated shortly more
+efficient than the baseline algorithm why not simply always use this algorithm?
+
+\subsubsection{Column Weight.}
+At the nested $O(n^2)$ level the Comba method adds the product of two single precision variables to each column of the output 
+independently.  A serious obstacle is if the carry is lost, due to lack of precision before the algorithm has a chance to fix
+the carries.  For example, in the multiplication of two three-digit numbers the third column of output will be the sum of
+three single precision multiplications.  If the precision of the accumulator for the output digits is less then $3 \cdot (\beta - 1)^2$ then
+an overflow can occur and the carry information will be lost.  For any $m$ and $n$ digit inputs the maximum weight of any column is 
+min$(m, n)$ which is fairly obvious.
+
+The maximum number of terms in any column of a product is known as the ``column weight'' and strictly governs when the algorithm can be used.  Recall
+from earlier that a double precision type has $\alpha$ bits of resolution and a single precision digit has $lg(\beta)$ bits of precision.  Given these
+two quantities we must not violate the following
+
+\begin{equation}
+k \cdot \left (\beta - 1 \right )^2 < 2^{\alpha}
+\end{equation}
+
+Which reduces to 
+
+\begin{equation}
+k \cdot \left ( \beta^2 - 2\beta + 1 \right ) < 2^{\alpha}
+\end{equation}
+
+Let $\rho = lg(\beta)$ represent the number of bits in a single precision digit.  By further re-arrangement of the equation the final solution is
+found.
+
+\begin{equation}
+k  < {{2^{\alpha}} \over {\left (2^{2\rho} - 2^{\rho + 1} + 1 \right )}}
+\end{equation}
+
+The defaults for LibTomMath are $\beta = 2^{28}$ and $\alpha = 2^{64}$ which means that $k$ is bounded by $k < 257$.  In this configuration 
+the smaller input may not have more than $256$ digits if the Comba method is to be used.  This is quite satisfactory for most applications since 
+$256$ digits would allow for numbers in the range of $0 \le x < 2^{7168}$ which, is much larger than most public key cryptographic algorithms require.  
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{fast\_s\_mp\_mul\_digs}. \\
+\textbf{Input}.   mp\_int $a$, mp\_int $b$ and an integer $digs$ \\
+\textbf{Output}.  $c \leftarrow \vert a \vert \cdot \vert b \vert \mbox{ (mod }\beta^{digs}\mbox{)}$. \\
+\hline \\
+Place an array of \textbf{MP\_WARRAY} single precision digits named $W$ on the stack. \\
+1.  If $c.alloc < digs$ then grow $c$ to $digs$ digits. (\textit{mp\_grow}) \\
+2.  If step 1 failed return(\textit{MP\_MEM}).\\
+\\
+3.  $pa \leftarrow \mbox{MIN}(digs, a.used + b.used)$ \\
+\\
+4.  $\_ \hat W \leftarrow 0$ \\
+5.  for $ix$ from 0 to $pa - 1$ do \\
+\hspace{3mm}5.1  $ty \leftarrow \mbox{MIN}(b.used - 1, ix)$ \\
+\hspace{3mm}5.2  $tx \leftarrow ix - ty$ \\
+\hspace{3mm}5.3  $iy \leftarrow \mbox{MIN}(a.used - tx, ty + 1)$ \\
+\hspace{3mm}5.4  for $iz$ from 0 to $iy - 1$ do \\
+\hspace{6mm}5.4.1  $\_ \hat W \leftarrow \_ \hat W + a_{tx+iy}b_{ty-iy}$ \\
+\hspace{3mm}5.5  $W_{ix} \leftarrow \_ \hat W (\mbox{mod }\beta)$\\
+\hspace{3mm}5.6  $\_ \hat W \leftarrow \lfloor \_ \hat W / \beta \rfloor$ \\
+6.  $W_{pa} \leftarrow \_ \hat W (\mbox{mod }\beta)$ \\
+\\
+7.  $oldused \leftarrow c.used$ \\
+8.  $c.used \leftarrow digs$ \\
+9.  for $ix$ from $0$ to $pa$ do \\
+\hspace{3mm}9.1  $c_{ix} \leftarrow W_{ix}$ \\
+10.  for $ix$ from $pa + 1$ to $oldused - 1$ do \\
+\hspace{3mm}10.1 $c_{ix} \leftarrow 0$ \\
+\\
+11.  Clamp $c$. \\
+12.  Return MP\_OKAY. \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm fast\_s\_mp\_mul\_digs}
+\label{fig:COMBAMULT}
+\end{figure}
+
+\textbf{Algorithm fast\_s\_mp\_mul\_digs.}
+This algorithm performs the unsigned multiplication of $a$ and $b$ using the Comba method limited to $digs$ digits of precision.
+
+The outer loop of this algorithm is more complicated than that of the baseline multiplier.  This is because on the inside of the 
+loop we want to produce one column per pass.  This allows the accumulator $\_ \hat W$ to be placed in CPU registers and
+reduce the memory bandwidth to two \textbf{mp\_digit} reads per iteration.
+
+The $ty$ variable is set to the minimum count of $ix$ or the number of digits in $b$.  That way if $a$ has more digits than
+$b$ this will be limited to $b.used - 1$.  The $tx$ variable is set to the to the distance past $b.used$ the variable
+$ix$ is.  This is used for the immediately subsequent statement where we find $iy$.  
+
+The variable $iy$ is the minimum digits we can read from either $a$ or $b$ before running out.  Computing one column at a time
+means we have to scan one integer upwards and the other downwards.  $a$ starts at $tx$ and $b$ starts at $ty$.  In each
+pass we are producing the $ix$'th output column and we note that $tx + ty = ix$.  As we move $tx$ upwards we have to 
+move $ty$ downards so the equality remains valid.  The $iy$ variable is the number of iterations until 
+$tx \ge a.used$ or $ty < 0$ occurs.
+
+After every inner pass we store the lower half of the accumulator into $W_{ix}$ and then propagate the carry of the accumulator
+into the next round by dividing $\_ \hat W$ by $\beta$.
+
+To measure the benefits of the Comba method over the baseline method consider the number of operations that are required.  If the 
+cost in terms of time of a multiply and addition is $p$ and the cost of a carry propagation is $q$ then a baseline multiplication would require 
+$O \left ((p + q)n^2 \right )$ time to multiply two $n$-digit numbers.  The Comba method requires only $O(pn^2 + qn)$ time, however in practice, 
+the speed increase is actually much more.  With $O(n)$ space the algorithm can be reduced to $O(pn + qn)$ time by implementing the $n$ multiply
+and addition operations in the nested loop in parallel.  
+
+EXAM,bn_fast_s_mp_mul_digs.c
+
+As per the pseudo--code we first calculate $pa$ (line @47,MIN@) as the number of digits to output.  Next we begin the outer loop
+to produce the individual columns of the product.  We use the two aliases $tmpx$ and $tmpy$ (lines @61,tmpx@, @62,tmpy@) to point
+inside the two multiplicands quickly.  
+
+The inner loop (lines @70,for@ to @72,}@) of this implementation is where the tradeoff come into play.  Originally this comba 
+implementation was ``row--major'' which means it adds to each of the columns in each pass.  After the outer loop it would then fix 
+the carries.  This was very fast except it had an annoying drawback.  You had to read a mp\_word and two mp\_digits and write 
+one mp\_word per iteration.  On processors such as the Athlon XP and P4 this did not matter much since the cache bandwidth 
+is very high and it can keep the ALU fed with data.  It did, however, matter on older and embedded cpus where cache is often 
+slower and also often doesn't exist.  This new algorithm only performs two reads per iteration under the assumption that the 
+compiler has aliased $\_ \hat W$ to a CPU register.
+
+After the inner loop we store the current accumulator in $W$ and shift $\_ \hat W$ (lines @75,W[ix]@, @78,>>@) to forward it as 
+a carry for the next pass.  After the outer loop we use the final carry (line @82,W[ix]@) as the last digit of the product.  
+
+\subsection{Polynomial Basis Multiplication}
+To break the $O(n^2)$ barrier in multiplication requires a completely different look at integer multiplication.  In the following algorithms
+the use of polynomial basis representation for two integers $a$ and $b$ as $f(x) = \sum_{i=0}^{n} a_i x^i$ and  
+$g(x) = \sum_{i=0}^{n} b_i x^i$ respectively, is required.  In this system both $f(x)$ and $g(x)$ have $n + 1$ terms and are of the $n$'th degree.
+ 
+The product $a \cdot b \equiv f(x)g(x)$ is the polynomial $W(x) = \sum_{i=0}^{2n} w_i x^i$.  The coefficients $w_i$ will
+directly yield the desired product when $\beta$ is substituted for $x$.  The direct solution to solve for the $2n + 1$ coefficients
+requires $O(n^2)$ time and would in practice be slower than the Comba technique.
+
+However, numerical analysis theory indicates that only $2n + 1$ distinct points in $W(x)$ are required to determine the values of the $2n + 1$ unknown 
+coefficients.   This means by finding $\zeta_y = W(y)$ for $2n + 1$ small values of $y$ the coefficients of $W(x)$ can be found with 
+Gaussian elimination.  This technique is also occasionally refered to as the \textit{interpolation technique} (\textit{references please...}) since in 
+effect an interpolation based on $2n + 1$ points will yield a polynomial equivalent to $W(x)$.  
+
+The coefficients of the polynomial $W(x)$ are unknown which makes finding $W(y)$ for any value of $y$ impossible.  However, since 
+$W(x) = f(x)g(x)$ the equivalent $\zeta_y = f(y) g(y)$ can be used in its place.  The benefit of this technique stems from the 
+fact that $f(y)$ and $g(y)$ are much smaller than either $a$ or $b$ respectively.  As a result finding the $2n + 1$ relations required 
+by multiplying $f(y)g(y)$ involves multiplying integers that are much smaller than either of the inputs.
+
+When picking points to gather relations there are always three obvious points to choose, $y = 0, 1$ and $ \infty$.  The $\zeta_0$ term
+is simply the product $W(0) = w_0 = a_0 \cdot b_0$.  The $\zeta_1$ term is the product 
+$W(1) = \left (\sum_{i = 0}^{n} a_i \right ) \left (\sum_{i = 0}^{n} b_i \right )$.  The third point $\zeta_{\infty}$ is less obvious but rather
+simple to explain.  The $2n + 1$'th coefficient of $W(x)$ is numerically equivalent to the most significant column in an integer multiplication.  
+The point at $\infty$ is used symbolically to represent the most significant column, that is $W(\infty) = w_{2n} = a_nb_n$.  Note that the 
+points at $y = 0$ and $\infty$ yield the coefficients $w_0$ and $w_{2n}$ directly.
+
+If more points are required they should be of small values and powers of two such as $2^q$ and the related \textit{mirror points} 
+$\left (2^q \right )^{2n}  \cdot \zeta_{2^{-q}}$ for small values of $q$.  The term ``mirror point'' stems from the fact that 
+$\left (2^q \right )^{2n}  \cdot \zeta_{2^{-q}}$ can be calculated in the exact opposite fashion as $\zeta_{2^q}$.  For
+example, when $n = 2$ and $q = 1$ then following two equations are equivalent to the point $\zeta_{2}$ and its mirror.
+
+\begin{eqnarray}
+\zeta_{2}                  = f(2)g(2) = (4a_2 + 2a_1 + a_0)(4b_2 + 2b_1 + b_0) \nonumber \\
+16 \cdot \zeta_{1 \over 2} = 4f({1\over 2}) \cdot 4g({1 \over 2}) = (a_2 + 2a_1 + 4a_0)(b_2 + 2b_1 + 4b_0)
+\end{eqnarray}
+
+Using such points will allow the values of $f(y)$ and $g(y)$ to be independently calculated using only left shifts.  For example, when $n = 2$ the
+polynomial $f(2^q)$ is equal to $2^q((2^qa_2) + a_1) + a_0$.  This technique of polynomial representation is known as Horner's method.  
+
+As a general rule of the algorithm when the inputs are split into $n$ parts each there are $2n - 1$ multiplications.  Each multiplication is of 
+multiplicands that have $n$ times fewer digits than the inputs.  The asymptotic running time of this algorithm is 
+$O \left ( k^{lg_n(2n - 1)} \right )$ for $k$ digit inputs (\textit{assuming they have the same number of digits}).  Figure~\ref{fig:exponent}
+summarizes the exponents for various values of $n$.
+
+\begin{figure}
+\begin{center}
+\begin{tabular}{|c|c|c|}
+\hline \textbf{Split into $n$ Parts} & \textbf{Exponent}  & \textbf{Notes}\\
+\hline $2$ & $1.584962501$ & This is Karatsuba Multiplication. \\
+\hline $3$ & $1.464973520$ & This is Toom-Cook Multiplication. \\
+\hline $4$ & $1.403677461$ &\\
+\hline $5$ & $1.365212389$ &\\
+\hline $10$ & $1.278753601$ &\\
+\hline $100$ & $1.149426538$ &\\
+\hline $1000$ & $1.100270931$ &\\
+\hline $10000$ & $1.075252070$ &\\
+\hline
+\end{tabular}
+\end{center}
+\caption{Asymptotic Running Time of Polynomial Basis Multiplication}
+\label{fig:exponent}
+\end{figure}
+
+At first it may seem like a good idea to choose $n = 1000$ since the exponent is approximately $1.1$.  However, the overhead
+of solving for the 2001 terms of $W(x)$ will certainly consume any savings the algorithm could offer for all but exceedingly large
+numbers.  
+
+\subsubsection{Cutoff Point}
+The polynomial basis multiplication algorithms all require fewer single precision multiplications than a straight Comba approach.  However, 
+the algorithms incur an overhead (\textit{at the $O(n)$ work level}) since they require a system of equations to be solved.  This makes the
+polynomial basis approach more costly to use with small inputs.
+
+Let $m$ represent the number of digits in the multiplicands (\textit{assume both multiplicands have the same number of digits}).  There exists a 
+point $y$ such that when $m < y$ the polynomial basis algorithms are more costly than Comba, when $m = y$ they are roughly the same cost and 
+when $m > y$ the Comba methods are slower than the polynomial basis algorithms.  
+
+The exact location of $y$ depends on several key architectural elements of the computer platform in question.
+
+\begin{enumerate}
+\item  The ratio of clock cycles for single precision multiplication versus other simpler operations such as addition, shifting, etc.  For example
+on the AMD Athlon the ratio is roughly $17 : 1$ while on the Intel P4 it is $29 : 1$.  The higher the ratio in favour of multiplication the lower
+the cutoff point $y$ will be.  
+
+\item  The complexity of the linear system of equations (\textit{for the coefficients of $W(x)$}) is.  Generally speaking as the number of splits
+grows the complexity grows substantially.  Ideally solving the system will only involve addition, subtraction and shifting of integers.  This
+directly reflects on the ratio previous mentioned.
+
+\item  To a lesser extent memory bandwidth and function call overheads.  Provided the values are in the processor cache this is less of an
+influence over the cutoff point.
+
+\end{enumerate}
+
+A clean cutoff point separation occurs when a point $y$ is found such that all of the cutoff point conditions are met.  For example, if the point
+is too low then there will be values of $m$ such that $m > y$ and the Comba method is still faster.  Finding the cutoff points is fairly simple when
+a high resolution timer is available.  
+
+\subsection{Karatsuba Multiplication}
+Karatsuba \cite{KARA} multiplication when originally proposed in 1962 was among the first set of algorithms to break the $O(n^2)$ barrier for
+general purpose multiplication.  Given two polynomial basis representations $f(x) = ax + b$ and $g(x) = cx + d$, Karatsuba proved with 
+light algebra \cite{KARAP} that the following polynomial is equivalent to multiplication of the two integers the polynomials represent.
+
+\begin{equation}
+f(x) \cdot g(x) = acx^2 + ((a - b)(c - d) - (ac + bd))x + bd
+\end{equation}
+
+Using the observation that $ac$ and $bd$ could be re-used only three half sized multiplications would be required to produce the product.  Applying
+this algorithm recursively, the work factor becomes $O(n^{lg(3)})$ which is substantially better than the work factor $O(n^2)$ of the Comba technique.  It turns 
+out what Karatsuba did not know or at least did not publish was that this is simply polynomial basis multiplication with the points 
+$\zeta_0$, $\zeta_{\infty}$ and $-\zeta_{-1}$.  Consider the resultant system of equations.
+
+\begin{center}
+\begin{tabular}{rcrcrcrc}
+$\zeta_{0}$ &      $=$ &  &  &  & & $w_0$ \\
+$-\zeta_{-1}$ &    $=$ & $-w_2$ & $+$ & $w_1$ & $-$ & $w_0$ \\
+$\zeta_{\infty}$ & $=$ & $w_2$ &  & &  & \\
+\end{tabular}
+\end{center}
+
+By adding the first and last equation to the equation in the middle the term $w_1$ can be isolated and all three coefficients solved for.  The simplicity
+of this system of equations has made Karatsuba fairly popular.  In fact the cutoff point is often fairly low\footnote{With LibTomMath 0.18 it is 70 and 109 digits for the Intel P4 and AMD Athlon respectively.}
+making it an ideal algorithm to speed up certain public key cryptosystems such as RSA and Diffie-Hellman.  It is worth noting that the point 
+$\zeta_1$ could be substituted for $-\zeta_{-1}$.  In this case the first and third row are subtracted instead of added to the second row.  
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_karatsuba\_mul}. \\
+\textbf{Input}.   mp\_int $a$ and mp\_int $b$ \\
+\textbf{Output}.  $c \leftarrow \vert a \vert \cdot \vert b \vert$ \\
+\hline \\
+1.  Init the following mp\_int variables: $x0$, $x1$, $y0$, $y1$, $t1$, $x0y0$, $x1y1$.\\
+2.  If step 2 failed then return(\textit{MP\_MEM}). \\
+\\
+Split the input.  e.g. $a = x1 \cdot \beta^B + x0$ \\
+3.  $B \leftarrow \mbox{min}(a.used, b.used)/2$ \\
+4.  $x0 \leftarrow a \mbox{ (mod }\beta^B\mbox{)}$ (\textit{mp\_mod\_2d}) \\
+5.  $y0 \leftarrow b \mbox{ (mod }\beta^B\mbox{)}$ \\
+6.  $x1 \leftarrow \lfloor a / \beta^B \rfloor$ (\textit{mp\_rshd}) \\
+7.  $y1 \leftarrow \lfloor b / \beta^B \rfloor$ \\
+\\
+Calculate the three products. \\
+8.  $x0y0 \leftarrow x0 \cdot y0$ (\textit{mp\_mul}) \\
+9.  $x1y1 \leftarrow x1 \cdot y1$ \\
+10.  $t1 \leftarrow x1 - x0$ (\textit{mp\_sub}) \\
+11.  $x0 \leftarrow y1 - y0$ \\
+12.  $t1 \leftarrow t1 \cdot x0$ \\
+\\
+Calculate the middle term. \\
+13.  $x0 \leftarrow x0y0 + x1y1$ \\
+14.  $t1 \leftarrow x0 - t1$ \\
+\\
+Calculate the final product. \\
+15.  $t1 \leftarrow t1 \cdot \beta^B$ (\textit{mp\_lshd}) \\
+16.  $x1y1 \leftarrow x1y1 \cdot \beta^{2B}$ \\
+17.  $t1 \leftarrow x0y0 + t1$ \\
+18.  $c \leftarrow t1 + x1y1$ \\
+19.  Clear all of the temporary variables. \\
+20.  Return(\textit{MP\_OKAY}).\\
+\hline 
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_karatsuba\_mul}
+\end{figure}
+
+\textbf{Algorithm mp\_karatsuba\_mul.}
+This algorithm computes the unsigned product of two inputs using the Karatsuba multiplication algorithm.  It is loosely based on the description
+from Knuth \cite[pp. 294-295]{TAOCPV2}.  
+
+\index{radix point}
+In order to split the two inputs into their respective halves, a suitable \textit{radix point} must be chosen.  The radix point chosen must
+be used for both of the inputs meaning that it must be smaller than the smallest input.  Step 3 chooses the radix point $B$ as half of the 
+smallest input \textbf{used} count.  After the radix point is chosen the inputs are split into lower and upper halves.  Step 4 and 5 
+compute the lower halves.  Step 6 and 7 computer the upper halves.  
+
+After the halves have been computed the three intermediate half-size products must be computed.  Step 8 and 9 compute the trivial products
+$x0 \cdot y0$ and $x1 \cdot y1$.  The mp\_int $x0$ is used as a temporary variable after $x1 - x0$ has been computed.  By using $x0$ instead
+of an additional temporary variable, the algorithm can avoid an addition memory allocation operation.
+
+The remaining steps 13 through 18 compute the Karatsuba polynomial through a variety of digit shifting and addition operations.
+
+EXAM,bn_mp_karatsuba_mul.c
+
+The new coding element in this routine, not  seen in previous routines, is the usage of goto statements.  The conventional
+wisdom is that goto statements should be avoided.  This is generally true, however when every single function call can fail, it makes sense
+to handle error recovery with a single piece of code.  Lines @61,if@ to @75,if@ handle initializing all of the temporary variables 
+required.  Note how each of the if statements goes to a different label in case of failure.  This allows the routine to correctly free only
+the temporaries that have been successfully allocated so far.
+
+The temporary variables are all initialized using the mp\_init\_size routine since they are expected to be large.  This saves the 
+additional reallocation that would have been necessary.  Also $x0$, $x1$, $y0$ and $y1$ have to be able to hold at least their respective
+number of digits for the next section of code.
+
+The first algebraic portion of the algorithm is to split the two inputs into their halves.  However, instead of using mp\_mod\_2d and mp\_rshd
+to extract the halves, the respective code has been placed inline within the body of the function.  To initialize the halves, the \textbf{used} and 
+\textbf{sign} members are copied first.  The first for loop on line @98,for@ copies the lower halves.  Since they are both the same magnitude it 
+is simpler to calculate both lower halves in a single loop.  The for loop on lines @104,for@ and @109,for@ calculate the upper halves $x1$ and 
+$y1$ respectively.
+
+By inlining the calculation of the halves, the Karatsuba multiplier has a slightly lower overhead and can be used for smaller magnitude inputs.
+
+When line @152,err@ is reached, the algorithm has completed succesfully.  The ``error status'' variable $err$ is set to \textbf{MP\_OKAY} so that
+the same code that handles errors can be used to clear the temporary variables and return.  
+
+\subsection{Toom-Cook $3$-Way Multiplication}
+Toom-Cook $3$-Way \cite{TOOM} multiplication is essentially the polynomial basis algorithm for $n = 2$ except that the points  are 
+chosen such that $\zeta$ is easy to compute and the resulting system of equations easy to reduce.  Here, the points $\zeta_{0}$, 
+$16 \cdot \zeta_{1 \over 2}$, $\zeta_1$, $\zeta_2$ and $\zeta_{\infty}$ make up the five required points to solve for the coefficients 
+of the $W(x)$.
+
+With the five relations that Toom-Cook specifies, the following system of equations is formed.
+
+\begin{center}
+\begin{tabular}{rcrcrcrcrcr}
+$\zeta_0$                    & $=$ & $0w_4$ & $+$ & $0w_3$ & $+$ & $0w_2$ & $+$ & $0w_1$ & $+$ & $1w_0$  \\
+$16 \cdot \zeta_{1 \over 2}$ & $=$ & $1w_4$ & $+$ & $2w_3$ & $+$ & $4w_2$ & $+$ & $8w_1$ & $+$ & $16w_0$  \\
+$\zeta_1$                    & $=$ & $1w_4$ & $+$ & $1w_3$ & $+$ & $1w_2$ & $+$ & $1w_1$ & $+$ & $1w_0$  \\
+$\zeta_2$                    & $=$ & $16w_4$ & $+$ & $8w_3$ & $+$ & $4w_2$ & $+$ & $2w_1$ & $+$ & $1w_0$  \\
+$\zeta_{\infty}$             & $=$ & $1w_4$ & $+$ & $0w_3$ & $+$ & $0w_2$ & $+$ & $0w_1$ & $+$ & $0w_0$  \\
+\end{tabular}
+\end{center}
+
+A trivial solution to this matrix requires $12$ subtractions, two multiplications by a small power of two, two divisions by a small power
+of two, two divisions by three and one multiplication by three.  All of these $19$ sub-operations require less than quadratic time, meaning that
+the algorithm can be faster than a baseline multiplication.  However, the greater complexity of this algorithm places the cutoff point
+(\textbf{TOOM\_MUL\_CUTOFF}) where Toom-Cook becomes more efficient much higher than the Karatsuba cutoff point.  
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_toom\_mul}. \\
+\textbf{Input}.   mp\_int $a$ and mp\_int $b$ \\
+\textbf{Output}.  $c \leftarrow  a  \cdot  b $ \\
+\hline \\
+Split $a$ and $b$ into three pieces.  E.g. $a = a_2 \beta^{2k} + a_1 \beta^{k} + a_0$ \\
+1.  $k \leftarrow \lfloor \mbox{min}(a.used, b.used) / 3 \rfloor$ \\
+2.  $a_0 \leftarrow a \mbox{ (mod }\beta^{k}\mbox{)}$ \\
+3.  $a_1 \leftarrow \lfloor a / \beta^k \rfloor$, $a_1 \leftarrow a_1 \mbox{ (mod }\beta^{k}\mbox{)}$ \\
+4.  $a_2 \leftarrow \lfloor a / \beta^{2k} \rfloor$, $a_2 \leftarrow a_2 \mbox{ (mod }\beta^{k}\mbox{)}$ \\
+5.  $b_0 \leftarrow a \mbox{ (mod }\beta^{k}\mbox{)}$ \\
+6.  $b_1 \leftarrow \lfloor a / \beta^k \rfloor$, $b_1 \leftarrow b_1 \mbox{ (mod }\beta^{k}\mbox{)}$ \\
+7.  $b_2 \leftarrow \lfloor a / \beta^{2k} \rfloor$, $b_2 \leftarrow b_2 \mbox{ (mod }\beta^{k}\mbox{)}$ \\
+\\
+Find the five equations for $w_0, w_1, ..., w_4$. \\
+8.  $w_0 \leftarrow a_0 \cdot b_0$ \\
+9.  $w_4 \leftarrow a_2 \cdot b_2$ \\
+10. $tmp_1 \leftarrow 2 \cdot a_0$, $tmp_1 \leftarrow a_1 + tmp_1$, $tmp_1 \leftarrow 2 \cdot tmp_1$, $tmp_1 \leftarrow tmp_1 + a_2$ \\
+11. $tmp_2 \leftarrow 2 \cdot b_0$, $tmp_2 \leftarrow b_1 + tmp_2$, $tmp_2 \leftarrow 2 \cdot tmp_2$, $tmp_2 \leftarrow tmp_2 + b_2$ \\
+12. $w_1 \leftarrow tmp_1 \cdot tmp_2$ \\
+13. $tmp_1 \leftarrow 2 \cdot a_2$, $tmp_1 \leftarrow a_1 + tmp_1$, $tmp_1 \leftarrow 2 \cdot tmp_1$, $tmp_1 \leftarrow tmp_1 + a_0$ \\
+14. $tmp_2 \leftarrow 2 \cdot b_2$, $tmp_2 \leftarrow b_1 + tmp_2$, $tmp_2 \leftarrow 2 \cdot tmp_2$, $tmp_2 \leftarrow tmp_2 + b_0$ \\
+15. $w_3 \leftarrow tmp_1 \cdot tmp_2$ \\
+16. $tmp_1 \leftarrow a_0 + a_1$, $tmp_1 \leftarrow tmp_1 + a_2$, $tmp_2 \leftarrow b_0 + b_1$, $tmp_2 \leftarrow tmp_2 + b_2$ \\
+17. $w_2 \leftarrow tmp_1 \cdot tmp_2$ \\
+\\
+Continued on the next page.\\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_toom\_mul}
+\end{figure}
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_toom\_mul} (continued). \\
+\textbf{Input}.   mp\_int $a$ and mp\_int $b$ \\
+\textbf{Output}.  $c \leftarrow a \cdot  b $ \\
+\hline \\
+Now solve the system of equations. \\
+18. $w_1 \leftarrow w_4 - w_1$, $w_3 \leftarrow w_3 - w_0$ \\
+19. $w_1 \leftarrow \lfloor w_1 / 2 \rfloor$, $w_3 \leftarrow \lfloor w_3 / 2 \rfloor$ \\
+20. $w_2 \leftarrow w_2 - w_0$, $w_2 \leftarrow w_2 - w_4$ \\
+21. $w_1 \leftarrow w_1 - w_2$, $w_3 \leftarrow w_3 - w_2$ \\
+22. $tmp_1 \leftarrow 8 \cdot w_0$, $w_1 \leftarrow w_1 - tmp_1$, $tmp_1 \leftarrow 8 \cdot w_4$, $w_3 \leftarrow w_3 - tmp_1$ \\
+23. $w_2 \leftarrow 3 \cdot w_2$, $w_2 \leftarrow w_2 - w_1$, $w_2 \leftarrow w_2 - w_3$ \\
+24. $w_1 \leftarrow w_1 - w_2$, $w_3 \leftarrow w_3 - w_2$ \\
+25. $w_1 \leftarrow \lfloor w_1 / 3 \rfloor, w_3 \leftarrow \lfloor w_3 / 3 \rfloor$ \\
+\\
+Now substitute $\beta^k$ for $x$ by shifting $w_0, w_1, ..., w_4$. \\
+26. for $n$ from $1$ to $4$ do \\
+\hspace{3mm}26.1  $w_n \leftarrow w_n \cdot \beta^{nk}$ \\
+27. $c \leftarrow w_0 + w_1$, $c \leftarrow c + w_2$, $c \leftarrow c + w_3$, $c \leftarrow c + w_4$ \\
+28. Return(\textit{MP\_OKAY}) \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_toom\_mul (continued)}
+\end{figure}
+
+\textbf{Algorithm mp\_toom\_mul.}
+This algorithm computes the product of two mp\_int variables $a$ and $b$ using the Toom-Cook approach.  Compared to the Karatsuba multiplication, this 
+algorithm has a lower asymptotic running time of approximately $O(n^{1.464})$ but at an obvious cost in overhead.  In this
+description, several statements have been compounded to save space.  The intention is that the statements are executed from left to right across
+any given step.
+
+The two inputs $a$ and $b$ are first split into three $k$-digit integers $a_0, a_1, a_2$ and $b_0, b_1, b_2$ respectively.  From these smaller
+integers the coefficients of the polynomial basis representations $f(x)$ and $g(x)$ are known and can be used to find the relations required.
+
+The first two relations $w_0$ and $w_4$ are the points $\zeta_{0}$ and $\zeta_{\infty}$ respectively.  The relation $w_1, w_2$ and $w_3$ correspond
+to the points $16 \cdot \zeta_{1 \over 2}, \zeta_{2}$ and $\zeta_{1}$ respectively.  These are found using logical shifts to independently find
+$f(y)$ and $g(y)$ which significantly speeds up the algorithm.
+
+After the five relations $w_0, w_1, \ldots, w_4$ have been computed, the system they represent must be solved in order for the unknown coefficients 
+$w_1, w_2$ and $w_3$ to be isolated.  The steps 18 through 25 perform the system reduction required as previously described.  Each step of
+the reduction represents the comparable matrix operation that would be performed had this been performed by pencil.  For example, step 18 indicates
+that row $1$ must be subtracted from row $4$ and simultaneously row $0$ subtracted from row $3$.  
+
+Once the coeffients have been isolated, the polynomial $W(x) = \sum_{i=0}^{2n} w_i x^i$ is known.  By substituting $\beta^{k}$ for $x$, the integer 
+result $a \cdot b$ is produced.
+
+EXAM,bn_mp_toom_mul.c
+
+The first obvious thing to note is that this algorithm is complicated.  The complexity is worth it if you are multiplying very 
+large numbers.  For example, a 10,000 digit multiplication takes approximaly 99,282,205 fewer single precision multiplications with
+Toom--Cook than a Comba or baseline approach (this is a savings of more than 99$\%$).  For most ``crypto'' sized numbers this
+algorithm is not practical as Karatsuba has a much lower cutoff point.
+
+First we split $a$ and $b$ into three roughly equal portions.  This has been accomplished (lines @40,mod@ to @69,rshd@) with 
+combinations of mp\_rshd() and mp\_mod\_2d() function calls.  At this point $a = a2 \cdot \beta^2 + a1 \cdot \beta + a0$ and similiarly
+for $b$.  
+
+Next we compute the five points $w0, w1, w2, w3$ and $w4$.  Recall that $w0$ and $w4$ can be computed directly from the portions so
+we get those out of the way first (lines @72,mul@ and @77,mul@).  Next we compute $w1, w2$ and $w3$ using Horners method.
+
+After this point we solve for the actual values of $w1, w2$ and $w3$ by reducing the $5 \times 5$ system which is relatively
+straight forward.  
+
+\subsection{Signed Multiplication}
+Now that algorithms to handle multiplications of every useful dimensions have been developed, a rather simple finishing touch is required.  So far all
+of the multiplication algorithms have been unsigned multiplications which leaves only a signed multiplication algorithm to be established.  
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_mul}. \\
+\textbf{Input}.   mp\_int $a$ and mp\_int $b$ \\
+\textbf{Output}.  $c \leftarrow a \cdot b$ \\
+\hline \\
+1.  If $a.sign = b.sign$ then \\
+\hspace{3mm}1.1  $sign = MP\_ZPOS$ \\
+2.  else \\
+\hspace{3mm}2.1  $sign = MP\_ZNEG$ \\
+3.  If min$(a.used, b.used) \ge TOOM\_MUL\_CUTOFF$ then  \\
+\hspace{3mm}3.1  $c \leftarrow a \cdot b$ using algorithm mp\_toom\_mul \\
+4.  else if min$(a.used, b.used) \ge KARATSUBA\_MUL\_CUTOFF$ then \\
+\hspace{3mm}4.1  $c \leftarrow a \cdot b$ using algorithm mp\_karatsuba\_mul \\
+5.  else \\
+\hspace{3mm}5.1  $digs \leftarrow a.used + b.used + 1$ \\
+\hspace{3mm}5.2  If $digs < MP\_ARRAY$ and min$(a.used, b.used) \le \delta$ then \\
+\hspace{6mm}5.2.1  $c \leftarrow a \cdot b \mbox{ (mod }\beta^{digs}\mbox{)}$ using algorithm fast\_s\_mp\_mul\_digs.  \\
+\hspace{3mm}5.3  else \\
+\hspace{6mm}5.3.1  $c \leftarrow a \cdot b \mbox{ (mod }\beta^{digs}\mbox{)}$ using algorithm s\_mp\_mul\_digs.  \\
+6.  $c.sign \leftarrow sign$ \\
+7.  Return the result of the unsigned multiplication performed. \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_mul}
+\end{figure}
+
+\textbf{Algorithm mp\_mul.}
+This algorithm performs the signed multiplication of two inputs.  It will make use of any of the three unsigned multiplication algorithms 
+available when the input is of appropriate size.  The \textbf{sign} of the result is not set until the end of the algorithm since algorithm
+s\_mp\_mul\_digs will clear it.  
+
+EXAM,bn_mp_mul.c
+
+The implementation is rather simplistic and is not particularly noteworthy.  Line @22,?@ computes the sign of the result using the ``?'' 
+operator from the C programming language.  Line @37,<<@ computes $\delta$ using the fact that $1 << k$ is equal to $2^k$.  
+
+\section{Squaring}
+\label{sec:basesquare}
+
+Squaring is a special case of multiplication where both multiplicands are equal.  At first it may seem like there is no significant optimization
+available but in fact there is.  Consider the multiplication of $576$ against $241$.  In total there will be nine single precision multiplications
+performed which are $1\cdot 6$, $1 \cdot 7$, $1 \cdot 5$, $4 \cdot 6$, $4 \cdot 7$, $4 \cdot 5$, $2 \cdot  6$, $2 \cdot 7$ and $2 \cdot 5$.  Now consider 
+the multiplication of $123$ against $123$.  The nine products are $3 \cdot 3$, $3 \cdot 2$, $3 \cdot 1$, $2 \cdot 3$, $2 \cdot 2$, $2 \cdot 1$, 
+$1 \cdot 3$, $1 \cdot 2$ and $1 \cdot 1$.  On closer inspection some of the products are equivalent.  For example, $3 \cdot 2 = 2 \cdot 3$ 
+and $3 \cdot 1 = 1 \cdot 3$. 
+
+For any $n$-digit input, there are ${{\left (n^2 + n \right)}\over 2}$ possible unique single precision multiplications required compared to the $n^2$
+required for multiplication.  The following diagram gives an example of the operations required.
+
+\begin{figure}[here]
+\begin{center}
+\begin{tabular}{ccccc|c}
+&&1&2&3&\\
+$\times$ &&1&2&3&\\
+\hline && $3 \cdot 1$ & $3 \cdot 2$ & $3 \cdot 3$ & Row 0\\
+       & $2 \cdot 1$  & $2 \cdot 2$ & $2 \cdot 3$ && Row 1 \\
+         $1 \cdot 1$  & $1 \cdot 2$ & $1 \cdot 3$ &&& Row 2 \\
+\end{tabular}
+\end{center}
+\caption{Squaring Optimization Diagram}
+\end{figure}
+
+MARK,SQUARE
+Starting from zero and numbering the columns from right to left a very simple pattern becomes obvious.  For the purposes of this discussion let $x$
+represent the number being squared.  The first observation is that in row $k$ the $2k$'th column of the product has a $\left (x_k \right)^2$ term in it.  
+
+The second observation is that every column $j$ in row $k$ where $j \ne 2k$ is part of a double product.  Every non-square term of a column will
+appear twice hence the name ``double product''.  Every odd column is made up entirely of double products.  In fact every column is made up of double 
+products and at most one square (\textit{see the exercise section}).  
+
+The third and final observation is that for row $k$ the first unique non-square term, that is, one that hasn't already appeared in an earlier row, 
+occurs at column $2k + 1$.  For example, on row $1$ of the previous squaring, column one is part of the double product with column one from row zero. 
+Column two of row one is a square and column three is the first unique column.
+
+\subsection{The Baseline Squaring Algorithm}
+The baseline squaring algorithm is meant to be a catch-all squaring algorithm.  It will handle any of the input sizes that the faster routines
+will not handle.  
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{s\_mp\_sqr}. \\
+\textbf{Input}.   mp\_int $a$ \\
+\textbf{Output}.  $b \leftarrow a^2$ \\
+\hline \\
+1.  Init a temporary mp\_int of at least $2 \cdot a.used +1$ digits.  (\textit{mp\_init\_size}) \\
+2.  If step 1 failed return(\textit{MP\_MEM}) \\
+3.  $t.used \leftarrow 2 \cdot a.used + 1$ \\
+4.  For $ix$ from 0 to $a.used - 1$ do \\
+\hspace{3mm}Calculate the square. \\
+\hspace{3mm}4.1  $\hat r \leftarrow t_{2ix} + \left (a_{ix} \right )^2$ \\
+\hspace{3mm}4.2  $t_{2ix} \leftarrow \hat r \mbox{ (mod }\beta\mbox{)}$ \\
+\hspace{3mm}Calculate the double products after the square. \\
+\hspace{3mm}4.3  $u \leftarrow \lfloor \hat r / \beta \rfloor$ \\
+\hspace{3mm}4.4  For $iy$ from $ix + 1$ to $a.used - 1$ do \\
+\hspace{6mm}4.4.1  $\hat r \leftarrow 2 \cdot a_{ix}a_{iy} + t_{ix + iy} + u$ \\
+\hspace{6mm}4.4.2  $t_{ix + iy} \leftarrow \hat r \mbox{ (mod }\beta\mbox{)}$ \\
+\hspace{6mm}4.4.3  $u \leftarrow \lfloor \hat r / \beta \rfloor$ \\
+\hspace{3mm}Set the last carry. \\
+\hspace{3mm}4.5  While $u > 0$ do \\
+\hspace{6mm}4.5.1  $iy \leftarrow iy + 1$ \\
+\hspace{6mm}4.5.2  $\hat r \leftarrow t_{ix + iy} + u$ \\
+\hspace{6mm}4.5.3  $t_{ix + iy} \leftarrow \hat r \mbox{ (mod }\beta\mbox{)}$ \\
+\hspace{6mm}4.5.4  $u \leftarrow \lfloor \hat r / \beta \rfloor$ \\
+5.  Clamp excess digits of $t$.  (\textit{mp\_clamp}) \\
+6.  Exchange $b$ and $t$. \\
+7.  Clear $t$ (\textit{mp\_clear}) \\
+8.  Return(\textit{MP\_OKAY}) \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm s\_mp\_sqr}
+\end{figure}
+
+\textbf{Algorithm s\_mp\_sqr.}
+This algorithm computes the square of an input using the three observations on squaring.  It is based fairly faithfully on  algorithm 14.16 of HAC
+\cite[pp.596-597]{HAC}.  Similar to algorithm s\_mp\_mul\_digs, a temporary mp\_int is allocated to hold the result of the squaring.  This allows the 
+destination mp\_int to be the same as the source mp\_int.
+
+The outer loop of this algorithm begins on step 4. It is best to think of the outer loop as walking down the rows of the partial results, while
+the inner loop computes the columns of the partial result.  Step 4.1 and 4.2 compute the square term for each row, and step 4.3 and 4.4 propagate
+the carry and compute the double products.  
+
+The requirement that a mp\_word be able to represent the range $0 \le x < 2 \beta^2$ arises from this
+very algorithm.  The product $a_{ix}a_{iy}$ will lie in the range $0 \le x \le \beta^2 - 2\beta + 1$ which is obviously less than $\beta^2$ meaning that
+when it is multiplied by two, it can be properly represented by a mp\_word.
+
+Similar to algorithm s\_mp\_mul\_digs, after every pass of the inner loop, the destination is correctly set to the sum of all of the partial 
+results calculated so far.  This involves expensive carry propagation which will be eliminated in the next algorithm.  
+
+EXAM,bn_s_mp_sqr.c
+
+Inside the outer loop (line @32,for@) the square term is calculated on line @35,r =@.  The carry (line @42,>>@) has been
+extracted from the mp\_word accumulator using a right shift.  Aliases for $a_{ix}$ and $t_{ix+iy}$ are initialized 
+(lines @45,tmpx@ and @48,tmpt@) to simplify the inner loop.  The doubling is performed using two
+additions (line @57,r + r@) since it is usually faster than shifting, if not at least as fast.  
+
+The important observation is that the inner loop does not begin at $iy = 0$ like for multiplication.  As such the inner loops
+get progressively shorter as the algorithm proceeds.  This is what leads to the savings compared to using a multiplication to
+square a number. 
+
+\subsection{Faster Squaring by the ``Comba'' Method}
+A major drawback to the baseline method is the requirement for single precision shifting inside the $O(n^2)$ nested loop.  Squaring has an additional
+drawback that it must double the product inside the inner loop as well.  As for multiplication, the Comba technique can be used to eliminate these
+performance hazards.
+
+The first obvious solution is to make an array of mp\_words which will hold all of the columns.  This will indeed eliminate all of the carry
+propagation operations from the inner loop.  However, the inner product must still be doubled $O(n^2)$ times.  The solution stems from the simple fact
+that $2a + 2b + 2c = 2(a + b + c)$.  That is the sum of all of the double products is equal to double the sum of all the products.  For example,
+$ab + ba + ac + ca = 2ab + 2ac = 2(ab + ac)$.  
+
+However, we cannot simply double all of the columns, since the squares appear only once per row.  The most practical solution is to have two 
+mp\_word arrays.  One array will hold the squares and the other array will hold the double products.  With both arrays the doubling and 
+carry propagation can be moved to a $O(n)$ work level outside the $O(n^2)$ level.  In this case, we have an even simpler solution in mind.
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{fast\_s\_mp\_sqr}. \\
+\textbf{Input}.   mp\_int $a$ \\
+\textbf{Output}.  $b \leftarrow a^2$ \\
+\hline \\
+Place an array of \textbf{MP\_WARRAY} mp\_digits named $W$ on the stack. \\
+1.  If $b.alloc < 2a.used + 1$ then grow $b$ to $2a.used + 1$ digits.  (\textit{mp\_grow}). \\
+2.  If step 1 failed return(\textit{MP\_MEM}). \\
+\\
+3.  $pa \leftarrow 2 \cdot a.used$ \\
+4.  $\hat W1 \leftarrow 0$ \\
+5.  for $ix$ from $0$ to $pa - 1$ do \\
+\hspace{3mm}5.1  $\_ \hat W \leftarrow 0$ \\
+\hspace{3mm}5.2  $ty \leftarrow \mbox{MIN}(a.used - 1, ix)$ \\
+\hspace{3mm}5.3  $tx \leftarrow ix - ty$ \\
+\hspace{3mm}5.4  $iy \leftarrow \mbox{MIN}(a.used - tx, ty + 1)$ \\
+\hspace{3mm}5.5  $iy \leftarrow \mbox{MIN}(iy, \lfloor \left (ty - tx + 1 \right )/2 \rfloor)$ \\
+\hspace{3mm}5.6  for $iz$ from $0$ to $iz - 1$ do \\
+\hspace{6mm}5.6.1  $\_ \hat W \leftarrow \_ \hat W + a_{tx + iz}a_{ty - iz}$ \\
+\hspace{3mm}5.7  $\_ \hat W \leftarrow 2 \cdot \_ \hat W  + \hat W1$ \\
+\hspace{3mm}5.8  if $ix$ is even then \\
+\hspace{6mm}5.8.1  $\_ \hat W \leftarrow \_ \hat W + \left ( a_{\lfloor ix/2 \rfloor}\right )^2$ \\
+\hspace{3mm}5.9  $W_{ix} \leftarrow \_ \hat W (\mbox{mod }\beta)$ \\
+\hspace{3mm}5.10  $\hat W1 \leftarrow \lfloor \_ \hat W / \beta \rfloor$ \\
+\\
+6.  $oldused \leftarrow b.used$ \\
+7.  $b.used \leftarrow 2 \cdot a.used$ \\
+8.  for $ix$ from $0$ to $pa - 1$ do \\
+\hspace{3mm}8.1  $b_{ix} \leftarrow W_{ix}$ \\
+9.  for $ix$ from $pa$ to $oldused - 1$ do \\
+\hspace{3mm}9.1  $b_{ix} \leftarrow 0$ \\
+10.  Clamp excess digits from $b$.  (\textit{mp\_clamp}) \\
+11.  Return(\textit{MP\_OKAY}). \\ 
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm fast\_s\_mp\_sqr}
+\end{figure}
+
+\textbf{Algorithm fast\_s\_mp\_sqr.}
+This algorithm computes the square of an input using the Comba technique.  It is designed to be a replacement for algorithm 
+s\_mp\_sqr when the number of input digits is less than \textbf{MP\_WARRAY} and less than $\delta \over 2$.  
+This algorithm is very similar to the Comba multiplier except with a few key differences we shall make note of.
+
+First, we have an accumulator and carry variables $\_ \hat W$ and $\hat W1$ respectively.  This is because the inner loop
+products are to be doubled.  If we had added the previous carry in we would be doubling too much.  Next we perform an
+addition MIN condition on $iy$ (step 5.5) to prevent overlapping digits.  For example, $a_3 \cdot a_5$ is equal
+$a_5 \cdot a_3$.  Whereas in the multiplication case we would have $5 < a.used$ and $3 \ge 0$ is maintained since we double the sum
+of the products just outside the inner loop we have to avoid doing this.  This is also a good thing since we perform
+fewer multiplications and the routine ends up being faster.
+
+Finally the last difference is the addition of the ``square'' term outside the inner loop (step 5.8).  We add in the square
+only to even outputs and it is the square of the term at the $\lfloor ix / 2 \rfloor$ position.
+
+EXAM,bn_fast_s_mp_sqr.c
+
+This implementation is essentially a copy of Comba multiplication with the appropriate changes added to make it faster for 
+the special case of squaring.  
+
+\subsection{Polynomial Basis Squaring}
+The same algorithm that performs optimal polynomial basis multiplication can be used to perform polynomial basis squaring.  The minor exception
+is that $\zeta_y = f(y)g(y)$ is actually equivalent to $\zeta_y = f(y)^2$ since $f(y) = g(y)$.  Instead of performing $2n + 1$
+multiplications to find the $\zeta$ relations, squaring operations are performed instead.  
+
+\subsection{Karatsuba Squaring}
+Let $f(x) = ax + b$ represent the polynomial basis representation of a number to square.  
+Let $h(x) = \left ( f(x) \right )^2$ represent the square of the polynomial.  The Karatsuba equation can be modified to square a 
+number with the following equation.
+
+\begin{equation}
+h(x) = a^2x^2 + \left (a^2 + b^2 - (a - b)^2 \right )x + b^2
+\end{equation}
+
+Upon closer inspection this equation only requires the calculation of three half-sized squares: $a^2$, $b^2$ and $(a - b)^2$.  As in 
+Karatsuba multiplication, this algorithm can be applied recursively on the input and will achieve an asymptotic running time of 
+$O \left ( n^{lg(3)} \right )$.
+
+If the asymptotic times of Karatsuba squaring and multiplication are the same, why not simply use the multiplication algorithm 
+instead?  The answer to this arises from the cutoff point for squaring.  As in multiplication there exists a cutoff point, at which the 
+time required for a Comba based squaring and a Karatsuba based squaring meet.  Due to the overhead inherent in the Karatsuba method, the cutoff 
+point is fairly high.  For example, on an AMD Athlon XP processor with $\beta = 2^{28}$, the cutoff point is around 127 digits.  
+
+Consider squaring a 200 digit number with this technique.  It will be split into two 100 digit halves which are subsequently squared.  
+The 100 digit halves will not be squared using Karatsuba, but instead using the faster Comba based squaring algorithm.  If Karatsuba multiplication
+were used instead, the 100 digit numbers would be squared with a slower Comba based multiplication.  
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_karatsuba\_sqr}. \\
+\textbf{Input}.   mp\_int $a$ \\
+\textbf{Output}.  $b \leftarrow a^2$ \\
+\hline \\
+1.  Initialize the following temporary mp\_ints:  $x0$, $x1$, $t1$, $t2$, $x0x0$ and $x1x1$. \\
+2.  If any of the initializations on step 1 failed return(\textit{MP\_MEM}). \\
+\\
+Split the input.  e.g. $a = x1\beta^B + x0$ \\
+3.  $B \leftarrow \lfloor a.used / 2 \rfloor$ \\
+4.  $x0 \leftarrow a \mbox{ (mod }\beta^B\mbox{)}$ (\textit{mp\_mod\_2d}) \\
+5.  $x1 \leftarrow \lfloor a / \beta^B \rfloor$ (\textit{mp\_lshd}) \\
+\\
+Calculate the three squares. \\
+6.  $x0x0 \leftarrow x0^2$ (\textit{mp\_sqr}) \\
+7.  $x1x1 \leftarrow x1^2$ \\
+8.  $t1 \leftarrow x1 - x0$ (\textit{mp\_sub}) \\
+9.  $t1 \leftarrow t1^2$ \\
+\\
+Compute the middle term. \\
+10.  $t2 \leftarrow x0x0 + x1x1$ (\textit{s\_mp\_add}) \\
+11.  $t1 \leftarrow t2 - t1$ \\
+\\
+Compute final product. \\
+12.  $t1 \leftarrow t1\beta^B$ (\textit{mp\_lshd}) \\
+13.  $x1x1 \leftarrow x1x1\beta^{2B}$ \\
+14.  $t1 \leftarrow t1 + x0x0$ \\
+15.  $b \leftarrow t1 + x1x1$ \\
+16.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_karatsuba\_sqr}
+\end{figure}
+
+\textbf{Algorithm mp\_karatsuba\_sqr.}
+This algorithm computes the square of an input $a$ using the Karatsuba technique.  This algorithm is very similar to the Karatsuba based
+multiplication algorithm with the exception that the three half-size multiplications have been replaced with three half-size squarings.
+
+The radix point for squaring is simply placed exactly in the middle of the digits when the input has an odd number of digits, otherwise it is
+placed just below the middle.  Step 3, 4 and 5 compute the two halves required using $B$
+as the radix point.  The first two squares in steps 6 and 7 are rather straightforward while the last square is of a more compact form.
+
+By expanding $\left (x1 - x0 \right )^2$, the $x1^2$ and $x0^2$ terms in the middle disappear, that is $x1^2 + x0^2 - (x1 - x0)^2 = 2 \cdot x0 \cdot x1$.
+Now if $5n$ single precision additions and a squaring of $n$-digits is faster than multiplying two $n$-digit numbers and doubling then
+this method is faster.  Assuming no further recursions occur, the difference can be estimated with the following inequality.
+
+Let $p$ represent the cost of a single precision addition and $q$ the cost of a single precision multiplication both in terms of time\footnote{Or
+machine clock cycles.}. 
+
+\begin{equation}
+5pn +{{q(n^2 + n)} \over 2} \le pn + qn^2
+\end{equation}
+
+For example, on an AMD Athlon XP processor $p = {1 \over 3}$ and $q = 6$.  This implies that the following inequality should hold.
+\begin{center}
+\begin{tabular}{rcl}
+${5n \over 3} + 3n^2 + 3n$     & $<$ & ${n \over 3} + 6n^2$ \\
+${5 \over 3} + 3n + 3$     & $<$ & ${1 \over 3} + 6n$ \\
+${13 \over 9}$     & $<$ & $n$ \\
+\end{tabular}
+\end{center}
+
+This results in a cutoff point around $n = 2$.  As a consequence it is actually faster to compute the middle term the ``long way'' on processors
+where multiplication is substantially slower\footnote{On the Athlon there is a 1:17 ratio between clock cycles for addition and multiplication.  On
+the Intel P4 processor this ratio is 1:29 making this method even more beneficial.  The only common exception is the ARMv4 processor which has a
+ratio of 1:7.  } than simpler operations such as addition.  
+
+EXAM,bn_mp_karatsuba_sqr.c
+
+This implementation is largely based on the implementation of algorithm mp\_karatsuba\_mul.  It uses the same inline style to copy and 
+shift the input into the two halves.  The loop from line @54,{@ to line @70,}@ has been modified since only one input exists.  The \textbf{used}
+count of both $x0$ and $x1$ is fixed up and $x0$ is clamped before the calculations begin.  At this point $x1$ and $x0$ are valid equivalents
+to the respective halves as if mp\_rshd and mp\_mod\_2d had been used.  
+
+By inlining the copy and shift operations the cutoff point for Karatsuba multiplication can be lowered.  On the Athlon the cutoff point
+is exactly at the point where Comba squaring can no longer be used (\textit{128 digits}).  On slower processors such as the Intel P4
+it is actually below the Comba limit (\textit{at 110 digits}).
+
+This routine uses the same error trap coding style as mp\_karatsuba\_sqr.  As the temporary variables are initialized errors are 
+redirected to the error trap higher up.  If the algorithm completes without error the error code is set to \textbf{MP\_OKAY} and 
+mp\_clears are executed normally.
+
+\subsection{Toom-Cook Squaring}
+The Toom-Cook squaring algorithm mp\_toom\_sqr is heavily based on the algorithm mp\_toom\_mul with the exception that squarings are used
+instead of multiplication to find the five relations.  The reader is encouraged to read the description of the latter algorithm and try to 
+derive their own Toom-Cook squaring algorithm.  
+
+\subsection{High Level Squaring}
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_sqr}. \\
+\textbf{Input}.   mp\_int $a$ \\
+\textbf{Output}.  $b \leftarrow a^2$ \\
+\hline \\
+1.  If $a.used \ge TOOM\_SQR\_CUTOFF$ then  \\
+\hspace{3mm}1.1  $b \leftarrow a^2$ using algorithm mp\_toom\_sqr \\
+2.  else if $a.used \ge KARATSUBA\_SQR\_CUTOFF$ then \\
+\hspace{3mm}2.1  $b \leftarrow a^2$ using algorithm mp\_karatsuba\_sqr \\
+3.  else \\
+\hspace{3mm}3.1  $digs \leftarrow a.used + b.used + 1$ \\
+\hspace{3mm}3.2  If $digs < MP\_ARRAY$ and $a.used \le \delta$ then \\
+\hspace{6mm}3.2.1  $b \leftarrow a^2$ using algorithm fast\_s\_mp\_sqr.  \\
+\hspace{3mm}3.3  else \\
+\hspace{6mm}3.3.1  $b \leftarrow a^2$ using algorithm s\_mp\_sqr.  \\
+4.  $b.sign \leftarrow MP\_ZPOS$ \\
+5.  Return the result of the unsigned squaring performed. \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_sqr}
+\end{figure}
+
+\textbf{Algorithm mp\_sqr.}
+This algorithm computes the square of the input using one of four different algorithms.  If the input is very large and has at least
+\textbf{TOOM\_SQR\_CUTOFF} or \textbf{KARATSUBA\_SQR\_CUTOFF} digits then either the Toom-Cook or the Karatsuba Squaring algorithm is used.  If
+neither of the polynomial basis algorithms should be used then either the Comba or baseline algorithm is used.  
+
+EXAM,bn_mp_sqr.c
+
+\section*{Exercises}
+\begin{tabular}{cl}
+$\left [ 3 \right ] $ & Devise an efficient algorithm for selection of the radix point to handle inputs \\
+                      & that have different number of digits in Karatsuba multiplication. \\
+                      & \\
+$\left [ 2 \right ] $ & In ~SQUARE~ the fact that every column of a squaring is made up \\
+                      & of double products and at most one square is stated.  Prove this statement. \\
+                      & \\                      
+$\left [ 3 \right ] $ & Prove the equation for Karatsuba squaring. \\
+                      & \\
+$\left [ 1 \right ] $ & Prove that Karatsuba squaring requires $O \left (n^{lg(3)} \right )$ time. \\
+                      & \\ 
+$\left [ 2 \right ] $ & Determine the minimal ratio between addition and multiplication clock cycles \\
+                      & required for equation $6.7$ to be true.  \\
+                      & \\
+$\left [ 3 \right ] $ & Implement a threaded version of Comba multiplication (and squaring) where you \\
+                      & compute subsets of the columns in each thread.  Determine a cutoff point where \\
+                      & it is effective and add the logic to mp\_mul() and mp\_sqr(). \\
+                      &\\
+$\left [ 4 \right ] $ & Same as the previous but also modify the Karatsuba and Toom-Cook.  You must \\
+                      & increase the throughput of mp\_exptmod() for random odd moduli in the range \\
+                      & $512 \ldots 4096$ bits significantly ($> 2x$) to complete this challenge. \\
+                      & \\
+\end{tabular}
+
+\chapter{Modular Reduction}
+MARK,REDUCTION
+\section{Basics of Modular Reduction}
+\index{modular residue}
+Modular reduction is an operation that arises quite often within public key cryptography algorithms and various number theoretic algorithms, 
+such as factoring.  Modular reduction algorithms are the third class of algorithms of the ``multipliers'' set.  A number $a$ is said to be \textit{reduced}
+modulo another number $b$ by finding the remainder of the division $a/b$.  Full integer division with remainder is a topic to be covered 
+in~\ref{sec:division}.
+
+Modular reduction is equivalent to solving for $r$ in the following equation.  $a = bq + r$ where $q = \lfloor a/b \rfloor$.  The result 
+$r$ is said to be ``congruent to $a$ modulo $b$'' which is also written as $r \equiv a \mbox{ (mod }b\mbox{)}$.  In other vernacular $r$ is known as the 
+``modular residue'' which leads to ``quadratic residue''\footnote{That's fancy talk for $b \equiv a^2 \mbox{ (mod }p\mbox{)}$.} and
+other forms of residues.  
+
+Modular reductions are normally used to create either finite groups, rings or fields.  The most common usage for performance driven modular reductions 
+is in modular exponentiation algorithms.  That is to compute $d = a^b \mbox{ (mod }c\mbox{)}$ as fast as possible.  This operation is used in the 
+RSA and Diffie-Hellman public key algorithms, for example.  Modular multiplication and squaring also appears as a fundamental operation in 
+elliptic curve cryptographic algorithms.  As will be discussed in the subsequent chapter there exist fast algorithms for computing modular 
+exponentiations without having to perform (\textit{in this example}) $b - 1$ multiplications.  These algorithms will produce partial results in the 
+range $0 \le x < c^2$ which can be taken advantage of to create several efficient algorithms.   They have also been used to create redundancy check 
+algorithms known as CRCs, error correction codes such as Reed-Solomon and solve a variety of number theoeretic problems.  
+
+\section{The Barrett Reduction}
+The Barrett reduction algorithm \cite{BARRETT} was inspired by fast division algorithms which multiply by the reciprocal to emulate
+division.  Barretts observation was that the residue $c$ of $a$ modulo $b$ is equal to 
+
+\begin{equation}
+c = a - b \cdot \lfloor a/b \rfloor
+\end{equation}
+
+Since algorithms such as modular exponentiation would be using the same modulus extensively, typical DSP\footnote{It is worth noting that Barrett's paper 
+targeted the DSP56K processor.}  intuition would indicate the next step would be to replace $a/b$ by a multiplication by the reciprocal.  However, 
+DSP intuition on its own will not work as these numbers are considerably larger than the precision of common DSP floating point data types.  
+It would take another common optimization to optimize the algorithm.
+
+\subsection{Fixed Point Arithmetic}
+The trick used to optimize the above equation is based on a technique of emulating floating point data types with fixed precision integers.  Fixed
+point arithmetic would become very popular as it greatly optimize the ``3d-shooter'' genre of games in the mid 1990s when floating point units were 
+fairly slow if not unavailable.   The idea behind fixed point arithmetic is to take a normal $k$-bit integer data type and break it into $p$-bit 
+integer and a $q$-bit fraction part (\textit{where $p+q = k$}).  
+
+In this system a $k$-bit integer $n$ would actually represent $n/2^q$.  For example, with $q = 4$ the integer $n = 37$ would actually represent the
+value $2.3125$.  To multiply two fixed point numbers the integers are multiplied using traditional arithmetic and subsequently normalized by 
+moving the implied decimal point back to where it should be.  For example, with $q = 4$ to multiply the integers $9$ and $5$ they must be converted 
+to fixed point first by multiplying by $2^q$.  Let $a = 9(2^q)$ represent the fixed point representation of $9$ and $b = 5(2^q)$ represent the 
+fixed point representation of $5$.  The product $ab$ is equal to $45(2^{2q})$ which when normalized by dividing by $2^q$ produces $45(2^q)$.  
+
+This technique became popular since a normal integer multiplication and logical shift right are the only required operations to perform a multiplication
+of two fixed point numbers.  Using fixed point arithmetic, division can be easily approximated by multiplying by the reciprocal.  If $2^q$ is 
+equivalent to one than $2^q/b$ is equivalent to the fixed point approximation of $1/b$ using real arithmetic.  Using this fact dividing an integer 
+$a$ by another integer $b$ can be achieved with the following expression.
+
+\begin{equation}
+\lfloor a / b \rfloor \mbox{ }\approx\mbox{ } \lfloor (a \cdot \lfloor 2^q / b \rfloor)/2^q \rfloor
+\end{equation}
+
+The precision of the division is proportional to the value of $q$.  If the divisor $b$ is used frequently as is the case with 
+modular exponentiation pre-computing $2^q/b$ will allow a division to be performed with a multiplication and a right shift.  Both operations
+are considerably faster than division on most processors.  
+
+Consider dividing $19$ by $5$.  The correct result is $\lfloor 19/5 \rfloor = 3$.  With $q = 3$ the reciprocal is $\lfloor 2^q/5 \rfloor = 1$ which
+leads to a product of $19$ which when divided by $2^q$ produces $2$.  However, with $q = 4$ the reciprocal is $\lfloor 2^q/5 \rfloor = 3$ and
+the result of the emulated division is $\lfloor 3 \cdot 19 / 2^q \rfloor = 3$ which is correct.  The value of $2^q$ must be close to or ideally
+larger than the dividend.  In effect if $a$ is the dividend then $q$ should allow $0 \le \lfloor a/2^q \rfloor \le 1$ in order for this approach
+to work correctly.  Plugging this form of divison into the original equation the following modular residue equation arises.
+
+\begin{equation}
+c = a - b \cdot \lfloor (a \cdot \lfloor 2^q / b \rfloor)/2^q \rfloor
+\end{equation}
+
+Using the notation from \cite{BARRETT} the value of $\lfloor 2^q / b \rfloor$ will be represented by the $\mu$ symbol.  Using the $\mu$
+variable also helps re-inforce the idea that it is meant to be computed once and re-used.
+
+\begin{equation}
+c = a - b \cdot \lfloor (a \cdot \mu)/2^q \rfloor
+\end{equation}
+
+Provided that $2^q \ge a$ this algorithm will produce a quotient that is either exactly correct or off by a value of one.  In the context of Barrett
+reduction the value of $a$ is bound by $0 \le a \le (b - 1)^2$ meaning that $2^q \ge b^2$ is sufficient to ensure the reciprocal will have enough
+precision.  
+
+Let $n$ represent the number of digits in $b$.  This algorithm requires approximately $2n^2$ single precision multiplications to produce the quotient and 
+another $n^2$ single precision multiplications to find the residue.  In total $3n^2$ single precision multiplications are required to 
+reduce the number.  
+
+For example, if $b = 1179677$ and $q = 41$ ($2^q > b^2$), then the reciprocal $\mu$ is equal to $\lfloor 2^q / b \rfloor = 1864089$.  Consider reducing
+$a = 180388626447$ modulo $b$ using the above reduction equation.  The quotient using the new formula is $\lfloor (a \cdot \mu) / 2^q \rfloor = 152913$.
+By subtracting $152913b$ from $a$ the correct residue $a \equiv 677346 \mbox{ (mod }b\mbox{)}$ is found.
+
+\subsection{Choosing a Radix Point}
+Using the fixed point representation a modular reduction can be performed with $3n^2$ single precision multiplications.  If that were the best
+that could be achieved a full division\footnote{A division requires approximately $O(2cn^2)$ single precision multiplications for a small value of $c$.  
+See~\ref{sec:division} for further details.} might as well be used in its place.  The key to optimizing the reduction is to reduce the precision of
+the initial multiplication that finds the quotient.  
+
+Let $a$ represent the number of which the residue is sought.  Let $b$ represent the modulus used to find the residue.  Let $m$ represent
+the number of digits in $b$.  For the purposes of this discussion we will assume that the number of digits in $a$ is $2m$, which is generally true if 
+two $m$-digit numbers have been multiplied.  Dividing $a$ by $b$ is the same as dividing a $2m$ digit integer by a $m$ digit integer.  Digits below the 
+$m - 1$'th digit of $a$ will contribute at most a value of $1$ to the quotient because $\beta^k < b$ for any $0 \le k \le m - 1$.  Another way to
+express this is by re-writing $a$ as two parts.  If $a' \equiv a \mbox{ (mod }b^m\mbox{)}$ and $a'' = a - a'$ then 
+${a \over b} \equiv {{a' + a''} \over b}$ which is equivalent to ${a' \over b} + {a'' \over b}$.  Since $a'$ is bound to be less than $b$ the quotient
+is bound by $0 \le {a' \over b} < 1$.
+
+Since the digits of $a'$ do not contribute much to the quotient the observation is that they might as well be zero.  However, if the digits 
+``might as well be zero'' they might as well not be there in the first place.  Let $q_0 = \lfloor a/\beta^{m-1} \rfloor$ represent the input
+with the irrelevant digits trimmed.  Now the modular reduction is trimmed to the almost equivalent equation
+
+\begin{equation}
+c = a - b \cdot \lfloor (q_0 \cdot \mu) / \beta^{m+1} \rfloor
+\end{equation}
+
+Note that the original divisor $2^q$ has been replaced with $\beta^{m+1}$ where in this case $q$ is a multiple of $lg(\beta)$. Also note that the 
+exponent on the divisor when added to the amount $q_0$ was shifted by equals $2m$.  If the optimization had not been performed the divisor 
+would have the exponent $2m$ so in the end the exponents do ``add up''. Using the above equation the quotient 
+$\lfloor (q_0 \cdot \mu) / \beta^{m+1} \rfloor$ can be off from the true quotient by at most two.  The original fixed point quotient can be off
+by as much as one (\textit{provided the radix point is chosen suitably}) and now that the lower irrelevent digits have been trimmed the quotient
+can be off by an additional value of one for a total of at most two.  This implies that 
+$0 \le a - b \cdot \lfloor (q_0 \cdot \mu) / \beta^{m+1} \rfloor < 3b$.  By first subtracting $b$ times the quotient and then conditionally subtracting 
+$b$ once or twice the residue is found.
+
+The quotient is now found using $(m + 1)(m) = m^2 + m$ single precision multiplications and the residue with an additional $m^2$ single
+precision multiplications, ignoring the subtractions required.  In total $2m^2 + m$ single precision multiplications are required to find the residue.  
+This is considerably faster than the original attempt.
+
+For example, let $\beta = 10$ represent the radix of the digits.  Let $b = 9999$ represent the modulus which implies $m = 4$. Let $a = 99929878$ 
+represent the value of which the residue is desired.  In this case $q = 8$ since $10^7 < 9999^2$ meaning that $\mu = \lfloor \beta^{q}/b \rfloor = 10001$.  
+With the new observation the multiplicand for the quotient is equal to $q_0 = \lfloor a / \beta^{m - 1} \rfloor = 99929$.  The quotient is then 
+$\lfloor (q_0 \cdot \mu) / \beta^{m+1} \rfloor = 9993$.  Subtracting $9993b$ from $a$ and the correct residue $a \equiv 9871 \mbox{ (mod }b\mbox{)}$ 
+is found.  
+
+\subsection{Trimming the Quotient}
+So far the reduction algorithm has been optimized from $3m^2$ single precision multiplications down to $2m^2 + m$ single precision multiplications.  As 
+it stands now the algorithm is already fairly fast compared to a full integer division algorithm.  However, there is still room for
+optimization.  
+
+After the first multiplication inside the quotient ($q_0 \cdot \mu$) the value is shifted right by $m + 1$ places effectively nullifying the lower
+half of the product.  It would be nice to be able to remove those digits from the product to effectively cut down the number of single precision 
+multiplications.  If the number of digits in the modulus $m$ is far less than $\beta$ a full product is not required for the algorithm to work properly.  
+In fact the lower $m - 2$ digits will not affect the upper half of the product at all and do not need to be computed.  
+
+The value of $\mu$ is a $m$-digit number and $q_0$ is a $m + 1$ digit number.  Using a full multiplier $(m + 1)(m) = m^2 + m$ single precision
+multiplications would be required.  Using a multiplier that will only produce digits at and above the $m - 1$'th digit reduces the number
+of single precision multiplications to ${m^2 + m} \over 2$ single precision multiplications.  
+
+\subsection{Trimming the Residue}
+After the quotient has been calculated it is used to reduce the input.  As previously noted the algorithm is not exact and it can be off by a small
+multiple of the modulus, that is $0 \le a - b \cdot \lfloor (q_0 \cdot \mu) / \beta^{m+1} \rfloor < 3b$.  If $b$ is $m$ digits than the 
+result of reduction equation is a value of at most $m + 1$ digits (\textit{provided $3 < \beta$}) implying that the upper $m - 1$ digits are
+implicitly zero.  
+
+The next optimization arises from this very fact.  Instead of computing $b \cdot \lfloor (q_0 \cdot \mu) / \beta^{m+1} \rfloor$ using a full
+$O(m^2)$ multiplication algorithm only the lower $m+1$ digits of the product have to be computed.  Similarly the value of $a$ can
+be reduced modulo $\beta^{m+1}$ before the multiple of $b$ is subtracted which simplifes the subtraction as well.  A multiplication that produces 
+only the lower $m+1$ digits requires ${m^2 + 3m - 2} \over 2$ single precision multiplications.  
+
+With both optimizations in place the algorithm is the algorithm Barrett proposed.  It requires $m^2 + 2m - 1$ single precision multiplications which
+is considerably faster than the straightforward $3m^2$ method.  
+
+\subsection{The Barrett Algorithm}
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_reduce}. \\
+\textbf{Input}.   mp\_int $a$, mp\_int $b$ and $\mu = \lfloor \beta^{2m}/b \rfloor, m = \lceil lg_{\beta}(b) \rceil, (0 \le a < b^2, b > 1)$ \\
+\textbf{Output}.  $a \mbox{ (mod }b\mbox{)}$ \\
+\hline \\
+Let $m$ represent the number of digits in $b$.  \\
+1.  Make a copy of $a$ and store it in $q$.  (\textit{mp\_init\_copy}) \\
+2.  $q \leftarrow \lfloor q / \beta^{m - 1} \rfloor$ (\textit{mp\_rshd}) \\
+\\
+Produce the quotient. \\
+3.  $q \leftarrow q \cdot \mu$  (\textit{note: only produce digits at or above $m-1$}) \\
+4.  $q \leftarrow \lfloor q / \beta^{m + 1} \rfloor$ \\
+\\
+Subtract the multiple of modulus from the input. \\
+5.  $a \leftarrow a \mbox{ (mod }\beta^{m+1}\mbox{)}$ (\textit{mp\_mod\_2d}) \\
+6.  $q \leftarrow q \cdot b \mbox{ (mod }\beta^{m+1}\mbox{)}$ (\textit{s\_mp\_mul\_digs}) \\
+7.  $a \leftarrow a - q$ (\textit{mp\_sub}) \\
+\\
+Add $\beta^{m+1}$ if a carry occured. \\
+8.  If $a < 0$ then (\textit{mp\_cmp\_d}) \\
+\hspace{3mm}8.1  $q \leftarrow 1$ (\textit{mp\_set}) \\
+\hspace{3mm}8.2  $q \leftarrow q \cdot \beta^{m+1}$ (\textit{mp\_lshd}) \\
+\hspace{3mm}8.3  $a \leftarrow a + q$ \\
+\\
+Now subtract the modulus if the residue is too large (e.g. quotient too small). \\
+9.  While $a \ge b$ do (\textit{mp\_cmp}) \\
+\hspace{3mm}9.1  $c \leftarrow a - b$ \\
+10.  Clear $q$. \\
+11.  Return(\textit{MP\_OKAY}) \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_reduce}
+\end{figure}
+
+\textbf{Algorithm mp\_reduce.}
+This algorithm will reduce the input $a$ modulo $b$ in place using the Barrett algorithm.  It is loosely based on algorithm 14.42 of HAC
+\cite[pp.  602]{HAC} which is based on the paper from Paul Barrett \cite{BARRETT}.  The algorithm has several restrictions and assumptions which must 
+be adhered to for the algorithm to work.
+
+First the modulus $b$ is assumed to be positive and greater than one.  If the modulus were less than or equal to one than subtracting
+a multiple of it would either accomplish nothing or actually enlarge the input.  The input $a$ must be in the range $0 \le a < b^2$ in order
+for the quotient to have enough precision.  If $a$ is the product of two numbers that were already reduced modulo $b$, this will not be a problem.
+Technically the algorithm will still work if $a \ge b^2$ but it will take much longer to finish.  The value of $\mu$ is passed as an argument to this 
+algorithm and is assumed to be calculated and stored before the algorithm is used.  
+
+Recall that the multiplication for the quotient on step 3 must only produce digits at or above the $m-1$'th position.  An algorithm called 
+$s\_mp\_mul\_high\_digs$ which has not been presented is used to accomplish this task.  The algorithm is based on $s\_mp\_mul\_digs$ except that
+instead of stopping at a given level of precision it starts at a given level of precision.  This optimal algorithm can only be used if the number
+of digits in $b$ is very much smaller than $\beta$.  
+
+While it is known that 
+$a \ge b \cdot \lfloor (q_0 \cdot \mu) / \beta^{m+1} \rfloor$ only the lower $m+1$ digits are being used to compute the residue, so an implied 
+``borrow'' from the higher digits might leave a negative result.  After the multiple of the modulus has been subtracted from $a$ the residue must be 
+fixed up in case it is negative.  The invariant $\beta^{m+1}$ must be added to the residue to make it positive again.  
+
+The while loop at step 9 will subtract $b$ until the residue is less than $b$.  If the algorithm is performed correctly this step is 
+performed at most twice, and on average once. However, if $a \ge b^2$ than it will iterate substantially more times than it should.
+
+EXAM,bn_mp_reduce.c
+
+The first multiplication that determines the quotient can be performed by only producing the digits from $m - 1$ and up.  This essentially halves
+the number of single precision multiplications required.  However, the optimization is only safe if $\beta$ is much larger than the number of digits
+in the modulus.  In the source code this is evaluated on lines @36,if@ to @44,}@ where algorithm s\_mp\_mul\_high\_digs is used when it is
+safe to do so.  
+
+\subsection{The Barrett Setup Algorithm}
+In order to use algorithm mp\_reduce the value of $\mu$ must be calculated in advance.  Ideally this value should be computed once and stored for
+future use so that the Barrett algorithm can be used without delay.  
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_reduce\_setup}. \\
+\textbf{Input}.   mp\_int $a$ ($a > 1$)  \\
+\textbf{Output}.  $\mu \leftarrow \lfloor \beta^{2m}/a \rfloor$ \\
+\hline \\
+1.  $\mu \leftarrow 2^{2 \cdot lg(\beta) \cdot  m}$ (\textit{mp\_2expt}) \\
+2.  $\mu \leftarrow \lfloor \mu / b \rfloor$ (\textit{mp\_div}) \\
+3.  Return(\textit{MP\_OKAY}) \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_reduce\_setup}
+\end{figure}
+
+\textbf{Algorithm mp\_reduce\_setup.}
+This algorithm computes the reciprocal $\mu$ required for Barrett reduction.  First $\beta^{2m}$ is calculated as $2^{2 \cdot lg(\beta) \cdot  m}$ which
+is equivalent and much faster.  The final value is computed by taking the integer quotient of $\lfloor \mu / b \rfloor$.
+
+EXAM,bn_mp_reduce_setup.c
+
+This simple routine calculates the reciprocal $\mu$ required by Barrett reduction.  Note the extended usage of algorithm mp\_div where the variable
+which would received the remainder is passed as NULL.  As will be discussed in~\ref{sec:division} the division routine allows both the quotient and the 
+remainder to be passed as NULL meaning to ignore the value.  
+
+\section{The Montgomery Reduction}
+Montgomery reduction\footnote{Thanks to Niels Ferguson for his insightful explanation of the algorithm.} \cite{MONT} is by far the most interesting 
+form of reduction in common use.  It computes a modular residue which is not actually equal to the residue of the input yet instead equal to a 
+residue times a constant.  However, as perplexing as this may sound the algorithm is relatively simple and very efficient.  
+
+Throughout this entire section the variable $n$ will represent the modulus used to form the residue.  As will be discussed shortly the value of
+$n$ must be odd.  The variable $x$ will represent the quantity of which the residue is sought.  Similar to the Barrett algorithm the input
+is restricted to $0 \le x < n^2$.  To begin the description some simple number theory facts must be established.
+
+\textbf{Fact 1.}  Adding $n$ to $x$ does not change the residue since in effect it adds one to the quotient $\lfloor x / n \rfloor$.  Another way
+to explain this is that $n$ is (\textit{or multiples of $n$ are}) congruent to zero modulo $n$.  Adding zero will not change the value of the residue.  
+
+\textbf{Fact 2.}  If $x$ is even then performing a division by two in $\Z$ is congruent to $x \cdot 2^{-1} \mbox{ (mod }n\mbox{)}$.  Actually
+this is an application of the fact that if $x$ is evenly divisible by any $k \in \Z$ then division in $\Z$ will be congruent to 
+multiplication by $k^{-1}$ modulo $n$.  
+
+From these two simple facts the following simple algorithm can be derived.
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{Montgomery Reduction}. \\
+\textbf{Input}.   Integer $x$, $n$ and $k$ \\
+\textbf{Output}.  $2^{-k}x \mbox{ (mod }n\mbox{)}$ \\
+\hline \\
+1.  for $t$ from $1$ to $k$ do \\
+\hspace{3mm}1.1  If $x$ is odd then \\
+\hspace{6mm}1.1.1  $x \leftarrow x + n$ \\
+\hspace{3mm}1.2  $x \leftarrow x/2$ \\
+2.  Return $x$. \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm Montgomery Reduction}
+\end{figure}
+
+The algorithm reduces the input one bit at a time using the two congruencies stated previously.  Inside the loop $n$, which is odd, is
+added to $x$ if $x$ is odd.  This forces $x$ to be even which allows the division by two in $\Z$ to be congruent to a modular division by two.  Since
+$x$ is assumed to be initially much larger than $n$ the addition of $n$ will contribute an insignificant magnitude to $x$.  Let $r$ represent the 
+final result of the Montgomery algorithm.  If $k > lg(n)$ and $0 \le x < n^2$ then the final result is limited to 
+$0 \le r < \lfloor x/2^k \rfloor + n$.  As a result at most a single subtraction is required to get the residue desired.
+
+\begin{figure}[here]
+\begin{small}
+\begin{center}
+\begin{tabular}{|c|l|}
+\hline \textbf{Step number ($t$)} & \textbf{Result ($x$)} \\
+\hline $1$ & $x + n = 5812$, $x/2 = 2906$ \\
+\hline $2$ & $x/2 = 1453$ \\
+\hline $3$ & $x + n = 1710$, $x/2 = 855$ \\
+\hline $4$ & $x + n = 1112$, $x/2 = 556$ \\
+\hline $5$ & $x/2 = 278$ \\
+\hline $6$ & $x/2 = 139$ \\
+\hline $7$ & $x + n = 396$, $x/2 = 198$ \\
+\hline $8$ & $x/2 = 99$ \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Example of Montgomery Reduction (I)}
+\label{fig:MONT1}
+\end{figure}
+
+Consider the example in figure~\ref{fig:MONT1} which reduces $x = 5555$ modulo $n = 257$ when $k = 8$.  The result of the algorithm $r = 99$ is
+congruent to the value of $2^{-8} \cdot 5555 \mbox{ (mod }257\mbox{)}$.  When $r$ is multiplied by $2^8$ modulo $257$ the correct residue 
+$r \equiv 158$ is produced.  
+
+Let $k = \lfloor lg(n) \rfloor + 1$ represent the number of bits in $n$.  The current algorithm requires $2k^2$ single precision shifts
+and $k^2$ single precision additions.  At this rate the algorithm is most certainly slower than Barrett reduction and not terribly useful.  
+Fortunately there exists an alternative representation of the algorithm.
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{Montgomery Reduction} (modified I). \\
+\textbf{Input}.   Integer $x$, $n$ and $k$ \\
+\textbf{Output}.  $2^{-k}x \mbox{ (mod }n\mbox{)}$ \\
+\hline \\
+1.  for $t$ from $0$ to $k - 1$ do \\
+\hspace{3mm}1.1  If the $t$'th bit of $x$ is one then \\
+\hspace{6mm}1.1.1  $x \leftarrow x + 2^tn$ \\
+2.  Return $x/2^k$. \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm Montgomery Reduction (modified I)}
+\end{figure}
+
+This algorithm is equivalent since $2^tn$ is a multiple of $n$ and the lower $k$ bits of $x$ are zero by step 2.  The number of single
+precision shifts has now been reduced from $2k^2$ to $k^2 + k$ which is only a small improvement.
+
+\begin{figure}[here]
+\begin{small}
+\begin{center}
+\begin{tabular}{|c|l|r|}
+\hline \textbf{Step number ($t$)} & \textbf{Result ($x$)} & \textbf{Result ($x$) in Binary} \\
+\hline -- & $5555$ & $1010110110011$ \\
+\hline $1$ & $x + 2^{0}n = 5812$ &  $1011010110100$ \\
+\hline $2$ & $5812$ & $1011010110100$ \\
+\hline $3$ & $x + 2^{2}n = 6840$ & $1101010111000$ \\
+\hline $4$ & $x + 2^{3}n = 8896$ & $10001011000000$ \\
+\hline $5$ & $8896$ & $10001011000000$ \\
+\hline $6$ & $8896$ & $10001011000000$ \\
+\hline $7$ & $x + 2^{6}n = 25344$ & $110001100000000$ \\
+\hline $8$ & $25344$ & $110001100000000$ \\
+\hline -- & $x/2^k = 99$ & \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Example of Montgomery Reduction (II)}
+\label{fig:MONT2}
+\end{figure}
+
+Figure~\ref{fig:MONT2} demonstrates the modified algorithm reducing $x = 5555$ modulo $n = 257$ with $k = 8$. 
+With this algorithm a single shift right at the end is the only right shift required to reduce the input instead of $k$ right shifts inside the 
+loop.  Note that for the iterations $t = 2, 5, 6$ and $8$ where the result $x$ is not changed.  In those iterations the $t$'th bit of $x$ is 
+zero and the appropriate multiple of $n$ does not need to be added to force the $t$'th bit of the result to zero.  
+
+\subsection{Digit Based Montgomery Reduction}
+Instead of computing the reduction on a bit-by-bit basis it is actually much faster to compute it on digit-by-digit basis.  Consider the
+previous algorithm re-written to compute the Montgomery reduction in this new fashion.
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{Montgomery Reduction} (modified II). \\
+\textbf{Input}.   Integer $x$, $n$ and $k$ \\
+\textbf{Output}.  $\beta^{-k}x \mbox{ (mod }n\mbox{)}$ \\
+\hline \\
+1.  for $t$ from $0$ to $k - 1$ do \\
+\hspace{3mm}1.1  $x \leftarrow x + \mu n \beta^t$ \\
+2.  Return $x/\beta^k$. \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm Montgomery Reduction (modified II)}
+\end{figure}
+
+The value $\mu n \beta^t$ is a multiple of the modulus $n$ meaning that it will not change the residue.  If the first digit of 
+the value $\mu n \beta^t$ equals the negative (modulo $\beta$) of the $t$'th digit of $x$ then the addition will result in a zero digit.  This
+problem breaks down to solving the following congruency.  
+
+\begin{center}
+\begin{tabular}{rcl}
+$x_t + \mu n_0$ & $\equiv$ & $0 \mbox{ (mod }\beta\mbox{)}$ \\
+$\mu n_0$ & $\equiv$ & $-x_t \mbox{ (mod }\beta\mbox{)}$ \\
+$\mu$ & $\equiv$ & $-x_t/n_0 \mbox{ (mod }\beta\mbox{)}$ \\
+\end{tabular}
+\end{center}
+
+In each iteration of the loop on step 1 a new value of $\mu$ must be calculated.  The value of $-1/n_0 \mbox{ (mod }\beta\mbox{)}$ is used 
+extensively in this algorithm and should be precomputed.  Let $\rho$ represent the negative of the modular inverse of $n_0$ modulo $\beta$.  
+
+For example, let $\beta = 10$ represent the radix.  Let $n = 17$ represent the modulus which implies $k = 2$ and $\rho \equiv 7$.  Let $x = 33$ 
+represent the value to reduce.
+
+\newpage\begin{figure}
+\begin{center}
+\begin{tabular}{|c|c|c|}
+\hline \textbf{Step ($t$)} & \textbf{Value of $x$} & \textbf{Value of $\mu$} \\
+\hline --                 & $33$ & --\\
+\hline $0$                 & $33 + \mu n = 50$ & $1$ \\
+\hline $1$                 & $50 + \mu n \beta = 900$ & $5$ \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Example of Montgomery Reduction}
+\end{figure}
+
+The final result $900$ is then divided by $\beta^k$ to produce the final result $9$.  The first observation is that $9 \nequiv x \mbox{ (mod }n\mbox{)}$ 
+which implies the result is not the modular residue of $x$ modulo $n$.  However, recall that the residue is actually multiplied by $\beta^{-k}$ in
+the algorithm.  To get the true residue the value must be multiplied by $\beta^k$.  In this case $\beta^k \equiv 15 \mbox{ (mod }n\mbox{)}$ and
+the correct residue is $9 \cdot 15 \equiv 16 \mbox{ (mod }n\mbox{)}$.  
+
+\subsection{Baseline Montgomery Reduction}
+The baseline Montgomery reduction algorithm will produce the residue for any size input.  It is designed to be a catch-all algororithm for 
+Montgomery reductions.  
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_montgomery\_reduce}. \\
+\textbf{Input}.   mp\_int $x$, mp\_int $n$ and a digit $\rho \equiv -1/n_0 \mbox{ (mod }n\mbox{)}$. \\
+\hspace{11.5mm}($0 \le x < n^2, n > 1, (n, \beta) = 1, \beta^k > n$) \\
+\textbf{Output}.  $\beta^{-k}x \mbox{ (mod }n\mbox{)}$ \\
+\hline \\
+1.  $digs \leftarrow 2n.used + 1$ \\
+2.  If $digs < MP\_ARRAY$ and $m.used < \delta$ then \\
+\hspace{3mm}2.1  Use algorithm fast\_mp\_montgomery\_reduce instead. \\
+\\
+Setup $x$ for the reduction. \\
+3.  If $x.alloc < digs$ then grow $x$ to $digs$ digits. \\
+4.  $x.used \leftarrow digs$ \\
+\\
+Eliminate the lower $k$ digits. \\
+5.  For $ix$ from $0$ to $k - 1$ do \\
+\hspace{3mm}5.1  $\mu \leftarrow x_{ix} \cdot \rho \mbox{ (mod }\beta\mbox{)}$ \\
+\hspace{3mm}5.2  $u \leftarrow 0$ \\
+\hspace{3mm}5.3  For $iy$ from $0$ to $k - 1$ do \\
+\hspace{6mm}5.3.1  $\hat r \leftarrow \mu n_{iy} + x_{ix + iy} + u$ \\
+\hspace{6mm}5.3.2  $x_{ix + iy} \leftarrow \hat r \mbox{ (mod }\beta\mbox{)}$ \\
+\hspace{6mm}5.3.3  $u \leftarrow \lfloor \hat r / \beta \rfloor$ \\
+\hspace{3mm}5.4  While $u > 0$ do \\
+\hspace{6mm}5.4.1  $iy \leftarrow iy + 1$ \\
+\hspace{6mm}5.4.2  $x_{ix + iy} \leftarrow x_{ix + iy} + u$ \\
+\hspace{6mm}5.4.3  $u \leftarrow \lfloor x_{ix+iy} / \beta \rfloor$ \\
+\hspace{6mm}5.4.4  $x_{ix + iy} \leftarrow x_{ix+iy} \mbox{ (mod }\beta\mbox{)}$ \\
+\\
+Divide by $\beta^k$ and fix up as required. \\
+6.  $x \leftarrow \lfloor x / \beta^k \rfloor$ \\
+7.  If $x \ge n$ then \\
+\hspace{3mm}7.1  $x \leftarrow x - n$ \\
+8.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_montgomery\_reduce}
+\end{figure}
+
+\textbf{Algorithm mp\_montgomery\_reduce.}
+This algorithm reduces the input $x$ modulo $n$ in place using the Montgomery reduction algorithm.  The algorithm is loosely based
+on algorithm 14.32 of \cite[pp.601]{HAC} except it merges the multiplication of $\mu n \beta^t$ with the addition in the inner loop.  The
+restrictions on this algorithm are fairly easy to adapt to.  First $0 \le x < n^2$ bounds the input to numbers in the same range as 
+for the Barrett algorithm.  Additionally if $n > 1$ and $n$ is odd there will exist a modular inverse $\rho$.  $\rho$ must be calculated in
+advance of this algorithm.  Finally the variable $k$ is fixed and a pseudonym for $n.used$.  
+
+Step 2 decides whether a faster Montgomery algorithm can be used.  It is based on the Comba technique meaning that there are limits on
+the size of the input.  This algorithm is discussed in ~COMBARED~.
+
+Step 5 is the main reduction loop of the algorithm.  The value of $\mu$ is calculated once per iteration in the outer loop.  The inner loop
+calculates $x + \mu n \beta^{ix}$ by multiplying $\mu n$ and adding the result to $x$ shifted by $ix$ digits.  Both the addition and
+multiplication are performed in the same loop to save time and memory.  Step 5.4 will handle any additional carries that escape the inner loop.
+
+Using a quick inspection this algorithm requires $n$ single precision multiplications for the outer loop and $n^2$ single precision multiplications 
+in the inner loop.  In total $n^2 + n$ single precision multiplications which compares favourably to Barrett at $n^2 + 2n - 1$ single precision
+multiplications.  
+
+EXAM,bn_mp_montgomery_reduce.c
+
+This is the baseline implementation of the Montgomery reduction algorithm.  Lines @30,digs@ to @35,}@ determine if the Comba based
+routine can be used instead.  Line @47,mu@ computes the value of $\mu$ for that particular iteration of the outer loop.  
+
+The multiplication $\mu n \beta^{ix}$ is performed in one step in the inner loop.  The alias $tmpx$ refers to the $ix$'th digit of $x$ and
+the alias $tmpn$ refers to the modulus $n$.  
+
+\subsection{Faster ``Comba'' Montgomery Reduction}
+MARK,COMBARED
+
+The Montgomery reduction requires fewer single precision multiplications than a Barrett reduction, however it is much slower due to the serial
+nature of the inner loop.  The Barrett reduction algorithm requires two slightly modified multipliers which can be implemented with the Comba
+technique.  The Montgomery reduction algorithm cannot directly use the Comba technique to any significant advantage since the inner loop calculates
+a $k \times 1$ product $k$ times. 
+
+The biggest obstacle is that at the $ix$'th iteration of the outer loop the value of $x_{ix}$ is required to calculate $\mu$.  This means the 
+carries from $0$ to $ix - 1$ must have been propagated upwards to form a valid $ix$'th digit.  The solution as it turns out is very simple.  
+Perform a Comba like multiplier and inside the outer loop just after the inner loop fix up the $ix + 1$'th digit by forwarding the carry.  
+
+With this change in place the Montgomery reduction algorithm can be performed with a Comba style multiplication loop which substantially increases
+the speed of the algorithm.  
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{fast\_mp\_montgomery\_reduce}. \\
+\textbf{Input}.   mp\_int $x$, mp\_int $n$ and a digit $\rho \equiv -1/n_0 \mbox{ (mod }n\mbox{)}$. \\
+\hspace{11.5mm}($0 \le x < n^2, n > 1, (n, \beta) = 1, \beta^k > n$) \\
+\textbf{Output}.  $\beta^{-k}x \mbox{ (mod }n\mbox{)}$ \\
+\hline \\
+Place an array of \textbf{MP\_WARRAY} mp\_word variables called $\hat W$ on the stack. \\
+1.  if $x.alloc < n.used + 1$ then grow $x$ to $n.used + 1$ digits. \\
+Copy the digits of $x$ into the array $\hat W$ \\
+2.  For $ix$ from $0$ to $x.used - 1$ do \\
+\hspace{3mm}2.1  $\hat W_{ix} \leftarrow x_{ix}$ \\
+3.  For $ix$ from $x.used$ to $2n.used - 1$ do \\
+\hspace{3mm}3.1  $\hat W_{ix} \leftarrow 0$ \\
+Elimiate the lower $k$ digits. \\
+4.  for $ix$ from $0$ to $n.used - 1$ do \\
+\hspace{3mm}4.1  $\mu \leftarrow \hat W_{ix} \cdot \rho \mbox{ (mod }\beta\mbox{)}$ \\
+\hspace{3mm}4.2  For $iy$ from $0$ to $n.used - 1$ do \\
+\hspace{6mm}4.2.1  $\hat W_{iy + ix} \leftarrow \hat W_{iy + ix} + \mu \cdot n_{iy}$ \\
+\hspace{3mm}4.3  $\hat W_{ix + 1} \leftarrow \hat W_{ix + 1} + \lfloor \hat W_{ix} / \beta \rfloor$ \\
+Propagate carries upwards. \\
+5.  for $ix$ from $n.used$ to $2n.used + 1$ do \\
+\hspace{3mm}5.1  $\hat W_{ix + 1} \leftarrow \hat W_{ix + 1} + \lfloor \hat W_{ix} / \beta \rfloor$ \\
+Shift right and reduce modulo $\beta$ simultaneously. \\
+6.  for $ix$ from $0$ to $n.used + 1$ do \\
+\hspace{3mm}6.1  $x_{ix} \leftarrow \hat W_{ix + n.used} \mbox{ (mod }\beta\mbox{)}$ \\
+Zero excess digits and fixup $x$. \\
+7.  if $x.used > n.used + 1$ then do \\
+\hspace{3mm}7.1  for $ix$ from $n.used + 1$ to $x.used - 1$ do \\
+\hspace{6mm}7.1.1  $x_{ix} \leftarrow 0$ \\
+8.  $x.used \leftarrow n.used + 1$ \\
+9.  Clamp excessive digits of $x$. \\
+10.  If $x \ge n$ then \\
+\hspace{3mm}10.1  $x \leftarrow x - n$ \\
+11.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm fast\_mp\_montgomery\_reduce}
+\end{figure}
+
+\textbf{Algorithm fast\_mp\_montgomery\_reduce.}
+This algorithm will compute the Montgomery reduction of $x$ modulo $n$ using the Comba technique.  It is on most computer platforms significantly
+faster than algorithm mp\_montgomery\_reduce and algorithm mp\_reduce (\textit{Barrett reduction}).  The algorithm has the same restrictions
+on the input as the baseline reduction algorithm.  An additional two restrictions are imposed on this algorithm.  The number of digits $k$ in the 
+the modulus $n$ must not violate $MP\_WARRAY > 2k +1$ and $n < \delta$.   When $\beta = 2^{28}$ this algorithm can be used to reduce modulo
+a modulus of at most $3,556$ bits in length.  
+
+As in the other Comba reduction algorithms there is a $\hat W$ array which stores the columns of the product.  It is initially filled with the
+contents of $x$ with the excess digits zeroed.  The reduction loop is very similar the to the baseline loop at heart.  The multiplication on step
+4.1 can be single precision only since $ab \mbox{ (mod }\beta\mbox{)} \equiv (a \mbox{ mod }\beta)(b \mbox{ mod }\beta)$.  Some multipliers such
+as those on the ARM processors take a variable length time to complete depending on the number of bytes of result it must produce.  By performing
+a single precision multiplication instead half the amount of time is spent.
+
+Also note that digit $\hat W_{ix}$ must have the carry from the $ix - 1$'th digit propagated upwards in order for this to work.  That is what step
+4.3 will do.  In effect over the $n.used$ iterations of the outer loop the $n.used$'th lower columns all have the their carries propagated forwards.  Note
+how the upper bits of those same words are not reduced modulo $\beta$.  This is because those values will be discarded shortly and there is no
+point.
+
+Step 5 will propagate the remainder of the carries upwards.  On step 6 the columns are reduced modulo $\beta$ and shifted simultaneously as they are
+stored in the destination $x$.  
+
+EXAM,bn_fast_mp_montgomery_reduce.c
+
+The $\hat W$ array is first filled with digits of $x$ on line @49,for@ then the rest of the digits are zeroed on line @54,for@.  Both loops share
+the same alias variables to make the code easier to read.  
+
+The value of $\mu$ is calculated in an interesting fashion.  First the value $\hat W_{ix}$ is reduced modulo $\beta$ and cast to a mp\_digit.  This
+forces the compiler to use a single precision multiplication and prevents any concerns about loss of precision.   Line @101,>>@ fixes the carry 
+for the next iteration of the loop by propagating the carry from $\hat W_{ix}$ to $\hat W_{ix+1}$.
+
+The for loop on line @113,for@ propagates the rest of the carries upwards through the columns.  The for loop on line @126,for@ reduces the columns
+modulo $\beta$ and shifts them $k$ places at the same time.  The alias $\_ \hat W$ actually refers to the array $\hat W$ starting at the $n.used$'th
+digit, that is $\_ \hat W_{t} = \hat W_{n.used + t}$.  
+
+\subsection{Montgomery Setup}
+To calculate the variable $\rho$ a relatively simple algorithm will be required.  
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_montgomery\_setup}. \\
+\textbf{Input}.   mp\_int $n$ ($n > 1$ and $(n, 2) = 1$) \\
+\textbf{Output}.  $\rho \equiv -1/n_0 \mbox{ (mod }\beta\mbox{)}$ \\
+\hline \\
+1.  $b \leftarrow n_0$ \\
+2.  If $b$ is even return(\textit{MP\_VAL}) \\
+3.  $x \leftarrow ((b + 2) \mbox{ AND } 4) << 1) + b$ \\
+4.  for $k$ from 0 to $\lceil lg(lg(\beta)) \rceil - 2$ do \\
+\hspace{3mm}4.1  $x \leftarrow x \cdot (2 - bx)$ \\
+5.  $\rho \leftarrow \beta - x \mbox{ (mod }\beta\mbox{)}$ \\
+6.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_montgomery\_setup} 
+\end{figure}
+
+\textbf{Algorithm mp\_montgomery\_setup.}
+This algorithm will calculate the value of $\rho$ required within the Montgomery reduction algorithms.  It uses a very interesting trick 
+to calculate $1/n_0$ when $\beta$ is a power of two.  
+
+EXAM,bn_mp_montgomery_setup.c
+
+This source code computes the value of $\rho$ required to perform Montgomery reduction.  It has been modified to avoid performing excess
+multiplications when $\beta$ is not the default 28-bits.  
+
+\section{The Diminished Radix Algorithm}
+The Diminished Radix method of modular reduction \cite{DRMET} is a fairly clever technique which can be more efficient than either the Barrett
+or Montgomery methods for certain forms of moduli.  The technique is based on the following simple congruence.
+
+\begin{equation}
+(x \mbox{ mod } n) + k \lfloor x / n \rfloor \equiv x \mbox{ (mod }(n - k)\mbox{)}
+\end{equation}
+
+This observation was used in the MMB \cite{MMB} block cipher to create a diffusion primitive.  It used the fact that if $n = 2^{31}$ and $k=1$ that 
+then a x86 multiplier could produce the 62-bit product and use  the ``shrd'' instruction to perform a double-precision right shift.  The proof
+of the above equation is very simple.  First write $x$ in the product form.
+
+\begin{equation}
+x = qn + r
+\end{equation}
+
+Now reduce both sides modulo $(n - k)$.
+
+\begin{equation}
+x \equiv qk + r  \mbox{ (mod }(n-k)\mbox{)}
+\end{equation}
+
+The variable $n$ reduces modulo $n - k$ to $k$.  By putting $q = \lfloor x/n \rfloor$ and $r = x \mbox{ mod } n$ 
+into the equation the original congruence is reproduced, thus concluding the proof.  The following algorithm is based on this observation.
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{Diminished Radix Reduction}. \\
+\textbf{Input}.   Integer $x$, $n$, $k$ \\
+\textbf{Output}.  $x \mbox{ mod } (n - k)$ \\
+\hline \\
+1.  $q \leftarrow \lfloor x / n \rfloor$ \\
+2.  $q \leftarrow k \cdot q$ \\
+3.  $x \leftarrow x \mbox{ (mod }n\mbox{)}$ \\
+4.  $x \leftarrow x + q$ \\
+5.  If $x \ge (n - k)$ then \\
+\hspace{3mm}5.1  $x \leftarrow x - (n - k)$ \\
+\hspace{3mm}5.2  Goto step 1. \\
+6.  Return $x$ \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm Diminished Radix Reduction}
+\label{fig:DR}
+\end{figure}
+
+This algorithm will reduce $x$ modulo $n - k$ and return the residue.  If $0 \le x < (n - k)^2$ then the algorithm will loop almost always
+once or twice and occasionally three times.  For simplicity sake the value of $x$ is bounded by the following simple polynomial.
+
+\begin{equation} 
+0 \le x < n^2 + k^2 - 2nk
+\end{equation}
+
+The true bound is  $0 \le x < (n - k - 1)^2$ but this has quite a few more terms.  The value of $q$ after step 1 is bounded by the following.
+
+\begin{equation}
+q < n - 2k - k^2/n
+\end{equation}
+
+Since $k^2$ is going to be considerably smaller than $n$ that term will always be zero.  The value of $x$ after step 3 is bounded trivially as
+$0 \le x < n$.  By step four the sum $x + q$ is bounded by 
+
+\begin{equation}
+0 \le q + x < (k + 1)n - 2k^2 - 1
+\end{equation}
+
+With a second pass $q$ will be loosely bounded by $0 \le q < k^2$ after step 2 while $x$ will still be loosely bounded by $0 \le x < n$ after step 3.  After the second pass it is highly unlike that the
+sum in step 4 will exceed $n - k$.  In practice fewer than three passes of the algorithm are required to reduce virtually every input in the 
+range $0 \le x < (n - k - 1)^2$.  
+
+\begin{figure}
+\begin{small}
+\begin{center}
+\begin{tabular}{|l|}
+\hline
+$x = 123456789, n = 256, k = 3$ \\
+\hline $q \leftarrow \lfloor x/n \rfloor = 482253$ \\
+$q \leftarrow q*k = 1446759$ \\
+$x \leftarrow x \mbox{ mod } n = 21$ \\
+$x \leftarrow x + q = 1446780$ \\
+$x \leftarrow x - (n - k) = 1446527$ \\
+\hline 
+$q \leftarrow \lfloor x/n \rfloor = 5650$ \\
+$q \leftarrow q*k = 16950$ \\
+$x \leftarrow x \mbox{ mod } n = 127$ \\
+$x \leftarrow x + q = 17077$ \\
+$x \leftarrow x - (n - k) = 16824$ \\
+\hline 
+$q \leftarrow \lfloor x/n \rfloor = 65$ \\
+$q \leftarrow q*k = 195$ \\
+$x \leftarrow x \mbox{ mod } n = 184$ \\
+$x \leftarrow x + q = 379$ \\
+$x \leftarrow x - (n - k) = 126$ \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Example Diminished Radix Reduction}
+\label{fig:EXDR}
+\end{figure}
+
+Figure~\ref{fig:EXDR} demonstrates the reduction of $x = 123456789$ modulo $n - k = 253$ when $n = 256$ and $k = 3$.  Note that even while $x$
+is considerably larger than $(n - k - 1)^2 = 63504$ the algorithm still converges on the modular residue exceedingly fast.  In this case only
+three passes were required to find the residue $x \equiv 126$.
+
+
+\subsection{Choice of Moduli}
+On the surface this algorithm looks like a very expensive algorithm.  It requires a couple of subtractions followed by multiplication and other
+modular reductions.  The usefulness of this algorithm becomes exceedingly clear when an appropriate modulus is chosen.
+
+Division in general is a very expensive operation to perform.  The one exception is when the division is by a power of the radix of representation used.  
+Division by ten for example is simple for pencil and paper mathematics since it amounts to shifting the decimal place to the right.  Similarly division 
+by two (\textit{or powers of two}) is very simple for binary computers to perform.  It would therefore seem logical to choose $n$ of the form $2^p$ 
+which would imply that $\lfloor x / n \rfloor$ is a simple shift of $x$ right $p$ bits.  
+
+However, there is one operation related to division of power of twos that is even faster than this.  If $n = \beta^p$ then the division may be 
+performed by moving whole digits to the right $p$ places.  In practice division by $\beta^p$ is much faster than division by $2^p$ for any $p$.  
+Also with the choice of $n = \beta^p$ reducing $x$ modulo $n$ merely requires zeroing the digits above the $p-1$'th digit of $x$.  
+
+Throughout the next section the term ``restricted modulus'' will refer to a modulus of the form $\beta^p - k$ whereas the term ``unrestricted
+modulus'' will refer to a modulus of the form $2^p - k$.  The word ``restricted'' in this case refers to the fact that it is based on the 
+$2^p$ logic except $p$ must be a multiple of $lg(\beta)$.  
+
+\subsection{Choice of $k$}
+Now that division and reduction (\textit{step 1 and 3 of figure~\ref{fig:DR}}) have been optimized to simple digit operations the multiplication by $k$
+in step 2 is the most expensive operation.  Fortunately the choice of $k$ is not terribly limited.  For all intents and purposes it might
+as well be a single digit.  The smaller the value of $k$ is the faster the algorithm will be.  
+
+\subsection{Restricted Diminished Radix Reduction}
+The restricted Diminished Radix algorithm can quickly reduce an input modulo a modulus of the form $n = \beta^p - k$.  This algorithm can reduce 
+an input $x$ within the range $0 \le x < n^2$ using only a couple passes of the algorithm demonstrated in figure~\ref{fig:DR}.  The implementation
+of this algorithm has been optimized to avoid additional overhead associated with a division by $\beta^p$, the multiplication by $k$ or the addition 
+of $x$ and $q$.  The resulting algorithm is very efficient and can lead to substantial improvements over Barrett and Montgomery reduction when modular 
+exponentiations are performed.
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_dr\_reduce}. \\
+\textbf{Input}.   mp\_int $x$, $n$ and a mp\_digit $k = \beta - n_0$ \\
+\hspace{11.5mm}($0 \le x < n^2$, $n > 1$, $0 < k < \beta$) \\
+\textbf{Output}.  $x \mbox{ mod } n$ \\
+\hline \\
+1.  $m \leftarrow n.used$ \\
+2.  If $x.alloc < 2m$ then grow $x$ to $2m$ digits. \\
+3.  $\mu \leftarrow 0$ \\
+4.  for $i$ from $0$ to $m - 1$ do \\
+\hspace{3mm}4.1  $\hat r \leftarrow k \cdot x_{m+i} + x_{i} + \mu$ \\
+\hspace{3mm}4.2  $x_{i} \leftarrow \hat r \mbox{ (mod }\beta\mbox{)}$ \\
+\hspace{3mm}4.3  $\mu \leftarrow \lfloor \hat r / \beta \rfloor$ \\
+5.  $x_{m} \leftarrow \mu$ \\
+6.  for $i$ from $m + 1$ to $x.used - 1$ do \\
+\hspace{3mm}6.1  $x_{i} \leftarrow 0$ \\
+7.  Clamp excess digits of $x$. \\
+8.  If $x \ge n$ then \\
+\hspace{3mm}8.1  $x \leftarrow x - n$ \\
+\hspace{3mm}8.2  Goto step 3. \\
+9.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_dr\_reduce}
+\end{figure}
+
+\textbf{Algorithm mp\_dr\_reduce.}
+This algorithm will perform the Dimished Radix reduction of $x$ modulo $n$.  It has similar restrictions to that of the Barrett reduction
+with the addition that $n$ must be of the form $n = \beta^m - k$ where $0 < k <\beta$.  
+
+This algorithm essentially implements the pseudo-code in figure~\ref{fig:DR} except with a slight optimization.  The division by $\beta^m$, multiplication by $k$
+and addition of $x \mbox{ mod }\beta^m$ are all performed simultaneously inside the loop on step 4.  The division by $\beta^m$ is emulated by accessing
+the term at the $m+i$'th position which is subsequently multiplied by $k$ and added to the term at the $i$'th position.  After the loop the $m$'th
+digit is set to the carry and the upper digits are zeroed.  Steps 5 and 6 emulate the reduction modulo $\beta^m$ that should have happend to 
+$x$ before the addition of the multiple of the upper half.  
+
+At step 8 if $x$ is still larger than $n$ another pass of the algorithm is required.  First $n$ is subtracted from $x$ and then the algorithm resumes
+at step 3.  
+
+EXAM,bn_mp_dr_reduce.c
+
+The first step is to grow $x$ as required to $2m$ digits since the reduction is performed in place on $x$.  The label on line @49,top:@ is where
+the algorithm will resume if further reduction passes are required.  In theory it could be placed at the top of the function however, the size of
+the modulus and question of whether $x$ is large enough are invariant after the first pass meaning that it would be a waste of time.  
+
+The aliases $tmpx1$ and $tmpx2$ refer to the digits of $x$ where the latter is offset by $m$ digits.  By reading digits from $x$ offset by $m$ digits
+a division by $\beta^m$ can be simulated virtually for free.  The loop on line @61,for@ performs the bulk of the work (\textit{corresponds to step 4 of algorithm 7.11})
+in this algorithm.
+
+By line @68,mu@ the pointer $tmpx1$ points to the $m$'th digit of $x$ which is where the final carry will be placed.  Similarly by line @71,for@ the 
+same pointer will point to the $m+1$'th digit where the zeroes will be placed.  
+
+Since the algorithm is only valid if both $x$ and $n$ are greater than zero an unsigned comparison suffices to determine if another pass is required.  
+With the same logic at line @82,sub@ the value of $x$ is known to be greater than or equal to $n$ meaning that an unsigned subtraction can be used
+as well.  Since the destination of the subtraction is the larger of the inputs the call to algorithm s\_mp\_sub cannot fail and the return code
+does not need to be checked.
+
+\subsubsection{Setup}
+To setup the restricted Diminished Radix algorithm the value $k = \beta - n_0$ is required.  This algorithm is not really complicated but provided for
+completeness.
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_dr\_setup}. \\
+\textbf{Input}.   mp\_int $n$ \\
+\textbf{Output}.  $k = \beta - n_0$ \\
+\hline \\
+1.  $k \leftarrow \beta - n_0$ \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_dr\_setup}
+\end{figure}
+
+EXAM,bn_mp_dr_setup.c
+
+\subsubsection{Modulus Detection}
+Another algorithm which will be useful is the ability to detect a restricted Diminished Radix modulus.  An integer is said to be
+of restricted Diminished Radix form if all of the digits are equal to $\beta - 1$ except the trailing digit which may be any value.
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_dr\_is\_modulus}. \\
+\textbf{Input}.   mp\_int $n$ \\
+\textbf{Output}.  $1$ if $n$ is in D.R form, $0$ otherwise \\
+\hline
+1.  If $n.used < 2$ then return($0$). \\
+2.  for $ix$ from $1$ to $n.used - 1$ do \\
+\hspace{3mm}2.1  If $n_{ix} \ne \beta - 1$ return($0$). \\
+3.  Return($1$). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_dr\_is\_modulus}
+\end{figure}
+
+\textbf{Algorithm mp\_dr\_is\_modulus.}
+This algorithm determines if a value is in Diminished Radix form.  Step 1 rejects obvious cases where fewer than two digits are
+in the mp\_int.  Step 2 tests all but the first digit to see if they are equal to $\beta - 1$.  If the algorithm manages to get to
+step 3 then $n$ must be of Diminished Radix form.
+
+EXAM,bn_mp_dr_is_modulus.c
+
+\subsection{Unrestricted Diminished Radix Reduction}
+The unrestricted Diminished Radix algorithm allows modular reductions to be performed when the modulus is of the form $2^p - k$.  This algorithm
+is a straightforward adaptation of algorithm~\ref{fig:DR}.
+
+In general the restricted Diminished Radix reduction algorithm is much faster since it has considerably lower overhead.  However, this new
+algorithm is much faster than either Montgomery or Barrett reduction when the moduli are of the appropriate form.
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_reduce\_2k}. \\
+\textbf{Input}.   mp\_int $a$ and $n$.  mp\_digit $k$  \\
+\hspace{11.5mm}($a \ge 0$, $n > 1$, $0 < k < \beta$, $n + k$ is a power of two) \\
+\textbf{Output}.  $a \mbox{ (mod }n\mbox{)}$ \\
+\hline
+1.  $p \leftarrow \lceil lg(n) \rceil$  (\textit{mp\_count\_bits}) \\
+2.  While $a \ge n$ do \\
+\hspace{3mm}2.1  $q \leftarrow \lfloor a / 2^p \rfloor$ (\textit{mp\_div\_2d}) \\
+\hspace{3mm}2.2  $a \leftarrow a \mbox{ (mod }2^p\mbox{)}$ (\textit{mp\_mod\_2d}) \\
+\hspace{3mm}2.3  $q \leftarrow q \cdot k$ (\textit{mp\_mul\_d}) \\
+\hspace{3mm}2.4  $a \leftarrow a - q$ (\textit{s\_mp\_sub}) \\
+\hspace{3mm}2.5  If $a \ge n$ then do \\
+\hspace{6mm}2.5.1  $a \leftarrow a - n$ \\
+3.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_reduce\_2k}
+\end{figure}
+
+\textbf{Algorithm mp\_reduce\_2k.}
+This algorithm quickly reduces an input $a$ modulo an unrestricted Diminished Radix modulus $n$.  Division by $2^p$ is emulated with a right
+shift which makes the algorithm fairly inexpensive to use.  
+
+EXAM,bn_mp_reduce_2k.c
+
+The algorithm mp\_count\_bits calculates the number of bits in an mp\_int which is used to find the initial value of $p$.  The call to mp\_div\_2d
+on line @31,mp_div_2d@ calculates both the quotient $q$ and the remainder $a$ required.  By doing both in a single function call the code size
+is kept fairly small.  The multiplication by $k$ is only performed if $k > 1$. This allows reductions modulo $2^p - 1$ to be performed without
+any multiplications.  
+
+The unsigned s\_mp\_add, mp\_cmp\_mag and s\_mp\_sub are used in place of their full sign counterparts since the inputs are only valid if they are 
+positive.  By using the unsigned versions the overhead is kept to a minimum.  
+
+\subsubsection{Unrestricted Setup}
+To setup this reduction algorithm the value of $k = 2^p - n$ is required.  
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_reduce\_2k\_setup}. \\
+\textbf{Input}.   mp\_int $n$   \\
+\textbf{Output}.  $k = 2^p - n$ \\
+\hline
+1.  $p \leftarrow \lceil lg(n) \rceil$  (\textit{mp\_count\_bits}) \\
+2.  $x \leftarrow 2^p$ (\textit{mp\_2expt}) \\
+3.  $x \leftarrow x - n$ (\textit{mp\_sub}) \\
+4.  $k \leftarrow x_0$ \\
+5.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_reduce\_2k\_setup}
+\end{figure}
+
+\textbf{Algorithm mp\_reduce\_2k\_setup.}
+This algorithm computes the value of $k$ required for the algorithm mp\_reduce\_2k.  By making a temporary variable $x$ equal to $2^p$ a subtraction
+is sufficient to solve for $k$.  Alternatively if $n$ has more than one digit the value of $k$ is simply $\beta - n_0$.  
+
+EXAM,bn_mp_reduce_2k_setup.c
+
+\subsubsection{Unrestricted Detection}
+An integer $n$ is a valid unrestricted Diminished Radix modulus if either of the following are true.
+
+\begin{enumerate}
+\item  The number has only one digit.
+\item  The number has more than one digit and every bit from the $\beta$'th to the most significant is one.
+\end{enumerate}
+
+If either condition is true than there is a power of two $2^p$ such that $0 < 2^p - n < \beta$.   If the input is only
+one digit than it will always be of the correct form.  Otherwise all of the bits above the first digit must be one.  This arises from the fact
+that there will be value of $k$ that when added to the modulus causes a carry in the first digit which propagates all the way to the most
+significant bit.  The resulting sum will be a power of two.
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_reduce\_is\_2k}. \\
+\textbf{Input}.   mp\_int $n$   \\
+\textbf{Output}.  $1$ if of proper form, $0$ otherwise \\
+\hline
+1.  If $n.used = 0$ then return($0$). \\
+2.  If $n.used = 1$ then return($1$). \\
+3.  $p \leftarrow \lceil lg(n) \rceil$  (\textit{mp\_count\_bits}) \\
+4.  for $x$ from $lg(\beta)$ to $p$ do \\
+\hspace{3mm}4.1  If the ($x \mbox{ mod }lg(\beta)$)'th bit of the $\lfloor x / lg(\beta) \rfloor$ of $n$ is zero then return($0$). \\
+5.  Return($1$). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_reduce\_is\_2k}
+\end{figure}
+
+\textbf{Algorithm mp\_reduce\_is\_2k.}
+This algorithm quickly determines if a modulus is of the form required for algorithm mp\_reduce\_2k to function properly.  
+
+EXAM,bn_mp_reduce_is_2k.c
+
+
+
+\section{Algorithm Comparison}
+So far three very different algorithms for modular reduction have been discussed.  Each of the algorithms have their own strengths and weaknesses
+that makes having such a selection very useful.  The following table sumarizes the three algorithms along with comparisons of work factors.  Since
+all three algorithms have the restriction that $0 \le x < n^2$ and $n > 1$ those limitations are not included in the table.  
+
+\begin{center}
+\begin{small}
+\begin{tabular}{|c|c|c|c|c|c|}
+\hline \textbf{Method} & \textbf{Work Required} & \textbf{Limitations} & \textbf{$m = 8$} & \textbf{$m = 32$} & \textbf{$m = 64$} \\
+\hline Barrett    & $m^2 + 2m - 1$ & None              & $79$ & $1087$ & $4223$ \\
+\hline Montgomery & $m^2 + m$      & $n$ must be odd   & $72$ & $1056$ & $4160$ \\
+\hline D.R.       & $2m$           & $n = \beta^m - k$ & $16$ & $64$   & $128$  \\
+\hline
+\end{tabular}
+\end{small}
+\end{center}
+
+In theory Montgomery and Barrett reductions would require roughly the same amount of time to complete.  However, in practice since Montgomery
+reduction can be written as a single function with the Comba technique it is much faster.  Barrett reduction suffers from the overhead of
+calling the half precision multipliers, addition and division by $\beta$ algorithms.
+
+For almost every cryptographic algorithm Montgomery reduction is the algorithm of choice.  The one set of algorithms where Diminished Radix reduction truly
+shines are based on the discrete logarithm problem such as Diffie-Hellman \cite{DH} and ElGamal \cite{ELGAMAL}.  In these algorithms
+primes of the form $\beta^m - k$ can be found and shared amongst users.  These primes will allow the Diminished Radix algorithm to be used in
+modular exponentiation to greatly speed up the operation.
+
+
+
+\section*{Exercises}
+\begin{tabular}{cl}
+$\left [ 3 \right ]$ & Prove that the ``trick'' in algorithm mp\_montgomery\_setup actually \\
+                     & calculates the correct value of $\rho$. \\
+                     & \\
+$\left [ 2 \right ]$ & Devise an algorithm to reduce modulo $n + k$ for small $k$ quickly.  \\
+                     & \\
+$\left [ 4 \right ]$ & Prove that the pseudo-code algorithm ``Diminished Radix Reduction'' \\
+                     & (\textit{figure~\ref{fig:DR}}) terminates.  Also prove the probability that it will \\
+                     & terminate within $1 \le k \le 10$ iterations. \\
+                     & \\
+\end{tabular}                     
+
+
+\chapter{Exponentiation}
+Exponentiation is the operation of raising one variable to the power of another, for example, $a^b$.  A variant of exponentiation, computed
+in a finite field or ring, is called modular exponentiation.  This latter style of operation is typically used in public key 
+cryptosystems such as RSA and Diffie-Hellman.  The ability to quickly compute modular exponentiations is of great benefit to any
+such cryptosystem and many methods have been sought to speed it up.
+
+\section{Exponentiation Basics}
+A trivial algorithm would simply multiply $a$ against itself $b - 1$ times to compute the exponentiation desired.  However, as $b$ grows in size
+the number of multiplications becomes prohibitive.  Imagine what would happen if $b$ $\approx$ $2^{1024}$ as is the case when computing an RSA signature
+with a $1024$-bit key.  Such a calculation could never be completed as it would take simply far too long.
+
+Fortunately there is a very simple algorithm based on the laws of exponents.  Recall that $lg_a(a^b) = b$ and that $lg_a(a^ba^c) = b + c$ which
+are two trivial relationships between the base and the exponent.  Let $b_i$ represent the $i$'th bit of $b$ starting from the least 
+significant bit.  If $b$ is a $k$-bit integer than the following equation is true.
+
+\begin{equation}
+a^b = \prod_{i=0}^{k-1} a^{2^i \cdot b_i}
+\end{equation}
+
+By taking the base $a$ logarithm of both sides of the equation the following equation is the result.
+
+\begin{equation}
+b = \sum_{i=0}^{k-1}2^i \cdot b_i
+\end{equation}
+
+The term $a^{2^i}$ can be found from the $i - 1$'th term by squaring the term since $\left ( a^{2^i} \right )^2$ is equal to
+$a^{2^{i+1}}$.  This observation forms the basis of essentially all fast exponentiation algorithms.  It requires $k$ squarings and on average
+$k \over 2$ multiplications to compute the result.  This is indeed quite an improvement over simply multiplying by $a$ a total of $b-1$ times.
+
+While this current method is a considerable speed up there are further improvements to be made.  For example, the $a^{2^i}$ term does not need to 
+be computed in an auxilary variable.  Consider the following equivalent algorithm.
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{Left to Right Exponentiation}. \\
+\textbf{Input}.   Integer $a$, $b$ and $k$ \\
+\textbf{Output}.  $c = a^b$ \\
+\hline \\
+1.  $c \leftarrow 1$ \\
+2.  for $i$ from $k - 1$ to $0$ do \\
+\hspace{3mm}2.1  $c \leftarrow c^2$ \\
+\hspace{3mm}2.2  $c \leftarrow c \cdot a^{b_i}$ \\
+3.  Return $c$. \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Left to Right Exponentiation}
+\label{fig:LTOR}
+\end{figure}
+
+This algorithm starts from the most significant bit and works towards the least significant bit.  When the $i$'th bit of $b$ is set $a$ is
+multiplied against the current product.  In each iteration the product is squared which doubles the exponent of the individual terms of the
+product.  
+
+For example, let $b = 101100_2 \equiv 44_{10}$.  The following chart demonstrates the actions of the algorithm.
+
+\newpage\begin{figure}
+\begin{center}
+\begin{tabular}{|c|c|}
+\hline \textbf{Value of $i$} & \textbf{Value of $c$} \\
+\hline - & $1$ \\
+\hline $5$ & $a$ \\
+\hline $4$ & $a^2$ \\
+\hline $3$ & $a^4 \cdot a$ \\
+\hline $2$ & $a^8 \cdot a^2 \cdot a$ \\
+\hline $1$ & $a^{16} \cdot a^4 \cdot a^2$ \\
+\hline $0$ & $a^{32} \cdot a^8 \cdot a^4$ \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Example of Left to Right Exponentiation}
+\end{figure}
+
+When the product $a^{32} \cdot a^8 \cdot a^4$ is simplified it is equal $a^{44}$ which is the desired exponentiation.  This particular algorithm is 
+called ``Left to Right'' because it reads the exponent in that order.  All of the exponentiation algorithms that will be presented are of this nature.  
+
+\subsection{Single Digit Exponentiation}
+The first algorithm in the series of exponentiation algorithms will be an unbounded algorithm where the exponent is a single digit.  It is intended 
+to be used when a small power of an input is required (\textit{e.g. $a^5$}).  It is faster than simply multiplying $b - 1$ times for all values of 
+$b$ that are greater than three.  
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_expt\_d}. \\
+\textbf{Input}.   mp\_int $a$ and mp\_digit $b$ \\
+\textbf{Output}.  $c = a^b$ \\
+\hline \\
+1.  $g \leftarrow a$ (\textit{mp\_init\_copy}) \\
+2.  $c \leftarrow 1$ (\textit{mp\_set}) \\
+3.  for $x$ from 1 to $lg(\beta)$ do \\
+\hspace{3mm}3.1  $c \leftarrow c^2$ (\textit{mp\_sqr}) \\
+\hspace{3mm}3.2  If $b$ AND $2^{lg(\beta) - 1} \ne 0$ then \\
+\hspace{6mm}3.2.1  $c \leftarrow c \cdot g$ (\textit{mp\_mul}) \\
+\hspace{3mm}3.3  $b \leftarrow b << 1$ \\
+4.  Clear $g$. \\
+5.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_expt\_d}
+\end{figure}
+
+\textbf{Algorithm mp\_expt\_d.}
+This algorithm computes the value of $a$ raised to the power of a single digit $b$.  It uses the left to right exponentiation algorithm to
+quickly compute the exponentiation.  It is loosely based on algorithm 14.79 of HAC \cite[pp. 615]{HAC} with the difference that the 
+exponent is a fixed width.  
+
+A copy of $a$ is made first to allow destination variable $c$ be the same as the source variable $a$.  The result is set to the initial value of 
+$1$ in the subsequent step.
+
+Inside the loop the exponent is read from the most significant bit first down to the least significant bit.  First $c$ is invariably squared
+on step 3.1.  In the following step if the most significant bit of $b$ is one the copy of $a$ is multiplied against $c$.  The value
+of $b$ is shifted left one bit to make the next bit down from the most signficant bit the new most significant bit.  In effect each
+iteration of the loop moves the bits of the exponent $b$ upwards to the most significant location.
+
+EXAM,bn_mp_expt_d.c
+
+Line @29,mp_set@ sets the initial value of the result to $1$.  Next the loop on line @31,for@ steps through each bit of the exponent starting from
+the most significant down towards the least significant. The invariant squaring operation placed on line @333,mp_sqr@ is performed first.  After 
+the squaring the result $c$ is multiplied by the base $g$ if and only if the most significant bit of the exponent is set.  The shift on line
+@47,<<@ moves all of the bits of the exponent upwards towards the most significant location.  
+
+\section{$k$-ary Exponentiation}
+When calculating an exponentiation the most time consuming bottleneck is the multiplications which are in general a small factor
+slower than squaring.  Recall from the previous algorithm that $b_{i}$ refers to the $i$'th bit of the exponent $b$.  Suppose instead it referred to
+the $i$'th $k$-bit digit of the exponent of $b$.  For $k = 1$ the definitions are synonymous and for $k > 1$ algorithm~\ref{fig:KARY}
+computes the same exponentiation.  A group of $k$ bits from the exponent is called a \textit{window}.  That is it is a small window on only a
+portion of the entire exponent.  Consider the following modification to the basic left to right exponentiation algorithm.
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{$k$-ary Exponentiation}. \\
+\textbf{Input}.   Integer $a$, $b$, $k$ and $t$ \\
+\textbf{Output}.  $c = a^b$ \\
+\hline \\
+1.  $c \leftarrow 1$ \\
+2.  for $i$ from $t - 1$ to $0$ do \\
+\hspace{3mm}2.1  $c \leftarrow c^{2^k} $ \\
+\hspace{3mm}2.2  Extract the $i$'th $k$-bit word from $b$ and store it in $g$. \\
+\hspace{3mm}2.3  $c \leftarrow c \cdot a^g$ \\
+3.  Return $c$. \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{$k$-ary Exponentiation}
+\label{fig:KARY}
+\end{figure}
+
+The squaring on step 2.1 can be calculated by squaring the value $c$ successively $k$ times.  If the values of $a^g$ for $0 < g < 2^k$ have been
+precomputed this algorithm requires only $t$ multiplications and $tk$ squarings.  The table can be generated with $2^{k - 1} - 1$ squarings and
+$2^{k - 1} + 1$ multiplications.  This algorithm assumes that the number of bits in the exponent is evenly divisible by $k$.  
+However, when it is not the remaining $0 < x \le k - 1$ bits can be handled with algorithm~\ref{fig:LTOR}.
+
+Suppose $k = 4$ and $t = 100$.  This modified algorithm will require $109$ multiplications and $408$ squarings to compute the exponentiation.  The
+original algorithm would on average have required $200$ multiplications and $400$ squrings to compute the same value.  The total number of squarings
+has increased slightly but the number of multiplications has nearly halved.
+
+\subsection{Optimal Values of $k$}
+An optimal value of $k$ will minimize $2^{k} + \lceil n / k \rceil + n - 1$ for a fixed number of bits in the exponent $n$.  The simplest
+approach is to brute force search amongst the values $k = 2, 3, \ldots, 8$ for the lowest result.  Table~\ref{fig:OPTK} lists optimal values of $k$
+for various exponent sizes and compares the number of multiplication and squarings required against algorithm~\ref{fig:LTOR}.  
+
+\begin{figure}[here]
+\begin{center}
+\begin{small}
+\begin{tabular}{|c|c|c|c|c|c|}
+\hline \textbf{Exponent (bits)} & \textbf{Optimal $k$} & \textbf{Work at $k$} & \textbf{Work with ~\ref{fig:LTOR}} \\
+\hline $16$ & $2$ & $27$ & $24$ \\
+\hline $32$ & $3$ & $49$ & $48$ \\
+\hline $64$ & $3$ & $92$ & $96$ \\
+\hline $128$ & $4$ & $175$ & $192$ \\
+\hline $256$ & $4$ & $335$ & $384$ \\
+\hline $512$ & $5$ & $645$ & $768$ \\
+\hline $1024$ & $6$ & $1257$ & $1536$ \\
+\hline $2048$ & $6$ & $2452$ & $3072$ \\
+\hline $4096$ & $7$ & $4808$ & $6144$ \\
+\hline
+\end{tabular}
+\end{small}
+\end{center}
+\caption{Optimal Values of $k$ for $k$-ary Exponentiation}
+\label{fig:OPTK}
+\end{figure}
+
+\subsection{Sliding-Window Exponentiation}
+A simple modification to the previous algorithm is only generate the upper half of the table in the range $2^{k-1} \le g < 2^k$.  Essentially
+this is a table for all values of $g$ where the most significant bit of $g$ is a one.  However, in order for this to be allowed in the 
+algorithm values of $g$ in the range $0 \le g < 2^{k-1}$ must be avoided.  
+
+Table~\ref{fig:OPTK2} lists optimal values of $k$ for various exponent sizes and compares the work required against algorithm~\ref{fig:KARY}.  
+
+\begin{figure}[here]
+\begin{center}
+\begin{small}
+\begin{tabular}{|c|c|c|c|c|c|}
+\hline \textbf{Exponent (bits)} & \textbf{Optimal $k$} & \textbf{Work at $k$} & \textbf{Work with ~\ref{fig:KARY}} \\
+\hline $16$ & $3$ & $24$ & $27$ \\
+\hline $32$ & $3$ & $45$ & $49$ \\
+\hline $64$ & $4$ & $87$ & $92$ \\
+\hline $128$ & $4$ & $167$ & $175$ \\
+\hline $256$ & $5$ & $322$ & $335$ \\
+\hline $512$ & $6$ & $628$ & $645$ \\
+\hline $1024$ & $6$ & $1225$ & $1257$ \\
+\hline $2048$ & $7$ & $2403$ & $2452$ \\
+\hline $4096$ & $8$ & $4735$ & $4808$ \\
+\hline
+\end{tabular}
+\end{small}
+\end{center}
+\caption{Optimal Values of $k$ for Sliding Window Exponentiation}
+\label{fig:OPTK2}
+\end{figure}
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{Sliding Window $k$-ary Exponentiation}. \\
+\textbf{Input}.   Integer $a$, $b$, $k$ and $t$ \\
+\textbf{Output}.  $c = a^b$ \\
+\hline \\
+1.  $c \leftarrow 1$ \\
+2.  for $i$ from $t - 1$ to $0$ do \\
+\hspace{3mm}2.1  If the $i$'th bit of $b$ is a zero then \\
+\hspace{6mm}2.1.1   $c \leftarrow c^2$ \\
+\hspace{3mm}2.2  else do \\
+\hspace{6mm}2.2.1  $c \leftarrow c^{2^k}$ \\
+\hspace{6mm}2.2.2  Extract the $k$ bits from $(b_{i}b_{i-1}\ldots b_{i-(k-1)})$ and store it in $g$. \\
+\hspace{6mm}2.2.3  $c \leftarrow c \cdot a^g$ \\
+\hspace{6mm}2.2.4  $i \leftarrow i - k$ \\
+3.  Return $c$. \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Sliding Window $k$-ary Exponentiation}
+\end{figure}
+
+Similar to the previous algorithm this algorithm must have a special handler when fewer than $k$ bits are left in the exponent.  While this
+algorithm requires the same number of squarings it can potentially have fewer multiplications.  The pre-computed table $a^g$ is also half
+the size as the previous table.  
+
+Consider the exponent $b = 111101011001000_2 \equiv 31432_{10}$ with $k = 3$ using both algorithms.  The first algorithm will divide the exponent up as 
+the following five $3$-bit words $b \equiv \left ( 111, 101, 011, 001, 000 \right )_{2}$.  The second algorithm will break the 
+exponent as $b \equiv \left ( 111, 101, 0, 110, 0, 100, 0 \right )_{2}$.  The single digit $0$ in the second representation are where
+a single squaring took place instead of a squaring and multiplication.  In total the first method requires $10$ multiplications and $18$ 
+squarings.  The second method requires $8$ multiplications and $18$ squarings.  
+
+In general the sliding window method is never slower than the generic $k$-ary method and often it is slightly faster.  
+
+\section{Modular Exponentiation}
+
+Modular exponentiation is essentially computing the power of a base within a finite field or ring.  For example, computing 
+$d \equiv a^b \mbox{ (mod }c\mbox{)}$ is a modular exponentiation.  Instead of first computing $a^b$ and then reducing it 
+modulo $c$ the intermediate result is reduced modulo $c$ after every squaring or multiplication operation.  
+
+This guarantees that any intermediate result is bounded by $0 \le d \le c^2 - 2c + 1$ and can be reduced modulo $c$ quickly using
+one of the algorithms presented in ~REDUCTION~.  
+
+Before the actual modular exponentiation algorithm can be written a wrapper algorithm must be written first.  This algorithm
+will allow the exponent $b$ to be negative which is computed as $c \equiv \left (1 / a \right )^{\vert b \vert} \mbox{(mod }d\mbox{)}$. The
+value of $(1/a) \mbox{ mod }c$ is computed using the modular inverse (\textit{see \ref{sec;modinv}}).  If no inverse exists the algorithm
+terminates with an error.  
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_exptmod}. \\
+\textbf{Input}.   mp\_int $a$, $b$ and $c$ \\
+\textbf{Output}.  $y \equiv g^x \mbox{ (mod }p\mbox{)}$ \\
+\hline \\
+1.  If $c.sign = MP\_NEG$ return(\textit{MP\_VAL}). \\
+2.  If $b.sign = MP\_NEG$ then \\
+\hspace{3mm}2.1  $g' \leftarrow g^{-1} \mbox{ (mod }c\mbox{)}$ \\
+\hspace{3mm}2.2  $x' \leftarrow \vert x \vert$ \\
+\hspace{3mm}2.3  Compute $d \equiv g'^{x'} \mbox{ (mod }c\mbox{)}$ via recursion. \\
+3.  if $p$ is odd \textbf{OR} $p$ is a D.R. modulus then \\
+\hspace{3mm}3.1  Compute $y \equiv g^{x} \mbox{ (mod }p\mbox{)}$ via algorithm mp\_exptmod\_fast. \\
+4.  else \\
+\hspace{3mm}4.1  Compute $y \equiv g^{x} \mbox{ (mod }p\mbox{)}$ via algorithm s\_mp\_exptmod. \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_exptmod}
+\end{figure}
+
+\textbf{Algorithm mp\_exptmod.}
+The first algorithm which actually performs modular exponentiation is algorithm s\_mp\_exptmod.  It is a sliding window $k$-ary algorithm 
+which uses Barrett reduction to reduce the product modulo $p$.  The second algorithm mp\_exptmod\_fast performs the same operation 
+except it uses either Montgomery or Diminished Radix reduction.  The two latter reduction algorithms are clumped in the same exponentiation
+algorithm since their arguments are essentially the same (\textit{two mp\_ints and one mp\_digit}).  
+
+EXAM,bn_mp_exptmod.c
+
+In order to keep the algorithms in a known state the first step on line @29,if@ is to reject any negative modulus as input.  If the exponent is
+negative the algorithm tries to perform a modular exponentiation with the modular inverse of the base $G$.  The temporary variable $tmpG$ is assigned
+the modular inverse of $G$ and $tmpX$ is assigned the absolute value of $X$.  The algorithm will recuse with these new values with a positive
+exponent.
+
+If the exponent is positive the algorithm resumes the exponentiation.  Line @63,dr_@ determines if the modulus is of the restricted Diminished Radix 
+form.  If it is not line @65,reduce@ attempts to determine if it is of a unrestricted Diminished Radix form.  The integer $dr$ will take on one
+of three values.
+
+\begin{enumerate}
+\item $dr = 0$ means that the modulus is not of either restricted or unrestricted Diminished Radix form.
+\item $dr = 1$ means that the modulus is of restricted Diminished Radix form.
+\item $dr = 2$ means that the modulus is of unrestricted Diminished Radix form.
+\end{enumerate}
+
+Line @69,if@ determines if the fast modular exponentiation algorithm can be used.  It is allowed if $dr \ne 0$ or if the modulus is odd.  Otherwise,
+the slower s\_mp\_exptmod algorithm is used which uses Barrett reduction.  
+
+\subsection{Barrett Modular Exponentiation}
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{s\_mp\_exptmod}. \\
+\textbf{Input}.   mp\_int $a$, $b$ and $c$ \\
+\textbf{Output}.  $y \equiv g^x \mbox{ (mod }p\mbox{)}$ \\
+\hline \\
+1.  $k \leftarrow lg(x)$ \\
+2.  $winsize \leftarrow  \left \lbrace \begin{array}{ll}
+                              2 &  \mbox{if }k \le 7 \\
+                              3 &  \mbox{if }7 < k \le 36 \\
+                              4 &  \mbox{if }36 < k \le 140 \\
+                              5 &  \mbox{if }140 < k \le 450 \\
+                              6 &  \mbox{if }450 < k \le 1303 \\
+                              7 &  \mbox{if }1303 < k \le 3529 \\
+                              8 &  \mbox{if }3529 < k \\
+                              \end{array} \right .$ \\
+3.  Initialize $2^{winsize}$ mp\_ints in an array named $M$ and one mp\_int named $\mu$ \\
+4.  Calculate the $\mu$ required for Barrett Reduction (\textit{mp\_reduce\_setup}). \\
+5.  $M_1 \leftarrow g \mbox{ (mod }p\mbox{)}$ \\
+\\
+Setup the table of small powers of $g$.  First find $g^{2^{winsize}}$ and then all multiples of it. \\
+6.  $k \leftarrow 2^{winsize - 1}$ \\
+7.  $M_{k} \leftarrow M_1$ \\
+8.  for $ix$ from 0 to $winsize - 2$ do \\
+\hspace{3mm}8.1  $M_k \leftarrow \left ( M_k \right )^2$ (\textit{mp\_sqr})  \\
+\hspace{3mm}8.2  $M_k \leftarrow M_k \mbox{ (mod }p\mbox{)}$ (\textit{mp\_reduce}) \\
+9.  for $ix$ from $2^{winsize - 1} + 1$ to $2^{winsize} - 1$ do \\
+\hspace{3mm}9.1  $M_{ix} \leftarrow M_{ix - 1} \cdot M_{1}$ (\textit{mp\_mul}) \\
+\hspace{3mm}9.2  $M_{ix} \leftarrow M_{ix} \mbox{ (mod }p\mbox{)}$ (\textit{mp\_reduce}) \\
+10.  $res \leftarrow 1$ \\
+\\
+Start Sliding Window. \\
+11.  $mode \leftarrow 0, bitcnt \leftarrow 1, buf \leftarrow 0, digidx \leftarrow x.used - 1, bitcpy \leftarrow 0, bitbuf \leftarrow 0$ \\
+12.  Loop \\
+\hspace{3mm}12.1  $bitcnt \leftarrow bitcnt - 1$ \\
+\hspace{3mm}12.2  If $bitcnt = 0$ then do \\
+\hspace{6mm}12.2.1  If $digidx = -1$ goto step 13. \\
+\hspace{6mm}12.2.2  $buf \leftarrow x_{digidx}$ \\
+\hspace{6mm}12.2.3  $digidx \leftarrow digidx - 1$ \\
+\hspace{6mm}12.2.4  $bitcnt \leftarrow lg(\beta)$ \\
+Continued on next page. \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm s\_mp\_exptmod}
+\end{figure}
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{s\_mp\_exptmod} (\textit{continued}). \\
+\textbf{Input}.   mp\_int $a$, $b$ and $c$ \\
+\textbf{Output}.  $y \equiv g^x \mbox{ (mod }p\mbox{)}$ \\
+\hline \\
+\hspace{3mm}12.3  $y \leftarrow (buf >> (lg(\beta) - 1))$ AND $1$ \\
+\hspace{3mm}12.4  $buf \leftarrow buf << 1$ \\
+\hspace{3mm}12.5  if $mode = 0$ and $y = 0$ then goto step 12. \\
+\hspace{3mm}12.6  if $mode = 1$ and $y = 0$ then do \\
+\hspace{6mm}12.6.1  $res \leftarrow res^2$ \\
+\hspace{6mm}12.6.2  $res \leftarrow res \mbox{ (mod }p\mbox{)}$ \\
+\hspace{6mm}12.6.3  Goto step 12. \\
+\hspace{3mm}12.7  $bitcpy \leftarrow bitcpy + 1$ \\
+\hspace{3mm}12.8  $bitbuf \leftarrow bitbuf + (y << (winsize - bitcpy))$ \\
+\hspace{3mm}12.9  $mode \leftarrow 2$ \\
+\hspace{3mm}12.10  If $bitcpy = winsize$ then do \\
+\hspace{6mm}Window is full so perform the squarings and single multiplication. \\
+\hspace{6mm}12.10.1  for $ix$ from $0$ to $winsize -1$ do \\
+\hspace{9mm}12.10.1.1  $res \leftarrow res^2$ \\
+\hspace{9mm}12.10.1.2  $res \leftarrow res \mbox{ (mod }p\mbox{)}$ \\
+\hspace{6mm}12.10.2  $res \leftarrow res \cdot M_{bitbuf}$ \\
+\hspace{6mm}12.10.3  $res \leftarrow res \mbox{ (mod }p\mbox{)}$ \\
+\hspace{6mm}Reset the window. \\
+\hspace{6mm}12.10.4  $bitcpy \leftarrow 0, bitbuf \leftarrow 0, mode \leftarrow 1$ \\
+\\
+No more windows left.  Check for residual bits of exponent. \\
+13.  If $mode = 2$ and $bitcpy > 0$ then do \\
+\hspace{3mm}13.1  for $ix$ form $0$ to $bitcpy - 1$ do \\
+\hspace{6mm}13.1.1  $res \leftarrow res^2$ \\
+\hspace{6mm}13.1.2  $res \leftarrow res \mbox{ (mod }p\mbox{)}$ \\
+\hspace{6mm}13.1.3  $bitbuf \leftarrow bitbuf << 1$ \\
+\hspace{6mm}13.1.4  If $bitbuf$ AND $2^{winsize} \ne 0$ then do \\
+\hspace{9mm}13.1.4.1  $res \leftarrow res \cdot M_{1}$ \\
+\hspace{9mm}13.1.4.2  $res \leftarrow res \mbox{ (mod }p\mbox{)}$ \\
+14.  $y \leftarrow res$ \\
+15.  Clear $res$, $mu$ and the $M$ array. \\
+16.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm s\_mp\_exptmod (continued)}
+\end{figure}
+
+\textbf{Algorithm s\_mp\_exptmod.}
+This algorithm computes the $x$'th power of $g$ modulo $p$ and stores the result in $y$.  It takes advantage of the Barrett reduction
+algorithm to keep the product small throughout the algorithm.
+
+The first two steps determine the optimal window size based on the number of bits in the exponent.  The larger the exponent the 
+larger the window size becomes.  After a window size $winsize$ has been chosen an array of $2^{winsize}$ mp\_int variables is allocated.  This
+table will hold the values of $g^x \mbox{ (mod }p\mbox{)}$ for $2^{winsize - 1} \le x < 2^{winsize}$.  
+
+After the table is allocated the first power of $g$ is found.  Since $g \ge p$ is allowed it must be first reduced modulo $p$ to make
+the rest of the algorithm more efficient.  The first element of the table at $2^{winsize - 1}$ is found by squaring $M_1$ successively $winsize - 2$
+times.  The rest of the table elements are found by multiplying the previous element by $M_1$ modulo $p$.
+
+Now that the table is available the sliding window may begin.  The following list describes the functions of all the variables in the window.
+\begin{enumerate}
+\item The variable $mode$ dictates how the bits of the exponent are interpreted.  
+\begin{enumerate}
+   \item When $mode = 0$ the bits are ignored since no non-zero bit of the exponent has been seen yet.  For example, if the exponent were simply 
+         $1$ then there would be $lg(\beta) - 1$ zero bits before the first non-zero bit.  In this case bits are ignored until a non-zero bit is found.  
+   \item When $mode = 1$ a non-zero bit has been seen before and a new $winsize$-bit window has not been formed yet.  In this mode leading $0$ bits 
+         are read and a single squaring is performed.  If a non-zero bit is read a new window is created.  
+   \item When $mode = 2$ the algorithm is in the middle of forming a window and new bits are appended to the window from the most significant bit
+         downwards.
+\end{enumerate}
+\item The variable $bitcnt$ indicates how many bits are left in the current digit of the exponent left to be read.  When it reaches zero a new digit
+      is fetched from the exponent.
+\item The variable $buf$ holds the currently read digit of the exponent. 
+\item The variable $digidx$ is an index into the exponents digits.  It starts at the leading digit $x.used - 1$ and moves towards the trailing digit.
+\item The variable $bitcpy$ indicates how many bits are in the currently formed window.  When it reaches $winsize$ the window is flushed and
+      the appropriate operations performed.
+\item The variable $bitbuf$ holds the current bits of the window being formed.  
+\end{enumerate}
+
+All of step 12 is the window processing loop.  It will iterate while there are digits available form the exponent to read.  The first step
+inside this loop is to extract a new digit if no more bits are available in the current digit.  If there are no bits left a new digit is
+read and if there are no digits left than the loop terminates.  
+
+After a digit is made available step 12.3 will extract the most significant bit of the current digit and move all other bits in the digit
+upwards.  In effect the digit is read from most significant bit to least significant bit and since the digits are read from leading to 
+trailing edges the entire exponent is read from most significant bit to least significant bit.
+
+At step 12.5 if the $mode$ and currently extracted bit $y$ are both zero the bit is ignored and the next bit is read.  This prevents the 
+algorithm from having to perform trivial squaring and reduction operations before the first non-zero bit is read.  Step 12.6 and 12.7-10 handle
+the two cases of $mode = 1$ and $mode = 2$ respectively.  
+
+FIGU,expt_state,Sliding Window State Diagram
+
+By step 13 there are no more digits left in the exponent.  However, there may be partial bits in the window left.  If $mode = 2$ then 
+a Left-to-Right algorithm is used to process the remaining few bits.  
+
+EXAM,bn_s_mp_exptmod.c
+
+Lines @26,if@ through @40,}@ determine the optimal window size based on the length of the exponent in bits.  The window divisions are sorted
+from smallest to greatest so that in each \textbf{if} statement only one condition must be tested.  For example, by the \textbf{if} statement 
+on line @32,if@ the value of $x$ is already known to be greater than $140$.  
+
+The conditional piece of code beginning on line @42,ifdef@ allows the window size to be restricted to five bits.  This logic is used to ensure
+the table of precomputed powers of $G$ remains relatively small.  
+
+The for loop on line @49,for@ initializes the $M$ array while lines @59,mp_init@ and @62,mp_reduce@ compute the value of $\mu$ required for
+Barrett reduction.  
+
+-- More later.
+
+\section{Quick Power of Two}
+Calculating $b = 2^a$ can be performed much quicker than with any of the previous algorithms.  Recall that a logical shift left $m << k$ is
+equivalent to $m \cdot 2^k$.  By this logic when $m = 1$ a quick power of two can be achieved.
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_2expt}. \\
+\textbf{Input}.   integer $b$ \\
+\textbf{Output}.  $a \leftarrow 2^b$ \\
+\hline \\
+1.  $a \leftarrow 0$ \\
+2.  If $a.alloc < \lfloor b / lg(\beta) \rfloor + 1$ then grow $a$ appropriately. \\
+3.  $a.used \leftarrow \lfloor b / lg(\beta) \rfloor + 1$ \\
+4.  $a_{\lfloor b / lg(\beta) \rfloor} \leftarrow 1 << (b \mbox{ mod } lg(\beta))$ \\
+5.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_2expt}
+\end{figure}
+
+\textbf{Algorithm mp\_2expt.}
+
+EXAM,bn_mp_2expt.c
+
+\chapter{Higher Level Algorithms}
+
+This chapter discusses the various higher level algorithms that are required to complete a well rounded multiple precision integer package.  These
+routines are less performance oriented than the algorithms of chapters five, six and seven but are no less important.  
+
+The first section describes a method of integer division with remainder that is universally well known.  It provides the signed division logic
+for the package.  The subsequent section discusses a set of algorithms which allow a single digit to be the 2nd operand for a variety of operations.  
+These algorithms serve mostly to simplify other algorithms where small constants are required.  The last two sections discuss how to manipulate 
+various representations of integers.  For example, converting from an mp\_int to a string of character.
+
+\section{Integer Division with Remainder}
+\label{sec:division}
+
+Integer division aside from modular exponentiation is the most intensive algorithm to compute.  Like addition, subtraction and multiplication
+the basis of this algorithm is the long-hand division algorithm taught to school children.  Throughout this discussion several common variables
+will be used.  Let $x$ represent the divisor and $y$ represent the dividend.  Let $q$ represent the integer quotient $\lfloor y / x \rfloor$ and 
+let $r$ represent the remainder $r = y - x \lfloor y / x \rfloor$.  The following simple algorithm will be used to start the discussion.
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{Radix-$\beta$ Integer Division}. \\
+\textbf{Input}.   integer $x$ and $y$ \\
+\textbf{Output}.  $q = \lfloor y/x\rfloor, r = y - xq$ \\
+\hline \\
+1.  $q \leftarrow 0$ \\
+2.  $n \leftarrow \vert \vert y \vert \vert - \vert \vert x \vert \vert$ \\
+3.  for $t$ from $n$ down to $0$ do \\
+\hspace{3mm}3.1  Maximize $k$ such that $kx\beta^t$ is less than or equal to $y$ and $(k + 1)x\beta^t$ is greater. \\
+\hspace{3mm}3.2  $q \leftarrow q + k\beta^t$ \\
+\hspace{3mm}3.3  $y \leftarrow y - kx\beta^t$ \\
+4.  $r \leftarrow y$ \\
+5.  Return($q, r$) \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm Radix-$\beta$ Integer Division}
+\label{fig:raddiv}
+\end{figure}
+
+As children we are taught this very simple algorithm for the case of $\beta = 10$.  Almost instinctively several optimizations are taught for which
+their reason of existing are never explained.  For this example let $y = 5471$ represent the dividend and $x = 23$ represent the divisor.
+
+To find the first digit of the quotient the value of $k$ must be maximized such that $kx\beta^t$ is less than or equal to $y$ and 
+simultaneously $(k + 1)x\beta^t$ is greater than $y$.  Implicitly $k$ is the maximum value the $t$'th digit of the quotient may have.  The habitual method
+used to find the maximum is to ``eyeball'' the two numbers, typically only the leading digits and quickly estimate a quotient.  By only using leading
+digits a much simpler division may be used to form an educated guess at what the value must be.  In this case $k = \lfloor 54/23\rfloor = 2$ quickly 
+arises as a possible  solution.  Indeed $2x\beta^2 = 4600$ is less than $y = 5471$ and simultaneously $(k + 1)x\beta^2 = 6900$ is larger than $y$.  
+As a  result $k\beta^2$ is added to the quotient which now equals $q = 200$ and $4600$ is subtracted from $y$ to give a remainder of $y = 841$.
+
+Again this process is repeated to produce the quotient digit $k = 3$ which makes the quotient $q = 200 + 3\beta = 230$ and the remainder 
+$y = 841 - 3x\beta = 181$.  Finally the last iteration of the loop produces $k = 7$ which leads to the quotient $q = 230 + 7 = 237$ and the
+remainder $y = 181 - 7x = 20$.  The final quotient and remainder found are $q = 237$ and $r = y = 20$ which are indeed correct since 
+$237 \cdot 23 + 20 = 5471$ is true.  
+
+\subsection{Quotient Estimation}
+\label{sec:divest}
+As alluded to earlier the quotient digit $k$ can be estimated from only the leading digits of both the divisor and dividend.  When $p$ leading
+digits are used from both the divisor and dividend to form an estimation the accuracy of the estimation rises as $p$ grows.  Technically
+speaking the estimation is based on assuming the lower $\vert \vert y \vert \vert - p$ and $\vert \vert x \vert \vert - p$ lower digits of the
+dividend and divisor are zero.  
+
+The value of the estimation may off by a few values in either direction and in general is fairly correct.  A simplification \cite[pp. 271]{TAOCPV2}
+of the estimation technique is to use $t + 1$ digits of the dividend and $t$ digits of the divisor, in particularly when $t = 1$.  The estimate 
+using this technique is never too small.  For the following proof let $t = \vert \vert y \vert \vert - 1$ and $s = \vert \vert x \vert \vert - 1$ 
+represent the most significant digits of the dividend and divisor respectively.
+
+\textbf{Proof.}\textit{  The quotient $\hat k = \lfloor (y_t\beta + y_{t-1}) / x_s \rfloor$ is greater than or equal to 
+$k = \lfloor y / (x \cdot \beta^{\vert \vert y \vert \vert - \vert \vert x \vert \vert - 1}) \rfloor$. }
+The first obvious case is when $\hat k = \beta - 1$ in which case the proof is concluded since the real quotient cannot be larger.  For all other 
+cases $\hat k = \lfloor (y_t\beta + y_{t-1}) / x_s \rfloor$ and $\hat k x_s \ge y_t\beta + y_{t-1} - x_s + 1$.  The latter portion of the inequalility
+$-x_s + 1$ arises from the fact that a truncated integer division will give the same quotient for at most $x_s - 1$ values.  Next a series of 
+inequalities will prove the hypothesis.
+
+\begin{equation}
+y - \hat k x \le y - \hat k x_s\beta^s
+\end{equation}
+
+This is trivially true since $x \ge x_s\beta^s$.  Next we replace $\hat kx_s\beta^s$ by the previous inequality for $\hat kx_s$.  
+
+\begin{equation}
+y - \hat k x \le y_t\beta^t + \ldots + y_0 - (y_t\beta^t + y_{t-1}\beta^{t-1} - x_s\beta^t + \beta^s)
+\end{equation}
+
+By simplifying the previous inequality the following inequality is formed.
+
+\begin{equation}
+y - \hat k x \le y_{t-2}\beta^{t-2} + \ldots + y_0 + x_s\beta^s - \beta^s
+\end{equation}
+
+Subsequently,
+
+\begin{equation}
+y_{t-2}\beta^{t-2} + \ldots +  y_0  + x_s\beta^s - \beta^s < x_s\beta^s \le x
+\end{equation}
+
+Which proves that $y - \hat kx \le x$ and by consequence $\hat k \ge k$ which concludes the proof.  \textbf{QED}
+
+
+\subsection{Normalized Integers}
+For the purposes of division a normalized input is when the divisors leading digit $x_n$ is greater than or equal to $\beta / 2$.  By multiplying both
+$x$ and $y$ by $j = \lfloor (\beta / 2) / x_n \rfloor$ the quotient remains unchanged and the remainder is simply $j$ times the original
+remainder.  The purpose of normalization is to ensure the leading digit of the divisor is sufficiently large such that the estimated quotient will
+lie in the domain of a single digit.  Consider the maximum dividend $(\beta - 1) \cdot \beta + (\beta - 1)$ and the minimum divisor $\beta / 2$.  
+
+\begin{equation} 
+{{\beta^2 - 1} \over { \beta / 2}} \le 2\beta - {2 \over \beta} 
+\end{equation}
+
+At most the quotient approaches $2\beta$, however, in practice this will not occur since that would imply the previous quotient digit was too small.  
+
+\subsection{Radix-$\beta$ Division with Remainder}
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_div}. \\
+\textbf{Input}.   mp\_int $a, b$ \\
+\textbf{Output}.  $c = \lfloor a/b \rfloor$, $d = a - bc$ \\
+\hline \\
+1.  If $b = 0$ return(\textit{MP\_VAL}). \\
+2.  If $\vert a \vert < \vert b \vert$ then do \\
+\hspace{3mm}2.1  $d \leftarrow a$ \\
+\hspace{3mm}2.2  $c \leftarrow 0$ \\
+\hspace{3mm}2.3  Return(\textit{MP\_OKAY}). \\
+\\
+Setup the quotient to receive the digits. \\
+3.  Grow $q$ to $a.used + 2$ digits. \\
+4.  $q \leftarrow 0$ \\
+5.  $x \leftarrow \vert a \vert , y \leftarrow \vert b \vert$ \\
+6.  $sign \leftarrow  \left \lbrace \begin{array}{ll}
+                              MP\_ZPOS &  \mbox{if }a.sign = b.sign \\
+                              MP\_NEG  &  \mbox{otherwise} \\
+                              \end{array} \right .$ \\
+\\
+Normalize the inputs such that the leading digit of $y$ is greater than or equal to $\beta / 2$. \\
+7.  $norm \leftarrow (lg(\beta) - 1) - (\lceil lg(y) \rceil \mbox{ (mod }lg(\beta)\mbox{)})$ \\
+8.  $x \leftarrow x \cdot 2^{norm}, y \leftarrow y \cdot 2^{norm}$ \\
+\\
+Find the leading digit of the quotient. \\
+9.  $n \leftarrow x.used - 1, t \leftarrow y.used - 1$ \\
+10.  $y \leftarrow y \cdot \beta^{n - t}$ \\
+11.  While ($x \ge y$) do \\
+\hspace{3mm}11.1  $q_{n - t} \leftarrow q_{n - t} + 1$ \\
+\hspace{3mm}11.2  $x \leftarrow x - y$ \\
+12.  $y \leftarrow \lfloor y / \beta^{n-t} \rfloor$ \\
+\\
+Continued on the next page. \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_div}
+\end{figure}
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_div} (continued). \\
+\textbf{Input}.   mp\_int $a, b$ \\
+\textbf{Output}.  $c = \lfloor a/b \rfloor$, $d = a - bc$ \\
+\hline \\
+Now find the remainder fo the digits. \\
+13.  for $i$ from $n$ down to $(t + 1)$ do \\
+\hspace{3mm}13.1  If $i > x.used$ then jump to the next iteration of this loop. \\
+\hspace{3mm}13.2  If $x_{i} = y_{t}$ then \\
+\hspace{6mm}13.2.1  $q_{i - t - 1} \leftarrow \beta - 1$ \\
+\hspace{3mm}13.3  else \\
+\hspace{6mm}13.3.1  $\hat r \leftarrow x_{i} \cdot \beta + x_{i - 1}$ \\
+\hspace{6mm}13.3.2  $\hat r \leftarrow \lfloor \hat r / y_{t} \rfloor$ \\
+\hspace{6mm}13.3.3  $q_{i - t - 1} \leftarrow \hat r$ \\
+\hspace{3mm}13.4  $q_{i - t - 1} \leftarrow q_{i - t - 1} + 1$ \\
+\\
+Fixup quotient estimation. \\
+\hspace{3mm}13.5  Loop \\
+\hspace{6mm}13.5.1  $q_{i - t - 1} \leftarrow q_{i - t - 1} - 1$ \\
+\hspace{6mm}13.5.2  t$1 \leftarrow 0$ \\
+\hspace{6mm}13.5.3  t$1_0 \leftarrow y_{t - 1}, $ t$1_1 \leftarrow y_t,$ t$1.used \leftarrow 2$ \\
+\hspace{6mm}13.5.4  $t1 \leftarrow t1 \cdot q_{i - t - 1}$ \\
+\hspace{6mm}13.5.5  t$2_0 \leftarrow x_{i - 2}, $ t$2_1 \leftarrow x_{i - 1}, $ t$2_2 \leftarrow x_i, $ t$2.used \leftarrow 3$ \\
+\hspace{6mm}13.5.6  If $\vert t1 \vert > \vert t2 \vert$ then goto step 13.5. \\
+\hspace{3mm}13.6  t$1 \leftarrow y \cdot q_{i - t - 1}$ \\
+\hspace{3mm}13.7  t$1 \leftarrow $ t$1 \cdot \beta^{i - t - 1}$ \\
+\hspace{3mm}13.8  $x \leftarrow x - $ t$1$ \\
+\hspace{3mm}13.9  If $x.sign = MP\_NEG$ then \\
+\hspace{6mm}13.10  t$1 \leftarrow y$ \\
+\hspace{6mm}13.11  t$1 \leftarrow $ t$1 \cdot \beta^{i - t - 1}$ \\
+\hspace{6mm}13.12  $x \leftarrow x + $ t$1$ \\
+\hspace{6mm}13.13  $q_{i - t - 1} \leftarrow q_{i - t - 1} - 1$ \\
+\\
+Finalize the result. \\
+14.  Clamp excess digits of $q$ \\
+15.  $c \leftarrow q, c.sign \leftarrow sign$ \\
+16.  $x.sign \leftarrow a.sign$ \\
+17.  $d \leftarrow \lfloor x / 2^{norm} \rfloor$ \\
+18.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_div (continued)}
+\end{figure}
+\textbf{Algorithm mp\_div.}
+This algorithm will calculate quotient and remainder from an integer division given a dividend and divisor.  The algorithm is a signed
+division and will produce a fully qualified quotient and remainder.
+
+First the divisor $b$ must be non-zero which is enforced in step one.  If the divisor is larger than the dividend than the quotient is implicitly 
+zero and the remainder is the dividend.  
+
+After the first two trivial cases of inputs are handled the variable $q$ is setup to receive the digits of the quotient.  Two unsigned copies of the
+divisor $y$ and dividend $x$ are made as well.  The core of the division algorithm is an unsigned division and will only work if the values are
+positive.  Now the two values $x$ and $y$ must be normalized such that the leading digit of $y$ is greater than or equal to $\beta / 2$.  
+This is performed by shifting both to the left by enough bits to get the desired normalization.  
+
+At this point the division algorithm can begin producing digits of the quotient.  Recall that maximum value of the estimation used is 
+$2\beta - {2 \over \beta}$ which means that a digit of the quotient must be first produced by another means.  In this case $y$ is shifted
+to the left (\textit{step ten}) so that it has the same number of digits as $x$.  The loop on step eleven will subtract multiples of the 
+shifted copy of $y$ until $x$ is smaller.  Since the leading digit of $y$ is greater than or equal to $\beta/2$ this loop will iterate at most two
+times to produce the desired leading digit of the quotient.  
+
+Now the remainder of the digits can be produced.  The equation $\hat q = \lfloor {{x_i \beta + x_{i-1}}\over y_t} \rfloor$ is used to fairly
+accurately approximate the true quotient digit.  The estimation can in theory produce an estimation as high as $2\beta - {2 \over \beta}$ but by
+induction the upper quotient digit is correct (\textit{as established on step eleven}) and the estimate must be less than $\beta$.  
+
+Recall from section~\ref{sec:divest} that the estimation is never too low but may be too high.  The next step of the estimation process is
+to refine the estimation.  The loop on step 13.5 uses $x_i\beta^2 + x_{i-1}\beta + x_{i-2}$ and $q_{i - t - 1}(y_t\beta + y_{t-1})$ as a higher
+order approximation to adjust the quotient digit.
+
+After both phases of estimation the quotient digit may still be off by a value of one\footnote{This is similar to the error introduced
+by optimizing Barrett reduction.}.  Steps 13.6 and 13.7 subtract the multiple of the divisor from the dividend (\textit{Similar to step 3.3 of
+algorithm~\ref{fig:raddiv}} and then subsequently add a multiple of the divisor if the quotient was too large.  
+
+Now that the quotient has been determine finializing the result is a matter of clamping the quotient, fixing the sizes and de-normalizing the 
+remainder.  An important aspect of this algorithm seemingly overlooked in other descriptions such as that of Algorithm 14.20 HAC \cite[pp. 598]{HAC}
+is that when the estimations are being made (\textit{inside the loop on step 13.5}) that the digits $y_{t-1}$, $x_{i-2}$ and $x_{i-1}$ may lie 
+outside their respective boundaries.  For example, if $t = 0$ or $i \le 1$ then the digits would be undefined.  In those cases the digits should
+respectively be replaced with a zero.  
+
+EXAM,bn_mp_div.c
+
+The implementation of this algorithm differs slightly from the pseudo code presented previously.  In this algorithm either of the quotient $c$ or
+remainder $d$ may be passed as a \textbf{NULL} pointer which indicates their value is not desired.  For example, the C code to call the division
+algorithm with only the quotient is 
+
+\begin{verbatim}
+mp_div(&a, &b, &c, NULL);  /* c = [a/b] */
+\end{verbatim}
+
+Lines @37,if@ and @42,if@ handle the two trivial cases of inputs which are division by zero and dividend smaller than the divisor 
+respectively.  After the two trivial cases all of the temporary variables are initialized.  Line @76,neg@ determines the sign of 
+the quotient and line @77,sign@ ensures that both $x$ and $y$ are positive.  
+
+The number of bits in the leading digit is calculated on line @80,norm@.  Implictly an mp\_int with $r$ digits will require $lg(\beta)(r-1) + k$ bits
+of precision which when reduced modulo $lg(\beta)$ produces the value of $k$.  In this case $k$ is the number of bits in the leading digit which is
+exactly what is required.  For the algorithm to operate $k$ must equal $lg(\beta) - 1$ and when it does not the inputs must be normalized by shifting
+them to the left by $lg(\beta) - 1 - k$ bits.
+
+Throughout the variables $n$ and $t$ will represent the highest digit of $x$ and $y$ respectively.  These are first used to produce the 
+leading digit of the quotient.  The loop beginning on line @113,for@ will produce the remainder of the quotient digits.
+
+The conditional ``continue'' on line @114,if@ is used to prevent the algorithm from reading past the leading edge of $x$ which can occur when the
+algorithm eliminates multiple non-zero digits in a single iteration.  This ensures that $x_i$ is always non-zero since by definition the digits
+above the $i$'th position $x$ must be zero in order for the quotient to be precise\footnote{Precise as far as integer division is concerned.}.  
+
+Lines @142,t1@, @143,t1@ and @150,t2@ through @152,t2@ manually construct the high accuracy estimations by setting the digits of the two mp\_int 
+variables directly.  
+
+\section{Single Digit Helpers}
+
+This section briefly describes a series of single digit helper algorithms which come in handy when working with small constants.  All of 
+the helper functions assume the single digit input is positive and will treat them as such.
+
+\subsection{Single Digit Addition and Subtraction}
+
+Both addition and subtraction are performed by ``cheating'' and using mp\_set followed by the higher level addition or subtraction 
+algorithms.   As a result these algorithms are subtantially simpler with a slight cost in performance.
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_add\_d}. \\
+\textbf{Input}.   mp\_int $a$ and a mp\_digit $b$ \\
+\textbf{Output}.  $c = a + b$ \\
+\hline \\
+1.  $t \leftarrow b$ (\textit{mp\_set}) \\
+2.  $c \leftarrow a + t$ \\
+3.  Return(\textit{MP\_OKAY}) \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_add\_d}
+\end{figure}
+
+\textbf{Algorithm mp\_add\_d.}
+This algorithm initiates a temporary mp\_int with the value of the single digit and uses algorithm mp\_add to add the two values together.
+
+EXAM,bn_mp_add_d.c
+
+Clever use of the letter 't'.
+
+\subsubsection{Subtraction}
+The single digit subtraction algorithm mp\_sub\_d is essentially the same except it uses mp\_sub to subtract the digit from the mp\_int.
+
+\subsection{Single Digit Multiplication}
+Single digit multiplication arises enough in division and radix conversion that it ought to be implement as a special case of the baseline
+multiplication algorithm.  Essentially this algorithm is a modified version of algorithm s\_mp\_mul\_digs where one of the multiplicands
+only has one digit.
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_mul\_d}. \\
+\textbf{Input}.   mp\_int $a$ and a mp\_digit $b$ \\
+\textbf{Output}.  $c = ab$ \\
+\hline \\
+1.  $pa \leftarrow a.used$ \\
+2.  Grow $c$ to at least $pa + 1$ digits. \\
+3.  $oldused \leftarrow c.used$ \\
+4.  $c.used \leftarrow pa + 1$ \\
+5.  $c.sign \leftarrow a.sign$ \\
+6.  $\mu \leftarrow 0$ \\
+7.  for $ix$ from $0$ to $pa - 1$ do \\
+\hspace{3mm}7.1  $\hat r \leftarrow \mu + a_{ix}b$ \\
+\hspace{3mm}7.2  $c_{ix} \leftarrow \hat r \mbox{ (mod }\beta\mbox{)}$ \\
+\hspace{3mm}7.3  $\mu \leftarrow \lfloor \hat r / \beta \rfloor$ \\
+8.  $c_{pa} \leftarrow \mu$ \\
+9.  for $ix$ from $pa + 1$ to $oldused$ do \\
+\hspace{3mm}9.1  $c_{ix} \leftarrow 0$ \\
+10.  Clamp excess digits of $c$. \\
+11.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_mul\_d}
+\end{figure}
+\textbf{Algorithm mp\_mul\_d.}
+This algorithm quickly multiplies an mp\_int by a small single digit value.  It is specially tailored to the job and has a minimal of overhead.  
+Unlike the full multiplication algorithms this algorithm does not require any significnat temporary storage or memory allocations.  
+
+EXAM,bn_mp_mul_d.c
+
+In this implementation the destination $c$ may point to the same mp\_int as the source $a$ since the result is written after the digit is 
+read from the source.  This function uses pointer aliases $tmpa$ and $tmpc$ for the digits of $a$ and $c$ respectively.  
+
+\subsection{Single Digit Division}
+Like the single digit multiplication algorithm, single digit division is also a fairly common algorithm used in radix conversion.  Since the
+divisor is only a single digit a specialized variant of the division algorithm can be used to compute the quotient.  
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_div\_d}. \\
+\textbf{Input}.   mp\_int $a$ and a mp\_digit $b$ \\
+\textbf{Output}.  $c = \lfloor a / b \rfloor, d = a - cb$ \\
+\hline \\
+1.  If $b = 0$ then return(\textit{MP\_VAL}).\\
+2.  If $b = 3$ then use algorithm mp\_div\_3 instead. \\
+3.  Init $q$ to $a.used$ digits.  \\
+4.  $q.used \leftarrow a.used$ \\
+5.  $q.sign \leftarrow a.sign$ \\
+6.  $\hat w \leftarrow 0$ \\
+7.  for $ix$ from $a.used - 1$ down to $0$ do \\
+\hspace{3mm}7.1  $\hat w \leftarrow \hat w \beta + a_{ix}$ \\
+\hspace{3mm}7.2  If $\hat w \ge b$ then \\
+\hspace{6mm}7.2.1  $t \leftarrow \lfloor \hat w / b \rfloor$ \\
+\hspace{6mm}7.2.2  $\hat w \leftarrow \hat w \mbox{ (mod }b\mbox{)}$ \\
+\hspace{3mm}7.3  else\\
+\hspace{6mm}7.3.1  $t \leftarrow 0$ \\
+\hspace{3mm}7.4  $q_{ix} \leftarrow t$ \\
+8.  $d \leftarrow \hat w$ \\
+9.  Clamp excess digits of $q$. \\
+10.  $c \leftarrow q$ \\
+11.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_div\_d}
+\end{figure}
+\textbf{Algorithm mp\_div\_d.}
+This algorithm divides the mp\_int $a$ by the single mp\_digit $b$ using an optimized approach.  Essentially in every iteration of the
+algorithm another digit of the dividend is reduced and another digit of quotient produced.  Provided $b < \beta$ the value of $\hat w$
+after step 7.1 will be limited such that $0 \le \lfloor \hat w / b \rfloor < \beta$.  
+
+If the divisor $b$ is equal to three a variant of this algorithm is used which is called mp\_div\_3.  It replaces the division by three with
+a multiplication by $\lfloor \beta / 3 \rfloor$ and the appropriate shift and residual fixup.  In essence it is much like the Barrett reduction
+from chapter seven.  
+
+EXAM,bn_mp_div_d.c
+
+Like the implementation of algorithm mp\_div this algorithm allows either of the quotient or remainder to be passed as a \textbf{NULL} pointer to
+indicate the respective value is not required.  This allows a trivial single digit modular reduction algorithm, mp\_mod\_d to be created.
+
+The division and remainder on lines @44,/@ and @45,%@ can be replaced often by a single division on most processors.  For example, the 32-bit x86 based 
+processors can divide a 64-bit quantity by a 32-bit quantity and produce the quotient and remainder simultaneously.  Unfortunately the GCC 
+compiler does not recognize that optimization and will actually produce two function calls to find the quotient and remainder respectively.  
+
+\subsection{Single Digit Root Extraction}
+
+Finding the $n$'th root of an integer is fairly easy as far as numerical analysis is concerned.  Algorithms such as the Newton-Raphson approximation 
+(\ref{eqn:newton}) series will converge very quickly to a root for any continuous function $f(x)$.  
+
+\begin{equation}
+x_{i+1} = x_i - {f(x_i) \over f'(x_i)}
+\label{eqn:newton}
+\end{equation}
+
+In this case the $n$'th root is desired and $f(x) = x^n - a$ where $a$ is the integer of which the root is desired.  The derivative of $f(x)$ is 
+simply $f'(x) = nx^{n - 1}$.  Of particular importance is that this algorithm will be used over the integers not over the a more continuous domain
+such as the real numbers.  As a result the root found can be above the true root by few and must be manually adjusted.  Ideally at the end of the 
+algorithm the $n$'th root $b$ of an integer $a$ is desired such that $b^n \le a$.  
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_n\_root}. \\
+\textbf{Input}.   mp\_int $a$ and a mp\_digit $b$ \\
+\textbf{Output}.  $c^b \le a$ \\
+\hline \\
+1.  If $b$ is even and $a.sign = MP\_NEG$ return(\textit{MP\_VAL}). \\
+2.  $sign \leftarrow a.sign$ \\
+3.  $a.sign \leftarrow MP\_ZPOS$ \\
+4.  t$2 \leftarrow 2$ \\
+5.  Loop \\
+\hspace{3mm}5.1  t$1 \leftarrow $ t$2$ \\
+\hspace{3mm}5.2  t$3 \leftarrow $ t$1^{b - 1}$ \\
+\hspace{3mm}5.3  t$2 \leftarrow $ t$3 $ $\cdot$ t$1$ \\
+\hspace{3mm}5.4  t$2 \leftarrow $ t$2 - a$ \\
+\hspace{3mm}5.5  t$3 \leftarrow $ t$3 \cdot b$ \\
+\hspace{3mm}5.6  t$3 \leftarrow \lfloor $t$2 / $t$3 \rfloor$ \\
+\hspace{3mm}5.7  t$2 \leftarrow $ t$1 - $ t$3$ \\
+\hspace{3mm}5.8  If t$1 \ne $ t$2$ then goto step 5.  \\
+6.  Loop \\
+\hspace{3mm}6.1  t$2 \leftarrow $ t$1^b$ \\
+\hspace{3mm}6.2  If t$2 > a$ then \\
+\hspace{6mm}6.2.1  t$1 \leftarrow $ t$1 - 1$ \\
+\hspace{6mm}6.2.2  Goto step 6. \\
+7.  $a.sign \leftarrow sign$ \\
+8.  $c \leftarrow $ t$1$ \\
+9.  $c.sign \leftarrow sign$  \\
+10.  Return(\textit{MP\_OKAY}).  \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_n\_root}
+\end{figure}
+\textbf{Algorithm mp\_n\_root.}
+This algorithm finds the integer $n$'th root of an input using the Newton-Raphson approach.  It is partially optimized based on the observation
+that the numerator of ${f(x) \over f'(x)}$ can be derived from a partial denominator.  That is at first the denominator is calculated by finding
+$x^{b - 1}$.  This value can then be multiplied by $x$ and have $a$ subtracted from it to find the numerator.  This saves a total of $b - 1$ 
+multiplications by t$1$ inside the loop.  
+
+The initial value of the approximation is t$2 = 2$ which allows the algorithm to start with very small values and quickly converge on the
+root.  Ideally this algorithm is meant to find the $n$'th root of an input where $n$ is bounded by $2 \le n \le 5$.  
+
+EXAM,bn_mp_n_root.c
+
+\section{Random Number Generation}
+
+Random numbers come up in a variety of activities from public key cryptography to simple simulations and various randomized algorithms.  Pollard-Rho 
+factoring for example, can make use of random values as starting points to find factors of a composite integer.  In this case the algorithm presented
+is solely for simulations and not intended for cryptographic use.  
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_rand}. \\
+\textbf{Input}.   An integer $b$ \\
+\textbf{Output}.  A pseudo-random number of $b$ digits \\
+\hline \\
+1.  $a \leftarrow 0$ \\
+2.  If $b \le 0$ return(\textit{MP\_OKAY}) \\
+3.  Pick a non-zero random digit $d$. \\
+4.  $a \leftarrow a + d$ \\
+5.  for $ix$ from 1 to $d - 1$ do \\
+\hspace{3mm}5.1  $a \leftarrow a \cdot \beta$ \\
+\hspace{3mm}5.2  Pick a random digit $d$. \\
+\hspace{3mm}5.3  $a \leftarrow a + d$ \\
+6.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_rand}
+\end{figure}
+\textbf{Algorithm mp\_rand.}
+This algorithm produces a pseudo-random integer of $b$ digits.  By ensuring that the first digit is non-zero the algorithm also guarantees that the
+final result has at least $b$ digits.  It relies heavily on a third-part random number generator which should ideally generate uniformly all of
+the integers from $0$ to $\beta - 1$.  
+
+EXAM,bn_mp_rand.c
+
+\section{Formatted Representations}
+The ability to emit a radix-$n$ textual representation of an integer is useful for interacting with human parties.  For example, the ability to
+be given a string of characters such as ``114585'' and turn it into the radix-$\beta$ equivalent would make it easier to enter numbers
+into a program.
+
+\subsection{Reading Radix-n Input}
+For the purposes of this text we will assume that a simple lower ASCII map (\ref{fig:ASC}) is used for the values of from $0$ to $63$ to 
+printable characters.  For example, when the character ``N'' is read it represents the integer $23$.  The first $16$ characters of the
+map are for the common representations up to hexadecimal.  After that they match the ``base64'' encoding scheme which are suitable chosen
+such that they are printable.  While outputting as base64 may not be too helpful for human operators it does allow communication via non binary
+mediums.
+
+\newpage\begin{figure}[here]
+\begin{center}
+\begin{tabular}{cc|cc|cc|cc}
+\hline \textbf{Value} & \textbf{Char} & \textbf{Value} & \textbf{Char} & \textbf{Value} & \textbf{Char} &  \textbf{Value} & \textbf{Char} \\
+\hline 
+0 & 0 & 1 & 1 & 2 & 2 & 3 & 3 \\
+4 & 4 & 5 & 5 & 6 & 6 & 7 & 7 \\
+8 & 8 & 9 & 9 & 10 & A & 11 & B \\
+12 & C & 13 & D & 14 & E & 15 & F \\
+16 & G & 17 & H & 18 & I & 19 & J \\
+20 & K & 21 & L & 22 & M & 23 & N \\
+24 & O & 25 & P & 26 & Q & 27 & R \\
+28 & S & 29 & T & 30 & U & 31 & V \\
+32 & W & 33 & X & 34 & Y & 35 & Z \\
+36 & a & 37 & b & 38 & c & 39 & d \\
+40 & e & 41 & f & 42 & g & 43 & h \\
+44 & i & 45 & j & 46 & k & 47 & l \\
+48 & m & 49 & n & 50 & o & 51 & p \\
+52 & q & 53 & r & 54 & s & 55 & t \\
+56 & u & 57 & v & 58 & w & 59 & x \\
+60 & y & 61 & z & 62 & $+$ & 63 & $/$ \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Lower ASCII Map}
+\label{fig:ASC}
+\end{figure}
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_read\_radix}. \\
+\textbf{Input}.   A string $str$ of length $sn$ and radix $r$. \\
+\textbf{Output}.  The radix-$\beta$ equivalent mp\_int. \\
+\hline \\
+1.  If $r < 2$ or $r > 64$ return(\textit{MP\_VAL}). \\
+2.  $ix \leftarrow 0$ \\
+3.  If $str_0 =$ ``-'' then do \\
+\hspace{3mm}3.1  $ix \leftarrow ix + 1$ \\
+\hspace{3mm}3.2  $sign \leftarrow MP\_NEG$ \\
+4.  else \\
+\hspace{3mm}4.1  $sign \leftarrow MP\_ZPOS$ \\
+5.  $a \leftarrow 0$ \\
+6.  for $iy$ from $ix$ to $sn - 1$ do \\
+\hspace{3mm}6.1  Let $y$ denote the position in the map of $str_{iy}$. \\
+\hspace{3mm}6.2  If $str_{iy}$ is not in the map or $y \ge r$ then goto step 7. \\
+\hspace{3mm}6.3  $a \leftarrow a \cdot r$ \\
+\hspace{3mm}6.4  $a \leftarrow a + y$ \\
+7.  If $a \ne 0$ then $a.sign \leftarrow sign$ \\
+8.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_read\_radix}
+\end{figure}
+\textbf{Algorithm mp\_read\_radix.}
+This algorithm will read an ASCII string and produce the radix-$\beta$ mp\_int representation of the same integer.  A minus symbol ``-'' may precede the 
+string  to indicate the value is negative, otherwise it is assumed to be positive.  The algorithm will read up to $sn$ characters from the input
+and will stop when it reads a character it cannot map the algorithm stops reading characters from the string.  This allows numbers to be embedded
+as part of larger input without any significant problem.
+
+EXAM,bn_mp_read_radix.c
+
+\subsection{Generating Radix-$n$ Output}
+Generating radix-$n$ output is fairly trivial with a division and remainder algorithm.  
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_toradix}. \\
+\textbf{Input}.   A mp\_int $a$ and an integer $r$\\
+\textbf{Output}.  The radix-$r$ representation of $a$ \\
+\hline \\
+1.  If $r < 2$ or $r > 64$ return(\textit{MP\_VAL}). \\
+2.  If $a = 0$ then $str = $ ``$0$'' and return(\textit{MP\_OKAY}).  \\
+3.  $t \leftarrow a$ \\
+4.  $str \leftarrow$ ``'' \\
+5.  if $t.sign = MP\_NEG$ then \\
+\hspace{3mm}5.1  $str \leftarrow str + $ ``-'' \\
+\hspace{3mm}5.2  $t.sign = MP\_ZPOS$ \\
+6.  While ($t \ne 0$) do \\
+\hspace{3mm}6.1  $d \leftarrow t \mbox{ (mod }r\mbox{)}$ \\
+\hspace{3mm}6.2  $t \leftarrow \lfloor t / r \rfloor$ \\
+\hspace{3mm}6.3  Look up $d$ in the map and store the equivalent character in $y$. \\
+\hspace{3mm}6.4  $str \leftarrow str + y$ \\
+7.  If $str_0 = $``$-$'' then \\
+\hspace{3mm}7.1  Reverse the digits $str_1, str_2, \ldots str_n$. \\
+8.  Otherwise \\
+\hspace{3mm}8.1  Reverse the digits $str_0, str_1, \ldots str_n$. \\
+9.  Return(\textit{MP\_OKAY}).\\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_toradix}
+\end{figure}
+\textbf{Algorithm mp\_toradix.}
+This algorithm computes the radix-$r$ representation of an mp\_int $a$.  The ``digits'' of the representation are extracted by reducing 
+successive powers of $\lfloor a / r^k \rfloor$ the input modulo $r$ until $r^k > a$.  Note that instead of actually dividing by $r^k$ in
+each iteration the quotient $\lfloor a / r \rfloor$ is saved for the next iteration.  As a result a series of trivial $n \times 1$ divisions
+are required instead of a series of $n \times k$ divisions.  One design flaw of this approach is that the digits are produced in the reverse order 
+(see~\ref{fig:mpradix}).  To remedy this flaw the digits must be swapped or simply ``reversed''.
+
+\begin{figure}
+\begin{center}
+\begin{tabular}{|c|c|c|}
+\hline \textbf{Value of $a$} & \textbf{Value of $d$} & \textbf{Value of $str$} \\
+\hline $1234$ & -- & -- \\
+\hline $123$  & $4$ & ``4'' \\
+\hline $12$   & $3$ & ``43'' \\
+\hline $1$    & $2$ & ``432'' \\
+\hline $0$    & $1$ & ``4321'' \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Example of Algorithm mp\_toradix.}
+\label{fig:mpradix}
+\end{figure}
+
+EXAM,bn_mp_toradix.c
+
+\chapter{Number Theoretic Algorithms}
+This chapter discusses several fundamental number theoretic algorithms such as the greatest common divisor, least common multiple and Jacobi 
+symbol computation.  These algorithms arise as essential components in several key cryptographic algorithms such as the RSA public key algorithm and
+various Sieve based factoring algorithms.
+
+\section{Greatest Common Divisor}
+The greatest common divisor of two integers $a$ and $b$, often denoted as $(a, b)$ is the largest integer $k$ that is a proper divisor of
+both $a$ and $b$.  That is, $k$ is the largest integer such that $0 \equiv a \mbox{ (mod }k\mbox{)}$ and $0 \equiv b \mbox{ (mod }k\mbox{)}$ occur
+simultaneously.
+
+The most common approach (cite) is to reduce one input modulo another.  That is if $a$ and $b$ are divisible by some integer $k$ and if $qa + r = b$ then
+$r$ is also divisible by $k$.  The reduction pattern follows $\left < a , b \right > \rightarrow \left < b, a \mbox{ mod } b \right >$.  
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{Greatest Common Divisor (I)}. \\
+\textbf{Input}.   Two positive integers $a$ and $b$ greater than zero. \\
+\textbf{Output}.  The greatest common divisor $(a, b)$.  \\
+\hline \\
+1.  While ($b > 0$) do \\
+\hspace{3mm}1.1  $r \leftarrow a \mbox{ (mod }b\mbox{)}$ \\
+\hspace{3mm}1.2  $a \leftarrow b$ \\
+\hspace{3mm}1.3  $b \leftarrow r$ \\
+2.  Return($a$). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm Greatest Common Divisor (I)}
+\label{fig:gcd1}
+\end{figure}
+
+This algorithm will quickly converge on the greatest common divisor since the residue $r$ tends diminish rapidly.  However, divisions are
+relatively expensive operations to perform and should ideally be avoided.  There is another approach based on a similar relationship of 
+greatest common divisors.  The faster approach is based on the observation that if $k$ divides both $a$ and $b$ it will also divide $a - b$.  
+In particular, we would like $a - b$ to decrease in magnitude which implies that $b \ge a$.  
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{Greatest Common Divisor (II)}. \\
+\textbf{Input}.   Two positive integers $a$ and $b$ greater than zero. \\
+\textbf{Output}.  The greatest common divisor $(a, b)$.  \\
+\hline \\
+1.  While ($b > 0$) do \\
+\hspace{3mm}1.1  Swap $a$ and $b$ such that $a$ is the smallest of the two. \\
+\hspace{3mm}1.2  $b \leftarrow b - a$ \\
+2.  Return($a$). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm Greatest Common Divisor (II)}
+\label{fig:gcd2}
+\end{figure}
+
+\textbf{Proof} \textit{Algorithm~\ref{fig:gcd2} will return the greatest common divisor of $a$ and $b$.}
+The algorithm in figure~\ref{fig:gcd2} will eventually terminate since $b \ge a$ the subtraction in step 1.2 will be a value less than $b$.  In other
+words in every iteration that tuple $\left < a, b \right >$ decrease in magnitude until eventually $a = b$.  Since both $a$ and $b$ are always 
+divisible by the greatest common divisor (\textit{until the last iteration}) and in the last iteration of the algorithm $b = 0$, therefore, in the 
+second to last iteration of the algorithm $b = a$ and clearly $(a, a) = a$ which concludes the proof.  \textbf{QED}.
+
+As a matter of practicality algorithm \ref{fig:gcd1} decreases far too slowly to be useful.  Specially if $b$ is much larger than $a$ such that 
+$b - a$ is still very much larger than $a$.  A simple addition to the algorithm is to divide $b - a$ by a power of some integer $p$ which does
+not divide the greatest common divisor but will divide $b - a$.  In this case ${b - a} \over p$ is also an integer and still divisible by
+the greatest common divisor.
+
+However, instead of factoring $b - a$ to find a suitable value of $p$ the powers of $p$ can be removed from $a$ and $b$ that are in common first.  
+Then inside the loop whenever $b - a$ is divisible by some power of $p$ it can be safely removed.  
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{Greatest Common Divisor (III)}. \\
+\textbf{Input}.   Two positive integers $a$ and $b$ greater than zero. \\
+\textbf{Output}.  The greatest common divisor $(a, b)$.  \\
+\hline \\
+1.  $k \leftarrow 0$ \\
+2.  While $a$ and $b$ are both divisible by $p$ do \\
+\hspace{3mm}2.1  $a \leftarrow \lfloor a / p \rfloor$ \\
+\hspace{3mm}2.2  $b \leftarrow \lfloor b / p \rfloor$ \\
+\hspace{3mm}2.3  $k \leftarrow k + 1$ \\
+3.  While $a$ is divisible by $p$ do \\
+\hspace{3mm}3.1  $a \leftarrow \lfloor a / p \rfloor$ \\
+4.  While $b$ is divisible by $p$ do \\
+\hspace{3mm}4.1  $b \leftarrow \lfloor b / p \rfloor$ \\
+5.  While ($b > 0$) do \\
+\hspace{3mm}5.1  Swap $a$ and $b$ such that $a$ is the smallest of the two. \\
+\hspace{3mm}5.2  $b \leftarrow b - a$ \\
+\hspace{3mm}5.3  While $b$ is divisible by $p$ do \\
+\hspace{6mm}5.3.1  $b \leftarrow \lfloor b / p \rfloor$ \\
+6.  Return($a \cdot p^k$). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm Greatest Common Divisor (III)}
+\label{fig:gcd3}
+\end{figure}
+
+This algorithm is based on the first except it removes powers of $p$ first and inside the main loop to ensure the tuple $\left < a, b \right >$ 
+decreases more rapidly.  The first loop on step two removes powers of $p$ that are in common.  A count, $k$, is kept which will present a common
+divisor of $p^k$.  After step two the remaining common divisor of $a$ and $b$ cannot be divisible by $p$.  This means that $p$ can be safely 
+divided out of the difference $b - a$ so long as the division leaves no remainder.  
+
+In particular the value of $p$ should be chosen such that the division on step 5.3.1 occur often.  It also helps that division by $p$ be easy
+to compute.  The ideal choice of $p$ is two since division by two amounts to a right logical shift.  Another important observation is that by
+step five both $a$ and $b$ are odd.  Therefore, the diffrence $b - a$ must be even which means that each iteration removes one bit from the 
+largest of the pair.
+
+\subsection{Complete Greatest Common Divisor}
+The algorithms presented so far cannot handle inputs which are zero or negative.  The following algorithm can handle all input cases properly
+and will produce the greatest common divisor.
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_gcd}. \\
+\textbf{Input}.   mp\_int $a$ and $b$ \\
+\textbf{Output}.  The greatest common divisor $c = (a, b)$.  \\
+\hline \\
+1.  If $a = 0$ and $b \ne 0$ then \\
+\hspace{3mm}1.1  $c \leftarrow b$ \\
+\hspace{3mm}1.2  Return(\textit{MP\_OKAY}). \\
+2.  If $a \ne 0$ and $b = 0$ then \\
+\hspace{3mm}2.1  $c \leftarrow a$ \\
+\hspace{3mm}2.2  Return(\textit{MP\_OKAY}). \\
+3.  If $a = b = 0$ then \\
+\hspace{3mm}3.1  $c \leftarrow 1$ \\
+\hspace{3mm}3.2  Return(\textit{MP\_OKAY}). \\
+4.  $u \leftarrow \vert a \vert, v \leftarrow \vert b \vert$ \\
+5.  $k \leftarrow 0$ \\
+6.  While $u.used > 0$ and $v.used > 0$ and $u_0 \equiv v_0 \equiv 0 \mbox{ (mod }2\mbox{)}$ \\
+\hspace{3mm}6.1  $k \leftarrow k + 1$ \\
+\hspace{3mm}6.2  $u \leftarrow \lfloor u / 2 \rfloor$ \\
+\hspace{3mm}6.3  $v \leftarrow \lfloor v / 2 \rfloor$ \\
+7.  While $u.used > 0$ and $u_0 \equiv 0 \mbox{ (mod }2\mbox{)}$ \\
+\hspace{3mm}7.1  $u \leftarrow \lfloor u / 2 \rfloor$ \\
+8.  While $v.used > 0$ and $v_0 \equiv 0 \mbox{ (mod }2\mbox{)}$ \\
+\hspace{3mm}8.1  $v \leftarrow \lfloor v / 2 \rfloor$ \\
+9.  While $v.used > 0$ \\
+\hspace{3mm}9.1  If $\vert u \vert > \vert v \vert$ then \\
+\hspace{6mm}9.1.1  Swap $u$ and $v$. \\
+\hspace{3mm}9.2  $v \leftarrow \vert v \vert - \vert u \vert$ \\
+\hspace{3mm}9.3  While $v.used > 0$ and $v_0 \equiv 0 \mbox{ (mod }2\mbox{)}$ \\
+\hspace{6mm}9.3.1  $v \leftarrow \lfloor v / 2 \rfloor$ \\
+10.  $c \leftarrow u \cdot 2^k$ \\
+11.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_gcd}
+\end{figure}
+\textbf{Algorithm mp\_gcd.}
+This algorithm will produce the greatest common divisor of two mp\_ints $a$ and $b$.  The algorithm was originally based on Algorithm B of
+Knuth \cite[pp. 338]{TAOCPV2} but has been modified to be simpler to explain.  In theory it achieves the same asymptotic working time as
+Algorithm B and in practice this appears to be true.  
+
+The first three steps handle the cases where either one of or both inputs are zero.  If either input is zero the greatest common divisor is the 
+largest input or zero if they are both zero.  If the inputs are not trivial than $u$ and $v$ are assigned the absolute values of 
+$a$ and $b$ respectively and the algorithm will proceed to reduce the pair.
+
+Step six will divide out any common factors of two and keep track of the count in the variable $k$.  After this step two is no longer a
+factor of the remaining greatest common divisor between $u$ and $v$ and can be safely evenly divided out of either whenever they are even.  Step 
+seven and eight ensure that the $u$ and $v$ respectively have no more factors of two.  At most only one of the while loops will iterate since 
+they cannot both be even.
+
+By step nine both of $u$ and $v$ are odd which is required for the inner logic.  First the pair are swapped such that $v$ is equal to
+or greater than $u$.  This ensures that the subtraction on step 9.2 will always produce a positive and even result.  Step 9.3 removes any
+factors of two from the difference $u$ to ensure that in the next iteration of the loop both are once again odd.
+
+After $v = 0$ occurs the variable $u$ has the greatest common divisor of the pair $\left < u, v \right >$ just after step six.  The result
+must be adjusted by multiplying by the common factors of two ($2^k$) removed earlier.  
+
+EXAM,bn_mp_gcd.c
+
+This function makes use of the macros mp\_iszero and mp\_iseven.  The former evaluates to $1$ if the input mp\_int is equivalent to the 
+integer zero otherwise it evaluates to $0$.  The latter evaluates to $1$ if the input mp\_int represents a non-zero even integer otherwise
+it evaluates to $0$.  Note that just because mp\_iseven may evaluate to $0$ does not mean the input is odd, it could also be zero.  The three 
+trivial cases of inputs are handled on lines @25,zero@ through @34,}@.  After those lines the inputs are assumed to be non-zero.
+
+Lines @36,if@ and @40,if@ make local copies $u$ and $v$ of the inputs $a$ and $b$ respectively.  At this point the common factors of two 
+must be divided out of the two inputs.  The while loop on line @49,while@ iterates so long as both are even.  The local integer $k$ is used to
+keep track of how many factors of $2$ are pulled out of both values.  It is assumed that the number of factors will not exceed the maximum 
+value of a C ``int'' data type\footnote{Strictly speaking no array in C may have more than entries than are accessible by an ``int'' so this is not 
+a limitation.}.  
+
+At this point there are no more common factors of two in the two values.  The while loops on lines @60,while@ and @65,while@ remove any independent
+factors of two such that both $u$ and $v$ are guaranteed to be an odd integer before hitting the main body of the algorithm.  The while loop
+on line @71, while@ performs the reduction of the pair until $v$ is equal to zero.  The unsigned comparison and subtraction algorithms are used in
+place of the full signed routines since both values are guaranteed to be positive and the result of the subtraction is guaranteed to be non-negative.
+
+\section{Least Common Multiple}
+The least common multiple of a pair of integers is their product divided by their greatest common divisor.  For two integers $a$ and $b$ the
+least common multiple is normally denoted as $[ a, b ]$ and numerically equivalent to ${ab} \over {(a, b)}$.  For example, if $a = 2 \cdot 2 \cdot 3 = 12$
+and $b = 2 \cdot 3 \cdot 3 \cdot 7 = 126$ the least common multiple is ${126 \over {(12, 126)}} = {126 \over 6} = 21$.
+
+The least common multiple arises often in coding theory as well as number theory.  If two functions have periods of $a$ and $b$ respectively they will
+collide, that is be in synchronous states, after only $[ a, b ]$ iterations.  This is why, for example, random number generators based on 
+Linear Feedback Shift Registers (LFSR) tend to use registers with periods which are co-prime (\textit{e.g. the greatest common divisor is one.}).  
+Similarly in number theory if a composite $n$ has two prime factors $p$ and $q$ then maximal order of any unit of $\Z/n\Z$ will be $[ p - 1, q - 1] $.
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_lcm}. \\
+\textbf{Input}.   mp\_int $a$ and $b$ \\
+\textbf{Output}.  The least common multiple $c = [a, b]$.  \\
+\hline \\
+1.  $c \leftarrow (a, b)$ \\
+2.  $t \leftarrow a \cdot b$ \\
+3.  $c \leftarrow \lfloor t / c \rfloor$ \\
+4.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_lcm}
+\end{figure}
+\textbf{Algorithm mp\_lcm.}
+This algorithm computes the least common multiple of two mp\_int inputs $a$ and $b$.  It computes the least common multiple directly by
+dividing the product of the two inputs by their greatest common divisor.
+
+EXAM,bn_mp_lcm.c
+
+\section{Jacobi Symbol Computation}
+To explain the Jacobi Symbol we shall first discuss the Legendre function\footnote{Arrg.  What is the name of this?} off which the Jacobi symbol is 
+defined.  The Legendre function computes whether or not an integer $a$ is a quadratic residue modulo an odd prime $p$.  Numerically it is
+equivalent to equation \ref{eqn:legendre}.
+
+\textit{-- Tom, don't be an ass, cite your source here...!}
+
+\begin{equation}
+a^{(p-1)/2} \equiv \begin{array}{rl}
+                              -1 &  \mbox{if }a\mbox{ is a quadratic non-residue.} \\
+                              0  &  \mbox{if }a\mbox{ divides }p\mbox{.} \\
+                              1  &  \mbox{if }a\mbox{ is a quadratic residue}. 
+                              \end{array} \mbox{ (mod }p\mbox{)}
+\label{eqn:legendre}                              
+\end{equation}
+
+\textbf{Proof.} \textit{Equation \ref{eqn:legendre} correctly identifies the residue status of an integer $a$ modulo a prime $p$.}
+An integer $a$ is a quadratic residue if the following equation has a solution.
+
+\begin{equation}
+x^2 \equiv a \mbox{ (mod }p\mbox{)}
+\label{eqn:root}
+\end{equation}
+
+Consider the following equation.
+
+\begin{equation}
+0 \equiv x^{p-1} - 1 \equiv \left \lbrace \left (x^2 \right )^{(p-1)/2} - a^{(p-1)/2} \right \rbrace + \left ( a^{(p-1)/2} - 1 \right ) \mbox{ (mod }p\mbox{)}
+\label{eqn:rooti}
+\end{equation}
+
+Whether equation \ref{eqn:root} has a solution or not equation \ref{eqn:rooti} is always true.  If $a^{(p-1)/2} - 1 \equiv 0 \mbox{ (mod }p\mbox{)}$
+then the quantity in the braces must be zero.  By reduction,
+
+\begin{eqnarray}
+\left (x^2 \right )^{(p-1)/2} - a^{(p-1)/2} \equiv 0  \nonumber \\
+\left (x^2 \right )^{(p-1)/2} \equiv a^{(p-1)/2} \nonumber \\
+x^2 \equiv a \mbox{ (mod }p\mbox{)} 
+\end{eqnarray}
+
+As a result there must be a solution to the quadratic equation and in turn $a$ must be a quadratic residue.  If $a$ does not divide $p$ and $a$
+is not a quadratic residue then the only other value $a^{(p-1)/2}$ may be congruent to is $-1$ since
+\begin{equation}
+0 \equiv a^{p - 1} - 1 \equiv (a^{(p-1)/2} + 1)(a^{(p-1)/2} - 1) \mbox{ (mod }p\mbox{)}
+\end{equation}
+One of the terms on the right hand side must be zero.  \textbf{QED}
+
+\subsection{Jacobi Symbol}
+The Jacobi symbol is a generalization of the Legendre function for any odd non prime moduli $p$ greater than 2.  If $p = \prod_{i=0}^n p_i$ then
+the Jacobi symbol $\left ( { a \over p } \right )$ is equal to the following equation.
+
+\begin{equation}
+\left ( { a \over p } \right ) = \left ( { a \over p_0} \right ) \left ( { a \over p_1} \right ) \ldots \left ( { a \over p_n} \right )
+\end{equation}
+
+By inspection if $p$ is prime the Jacobi symbol is equivalent to the Legendre function.  The following facts\footnote{See HAC \cite[pp. 72-74]{HAC} for
+further details.} will be used to derive an efficient Jacobi symbol algorithm.  Where $p$ is an odd integer greater than two and $a, b \in \Z$ the
+following are true.  
+
+\begin{enumerate}
+\item $\left ( { a \over p} \right )$ equals $-1$, $0$ or $1$. 
+\item $\left ( { ab \over p} \right ) = \left ( { a \over p} \right )\left ( { b \over p} \right )$.
+\item If $a \equiv b$ then $\left ( { a \over p} \right ) = \left ( { b \over p} \right )$.
+\item $\left ( { 2 \over p} \right )$ equals $1$ if $p \equiv 1$ or $7 \mbox{ (mod }8\mbox{)}$.  Otherwise, it equals $-1$.
+\item $\left ( { a \over p} \right ) \equiv \left ( { p \over a} \right ) \cdot (-1)^{(p-1)(a-1)/4}$.  More specifically 
+$\left ( { a \over p} \right ) = \left ( { p \over a} \right )$ if $p \equiv a \equiv 1 \mbox{ (mod }4\mbox{)}$.  
+\end{enumerate}
+
+Using these facts if $a = 2^k \cdot a'$ then
+
+\begin{eqnarray}
+\left ( { a \over p } \right ) = \left ( {{2^k} \over p } \right ) \left ( {a' \over p} \right ) \nonumber \\
+                               = \left ( {2 \over p } \right )^k \left ( {a' \over p} \right ) 
+\label{eqn:jacobi}
+\end{eqnarray}
+
+By fact five, 
+
+\begin{equation}
+\left ( { a \over p } \right ) = \left ( { p \over a } \right ) \cdot (-1)^{(p-1)(a-1)/4} 
+\end{equation}
+
+Subsequently by fact three since $p \equiv (p \mbox{ mod }a) \mbox{ (mod }a\mbox{)}$ then 
+
+\begin{equation}
+\left ( { a \over p } \right ) = \left ( { {p \mbox{ mod } a} \over a } \right ) \cdot (-1)^{(p-1)(a-1)/4} 
+\end{equation}
+
+By putting both observations into equation \ref{eqn:jacobi} the following simplified equation is formed.
+
+\begin{equation}
+\left ( { a \over p } \right ) = \left ( {2 \over p } \right )^k \left ( {{p\mbox{ mod }a'} \over a'} \right )  \cdot (-1)^{(p-1)(a'-1)/4} 
+\end{equation}
+
+The value of $\left ( {{p \mbox{ mod }a'} \over a'} \right )$ can be found by using the same equation recursively.  The value of 
+$\left ( {2 \over p } \right )^k$ equals $1$ if $k$ is even otherwise it equals $\left ( {2 \over p } \right )$.  Using this approach the 
+factors of $p$ do not have to be known.  Furthermore, if $(a, p) = 1$ then the algorithm will terminate when the recursion requests the 
+Jacobi symbol computation of $\left ( {1 \over a'} \right )$ which is simply $1$.  
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_jacobi}. \\
+\textbf{Input}.   mp\_int $a$ and $p$, $a \ge 0$, $p \ge 3$, $p \equiv 1 \mbox{ (mod }2\mbox{)}$ \\
+\textbf{Output}.  The Jacobi symbol $c = \left ( {a \over p } \right )$. \\
+\hline \\
+1.  If $a = 0$ then \\
+\hspace{3mm}1.1  $c \leftarrow 0$ \\
+\hspace{3mm}1.2  Return(\textit{MP\_OKAY}). \\
+2.  If $a = 1$ then \\
+\hspace{3mm}2.1  $c \leftarrow 1$ \\
+\hspace{3mm}2.2  Return(\textit{MP\_OKAY}). \\
+3.  $a' \leftarrow a$ \\
+4.  $k \leftarrow 0$ \\
+5.  While $a'.used > 0$ and $a'_0 \equiv 0 \mbox{ (mod }2\mbox{)}$ \\
+\hspace{3mm}5.1  $k \leftarrow k + 1$ \\
+\hspace{3mm}5.2  $a' \leftarrow \lfloor a' / 2 \rfloor$ \\
+6.  If $k \equiv 0 \mbox{ (mod }2\mbox{)}$ then \\
+\hspace{3mm}6.1  $s \leftarrow 1$ \\
+7.  else \\
+\hspace{3mm}7.1  $r \leftarrow p_0 \mbox{ (mod }8\mbox{)}$ \\
+\hspace{3mm}7.2  If $r = 1$ or $r = 7$ then \\
+\hspace{6mm}7.2.1  $s \leftarrow 1$ \\
+\hspace{3mm}7.3  else \\
+\hspace{6mm}7.3.1  $s \leftarrow -1$ \\
+8.  If $p_0 \equiv a'_0 \equiv 3 \mbox{ (mod }4\mbox{)}$ then \\
+\hspace{3mm}8.1  $s \leftarrow -s$ \\
+9.  If $a' \ne 1$ then \\
+\hspace{3mm}9.1  $p' \leftarrow p \mbox{ (mod }a'\mbox{)}$ \\
+\hspace{3mm}9.2  $s \leftarrow s \cdot \mbox{mp\_jacobi}(p', a')$ \\
+10.  $c \leftarrow s$ \\
+11.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_jacobi}
+\end{figure}
+\textbf{Algorithm mp\_jacobi.}
+This algorithm computes the Jacobi symbol for an arbitrary positive integer $a$ with respect to an odd integer $p$ greater than three.  The algorithm
+is based on algorithm 2.149 of HAC \cite[pp. 73]{HAC}.  
+
+Step numbers one and two handle the trivial cases of $a = 0$ and $a = 1$ respectively.  Step five determines the number of two factors in the
+input $a$.  If $k$ is even than the term $\left ( { 2 \over p } \right )^k$ must always evaluate to one.  If $k$ is odd than the term evaluates to one 
+if $p_0$ is congruent to one or seven modulo eight, otherwise it evaluates to $-1$. After the the $\left ( { 2 \over p } \right )^k$ term is handled 
+the $(-1)^{(p-1)(a'-1)/4}$ is computed and multiplied against the current product $s$.  The latter term evaluates to one if both $p$ and $a'$ 
+are congruent to one modulo four, otherwise it evaluates to negative one.
+
+By step nine if $a'$ does not equal one a recursion is required.  Step 9.1 computes $p' \equiv p \mbox{ (mod }a'\mbox{)}$ and will recurse to compute
+$\left ( {p' \over a'} \right )$ which is multiplied against the current Jacobi product.
+
+EXAM,bn_mp_jacobi.c
+
+As a matter of practicality the variable $a'$ as per the pseudo-code is reprensented by the variable $a1$ since the $'$ symbol is not valid for a C 
+variable name character. 
+
+The two simple cases of $a = 0$ and $a = 1$ are handled at the very beginning to simplify the algorithm.  If the input is non-trivial the algorithm
+has to proceed compute the Jacobi.  The variable $s$ is used to hold the current Jacobi product.  Note that $s$ is merely a C ``int'' data type since
+the values it may obtain are merely $-1$, $0$ and $1$.  
+
+After a local copy of $a$ is made all of the factors of two are divided out and the total stored in $k$.  Technically only the least significant
+bit of $k$ is required, however, it makes the algorithm simpler to follow to perform an addition. In practice an exclusive-or and addition have the same 
+processor requirements and neither is faster than the other.
+
+Line @59, if@ through @70, }@ determines the value of $\left ( { 2 \over p } \right )^k$.  If the least significant bit of $k$ is zero than
+$k$ is even and the value is one.  Otherwise, the value of $s$ depends on which residue class $p$ belongs to modulo eight.  The value of
+$(-1)^{(p-1)(a'-1)/4}$ is compute and multiplied against $s$ on lines @73, if@ through @75, }@.  
+
+Finally, if $a1$ does not equal one the algorithm must recurse and compute $\left ( {p' \over a'} \right )$.  
+
+\textit{-- Comment about default $s$ and such...}
+
+\section{Modular Inverse}
+\label{sec:modinv}
+The modular inverse of a number actually refers to the modular multiplicative inverse.  Essentially for any integer $a$ such that $(a, p) = 1$ there
+exist another integer $b$ such that $ab \equiv 1 \mbox{ (mod }p\mbox{)}$.  The integer $b$ is called the multiplicative inverse of $a$ which is
+denoted as $b = a^{-1}$.  Technically speaking modular inversion is a well defined operation for any finite ring or field not just for rings and 
+fields of integers.  However, the former will be the matter of discussion.
+
+The simplest approach is to compute the algebraic inverse of the input.  That is to compute $b \equiv a^{\Phi(p) - 1}$.  If $\Phi(p)$ is the 
+order of the multiplicative subgroup modulo $p$ then $b$ must be the multiplicative inverse of $a$.  The proof of which is trivial.
+
+\begin{equation}
+ab \equiv a \left (a^{\Phi(p) - 1} \right ) \equiv a^{\Phi(p)} \equiv a^0 \equiv 1 \mbox{ (mod }p\mbox{)}
+\end{equation}
+
+However, as simple as this approach may be it has two serious flaws.  It requires that the value of $\Phi(p)$ be known which if $p$ is composite 
+requires all of the prime factors.  This approach also is very slow as the size of $p$ grows.  
+
+A simpler approach is based on the observation that solving for the multiplicative inverse is equivalent to solving the linear 
+Diophantine\footnote{See LeVeque \cite[pp. 40-43]{LeVeque} for more information.} equation.
+
+\begin{equation}
+ab + pq = 1
+\end{equation}
+
+Where $a$, $b$, $p$ and $q$ are all integers.  If such a pair of integers $ \left < b, q \right >$ exist than $b$ is the multiplicative inverse of 
+$a$ modulo $p$.  The extended Euclidean algorithm (Knuth \cite[pp. 342]{TAOCPV2}) can be used to solve such equations provided $(a, p) = 1$.  
+However, instead of using that algorithm directly a variant known as the binary Extended Euclidean algorithm will be used in its place.  The
+binary approach is very similar to the binary greatest common divisor algorithm except it will produce a full solution to the Diophantine 
+equation.  
+
+\subsection{General Case}
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_invmod}. \\
+\textbf{Input}.   mp\_int $a$ and $b$, $(a, b) = 1$, $p \ge 2$, $0 < a < p$.  \\
+\textbf{Output}.  The modular inverse $c \equiv a^{-1} \mbox{ (mod }b\mbox{)}$. \\
+\hline \\
+1.  If $b \le 0$ then return(\textit{MP\_VAL}). \\
+2.  If $b_0 \equiv 1 \mbox{ (mod }2\mbox{)}$ then use algorithm fast\_mp\_invmod. \\
+3.  $x \leftarrow \vert a \vert, y \leftarrow b$ \\
+4.  If $x_0 \equiv y_0  \equiv 0 \mbox{ (mod }2\mbox{)}$ then return(\textit{MP\_VAL}). \\
+5.  $B \leftarrow 0, C \leftarrow 0, A \leftarrow 1, D \leftarrow 1$ \\
+6.  While $u.used > 0$ and $u_0 \equiv 0 \mbox{ (mod }2\mbox{)}$ \\
+\hspace{3mm}6.1  $u \leftarrow \lfloor u / 2 \rfloor$ \\
+\hspace{3mm}6.2  If ($A.used > 0$ and $A_0 \equiv 1 \mbox{ (mod }2\mbox{)}$) or ($B.used > 0$ and $B_0 \equiv 1 \mbox{ (mod }2\mbox{)}$) then \\
+\hspace{6mm}6.2.1  $A \leftarrow A + y$ \\
+\hspace{6mm}6.2.2  $B \leftarrow B - x$ \\
+\hspace{3mm}6.3  $A \leftarrow \lfloor A / 2 \rfloor$ \\
+\hspace{3mm}6.4  $B \leftarrow \lfloor B / 2 \rfloor$ \\
+7.  While $v.used > 0$ and $v_0 \equiv 0 \mbox{ (mod }2\mbox{)}$ \\
+\hspace{3mm}7.1  $v \leftarrow \lfloor v / 2 \rfloor$ \\
+\hspace{3mm}7.2  If ($C.used > 0$ and $C_0 \equiv 1 \mbox{ (mod }2\mbox{)}$) or ($D.used > 0$ and $D_0 \equiv 1 \mbox{ (mod }2\mbox{)}$) then \\
+\hspace{6mm}7.2.1  $C \leftarrow C + y$ \\
+\hspace{6mm}7.2.2  $D \leftarrow D - x$ \\
+\hspace{3mm}7.3  $C \leftarrow \lfloor C / 2 \rfloor$ \\
+\hspace{3mm}7.4  $D \leftarrow \lfloor D / 2 \rfloor$ \\
+8.  If $u \ge v$ then \\
+\hspace{3mm}8.1  $u \leftarrow u - v$ \\
+\hspace{3mm}8.2  $A \leftarrow A - C$ \\
+\hspace{3mm}8.3  $B \leftarrow B - D$ \\
+9.  else \\
+\hspace{3mm}9.1  $v \leftarrow v - u$ \\
+\hspace{3mm}9.2  $C \leftarrow C - A$ \\
+\hspace{3mm}9.3  $D \leftarrow D - B$ \\
+10.  If $u \ne 0$ goto step 6. \\
+11.  If $v \ne 1$ return(\textit{MP\_VAL}). \\
+12.  While $C \le 0$ do \\
+\hspace{3mm}12.1  $C \leftarrow C + b$ \\
+13.  While $C \ge b$ do \\
+\hspace{3mm}13.1  $C \leftarrow C - b$ \\
+14.  $c \leftarrow C$ \\
+15.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\end{figure}
+\textbf{Algorithm mp\_invmod.}
+This algorithm computes the modular multiplicative inverse of an integer $a$ modulo an integer $b$.  This algorithm is a variation of the 
+extended binary Euclidean algorithm from HAC \cite[pp. 608]{HAC}.  It has been modified to only compute the modular inverse and not a complete
+Diophantine solution.  
+
+If $b \le 0$ than the modulus is invalid and MP\_VAL is returned.  Similarly if both $a$ and $b$ are even then there cannot be a multiplicative
+inverse for $a$ and the error is reported.  
+
+The astute reader will observe that steps seven through nine are very similar to the binary greatest common divisor algorithm mp\_gcd.  In this case
+the other variables to the Diophantine equation are solved.  The algorithm terminates when $u = 0$ in which case the solution is
+
+\begin{equation}
+Ca + Db = v
+\end{equation}
+
+If $v$, the greatest common divisor of $a$ and $b$ is not equal to one then the algorithm will report an error as no inverse exists.  Otherwise, $C$
+is the modular inverse of $a$.  The actual value of $C$ is congruent to, but not necessarily equal to, the ideal modular inverse which should lie 
+within $1 \le a^{-1} < b$.  Step numbers twelve and thirteen adjust the inverse until it is in range.  If the original input $a$ is within $0 < a < p$ 
+then only a couple of additions or subtractions will be required to adjust the inverse.
+
+EXAM,bn_mp_invmod.c
+
+\subsubsection{Odd Moduli}
+
+When the modulus $b$ is odd the variables $A$ and $C$ are fixed and are not required to compute the inverse.  In particular by attempting to solve
+the Diophantine $Cb + Da = 1$ only $B$ and $D$ are required to find the inverse of $a$.  
+
+The algorithm fast\_mp\_invmod is a direct adaptation of algorithm mp\_invmod with all all steps involving either $A$ or $C$ removed.  This 
+optimization will halve the time required to compute the modular inverse.
+
+\section{Primality Tests}
+
+A non-zero integer $a$ is said to be prime if it is not divisible by any other integer excluding one and itself.  For example, $a = 7$ is prime 
+since the integers $2 \ldots 6$ do not evenly divide $a$.  By contrast, $a = 6$ is not prime since $a = 6 = 2 \cdot 3$. 
+
+Prime numbers arise in cryptography considerably as they allow finite fields to be formed.  The ability to determine whether an integer is prime or
+not quickly has been a viable subject in cryptography and number theory for considerable time.  The algorithms that will be presented are all
+probablistic algorithms in that when they report an integer is composite it must be composite.  However, when the algorithms report an integer is
+prime the algorithm may be incorrect.  
+
+As will be discussed it is possible to limit the probability of error so well that for practical purposes the probablity of error might as 
+well be zero.  For the purposes of these discussions let $n$ represent the candidate integer of which the primality is in question.
+
+\subsection{Trial Division}
+
+Trial division means to attempt to evenly divide a candidate integer by small prime integers.  If the candidate can be evenly divided it obviously
+cannot be prime.  By dividing by all primes $1 < p \le \sqrt{n}$ this test can actually prove whether an integer is prime.  However, such a test
+would require a prohibitive amount of time as $n$ grows.
+
+Instead of dividing by every prime, a smaller, more mangeable set of primes may be used instead.  By performing trial division with only a subset
+of the primes less than $\sqrt{n} + 1$ the algorithm cannot prove if a candidate is prime.  However, often it can prove a candidate is not prime.
+
+The benefit of this test is that trial division by small values is fairly efficient.  Specially compared to the other algorithms that will be
+discussed shortly.  The probability that this approach correctly identifies a composite candidate when tested with all primes upto $q$ is given by
+$1 - {1.12 \over ln(q)}$.  The graph (\ref{pic:primality}, will be added later) demonstrates the probability of success for the range 
+$3 \le q \le 100$.  
+
+At approximately $q = 30$ the gain of performing further tests diminishes fairly quickly.  At $q = 90$ further testing is generally not going to 
+be of any practical use.  In the case of LibTomMath the default limit $q = 256$ was chosen since it is not too high and will eliminate 
+approximately $80\%$ of all candidate integers.  The constant \textbf{PRIME\_SIZE} is equal to the number of primes in the test base.  The 
+array \_\_prime\_tab is an array of the first \textbf{PRIME\_SIZE} prime numbers.  
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_prime\_is\_divisible}. \\
+\textbf{Input}.   mp\_int $a$ \\
+\textbf{Output}.  $c = 1$ if $n$ is divisible by a small prime, otherwise $c = 0$.  \\
+\hline \\
+1.  for $ix$ from $0$ to $PRIME\_SIZE$ do \\
+\hspace{3mm}1.1  $d \leftarrow n \mbox{ (mod }\_\_prime\_tab_{ix}\mbox{)}$ \\
+\hspace{3mm}1.2  If $d = 0$ then \\
+\hspace{6mm}1.2.1  $c \leftarrow 1$ \\
+\hspace{6mm}1.2.2  Return(\textit{MP\_OKAY}). \\
+2.  $c \leftarrow 0$ \\
+3.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_prime\_is\_divisible}
+\end{figure}
+\textbf{Algorithm mp\_prime\_is\_divisible.}
+This algorithm attempts to determine if a candidate integer $n$ is composite by performing trial divisions.  
+
+EXAM,bn_mp_prime_is_divisible.c
+
+The algorithm defaults to a return of $0$ in case an error occurs.  The values in the prime table are all specified to be in the range of a 
+mp\_digit.  The table \_\_prime\_tab is defined in the following file.
+
+EXAM,bn_prime_tab.c
+
+Note that there are two possible tables.  When an mp\_digit is 7-bits long only the primes upto $127$ may be included, otherwise the primes
+upto $1619$ are used.  Note that the value of \textbf{PRIME\_SIZE} is a constant dependent on the size of a mp\_digit. 
+
+\subsection{The Fermat Test}
+The Fermat test is probably one the oldest tests to have a non-trivial probability of success.  It is based on the fact that if $n$ is in 
+fact prime then $a^{n} \equiv a \mbox{ (mod }n\mbox{)}$ for all $0 < a < n$.  The reason being that if $n$ is prime than the order of
+the multiplicative sub group is $n - 1$.  Any base $a$ must have an order which divides $n - 1$ and as such $a^n$ is equivalent to 
+$a^1 = a$.  
+
+If $n$ is composite then any given base $a$ does not have to have a period which divides $n - 1$.  In which case 
+it is possible that $a^n \nequiv a \mbox{ (mod }n\mbox{)}$.  However, this test is not absolute as it is possible that the order
+of a base will divide $n - 1$ which would then be reported as prime.  Such a base yields what is known as a Fermat pseudo-prime.  Several 
+integers known as Carmichael numbers will be a pseudo-prime to all valid bases.  Fortunately such numbers are extremely rare as $n$ grows
+in size.
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_prime\_fermat}. \\
+\textbf{Input}.   mp\_int $a$ and $b$, $a \ge 2$, $0 < b < a$.  \\
+\textbf{Output}.  $c = 1$ if $b^a \equiv b \mbox{ (mod }a\mbox{)}$, otherwise $c = 0$.  \\
+\hline \\
+1.  $t \leftarrow b^a \mbox{ (mod }a\mbox{)}$ \\
+2.  If $t = b$ then \\
+\hspace{3mm}2.1  $c = 1$ \\
+3.  else \\
+\hspace{3mm}3.1  $c = 0$ \\
+4.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_prime\_fermat}
+\end{figure}
+\textbf{Algorithm mp\_prime\_fermat.}
+This algorithm determines whether an mp\_int $a$ is a Fermat prime to the base $b$ or not.  It uses a single modular exponentiation to
+determine the result.  
+
+EXAM,bn_mp_prime_fermat.c
+
+\subsection{The Miller-Rabin Test}
+The Miller-Rabin (citation) test is another primality test which has tighter error bounds than the Fermat test specifically with sequentially chosen 
+candidate  integers.  The algorithm is based on the observation that if $n - 1 = 2^kr$ and if $b^r \nequiv \pm 1$ then after upto $k - 1$ squarings the 
+value must be equal to $-1$.  The squarings are stopped as soon as $-1$ is observed.  If the value of $1$ is observed first it means that
+some value not congruent to $\pm 1$ when squared equals one which cannot occur if $n$ is prime.
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_prime\_miller\_rabin}. \\
+\textbf{Input}.   mp\_int $a$ and $b$, $a \ge 2$, $0 < b < a$.  \\
+\textbf{Output}.  $c = 1$ if $a$ is a Miller-Rabin prime to the base $a$, otherwise $c = 0$.  \\
+\hline
+1.  $a' \leftarrow a - 1$ \\
+2.  $r  \leftarrow n1$    \\
+3.  $c \leftarrow 0, s  \leftarrow 0$ \\
+4.  While $r.used > 0$ and $r_0 \equiv 0 \mbox{ (mod }2\mbox{)}$ \\
+\hspace{3mm}4.1  $s \leftarrow s + 1$ \\
+\hspace{3mm}4.2  $r \leftarrow \lfloor r / 2 \rfloor$ \\
+5.  $y \leftarrow b^r \mbox{ (mod }a\mbox{)}$ \\
+6.  If $y \nequiv \pm 1$ then \\
+\hspace{3mm}6.1  $j \leftarrow 1$ \\
+\hspace{3mm}6.2  While $j \le (s - 1)$ and $y \nequiv a'$ \\
+\hspace{6mm}6.2.1  $y \leftarrow y^2 \mbox{ (mod }a\mbox{)}$ \\
+\hspace{6mm}6.2.2  If $y = 1$ then goto step 8. \\
+\hspace{6mm}6.2.3  $j \leftarrow j + 1$ \\
+\hspace{3mm}6.3  If $y \nequiv a'$ goto step 8. \\
+7.  $c \leftarrow 1$\\
+8.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_prime\_miller\_rabin}
+\end{figure}
+\textbf{Algorithm mp\_prime\_miller\_rabin.}
+This algorithm performs one trial round of the Miller-Rabin algorithm to the base $b$.  It will set $c = 1$ if the algorithm cannot determine
+if $b$ is composite or $c = 0$ if $b$ is provably composite.  The values of $s$ and $r$ are computed such that $a' = a - 1 = 2^sr$.  
+
+If the value $y \equiv b^r$ is congruent to $\pm 1$ then the algorithm cannot prove if $a$ is composite or not.  Otherwise, the algorithm will
+square $y$ upto $s - 1$ times stopping only when $y \equiv -1$.  If $y^2 \equiv 1$ and $y \nequiv \pm 1$ then the algorithm can report that $a$
+is provably composite.  If the algorithm performs $s - 1$ squarings and $y \nequiv -1$ then $a$ is provably composite.  If $a$ is not provably 
+composite then it is \textit{probably} prime.
+
+EXAM,bn_mp_prime_miller_rabin.c
+
+
+
+
+\backmatter
+\appendix
+\begin{thebibliography}{ABCDEF}
+\bibitem[1]{TAOCPV2}
+Donald Knuth, \textit{The Art of Computer Programming}, Third Edition, Volume Two, Seminumerical Algorithms, Addison-Wesley, 1998
+
+\bibitem[2]{HAC}
+A. Menezes, P. van Oorschot, S. Vanstone, \textit{Handbook of Applied Cryptography}, CRC Press, 1996
+
+\bibitem[3]{ROSE}
+Michael Rosing, \textit{Implementing Elliptic Curve Cryptography}, Manning Publications, 1999
+
+\bibitem[4]{COMBA}
+Paul G. Comba, \textit{Exponentiation Cryptosystems on the IBM PC}. IBM Systems Journal 29(4): 526-538 (1990)
+
+\bibitem[5]{KARA}
+A. Karatsuba, Doklay Akad. Nauk SSSR 145 (1962), pp.293-294
+
+\bibitem[6]{KARAP}
+Andre Weimerskirch and Christof Paar, \textit{Generalizations of the Karatsuba Algorithm for Polynomial Multiplication}, Submitted to Design, Codes and Cryptography, March 2002
+
+\bibitem[7]{BARRETT}
+Paul Barrett, \textit{Implementing the Rivest Shamir and Adleman Public Key Encryption Algorithm on a Standard Digital Signal Processor}, Advances in Cryptology, Crypto '86, Springer-Verlag.
+
+\bibitem[8]{MONT}
+P.L.Montgomery. \textit{Modular multiplication without trial division}. Mathematics of Computation, 44(170):519-521, April 1985.
+
+\bibitem[9]{DRMET}
+Chae Hoon Lim and Pil Joong Lee, \textit{Generating Efficient Primes for Discrete Log Cryptosystems}, POSTECH Information Research Laboratories
+
+\bibitem[10]{MMB}
+J. Daemen and R. Govaerts and J. Vandewalle, \textit{Block ciphers based on Modular Arithmetic}, State and {P}rogress in the {R}esearch of {C}ryptography, 1993, pp. 80-89
+
+\bibitem[11]{RSAREF}
+R.L. Rivest, A. Shamir, L. Adleman, \textit{A Method for Obtaining Digital Signatures and Public-Key Cryptosystems}
+
+\bibitem[12]{DHREF}
+Whitfield Diffie, Martin E. Hellman, \textit{New Directions in Cryptography}, IEEE Transactions on Information Theory, 1976
+
+\bibitem[13]{IEEE}
+IEEE Standard for Binary Floating-Point Arithmetic (ANSI/IEEE Std 754-1985)
+
+\bibitem[14]{GMP}
+GNU Multiple Precision (GMP), \url{http://www.swox.com/gmp/}
+
+\bibitem[15]{MPI}
+Multiple Precision Integer Library (MPI), Michael Fromberger, \url{http://thayer.dartmouth.edu/~sting/mpi/}
+
+\bibitem[16]{OPENSSL}
+OpenSSL Cryptographic Toolkit, \url{http://openssl.org}
+
+\bibitem[17]{LIP}
+Large Integer Package, \url{http://home.hetnet.nl/~ecstr/LIP.zip}
+
+\bibitem[18]{ISOC}
+JTC1/SC22/WG14, ISO/IEC 9899:1999, ``A draft rationale for the C99 standard.''
+
+\bibitem[19]{JAVA}
+The Sun Java Website, \url{http://java.sun.com/}
+
+\end{thebibliography}
+
+\input{tommath.ind}
+
+\end{document}
diff --git a/libtommath/tommath.tex b/libtommath/tommath.tex
new file mode 100644
index 0000000..b016010
--- /dev/null
+++ b/libtommath/tommath.tex
@@ -0,0 +1,10814 @@
+\documentclass[b5paper]{book}
+\usepackage{hyperref}
+\usepackage{makeidx}
+\usepackage{amssymb}
+\usepackage{color}
+\usepackage{alltt}
+\usepackage{graphicx}
+\usepackage{layout}
+\def\union{\cup}
+\def\intersect{\cap}
+\def\getsrandom{\stackrel{\rm R}{\gets}}
+\def\cross{\times}
+\def\cat{\hspace{0.5em} \| \hspace{0.5em}}
+\def\catn{$\|$}
+\def\divides{\hspace{0.3em} | \hspace{0.3em}}
+\def\nequiv{\not\equiv}
+\def\approx{\raisebox{0.2ex}{\mbox{\small $\sim$}}}
+\def\lcm{{\rm lcm}}
+\def\gcd{{\rm gcd}}
+\def\log{{\rm log}}
+\def\ord{{\rm ord}}
+\def\abs{{\mathit abs}}
+\def\rep{{\mathit rep}}
+\def\mod{{\mathit\ mod\ }}
+\renewcommand{\pmod}[1]{\ ({\rm mod\ }{#1})}
+\newcommand{\floor}[1]{\left\lfloor{#1}\right\rfloor}
+\newcommand{\ceil}[1]{\left\lceil{#1}\right\rceil}
+\def\Or{{\rm\ or\ }}
+\def\And{{\rm\ and\ }}
+\def\iff{\hspace{1em}\Longleftrightarrow\hspace{1em}}
+\def\implies{\Rightarrow}
+\def\undefined{{\rm ``undefined"}}
+\def\Proof{\vspace{1ex}\noindent {\bf Proof:}\hspace{1em}}
+\let\oldphi\phi
+\def\phi{\varphi}
+\def\Pr{{\rm Pr}}
+\newcommand{\str}[1]{{\mathbf{#1}}}
+\def\F{{\mathbb F}}
+\def\N{{\mathbb N}}
+\def\Z{{\mathbb Z}}
+\def\R{{\mathbb R}}
+\def\C{{\mathbb C}}
+\def\Q{{\mathbb Q}}
+\definecolor{DGray}{gray}{0.5}
+\newcommand{\emailaddr}[1]{\mbox{$<${#1}$>$}}
+\def\twiddle{\raisebox{0.3ex}{\mbox{\tiny $\sim$}}}
+\def\gap{\vspace{0.5ex}}
+\makeindex
+\begin{document}
+\frontmatter
+\pagestyle{empty}
+\title{Multi--Precision Math}
+\author{\mbox{
+%\begin{small}
+\begin{tabular}{c}
+Tom St Denis \\
+Algonquin College \\
+\\
+Mads Rasmussen \\
+Open Communications Security \\
+\\
+Greg Rose \\
+QUALCOMM Australia \\
+\end{tabular}
+%\end{small}
+}
+}
+\maketitle
+This text has been placed in the public domain.  This text corresponds to the v0.35 release of the 
+LibTomMath project.
+
+\begin{alltt}
+Tom St Denis
+111 Banning Rd
+Ottawa, Ontario
+K2L 1C3
+Canada
+
+Phone: 1-613-836-3160
+Email: tomstdenis@iahu.ca
+\end{alltt}
+
+This text is formatted to the international B5 paper size of 176mm wide by 250mm tall using the \LaTeX{} 
+{\em book} macro package and the Perl {\em booker} package.
+
+\tableofcontents
+\listoffigures
+\chapter*{Prefaces}
+When I tell people about my LibTom projects and that I release them as public domain they are often puzzled.  
+They ask why I did it and especially why I continue to work on them for free.  The best I can explain it is ``Because I can.''  
+Which seems odd and perhaps too terse for adult conversation. I often qualify it with ``I am able, I am willing.'' which 
+perhaps explains it better.  I am the first to admit there is not anything that special with what I have done.  Perhaps
+others can see that too and then we would have a society to be proud of.  My LibTom projects are what I am doing to give 
+back to society in the form of tools and knowledge that can help others in their endeavours.
+
+I started writing this book because it was the most logical task to further my goal of open academia.  The LibTomMath source
+code itself was written to be easy to follow and learn from.  There are times, however, where pure C source code does not
+explain the algorithms properly.  Hence this book.  The book literally starts with the foundation of the library and works
+itself outwards to the more complicated algorithms.  The use of both pseudo--code and verbatim source code provides a duality
+of ``theory'' and ``practice'' that the computer science students of the world shall appreciate.  I never deviate too far
+from relatively straightforward algebra and I hope that this book can be a valuable learning asset.
+
+This book and indeed much of the LibTom projects would not exist in their current form if it was not for a plethora
+of kind people donating their time, resources and kind words to help support my work.  Writing a text of significant
+length (along with the source code) is a tiresome and lengthy process.  Currently the LibTom project is four years old,
+comprises of literally thousands of users and over 100,000 lines of source code, TeX and other material.  People like Mads and Greg 
+were there at the beginning to encourage me to work well.  It is amazing how timely validation from others can boost morale to 
+continue the project. Definitely my parents were there for me by providing room and board during the many months of work in 2003.  
+
+To my many friends whom I have met through the years I thank you for the good times and the words of encouragement.  I hope I
+honour your kind gestures with this project.
+
+Open Source.  Open Academia.  Open Minds.
+
+\begin{flushright} Tom St Denis \end{flushright}
+
+\newpage
+I found the opportunity to work with Tom appealing for several reasons, not only could I broaden my own horizons, but also 
+contribute to educate others facing the problem of having to handle big number mathematical calculations.
+
+This book is Tom's child and he has been caring and fostering the project ever since the beginning with a clear mind of 
+how he wanted the project to turn out. I have helped by proofreading the text and we have had several discussions about 
+the layout and language used.
+
+I hold a masters degree in cryptography from the University of Southern Denmark and have always been interested in the 
+practical aspects of cryptography. 
+
+Having worked in the security consultancy business for several years in S\~{a}o Paulo, Brazil, I have been in touch with a 
+great deal of work in which multiple precision mathematics was needed. Understanding the possibilities for speeding up 
+multiple precision calculations is often very important since we deal with outdated machine architecture where modular 
+reductions, for example, become painfully slow.
+
+This text is for people who stop and wonder when first examining algorithms such as RSA for the first time and asks 
+themselves, ``You tell me this is only secure for large numbers, fine; but how do you implement these numbers?''
+
+\begin{flushright}
+Mads Rasmussen
+
+S\~{a}o Paulo - SP
+
+Brazil
+\end{flushright}
+
+\newpage
+It's all because I broke my leg. That just happened to be at about the same time that Tom asked for someone to review the section of the book about 
+Karatsuba multiplication. I was laid up, alone and immobile, and thought ``Why not?'' I vaguely knew what Karatsuba multiplication was, but not 
+really, so I thought I could help, learn, and stop myself from watching daytime cable TV, all at once.
+
+At the time of writing this, I've still not met Tom or Mads in meatspace. I've been following Tom's progress since his first splash on the 
+sci.crypt Usenet news group. I watched him go from a clueless newbie, to the cryptographic equivalent of a reformed smoker, to a real
+contributor to the field, over a period of about two years. I've been impressed with his obvious intelligence, and astounded by his productivity. 
+Of course, he's young enough to be my own child, so he doesn't have my problems with staying awake.
+
+When I reviewed that single section of the book, in its very earliest form, I was very pleasantly surprised. So I decided to collaborate more fully, 
+and at least review all of it, and perhaps write some bits too. There's still a long way to go with it, and I have watched a number of close 
+friends go through the mill of publication, so I think that the way to go is longer than Tom thinks it is. Nevertheless, it's a good effort, 
+and I'm pleased to be involved with it.
+
+\begin{flushright}
+Greg Rose, Sydney, Australia, June 2003. 
+\end{flushright}
+
+\mainmatter
+\pagestyle{headings}
+\chapter{Introduction}
+\section{Multiple Precision Arithmetic}
+
+\subsection{What is Multiple Precision Arithmetic?}
+When we think of long-hand arithmetic such as addition or multiplication we rarely consider the fact that we instinctively
+raise or lower the precision of the numbers we are dealing with.  For example, in decimal we almost immediate can 
+reason that $7$ times $6$ is $42$.  However, $42$ has two digits of precision as opposed to one digit we started with.  
+Further multiplications of say $3$ result in a larger precision result $126$.  In these few examples we have multiple 
+precisions for the numbers we are working with.  Despite the various levels of precision a single subset\footnote{With the occasional optimization.}
+ of algorithms can be designed to accomodate them.  
+
+By way of comparison a fixed or single precision operation would lose precision on various operations.  For example, in
+the decimal system with fixed precision $6 \cdot 7 = 2$.
+
+Essentially at the heart of computer based multiple precision arithmetic are the same long-hand algorithms taught in
+schools to manually add, subtract, multiply and divide.  
+
+\subsection{The Need for Multiple Precision Arithmetic}
+The most prevalent need for multiple precision arithmetic, often referred to as ``bignum'' math, is within the implementation
+of public-key cryptography algorithms.   Algorithms such as RSA \cite{RSAREF} and Diffie-Hellman \cite{DHREF} require 
+integers of significant magnitude to resist known cryptanalytic attacks.  For example, at the time of this writing a 
+typical RSA modulus would be at least greater than $10^{309}$.  However, modern programming languages such as ISO C \cite{ISOC} and 
+Java \cite{JAVA} only provide instrinsic support for integers which are relatively small and single precision.
+
+\begin{figure}[!here]
+\begin{center}
+\begin{tabular}{|r|c|}
+\hline \textbf{Data Type} & \textbf{Range} \\
+\hline char  & $-128 \ldots 127$ \\
+\hline short & $-32768 \ldots 32767$ \\
+\hline long  & $-2147483648 \ldots 2147483647$ \\
+\hline long long & $-9223372036854775808 \ldots 9223372036854775807$ \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Typical Data Types for the C Programming Language}
+\label{fig:ISOC}
+\end{figure}
+
+The largest data type guaranteed to be provided by the ISO C programming 
+language\footnote{As per the ISO C standard.  However, each compiler vendor is allowed to augment the precision as they 
+see fit.}  can only represent values up to $10^{19}$ as shown in figure \ref{fig:ISOC}. On its own the C language is 
+insufficient to accomodate the magnitude required for the problem at hand.  An RSA modulus of magnitude $10^{19}$ could be 
+trivially factored\footnote{A Pollard-Rho factoring would take only $2^{16}$ time.} on the average desktop computer, 
+rendering any protocol based on the algorithm insecure.  Multiple precision algorithms solve this very problem by 
+extending the range of representable integers while using single precision data types.
+
+Most advancements in fast multiple precision arithmetic stem from the need for faster and more efficient cryptographic 
+primitives.  Faster modular reduction and exponentiation algorithms such as Barrett's algorithm, which have appeared in 
+various cryptographic journals, can render algorithms such as RSA and Diffie-Hellman more efficient.  In fact, several 
+major companies such as RSA Security, Certicom and Entrust have built entire product lines on the implementation and 
+deployment of efficient algorithms.
+
+However, cryptography is not the only field of study that can benefit from fast multiple precision integer routines.  
+Another auxiliary use of multiple precision integers is high precision floating point data types.  
+The basic IEEE \cite{IEEE} standard floating point type is made up of an integer mantissa $q$, an exponent $e$ and a sign bit $s$.  
+Numbers are given in the form $n = q \cdot b^e \cdot -1^s$ where $b = 2$ is the most common base for IEEE.  Since IEEE 
+floating point is meant to be implemented in hardware the precision of the mantissa is often fairly small 
+(\textit{23, 48 and 64 bits}).  The mantissa is merely an integer and a multiple precision integer could be used to create
+a mantissa of much larger precision than hardware alone can efficiently support.  This approach could be useful where 
+scientific applications must minimize the total output error over long calculations.
+
+Yet another use for large integers is within arithmetic on polynomials of large characteristic (i.e. $GF(p)[x]$ for large $p$).
+In fact the library discussed within this text has already been used to form a polynomial basis library\footnote{See \url{http://poly.libtomcrypt.org} for more details.}.
+
+\subsection{Benefits of Multiple Precision Arithmetic}
+\index{precision}
+The benefit of multiple precision representations over single or fixed precision representations is that 
+no precision is lost while representing the result of an operation which requires excess precision.  For example, 
+the product of two $n$-bit integers requires at least $2n$ bits of precision to be represented faithfully.  A multiple 
+precision algorithm would augment the precision of the destination to accomodate the result while a single precision system 
+would truncate excess bits to maintain a fixed level of precision.
+
+It is possible to implement algorithms which require large integers with fixed precision algorithms.  For example, elliptic
+curve cryptography (\textit{ECC}) is often implemented on smartcards by fixing the precision of the integers to the maximum 
+size the system will ever need.  Such an approach can lead to vastly simpler algorithms which can accomodate the 
+integers required even if the host platform cannot natively accomodate them\footnote{For example, the average smartcard 
+processor has an 8 bit accumulator.}.  However, as efficient as such an approach may be, the resulting source code is not
+normally very flexible.  It cannot, at runtime, accomodate inputs of higher magnitude than the designer anticipated.
+
+Multiple precision algorithms have the most overhead of any style of arithmetic.  For the the most part the 
+overhead can be kept to a minimum with careful planning, but overall, it is not well suited for most memory starved
+platforms.  However, multiple precision algorithms do offer the most flexibility in terms of the magnitude of the 
+inputs.  That is, the same algorithms based on multiple precision integers can accomodate any reasonable size input 
+without the designer's explicit forethought.  This leads to lower cost of ownership for the code as it only has to 
+be written and tested once.
+
+\section{Purpose of This Text}
+The purpose of this text is to instruct the reader regarding how to implement efficient multiple precision algorithms.  
+That is to not only explain a limited subset of the core theory behind the algorithms but also the various ``house keeping'' 
+elements that are neglected by authors of other texts on the subject.  Several well reknowned texts \cite{TAOCPV2,HAC} 
+give considerably detailed explanations of the theoretical aspects of algorithms and often very little information 
+regarding the practical implementation aspects.  
+
+In most cases how an algorithm is explained and how it is actually implemented are two very different concepts.  For 
+example, the Handbook of Applied Cryptography (\textit{HAC}), algorithm 14.7 on page 594, gives a relatively simple 
+algorithm for performing multiple precision integer addition.  However, the description lacks any discussion concerning 
+the fact that the two integer inputs may be of differing magnitudes.  As a result the implementation is not as simple
+as the text would lead people to believe.  Similarly the division routine (\textit{algorithm 14.20, pp. 598}) does not 
+discuss how to handle sign or handle the dividend's decreasing magnitude in the main loop (\textit{step \#3}).
+
+Both texts also do not discuss several key optimal algorithms required such as ``Comba'' and Karatsuba multipliers 
+and fast modular inversion, which we consider practical oversights.  These optimal algorithms are vital to achieve 
+any form of useful performance in non-trivial applications.  
+
+To solve this problem the focus of this text is on the practical aspects of implementing a multiple precision integer
+package.  As a case study the ``LibTomMath''\footnote{Available at \url{http://math.libtomcrypt.org}} package is used 
+to demonstrate algorithms with real implementations\footnote{In the ISO C programming language.} that have been field 
+tested and work very well.  The LibTomMath library is freely available on the Internet for all uses and this text 
+discusses a very large portion of the inner workings of the library.
+
+The algorithms that are presented will always include at least one ``pseudo-code'' description followed 
+by the actual C source code that implements the algorithm.  The pseudo-code can be used to implement the same 
+algorithm in other programming languages as the reader sees fit.  
+
+This text shall also serve as a walkthrough of the creation of multiple precision algorithms from scratch.  Showing
+the reader how the algorithms fit together as well as where to start on various taskings.  
+
+\section{Discussion and Notation}
+\subsection{Notation}
+A multiple precision integer of $n$-digits shall be denoted as $x = (x_{n-1}, \ldots, x_1, x_0)_{ \beta }$ and represent
+the integer $x \equiv \sum_{i=0}^{n-1} x_i\beta^i$.  The elements of the array $x$ are said to be the radix $\beta$ digits 
+of the integer.  For example, $x = (1,2,3)_{10}$ would represent the integer 
+$1\cdot 10^2 + 2\cdot10^1 + 3\cdot10^0 = 123$.  
+
+\index{mp\_int}
+The term ``mp\_int'' shall refer to a composite structure which contains the digits of the integer it represents, as well 
+as auxilary data required to manipulate the data.  These additional members are discussed further in section 
+\ref{sec:MPINT}.  For the purposes of this text a ``multiple precision integer'' and an ``mp\_int'' are assumed to be 
+synonymous.  When an algorithm is specified to accept an mp\_int variable it is assumed the various auxliary data members 
+are present as well.  An expression of the type \textit{variablename.item} implies that it should evaluate to the 
+member named ``item'' of the variable.  For example, a string of characters may have a member ``length'' which would 
+evaluate to the number of characters in the string.  If the string $a$ equals ``hello'' then it follows that 
+$a.length = 5$.  
+
+For certain discussions more generic algorithms are presented to help the reader understand the final algorithm used
+to solve a given problem.  When an algorithm is described as accepting an integer input it is assumed the input is 
+a plain integer with no additional multiple-precision members.  That is, algorithms that use integers as opposed to 
+mp\_ints as inputs do not concern themselves with the housekeeping operations required such as memory management.  These 
+algorithms will be used to establish the relevant theory which will subsequently be used to describe a multiple
+precision algorithm to solve the same problem.  
+
+\subsection{Precision Notation}
+The variable $\beta$ represents the radix of a single digit of a multiple precision integer and 
+must be of the form $q^p$ for $q, p \in \Z^+$.  A single precision variable must be able to represent integers in 
+the range $0 \le x < q \beta$ while a double precision variable must be able to represent integers in the range 
+$0 \le x < q \beta^2$.  The extra radix-$q$ factor allows additions and subtractions to proceed without truncation of the 
+carry.  Since all modern computers are binary, it is assumed that $q$ is two.
+
+\index{mp\_digit} \index{mp\_word}
+Within the source code that will be presented for each algorithm, the data type \textbf{mp\_digit} will represent 
+a single precision integer type, while, the data type \textbf{mp\_word} will represent a double precision integer type.  In 
+several algorithms (notably the Comba routines) temporary results will be stored in arrays of double precision mp\_words.  
+For the purposes of this text $x_j$ will refer to the $j$'th digit of a single precision array and $\hat x_j$ will refer to 
+the $j$'th digit of a double precision array.  Whenever an expression is to be assigned to a double precision
+variable it is assumed that all single precision variables are promoted to double precision during the evaluation.  
+Expressions that are assigned to a single precision variable are truncated to fit within the precision of a single
+precision data type.
+
+For example, if $\beta = 10^2$ a single precision data type may represent a value in the 
+range $0 \le x < 10^3$, while a double precision data type may represent a value in the range $0 \le x < 10^5$.  Let
+$a = 23$ and $b = 49$ represent two single precision variables.  The single precision product shall be written
+as $c \leftarrow a \cdot b$ while the double precision product shall be written as $\hat c \leftarrow a \cdot b$.
+In this particular case, $\hat c = 1127$ and $c = 127$.  The most significant digit of the product would not fit 
+in a single precision data type and as a result $c \ne \hat c$.  
+
+\subsection{Algorithm Inputs and Outputs}
+Within the algorithm descriptions all variables are assumed to be scalars of either single or double precision
+as indicated.  The only exception to this rule is when variables have been indicated to be of type mp\_int.  This 
+distinction is important as scalars are often used as array indicies and various other counters.  
+
+\subsection{Mathematical Expressions}
+The $\lfloor \mbox{ } \rfloor$ brackets imply an expression truncated to an integer not greater than the expression 
+itself.  For example, $\lfloor 5.7 \rfloor = 5$.  Similarly the $\lceil \mbox{ } \rceil$ brackets imply an expression
+rounded to an integer not less than the expression itself.  For example, $\lceil 5.1 \rceil = 6$.  Typically when 
+the $/$ division symbol is used the intention is to perform an integer division with truncation.  For example, 
+$5/2 = 2$ which will often be written as $\lfloor 5/2 \rfloor = 2$ for clarity.  When an expression is written as a 
+fraction a real value division is implied, for example ${5 \over 2} = 2.5$.  
+
+The norm of a multiple precision integer, for example $\vert \vert x \vert \vert$, will be used to represent the number of digits in the representation
+of the integer.  For example, $\vert \vert 123 \vert \vert = 3$ and $\vert \vert 79452 \vert \vert = 5$.  
+
+\subsection{Work Effort}
+\index{big-Oh}
+To measure the efficiency of the specified algorithms, a modified big-Oh notation is used.  In this system all 
+single precision operations are considered to have the same cost\footnote{Except where explicitly noted.}.  
+That is a single precision addition, multiplication and division are assumed to take the same time to 
+complete.  While this is generally not true in practice, it will simplify the discussions considerably.
+
+Some algorithms have slight advantages over others which is why some constants will not be removed in 
+the notation.  For example, a normal baseline multiplication (section \ref{sec:basemult}) requires $O(n^2)$ work while a 
+baseline squaring (section \ref{sec:basesquare}) requires $O({{n^2 + n}\over 2})$ work.  In standard big-Oh notation these 
+would both be said to be equivalent to $O(n^2)$.  However, 
+in the context of the this text this is not the case as the magnitude of the inputs will typically be rather small.  As a 
+result small constant factors in the work effort will make an observable difference in algorithm efficiency.
+
+All of the algorithms presented in this text have a polynomial time work level.  That is, of the form 
+$O(n^k)$ for $n, k \in \Z^{+}$.  This will help make useful comparisons in terms of the speed of the algorithms and how 
+various optimizations will help pay off in the long run.
+
+\section{Exercises}
+Within the more advanced chapters a section will be set aside to give the reader some challenging exercises related to
+the discussion at hand.  These exercises are not designed to be prize winning problems, but instead to be thought 
+provoking.  Wherever possible the problems are forward minded, stating problems that will be answered in subsequent 
+chapters.  The reader is encouraged to finish the exercises as they appear to get a better understanding of the 
+subject material.  
+
+That being said, the problems are designed to affirm knowledge of a particular subject matter.  Students in particular
+are encouraged to verify they can answer the problems correctly before moving on.
+
+Similar to the exercises of \cite[pp. ix]{TAOCPV2} these exercises are given a scoring system based on the difficulty of
+the problem.  However, unlike \cite{TAOCPV2} the problems do not get nearly as hard.  The scoring of these 
+exercises ranges from one (the easiest) to five (the hardest).  The following table sumarizes the 
+scoring system used.
+
+\begin{figure}[here]
+\begin{center}
+\begin{small}
+\begin{tabular}{|c|l|}
+\hline $\left [ 1 \right ]$ & An easy problem that should only take the reader a manner of \\
+                            & minutes to solve.  Usually does not involve much computer time \\
+                            & to solve. \\
+\hline $\left [ 2 \right ]$ & An easy problem that involves a marginal amount of computer \\
+                     & time usage.  Usually requires a program to be written to \\
+                     & solve the problem. \\
+\hline $\left [ 3 \right ]$ & A moderately hard problem that requires a non-trivial amount \\
+                     & of work.  Usually involves trivial research and development of \\
+                     & new theory from the perspective of a student. \\
+\hline $\left [ 4 \right ]$ & A moderately hard problem that involves a non-trivial amount \\
+                     & of work and research, the solution to which will demonstrate \\
+                     & a higher mastery of the subject matter. \\
+\hline $\left [ 5 \right ]$ & A hard problem that involves concepts that are difficult for a \\
+                     & novice to solve.  Solutions to these problems will demonstrate a \\
+                     & complete mastery of the given subject. \\
+\hline
+\end{tabular}
+\end{small}
+\end{center}
+\caption{Exercise Scoring System}
+\end{figure}
+
+Problems at the first level are meant to be simple questions that the reader can answer quickly without programming a solution or
+devising new theory.  These problems are quick tests to see if the material is understood.  Problems at the second level 
+are also designed to be easy but will require a program or algorithm to be implemented to arrive at the answer.  These
+two levels are essentially entry level questions.  
+
+Problems at the third level are meant to be a bit more difficult than the first two levels.  The answer is often 
+fairly obvious but arriving at an exacting solution requires some thought and skill.  These problems will almost always 
+involve devising a new algorithm or implementing a variation of another algorithm previously presented.  Readers who can
+answer these questions will feel comfortable with the concepts behind the topic at hand.
+
+Problems at the fourth level are meant to be similar to those of the level three questions except they will require 
+additional research to be completed.  The reader will most likely not know the answer right away, nor will the text provide 
+the exact details of the answer until a subsequent chapter.  
+
+Problems at the fifth level are meant to be the hardest 
+problems relative to all the other problems in the chapter.  People who can correctly answer fifth level problems have a 
+mastery of the subject matter at hand.
+
+Often problems will be tied together.  The purpose of this is to start a chain of thought that will be discussed in future chapters.  The reader
+is encouraged to answer the follow-up problems and try to draw the relevance of problems.
+
+\section{Introduction to LibTomMath}
+
+\subsection{What is LibTomMath?}
+LibTomMath is a free and open source multiple precision integer library written entirely in portable ISO C.  By portable it 
+is meant that the library does not contain any code that is computer platform dependent or otherwise problematic to use on 
+any given platform.  
+
+The library has been successfully tested under numerous operating systems including Unix\footnote{All of these
+trademarks belong to their respective rightful owners.}, MacOS, Windows, Linux, PalmOS and on standalone hardware such 
+as the Gameboy Advance.  The library is designed to contain enough functionality to be able to develop applications such 
+as public key cryptosystems and still maintain a relatively small footprint.
+
+\subsection{Goals of LibTomMath}
+
+Libraries which obtain the most efficiency are rarely written in a high level programming language such as C.  However, 
+even though this library is written entirely in ISO C, considerable care has been taken to optimize the algorithm implementations within the 
+library.  Specifically the code has been written to work well with the GNU C Compiler (\textit{GCC}) on both x86 and ARM 
+processors.  Wherever possible, highly efficient algorithms, such as Karatsuba multiplication, sliding window 
+exponentiation and Montgomery reduction have been provided to make the library more efficient.  
+
+Even with the nearly optimal and specialized algorithms that have been included the Application Programing Interface 
+(\textit{API}) has been kept as simple as possible.  Often generic place holder routines will make use of specialized 
+algorithms automatically without the developer's specific attention.  One such example is the generic multiplication 
+algorithm \textbf{mp\_mul()} which will automatically use Toom--Cook, Karatsuba, Comba or baseline multiplication 
+based on the magnitude of the inputs and the configuration of the library.  
+
+Making LibTomMath as efficient as possible is not the only goal of the LibTomMath project.  Ideally the library should 
+be source compatible with another popular library which makes it more attractive for developers to use.  In this case the
+MPI library was used as a API template for all the basic functions.  MPI was chosen because it is another library that fits 
+in the same niche as LibTomMath.  Even though LibTomMath uses MPI as the template for the function names and argument 
+passing conventions, it has been written from scratch by Tom St Denis.
+
+The project is also meant to act as a learning tool for students, the logic being that no easy-to-follow ``bignum'' 
+library exists which can be used to teach computer science students how to perform fast and reliable multiple precision 
+integer arithmetic.  To this end the source code has been given quite a few comments and algorithm discussion points.  
+
+\section{Choice of LibTomMath}
+LibTomMath was chosen as the case study of this text not only because the author of both projects is one and the same but
+for more worthy reasons.  Other libraries such as GMP \cite{GMP}, MPI \cite{MPI}, LIP \cite{LIP} and OpenSSL 
+\cite{OPENSSL} have multiple precision integer arithmetic routines but would not be ideal for this text for 
+reasons that will be explained in the following sub-sections.
+
+\subsection{Code Base}
+The LibTomMath code base is all portable ISO C source code.  This means that there are no platform dependent conditional
+segments of code littered throughout the source.  This clean and uncluttered approach to the library means that a
+developer can more readily discern the true intent of a given section of source code without trying to keep track of
+what conditional code will be used.
+
+The code base of LibTomMath is well organized.  Each function is in its own separate source code file 
+which allows the reader to find a given function very quickly.  On average there are $76$ lines of code per source
+file which makes the source very easily to follow.  By comparison MPI and LIP are single file projects making code tracing
+very hard.  GMP has many conditional code segments which also hinder tracing.  
+
+When compiled with GCC for the x86 processor and optimized for speed the entire library is approximately $100$KiB\footnote{The notation ``KiB'' means $2^{10}$ octets, similarly ``MiB'' means $2^{20}$ octets.}
+ which is fairly small compared to GMP (over $250$KiB).  LibTomMath is slightly larger than MPI (which compiles to about 
+$50$KiB) but LibTomMath is also much faster and more complete than MPI.
+
+\subsection{API Simplicity}
+LibTomMath is designed after the MPI library and shares the API design.  Quite often programs that use MPI will build 
+with LibTomMath without change. The function names correlate directly to the action they perform.  Almost all of the 
+functions share the same parameter passing convention.  The learning curve is fairly shallow with the API provided 
+which is an extremely valuable benefit for the student and developer alike.  
+
+The LIP library is an example of a library with an API that is awkward to work with.  LIP uses function names that are often ``compressed'' to 
+illegible short hand.  LibTomMath does not share this characteristic.  
+
+The GMP library also does not return error codes.  Instead it uses a POSIX.1 \cite{POSIX1} signal system where errors
+are signaled to the host application.  This happens to be the fastest approach but definitely not the most versatile.  In
+effect a math error (i.e. invalid input, heap error, etc) can cause a program to stop functioning which is definitely 
+undersireable in many situations.
+
+\subsection{Optimizations}
+While LibTomMath is certainly not the fastest library (GMP often beats LibTomMath by a factor of two) it does
+feature a set of optimal algorithms for tasks such as modular reduction, exponentiation, multiplication and squaring.  GMP 
+and LIP also feature such optimizations while MPI only uses baseline algorithms with no optimizations.  GMP lacks a few
+of the additional modular reduction optimizations that LibTomMath features\footnote{At the time of this writing GMP
+only had Barrett and Montgomery modular reduction algorithms.}.  
+
+LibTomMath is almost always an order of magnitude faster than the MPI library at computationally expensive tasks such as modular
+exponentiation.  In the grand scheme of ``bignum'' libraries LibTomMath is faster than the average library and usually  
+slower than the best libraries such as GMP and OpenSSL by only a small factor.
+
+\subsection{Portability and Stability}
+LibTomMath will build ``out of the box'' on any platform equipped with a modern version of the GNU C Compiler 
+(\textit{GCC}).  This means that without changes the library will build without configuration or setting up any 
+variables.  LIP and MPI will build ``out of the box'' as well but have numerous known bugs.  Most notably the author of 
+MPI has recently stopped working on his library and LIP has long since been discontinued.  
+
+GMP requires a configuration script to run and will not build out of the box.   GMP and LibTomMath are still in active
+development and are very stable across a variety of platforms.
+
+\subsection{Choice}
+LibTomMath is a relatively compact, well documented, highly optimized and portable library which seems only natural for
+the case study of this text.  Various source files from the LibTomMath project will be included within the text.  However, 
+the reader is encouraged to download their own copy of the library to actually be able to work with the library.  
+
+\chapter{Getting Started}
+\section{Library Basics}
+The trick to writing any useful library of source code is to build a solid foundation and work outwards from it.  First, 
+a problem along with allowable solution parameters should be identified and analyzed.  In this particular case the 
+inability to accomodate multiple precision integers is the problem.  Futhermore, the solution must be written
+as portable source code that is reasonably efficient across several different computer platforms.
+
+After a foundation is formed the remainder of the library can be designed and implemented in a hierarchical fashion.  
+That is, to implement the lowest level dependencies first and work towards the most abstract functions last.  For example, 
+before implementing a modular exponentiation algorithm one would implement a modular reduction algorithm.
+By building outwards from a base foundation instead of using a parallel design methodology the resulting project is 
+highly modular.  Being highly modular is a desirable property of any project as it often means the resulting product
+has a small footprint and updates are easy to perform.  
+
+Usually when I start a project I will begin with the header files.  I define the data types I think I will need and 
+prototype the initial functions that are not dependent on other functions (within the library).  After I 
+implement these base functions I prototype more dependent functions and implement them.   The process repeats until
+I implement all of the functions I require.  For example, in the case of LibTomMath I implemented functions such as 
+mp\_init() well before I implemented mp\_mul() and even further before I implemented mp\_exptmod().  As an example as to 
+why this design works note that the Karatsuba and Toom-Cook multipliers were written \textit{after} the 
+dependent function mp\_exptmod() was written.  Adding the new multiplication algorithms did not require changes to the 
+mp\_exptmod() function itself and lowered the total cost of ownership (\textit{so to speak}) and of development 
+for new algorithms.  This methodology allows new algorithms to be tested in a complete framework with relative ease.
+
+\begin{center}
+\begin{figure}[here]
+\includegraphics{pics/design_process.ps}
+\caption{Design Flow of the First Few Original LibTomMath Functions.}
+\label{pic:design_process}
+\end{figure}
+\end{center}
+
+Only after the majority of the functions were in place did I pursue a less hierarchical approach to auditing and optimizing
+the source code.  For example, one day I may audit the multipliers and the next day the polynomial basis functions.  
+
+It only makes sense to begin the text with the preliminary data types and support algorithms required as well.  
+This chapter discusses the core algorithms of the library which are the dependents for every other algorithm.
+
+\section{What is a Multiple Precision Integer?}
+Recall that most programming languages, in particular ISO C \cite{ISOC}, only have fixed precision data types that on their own cannot 
+be used to represent values larger than their precision will allow. The purpose of multiple precision algorithms is 
+to use fixed precision data types to create and manipulate multiple precision integers which may represent values 
+that are very large.  
+
+As a well known analogy, school children are taught how to form numbers larger than nine by prepending more radix ten digits.  In the decimal system
+the largest single digit value is $9$.  However, by concatenating digits together larger numbers may be represented.  Newly prepended digits 
+(\textit{to the left}) are said to be in a different power of ten column.  That is, the number $123$ can be described as having a $1$ in the hundreds 
+column, $2$ in the tens column and $3$ in the ones column.  Or more formally $123 = 1 \cdot 10^2 + 2 \cdot 10^1 + 3 \cdot 10^0$.  Computer based 
+multiple precision arithmetic is essentially the same concept.  Larger integers are represented by adjoining fixed 
+precision computer words with the exception that a different radix is used.
+
+What most people probably do not think about explicitly are the various other attributes that describe a multiple precision 
+integer.  For example, the integer $154_{10}$ has two immediately obvious properties.  First, the integer is positive, 
+that is the sign of this particular integer is positive as opposed to negative.  Second, the integer has three digits in 
+its representation.  There is an additional property that the integer posesses that does not concern pencil-and-paper 
+arithmetic.  The third property is how many digits placeholders are available to hold the integer.  
+
+The human analogy of this third property is ensuring there is enough space on the paper to write the integer.  For example,
+if one starts writing a large number too far to the right on a piece of paper they will have to erase it and move left.  
+Similarly, computer algorithms must maintain strict control over memory usage to ensure that the digits of an integer
+will not exceed the allowed boundaries.  These three properties make up what is known as a multiple precision 
+integer or mp\_int for short.  
+
+\subsection{The mp\_int Structure}
+\label{sec:MPINT}
+The mp\_int structure is the ISO C based manifestation of what represents a multiple precision integer.  The ISO C standard does not provide for 
+any such data type but it does provide for making composite data types known as structures.  The following is the structure definition 
+used within LibTomMath.
+
+\index{mp\_int}
+\begin{figure}[here]
+\begin{center}
+\begin{small}
+%\begin{verbatim}
+\begin{tabular}{|l|}
+\hline
+typedef struct \{ \\
+\hspace{3mm}int used, alloc, sign;\\
+\hspace{3mm}mp\_digit *dp;\\
+\} \textbf{mp\_int}; \\
+\hline
+\end{tabular}
+%\end{verbatim}
+\end{small}
+\caption{The mp\_int Structure}
+\label{fig:mpint}
+\end{center}
+\end{figure}
+
+The mp\_int structure (fig. \ref{fig:mpint}) can be broken down as follows.
+
+\begin{enumerate}
+\item The \textbf{used} parameter denotes how many digits of the array \textbf{dp} contain the digits used to represent
+a given integer.  The \textbf{used} count must be positive (or zero) and may not exceed the \textbf{alloc} count.  
+
+\item The \textbf{alloc} parameter denotes how 
+many digits are available in the array to use by functions before it has to increase in size.  When the \textbf{used} count 
+of a result would exceed the \textbf{alloc} count all of the algorithms will automatically increase the size of the 
+array to accommodate the precision of the result.  
+
+\item The pointer \textbf{dp} points to a dynamically allocated array of digits that represent the given multiple 
+precision integer.  It is padded with $(\textbf{alloc} - \textbf{used})$ zero digits.  The array is maintained in a least 
+significant digit order.  As a pencil and paper analogy the array is organized such that the right most digits are stored
+first starting at the location indexed by zero\footnote{In C all arrays begin at zero.} in the array.  For example, 
+if \textbf{dp} contains $\lbrace a, b, c, \ldots \rbrace$ where \textbf{dp}$_0 = a$, \textbf{dp}$_1 = b$, \textbf{dp}$_2 = c$, $\ldots$ then 
+it would represent the integer $a + b\beta + c\beta^2 + \ldots$  
+
+\index{MP\_ZPOS} \index{MP\_NEG}
+\item The \textbf{sign} parameter denotes the sign as either zero/positive (\textbf{MP\_ZPOS}) or negative (\textbf{MP\_NEG}).  
+\end{enumerate}
+
+\subsubsection{Valid mp\_int Structures}
+Several rules are placed on the state of an mp\_int structure and are assumed to be followed for reasons of efficiency.  
+The only exceptions are when the structure is passed to initialization functions such as mp\_init() and mp\_init\_copy().
+
+\begin{enumerate}
+\item The value of \textbf{alloc} may not be less than one.  That is \textbf{dp} always points to a previously allocated
+array of digits.
+\item The value of \textbf{used} may not exceed \textbf{alloc} and must be greater than or equal to zero.
+\item The value of \textbf{used} implies the digit at index $(used - 1)$ of the \textbf{dp} array is non-zero.  That is, 
+leading zero digits in the most significant positions must be trimmed.
+   \begin{enumerate}
+   \item Digits in the \textbf{dp} array at and above the \textbf{used} location must be zero.
+   \end{enumerate}
+\item The value of \textbf{sign} must be \textbf{MP\_ZPOS} if \textbf{used} is zero; 
+this represents the mp\_int value of zero.
+\end{enumerate}
+
+\section{Argument Passing}
+A convention of argument passing must be adopted early on in the development of any library.  Making the function 
+prototypes consistent will help eliminate many headaches in the future as the library grows to significant complexity.  
+In LibTomMath the multiple precision integer functions accept parameters from left to right as pointers to mp\_int 
+structures.  That means that the source (input) operands are placed on the left and the destination (output) on the right.   
+Consider the following examples.
+
+\begin{verbatim}
+   mp_mul(&a, &b, &c);   /* c = a * b */
+   mp_add(&a, &b, &a);   /* a = a + b */
+   mp_sqr(&a, &b);       /* b = a * a */
+\end{verbatim}
+
+The left to right order is a fairly natural way to implement the functions since it lets the developer read aloud the
+functions and make sense of them.  For example, the first function would read ``multiply a and b and store in c''.
+
+Certain libraries (\textit{LIP by Lenstra for instance}) accept parameters the other way around, to mimic the order
+of assignment expressions.  That is, the destination (output) is on the left and arguments (inputs) are on the right.  In 
+truth, it is entirely a matter of preference.  In the case of LibTomMath the convention from the MPI library has been 
+adopted.  
+
+Another very useful design consideration, provided for in LibTomMath, is whether to allow argument sources to also be a 
+destination.  For example, the second example (\textit{mp\_add}) adds $a$ to $b$ and stores in $a$.  This is an important 
+feature to implement since it allows the calling functions to cut down on the number of variables it must maintain.  
+However, to implement this feature specific care has to be given to ensure the destination is not modified before the 
+source is fully read.
+
+\section{Return Values}
+A well implemented application, no matter what its purpose, should trap as many runtime errors as possible and return them 
+to the caller.  By catching runtime errors a library can be guaranteed to prevent undefined behaviour.  However, the end 
+developer can still manage to cause a library to crash.  For example, by passing an invalid pointer an application may
+fault by dereferencing memory not owned by the application.
+
+In the case of LibTomMath the only errors that are checked for are related to inappropriate inputs (division by zero for 
+instance) and memory allocation errors.  It will not check that the mp\_int passed to any function is valid nor 
+will it check pointers for validity.  Any function that can cause a runtime error will return an error code as an 
+\textbf{int} data type with one of the following values (fig \ref{fig:errcodes}).
+
+\index{MP\_OKAY} \index{MP\_VAL} \index{MP\_MEM}
+\begin{figure}[here]
+\begin{center}
+\begin{tabular}{|l|l|}
+\hline \textbf{Value} & \textbf{Meaning} \\
+\hline \textbf{MP\_OKAY} & The function was successful \\
+\hline \textbf{MP\_VAL}  & One of the input value(s) was invalid \\
+\hline \textbf{MP\_MEM}  & The function ran out of heap memory \\
+\hline
+\end{tabular}
+\end{center}
+\caption{LibTomMath Error Codes}
+\label{fig:errcodes}
+\end{figure}
+
+When an error is detected within a function it should free any memory it allocated, often during the initialization of
+temporary mp\_ints, and return as soon as possible.  The goal is to leave the system in the same state it was when the 
+function was called.  Error checking with this style of API is fairly simple.
+
+\begin{verbatim}
+   int err;
+   if ((err = mp_add(&a, &b, &c)) != MP_OKAY) {
+      printf("Error: %s\n", mp_error_to_string(err));
+      exit(EXIT_FAILURE);
+   }
+\end{verbatim}
+
+The GMP \cite{GMP} library uses C style \textit{signals} to flag errors which is of questionable use.  Not all errors are fatal 
+and it was not deemed ideal by the author of LibTomMath to force developers to have signal handlers for such cases.
+
+\section{Initialization and Clearing}
+The logical starting point when actually writing multiple precision integer functions is the initialization and 
+clearing of the mp\_int structures.  These two algorithms will be used by the majority of the higher level algorithms.
+
+Given the basic mp\_int structure an initialization routine must first allocate memory to hold the digits of
+the integer.  Often it is optimal to allocate a sufficiently large pre-set number of digits even though
+the initial integer will represent zero.  If only a single digit were allocated quite a few subsequent re-allocations
+would occur when operations are performed on the integers.  There is a tradeoff between how many default digits to allocate
+and how many re-allocations are tolerable.  Obviously allocating an excessive amount of digits initially will waste 
+memory and become unmanageable.  
+
+If the memory for the digits has been successfully allocated then the rest of the members of the structure must
+be initialized.  Since the initial state of an mp\_int is to represent the zero integer, the allocated digits must be set
+to zero.  The \textbf{used} count set to zero and \textbf{sign} set to \textbf{MP\_ZPOS}.
+
+\subsection{Initializing an mp\_int}
+An mp\_int is said to be initialized if it is set to a valid, preferably default, state such that all of the members of the
+structure are set to valid values.  The mp\_init algorithm will perform such an action.
+
+\index{mp\_init}
+\begin{figure}[here]
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_init}. \\
+\textbf{Input}.   An mp\_int $a$ \\
+\textbf{Output}.  Allocate memory and initialize $a$ to a known valid mp\_int state.  \\
+\hline \\
+1.  Allocate memory for \textbf{MP\_PREC} digits. \\
+2.  If the allocation failed return(\textit{MP\_MEM}) \\
+3.  for $n$ from $0$ to $MP\_PREC - 1$ do  \\
+\hspace{3mm}3.1  $a_n \leftarrow 0$\\
+4.  $a.sign \leftarrow MP\_ZPOS$\\
+5.  $a.used \leftarrow 0$\\
+6.  $a.alloc \leftarrow MP\_PREC$\\
+7.  Return(\textit{MP\_OKAY})\\
+\hline
+\end{tabular}
+\end{center}
+\caption{Algorithm mp\_init}
+\end{figure}
+
+\textbf{Algorithm mp\_init.}
+The purpose of this function is to initialize an mp\_int structure so that the rest of the library can properly
+manipulte it.  It is assumed that the input may not have had any of its members previously initialized which is certainly
+a valid assumption if the input resides on the stack.  
+
+Before any of the members such as \textbf{sign}, \textbf{used} or \textbf{alloc} are initialized the memory for
+the digits is allocated.  If this fails the function returns before setting any of the other members.  The \textbf{MP\_PREC} 
+name represents a constant\footnote{Defined in the ``tommath.h'' header file within LibTomMath.} 
+used to dictate the minimum precision of newly initialized mp\_int integers.  Ideally, it is at least equal to the smallest
+precision number you'll be working with.
+
+Allocating a block of digits at first instead of a single digit has the benefit of lowering the number of usually slow
+heap operations later functions will have to perform in the future.  If \textbf{MP\_PREC} is set correctly the slack 
+memory and the number of heap operations will be trivial.
+
+Once the allocation has been made the digits have to be set to zero as well as the \textbf{used}, \textbf{sign} and
+\textbf{alloc} members initialized.  This ensures that the mp\_int will always represent the default state of zero regardless
+of the original condition of the input.
+
+\textbf{Remark.}
+This function introduces the idiosyncrasy that all iterative loops, commonly initiated with the ``for'' keyword, iterate incrementally
+when the ``to'' keyword is placed between two expressions.  For example, ``for $a$ from $b$ to $c$ do'' means that
+a subsequent expression (or body of expressions) are to be evaluated upto $c - b$ times so long as $b \le c$.  In each
+iteration the variable $a$ is substituted for a new integer that lies inclusively between $b$ and $c$.  If $b > c$ occured
+the loop would not iterate.  By contrast if the ``downto'' keyword were used in place of ``to'' the loop would iterate 
+decrementally.
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_init.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* init a new mp_int */
+018   int mp_init (mp_int * a)
+019   \{
+020     int i;
+021   
+022     /* allocate memory required and clear it */
+023     a->dp = OPT_CAST(mp_digit) XMALLOC (sizeof (mp_digit) * MP_PREC);
+024     if (a->dp == NULL) \{
+025       return MP_MEM;
+026     \}
+027   
+028     /* set the digits to zero */
+029     for (i = 0; i < MP_PREC; i++) \{
+030         a->dp[i] = 0;
+031     \}
+032   
+033     /* set the used to zero, allocated digits to the default precision
+034      * and sign to positive */
+035     a->used  = 0;
+036     a->alloc = MP_PREC;
+037     a->sign  = MP_ZPOS;
+038   
+039     return MP_OKAY;
+040   \}
+041   #endif
+\end{alltt}
+\end{small}
+
+One immediate observation of this initializtion function is that it does not return a pointer to a mp\_int structure.  It 
+is assumed that the caller has already allocated memory for the mp\_int structure, typically on the application stack.  The 
+call to mp\_init() is used only to initialize the members of the structure to a known default state.  
+
+Here we see (line 23) the memory allocation is performed first.  This allows us to exit cleanly and quickly
+if there is an error.  If the allocation fails the routine will return \textbf{MP\_MEM} to the caller to indicate there
+was a memory error.  The function XMALLOC is what actually allocates the memory.  Technically XMALLOC is not a function
+but a macro defined in ``tommath.h``.  By default, XMALLOC will evaluate to malloc() which is the C library's built--in
+memory allocation routine.
+
+In order to assure the mp\_int is in a known state the digits must be set to zero.  On most platforms this could have been
+accomplished by using calloc() instead of malloc().  However,  to correctly initialize a integer type to a given value in a 
+portable fashion you have to actually assign the value.  The for loop (line 29) performs this required
+operation.
+
+After the memory has been successfully initialized the remainder of the members are initialized 
+(lines 33 through 34) to their respective default states.  At this point the algorithm has succeeded and
+a success code is returned to the calling function.  If this function returns \textbf{MP\_OKAY} it is safe to assume the 
+mp\_int structure has been properly initialized and is safe to use with other functions within the library.  
+
+\subsection{Clearing an mp\_int}
+When an mp\_int is no longer required by the application, the memory that has been allocated for its digits must be 
+returned to the application's memory pool with the mp\_clear algorithm.
+
+\begin{figure}[here]
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_clear}. \\
+\textbf{Input}.   An mp\_int $a$ \\
+\textbf{Output}.  The memory for $a$ shall be deallocated.  \\
+\hline \\
+1.  If $a$ has been previously freed then return(\textit{MP\_OKAY}). \\
+2.  for $n$ from 0 to $a.used - 1$ do \\
+\hspace{3mm}2.1  $a_n \leftarrow 0$ \\
+3.  Free the memory allocated for the digits of $a$. \\
+4.  $a.used \leftarrow 0$ \\
+5.  $a.alloc \leftarrow 0$ \\
+6.  $a.sign \leftarrow MP\_ZPOS$ \\
+7.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Algorithm mp\_clear}
+\end{figure}
+
+\textbf{Algorithm mp\_clear.}
+This algorithm accomplishes two goals.  First, it clears the digits and the other mp\_int members.  This ensures that 
+if a developer accidentally re-uses a cleared structure it is less likely to cause problems.  The second goal
+is to free the allocated memory.
+
+The logic behind the algorithm is extended by marking cleared mp\_int structures so that subsequent calls to this
+algorithm will not try to free the memory multiple times.  Cleared mp\_ints are detectable by having a pre-defined invalid 
+digit pointer \textbf{dp} setting.
+
+Once an mp\_int has been cleared the mp\_int structure is no longer in a valid state for any other algorithm
+with the exception of algorithms mp\_init, mp\_init\_copy, mp\_init\_size and mp\_clear.
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_clear.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* clear one (frees)  */
+018   void
+019   mp_clear (mp_int * a)
+020   \{
+021     int i;
+022   
+023     /* only do anything if a hasn't been freed previously */
+024     if (a->dp != NULL) \{
+025       /* first zero the digits */
+026       for (i = 0; i < a->used; i++) \{
+027           a->dp[i] = 0;
+028       \}
+029   
+030       /* free ram */
+031       XFREE(a->dp);
+032   
+033       /* reset members to make debugging easier */
+034       a->dp    = NULL;
+035       a->alloc = a->used = 0;
+036       a->sign  = MP_ZPOS;
+037     \}
+038   \}
+039   #endif
+\end{alltt}
+\end{small}
+
+The algorithm only operates on the mp\_int if it hasn't been previously cleared.  The if statement (line 24)
+checks to see if the \textbf{dp} member is not \textbf{NULL}.  If the mp\_int is a valid mp\_int then \textbf{dp} cannot be
+\textbf{NULL} in which case the if statement will evaluate to true.
+
+The digits of the mp\_int are cleared by the for loop (line 26) which assigns a zero to every digit.  Similar to mp\_init()
+the digits are assigned zero instead of using block memory operations (such as memset()) since this is more portable.  
+
+The digits are deallocated off the heap via the XFREE macro.  Similar to XMALLOC the XFREE macro actually evaluates to
+a standard C library function.  In this case the free() function.  Since free() only deallocates the memory the pointer
+still has to be reset to \textbf{NULL} manually (line 34).  
+
+Now that the digits have been cleared and deallocated the other members are set to their final values (lines 35 and 36).
+
+\section{Maintenance Algorithms}
+
+The previous sections describes how to initialize and clear an mp\_int structure.  To further support operations
+that are to be performed on mp\_int structures (such as addition and multiplication) the dependent algorithms must be
+able to augment the precision of an mp\_int and 
+initialize mp\_ints with differing initial conditions.  
+
+These algorithms complete the set of low level algorithms required to work with mp\_int structures in the higher level
+algorithms such as addition, multiplication and modular exponentiation.
+
+\subsection{Augmenting an mp\_int's Precision}
+When storing a value in an mp\_int structure, a sufficient number of digits must be available to accomodate the entire 
+result of an operation without loss of precision.  Quite often the size of the array given by the \textbf{alloc} member 
+is large enough to simply increase the \textbf{used} digit count.  However, when the size of the array is too small it 
+must be re-sized appropriately to accomodate the result.  The mp\_grow algorithm will provide this functionality.
+
+\newpage\begin{figure}[here]
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_grow}. \\
+\textbf{Input}.   An mp\_int $a$ and an integer $b$. \\
+\textbf{Output}.  $a$ is expanded to accomodate $b$ digits. \\
+\hline \\
+1.  if $a.alloc \ge b$ then return(\textit{MP\_OKAY}) \\
+2.  $u \leftarrow b\mbox{ (mod }MP\_PREC\mbox{)}$ \\
+3.  $v \leftarrow b + 2 \cdot MP\_PREC - u$ \\
+4.  Re-allocate the array of digits $a$ to size $v$ \\
+5.  If the allocation failed then return(\textit{MP\_MEM}). \\
+6.  for n from a.alloc to $v - 1$ do  \\
+\hspace{+3mm}6.1  $a_n \leftarrow 0$ \\
+7.  $a.alloc \leftarrow v$ \\
+8.  Return(\textit{MP\_OKAY}) \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Algorithm mp\_grow}
+\end{figure}
+
+\textbf{Algorithm mp\_grow.}
+It is ideal to prevent re-allocations from being performed if they are not required (step one).  This is useful to 
+prevent mp\_ints from growing excessively in code that erroneously calls mp\_grow.  
+
+The requested digit count is padded up to next multiple of \textbf{MP\_PREC} plus an additional \textbf{MP\_PREC} (steps two and three).  
+This helps prevent many trivial reallocations that would grow an mp\_int by trivially small values.  
+
+It is assumed that the reallocation (step four) leaves the lower $a.alloc$ digits of the mp\_int intact.  This is much 
+akin to how the \textit{realloc} function from the standard C library works.  Since the newly allocated digits are 
+assumed to contain undefined values they are initially set to zero.
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_grow.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* grow as required */
+018   int mp_grow (mp_int * a, int size)
+019   \{
+020     int     i;
+021     mp_digit *tmp;
+022   
+023     /* if the alloc size is smaller alloc more ram */
+024     if (a->alloc < size) \{
+025       /* ensure there are always at least MP_PREC digits extra on top */
+026       size += (MP_PREC * 2) - (size % MP_PREC);
+027   
+028       /* reallocate the array a->dp
+029        *
+030        * We store the return in a temporary variable
+031        * in case the operation failed we don't want
+032        * to overwrite the dp member of a.
+033        */
+034       tmp = OPT_CAST(mp_digit) XREALLOC (a->dp, sizeof (mp_digit) * size);
+035       if (tmp == NULL) \{
+036         /* reallocation failed but "a" is still valid [can be freed] */
+037         return MP_MEM;
+038       \}
+039   
+040       /* reallocation succeeded so set a->dp */
+041       a->dp = tmp;
+042   
+043       /* zero excess digits */
+044       i        = a->alloc;
+045       a->alloc = size;
+046       for (; i < a->alloc; i++) \{
+047         a->dp[i] = 0;
+048       \}
+049     \}
+050     return MP_OKAY;
+051   \}
+052   #endif
+\end{alltt}
+\end{small}
+
+A quick optimization is to first determine if a memory re-allocation is required at all.  The if statement (line 24) checks
+if the \textbf{alloc} member of the mp\_int is smaller than the requested digit count.  If the count is not larger than \textbf{alloc}
+the function skips the re-allocation part thus saving time.
+
+When a re-allocation is performed it is turned into an optimal request to save time in the future.  The requested digit count is
+padded upwards to 2nd multiple of \textbf{MP\_PREC} larger than \textbf{alloc} (line 26).  The XREALLOC function is used
+to re-allocate the memory.  As per the other functions XREALLOC is actually a macro which evaluates to realloc by default.  The realloc
+function leaves the base of the allocation intact which means the first \textbf{alloc} digits of the mp\_int are the same as before
+the re-allocation.  All	that is left is to clear the newly allocated digits and return.
+
+Note that the re-allocation result is actually stored in a temporary pointer $tmp$.  This is to allow this function to return
+an error with a valid pointer.  Earlier releases of the library stored the result of XREALLOC into the mp\_int $a$.  That would
+result in a memory leak if XREALLOC ever failed.  
+
+\subsection{Initializing Variable Precision mp\_ints}
+Occasionally the number of digits required will be known in advance of an initialization, based on, for example, the size 
+of input mp\_ints to a given algorithm.  The purpose of algorithm mp\_init\_size is similar to mp\_init except that it 
+will allocate \textit{at least} a specified number of digits.  
+
+\begin{figure}[here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_init\_size}. \\
+\textbf{Input}.   An mp\_int $a$ and the requested number of digits $b$. \\
+\textbf{Output}.  $a$ is initialized to hold at least $b$ digits. \\
+\hline \\
+1.  $u \leftarrow b \mbox{ (mod }MP\_PREC\mbox{)}$ \\
+2.  $v \leftarrow b + 2 \cdot MP\_PREC - u$ \\
+3.  Allocate $v$ digits. \\
+4.  for $n$ from $0$ to $v - 1$ do \\
+\hspace{3mm}4.1  $a_n \leftarrow 0$ \\
+5.  $a.sign \leftarrow MP\_ZPOS$\\
+6.  $a.used \leftarrow 0$\\
+7.  $a.alloc \leftarrow v$\\
+8.  Return(\textit{MP\_OKAY})\\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_init\_size}
+\end{figure}
+
+\textbf{Algorithm mp\_init\_size.}
+This algorithm will initialize an mp\_int structure $a$ like algorithm mp\_init with the exception that the number of 
+digits allocated can be controlled by the second input argument $b$.  The input size is padded upwards so it is a 
+multiple of \textbf{MP\_PREC} plus an additional \textbf{MP\_PREC} digits.  This padding is used to prevent trivial 
+allocations from becoming a bottleneck in the rest of the algorithms.
+
+Like algorithm mp\_init, the mp\_int structure is initialized to a default state representing the integer zero.  This 
+particular algorithm is useful if it is known ahead of time the approximate size of the input.  If the approximation is
+correct no further memory re-allocations are required to work with the mp\_int.
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_init\_size.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* init an mp_init for a given size */
+018   int mp_init_size (mp_int * a, int size)
+019   \{
+020     int x;
+021   
+022     /* pad size so there are always extra digits */
+023     size += (MP_PREC * 2) - (size % MP_PREC);    
+024     
+025     /* alloc mem */
+026     a->dp = OPT_CAST(mp_digit) XMALLOC (sizeof (mp_digit) * size);
+027     if (a->dp == NULL) \{
+028       return MP_MEM;
+029     \}
+030   
+031     /* set the members */
+032     a->used  = 0;
+033     a->alloc = size;
+034     a->sign  = MP_ZPOS;
+035   
+036     /* zero the digits */
+037     for (x = 0; x < size; x++) \{
+038         a->dp[x] = 0;
+039     \}
+040   
+041     return MP_OKAY;
+042   \}
+043   #endif
+\end{alltt}
+\end{small}
+
+The number of digits $b$ requested is padded (line 23) by first augmenting it to the next multiple of 
+\textbf{MP\_PREC} and then adding \textbf{MP\_PREC} to the result.  If the memory can be successfully allocated the 
+mp\_int is placed in a default state representing the integer zero.  Otherwise, the error code \textbf{MP\_MEM} will be 
+returned (line 28).  
+
+The digits are allocated and set to zero at the same time with the calloc() function (line @25,XCALLOC@).  The 
+\textbf{used} count is set to zero, the \textbf{alloc} count set to the padded digit count and the \textbf{sign} flag set 
+to \textbf{MP\_ZPOS} to achieve a default valid mp\_int state (lines 32, 33 and 34).  If the function 
+returns succesfully then it is correct to assume that the mp\_int structure is in a valid state for the remainder of the 
+functions to work with.
+
+\subsection{Multiple Integer Initializations and Clearings}
+Occasionally a function will require a series of mp\_int data types to be made available simultaneously.  
+The purpose of algorithm mp\_init\_multi is to initialize a variable length array of mp\_int structures in a single
+statement.  It is essentially a shortcut to multiple initializations.
+
+\newpage\begin{figure}[here]
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_init\_multi}. \\
+\textbf{Input}.   Variable length array $V_k$ of mp\_int variables of length $k$. \\
+\textbf{Output}.  The array is initialized such that each mp\_int of $V_k$ is ready to use. \\
+\hline \\
+1.  for $n$ from 0 to $k - 1$ do \\
+\hspace{+3mm}1.1.  Initialize the mp\_int $V_n$ (\textit{mp\_init}) \\
+\hspace{+3mm}1.2.  If initialization failed then do \\
+\hspace{+6mm}1.2.1.  for $j$ from $0$ to $n$ do \\
+\hspace{+9mm}1.2.1.1.  Free the mp\_int $V_j$ (\textit{mp\_clear}) \\
+\hspace{+6mm}1.2.2.   Return(\textit{MP\_MEM}) \\
+2.  Return(\textit{MP\_OKAY}) \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Algorithm mp\_init\_multi}
+\end{figure}
+
+\textbf{Algorithm mp\_init\_multi.}
+The algorithm will initialize the array of mp\_int variables one at a time.  If a runtime error has been detected 
+(\textit{step 1.2}) all of the previously initialized variables are cleared.  The goal is an ``all or nothing'' 
+initialization which allows for quick recovery from runtime errors.
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_init\_multi.c
+\vspace{-3mm}
+\begin{alltt}
+016   #include <stdarg.h>
+017   
+018   int mp_init_multi(mp_int *mp, ...) 
+019   \{
+020       mp_err res = MP_OKAY;      /* Assume ok until proven otherwise */
+021       int n = 0;                 /* Number of ok inits */
+022       mp_int* cur_arg = mp;
+023       va_list args;
+024   
+025       va_start(args, mp);        /* init args to next argument from caller */
+026       while (cur_arg != NULL) \{
+027           if (mp_init(cur_arg) != MP_OKAY) \{
+028               /* Oops - error! Back-track and mp_clear what we already
+029                  succeeded in init-ing, then return error.
+030               */
+031               va_list clean_args;
+032               
+033               /* end the current list */
+034               va_end(args);
+035               
+036               /* now start cleaning up */            
+037               cur_arg = mp;
+038               va_start(clean_args, mp);
+039               while (n--) \{
+040                   mp_clear(cur_arg);
+041                   cur_arg = va_arg(clean_args, mp_int*);
+042               \}
+043               va_end(clean_args);
+044               res = MP_MEM;
+045               break;
+046           \}
+047           n++;
+048           cur_arg = va_arg(args, mp_int*);
+049       \}
+050       va_end(args);
+051       return res;                /* Assumed ok, if error flagged above. */
+052   \}
+053   
+054   #endif
+\end{alltt}
+\end{small}
+
+This function intializes a variable length list of mp\_int structure pointers.  However, instead of having the mp\_int
+structures in an actual C array they are simply passed as arguments to the function.  This function makes use of the 
+``...'' argument syntax of the C programming language.  The list is terminated with a final \textbf{NULL} argument 
+appended on the right.  
+
+The function uses the ``stdarg.h'' \textit{va} functions to step portably through the arguments to the function.  A count
+$n$ of succesfully initialized mp\_int structures is maintained (line 47) such that if a failure does occur,
+the algorithm can backtrack and free the previously initialized structures (lines 27 to 46).  
+
+
+\subsection{Clamping Excess Digits}
+When a function anticipates a result will be $n$ digits it is simpler to assume this is true within the body of 
+the function instead of checking during the computation.  For example, a multiplication of a $i$ digit number by a 
+$j$ digit produces a result of at most $i + j$ digits.  It is entirely possible that the result is $i + j - 1$ 
+though, with no final carry into the last position.  However, suppose the destination had to be first expanded 
+(\textit{via mp\_grow}) to accomodate $i + j - 1$ digits than further expanded to accomodate the final carry.  
+That would be a considerable waste of time since heap operations are relatively slow.
+
+The ideal solution is to always assume the result is $i + j$ and fix up the \textbf{used} count after the function
+terminates.  This way a single heap operation (\textit{at most}) is required.  However, if the result was not checked
+there would be an excess high order zero digit.  
+
+For example, suppose the product of two integers was $x_n = (0x_{n-1}x_{n-2}...x_0)_{\beta}$.  The leading zero digit 
+will not contribute to the precision of the result.  In fact, through subsequent operations more leading zero digits would
+accumulate to the point the size of the integer would be prohibitive.  As a result even though the precision is very 
+low the representation is excessively large.  
+
+The mp\_clamp algorithm is designed to solve this very problem.  It will trim high-order zeros by decrementing the 
+\textbf{used} count until a non-zero most significant digit is found.  Also in this system, zero is considered to be a 
+positive number which means that if the \textbf{used} count is decremented to zero, the sign must be set to 
+\textbf{MP\_ZPOS}.
+
+\begin{figure}[here]
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_clamp}. \\
+\textbf{Input}.   An mp\_int $a$ \\
+\textbf{Output}.  Any excess leading zero digits of $a$ are removed \\
+\hline \\
+1.  while $a.used > 0$ and $a_{a.used - 1} = 0$ do \\
+\hspace{+3mm}1.1  $a.used \leftarrow a.used - 1$ \\
+2.  if $a.used = 0$ then do \\
+\hspace{+3mm}2.1  $a.sign \leftarrow MP\_ZPOS$ \\
+\hline \\
+\end{tabular}
+\end{center}
+\caption{Algorithm mp\_clamp}
+\end{figure}
+
+\textbf{Algorithm mp\_clamp.}
+As can be expected this algorithm is very simple.  The loop on step one is expected to iterate only once or twice at
+the most.  For example, this will happen in cases where there is not a carry to fill the last position.  Step two fixes the sign for 
+when all of the digits are zero to ensure that the mp\_int is valid at all times.
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_clamp.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* trim unused digits 
+018    *
+019    * This is used to ensure that leading zero digits are
+020    * trimed and the leading "used" digit will be non-zero
+021    * Typically very fast.  Also fixes the sign if there
+022    * are no more leading digits
+023    */
+024   void
+025   mp_clamp (mp_int * a)
+026   \{
+027     /* decrease used while the most significant digit is
+028      * zero.
+029      */
+030     while (a->used > 0 && a->dp[a->used - 1] == 0) \{
+031       --(a->used);
+032     \}
+033   
+034     /* reset the sign flag if used == 0 */
+035     if (a->used == 0) \{
+036       a->sign = MP_ZPOS;
+037     \}
+038   \}
+039   #endif
+\end{alltt}
+\end{small}
+
+Note on line 27 how to test for the \textbf{used} count is made on the left of the \&\& operator.  In the C programming
+language the terms to \&\& are evaluated left to right with a boolean short-circuit if any condition fails.  This is 
+important since if the \textbf{used} is zero the test on the right would fetch below the array.  That is obviously 
+undesirable.  The parenthesis on line 30 is used to make sure the \textbf{used} count is decremented and not
+the pointer ``a''.  
+
+\section*{Exercises}
+\begin{tabular}{cl}
+$\left [ 1 \right ]$ & Discuss the relevance of the \textbf{used} member of the mp\_int structure. \\
+                     & \\
+$\left [ 1 \right ]$ & Discuss the consequences of not using padding when performing allocations.  \\
+                     & \\
+$\left [ 2 \right ]$ & Estimate an ideal value for \textbf{MP\_PREC} when performing 1024-bit RSA \\
+                     & encryption when $\beta = 2^{28}$.  \\
+                     & \\
+$\left [ 1 \right ]$ & Discuss the relevance of the algorithm mp\_clamp.  What does it prevent? \\
+                     & \\
+$\left [ 1 \right ]$ & Give an example of when the algorithm  mp\_init\_copy might be useful. \\
+                     & \\
+\end{tabular}
+
+
+%%%
+% CHAPTER FOUR
+%%%
+
+\chapter{Basic Operations}
+
+\section{Introduction}
+In the previous chapter a series of low level algorithms were established that dealt with initializing and maintaining
+mp\_int structures.  This chapter will discuss another set of seemingly non-algebraic algorithms which will form the low 
+level basis of the entire library.  While these algorithm are relatively trivial it is important to understand how they
+work before proceeding since these algorithms will be used almost intrinsically in the following chapters.
+
+The algorithms in this chapter deal primarily with more ``programmer'' related tasks such as creating copies of
+mp\_int structures, assigning small values to mp\_int structures and comparisons of the values mp\_int structures
+represent.   
+
+\section{Assigning Values to mp\_int Structures}
+\subsection{Copying an mp\_int}
+Assigning the value that a given mp\_int structure represents to another mp\_int structure shall be known as making
+a copy for the purposes of this text.  The copy of the mp\_int will be a separate entity that represents the same
+value as the mp\_int it was copied from.  The mp\_copy algorithm provides this functionality. 
+
+\newpage\begin{figure}[here]
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_copy}. \\
+\textbf{Input}.  An mp\_int $a$ and $b$. \\
+\textbf{Output}.  Store a copy of $a$ in $b$. \\
+\hline \\
+1.  If $b.alloc < a.used$ then grow $b$ to $a.used$ digits.  (\textit{mp\_grow}) \\
+2.  for $n$ from 0 to $a.used - 1$ do \\
+\hspace{3mm}2.1  $b_{n} \leftarrow a_{n}$ \\
+3.  for $n$ from $a.used$ to $b.used - 1$ do \\
+\hspace{3mm}3.1  $b_{n} \leftarrow 0$ \\
+4.  $b.used \leftarrow a.used$ \\
+5.  $b.sign \leftarrow a.sign$ \\
+6.  return(\textit{MP\_OKAY}) \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Algorithm mp\_copy}
+\end{figure}
+
+\textbf{Algorithm mp\_copy.}
+This algorithm copies the mp\_int $a$ such that upon succesful termination of the algorithm the mp\_int $b$ will
+represent the same integer as the mp\_int $a$.  The mp\_int $b$ shall be a complete and distinct copy of the 
+mp\_int $a$ meaing that the mp\_int $a$ can be modified and it shall not affect the value of the mp\_int $b$.
+
+If $b$ does not have enough room for the digits of $a$ it must first have its precision augmented via the mp\_grow 
+algorithm.  The digits of $a$ are copied over the digits of $b$ and any excess digits of $b$ are set to zero (step two
+and three).  The \textbf{used} and \textbf{sign} members of $a$ are finally copied over the respective members of
+$b$.
+
+\textbf{Remark.}  This algorithm also introduces a new idiosyncrasy that will be used throughout the rest of the
+text.  The error return codes of other algorithms are not explicitly checked in the pseudo-code presented.  For example, in 
+step one of the mp\_copy algorithm the return of mp\_grow is not explicitly checked to ensure it succeeded.  Text space is 
+limited so it is assumed that if a algorithm fails it will clear all temporarily allocated mp\_ints and return
+the error code itself.  However, the C code presented will demonstrate all of the error handling logic required to 
+implement the pseudo-code.
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_copy.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* copy, b = a */
+018   int
+019   mp_copy (mp_int * a, mp_int * b)
+020   \{
+021     int     res, n;
+022   
+023     /* if dst == src do nothing */
+024     if (a == b) \{
+025       return MP_OKAY;
+026     \}
+027   
+028     /* grow dest */
+029     if (b->alloc < a->used) \{
+030        if ((res = mp_grow (b, a->used)) != MP_OKAY) \{
+031           return res;
+032        \}
+033     \}
+034   
+035     /* zero b and copy the parameters over */
+036     \{
+037       register mp_digit *tmpa, *tmpb;
+038   
+039       /* pointer aliases */
+040   
+041       /* source */
+042       tmpa = a->dp;
+043   
+044       /* destination */
+045       tmpb = b->dp;
+046   
+047       /* copy all the digits */
+048       for (n = 0; n < a->used; n++) \{
+049         *tmpb++ = *tmpa++;
+050       \}
+051   
+052       /* clear high digits */
+053       for (; n < b->used; n++) \{
+054         *tmpb++ = 0;
+055       \}
+056     \}
+057   
+058     /* copy used count and sign */
+059     b->used = a->used;
+060     b->sign = a->sign;
+061     return MP_OKAY;
+062   \}
+063   #endif
+\end{alltt}
+\end{small}
+
+Occasionally a dependent algorithm may copy an mp\_int effectively into itself such as when the input and output
+mp\_int structures passed to a function are one and the same.  For this case it is optimal to return immediately without 
+copying digits (line 24).  
+
+The mp\_int $b$ must have enough digits to accomodate the used digits of the mp\_int $a$.  If $b.alloc$ is less than
+$a.used$ the algorithm mp\_grow is used to augment the precision of $b$ (lines 29 to 33).  In order to
+simplify the inner loop that copies the digits from $a$ to $b$, two aliases $tmpa$ and $tmpb$ point directly at the digits
+of the mp\_ints $a$ and $b$ respectively.  These aliases (lines 42 and 45) allow the compiler to access the digits without first dereferencing the
+mp\_int pointers and then subsequently the pointer to the digits.  
+
+After the aliases are established the digits from $a$ are copied into $b$ (lines 48 to 50) and then the excess 
+digits of $b$ are set to zero (lines 53 to 55).  Both ``for'' loops make use of the pointer aliases and in 
+fact the alias for $b$ is carried through into the second ``for'' loop to clear the excess digits.  This optimization 
+allows the alias to stay in a machine register fairly easy between the two loops.
+
+\textbf{Remarks.}  The use of pointer aliases is an implementation methodology first introduced in this function that will
+be used considerably in other functions.  Technically, a pointer alias is simply a short hand alias used to lower the 
+number of pointer dereferencing operations required to access data.  For example, a for loop may resemble
+
+\begin{alltt}
+for (x = 0; x < 100; x++) \{
+    a->num[4]->dp[x] = 0;
+\}
+\end{alltt}
+
+This could be re-written using aliases as 
+
+\begin{alltt}
+mp_digit *tmpa;
+a = a->num[4]->dp;
+for (x = 0; x < 100; x++) \{
+    *a++ = 0;
+\}
+\end{alltt}
+
+In this case an alias is used to access the 
+array of digits within an mp\_int structure directly.  It may seem that a pointer alias is strictly not required 
+as a compiler may optimize out the redundant pointer operations.  However, there are two dominant reasons to use aliases.
+
+The first reason is that most compilers will not effectively optimize pointer arithmetic.  For example, some optimizations 
+may work for the Microsoft Visual C++ compiler (MSVC) and not for the GNU C Compiler (GCC).  Also some optimizations may 
+work for GCC and not MSVC.  As such it is ideal to find a common ground for as many compilers as possible.  Pointer 
+aliases optimize the code considerably before the compiler even reads the source code which means the end compiled code 
+stands a better chance of being faster.
+
+The second reason is that pointer aliases often can make an algorithm simpler to read.  Consider the first ``for'' 
+loop of the function mp\_copy() re-written to not use pointer aliases.
+
+\begin{alltt}
+    /* copy all the digits */
+    for (n = 0; n < a->used; n++) \{
+      b->dp[n] = a->dp[n];
+    \}
+\end{alltt}
+
+Whether this code is harder to read depends strongly on the individual.  However, it is quantifiably slightly more 
+complicated as there are four variables within the statement instead of just two.
+
+\subsubsection{Nested Statements}
+Another commonly used technique in the source routines is that certain sections of code are nested.  This is used in
+particular with the pointer aliases to highlight code phases.  For example, a Comba multiplier (discussed in chapter six)
+will typically have three different phases.  First the temporaries are initialized, then the columns calculated and 
+finally the carries are propagated.  In this example the middle column production phase will typically be nested as it
+uses temporary variables and aliases the most.
+
+The nesting also simplies the source code as variables that are nested are only valid for their scope.  As a result
+the various temporary variables required do not propagate into other sections of code.
+
+
+\subsection{Creating a Clone}
+Another common operation is to make a local temporary copy of an mp\_int argument.  To initialize an mp\_int 
+and then copy another existing mp\_int into the newly intialized mp\_int will be known as creating a clone.  This is 
+useful within functions that need to modify an argument but do not wish to actually modify the original copy.  The 
+mp\_init\_copy algorithm has been designed to help perform this task.
+
+\begin{figure}[here]
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_init\_copy}. \\
+\textbf{Input}.   An mp\_int $a$ and $b$\\
+\textbf{Output}.  $a$ is initialized to be a copy of $b$. \\
+\hline \\
+1.  Init $a$.  (\textit{mp\_init}) \\
+2.  Copy $b$ to $a$.  (\textit{mp\_copy}) \\
+3.  Return the status of the copy operation. \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Algorithm mp\_init\_copy}
+\end{figure}
+
+\textbf{Algorithm mp\_init\_copy.}
+This algorithm will initialize an mp\_int variable and copy another previously initialized mp\_int variable into it.  As 
+such this algorithm will perform two operations in one step.  
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_init\_copy.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* creates "a" then copies b into it */
+018   int mp_init_copy (mp_int * a, mp_int * b)
+019   \{
+020     int     res;
+021   
+022     if ((res = mp_init (a)) != MP_OKAY) \{
+023       return res;
+024     \}
+025     return mp_copy (b, a);
+026   \}
+027   #endif
+\end{alltt}
+\end{small}
+
+This will initialize \textbf{a} and make it a verbatim copy of the contents of \textbf{b}.  Note that 
+\textbf{a} will have its own memory allocated which means that \textbf{b} may be cleared after the call
+and \textbf{a} will be left intact.  
+
+\section{Zeroing an Integer}
+Reseting an mp\_int to the default state is a common step in many algorithms.  The mp\_zero algorithm will be the algorithm used to
+perform this task.
+
+\begin{figure}[here]
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_zero}. \\
+\textbf{Input}.   An mp\_int $a$ \\
+\textbf{Output}.  Zero the contents of $a$ \\
+\hline \\
+1.  $a.used \leftarrow 0$ \\
+2.  $a.sign \leftarrow$ MP\_ZPOS \\
+3.  for $n$ from 0 to $a.alloc - 1$ do \\
+\hspace{3mm}3.1  $a_n \leftarrow 0$ \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Algorithm mp\_zero}
+\end{figure}
+
+\textbf{Algorithm mp\_zero.}
+This algorithm simply resets a mp\_int to the default state.  
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_zero.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* set to zero */
+018   void mp_zero (mp_int * a)
+019   \{
+020     int       n;
+021     mp_digit *tmp;
+022   
+023     a->sign = MP_ZPOS;
+024     a->used = 0;
+025   
+026     tmp = a->dp;
+027     for (n = 0; n < a->alloc; n++) \{
+028        *tmp++ = 0;
+029     \}
+030   \}
+031   #endif
+\end{alltt}
+\end{small}
+
+After the function is completed, all of the digits are zeroed, the \textbf{used} count is zeroed and the 
+\textbf{sign} variable is set to \textbf{MP\_ZPOS}.
+
+\section{Sign Manipulation}
+\subsection{Absolute Value}
+With the mp\_int representation of an integer, calculating the absolute value is trivial.  The mp\_abs algorithm will compute
+the absolute value of an mp\_int.
+
+\begin{figure}[here]
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_abs}. \\
+\textbf{Input}.   An mp\_int $a$ \\
+\textbf{Output}.  Computes $b = \vert a \vert$ \\
+\hline \\
+1.  Copy $a$ to $b$.  (\textit{mp\_copy}) \\
+2.  If the copy failed return(\textit{MP\_MEM}). \\
+3.  $b.sign \leftarrow MP\_ZPOS$ \\
+4.  Return(\textit{MP\_OKAY}) \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Algorithm mp\_abs}
+\end{figure}
+
+\textbf{Algorithm mp\_abs.}
+This algorithm computes the absolute of an mp\_int input.  First it copies $a$ over $b$.  This is an example of an
+algorithm where the check in mp\_copy that determines if the source and destination are equal proves useful.  This allows,
+for instance, the developer to pass the same mp\_int as the source and destination to this function without addition 
+logic to handle it.
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_abs.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* b = |a| 
+018    *
+019    * Simple function copies the input and fixes the sign to positive
+020    */
+021   int
+022   mp_abs (mp_int * a, mp_int * b)
+023   \{
+024     int     res;
+025   
+026     /* copy a to b */
+027     if (a != b) \{
+028        if ((res = mp_copy (a, b)) != MP_OKAY) \{
+029          return res;
+030        \}
+031     \}
+032   
+033     /* force the sign of b to positive */
+034     b->sign = MP_ZPOS;
+035   
+036     return MP_OKAY;
+037   \}
+038   #endif
+\end{alltt}
+\end{small}
+
+This fairly trivial algorithm first eliminates non--required duplications (line 27) and then sets the
+\textbf{sign} flag to \textbf{MP\_ZPOS}.
+
+\subsection{Integer Negation}
+With the mp\_int representation of an integer, calculating the negation is also trivial.  The mp\_neg algorithm will compute
+the negative of an mp\_int input.
+
+\begin{figure}[here]
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_neg}. \\
+\textbf{Input}.   An mp\_int $a$ \\
+\textbf{Output}.  Computes $b = -a$ \\
+\hline \\
+1.  Copy $a$ to $b$.  (\textit{mp\_copy}) \\
+2.  If the copy failed return(\textit{MP\_MEM}). \\
+3.  If $a.used = 0$ then return(\textit{MP\_OKAY}). \\
+4.  If $a.sign = MP\_ZPOS$ then do \\
+\hspace{3mm}4.1  $b.sign = MP\_NEG$. \\
+5.  else do \\
+\hspace{3mm}5.1  $b.sign = MP\_ZPOS$. \\
+6.  Return(\textit{MP\_OKAY}) \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Algorithm mp\_neg}
+\end{figure}
+
+\textbf{Algorithm mp\_neg.}
+This algorithm computes the negation of an input.  First it copies $a$ over $b$.  If $a$ has no used digits then
+the algorithm returns immediately.  Otherwise it flips the sign flag and stores the result in $b$.  Note that if 
+$a$ had no digits then it must be positive by definition.  Had step three been omitted then the algorithm would return
+zero as negative.
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_neg.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* b = -a */
+018   int mp_neg (mp_int * a, mp_int * b)
+019   \{
+020     int     res;
+021     if (a != b) \{
+022        if ((res = mp_copy (a, b)) != MP_OKAY) \{
+023           return res;
+024        \}
+025     \}
+026   
+027     if (mp_iszero(b) != MP_YES) \{
+028        b->sign = (a->sign == MP_ZPOS) ? MP_NEG : MP_ZPOS;
+029     \} else \{
+030        b->sign = MP_ZPOS;
+031     \}
+032   
+033     return MP_OKAY;
+034   \}
+035   #endif
+\end{alltt}
+\end{small}
+
+Like mp\_abs() this function avoids non--required duplications (line 21) and then sets the sign.  We
+have to make sure that only non--zero values get a \textbf{sign} of \textbf{MP\_NEG}.  If the mp\_int is zero
+than the \textbf{sign} is hard--coded to \textbf{MP\_ZPOS}.
+
+\section{Small Constants}
+\subsection{Setting Small Constants}
+Often a mp\_int must be set to a relatively small value such as $1$ or $2$.  For these cases the mp\_set algorithm is useful.
+
+\newpage\begin{figure}[here]
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_set}. \\
+\textbf{Input}.   An mp\_int $a$ and a digit $b$ \\
+\textbf{Output}.  Make $a$ equivalent to $b$ \\
+\hline \\
+1.  Zero $a$ (\textit{mp\_zero}). \\
+2.  $a_0 \leftarrow b \mbox{ (mod }\beta\mbox{)}$ \\
+3.  $a.used \leftarrow  \left \lbrace \begin{array}{ll}
+                              1 &  \mbox{if }a_0 > 0 \\
+                              0 &  \mbox{if }a_0 = 0 
+                              \end{array} \right .$ \\
+\hline                              
+\end{tabular}
+\end{center}
+\caption{Algorithm mp\_set}
+\end{figure}
+
+\textbf{Algorithm mp\_set.}
+This algorithm sets a mp\_int to a small single digit value.  Step number 1 ensures that the integer is reset to the default state.  The
+single digit is set (\textit{modulo $\beta$}) and the \textbf{used} count is adjusted accordingly.
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_set.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* set to a digit */
+018   void mp_set (mp_int * a, mp_digit b)
+019   \{
+020     mp_zero (a);
+021     a->dp[0] = b & MP_MASK;
+022     a->used  = (a->dp[0] != 0) ? 1 : 0;
+023   \}
+024   #endif
+\end{alltt}
+\end{small}
+
+First we zero (line 20) the mp\_int to make sure that the other members are initialized for a 
+small positive constant.  mp\_zero() ensures that the \textbf{sign} is positive and the \textbf{used} count
+is zero.  Next we set the digit and reduce it modulo $\beta$ (line 21).  After this step we have to 
+check if the resulting digit is zero or not.  If it is not then we set the \textbf{used} count to one, otherwise
+to zero.
+
+We can quickly reduce modulo $\beta$ since it is of the form $2^k$ and a quick binary AND operation with 
+$2^k - 1$ will perform the same operation.
+
+One important limitation of this function is that it will only set one digit.  The size of a digit is not fixed, meaning source that uses 
+this function should take that into account.  Only trivially small constants can be set using this function.
+
+\subsection{Setting Large Constants}
+To overcome the limitations of the mp\_set algorithm the mp\_set\_int algorithm is ideal.  It accepts a ``long''
+data type as input and will always treat it as a 32-bit integer.
+
+\begin{figure}[here]
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_set\_int}. \\
+\textbf{Input}.   An mp\_int $a$ and a ``long'' integer $b$ \\
+\textbf{Output}.  Make $a$ equivalent to $b$ \\
+\hline \\
+1.  Zero $a$ (\textit{mp\_zero}) \\
+2.  for $n$ from 0 to 7 do \\
+\hspace{3mm}2.1  $a \leftarrow a \cdot 16$ (\textit{mp\_mul2d}) \\
+\hspace{3mm}2.2  $u \leftarrow \lfloor b / 2^{4(7 - n)} \rfloor \mbox{ (mod }16\mbox{)}$\\
+\hspace{3mm}2.3  $a_0 \leftarrow a_0 + u$ \\
+\hspace{3mm}2.4  $a.used \leftarrow a.used + 1$ \\
+3.  Clamp excess used digits (\textit{mp\_clamp}) \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Algorithm mp\_set\_int}
+\end{figure}
+
+\textbf{Algorithm mp\_set\_int.}
+The algorithm performs eight iterations of a simple loop where in each iteration four bits from the source are added to the 
+mp\_int.  Step 2.1 will multiply the current result by sixteen making room for four more bits in the less significant positions.  In step 2.2 the
+next four bits from the source are extracted and are added to the mp\_int. The \textbf{used} digit count is 
+incremented to reflect the addition.  The \textbf{used} digit counter is incremented since if any of the leading digits were zero the mp\_int would have
+zero digits used and the newly added four bits would be ignored.
+
+Excess zero digits are trimmed in steps 2.1 and 3 by using higher level algorithms mp\_mul2d and mp\_clamp.
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_set\_int.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* set a 32-bit const */
+018   int mp_set_int (mp_int * a, unsigned long b)
+019   \{
+020     int     x, res;
+021   
+022     mp_zero (a);
+023     
+024     /* set four bits at a time */
+025     for (x = 0; x < 8; x++) \{
+026       /* shift the number up four bits */
+027       if ((res = mp_mul_2d (a, 4, a)) != MP_OKAY) \{
+028         return res;
+029       \}
+030   
+031       /* OR in the top four bits of the source */
+032       a->dp[0] |= (b >> 28) & 15;
+033   
+034       /* shift the source up to the next four bits */
+035       b <<= 4;
+036   
+037       /* ensure that digits are not clamped off */
+038       a->used += 1;
+039     \}
+040     mp_clamp (a);
+041     return MP_OKAY;
+042   \}
+043   #endif
+\end{alltt}
+\end{small}
+
+This function sets four bits of the number at a time to handle all practical \textbf{DIGIT\_BIT} sizes.  The weird
+addition on line 38 ensures that the newly added in bits are added to the number of digits.  While it may not 
+seem obvious as to why the digit counter does not grow exceedingly large it is because of the shift on line 27 
+as well as the  call to mp\_clamp() on line 40.  Both functions will clamp excess leading digits which keeps 
+the number of used digits low.
+
+\section{Comparisons}
+\subsection{Unsigned Comparisions}
+Comparing a multiple precision integer is performed with the exact same algorithm used to compare two decimal numbers.  For example,
+to compare $1,234$ to $1,264$ the digits are extracted by their positions.  That is we compare $1 \cdot 10^3 + 2 \cdot 10^2 + 3 \cdot 10^1 + 4 \cdot 10^0$
+to $1 \cdot 10^3 + 2 \cdot 10^2 + 6 \cdot 10^1 + 4 \cdot 10^0$ by comparing single digits at a time starting with the highest magnitude 
+positions.  If any leading digit of one integer is greater than a digit in the same position of another integer then obviously it must be greater.  
+
+The first comparision routine that will be developed is the unsigned magnitude compare which will perform a comparison based on the digits of two
+mp\_int variables alone.  It will ignore the sign of the two inputs.  Such a function is useful when an absolute comparison is required or if the 
+signs are known to agree in advance.
+
+To facilitate working with the results of the comparison functions three constants are required.  
+
+\begin{figure}[here]
+\begin{center}
+\begin{tabular}{|r|l|}
+\hline \textbf{Constant} & \textbf{Meaning} \\
+\hline \textbf{MP\_GT} & Greater Than \\
+\hline \textbf{MP\_EQ} & Equal To \\
+\hline \textbf{MP\_LT} & Less Than \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Comparison Return Codes}
+\end{figure}
+
+\begin{figure}[here]
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_cmp\_mag}. \\
+\textbf{Input}.   Two mp\_ints $a$ and $b$.  \\
+\textbf{Output}.  Unsigned comparison results ($a$ to the left of $b$). \\
+\hline \\
+1.  If $a.used > b.used$ then return(\textit{MP\_GT}) \\
+2.  If $a.used < b.used$ then return(\textit{MP\_LT}) \\
+3.  for n from $a.used - 1$ to 0 do \\
+\hspace{+3mm}3.1  if $a_n > b_n$ then return(\textit{MP\_GT}) \\
+\hspace{+3mm}3.2  if $a_n < b_n$ then return(\textit{MP\_LT}) \\
+4.  Return(\textit{MP\_EQ}) \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Algorithm mp\_cmp\_mag}
+\end{figure}
+
+\textbf{Algorithm mp\_cmp\_mag.}
+By saying ``$a$ to the left of $b$'' it is meant that the comparison is with respect to $a$, that is if $a$ is greater than $b$ it will return
+\textbf{MP\_GT} and similar with respect to when $a = b$ and $a < b$.  The first two steps compare the number of digits used in both $a$ and $b$.  
+Obviously if the digit counts differ there would be an imaginary zero digit in the smaller number where the leading digit of the larger number is.  
+If both have the same number of digits than the actual digits themselves must be compared starting at the leading digit.  
+
+By step three both inputs must have the same number of digits so its safe to start from either $a.used - 1$ or $b.used - 1$ and count down to
+the zero'th digit.  If after all of the digits have been compared, no difference is found, the algorithm returns \textbf{MP\_EQ}.
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_cmp\_mag.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* compare maginitude of two ints (unsigned) */
+018   int mp_cmp_mag (mp_int * a, mp_int * b)
+019   \{
+020     int     n;
+021     mp_digit *tmpa, *tmpb;
+022   
+023     /* compare based on # of non-zero digits */
+024     if (a->used > b->used) \{
+025       return MP_GT;
+026     \}
+027     
+028     if (a->used < b->used) \{
+029       return MP_LT;
+030     \}
+031   
+032     /* alias for a */
+033     tmpa = a->dp + (a->used - 1);
+034   
+035     /* alias for b */
+036     tmpb = b->dp + (a->used - 1);
+037   
+038     /* compare based on digits  */
+039     for (n = 0; n < a->used; ++n, --tmpa, --tmpb) \{
+040       if (*tmpa > *tmpb) \{
+041         return MP_GT;
+042       \}
+043   
+044       if (*tmpa < *tmpb) \{
+045         return MP_LT;
+046       \}
+047     \}
+048     return MP_EQ;
+049   \}
+050   #endif
+\end{alltt}
+\end{small}
+
+The two if statements (lines 24 and 28) compare the number of digits in the two inputs.  These two are 
+performed before all of the digits are compared since it is a very cheap test to perform and can potentially save 
+considerable time.  The implementation given is also not valid without those two statements.  $b.alloc$ may be 
+smaller than $a.used$, meaning that undefined values will be read from $b$ past the end of the array of digits.
+
+
+
+\subsection{Signed Comparisons}
+Comparing with sign considerations is also fairly critical in several routines (\textit{division for example}).  Based on an unsigned magnitude 
+comparison a trivial signed comparison algorithm can be written.
+
+\begin{figure}[here]
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_cmp}. \\
+\textbf{Input}.   Two mp\_ints $a$ and $b$ \\
+\textbf{Output}.  Signed Comparison Results ($a$ to the left of $b$) \\
+\hline \\
+1.  if $a.sign = MP\_NEG$ and $b.sign = MP\_ZPOS$ then return(\textit{MP\_LT}) \\
+2.  if $a.sign = MP\_ZPOS$ and $b.sign = MP\_NEG$ then return(\textit{MP\_GT}) \\
+3.  if $a.sign = MP\_NEG$ then \\
+\hspace{+3mm}3.1  Return the unsigned comparison of $b$ and $a$ (\textit{mp\_cmp\_mag}) \\
+4   Otherwise \\
+\hspace{+3mm}4.1  Return the unsigned comparison of $a$ and $b$ \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Algorithm mp\_cmp}
+\end{figure}
+
+\textbf{Algorithm mp\_cmp.}
+The first two steps compare the signs of the two inputs.  If the signs do not agree then it can return right away with the appropriate 
+comparison code.  When the signs are equal the digits of the inputs must be compared to determine the correct result.  In step 
+three the unsigned comparision flips the order of the arguments since they are both negative.  For instance, if $-a > -b$ then 
+$\vert a \vert < \vert b \vert$.  Step number four will compare the two when they are both positive.
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_cmp.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* compare two ints (signed)*/
+018   int
+019   mp_cmp (mp_int * a, mp_int * b)
+020   \{
+021     /* compare based on sign */
+022     if (a->sign != b->sign) \{
+023        if (a->sign == MP_NEG) \{
+024           return MP_LT;
+025        \} else \{
+026           return MP_GT;
+027        \}
+028     \}
+029     
+030     /* compare digits */
+031     if (a->sign == MP_NEG) \{
+032        /* if negative compare opposite direction */
+033        return mp_cmp_mag(b, a);
+034     \} else \{
+035        return mp_cmp_mag(a, b);
+036     \}
+037   \}
+038   #endif
+\end{alltt}
+\end{small}
+
+The two if statements (lines 22 and 23) perform the initial sign comparison.  If the signs are not the equal then which ever
+has the positive sign is larger.   The inputs are compared (line 31) based on magnitudes.  If the signs were both 
+negative then the unsigned comparison is performed in the opposite direction (line 33).  Otherwise, the signs are assumed to 
+be both positive and a forward direction unsigned comparison is performed.
+
+\section*{Exercises}
+\begin{tabular}{cl}
+$\left [ 2 \right ]$ & Modify algorithm mp\_set\_int to accept as input a variable length array of bits. \\
+                     & \\
+$\left [ 3 \right ]$ & Give the probability that algorithm mp\_cmp\_mag will have to compare $k$ digits  \\
+                     & of two random digits (of equal magnitude) before a difference is found. \\
+                     & \\
+$\left [ 1 \right ]$ & Suggest a simple method to speed up the implementation of mp\_cmp\_mag based  \\
+                     & on the observations made in the previous problem. \\
+                     &
+\end{tabular}
+
+\chapter{Basic Arithmetic}
+\section{Introduction}
+At this point algorithms for initialization, clearing, zeroing, copying, comparing and setting small constants have been 
+established.  The next logical set of algorithms to develop are addition, subtraction and digit shifting algorithms.  These 
+algorithms make use of the lower level algorithms and are the cruicial building block for the multiplication algorithms.  It is very important 
+that these algorithms are highly optimized.  On their own they are simple $O(n)$ algorithms but they can be called from higher level algorithms 
+which easily places them at $O(n^2)$ or even $O(n^3)$ work levels.  
+
+All of the algorithms within this chapter make use of the logical bit shift operations denoted by $<<$ and $>>$ for left and right 
+logical shifts respectively.  A logical shift is analogous to sliding the decimal point of radix-10 representations.  For example, the real 
+number $0.9345$ is equivalent to $93.45\%$ which is found by sliding the the decimal two places to the right (\textit{multiplying by $\beta^2 = 10^2$}).  
+Algebraically a binary logical shift is equivalent to a division or multiplication by a power of two.  
+For example, $a << k = a \cdot 2^k$ while $a >> k = \lfloor a/2^k \rfloor$.
+
+One significant difference between a logical shift and the way decimals are shifted is that digits below the zero'th position are removed
+from the number.  For example, consider $1101_2 >> 1$ using decimal notation this would produce $110.1_2$.  However, with a logical shift the 
+result is $110_2$.  
+
+\section{Addition and Subtraction}
+In common twos complement fixed precision arithmetic negative numbers are easily represented by subtraction from the modulus.  For example, with 32-bit integers
+$a - b\mbox{ (mod }2^{32}\mbox{)}$ is the same as $a + (2^{32} - b) \mbox{ (mod }2^{32}\mbox{)}$  since $2^{32} \equiv 0 \mbox{ (mod }2^{32}\mbox{)}$.  
+As a result subtraction can be performed with a trivial series of logical operations and an addition.
+
+However, in multiple precision arithmetic negative numbers are not represented in the same way.  Instead a sign flag is used to keep track of the
+sign of the integer.  As a result signed addition and subtraction are actually implemented as conditional usage of lower level addition or 
+subtraction algorithms with the sign fixed up appropriately.
+
+The lower level algorithms will add or subtract integers without regard to the sign flag.  That is they will add or subtract the magnitude of
+the integers respectively.
+
+\subsection{Low Level Addition}
+An unsigned addition of multiple precision integers is performed with the same long-hand algorithm used to add decimal numbers.  That is to add the 
+trailing digits first and propagate the resulting carry upwards.  Since this is a lower level algorithm the name will have a ``s\_'' prefix.  
+Historically that convention stems from the MPI library where ``s\_'' stood for static functions that were hidden from the developer entirely.
+
+\newpage
+\begin{figure}[!here]
+\begin{center}
+\begin{small}
+\begin{tabular}{l}
+\hline Algorithm \textbf{s\_mp\_add}. \\
+\textbf{Input}.   Two mp\_ints $a$ and $b$ \\
+\textbf{Output}.  The unsigned addition $c = \vert a \vert + \vert b \vert$. \\
+\hline \\
+1.  if $a.used > b.used$ then \\
+\hspace{+3mm}1.1  $min \leftarrow b.used$ \\
+\hspace{+3mm}1.2  $max \leftarrow a.used$ \\
+\hspace{+3mm}1.3  $x   \leftarrow a$ \\
+2.  else  \\
+\hspace{+3mm}2.1  $min \leftarrow a.used$ \\
+\hspace{+3mm}2.2  $max \leftarrow b.used$ \\
+\hspace{+3mm}2.3  $x   \leftarrow b$ \\
+3.  If $c.alloc < max + 1$ then grow $c$ to hold at least $max + 1$ digits (\textit{mp\_grow}) \\
+4.  $oldused \leftarrow c.used$ \\
+5.  $c.used \leftarrow max + 1$ \\
+6.  $u \leftarrow 0$ \\
+7.  for $n$ from $0$ to $min - 1$ do \\
+\hspace{+3mm}7.1  $c_n \leftarrow a_n + b_n + u$ \\
+\hspace{+3mm}7.2  $u \leftarrow c_n >> lg(\beta)$ \\
+\hspace{+3mm}7.3  $c_n \leftarrow c_n \mbox{ (mod }\beta\mbox{)}$ \\
+8.  if $min \ne max$ then do \\
+\hspace{+3mm}8.1  for $n$ from $min$ to $max - 1$ do \\
+\hspace{+6mm}8.1.1  $c_n \leftarrow x_n + u$ \\
+\hspace{+6mm}8.1.2  $u \leftarrow c_n >> lg(\beta)$ \\
+\hspace{+6mm}8.1.3  $c_n \leftarrow c_n \mbox{ (mod }\beta\mbox{)}$ \\
+9.  $c_{max} \leftarrow u$ \\
+10.  if $olduse > max$ then \\
+\hspace{+3mm}10.1  for $n$ from $max + 1$ to $oldused - 1$ do \\
+\hspace{+6mm}10.1.1  $c_n \leftarrow 0$ \\
+11.  Clamp excess digits in $c$.  (\textit{mp\_clamp}) \\
+12.  Return(\textit{MP\_OKAY}) \\
+\hline
+\end{tabular}
+\end{small}
+\end{center}
+\caption{Algorithm s\_mp\_add}
+\end{figure}
+
+\textbf{Algorithm s\_mp\_add.}
+This algorithm is loosely based on algorithm 14.7 of HAC \cite[pp. 594]{HAC} but has been extended to allow the inputs to have different magnitudes.  
+Coincidentally the description of algorithm A in Knuth \cite[pp. 266]{TAOCPV2} shares the same deficiency as the algorithm from \cite{HAC}.  Even the 
+MIX pseudo  machine code presented by Knuth \cite[pp. 266-267]{TAOCPV2} is incapable of handling inputs which are of different magnitudes.
+
+The first thing that has to be accomplished is to sort out which of the two inputs is the largest.  The addition logic
+will simply add all of the smallest input to the largest input and store that first part of the result in the
+destination.  Then it will apply a simpler addition loop to excess digits of the larger input.
+
+The first two steps will handle sorting the inputs such that $min$ and $max$ hold the digit counts of the two 
+inputs.  The variable $x$ will be an mp\_int alias for the largest input or the second input $b$ if they have the
+same number of digits.  After the inputs are sorted the destination $c$ is grown as required to accomodate the sum 
+of the two inputs.  The original \textbf{used} count of $c$ is copied and set to the new used count.  
+
+At this point the first addition loop will go through as many digit positions that both inputs have.  The carry
+variable $\mu$ is set to zero outside the loop.  Inside the loop an ``addition'' step requires three statements to produce
+one digit of the summand.  First
+two digits from $a$ and $b$ are added together along with the carry $\mu$.  The carry of this step is extracted and stored
+in $\mu$ and finally the digit of the result $c_n$ is truncated within the range $0 \le c_n < \beta$.
+
+Now all of the digit positions that both inputs have in common have been exhausted.  If $min \ne max$ then $x$ is an alias
+for one of the inputs that has more digits.  A simplified addition loop is then used to essentially copy the remaining digits
+and the carry to the destination.
+
+The final carry is stored in $c_{max}$ and digits above $max$ upto $oldused$ are zeroed which completes the addition.
+
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_s\_mp\_add.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* low level addition, based on HAC pp.594, Algorithm 14.7 */
+018   int
+019   s_mp_add (mp_int * a, mp_int * b, mp_int * c)
+020   \{
+021     mp_int *x;
+022     int     olduse, res, min, max;
+023   
+024     /* find sizes, we let |a| <= |b| which means we have to sort
+025      * them.  "x" will point to the input with the most digits
+026      */
+027     if (a->used > b->used) \{
+028       min = b->used;
+029       max = a->used;
+030       x = a;
+031     \} else \{
+032       min = a->used;
+033       max = b->used;
+034       x = b;
+035     \}
+036   
+037     /* init result */
+038     if (c->alloc < max + 1) \{
+039       if ((res = mp_grow (c, max + 1)) != MP_OKAY) \{
+040         return res;
+041       \}
+042     \}
+043   
+044     /* get old used digit count and set new one */
+045     olduse = c->used;
+046     c->used = max + 1;
+047   
+048     \{
+049       register mp_digit u, *tmpa, *tmpb, *tmpc;
+050       register int i;
+051   
+052       /* alias for digit pointers */
+053   
+054       /* first input */
+055       tmpa = a->dp;
+056   
+057       /* second input */
+058       tmpb = b->dp;
+059   
+060       /* destination */
+061       tmpc = c->dp;
+062   
+063       /* zero the carry */
+064       u = 0;
+065       for (i = 0; i < min; i++) \{
+066         /* Compute the sum at one digit, T[i] = A[i] + B[i] + U */
+067         *tmpc = *tmpa++ + *tmpb++ + u;
+068   
+069         /* U = carry bit of T[i] */
+070         u = *tmpc >> ((mp_digit)DIGIT_BIT);
+071   
+072         /* take away carry bit from T[i] */
+073         *tmpc++ &= MP_MASK;
+074       \}
+075   
+076       /* now copy higher words if any, that is in A+B 
+077        * if A or B has more digits add those in 
+078        */
+079       if (min != max) \{
+080         for (; i < max; i++) \{
+081           /* T[i] = X[i] + U */
+082           *tmpc = x->dp[i] + u;
+083   
+084           /* U = carry bit of T[i] */
+085           u = *tmpc >> ((mp_digit)DIGIT_BIT);
+086   
+087           /* take away carry bit from T[i] */
+088           *tmpc++ &= MP_MASK;
+089         \}
+090       \}
+091   
+092       /* add carry */
+093       *tmpc++ = u;
+094   
+095       /* clear digits above oldused */
+096       for (i = c->used; i < olduse; i++) \{
+097         *tmpc++ = 0;
+098       \}
+099     \}
+100   
+101     mp_clamp (c);
+102     return MP_OKAY;
+103   \}
+104   #endif
+\end{alltt}
+\end{small}
+
+We first sort (lines 27 to 35) the inputs based on magnitude and determine the $min$ and $max$ variables.
+Note that $x$ is a pointer to an mp\_int assigned to the largest input, in effect it is a local alias.  Next we
+grow the destination (37 to 42) ensure that it can accomodate the result of the addition. 
+
+Similar to the implementation of mp\_copy this function uses the braced code and local aliases coding style.  The three aliases that are on 
+lines 55, 58 and 61 represent the two inputs and destination variables respectively.  These aliases are used to ensure the
+compiler does not have to dereference $a$, $b$ or $c$ (respectively) to access the digits of the respective mp\_int.
+
+The initial carry $u$ will be cleared (line 64), note that $u$ is of type mp\_digit which ensures type 
+compatibility within the implementation.  The initial addition (line 65 to 74) adds digits from
+both inputs until the smallest input runs out of digits.  Similarly the conditional addition loop
+(line 80 to 90) adds the remaining digits from the larger of the two inputs.  The addition is finished 
+with the final carry being stored in $tmpc$ (line 93).  Note the ``++'' operator within the same expression.
+After line 93, $tmpc$ will point to the $c.used$'th digit of the mp\_int $c$.  This is useful
+for the next loop (line 96 to 99) which set any old upper digits to zero.
+
+\subsection{Low Level Subtraction}
+The low level unsigned subtraction algorithm is very similar to the low level unsigned addition algorithm.  The principle difference is that the
+unsigned subtraction algorithm requires the result to be positive.  That is when computing $a - b$ the condition $\vert a \vert \ge \vert b\vert$ must 
+be met for this algorithm to function properly.  Keep in mind this low level algorithm is not meant to be used in higher level algorithms directly.  
+This algorithm as will be shown can be used to create functional signed addition and subtraction algorithms.
+
+
+For this algorithm a new variable is required to make the description simpler.  Recall from section 1.3.1 that a mp\_digit must be able to represent
+the range $0 \le x < 2\beta$ for the algorithms to work correctly.  However, it is allowable that a mp\_digit represent a larger range of values.  For 
+this algorithm we will assume that the variable $\gamma$ represents the number of bits available in a 
+mp\_digit (\textit{this implies $2^{\gamma} > \beta$}).  
+
+For example, the default for LibTomMath is to use a ``unsigned long'' for the mp\_digit ``type'' while $\beta = 2^{28}$.  In ISO C an ``unsigned long''
+data type must be able to represent $0 \le x < 2^{32}$ meaning that in this case $\gamma \ge 32$.
+
+\newpage\begin{figure}[!here]
+\begin{center}
+\begin{small}
+\begin{tabular}{l}
+\hline Algorithm \textbf{s\_mp\_sub}. \\
+\textbf{Input}.   Two mp\_ints $a$ and $b$ ($\vert a \vert \ge \vert b \vert$) \\
+\textbf{Output}.  The unsigned subtraction $c = \vert a \vert - \vert b \vert$. \\
+\hline \\
+1.  $min \leftarrow b.used$ \\
+2.  $max \leftarrow a.used$ \\
+3.  If $c.alloc < max$ then grow $c$ to hold at least $max$ digits.  (\textit{mp\_grow}) \\
+4.  $oldused \leftarrow c.used$ \\ 
+5.  $c.used \leftarrow max$ \\
+6.  $u \leftarrow 0$ \\
+7.  for $n$ from $0$ to $min - 1$ do \\
+\hspace{3mm}7.1  $c_n \leftarrow a_n - b_n - u$ \\
+\hspace{3mm}7.2  $u   \leftarrow c_n >> (\gamma - 1)$ \\
+\hspace{3mm}7.3  $c_n \leftarrow c_n \mbox{ (mod }\beta\mbox{)}$ \\
+8.  if $min < max$ then do \\
+\hspace{3mm}8.1  for $n$ from $min$ to $max - 1$ do \\
+\hspace{6mm}8.1.1  $c_n \leftarrow a_n - u$ \\
+\hspace{6mm}8.1.2  $u   \leftarrow c_n >> (\gamma - 1)$ \\
+\hspace{6mm}8.1.3  $c_n \leftarrow c_n \mbox{ (mod }\beta\mbox{)}$ \\
+9. if $oldused > max$ then do \\
+\hspace{3mm}9.1  for $n$ from $max$ to $oldused - 1$ do \\
+\hspace{6mm}9.1.1  $c_n \leftarrow 0$ \\
+10. Clamp excess digits of $c$.  (\textit{mp\_clamp}). \\
+11. Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{small}
+\end{center}
+\caption{Algorithm s\_mp\_sub}
+\end{figure}
+
+\textbf{Algorithm s\_mp\_sub.}
+This algorithm performs the unsigned subtraction of two mp\_int variables under the restriction that the result must be positive.  That is when
+passing variables $a$ and $b$ the condition that $\vert a \vert \ge \vert b \vert$ must be met for the algorithm to function correctly.  This
+algorithm is loosely based on algorithm 14.9 \cite[pp. 595]{HAC} and is similar to algorithm S in \cite[pp. 267]{TAOCPV2} as well.  As was the case
+of the algorithm s\_mp\_add both other references lack discussion concerning various practical details such as when the inputs differ in magnitude.
+
+The initial sorting of the inputs is trivial in this algorithm since $a$ is guaranteed to have at least the same magnitude of $b$.  Steps 1 and 2 
+set the $min$ and $max$ variables.  Unlike the addition routine there is guaranteed to be no carry which means that the final result can be at 
+most $max$ digits in length as opposed to $max + 1$.  Similar to the addition algorithm the \textbf{used} count of $c$ is copied locally and 
+set to the maximal count for the operation.
+
+The subtraction loop that begins on step seven is essentially the same as the addition loop of algorithm s\_mp\_add except single precision 
+subtraction is used instead.  Note the use of the $\gamma$ variable to extract the carry (\textit{also known as the borrow}) within the subtraction 
+loops.  Under the assumption that two's complement single precision arithmetic is used this will successfully extract the desired carry.  
+
+For example, consider subtracting $0101_2$ from $0100_2$ where $\gamma = 4$ and $\beta = 2$.  The least significant bit will force a carry upwards to 
+the third bit which will be set to zero after the borrow.  After the very first bit has been subtracted $4 - 1 \equiv 0011_2$ will remain,  When the 
+third bit of $0101_2$ is subtracted from the result it will cause another carry.  In this case though the carry will be forced to propagate all the 
+way to the most significant bit.  
+
+Recall that $\beta < 2^{\gamma}$.  This means that if a carry does occur just before the $lg(\beta)$'th bit it will propagate all the way to the most 
+significant bit.  Thus, the high order bits of the mp\_digit that are not part of the actual digit will either be all zero, or all one. All that
+is needed is a single zero or one bit for the carry.  Therefore a single logical shift right by $\gamma - 1$ positions is sufficient to extract the 
+carry.  This method of carry extraction may seem awkward but the reason for it becomes apparent when the implementation is discussed.  
+
+If $b$ has a smaller magnitude than $a$ then step 9 will force the carry and copy operation to propagate through the larger input $a$ into $c$.  Step
+10 will ensure that any leading digits of $c$ above the $max$'th position are zeroed.
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_s\_mp\_sub.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* low level subtraction (assumes |a| > |b|), HAC pp.595 Algorithm 14.9 */
+018   int
+019   s_mp_sub (mp_int * a, mp_int * b, mp_int * c)
+020   \{
+021     int     olduse, res, min, max;
+022   
+023     /* find sizes */
+024     min = b->used;
+025     max = a->used;
+026   
+027     /* init result */
+028     if (c->alloc < max) \{
+029       if ((res = mp_grow (c, max)) != MP_OKAY) \{
+030         return res;
+031       \}
+032     \}
+033     olduse = c->used;
+034     c->used = max;
+035   
+036     \{
+037       register mp_digit u, *tmpa, *tmpb, *tmpc;
+038       register int i;
+039   
+040       /* alias for digit pointers */
+041       tmpa = a->dp;
+042       tmpb = b->dp;
+043       tmpc = c->dp;
+044   
+045       /* set carry to zero */
+046       u = 0;
+047       for (i = 0; i < min; i++) \{
+048         /* T[i] = A[i] - B[i] - U */
+049         *tmpc = *tmpa++ - *tmpb++ - u;
+050   
+051         /* U = carry bit of T[i]
+052          * Note this saves performing an AND operation since
+053          * if a carry does occur it will propagate all the way to the
+054          * MSB.  As a result a single shift is enough to get the carry
+055          */
+056         u = *tmpc >> ((mp_digit)(CHAR_BIT * sizeof (mp_digit) - 1));
+057   
+058         /* Clear carry from T[i] */
+059         *tmpc++ &= MP_MASK;
+060       \}
+061   
+062       /* now copy higher words if any, e.g. if A has more digits than B  */
+063       for (; i < max; i++) \{
+064         /* T[i] = A[i] - U */
+065         *tmpc = *tmpa++ - u;
+066   
+067         /* U = carry bit of T[i] */
+068         u = *tmpc >> ((mp_digit)(CHAR_BIT * sizeof (mp_digit) - 1));
+069   
+070         /* Clear carry from T[i] */
+071         *tmpc++ &= MP_MASK;
+072       \}
+073   
+074       /* clear digits above used (since we may not have grown result above) */
+      
+075       for (i = c->used; i < olduse; i++) \{
+076         *tmpc++ = 0;
+077       \}
+078     \}
+079   
+080     mp_clamp (c);
+081     return MP_OKAY;
+082   \}
+083   
+084   #endif
+\end{alltt}
+\end{small}
+
+Like low level addition we ``sort'' the inputs.  Except in this case the sorting is hardcoded 
+(lines 24 and 25).  In reality the $min$ and $max$ variables are only aliases and are only 
+used to make the source code easier to read.  Again the pointer alias optimization is used 
+within this algorithm.  The aliases $tmpa$, $tmpb$ and $tmpc$ are initialized
+(lines 41, 42 and 43) for $a$, $b$ and $c$ respectively.
+
+The first subtraction loop (lines 46 through 60) subtract digits from both inputs until the smaller of
+the two inputs has been exhausted.  As remarked earlier there is an implementation reason for using the ``awkward'' 
+method of extracting the carry (line 56).  The traditional method for extracting the carry would be to shift 
+by $lg(\beta)$ positions and logically AND the least significant bit.  The AND operation is required because all of 
+the bits above the $\lg(\beta)$'th bit will be set to one after a carry occurs from subtraction.  This carry 
+extraction requires two relatively cheap operations to extract the carry.  The other method is to simply shift the 
+most significant bit to the least significant bit thus extracting the carry with a single cheap operation.  This 
+optimization only works on twos compliment machines which is a safe assumption to make.
+
+If $a$ has a larger magnitude than $b$ an additional loop (lines 63 through 72) is required to propagate 
+the carry through $a$ and copy the result to $c$.  
+
+\subsection{High Level Addition}
+Now that both lower level addition and subtraction algorithms have been established an effective high level signed addition algorithm can be
+established.  This high level addition algorithm will be what other algorithms and developers will use to perform addition of mp\_int data 
+types.  
+
+Recall from section 5.2 that an mp\_int represents an integer with an unsigned mantissa (\textit{the array of digits}) and a \textbf{sign} 
+flag.  A high level addition is actually performed as a series of eight separate cases which can be optimized down to three unique cases.
+
+\begin{figure}[!here]
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_add}. \\
+\textbf{Input}.   Two mp\_ints $a$ and $b$  \\
+\textbf{Output}.  The signed addition $c = a + b$. \\
+\hline \\
+1.  if $a.sign = b.sign$ then do \\
+\hspace{3mm}1.1  $c.sign \leftarrow a.sign$  \\
+\hspace{3mm}1.2  $c \leftarrow \vert a \vert + \vert b \vert$ (\textit{s\_mp\_add})\\
+2.  else do \\
+\hspace{3mm}2.1  if $\vert a \vert < \vert b \vert$ then do (\textit{mp\_cmp\_mag})  \\
+\hspace{6mm}2.1.1  $c.sign \leftarrow b.sign$ \\
+\hspace{6mm}2.1.2  $c \leftarrow \vert b \vert - \vert a \vert$ (\textit{s\_mp\_sub}) \\
+\hspace{3mm}2.2  else do \\
+\hspace{6mm}2.2.1  $c.sign \leftarrow a.sign$ \\
+\hspace{6mm}2.2.2  $c \leftarrow \vert a \vert - \vert b \vert$ \\
+3.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Algorithm mp\_add}
+\end{figure}
+
+\textbf{Algorithm mp\_add.}
+This algorithm performs the signed addition of two mp\_int variables.  There is no reference algorithm to draw upon from 
+either \cite{TAOCPV2} or \cite{HAC} since they both only provide unsigned operations.  The algorithm is fairly 
+straightforward but restricted since subtraction can only produce positive results.
+
+\begin{figure}[here]
+\begin{small}
+\begin{center}
+\begin{tabular}{|c|c|c|c|c|}
+\hline \textbf{Sign of $a$} & \textbf{Sign of $b$} & \textbf{$\vert a \vert > \vert b \vert $} & \textbf{Unsigned Operation} & \textbf{Result Sign Flag} \\
+\hline $+$ & $+$ & Yes & $c = a + b$ & $a.sign$ \\
+\hline $+$ & $+$ & No  & $c = a + b$ & $a.sign$ \\
+\hline $-$ & $-$ & Yes & $c = a + b$ & $a.sign$ \\
+\hline $-$ & $-$ & No  & $c = a + b$ & $a.sign$ \\
+\hline &&&&\\
+
+\hline $+$ & $-$ & No  & $c = b - a$ & $b.sign$ \\
+\hline $-$ & $+$ & No  & $c = b - a$ & $b.sign$ \\
+
+\hline &&&&\\
+
+\hline $+$ & $-$ & Yes & $c = a - b$ & $a.sign$ \\
+\hline $-$ & $+$ & Yes & $c = a - b$ & $a.sign$ \\
+
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Addition Guide Chart}
+\label{fig:AddChart}
+\end{figure}
+
+Figure~\ref{fig:AddChart} lists all of the eight possible input combinations and is sorted to show that only three 
+specific cases need to be handled.  The return code of the unsigned operations at step 1.2, 2.1.2 and 2.2.2 are 
+forwarded to step three to check for errors.  This simplifies the description of the algorithm considerably and best 
+follows how the implementation actually was achieved.
+
+Also note how the \textbf{sign} is set before the unsigned addition or subtraction is performed.  Recall from the descriptions of algorithms
+s\_mp\_add and s\_mp\_sub that the mp\_clamp function is used at the end to trim excess digits.  The mp\_clamp algorithm will set the \textbf{sign}
+to \textbf{MP\_ZPOS} when the \textbf{used} digit count reaches zero.
+
+For example, consider performing $-a + a$ with algorithm mp\_add.  By the description of the algorithm the sign is set to \textbf{MP\_NEG} which would
+produce a result of $-0$.  However, since the sign is set first then the unsigned addition is performed the subsequent usage of algorithm mp\_clamp 
+within algorithm s\_mp\_add will force $-0$ to become $0$.  
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_add.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* high level addition (handles signs) */
+018   int mp_add (mp_int * a, mp_int * b, mp_int * c)
+019   \{
+020     int     sa, sb, res;
+021   
+022     /* get sign of both inputs */
+023     sa = a->sign;
+024     sb = b->sign;
+025   
+026     /* handle two cases, not four */
+027     if (sa == sb) \{
+028       /* both positive or both negative */
+029       /* add their magnitudes, copy the sign */
+030       c->sign = sa;
+031       res = s_mp_add (a, b, c);
+032     \} else \{
+033       /* one positive, the other negative */
+034       /* subtract the one with the greater magnitude from */
+035       /* the one of the lesser magnitude.  The result gets */
+036       /* the sign of the one with the greater magnitude. */
+037       if (mp_cmp_mag (a, b) == MP_LT) \{
+038         c->sign = sb;
+039         res = s_mp_sub (b, a, c);
+040       \} else \{
+041         c->sign = sa;
+042         res = s_mp_sub (a, b, c);
+043       \}
+044     \}
+045     return res;
+046   \}
+047   
+048   #endif
+\end{alltt}
+\end{small}
+
+The source code follows the algorithm fairly closely.  The most notable new source code addition is the usage of the $res$ integer variable which
+is used to pass result of the unsigned operations forward.  Unlike in the algorithm, the variable $res$ is merely returned as is without
+explicitly checking it and returning the constant \textbf{MP\_OKAY}.  The observation is this algorithm will succeed or fail only if the lower
+level functions do so.  Returning their return code is sufficient.
+
+\subsection{High Level Subtraction}
+The high level signed subtraction algorithm is essentially the same as the high level signed addition algorithm.  
+
+\newpage\begin{figure}[!here]
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_sub}. \\
+\textbf{Input}.   Two mp\_ints $a$ and $b$  \\
+\textbf{Output}.  The signed subtraction $c = a - b$. \\
+\hline \\
+1.  if $a.sign \ne b.sign$ then do \\
+\hspace{3mm}1.1  $c.sign \leftarrow a.sign$ \\
+\hspace{3mm}1.2  $c \leftarrow \vert a \vert + \vert b \vert$ (\textit{s\_mp\_add}) \\
+2.  else do \\
+\hspace{3mm}2.1  if $\vert a \vert \ge \vert b \vert$ then do (\textit{mp\_cmp\_mag}) \\
+\hspace{6mm}2.1.1  $c.sign \leftarrow a.sign$ \\
+\hspace{6mm}2.1.2  $c \leftarrow \vert a \vert  - \vert b \vert$ (\textit{s\_mp\_sub}) \\
+\hspace{3mm}2.2  else do \\
+\hspace{6mm}2.2.1  $c.sign \leftarrow  \left \lbrace \begin{array}{ll}
+                              MP\_ZPOS &  \mbox{if }a.sign = MP\_NEG \\
+                              MP\_NEG  &  \mbox{otherwise} \\
+                              \end{array} \right .$ \\
+\hspace{6mm}2.2.2  $c \leftarrow \vert b \vert  - \vert a \vert$ \\
+3.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Algorithm mp\_sub}
+\end{figure}
+
+\textbf{Algorithm mp\_sub.}
+This algorithm performs the signed subtraction of two inputs.  Similar to algorithm mp\_add there is no reference in either \cite{TAOCPV2} or 
+\cite{HAC}.  Also this algorithm is restricted by algorithm s\_mp\_sub.  Chart \ref{fig:SubChart} lists the eight possible inputs and
+the operations required.
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{|c|c|c|c|c|}
+\hline \textbf{Sign of $a$} & \textbf{Sign of $b$} & \textbf{$\vert a \vert \ge \vert b \vert $} & \textbf{Unsigned Operation} & \textbf{Result Sign Flag} \\
+\hline $+$ & $-$ & Yes & $c = a + b$ & $a.sign$ \\
+\hline $+$ & $-$ & No  & $c = a + b$ & $a.sign$ \\
+\hline $-$ & $+$ & Yes & $c = a + b$ & $a.sign$ \\
+\hline $-$ & $+$ & No  & $c = a + b$ & $a.sign$ \\
+\hline &&&& \\
+\hline $+$ & $+$ & Yes & $c = a - b$ & $a.sign$ \\
+\hline $-$ & $-$ & Yes & $c = a - b$ & $a.sign$ \\
+\hline &&&& \\
+\hline $+$ & $+$ & No  & $c = b - a$ & $\mbox{opposite of }a.sign$ \\
+\hline $-$ & $-$ & No  & $c = b - a$ & $\mbox{opposite of }a.sign$ \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Subtraction Guide Chart}
+\label{fig:SubChart}
+\end{figure}
+
+Similar to the case of algorithm mp\_add the \textbf{sign} is set first before the unsigned addition or subtraction.  That is to prevent the 
+algorithm from producing $-a - -a = -0$ as a result.  
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_sub.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* high level subtraction (handles signs) */
+018   int
+019   mp_sub (mp_int * a, mp_int * b, mp_int * c)
+020   \{
+021     int     sa, sb, res;
+022   
+023     sa = a->sign;
+024     sb = b->sign;
+025   
+026     if (sa != sb) \{
+027       /* subtract a negative from a positive, OR */
+028       /* subtract a positive from a negative. */
+029       /* In either case, ADD their magnitudes, */
+030       /* and use the sign of the first number. */
+031       c->sign = sa;
+032       res = s_mp_add (a, b, c);
+033     \} else \{
+034       /* subtract a positive from a positive, OR */
+035       /* subtract a negative from a negative. */
+036       /* First, take the difference between their */
+037       /* magnitudes, then... */
+038       if (mp_cmp_mag (a, b) != MP_LT) \{
+039         /* Copy the sign from the first */
+040         c->sign = sa;
+041         /* The first has a larger or equal magnitude */
+042         res = s_mp_sub (a, b, c);
+043       \} else \{
+044         /* The result has the *opposite* sign from */
+045         /* the first number. */
+046         c->sign = (sa == MP_ZPOS) ? MP_NEG : MP_ZPOS;
+047         /* The second has a larger magnitude */
+048         res = s_mp_sub (b, a, c);
+049       \}
+050     \}
+051     return res;
+052   \}
+053   
+054   #endif
+\end{alltt}
+\end{small}
+
+Much like the implementation of algorithm mp\_add the variable $res$ is used to catch the return code of the unsigned addition or subtraction operations
+and forward it to the end of the function.  On line 38 the ``not equal to'' \textbf{MP\_LT} expression is used to emulate a 
+``greater than or equal to'' comparison.  
+
+\section{Bit and Digit Shifting}
+It is quite common to think of a multiple precision integer as a polynomial in $x$, that is $y = f(\beta)$ where $f(x) = \sum_{i=0}^{n-1} a_i x^i$.  
+This notation arises within discussion of Montgomery and Diminished Radix Reduction as well as Karatsuba multiplication and squaring.  
+
+In order to facilitate operations on polynomials in $x$ as above a series of simple ``digit'' algorithms have to be established.  That is to shift
+the digits left or right as well to shift individual bits of the digits left and right.  It is important to note that not all ``shift'' operations
+are on radix-$\beta$ digits.  
+
+\subsection{Multiplication by Two}
+
+In a binary system where the radix is a power of two multiplication by two not only arises often in other algorithms it is a fairly efficient 
+operation to perform.  A single precision logical shift left is sufficient to multiply a single digit by two.  
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_mul\_2}. \\
+\textbf{Input}.   One mp\_int $a$ \\
+\textbf{Output}.  $b = 2a$. \\
+\hline \\
+1.  If $b.alloc < a.used + 1$ then grow $b$ to hold $a.used + 1$ digits.  (\textit{mp\_grow}) \\
+2.  $oldused \leftarrow b.used$ \\
+3.  $b.used \leftarrow a.used$ \\
+4.  $r \leftarrow 0$ \\
+5.  for $n$ from 0 to $a.used - 1$ do \\
+\hspace{3mm}5.1  $rr \leftarrow a_n >> (lg(\beta) - 1)$ \\
+\hspace{3mm}5.2  $b_n \leftarrow (a_n << 1) + r \mbox{ (mod }\beta\mbox{)}$ \\
+\hspace{3mm}5.3  $r \leftarrow rr$ \\
+6.  If $r \ne 0$ then do \\
+\hspace{3mm}6.1  $b_{n + 1} \leftarrow r$ \\
+\hspace{3mm}6.2  $b.used \leftarrow b.used + 1$ \\
+7.  If $b.used < oldused - 1$ then do \\
+\hspace{3mm}7.1  for $n$ from $b.used$ to $oldused - 1$ do \\
+\hspace{6mm}7.1.1  $b_n \leftarrow 0$ \\
+8.  $b.sign \leftarrow a.sign$ \\
+9.  Return(\textit{MP\_OKAY}).\\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_mul\_2}
+\end{figure}
+
+\textbf{Algorithm mp\_mul\_2.}
+This algorithm will quickly multiply a mp\_int by two provided $\beta$ is a power of two.  Neither \cite{TAOCPV2} nor \cite{HAC} describe such 
+an algorithm despite the fact it arises often in other algorithms.  The algorithm is setup much like the lower level algorithm s\_mp\_add since 
+it is for all intents and purposes equivalent to the operation $b = \vert a \vert + \vert a \vert$.  
+
+Step 1 and 2 grow the input as required to accomodate the maximum number of \textbf{used} digits in the result.  The initial \textbf{used} count
+is set to $a.used$ at step 4.  Only if there is a final carry will the \textbf{used} count require adjustment.
+
+Step 6 is an optimization implementation of the addition loop for this specific case.  That is since the two values being added together 
+are the same there is no need to perform two reads from the digits of $a$.  Step 6.1 performs a single precision shift on the current digit $a_n$ to
+obtain what will be the carry for the next iteration.  Step 6.2 calculates the $n$'th digit of the result as single precision shift of $a_n$ plus
+the previous carry.  Recall from section 4.1 that $a_n << 1$ is equivalent to $a_n \cdot 2$.  An iteration of the addition loop is finished with 
+forwarding the carry to the next iteration.
+
+Step 7 takes care of any final carry by setting the $a.used$'th digit of the result to the carry and augmenting the \textbf{used} count of $b$.  
+Step 8 clears any leading digits of $b$ in case it originally had a larger magnitude than $a$.
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_mul\_2.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* b = a*2 */
+018   int mp_mul_2(mp_int * a, mp_int * b)
+019   \{
+020     int     x, res, oldused;
+021   
+022     /* grow to accomodate result */
+023     if (b->alloc < a->used + 1) \{
+024       if ((res = mp_grow (b, a->used + 1)) != MP_OKAY) \{
+025         return res;
+026       \}
+027     \}
+028   
+029     oldused = b->used;
+030     b->used = a->used;
+031   
+032     \{
+033       register mp_digit r, rr, *tmpa, *tmpb;
+034   
+035       /* alias for source */
+036       tmpa = a->dp;
+037       
+038       /* alias for dest */
+039       tmpb = b->dp;
+040   
+041       /* carry */
+042       r = 0;
+043       for (x = 0; x < a->used; x++) \{
+044       
+045         /* get what will be the *next* carry bit from the 
+046          * MSB of the current digit 
+047          */
+048         rr = *tmpa >> ((mp_digit)(DIGIT_BIT - 1));
+049         
+050         /* now shift up this digit, add in the carry [from the previous] */
+051         *tmpb++ = ((*tmpa++ << ((mp_digit)1)) | r) & MP_MASK;
+052         
+053         /* copy the carry that would be from the source 
+054          * digit into the next iteration 
+055          */
+056         r = rr;
+057       \}
+058   
+059       /* new leading digit? */
+060       if (r != 0) \{
+061         /* add a MSB which is always 1 at this point */
+062         *tmpb = 1;
+063         ++(b->used);
+064       \}
+065   
+066       /* now zero any excess digits on the destination 
+067        * that we didn't write to 
+068        */
+069       tmpb = b->dp + b->used;
+070       for (x = b->used; x < oldused; x++) \{
+071         *tmpb++ = 0;
+072       \}
+073     \}
+074     b->sign = a->sign;
+075     return MP_OKAY;
+076   \}
+077   #endif
+\end{alltt}
+\end{small}
+
+This implementation is essentially an optimized implementation of s\_mp\_add for the case of doubling an input.  The only noteworthy difference
+is the use of the logical shift operator on line 51 to perform a single precision doubling.  
+
+\subsection{Division by Two}
+A division by two can just as easily be accomplished with a logical shift right as multiplication by two can be with a logical shift left.
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_div\_2}. \\
+\textbf{Input}.   One mp\_int $a$ \\
+\textbf{Output}.  $b = a/2$. \\
+\hline \\
+1.  If $b.alloc < a.used$ then grow $b$ to hold $a.used$ digits.  (\textit{mp\_grow}) \\
+2.  If the reallocation failed return(\textit{MP\_MEM}). \\
+3.  $oldused \leftarrow b.used$ \\
+4.  $b.used \leftarrow a.used$ \\
+5.  $r \leftarrow 0$ \\
+6.  for $n$ from $b.used - 1$ to $0$ do \\
+\hspace{3mm}6.1  $rr \leftarrow a_n \mbox{ (mod }2\mbox{)}$\\
+\hspace{3mm}6.2  $b_n \leftarrow (a_n >> 1) + (r << (lg(\beta) - 1)) \mbox{ (mod }\beta\mbox{)}$ \\
+\hspace{3mm}6.3  $r \leftarrow rr$ \\
+7.  If $b.used < oldused - 1$ then do \\
+\hspace{3mm}7.1  for $n$ from $b.used$ to $oldused - 1$ do \\
+\hspace{6mm}7.1.1  $b_n \leftarrow 0$ \\
+8.  $b.sign \leftarrow a.sign$ \\
+9.  Clamp excess digits of $b$.  (\textit{mp\_clamp}) \\
+10.  Return(\textit{MP\_OKAY}).\\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_div\_2}
+\end{figure}
+
+\textbf{Algorithm mp\_div\_2.}
+This algorithm will divide an mp\_int by two using logical shifts to the right.  Like mp\_mul\_2 it uses a modified low level addition
+core as the basis of the algorithm.  Unlike mp\_mul\_2 the shift operations work from the leading digit to the trailing digit.  The algorithm
+could be written to work from the trailing digit to the leading digit however, it would have to stop one short of $a.used - 1$ digits to prevent
+reading past the end of the array of digits.
+
+Essentially the loop at step 6 is similar to that of mp\_mul\_2 except the logical shifts go in the opposite direction and the carry is at the 
+least significant bit not the most significant bit.  
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_div\_2.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* b = a/2 */
+018   int mp_div_2(mp_int * a, mp_int * b)
+019   \{
+020     int     x, res, oldused;
+021   
+022     /* copy */
+023     if (b->alloc < a->used) \{
+024       if ((res = mp_grow (b, a->used)) != MP_OKAY) \{
+025         return res;
+026       \}
+027     \}
+028   
+029     oldused = b->used;
+030     b->used = a->used;
+031     \{
+032       register mp_digit r, rr, *tmpa, *tmpb;
+033   
+034       /* source alias */
+035       tmpa = a->dp + b->used - 1;
+036   
+037       /* dest alias */
+038       tmpb = b->dp + b->used - 1;
+039   
+040       /* carry */
+041       r = 0;
+042       for (x = b->used - 1; x >= 0; x--) \{
+043         /* get the carry for the next iteration */
+044         rr = *tmpa & 1;
+045   
+046         /* shift the current digit, add in carry and store */
+047         *tmpb-- = (*tmpa-- >> 1) | (r << (DIGIT_BIT - 1));
+048   
+049         /* forward carry to next iteration */
+050         r = rr;
+051       \}
+052   
+053       /* zero excess digits */
+054       tmpb = b->dp + b->used;
+055       for (x = b->used; x < oldused; x++) \{
+056         *tmpb++ = 0;
+057       \}
+058     \}
+059     b->sign = a->sign;
+060     mp_clamp (b);
+061     return MP_OKAY;
+062   \}
+063   #endif
+\end{alltt}
+\end{small}
+
+\section{Polynomial Basis Operations}
+Recall from section 4.3 that any integer can be represented as a polynomial in $x$ as $y = f(\beta)$.  Such a representation is also known as
+the polynomial basis \cite[pp. 48]{ROSE}. Given such a notation a multiplication or division by $x$ amounts to shifting whole digits a single 
+place.  The need for such operations arises in several other higher level algorithms such as Barrett and Montgomery reduction, integer
+division and Karatsuba multiplication.  
+
+Converting from an array of digits to polynomial basis is very simple.  Consider the integer $y \equiv (a_2, a_1, a_0)_{\beta}$ and recall that
+$y = \sum_{i=0}^{2} a_i \beta^i$.  Simply replace $\beta$ with $x$ and the expression is in polynomial basis.  For example, $f(x) = 8x + 9$ is the
+polynomial basis representation for $89$ using radix ten.  That is, $f(10) = 8(10) + 9 = 89$.  
+
+\subsection{Multiplication by $x$}
+
+Given a polynomial in $x$ such as $f(x) = a_n x^n + a_{n-1} x^{n-1} + ... + a_0$ multiplying by $x$ amounts to shifting the coefficients up one 
+degree.  In this case $f(x) \cdot x = a_n x^{n+1} + a_{n-1} x^n + ... + a_0 x$.  From a scalar basis point of view multiplying by $x$ is equivalent to
+multiplying by the integer $\beta$.  
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_lshd}. \\
+\textbf{Input}.   One mp\_int $a$ and an integer $b$ \\
+\textbf{Output}.  $a \leftarrow a \cdot \beta^b$ (equivalent to multiplication by $x^b$). \\
+\hline \\
+1.  If $b \le 0$ then return(\textit{MP\_OKAY}). \\
+2.  If $a.alloc < a.used + b$ then grow $a$ to at least $a.used + b$ digits.  (\textit{mp\_grow}). \\
+3.  If the reallocation failed return(\textit{MP\_MEM}). \\
+4.  $a.used \leftarrow a.used + b$ \\
+5.  $i \leftarrow a.used - 1$ \\
+6.  $j \leftarrow a.used - 1 - b$ \\
+7.  for $n$ from $a.used - 1$ to $b$ do \\
+\hspace{3mm}7.1  $a_{i} \leftarrow a_{j}$ \\
+\hspace{3mm}7.2  $i \leftarrow i - 1$ \\
+\hspace{3mm}7.3  $j \leftarrow j - 1$ \\
+8.  for $n$ from 0 to $b - 1$ do \\
+\hspace{3mm}8.1  $a_n \leftarrow 0$ \\
+9.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_lshd}
+\end{figure}
+
+\textbf{Algorithm mp\_lshd.}
+This algorithm multiplies an mp\_int by the $b$'th power of $x$.  This is equivalent to multiplying by $\beta^b$.  The algorithm differs 
+from the other algorithms presented so far as it performs the operation in place instead storing the result in a separate location.  The
+motivation behind this change is due to the way this function is typically used.  Algorithms such as mp\_add store the result in an optionally
+different third mp\_int because the original inputs are often still required.  Algorithm mp\_lshd (\textit{and similarly algorithm mp\_rshd}) is
+typically used on values where the original value is no longer required.  The algorithm will return success immediately if 
+$b \le 0$ since the rest of algorithm is only valid when $b > 0$.  
+
+First the destination $a$ is grown as required to accomodate the result.  The counters $i$ and $j$ are used to form a \textit{sliding window} over
+the digits of $a$ of length $b$.  The head of the sliding window is at $i$ (\textit{the leading digit}) and the tail at $j$ (\textit{the trailing digit}).  
+The loop on step 7 copies the digit from the tail to the head.  In each iteration the window is moved down one digit.   The last loop on 
+step 8 sets the lower $b$ digits to zero.
+
+\newpage
+\begin{center}
+\begin{figure}[here]
+\includegraphics{pics/sliding_window.ps}
+\caption{Sliding Window Movement}
+\label{pic:sliding_window}
+\end{figure}
+\end{center}
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_lshd.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* shift left a certain amount of digits */
+018   int mp_lshd (mp_int * a, int b)
+019   \{
+020     int     x, res;
+021   
+022     /* if its less than zero return */
+023     if (b <= 0) \{
+024       return MP_OKAY;
+025     \}
+026   
+027     /* grow to fit the new digits */
+028     if (a->alloc < a->used + b) \{
+029        if ((res = mp_grow (a, a->used + b)) != MP_OKAY) \{
+030          return res;
+031        \}
+032     \}
+033   
+034     \{
+035       register mp_digit *top, *bottom;
+036   
+037       /* increment the used by the shift amount then copy upwards */
+038       a->used += b;
+039   
+040       /* top */
+041       top = a->dp + a->used - 1;
+042   
+043       /* base */
+044       bottom = a->dp + a->used - 1 - b;
+045   
+046       /* much like mp_rshd this is implemented using a sliding window
+047        * except the window goes the otherway around.  Copying from
+048        * the bottom to the top.  see bn_mp_rshd.c for more info.
+049        */
+050       for (x = a->used - 1; x >= b; x--) \{
+051         *top-- = *bottom--;
+052       \}
+053   
+054       /* zero the lower digits */
+055       top = a->dp;
+056       for (x = 0; x < b; x++) \{
+057         *top++ = 0;
+058       \}
+059     \}
+060     return MP_OKAY;
+061   \}
+062   #endif
+\end{alltt}
+\end{small}
+
+The if statement (line 23) ensures that the $b$ variable is greater than zero since we do not interpret negative
+shift counts properly.  The \textbf{used} count is incremented by $b$ before the copy loop begins.  This elminates 
+the need for an additional variable in the for loop.  The variable $top$ (line 41) is an alias
+for the leading digit while $bottom$ (line 44) is an alias for the trailing edge.  The aliases form a 
+window of exactly $b$ digits over the input.  
+
+\subsection{Division by $x$}
+
+Division by powers of $x$ is easily achieved by shifting the digits right and removing any that will end up to the right of the zero'th digit.  
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_rshd}. \\
+\textbf{Input}.   One mp\_int $a$ and an integer $b$ \\
+\textbf{Output}.  $a \leftarrow a / \beta^b$ (Divide by $x^b$). \\
+\hline \\
+1.  If $b \le 0$ then return. \\
+2.  If $a.used \le b$ then do \\
+\hspace{3mm}2.1  Zero $a$.  (\textit{mp\_zero}). \\
+\hspace{3mm}2.2  Return. \\
+3.  $i \leftarrow 0$ \\
+4.  $j \leftarrow b$ \\
+5.  for $n$ from 0 to $a.used - b - 1$ do \\
+\hspace{3mm}5.1  $a_i \leftarrow a_j$ \\
+\hspace{3mm}5.2  $i \leftarrow i + 1$ \\
+\hspace{3mm}5.3  $j \leftarrow j + 1$ \\
+6.  for $n$ from $a.used - b$ to $a.used - 1$ do \\
+\hspace{3mm}6.1  $a_n \leftarrow 0$ \\
+7.  $a.used \leftarrow a.used - b$ \\
+8.  Return. \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_rshd}
+\end{figure}
+
+\textbf{Algorithm mp\_rshd.}
+This algorithm divides the input in place by the $b$'th power of $x$.  It is analogous to dividing by a $\beta^b$ but much quicker since
+it does not require single precision division.  This algorithm does not actually return an error code as it cannot fail.  
+
+If the input $b$ is less than one the algorithm quickly returns without performing any work.  If the \textbf{used} count is less than or equal
+to the shift count $b$ then it will simply zero the input and return.
+
+After the trivial cases of inputs have been handled the sliding window is setup.  Much like the case of algorithm mp\_lshd a sliding window that
+is $b$ digits wide is used to copy the digits.  Unlike mp\_lshd the window slides in the opposite direction from the trailing to the leading digit.  
+Also the digits are copied from the leading to the trailing edge.
+
+Once the window copy is complete the upper digits must be zeroed and the \textbf{used} count decremented.
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_rshd.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* shift right a certain amount of digits */
+018   void mp_rshd (mp_int * a, int b)
+019   \{
+020     int     x;
+021   
+022     /* if b <= 0 then ignore it */
+023     if (b <= 0) \{
+024       return;
+025     \}
+026   
+027     /* if b > used then simply zero it and return */
+028     if (a->used <= b) \{
+029       mp_zero (a);
+030       return;
+031     \}
+032   
+033     \{
+034       register mp_digit *bottom, *top;
+035   
+036       /* shift the digits down */
+037   
+038       /* bottom */
+039       bottom = a->dp;
+040   
+041       /* top [offset into digits] */
+042       top = a->dp + b;
+043   
+044       /* this is implemented as a sliding window where 
+045        * the window is b-digits long and digits from 
+046        * the top of the window are copied to the bottom
+047        *
+048        * e.g.
+049   
+050        b-2 | b-1 | b0 | b1 | b2 | ... | bb |   ---->
+051                    /\symbol{92}                   |      ---->
+052                     \symbol{92}-------------------/      ---->
+053        */
+054       for (x = 0; x < (a->used - b); x++) \{
+055         *bottom++ = *top++;
+056       \}
+057   
+058       /* zero the top digits */
+059       for (; x < a->used; x++) \{
+060         *bottom++ = 0;
+061       \}
+062     \}
+063     
+064     /* remove excess digits */
+065     a->used -= b;
+066   \}
+067   #endif
+\end{alltt}
+\end{small}
+
+The only noteworthy element of this routine is the lack of a return type since it cannot fail.  Like mp\_lshd() we
+form a sliding window except we copy in the other direction.  After the window (line 59) we then zero
+the upper digits of the input to make sure the result is correct.
+
+\section{Powers of Two}
+
+Now that algorithms for moving single bits as well as whole digits exist algorithms for moving the ``in between'' distances are required.  For 
+example, to quickly multiply by $2^k$ for any $k$ without using a full multiplier algorithm would prove useful.  Instead of performing single
+shifts $k$ times to achieve a multiplication by $2^{\pm k}$ a mixture of whole digit shifting and partial digit shifting is employed.  
+
+\subsection{Multiplication by Power of Two}
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_mul\_2d}. \\
+\textbf{Input}.   One mp\_int $a$ and an integer $b$ \\
+\textbf{Output}.  $c \leftarrow a \cdot 2^b$. \\
+\hline \\
+1.  $c \leftarrow a$.  (\textit{mp\_copy}) \\
+2.  If $c.alloc < c.used + \lfloor b / lg(\beta) \rfloor + 2$ then grow $c$ accordingly. \\
+3.  If the reallocation failed return(\textit{MP\_MEM}). \\
+4.  If $b \ge lg(\beta)$ then \\
+\hspace{3mm}4.1  $c \leftarrow c \cdot \beta^{\lfloor b / lg(\beta) \rfloor}$ (\textit{mp\_lshd}). \\
+\hspace{3mm}4.2  If step 4.1 failed return(\textit{MP\_MEM}). \\
+5.  $d \leftarrow b \mbox{ (mod }lg(\beta)\mbox{)}$ \\
+6.  If $d \ne 0$ then do \\
+\hspace{3mm}6.1  $mask \leftarrow 2^d$ \\
+\hspace{3mm}6.2  $r \leftarrow 0$ \\
+\hspace{3mm}6.3  for $n$ from $0$ to $c.used - 1$ do \\
+\hspace{6mm}6.3.1  $rr \leftarrow c_n >> (lg(\beta) - d) \mbox{ (mod }mask\mbox{)}$ \\
+\hspace{6mm}6.3.2  $c_n \leftarrow (c_n << d) + r \mbox{ (mod }\beta\mbox{)}$ \\
+\hspace{6mm}6.3.3  $r \leftarrow rr$ \\
+\hspace{3mm}6.4  If $r > 0$ then do \\
+\hspace{6mm}6.4.1  $c_{c.used} \leftarrow r$ \\
+\hspace{6mm}6.4.2  $c.used \leftarrow c.used + 1$ \\
+7.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_mul\_2d}
+\end{figure}
+
+\textbf{Algorithm mp\_mul\_2d.}
+This algorithm multiplies $a$ by $2^b$ and stores the result in $c$.  The algorithm uses algorithm mp\_lshd and a derivative of algorithm mp\_mul\_2 to
+quickly compute the product.
+
+First the algorithm will multiply $a$ by $x^{\lfloor b / lg(\beta) \rfloor}$ which will ensure that the remainder multiplicand is less than 
+$\beta$.  For example, if $b = 37$ and $\beta = 2^{28}$ then this step will multiply by $x$ leaving a multiplication by $2^{37 - 28} = 2^{9}$ 
+left.
+
+After the digits have been shifted appropriately at most $lg(\beta) - 1$ shifts are left to perform.  Step 5 calculates the number of remaining shifts 
+required.  If it is non-zero a modified shift loop is used to calculate the remaining product.  
+Essentially the loop is a generic version of algorith mp\_mul2 designed to handle any shift count in the range $1 \le x < lg(\beta)$.  The $mask$
+variable is used to extract the upper $d$ bits to form the carry for the next iteration.  
+
+This algorithm is loosely measured as a $O(2n)$ algorithm which means that if the input is $n$-digits that it takes $2n$ ``time'' to 
+complete.  It is possible to optimize this algorithm down to a $O(n)$ algorithm at a cost of making the algorithm slightly harder to follow.
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_mul\_2d.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* shift left by a certain bit count */
+018   int mp_mul_2d (mp_int * a, int b, mp_int * c)
+019   \{
+020     mp_digit d;
+021     int      res;
+022   
+023     /* copy */
+024     if (a != c) \{
+025        if ((res = mp_copy (a, c)) != MP_OKAY) \{
+026          return res;
+027        \}
+028     \}
+029   
+030     if (c->alloc < (int)(c->used + b/DIGIT_BIT + 1)) \{
+031        if ((res = mp_grow (c, c->used + b / DIGIT_BIT + 1)) != MP_OKAY) \{
+032          return res;
+033        \}
+034     \}
+035   
+036     /* shift by as many digits in the bit count */
+037     if (b >= (int)DIGIT_BIT) \{
+038       if ((res = mp_lshd (c, b / DIGIT_BIT)) != MP_OKAY) \{
+039         return res;
+040       \}
+041     \}
+042   
+043     /* shift any bit count < DIGIT_BIT */
+044     d = (mp_digit) (b % DIGIT_BIT);
+045     if (d != 0) \{
+046       register mp_digit *tmpc, shift, mask, r, rr;
+047       register int x;
+048   
+049       /* bitmask for carries */
+050       mask = (((mp_digit)1) << d) - 1;
+051   
+052       /* shift for msbs */
+053       shift = DIGIT_BIT - d;
+054   
+055       /* alias */
+056       tmpc = c->dp;
+057   
+058       /* carry */
+059       r    = 0;
+060       for (x = 0; x < c->used; x++) \{
+061         /* get the higher bits of the current word */
+062         rr = (*tmpc >> shift) & mask;
+063   
+064         /* shift the current word and OR in the carry */
+065         *tmpc = ((*tmpc << d) | r) & MP_MASK;
+066         ++tmpc;
+067   
+068         /* set the carry to the carry bits of the current word */
+069         r = rr;
+070       \}
+071       
+072       /* set final carry */
+073       if (r != 0) \{
+074          c->dp[(c->used)++] = r;
+075       \}
+076     \}
+077     mp_clamp (c);
+078     return MP_OKAY;
+079   \}
+080   #endif
+\end{alltt}
+\end{small}
+
+The shifting is performed in--place which means the first step (line 24) is to copy the input to the 
+destination.  We avoid calling mp\_copy() by making sure the mp\_ints are different.  The destination then
+has to be grown (line 31) to accomodate the result.
+
+If the shift count $b$ is larger than $lg(\beta)$ then a call to mp\_lshd() is used to handle all of the multiples 
+of $lg(\beta)$.  Leaving only a remaining shift of $lg(\beta) - 1$ or fewer bits left.  Inside the actual shift 
+loop (lines 45 to 76) we make use of pre--computed values $shift$ and $mask$.   These are used to
+extract the carry bit(s) to pass into the next iteration of the loop.  The $r$ and $rr$ variables form a 
+chain between consecutive iterations to propagate the carry.  
+
+\subsection{Division by Power of Two}
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_div\_2d}. \\
+\textbf{Input}.   One mp\_int $a$ and an integer $b$ \\
+\textbf{Output}.  $c \leftarrow \lfloor a / 2^b \rfloor, d \leftarrow a \mbox{ (mod }2^b\mbox{)}$. \\
+\hline \\
+1.  If $b \le 0$ then do \\
+\hspace{3mm}1.1  $c \leftarrow a$ (\textit{mp\_copy}) \\
+\hspace{3mm}1.2  $d \leftarrow 0$ (\textit{mp\_zero}) \\
+\hspace{3mm}1.3  Return(\textit{MP\_OKAY}). \\
+2.  $c \leftarrow a$ \\
+3.  $d \leftarrow a \mbox{ (mod }2^b\mbox{)}$ (\textit{mp\_mod\_2d}) \\
+4.  If $b \ge lg(\beta)$ then do \\
+\hspace{3mm}4.1  $c \leftarrow \lfloor c/\beta^{\lfloor b/lg(\beta) \rfloor} \rfloor$ (\textit{mp\_rshd}). \\
+5.  $k \leftarrow b \mbox{ (mod }lg(\beta)\mbox{)}$ \\
+6.  If $k \ne 0$ then do \\
+\hspace{3mm}6.1  $mask \leftarrow 2^k$ \\
+\hspace{3mm}6.2  $r \leftarrow 0$ \\
+\hspace{3mm}6.3  for $n$ from $c.used - 1$ to $0$ do \\
+\hspace{6mm}6.3.1  $rr \leftarrow c_n \mbox{ (mod }mask\mbox{)}$ \\
+\hspace{6mm}6.3.2  $c_n \leftarrow (c_n >> k) + (r << (lg(\beta) - k))$ \\
+\hspace{6mm}6.3.3  $r \leftarrow rr$ \\
+7.  Clamp excess digits of $c$.  (\textit{mp\_clamp}) \\
+8.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_div\_2d}
+\end{figure}
+
+\textbf{Algorithm mp\_div\_2d.}
+This algorithm will divide an input $a$ by $2^b$ and produce the quotient and remainder.  The algorithm is designed much like algorithm 
+mp\_mul\_2d by first using whole digit shifts then single precision shifts.  This algorithm will also produce the remainder of the division
+by using algorithm mp\_mod\_2d.
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_div\_2d.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* shift right by a certain bit count (store quotient in c, optional remaind
+      er in d) */
+018   int mp_div_2d (mp_int * a, int b, mp_int * c, mp_int * d)
+019   \{
+020     mp_digit D, r, rr;
+021     int     x, res;
+022     mp_int  t;
+023   
+024   
+025     /* if the shift count is <= 0 then we do no work */
+026     if (b <= 0) \{
+027       res = mp_copy (a, c);
+028       if (d != NULL) \{
+029         mp_zero (d);
+030       \}
+031       return res;
+032     \}
+033   
+034     if ((res = mp_init (&t)) != MP_OKAY) \{
+035       return res;
+036     \}
+037   
+038     /* get the remainder */
+039     if (d != NULL) \{
+040       if ((res = mp_mod_2d (a, b, &t)) != MP_OKAY) \{
+041         mp_clear (&t);
+042         return res;
+043       \}
+044     \}
+045   
+046     /* copy */
+047     if ((res = mp_copy (a, c)) != MP_OKAY) \{
+048       mp_clear (&t);
+049       return res;
+050     \}
+051   
+052     /* shift by as many digits in the bit count */
+053     if (b >= (int)DIGIT_BIT) \{
+054       mp_rshd (c, b / DIGIT_BIT);
+055     \}
+056   
+057     /* shift any bit count < DIGIT_BIT */
+058     D = (mp_digit) (b % DIGIT_BIT);
+059     if (D != 0) \{
+060       register mp_digit *tmpc, mask, shift;
+061   
+062       /* mask */
+063       mask = (((mp_digit)1) << D) - 1;
+064   
+065       /* shift for lsb */
+066       shift = DIGIT_BIT - D;
+067   
+068       /* alias */
+069       tmpc = c->dp + (c->used - 1);
+070   
+071       /* carry */
+072       r = 0;
+073       for (x = c->used - 1; x >= 0; x--) \{
+074         /* get the lower  bits of this word in a temp */
+075         rr = *tmpc & mask;
+076   
+077         /* shift the current word and mix in the carry bits from the previous 
+      word */
+078         *tmpc = (*tmpc >> D) | (r << shift);
+079         --tmpc;
+080   
+081         /* set the carry to the carry bits of the current word found above */
+082         r = rr;
+083       \}
+084     \}
+085     mp_clamp (c);
+086     if (d != NULL) \{
+087       mp_exch (&t, d);
+088     \}
+089     mp_clear (&t);
+090     return MP_OKAY;
+091   \}
+092   #endif
+\end{alltt}
+\end{small}
+
+The implementation of algorithm mp\_div\_2d is slightly different than the algorithm specifies.  The remainder $d$ may be optionally 
+ignored by passing \textbf{NULL} as the pointer to the mp\_int variable.    The temporary mp\_int variable $t$ is used to hold the 
+result of the remainder operation until the end.  This allows $d$ and $a$ to represent the same mp\_int without modifying $a$ before
+the quotient is obtained.
+
+The remainder of the source code is essentially the same as the source code for mp\_mul\_2d.  The only significant difference is
+the direction of the shifts.
+
+\subsection{Remainder of Division by Power of Two}
+
+The last algorithm in the series of polynomial basis power of two algorithms is calculating the remainder of division by $2^b$.  This
+algorithm benefits from the fact that in twos complement arithmetic $a \mbox{ (mod }2^b\mbox{)}$ is the same as $a$ AND $2^b - 1$.  
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_mod\_2d}. \\
+\textbf{Input}.   One mp\_int $a$ and an integer $b$ \\
+\textbf{Output}.  $c \leftarrow a \mbox{ (mod }2^b\mbox{)}$. \\
+\hline \\
+1.  If $b \le 0$ then do \\
+\hspace{3mm}1.1  $c \leftarrow 0$ (\textit{mp\_zero}) \\
+\hspace{3mm}1.2  Return(\textit{MP\_OKAY}). \\
+2.  If $b > a.used \cdot lg(\beta)$ then do \\
+\hspace{3mm}2.1  $c \leftarrow a$ (\textit{mp\_copy}) \\
+\hspace{3mm}2.2  Return the result of step 2.1. \\
+3.  $c \leftarrow a$ \\
+4.  If step 3 failed return(\textit{MP\_MEM}). \\
+5.  for $n$ from $\lceil b / lg(\beta) \rceil$ to $c.used$ do \\
+\hspace{3mm}5.1  $c_n \leftarrow 0$ \\
+6.  $k \leftarrow b \mbox{ (mod }lg(\beta)\mbox{)}$ \\
+7.  $c_{\lfloor b / lg(\beta) \rfloor} \leftarrow c_{\lfloor b / lg(\beta) \rfloor} \mbox{ (mod }2^{k}\mbox{)}$. \\
+8.  Clamp excess digits of $c$.  (\textit{mp\_clamp}) \\
+9.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_mod\_2d}
+\end{figure}
+
+\textbf{Algorithm mp\_mod\_2d.}
+This algorithm will quickly calculate the value of $a \mbox{ (mod }2^b\mbox{)}$.  First if $b$ is less than or equal to zero the 
+result is set to zero.  If $b$ is greater than the number of bits in $a$ then it simply copies $a$ to $c$ and returns.  Otherwise, $a$ 
+is copied to $b$, leading digits are removed and the remaining leading digit is trimed to the exact bit count.
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_mod\_2d.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* calc a value mod 2**b */
+018   int
+019   mp_mod_2d (mp_int * a, int b, mp_int * c)
+020   \{
+021     int     x, res;
+022   
+023     /* if b is <= 0 then zero the int */
+024     if (b <= 0) \{
+025       mp_zero (c);
+026       return MP_OKAY;
+027     \}
+028   
+029     /* if the modulus is larger than the value than return */
+030     if (b >= (int) (a->used * DIGIT_BIT)) \{
+031       res = mp_copy (a, c);
+032       return res;
+033     \}
+034   
+035     /* copy */
+036     if ((res = mp_copy (a, c)) != MP_OKAY) \{
+037       return res;
+038     \}
+039   
+040     /* zero digits above the last digit of the modulus */
+041     for (x = (b / DIGIT_BIT) + ((b % DIGIT_BIT) == 0 ? 0 : 1); x < c->used; x+
+      +) \{
+042       c->dp[x] = 0;
+043     \}
+044     /* clear the digit that is not completely outside/inside the modulus */
+045     c->dp[b / DIGIT_BIT] &=
+046       (mp_digit) ((((mp_digit) 1) << (((mp_digit) b) % DIGIT_BIT)) - ((mp_digi
+      t) 1));
+047     mp_clamp (c);
+048     return MP_OKAY;
+049   \}
+050   #endif
+\end{alltt}
+\end{small}
+
+We first avoid cases of $b \le 0$ by simply mp\_zero()'ing the destination in such cases.  Next if $2^b$ is larger
+than the input we just mp\_copy() the input and return right away.  After this point we know we must actually
+perform some work to produce the remainder.
+
+Recalling that reducing modulo $2^k$ and a binary ``and'' with $2^k - 1$ are numerically equivalent we can quickly reduce 
+the number.  First we zero any digits above the last digit in $2^b$ (line 41).  Next we reduce the 
+leading digit of both (line 45) and then mp\_clamp().
+
+\section*{Exercises}
+\begin{tabular}{cl}
+$\left [ 3 \right ] $ & Devise an algorithm that performs $a \cdot 2^b$ for generic values of $b$ \\
+                      & in $O(n)$ time. \\
+                      &\\
+$\left [ 3 \right ] $ & Devise an efficient algorithm to multiply by small low hamming  \\
+                      & weight values such as $3$, $5$ and $9$.  Extend it to handle all values \\
+                      & upto $64$ with a hamming weight less than three. \\
+                      &\\
+$\left [ 2 \right ] $ & Modify the preceding algorithm to handle values of the form \\
+                      & $2^k - 1$ as well. \\
+                      &\\
+$\left [ 3 \right ] $ & Using only algorithms mp\_mul\_2, mp\_div\_2 and mp\_add create an \\
+                      & algorithm to multiply two integers in roughly $O(2n^2)$ time for \\
+                      & any $n$-bit input.  Note that the time of addition is ignored in the \\
+                      & calculation.  \\
+                      & \\
+$\left [ 5 \right ] $ & Improve the previous algorithm to have a working time of at most \\
+                      & $O \left (2^{(k-1)}n + \left ({2n^2 \over k} \right ) \right )$ for an appropriate choice of $k$.  Again ignore \\
+                      & the cost of addition. \\
+                      & \\
+$\left [ 2 \right ] $ & Devise a chart to find optimal values of $k$ for the previous problem \\
+                      & for $n = 64 \ldots 1024$ in steps of $64$. \\
+                      & \\
+$\left [ 2 \right ] $ & Using only algorithms mp\_abs and mp\_sub devise another method for \\
+                      & calculating the result of a signed comparison. \\
+                      &
+\end{tabular}
+
+\chapter{Multiplication and Squaring}
+\section{The Multipliers}
+For most number theoretic problems including certain public key cryptographic algorithms, the ``multipliers'' form the most important subset of 
+algorithms of any multiple precision integer package.  The set of multiplier algorithms include integer multiplication, squaring and modular reduction 
+where in each of the algorithms single precision multiplication is the dominant operation performed.  This chapter will discuss integer multiplication 
+and squaring, leaving modular reductions for the subsequent chapter.  
+
+The importance of the multiplier algorithms is for the most part driven by the fact that certain popular public key algorithms are based on modular 
+exponentiation, that is computing $d \equiv a^b \mbox{ (mod }c\mbox{)}$ for some arbitrary choice of $a$, $b$, $c$ and $d$.  During a modular
+exponentiation the majority\footnote{Roughly speaking a modular exponentiation will spend about 40\% of the time performing modular reductions, 
+35\% of the time performing squaring and 25\% of the time performing multiplications.} of the processor time is spent performing single precision 
+multiplications.
+
+For centuries general purpose multiplication has required a lengthly $O(n^2)$ process, whereby each digit of one multiplicand has to be multiplied 
+against every digit of the other multiplicand.  Traditional long-hand multiplication is based on this process;  while the techniques can differ the 
+overall algorithm used is essentially the same.  Only ``recently'' have faster algorithms been studied.  First Karatsuba multiplication was discovered in 
+1962.  This algorithm can multiply two numbers with considerably fewer single precision multiplications when compared to the long-hand approach.  
+This technique led to the discovery of polynomial basis algorithms (\textit{good reference?}) and subquently Fourier Transform based solutions.  
+
+\section{Multiplication}
+\subsection{The Baseline Multiplication}
+\label{sec:basemult}
+\index{baseline multiplication}
+Computing the product of two integers in software can be achieved using a trivial adaptation of the standard $O(n^2)$ long-hand multiplication
+algorithm that school children are taught.  The algorithm is considered an $O(n^2)$ algorithm since for two $n$-digit inputs $n^2$ single precision 
+multiplications are required.  More specifically for a $m$ and $n$ digit input $m \cdot n$ single precision multiplications are required.  To 
+simplify most discussions, it will be assumed that the inputs have comparable number of digits.  
+
+The ``baseline multiplication'' algorithm is designed to act as the ``catch-all'' algorithm, only to be used when the faster algorithms cannot be 
+used.  This algorithm does not use any particularly interesting optimizations and should ideally be avoided if possible.    One important 
+facet of this algorithm, is that it has been modified to only produce a certain amount of output digits as resolution.  The importance of this 
+modification will become evident during the discussion of Barrett modular reduction.  Recall that for a $n$ and $m$ digit input the product 
+will be at most $n + m$ digits.  Therefore, this algorithm can be reduced to a full multiplier by having it produce $n + m$ digits of the product.  
+
+Recall from sub-section 4.2.2 the definition of $\gamma$ as the number of bits in the type \textbf{mp\_digit}.  We shall now extend the variable set to 
+include $\alpha$ which shall represent the number of bits in the type \textbf{mp\_word}.  This implies that $2^{\alpha} > 2 \cdot \beta^2$.  The 
+constant $\delta = 2^{\alpha - 2lg(\beta)}$ will represent the maximal weight of any column in a product (\textit{see sub-section 5.2.2 for more information}).
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{s\_mp\_mul\_digs}. \\
+\textbf{Input}.   mp\_int $a$, mp\_int $b$ and an integer $digs$ \\
+\textbf{Output}.  $c \leftarrow \vert a \vert \cdot \vert b \vert \mbox{ (mod }\beta^{digs}\mbox{)}$. \\
+\hline \\
+1.  If min$(a.used, b.used) < \delta$ then do \\
+\hspace{3mm}1.1  Calculate $c = \vert a \vert \cdot \vert b \vert$ by the Comba method (\textit{see algorithm~\ref{fig:COMBAMULT}}).  \\
+\hspace{3mm}1.2  Return the result of step 1.1 \\
+\\
+Allocate and initialize a temporary mp\_int. \\
+2.  Init $t$ to be of size $digs$ \\
+3.  If step 2 failed return(\textit{MP\_MEM}). \\
+4.  $t.used \leftarrow digs$ \\
+\\
+Compute the product. \\
+5.  for $ix$ from $0$ to $a.used - 1$ do \\
+\hspace{3mm}5.1  $u \leftarrow 0$ \\
+\hspace{3mm}5.2  $pb \leftarrow \mbox{min}(b.used, digs - ix)$ \\
+\hspace{3mm}5.3  If $pb < 1$ then goto step 6. \\
+\hspace{3mm}5.4  for $iy$ from $0$ to $pb - 1$ do \\
+\hspace{6mm}5.4.1  $\hat r \leftarrow t_{iy + ix} + a_{ix} \cdot b_{iy} + u$ \\
+\hspace{6mm}5.4.2  $t_{iy + ix} \leftarrow \hat r \mbox{ (mod }\beta\mbox{)}$ \\
+\hspace{6mm}5.4.3  $u \leftarrow \lfloor \hat r / \beta \rfloor$ \\
+\hspace{3mm}5.5  if $ix + pb < digs$ then do \\
+\hspace{6mm}5.5.1  $t_{ix + pb} \leftarrow u$ \\
+6.  Clamp excess digits of $t$. \\
+7.  Swap $c$ with $t$ \\
+8.  Clear $t$ \\
+9.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm s\_mp\_mul\_digs}
+\end{figure}
+
+\textbf{Algorithm s\_mp\_mul\_digs.}
+This algorithm computes the unsigned product of two inputs $a$ and $b$, limited to an output precision of $digs$ digits.  While it may seem
+a bit awkward to modify the function from its simple $O(n^2)$ description, the usefulness of partial multipliers will arise in a subsequent 
+algorithm.  The algorithm is loosely based on algorithm 14.12 from \cite[pp. 595]{HAC} and is similar to Algorithm M of Knuth \cite[pp. 268]{TAOCPV2}.  
+Algorithm s\_mp\_mul\_digs differs from these cited references since it can produce a variable output precision regardless of the precision of the 
+inputs.
+
+The first thing this algorithm checks for is whether a Comba multiplier can be used instead.   If the minimum digit count of either
+input is less than $\delta$, then the Comba method may be used instead.    After the Comba method is ruled out, the baseline algorithm begins.  A 
+temporary mp\_int variable $t$ is used to hold the intermediate result of the product.  This allows the algorithm to be used to 
+compute products when either $a = c$ or $b = c$ without overwriting the inputs.  
+
+All of step 5 is the infamous $O(n^2)$ multiplication loop slightly modified to only produce upto $digs$ digits of output.  The $pb$ variable
+is given the count of digits to read from $b$ inside the nested loop.  If $pb \le 1$ then no more output digits can be produced and the algorithm
+will exit the loop.  The best way to think of the loops are as a series of $pb \times 1$ multiplications.    That is, in each pass of the 
+innermost loop $a_{ix}$ is multiplied against $b$ and the result is added (\textit{with an appropriate shift}) to $t$.  
+
+For example, consider multiplying $576$ by $241$.  That is equivalent to computing $10^0(1)(576) + 10^1(4)(576) + 10^2(2)(576)$ which is best
+visualized in the following table.
+
+\begin{figure}[here]
+\begin{center}
+\begin{tabular}{|c|c|c|c|c|c|l|}
+\hline   &&          & 5 & 7 & 6 & \\
+\hline   $\times$&&  & 2 & 4 & 1 & \\
+\hline &&&&&&\\
+  &&          & 5 & 7 & 6 & $10^0(1)(576)$ \\
+  &2 &   3    & 6 & 1 & 6 & $10^1(4)(576) + 10^0(1)(576)$ \\
+  1 & 3 & 8 & 8 & 1 & 6 &   $10^2(2)(576) + 10^1(4)(576) + 10^0(1)(576)$ \\
+\hline  
+\end{tabular}
+\end{center}
+\caption{Long-Hand Multiplication Diagram}
+\end{figure}
+
+Each row of the product is added to the result after being shifted to the left (\textit{multiplied by a power of the radix}) by the appropriate 
+count.  That is in pass $ix$ of the inner loop the product is added starting at the $ix$'th digit of the reult.
+
+Step 5.4.1 introduces the hat symbol (\textit{e.g. $\hat r$}) which represents a double precision variable.  The multiplication on that step
+is assumed to be a double wide output single precision multiplication.  That is, two single precision variables are multiplied to produce a
+double precision result.  The step is somewhat optimized from a long-hand multiplication algorithm because the carry from the addition in step
+5.4.1 is propagated through the nested loop.  If the carry was not propagated immediately it would overflow the single precision digit 
+$t_{ix+iy}$ and the result would be lost.  
+
+At step 5.5 the nested loop is finished and any carry that was left over should be forwarded.  The carry does not have to be added to the $ix+pb$'th
+digit since that digit is assumed to be zero at this point.  However, if $ix + pb \ge digs$ the carry is not set as it would make the result
+exceed the precision requested.
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_s\_mp\_mul\_digs.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* multiplies |a| * |b| and only computes upto digs digits of result
+018    * HAC pp. 595, Algorithm 14.12  Modified so you can control how 
+019    * many digits of output are created.
+020    */
+021   int s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
+022   \{
+023     mp_int  t;
+024     int     res, pa, pb, ix, iy;
+025     mp_digit u;
+026     mp_word r;
+027     mp_digit tmpx, *tmpt, *tmpy;
+028   
+029     /* can we use the fast multiplier? */
+030     if (((digs) < MP_WARRAY) &&
+031         MIN (a->used, b->used) < 
+032             (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) \{
+033       return fast_s_mp_mul_digs (a, b, c, digs);
+034     \}
+035   
+036     if ((res = mp_init_size (&t, digs)) != MP_OKAY) \{
+037       return res;
+038     \}
+039     t.used = digs;
+040   
+041     /* compute the digits of the product directly */
+042     pa = a->used;
+043     for (ix = 0; ix < pa; ix++) \{
+044       /* set the carry to zero */
+045       u = 0;
+046   
+047       /* limit ourselves to making digs digits of output */
+048       pb = MIN (b->used, digs - ix);
+049   
+050       /* setup some aliases */
+051       /* copy of the digit from a used within the nested loop */
+052       tmpx = a->dp[ix];
+053       
+054       /* an alias for the destination shifted ix places */
+055       tmpt = t.dp + ix;
+056       
+057       /* an alias for the digits of b */
+058       tmpy = b->dp;
+059   
+060       /* compute the columns of the output and propagate the carry */
+061       for (iy = 0; iy < pb; iy++) \{
+062         /* compute the column as a mp_word */
+063         r       = ((mp_word)*tmpt) +
+064                   ((mp_word)tmpx) * ((mp_word)*tmpy++) +
+065                   ((mp_word) u);
+066   
+067         /* the new column is the lower part of the result */
+068         *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
+069   
+070         /* get the carry word from the result */
+071         u       = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
+072       \}
+073       /* set carry if it is placed below digs */
+074       if (ix + iy < digs) \{
+075         *tmpt = u;
+076       \}
+077     \}
+078   
+079     mp_clamp (&t);
+080     mp_exch (&t, c);
+081   
+082     mp_clear (&t);
+083     return MP_OKAY;
+084   \}
+085   #endif
+\end{alltt}
+\end{small}
+
+First we determine (line 30) if the Comba method can be used first since it's faster.  The conditions for 
+sing the Comba routine are that min$(a.used, b.used) < \delta$ and the number of digits of output is less than 
+\textbf{MP\_WARRAY}.  This new constant is used to control the stack usage in the Comba routines.  By default it is 
+set to $\delta$ but can be reduced when memory is at a premium.
+
+If we cannot use the Comba method we proceed to setup the baseline routine.  We allocate the the destination mp\_int
+$t$ (line 36) to the exact size of the output to avoid further re--allocations.  At this point we now 
+begin the $O(n^2)$ loop.
+
+This implementation of multiplication has the caveat that it can be trimmed to only produce a variable number of
+digits as output.  In each iteration of the outer loop the $pb$ variable is set (line 48) to the maximum 
+number of inner loop iterations.  
+
+Inside the inner loop we calculate $\hat r$ as the mp\_word product of the two mp\_digits and the addition of the
+carry from the previous iteration.  A particularly important observation is that most modern optimizing 
+C compilers (GCC for instance) can recognize that a $N \times N \rightarrow 2N$ multiplication is all that 
+is required for the product.  In x86 terms for example, this means using the MUL instruction.
+
+Each digit of the product is stored in turn (line 68) and the carry propagated (line 71) to the 
+next iteration.
+
+\subsection{Faster Multiplication by the ``Comba'' Method}
+
+One of the huge drawbacks of the ``baseline'' algorithms is that at the $O(n^2)$ level the carry must be 
+computed and propagated upwards.  This makes the nested loop very sequential and hard to unroll and implement 
+in parallel.  The ``Comba'' \cite{COMBA} method is named after little known (\textit{in cryptographic venues}) Paul G. 
+Comba who described a method of implementing fast multipliers that do not require nested carry fixup operations.  As an 
+interesting aside it seems that Paul Barrett describes a similar technique in his 1986 paper \cite{BARRETT} written 
+five years before.
+
+At the heart of the Comba technique is once again the long-hand algorithm.  Except in this case a slight 
+twist is placed on how the columns of the result are produced.  In the standard long-hand algorithm rows of products 
+are produced then added together to form the final result.  In the baseline algorithm the columns are added together 
+after each iteration to get the result instantaneously.  
+
+In the Comba algorithm the columns of the result are produced entirely independently of each other.  That is at 
+the $O(n^2)$ level a simple multiplication and addition step is performed.  The carries of the columns are propagated 
+after the nested loop to reduce the amount of work requiored. Succintly the first step of the algorithm is to compute 
+the product vector $\vec x$ as follows. 
+
+\begin{equation}
+\vec x_n = \sum_{i+j = n} a_ib_j, \forall n \in \lbrace 0, 1, 2, \ldots, i + j \rbrace
+\end{equation}
+
+Where $\vec x_n$ is the $n'th$ column of the output vector.  Consider the following example which computes the vector $\vec x$ for the multiplication
+of $576$ and $241$.  
+
+\newpage\begin{figure}[here]
+\begin{small}
+\begin{center}
+\begin{tabular}{|c|c|c|c|c|c|}
+  \hline &          & 5 & 7 & 6 & First Input\\
+  \hline $\times$ & & 2 & 4 & 1 & Second Input\\
+\hline            &                        & $1 \cdot 5 = 5$   & $1 \cdot 7 = 7$   & $1 \cdot 6 = 6$ & First pass \\
+                  &  $4 \cdot 5 = 20$      & $4 \cdot 7+5=33$  & $4 \cdot 6+7=31$  & 6               & Second pass \\
+   $2 \cdot 5 = 10$ &  $2 \cdot 7 + 20 = 34$ & $2 \cdot 6+33=45$ & 31                & 6             & Third pass \\
+\hline 10 & 34 & 45 & 31 & 6 & Final Result \\   
+\hline   
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Comba Multiplication Diagram}
+\end{figure}
+
+At this point the vector $x = \left < 10, 34, 45, 31, 6 \right >$ is the result of the first step of the Comba multipler.  
+Now the columns must be fixed by propagating the carry upwards.  The resultant vector will have one extra dimension over the input vector which is
+congruent to adding a leading zero digit.
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{Comba Fixup}. \\
+\textbf{Input}.   Vector $\vec x$ of dimension $k$ \\
+\textbf{Output}.  Vector $\vec x$ such that the carries have been propagated. \\
+\hline \\
+1.  for $n$ from $0$ to $k - 1$ do \\
+\hspace{3mm}1.1 $\vec x_{n+1} \leftarrow \vec x_{n+1} + \lfloor \vec x_{n}/\beta \rfloor$ \\
+\hspace{3mm}1.2 $\vec x_{n} \leftarrow \vec x_{n} \mbox{ (mod }\beta\mbox{)}$ \\
+2.  Return($\vec x$). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm Comba Fixup}
+\end{figure}
+
+With that algorithm and $k = 5$ and $\beta = 10$ the following vector is produced $\vec x= \left < 1, 3, 8, 8, 1, 6 \right >$.  In this case 
+$241 \cdot 576$ is in fact $138816$ and the procedure succeeded.  If the algorithm is correct and as will be demonstrated shortly more
+efficient than the baseline algorithm why not simply always use this algorithm?
+
+\subsubsection{Column Weight.}
+At the nested $O(n^2)$ level the Comba method adds the product of two single precision variables to each column of the output 
+independently.  A serious obstacle is if the carry is lost, due to lack of precision before the algorithm has a chance to fix
+the carries.  For example, in the multiplication of two three-digit numbers the third column of output will be the sum of
+three single precision multiplications.  If the precision of the accumulator for the output digits is less then $3 \cdot (\beta - 1)^2$ then
+an overflow can occur and the carry information will be lost.  For any $m$ and $n$ digit inputs the maximum weight of any column is 
+min$(m, n)$ which is fairly obvious.
+
+The maximum number of terms in any column of a product is known as the ``column weight'' and strictly governs when the algorithm can be used.  Recall
+from earlier that a double precision type has $\alpha$ bits of resolution and a single precision digit has $lg(\beta)$ bits of precision.  Given these
+two quantities we must not violate the following
+
+\begin{equation}
+k \cdot \left (\beta - 1 \right )^2 < 2^{\alpha}
+\end{equation}
+
+Which reduces to 
+
+\begin{equation}
+k \cdot \left ( \beta^2 - 2\beta + 1 \right ) < 2^{\alpha}
+\end{equation}
+
+Let $\rho = lg(\beta)$ represent the number of bits in a single precision digit.  By further re-arrangement of the equation the final solution is
+found.
+
+\begin{equation}
+k  < {{2^{\alpha}} \over {\left (2^{2\rho} - 2^{\rho + 1} + 1 \right )}}
+\end{equation}
+
+The defaults for LibTomMath are $\beta = 2^{28}$ and $\alpha = 2^{64}$ which means that $k$ is bounded by $k < 257$.  In this configuration 
+the smaller input may not have more than $256$ digits if the Comba method is to be used.  This is quite satisfactory for most applications since 
+$256$ digits would allow for numbers in the range of $0 \le x < 2^{7168}$ which, is much larger than most public key cryptographic algorithms require.  
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{fast\_s\_mp\_mul\_digs}. \\
+\textbf{Input}.   mp\_int $a$, mp\_int $b$ and an integer $digs$ \\
+\textbf{Output}.  $c \leftarrow \vert a \vert \cdot \vert b \vert \mbox{ (mod }\beta^{digs}\mbox{)}$. \\
+\hline \\
+Place an array of \textbf{MP\_WARRAY} single precision digits named $W$ on the stack. \\
+1.  If $c.alloc < digs$ then grow $c$ to $digs$ digits. (\textit{mp\_grow}) \\
+2.  If step 1 failed return(\textit{MP\_MEM}).\\
+\\
+3.  $pa \leftarrow \mbox{MIN}(digs, a.used + b.used)$ \\
+\\
+4.  $\_ \hat W \leftarrow 0$ \\
+5.  for $ix$ from 0 to $pa - 1$ do \\
+\hspace{3mm}5.1  $ty \leftarrow \mbox{MIN}(b.used - 1, ix)$ \\
+\hspace{3mm}5.2  $tx \leftarrow ix - ty$ \\
+\hspace{3mm}5.3  $iy \leftarrow \mbox{MIN}(a.used - tx, ty + 1)$ \\
+\hspace{3mm}5.4  for $iz$ from 0 to $iy - 1$ do \\
+\hspace{6mm}5.4.1  $\_ \hat W \leftarrow \_ \hat W + a_{tx+iy}b_{ty-iy}$ \\
+\hspace{3mm}5.5  $W_{ix} \leftarrow \_ \hat W (\mbox{mod }\beta)$\\
+\hspace{3mm}5.6  $\_ \hat W \leftarrow \lfloor \_ \hat W / \beta \rfloor$ \\
+6.  $W_{pa} \leftarrow \_ \hat W (\mbox{mod }\beta)$ \\
+\\
+7.  $oldused \leftarrow c.used$ \\
+8.  $c.used \leftarrow digs$ \\
+9.  for $ix$ from $0$ to $pa$ do \\
+\hspace{3mm}9.1  $c_{ix} \leftarrow W_{ix}$ \\
+10.  for $ix$ from $pa + 1$ to $oldused - 1$ do \\
+\hspace{3mm}10.1 $c_{ix} \leftarrow 0$ \\
+\\
+11.  Clamp $c$. \\
+12.  Return MP\_OKAY. \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm fast\_s\_mp\_mul\_digs}
+\label{fig:COMBAMULT}
+\end{figure}
+
+\textbf{Algorithm fast\_s\_mp\_mul\_digs.}
+This algorithm performs the unsigned multiplication of $a$ and $b$ using the Comba method limited to $digs$ digits of precision.
+
+The outer loop of this algorithm is more complicated than that of the baseline multiplier.  This is because on the inside of the 
+loop we want to produce one column per pass.  This allows the accumulator $\_ \hat W$ to be placed in CPU registers and
+reduce the memory bandwidth to two \textbf{mp\_digit} reads per iteration.
+
+The $ty$ variable is set to the minimum count of $ix$ or the number of digits in $b$.  That way if $a$ has more digits than
+$b$ this will be limited to $b.used - 1$.  The $tx$ variable is set to the to the distance past $b.used$ the variable
+$ix$ is.  This is used for the immediately subsequent statement where we find $iy$.  
+
+The variable $iy$ is the minimum digits we can read from either $a$ or $b$ before running out.  Computing one column at a time
+means we have to scan one integer upwards and the other downwards.  $a$ starts at $tx$ and $b$ starts at $ty$.  In each
+pass we are producing the $ix$'th output column and we note that $tx + ty = ix$.  As we move $tx$ upwards we have to 
+move $ty$ downards so the equality remains valid.  The $iy$ variable is the number of iterations until 
+$tx \ge a.used$ or $ty < 0$ occurs.
+
+After every inner pass we store the lower half of the accumulator into $W_{ix}$ and then propagate the carry of the accumulator
+into the next round by dividing $\_ \hat W$ by $\beta$.
+
+To measure the benefits of the Comba method over the baseline method consider the number of operations that are required.  If the 
+cost in terms of time of a multiply and addition is $p$ and the cost of a carry propagation is $q$ then a baseline multiplication would require 
+$O \left ((p + q)n^2 \right )$ time to multiply two $n$-digit numbers.  The Comba method requires only $O(pn^2 + qn)$ time, however in practice, 
+the speed increase is actually much more.  With $O(n)$ space the algorithm can be reduced to $O(pn + qn)$ time by implementing the $n$ multiply
+and addition operations in the nested loop in parallel.  
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_fast\_s\_mp\_mul\_digs.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* Fast (comba) multiplier
+018    *
+019    * This is the fast column-array [comba] multiplier.  It is 
+020    * designed to compute the columns of the product first 
+021    * then handle the carries afterwards.  This has the effect 
+022    * of making the nested loops that compute the columns very
+023    * simple and schedulable on super-scalar processors.
+024    *
+025    * This has been modified to produce a variable number of 
+026    * digits of output so if say only a half-product is required 
+027    * you don't have to compute the upper half (a feature 
+028    * required for fast Barrett reduction).
+029    *
+030    * Based on Algorithm 14.12 on pp.595 of HAC.
+031    *
+032    */
+033   int fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
+034   \{
+035     int     olduse, res, pa, ix, iz;
+036     mp_digit W[MP_WARRAY];
+037     register mp_word  _W;
+038   
+039     /* grow the destination as required */
+040     if (c->alloc < digs) \{
+041       if ((res = mp_grow (c, digs)) != MP_OKAY) \{
+042         return res;
+043       \}
+044     \}
+045   
+046     /* number of output digits to produce */
+047     pa = MIN(digs, a->used + b->used);
+048   
+049     /* clear the carry */
+050     _W = 0;
+051     for (ix = 0; ix < pa; ix++) \{ 
+052         int      tx, ty;
+053         int      iy;
+054         mp_digit *tmpx, *tmpy;
+055   
+056         /* get offsets into the two bignums */
+057         ty = MIN(b->used-1, ix);
+058         tx = ix - ty;
+059   
+060         /* setup temp aliases */
+061         tmpx = a->dp + tx;
+062         tmpy = b->dp + ty;
+063   
+064         /* this is the number of times the loop will iterrate, essentially 
+065            while (tx++ < a->used && ty-- >= 0) \{ ... \}
+066          */
+067         iy = MIN(a->used-tx, ty+1);
+068   
+069         /* execute loop */
+070         for (iz = 0; iz < iy; ++iz) \{
+071            _W += ((mp_word)*tmpx++)*((mp_word)*tmpy--);
+072         \}
+073   
+074         /* store term */
+075         W[ix] = ((mp_digit)_W) & MP_MASK;
+076   
+077         /* make next carry */
+078         _W = _W >> ((mp_word)DIGIT_BIT);
+079     \}
+080   
+081     /* store final carry */
+082     W[ix] = (mp_digit)(_W & MP_MASK);
+083   
+084     /* setup dest */
+085     olduse  = c->used;
+086     c->used = pa;
+087   
+088     \{
+089       register mp_digit *tmpc;
+090       tmpc = c->dp;
+091       for (ix = 0; ix < pa+1; ix++) \{
+092         /* now extract the previous digit [below the carry] */
+093         *tmpc++ = W[ix];
+094       \}
+095   
+096       /* clear unused digits [that existed in the old copy of c] */
+097       for (; ix < olduse; ix++) \{
+098         *tmpc++ = 0;
+099       \}
+100     \}
+101     mp_clamp (c);
+102     return MP_OKAY;
+103   \}
+104   #endif
+\end{alltt}
+\end{small}
+
+As per the pseudo--code we first calculate $pa$ (line 47) as the number of digits to output.  Next we begin the outer loop
+to produce the individual columns of the product.  We use the two aliases $tmpx$ and $tmpy$ (lines 61, 62) to point
+inside the two multiplicands quickly.  
+
+The inner loop (lines 70 to 72) of this implementation is where the tradeoff come into play.  Originally this comba 
+implementation was ``row--major'' which means it adds to each of the columns in each pass.  After the outer loop it would then fix 
+the carries.  This was very fast except it had an annoying drawback.  You had to read a mp\_word and two mp\_digits and write 
+one mp\_word per iteration.  On processors such as the Athlon XP and P4 this did not matter much since the cache bandwidth 
+is very high and it can keep the ALU fed with data.  It did, however, matter on older and embedded cpus where cache is often 
+slower and also often doesn't exist.  This new algorithm only performs two reads per iteration under the assumption that the 
+compiler has aliased $\_ \hat W$ to a CPU register.
+
+After the inner loop we store the current accumulator in $W$ and shift $\_ \hat W$ (lines 75, 78) to forward it as 
+a carry for the next pass.  After the outer loop we use the final carry (line 82) as the last digit of the product.  
+
+\subsection{Polynomial Basis Multiplication}
+To break the $O(n^2)$ barrier in multiplication requires a completely different look at integer multiplication.  In the following algorithms
+the use of polynomial basis representation for two integers $a$ and $b$ as $f(x) = \sum_{i=0}^{n} a_i x^i$ and  
+$g(x) = \sum_{i=0}^{n} b_i x^i$ respectively, is required.  In this system both $f(x)$ and $g(x)$ have $n + 1$ terms and are of the $n$'th degree.
+ 
+The product $a \cdot b \equiv f(x)g(x)$ is the polynomial $W(x) = \sum_{i=0}^{2n} w_i x^i$.  The coefficients $w_i$ will
+directly yield the desired product when $\beta$ is substituted for $x$.  The direct solution to solve for the $2n + 1$ coefficients
+requires $O(n^2)$ time and would in practice be slower than the Comba technique.
+
+However, numerical analysis theory indicates that only $2n + 1$ distinct points in $W(x)$ are required to determine the values of the $2n + 1$ unknown 
+coefficients.   This means by finding $\zeta_y = W(y)$ for $2n + 1$ small values of $y$ the coefficients of $W(x)$ can be found with 
+Gaussian elimination.  This technique is also occasionally refered to as the \textit{interpolation technique} (\textit{references please...}) since in 
+effect an interpolation based on $2n + 1$ points will yield a polynomial equivalent to $W(x)$.  
+
+The coefficients of the polynomial $W(x)$ are unknown which makes finding $W(y)$ for any value of $y$ impossible.  However, since 
+$W(x) = f(x)g(x)$ the equivalent $\zeta_y = f(y) g(y)$ can be used in its place.  The benefit of this technique stems from the 
+fact that $f(y)$ and $g(y)$ are much smaller than either $a$ or $b$ respectively.  As a result finding the $2n + 1$ relations required 
+by multiplying $f(y)g(y)$ involves multiplying integers that are much smaller than either of the inputs.
+
+When picking points to gather relations there are always three obvious points to choose, $y = 0, 1$ and $ \infty$.  The $\zeta_0$ term
+is simply the product $W(0) = w_0 = a_0 \cdot b_0$.  The $\zeta_1$ term is the product 
+$W(1) = \left (\sum_{i = 0}^{n} a_i \right ) \left (\sum_{i = 0}^{n} b_i \right )$.  The third point $\zeta_{\infty}$ is less obvious but rather
+simple to explain.  The $2n + 1$'th coefficient of $W(x)$ is numerically equivalent to the most significant column in an integer multiplication.  
+The point at $\infty$ is used symbolically to represent the most significant column, that is $W(\infty) = w_{2n} = a_nb_n$.  Note that the 
+points at $y = 0$ and $\infty$ yield the coefficients $w_0$ and $w_{2n}$ directly.
+
+If more points are required they should be of small values and powers of two such as $2^q$ and the related \textit{mirror points} 
+$\left (2^q \right )^{2n}  \cdot \zeta_{2^{-q}}$ for small values of $q$.  The term ``mirror point'' stems from the fact that 
+$\left (2^q \right )^{2n}  \cdot \zeta_{2^{-q}}$ can be calculated in the exact opposite fashion as $\zeta_{2^q}$.  For
+example, when $n = 2$ and $q = 1$ then following two equations are equivalent to the point $\zeta_{2}$ and its mirror.
+
+\begin{eqnarray}
+\zeta_{2}                  = f(2)g(2) = (4a_2 + 2a_1 + a_0)(4b_2 + 2b_1 + b_0) \nonumber \\
+16 \cdot \zeta_{1 \over 2} = 4f({1\over 2}) \cdot 4g({1 \over 2}) = (a_2 + 2a_1 + 4a_0)(b_2 + 2b_1 + 4b_0)
+\end{eqnarray}
+
+Using such points will allow the values of $f(y)$ and $g(y)$ to be independently calculated using only left shifts.  For example, when $n = 2$ the
+polynomial $f(2^q)$ is equal to $2^q((2^qa_2) + a_1) + a_0$.  This technique of polynomial representation is known as Horner's method.  
+
+As a general rule of the algorithm when the inputs are split into $n$ parts each there are $2n - 1$ multiplications.  Each multiplication is of 
+multiplicands that have $n$ times fewer digits than the inputs.  The asymptotic running time of this algorithm is 
+$O \left ( k^{lg_n(2n - 1)} \right )$ for $k$ digit inputs (\textit{assuming they have the same number of digits}).  Figure~\ref{fig:exponent}
+summarizes the exponents for various values of $n$.
+
+\begin{figure}
+\begin{center}
+\begin{tabular}{|c|c|c|}
+\hline \textbf{Split into $n$ Parts} & \textbf{Exponent}  & \textbf{Notes}\\
+\hline $2$ & $1.584962501$ & This is Karatsuba Multiplication. \\
+\hline $3$ & $1.464973520$ & This is Toom-Cook Multiplication. \\
+\hline $4$ & $1.403677461$ &\\
+\hline $5$ & $1.365212389$ &\\
+\hline $10$ & $1.278753601$ &\\
+\hline $100$ & $1.149426538$ &\\
+\hline $1000$ & $1.100270931$ &\\
+\hline $10000$ & $1.075252070$ &\\
+\hline
+\end{tabular}
+\end{center}
+\caption{Asymptotic Running Time of Polynomial Basis Multiplication}
+\label{fig:exponent}
+\end{figure}
+
+At first it may seem like a good idea to choose $n = 1000$ since the exponent is approximately $1.1$.  However, the overhead
+of solving for the 2001 terms of $W(x)$ will certainly consume any savings the algorithm could offer for all but exceedingly large
+numbers.  
+
+\subsubsection{Cutoff Point}
+The polynomial basis multiplication algorithms all require fewer single precision multiplications than a straight Comba approach.  However, 
+the algorithms incur an overhead (\textit{at the $O(n)$ work level}) since they require a system of equations to be solved.  This makes the
+polynomial basis approach more costly to use with small inputs.
+
+Let $m$ represent the number of digits in the multiplicands (\textit{assume both multiplicands have the same number of digits}).  There exists a 
+point $y$ such that when $m < y$ the polynomial basis algorithms are more costly than Comba, when $m = y$ they are roughly the same cost and 
+when $m > y$ the Comba methods are slower than the polynomial basis algorithms.  
+
+The exact location of $y$ depends on several key architectural elements of the computer platform in question.
+
+\begin{enumerate}
+\item  The ratio of clock cycles for single precision multiplication versus other simpler operations such as addition, shifting, etc.  For example
+on the AMD Athlon the ratio is roughly $17 : 1$ while on the Intel P4 it is $29 : 1$.  The higher the ratio in favour of multiplication the lower
+the cutoff point $y$ will be.  
+
+\item  The complexity of the linear system of equations (\textit{for the coefficients of $W(x)$}) is.  Generally speaking as the number of splits
+grows the complexity grows substantially.  Ideally solving the system will only involve addition, subtraction and shifting of integers.  This
+directly reflects on the ratio previous mentioned.
+
+\item  To a lesser extent memory bandwidth and function call overheads.  Provided the values are in the processor cache this is less of an
+influence over the cutoff point.
+
+\end{enumerate}
+
+A clean cutoff point separation occurs when a point $y$ is found such that all of the cutoff point conditions are met.  For example, if the point
+is too low then there will be values of $m$ such that $m > y$ and the Comba method is still faster.  Finding the cutoff points is fairly simple when
+a high resolution timer is available.  
+
+\subsection{Karatsuba Multiplication}
+Karatsuba \cite{KARA} multiplication when originally proposed in 1962 was among the first set of algorithms to break the $O(n^2)$ barrier for
+general purpose multiplication.  Given two polynomial basis representations $f(x) = ax + b$ and $g(x) = cx + d$, Karatsuba proved with 
+light algebra \cite{KARAP} that the following polynomial is equivalent to multiplication of the two integers the polynomials represent.
+
+\begin{equation}
+f(x) \cdot g(x) = acx^2 + ((a - b)(c - d) - (ac + bd))x + bd
+\end{equation}
+
+Using the observation that $ac$ and $bd$ could be re-used only three half sized multiplications would be required to produce the product.  Applying
+this algorithm recursively, the work factor becomes $O(n^{lg(3)})$ which is substantially better than the work factor $O(n^2)$ of the Comba technique.  It turns 
+out what Karatsuba did not know or at least did not publish was that this is simply polynomial basis multiplication with the points 
+$\zeta_0$, $\zeta_{\infty}$ and $-\zeta_{-1}$.  Consider the resultant system of equations.
+
+\begin{center}
+\begin{tabular}{rcrcrcrc}
+$\zeta_{0}$ &      $=$ &  &  &  & & $w_0$ \\
+$-\zeta_{-1}$ &    $=$ & $-w_2$ & $+$ & $w_1$ & $-$ & $w_0$ \\
+$\zeta_{\infty}$ & $=$ & $w_2$ &  & &  & \\
+\end{tabular}
+\end{center}
+
+By adding the first and last equation to the equation in the middle the term $w_1$ can be isolated and all three coefficients solved for.  The simplicity
+of this system of equations has made Karatsuba fairly popular.  In fact the cutoff point is often fairly low\footnote{With LibTomMath 0.18 it is 70 and 109 digits for the Intel P4 and AMD Athlon respectively.}
+making it an ideal algorithm to speed up certain public key cryptosystems such as RSA and Diffie-Hellman.  It is worth noting that the point 
+$\zeta_1$ could be substituted for $-\zeta_{-1}$.  In this case the first and third row are subtracted instead of added to the second row.  
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_karatsuba\_mul}. \\
+\textbf{Input}.   mp\_int $a$ and mp\_int $b$ \\
+\textbf{Output}.  $c \leftarrow \vert a \vert \cdot \vert b \vert$ \\
+\hline \\
+1.  Init the following mp\_int variables: $x0$, $x1$, $y0$, $y1$, $t1$, $x0y0$, $x1y1$.\\
+2.  If step 2 failed then return(\textit{MP\_MEM}). \\
+\\
+Split the input.  e.g. $a = x1 \cdot \beta^B + x0$ \\
+3.  $B \leftarrow \mbox{min}(a.used, b.used)/2$ \\
+4.  $x0 \leftarrow a \mbox{ (mod }\beta^B\mbox{)}$ (\textit{mp\_mod\_2d}) \\
+5.  $y0 \leftarrow b \mbox{ (mod }\beta^B\mbox{)}$ \\
+6.  $x1 \leftarrow \lfloor a / \beta^B \rfloor$ (\textit{mp\_rshd}) \\
+7.  $y1 \leftarrow \lfloor b / \beta^B \rfloor$ \\
+\\
+Calculate the three products. \\
+8.  $x0y0 \leftarrow x0 \cdot y0$ (\textit{mp\_mul}) \\
+9.  $x1y1 \leftarrow x1 \cdot y1$ \\
+10.  $t1 \leftarrow x1 - x0$ (\textit{mp\_sub}) \\
+11.  $x0 \leftarrow y1 - y0$ \\
+12.  $t1 \leftarrow t1 \cdot x0$ \\
+\\
+Calculate the middle term. \\
+13.  $x0 \leftarrow x0y0 + x1y1$ \\
+14.  $t1 \leftarrow x0 - t1$ \\
+\\
+Calculate the final product. \\
+15.  $t1 \leftarrow t1 \cdot \beta^B$ (\textit{mp\_lshd}) \\
+16.  $x1y1 \leftarrow x1y1 \cdot \beta^{2B}$ \\
+17.  $t1 \leftarrow x0y0 + t1$ \\
+18.  $c \leftarrow t1 + x1y1$ \\
+19.  Clear all of the temporary variables. \\
+20.  Return(\textit{MP\_OKAY}).\\
+\hline 
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_karatsuba\_mul}
+\end{figure}
+
+\textbf{Algorithm mp\_karatsuba\_mul.}
+This algorithm computes the unsigned product of two inputs using the Karatsuba multiplication algorithm.  It is loosely based on the description
+from Knuth \cite[pp. 294-295]{TAOCPV2}.  
+
+\index{radix point}
+In order to split the two inputs into their respective halves, a suitable \textit{radix point} must be chosen.  The radix point chosen must
+be used for both of the inputs meaning that it must be smaller than the smallest input.  Step 3 chooses the radix point $B$ as half of the 
+smallest input \textbf{used} count.  After the radix point is chosen the inputs are split into lower and upper halves.  Step 4 and 5 
+compute the lower halves.  Step 6 and 7 computer the upper halves.  
+
+After the halves have been computed the three intermediate half-size products must be computed.  Step 8 and 9 compute the trivial products
+$x0 \cdot y0$ and $x1 \cdot y1$.  The mp\_int $x0$ is used as a temporary variable after $x1 - x0$ has been computed.  By using $x0$ instead
+of an additional temporary variable, the algorithm can avoid an addition memory allocation operation.
+
+The remaining steps 13 through 18 compute the Karatsuba polynomial through a variety of digit shifting and addition operations.
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_karatsuba\_mul.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* c = |a| * |b| using Karatsuba Multiplication using 
+018    * three half size multiplications
+019    *
+020    * Let B represent the radix [e.g. 2**DIGIT_BIT] and 
+021    * let n represent half of the number of digits in 
+022    * the min(a,b)
+023    *
+024    * a = a1 * B**n + a0
+025    * b = b1 * B**n + b0
+026    *
+027    * Then, a * b => 
+028      a1b1 * B**2n + ((a1 - a0)(b1 - b0) + a0b0 + a1b1) * B + a0b0
+029    *
+030    * Note that a1b1 and a0b0 are used twice and only need to be 
+031    * computed once.  So in total three half size (half # of 
+032    * digit) multiplications are performed, a0b0, a1b1 and 
+033    * (a1-b1)(a0-b0)
+034    *
+035    * Note that a multiplication of half the digits requires
+036    * 1/4th the number of single precision multiplications so in 
+037    * total after one call 25% of the single precision multiplications 
+038    * are saved.  Note also that the call to mp_mul can end up back 
+039    * in this function if the a0, a1, b0, or b1 are above the threshold.  
+040    * This is known as divide-and-conquer and leads to the famous 
+041    * O(N**lg(3)) or O(N**1.584) work which is asymptopically lower than 
+042    * the standard O(N**2) that the baseline/comba methods use.  
+043    * Generally though the overhead of this method doesn't pay off 
+044    * until a certain size (N ~ 80) is reached.
+045    */
+046   int mp_karatsuba_mul (mp_int * a, mp_int * b, mp_int * c)
+047   \{
+048     mp_int  x0, x1, y0, y1, t1, x0y0, x1y1;
+049     int     B, err;
+050   
+051     /* default the return code to an error */
+052     err = MP_MEM;
+053   
+054     /* min # of digits */
+055     B = MIN (a->used, b->used);
+056   
+057     /* now divide in two */
+058     B = B >> 1;
+059   
+060     /* init copy all the temps */
+061     if (mp_init_size (&x0, B) != MP_OKAY)
+062       goto ERR;
+063     if (mp_init_size (&x1, a->used - B) != MP_OKAY)
+064       goto X0;
+065     if (mp_init_size (&y0, B) != MP_OKAY)
+066       goto X1;
+067     if (mp_init_size (&y1, b->used - B) != MP_OKAY)
+068       goto Y0;
+069   
+070     /* init temps */
+071     if (mp_init_size (&t1, B * 2) != MP_OKAY)
+072       goto Y1;
+073     if (mp_init_size (&x0y0, B * 2) != MP_OKAY)
+074       goto T1;
+075     if (mp_init_size (&x1y1, B * 2) != MP_OKAY)
+076       goto X0Y0;
+077   
+078     /* now shift the digits */
+079     x0.used = y0.used = B;
+080     x1.used = a->used - B;
+081     y1.used = b->used - B;
+082   
+083     \{
+084       register int x;
+085       register mp_digit *tmpa, *tmpb, *tmpx, *tmpy;
+086   
+087       /* we copy the digits directly instead of using higher level functions
+088        * since we also need to shift the digits
+089        */
+090       tmpa = a->dp;
+091       tmpb = b->dp;
+092   
+093       tmpx = x0.dp;
+094       tmpy = y0.dp;
+095       for (x = 0; x < B; x++) \{
+096         *tmpx++ = *tmpa++;
+097         *tmpy++ = *tmpb++;
+098       \}
+099   
+100       tmpx = x1.dp;
+101       for (x = B; x < a->used; x++) \{
+102         *tmpx++ = *tmpa++;
+103       \}
+104   
+105       tmpy = y1.dp;
+106       for (x = B; x < b->used; x++) \{
+107         *tmpy++ = *tmpb++;
+108       \}
+109     \}
+110   
+111     /* only need to clamp the lower words since by definition the 
+112      * upper words x1/y1 must have a known number of digits
+113      */
+114     mp_clamp (&x0);
+115     mp_clamp (&y0);
+116   
+117     /* now calc the products x0y0 and x1y1 */
+118     /* after this x0 is no longer required, free temp [x0==t2]! */
+119     if (mp_mul (&x0, &y0, &x0y0) != MP_OKAY)  
+120       goto X1Y1;          /* x0y0 = x0*y0 */
+121     if (mp_mul (&x1, &y1, &x1y1) != MP_OKAY)
+122       goto X1Y1;          /* x1y1 = x1*y1 */
+123   
+124     /* now calc x1-x0 and y1-y0 */
+125     if (mp_sub (&x1, &x0, &t1) != MP_OKAY)
+126       goto X1Y1;          /* t1 = x1 - x0 */
+127     if (mp_sub (&y1, &y0, &x0) != MP_OKAY)
+128       goto X1Y1;          /* t2 = y1 - y0 */
+129     if (mp_mul (&t1, &x0, &t1) != MP_OKAY)
+130       goto X1Y1;          /* t1 = (x1 - x0) * (y1 - y0) */
+131   
+132     /* add x0y0 */
+133     if (mp_add (&x0y0, &x1y1, &x0) != MP_OKAY)
+134       goto X1Y1;          /* t2 = x0y0 + x1y1 */
+135     if (mp_sub (&x0, &t1, &t1) != MP_OKAY)
+136       goto X1Y1;          /* t1 = x0y0 + x1y1 - (x1-x0)*(y1-y0) */
+137   
+138     /* shift by B */
+139     if (mp_lshd (&t1, B) != MP_OKAY)
+140       goto X1Y1;          /* t1 = (x0y0 + x1y1 - (x1-x0)*(y1-y0))<<B */
+141     if (mp_lshd (&x1y1, B * 2) != MP_OKAY)
+142       goto X1Y1;          /* x1y1 = x1y1 << 2*B */
+143   
+144     if (mp_add (&x0y0, &t1, &t1) != MP_OKAY)
+145       goto X1Y1;          /* t1 = x0y0 + t1 */
+146     if (mp_add (&t1, &x1y1, c) != MP_OKAY)
+147       goto X1Y1;          /* t1 = x0y0 + t1 + x1y1 */
+148   
+149     /* Algorithm succeeded set the return code to MP_OKAY */
+150     err = MP_OKAY;
+151   
+152   X1Y1:mp_clear (&x1y1);
+153   X0Y0:mp_clear (&x0y0);
+154   T1:mp_clear (&t1);
+155   Y1:mp_clear (&y1);
+156   Y0:mp_clear (&y0);
+157   X1:mp_clear (&x1);
+158   X0:mp_clear (&x0);
+159   ERR:
+160     return err;
+161   \}
+162   #endif
+\end{alltt}
+\end{small}
+
+The new coding element in this routine, not  seen in previous routines, is the usage of goto statements.  The conventional
+wisdom is that goto statements should be avoided.  This is generally true, however when every single function call can fail, it makes sense
+to handle error recovery with a single piece of code.  Lines 61 to 75 handle initializing all of the temporary variables 
+required.  Note how each of the if statements goes to a different label in case of failure.  This allows the routine to correctly free only
+the temporaries that have been successfully allocated so far.
+
+The temporary variables are all initialized using the mp\_init\_size routine since they are expected to be large.  This saves the 
+additional reallocation that would have been necessary.  Also $x0$, $x1$, $y0$ and $y1$ have to be able to hold at least their respective
+number of digits for the next section of code.
+
+The first algebraic portion of the algorithm is to split the two inputs into their halves.  However, instead of using mp\_mod\_2d and mp\_rshd
+to extract the halves, the respective code has been placed inline within the body of the function.  To initialize the halves, the \textbf{used} and 
+\textbf{sign} members are copied first.  The first for loop on line 101 copies the lower halves.  Since they are both the same magnitude it 
+is simpler to calculate both lower halves in a single loop.  The for loop on lines 106 and 106 calculate the upper halves $x1$ and 
+$y1$ respectively.
+
+By inlining the calculation of the halves, the Karatsuba multiplier has a slightly lower overhead and can be used for smaller magnitude inputs.
+
+When line 150 is reached, the algorithm has completed succesfully.  The ``error status'' variable $err$ is set to \textbf{MP\_OKAY} so that
+the same code that handles errors can be used to clear the temporary variables and return.  
+
+\subsection{Toom-Cook $3$-Way Multiplication}
+Toom-Cook $3$-Way \cite{TOOM} multiplication is essentially the polynomial basis algorithm for $n = 2$ except that the points  are 
+chosen such that $\zeta$ is easy to compute and the resulting system of equations easy to reduce.  Here, the points $\zeta_{0}$, 
+$16 \cdot \zeta_{1 \over 2}$, $\zeta_1$, $\zeta_2$ and $\zeta_{\infty}$ make up the five required points to solve for the coefficients 
+of the $W(x)$.
+
+With the five relations that Toom-Cook specifies, the following system of equations is formed.
+
+\begin{center}
+\begin{tabular}{rcrcrcrcrcr}
+$\zeta_0$                    & $=$ & $0w_4$ & $+$ & $0w_3$ & $+$ & $0w_2$ & $+$ & $0w_1$ & $+$ & $1w_0$  \\
+$16 \cdot \zeta_{1 \over 2}$ & $=$ & $1w_4$ & $+$ & $2w_3$ & $+$ & $4w_2$ & $+$ & $8w_1$ & $+$ & $16w_0$  \\
+$\zeta_1$                    & $=$ & $1w_4$ & $+$ & $1w_3$ & $+$ & $1w_2$ & $+$ & $1w_1$ & $+$ & $1w_0$  \\
+$\zeta_2$                    & $=$ & $16w_4$ & $+$ & $8w_3$ & $+$ & $4w_2$ & $+$ & $2w_1$ & $+$ & $1w_0$  \\
+$\zeta_{\infty}$             & $=$ & $1w_4$ & $+$ & $0w_3$ & $+$ & $0w_2$ & $+$ & $0w_1$ & $+$ & $0w_0$  \\
+\end{tabular}
+\end{center}
+
+A trivial solution to this matrix requires $12$ subtractions, two multiplications by a small power of two, two divisions by a small power
+of two, two divisions by three and one multiplication by three.  All of these $19$ sub-operations require less than quadratic time, meaning that
+the algorithm can be faster than a baseline multiplication.  However, the greater complexity of this algorithm places the cutoff point
+(\textbf{TOOM\_MUL\_CUTOFF}) where Toom-Cook becomes more efficient much higher than the Karatsuba cutoff point.  
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_toom\_mul}. \\
+\textbf{Input}.   mp\_int $a$ and mp\_int $b$ \\
+\textbf{Output}.  $c \leftarrow  a  \cdot  b $ \\
+\hline \\
+Split $a$ and $b$ into three pieces.  E.g. $a = a_2 \beta^{2k} + a_1 \beta^{k} + a_0$ \\
+1.  $k \leftarrow \lfloor \mbox{min}(a.used, b.used) / 3 \rfloor$ \\
+2.  $a_0 \leftarrow a \mbox{ (mod }\beta^{k}\mbox{)}$ \\
+3.  $a_1 \leftarrow \lfloor a / \beta^k \rfloor$, $a_1 \leftarrow a_1 \mbox{ (mod }\beta^{k}\mbox{)}$ \\
+4.  $a_2 \leftarrow \lfloor a / \beta^{2k} \rfloor$, $a_2 \leftarrow a_2 \mbox{ (mod }\beta^{k}\mbox{)}$ \\
+5.  $b_0 \leftarrow a \mbox{ (mod }\beta^{k}\mbox{)}$ \\
+6.  $b_1 \leftarrow \lfloor a / \beta^k \rfloor$, $b_1 \leftarrow b_1 \mbox{ (mod }\beta^{k}\mbox{)}$ \\
+7.  $b_2 \leftarrow \lfloor a / \beta^{2k} \rfloor$, $b_2 \leftarrow b_2 \mbox{ (mod }\beta^{k}\mbox{)}$ \\
+\\
+Find the five equations for $w_0, w_1, ..., w_4$. \\
+8.  $w_0 \leftarrow a_0 \cdot b_0$ \\
+9.  $w_4 \leftarrow a_2 \cdot b_2$ \\
+10. $tmp_1 \leftarrow 2 \cdot a_0$, $tmp_1 \leftarrow a_1 + tmp_1$, $tmp_1 \leftarrow 2 \cdot tmp_1$, $tmp_1 \leftarrow tmp_1 + a_2$ \\
+11. $tmp_2 \leftarrow 2 \cdot b_0$, $tmp_2 \leftarrow b_1 + tmp_2$, $tmp_2 \leftarrow 2 \cdot tmp_2$, $tmp_2 \leftarrow tmp_2 + b_2$ \\
+12. $w_1 \leftarrow tmp_1 \cdot tmp_2$ \\
+13. $tmp_1 \leftarrow 2 \cdot a_2$, $tmp_1 \leftarrow a_1 + tmp_1$, $tmp_1 \leftarrow 2 \cdot tmp_1$, $tmp_1 \leftarrow tmp_1 + a_0$ \\
+14. $tmp_2 \leftarrow 2 \cdot b_2$, $tmp_2 \leftarrow b_1 + tmp_2$, $tmp_2 \leftarrow 2 \cdot tmp_2$, $tmp_2 \leftarrow tmp_2 + b_0$ \\
+15. $w_3 \leftarrow tmp_1 \cdot tmp_2$ \\
+16. $tmp_1 \leftarrow a_0 + a_1$, $tmp_1 \leftarrow tmp_1 + a_2$, $tmp_2 \leftarrow b_0 + b_1$, $tmp_2 \leftarrow tmp_2 + b_2$ \\
+17. $w_2 \leftarrow tmp_1 \cdot tmp_2$ \\
+\\
+Continued on the next page.\\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_toom\_mul}
+\end{figure}
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_toom\_mul} (continued). \\
+\textbf{Input}.   mp\_int $a$ and mp\_int $b$ \\
+\textbf{Output}.  $c \leftarrow a \cdot  b $ \\
+\hline \\
+Now solve the system of equations. \\
+18. $w_1 \leftarrow w_4 - w_1$, $w_3 \leftarrow w_3 - w_0$ \\
+19. $w_1 \leftarrow \lfloor w_1 / 2 \rfloor$, $w_3 \leftarrow \lfloor w_3 / 2 \rfloor$ \\
+20. $w_2 \leftarrow w_2 - w_0$, $w_2 \leftarrow w_2 - w_4$ \\
+21. $w_1 \leftarrow w_1 - w_2$, $w_3 \leftarrow w_3 - w_2$ \\
+22. $tmp_1 \leftarrow 8 \cdot w_0$, $w_1 \leftarrow w_1 - tmp_1$, $tmp_1 \leftarrow 8 \cdot w_4$, $w_3 \leftarrow w_3 - tmp_1$ \\
+23. $w_2 \leftarrow 3 \cdot w_2$, $w_2 \leftarrow w_2 - w_1$, $w_2 \leftarrow w_2 - w_3$ \\
+24. $w_1 \leftarrow w_1 - w_2$, $w_3 \leftarrow w_3 - w_2$ \\
+25. $w_1 \leftarrow \lfloor w_1 / 3 \rfloor, w_3 \leftarrow \lfloor w_3 / 3 \rfloor$ \\
+\\
+Now substitute $\beta^k$ for $x$ by shifting $w_0, w_1, ..., w_4$. \\
+26. for $n$ from $1$ to $4$ do \\
+\hspace{3mm}26.1  $w_n \leftarrow w_n \cdot \beta^{nk}$ \\
+27. $c \leftarrow w_0 + w_1$, $c \leftarrow c + w_2$, $c \leftarrow c + w_3$, $c \leftarrow c + w_4$ \\
+28. Return(\textit{MP\_OKAY}) \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_toom\_mul (continued)}
+\end{figure}
+
+\textbf{Algorithm mp\_toom\_mul.}
+This algorithm computes the product of two mp\_int variables $a$ and $b$ using the Toom-Cook approach.  Compared to the Karatsuba multiplication, this 
+algorithm has a lower asymptotic running time of approximately $O(n^{1.464})$ but at an obvious cost in overhead.  In this
+description, several statements have been compounded to save space.  The intention is that the statements are executed from left to right across
+any given step.
+
+The two inputs $a$ and $b$ are first split into three $k$-digit integers $a_0, a_1, a_2$ and $b_0, b_1, b_2$ respectively.  From these smaller
+integers the coefficients of the polynomial basis representations $f(x)$ and $g(x)$ are known and can be used to find the relations required.
+
+The first two relations $w_0$ and $w_4$ are the points $\zeta_{0}$ and $\zeta_{\infty}$ respectively.  The relation $w_1, w_2$ and $w_3$ correspond
+to the points $16 \cdot \zeta_{1 \over 2}, \zeta_{2}$ and $\zeta_{1}$ respectively.  These are found using logical shifts to independently find
+$f(y)$ and $g(y)$ which significantly speeds up the algorithm.
+
+After the five relations $w_0, w_1, \ldots, w_4$ have been computed, the system they represent must be solved in order for the unknown coefficients 
+$w_1, w_2$ and $w_3$ to be isolated.  The steps 18 through 25 perform the system reduction required as previously described.  Each step of
+the reduction represents the comparable matrix operation that would be performed had this been performed by pencil.  For example, step 18 indicates
+that row $1$ must be subtracted from row $4$ and simultaneously row $0$ subtracted from row $3$.  
+
+Once the coeffients have been isolated, the polynomial $W(x) = \sum_{i=0}^{2n} w_i x^i$ is known.  By substituting $\beta^{k}$ for $x$, the integer 
+result $a \cdot b$ is produced.
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_toom\_mul.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* multiplication using the Toom-Cook 3-way algorithm 
+018    *
+019    * Much more complicated than Karatsuba but has a lower 
+020    * asymptotic running time of O(N**1.464).  This algorithm is 
+021    * only particularly useful on VERY large inputs 
+022    * (we're talking 1000s of digits here...).
+023   */
+024   int mp_toom_mul(mp_int *a, mp_int *b, mp_int *c)
+025   \{
+026       mp_int w0, w1, w2, w3, w4, tmp1, tmp2, a0, a1, a2, b0, b1, b2;
+027       int res, B;
+028           
+029       /* init temps */
+030       if ((res = mp_init_multi(&w0, &w1, &w2, &w3, &w4, 
+031                                &a0, &a1, &a2, &b0, &b1, 
+032                                &b2, &tmp1, &tmp2, NULL)) != MP_OKAY) \{
+033          return res;
+034       \}
+035       
+036       /* B */
+037       B = MIN(a->used, b->used) / 3;
+038       
+039       /* a = a2 * B**2 + a1 * B + a0 */
+040       if ((res = mp_mod_2d(a, DIGIT_BIT * B, &a0)) != MP_OKAY) \{
+041          goto ERR;
+042       \}
+043   
+044       if ((res = mp_copy(a, &a1)) != MP_OKAY) \{
+045          goto ERR;
+046       \}
+047       mp_rshd(&a1, B);
+048       mp_mod_2d(&a1, DIGIT_BIT * B, &a1);
+049   
+050       if ((res = mp_copy(a, &a2)) != MP_OKAY) \{
+051          goto ERR;
+052       \}
+053       mp_rshd(&a2, B*2);
+054       
+055       /* b = b2 * B**2 + b1 * B + b0 */
+056       if ((res = mp_mod_2d(b, DIGIT_BIT * B, &b0)) != MP_OKAY) \{
+057          goto ERR;
+058       \}
+059   
+060       if ((res = mp_copy(b, &b1)) != MP_OKAY) \{
+061          goto ERR;
+062       \}
+063       mp_rshd(&b1, B);
+064       mp_mod_2d(&b1, DIGIT_BIT * B, &b1);
+065   
+066       if ((res = mp_copy(b, &b2)) != MP_OKAY) \{
+067          goto ERR;
+068       \}
+069       mp_rshd(&b2, B*2);
+070       
+071       /* w0 = a0*b0 */
+072       if ((res = mp_mul(&a0, &b0, &w0)) != MP_OKAY) \{
+073          goto ERR;
+074       \}
+075       
+076       /* w4 = a2 * b2 */
+077       if ((res = mp_mul(&a2, &b2, &w4)) != MP_OKAY) \{
+078          goto ERR;
+079       \}
+080       
+081       /* w1 = (a2 + 2(a1 + 2a0))(b2 + 2(b1 + 2b0)) */
+082       if ((res = mp_mul_2(&a0, &tmp1)) != MP_OKAY) \{
+083          goto ERR;
+084       \}
+085       if ((res = mp_add(&tmp1, &a1, &tmp1)) != MP_OKAY) \{
+086          goto ERR;
+087       \}
+088       if ((res = mp_mul_2(&tmp1, &tmp1)) != MP_OKAY) \{
+089          goto ERR;
+090       \}
+091       if ((res = mp_add(&tmp1, &a2, &tmp1)) != MP_OKAY) \{
+092          goto ERR;
+093       \}
+094       
+095       if ((res = mp_mul_2(&b0, &tmp2)) != MP_OKAY) \{
+096          goto ERR;
+097       \}
+098       if ((res = mp_add(&tmp2, &b1, &tmp2)) != MP_OKAY) \{
+099          goto ERR;
+100       \}
+101       if ((res = mp_mul_2(&tmp2, &tmp2)) != MP_OKAY) \{
+102          goto ERR;
+103       \}
+104       if ((res = mp_add(&tmp2, &b2, &tmp2)) != MP_OKAY) \{
+105          goto ERR;
+106       \}
+107       
+108       if ((res = mp_mul(&tmp1, &tmp2, &w1)) != MP_OKAY) \{
+109          goto ERR;
+110       \}
+111       
+112       /* w3 = (a0 + 2(a1 + 2a2))(b0 + 2(b1 + 2b2)) */
+113       if ((res = mp_mul_2(&a2, &tmp1)) != MP_OKAY) \{
+114          goto ERR;
+115       \}
+116       if ((res = mp_add(&tmp1, &a1, &tmp1)) != MP_OKAY) \{
+117          goto ERR;
+118       \}
+119       if ((res = mp_mul_2(&tmp1, &tmp1)) != MP_OKAY) \{
+120          goto ERR;
+121       \}
+122       if ((res = mp_add(&tmp1, &a0, &tmp1)) != MP_OKAY) \{
+123          goto ERR;
+124       \}
+125       
+126       if ((res = mp_mul_2(&b2, &tmp2)) != MP_OKAY) \{
+127          goto ERR;
+128       \}
+129       if ((res = mp_add(&tmp2, &b1, &tmp2)) != MP_OKAY) \{
+130          goto ERR;
+131       \}
+132       if ((res = mp_mul_2(&tmp2, &tmp2)) != MP_OKAY) \{
+133          goto ERR;
+134       \}
+135       if ((res = mp_add(&tmp2, &b0, &tmp2)) != MP_OKAY) \{
+136          goto ERR;
+137       \}
+138       
+139       if ((res = mp_mul(&tmp1, &tmp2, &w3)) != MP_OKAY) \{
+140          goto ERR;
+141       \}
+142       
+143   
+144       /* w2 = (a2 + a1 + a0)(b2 + b1 + b0) */
+145       if ((res = mp_add(&a2, &a1, &tmp1)) != MP_OKAY) \{
+146          goto ERR;
+147       \}
+148       if ((res = mp_add(&tmp1, &a0, &tmp1)) != MP_OKAY) \{
+149          goto ERR;
+150       \}
+151       if ((res = mp_add(&b2, &b1, &tmp2)) != MP_OKAY) \{
+152          goto ERR;
+153       \}
+154       if ((res = mp_add(&tmp2, &b0, &tmp2)) != MP_OKAY) \{
+155          goto ERR;
+156       \}
+157       if ((res = mp_mul(&tmp1, &tmp2, &w2)) != MP_OKAY) \{
+158          goto ERR;
+159       \}
+160       
+161       /* now solve the matrix 
+162       
+163          0  0  0  0  1
+164          1  2  4  8  16
+165          1  1  1  1  1
+166          16 8  4  2  1
+167          1  0  0  0  0
+168          
+169          using 12 subtractions, 4 shifts, 
+170                 2 small divisions and 1 small multiplication 
+171        */
+172        
+173        /* r1 - r4 */
+174        if ((res = mp_sub(&w1, &w4, &w1)) != MP_OKAY) \{
+175           goto ERR;
+176        \}
+177        /* r3 - r0 */
+178        if ((res = mp_sub(&w3, &w0, &w3)) != MP_OKAY) \{
+179           goto ERR;
+180        \}
+181        /* r1/2 */
+182        if ((res = mp_div_2(&w1, &w1)) != MP_OKAY) \{
+183           goto ERR;
+184        \}
+185        /* r3/2 */
+186        if ((res = mp_div_2(&w3, &w3)) != MP_OKAY) \{
+187           goto ERR;
+188        \}
+189        /* r2 - r0 - r4 */
+190        if ((res = mp_sub(&w2, &w0, &w2)) != MP_OKAY) \{
+191           goto ERR;
+192        \}
+193        if ((res = mp_sub(&w2, &w4, &w2)) != MP_OKAY) \{
+194           goto ERR;
+195        \}
+196        /* r1 - r2 */
+197        if ((res = mp_sub(&w1, &w2, &w1)) != MP_OKAY) \{
+198           goto ERR;
+199        \}
+200        /* r3 - r2 */
+201        if ((res = mp_sub(&w3, &w2, &w3)) != MP_OKAY) \{
+202           goto ERR;
+203        \}
+204        /* r1 - 8r0 */
+205        if ((res = mp_mul_2d(&w0, 3, &tmp1)) != MP_OKAY) \{
+206           goto ERR;
+207        \}
+208        if ((res = mp_sub(&w1, &tmp1, &w1)) != MP_OKAY) \{
+209           goto ERR;
+210        \}
+211        /* r3 - 8r4 */
+212        if ((res = mp_mul_2d(&w4, 3, &tmp1)) != MP_OKAY) \{
+213           goto ERR;
+214        \}
+215        if ((res = mp_sub(&w3, &tmp1, &w3)) != MP_OKAY) \{
+216           goto ERR;
+217        \}
+218        /* 3r2 - r1 - r3 */
+219        if ((res = mp_mul_d(&w2, 3, &w2)) != MP_OKAY) \{
+220           goto ERR;
+221        \}
+222        if ((res = mp_sub(&w2, &w1, &w2)) != MP_OKAY) \{
+223           goto ERR;
+224        \}
+225        if ((res = mp_sub(&w2, &w3, &w2)) != MP_OKAY) \{
+226           goto ERR;
+227        \}
+228        /* r1 - r2 */
+229        if ((res = mp_sub(&w1, &w2, &w1)) != MP_OKAY) \{
+230           goto ERR;
+231        \}
+232        /* r3 - r2 */
+233        if ((res = mp_sub(&w3, &w2, &w3)) != MP_OKAY) \{
+234           goto ERR;
+235        \}
+236        /* r1/3 */
+237        if ((res = mp_div_3(&w1, &w1, NULL)) != MP_OKAY) \{
+238           goto ERR;
+239        \}
+240        /* r3/3 */
+241        if ((res = mp_div_3(&w3, &w3, NULL)) != MP_OKAY) \{
+242           goto ERR;
+243        \}
+244        
+245        /* at this point shift W[n] by B*n */
+246        if ((res = mp_lshd(&w1, 1*B)) != MP_OKAY) \{
+247           goto ERR;
+248        \}
+249        if ((res = mp_lshd(&w2, 2*B)) != MP_OKAY) \{
+250           goto ERR;
+251        \}
+252        if ((res = mp_lshd(&w3, 3*B)) != MP_OKAY) \{
+253           goto ERR;
+254        \}
+255        if ((res = mp_lshd(&w4, 4*B)) != MP_OKAY) \{
+256           goto ERR;
+257        \}     
+258        
+259        if ((res = mp_add(&w0, &w1, c)) != MP_OKAY) \{
+260           goto ERR;
+261        \}
+262        if ((res = mp_add(&w2, &w3, &tmp1)) != MP_OKAY) \{
+263           goto ERR;
+264        \}
+265        if ((res = mp_add(&w4, &tmp1, &tmp1)) != MP_OKAY) \{
+266           goto ERR;
+267        \}
+268        if ((res = mp_add(&tmp1, c, c)) != MP_OKAY) \{
+269           goto ERR;
+270        \}     
+271        
+272   ERR:
+273        mp_clear_multi(&w0, &w1, &w2, &w3, &w4, 
+274                       &a0, &a1, &a2, &b0, &b1, 
+275                       &b2, &tmp1, &tmp2, NULL);
+276        return res;
+277   \}     
+278        
+279   #endif
+\end{alltt}
+\end{small}
+
+The first obvious thing to note is that this algorithm is complicated.  The complexity is worth it if you are multiplying very 
+large numbers.  For example, a 10,000 digit multiplication takes approximaly 99,282,205 fewer single precision multiplications with
+Toom--Cook than a Comba or baseline approach (this is a savings of more than 99$\%$).  For most ``crypto'' sized numbers this
+algorithm is not practical as Karatsuba has a much lower cutoff point.
+
+First we split $a$ and $b$ into three roughly equal portions.  This has been accomplished (lines 40 to 69) with 
+combinations of mp\_rshd() and mp\_mod\_2d() function calls.  At this point $a = a2 \cdot \beta^2 + a1 \cdot \beta + a0$ and similiarly
+for $b$.  
+
+Next we compute the five points $w0, w1, w2, w3$ and $w4$.  Recall that $w0$ and $w4$ can be computed directly from the portions so
+we get those out of the way first (lines 72 and 77).  Next we compute $w1, w2$ and $w3$ using Horners method.
+
+After this point we solve for the actual values of $w1, w2$ and $w3$ by reducing the $5 \times 5$ system which is relatively
+straight forward.  
+
+\subsection{Signed Multiplication}
+Now that algorithms to handle multiplications of every useful dimensions have been developed, a rather simple finishing touch is required.  So far all
+of the multiplication algorithms have been unsigned multiplications which leaves only a signed multiplication algorithm to be established.  
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_mul}. \\
+\textbf{Input}.   mp\_int $a$ and mp\_int $b$ \\
+\textbf{Output}.  $c \leftarrow a \cdot b$ \\
+\hline \\
+1.  If $a.sign = b.sign$ then \\
+\hspace{3mm}1.1  $sign = MP\_ZPOS$ \\
+2.  else \\
+\hspace{3mm}2.1  $sign = MP\_ZNEG$ \\
+3.  If min$(a.used, b.used) \ge TOOM\_MUL\_CUTOFF$ then  \\
+\hspace{3mm}3.1  $c \leftarrow a \cdot b$ using algorithm mp\_toom\_mul \\
+4.  else if min$(a.used, b.used) \ge KARATSUBA\_MUL\_CUTOFF$ then \\
+\hspace{3mm}4.1  $c \leftarrow a \cdot b$ using algorithm mp\_karatsuba\_mul \\
+5.  else \\
+\hspace{3mm}5.1  $digs \leftarrow a.used + b.used + 1$ \\
+\hspace{3mm}5.2  If $digs < MP\_ARRAY$ and min$(a.used, b.used) \le \delta$ then \\
+\hspace{6mm}5.2.1  $c \leftarrow a \cdot b \mbox{ (mod }\beta^{digs}\mbox{)}$ using algorithm fast\_s\_mp\_mul\_digs.  \\
+\hspace{3mm}5.3  else \\
+\hspace{6mm}5.3.1  $c \leftarrow a \cdot b \mbox{ (mod }\beta^{digs}\mbox{)}$ using algorithm s\_mp\_mul\_digs.  \\
+6.  $c.sign \leftarrow sign$ \\
+7.  Return the result of the unsigned multiplication performed. \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_mul}
+\end{figure}
+
+\textbf{Algorithm mp\_mul.}
+This algorithm performs the signed multiplication of two inputs.  It will make use of any of the three unsigned multiplication algorithms 
+available when the input is of appropriate size.  The \textbf{sign} of the result is not set until the end of the algorithm since algorithm
+s\_mp\_mul\_digs will clear it.  
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_mul.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* high level multiplication (handles sign) */
+018   int mp_mul (mp_int * a, mp_int * b, mp_int * c)
+019   \{
+020     int     res, neg;
+021     neg = (a->sign == b->sign) ? MP_ZPOS : MP_NEG;
+022   
+023     /* use Toom-Cook? */
+024   #ifdef BN_MP_TOOM_MUL_C
+025     if (MIN (a->used, b->used) >= TOOM_MUL_CUTOFF) \{
+026       res = mp_toom_mul(a, b, c);
+027     \} else 
+028   #endif
+029   #ifdef BN_MP_KARATSUBA_MUL_C
+030     /* use Karatsuba? */
+031     if (MIN (a->used, b->used) >= KARATSUBA_MUL_CUTOFF) \{
+032       res = mp_karatsuba_mul (a, b, c);
+033     \} else 
+034   #endif
+035     \{
+036       /* can we use the fast multiplier?
+037        *
+038        * The fast multiplier can be used if the output will 
+039        * have less than MP_WARRAY digits and the number of 
+040        * digits won't affect carry propagation
+041        */
+042       int     digs = a->used + b->used + 1;
+043   
+044   #ifdef BN_FAST_S_MP_MUL_DIGS_C
+045       if ((digs < MP_WARRAY) &&
+046           MIN(a->used, b->used) <= 
+047           (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) \{
+048         res = fast_s_mp_mul_digs (a, b, c, digs);
+049       \} else 
+050   #endif
+051   #ifdef BN_S_MP_MUL_DIGS_C
+052         res = s_mp_mul (a, b, c); /* uses s_mp_mul_digs */
+053   #else
+054         res = MP_VAL;
+055   #endif
+056   
+057     \}
+058     c->sign = (c->used > 0) ? neg : MP_ZPOS;
+059     return res;
+060   \}
+061   #endif
+\end{alltt}
+\end{small}
+
+The implementation is rather simplistic and is not particularly noteworthy.  Line 23 computes the sign of the result using the ``?'' 
+operator from the C programming language.  Line 47 computes $\delta$ using the fact that $1 << k$ is equal to $2^k$.  
+
+\section{Squaring}
+\label{sec:basesquare}
+
+Squaring is a special case of multiplication where both multiplicands are equal.  At first it may seem like there is no significant optimization
+available but in fact there is.  Consider the multiplication of $576$ against $241$.  In total there will be nine single precision multiplications
+performed which are $1\cdot 6$, $1 \cdot 7$, $1 \cdot 5$, $4 \cdot 6$, $4 \cdot 7$, $4 \cdot 5$, $2 \cdot  6$, $2 \cdot 7$ and $2 \cdot 5$.  Now consider 
+the multiplication of $123$ against $123$.  The nine products are $3 \cdot 3$, $3 \cdot 2$, $3 \cdot 1$, $2 \cdot 3$, $2 \cdot 2$, $2 \cdot 1$, 
+$1 \cdot 3$, $1 \cdot 2$ and $1 \cdot 1$.  On closer inspection some of the products are equivalent.  For example, $3 \cdot 2 = 2 \cdot 3$ 
+and $3 \cdot 1 = 1 \cdot 3$. 
+
+For any $n$-digit input, there are ${{\left (n^2 + n \right)}\over 2}$ possible unique single precision multiplications required compared to the $n^2$
+required for multiplication.  The following diagram gives an example of the operations required.
+
+\begin{figure}[here]
+\begin{center}
+\begin{tabular}{ccccc|c}
+&&1&2&3&\\
+$\times$ &&1&2&3&\\
+\hline && $3 \cdot 1$ & $3 \cdot 2$ & $3 \cdot 3$ & Row 0\\
+       & $2 \cdot 1$  & $2 \cdot 2$ & $2 \cdot 3$ && Row 1 \\
+         $1 \cdot 1$  & $1 \cdot 2$ & $1 \cdot 3$ &&& Row 2 \\
+\end{tabular}
+\end{center}
+\caption{Squaring Optimization Diagram}
+\end{figure}
+
+Starting from zero and numbering the columns from right to left a very simple pattern becomes obvious.  For the purposes of this discussion let $x$
+represent the number being squared.  The first observation is that in row $k$ the $2k$'th column of the product has a $\left (x_k \right)^2$ term in it.  
+
+The second observation is that every column $j$ in row $k$ where $j \ne 2k$ is part of a double product.  Every non-square term of a column will
+appear twice hence the name ``double product''.  Every odd column is made up entirely of double products.  In fact every column is made up of double 
+products and at most one square (\textit{see the exercise section}).  
+
+The third and final observation is that for row $k$ the first unique non-square term, that is, one that hasn't already appeared in an earlier row, 
+occurs at column $2k + 1$.  For example, on row $1$ of the previous squaring, column one is part of the double product with column one from row zero. 
+Column two of row one is a square and column three is the first unique column.
+
+\subsection{The Baseline Squaring Algorithm}
+The baseline squaring algorithm is meant to be a catch-all squaring algorithm.  It will handle any of the input sizes that the faster routines
+will not handle.  
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{s\_mp\_sqr}. \\
+\textbf{Input}.   mp\_int $a$ \\
+\textbf{Output}.  $b \leftarrow a^2$ \\
+\hline \\
+1.  Init a temporary mp\_int of at least $2 \cdot a.used +1$ digits.  (\textit{mp\_init\_size}) \\
+2.  If step 1 failed return(\textit{MP\_MEM}) \\
+3.  $t.used \leftarrow 2 \cdot a.used + 1$ \\
+4.  For $ix$ from 0 to $a.used - 1$ do \\
+\hspace{3mm}Calculate the square. \\
+\hspace{3mm}4.1  $\hat r \leftarrow t_{2ix} + \left (a_{ix} \right )^2$ \\
+\hspace{3mm}4.2  $t_{2ix} \leftarrow \hat r \mbox{ (mod }\beta\mbox{)}$ \\
+\hspace{3mm}Calculate the double products after the square. \\
+\hspace{3mm}4.3  $u \leftarrow \lfloor \hat r / \beta \rfloor$ \\
+\hspace{3mm}4.4  For $iy$ from $ix + 1$ to $a.used - 1$ do \\
+\hspace{6mm}4.4.1  $\hat r \leftarrow 2 \cdot a_{ix}a_{iy} + t_{ix + iy} + u$ \\
+\hspace{6mm}4.4.2  $t_{ix + iy} \leftarrow \hat r \mbox{ (mod }\beta\mbox{)}$ \\
+\hspace{6mm}4.4.3  $u \leftarrow \lfloor \hat r / \beta \rfloor$ \\
+\hspace{3mm}Set the last carry. \\
+\hspace{3mm}4.5  While $u > 0$ do \\
+\hspace{6mm}4.5.1  $iy \leftarrow iy + 1$ \\
+\hspace{6mm}4.5.2  $\hat r \leftarrow t_{ix + iy} + u$ \\
+\hspace{6mm}4.5.3  $t_{ix + iy} \leftarrow \hat r \mbox{ (mod }\beta\mbox{)}$ \\
+\hspace{6mm}4.5.4  $u \leftarrow \lfloor \hat r / \beta \rfloor$ \\
+5.  Clamp excess digits of $t$.  (\textit{mp\_clamp}) \\
+6.  Exchange $b$ and $t$. \\
+7.  Clear $t$ (\textit{mp\_clear}) \\
+8.  Return(\textit{MP\_OKAY}) \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm s\_mp\_sqr}
+\end{figure}
+
+\textbf{Algorithm s\_mp\_sqr.}
+This algorithm computes the square of an input using the three observations on squaring.  It is based fairly faithfully on  algorithm 14.16 of HAC
+\cite[pp.596-597]{HAC}.  Similar to algorithm s\_mp\_mul\_digs, a temporary mp\_int is allocated to hold the result of the squaring.  This allows the 
+destination mp\_int to be the same as the source mp\_int.
+
+The outer loop of this algorithm begins on step 4. It is best to think of the outer loop as walking down the rows of the partial results, while
+the inner loop computes the columns of the partial result.  Step 4.1 and 4.2 compute the square term for each row, and step 4.3 and 4.4 propagate
+the carry and compute the double products.  
+
+The requirement that a mp\_word be able to represent the range $0 \le x < 2 \beta^2$ arises from this
+very algorithm.  The product $a_{ix}a_{iy}$ will lie in the range $0 \le x \le \beta^2 - 2\beta + 1$ which is obviously less than $\beta^2$ meaning that
+when it is multiplied by two, it can be properly represented by a mp\_word.
+
+Similar to algorithm s\_mp\_mul\_digs, after every pass of the inner loop, the destination is correctly set to the sum of all of the partial 
+results calculated so far.  This involves expensive carry propagation which will be eliminated in the next algorithm.  
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_s\_mp\_sqr.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* low level squaring, b = a*a, HAC pp.596-597, Algorithm 14.16 */
+018   int s_mp_sqr (mp_int * a, mp_int * b)
+019   \{
+020     mp_int  t;
+021     int     res, ix, iy, pa;
+022     mp_word r;
+023     mp_digit u, tmpx, *tmpt;
+024   
+025     pa = a->used;
+026     if ((res = mp_init_size (&t, 2*pa + 1)) != MP_OKAY) \{
+027       return res;
+028     \}
+029   
+030     /* default used is maximum possible size */
+031     t.used = 2*pa + 1;
+032   
+033     for (ix = 0; ix < pa; ix++) \{
+034       /* first calculate the digit at 2*ix */
+035       /* calculate double precision result */
+036       r = ((mp_word) t.dp[2*ix]) +
+037           ((mp_word)a->dp[ix])*((mp_word)a->dp[ix]);
+038   
+039       /* store lower part in result */
+040       t.dp[ix+ix] = (mp_digit) (r & ((mp_word) MP_MASK));
+041   
+042       /* get the carry */
+043       u           = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
+044   
+045       /* left hand side of A[ix] * A[iy] */
+046       tmpx        = a->dp[ix];
+047   
+048       /* alias for where to store the results */
+049       tmpt        = t.dp + (2*ix + 1);
+050       
+051       for (iy = ix + 1; iy < pa; iy++) \{
+052         /* first calculate the product */
+053         r       = ((mp_word)tmpx) * ((mp_word)a->dp[iy]);
+054   
+055         /* now calculate the double precision result, note we use
+056          * addition instead of *2 since it's easier to optimize
+057          */
+058         r       = ((mp_word) *tmpt) + r + r + ((mp_word) u);
+059   
+060         /* store lower part */
+061         *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
+062   
+063         /* get carry */
+064         u       = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
+065       \}
+066       /* propagate upwards */
+067       while (u != ((mp_digit) 0)) \{
+068         r       = ((mp_word) *tmpt) + ((mp_word) u);
+069         *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
+070         u       = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
+071       \}
+072     \}
+073   
+074     mp_clamp (&t);
+075     mp_exch (&t, b);
+076     mp_clear (&t);
+077     return MP_OKAY;
+078   \}
+079   #endif
+\end{alltt}
+\end{small}
+
+Inside the outer loop (line 33) the square term is calculated on line 36.  The carry (line 43) has been
+extracted from the mp\_word accumulator using a right shift.  Aliases for $a_{ix}$ and $t_{ix+iy}$ are initialized 
+(lines 46 and 49) to simplify the inner loop.  The doubling is performed using two
+additions (line 58) since it is usually faster than shifting, if not at least as fast.  
+
+The important observation is that the inner loop does not begin at $iy = 0$ like for multiplication.  As such the inner loops
+get progressively shorter as the algorithm proceeds.  This is what leads to the savings compared to using a multiplication to
+square a number. 
+
+\subsection{Faster Squaring by the ``Comba'' Method}
+A major drawback to the baseline method is the requirement for single precision shifting inside the $O(n^2)$ nested loop.  Squaring has an additional
+drawback that it must double the product inside the inner loop as well.  As for multiplication, the Comba technique can be used to eliminate these
+performance hazards.
+
+The first obvious solution is to make an array of mp\_words which will hold all of the columns.  This will indeed eliminate all of the carry
+propagation operations from the inner loop.  However, the inner product must still be doubled $O(n^2)$ times.  The solution stems from the simple fact
+that $2a + 2b + 2c = 2(a + b + c)$.  That is the sum of all of the double products is equal to double the sum of all the products.  For example,
+$ab + ba + ac + ca = 2ab + 2ac = 2(ab + ac)$.  
+
+However, we cannot simply double all of the columns, since the squares appear only once per row.  The most practical solution is to have two 
+mp\_word arrays.  One array will hold the squares and the other array will hold the double products.  With both arrays the doubling and 
+carry propagation can be moved to a $O(n)$ work level outside the $O(n^2)$ level.  In this case, we have an even simpler solution in mind.
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{fast\_s\_mp\_sqr}. \\
+\textbf{Input}.   mp\_int $a$ \\
+\textbf{Output}.  $b \leftarrow a^2$ \\
+\hline \\
+Place an array of \textbf{MP\_WARRAY} mp\_digits named $W$ on the stack. \\
+1.  If $b.alloc < 2a.used + 1$ then grow $b$ to $2a.used + 1$ digits.  (\textit{mp\_grow}). \\
+2.  If step 1 failed return(\textit{MP\_MEM}). \\
+\\
+3.  $pa \leftarrow 2 \cdot a.used$ \\
+4.  $\hat W1 \leftarrow 0$ \\
+5.  for $ix$ from $0$ to $pa - 1$ do \\
+\hspace{3mm}5.1  $\_ \hat W \leftarrow 0$ \\
+\hspace{3mm}5.2  $ty \leftarrow \mbox{MIN}(a.used - 1, ix)$ \\
+\hspace{3mm}5.3  $tx \leftarrow ix - ty$ \\
+\hspace{3mm}5.4  $iy \leftarrow \mbox{MIN}(a.used - tx, ty + 1)$ \\
+\hspace{3mm}5.5  $iy \leftarrow \mbox{MIN}(iy, \lfloor \left (ty - tx + 1 \right )/2 \rfloor)$ \\
+\hspace{3mm}5.6  for $iz$ from $0$ to $iz - 1$ do \\
+\hspace{6mm}5.6.1  $\_ \hat W \leftarrow \_ \hat W + a_{tx + iz}a_{ty - iz}$ \\
+\hspace{3mm}5.7  $\_ \hat W \leftarrow 2 \cdot \_ \hat W  + \hat W1$ \\
+\hspace{3mm}5.8  if $ix$ is even then \\
+\hspace{6mm}5.8.1  $\_ \hat W \leftarrow \_ \hat W + \left ( a_{\lfloor ix/2 \rfloor}\right )^2$ \\
+\hspace{3mm}5.9  $W_{ix} \leftarrow \_ \hat W (\mbox{mod }\beta)$ \\
+\hspace{3mm}5.10  $\hat W1 \leftarrow \lfloor \_ \hat W / \beta \rfloor$ \\
+\\
+6.  $oldused \leftarrow b.used$ \\
+7.  $b.used \leftarrow 2 \cdot a.used$ \\
+8.  for $ix$ from $0$ to $pa - 1$ do \\
+\hspace{3mm}8.1  $b_{ix} \leftarrow W_{ix}$ \\
+9.  for $ix$ from $pa$ to $oldused - 1$ do \\
+\hspace{3mm}9.1  $b_{ix} \leftarrow 0$ \\
+10.  Clamp excess digits from $b$.  (\textit{mp\_clamp}) \\
+11.  Return(\textit{MP\_OKAY}). \\ 
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm fast\_s\_mp\_sqr}
+\end{figure}
+
+\textbf{Algorithm fast\_s\_mp\_sqr.}
+This algorithm computes the square of an input using the Comba technique.  It is designed to be a replacement for algorithm 
+s\_mp\_sqr when the number of input digits is less than \textbf{MP\_WARRAY} and less than $\delta \over 2$.  
+This algorithm is very similar to the Comba multiplier except with a few key differences we shall make note of.
+
+First, we have an accumulator and carry variables $\_ \hat W$ and $\hat W1$ respectively.  This is because the inner loop
+products are to be doubled.  If we had added the previous carry in we would be doubling too much.  Next we perform an
+addition MIN condition on $iy$ (step 5.5) to prevent overlapping digits.  For example, $a_3 \cdot a_5$ is equal
+$a_5 \cdot a_3$.  Whereas in the multiplication case we would have $5 < a.used$ and $3 \ge 0$ is maintained since we double the sum
+of the products just outside the inner loop we have to avoid doing this.  This is also a good thing since we perform
+fewer multiplications and the routine ends up being faster.
+
+Finally the last difference is the addition of the ``square'' term outside the inner loop (step 5.8).  We add in the square
+only to even outputs and it is the square of the term at the $\lfloor ix / 2 \rfloor$ position.
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_fast\_s\_mp\_sqr.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* the jist of squaring...
+018    * you do like mult except the offset of the tmpx [one that 
+019    * starts closer to zero] can't equal the offset of tmpy.  
+020    * So basically you set up iy like before then you min it with
+021    * (ty-tx) so that it never happens.  You double all those 
+022    * you add in the inner loop
+023   
+024   After that loop you do the squares and add them in.
+025   */
+026   
+027   int fast_s_mp_sqr (mp_int * a, mp_int * b)
+028   \{
+029     int       olduse, res, pa, ix, iz;
+030     mp_digit   W[MP_WARRAY], *tmpx;
+031     mp_word   W1;
+032   
+033     /* grow the destination as required */
+034     pa = a->used + a->used;
+035     if (b->alloc < pa) \{
+036       if ((res = mp_grow (b, pa)) != MP_OKAY) \{
+037         return res;
+038       \}
+039     \}
+040   
+041     /* number of output digits to produce */
+042     W1 = 0;
+043     for (ix = 0; ix < pa; ix++) \{ 
+044         int      tx, ty, iy;
+045         mp_word  _W;
+046         mp_digit *tmpy;
+047   
+048         /* clear counter */
+049         _W = 0;
+050   
+051         /* get offsets into the two bignums */
+052         ty = MIN(a->used-1, ix);
+053         tx = ix - ty;
+054   
+055         /* setup temp aliases */
+056         tmpx = a->dp + tx;
+057         tmpy = a->dp + ty;
+058   
+059         /* this is the number of times the loop will iterrate, essentially
+060            while (tx++ < a->used && ty-- >= 0) \{ ... \}
+061          */
+062         iy = MIN(a->used-tx, ty+1);
+063   
+064         /* now for squaring tx can never equal ty 
+065          * we halve the distance since they approach at a rate of 2x
+066          * and we have to round because odd cases need to be executed
+067          */
+068         iy = MIN(iy, (ty-tx+1)>>1);
+069   
+070         /* execute loop */
+071         for (iz = 0; iz < iy; iz++) \{
+072            _W += ((mp_word)*tmpx++)*((mp_word)*tmpy--);
+073         \}
+074   
+075         /* double the inner product and add carry */
+076         _W = _W + _W + W1;
+077   
+078         /* even columns have the square term in them */
+079         if ((ix&1) == 0) \{
+080            _W += ((mp_word)a->dp[ix>>1])*((mp_word)a->dp[ix>>1]);
+081         \}
+082   
+083         /* store it */
+084         W[ix] = (mp_digit)(_W & MP_MASK);
+085   
+086         /* make next carry */
+087         W1 = _W >> ((mp_word)DIGIT_BIT);
+088     \}
+089   
+090     /* setup dest */
+091     olduse  = b->used;
+092     b->used = a->used+a->used;
+093   
+094     \{
+095       mp_digit *tmpb;
+096       tmpb = b->dp;
+097       for (ix = 0; ix < pa; ix++) \{
+098         *tmpb++ = W[ix] & MP_MASK;
+099       \}
+100   
+101       /* clear unused digits [that existed in the old copy of c] */
+102       for (; ix < olduse; ix++) \{
+103         *tmpb++ = 0;
+104       \}
+105     \}
+106     mp_clamp (b);
+107     return MP_OKAY;
+108   \}
+109   #endif
+\end{alltt}
+\end{small}
+
+This implementation is essentially a copy of Comba multiplication with the appropriate changes added to make it faster for 
+the special case of squaring.  
+
+\subsection{Polynomial Basis Squaring}
+The same algorithm that performs optimal polynomial basis multiplication can be used to perform polynomial basis squaring.  The minor exception
+is that $\zeta_y = f(y)g(y)$ is actually equivalent to $\zeta_y = f(y)^2$ since $f(y) = g(y)$.  Instead of performing $2n + 1$
+multiplications to find the $\zeta$ relations, squaring operations are performed instead.  
+
+\subsection{Karatsuba Squaring}
+Let $f(x) = ax + b$ represent the polynomial basis representation of a number to square.  
+Let $h(x) = \left ( f(x) \right )^2$ represent the square of the polynomial.  The Karatsuba equation can be modified to square a 
+number with the following equation.
+
+\begin{equation}
+h(x) = a^2x^2 + \left (a^2 + b^2 - (a - b)^2 \right )x + b^2
+\end{equation}
+
+Upon closer inspection this equation only requires the calculation of three half-sized squares: $a^2$, $b^2$ and $(a - b)^2$.  As in 
+Karatsuba multiplication, this algorithm can be applied recursively on the input and will achieve an asymptotic running time of 
+$O \left ( n^{lg(3)} \right )$.
+
+If the asymptotic times of Karatsuba squaring and multiplication are the same, why not simply use the multiplication algorithm 
+instead?  The answer to this arises from the cutoff point for squaring.  As in multiplication there exists a cutoff point, at which the 
+time required for a Comba based squaring and a Karatsuba based squaring meet.  Due to the overhead inherent in the Karatsuba method, the cutoff 
+point is fairly high.  For example, on an AMD Athlon XP processor with $\beta = 2^{28}$, the cutoff point is around 127 digits.  
+
+Consider squaring a 200 digit number with this technique.  It will be split into two 100 digit halves which are subsequently squared.  
+The 100 digit halves will not be squared using Karatsuba, but instead using the faster Comba based squaring algorithm.  If Karatsuba multiplication
+were used instead, the 100 digit numbers would be squared with a slower Comba based multiplication.  
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_karatsuba\_sqr}. \\
+\textbf{Input}.   mp\_int $a$ \\
+\textbf{Output}.  $b \leftarrow a^2$ \\
+\hline \\
+1.  Initialize the following temporary mp\_ints:  $x0$, $x1$, $t1$, $t2$, $x0x0$ and $x1x1$. \\
+2.  If any of the initializations on step 1 failed return(\textit{MP\_MEM}). \\
+\\
+Split the input.  e.g. $a = x1\beta^B + x0$ \\
+3.  $B \leftarrow \lfloor a.used / 2 \rfloor$ \\
+4.  $x0 \leftarrow a \mbox{ (mod }\beta^B\mbox{)}$ (\textit{mp\_mod\_2d}) \\
+5.  $x1 \leftarrow \lfloor a / \beta^B \rfloor$ (\textit{mp\_lshd}) \\
+\\
+Calculate the three squares. \\
+6.  $x0x0 \leftarrow x0^2$ (\textit{mp\_sqr}) \\
+7.  $x1x1 \leftarrow x1^2$ \\
+8.  $t1 \leftarrow x1 - x0$ (\textit{mp\_sub}) \\
+9.  $t1 \leftarrow t1^2$ \\
+\\
+Compute the middle term. \\
+10.  $t2 \leftarrow x0x0 + x1x1$ (\textit{s\_mp\_add}) \\
+11.  $t1 \leftarrow t2 - t1$ \\
+\\
+Compute final product. \\
+12.  $t1 \leftarrow t1\beta^B$ (\textit{mp\_lshd}) \\
+13.  $x1x1 \leftarrow x1x1\beta^{2B}$ \\
+14.  $t1 \leftarrow t1 + x0x0$ \\
+15.  $b \leftarrow t1 + x1x1$ \\
+16.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_karatsuba\_sqr}
+\end{figure}
+
+\textbf{Algorithm mp\_karatsuba\_sqr.}
+This algorithm computes the square of an input $a$ using the Karatsuba technique.  This algorithm is very similar to the Karatsuba based
+multiplication algorithm with the exception that the three half-size multiplications have been replaced with three half-size squarings.
+
+The radix point for squaring is simply placed exactly in the middle of the digits when the input has an odd number of digits, otherwise it is
+placed just below the middle.  Step 3, 4 and 5 compute the two halves required using $B$
+as the radix point.  The first two squares in steps 6 and 7 are rather straightforward while the last square is of a more compact form.
+
+By expanding $\left (x1 - x0 \right )^2$, the $x1^2$ and $x0^2$ terms in the middle disappear, that is $x1^2 + x0^2 - (x1 - x0)^2 = 2 \cdot x0 \cdot x1$.
+Now if $5n$ single precision additions and a squaring of $n$-digits is faster than multiplying two $n$-digit numbers and doubling then
+this method is faster.  Assuming no further recursions occur, the difference can be estimated with the following inequality.
+
+Let $p$ represent the cost of a single precision addition and $q$ the cost of a single precision multiplication both in terms of time\footnote{Or
+machine clock cycles.}. 
+
+\begin{equation}
+5pn +{{q(n^2 + n)} \over 2} \le pn + qn^2
+\end{equation}
+
+For example, on an AMD Athlon XP processor $p = {1 \over 3}$ and $q = 6$.  This implies that the following inequality should hold.
+\begin{center}
+\begin{tabular}{rcl}
+${5n \over 3} + 3n^2 + 3n$     & $<$ & ${n \over 3} + 6n^2$ \\
+${5 \over 3} + 3n + 3$     & $<$ & ${1 \over 3} + 6n$ \\
+${13 \over 9}$     & $<$ & $n$ \\
+\end{tabular}
+\end{center}
+
+This results in a cutoff point around $n = 2$.  As a consequence it is actually faster to compute the middle term the ``long way'' on processors
+where multiplication is substantially slower\footnote{On the Athlon there is a 1:17 ratio between clock cycles for addition and multiplication.  On
+the Intel P4 processor this ratio is 1:29 making this method even more beneficial.  The only common exception is the ARMv4 processor which has a
+ratio of 1:7.  } than simpler operations such as addition.  
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_karatsuba\_sqr.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* Karatsuba squaring, computes b = a*a using three 
+018    * half size squarings
+019    *
+020    * See comments of karatsuba_mul for details.  It 
+021    * is essentially the same algorithm but merely 
+022    * tuned to perform recursive squarings.
+023    */
+024   int mp_karatsuba_sqr (mp_int * a, mp_int * b)
+025   \{
+026     mp_int  x0, x1, t1, t2, x0x0, x1x1;
+027     int     B, err;
+028   
+029     err = MP_MEM;
+030   
+031     /* min # of digits */
+032     B = a->used;
+033   
+034     /* now divide in two */
+035     B = B >> 1;
+036   
+037     /* init copy all the temps */
+038     if (mp_init_size (&x0, B) != MP_OKAY)
+039       goto ERR;
+040     if (mp_init_size (&x1, a->used - B) != MP_OKAY)
+041       goto X0;
+042   
+043     /* init temps */
+044     if (mp_init_size (&t1, a->used * 2) != MP_OKAY)
+045       goto X1;
+046     if (mp_init_size (&t2, a->used * 2) != MP_OKAY)
+047       goto T1;
+048     if (mp_init_size (&x0x0, B * 2) != MP_OKAY)
+049       goto T2;
+050     if (mp_init_size (&x1x1, (a->used - B) * 2) != MP_OKAY)
+051       goto X0X0;
+052   
+053     \{
+054       register int x;
+055       register mp_digit *dst, *src;
+056   
+057       src = a->dp;
+058   
+059       /* now shift the digits */
+060       dst = x0.dp;
+061       for (x = 0; x < B; x++) \{
+062         *dst++ = *src++;
+063       \}
+064   
+065       dst = x1.dp;
+066       for (x = B; x < a->used; x++) \{
+067         *dst++ = *src++;
+068       \}
+069     \}
+070   
+071     x0.used = B;
+072     x1.used = a->used - B;
+073   
+074     mp_clamp (&x0);
+075   
+076     /* now calc the products x0*x0 and x1*x1 */
+077     if (mp_sqr (&x0, &x0x0) != MP_OKAY)
+078       goto X1X1;           /* x0x0 = x0*x0 */
+079     if (mp_sqr (&x1, &x1x1) != MP_OKAY)
+080       goto X1X1;           /* x1x1 = x1*x1 */
+081   
+082     /* now calc (x1-x0)**2 */
+083     if (mp_sub (&x1, &x0, &t1) != MP_OKAY)
+084       goto X1X1;           /* t1 = x1 - x0 */
+085     if (mp_sqr (&t1, &t1) != MP_OKAY)
+086       goto X1X1;           /* t1 = (x1 - x0) * (x1 - x0) */
+087   
+088     /* add x0y0 */
+089     if (s_mp_add (&x0x0, &x1x1, &t2) != MP_OKAY)
+090       goto X1X1;           /* t2 = x0x0 + x1x1 */
+091     if (mp_sub (&t2, &t1, &t1) != MP_OKAY)
+092       goto X1X1;           /* t1 = x0x0 + x1x1 - (x1-x0)*(x1-x0) */
+093   
+094     /* shift by B */
+095     if (mp_lshd (&t1, B) != MP_OKAY)
+096       goto X1X1;           /* t1 = (x0x0 + x1x1 - (x1-x0)*(x1-x0))<<B */
+097     if (mp_lshd (&x1x1, B * 2) != MP_OKAY)
+098       goto X1X1;           /* x1x1 = x1x1 << 2*B */
+099   
+100     if (mp_add (&x0x0, &t1, &t1) != MP_OKAY)
+101       goto X1X1;           /* t1 = x0x0 + t1 */
+102     if (mp_add (&t1, &x1x1, b) != MP_OKAY)
+103       goto X1X1;           /* t1 = x0x0 + t1 + x1x1 */
+104   
+105     err = MP_OKAY;
+106   
+107   X1X1:mp_clear (&x1x1);
+108   X0X0:mp_clear (&x0x0);
+109   T2:mp_clear (&t2);
+110   T1:mp_clear (&t1);
+111   X1:mp_clear (&x1);
+112   X0:mp_clear (&x0);
+113   ERR:
+114     return err;
+115   \}
+116   #endif
+\end{alltt}
+\end{small}
+
+This implementation is largely based on the implementation of algorithm mp\_karatsuba\_mul.  It uses the same inline style to copy and 
+shift the input into the two halves.  The loop from line 53 to line 69 has been modified since only one input exists.  The \textbf{used}
+count of both $x0$ and $x1$ is fixed up and $x0$ is clamped before the calculations begin.  At this point $x1$ and $x0$ are valid equivalents
+to the respective halves as if mp\_rshd and mp\_mod\_2d had been used.  
+
+By inlining the copy and shift operations the cutoff point for Karatsuba multiplication can be lowered.  On the Athlon the cutoff point
+is exactly at the point where Comba squaring can no longer be used (\textit{128 digits}).  On slower processors such as the Intel P4
+it is actually below the Comba limit (\textit{at 110 digits}).
+
+This routine uses the same error trap coding style as mp\_karatsuba\_sqr.  As the temporary variables are initialized errors are 
+redirected to the error trap higher up.  If the algorithm completes without error the error code is set to \textbf{MP\_OKAY} and 
+mp\_clears are executed normally.
+
+\subsection{Toom-Cook Squaring}
+The Toom-Cook squaring algorithm mp\_toom\_sqr is heavily based on the algorithm mp\_toom\_mul with the exception that squarings are used
+instead of multiplication to find the five relations.  The reader is encouraged to read the description of the latter algorithm and try to 
+derive their own Toom-Cook squaring algorithm.  
+
+\subsection{High Level Squaring}
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_sqr}. \\
+\textbf{Input}.   mp\_int $a$ \\
+\textbf{Output}.  $b \leftarrow a^2$ \\
+\hline \\
+1.  If $a.used \ge TOOM\_SQR\_CUTOFF$ then  \\
+\hspace{3mm}1.1  $b \leftarrow a^2$ using algorithm mp\_toom\_sqr \\
+2.  else if $a.used \ge KARATSUBA\_SQR\_CUTOFF$ then \\
+\hspace{3mm}2.1  $b \leftarrow a^2$ using algorithm mp\_karatsuba\_sqr \\
+3.  else \\
+\hspace{3mm}3.1  $digs \leftarrow a.used + b.used + 1$ \\
+\hspace{3mm}3.2  If $digs < MP\_ARRAY$ and $a.used \le \delta$ then \\
+\hspace{6mm}3.2.1  $b \leftarrow a^2$ using algorithm fast\_s\_mp\_sqr.  \\
+\hspace{3mm}3.3  else \\
+\hspace{6mm}3.3.1  $b \leftarrow a^2$ using algorithm s\_mp\_sqr.  \\
+4.  $b.sign \leftarrow MP\_ZPOS$ \\
+5.  Return the result of the unsigned squaring performed. \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_sqr}
+\end{figure}
+
+\textbf{Algorithm mp\_sqr.}
+This algorithm computes the square of the input using one of four different algorithms.  If the input is very large and has at least
+\textbf{TOOM\_SQR\_CUTOFF} or \textbf{KARATSUBA\_SQR\_CUTOFF} digits then either the Toom-Cook or the Karatsuba Squaring algorithm is used.  If
+neither of the polynomial basis algorithms should be used then either the Comba or baseline algorithm is used.  
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_sqr.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* computes b = a*a */
+018   int
+019   mp_sqr (mp_int * a, mp_int * b)
+020   \{
+021     int     res;
+022   
+023   #ifdef BN_MP_TOOM_SQR_C
+024     /* use Toom-Cook? */
+025     if (a->used >= TOOM_SQR_CUTOFF) \{
+026       res = mp_toom_sqr(a, b);
+027     /* Karatsuba? */
+028     \} else 
+029   #endif
+030   #ifdef BN_MP_KARATSUBA_SQR_C
+031   if (a->used >= KARATSUBA_SQR_CUTOFF) \{
+032       res = mp_karatsuba_sqr (a, b);
+033     \} else 
+034   #endif
+035     \{
+036   #ifdef BN_FAST_S_MP_SQR_C
+037       /* can we use the fast comba multiplier? */
+038       if ((a->used * 2 + 1) < MP_WARRAY && 
+039            a->used < 
+040            (1 << (sizeof(mp_word) * CHAR_BIT - 2*DIGIT_BIT - 1))) \{
+041         res = fast_s_mp_sqr (a, b);
+042       \} else
+043   #endif
+044   #ifdef BN_S_MP_SQR_C
+045         res = s_mp_sqr (a, b);
+046   #else
+047         res = MP_VAL;
+048   #endif
+049     \}
+050     b->sign = MP_ZPOS;
+051     return res;
+052   \}
+053   #endif
+\end{alltt}
+\end{small}
+
+\section*{Exercises}
+\begin{tabular}{cl}
+$\left [ 3 \right ] $ & Devise an efficient algorithm for selection of the radix point to handle inputs \\
+                      & that have different number of digits in Karatsuba multiplication. \\
+                      & \\
+$\left [ 2 \right ] $ & In section 5.3 the fact that every column of a squaring is made up \\
+                      & of double products and at most one square is stated.  Prove this statement. \\
+                      & \\                      
+$\left [ 3 \right ] $ & Prove the equation for Karatsuba squaring. \\
+                      & \\
+$\left [ 1 \right ] $ & Prove that Karatsuba squaring requires $O \left (n^{lg(3)} \right )$ time. \\
+                      & \\ 
+$\left [ 2 \right ] $ & Determine the minimal ratio between addition and multiplication clock cycles \\
+                      & required for equation $6.7$ to be true.  \\
+                      & \\
+$\left [ 3 \right ] $ & Implement a threaded version of Comba multiplication (and squaring) where you \\
+                      & compute subsets of the columns in each thread.  Determine a cutoff point where \\
+                      & it is effective and add the logic to mp\_mul() and mp\_sqr(). \\
+                      &\\
+$\left [ 4 \right ] $ & Same as the previous but also modify the Karatsuba and Toom-Cook.  You must \\
+                      & increase the throughput of mp\_exptmod() for random odd moduli in the range \\
+                      & $512 \ldots 4096$ bits significantly ($> 2x$) to complete this challenge. \\
+                      & \\
+\end{tabular}
+
+\chapter{Modular Reduction}
+\section{Basics of Modular Reduction}
+\index{modular residue}
+Modular reduction is an operation that arises quite often within public key cryptography algorithms and various number theoretic algorithms, 
+such as factoring.  Modular reduction algorithms are the third class of algorithms of the ``multipliers'' set.  A number $a$ is said to be \textit{reduced}
+modulo another number $b$ by finding the remainder of the division $a/b$.  Full integer division with remainder is a topic to be covered 
+in~\ref{sec:division}.
+
+Modular reduction is equivalent to solving for $r$ in the following equation.  $a = bq + r$ where $q = \lfloor a/b \rfloor$.  The result 
+$r$ is said to be ``congruent to $a$ modulo $b$'' which is also written as $r \equiv a \mbox{ (mod }b\mbox{)}$.  In other vernacular $r$ is known as the 
+``modular residue'' which leads to ``quadratic residue''\footnote{That's fancy talk for $b \equiv a^2 \mbox{ (mod }p\mbox{)}$.} and
+other forms of residues.  
+
+Modular reductions are normally used to create either finite groups, rings or fields.  The most common usage for performance driven modular reductions 
+is in modular exponentiation algorithms.  That is to compute $d = a^b \mbox{ (mod }c\mbox{)}$ as fast as possible.  This operation is used in the 
+RSA and Diffie-Hellman public key algorithms, for example.  Modular multiplication and squaring also appears as a fundamental operation in 
+elliptic curve cryptographic algorithms.  As will be discussed in the subsequent chapter there exist fast algorithms for computing modular 
+exponentiations without having to perform (\textit{in this example}) $b - 1$ multiplications.  These algorithms will produce partial results in the 
+range $0 \le x < c^2$ which can be taken advantage of to create several efficient algorithms.   They have also been used to create redundancy check 
+algorithms known as CRCs, error correction codes such as Reed-Solomon and solve a variety of number theoeretic problems.  
+
+\section{The Barrett Reduction}
+The Barrett reduction algorithm \cite{BARRETT} was inspired by fast division algorithms which multiply by the reciprocal to emulate
+division.  Barretts observation was that the residue $c$ of $a$ modulo $b$ is equal to 
+
+\begin{equation}
+c = a - b \cdot \lfloor a/b \rfloor
+\end{equation}
+
+Since algorithms such as modular exponentiation would be using the same modulus extensively, typical DSP\footnote{It is worth noting that Barrett's paper 
+targeted the DSP56K processor.}  intuition would indicate the next step would be to replace $a/b$ by a multiplication by the reciprocal.  However, 
+DSP intuition on its own will not work as these numbers are considerably larger than the precision of common DSP floating point data types.  
+It would take another common optimization to optimize the algorithm.
+
+\subsection{Fixed Point Arithmetic}
+The trick used to optimize the above equation is based on a technique of emulating floating point data types with fixed precision integers.  Fixed
+point arithmetic would become very popular as it greatly optimize the ``3d-shooter'' genre of games in the mid 1990s when floating point units were 
+fairly slow if not unavailable.   The idea behind fixed point arithmetic is to take a normal $k$-bit integer data type and break it into $p$-bit 
+integer and a $q$-bit fraction part (\textit{where $p+q = k$}).  
+
+In this system a $k$-bit integer $n$ would actually represent $n/2^q$.  For example, with $q = 4$ the integer $n = 37$ would actually represent the
+value $2.3125$.  To multiply two fixed point numbers the integers are multiplied using traditional arithmetic and subsequently normalized by 
+moving the implied decimal point back to where it should be.  For example, with $q = 4$ to multiply the integers $9$ and $5$ they must be converted 
+to fixed point first by multiplying by $2^q$.  Let $a = 9(2^q)$ represent the fixed point representation of $9$ and $b = 5(2^q)$ represent the 
+fixed point representation of $5$.  The product $ab$ is equal to $45(2^{2q})$ which when normalized by dividing by $2^q$ produces $45(2^q)$.  
+
+This technique became popular since a normal integer multiplication and logical shift right are the only required operations to perform a multiplication
+of two fixed point numbers.  Using fixed point arithmetic, division can be easily approximated by multiplying by the reciprocal.  If $2^q$ is 
+equivalent to one than $2^q/b$ is equivalent to the fixed point approximation of $1/b$ using real arithmetic.  Using this fact dividing an integer 
+$a$ by another integer $b$ can be achieved with the following expression.
+
+\begin{equation}
+\lfloor a / b \rfloor \mbox{ }\approx\mbox{ } \lfloor (a \cdot \lfloor 2^q / b \rfloor)/2^q \rfloor
+\end{equation}
+
+The precision of the division is proportional to the value of $q$.  If the divisor $b$ is used frequently as is the case with 
+modular exponentiation pre-computing $2^q/b$ will allow a division to be performed with a multiplication and a right shift.  Both operations
+are considerably faster than division on most processors.  
+
+Consider dividing $19$ by $5$.  The correct result is $\lfloor 19/5 \rfloor = 3$.  With $q = 3$ the reciprocal is $\lfloor 2^q/5 \rfloor = 1$ which
+leads to a product of $19$ which when divided by $2^q$ produces $2$.  However, with $q = 4$ the reciprocal is $\lfloor 2^q/5 \rfloor = 3$ and
+the result of the emulated division is $\lfloor 3 \cdot 19 / 2^q \rfloor = 3$ which is correct.  The value of $2^q$ must be close to or ideally
+larger than the dividend.  In effect if $a$ is the dividend then $q$ should allow $0 \le \lfloor a/2^q \rfloor \le 1$ in order for this approach
+to work correctly.  Plugging this form of divison into the original equation the following modular residue equation arises.
+
+\begin{equation}
+c = a - b \cdot \lfloor (a \cdot \lfloor 2^q / b \rfloor)/2^q \rfloor
+\end{equation}
+
+Using the notation from \cite{BARRETT} the value of $\lfloor 2^q / b \rfloor$ will be represented by the $\mu$ symbol.  Using the $\mu$
+variable also helps re-inforce the idea that it is meant to be computed once and re-used.
+
+\begin{equation}
+c = a - b \cdot \lfloor (a \cdot \mu)/2^q \rfloor
+\end{equation}
+
+Provided that $2^q \ge a$ this algorithm will produce a quotient that is either exactly correct or off by a value of one.  In the context of Barrett
+reduction the value of $a$ is bound by $0 \le a \le (b - 1)^2$ meaning that $2^q \ge b^2$ is sufficient to ensure the reciprocal will have enough
+precision.  
+
+Let $n$ represent the number of digits in $b$.  This algorithm requires approximately $2n^2$ single precision multiplications to produce the quotient and 
+another $n^2$ single precision multiplications to find the residue.  In total $3n^2$ single precision multiplications are required to 
+reduce the number.  
+
+For example, if $b = 1179677$ and $q = 41$ ($2^q > b^2$), then the reciprocal $\mu$ is equal to $\lfloor 2^q / b \rfloor = 1864089$.  Consider reducing
+$a = 180388626447$ modulo $b$ using the above reduction equation.  The quotient using the new formula is $\lfloor (a \cdot \mu) / 2^q \rfloor = 152913$.
+By subtracting $152913b$ from $a$ the correct residue $a \equiv 677346 \mbox{ (mod }b\mbox{)}$ is found.
+
+\subsection{Choosing a Radix Point}
+Using the fixed point representation a modular reduction can be performed with $3n^2$ single precision multiplications.  If that were the best
+that could be achieved a full division\footnote{A division requires approximately $O(2cn^2)$ single precision multiplications for a small value of $c$.  
+See~\ref{sec:division} for further details.} might as well be used in its place.  The key to optimizing the reduction is to reduce the precision of
+the initial multiplication that finds the quotient.  
+
+Let $a$ represent the number of which the residue is sought.  Let $b$ represent the modulus used to find the residue.  Let $m$ represent
+the number of digits in $b$.  For the purposes of this discussion we will assume that the number of digits in $a$ is $2m$, which is generally true if 
+two $m$-digit numbers have been multiplied.  Dividing $a$ by $b$ is the same as dividing a $2m$ digit integer by a $m$ digit integer.  Digits below the 
+$m - 1$'th digit of $a$ will contribute at most a value of $1$ to the quotient because $\beta^k < b$ for any $0 \le k \le m - 1$.  Another way to
+express this is by re-writing $a$ as two parts.  If $a' \equiv a \mbox{ (mod }b^m\mbox{)}$ and $a'' = a - a'$ then 
+${a \over b} \equiv {{a' + a''} \over b}$ which is equivalent to ${a' \over b} + {a'' \over b}$.  Since $a'$ is bound to be less than $b$ the quotient
+is bound by $0 \le {a' \over b} < 1$.
+
+Since the digits of $a'$ do not contribute much to the quotient the observation is that they might as well be zero.  However, if the digits 
+``might as well be zero'' they might as well not be there in the first place.  Let $q_0 = \lfloor a/\beta^{m-1} \rfloor$ represent the input
+with the irrelevant digits trimmed.  Now the modular reduction is trimmed to the almost equivalent equation
+
+\begin{equation}
+c = a - b \cdot \lfloor (q_0 \cdot \mu) / \beta^{m+1} \rfloor
+\end{equation}
+
+Note that the original divisor $2^q$ has been replaced with $\beta^{m+1}$ where in this case $q$ is a multiple of $lg(\beta)$. Also note that the 
+exponent on the divisor when added to the amount $q_0$ was shifted by equals $2m$.  If the optimization had not been performed the divisor 
+would have the exponent $2m$ so in the end the exponents do ``add up''. Using the above equation the quotient 
+$\lfloor (q_0 \cdot \mu) / \beta^{m+1} \rfloor$ can be off from the true quotient by at most two.  The original fixed point quotient can be off
+by as much as one (\textit{provided the radix point is chosen suitably}) and now that the lower irrelevent digits have been trimmed the quotient
+can be off by an additional value of one for a total of at most two.  This implies that 
+$0 \le a - b \cdot \lfloor (q_0 \cdot \mu) / \beta^{m+1} \rfloor < 3b$.  By first subtracting $b$ times the quotient and then conditionally subtracting 
+$b$ once or twice the residue is found.
+
+The quotient is now found using $(m + 1)(m) = m^2 + m$ single precision multiplications and the residue with an additional $m^2$ single
+precision multiplications, ignoring the subtractions required.  In total $2m^2 + m$ single precision multiplications are required to find the residue.  
+This is considerably faster than the original attempt.
+
+For example, let $\beta = 10$ represent the radix of the digits.  Let $b = 9999$ represent the modulus which implies $m = 4$. Let $a = 99929878$ 
+represent the value of which the residue is desired.  In this case $q = 8$ since $10^7 < 9999^2$ meaning that $\mu = \lfloor \beta^{q}/b \rfloor = 10001$.  
+With the new observation the multiplicand for the quotient is equal to $q_0 = \lfloor a / \beta^{m - 1} \rfloor = 99929$.  The quotient is then 
+$\lfloor (q_0 \cdot \mu) / \beta^{m+1} \rfloor = 9993$.  Subtracting $9993b$ from $a$ and the correct residue $a \equiv 9871 \mbox{ (mod }b\mbox{)}$ 
+is found.  
+
+\subsection{Trimming the Quotient}
+So far the reduction algorithm has been optimized from $3m^2$ single precision multiplications down to $2m^2 + m$ single precision multiplications.  As 
+it stands now the algorithm is already fairly fast compared to a full integer division algorithm.  However, there is still room for
+optimization.  
+
+After the first multiplication inside the quotient ($q_0 \cdot \mu$) the value is shifted right by $m + 1$ places effectively nullifying the lower
+half of the product.  It would be nice to be able to remove those digits from the product to effectively cut down the number of single precision 
+multiplications.  If the number of digits in the modulus $m$ is far less than $\beta$ a full product is not required for the algorithm to work properly.  
+In fact the lower $m - 2$ digits will not affect the upper half of the product at all and do not need to be computed.  
+
+The value of $\mu$ is a $m$-digit number and $q_0$ is a $m + 1$ digit number.  Using a full multiplier $(m + 1)(m) = m^2 + m$ single precision
+multiplications would be required.  Using a multiplier that will only produce digits at and above the $m - 1$'th digit reduces the number
+of single precision multiplications to ${m^2 + m} \over 2$ single precision multiplications.  
+
+\subsection{Trimming the Residue}
+After the quotient has been calculated it is used to reduce the input.  As previously noted the algorithm is not exact and it can be off by a small
+multiple of the modulus, that is $0 \le a - b \cdot \lfloor (q_0 \cdot \mu) / \beta^{m+1} \rfloor < 3b$.  If $b$ is $m$ digits than the 
+result of reduction equation is a value of at most $m + 1$ digits (\textit{provided $3 < \beta$}) implying that the upper $m - 1$ digits are
+implicitly zero.  
+
+The next optimization arises from this very fact.  Instead of computing $b \cdot \lfloor (q_0 \cdot \mu) / \beta^{m+1} \rfloor$ using a full
+$O(m^2)$ multiplication algorithm only the lower $m+1$ digits of the product have to be computed.  Similarly the value of $a$ can
+be reduced modulo $\beta^{m+1}$ before the multiple of $b$ is subtracted which simplifes the subtraction as well.  A multiplication that produces 
+only the lower $m+1$ digits requires ${m^2 + 3m - 2} \over 2$ single precision multiplications.  
+
+With both optimizations in place the algorithm is the algorithm Barrett proposed.  It requires $m^2 + 2m - 1$ single precision multiplications which
+is considerably faster than the straightforward $3m^2$ method.  
+
+\subsection{The Barrett Algorithm}
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_reduce}. \\
+\textbf{Input}.   mp\_int $a$, mp\_int $b$ and $\mu = \lfloor \beta^{2m}/b \rfloor, m = \lceil lg_{\beta}(b) \rceil, (0 \le a < b^2, b > 1)$ \\
+\textbf{Output}.  $a \mbox{ (mod }b\mbox{)}$ \\
+\hline \\
+Let $m$ represent the number of digits in $b$.  \\
+1.  Make a copy of $a$ and store it in $q$.  (\textit{mp\_init\_copy}) \\
+2.  $q \leftarrow \lfloor q / \beta^{m - 1} \rfloor$ (\textit{mp\_rshd}) \\
+\\
+Produce the quotient. \\
+3.  $q \leftarrow q \cdot \mu$  (\textit{note: only produce digits at or above $m-1$}) \\
+4.  $q \leftarrow \lfloor q / \beta^{m + 1} \rfloor$ \\
+\\
+Subtract the multiple of modulus from the input. \\
+5.  $a \leftarrow a \mbox{ (mod }\beta^{m+1}\mbox{)}$ (\textit{mp\_mod\_2d}) \\
+6.  $q \leftarrow q \cdot b \mbox{ (mod }\beta^{m+1}\mbox{)}$ (\textit{s\_mp\_mul\_digs}) \\
+7.  $a \leftarrow a - q$ (\textit{mp\_sub}) \\
+\\
+Add $\beta^{m+1}$ if a carry occured. \\
+8.  If $a < 0$ then (\textit{mp\_cmp\_d}) \\
+\hspace{3mm}8.1  $q \leftarrow 1$ (\textit{mp\_set}) \\
+\hspace{3mm}8.2  $q \leftarrow q \cdot \beta^{m+1}$ (\textit{mp\_lshd}) \\
+\hspace{3mm}8.3  $a \leftarrow a + q$ \\
+\\
+Now subtract the modulus if the residue is too large (e.g. quotient too small). \\
+9.  While $a \ge b$ do (\textit{mp\_cmp}) \\
+\hspace{3mm}9.1  $c \leftarrow a - b$ \\
+10.  Clear $q$. \\
+11.  Return(\textit{MP\_OKAY}) \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_reduce}
+\end{figure}
+
+\textbf{Algorithm mp\_reduce.}
+This algorithm will reduce the input $a$ modulo $b$ in place using the Barrett algorithm.  It is loosely based on algorithm 14.42 of HAC
+\cite[pp.  602]{HAC} which is based on the paper from Paul Barrett \cite{BARRETT}.  The algorithm has several restrictions and assumptions which must 
+be adhered to for the algorithm to work.
+
+First the modulus $b$ is assumed to be positive and greater than one.  If the modulus were less than or equal to one than subtracting
+a multiple of it would either accomplish nothing or actually enlarge the input.  The input $a$ must be in the range $0 \le a < b^2$ in order
+for the quotient to have enough precision.  If $a$ is the product of two numbers that were already reduced modulo $b$, this will not be a problem.
+Technically the algorithm will still work if $a \ge b^2$ but it will take much longer to finish.  The value of $\mu$ is passed as an argument to this 
+algorithm and is assumed to be calculated and stored before the algorithm is used.  
+
+Recall that the multiplication for the quotient on step 3 must only produce digits at or above the $m-1$'th position.  An algorithm called 
+$s\_mp\_mul\_high\_digs$ which has not been presented is used to accomplish this task.  The algorithm is based on $s\_mp\_mul\_digs$ except that
+instead of stopping at a given level of precision it starts at a given level of precision.  This optimal algorithm can only be used if the number
+of digits in $b$ is very much smaller than $\beta$.  
+
+While it is known that 
+$a \ge b \cdot \lfloor (q_0 \cdot \mu) / \beta^{m+1} \rfloor$ only the lower $m+1$ digits are being used to compute the residue, so an implied 
+``borrow'' from the higher digits might leave a negative result.  After the multiple of the modulus has been subtracted from $a$ the residue must be 
+fixed up in case it is negative.  The invariant $\beta^{m+1}$ must be added to the residue to make it positive again.  
+
+The while loop at step 9 will subtract $b$ until the residue is less than $b$.  If the algorithm is performed correctly this step is 
+performed at most twice, and on average once. However, if $a \ge b^2$ than it will iterate substantially more times than it should.
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_reduce.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* reduces x mod m, assumes 0 < x < m**2, mu is 
+018    * precomputed via mp_reduce_setup.
+019    * From HAC pp.604 Algorithm 14.42
+020    */
+021   int mp_reduce (mp_int * x, mp_int * m, mp_int * mu)
+022   \{
+023     mp_int  q;
+024     int     res, um = m->used;
+025   
+026     /* q = x */
+027     if ((res = mp_init_copy (&q, x)) != MP_OKAY) \{
+028       return res;
+029     \}
+030   
+031     /* q1 = x / b**(k-1)  */
+032     mp_rshd (&q, um - 1);         
+033   
+034     /* according to HAC this optimization is ok */
+035     if (((unsigned long) um) > (((mp_digit)1) << (DIGIT_BIT - 1))) \{
+036       if ((res = mp_mul (&q, mu, &q)) != MP_OKAY) \{
+037         goto CLEANUP;
+038       \}
+039     \} else \{
+040   #ifdef BN_S_MP_MUL_HIGH_DIGS_C
+041       if ((res = s_mp_mul_high_digs (&q, mu, &q, um)) != MP_OKAY) \{
+042         goto CLEANUP;
+043       \}
+044   #elif defined(BN_FAST_S_MP_MUL_HIGH_DIGS_C)
+045       if ((res = fast_s_mp_mul_high_digs (&q, mu, &q, um)) != MP_OKAY) \{
+046         goto CLEANUP;
+047       \}
+048   #else 
+049       \{ 
+050         res = MP_VAL;
+051         goto CLEANUP;
+052       \}
+053   #endif
+054     \}
+055   
+056     /* q3 = q2 / b**(k+1) */
+057     mp_rshd (&q, um + 1);         
+058   
+059     /* x = x mod b**(k+1), quick (no division) */
+060     if ((res = mp_mod_2d (x, DIGIT_BIT * (um + 1), x)) != MP_OKAY) \{
+061       goto CLEANUP;
+062     \}
+063   
+064     /* q = q * m mod b**(k+1), quick (no division) */
+065     if ((res = s_mp_mul_digs (&q, m, &q, um + 1)) != MP_OKAY) \{
+066       goto CLEANUP;
+067     \}
+068   
+069     /* x = x - q */
+070     if ((res = mp_sub (x, &q, x)) != MP_OKAY) \{
+071       goto CLEANUP;
+072     \}
+073   
+074     /* If x < 0, add b**(k+1) to it */
+075     if (mp_cmp_d (x, 0) == MP_LT) \{
+076       mp_set (&q, 1);
+077       if ((res = mp_lshd (&q, um + 1)) != MP_OKAY)
+078         goto CLEANUP;
+079       if ((res = mp_add (x, &q, x)) != MP_OKAY)
+080         goto CLEANUP;
+081     \}
+082   
+083     /* Back off if it's too big */
+084     while (mp_cmp (x, m) != MP_LT) \{
+085       if ((res = s_mp_sub (x, m, x)) != MP_OKAY) \{
+086         goto CLEANUP;
+087       \}
+088     \}
+089     
+090   CLEANUP:
+091     mp_clear (&q);
+092   
+093     return res;
+094   \}
+095   #endif
+\end{alltt}
+\end{small}
+
+The first multiplication that determines the quotient can be performed by only producing the digits from $m - 1$ and up.  This essentially halves
+the number of single precision multiplications required.  However, the optimization is only safe if $\beta$ is much larger than the number of digits
+in the modulus.  In the source code this is evaluated on lines 36 to 43 where algorithm s\_mp\_mul\_high\_digs is used when it is
+safe to do so.  
+
+\subsection{The Barrett Setup Algorithm}
+In order to use algorithm mp\_reduce the value of $\mu$ must be calculated in advance.  Ideally this value should be computed once and stored for
+future use so that the Barrett algorithm can be used without delay.  
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_reduce\_setup}. \\
+\textbf{Input}.   mp\_int $a$ ($a > 1$)  \\
+\textbf{Output}.  $\mu \leftarrow \lfloor \beta^{2m}/a \rfloor$ \\
+\hline \\
+1.  $\mu \leftarrow 2^{2 \cdot lg(\beta) \cdot  m}$ (\textit{mp\_2expt}) \\
+2.  $\mu \leftarrow \lfloor \mu / b \rfloor$ (\textit{mp\_div}) \\
+3.  Return(\textit{MP\_OKAY}) \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_reduce\_setup}
+\end{figure}
+
+\textbf{Algorithm mp\_reduce\_setup.}
+This algorithm computes the reciprocal $\mu$ required for Barrett reduction.  First $\beta^{2m}$ is calculated as $2^{2 \cdot lg(\beta) \cdot  m}$ which
+is equivalent and much faster.  The final value is computed by taking the integer quotient of $\lfloor \mu / b \rfloor$.
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_reduce\_setup.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* pre-calculate the value required for Barrett reduction
+018    * For a given modulus "b" it calulates the value required in "a"
+019    */
+020   int mp_reduce_setup (mp_int * a, mp_int * b)
+021   \{
+022     int     res;
+023     
+024     if ((res = mp_2expt (a, b->used * 2 * DIGIT_BIT)) != MP_OKAY) \{
+025       return res;
+026     \}
+027     return mp_div (a, b, a, NULL);
+028   \}
+029   #endif
+\end{alltt}
+\end{small}
+
+This simple routine calculates the reciprocal $\mu$ required by Barrett reduction.  Note the extended usage of algorithm mp\_div where the variable
+which would received the remainder is passed as NULL.  As will be discussed in~\ref{sec:division} the division routine allows both the quotient and the 
+remainder to be passed as NULL meaning to ignore the value.  
+
+\section{The Montgomery Reduction}
+Montgomery reduction\footnote{Thanks to Niels Ferguson for his insightful explanation of the algorithm.} \cite{MONT} is by far the most interesting 
+form of reduction in common use.  It computes a modular residue which is not actually equal to the residue of the input yet instead equal to a 
+residue times a constant.  However, as perplexing as this may sound the algorithm is relatively simple and very efficient.  
+
+Throughout this entire section the variable $n$ will represent the modulus used to form the residue.  As will be discussed shortly the value of
+$n$ must be odd.  The variable $x$ will represent the quantity of which the residue is sought.  Similar to the Barrett algorithm the input
+is restricted to $0 \le x < n^2$.  To begin the description some simple number theory facts must be established.
+
+\textbf{Fact 1.}  Adding $n$ to $x$ does not change the residue since in effect it adds one to the quotient $\lfloor x / n \rfloor$.  Another way
+to explain this is that $n$ is (\textit{or multiples of $n$ are}) congruent to zero modulo $n$.  Adding zero will not change the value of the residue.  
+
+\textbf{Fact 2.}  If $x$ is even then performing a division by two in $\Z$ is congruent to $x \cdot 2^{-1} \mbox{ (mod }n\mbox{)}$.  Actually
+this is an application of the fact that if $x$ is evenly divisible by any $k \in \Z$ then division in $\Z$ will be congruent to 
+multiplication by $k^{-1}$ modulo $n$.  
+
+From these two simple facts the following simple algorithm can be derived.
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{Montgomery Reduction}. \\
+\textbf{Input}.   Integer $x$, $n$ and $k$ \\
+\textbf{Output}.  $2^{-k}x \mbox{ (mod }n\mbox{)}$ \\
+\hline \\
+1.  for $t$ from $1$ to $k$ do \\
+\hspace{3mm}1.1  If $x$ is odd then \\
+\hspace{6mm}1.1.1  $x \leftarrow x + n$ \\
+\hspace{3mm}1.2  $x \leftarrow x/2$ \\
+2.  Return $x$. \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm Montgomery Reduction}
+\end{figure}
+
+The algorithm reduces the input one bit at a time using the two congruencies stated previously.  Inside the loop $n$, which is odd, is
+added to $x$ if $x$ is odd.  This forces $x$ to be even which allows the division by two in $\Z$ to be congruent to a modular division by two.  Since
+$x$ is assumed to be initially much larger than $n$ the addition of $n$ will contribute an insignificant magnitude to $x$.  Let $r$ represent the 
+final result of the Montgomery algorithm.  If $k > lg(n)$ and $0 \le x < n^2$ then the final result is limited to 
+$0 \le r < \lfloor x/2^k \rfloor + n$.  As a result at most a single subtraction is required to get the residue desired.
+
+\begin{figure}[here]
+\begin{small}
+\begin{center}
+\begin{tabular}{|c|l|}
+\hline \textbf{Step number ($t$)} & \textbf{Result ($x$)} \\
+\hline $1$ & $x + n = 5812$, $x/2 = 2906$ \\
+\hline $2$ & $x/2 = 1453$ \\
+\hline $3$ & $x + n = 1710$, $x/2 = 855$ \\
+\hline $4$ & $x + n = 1112$, $x/2 = 556$ \\
+\hline $5$ & $x/2 = 278$ \\
+\hline $6$ & $x/2 = 139$ \\
+\hline $7$ & $x + n = 396$, $x/2 = 198$ \\
+\hline $8$ & $x/2 = 99$ \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Example of Montgomery Reduction (I)}
+\label{fig:MONT1}
+\end{figure}
+
+Consider the example in figure~\ref{fig:MONT1} which reduces $x = 5555$ modulo $n = 257$ when $k = 8$.  The result of the algorithm $r = 99$ is
+congruent to the value of $2^{-8} \cdot 5555 \mbox{ (mod }257\mbox{)}$.  When $r$ is multiplied by $2^8$ modulo $257$ the correct residue 
+$r \equiv 158$ is produced.  
+
+Let $k = \lfloor lg(n) \rfloor + 1$ represent the number of bits in $n$.  The current algorithm requires $2k^2$ single precision shifts
+and $k^2$ single precision additions.  At this rate the algorithm is most certainly slower than Barrett reduction and not terribly useful.  
+Fortunately there exists an alternative representation of the algorithm.
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{Montgomery Reduction} (modified I). \\
+\textbf{Input}.   Integer $x$, $n$ and $k$ \\
+\textbf{Output}.  $2^{-k}x \mbox{ (mod }n\mbox{)}$ \\
+\hline \\
+1.  for $t$ from $0$ to $k - 1$ do \\
+\hspace{3mm}1.1  If the $t$'th bit of $x$ is one then \\
+\hspace{6mm}1.1.1  $x \leftarrow x + 2^tn$ \\
+2.  Return $x/2^k$. \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm Montgomery Reduction (modified I)}
+\end{figure}
+
+This algorithm is equivalent since $2^tn$ is a multiple of $n$ and the lower $k$ bits of $x$ are zero by step 2.  The number of single
+precision shifts has now been reduced from $2k^2$ to $k^2 + k$ which is only a small improvement.
+
+\begin{figure}[here]
+\begin{small}
+\begin{center}
+\begin{tabular}{|c|l|r|}
+\hline \textbf{Step number ($t$)} & \textbf{Result ($x$)} & \textbf{Result ($x$) in Binary} \\
+\hline -- & $5555$ & $1010110110011$ \\
+\hline $1$ & $x + 2^{0}n = 5812$ &  $1011010110100$ \\
+\hline $2$ & $5812$ & $1011010110100$ \\
+\hline $3$ & $x + 2^{2}n = 6840$ & $1101010111000$ \\
+\hline $4$ & $x + 2^{3}n = 8896$ & $10001011000000$ \\
+\hline $5$ & $8896$ & $10001011000000$ \\
+\hline $6$ & $8896$ & $10001011000000$ \\
+\hline $7$ & $x + 2^{6}n = 25344$ & $110001100000000$ \\
+\hline $8$ & $25344$ & $110001100000000$ \\
+\hline -- & $x/2^k = 99$ & \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Example of Montgomery Reduction (II)}
+\label{fig:MONT2}
+\end{figure}
+
+Figure~\ref{fig:MONT2} demonstrates the modified algorithm reducing $x = 5555$ modulo $n = 257$ with $k = 8$. 
+With this algorithm a single shift right at the end is the only right shift required to reduce the input instead of $k$ right shifts inside the 
+loop.  Note that for the iterations $t = 2, 5, 6$ and $8$ where the result $x$ is not changed.  In those iterations the $t$'th bit of $x$ is 
+zero and the appropriate multiple of $n$ does not need to be added to force the $t$'th bit of the result to zero.  
+
+\subsection{Digit Based Montgomery Reduction}
+Instead of computing the reduction on a bit-by-bit basis it is actually much faster to compute it on digit-by-digit basis.  Consider the
+previous algorithm re-written to compute the Montgomery reduction in this new fashion.
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{Montgomery Reduction} (modified II). \\
+\textbf{Input}.   Integer $x$, $n$ and $k$ \\
+\textbf{Output}.  $\beta^{-k}x \mbox{ (mod }n\mbox{)}$ \\
+\hline \\
+1.  for $t$ from $0$ to $k - 1$ do \\
+\hspace{3mm}1.1  $x \leftarrow x + \mu n \beta^t$ \\
+2.  Return $x/\beta^k$. \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm Montgomery Reduction (modified II)}
+\end{figure}
+
+The value $\mu n \beta^t$ is a multiple of the modulus $n$ meaning that it will not change the residue.  If the first digit of 
+the value $\mu n \beta^t$ equals the negative (modulo $\beta$) of the $t$'th digit of $x$ then the addition will result in a zero digit.  This
+problem breaks down to solving the following congruency.  
+
+\begin{center}
+\begin{tabular}{rcl}
+$x_t + \mu n_0$ & $\equiv$ & $0 \mbox{ (mod }\beta\mbox{)}$ \\
+$\mu n_0$ & $\equiv$ & $-x_t \mbox{ (mod }\beta\mbox{)}$ \\
+$\mu$ & $\equiv$ & $-x_t/n_0 \mbox{ (mod }\beta\mbox{)}$ \\
+\end{tabular}
+\end{center}
+
+In each iteration of the loop on step 1 a new value of $\mu$ must be calculated.  The value of $-1/n_0 \mbox{ (mod }\beta\mbox{)}$ is used 
+extensively in this algorithm and should be precomputed.  Let $\rho$ represent the negative of the modular inverse of $n_0$ modulo $\beta$.  
+
+For example, let $\beta = 10$ represent the radix.  Let $n = 17$ represent the modulus which implies $k = 2$ and $\rho \equiv 7$.  Let $x = 33$ 
+represent the value to reduce.
+
+\newpage\begin{figure}
+\begin{center}
+\begin{tabular}{|c|c|c|}
+\hline \textbf{Step ($t$)} & \textbf{Value of $x$} & \textbf{Value of $\mu$} \\
+\hline --                 & $33$ & --\\
+\hline $0$                 & $33 + \mu n = 50$ & $1$ \\
+\hline $1$                 & $50 + \mu n \beta = 900$ & $5$ \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Example of Montgomery Reduction}
+\end{figure}
+
+The final result $900$ is then divided by $\beta^k$ to produce the final result $9$.  The first observation is that $9 \nequiv x \mbox{ (mod }n\mbox{)}$ 
+which implies the result is not the modular residue of $x$ modulo $n$.  However, recall that the residue is actually multiplied by $\beta^{-k}$ in
+the algorithm.  To get the true residue the value must be multiplied by $\beta^k$.  In this case $\beta^k \equiv 15 \mbox{ (mod }n\mbox{)}$ and
+the correct residue is $9 \cdot 15 \equiv 16 \mbox{ (mod }n\mbox{)}$.  
+
+\subsection{Baseline Montgomery Reduction}
+The baseline Montgomery reduction algorithm will produce the residue for any size input.  It is designed to be a catch-all algororithm for 
+Montgomery reductions.  
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_montgomery\_reduce}. \\
+\textbf{Input}.   mp\_int $x$, mp\_int $n$ and a digit $\rho \equiv -1/n_0 \mbox{ (mod }n\mbox{)}$. \\
+\hspace{11.5mm}($0 \le x < n^2, n > 1, (n, \beta) = 1, \beta^k > n$) \\
+\textbf{Output}.  $\beta^{-k}x \mbox{ (mod }n\mbox{)}$ \\
+\hline \\
+1.  $digs \leftarrow 2n.used + 1$ \\
+2.  If $digs < MP\_ARRAY$ and $m.used < \delta$ then \\
+\hspace{3mm}2.1  Use algorithm fast\_mp\_montgomery\_reduce instead. \\
+\\
+Setup $x$ for the reduction. \\
+3.  If $x.alloc < digs$ then grow $x$ to $digs$ digits. \\
+4.  $x.used \leftarrow digs$ \\
+\\
+Eliminate the lower $k$ digits. \\
+5.  For $ix$ from $0$ to $k - 1$ do \\
+\hspace{3mm}5.1  $\mu \leftarrow x_{ix} \cdot \rho \mbox{ (mod }\beta\mbox{)}$ \\
+\hspace{3mm}5.2  $u \leftarrow 0$ \\
+\hspace{3mm}5.3  For $iy$ from $0$ to $k - 1$ do \\
+\hspace{6mm}5.3.1  $\hat r \leftarrow \mu n_{iy} + x_{ix + iy} + u$ \\
+\hspace{6mm}5.3.2  $x_{ix + iy} \leftarrow \hat r \mbox{ (mod }\beta\mbox{)}$ \\
+\hspace{6mm}5.3.3  $u \leftarrow \lfloor \hat r / \beta \rfloor$ \\
+\hspace{3mm}5.4  While $u > 0$ do \\
+\hspace{6mm}5.4.1  $iy \leftarrow iy + 1$ \\
+\hspace{6mm}5.4.2  $x_{ix + iy} \leftarrow x_{ix + iy} + u$ \\
+\hspace{6mm}5.4.3  $u \leftarrow \lfloor x_{ix+iy} / \beta \rfloor$ \\
+\hspace{6mm}5.4.4  $x_{ix + iy} \leftarrow x_{ix+iy} \mbox{ (mod }\beta\mbox{)}$ \\
+\\
+Divide by $\beta^k$ and fix up as required. \\
+6.  $x \leftarrow \lfloor x / \beta^k \rfloor$ \\
+7.  If $x \ge n$ then \\
+\hspace{3mm}7.1  $x \leftarrow x - n$ \\
+8.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_montgomery\_reduce}
+\end{figure}
+
+\textbf{Algorithm mp\_montgomery\_reduce.}
+This algorithm reduces the input $x$ modulo $n$ in place using the Montgomery reduction algorithm.  The algorithm is loosely based
+on algorithm 14.32 of \cite[pp.601]{HAC} except it merges the multiplication of $\mu n \beta^t$ with the addition in the inner loop.  The
+restrictions on this algorithm are fairly easy to adapt to.  First $0 \le x < n^2$ bounds the input to numbers in the same range as 
+for the Barrett algorithm.  Additionally if $n > 1$ and $n$ is odd there will exist a modular inverse $\rho$.  $\rho$ must be calculated in
+advance of this algorithm.  Finally the variable $k$ is fixed and a pseudonym for $n.used$.  
+
+Step 2 decides whether a faster Montgomery algorithm can be used.  It is based on the Comba technique meaning that there are limits on
+the size of the input.  This algorithm is discussed in sub-section 6.3.3.
+
+Step 5 is the main reduction loop of the algorithm.  The value of $\mu$ is calculated once per iteration in the outer loop.  The inner loop
+calculates $x + \mu n \beta^{ix}$ by multiplying $\mu n$ and adding the result to $x$ shifted by $ix$ digits.  Both the addition and
+multiplication are performed in the same loop to save time and memory.  Step 5.4 will handle any additional carries that escape the inner loop.
+
+Using a quick inspection this algorithm requires $n$ single precision multiplications for the outer loop and $n^2$ single precision multiplications 
+in the inner loop.  In total $n^2 + n$ single precision multiplications which compares favourably to Barrett at $n^2 + 2n - 1$ single precision
+multiplications.  
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_montgomery\_reduce.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* computes xR**-1 == x (mod N) via Montgomery Reduction */
+018   int
+019   mp_montgomery_reduce (mp_int * x, mp_int * n, mp_digit rho)
+020   \{
+021     int     ix, res, digs;
+022     mp_digit mu;
+023   
+024     /* can the fast reduction [comba] method be used?
+025      *
+026      * Note that unlike in mul you're safely allowed *less*
+027      * than the available columns [255 per default] since carries
+028      * are fixed up in the inner loop.
+029      */
+030     digs = n->used * 2 + 1;
+031     if ((digs < MP_WARRAY) &&
+032         n->used <
+033         (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) \{
+034       return fast_mp_montgomery_reduce (x, n, rho);
+035     \}
+036   
+037     /* grow the input as required */
+038     if (x->alloc < digs) \{
+039       if ((res = mp_grow (x, digs)) != MP_OKAY) \{
+040         return res;
+041       \}
+042     \}
+043     x->used = digs;
+044   
+045     for (ix = 0; ix < n->used; ix++) \{
+046       /* mu = ai * rho mod b
+047        *
+048        * The value of rho must be precalculated via
+049        * montgomery_setup() such that
+050        * it equals -1/n0 mod b this allows the
+051        * following inner loop to reduce the
+052        * input one digit at a time
+053        */
+054       mu = (mp_digit) (((mp_word)x->dp[ix]) * ((mp_word)rho) & MP_MASK);
+055   
+056       /* a = a + mu * m * b**i */
+057       \{
+058         register int iy;
+059         register mp_digit *tmpn, *tmpx, u;
+060         register mp_word r;
+061   
+062         /* alias for digits of the modulus */
+063         tmpn = n->dp;
+064   
+065         /* alias for the digits of x [the input] */
+066         tmpx = x->dp + ix;
+067   
+068         /* set the carry to zero */
+069         u = 0;
+070   
+071         /* Multiply and add in place */
+072         for (iy = 0; iy < n->used; iy++) \{
+073           /* compute product and sum */
+074           r       = ((mp_word)mu) * ((mp_word)*tmpn++) +
+075                     ((mp_word) u) + ((mp_word) * tmpx);
+076   
+077           /* get carry */
+078           u       = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
+079   
+080           /* fix digit */
+081           *tmpx++ = (mp_digit)(r & ((mp_word) MP_MASK));
+082         \}
+083         /* At this point the ix'th digit of x should be zero */
+084   
+085   
+086         /* propagate carries upwards as required*/
+087         while (u) \{
+088           *tmpx   += u;
+089           u        = *tmpx >> DIGIT_BIT;
+090           *tmpx++ &= MP_MASK;
+091         \}
+092       \}
+093     \}
+094   
+095     /* at this point the n.used'th least
+096      * significant digits of x are all zero
+097      * which means we can shift x to the
+098      * right by n.used digits and the
+099      * residue is unchanged.
+100      */
+101   
+102     /* x = x/b**n.used */
+103     mp_clamp(x);
+104     mp_rshd (x, n->used);
+105   
+106     /* if x >= n then x = x - n */
+107     if (mp_cmp_mag (x, n) != MP_LT) \{
+108       return s_mp_sub (x, n, x);
+109     \}
+110   
+111     return MP_OKAY;
+112   \}
+113   #endif
+\end{alltt}
+\end{small}
+
+This is the baseline implementation of the Montgomery reduction algorithm.  Lines 30 to 35 determine if the Comba based
+routine can be used instead.  Line 48 computes the value of $\mu$ for that particular iteration of the outer loop.  
+
+The multiplication $\mu n \beta^{ix}$ is performed in one step in the inner loop.  The alias $tmpx$ refers to the $ix$'th digit of $x$ and
+the alias $tmpn$ refers to the modulus $n$.  
+
+\subsection{Faster ``Comba'' Montgomery Reduction}
+
+The Montgomery reduction requires fewer single precision multiplications than a Barrett reduction, however it is much slower due to the serial
+nature of the inner loop.  The Barrett reduction algorithm requires two slightly modified multipliers which can be implemented with the Comba
+technique.  The Montgomery reduction algorithm cannot directly use the Comba technique to any significant advantage since the inner loop calculates
+a $k \times 1$ product $k$ times. 
+
+The biggest obstacle is that at the $ix$'th iteration of the outer loop the value of $x_{ix}$ is required to calculate $\mu$.  This means the 
+carries from $0$ to $ix - 1$ must have been propagated upwards to form a valid $ix$'th digit.  The solution as it turns out is very simple.  
+Perform a Comba like multiplier and inside the outer loop just after the inner loop fix up the $ix + 1$'th digit by forwarding the carry.  
+
+With this change in place the Montgomery reduction algorithm can be performed with a Comba style multiplication loop which substantially increases
+the speed of the algorithm.  
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{fast\_mp\_montgomery\_reduce}. \\
+\textbf{Input}.   mp\_int $x$, mp\_int $n$ and a digit $\rho \equiv -1/n_0 \mbox{ (mod }n\mbox{)}$. \\
+\hspace{11.5mm}($0 \le x < n^2, n > 1, (n, \beta) = 1, \beta^k > n$) \\
+\textbf{Output}.  $\beta^{-k}x \mbox{ (mod }n\mbox{)}$ \\
+\hline \\
+Place an array of \textbf{MP\_WARRAY} mp\_word variables called $\hat W$ on the stack. \\
+1.  if $x.alloc < n.used + 1$ then grow $x$ to $n.used + 1$ digits. \\
+Copy the digits of $x$ into the array $\hat W$ \\
+2.  For $ix$ from $0$ to $x.used - 1$ do \\
+\hspace{3mm}2.1  $\hat W_{ix} \leftarrow x_{ix}$ \\
+3.  For $ix$ from $x.used$ to $2n.used - 1$ do \\
+\hspace{3mm}3.1  $\hat W_{ix} \leftarrow 0$ \\
+Elimiate the lower $k$ digits. \\
+4.  for $ix$ from $0$ to $n.used - 1$ do \\
+\hspace{3mm}4.1  $\mu \leftarrow \hat W_{ix} \cdot \rho \mbox{ (mod }\beta\mbox{)}$ \\
+\hspace{3mm}4.2  For $iy$ from $0$ to $n.used - 1$ do \\
+\hspace{6mm}4.2.1  $\hat W_{iy + ix} \leftarrow \hat W_{iy + ix} + \mu \cdot n_{iy}$ \\
+\hspace{3mm}4.3  $\hat W_{ix + 1} \leftarrow \hat W_{ix + 1} + \lfloor \hat W_{ix} / \beta \rfloor$ \\
+Propagate carries upwards. \\
+5.  for $ix$ from $n.used$ to $2n.used + 1$ do \\
+\hspace{3mm}5.1  $\hat W_{ix + 1} \leftarrow \hat W_{ix + 1} + \lfloor \hat W_{ix} / \beta \rfloor$ \\
+Shift right and reduce modulo $\beta$ simultaneously. \\
+6.  for $ix$ from $0$ to $n.used + 1$ do \\
+\hspace{3mm}6.1  $x_{ix} \leftarrow \hat W_{ix + n.used} \mbox{ (mod }\beta\mbox{)}$ \\
+Zero excess digits and fixup $x$. \\
+7.  if $x.used > n.used + 1$ then do \\
+\hspace{3mm}7.1  for $ix$ from $n.used + 1$ to $x.used - 1$ do \\
+\hspace{6mm}7.1.1  $x_{ix} \leftarrow 0$ \\
+8.  $x.used \leftarrow n.used + 1$ \\
+9.  Clamp excessive digits of $x$. \\
+10.  If $x \ge n$ then \\
+\hspace{3mm}10.1  $x \leftarrow x - n$ \\
+11.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm fast\_mp\_montgomery\_reduce}
+\end{figure}
+
+\textbf{Algorithm fast\_mp\_montgomery\_reduce.}
+This algorithm will compute the Montgomery reduction of $x$ modulo $n$ using the Comba technique.  It is on most computer platforms significantly
+faster than algorithm mp\_montgomery\_reduce and algorithm mp\_reduce (\textit{Barrett reduction}).  The algorithm has the same restrictions
+on the input as the baseline reduction algorithm.  An additional two restrictions are imposed on this algorithm.  The number of digits $k$ in the 
+the modulus $n$ must not violate $MP\_WARRAY > 2k +1$ and $n < \delta$.   When $\beta = 2^{28}$ this algorithm can be used to reduce modulo
+a modulus of at most $3,556$ bits in length.  
+
+As in the other Comba reduction algorithms there is a $\hat W$ array which stores the columns of the product.  It is initially filled with the
+contents of $x$ with the excess digits zeroed.  The reduction loop is very similar the to the baseline loop at heart.  The multiplication on step
+4.1 can be single precision only since $ab \mbox{ (mod }\beta\mbox{)} \equiv (a \mbox{ mod }\beta)(b \mbox{ mod }\beta)$.  Some multipliers such
+as those on the ARM processors take a variable length time to complete depending on the number of bytes of result it must produce.  By performing
+a single precision multiplication instead half the amount of time is spent.
+
+Also note that digit $\hat W_{ix}$ must have the carry from the $ix - 1$'th digit propagated upwards in order for this to work.  That is what step
+4.3 will do.  In effect over the $n.used$ iterations of the outer loop the $n.used$'th lower columns all have the their carries propagated forwards.  Note
+how the upper bits of those same words are not reduced modulo $\beta$.  This is because those values will be discarded shortly and there is no
+point.
+
+Step 5 will propagate the remainder of the carries upwards.  On step 6 the columns are reduced modulo $\beta$ and shifted simultaneously as they are
+stored in the destination $x$.  
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_fast\_mp\_montgomery\_reduce.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* computes xR**-1 == x (mod N) via Montgomery Reduction
+018    *
+019    * This is an optimized implementation of montgomery_reduce
+020    * which uses the comba method to quickly calculate the columns of the
+021    * reduction.
+022    *
+023    * Based on Algorithm 14.32 on pp.601 of HAC.
+024   */
+025   int fast_mp_montgomery_reduce (mp_int * x, mp_int * n, mp_digit rho)
+026   \{
+027     int     ix, res, olduse;
+028     mp_word W[MP_WARRAY];
+029   
+030     /* get old used count */
+031     olduse = x->used;
+032   
+033     /* grow a as required */
+034     if (x->alloc < n->used + 1) \{
+035       if ((res = mp_grow (x, n->used + 1)) != MP_OKAY) \{
+036         return res;
+037       \}
+038     \}
+039   
+040     /* first we have to get the digits of the input into
+041      * an array of double precision words W[...]
+042      */
+043     \{
+044       register mp_word *_W;
+045       register mp_digit *tmpx;
+046   
+047       /* alias for the W[] array */
+048       _W   = W;
+049   
+050       /* alias for the digits of  x*/
+051       tmpx = x->dp;
+052   
+053       /* copy the digits of a into W[0..a->used-1] */
+054       for (ix = 0; ix < x->used; ix++) \{
+055         *_W++ = *tmpx++;
+056       \}
+057   
+058       /* zero the high words of W[a->used..m->used*2] */
+059       for (; ix < n->used * 2 + 1; ix++) \{
+060         *_W++ = 0;
+061       \}
+062     \}
+063   
+064     /* now we proceed to zero successive digits
+065      * from the least significant upwards
+066      */
+067     for (ix = 0; ix < n->used; ix++) \{
+068       /* mu = ai * m' mod b
+069        *
+070        * We avoid a double precision multiplication (which isn't required)
+071        * by casting the value down to a mp_digit.  Note this requires
+072        * that W[ix-1] have  the carry cleared (see after the inner loop)
+073        */
+074       register mp_digit mu;
+075       mu = (mp_digit) (((W[ix] & MP_MASK) * rho) & MP_MASK);
+076   
+077       /* a = a + mu * m * b**i
+078        *
+079        * This is computed in place and on the fly.  The multiplication
+080        * by b**i is handled by offseting which columns the results
+081        * are added to.
+082        *
+083        * Note the comba method normally doesn't handle carries in the
+084        * inner loop In this case we fix the carry from the previous
+085        * column since the Montgomery reduction requires digits of the
+086        * result (so far) [see above] to work.  This is
+087        * handled by fixing up one carry after the inner loop.  The
+088        * carry fixups are done in order so after these loops the
+089        * first m->used words of W[] have the carries fixed
+090        */
+091       \{
+092         register int iy;
+093         register mp_digit *tmpn;
+094         register mp_word *_W;
+095   
+096         /* alias for the digits of the modulus */
+097         tmpn = n->dp;
+098   
+099         /* Alias for the columns set by an offset of ix */
+100         _W = W + ix;
+101   
+102         /* inner loop */
+103         for (iy = 0; iy < n->used; iy++) \{
+104             *_W++ += ((mp_word)mu) * ((mp_word)*tmpn++);
+105         \}
+106       \}
+107   
+108       /* now fix carry for next digit, W[ix+1] */
+109       W[ix + 1] += W[ix] >> ((mp_word) DIGIT_BIT);
+110     \}
+111   
+112     /* now we have to propagate the carries and
+113      * shift the words downward [all those least
+114      * significant digits we zeroed].
+115      */
+116     \{
+117       register mp_digit *tmpx;
+118       register mp_word *_W, *_W1;
+119   
+120       /* nox fix rest of carries */
+121   
+122       /* alias for current word */
+123       _W1 = W + ix;
+124   
+125       /* alias for next word, where the carry goes */
+126       _W = W + ++ix;
+127   
+128       for (; ix <= n->used * 2 + 1; ix++) \{
+129         *_W++ += *_W1++ >> ((mp_word) DIGIT_BIT);
+130       \}
+131   
+132       /* copy out, A = A/b**n
+133        *
+134        * The result is A/b**n but instead of converting from an
+135        * array of mp_word to mp_digit than calling mp_rshd
+136        * we just copy them in the right order
+137        */
+138   
+139       /* alias for destination word */
+140       tmpx = x->dp;
+141   
+142       /* alias for shifted double precision result */
+143       _W = W + n->used;
+144   
+145       for (ix = 0; ix < n->used + 1; ix++) \{
+146         *tmpx++ = (mp_digit)(*_W++ & ((mp_word) MP_MASK));
+147       \}
+148   
+149       /* zero oldused digits, if the input a was larger than
+150        * m->used+1 we'll have to clear the digits
+151        */
+152       for (; ix < olduse; ix++) \{
+153         *tmpx++ = 0;
+154       \}
+155     \}
+156   
+157     /* set the max used and clamp */
+158     x->used = n->used + 1;
+159     mp_clamp (x);
+160   
+161     /* if A >= m then A = A - m */
+162     if (mp_cmp_mag (x, n) != MP_LT) \{
+163       return s_mp_sub (x, n, x);
+164     \}
+165     return MP_OKAY;
+166   \}
+167   #endif
+\end{alltt}
+\end{small}
+
+The $\hat W$ array is first filled with digits of $x$ on line 50 then the rest of the digits are zeroed on line 54.  Both loops share
+the same alias variables to make the code easier to read.  
+
+The value of $\mu$ is calculated in an interesting fashion.  First the value $\hat W_{ix}$ is reduced modulo $\beta$ and cast to a mp\_digit.  This
+forces the compiler to use a single precision multiplication and prevents any concerns about loss of precision.   Line 109 fixes the carry 
+for the next iteration of the loop by propagating the carry from $\hat W_{ix}$ to $\hat W_{ix+1}$.
+
+The for loop on line 108 propagates the rest of the carries upwards through the columns.  The for loop on line 125 reduces the columns
+modulo $\beta$ and shifts them $k$ places at the same time.  The alias $\_ \hat W$ actually refers to the array $\hat W$ starting at the $n.used$'th
+digit, that is $\_ \hat W_{t} = \hat W_{n.used + t}$.  
+
+\subsection{Montgomery Setup}
+To calculate the variable $\rho$ a relatively simple algorithm will be required.  
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_montgomery\_setup}. \\
+\textbf{Input}.   mp\_int $n$ ($n > 1$ and $(n, 2) = 1$) \\
+\textbf{Output}.  $\rho \equiv -1/n_0 \mbox{ (mod }\beta\mbox{)}$ \\
+\hline \\
+1.  $b \leftarrow n_0$ \\
+2.  If $b$ is even return(\textit{MP\_VAL}) \\
+3.  $x \leftarrow ((b + 2) \mbox{ AND } 4) << 1) + b$ \\
+4.  for $k$ from 0 to $\lceil lg(lg(\beta)) \rceil - 2$ do \\
+\hspace{3mm}4.1  $x \leftarrow x \cdot (2 - bx)$ \\
+5.  $\rho \leftarrow \beta - x \mbox{ (mod }\beta\mbox{)}$ \\
+6.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_montgomery\_setup} 
+\end{figure}
+
+\textbf{Algorithm mp\_montgomery\_setup.}
+This algorithm will calculate the value of $\rho$ required within the Montgomery reduction algorithms.  It uses a very interesting trick 
+to calculate $1/n_0$ when $\beta$ is a power of two.  
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_montgomery\_setup.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* setups the montgomery reduction stuff */
+018   int
+019   mp_montgomery_setup (mp_int * n, mp_digit * rho)
+020   \{
+021     mp_digit x, b;
+022   
+023   /* fast inversion mod 2**k
+024    *
+025    * Based on the fact that
+026    *
+027    * XA = 1 (mod 2**n)  =>  (X(2-XA)) A = 1 (mod 2**2n)
+028    *                    =>  2*X*A - X*X*A*A = 1
+029    *                    =>  2*(1) - (1)     = 1
+030    */
+031     b = n->dp[0];
+032   
+033     if ((b & 1) == 0) \{
+034       return MP_VAL;
+035     \}
+036   
+037     x = (((b + 2) & 4) << 1) + b; /* here x*a==1 mod 2**4 */
+038     x *= 2 - b * x;               /* here x*a==1 mod 2**8 */
+039   #if !defined(MP_8BIT)
+040     x *= 2 - b * x;               /* here x*a==1 mod 2**16 */
+041   #endif
+042   #if defined(MP_64BIT) || !(defined(MP_8BIT) || defined(MP_16BIT))
+043     x *= 2 - b * x;               /* here x*a==1 mod 2**32 */
+044   #endif
+045   #ifdef MP_64BIT
+046     x *= 2 - b * x;               /* here x*a==1 mod 2**64 */
+047   #endif
+048   
+049     /* rho = -1/m mod b */
+050     *rho = (((mp_word)1 << ((mp_word) DIGIT_BIT)) - x) & MP_MASK;
+051   
+052     return MP_OKAY;
+053   \}
+054   #endif
+\end{alltt}
+\end{small}
+
+This source code computes the value of $\rho$ required to perform Montgomery reduction.  It has been modified to avoid performing excess
+multiplications when $\beta$ is not the default 28-bits.  
+
+\section{The Diminished Radix Algorithm}
+The Diminished Radix method of modular reduction \cite{DRMET} is a fairly clever technique which can be more efficient than either the Barrett
+or Montgomery methods for certain forms of moduli.  The technique is based on the following simple congruence.
+
+\begin{equation}
+(x \mbox{ mod } n) + k \lfloor x / n \rfloor \equiv x \mbox{ (mod }(n - k)\mbox{)}
+\end{equation}
+
+This observation was used in the MMB \cite{MMB} block cipher to create a diffusion primitive.  It used the fact that if $n = 2^{31}$ and $k=1$ that 
+then a x86 multiplier could produce the 62-bit product and use  the ``shrd'' instruction to perform a double-precision right shift.  The proof
+of the above equation is very simple.  First write $x$ in the product form.
+
+\begin{equation}
+x = qn + r
+\end{equation}
+
+Now reduce both sides modulo $(n - k)$.
+
+\begin{equation}
+x \equiv qk + r  \mbox{ (mod }(n-k)\mbox{)}
+\end{equation}
+
+The variable $n$ reduces modulo $n - k$ to $k$.  By putting $q = \lfloor x/n \rfloor$ and $r = x \mbox{ mod } n$ 
+into the equation the original congruence is reproduced, thus concluding the proof.  The following algorithm is based on this observation.
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{Diminished Radix Reduction}. \\
+\textbf{Input}.   Integer $x$, $n$, $k$ \\
+\textbf{Output}.  $x \mbox{ mod } (n - k)$ \\
+\hline \\
+1.  $q \leftarrow \lfloor x / n \rfloor$ \\
+2.  $q \leftarrow k \cdot q$ \\
+3.  $x \leftarrow x \mbox{ (mod }n\mbox{)}$ \\
+4.  $x \leftarrow x + q$ \\
+5.  If $x \ge (n - k)$ then \\
+\hspace{3mm}5.1  $x \leftarrow x - (n - k)$ \\
+\hspace{3mm}5.2  Goto step 1. \\
+6.  Return $x$ \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm Diminished Radix Reduction}
+\label{fig:DR}
+\end{figure}
+
+This algorithm will reduce $x$ modulo $n - k$ and return the residue.  If $0 \le x < (n - k)^2$ then the algorithm will loop almost always
+once or twice and occasionally three times.  For simplicity sake the value of $x$ is bounded by the following simple polynomial.
+
+\begin{equation} 
+0 \le x < n^2 + k^2 - 2nk
+\end{equation}
+
+The true bound is  $0 \le x < (n - k - 1)^2$ but this has quite a few more terms.  The value of $q$ after step 1 is bounded by the following.
+
+\begin{equation}
+q < n - 2k - k^2/n
+\end{equation}
+
+Since $k^2$ is going to be considerably smaller than $n$ that term will always be zero.  The value of $x$ after step 3 is bounded trivially as
+$0 \le x < n$.  By step four the sum $x + q$ is bounded by 
+
+\begin{equation}
+0 \le q + x < (k + 1)n - 2k^2 - 1
+\end{equation}
+
+With a second pass $q$ will be loosely bounded by $0 \le q < k^2$ after step 2 while $x$ will still be loosely bounded by $0 \le x < n$ after step 3.  After the second pass it is highly unlike that the
+sum in step 4 will exceed $n - k$.  In practice fewer than three passes of the algorithm are required to reduce virtually every input in the 
+range $0 \le x < (n - k - 1)^2$.  
+
+\begin{figure}
+\begin{small}
+\begin{center}
+\begin{tabular}{|l|}
+\hline
+$x = 123456789, n = 256, k = 3$ \\
+\hline $q \leftarrow \lfloor x/n \rfloor = 482253$ \\
+$q \leftarrow q*k = 1446759$ \\
+$x \leftarrow x \mbox{ mod } n = 21$ \\
+$x \leftarrow x + q = 1446780$ \\
+$x \leftarrow x - (n - k) = 1446527$ \\
+\hline 
+$q \leftarrow \lfloor x/n \rfloor = 5650$ \\
+$q \leftarrow q*k = 16950$ \\
+$x \leftarrow x \mbox{ mod } n = 127$ \\
+$x \leftarrow x + q = 17077$ \\
+$x \leftarrow x - (n - k) = 16824$ \\
+\hline 
+$q \leftarrow \lfloor x/n \rfloor = 65$ \\
+$q \leftarrow q*k = 195$ \\
+$x \leftarrow x \mbox{ mod } n = 184$ \\
+$x \leftarrow x + q = 379$ \\
+$x \leftarrow x - (n - k) = 126$ \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Example Diminished Radix Reduction}
+\label{fig:EXDR}
+\end{figure}
+
+Figure~\ref{fig:EXDR} demonstrates the reduction of $x = 123456789$ modulo $n - k = 253$ when $n = 256$ and $k = 3$.  Note that even while $x$
+is considerably larger than $(n - k - 1)^2 = 63504$ the algorithm still converges on the modular residue exceedingly fast.  In this case only
+three passes were required to find the residue $x \equiv 126$.
+
+
+\subsection{Choice of Moduli}
+On the surface this algorithm looks like a very expensive algorithm.  It requires a couple of subtractions followed by multiplication and other
+modular reductions.  The usefulness of this algorithm becomes exceedingly clear when an appropriate modulus is chosen.
+
+Division in general is a very expensive operation to perform.  The one exception is when the division is by a power of the radix of representation used.  
+Division by ten for example is simple for pencil and paper mathematics since it amounts to shifting the decimal place to the right.  Similarly division 
+by two (\textit{or powers of two}) is very simple for binary computers to perform.  It would therefore seem logical to choose $n$ of the form $2^p$ 
+which would imply that $\lfloor x / n \rfloor$ is a simple shift of $x$ right $p$ bits.  
+
+However, there is one operation related to division of power of twos that is even faster than this.  If $n = \beta^p$ then the division may be 
+performed by moving whole digits to the right $p$ places.  In practice division by $\beta^p$ is much faster than division by $2^p$ for any $p$.  
+Also with the choice of $n = \beta^p$ reducing $x$ modulo $n$ merely requires zeroing the digits above the $p-1$'th digit of $x$.  
+
+Throughout the next section the term ``restricted modulus'' will refer to a modulus of the form $\beta^p - k$ whereas the term ``unrestricted
+modulus'' will refer to a modulus of the form $2^p - k$.  The word ``restricted'' in this case refers to the fact that it is based on the 
+$2^p$ logic except $p$ must be a multiple of $lg(\beta)$.  
+
+\subsection{Choice of $k$}
+Now that division and reduction (\textit{step 1 and 3 of figure~\ref{fig:DR}}) have been optimized to simple digit operations the multiplication by $k$
+in step 2 is the most expensive operation.  Fortunately the choice of $k$ is not terribly limited.  For all intents and purposes it might
+as well be a single digit.  The smaller the value of $k$ is the faster the algorithm will be.  
+
+\subsection{Restricted Diminished Radix Reduction}
+The restricted Diminished Radix algorithm can quickly reduce an input modulo a modulus of the form $n = \beta^p - k$.  This algorithm can reduce 
+an input $x$ within the range $0 \le x < n^2$ using only a couple passes of the algorithm demonstrated in figure~\ref{fig:DR}.  The implementation
+of this algorithm has been optimized to avoid additional overhead associated with a division by $\beta^p$, the multiplication by $k$ or the addition 
+of $x$ and $q$.  The resulting algorithm is very efficient and can lead to substantial improvements over Barrett and Montgomery reduction when modular 
+exponentiations are performed.
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_dr\_reduce}. \\
+\textbf{Input}.   mp\_int $x$, $n$ and a mp\_digit $k = \beta - n_0$ \\
+\hspace{11.5mm}($0 \le x < n^2$, $n > 1$, $0 < k < \beta$) \\
+\textbf{Output}.  $x \mbox{ mod } n$ \\
+\hline \\
+1.  $m \leftarrow n.used$ \\
+2.  If $x.alloc < 2m$ then grow $x$ to $2m$ digits. \\
+3.  $\mu \leftarrow 0$ \\
+4.  for $i$ from $0$ to $m - 1$ do \\
+\hspace{3mm}4.1  $\hat r \leftarrow k \cdot x_{m+i} + x_{i} + \mu$ \\
+\hspace{3mm}4.2  $x_{i} \leftarrow \hat r \mbox{ (mod }\beta\mbox{)}$ \\
+\hspace{3mm}4.3  $\mu \leftarrow \lfloor \hat r / \beta \rfloor$ \\
+5.  $x_{m} \leftarrow \mu$ \\
+6.  for $i$ from $m + 1$ to $x.used - 1$ do \\
+\hspace{3mm}6.1  $x_{i} \leftarrow 0$ \\
+7.  Clamp excess digits of $x$. \\
+8.  If $x \ge n$ then \\
+\hspace{3mm}8.1  $x \leftarrow x - n$ \\
+\hspace{3mm}8.2  Goto step 3. \\
+9.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_dr\_reduce}
+\end{figure}
+
+\textbf{Algorithm mp\_dr\_reduce.}
+This algorithm will perform the Dimished Radix reduction of $x$ modulo $n$.  It has similar restrictions to that of the Barrett reduction
+with the addition that $n$ must be of the form $n = \beta^m - k$ where $0 < k <\beta$.  
+
+This algorithm essentially implements the pseudo-code in figure~\ref{fig:DR} except with a slight optimization.  The division by $\beta^m$, multiplication by $k$
+and addition of $x \mbox{ mod }\beta^m$ are all performed simultaneously inside the loop on step 4.  The division by $\beta^m$ is emulated by accessing
+the term at the $m+i$'th position which is subsequently multiplied by $k$ and added to the term at the $i$'th position.  After the loop the $m$'th
+digit is set to the carry and the upper digits are zeroed.  Steps 5 and 6 emulate the reduction modulo $\beta^m$ that should have happend to 
+$x$ before the addition of the multiple of the upper half.  
+
+At step 8 if $x$ is still larger than $n$ another pass of the algorithm is required.  First $n$ is subtracted from $x$ and then the algorithm resumes
+at step 3.  
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_dr\_reduce.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* reduce "x" in place modulo "n" using the Diminished Radix algorithm.
+018    *
+019    * Based on algorithm from the paper
+020    *
+021    * "Generating Efficient Primes for Discrete Log Cryptosystems"
+022    *                 Chae Hoon Lim, Pil Joong Lee,
+023    *          POSTECH Information Research Laboratories
+024    *
+025    * The modulus must be of a special format [see manual]
+026    *
+027    * Has been modified to use algorithm 7.10 from the LTM book instead
+028    *
+029    * Input x must be in the range 0 <= x <= (n-1)**2
+030    */
+031   int
+032   mp_dr_reduce (mp_int * x, mp_int * n, mp_digit k)
+033   \{
+034     int      err, i, m;
+035     mp_word  r;
+036     mp_digit mu, *tmpx1, *tmpx2;
+037   
+038     /* m = digits in modulus */
+039     m = n->used;
+040   
+041     /* ensure that "x" has at least 2m digits */
+042     if (x->alloc < m + m) \{
+043       if ((err = mp_grow (x, m + m)) != MP_OKAY) \{
+044         return err;
+045       \}
+046     \}
+047   
+048   /* top of loop, this is where the code resumes if
+049    * another reduction pass is required.
+050    */
+051   top:
+052     /* aliases for digits */
+053     /* alias for lower half of x */
+054     tmpx1 = x->dp;
+055   
+056     /* alias for upper half of x, or x/B**m */
+057     tmpx2 = x->dp + m;
+058   
+059     /* set carry to zero */
+060     mu = 0;
+061   
+062     /* compute (x mod B**m) + k * [x/B**m] inline and inplace */
+063     for (i = 0; i < m; i++) \{
+064         r         = ((mp_word)*tmpx2++) * ((mp_word)k) + *tmpx1 + mu;
+065         *tmpx1++  = (mp_digit)(r & MP_MASK);
+066         mu        = (mp_digit)(r >> ((mp_word)DIGIT_BIT));
+067     \}
+068   
+069     /* set final carry */
+070     *tmpx1++ = mu;
+071   
+072     /* zero words above m */
+073     for (i = m + 1; i < x->used; i++) \{
+074         *tmpx1++ = 0;
+075     \}
+076   
+077     /* clamp, sub and return */
+078     mp_clamp (x);
+079   
+080     /* if x >= n then subtract and reduce again
+081      * Each successive "recursion" makes the input smaller and smaller.
+082      */
+083     if (mp_cmp_mag (x, n) != MP_LT) \{
+084       s_mp_sub(x, n, x);
+085       goto top;
+086     \}
+087     return MP_OKAY;
+088   \}
+089   #endif
+\end{alltt}
+\end{small}
+
+The first step is to grow $x$ as required to $2m$ digits since the reduction is performed in place on $x$.  The label on line 51 is where
+the algorithm will resume if further reduction passes are required.  In theory it could be placed at the top of the function however, the size of
+the modulus and question of whether $x$ is large enough are invariant after the first pass meaning that it would be a waste of time.  
+
+The aliases $tmpx1$ and $tmpx2$ refer to the digits of $x$ where the latter is offset by $m$ digits.  By reading digits from $x$ offset by $m$ digits
+a division by $\beta^m$ can be simulated virtually for free.  The loop on line 63 performs the bulk of the work (\textit{corresponds to step 4 of algorithm 7.11})
+in this algorithm.
+
+By line 70 the pointer $tmpx1$ points to the $m$'th digit of $x$ which is where the final carry will be placed.  Similarly by line 73 the 
+same pointer will point to the $m+1$'th digit where the zeroes will be placed.  
+
+Since the algorithm is only valid if both $x$ and $n$ are greater than zero an unsigned comparison suffices to determine if another pass is required.  
+With the same logic at line 84 the value of $x$ is known to be greater than or equal to $n$ meaning that an unsigned subtraction can be used
+as well.  Since the destination of the subtraction is the larger of the inputs the call to algorithm s\_mp\_sub cannot fail and the return code
+does not need to be checked.
+
+\subsubsection{Setup}
+To setup the restricted Diminished Radix algorithm the value $k = \beta - n_0$ is required.  This algorithm is not really complicated but provided for
+completeness.
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_dr\_setup}. \\
+\textbf{Input}.   mp\_int $n$ \\
+\textbf{Output}.  $k = \beta - n_0$ \\
+\hline \\
+1.  $k \leftarrow \beta - n_0$ \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_dr\_setup}
+\end{figure}
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_dr\_setup.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* determines the setup value */
+018   void mp_dr_setup(mp_int *a, mp_digit *d)
+019   \{
+020      /* the casts are required if DIGIT_BIT is one less than
+021       * the number of bits in a mp_digit [e.g. DIGIT_BIT==31]
+022       */
+023      *d = (mp_digit)((((mp_word)1) << ((mp_word)DIGIT_BIT)) - 
+024           ((mp_word)a->dp[0]));
+025   \}
+026   
+027   #endif
+\end{alltt}
+\end{small}
+
+\subsubsection{Modulus Detection}
+Another algorithm which will be useful is the ability to detect a restricted Diminished Radix modulus.  An integer is said to be
+of restricted Diminished Radix form if all of the digits are equal to $\beta - 1$ except the trailing digit which may be any value.
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_dr\_is\_modulus}. \\
+\textbf{Input}.   mp\_int $n$ \\
+\textbf{Output}.  $1$ if $n$ is in D.R form, $0$ otherwise \\
+\hline
+1.  If $n.used < 2$ then return($0$). \\
+2.  for $ix$ from $1$ to $n.used - 1$ do \\
+\hspace{3mm}2.1  If $n_{ix} \ne \beta - 1$ return($0$). \\
+3.  Return($1$). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_dr\_is\_modulus}
+\end{figure}
+
+\textbf{Algorithm mp\_dr\_is\_modulus.}
+This algorithm determines if a value is in Diminished Radix form.  Step 1 rejects obvious cases where fewer than two digits are
+in the mp\_int.  Step 2 tests all but the first digit to see if they are equal to $\beta - 1$.  If the algorithm manages to get to
+step 3 then $n$ must be of Diminished Radix form.
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_dr\_is\_modulus.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* determines if a number is a valid DR modulus */
+018   int mp_dr_is_modulus(mp_int *a)
+019   \{
+020      int ix;
+021   
+022      /* must be at least two digits */
+023      if (a->used < 2) \{
+024         return 0;
+025      \}
+026   
+027      /* must be of the form b**k - a [a <= b] so all
+028       * but the first digit must be equal to -1 (mod b).
+029       */
+030      for (ix = 1; ix < a->used; ix++) \{
+031          if (a->dp[ix] != MP_MASK) \{
+032             return 0;
+033          \}
+034      \}
+035      return 1;
+036   \}
+037   
+038   #endif
+\end{alltt}
+\end{small}
+
+\subsection{Unrestricted Diminished Radix Reduction}
+The unrestricted Diminished Radix algorithm allows modular reductions to be performed when the modulus is of the form $2^p - k$.  This algorithm
+is a straightforward adaptation of algorithm~\ref{fig:DR}.
+
+In general the restricted Diminished Radix reduction algorithm is much faster since it has considerably lower overhead.  However, this new
+algorithm is much faster than either Montgomery or Barrett reduction when the moduli are of the appropriate form.
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_reduce\_2k}. \\
+\textbf{Input}.   mp\_int $a$ and $n$.  mp\_digit $k$  \\
+\hspace{11.5mm}($a \ge 0$, $n > 1$, $0 < k < \beta$, $n + k$ is a power of two) \\
+\textbf{Output}.  $a \mbox{ (mod }n\mbox{)}$ \\
+\hline
+1.  $p \leftarrow \lceil lg(n) \rceil$  (\textit{mp\_count\_bits}) \\
+2.  While $a \ge n$ do \\
+\hspace{3mm}2.1  $q \leftarrow \lfloor a / 2^p \rfloor$ (\textit{mp\_div\_2d}) \\
+\hspace{3mm}2.2  $a \leftarrow a \mbox{ (mod }2^p\mbox{)}$ (\textit{mp\_mod\_2d}) \\
+\hspace{3mm}2.3  $q \leftarrow q \cdot k$ (\textit{mp\_mul\_d}) \\
+\hspace{3mm}2.4  $a \leftarrow a - q$ (\textit{s\_mp\_sub}) \\
+\hspace{3mm}2.5  If $a \ge n$ then do \\
+\hspace{6mm}2.5.1  $a \leftarrow a - n$ \\
+3.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_reduce\_2k}
+\end{figure}
+
+\textbf{Algorithm mp\_reduce\_2k.}
+This algorithm quickly reduces an input $a$ modulo an unrestricted Diminished Radix modulus $n$.  Division by $2^p$ is emulated with a right
+shift which makes the algorithm fairly inexpensive to use.  
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_reduce\_2k.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* reduces a modulo n where n is of the form 2**p - d */
+018   int mp_reduce_2k(mp_int *a, mp_int *n, mp_digit d)
+019   \{
+020      mp_int q;
+021      int    p, res;
+022      
+023      if ((res = mp_init(&q)) != MP_OKAY) \{
+024         return res;
+025      \}
+026      
+027      p = mp_count_bits(n);    
+028   top:
+029      /* q = a/2**p, a = a mod 2**p */
+030      if ((res = mp_div_2d(a, p, &q, a)) != MP_OKAY) \{
+031         goto ERR;
+032      \}
+033      
+034      if (d != 1) \{
+035         /* q = q * d */
+036         if ((res = mp_mul_d(&q, d, &q)) != MP_OKAY) \{ 
+037            goto ERR;
+038         \}
+039      \}
+040      
+041      /* a = a + q */
+042      if ((res = s_mp_add(a, &q, a)) != MP_OKAY) \{
+043         goto ERR;
+044      \}
+045      
+046      if (mp_cmp_mag(a, n) != MP_LT) \{
+047         s_mp_sub(a, n, a);
+048         goto top;
+049      \}
+050      
+051   ERR:
+052      mp_clear(&q);
+053      return res;
+054   \}
+055   
+056   #endif
+\end{alltt}
+\end{small}
+
+The algorithm mp\_count\_bits calculates the number of bits in an mp\_int which is used to find the initial value of $p$.  The call to mp\_div\_2d
+on line 30 calculates both the quotient $q$ and the remainder $a$ required.  By doing both in a single function call the code size
+is kept fairly small.  The multiplication by $k$ is only performed if $k > 1$. This allows reductions modulo $2^p - 1$ to be performed without
+any multiplications.  
+
+The unsigned s\_mp\_add, mp\_cmp\_mag and s\_mp\_sub are used in place of their full sign counterparts since the inputs are only valid if they are 
+positive.  By using the unsigned versions the overhead is kept to a minimum.  
+
+\subsubsection{Unrestricted Setup}
+To setup this reduction algorithm the value of $k = 2^p - n$ is required.  
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_reduce\_2k\_setup}. \\
+\textbf{Input}.   mp\_int $n$   \\
+\textbf{Output}.  $k = 2^p - n$ \\
+\hline
+1.  $p \leftarrow \lceil lg(n) \rceil$  (\textit{mp\_count\_bits}) \\
+2.  $x \leftarrow 2^p$ (\textit{mp\_2expt}) \\
+3.  $x \leftarrow x - n$ (\textit{mp\_sub}) \\
+4.  $k \leftarrow x_0$ \\
+5.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_reduce\_2k\_setup}
+\end{figure}
+
+\textbf{Algorithm mp\_reduce\_2k\_setup.}
+This algorithm computes the value of $k$ required for the algorithm mp\_reduce\_2k.  By making a temporary variable $x$ equal to $2^p$ a subtraction
+is sufficient to solve for $k$.  Alternatively if $n$ has more than one digit the value of $k$ is simply $\beta - n_0$.  
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_reduce\_2k\_setup.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* determines the setup value */
+018   int mp_reduce_2k_setup(mp_int *a, mp_digit *d)
+019   \{
+020      int res, p;
+021      mp_int tmp;
+022      
+023      if ((res = mp_init(&tmp)) != MP_OKAY) \{
+024         return res;
+025      \}
+026      
+027      p = mp_count_bits(a);
+028      if ((res = mp_2expt(&tmp, p)) != MP_OKAY) \{
+029         mp_clear(&tmp);
+030         return res;
+031      \}
+032      
+033      if ((res = s_mp_sub(&tmp, a, &tmp)) != MP_OKAY) \{
+034         mp_clear(&tmp);
+035         return res;
+036      \}
+037      
+038      *d = tmp.dp[0];
+039      mp_clear(&tmp);
+040      return MP_OKAY;
+041   \}
+042   #endif
+\end{alltt}
+\end{small}
+
+\subsubsection{Unrestricted Detection}
+An integer $n$ is a valid unrestricted Diminished Radix modulus if either of the following are true.
+
+\begin{enumerate}
+\item  The number has only one digit.
+\item  The number has more than one digit and every bit from the $\beta$'th to the most significant is one.
+\end{enumerate}
+
+If either condition is true than there is a power of two $2^p$ such that $0 < 2^p - n < \beta$.   If the input is only
+one digit than it will always be of the correct form.  Otherwise all of the bits above the first digit must be one.  This arises from the fact
+that there will be value of $k$ that when added to the modulus causes a carry in the first digit which propagates all the way to the most
+significant bit.  The resulting sum will be a power of two.
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_reduce\_is\_2k}. \\
+\textbf{Input}.   mp\_int $n$   \\
+\textbf{Output}.  $1$ if of proper form, $0$ otherwise \\
+\hline
+1.  If $n.used = 0$ then return($0$). \\
+2.  If $n.used = 1$ then return($1$). \\
+3.  $p \leftarrow \lceil lg(n) \rceil$  (\textit{mp\_count\_bits}) \\
+4.  for $x$ from $lg(\beta)$ to $p$ do \\
+\hspace{3mm}4.1  If the ($x \mbox{ mod }lg(\beta)$)'th bit of the $\lfloor x / lg(\beta) \rfloor$ of $n$ is zero then return($0$). \\
+5.  Return($1$). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_reduce\_is\_2k}
+\end{figure}
+
+\textbf{Algorithm mp\_reduce\_is\_2k.}
+This algorithm quickly determines if a modulus is of the form required for algorithm mp\_reduce\_2k to function properly.  
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_reduce\_is\_2k.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* determines if mp_reduce_2k can be used */
+018   int mp_reduce_is_2k(mp_int *a)
+019   \{
+020      int ix, iy, iw;
+021      mp_digit iz;
+022      
+023      if (a->used == 0) \{
+024         return MP_NO;
+025      \} else if (a->used == 1) \{
+026         return MP_YES;
+027      \} else if (a->used > 1) \{
+028         iy = mp_count_bits(a);
+029         iz = 1;
+030         iw = 1;
+031       
+032         /* Test every bit from the second digit up, must be 1 */
+033         for (ix = DIGIT_BIT; ix < iy; ix++) \{
+034             if ((a->dp[iw] & iz) == 0) \{
+035                return MP_NO;
+036             \}
+037             iz <<= 1;
+038             if (iz > (mp_digit)MP_MASK) \{
+039                ++iw;
+040                iz = 1;
+041             \}
+042         \}
+043      \}
+044      return MP_YES;
+045   \}
+046   
+047   #endif
+\end{alltt}
+\end{small}
+
+
+
+\section{Algorithm Comparison}
+So far three very different algorithms for modular reduction have been discussed.  Each of the algorithms have their own strengths and weaknesses
+that makes having such a selection very useful.  The following table sumarizes the three algorithms along with comparisons of work factors.  Since
+all three algorithms have the restriction that $0 \le x < n^2$ and $n > 1$ those limitations are not included in the table.  
+
+\begin{center}
+\begin{small}
+\begin{tabular}{|c|c|c|c|c|c|}
+\hline \textbf{Method} & \textbf{Work Required} & \textbf{Limitations} & \textbf{$m = 8$} & \textbf{$m = 32$} & \textbf{$m = 64$} \\
+\hline Barrett    & $m^2 + 2m - 1$ & None              & $79$ & $1087$ & $4223$ \\
+\hline Montgomery & $m^2 + m$      & $n$ must be odd   & $72$ & $1056$ & $4160$ \\
+\hline D.R.       & $2m$           & $n = \beta^m - k$ & $16$ & $64$   & $128$  \\
+\hline
+\end{tabular}
+\end{small}
+\end{center}
+
+In theory Montgomery and Barrett reductions would require roughly the same amount of time to complete.  However, in practice since Montgomery
+reduction can be written as a single function with the Comba technique it is much faster.  Barrett reduction suffers from the overhead of
+calling the half precision multipliers, addition and division by $\beta$ algorithms.
+
+For almost every cryptographic algorithm Montgomery reduction is the algorithm of choice.  The one set of algorithms where Diminished Radix reduction truly
+shines are based on the discrete logarithm problem such as Diffie-Hellman \cite{DH} and ElGamal \cite{ELGAMAL}.  In these algorithms
+primes of the form $\beta^m - k$ can be found and shared amongst users.  These primes will allow the Diminished Radix algorithm to be used in
+modular exponentiation to greatly speed up the operation.
+
+
+
+\section*{Exercises}
+\begin{tabular}{cl}
+$\left [ 3 \right ]$ & Prove that the ``trick'' in algorithm mp\_montgomery\_setup actually \\
+                     & calculates the correct value of $\rho$. \\
+                     & \\
+$\left [ 2 \right ]$ & Devise an algorithm to reduce modulo $n + k$ for small $k$ quickly.  \\
+                     & \\
+$\left [ 4 \right ]$ & Prove that the pseudo-code algorithm ``Diminished Radix Reduction'' \\
+                     & (\textit{figure~\ref{fig:DR}}) terminates.  Also prove the probability that it will \\
+                     & terminate within $1 \le k \le 10$ iterations. \\
+                     & \\
+\end{tabular}                     
+
+
+\chapter{Exponentiation}
+Exponentiation is the operation of raising one variable to the power of another, for example, $a^b$.  A variant of exponentiation, computed
+in a finite field or ring, is called modular exponentiation.  This latter style of operation is typically used in public key 
+cryptosystems such as RSA and Diffie-Hellman.  The ability to quickly compute modular exponentiations is of great benefit to any
+such cryptosystem and many methods have been sought to speed it up.
+
+\section{Exponentiation Basics}
+A trivial algorithm would simply multiply $a$ against itself $b - 1$ times to compute the exponentiation desired.  However, as $b$ grows in size
+the number of multiplications becomes prohibitive.  Imagine what would happen if $b$ $\approx$ $2^{1024}$ as is the case when computing an RSA signature
+with a $1024$-bit key.  Such a calculation could never be completed as it would take simply far too long.
+
+Fortunately there is a very simple algorithm based on the laws of exponents.  Recall that $lg_a(a^b) = b$ and that $lg_a(a^ba^c) = b + c$ which
+are two trivial relationships between the base and the exponent.  Let $b_i$ represent the $i$'th bit of $b$ starting from the least 
+significant bit.  If $b$ is a $k$-bit integer than the following equation is true.
+
+\begin{equation}
+a^b = \prod_{i=0}^{k-1} a^{2^i \cdot b_i}
+\end{equation}
+
+By taking the base $a$ logarithm of both sides of the equation the following equation is the result.
+
+\begin{equation}
+b = \sum_{i=0}^{k-1}2^i \cdot b_i
+\end{equation}
+
+The term $a^{2^i}$ can be found from the $i - 1$'th term by squaring the term since $\left ( a^{2^i} \right )^2$ is equal to
+$a^{2^{i+1}}$.  This observation forms the basis of essentially all fast exponentiation algorithms.  It requires $k$ squarings and on average
+$k \over 2$ multiplications to compute the result.  This is indeed quite an improvement over simply multiplying by $a$ a total of $b-1$ times.
+
+While this current method is a considerable speed up there are further improvements to be made.  For example, the $a^{2^i}$ term does not need to 
+be computed in an auxilary variable.  Consider the following equivalent algorithm.
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{Left to Right Exponentiation}. \\
+\textbf{Input}.   Integer $a$, $b$ and $k$ \\
+\textbf{Output}.  $c = a^b$ \\
+\hline \\
+1.  $c \leftarrow 1$ \\
+2.  for $i$ from $k - 1$ to $0$ do \\
+\hspace{3mm}2.1  $c \leftarrow c^2$ \\
+\hspace{3mm}2.2  $c \leftarrow c \cdot a^{b_i}$ \\
+3.  Return $c$. \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Left to Right Exponentiation}
+\label{fig:LTOR}
+\end{figure}
+
+This algorithm starts from the most significant bit and works towards the least significant bit.  When the $i$'th bit of $b$ is set $a$ is
+multiplied against the current product.  In each iteration the product is squared which doubles the exponent of the individual terms of the
+product.  
+
+For example, let $b = 101100_2 \equiv 44_{10}$.  The following chart demonstrates the actions of the algorithm.
+
+\newpage\begin{figure}
+\begin{center}
+\begin{tabular}{|c|c|}
+\hline \textbf{Value of $i$} & \textbf{Value of $c$} \\
+\hline - & $1$ \\
+\hline $5$ & $a$ \\
+\hline $4$ & $a^2$ \\
+\hline $3$ & $a^4 \cdot a$ \\
+\hline $2$ & $a^8 \cdot a^2 \cdot a$ \\
+\hline $1$ & $a^{16} \cdot a^4 \cdot a^2$ \\
+\hline $0$ & $a^{32} \cdot a^8 \cdot a^4$ \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Example of Left to Right Exponentiation}
+\end{figure}
+
+When the product $a^{32} \cdot a^8 \cdot a^4$ is simplified it is equal $a^{44}$ which is the desired exponentiation.  This particular algorithm is 
+called ``Left to Right'' because it reads the exponent in that order.  All of the exponentiation algorithms that will be presented are of this nature.  
+
+\subsection{Single Digit Exponentiation}
+The first algorithm in the series of exponentiation algorithms will be an unbounded algorithm where the exponent is a single digit.  It is intended 
+to be used when a small power of an input is required (\textit{e.g. $a^5$}).  It is faster than simply multiplying $b - 1$ times for all values of 
+$b$ that are greater than three.  
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_expt\_d}. \\
+\textbf{Input}.   mp\_int $a$ and mp\_digit $b$ \\
+\textbf{Output}.  $c = a^b$ \\
+\hline \\
+1.  $g \leftarrow a$ (\textit{mp\_init\_copy}) \\
+2.  $c \leftarrow 1$ (\textit{mp\_set}) \\
+3.  for $x$ from 1 to $lg(\beta)$ do \\
+\hspace{3mm}3.1  $c \leftarrow c^2$ (\textit{mp\_sqr}) \\
+\hspace{3mm}3.2  If $b$ AND $2^{lg(\beta) - 1} \ne 0$ then \\
+\hspace{6mm}3.2.1  $c \leftarrow c \cdot g$ (\textit{mp\_mul}) \\
+\hspace{3mm}3.3  $b \leftarrow b << 1$ \\
+4.  Clear $g$. \\
+5.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_expt\_d}
+\end{figure}
+
+\textbf{Algorithm mp\_expt\_d.}
+This algorithm computes the value of $a$ raised to the power of a single digit $b$.  It uses the left to right exponentiation algorithm to
+quickly compute the exponentiation.  It is loosely based on algorithm 14.79 of HAC \cite[pp. 615]{HAC} with the difference that the 
+exponent is a fixed width.  
+
+A copy of $a$ is made first to allow destination variable $c$ be the same as the source variable $a$.  The result is set to the initial value of 
+$1$ in the subsequent step.
+
+Inside the loop the exponent is read from the most significant bit first down to the least significant bit.  First $c$ is invariably squared
+on step 3.1.  In the following step if the most significant bit of $b$ is one the copy of $a$ is multiplied against $c$.  The value
+of $b$ is shifted left one bit to make the next bit down from the most signficant bit the new most significant bit.  In effect each
+iteration of the loop moves the bits of the exponent $b$ upwards to the most significant location.
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_expt\_d.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* calculate c = a**b  using a square-multiply algorithm */
+018   int mp_expt_d (mp_int * a, mp_digit b, mp_int * c)
+019   \{
+020     int     res, x;
+021     mp_int  g;
+022   
+023     if ((res = mp_init_copy (&g, a)) != MP_OKAY) \{
+024       return res;
+025     \}
+026   
+027     /* set initial result */
+028     mp_set (c, 1);
+029   
+030     for (x = 0; x < (int) DIGIT_BIT; x++) \{
+031       /* square */
+032       if ((res = mp_sqr (c, c)) != MP_OKAY) \{
+033         mp_clear (&g);
+034         return res;
+035       \}
+036   
+037       /* if the bit is set multiply */
+038       if ((b & (mp_digit) (((mp_digit)1) << (DIGIT_BIT - 1))) != 0) \{
+039         if ((res = mp_mul (c, &g, c)) != MP_OKAY) \{
+040            mp_clear (&g);
+041            return res;
+042         \}
+043       \}
+044   
+045       /* shift to next bit */
+046       b <<= 1;
+047     \}
+048   
+049     mp_clear (&g);
+050     return MP_OKAY;
+051   \}
+052   #endif
+\end{alltt}
+\end{small}
+
+Line 28 sets the initial value of the result to $1$.  Next the loop on line 30 steps through each bit of the exponent starting from
+the most significant down towards the least significant. The invariant squaring operation placed on line 32 is performed first.  After 
+the squaring the result $c$ is multiplied by the base $g$ if and only if the most significant bit of the exponent is set.  The shift on line
+46 moves all of the bits of the exponent upwards towards the most significant location.  
+
+\section{$k$-ary Exponentiation}
+When calculating an exponentiation the most time consuming bottleneck is the multiplications which are in general a small factor
+slower than squaring.  Recall from the previous algorithm that $b_{i}$ refers to the $i$'th bit of the exponent $b$.  Suppose instead it referred to
+the $i$'th $k$-bit digit of the exponent of $b$.  For $k = 1$ the definitions are synonymous and for $k > 1$ algorithm~\ref{fig:KARY}
+computes the same exponentiation.  A group of $k$ bits from the exponent is called a \textit{window}.  That is it is a small window on only a
+portion of the entire exponent.  Consider the following modification to the basic left to right exponentiation algorithm.
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{$k$-ary Exponentiation}. \\
+\textbf{Input}.   Integer $a$, $b$, $k$ and $t$ \\
+\textbf{Output}.  $c = a^b$ \\
+\hline \\
+1.  $c \leftarrow 1$ \\
+2.  for $i$ from $t - 1$ to $0$ do \\
+\hspace{3mm}2.1  $c \leftarrow c^{2^k} $ \\
+\hspace{3mm}2.2  Extract the $i$'th $k$-bit word from $b$ and store it in $g$. \\
+\hspace{3mm}2.3  $c \leftarrow c \cdot a^g$ \\
+3.  Return $c$. \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{$k$-ary Exponentiation}
+\label{fig:KARY}
+\end{figure}
+
+The squaring on step 2.1 can be calculated by squaring the value $c$ successively $k$ times.  If the values of $a^g$ for $0 < g < 2^k$ have been
+precomputed this algorithm requires only $t$ multiplications and $tk$ squarings.  The table can be generated with $2^{k - 1} - 1$ squarings and
+$2^{k - 1} + 1$ multiplications.  This algorithm assumes that the number of bits in the exponent is evenly divisible by $k$.  
+However, when it is not the remaining $0 < x \le k - 1$ bits can be handled with algorithm~\ref{fig:LTOR}.
+
+Suppose $k = 4$ and $t = 100$.  This modified algorithm will require $109$ multiplications and $408$ squarings to compute the exponentiation.  The
+original algorithm would on average have required $200$ multiplications and $400$ squrings to compute the same value.  The total number of squarings
+has increased slightly but the number of multiplications has nearly halved.
+
+\subsection{Optimal Values of $k$}
+An optimal value of $k$ will minimize $2^{k} + \lceil n / k \rceil + n - 1$ for a fixed number of bits in the exponent $n$.  The simplest
+approach is to brute force search amongst the values $k = 2, 3, \ldots, 8$ for the lowest result.  Table~\ref{fig:OPTK} lists optimal values of $k$
+for various exponent sizes and compares the number of multiplication and squarings required against algorithm~\ref{fig:LTOR}.  
+
+\begin{figure}[here]
+\begin{center}
+\begin{small}
+\begin{tabular}{|c|c|c|c|c|c|}
+\hline \textbf{Exponent (bits)} & \textbf{Optimal $k$} & \textbf{Work at $k$} & \textbf{Work with ~\ref{fig:LTOR}} \\
+\hline $16$ & $2$ & $27$ & $24$ \\
+\hline $32$ & $3$ & $49$ & $48$ \\
+\hline $64$ & $3$ & $92$ & $96$ \\
+\hline $128$ & $4$ & $175$ & $192$ \\
+\hline $256$ & $4$ & $335$ & $384$ \\
+\hline $512$ & $5$ & $645$ & $768$ \\
+\hline $1024$ & $6$ & $1257$ & $1536$ \\
+\hline $2048$ & $6$ & $2452$ & $3072$ \\
+\hline $4096$ & $7$ & $4808$ & $6144$ \\
+\hline
+\end{tabular}
+\end{small}
+\end{center}
+\caption{Optimal Values of $k$ for $k$-ary Exponentiation}
+\label{fig:OPTK}
+\end{figure}
+
+\subsection{Sliding-Window Exponentiation}
+A simple modification to the previous algorithm is only generate the upper half of the table in the range $2^{k-1} \le g < 2^k$.  Essentially
+this is a table for all values of $g$ where the most significant bit of $g$ is a one.  However, in order for this to be allowed in the 
+algorithm values of $g$ in the range $0 \le g < 2^{k-1}$ must be avoided.  
+
+Table~\ref{fig:OPTK2} lists optimal values of $k$ for various exponent sizes and compares the work required against algorithm~\ref{fig:KARY}.  
+
+\begin{figure}[here]
+\begin{center}
+\begin{small}
+\begin{tabular}{|c|c|c|c|c|c|}
+\hline \textbf{Exponent (bits)} & \textbf{Optimal $k$} & \textbf{Work at $k$} & \textbf{Work with ~\ref{fig:KARY}} \\
+\hline $16$ & $3$ & $24$ & $27$ \\
+\hline $32$ & $3$ & $45$ & $49$ \\
+\hline $64$ & $4$ & $87$ & $92$ \\
+\hline $128$ & $4$ & $167$ & $175$ \\
+\hline $256$ & $5$ & $322$ & $335$ \\
+\hline $512$ & $6$ & $628$ & $645$ \\
+\hline $1024$ & $6$ & $1225$ & $1257$ \\
+\hline $2048$ & $7$ & $2403$ & $2452$ \\
+\hline $4096$ & $8$ & $4735$ & $4808$ \\
+\hline
+\end{tabular}
+\end{small}
+\end{center}
+\caption{Optimal Values of $k$ for Sliding Window Exponentiation}
+\label{fig:OPTK2}
+\end{figure}
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{Sliding Window $k$-ary Exponentiation}. \\
+\textbf{Input}.   Integer $a$, $b$, $k$ and $t$ \\
+\textbf{Output}.  $c = a^b$ \\
+\hline \\
+1.  $c \leftarrow 1$ \\
+2.  for $i$ from $t - 1$ to $0$ do \\
+\hspace{3mm}2.1  If the $i$'th bit of $b$ is a zero then \\
+\hspace{6mm}2.1.1   $c \leftarrow c^2$ \\
+\hspace{3mm}2.2  else do \\
+\hspace{6mm}2.2.1  $c \leftarrow c^{2^k}$ \\
+\hspace{6mm}2.2.2  Extract the $k$ bits from $(b_{i}b_{i-1}\ldots b_{i-(k-1)})$ and store it in $g$. \\
+\hspace{6mm}2.2.3  $c \leftarrow c \cdot a^g$ \\
+\hspace{6mm}2.2.4  $i \leftarrow i - k$ \\
+3.  Return $c$. \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Sliding Window $k$-ary Exponentiation}
+\end{figure}
+
+Similar to the previous algorithm this algorithm must have a special handler when fewer than $k$ bits are left in the exponent.  While this
+algorithm requires the same number of squarings it can potentially have fewer multiplications.  The pre-computed table $a^g$ is also half
+the size as the previous table.  
+
+Consider the exponent $b = 111101011001000_2 \equiv 31432_{10}$ with $k = 3$ using both algorithms.  The first algorithm will divide the exponent up as 
+the following five $3$-bit words $b \equiv \left ( 111, 101, 011, 001, 000 \right )_{2}$.  The second algorithm will break the 
+exponent as $b \equiv \left ( 111, 101, 0, 110, 0, 100, 0 \right )_{2}$.  The single digit $0$ in the second representation are where
+a single squaring took place instead of a squaring and multiplication.  In total the first method requires $10$ multiplications and $18$ 
+squarings.  The second method requires $8$ multiplications and $18$ squarings.  
+
+In general the sliding window method is never slower than the generic $k$-ary method and often it is slightly faster.  
+
+\section{Modular Exponentiation}
+
+Modular exponentiation is essentially computing the power of a base within a finite field or ring.  For example, computing 
+$d \equiv a^b \mbox{ (mod }c\mbox{)}$ is a modular exponentiation.  Instead of first computing $a^b$ and then reducing it 
+modulo $c$ the intermediate result is reduced modulo $c$ after every squaring or multiplication operation.  
+
+This guarantees that any intermediate result is bounded by $0 \le d \le c^2 - 2c + 1$ and can be reduced modulo $c$ quickly using
+one of the algorithms presented in chapter six.  
+
+Before the actual modular exponentiation algorithm can be written a wrapper algorithm must be written first.  This algorithm
+will allow the exponent $b$ to be negative which is computed as $c \equiv \left (1 / a \right )^{\vert b \vert} \mbox{(mod }d\mbox{)}$. The
+value of $(1/a) \mbox{ mod }c$ is computed using the modular inverse (\textit{see \ref{sec;modinv}}).  If no inverse exists the algorithm
+terminates with an error.  
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_exptmod}. \\
+\textbf{Input}.   mp\_int $a$, $b$ and $c$ \\
+\textbf{Output}.  $y \equiv g^x \mbox{ (mod }p\mbox{)}$ \\
+\hline \\
+1.  If $c.sign = MP\_NEG$ return(\textit{MP\_VAL}). \\
+2.  If $b.sign = MP\_NEG$ then \\
+\hspace{3mm}2.1  $g' \leftarrow g^{-1} \mbox{ (mod }c\mbox{)}$ \\
+\hspace{3mm}2.2  $x' \leftarrow \vert x \vert$ \\
+\hspace{3mm}2.3  Compute $d \equiv g'^{x'} \mbox{ (mod }c\mbox{)}$ via recursion. \\
+3.  if $p$ is odd \textbf{OR} $p$ is a D.R. modulus then \\
+\hspace{3mm}3.1  Compute $y \equiv g^{x} \mbox{ (mod }p\mbox{)}$ via algorithm mp\_exptmod\_fast. \\
+4.  else \\
+\hspace{3mm}4.1  Compute $y \equiv g^{x} \mbox{ (mod }p\mbox{)}$ via algorithm s\_mp\_exptmod. \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_exptmod}
+\end{figure}
+
+\textbf{Algorithm mp\_exptmod.}
+The first algorithm which actually performs modular exponentiation is algorithm s\_mp\_exptmod.  It is a sliding window $k$-ary algorithm 
+which uses Barrett reduction to reduce the product modulo $p$.  The second algorithm mp\_exptmod\_fast performs the same operation 
+except it uses either Montgomery or Diminished Radix reduction.  The two latter reduction algorithms are clumped in the same exponentiation
+algorithm since their arguments are essentially the same (\textit{two mp\_ints and one mp\_digit}).  
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_exptmod.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   
+018   /* this is a shell function that calls either the normal or Montgomery
+019    * exptmod functions.  Originally the call to the montgomery code was
+020    * embedded in the normal function but that wasted alot of stack space
+021    * for nothing (since 99% of the time the Montgomery code would be called)
+022    */
+023   int mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
+024   \{
+025     int dr;
+026   
+027     /* modulus P must be positive */
+028     if (P->sign == MP_NEG) \{
+029        return MP_VAL;
+030     \}
+031   
+032     /* if exponent X is negative we have to recurse */
+033     if (X->sign == MP_NEG) \{
+034   #ifdef BN_MP_INVMOD_C
+035        mp_int tmpG, tmpX;
+036        int err;
+037   
+038        /* first compute 1/G mod P */
+039        if ((err = mp_init(&tmpG)) != MP_OKAY) \{
+040           return err;
+041        \}
+042        if ((err = mp_invmod(G, P, &tmpG)) != MP_OKAY) \{
+043           mp_clear(&tmpG);
+044           return err;
+045        \}
+046   
+047        /* now get |X| */
+048        if ((err = mp_init(&tmpX)) != MP_OKAY) \{
+049           mp_clear(&tmpG);
+050           return err;
+051        \}
+052        if ((err = mp_abs(X, &tmpX)) != MP_OKAY) \{
+053           mp_clear_multi(&tmpG, &tmpX, NULL);
+054           return err;
+055        \}
+056   
+057        /* and now compute (1/G)**|X| instead of G**X [X < 0] */
+058        err = mp_exptmod(&tmpG, &tmpX, P, Y);
+059        mp_clear_multi(&tmpG, &tmpX, NULL);
+060        return err;
+061   #else 
+062        /* no invmod */
+063        return MP_VAL;
+064   #endif
+065     \}
+066   
+067   /* modified diminished radix reduction */
+068   #if defined(BN_MP_REDUCE_IS_2K_L_C) && defined(BN_MP_REDUCE_2K_L_C)
+069     if (mp_reduce_is_2k_l(P) == MP_YES) \{
+070        return s_mp_exptmod(G, X, P, Y, 1);
+071     \}
+072   #endif
+073   
+074   #ifdef BN_MP_DR_IS_MODULUS_C
+075     /* is it a DR modulus? */
+076     dr = mp_dr_is_modulus(P);
+077   #else
+078     /* default to no */
+079     dr = 0;
+080   #endif
+081   
+082   #ifdef BN_MP_REDUCE_IS_2K_C
+083     /* if not, is it a unrestricted DR modulus? */
+084     if (dr == 0) \{
+085        dr = mp_reduce_is_2k(P) << 1;
+086     \}
+087   #endif
+088       
+089     /* if the modulus is odd or dr != 0 use the montgomery method */
+090   #ifdef BN_MP_EXPTMOD_FAST_C
+091     if (mp_isodd (P) == 1 || dr !=  0) \{
+092       return mp_exptmod_fast (G, X, P, Y, dr);
+093     \} else \{
+094   #endif
+095   #ifdef BN_S_MP_EXPTMOD_C
+096       /* otherwise use the generic Barrett reduction technique */
+097       return s_mp_exptmod (G, X, P, Y, 0);
+098   #else
+099       /* no exptmod for evens */
+100       return MP_VAL;
+101   #endif
+102   #ifdef BN_MP_EXPTMOD_FAST_C
+103     \}
+104   #endif
+105   \}
+106   
+107   #endif
+\end{alltt}
+\end{small}
+
+In order to keep the algorithms in a known state the first step on line 28 is to reject any negative modulus as input.  If the exponent is
+negative the algorithm tries to perform a modular exponentiation with the modular inverse of the base $G$.  The temporary variable $tmpG$ is assigned
+the modular inverse of $G$ and $tmpX$ is assigned the absolute value of $X$.  The algorithm will recuse with these new values with a positive
+exponent.
+
+If the exponent is positive the algorithm resumes the exponentiation.  Line 76 determines if the modulus is of the restricted Diminished Radix 
+form.  If it is not line 69 attempts to determine if it is of a unrestricted Diminished Radix form.  The integer $dr$ will take on one
+of three values.
+
+\begin{enumerate}
+\item $dr = 0$ means that the modulus is not of either restricted or unrestricted Diminished Radix form.
+\item $dr = 1$ means that the modulus is of restricted Diminished Radix form.
+\item $dr = 2$ means that the modulus is of unrestricted Diminished Radix form.
+\end{enumerate}
+
+Line 69 determines if the fast modular exponentiation algorithm can be used.  It is allowed if $dr \ne 0$ or if the modulus is odd.  Otherwise,
+the slower s\_mp\_exptmod algorithm is used which uses Barrett reduction.  
+
+\subsection{Barrett Modular Exponentiation}
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{s\_mp\_exptmod}. \\
+\textbf{Input}.   mp\_int $a$, $b$ and $c$ \\
+\textbf{Output}.  $y \equiv g^x \mbox{ (mod }p\mbox{)}$ \\
+\hline \\
+1.  $k \leftarrow lg(x)$ \\
+2.  $winsize \leftarrow  \left \lbrace \begin{array}{ll}
+                              2 &  \mbox{if }k \le 7 \\
+                              3 &  \mbox{if }7 < k \le 36 \\
+                              4 &  \mbox{if }36 < k \le 140 \\
+                              5 &  \mbox{if }140 < k \le 450 \\
+                              6 &  \mbox{if }450 < k \le 1303 \\
+                              7 &  \mbox{if }1303 < k \le 3529 \\
+                              8 &  \mbox{if }3529 < k \\
+                              \end{array} \right .$ \\
+3.  Initialize $2^{winsize}$ mp\_ints in an array named $M$ and one mp\_int named $\mu$ \\
+4.  Calculate the $\mu$ required for Barrett Reduction (\textit{mp\_reduce\_setup}). \\
+5.  $M_1 \leftarrow g \mbox{ (mod }p\mbox{)}$ \\
+\\
+Setup the table of small powers of $g$.  First find $g^{2^{winsize}}$ and then all multiples of it. \\
+6.  $k \leftarrow 2^{winsize - 1}$ \\
+7.  $M_{k} \leftarrow M_1$ \\
+8.  for $ix$ from 0 to $winsize - 2$ do \\
+\hspace{3mm}8.1  $M_k \leftarrow \left ( M_k \right )^2$ (\textit{mp\_sqr})  \\
+\hspace{3mm}8.2  $M_k \leftarrow M_k \mbox{ (mod }p\mbox{)}$ (\textit{mp\_reduce}) \\
+9.  for $ix$ from $2^{winsize - 1} + 1$ to $2^{winsize} - 1$ do \\
+\hspace{3mm}9.1  $M_{ix} \leftarrow M_{ix - 1} \cdot M_{1}$ (\textit{mp\_mul}) \\
+\hspace{3mm}9.2  $M_{ix} \leftarrow M_{ix} \mbox{ (mod }p\mbox{)}$ (\textit{mp\_reduce}) \\
+10.  $res \leftarrow 1$ \\
+\\
+Start Sliding Window. \\
+11.  $mode \leftarrow 0, bitcnt \leftarrow 1, buf \leftarrow 0, digidx \leftarrow x.used - 1, bitcpy \leftarrow 0, bitbuf \leftarrow 0$ \\
+12.  Loop \\
+\hspace{3mm}12.1  $bitcnt \leftarrow bitcnt - 1$ \\
+\hspace{3mm}12.2  If $bitcnt = 0$ then do \\
+\hspace{6mm}12.2.1  If $digidx = -1$ goto step 13. \\
+\hspace{6mm}12.2.2  $buf \leftarrow x_{digidx}$ \\
+\hspace{6mm}12.2.3  $digidx \leftarrow digidx - 1$ \\
+\hspace{6mm}12.2.4  $bitcnt \leftarrow lg(\beta)$ \\
+Continued on next page. \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm s\_mp\_exptmod}
+\end{figure}
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{s\_mp\_exptmod} (\textit{continued}). \\
+\textbf{Input}.   mp\_int $a$, $b$ and $c$ \\
+\textbf{Output}.  $y \equiv g^x \mbox{ (mod }p\mbox{)}$ \\
+\hline \\
+\hspace{3mm}12.3  $y \leftarrow (buf >> (lg(\beta) - 1))$ AND $1$ \\
+\hspace{3mm}12.4  $buf \leftarrow buf << 1$ \\
+\hspace{3mm}12.5  if $mode = 0$ and $y = 0$ then goto step 12. \\
+\hspace{3mm}12.6  if $mode = 1$ and $y = 0$ then do \\
+\hspace{6mm}12.6.1  $res \leftarrow res^2$ \\
+\hspace{6mm}12.6.2  $res \leftarrow res \mbox{ (mod }p\mbox{)}$ \\
+\hspace{6mm}12.6.3  Goto step 12. \\
+\hspace{3mm}12.7  $bitcpy \leftarrow bitcpy + 1$ \\
+\hspace{3mm}12.8  $bitbuf \leftarrow bitbuf + (y << (winsize - bitcpy))$ \\
+\hspace{3mm}12.9  $mode \leftarrow 2$ \\
+\hspace{3mm}12.10  If $bitcpy = winsize$ then do \\
+\hspace{6mm}Window is full so perform the squarings and single multiplication. \\
+\hspace{6mm}12.10.1  for $ix$ from $0$ to $winsize -1$ do \\
+\hspace{9mm}12.10.1.1  $res \leftarrow res^2$ \\
+\hspace{9mm}12.10.1.2  $res \leftarrow res \mbox{ (mod }p\mbox{)}$ \\
+\hspace{6mm}12.10.2  $res \leftarrow res \cdot M_{bitbuf}$ \\
+\hspace{6mm}12.10.3  $res \leftarrow res \mbox{ (mod }p\mbox{)}$ \\
+\hspace{6mm}Reset the window. \\
+\hspace{6mm}12.10.4  $bitcpy \leftarrow 0, bitbuf \leftarrow 0, mode \leftarrow 1$ \\
+\\
+No more windows left.  Check for residual bits of exponent. \\
+13.  If $mode = 2$ and $bitcpy > 0$ then do \\
+\hspace{3mm}13.1  for $ix$ form $0$ to $bitcpy - 1$ do \\
+\hspace{6mm}13.1.1  $res \leftarrow res^2$ \\
+\hspace{6mm}13.1.2  $res \leftarrow res \mbox{ (mod }p\mbox{)}$ \\
+\hspace{6mm}13.1.3  $bitbuf \leftarrow bitbuf << 1$ \\
+\hspace{6mm}13.1.4  If $bitbuf$ AND $2^{winsize} \ne 0$ then do \\
+\hspace{9mm}13.1.4.1  $res \leftarrow res \cdot M_{1}$ \\
+\hspace{9mm}13.1.4.2  $res \leftarrow res \mbox{ (mod }p\mbox{)}$ \\
+14.  $y \leftarrow res$ \\
+15.  Clear $res$, $mu$ and the $M$ array. \\
+16.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm s\_mp\_exptmod (continued)}
+\end{figure}
+
+\textbf{Algorithm s\_mp\_exptmod.}
+This algorithm computes the $x$'th power of $g$ modulo $p$ and stores the result in $y$.  It takes advantage of the Barrett reduction
+algorithm to keep the product small throughout the algorithm.
+
+The first two steps determine the optimal window size based on the number of bits in the exponent.  The larger the exponent the 
+larger the window size becomes.  After a window size $winsize$ has been chosen an array of $2^{winsize}$ mp\_int variables is allocated.  This
+table will hold the values of $g^x \mbox{ (mod }p\mbox{)}$ for $2^{winsize - 1} \le x < 2^{winsize}$.  
+
+After the table is allocated the first power of $g$ is found.  Since $g \ge p$ is allowed it must be first reduced modulo $p$ to make
+the rest of the algorithm more efficient.  The first element of the table at $2^{winsize - 1}$ is found by squaring $M_1$ successively $winsize - 2$
+times.  The rest of the table elements are found by multiplying the previous element by $M_1$ modulo $p$.
+
+Now that the table is available the sliding window may begin.  The following list describes the functions of all the variables in the window.
+\begin{enumerate}
+\item The variable $mode$ dictates how the bits of the exponent are interpreted.  
+\begin{enumerate}
+   \item When $mode = 0$ the bits are ignored since no non-zero bit of the exponent has been seen yet.  For example, if the exponent were simply 
+         $1$ then there would be $lg(\beta) - 1$ zero bits before the first non-zero bit.  In this case bits are ignored until a non-zero bit is found.  
+   \item When $mode = 1$ a non-zero bit has been seen before and a new $winsize$-bit window has not been formed yet.  In this mode leading $0$ bits 
+         are read and a single squaring is performed.  If a non-zero bit is read a new window is created.  
+   \item When $mode = 2$ the algorithm is in the middle of forming a window and new bits are appended to the window from the most significant bit
+         downwards.
+\end{enumerate}
+\item The variable $bitcnt$ indicates how many bits are left in the current digit of the exponent left to be read.  When it reaches zero a new digit
+      is fetched from the exponent.
+\item The variable $buf$ holds the currently read digit of the exponent. 
+\item The variable $digidx$ is an index into the exponents digits.  It starts at the leading digit $x.used - 1$ and moves towards the trailing digit.
+\item The variable $bitcpy$ indicates how many bits are in the currently formed window.  When it reaches $winsize$ the window is flushed and
+      the appropriate operations performed.
+\item The variable $bitbuf$ holds the current bits of the window being formed.  
+\end{enumerate}
+
+All of step 12 is the window processing loop.  It will iterate while there are digits available form the exponent to read.  The first step
+inside this loop is to extract a new digit if no more bits are available in the current digit.  If there are no bits left a new digit is
+read and if there are no digits left than the loop terminates.  
+
+After a digit is made available step 12.3 will extract the most significant bit of the current digit and move all other bits in the digit
+upwards.  In effect the digit is read from most significant bit to least significant bit and since the digits are read from leading to 
+trailing edges the entire exponent is read from most significant bit to least significant bit.
+
+At step 12.5 if the $mode$ and currently extracted bit $y$ are both zero the bit is ignored and the next bit is read.  This prevents the 
+algorithm from having to perform trivial squaring and reduction operations before the first non-zero bit is read.  Step 12.6 and 12.7-10 handle
+the two cases of $mode = 1$ and $mode = 2$ respectively.  
+
+\begin{center}
+\begin{figure}[here]
+\includegraphics{pics/expt_state.ps}
+\caption{Sliding Window State Diagram}
+\label{pic:expt_state}
+\end{figure}
+\end{center}
+
+By step 13 there are no more digits left in the exponent.  However, there may be partial bits in the window left.  If $mode = 2$ then 
+a Left-to-Right algorithm is used to process the remaining few bits.  
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_s\_mp\_exptmod.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   #ifdef MP_LOW_MEM
+018      #define TAB_SIZE 32
+019   #else
+020      #define TAB_SIZE 256
+021   #endif
+022   
+023   int s_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmod
+      e)
+024   \{
+025     mp_int  M[TAB_SIZE], res, mu;
+026     mp_digit buf;
+027     int     err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
+028     int (*redux)(mp_int*,mp_int*,mp_int*);
+029   
+030     /* find window size */
+031     x = mp_count_bits (X);
+032     if (x <= 7) \{
+033       winsize = 2;
+034     \} else if (x <= 36) \{
+035       winsize = 3;
+036     \} else if (x <= 140) \{
+037       winsize = 4;
+038     \} else if (x <= 450) \{
+039       winsize = 5;
+040     \} else if (x <= 1303) \{
+041       winsize = 6;
+042     \} else if (x <= 3529) \{
+043       winsize = 7;
+044     \} else \{
+045       winsize = 8;
+046     \}
+047   
+048   #ifdef MP_LOW_MEM
+049       if (winsize > 5) \{
+050          winsize = 5;
+051       \}
+052   #endif
+053   
+054     /* init M array */
+055     /* init first cell */
+056     if ((err = mp_init(&M[1])) != MP_OKAY) \{
+057        return err; 
+058     \}
+059   
+060     /* now init the second half of the array */
+061     for (x = 1<<(winsize-1); x < (1 << winsize); x++) \{
+062       if ((err = mp_init(&M[x])) != MP_OKAY) \{
+063         for (y = 1<<(winsize-1); y < x; y++) \{
+064           mp_clear (&M[y]);
+065         \}
+066         mp_clear(&M[1]);
+067         return err;
+068       \}
+069     \}
+070   
+071     /* create mu, used for Barrett reduction */
+072     if ((err = mp_init (&mu)) != MP_OKAY) \{
+073       goto LBL_M;
+074     \}
+075     
+076     if (redmode == 0) \{
+077        if ((err = mp_reduce_setup (&mu, P)) != MP_OKAY) \{
+078           goto LBL_MU;
+079        \}
+080        redux = mp_reduce;
+081     \} else \{
+082        if ((err = mp_reduce_2k_setup_l (P, &mu)) != MP_OKAY) \{
+083           goto LBL_MU;
+084        \}
+085        redux = mp_reduce_2k_l;
+086     \}    
+087   
+088     /* create M table
+089      *
+090      * The M table contains powers of the base, 
+091      * e.g. M[x] = G**x mod P
+092      *
+093      * The first half of the table is not 
+094      * computed though accept for M[0] and M[1]
+095      */
+096     if ((err = mp_mod (G, P, &M[1])) != MP_OKAY) \{
+097       goto LBL_MU;
+098     \}
+099   
+100     /* compute the value at M[1<<(winsize-1)] by squaring 
+101      * M[1] (winsize-1) times 
+102      */
+103     if ((err = mp_copy (&M[1], &M[1 << (winsize - 1)])) != MP_OKAY) \{
+104       goto LBL_MU;
+105     \}
+106   
+107     for (x = 0; x < (winsize - 1); x++) \{
+108       /* square it */
+109       if ((err = mp_sqr (&M[1 << (winsize - 1)], 
+110                          &M[1 << (winsize - 1)])) != MP_OKAY) \{
+111         goto LBL_MU;
+112       \}
+113   
+114       /* reduce modulo P */
+115       if ((err = redux (&M[1 << (winsize - 1)], P, &mu)) != MP_OKAY) \{
+116         goto LBL_MU;
+117       \}
+118     \}
+119   
+120     /* create upper table, that is M[x] = M[x-1] * M[1] (mod P)
+121      * for x = (2**(winsize - 1) + 1) to (2**winsize - 1)
+122      */
+123     for (x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x++) \{
+124       if ((err = mp_mul (&M[x - 1], &M[1], &M[x])) != MP_OKAY) \{
+125         goto LBL_MU;
+126       \}
+127       if ((err = redux (&M[x], P, &mu)) != MP_OKAY) \{
+128         goto LBL_MU;
+129       \}
+130     \}
+131   
+132     /* setup result */
+133     if ((err = mp_init (&res)) != MP_OKAY) \{
+134       goto LBL_MU;
+135     \}
+136     mp_set (&res, 1);
+137   
+138     /* set initial mode and bit cnt */
+139     mode   = 0;
+140     bitcnt = 1;
+141     buf    = 0;
+142     digidx = X->used - 1;
+143     bitcpy = 0;
+144     bitbuf = 0;
+145   
+146     for (;;) \{
+147       /* grab next digit as required */
+148       if (--bitcnt == 0) \{
+149         /* if digidx == -1 we are out of digits */
+150         if (digidx == -1) \{
+151           break;
+152         \}
+153         /* read next digit and reset the bitcnt */
+154         buf    = X->dp[digidx--];
+155         bitcnt = (int) DIGIT_BIT;
+156       \}
+157   
+158       /* grab the next msb from the exponent */
+159       y     = (buf >> (mp_digit)(DIGIT_BIT - 1)) & 1;
+160       buf <<= (mp_digit)1;
+161   
+162       /* if the bit is zero and mode == 0 then we ignore it
+163        * These represent the leading zero bits before the first 1 bit
+164        * in the exponent.  Technically this opt is not required but it
+165        * does lower the # of trivial squaring/reductions used
+166        */
+167       if (mode == 0 && y == 0) \{
+168         continue;
+169       \}
+170   
+171       /* if the bit is zero and mode == 1 then we square */
+172       if (mode == 1 && y == 0) \{
+173         if ((err = mp_sqr (&res, &res)) != MP_OKAY) \{
+174           goto LBL_RES;
+175         \}
+176         if ((err = redux (&res, P, &mu)) != MP_OKAY) \{
+177           goto LBL_RES;
+178         \}
+179         continue;
+180       \}
+181   
+182       /* else we add it to the window */
+183       bitbuf |= (y << (winsize - ++bitcpy));
+184       mode    = 2;
+185   
+186       if (bitcpy == winsize) \{
+187         /* ok window is filled so square as required and multiply  */
+188         /* square first */
+189         for (x = 0; x < winsize; x++) \{
+190           if ((err = mp_sqr (&res, &res)) != MP_OKAY) \{
+191             goto LBL_RES;
+192           \}
+193           if ((err = redux (&res, P, &mu)) != MP_OKAY) \{
+194             goto LBL_RES;
+195           \}
+196         \}
+197   
+198         /* then multiply */
+199         if ((err = mp_mul (&res, &M[bitbuf], &res)) != MP_OKAY) \{
+200           goto LBL_RES;
+201         \}
+202         if ((err = redux (&res, P, &mu)) != MP_OKAY) \{
+203           goto LBL_RES;
+204         \}
+205   
+206         /* empty window and reset */
+207         bitcpy = 0;
+208         bitbuf = 0;
+209         mode   = 1;
+210       \}
+211     \}
+212   
+213     /* if bits remain then square/multiply */
+214     if (mode == 2 && bitcpy > 0) \{
+215       /* square then multiply if the bit is set */
+216       for (x = 0; x < bitcpy; x++) \{
+217         if ((err = mp_sqr (&res, &res)) != MP_OKAY) \{
+218           goto LBL_RES;
+219         \}
+220         if ((err = redux (&res, P, &mu)) != MP_OKAY) \{
+221           goto LBL_RES;
+222         \}
+223   
+224         bitbuf <<= 1;
+225         if ((bitbuf & (1 << winsize)) != 0) \{
+226           /* then multiply */
+227           if ((err = mp_mul (&res, &M[1], &res)) != MP_OKAY) \{
+228             goto LBL_RES;
+229           \}
+230           if ((err = redux (&res, P, &mu)) != MP_OKAY) \{
+231             goto LBL_RES;
+232           \}
+233         \}
+234       \}
+235     \}
+236   
+237     mp_exch (&res, Y);
+238     err = MP_OKAY;
+239   LBL_RES:mp_clear (&res);
+240   LBL_MU:mp_clear (&mu);
+241   LBL_M:
+242     mp_clear(&M[1]);
+243     for (x = 1<<(winsize-1); x < (1 << winsize); x++) \{
+244       mp_clear (&M[x]);
+245     \}
+246     return err;
+247   \}
+248   #endif
+\end{alltt}
+\end{small}
+
+Lines 21 through 40 determine the optimal window size based on the length of the exponent in bits.  The window divisions are sorted
+from smallest to greatest so that in each \textbf{if} statement only one condition must be tested.  For example, by the \textbf{if} statement 
+on line 32 the value of $x$ is already known to be greater than $140$.  
+
+The conditional piece of code beginning on line 48 allows the window size to be restricted to five bits.  This logic is used to ensure
+the table of precomputed powers of $G$ remains relatively small.  
+
+The for loop on line 61 initializes the $M$ array while lines 62 and 77 compute the value of $\mu$ required for
+Barrett reduction.  
+
+-- More later.
+
+\section{Quick Power of Two}
+Calculating $b = 2^a$ can be performed much quicker than with any of the previous algorithms.  Recall that a logical shift left $m << k$ is
+equivalent to $m \cdot 2^k$.  By this logic when $m = 1$ a quick power of two can be achieved.
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_2expt}. \\
+\textbf{Input}.   integer $b$ \\
+\textbf{Output}.  $a \leftarrow 2^b$ \\
+\hline \\
+1.  $a \leftarrow 0$ \\
+2.  If $a.alloc < \lfloor b / lg(\beta) \rfloor + 1$ then grow $a$ appropriately. \\
+3.  $a.used \leftarrow \lfloor b / lg(\beta) \rfloor + 1$ \\
+4.  $a_{\lfloor b / lg(\beta) \rfloor} \leftarrow 1 << (b \mbox{ mod } lg(\beta))$ \\
+5.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_2expt}
+\end{figure}
+
+\textbf{Algorithm mp\_2expt.}
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_2expt.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* computes a = 2**b 
+018    *
+019    * Simple algorithm which zeroes the int, grows it then just sets one bit
+020    * as required.
+021    */
+022   int
+023   mp_2expt (mp_int * a, int b)
+024   \{
+025     int     res;
+026   
+027     /* zero a as per default */
+028     mp_zero (a);
+029   
+030     /* grow a to accomodate the single bit */
+031     if ((res = mp_grow (a, b / DIGIT_BIT + 1)) != MP_OKAY) \{
+032       return res;
+033     \}
+034   
+035     /* set the used count of where the bit will go */
+036     a->used = b / DIGIT_BIT + 1;
+037   
+038     /* put the single bit in its place */
+039     a->dp[b / DIGIT_BIT] = ((mp_digit)1) << (b % DIGIT_BIT);
+040   
+041     return MP_OKAY;
+042   \}
+043   #endif
+\end{alltt}
+\end{small}
+
+\chapter{Higher Level Algorithms}
+
+This chapter discusses the various higher level algorithms that are required to complete a well rounded multiple precision integer package.  These
+routines are less performance oriented than the algorithms of chapters five, six and seven but are no less important.  
+
+The first section describes a method of integer division with remainder that is universally well known.  It provides the signed division logic
+for the package.  The subsequent section discusses a set of algorithms which allow a single digit to be the 2nd operand for a variety of operations.  
+These algorithms serve mostly to simplify other algorithms where small constants are required.  The last two sections discuss how to manipulate 
+various representations of integers.  For example, converting from an mp\_int to a string of character.
+
+\section{Integer Division with Remainder}
+\label{sec:division}
+
+Integer division aside from modular exponentiation is the most intensive algorithm to compute.  Like addition, subtraction and multiplication
+the basis of this algorithm is the long-hand division algorithm taught to school children.  Throughout this discussion several common variables
+will be used.  Let $x$ represent the divisor and $y$ represent the dividend.  Let $q$ represent the integer quotient $\lfloor y / x \rfloor$ and 
+let $r$ represent the remainder $r = y - x \lfloor y / x \rfloor$.  The following simple algorithm will be used to start the discussion.
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{Radix-$\beta$ Integer Division}. \\
+\textbf{Input}.   integer $x$ and $y$ \\
+\textbf{Output}.  $q = \lfloor y/x\rfloor, r = y - xq$ \\
+\hline \\
+1.  $q \leftarrow 0$ \\
+2.  $n \leftarrow \vert \vert y \vert \vert - \vert \vert x \vert \vert$ \\
+3.  for $t$ from $n$ down to $0$ do \\
+\hspace{3mm}3.1  Maximize $k$ such that $kx\beta^t$ is less than or equal to $y$ and $(k + 1)x\beta^t$ is greater. \\
+\hspace{3mm}3.2  $q \leftarrow q + k\beta^t$ \\
+\hspace{3mm}3.3  $y \leftarrow y - kx\beta^t$ \\
+4.  $r \leftarrow y$ \\
+5.  Return($q, r$) \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm Radix-$\beta$ Integer Division}
+\label{fig:raddiv}
+\end{figure}
+
+As children we are taught this very simple algorithm for the case of $\beta = 10$.  Almost instinctively several optimizations are taught for which
+their reason of existing are never explained.  For this example let $y = 5471$ represent the dividend and $x = 23$ represent the divisor.
+
+To find the first digit of the quotient the value of $k$ must be maximized such that $kx\beta^t$ is less than or equal to $y$ and 
+simultaneously $(k + 1)x\beta^t$ is greater than $y$.  Implicitly $k$ is the maximum value the $t$'th digit of the quotient may have.  The habitual method
+used to find the maximum is to ``eyeball'' the two numbers, typically only the leading digits and quickly estimate a quotient.  By only using leading
+digits a much simpler division may be used to form an educated guess at what the value must be.  In this case $k = \lfloor 54/23\rfloor = 2$ quickly 
+arises as a possible  solution.  Indeed $2x\beta^2 = 4600$ is less than $y = 5471$ and simultaneously $(k + 1)x\beta^2 = 6900$ is larger than $y$.  
+As a  result $k\beta^2$ is added to the quotient which now equals $q = 200$ and $4600$ is subtracted from $y$ to give a remainder of $y = 841$.
+
+Again this process is repeated to produce the quotient digit $k = 3$ which makes the quotient $q = 200 + 3\beta = 230$ and the remainder 
+$y = 841 - 3x\beta = 181$.  Finally the last iteration of the loop produces $k = 7$ which leads to the quotient $q = 230 + 7 = 237$ and the
+remainder $y = 181 - 7x = 20$.  The final quotient and remainder found are $q = 237$ and $r = y = 20$ which are indeed correct since 
+$237 \cdot 23 + 20 = 5471$ is true.  
+
+\subsection{Quotient Estimation}
+\label{sec:divest}
+As alluded to earlier the quotient digit $k$ can be estimated from only the leading digits of both the divisor and dividend.  When $p$ leading
+digits are used from both the divisor and dividend to form an estimation the accuracy of the estimation rises as $p$ grows.  Technically
+speaking the estimation is based on assuming the lower $\vert \vert y \vert \vert - p$ and $\vert \vert x \vert \vert - p$ lower digits of the
+dividend and divisor are zero.  
+
+The value of the estimation may off by a few values in either direction and in general is fairly correct.  A simplification \cite[pp. 271]{TAOCPV2}
+of the estimation technique is to use $t + 1$ digits of the dividend and $t$ digits of the divisor, in particularly when $t = 1$.  The estimate 
+using this technique is never too small.  For the following proof let $t = \vert \vert y \vert \vert - 1$ and $s = \vert \vert x \vert \vert - 1$ 
+represent the most significant digits of the dividend and divisor respectively.
+
+\textbf{Proof.}\textit{  The quotient $\hat k = \lfloor (y_t\beta + y_{t-1}) / x_s \rfloor$ is greater than or equal to 
+$k = \lfloor y / (x \cdot \beta^{\vert \vert y \vert \vert - \vert \vert x \vert \vert - 1}) \rfloor$. }
+The first obvious case is when $\hat k = \beta - 1$ in which case the proof is concluded since the real quotient cannot be larger.  For all other 
+cases $\hat k = \lfloor (y_t\beta + y_{t-1}) / x_s \rfloor$ and $\hat k x_s \ge y_t\beta + y_{t-1} - x_s + 1$.  The latter portion of the inequalility
+$-x_s + 1$ arises from the fact that a truncated integer division will give the same quotient for at most $x_s - 1$ values.  Next a series of 
+inequalities will prove the hypothesis.
+
+\begin{equation}
+y - \hat k x \le y - \hat k x_s\beta^s
+\end{equation}
+
+This is trivially true since $x \ge x_s\beta^s$.  Next we replace $\hat kx_s\beta^s$ by the previous inequality for $\hat kx_s$.  
+
+\begin{equation}
+y - \hat k x \le y_t\beta^t + \ldots + y_0 - (y_t\beta^t + y_{t-1}\beta^{t-1} - x_s\beta^t + \beta^s)
+\end{equation}
+
+By simplifying the previous inequality the following inequality is formed.
+
+\begin{equation}
+y - \hat k x \le y_{t-2}\beta^{t-2} + \ldots + y_0 + x_s\beta^s - \beta^s
+\end{equation}
+
+Subsequently,
+
+\begin{equation}
+y_{t-2}\beta^{t-2} + \ldots +  y_0  + x_s\beta^s - \beta^s < x_s\beta^s \le x
+\end{equation}
+
+Which proves that $y - \hat kx \le x$ and by consequence $\hat k \ge k$ which concludes the proof.  \textbf{QED}
+
+
+\subsection{Normalized Integers}
+For the purposes of division a normalized input is when the divisors leading digit $x_n$ is greater than or equal to $\beta / 2$.  By multiplying both
+$x$ and $y$ by $j = \lfloor (\beta / 2) / x_n \rfloor$ the quotient remains unchanged and the remainder is simply $j$ times the original
+remainder.  The purpose of normalization is to ensure the leading digit of the divisor is sufficiently large such that the estimated quotient will
+lie in the domain of a single digit.  Consider the maximum dividend $(\beta - 1) \cdot \beta + (\beta - 1)$ and the minimum divisor $\beta / 2$.  
+
+\begin{equation} 
+{{\beta^2 - 1} \over { \beta / 2}} \le 2\beta - {2 \over \beta} 
+\end{equation}
+
+At most the quotient approaches $2\beta$, however, in practice this will not occur since that would imply the previous quotient digit was too small.  
+
+\subsection{Radix-$\beta$ Division with Remainder}
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_div}. \\
+\textbf{Input}.   mp\_int $a, b$ \\
+\textbf{Output}.  $c = \lfloor a/b \rfloor$, $d = a - bc$ \\
+\hline \\
+1.  If $b = 0$ return(\textit{MP\_VAL}). \\
+2.  If $\vert a \vert < \vert b \vert$ then do \\
+\hspace{3mm}2.1  $d \leftarrow a$ \\
+\hspace{3mm}2.2  $c \leftarrow 0$ \\
+\hspace{3mm}2.3  Return(\textit{MP\_OKAY}). \\
+\\
+Setup the quotient to receive the digits. \\
+3.  Grow $q$ to $a.used + 2$ digits. \\
+4.  $q \leftarrow 0$ \\
+5.  $x \leftarrow \vert a \vert , y \leftarrow \vert b \vert$ \\
+6.  $sign \leftarrow  \left \lbrace \begin{array}{ll}
+                              MP\_ZPOS &  \mbox{if }a.sign = b.sign \\
+                              MP\_NEG  &  \mbox{otherwise} \\
+                              \end{array} \right .$ \\
+\\
+Normalize the inputs such that the leading digit of $y$ is greater than or equal to $\beta / 2$. \\
+7.  $norm \leftarrow (lg(\beta) - 1) - (\lceil lg(y) \rceil \mbox{ (mod }lg(\beta)\mbox{)})$ \\
+8.  $x \leftarrow x \cdot 2^{norm}, y \leftarrow y \cdot 2^{norm}$ \\
+\\
+Find the leading digit of the quotient. \\
+9.  $n \leftarrow x.used - 1, t \leftarrow y.used - 1$ \\
+10.  $y \leftarrow y \cdot \beta^{n - t}$ \\
+11.  While ($x \ge y$) do \\
+\hspace{3mm}11.1  $q_{n - t} \leftarrow q_{n - t} + 1$ \\
+\hspace{3mm}11.2  $x \leftarrow x - y$ \\
+12.  $y \leftarrow \lfloor y / \beta^{n-t} \rfloor$ \\
+\\
+Continued on the next page. \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_div}
+\end{figure}
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_div} (continued). \\
+\textbf{Input}.   mp\_int $a, b$ \\
+\textbf{Output}.  $c = \lfloor a/b \rfloor$, $d = a - bc$ \\
+\hline \\
+Now find the remainder fo the digits. \\
+13.  for $i$ from $n$ down to $(t + 1)$ do \\
+\hspace{3mm}13.1  If $i > x.used$ then jump to the next iteration of this loop. \\
+\hspace{3mm}13.2  If $x_{i} = y_{t}$ then \\
+\hspace{6mm}13.2.1  $q_{i - t - 1} \leftarrow \beta - 1$ \\
+\hspace{3mm}13.3  else \\
+\hspace{6mm}13.3.1  $\hat r \leftarrow x_{i} \cdot \beta + x_{i - 1}$ \\
+\hspace{6mm}13.3.2  $\hat r \leftarrow \lfloor \hat r / y_{t} \rfloor$ \\
+\hspace{6mm}13.3.3  $q_{i - t - 1} \leftarrow \hat r$ \\
+\hspace{3mm}13.4  $q_{i - t - 1} \leftarrow q_{i - t - 1} + 1$ \\
+\\
+Fixup quotient estimation. \\
+\hspace{3mm}13.5  Loop \\
+\hspace{6mm}13.5.1  $q_{i - t - 1} \leftarrow q_{i - t - 1} - 1$ \\
+\hspace{6mm}13.5.2  t$1 \leftarrow 0$ \\
+\hspace{6mm}13.5.3  t$1_0 \leftarrow y_{t - 1}, $ t$1_1 \leftarrow y_t,$ t$1.used \leftarrow 2$ \\
+\hspace{6mm}13.5.4  $t1 \leftarrow t1 \cdot q_{i - t - 1}$ \\
+\hspace{6mm}13.5.5  t$2_0 \leftarrow x_{i - 2}, $ t$2_1 \leftarrow x_{i - 1}, $ t$2_2 \leftarrow x_i, $ t$2.used \leftarrow 3$ \\
+\hspace{6mm}13.5.6  If $\vert t1 \vert > \vert t2 \vert$ then goto step 13.5. \\
+\hspace{3mm}13.6  t$1 \leftarrow y \cdot q_{i - t - 1}$ \\
+\hspace{3mm}13.7  t$1 \leftarrow $ t$1 \cdot \beta^{i - t - 1}$ \\
+\hspace{3mm}13.8  $x \leftarrow x - $ t$1$ \\
+\hspace{3mm}13.9  If $x.sign = MP\_NEG$ then \\
+\hspace{6mm}13.10  t$1 \leftarrow y$ \\
+\hspace{6mm}13.11  t$1 \leftarrow $ t$1 \cdot \beta^{i - t - 1}$ \\
+\hspace{6mm}13.12  $x \leftarrow x + $ t$1$ \\
+\hspace{6mm}13.13  $q_{i - t - 1} \leftarrow q_{i - t - 1} - 1$ \\
+\\
+Finalize the result. \\
+14.  Clamp excess digits of $q$ \\
+15.  $c \leftarrow q, c.sign \leftarrow sign$ \\
+16.  $x.sign \leftarrow a.sign$ \\
+17.  $d \leftarrow \lfloor x / 2^{norm} \rfloor$ \\
+18.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_div (continued)}
+\end{figure}
+\textbf{Algorithm mp\_div.}
+This algorithm will calculate quotient and remainder from an integer division given a dividend and divisor.  The algorithm is a signed
+division and will produce a fully qualified quotient and remainder.
+
+First the divisor $b$ must be non-zero which is enforced in step one.  If the divisor is larger than the dividend than the quotient is implicitly 
+zero and the remainder is the dividend.  
+
+After the first two trivial cases of inputs are handled the variable $q$ is setup to receive the digits of the quotient.  Two unsigned copies of the
+divisor $y$ and dividend $x$ are made as well.  The core of the division algorithm is an unsigned division and will only work if the values are
+positive.  Now the two values $x$ and $y$ must be normalized such that the leading digit of $y$ is greater than or equal to $\beta / 2$.  
+This is performed by shifting both to the left by enough bits to get the desired normalization.  
+
+At this point the division algorithm can begin producing digits of the quotient.  Recall that maximum value of the estimation used is 
+$2\beta - {2 \over \beta}$ which means that a digit of the quotient must be first produced by another means.  In this case $y$ is shifted
+to the left (\textit{step ten}) so that it has the same number of digits as $x$.  The loop on step eleven will subtract multiples of the 
+shifted copy of $y$ until $x$ is smaller.  Since the leading digit of $y$ is greater than or equal to $\beta/2$ this loop will iterate at most two
+times to produce the desired leading digit of the quotient.  
+
+Now the remainder of the digits can be produced.  The equation $\hat q = \lfloor {{x_i \beta + x_{i-1}}\over y_t} \rfloor$ is used to fairly
+accurately approximate the true quotient digit.  The estimation can in theory produce an estimation as high as $2\beta - {2 \over \beta}$ but by
+induction the upper quotient digit is correct (\textit{as established on step eleven}) and the estimate must be less than $\beta$.  
+
+Recall from section~\ref{sec:divest} that the estimation is never too low but may be too high.  The next step of the estimation process is
+to refine the estimation.  The loop on step 13.5 uses $x_i\beta^2 + x_{i-1}\beta + x_{i-2}$ and $q_{i - t - 1}(y_t\beta + y_{t-1})$ as a higher
+order approximation to adjust the quotient digit.
+
+After both phases of estimation the quotient digit may still be off by a value of one\footnote{This is similar to the error introduced
+by optimizing Barrett reduction.}.  Steps 13.6 and 13.7 subtract the multiple of the divisor from the dividend (\textit{Similar to step 3.3 of
+algorithm~\ref{fig:raddiv}} and then subsequently add a multiple of the divisor if the quotient was too large.  
+
+Now that the quotient has been determine finializing the result is a matter of clamping the quotient, fixing the sizes and de-normalizing the 
+remainder.  An important aspect of this algorithm seemingly overlooked in other descriptions such as that of Algorithm 14.20 HAC \cite[pp. 598]{HAC}
+is that when the estimations are being made (\textit{inside the loop on step 13.5}) that the digits $y_{t-1}$, $x_{i-2}$ and $x_{i-1}$ may lie 
+outside their respective boundaries.  For example, if $t = 0$ or $i \le 1$ then the digits would be undefined.  In those cases the digits should
+respectively be replaced with a zero.  
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_div.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   #ifdef BN_MP_DIV_SMALL
+018   
+019   /* slower bit-bang division... also smaller */
+020   int mp_div(mp_int * a, mp_int * b, mp_int * c, mp_int * d)
+021   \{
+022      mp_int ta, tb, tq, q;
+023      int    res, n, n2;
+024   
+025     /* is divisor zero ? */
+026     if (mp_iszero (b) == 1) \{
+027       return MP_VAL;
+028     \}
+029   
+030     /* if a < b then q=0, r = a */
+031     if (mp_cmp_mag (a, b) == MP_LT) \{
+032       if (d != NULL) \{
+033         res = mp_copy (a, d);
+034       \} else \{
+035         res = MP_OKAY;
+036       \}
+037       if (c != NULL) \{
+038         mp_zero (c);
+039       \}
+040       return res;
+041     \}
+042       
+043     /* init our temps */
+044     if ((res = mp_init_multi(&ta, &tb, &tq, &q, NULL) != MP_OKAY)) \{
+045        return res;
+046     \}
+047   
+048   
+049     mp_set(&tq, 1);
+050     n = mp_count_bits(a) - mp_count_bits(b);
+051     if (((res = mp_abs(a, &ta)) != MP_OKAY) ||
+052         ((res = mp_abs(b, &tb)) != MP_OKAY) || 
+053         ((res = mp_mul_2d(&tb, n, &tb)) != MP_OKAY) ||
+054         ((res = mp_mul_2d(&tq, n, &tq)) != MP_OKAY)) \{
+055         goto LBL_ERR;
+056     \}
+057   
+058     while (n-- >= 0) \{
+059        if (mp_cmp(&tb, &ta) != MP_GT) \{
+060           if (((res = mp_sub(&ta, &tb, &ta)) != MP_OKAY) ||
+061               ((res = mp_add(&q, &tq, &q)) != MP_OKAY)) \{
+062              goto LBL_ERR;
+063           \}
+064        \}
+065        if (((res = mp_div_2d(&tb, 1, &tb, NULL)) != MP_OKAY) ||
+066            ((res = mp_div_2d(&tq, 1, &tq, NULL)) != MP_OKAY)) \{
+067              goto LBL_ERR;
+068        \}
+069     \}
+070   
+071     /* now q == quotient and ta == remainder */
+072     n  = a->sign;
+073     n2 = (a->sign == b->sign ? MP_ZPOS : MP_NEG);
+074     if (c != NULL) \{
+075        mp_exch(c, &q);
+076        c->sign  = (mp_iszero(c) == MP_YES) ? MP_ZPOS : n2;
+077     \}
+078     if (d != NULL) \{
+079        mp_exch(d, &ta);
+080        d->sign = (mp_iszero(d) == MP_YES) ? MP_ZPOS : n;
+081     \}
+082   LBL_ERR:
+083      mp_clear_multi(&ta, &tb, &tq, &q, NULL);
+084      return res;
+085   \}
+086   
+087   #else
+088   
+089   /* integer signed division. 
+090    * c*b + d == a [e.g. a/b, c=quotient, d=remainder]
+091    * HAC pp.598 Algorithm 14.20
+092    *
+093    * Note that the description in HAC is horribly 
+094    * incomplete.  For example, it doesn't consider 
+095    * the case where digits are removed from 'x' in 
+096    * the inner loop.  It also doesn't consider the 
+097    * case that y has fewer than three digits, etc..
+098    *
+099    * The overall algorithm is as described as 
+100    * 14.20 from HAC but fixed to treat these cases.
+101   */
+102   int mp_div (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
+103   \{
+104     mp_int  q, x, y, t1, t2;
+105     int     res, n, t, i, norm, neg;
+106   
+107     /* is divisor zero ? */
+108     if (mp_iszero (b) == 1) \{
+109       return MP_VAL;
+110     \}
+111   
+112     /* if a < b then q=0, r = a */
+113     if (mp_cmp_mag (a, b) == MP_LT) \{
+114       if (d != NULL) \{
+115         res = mp_copy (a, d);
+116       \} else \{
+117         res = MP_OKAY;
+118       \}
+119       if (c != NULL) \{
+120         mp_zero (c);
+121       \}
+122       return res;
+123     \}
+124   
+125     if ((res = mp_init_size (&q, a->used + 2)) != MP_OKAY) \{
+126       return res;
+127     \}
+128     q.used = a->used + 2;
+129   
+130     if ((res = mp_init (&t1)) != MP_OKAY) \{
+131       goto LBL_Q;
+132     \}
+133   
+134     if ((res = mp_init (&t2)) != MP_OKAY) \{
+135       goto LBL_T1;
+136     \}
+137   
+138     if ((res = mp_init_copy (&x, a)) != MP_OKAY) \{
+139       goto LBL_T2;
+140     \}
+141   
+142     if ((res = mp_init_copy (&y, b)) != MP_OKAY) \{
+143       goto LBL_X;
+144     \}
+145   
+146     /* fix the sign */
+147     neg = (a->sign == b->sign) ? MP_ZPOS : MP_NEG;
+148     x.sign = y.sign = MP_ZPOS;
+149   
+150     /* normalize both x and y, ensure that y >= b/2, [b == 2**DIGIT_BIT] */
+151     norm = mp_count_bits(&y) % DIGIT_BIT;
+152     if (norm < (int)(DIGIT_BIT-1)) \{
+153        norm = (DIGIT_BIT-1) - norm;
+154        if ((res = mp_mul_2d (&x, norm, &x)) != MP_OKAY) \{
+155          goto LBL_Y;
+156        \}
+157        if ((res = mp_mul_2d (&y, norm, &y)) != MP_OKAY) \{
+158          goto LBL_Y;
+159        \}
+160     \} else \{
+161        norm = 0;
+162     \}
+163   
+164     /* note hac does 0 based, so if used==5 then its 0,1,2,3,4, e.g. use 4 */
+165     n = x.used - 1;
+166     t = y.used - 1;
+167   
+168     /* while (x >= y*b**n-t) do \{ q[n-t] += 1; x -= y*b**\{n-t\} \} */
+169     if ((res = mp_lshd (&y, n - t)) != MP_OKAY) \{ /* y = y*b**\{n-t\} */
+170       goto LBL_Y;
+171     \}
+172   
+173     while (mp_cmp (&x, &y) != MP_LT) \{
+174       ++(q.dp[n - t]);
+175       if ((res = mp_sub (&x, &y, &x)) != MP_OKAY) \{
+176         goto LBL_Y;
+177       \}
+178     \}
+179   
+180     /* reset y by shifting it back down */
+181     mp_rshd (&y, n - t);
+182   
+183     /* step 3. for i from n down to (t + 1) */
+184     for (i = n; i >= (t + 1); i--) \{
+185       if (i > x.used) \{
+186         continue;
+187       \}
+188   
+189       /* step 3.1 if xi == yt then set q\{i-t-1\} to b-1, 
+190        * otherwise set q\{i-t-1\} to (xi*b + x\{i-1\})/yt */
+191       if (x.dp[i] == y.dp[t]) \{
+192         q.dp[i - t - 1] = ((((mp_digit)1) << DIGIT_BIT) - 1);
+193       \} else \{
+194         mp_word tmp;
+195         tmp = ((mp_word) x.dp[i]) << ((mp_word) DIGIT_BIT);
+196         tmp |= ((mp_word) x.dp[i - 1]);
+197         tmp /= ((mp_word) y.dp[t]);
+198         if (tmp > (mp_word) MP_MASK)
+199           tmp = MP_MASK;
+200         q.dp[i - t - 1] = (mp_digit) (tmp & (mp_word) (MP_MASK));
+201       \}
+202   
+203       /* while (q\{i-t-1\} * (yt * b + y\{t-1\})) > 
+204                xi * b**2 + xi-1 * b + xi-2 
+205        
+206          do q\{i-t-1\} -= 1; 
+207       */
+208       q.dp[i - t - 1] = (q.dp[i - t - 1] + 1) & MP_MASK;
+209       do \{
+210         q.dp[i - t - 1] = (q.dp[i - t - 1] - 1) & MP_MASK;
+211   
+212         /* find left hand */
+213         mp_zero (&t1);
+214         t1.dp[0] = (t - 1 < 0) ? 0 : y.dp[t - 1];
+215         t1.dp[1] = y.dp[t];
+216         t1.used = 2;
+217         if ((res = mp_mul_d (&t1, q.dp[i - t - 1], &t1)) != MP_OKAY) \{
+218           goto LBL_Y;
+219         \}
+220   
+221         /* find right hand */
+222         t2.dp[0] = (i - 2 < 0) ? 0 : x.dp[i - 2];
+223         t2.dp[1] = (i - 1 < 0) ? 0 : x.dp[i - 1];
+224         t2.dp[2] = x.dp[i];
+225         t2.used = 3;
+226       \} while (mp_cmp_mag(&t1, &t2) == MP_GT);
+227   
+228       /* step 3.3 x = x - q\{i-t-1\} * y * b**\{i-t-1\} */
+229       if ((res = mp_mul_d (&y, q.dp[i - t - 1], &t1)) != MP_OKAY) \{
+230         goto LBL_Y;
+231       \}
+232   
+233       if ((res = mp_lshd (&t1, i - t - 1)) != MP_OKAY) \{
+234         goto LBL_Y;
+235       \}
+236   
+237       if ((res = mp_sub (&x, &t1, &x)) != MP_OKAY) \{
+238         goto LBL_Y;
+239       \}
+240   
+241       /* if x < 0 then \{ x = x + y*b**\{i-t-1\}; q\{i-t-1\} -= 1; \} */
+242       if (x.sign == MP_NEG) \{
+243         if ((res = mp_copy (&y, &t1)) != MP_OKAY) \{
+244           goto LBL_Y;
+245         \}
+246         if ((res = mp_lshd (&t1, i - t - 1)) != MP_OKAY) \{
+247           goto LBL_Y;
+248         \}
+249         if ((res = mp_add (&x, &t1, &x)) != MP_OKAY) \{
+250           goto LBL_Y;
+251         \}
+252   
+253         q.dp[i - t - 1] = (q.dp[i - t - 1] - 1UL) & MP_MASK;
+254       \}
+255     \}
+256   
+257     /* now q is the quotient and x is the remainder 
+258      * [which we have to normalize] 
+259      */
+260     
+261     /* get sign before writing to c */
+262     x.sign = x.used == 0 ? MP_ZPOS : a->sign;
+263   
+264     if (c != NULL) \{
+265       mp_clamp (&q);
+266       mp_exch (&q, c);
+267       c->sign = neg;
+268     \}
+269   
+270     if (d != NULL) \{
+271       mp_div_2d (&x, norm, &x, NULL);
+272       mp_exch (&x, d);
+273     \}
+274   
+275     res = MP_OKAY;
+276   
+277   LBL_Y:mp_clear (&y);
+278   LBL_X:mp_clear (&x);
+279   LBL_T2:mp_clear (&t2);
+280   LBL_T1:mp_clear (&t1);
+281   LBL_Q:mp_clear (&q);
+282     return res;
+283   \}
+284   
+285   #endif
+286   
+287   #endif
+\end{alltt}
+\end{small}
+
+The implementation of this algorithm differs slightly from the pseudo code presented previously.  In this algorithm either of the quotient $c$ or
+remainder $d$ may be passed as a \textbf{NULL} pointer which indicates their value is not desired.  For example, the C code to call the division
+algorithm with only the quotient is 
+
+\begin{verbatim}
+mp_div(&a, &b, &c, NULL);  /* c = [a/b] */
+\end{verbatim}
+
+Lines 37 and 44 handle the two trivial cases of inputs which are division by zero and dividend smaller than the divisor 
+respectively.  After the two trivial cases all of the temporary variables are initialized.  Line 105 determines the sign of 
+the quotient and line 76 ensures that both $x$ and $y$ are positive.  
+
+The number of bits in the leading digit is calculated on line 105.  Implictly an mp\_int with $r$ digits will require $lg(\beta)(r-1) + k$ bits
+of precision which when reduced modulo $lg(\beta)$ produces the value of $k$.  In this case $k$ is the number of bits in the leading digit which is
+exactly what is required.  For the algorithm to operate $k$ must equal $lg(\beta) - 1$ and when it does not the inputs must be normalized by shifting
+them to the left by $lg(\beta) - 1 - k$ bits.
+
+Throughout the variables $n$ and $t$ will represent the highest digit of $x$ and $y$ respectively.  These are first used to produce the 
+leading digit of the quotient.  The loop beginning on line 183 will produce the remainder of the quotient digits.
+
+The conditional ``continue'' on line 114 is used to prevent the algorithm from reading past the leading edge of $x$ which can occur when the
+algorithm eliminates multiple non-zero digits in a single iteration.  This ensures that $x_i$ is always non-zero since by definition the digits
+above the $i$'th position $x$ must be zero in order for the quotient to be precise\footnote{Precise as far as integer division is concerned.}.  
+
+Lines 130, 130 and 134 through 134 manually construct the high accuracy estimations by setting the digits of the two mp\_int 
+variables directly.  
+
+\section{Single Digit Helpers}
+
+This section briefly describes a series of single digit helper algorithms which come in handy when working with small constants.  All of 
+the helper functions assume the single digit input is positive and will treat them as such.
+
+\subsection{Single Digit Addition and Subtraction}
+
+Both addition and subtraction are performed by ``cheating'' and using mp\_set followed by the higher level addition or subtraction 
+algorithms.   As a result these algorithms are subtantially simpler with a slight cost in performance.
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_add\_d}. \\
+\textbf{Input}.   mp\_int $a$ and a mp\_digit $b$ \\
+\textbf{Output}.  $c = a + b$ \\
+\hline \\
+1.  $t \leftarrow b$ (\textit{mp\_set}) \\
+2.  $c \leftarrow a + t$ \\
+3.  Return(\textit{MP\_OKAY}) \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_add\_d}
+\end{figure}
+
+\textbf{Algorithm mp\_add\_d.}
+This algorithm initiates a temporary mp\_int with the value of the single digit and uses algorithm mp\_add to add the two values together.
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_add\_d.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* single digit addition */
+018   int
+019   mp_add_d (mp_int * a, mp_digit b, mp_int * c)
+020   \{
+021     int     res, ix, oldused;
+022     mp_digit *tmpa, *tmpc, mu;
+023   
+024     /* grow c as required */
+025     if (c->alloc < a->used + 1) \{
+026        if ((res = mp_grow(c, a->used + 1)) != MP_OKAY) \{
+027           return res;
+028        \}
+029     \}
+030   
+031     /* if a is negative and |a| >= b, call c = |a| - b */
+032     if (a->sign == MP_NEG && (a->used > 1 || a->dp[0] >= b)) \{
+033        /* temporarily fix sign of a */
+034        a->sign = MP_ZPOS;
+035   
+036        /* c = |a| - b */
+037        res = mp_sub_d(a, b, c);
+038   
+039        /* fix sign  */
+040        a->sign = c->sign = MP_NEG;
+041   
+042        return res;
+043     \}
+044   
+045     /* old number of used digits in c */
+046     oldused = c->used;
+047   
+048     /* sign always positive */
+049     c->sign = MP_ZPOS;
+050   
+051     /* source alias */
+052     tmpa    = a->dp;
+053   
+054     /* destination alias */
+055     tmpc    = c->dp;
+056   
+057     /* if a is positive */
+058     if (a->sign == MP_ZPOS) \{
+059        /* add digit, after this we're propagating
+060         * the carry.
+061         */
+062        *tmpc   = *tmpa++ + b;
+063        mu      = *tmpc >> DIGIT_BIT;
+064        *tmpc++ &= MP_MASK;
+065   
+066        /* now handle rest of the digits */
+067        for (ix = 1; ix < a->used; ix++) \{
+068           *tmpc   = *tmpa++ + mu;
+069           mu      = *tmpc >> DIGIT_BIT;
+070           *tmpc++ &= MP_MASK;
+071        \}
+072        /* set final carry */
+073        ix++;
+074        *tmpc++  = mu;
+075   
+076        /* setup size */
+077        c->used = a->used + 1;
+078     \} else \{
+079        /* a was negative and |a| < b */
+080        c->used  = 1;
+081   
+082        /* the result is a single digit */
+083        if (a->used == 1) \{
+084           *tmpc++  =  b - a->dp[0];
+085        \} else \{
+086           *tmpc++  =  b;
+087        \}
+088   
+089        /* setup count so the clearing of oldused
+090         * can fall through correctly
+091         */
+092        ix       = 1;
+093     \}
+094   
+095     /* now zero to oldused */
+096     while (ix++ < oldused) \{
+097        *tmpc++ = 0;
+098     \}
+099     mp_clamp(c);
+100   
+101     return MP_OKAY;
+102   \}
+103   
+104   #endif
+\end{alltt}
+\end{small}
+
+Clever use of the letter 't'.
+
+\subsubsection{Subtraction}
+The single digit subtraction algorithm mp\_sub\_d is essentially the same except it uses mp\_sub to subtract the digit from the mp\_int.
+
+\subsection{Single Digit Multiplication}
+Single digit multiplication arises enough in division and radix conversion that it ought to be implement as a special case of the baseline
+multiplication algorithm.  Essentially this algorithm is a modified version of algorithm s\_mp\_mul\_digs where one of the multiplicands
+only has one digit.
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_mul\_d}. \\
+\textbf{Input}.   mp\_int $a$ and a mp\_digit $b$ \\
+\textbf{Output}.  $c = ab$ \\
+\hline \\
+1.  $pa \leftarrow a.used$ \\
+2.  Grow $c$ to at least $pa + 1$ digits. \\
+3.  $oldused \leftarrow c.used$ \\
+4.  $c.used \leftarrow pa + 1$ \\
+5.  $c.sign \leftarrow a.sign$ \\
+6.  $\mu \leftarrow 0$ \\
+7.  for $ix$ from $0$ to $pa - 1$ do \\
+\hspace{3mm}7.1  $\hat r \leftarrow \mu + a_{ix}b$ \\
+\hspace{3mm}7.2  $c_{ix} \leftarrow \hat r \mbox{ (mod }\beta\mbox{)}$ \\
+\hspace{3mm}7.3  $\mu \leftarrow \lfloor \hat r / \beta \rfloor$ \\
+8.  $c_{pa} \leftarrow \mu$ \\
+9.  for $ix$ from $pa + 1$ to $oldused$ do \\
+\hspace{3mm}9.1  $c_{ix} \leftarrow 0$ \\
+10.  Clamp excess digits of $c$. \\
+11.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_mul\_d}
+\end{figure}
+\textbf{Algorithm mp\_mul\_d.}
+This algorithm quickly multiplies an mp\_int by a small single digit value.  It is specially tailored to the job and has a minimal of overhead.  
+Unlike the full multiplication algorithms this algorithm does not require any significnat temporary storage or memory allocations.  
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_mul\_d.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* multiply by a digit */
+018   int
+019   mp_mul_d (mp_int * a, mp_digit b, mp_int * c)
+020   \{
+021     mp_digit u, *tmpa, *tmpc;
+022     mp_word  r;
+023     int      ix, res, olduse;
+024   
+025     /* make sure c is big enough to hold a*b */
+026     if (c->alloc < a->used + 1) \{
+027       if ((res = mp_grow (c, a->used + 1)) != MP_OKAY) \{
+028         return res;
+029       \}
+030     \}
+031   
+032     /* get the original destinations used count */
+033     olduse = c->used;
+034   
+035     /* set the sign */
+036     c->sign = a->sign;
+037   
+038     /* alias for a->dp [source] */
+039     tmpa = a->dp;
+040   
+041     /* alias for c->dp [dest] */
+042     tmpc = c->dp;
+043   
+044     /* zero carry */
+045     u = 0;
+046   
+047     /* compute columns */
+048     for (ix = 0; ix < a->used; ix++) \{
+049       /* compute product and carry sum for this term */
+050       r       = ((mp_word) u) + ((mp_word)*tmpa++) * ((mp_word)b);
+051   
+052       /* mask off higher bits to get a single digit */
+053       *tmpc++ = (mp_digit) (r & ((mp_word) MP_MASK));
+054   
+055       /* send carry into next iteration */
+056       u       = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
+057     \}
+058   
+059     /* store final carry [if any] and increment ix offset  */
+060     *tmpc++ = u;
+061     ++ix;
+062   
+063     /* now zero digits above the top */
+064     while (ix++ < olduse) \{
+065        *tmpc++ = 0;
+066     \}
+067   
+068     /* set used count */
+069     c->used = a->used + 1;
+070     mp_clamp(c);
+071   
+072     return MP_OKAY;
+073   \}
+074   #endif
+\end{alltt}
+\end{small}
+
+In this implementation the destination $c$ may point to the same mp\_int as the source $a$ since the result is written after the digit is 
+read from the source.  This function uses pointer aliases $tmpa$ and $tmpc$ for the digits of $a$ and $c$ respectively.  
+
+\subsection{Single Digit Division}
+Like the single digit multiplication algorithm, single digit division is also a fairly common algorithm used in radix conversion.  Since the
+divisor is only a single digit a specialized variant of the division algorithm can be used to compute the quotient.  
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_div\_d}. \\
+\textbf{Input}.   mp\_int $a$ and a mp\_digit $b$ \\
+\textbf{Output}.  $c = \lfloor a / b \rfloor, d = a - cb$ \\
+\hline \\
+1.  If $b = 0$ then return(\textit{MP\_VAL}).\\
+2.  If $b = 3$ then use algorithm mp\_div\_3 instead. \\
+3.  Init $q$ to $a.used$ digits.  \\
+4.  $q.used \leftarrow a.used$ \\
+5.  $q.sign \leftarrow a.sign$ \\
+6.  $\hat w \leftarrow 0$ \\
+7.  for $ix$ from $a.used - 1$ down to $0$ do \\
+\hspace{3mm}7.1  $\hat w \leftarrow \hat w \beta + a_{ix}$ \\
+\hspace{3mm}7.2  If $\hat w \ge b$ then \\
+\hspace{6mm}7.2.1  $t \leftarrow \lfloor \hat w / b \rfloor$ \\
+\hspace{6mm}7.2.2  $\hat w \leftarrow \hat w \mbox{ (mod }b\mbox{)}$ \\
+\hspace{3mm}7.3  else\\
+\hspace{6mm}7.3.1  $t \leftarrow 0$ \\
+\hspace{3mm}7.4  $q_{ix} \leftarrow t$ \\
+8.  $d \leftarrow \hat w$ \\
+9.  Clamp excess digits of $q$. \\
+10.  $c \leftarrow q$ \\
+11.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_div\_d}
+\end{figure}
+\textbf{Algorithm mp\_div\_d.}
+This algorithm divides the mp\_int $a$ by the single mp\_digit $b$ using an optimized approach.  Essentially in every iteration of the
+algorithm another digit of the dividend is reduced and another digit of quotient produced.  Provided $b < \beta$ the value of $\hat w$
+after step 7.1 will be limited such that $0 \le \lfloor \hat w / b \rfloor < \beta$.  
+
+If the divisor $b$ is equal to three a variant of this algorithm is used which is called mp\_div\_3.  It replaces the division by three with
+a multiplication by $\lfloor \beta / 3 \rfloor$ and the appropriate shift and residual fixup.  In essence it is much like the Barrett reduction
+from chapter seven.  
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_div\_d.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   static int s_is_power_of_two(mp_digit b, int *p)
+018   \{
+019      int x;
+020   
+021      for (x = 1; x < DIGIT_BIT; x++) \{
+022         if (b == (((mp_digit)1)<<x)) \{
+023            *p = x;
+024            return 1;
+025         \}
+026      \}
+027      return 0;
+028   \}
+029   
+030   /* single digit division (based on routine from MPI) */
+031   int mp_div_d (mp_int * a, mp_digit b, mp_int * c, mp_digit * d)
+032   \{
+033     mp_int  q;
+034     mp_word w;
+035     mp_digit t;
+036     int     res, ix;
+037   
+038     /* cannot divide by zero */
+039     if (b == 0) \{
+040        return MP_VAL;
+041     \}
+042   
+043     /* quick outs */
+044     if (b == 1 || mp_iszero(a) == 1) \{
+045        if (d != NULL) \{
+046           *d = 0;
+047        \}
+048        if (c != NULL) \{
+049           return mp_copy(a, c);
+050        \}
+051        return MP_OKAY;
+052     \}
+053   
+054     /* power of two ? */
+055     if (s_is_power_of_two(b, &ix) == 1) \{
+056        if (d != NULL) \{
+057           *d = a->dp[0] & ((((mp_digit)1)<<ix) - 1);
+058        \}
+059        if (c != NULL) \{
+060           return mp_div_2d(a, ix, c, NULL);
+061        \}
+062        return MP_OKAY;
+063     \}
+064   
+065   #ifdef BN_MP_DIV_3_C
+066     /* three? */
+067     if (b == 3) \{
+068        return mp_div_3(a, c, d);
+069     \}
+070   #endif
+071   
+072     /* no easy answer [c'est la vie].  Just division */
+073     if ((res = mp_init_size(&q, a->used)) != MP_OKAY) \{
+074        return res;
+075     \}
+076     
+077     q.used = a->used;
+078     q.sign = a->sign;
+079     w = 0;
+080     for (ix = a->used - 1; ix >= 0; ix--) \{
+081        w = (w << ((mp_word)DIGIT_BIT)) | ((mp_word)a->dp[ix]);
+082        
+083        if (w >= b) \{
+084           t = (mp_digit)(w / b);
+085           w -= ((mp_word)t) * ((mp_word)b);
+086         \} else \{
+087           t = 0;
+088         \}
+089         q.dp[ix] = (mp_digit)t;
+090     \}
+091     
+092     if (d != NULL) \{
+093        *d = (mp_digit)w;
+094     \}
+095     
+096     if (c != NULL) \{
+097        mp_clamp(&q);
+098        mp_exch(&q, c);
+099     \}
+100     mp_clear(&q);
+101     
+102     return res;
+103   \}
+104   
+105   #endif
+\end{alltt}
+\end{small}
+
+Like the implementation of algorithm mp\_div this algorithm allows either of the quotient or remainder to be passed as a \textbf{NULL} pointer to
+indicate the respective value is not required.  This allows a trivial single digit modular reduction algorithm, mp\_mod\_d to be created.
+
+The division and remainder on lines 43 and @45,%@ can be replaced often by a single division on most processors.  For example, the 32-bit x86 based 
+processors can divide a 64-bit quantity by a 32-bit quantity and produce the quotient and remainder simultaneously.  Unfortunately the GCC 
+compiler does not recognize that optimization and will actually produce two function calls to find the quotient and remainder respectively.  
+
+\subsection{Single Digit Root Extraction}
+
+Finding the $n$'th root of an integer is fairly easy as far as numerical analysis is concerned.  Algorithms such as the Newton-Raphson approximation 
+(\ref{eqn:newton}) series will converge very quickly to a root for any continuous function $f(x)$.  
+
+\begin{equation}
+x_{i+1} = x_i - {f(x_i) \over f'(x_i)}
+\label{eqn:newton}
+\end{equation}
+
+In this case the $n$'th root is desired and $f(x) = x^n - a$ where $a$ is the integer of which the root is desired.  The derivative of $f(x)$ is 
+simply $f'(x) = nx^{n - 1}$.  Of particular importance is that this algorithm will be used over the integers not over the a more continuous domain
+such as the real numbers.  As a result the root found can be above the true root by few and must be manually adjusted.  Ideally at the end of the 
+algorithm the $n$'th root $b$ of an integer $a$ is desired such that $b^n \le a$.  
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_n\_root}. \\
+\textbf{Input}.   mp\_int $a$ and a mp\_digit $b$ \\
+\textbf{Output}.  $c^b \le a$ \\
+\hline \\
+1.  If $b$ is even and $a.sign = MP\_NEG$ return(\textit{MP\_VAL}). \\
+2.  $sign \leftarrow a.sign$ \\
+3.  $a.sign \leftarrow MP\_ZPOS$ \\
+4.  t$2 \leftarrow 2$ \\
+5.  Loop \\
+\hspace{3mm}5.1  t$1 \leftarrow $ t$2$ \\
+\hspace{3mm}5.2  t$3 \leftarrow $ t$1^{b - 1}$ \\
+\hspace{3mm}5.3  t$2 \leftarrow $ t$3 $ $\cdot$ t$1$ \\
+\hspace{3mm}5.4  t$2 \leftarrow $ t$2 - a$ \\
+\hspace{3mm}5.5  t$3 \leftarrow $ t$3 \cdot b$ \\
+\hspace{3mm}5.6  t$3 \leftarrow \lfloor $t$2 / $t$3 \rfloor$ \\
+\hspace{3mm}5.7  t$2 \leftarrow $ t$1 - $ t$3$ \\
+\hspace{3mm}5.8  If t$1 \ne $ t$2$ then goto step 5.  \\
+6.  Loop \\
+\hspace{3mm}6.1  t$2 \leftarrow $ t$1^b$ \\
+\hspace{3mm}6.2  If t$2 > a$ then \\
+\hspace{6mm}6.2.1  t$1 \leftarrow $ t$1 - 1$ \\
+\hspace{6mm}6.2.2  Goto step 6. \\
+7.  $a.sign \leftarrow sign$ \\
+8.  $c \leftarrow $ t$1$ \\
+9.  $c.sign \leftarrow sign$  \\
+10.  Return(\textit{MP\_OKAY}).  \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_n\_root}
+\end{figure}
+\textbf{Algorithm mp\_n\_root.}
+This algorithm finds the integer $n$'th root of an input using the Newton-Raphson approach.  It is partially optimized based on the observation
+that the numerator of ${f(x) \over f'(x)}$ can be derived from a partial denominator.  That is at first the denominator is calculated by finding
+$x^{b - 1}$.  This value can then be multiplied by $x$ and have $a$ subtracted from it to find the numerator.  This saves a total of $b - 1$ 
+multiplications by t$1$ inside the loop.  
+
+The initial value of the approximation is t$2 = 2$ which allows the algorithm to start with very small values and quickly converge on the
+root.  Ideally this algorithm is meant to find the $n$'th root of an input where $n$ is bounded by $2 \le n \le 5$.  
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_n\_root.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* find the n'th root of an integer 
+018    *
+019    * Result found such that (c)**b <= a and (c+1)**b > a 
+020    *
+021    * This algorithm uses Newton's approximation 
+022    * x[i+1] = x[i] - f(x[i])/f'(x[i]) 
+023    * which will find the root in log(N) time where 
+024    * each step involves a fair bit.  This is not meant to 
+025    * find huge roots [square and cube, etc].
+026    */
+027   int mp_n_root (mp_int * a, mp_digit b, mp_int * c)
+028   \{
+029     mp_int  t1, t2, t3;
+030     int     res, neg;
+031   
+032     /* input must be positive if b is even */
+033     if ((b & 1) == 0 && a->sign == MP_NEG) \{
+034       return MP_VAL;
+035     \}
+036   
+037     if ((res = mp_init (&t1)) != MP_OKAY) \{
+038       return res;
+039     \}
+040   
+041     if ((res = mp_init (&t2)) != MP_OKAY) \{
+042       goto LBL_T1;
+043     \}
+044   
+045     if ((res = mp_init (&t3)) != MP_OKAY) \{
+046       goto LBL_T2;
+047     \}
+048   
+049     /* if a is negative fudge the sign but keep track */
+050     neg     = a->sign;
+051     a->sign = MP_ZPOS;
+052   
+053     /* t2 = 2 */
+054     mp_set (&t2, 2);
+055   
+056     do \{
+057       /* t1 = t2 */
+058       if ((res = mp_copy (&t2, &t1)) != MP_OKAY) \{
+059         goto LBL_T3;
+060       \}
+061   
+062       /* t2 = t1 - ((t1**b - a) / (b * t1**(b-1))) */
+063       
+064       /* t3 = t1**(b-1) */
+065       if ((res = mp_expt_d (&t1, b - 1, &t3)) != MP_OKAY) \{   
+066         goto LBL_T3;
+067       \}
+068   
+069       /* numerator */
+070       /* t2 = t1**b */
+071       if ((res = mp_mul (&t3, &t1, &t2)) != MP_OKAY) \{    
+072         goto LBL_T3;
+073       \}
+074   
+075       /* t2 = t1**b - a */
+076       if ((res = mp_sub (&t2, a, &t2)) != MP_OKAY) \{  
+077         goto LBL_T3;
+078       \}
+079   
+080       /* denominator */
+081       /* t3 = t1**(b-1) * b  */
+082       if ((res = mp_mul_d (&t3, b, &t3)) != MP_OKAY) \{    
+083         goto LBL_T3;
+084       \}
+085   
+086       /* t3 = (t1**b - a)/(b * t1**(b-1)) */
+087       if ((res = mp_div (&t2, &t3, &t3, NULL)) != MP_OKAY) \{  
+088         goto LBL_T3;
+089       \}
+090   
+091       if ((res = mp_sub (&t1, &t3, &t2)) != MP_OKAY) \{
+092         goto LBL_T3;
+093       \}
+094     \}  while (mp_cmp (&t1, &t2) != MP_EQ);
+095   
+096     /* result can be off by a few so check */
+097     for (;;) \{
+098       if ((res = mp_expt_d (&t1, b, &t2)) != MP_OKAY) \{
+099         goto LBL_T3;
+100       \}
+101   
+102       if (mp_cmp (&t2, a) == MP_GT) \{
+103         if ((res = mp_sub_d (&t1, 1, &t1)) != MP_OKAY) \{
+104            goto LBL_T3;
+105         \}
+106       \} else \{
+107         break;
+108       \}
+109     \}
+110   
+111     /* reset the sign of a first */
+112     a->sign = neg;
+113   
+114     /* set the result */
+115     mp_exch (&t1, c);
+116   
+117     /* set the sign of the result */
+118     c->sign = neg;
+119   
+120     res = MP_OKAY;
+121   
+122   LBL_T3:mp_clear (&t3);
+123   LBL_T2:mp_clear (&t2);
+124   LBL_T1:mp_clear (&t1);
+125     return res;
+126   \}
+127   #endif
+\end{alltt}
+\end{small}
+
+\section{Random Number Generation}
+
+Random numbers come up in a variety of activities from public key cryptography to simple simulations and various randomized algorithms.  Pollard-Rho 
+factoring for example, can make use of random values as starting points to find factors of a composite integer.  In this case the algorithm presented
+is solely for simulations and not intended for cryptographic use.  
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_rand}. \\
+\textbf{Input}.   An integer $b$ \\
+\textbf{Output}.  A pseudo-random number of $b$ digits \\
+\hline \\
+1.  $a \leftarrow 0$ \\
+2.  If $b \le 0$ return(\textit{MP\_OKAY}) \\
+3.  Pick a non-zero random digit $d$. \\
+4.  $a \leftarrow a + d$ \\
+5.  for $ix$ from 1 to $d - 1$ do \\
+\hspace{3mm}5.1  $a \leftarrow a \cdot \beta$ \\
+\hspace{3mm}5.2  Pick a random digit $d$. \\
+\hspace{3mm}5.3  $a \leftarrow a + d$ \\
+6.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_rand}
+\end{figure}
+\textbf{Algorithm mp\_rand.}
+This algorithm produces a pseudo-random integer of $b$ digits.  By ensuring that the first digit is non-zero the algorithm also guarantees that the
+final result has at least $b$ digits.  It relies heavily on a third-part random number generator which should ideally generate uniformly all of
+the integers from $0$ to $\beta - 1$.  
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_rand.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* makes a pseudo-random int of a given size */
+018   int
+019   mp_rand (mp_int * a, int digits)
+020   \{
+021     int     res;
+022     mp_digit d;
+023   
+024     mp_zero (a);
+025     if (digits <= 0) \{
+026       return MP_OKAY;
+027     \}
+028   
+029     /* first place a random non-zero digit */
+030     do \{
+031       d = ((mp_digit) abs (rand ())) & MP_MASK;
+032     \} while (d == 0);
+033   
+034     if ((res = mp_add_d (a, d, a)) != MP_OKAY) \{
+035       return res;
+036     \}
+037   
+038     while (--digits > 0) \{
+039       if ((res = mp_lshd (a, 1)) != MP_OKAY) \{
+040         return res;
+041       \}
+042   
+043       if ((res = mp_add_d (a, ((mp_digit) abs (rand ())), a)) != MP_OKAY) \{
+044         return res;
+045       \}
+046     \}
+047   
+048     return MP_OKAY;
+049   \}
+050   #endif
+\end{alltt}
+\end{small}
+
+\section{Formatted Representations}
+The ability to emit a radix-$n$ textual representation of an integer is useful for interacting with human parties.  For example, the ability to
+be given a string of characters such as ``114585'' and turn it into the radix-$\beta$ equivalent would make it easier to enter numbers
+into a program.
+
+\subsection{Reading Radix-n Input}
+For the purposes of this text we will assume that a simple lower ASCII map (\ref{fig:ASC}) is used for the values of from $0$ to $63$ to 
+printable characters.  For example, when the character ``N'' is read it represents the integer $23$.  The first $16$ characters of the
+map are for the common representations up to hexadecimal.  After that they match the ``base64'' encoding scheme which are suitable chosen
+such that they are printable.  While outputting as base64 may not be too helpful for human operators it does allow communication via non binary
+mediums.
+
+\newpage\begin{figure}[here]
+\begin{center}
+\begin{tabular}{cc|cc|cc|cc}
+\hline \textbf{Value} & \textbf{Char} & \textbf{Value} & \textbf{Char} & \textbf{Value} & \textbf{Char} &  \textbf{Value} & \textbf{Char} \\
+\hline 
+0 & 0 & 1 & 1 & 2 & 2 & 3 & 3 \\
+4 & 4 & 5 & 5 & 6 & 6 & 7 & 7 \\
+8 & 8 & 9 & 9 & 10 & A & 11 & B \\
+12 & C & 13 & D & 14 & E & 15 & F \\
+16 & G & 17 & H & 18 & I & 19 & J \\
+20 & K & 21 & L & 22 & M & 23 & N \\
+24 & O & 25 & P & 26 & Q & 27 & R \\
+28 & S & 29 & T & 30 & U & 31 & V \\
+32 & W & 33 & X & 34 & Y & 35 & Z \\
+36 & a & 37 & b & 38 & c & 39 & d \\
+40 & e & 41 & f & 42 & g & 43 & h \\
+44 & i & 45 & j & 46 & k & 47 & l \\
+48 & m & 49 & n & 50 & o & 51 & p \\
+52 & q & 53 & r & 54 & s & 55 & t \\
+56 & u & 57 & v & 58 & w & 59 & x \\
+60 & y & 61 & z & 62 & $+$ & 63 & $/$ \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Lower ASCII Map}
+\label{fig:ASC}
+\end{figure}
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_read\_radix}. \\
+\textbf{Input}.   A string $str$ of length $sn$ and radix $r$. \\
+\textbf{Output}.  The radix-$\beta$ equivalent mp\_int. \\
+\hline \\
+1.  If $r < 2$ or $r > 64$ return(\textit{MP\_VAL}). \\
+2.  $ix \leftarrow 0$ \\
+3.  If $str_0 =$ ``-'' then do \\
+\hspace{3mm}3.1  $ix \leftarrow ix + 1$ \\
+\hspace{3mm}3.2  $sign \leftarrow MP\_NEG$ \\
+4.  else \\
+\hspace{3mm}4.1  $sign \leftarrow MP\_ZPOS$ \\
+5.  $a \leftarrow 0$ \\
+6.  for $iy$ from $ix$ to $sn - 1$ do \\
+\hspace{3mm}6.1  Let $y$ denote the position in the map of $str_{iy}$. \\
+\hspace{3mm}6.2  If $str_{iy}$ is not in the map or $y \ge r$ then goto step 7. \\
+\hspace{3mm}6.3  $a \leftarrow a \cdot r$ \\
+\hspace{3mm}6.4  $a \leftarrow a + y$ \\
+7.  If $a \ne 0$ then $a.sign \leftarrow sign$ \\
+8.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_read\_radix}
+\end{figure}
+\textbf{Algorithm mp\_read\_radix.}
+This algorithm will read an ASCII string and produce the radix-$\beta$ mp\_int representation of the same integer.  A minus symbol ``-'' may precede the 
+string  to indicate the value is negative, otherwise it is assumed to be positive.  The algorithm will read up to $sn$ characters from the input
+and will stop when it reads a character it cannot map the algorithm stops reading characters from the string.  This allows numbers to be embedded
+as part of larger input without any significant problem.
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_read\_radix.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* read a string [ASCII] in a given radix */
+018   int mp_read_radix (mp_int * a, const char *str, int radix)
+019   \{
+020     int     y, res, neg;
+021     char    ch;
+022   
+023     /* make sure the radix is ok */
+024     if (radix < 2 || radix > 64) \{
+025       return MP_VAL;
+026     \}
+027   
+028     /* if the leading digit is a 
+029      * minus set the sign to negative. 
+030      */
+031     if (*str == '-') \{
+032       ++str;
+033       neg = MP_NEG;
+034     \} else \{
+035       neg = MP_ZPOS;
+036     \}
+037   
+038     /* set the integer to the default of zero */
+039     mp_zero (a);
+040     
+041     /* process each digit of the string */
+042     while (*str) \{
+043       /* if the radix < 36 the conversion is case insensitive
+044        * this allows numbers like 1AB and 1ab to represent the same  value
+045        * [e.g. in hex]
+046        */
+047       ch = (char) ((radix < 36) ? toupper (*str) : *str);
+048       for (y = 0; y < 64; y++) \{
+049         if (ch == mp_s_rmap[y]) \{
+050            break;
+051         \}
+052       \}
+053   
+054       /* if the char was found in the map 
+055        * and is less than the given radix add it
+056        * to the number, otherwise exit the loop. 
+057        */
+058       if (y < radix) \{
+059         if ((res = mp_mul_d (a, (mp_digit) radix, a)) != MP_OKAY) \{
+060            return res;
+061         \}
+062         if ((res = mp_add_d (a, (mp_digit) y, a)) != MP_OKAY) \{
+063            return res;
+064         \}
+065       \} else \{
+066         break;
+067       \}
+068       ++str;
+069     \}
+070     
+071     /* set the sign only if a != 0 */
+072     if (mp_iszero(a) != 1) \{
+073        a->sign = neg;
+074     \}
+075     return MP_OKAY;
+076   \}
+077   #endif
+\end{alltt}
+\end{small}
+
+\subsection{Generating Radix-$n$ Output}
+Generating radix-$n$ output is fairly trivial with a division and remainder algorithm.  
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_toradix}. \\
+\textbf{Input}.   A mp\_int $a$ and an integer $r$\\
+\textbf{Output}.  The radix-$r$ representation of $a$ \\
+\hline \\
+1.  If $r < 2$ or $r > 64$ return(\textit{MP\_VAL}). \\
+2.  If $a = 0$ then $str = $ ``$0$'' and return(\textit{MP\_OKAY}).  \\
+3.  $t \leftarrow a$ \\
+4.  $str \leftarrow$ ``'' \\
+5.  if $t.sign = MP\_NEG$ then \\
+\hspace{3mm}5.1  $str \leftarrow str + $ ``-'' \\
+\hspace{3mm}5.2  $t.sign = MP\_ZPOS$ \\
+6.  While ($t \ne 0$) do \\
+\hspace{3mm}6.1  $d \leftarrow t \mbox{ (mod }r\mbox{)}$ \\
+\hspace{3mm}6.2  $t \leftarrow \lfloor t / r \rfloor$ \\
+\hspace{3mm}6.3  Look up $d$ in the map and store the equivalent character in $y$. \\
+\hspace{3mm}6.4  $str \leftarrow str + y$ \\
+7.  If $str_0 = $``$-$'' then \\
+\hspace{3mm}7.1  Reverse the digits $str_1, str_2, \ldots str_n$. \\
+8.  Otherwise \\
+\hspace{3mm}8.1  Reverse the digits $str_0, str_1, \ldots str_n$. \\
+9.  Return(\textit{MP\_OKAY}).\\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_toradix}
+\end{figure}
+\textbf{Algorithm mp\_toradix.}
+This algorithm computes the radix-$r$ representation of an mp\_int $a$.  The ``digits'' of the representation are extracted by reducing 
+successive powers of $\lfloor a / r^k \rfloor$ the input modulo $r$ until $r^k > a$.  Note that instead of actually dividing by $r^k$ in
+each iteration the quotient $\lfloor a / r \rfloor$ is saved for the next iteration.  As a result a series of trivial $n \times 1$ divisions
+are required instead of a series of $n \times k$ divisions.  One design flaw of this approach is that the digits are produced in the reverse order 
+(see~\ref{fig:mpradix}).  To remedy this flaw the digits must be swapped or simply ``reversed''.
+
+\begin{figure}
+\begin{center}
+\begin{tabular}{|c|c|c|}
+\hline \textbf{Value of $a$} & \textbf{Value of $d$} & \textbf{Value of $str$} \\
+\hline $1234$ & -- & -- \\
+\hline $123$  & $4$ & ``4'' \\
+\hline $12$   & $3$ & ``43'' \\
+\hline $1$    & $2$ & ``432'' \\
+\hline $0$    & $1$ & ``4321'' \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Example of Algorithm mp\_toradix.}
+\label{fig:mpradix}
+\end{figure}
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_toradix.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* stores a bignum as a ASCII string in a given radix (2..64) */
+018   int mp_toradix (mp_int * a, char *str, int radix)
+019   \{
+020     int     res, digs;
+021     mp_int  t;
+022     mp_digit d;
+023     char   *_s = str;
+024   
+025     /* check range of the radix */
+026     if (radix < 2 || radix > 64) \{
+027       return MP_VAL;
+028     \}
+029   
+030     /* quick out if its zero */
+031     if (mp_iszero(a) == 1) \{
+032        *str++ = '0';
+033        *str = '\symbol{92}0';
+034        return MP_OKAY;
+035     \}
+036   
+037     if ((res = mp_init_copy (&t, a)) != MP_OKAY) \{
+038       return res;
+039     \}
+040   
+041     /* if it is negative output a - */
+042     if (t.sign == MP_NEG) \{
+043       ++_s;
+044       *str++ = '-';
+045       t.sign = MP_ZPOS;
+046     \}
+047   
+048     digs = 0;
+049     while (mp_iszero (&t) == 0) \{
+050       if ((res = mp_div_d (&t, (mp_digit) radix, &t, &d)) != MP_OKAY) \{
+051         mp_clear (&t);
+052         return res;
+053       \}
+054       *str++ = mp_s_rmap[d];
+055       ++digs;
+056     \}
+057   
+058     /* reverse the digits of the string.  In this case _s points
+059      * to the first digit [exluding the sign] of the number]
+060      */
+061     bn_reverse ((unsigned char *)_s, digs);
+062   
+063     /* append a NULL so the string is properly terminated */
+064     *str = '\symbol{92}0';
+065   
+066     mp_clear (&t);
+067     return MP_OKAY;
+068   \}
+069   
+070   #endif
+\end{alltt}
+\end{small}
+
+\chapter{Number Theoretic Algorithms}
+This chapter discusses several fundamental number theoretic algorithms such as the greatest common divisor, least common multiple and Jacobi 
+symbol computation.  These algorithms arise as essential components in several key cryptographic algorithms such as the RSA public key algorithm and
+various Sieve based factoring algorithms.
+
+\section{Greatest Common Divisor}
+The greatest common divisor of two integers $a$ and $b$, often denoted as $(a, b)$ is the largest integer $k$ that is a proper divisor of
+both $a$ and $b$.  That is, $k$ is the largest integer such that $0 \equiv a \mbox{ (mod }k\mbox{)}$ and $0 \equiv b \mbox{ (mod }k\mbox{)}$ occur
+simultaneously.
+
+The most common approach (cite) is to reduce one input modulo another.  That is if $a$ and $b$ are divisible by some integer $k$ and if $qa + r = b$ then
+$r$ is also divisible by $k$.  The reduction pattern follows $\left < a , b \right > \rightarrow \left < b, a \mbox{ mod } b \right >$.  
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{Greatest Common Divisor (I)}. \\
+\textbf{Input}.   Two positive integers $a$ and $b$ greater than zero. \\
+\textbf{Output}.  The greatest common divisor $(a, b)$.  \\
+\hline \\
+1.  While ($b > 0$) do \\
+\hspace{3mm}1.1  $r \leftarrow a \mbox{ (mod }b\mbox{)}$ \\
+\hspace{3mm}1.2  $a \leftarrow b$ \\
+\hspace{3mm}1.3  $b \leftarrow r$ \\
+2.  Return($a$). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm Greatest Common Divisor (I)}
+\label{fig:gcd1}
+\end{figure}
+
+This algorithm will quickly converge on the greatest common divisor since the residue $r$ tends diminish rapidly.  However, divisions are
+relatively expensive operations to perform and should ideally be avoided.  There is another approach based on a similar relationship of 
+greatest common divisors.  The faster approach is based on the observation that if $k$ divides both $a$ and $b$ it will also divide $a - b$.  
+In particular, we would like $a - b$ to decrease in magnitude which implies that $b \ge a$.  
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{Greatest Common Divisor (II)}. \\
+\textbf{Input}.   Two positive integers $a$ and $b$ greater than zero. \\
+\textbf{Output}.  The greatest common divisor $(a, b)$.  \\
+\hline \\
+1.  While ($b > 0$) do \\
+\hspace{3mm}1.1  Swap $a$ and $b$ such that $a$ is the smallest of the two. \\
+\hspace{3mm}1.2  $b \leftarrow b - a$ \\
+2.  Return($a$). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm Greatest Common Divisor (II)}
+\label{fig:gcd2}
+\end{figure}
+
+\textbf{Proof} \textit{Algorithm~\ref{fig:gcd2} will return the greatest common divisor of $a$ and $b$.}
+The algorithm in figure~\ref{fig:gcd2} will eventually terminate since $b \ge a$ the subtraction in step 1.2 will be a value less than $b$.  In other
+words in every iteration that tuple $\left < a, b \right >$ decrease in magnitude until eventually $a = b$.  Since both $a$ and $b$ are always 
+divisible by the greatest common divisor (\textit{until the last iteration}) and in the last iteration of the algorithm $b = 0$, therefore, in the 
+second to last iteration of the algorithm $b = a$ and clearly $(a, a) = a$ which concludes the proof.  \textbf{QED}.
+
+As a matter of practicality algorithm \ref{fig:gcd1} decreases far too slowly to be useful.  Specially if $b$ is much larger than $a$ such that 
+$b - a$ is still very much larger than $a$.  A simple addition to the algorithm is to divide $b - a$ by a power of some integer $p$ which does
+not divide the greatest common divisor but will divide $b - a$.  In this case ${b - a} \over p$ is also an integer and still divisible by
+the greatest common divisor.
+
+However, instead of factoring $b - a$ to find a suitable value of $p$ the powers of $p$ can be removed from $a$ and $b$ that are in common first.  
+Then inside the loop whenever $b - a$ is divisible by some power of $p$ it can be safely removed.  
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{Greatest Common Divisor (III)}. \\
+\textbf{Input}.   Two positive integers $a$ and $b$ greater than zero. \\
+\textbf{Output}.  The greatest common divisor $(a, b)$.  \\
+\hline \\
+1.  $k \leftarrow 0$ \\
+2.  While $a$ and $b$ are both divisible by $p$ do \\
+\hspace{3mm}2.1  $a \leftarrow \lfloor a / p \rfloor$ \\
+\hspace{3mm}2.2  $b \leftarrow \lfloor b / p \rfloor$ \\
+\hspace{3mm}2.3  $k \leftarrow k + 1$ \\
+3.  While $a$ is divisible by $p$ do \\
+\hspace{3mm}3.1  $a \leftarrow \lfloor a / p \rfloor$ \\
+4.  While $b$ is divisible by $p$ do \\
+\hspace{3mm}4.1  $b \leftarrow \lfloor b / p \rfloor$ \\
+5.  While ($b > 0$) do \\
+\hspace{3mm}5.1  Swap $a$ and $b$ such that $a$ is the smallest of the two. \\
+\hspace{3mm}5.2  $b \leftarrow b - a$ \\
+\hspace{3mm}5.3  While $b$ is divisible by $p$ do \\
+\hspace{6mm}5.3.1  $b \leftarrow \lfloor b / p \rfloor$ \\
+6.  Return($a \cdot p^k$). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm Greatest Common Divisor (III)}
+\label{fig:gcd3}
+\end{figure}
+
+This algorithm is based on the first except it removes powers of $p$ first and inside the main loop to ensure the tuple $\left < a, b \right >$ 
+decreases more rapidly.  The first loop on step two removes powers of $p$ that are in common.  A count, $k$, is kept which will present a common
+divisor of $p^k$.  After step two the remaining common divisor of $a$ and $b$ cannot be divisible by $p$.  This means that $p$ can be safely 
+divided out of the difference $b - a$ so long as the division leaves no remainder.  
+
+In particular the value of $p$ should be chosen such that the division on step 5.3.1 occur often.  It also helps that division by $p$ be easy
+to compute.  The ideal choice of $p$ is two since division by two amounts to a right logical shift.  Another important observation is that by
+step five both $a$ and $b$ are odd.  Therefore, the diffrence $b - a$ must be even which means that each iteration removes one bit from the 
+largest of the pair.
+
+\subsection{Complete Greatest Common Divisor}
+The algorithms presented so far cannot handle inputs which are zero or negative.  The following algorithm can handle all input cases properly
+and will produce the greatest common divisor.
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_gcd}. \\
+\textbf{Input}.   mp\_int $a$ and $b$ \\
+\textbf{Output}.  The greatest common divisor $c = (a, b)$.  \\
+\hline \\
+1.  If $a = 0$ and $b \ne 0$ then \\
+\hspace{3mm}1.1  $c \leftarrow b$ \\
+\hspace{3mm}1.2  Return(\textit{MP\_OKAY}). \\
+2.  If $a \ne 0$ and $b = 0$ then \\
+\hspace{3mm}2.1  $c \leftarrow a$ \\
+\hspace{3mm}2.2  Return(\textit{MP\_OKAY}). \\
+3.  If $a = b = 0$ then \\
+\hspace{3mm}3.1  $c \leftarrow 1$ \\
+\hspace{3mm}3.2  Return(\textit{MP\_OKAY}). \\
+4.  $u \leftarrow \vert a \vert, v \leftarrow \vert b \vert$ \\
+5.  $k \leftarrow 0$ \\
+6.  While $u.used > 0$ and $v.used > 0$ and $u_0 \equiv v_0 \equiv 0 \mbox{ (mod }2\mbox{)}$ \\
+\hspace{3mm}6.1  $k \leftarrow k + 1$ \\
+\hspace{3mm}6.2  $u \leftarrow \lfloor u / 2 \rfloor$ \\
+\hspace{3mm}6.3  $v \leftarrow \lfloor v / 2 \rfloor$ \\
+7.  While $u.used > 0$ and $u_0 \equiv 0 \mbox{ (mod }2\mbox{)}$ \\
+\hspace{3mm}7.1  $u \leftarrow \lfloor u / 2 \rfloor$ \\
+8.  While $v.used > 0$ and $v_0 \equiv 0 \mbox{ (mod }2\mbox{)}$ \\
+\hspace{3mm}8.1  $v \leftarrow \lfloor v / 2 \rfloor$ \\
+9.  While $v.used > 0$ \\
+\hspace{3mm}9.1  If $\vert u \vert > \vert v \vert$ then \\
+\hspace{6mm}9.1.1  Swap $u$ and $v$. \\
+\hspace{3mm}9.2  $v \leftarrow \vert v \vert - \vert u \vert$ \\
+\hspace{3mm}9.3  While $v.used > 0$ and $v_0 \equiv 0 \mbox{ (mod }2\mbox{)}$ \\
+\hspace{6mm}9.3.1  $v \leftarrow \lfloor v / 2 \rfloor$ \\
+10.  $c \leftarrow u \cdot 2^k$ \\
+11.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_gcd}
+\end{figure}
+\textbf{Algorithm mp\_gcd.}
+This algorithm will produce the greatest common divisor of two mp\_ints $a$ and $b$.  The algorithm was originally based on Algorithm B of
+Knuth \cite[pp. 338]{TAOCPV2} but has been modified to be simpler to explain.  In theory it achieves the same asymptotic working time as
+Algorithm B and in practice this appears to be true.  
+
+The first three steps handle the cases where either one of or both inputs are zero.  If either input is zero the greatest common divisor is the 
+largest input or zero if they are both zero.  If the inputs are not trivial than $u$ and $v$ are assigned the absolute values of 
+$a$ and $b$ respectively and the algorithm will proceed to reduce the pair.
+
+Step six will divide out any common factors of two and keep track of the count in the variable $k$.  After this step two is no longer a
+factor of the remaining greatest common divisor between $u$ and $v$ and can be safely evenly divided out of either whenever they are even.  Step 
+seven and eight ensure that the $u$ and $v$ respectively have no more factors of two.  At most only one of the while loops will iterate since 
+they cannot both be even.
+
+By step nine both of $u$ and $v$ are odd which is required for the inner logic.  First the pair are swapped such that $v$ is equal to
+or greater than $u$.  This ensures that the subtraction on step 9.2 will always produce a positive and even result.  Step 9.3 removes any
+factors of two from the difference $u$ to ensure that in the next iteration of the loop both are once again odd.
+
+After $v = 0$ occurs the variable $u$ has the greatest common divisor of the pair $\left < u, v \right >$ just after step six.  The result
+must be adjusted by multiplying by the common factors of two ($2^k$) removed earlier.  
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_gcd.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* Greatest Common Divisor using the binary method */
+018   int mp_gcd (mp_int * a, mp_int * b, mp_int * c)
+019   \{
+020     mp_int  u, v;
+021     int     k, u_lsb, v_lsb, res;
+022   
+023     /* either zero than gcd is the largest */
+024     if (mp_iszero (a) == 1 && mp_iszero (b) == 0) \{
+025       return mp_abs (b, c);
+026     \}
+027     if (mp_iszero (a) == 0 && mp_iszero (b) == 1) \{
+028       return mp_abs (a, c);
+029     \}
+030   
+031     /* optimized.  At this point if a == 0 then
+032      * b must equal zero too
+033      */
+034     if (mp_iszero (a) == 1) \{
+035       mp_zero(c);
+036       return MP_OKAY;
+037     \}
+038   
+039     /* get copies of a and b we can modify */
+040     if ((res = mp_init_copy (&u, a)) != MP_OKAY) \{
+041       return res;
+042     \}
+043   
+044     if ((res = mp_init_copy (&v, b)) != MP_OKAY) \{
+045       goto LBL_U;
+046     \}
+047   
+048     /* must be positive for the remainder of the algorithm */
+049     u.sign = v.sign = MP_ZPOS;
+050   
+051     /* B1.  Find the common power of two for u and v */
+052     u_lsb = mp_cnt_lsb(&u);
+053     v_lsb = mp_cnt_lsb(&v);
+054     k     = MIN(u_lsb, v_lsb);
+055   
+056     if (k > 0) \{
+057        /* divide the power of two out */
+058        if ((res = mp_div_2d(&u, k, &u, NULL)) != MP_OKAY) \{
+059           goto LBL_V;
+060        \}
+061   
+062        if ((res = mp_div_2d(&v, k, &v, NULL)) != MP_OKAY) \{
+063           goto LBL_V;
+064        \}
+065     \}
+066   
+067     /* divide any remaining factors of two out */
+068     if (u_lsb != k) \{
+069        if ((res = mp_div_2d(&u, u_lsb - k, &u, NULL)) != MP_OKAY) \{
+070           goto LBL_V;
+071        \}
+072     \}
+073   
+074     if (v_lsb != k) \{
+075        if ((res = mp_div_2d(&v, v_lsb - k, &v, NULL)) != MP_OKAY) \{
+076           goto LBL_V;
+077        \}
+078     \}
+079   
+080     while (mp_iszero(&v) == 0) \{
+081        /* make sure v is the largest */
+082        if (mp_cmp_mag(&u, &v) == MP_GT) \{
+083           /* swap u and v to make sure v is >= u */
+084           mp_exch(&u, &v);
+085        \}
+086        
+087        /* subtract smallest from largest */
+088        if ((res = s_mp_sub(&v, &u, &v)) != MP_OKAY) \{
+089           goto LBL_V;
+090        \}
+091        
+092        /* Divide out all factors of two */
+093        if ((res = mp_div_2d(&v, mp_cnt_lsb(&v), &v, NULL)) != MP_OKAY) \{
+094           goto LBL_V;
+095        \} 
+096     \} 
+097   
+098     /* multiply by 2**k which we divided out at the beginning */
+099     if ((res = mp_mul_2d (&u, k, c)) != MP_OKAY) \{
+100        goto LBL_V;
+101     \}
+102     c->sign = MP_ZPOS;
+103     res = MP_OKAY;
+104   LBL_V:mp_clear (&u);
+105   LBL_U:mp_clear (&v);
+106     return res;
+107   \}
+108   #endif
+\end{alltt}
+\end{small}
+
+This function makes use of the macros mp\_iszero and mp\_iseven.  The former evaluates to $1$ if the input mp\_int is equivalent to the 
+integer zero otherwise it evaluates to $0$.  The latter evaluates to $1$ if the input mp\_int represents a non-zero even integer otherwise
+it evaluates to $0$.  Note that just because mp\_iseven may evaluate to $0$ does not mean the input is odd, it could also be zero.  The three 
+trivial cases of inputs are handled on lines 24 through 37.  After those lines the inputs are assumed to be non-zero.
+
+Lines 34 and 40 make local copies $u$ and $v$ of the inputs $a$ and $b$ respectively.  At this point the common factors of two 
+must be divided out of the two inputs.  The while loop on line 80 iterates so long as both are even.  The local integer $k$ is used to
+keep track of how many factors of $2$ are pulled out of both values.  It is assumed that the number of factors will not exceed the maximum 
+value of a C ``int'' data type\footnote{Strictly speaking no array in C may have more than entries than are accessible by an ``int'' so this is not 
+a limitation.}.  
+
+At this point there are no more common factors of two in the two values.  The while loops on lines 80 and 80 remove any independent
+factors of two such that both $u$ and $v$ are guaranteed to be an odd integer before hitting the main body of the algorithm.  The while loop
+on line 80 performs the reduction of the pair until $v$ is equal to zero.  The unsigned comparison and subtraction algorithms are used in
+place of the full signed routines since both values are guaranteed to be positive and the result of the subtraction is guaranteed to be non-negative.
+
+\section{Least Common Multiple}
+The least common multiple of a pair of integers is their product divided by their greatest common divisor.  For two integers $a$ and $b$ the
+least common multiple is normally denoted as $[ a, b ]$ and numerically equivalent to ${ab} \over {(a, b)}$.  For example, if $a = 2 \cdot 2 \cdot 3 = 12$
+and $b = 2 \cdot 3 \cdot 3 \cdot 7 = 126$ the least common multiple is ${126 \over {(12, 126)}} = {126 \over 6} = 21$.
+
+The least common multiple arises often in coding theory as well as number theory.  If two functions have periods of $a$ and $b$ respectively they will
+collide, that is be in synchronous states, after only $[ a, b ]$ iterations.  This is why, for example, random number generators based on 
+Linear Feedback Shift Registers (LFSR) tend to use registers with periods which are co-prime (\textit{e.g. the greatest common divisor is one.}).  
+Similarly in number theory if a composite $n$ has two prime factors $p$ and $q$ then maximal order of any unit of $\Z/n\Z$ will be $[ p - 1, q - 1] $.
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_lcm}. \\
+\textbf{Input}.   mp\_int $a$ and $b$ \\
+\textbf{Output}.  The least common multiple $c = [a, b]$.  \\
+\hline \\
+1.  $c \leftarrow (a, b)$ \\
+2.  $t \leftarrow a \cdot b$ \\
+3.  $c \leftarrow \lfloor t / c \rfloor$ \\
+4.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_lcm}
+\end{figure}
+\textbf{Algorithm mp\_lcm.}
+This algorithm computes the least common multiple of two mp\_int inputs $a$ and $b$.  It computes the least common multiple directly by
+dividing the product of the two inputs by their greatest common divisor.
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_lcm.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* computes least common multiple as |a*b|/(a, b) */
+018   int mp_lcm (mp_int * a, mp_int * b, mp_int * c)
+019   \{
+020     int     res;
+021     mp_int  t1, t2;
+022   
+023   
+024     if ((res = mp_init_multi (&t1, &t2, NULL)) != MP_OKAY) \{
+025       return res;
+026     \}
+027   
+028     /* t1 = get the GCD of the two inputs */
+029     if ((res = mp_gcd (a, b, &t1)) != MP_OKAY) \{
+030       goto LBL_T;
+031     \}
+032   
+033     /* divide the smallest by the GCD */
+034     if (mp_cmp_mag(a, b) == MP_LT) \{
+035        /* store quotient in t2 such that t2 * b is the LCM */
+036        if ((res = mp_div(a, &t1, &t2, NULL)) != MP_OKAY) \{
+037           goto LBL_T;
+038        \}
+039        res = mp_mul(b, &t2, c);
+040     \} else \{
+041        /* store quotient in t2 such that t2 * a is the LCM */
+042        if ((res = mp_div(b, &t1, &t2, NULL)) != MP_OKAY) \{
+043           goto LBL_T;
+044        \}
+045        res = mp_mul(a, &t2, c);
+046     \}
+047   
+048     /* fix the sign to positive */
+049     c->sign = MP_ZPOS;
+050   
+051   LBL_T:
+052     mp_clear_multi (&t1, &t2, NULL);
+053     return res;
+054   \}
+055   #endif
+\end{alltt}
+\end{small}
+
+\section{Jacobi Symbol Computation}
+To explain the Jacobi Symbol we shall first discuss the Legendre function\footnote{Arrg.  What is the name of this?} off which the Jacobi symbol is 
+defined.  The Legendre function computes whether or not an integer $a$ is a quadratic residue modulo an odd prime $p$.  Numerically it is
+equivalent to equation \ref{eqn:legendre}.
+
+\textit{-- Tom, don't be an ass, cite your source here...!}
+
+\begin{equation}
+a^{(p-1)/2} \equiv \begin{array}{rl}
+                              -1 &  \mbox{if }a\mbox{ is a quadratic non-residue.} \\
+                              0  &  \mbox{if }a\mbox{ divides }p\mbox{.} \\
+                              1  &  \mbox{if }a\mbox{ is a quadratic residue}. 
+                              \end{array} \mbox{ (mod }p\mbox{)}
+\label{eqn:legendre}                              
+\end{equation}
+
+\textbf{Proof.} \textit{Equation \ref{eqn:legendre} correctly identifies the residue status of an integer $a$ modulo a prime $p$.}
+An integer $a$ is a quadratic residue if the following equation has a solution.
+
+\begin{equation}
+x^2 \equiv a \mbox{ (mod }p\mbox{)}
+\label{eqn:root}
+\end{equation}
+
+Consider the following equation.
+
+\begin{equation}
+0 \equiv x^{p-1} - 1 \equiv \left \lbrace \left (x^2 \right )^{(p-1)/2} - a^{(p-1)/2} \right \rbrace + \left ( a^{(p-1)/2} - 1 \right ) \mbox{ (mod }p\mbox{)}
+\label{eqn:rooti}
+\end{equation}
+
+Whether equation \ref{eqn:root} has a solution or not equation \ref{eqn:rooti} is always true.  If $a^{(p-1)/2} - 1 \equiv 0 \mbox{ (mod }p\mbox{)}$
+then the quantity in the braces must be zero.  By reduction,
+
+\begin{eqnarray}
+\left (x^2 \right )^{(p-1)/2} - a^{(p-1)/2} \equiv 0  \nonumber \\
+\left (x^2 \right )^{(p-1)/2} \equiv a^{(p-1)/2} \nonumber \\
+x^2 \equiv a \mbox{ (mod }p\mbox{)} 
+\end{eqnarray}
+
+As a result there must be a solution to the quadratic equation and in turn $a$ must be a quadratic residue.  If $a$ does not divide $p$ and $a$
+is not a quadratic residue then the only other value $a^{(p-1)/2}$ may be congruent to is $-1$ since
+\begin{equation}
+0 \equiv a^{p - 1} - 1 \equiv (a^{(p-1)/2} + 1)(a^{(p-1)/2} - 1) \mbox{ (mod }p\mbox{)}
+\end{equation}
+One of the terms on the right hand side must be zero.  \textbf{QED}
+
+\subsection{Jacobi Symbol}
+The Jacobi symbol is a generalization of the Legendre function for any odd non prime moduli $p$ greater than 2.  If $p = \prod_{i=0}^n p_i$ then
+the Jacobi symbol $\left ( { a \over p } \right )$ is equal to the following equation.
+
+\begin{equation}
+\left ( { a \over p } \right ) = \left ( { a \over p_0} \right ) \left ( { a \over p_1} \right ) \ldots \left ( { a \over p_n} \right )
+\end{equation}
+
+By inspection if $p$ is prime the Jacobi symbol is equivalent to the Legendre function.  The following facts\footnote{See HAC \cite[pp. 72-74]{HAC} for
+further details.} will be used to derive an efficient Jacobi symbol algorithm.  Where $p$ is an odd integer greater than two and $a, b \in \Z$ the
+following are true.  
+
+\begin{enumerate}
+\item $\left ( { a \over p} \right )$ equals $-1$, $0$ or $1$. 
+\item $\left ( { ab \over p} \right ) = \left ( { a \over p} \right )\left ( { b \over p} \right )$.
+\item If $a \equiv b$ then $\left ( { a \over p} \right ) = \left ( { b \over p} \right )$.
+\item $\left ( { 2 \over p} \right )$ equals $1$ if $p \equiv 1$ or $7 \mbox{ (mod }8\mbox{)}$.  Otherwise, it equals $-1$.
+\item $\left ( { a \over p} \right ) \equiv \left ( { p \over a} \right ) \cdot (-1)^{(p-1)(a-1)/4}$.  More specifically 
+$\left ( { a \over p} \right ) = \left ( { p \over a} \right )$ if $p \equiv a \equiv 1 \mbox{ (mod }4\mbox{)}$.  
+\end{enumerate}
+
+Using these facts if $a = 2^k \cdot a'$ then
+
+\begin{eqnarray}
+\left ( { a \over p } \right ) = \left ( {{2^k} \over p } \right ) \left ( {a' \over p} \right ) \nonumber \\
+                               = \left ( {2 \over p } \right )^k \left ( {a' \over p} \right ) 
+\label{eqn:jacobi}
+\end{eqnarray}
+
+By fact five, 
+
+\begin{equation}
+\left ( { a \over p } \right ) = \left ( { p \over a } \right ) \cdot (-1)^{(p-1)(a-1)/4} 
+\end{equation}
+
+Subsequently by fact three since $p \equiv (p \mbox{ mod }a) \mbox{ (mod }a\mbox{)}$ then 
+
+\begin{equation}
+\left ( { a \over p } \right ) = \left ( { {p \mbox{ mod } a} \over a } \right ) \cdot (-1)^{(p-1)(a-1)/4} 
+\end{equation}
+
+By putting both observations into equation \ref{eqn:jacobi} the following simplified equation is formed.
+
+\begin{equation}
+\left ( { a \over p } \right ) = \left ( {2 \over p } \right )^k \left ( {{p\mbox{ mod }a'} \over a'} \right )  \cdot (-1)^{(p-1)(a'-1)/4} 
+\end{equation}
+
+The value of $\left ( {{p \mbox{ mod }a'} \over a'} \right )$ can be found by using the same equation recursively.  The value of 
+$\left ( {2 \over p } \right )^k$ equals $1$ if $k$ is even otherwise it equals $\left ( {2 \over p } \right )$.  Using this approach the 
+factors of $p$ do not have to be known.  Furthermore, if $(a, p) = 1$ then the algorithm will terminate when the recursion requests the 
+Jacobi symbol computation of $\left ( {1 \over a'} \right )$ which is simply $1$.  
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_jacobi}. \\
+\textbf{Input}.   mp\_int $a$ and $p$, $a \ge 0$, $p \ge 3$, $p \equiv 1 \mbox{ (mod }2\mbox{)}$ \\
+\textbf{Output}.  The Jacobi symbol $c = \left ( {a \over p } \right )$. \\
+\hline \\
+1.  If $a = 0$ then \\
+\hspace{3mm}1.1  $c \leftarrow 0$ \\
+\hspace{3mm}1.2  Return(\textit{MP\_OKAY}). \\
+2.  If $a = 1$ then \\
+\hspace{3mm}2.1  $c \leftarrow 1$ \\
+\hspace{3mm}2.2  Return(\textit{MP\_OKAY}). \\
+3.  $a' \leftarrow a$ \\
+4.  $k \leftarrow 0$ \\
+5.  While $a'.used > 0$ and $a'_0 \equiv 0 \mbox{ (mod }2\mbox{)}$ \\
+\hspace{3mm}5.1  $k \leftarrow k + 1$ \\
+\hspace{3mm}5.2  $a' \leftarrow \lfloor a' / 2 \rfloor$ \\
+6.  If $k \equiv 0 \mbox{ (mod }2\mbox{)}$ then \\
+\hspace{3mm}6.1  $s \leftarrow 1$ \\
+7.  else \\
+\hspace{3mm}7.1  $r \leftarrow p_0 \mbox{ (mod }8\mbox{)}$ \\
+\hspace{3mm}7.2  If $r = 1$ or $r = 7$ then \\
+\hspace{6mm}7.2.1  $s \leftarrow 1$ \\
+\hspace{3mm}7.3  else \\
+\hspace{6mm}7.3.1  $s \leftarrow -1$ \\
+8.  If $p_0 \equiv a'_0 \equiv 3 \mbox{ (mod }4\mbox{)}$ then \\
+\hspace{3mm}8.1  $s \leftarrow -s$ \\
+9.  If $a' \ne 1$ then \\
+\hspace{3mm}9.1  $p' \leftarrow p \mbox{ (mod }a'\mbox{)}$ \\
+\hspace{3mm}9.2  $s \leftarrow s \cdot \mbox{mp\_jacobi}(p', a')$ \\
+10.  $c \leftarrow s$ \\
+11.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_jacobi}
+\end{figure}
+\textbf{Algorithm mp\_jacobi.}
+This algorithm computes the Jacobi symbol for an arbitrary positive integer $a$ with respect to an odd integer $p$ greater than three.  The algorithm
+is based on algorithm 2.149 of HAC \cite[pp. 73]{HAC}.  
+
+Step numbers one and two handle the trivial cases of $a = 0$ and $a = 1$ respectively.  Step five determines the number of two factors in the
+input $a$.  If $k$ is even than the term $\left ( { 2 \over p } \right )^k$ must always evaluate to one.  If $k$ is odd than the term evaluates to one 
+if $p_0$ is congruent to one or seven modulo eight, otherwise it evaluates to $-1$. After the the $\left ( { 2 \over p } \right )^k$ term is handled 
+the $(-1)^{(p-1)(a'-1)/4}$ is computed and multiplied against the current product $s$.  The latter term evaluates to one if both $p$ and $a'$ 
+are congruent to one modulo four, otherwise it evaluates to negative one.
+
+By step nine if $a'$ does not equal one a recursion is required.  Step 9.1 computes $p' \equiv p \mbox{ (mod }a'\mbox{)}$ and will recurse to compute
+$\left ( {p' \over a'} \right )$ which is multiplied against the current Jacobi product.
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_jacobi.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* computes the jacobi c = (a | n) (or Legendre if n is prime)
+018    * HAC pp. 73 Algorithm 2.149
+019    */
+020   int mp_jacobi (mp_int * a, mp_int * p, int *c)
+021   \{
+022     mp_int  a1, p1;
+023     int     k, s, r, res;
+024     mp_digit residue;
+025   
+026     /* if p <= 0 return MP_VAL */
+027     if (mp_cmp_d(p, 0) != MP_GT) \{
+028        return MP_VAL;
+029     \}
+030   
+031     /* step 1.  if a == 0, return 0 */
+032     if (mp_iszero (a) == 1) \{
+033       *c = 0;
+034       return MP_OKAY;
+035     \}
+036   
+037     /* step 2.  if a == 1, return 1 */
+038     if (mp_cmp_d (a, 1) == MP_EQ) \{
+039       *c = 1;
+040       return MP_OKAY;
+041     \}
+042   
+043     /* default */
+044     s = 0;
+045   
+046     /* step 3.  write a = a1 * 2**k  */
+047     if ((res = mp_init_copy (&a1, a)) != MP_OKAY) \{
+048       return res;
+049     \}
+050   
+051     if ((res = mp_init (&p1)) != MP_OKAY) \{
+052       goto LBL_A1;
+053     \}
+054   
+055     /* divide out larger power of two */
+056     k = mp_cnt_lsb(&a1);
+057     if ((res = mp_div_2d(&a1, k, &a1, NULL)) != MP_OKAY) \{
+058        goto LBL_P1;
+059     \}
+060   
+061     /* step 4.  if e is even set s=1 */
+062     if ((k & 1) == 0) \{
+063       s = 1;
+064     \} else \{
+065       /* else set s=1 if p = 1/7 (mod 8) or s=-1 if p = 3/5 (mod 8) */
+066       residue = p->dp[0] & 7;
+067   
+068       if (residue == 1 || residue == 7) \{
+069         s = 1;
+070       \} else if (residue == 3 || residue == 5) \{
+071         s = -1;
+072       \}
+073     \}
+074   
+075     /* step 5.  if p == 3 (mod 4) *and* a1 == 3 (mod 4) then s = -s */
+076     if ( ((p->dp[0] & 3) == 3) && ((a1.dp[0] & 3) == 3)) \{
+077       s = -s;
+078     \}
+079   
+080     /* if a1 == 1 we're done */
+081     if (mp_cmp_d (&a1, 1) == MP_EQ) \{
+082       *c = s;
+083     \} else \{
+084       /* n1 = n mod a1 */
+085       if ((res = mp_mod (p, &a1, &p1)) != MP_OKAY) \{
+086         goto LBL_P1;
+087       \}
+088       if ((res = mp_jacobi (&p1, &a1, &r)) != MP_OKAY) \{
+089         goto LBL_P1;
+090       \}
+091       *c = s * r;
+092     \}
+093   
+094     /* done */
+095     res = MP_OKAY;
+096   LBL_P1:mp_clear (&p1);
+097   LBL_A1:mp_clear (&a1);
+098     return res;
+099   \}
+100   #endif
+\end{alltt}
+\end{small}
+
+As a matter of practicality the variable $a'$ as per the pseudo-code is reprensented by the variable $a1$ since the $'$ symbol is not valid for a C 
+variable name character. 
+
+The two simple cases of $a = 0$ and $a = 1$ are handled at the very beginning to simplify the algorithm.  If the input is non-trivial the algorithm
+has to proceed compute the Jacobi.  The variable $s$ is used to hold the current Jacobi product.  Note that $s$ is merely a C ``int'' data type since
+the values it may obtain are merely $-1$, $0$ and $1$.  
+
+After a local copy of $a$ is made all of the factors of two are divided out and the total stored in $k$.  Technically only the least significant
+bit of $k$ is required, however, it makes the algorithm simpler to follow to perform an addition. In practice an exclusive-or and addition have the same 
+processor requirements and neither is faster than the other.
+
+Line 61 through 70 determines the value of $\left ( { 2 \over p } \right )^k$.  If the least significant bit of $k$ is zero than
+$k$ is even and the value is one.  Otherwise, the value of $s$ depends on which residue class $p$ belongs to modulo eight.  The value of
+$(-1)^{(p-1)(a'-1)/4}$ is compute and multiplied against $s$ on lines 75 through 73.  
+
+Finally, if $a1$ does not equal one the algorithm must recurse and compute $\left ( {p' \over a'} \right )$.  
+
+\textit{-- Comment about default $s$ and such...}
+
+\section{Modular Inverse}
+\label{sec:modinv}
+The modular inverse of a number actually refers to the modular multiplicative inverse.  Essentially for any integer $a$ such that $(a, p) = 1$ there
+exist another integer $b$ such that $ab \equiv 1 \mbox{ (mod }p\mbox{)}$.  The integer $b$ is called the multiplicative inverse of $a$ which is
+denoted as $b = a^{-1}$.  Technically speaking modular inversion is a well defined operation for any finite ring or field not just for rings and 
+fields of integers.  However, the former will be the matter of discussion.
+
+The simplest approach is to compute the algebraic inverse of the input.  That is to compute $b \equiv a^{\Phi(p) - 1}$.  If $\Phi(p)$ is the 
+order of the multiplicative subgroup modulo $p$ then $b$ must be the multiplicative inverse of $a$.  The proof of which is trivial.
+
+\begin{equation}
+ab \equiv a \left (a^{\Phi(p) - 1} \right ) \equiv a^{\Phi(p)} \equiv a^0 \equiv 1 \mbox{ (mod }p\mbox{)}
+\end{equation}
+
+However, as simple as this approach may be it has two serious flaws.  It requires that the value of $\Phi(p)$ be known which if $p$ is composite 
+requires all of the prime factors.  This approach also is very slow as the size of $p$ grows.  
+
+A simpler approach is based on the observation that solving for the multiplicative inverse is equivalent to solving the linear 
+Diophantine\footnote{See LeVeque \cite[pp. 40-43]{LeVeque} for more information.} equation.
+
+\begin{equation}
+ab + pq = 1
+\end{equation}
+
+Where $a$, $b$, $p$ and $q$ are all integers.  If such a pair of integers $ \left < b, q \right >$ exist than $b$ is the multiplicative inverse of 
+$a$ modulo $p$.  The extended Euclidean algorithm (Knuth \cite[pp. 342]{TAOCPV2}) can be used to solve such equations provided $(a, p) = 1$.  
+However, instead of using that algorithm directly a variant known as the binary Extended Euclidean algorithm will be used in its place.  The
+binary approach is very similar to the binary greatest common divisor algorithm except it will produce a full solution to the Diophantine 
+equation.  
+
+\subsection{General Case}
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_invmod}. \\
+\textbf{Input}.   mp\_int $a$ and $b$, $(a, b) = 1$, $p \ge 2$, $0 < a < p$.  \\
+\textbf{Output}.  The modular inverse $c \equiv a^{-1} \mbox{ (mod }b\mbox{)}$. \\
+\hline \\
+1.  If $b \le 0$ then return(\textit{MP\_VAL}). \\
+2.  If $b_0 \equiv 1 \mbox{ (mod }2\mbox{)}$ then use algorithm fast\_mp\_invmod. \\
+3.  $x \leftarrow \vert a \vert, y \leftarrow b$ \\
+4.  If $x_0 \equiv y_0  \equiv 0 \mbox{ (mod }2\mbox{)}$ then return(\textit{MP\_VAL}). \\
+5.  $B \leftarrow 0, C \leftarrow 0, A \leftarrow 1, D \leftarrow 1$ \\
+6.  While $u.used > 0$ and $u_0 \equiv 0 \mbox{ (mod }2\mbox{)}$ \\
+\hspace{3mm}6.1  $u \leftarrow \lfloor u / 2 \rfloor$ \\
+\hspace{3mm}6.2  If ($A.used > 0$ and $A_0 \equiv 1 \mbox{ (mod }2\mbox{)}$) or ($B.used > 0$ and $B_0 \equiv 1 \mbox{ (mod }2\mbox{)}$) then \\
+\hspace{6mm}6.2.1  $A \leftarrow A + y$ \\
+\hspace{6mm}6.2.2  $B \leftarrow B - x$ \\
+\hspace{3mm}6.3  $A \leftarrow \lfloor A / 2 \rfloor$ \\
+\hspace{3mm}6.4  $B \leftarrow \lfloor B / 2 \rfloor$ \\
+7.  While $v.used > 0$ and $v_0 \equiv 0 \mbox{ (mod }2\mbox{)}$ \\
+\hspace{3mm}7.1  $v \leftarrow \lfloor v / 2 \rfloor$ \\
+\hspace{3mm}7.2  If ($C.used > 0$ and $C_0 \equiv 1 \mbox{ (mod }2\mbox{)}$) or ($D.used > 0$ and $D_0 \equiv 1 \mbox{ (mod }2\mbox{)}$) then \\
+\hspace{6mm}7.2.1  $C \leftarrow C + y$ \\
+\hspace{6mm}7.2.2  $D \leftarrow D - x$ \\
+\hspace{3mm}7.3  $C \leftarrow \lfloor C / 2 \rfloor$ \\
+\hspace{3mm}7.4  $D \leftarrow \lfloor D / 2 \rfloor$ \\
+8.  If $u \ge v$ then \\
+\hspace{3mm}8.1  $u \leftarrow u - v$ \\
+\hspace{3mm}8.2  $A \leftarrow A - C$ \\
+\hspace{3mm}8.3  $B \leftarrow B - D$ \\
+9.  else \\
+\hspace{3mm}9.1  $v \leftarrow v - u$ \\
+\hspace{3mm}9.2  $C \leftarrow C - A$ \\
+\hspace{3mm}9.3  $D \leftarrow D - B$ \\
+10.  If $u \ne 0$ goto step 6. \\
+11.  If $v \ne 1$ return(\textit{MP\_VAL}). \\
+12.  While $C \le 0$ do \\
+\hspace{3mm}12.1  $C \leftarrow C + b$ \\
+13.  While $C \ge b$ do \\
+\hspace{3mm}13.1  $C \leftarrow C - b$ \\
+14.  $c \leftarrow C$ \\
+15.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\end{figure}
+\textbf{Algorithm mp\_invmod.}
+This algorithm computes the modular multiplicative inverse of an integer $a$ modulo an integer $b$.  This algorithm is a variation of the 
+extended binary Euclidean algorithm from HAC \cite[pp. 608]{HAC}.  It has been modified to only compute the modular inverse and not a complete
+Diophantine solution.  
+
+If $b \le 0$ than the modulus is invalid and MP\_VAL is returned.  Similarly if both $a$ and $b$ are even then there cannot be a multiplicative
+inverse for $a$ and the error is reported.  
+
+The astute reader will observe that steps seven through nine are very similar to the binary greatest common divisor algorithm mp\_gcd.  In this case
+the other variables to the Diophantine equation are solved.  The algorithm terminates when $u = 0$ in which case the solution is
+
+\begin{equation}
+Ca + Db = v
+\end{equation}
+
+If $v$, the greatest common divisor of $a$ and $b$ is not equal to one then the algorithm will report an error as no inverse exists.  Otherwise, $C$
+is the modular inverse of $a$.  The actual value of $C$ is congruent to, but not necessarily equal to, the ideal modular inverse which should lie 
+within $1 \le a^{-1} < b$.  Step numbers twelve and thirteen adjust the inverse until it is in range.  If the original input $a$ is within $0 < a < p$ 
+then only a couple of additions or subtractions will be required to adjust the inverse.
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_invmod.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* hac 14.61, pp608 */
+018   int mp_invmod (mp_int * a, mp_int * b, mp_int * c)
+019   \{
+020     /* b cannot be negative */
+021     if (b->sign == MP_NEG || mp_iszero(b) == 1) \{
+022       return MP_VAL;
+023     \}
+024   
+025   #ifdef BN_FAST_MP_INVMOD_C
+026     /* if the modulus is odd we can use a faster routine instead */
+027     if (mp_isodd (b) == 1) \{
+028       return fast_mp_invmod (a, b, c);
+029     \}
+030   #endif
+031   
+032   #ifdef BN_MP_INVMOD_SLOW_C
+033     return mp_invmod_slow(a, b, c);
+034   #endif
+035   
+036     return MP_VAL;
+037   \}
+038   #endif
+\end{alltt}
+\end{small}
+
+\subsubsection{Odd Moduli}
+
+When the modulus $b$ is odd the variables $A$ and $C$ are fixed and are not required to compute the inverse.  In particular by attempting to solve
+the Diophantine $Cb + Da = 1$ only $B$ and $D$ are required to find the inverse of $a$.  
+
+The algorithm fast\_mp\_invmod is a direct adaptation of algorithm mp\_invmod with all all steps involving either $A$ or $C$ removed.  This 
+optimization will halve the time required to compute the modular inverse.
+
+\section{Primality Tests}
+
+A non-zero integer $a$ is said to be prime if it is not divisible by any other integer excluding one and itself.  For example, $a = 7$ is prime 
+since the integers $2 \ldots 6$ do not evenly divide $a$.  By contrast, $a = 6$ is not prime since $a = 6 = 2 \cdot 3$. 
+
+Prime numbers arise in cryptography considerably as they allow finite fields to be formed.  The ability to determine whether an integer is prime or
+not quickly has been a viable subject in cryptography and number theory for considerable time.  The algorithms that will be presented are all
+probablistic algorithms in that when they report an integer is composite it must be composite.  However, when the algorithms report an integer is
+prime the algorithm may be incorrect.  
+
+As will be discussed it is possible to limit the probability of error so well that for practical purposes the probablity of error might as 
+well be zero.  For the purposes of these discussions let $n$ represent the candidate integer of which the primality is in question.
+
+\subsection{Trial Division}
+
+Trial division means to attempt to evenly divide a candidate integer by small prime integers.  If the candidate can be evenly divided it obviously
+cannot be prime.  By dividing by all primes $1 < p \le \sqrt{n}$ this test can actually prove whether an integer is prime.  However, such a test
+would require a prohibitive amount of time as $n$ grows.
+
+Instead of dividing by every prime, a smaller, more mangeable set of primes may be used instead.  By performing trial division with only a subset
+of the primes less than $\sqrt{n} + 1$ the algorithm cannot prove if a candidate is prime.  However, often it can prove a candidate is not prime.
+
+The benefit of this test is that trial division by small values is fairly efficient.  Specially compared to the other algorithms that will be
+discussed shortly.  The probability that this approach correctly identifies a composite candidate when tested with all primes upto $q$ is given by
+$1 - {1.12 \over ln(q)}$.  The graph (\ref{pic:primality}, will be added later) demonstrates the probability of success for the range 
+$3 \le q \le 100$.  
+
+At approximately $q = 30$ the gain of performing further tests diminishes fairly quickly.  At $q = 90$ further testing is generally not going to 
+be of any practical use.  In the case of LibTomMath the default limit $q = 256$ was chosen since it is not too high and will eliminate 
+approximately $80\%$ of all candidate integers.  The constant \textbf{PRIME\_SIZE} is equal to the number of primes in the test base.  The 
+array \_\_prime\_tab is an array of the first \textbf{PRIME\_SIZE} prime numbers.  
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_prime\_is\_divisible}. \\
+\textbf{Input}.   mp\_int $a$ \\
+\textbf{Output}.  $c = 1$ if $n$ is divisible by a small prime, otherwise $c = 0$.  \\
+\hline \\
+1.  for $ix$ from $0$ to $PRIME\_SIZE$ do \\
+\hspace{3mm}1.1  $d \leftarrow n \mbox{ (mod }\_\_prime\_tab_{ix}\mbox{)}$ \\
+\hspace{3mm}1.2  If $d = 0$ then \\
+\hspace{6mm}1.2.1  $c \leftarrow 1$ \\
+\hspace{6mm}1.2.2  Return(\textit{MP\_OKAY}). \\
+2.  $c \leftarrow 0$ \\
+3.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_prime\_is\_divisible}
+\end{figure}
+\textbf{Algorithm mp\_prime\_is\_divisible.}
+This algorithm attempts to determine if a candidate integer $n$ is composite by performing trial divisions.  
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_prime\_is\_divisible.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* determines if an integers is divisible by one 
+018    * of the first PRIME_SIZE primes or not
+019    *
+020    * sets result to 0 if not, 1 if yes
+021    */
+022   int mp_prime_is_divisible (mp_int * a, int *result)
+023   \{
+024     int     err, ix;
+025     mp_digit res;
+026   
+027     /* default to not */
+028     *result = MP_NO;
+029   
+030     for (ix = 0; ix < PRIME_SIZE; ix++) \{
+031       /* what is a mod LBL_prime_tab[ix] */
+032       if ((err = mp_mod_d (a, ltm_prime_tab[ix], &res)) != MP_OKAY) \{
+033         return err;
+034       \}
+035   
+036       /* is the residue zero? */
+037       if (res == 0) \{
+038         *result = MP_YES;
+039         return MP_OKAY;
+040       \}
+041     \}
+042   
+043     return MP_OKAY;
+044   \}
+045   #endif
+\end{alltt}
+\end{small}
+
+The algorithm defaults to a return of $0$ in case an error occurs.  The values in the prime table are all specified to be in the range of a 
+mp\_digit.  The table \_\_prime\_tab is defined in the following file.
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_prime\_tab.c
+\vspace{-3mm}
+\begin{alltt}
+016   const mp_digit ltm_prime_tab[] = \{
+017     0x0002, 0x0003, 0x0005, 0x0007, 0x000B, 0x000D, 0x0011, 0x0013,
+018     0x0017, 0x001D, 0x001F, 0x0025, 0x0029, 0x002B, 0x002F, 0x0035,
+019     0x003B, 0x003D, 0x0043, 0x0047, 0x0049, 0x004F, 0x0053, 0x0059,
+020     0x0061, 0x0065, 0x0067, 0x006B, 0x006D, 0x0071, 0x007F,
+021   #ifndef MP_8BIT
+022     0x0083,
+023     0x0089, 0x008B, 0x0095, 0x0097, 0x009D, 0x00A3, 0x00A7, 0x00AD,
+024     0x00B3, 0x00B5, 0x00BF, 0x00C1, 0x00C5, 0x00C7, 0x00D3, 0x00DF,
+025     0x00E3, 0x00E5, 0x00E9, 0x00EF, 0x00F1, 0x00FB, 0x0101, 0x0107,
+026     0x010D, 0x010F, 0x0115, 0x0119, 0x011B, 0x0125, 0x0133, 0x0137,
+027   
+028     0x0139, 0x013D, 0x014B, 0x0151, 0x015B, 0x015D, 0x0161, 0x0167,
+029     0x016F, 0x0175, 0x017B, 0x017F, 0x0185, 0x018D, 0x0191, 0x0199,
+030     0x01A3, 0x01A5, 0x01AF, 0x01B1, 0x01B7, 0x01BB, 0x01C1, 0x01C9,
+031     0x01CD, 0x01CF, 0x01D3, 0x01DF, 0x01E7, 0x01EB, 0x01F3, 0x01F7,
+032     0x01FD, 0x0209, 0x020B, 0x021D, 0x0223, 0x022D, 0x0233, 0x0239,
+033     0x023B, 0x0241, 0x024B, 0x0251, 0x0257, 0x0259, 0x025F, 0x0265,
+034     0x0269, 0x026B, 0x0277, 0x0281, 0x0283, 0x0287, 0x028D, 0x0293,
+035     0x0295, 0x02A1, 0x02A5, 0x02AB, 0x02B3, 0x02BD, 0x02C5, 0x02CF,
+036   
+037     0x02D7, 0x02DD, 0x02E3, 0x02E7, 0x02EF, 0x02F5, 0x02F9, 0x0301,
+038     0x0305, 0x0313, 0x031D, 0x0329, 0x032B, 0x0335, 0x0337, 0x033B,
+039     0x033D, 0x0347, 0x0355, 0x0359, 0x035B, 0x035F, 0x036D, 0x0371,
+040     0x0373, 0x0377, 0x038B, 0x038F, 0x0397, 0x03A1, 0x03A9, 0x03AD,
+041     0x03B3, 0x03B9, 0x03C7, 0x03CB, 0x03D1, 0x03D7, 0x03DF, 0x03E5,
+042     0x03F1, 0x03F5, 0x03FB, 0x03FD, 0x0407, 0x0409, 0x040F, 0x0419,
+043     0x041B, 0x0425, 0x0427, 0x042D, 0x043F, 0x0443, 0x0445, 0x0449,
+044     0x044F, 0x0455, 0x045D, 0x0463, 0x0469, 0x047F, 0x0481, 0x048B,
+045   
+046     0x0493, 0x049D, 0x04A3, 0x04A9, 0x04B1, 0x04BD, 0x04C1, 0x04C7,
+047     0x04CD, 0x04CF, 0x04D5, 0x04E1, 0x04EB, 0x04FD, 0x04FF, 0x0503,
+048     0x0509, 0x050B, 0x0511, 0x0515, 0x0517, 0x051B, 0x0527, 0x0529,
+049     0x052F, 0x0551, 0x0557, 0x055D, 0x0565, 0x0577, 0x0581, 0x058F,
+050     0x0593, 0x0595, 0x0599, 0x059F, 0x05A7, 0x05AB, 0x05AD, 0x05B3,
+051     0x05BF, 0x05C9, 0x05CB, 0x05CF, 0x05D1, 0x05D5, 0x05DB, 0x05E7,
+052     0x05F3, 0x05FB, 0x0607, 0x060D, 0x0611, 0x0617, 0x061F, 0x0623,
+053     0x062B, 0x062F, 0x063D, 0x0641, 0x0647, 0x0649, 0x064D, 0x0653
+054   #endif
+055   \};
+056   #endif
+\end{alltt}
+\end{small}
+
+Note that there are two possible tables.  When an mp\_digit is 7-bits long only the primes upto $127$ may be included, otherwise the primes
+upto $1619$ are used.  Note that the value of \textbf{PRIME\_SIZE} is a constant dependent on the size of a mp\_digit. 
+
+\subsection{The Fermat Test}
+The Fermat test is probably one the oldest tests to have a non-trivial probability of success.  It is based on the fact that if $n$ is in 
+fact prime then $a^{n} \equiv a \mbox{ (mod }n\mbox{)}$ for all $0 < a < n$.  The reason being that if $n$ is prime than the order of
+the multiplicative sub group is $n - 1$.  Any base $a$ must have an order which divides $n - 1$ and as such $a^n$ is equivalent to 
+$a^1 = a$.  
+
+If $n$ is composite then any given base $a$ does not have to have a period which divides $n - 1$.  In which case 
+it is possible that $a^n \nequiv a \mbox{ (mod }n\mbox{)}$.  However, this test is not absolute as it is possible that the order
+of a base will divide $n - 1$ which would then be reported as prime.  Such a base yields what is known as a Fermat pseudo-prime.  Several 
+integers known as Carmichael numbers will be a pseudo-prime to all valid bases.  Fortunately such numbers are extremely rare as $n$ grows
+in size.
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_prime\_fermat}. \\
+\textbf{Input}.   mp\_int $a$ and $b$, $a \ge 2$, $0 < b < a$.  \\
+\textbf{Output}.  $c = 1$ if $b^a \equiv b \mbox{ (mod }a\mbox{)}$, otherwise $c = 0$.  \\
+\hline \\
+1.  $t \leftarrow b^a \mbox{ (mod }a\mbox{)}$ \\
+2.  If $t = b$ then \\
+\hspace{3mm}2.1  $c = 1$ \\
+3.  else \\
+\hspace{3mm}3.1  $c = 0$ \\
+4.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_prime\_fermat}
+\end{figure}
+\textbf{Algorithm mp\_prime\_fermat.}
+This algorithm determines whether an mp\_int $a$ is a Fermat prime to the base $b$ or not.  It uses a single modular exponentiation to
+determine the result.  
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_prime\_fermat.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* performs one Fermat test.
+018    * 
+019    * If "a" were prime then b**a == b (mod a) since the order of
+020    * the multiplicative sub-group would be phi(a) = a-1.  That means
+021    * it would be the same as b**(a mod (a-1)) == b**1 == b (mod a).
+022    *
+023    * Sets result to 1 if the congruence holds, or zero otherwise.
+024    */
+025   int mp_prime_fermat (mp_int * a, mp_int * b, int *result)
+026   \{
+027     mp_int  t;
+028     int     err;
+029   
+030     /* default to composite  */
+031     *result = MP_NO;
+032   
+033     /* ensure b > 1 */
+034     if (mp_cmp_d(b, 1) != MP_GT) \{
+035        return MP_VAL;
+036     \}
+037   
+038     /* init t */
+039     if ((err = mp_init (&t)) != MP_OKAY) \{
+040       return err;
+041     \}
+042   
+043     /* compute t = b**a mod a */
+044     if ((err = mp_exptmod (b, a, a, &t)) != MP_OKAY) \{
+045       goto LBL_T;
+046     \}
+047   
+048     /* is it equal to b? */
+049     if (mp_cmp (&t, b) == MP_EQ) \{
+050       *result = MP_YES;
+051     \}
+052   
+053     err = MP_OKAY;
+054   LBL_T:mp_clear (&t);
+055     return err;
+056   \}
+057   #endif
+\end{alltt}
+\end{small}
+
+\subsection{The Miller-Rabin Test}
+The Miller-Rabin (citation) test is another primality test which has tighter error bounds than the Fermat test specifically with sequentially chosen 
+candidate  integers.  The algorithm is based on the observation that if $n - 1 = 2^kr$ and if $b^r \nequiv \pm 1$ then after upto $k - 1$ squarings the 
+value must be equal to $-1$.  The squarings are stopped as soon as $-1$ is observed.  If the value of $1$ is observed first it means that
+some value not congruent to $\pm 1$ when squared equals one which cannot occur if $n$ is prime.
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_prime\_miller\_rabin}. \\
+\textbf{Input}.   mp\_int $a$ and $b$, $a \ge 2$, $0 < b < a$.  \\
+\textbf{Output}.  $c = 1$ if $a$ is a Miller-Rabin prime to the base $a$, otherwise $c = 0$.  \\
+\hline
+1.  $a' \leftarrow a - 1$ \\
+2.  $r  \leftarrow n1$    \\
+3.  $c \leftarrow 0, s  \leftarrow 0$ \\
+4.  While $r.used > 0$ and $r_0 \equiv 0 \mbox{ (mod }2\mbox{)}$ \\
+\hspace{3mm}4.1  $s \leftarrow s + 1$ \\
+\hspace{3mm}4.2  $r \leftarrow \lfloor r / 2 \rfloor$ \\
+5.  $y \leftarrow b^r \mbox{ (mod }a\mbox{)}$ \\
+6.  If $y \nequiv \pm 1$ then \\
+\hspace{3mm}6.1  $j \leftarrow 1$ \\
+\hspace{3mm}6.2  While $j \le (s - 1)$ and $y \nequiv a'$ \\
+\hspace{6mm}6.2.1  $y \leftarrow y^2 \mbox{ (mod }a\mbox{)}$ \\
+\hspace{6mm}6.2.2  If $y = 1$ then goto step 8. \\
+\hspace{6mm}6.2.3  $j \leftarrow j + 1$ \\
+\hspace{3mm}6.3  If $y \nequiv a'$ goto step 8. \\
+7.  $c \leftarrow 1$\\
+8.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_prime\_miller\_rabin}
+\end{figure}
+\textbf{Algorithm mp\_prime\_miller\_rabin.}
+This algorithm performs one trial round of the Miller-Rabin algorithm to the base $b$.  It will set $c = 1$ if the algorithm cannot determine
+if $b$ is composite or $c = 0$ if $b$ is provably composite.  The values of $s$ and $r$ are computed such that $a' = a - 1 = 2^sr$.  
+
+If the value $y \equiv b^r$ is congruent to $\pm 1$ then the algorithm cannot prove if $a$ is composite or not.  Otherwise, the algorithm will
+square $y$ upto $s - 1$ times stopping only when $y \equiv -1$.  If $y^2 \equiv 1$ and $y \nequiv \pm 1$ then the algorithm can report that $a$
+is provably composite.  If the algorithm performs $s - 1$ squarings and $y \nequiv -1$ then $a$ is provably composite.  If $a$ is not provably 
+composite then it is \textit{probably} prime.
+
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_prime\_miller\_rabin.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* Miller-Rabin test of "a" to the base of "b" as described in 
+018    * HAC pp. 139 Algorithm 4.24
+019    *
+020    * Sets result to 0 if definitely composite or 1 if probably prime.
+021    * Randomly the chance of error is no more than 1/4 and often 
+022    * very much lower.
+023    */
+024   int mp_prime_miller_rabin (mp_int * a, mp_int * b, int *result)
+025   \{
+026     mp_int  n1, y, r;
+027     int     s, j, err;
+028   
+029     /* default */
+030     *result = MP_NO;
+031   
+032     /* ensure b > 1 */
+033     if (mp_cmp_d(b, 1) != MP_GT) \{
+034        return MP_VAL;
+035     \}     
+036   
+037     /* get n1 = a - 1 */
+038     if ((err = mp_init_copy (&n1, a)) != MP_OKAY) \{
+039       return err;
+040     \}
+041     if ((err = mp_sub_d (&n1, 1, &n1)) != MP_OKAY) \{
+042       goto LBL_N1;
+043     \}
+044   
+045     /* set 2**s * r = n1 */
+046     if ((err = mp_init_copy (&r, &n1)) != MP_OKAY) \{
+047       goto LBL_N1;
+048     \}
+049   
+050     /* count the number of least significant bits
+051      * which are zero
+052      */
+053     s = mp_cnt_lsb(&r);
+054   
+055     /* now divide n - 1 by 2**s */
+056     if ((err = mp_div_2d (&r, s, &r, NULL)) != MP_OKAY) \{
+057       goto LBL_R;
+058     \}
+059   
+060     /* compute y = b**r mod a */
+061     if ((err = mp_init (&y)) != MP_OKAY) \{
+062       goto LBL_R;
+063     \}
+064     if ((err = mp_exptmod (b, &r, a, &y)) != MP_OKAY) \{
+065       goto LBL_Y;
+066     \}
+067   
+068     /* if y != 1 and y != n1 do */
+069     if (mp_cmp_d (&y, 1) != MP_EQ && mp_cmp (&y, &n1) != MP_EQ) \{
+070       j = 1;
+071       /* while j <= s-1 and y != n1 */
+072       while ((j <= (s - 1)) && mp_cmp (&y, &n1) != MP_EQ) \{
+073         if ((err = mp_sqrmod (&y, a, &y)) != MP_OKAY) \{
+074            goto LBL_Y;
+075         \}
+076   
+077         /* if y == 1 then composite */
+078         if (mp_cmp_d (&y, 1) == MP_EQ) \{
+079            goto LBL_Y;
+080         \}
+081   
+082         ++j;
+083       \}
+084   
+085       /* if y != n1 then composite */
+086       if (mp_cmp (&y, &n1) != MP_EQ) \{
+087         goto LBL_Y;
+088       \}
+089     \}
+090   
+091     /* probably prime now */
+092     *result = MP_YES;
+093   LBL_Y:mp_clear (&y);
+094   LBL_R:mp_clear (&r);
+095   LBL_N1:mp_clear (&n1);
+096     return err;
+097   \}
+098   #endif
+\end{alltt}
+\end{small}
+
+
+
+
+\backmatter
+\appendix
+\begin{thebibliography}{ABCDEF}
+\bibitem[1]{TAOCPV2}
+Donald Knuth, \textit{The Art of Computer Programming}, Third Edition, Volume Two, Seminumerical Algorithms, Addison-Wesley, 1998
+
+\bibitem[2]{HAC}
+A. Menezes, P. van Oorschot, S. Vanstone, \textit{Handbook of Applied Cryptography}, CRC Press, 1996
+
+\bibitem[3]{ROSE}
+Michael Rosing, \textit{Implementing Elliptic Curve Cryptography}, Manning Publications, 1999
+
+\bibitem[4]{COMBA}
+Paul G. Comba, \textit{Exponentiation Cryptosystems on the IBM PC}. IBM Systems Journal 29(4): 526-538 (1990)
+
+\bibitem[5]{KARA}
+A. Karatsuba, Doklay Akad. Nauk SSSR 145 (1962), pp.293-294
+
+\bibitem[6]{KARAP}
+Andre Weimerskirch and Christof Paar, \textit{Generalizations of the Karatsuba Algorithm for Polynomial Multiplication}, Submitted to Design, Codes and Cryptography, March 2002
+
+\bibitem[7]{BARRETT}
+Paul Barrett, \textit{Implementing the Rivest Shamir and Adleman Public Key Encryption Algorithm on a Standard Digital Signal Processor}, Advances in Cryptology, Crypto '86, Springer-Verlag.
+
+\bibitem[8]{MONT}
+P.L.Montgomery. \textit{Modular multiplication without trial division}. Mathematics of Computation, 44(170):519-521, April 1985.
+
+\bibitem[9]{DRMET}
+Chae Hoon Lim and Pil Joong Lee, \textit{Generating Efficient Primes for Discrete Log Cryptosystems}, POSTECH Information Research Laboratories
+
+\bibitem[10]{MMB}
+J. Daemen and R. Govaerts and J. Vandewalle, \textit{Block ciphers based on Modular Arithmetic}, State and {P}rogress in the {R}esearch of {C}ryptography, 1993, pp. 80-89
+
+\bibitem[11]{RSAREF}
+R.L. Rivest, A. Shamir, L. Adleman, \textit{A Method for Obtaining Digital Signatures and Public-Key Cryptosystems}
+
+\bibitem[12]{DHREF}
+Whitfield Diffie, Martin E. Hellman, \textit{New Directions in Cryptography}, IEEE Transactions on Information Theory, 1976
+
+\bibitem[13]{IEEE}
+IEEE Standard for Binary Floating-Point Arithmetic (ANSI/IEEE Std 754-1985)
+
+\bibitem[14]{GMP}
+GNU Multiple Precision (GMP), \url{http://www.swox.com/gmp/}
+
+\bibitem[15]{MPI}
+Multiple Precision Integer Library (MPI), Michael Fromberger, \url{http://thayer.dartmouth.edu/~sting/mpi/}
+
+\bibitem[16]{OPENSSL}
+OpenSSL Cryptographic Toolkit, \url{http://openssl.org}
+
+\bibitem[17]{LIP}
+Large Integer Package, \url{http://home.hetnet.nl/~ecstr/LIP.zip}
+
+\bibitem[18]{ISOC}
+JTC1/SC22/WG14, ISO/IEC 9899:1999, ``A draft rationale for the C99 standard.''
+
+\bibitem[19]{JAVA}
+The Sun Java Website, \url{http://java.sun.com/}
+
+\end{thebibliography}
+
+\input{tommath.ind}
+
+\end{document}
diff --git a/libtommath/tommath_class.h b/libtommath/tommath_class.h
new file mode 100644
index 0000000..9faf627
--- /dev/null
+++ b/libtommath/tommath_class.h
@@ -0,0 +1,1000 @@
+#if !(defined(LTM1) && defined(LTM2) && defined(LTM3))
+#if defined(LTM2)
+#define LTM3
+#endif
+#if defined(LTM1)
+#define LTM2
+#endif
+#define LTM1
+
+#if defined(LTM_ALL)
+#define BN_ERROR_C
+#define BN_FAST_MP_INVMOD_C
+#define BN_FAST_MP_MONTGOMERY_REDUCE_C
+#define BN_FAST_S_MP_MUL_DIGS_C
+#define BN_FAST_S_MP_MUL_HIGH_DIGS_C
+#define BN_FAST_S_MP_SQR_C
+#define BN_MP_2EXPT_C
+#define BN_MP_ABS_C
+#define BN_MP_ADD_C
+#define BN_MP_ADD_D_C
+#define BN_MP_ADDMOD_C
+#define BN_MP_AND_C
+#define BN_MP_CLAMP_C
+#define BN_MP_CLEAR_C
+#define BN_MP_CLEAR_MULTI_C
+#define BN_MP_CMP_C
+#define BN_MP_CMP_D_C
+#define BN_MP_CMP_MAG_C
+#define BN_MP_CNT_LSB_C
+#define BN_MP_COPY_C
+#define BN_MP_COUNT_BITS_C
+#define BN_MP_DIV_C
+#define BN_MP_DIV_2_C
+#define BN_MP_DIV_2D_C
+#define BN_MP_DIV_3_C
+#define BN_MP_DIV_D_C
+#define BN_MP_DR_IS_MODULUS_C
+#define BN_MP_DR_REDUCE_C
+#define BN_MP_DR_SETUP_C
+#define BN_MP_EXCH_C
+#define BN_MP_EXPT_D_C
+#define BN_MP_EXPTMOD_C
+#define BN_MP_EXPTMOD_FAST_C
+#define BN_MP_EXTEUCLID_C
+#define BN_MP_FREAD_C
+#define BN_MP_FWRITE_C
+#define BN_MP_GCD_C
+#define BN_MP_GET_INT_C
+#define BN_MP_GROW_C
+#define BN_MP_INIT_C
+#define BN_MP_INIT_COPY_C
+#define BN_MP_INIT_MULTI_C
+#define BN_MP_INIT_SET_C
+#define BN_MP_INIT_SET_INT_C
+#define BN_MP_INIT_SIZE_C
+#define BN_MP_INVMOD_C
+#define BN_MP_INVMOD_SLOW_C
+#define BN_MP_IS_SQUARE_C
+#define BN_MP_JACOBI_C
+#define BN_MP_KARATSUBA_MUL_C
+#define BN_MP_KARATSUBA_SQR_C
+#define BN_MP_LCM_C
+#define BN_MP_LSHD_C
+#define BN_MP_MOD_C
+#define BN_MP_MOD_2D_C
+#define BN_MP_MOD_D_C
+#define BN_MP_MONTGOMERY_CALC_NORMALIZATION_C
+#define BN_MP_MONTGOMERY_REDUCE_C
+#define BN_MP_MONTGOMERY_SETUP_C
+#define BN_MP_MUL_C
+#define BN_MP_MUL_2_C
+#define BN_MP_MUL_2D_C
+#define BN_MP_MUL_D_C
+#define BN_MP_MULMOD_C
+#define BN_MP_N_ROOT_C
+#define BN_MP_NEG_C
+#define BN_MP_OR_C
+#define BN_MP_PRIME_FERMAT_C
+#define BN_MP_PRIME_IS_DIVISIBLE_C
+#define BN_MP_PRIME_IS_PRIME_C
+#define BN_MP_PRIME_MILLER_RABIN_C
+#define BN_MP_PRIME_NEXT_PRIME_C
+#define BN_MP_PRIME_RABIN_MILLER_TRIALS_C
+#define BN_MP_PRIME_RANDOM_EX_C
+#define BN_MP_RADIX_SIZE_C
+#define BN_MP_RADIX_SMAP_C
+#define BN_MP_RAND_C
+#define BN_MP_READ_RADIX_C
+#define BN_MP_READ_SIGNED_BIN_C
+#define BN_MP_READ_UNSIGNED_BIN_C
+#define BN_MP_REDUCE_C
+#define BN_MP_REDUCE_2K_C
+#define BN_MP_REDUCE_2K_L_C
+#define BN_MP_REDUCE_2K_SETUP_C
+#define BN_MP_REDUCE_2K_SETUP_L_C
+#define BN_MP_REDUCE_IS_2K_C
+#define BN_MP_REDUCE_IS_2K_L_C
+#define BN_MP_REDUCE_SETUP_C
+#define BN_MP_RSHD_C
+#define BN_MP_SET_C
+#define BN_MP_SET_INT_C
+#define BN_MP_SHRINK_C
+#define BN_MP_SIGNED_BIN_SIZE_C
+#define BN_MP_SQR_C
+#define BN_MP_SQRMOD_C
+#define BN_MP_SQRT_C
+#define BN_MP_SUB_C
+#define BN_MP_SUB_D_C
+#define BN_MP_SUBMOD_C
+#define BN_MP_TO_SIGNED_BIN_C
+#define BN_MP_TO_SIGNED_BIN_N_C
+#define BN_MP_TO_UNSIGNED_BIN_C
+#define BN_MP_TO_UNSIGNED_BIN_N_C
+#define BN_MP_TOOM_MUL_C
+#define BN_MP_TOOM_SQR_C
+#define BN_MP_TORADIX_C
+#define BN_MP_TORADIX_N_C
+#define BN_MP_UNSIGNED_BIN_SIZE_C
+#define BN_MP_XOR_C
+#define BN_MP_ZERO_C
+#define BN_PRIME_TAB_C
+#define BN_REVERSE_C
+#define BN_S_MP_ADD_C
+#define BN_S_MP_EXPTMOD_C
+#define BN_S_MP_MUL_DIGS_C
+#define BN_S_MP_MUL_HIGH_DIGS_C
+#define BN_S_MP_SQR_C
+#define BN_S_MP_SUB_C
+#define BNCORE_C
+#endif
+
+#if defined(BN_ERROR_C)
+   #define BN_MP_ERROR_TO_STRING_C
+#endif
+
+#if defined(BN_FAST_MP_INVMOD_C)
+   #define BN_MP_ISEVEN_C
+   #define BN_MP_INIT_MULTI_C
+   #define BN_MP_COPY_C
+   #define BN_MP_MOD_C
+   #define BN_MP_SET_C
+   #define BN_MP_DIV_2_C
+   #define BN_MP_ISODD_C
+   #define BN_MP_SUB_C
+   #define BN_MP_CMP_C
+   #define BN_MP_ISZERO_C
+   #define BN_MP_CMP_D_C
+   #define BN_MP_ADD_C
+   #define BN_MP_EXCH_C
+   #define BN_MP_CLEAR_MULTI_C
+#endif
+
+#if defined(BN_FAST_MP_MONTGOMERY_REDUCE_C)
+   #define BN_MP_GROW_C
+   #define BN_MP_RSHD_C
+   #define BN_MP_CLAMP_C
+   #define BN_MP_CMP_MAG_C
+   #define BN_S_MP_SUB_C
+#endif
+
+#if defined(BN_FAST_S_MP_MUL_DIGS_C)
+   #define BN_MP_GROW_C
+   #define BN_MP_CLAMP_C
+#endif
+
+#if defined(BN_FAST_S_MP_MUL_HIGH_DIGS_C)
+   #define BN_MP_GROW_C
+   #define BN_MP_CLAMP_C
+#endif
+
+#if defined(BN_FAST_S_MP_SQR_C)
+   #define BN_MP_GROW_C
+   #define BN_MP_CLAMP_C
+#endif
+
+#if defined(BN_MP_2EXPT_C)
+   #define BN_MP_ZERO_C
+   #define BN_MP_GROW_C
+#endif
+
+#if defined(BN_MP_ABS_C)
+   #define BN_MP_COPY_C
+#endif
+
+#if defined(BN_MP_ADD_C)
+   #define BN_S_MP_ADD_C
+   #define BN_MP_CMP_MAG_C
+   #define BN_S_MP_SUB_C
+#endif
+
+#if defined(BN_MP_ADD_D_C)
+   #define BN_MP_GROW_C
+   #define BN_MP_SUB_D_C
+   #define BN_MP_CLAMP_C
+#endif
+
+#if defined(BN_MP_ADDMOD_C)
+   #define BN_MP_INIT_C
+   #define BN_MP_ADD_C
+   #define BN_MP_CLEAR_C
+   #define BN_MP_MOD_C
+#endif
+
+#if defined(BN_MP_AND_C)
+   #define BN_MP_INIT_COPY_C
+   #define BN_MP_CLAMP_C
+   #define BN_MP_EXCH_C
+   #define BN_MP_CLEAR_C
+#endif
+
+#if defined(BN_MP_CLAMP_C)
+#endif
+
+#if defined(BN_MP_CLEAR_C)
+#endif
+
+#if defined(BN_MP_CLEAR_MULTI_C)
+   #define BN_MP_CLEAR_C
+#endif
+
+#if defined(BN_MP_CMP_C)
+   #define BN_MP_CMP_MAG_C
+#endif
+
+#if defined(BN_MP_CMP_D_C)
+#endif
+
+#if defined(BN_MP_CMP_MAG_C)
+#endif
+
+#if defined(BN_MP_CNT_LSB_C)
+   #define BN_MP_ISZERO_C
+#endif
+
+#if defined(BN_MP_COPY_C)
+   #define BN_MP_GROW_C
+#endif
+
+#if defined(BN_MP_COUNT_BITS_C)
+#endif
+
+#if defined(BN_MP_DIV_C)
+   #define BN_MP_ISZERO_C
+   #define BN_MP_CMP_MAG_C
+   #define BN_MP_COPY_C
+   #define BN_MP_ZERO_C
+   #define BN_MP_INIT_MULTI_C
+   #define BN_MP_SET_C
+   #define BN_MP_COUNT_BITS_C
+   #define BN_MP_ABS_C
+   #define BN_MP_MUL_2D_C
+   #define BN_MP_CMP_C
+   #define BN_MP_SUB_C
+   #define BN_MP_ADD_C
+   #define BN_MP_DIV_2D_C
+   #define BN_MP_EXCH_C
+   #define BN_MP_CLEAR_MULTI_C
+   #define BN_MP_INIT_SIZE_C
+   #define BN_MP_INIT_C
+   #define BN_MP_INIT_COPY_C
+   #define BN_MP_LSHD_C
+   #define BN_MP_RSHD_C
+   #define BN_MP_MUL_D_C
+   #define BN_MP_CLAMP_C
+   #define BN_MP_CLEAR_C
+#endif
+
+#if defined(BN_MP_DIV_2_C)
+   #define BN_MP_GROW_C
+   #define BN_MP_CLAMP_C
+#endif
+
+#if defined(BN_MP_DIV_2D_C)
+   #define BN_MP_COPY_C
+   #define BN_MP_ZERO_C
+   #define BN_MP_INIT_C
+   #define BN_MP_MOD_2D_C
+   #define BN_MP_CLEAR_C
+   #define BN_MP_RSHD_C
+   #define BN_MP_CLAMP_C
+   #define BN_MP_EXCH_C
+#endif
+
+#if defined(BN_MP_DIV_3_C)
+   #define BN_MP_INIT_SIZE_C
+   #define BN_MP_CLAMP_C
+   #define BN_MP_EXCH_C
+   #define BN_MP_CLEAR_C
+#endif
+
+#if defined(BN_MP_DIV_D_C)
+   #define BN_MP_ISZERO_C
+   #define BN_MP_COPY_C
+   #define BN_MP_DIV_2D_C
+   #define BN_MP_DIV_3_C
+   #define BN_MP_INIT_SIZE_C
+   #define BN_MP_CLAMP_C
+   #define BN_MP_EXCH_C
+   #define BN_MP_CLEAR_C
+#endif
+
+#if defined(BN_MP_DR_IS_MODULUS_C)
+#endif
+
+#if defined(BN_MP_DR_REDUCE_C)
+   #define BN_MP_GROW_C
+   #define BN_MP_CLAMP_C
+   #define BN_MP_CMP_MAG_C
+   #define BN_S_MP_SUB_C
+#endif
+
+#if defined(BN_MP_DR_SETUP_C)
+#endif
+
+#if defined(BN_MP_EXCH_C)
+#endif
+
+#if defined(BN_MP_EXPT_D_C)
+   #define BN_MP_INIT_COPY_C
+   #define BN_MP_SET_C
+   #define BN_MP_SQR_C
+   #define BN_MP_CLEAR_C
+   #define BN_MP_MUL_C
+#endif
+
+#if defined(BN_MP_EXPTMOD_C)
+   #define BN_MP_INIT_C
+   #define BN_MP_INVMOD_C
+   #define BN_MP_CLEAR_C
+   #define BN_MP_ABS_C
+   #define BN_MP_CLEAR_MULTI_C
+   #define BN_MP_REDUCE_IS_2K_L_C
+   #define BN_S_MP_EXPTMOD_C
+   #define BN_MP_DR_IS_MODULUS_C
+   #define BN_MP_REDUCE_IS_2K_C
+   #define BN_MP_ISODD_C
+   #define BN_MP_EXPTMOD_FAST_C
+#endif
+
+#if defined(BN_MP_EXPTMOD_FAST_C)
+   #define BN_MP_COUNT_BITS_C
+   #define BN_MP_INIT_C
+   #define BN_MP_CLEAR_C
+   #define BN_MP_MONTGOMERY_SETUP_C
+   #define BN_FAST_MP_MONTGOMERY_REDUCE_C
+   #define BN_MP_MONTGOMERY_REDUCE_C
+   #define BN_MP_DR_SETUP_C
+   #define BN_MP_DR_REDUCE_C
+   #define BN_MP_REDUCE_2K_SETUP_C
+   #define BN_MP_REDUCE_2K_C
+   #define BN_MP_MONTGOMERY_CALC_NORMALIZATION_C
+   #define BN_MP_MULMOD_C
+   #define BN_MP_SET_C
+   #define BN_MP_MOD_C
+   #define BN_MP_COPY_C
+   #define BN_MP_SQR_C
+   #define BN_MP_MUL_C
+   #define BN_MP_EXCH_C
+#endif
+
+#if defined(BN_MP_EXTEUCLID_C)
+   #define BN_MP_INIT_MULTI_C
+   #define BN_MP_SET_C
+   #define BN_MP_COPY_C
+   #define BN_MP_ISZERO_C
+   #define BN_MP_DIV_C
+   #define BN_MP_MUL_C
+   #define BN_MP_SUB_C
+   #define BN_MP_NEG_C
+   #define BN_MP_EXCH_C
+   #define BN_MP_CLEAR_MULTI_C
+#endif
+
+#if defined(BN_MP_FREAD_C)
+   #define BN_MP_ZERO_C
+   #define BN_MP_S_RMAP_C
+   #define BN_MP_MUL_D_C
+   #define BN_MP_ADD_D_C
+   #define BN_MP_CMP_D_C
+#endif
+
+#if defined(BN_MP_FWRITE_C)
+   #define BN_MP_RADIX_SIZE_C
+   #define BN_MP_TORADIX_C
+#endif
+
+#if defined(BN_MP_GCD_C)
+   #define BN_MP_ISZERO_C
+   #define BN_MP_ABS_C
+   #define BN_MP_ZERO_C
+   #define BN_MP_INIT_COPY_C
+   #define BN_MP_CNT_LSB_C
+   #define BN_MP_DIV_2D_C
+   #define BN_MP_CMP_MAG_C
+   #define BN_MP_EXCH_C
+   #define BN_S_MP_SUB_C
+   #define BN_MP_MUL_2D_C
+   #define BN_MP_CLEAR_C
+#endif
+
+#if defined(BN_MP_GET_INT_C)
+#endif
+
+#if defined(BN_MP_GROW_C)
+#endif
+
+#if defined(BN_MP_INIT_C)
+#endif
+
+#if defined(BN_MP_INIT_COPY_C)
+   #define BN_MP_COPY_C
+#endif
+
+#if defined(BN_MP_INIT_MULTI_C)
+   #define BN_MP_ERR_C
+   #define BN_MP_INIT_C
+   #define BN_MP_CLEAR_C
+#endif
+
+#if defined(BN_MP_INIT_SET_C)
+   #define BN_MP_INIT_C
+   #define BN_MP_SET_C
+#endif
+
+#if defined(BN_MP_INIT_SET_INT_C)
+   #define BN_MP_INIT_C
+   #define BN_MP_SET_INT_C
+#endif
+
+#if defined(BN_MP_INIT_SIZE_C)
+   #define BN_MP_INIT_C
+#endif
+
+#if defined(BN_MP_INVMOD_C)
+   #define BN_MP_ISZERO_C
+   #define BN_MP_ISODD_C
+   #define BN_FAST_MP_INVMOD_C
+   #define BN_MP_INVMOD_SLOW_C
+#endif
+
+#if defined(BN_MP_INVMOD_SLOW_C)
+   #define BN_MP_ISZERO_C
+   #define BN_MP_INIT_MULTI_C
+   #define BN_MP_MOD_C
+   #define BN_MP_COPY_C
+   #define BN_MP_ISEVEN_C
+   #define BN_MP_SET_C
+   #define BN_MP_DIV_2_C
+   #define BN_MP_ISODD_C
+   #define BN_MP_ADD_C
+   #define BN_MP_SUB_C
+   #define BN_MP_CMP_C
+   #define BN_MP_CMP_D_C
+   #define BN_MP_CMP_MAG_C
+   #define BN_MP_EXCH_C
+   #define BN_MP_CLEAR_MULTI_C
+#endif
+
+#if defined(BN_MP_IS_SQUARE_C)
+   #define BN_MP_MOD_D_C
+   #define BN_MP_INIT_SET_INT_C
+   #define BN_MP_MOD_C
+   #define BN_MP_GET_INT_C
+   #define BN_MP_SQRT_C
+   #define BN_MP_SQR_C
+   #define BN_MP_CMP_MAG_C
+   #define BN_MP_CLEAR_C
+#endif
+
+#if defined(BN_MP_JACOBI_C)
+   #define BN_MP_CMP_D_C
+   #define BN_MP_ISZERO_C
+   #define BN_MP_INIT_COPY_C
+   #define BN_MP_CNT_LSB_C
+   #define BN_MP_DIV_2D_C
+   #define BN_MP_MOD_C
+   #define BN_MP_CLEAR_C
+#endif
+
+#if defined(BN_MP_KARATSUBA_MUL_C)
+   #define BN_MP_MUL_C
+   #define BN_MP_INIT_SIZE_C
+   #define BN_MP_CLAMP_C
+   #define BN_MP_SUB_C
+   #define BN_MP_ADD_C
+   #define BN_MP_LSHD_C
+   #define BN_MP_CLEAR_C
+#endif
+
+#if defined(BN_MP_KARATSUBA_SQR_C)
+   #define BN_MP_INIT_SIZE_C
+   #define BN_MP_CLAMP_C
+   #define BN_MP_SQR_C
+   #define BN_MP_SUB_C
+   #define BN_S_MP_ADD_C
+   #define BN_MP_LSHD_C
+   #define BN_MP_ADD_C
+   #define BN_MP_CLEAR_C
+#endif
+
+#if defined(BN_MP_LCM_C)
+   #define BN_MP_INIT_MULTI_C
+   #define BN_MP_GCD_C
+   #define BN_MP_CMP_MAG_C
+   #define BN_MP_DIV_C
+   #define BN_MP_MUL_C
+   #define BN_MP_CLEAR_MULTI_C
+#endif
+
+#if defined(BN_MP_LSHD_C)
+   #define BN_MP_GROW_C
+   #define BN_MP_RSHD_C
+#endif
+
+#if defined(BN_MP_MOD_C)
+   #define BN_MP_INIT_C
+   #define BN_MP_DIV_C
+   #define BN_MP_CLEAR_C
+   #define BN_MP_ADD_C
+   #define BN_MP_EXCH_C
+#endif
+
+#if defined(BN_MP_MOD_2D_C)
+   #define BN_MP_ZERO_C
+   #define BN_MP_COPY_C
+   #define BN_MP_CLAMP_C
+#endif
+
+#if defined(BN_MP_MOD_D_C)
+   #define BN_MP_DIV_D_C
+#endif
+
+#if defined(BN_MP_MONTGOMERY_CALC_NORMALIZATION_C)
+   #define BN_MP_COUNT_BITS_C
+   #define BN_MP_2EXPT_C
+   #define BN_MP_SET_C
+   #define BN_MP_MUL_2_C
+   #define BN_MP_CMP_MAG_C
+   #define BN_S_MP_SUB_C
+#endif
+
+#if defined(BN_MP_MONTGOMERY_REDUCE_C)
+   #define BN_FAST_MP_MONTGOMERY_REDUCE_C
+   #define BN_MP_GROW_C
+   #define BN_MP_CLAMP_C
+   #define BN_MP_RSHD_C
+   #define BN_MP_CMP_MAG_C
+   #define BN_S_MP_SUB_C
+#endif
+
+#if defined(BN_MP_MONTGOMERY_SETUP_C)
+#endif
+
+#if defined(BN_MP_MUL_C)
+   #define BN_MP_TOOM_MUL_C
+   #define BN_MP_KARATSUBA_MUL_C
+   #define BN_FAST_S_MP_MUL_DIGS_C
+   #define BN_S_MP_MUL_C
+   #define BN_S_MP_MUL_DIGS_C
+#endif
+
+#if defined(BN_MP_MUL_2_C)
+   #define BN_MP_GROW_C
+#endif
+
+#if defined(BN_MP_MUL_2D_C)
+   #define BN_MP_COPY_C
+   #define BN_MP_GROW_C
+   #define BN_MP_LSHD_C
+   #define BN_MP_CLAMP_C
+#endif
+
+#if defined(BN_MP_MUL_D_C)
+   #define BN_MP_GROW_C
+   #define BN_MP_CLAMP_C
+#endif
+
+#if defined(BN_MP_MULMOD_C)
+   #define BN_MP_INIT_C
+   #define BN_MP_MUL_C
+   #define BN_MP_CLEAR_C
+   #define BN_MP_MOD_C
+#endif
+
+#if defined(BN_MP_N_ROOT_C)
+   #define BN_MP_INIT_C
+   #define BN_MP_SET_C
+   #define BN_MP_COPY_C
+   #define BN_MP_EXPT_D_C
+   #define BN_MP_MUL_C
+   #define BN_MP_SUB_C
+   #define BN_MP_MUL_D_C
+   #define BN_MP_DIV_C
+   #define BN_MP_CMP_C
+   #define BN_MP_SUB_D_C
+   #define BN_MP_EXCH_C
+   #define BN_MP_CLEAR_C
+#endif
+
+#if defined(BN_MP_NEG_C)
+   #define BN_MP_COPY_C
+   #define BN_MP_ISZERO_C
+#endif
+
+#if defined(BN_MP_OR_C)
+   #define BN_MP_INIT_COPY_C
+   #define BN_MP_CLAMP_C
+   #define BN_MP_EXCH_C
+   #define BN_MP_CLEAR_C
+#endif
+
+#if defined(BN_MP_PRIME_FERMAT_C)
+   #define BN_MP_CMP_D_C
+   #define BN_MP_INIT_C
+   #define BN_MP_EXPTMOD_C
+   #define BN_MP_CMP_C
+   #define BN_MP_CLEAR_C
+#endif
+
+#if defined(BN_MP_PRIME_IS_DIVISIBLE_C)
+   #define BN_MP_MOD_D_C
+#endif
+
+#if defined(BN_MP_PRIME_IS_PRIME_C)
+   #define BN_MP_CMP_D_C
+   #define BN_MP_PRIME_IS_DIVISIBLE_C
+   #define BN_MP_INIT_C
+   #define BN_MP_SET_C
+   #define BN_MP_PRIME_MILLER_RABIN_C
+   #define BN_MP_CLEAR_C
+#endif
+
+#if defined(BN_MP_PRIME_MILLER_RABIN_C)
+   #define BN_MP_CMP_D_C
+   #define BN_MP_INIT_COPY_C
+   #define BN_MP_SUB_D_C
+   #define BN_MP_CNT_LSB_C
+   #define BN_MP_DIV_2D_C
+   #define BN_MP_EXPTMOD_C
+   #define BN_MP_CMP_C
+   #define BN_MP_SQRMOD_C
+   #define BN_MP_CLEAR_C
+#endif
+
+#if defined(BN_MP_PRIME_NEXT_PRIME_C)
+   #define BN_MP_CMP_D_C
+   #define BN_MP_SET_C
+   #define BN_MP_SUB_D_C
+   #define BN_MP_ISEVEN_C
+   #define BN_MP_MOD_D_C
+   #define BN_MP_INIT_C
+   #define BN_MP_ADD_D_C
+   #define BN_MP_PRIME_MILLER_RABIN_C
+   #define BN_MP_CLEAR_C
+#endif
+
+#if defined(BN_MP_PRIME_RABIN_MILLER_TRIALS_C)
+#endif
+
+#if defined(BN_MP_PRIME_RANDOM_EX_C)
+   #define BN_MP_READ_UNSIGNED_BIN_C
+   #define BN_MP_PRIME_IS_PRIME_C
+   #define BN_MP_SUB_D_C
+   #define BN_MP_DIV_2_C
+   #define BN_MP_MUL_2_C
+   #define BN_MP_ADD_D_C
+#endif
+
+#if defined(BN_MP_RADIX_SIZE_C)
+   #define BN_MP_COUNT_BITS_C
+   #define BN_MP_INIT_COPY_C
+   #define BN_MP_ISZERO_C
+   #define BN_MP_DIV_D_C
+   #define BN_MP_CLEAR_C
+#endif
+
+#if defined(BN_MP_RADIX_SMAP_C)
+   #define BN_MP_S_RMAP_C
+#endif
+
+#if defined(BN_MP_RAND_C)
+   #define BN_MP_ZERO_C
+   #define BN_MP_ADD_D_C
+   #define BN_MP_LSHD_C
+#endif
+
+#if defined(BN_MP_READ_RADIX_C)
+   #define BN_MP_ZERO_C
+   #define BN_MP_S_RMAP_C
+   #define BN_MP_MUL_D_C
+   #define BN_MP_ADD_D_C
+   #define BN_MP_ISZERO_C
+#endif
+
+#if defined(BN_MP_READ_SIGNED_BIN_C)
+   #define BN_MP_READ_UNSIGNED_BIN_C
+#endif
+
+#if defined(BN_MP_READ_UNSIGNED_BIN_C)
+   #define BN_MP_GROW_C
+   #define BN_MP_ZERO_C
+   #define BN_MP_MUL_2D_C
+   #define BN_MP_CLAMP_C
+#endif
+
+#if defined(BN_MP_REDUCE_C)
+   #define BN_MP_REDUCE_SETUP_C
+   #define BN_MP_INIT_COPY_C
+   #define BN_MP_RSHD_C
+   #define BN_MP_MUL_C
+   #define BN_S_MP_MUL_HIGH_DIGS_C
+   #define BN_FAST_S_MP_MUL_HIGH_DIGS_C
+   #define BN_MP_MOD_2D_C
+   #define BN_S_MP_MUL_DIGS_C
+   #define BN_MP_SUB_C
+   #define BN_MP_CMP_D_C
+   #define BN_MP_SET_C
+   #define BN_MP_LSHD_C
+   #define BN_MP_ADD_C
+   #define BN_MP_CMP_C
+   #define BN_S_MP_SUB_C
+   #define BN_MP_CLEAR_C
+#endif
+
+#if defined(BN_MP_REDUCE_2K_C)
+   #define BN_MP_INIT_C
+   #define BN_MP_COUNT_BITS_C
+   #define BN_MP_DIV_2D_C
+   #define BN_MP_MUL_D_C
+   #define BN_S_MP_ADD_C
+   #define BN_MP_CMP_MAG_C
+   #define BN_S_MP_SUB_C
+   #define BN_MP_CLEAR_C
+#endif
+
+#if defined(BN_MP_REDUCE_2K_L_C)
+   #define BN_MP_INIT_C
+   #define BN_MP_COUNT_BITS_C
+   #define BN_MP_DIV_2D_C
+   #define BN_MP_MUL_C
+   #define BN_S_MP_ADD_C
+   #define BN_MP_CMP_MAG_C
+   #define BN_S_MP_SUB_C
+   #define BN_MP_CLEAR_C
+#endif
+
+#if defined(BN_MP_REDUCE_2K_SETUP_C)
+   #define BN_MP_INIT_C
+   #define BN_MP_COUNT_BITS_C
+   #define BN_MP_2EXPT_C
+   #define BN_MP_CLEAR_C
+   #define BN_S_MP_SUB_C
+#endif
+
+#if defined(BN_MP_REDUCE_2K_SETUP_L_C)
+   #define BN_MP_INIT_C
+   #define BN_MP_2EXPT_C
+   #define BN_MP_COUNT_BITS_C
+   #define BN_S_MP_SUB_C
+   #define BN_MP_CLEAR_C
+#endif
+
+#if defined(BN_MP_REDUCE_IS_2K_C)
+   #define BN_MP_REDUCE_2K_C
+   #define BN_MP_COUNT_BITS_C
+#endif
+
+#if defined(BN_MP_REDUCE_IS_2K_L_C)
+#endif
+
+#if defined(BN_MP_REDUCE_SETUP_C)
+   #define BN_MP_2EXPT_C
+   #define BN_MP_DIV_C
+#endif
+
+#if defined(BN_MP_RSHD_C)
+   #define BN_MP_ZERO_C
+#endif
+
+#if defined(BN_MP_SET_C)
+   #define BN_MP_ZERO_C
+#endif
+
+#if defined(BN_MP_SET_INT_C)
+   #define BN_MP_ZERO_C
+   #define BN_MP_MUL_2D_C
+   #define BN_MP_CLAMP_C
+#endif
+
+#if defined(BN_MP_SHRINK_C)
+#endif
+
+#if defined(BN_MP_SIGNED_BIN_SIZE_C)
+   #define BN_MP_UNSIGNED_BIN_SIZE_C
+#endif
+
+#if defined(BN_MP_SQR_C)
+   #define BN_MP_TOOM_SQR_C
+   #define BN_MP_KARATSUBA_SQR_C
+   #define BN_FAST_S_MP_SQR_C
+   #define BN_S_MP_SQR_C
+#endif
+
+#if defined(BN_MP_SQRMOD_C)
+   #define BN_MP_INIT_C
+   #define BN_MP_SQR_C
+   #define BN_MP_CLEAR_C
+   #define BN_MP_MOD_C
+#endif
+
+#if defined(BN_MP_SQRT_C)
+   #define BN_MP_N_ROOT_C
+   #define BN_MP_ISZERO_C
+   #define BN_MP_ZERO_C
+   #define BN_MP_INIT_COPY_C
+   #define BN_MP_RSHD_C
+   #define BN_MP_DIV_C
+   #define BN_MP_ADD_C
+   #define BN_MP_DIV_2_C
+   #define BN_MP_CMP_MAG_C
+   #define BN_MP_EXCH_C
+   #define BN_MP_CLEAR_C
+#endif
+
+#if defined(BN_MP_SUB_C)
+   #define BN_S_MP_ADD_C
+   #define BN_MP_CMP_MAG_C
+   #define BN_S_MP_SUB_C
+#endif
+
+#if defined(BN_MP_SUB_D_C)
+   #define BN_MP_GROW_C
+   #define BN_MP_ADD_D_C
+   #define BN_MP_CLAMP_C
+#endif
+
+#if defined(BN_MP_SUBMOD_C)
+   #define BN_MP_INIT_C
+   #define BN_MP_SUB_C
+   #define BN_MP_CLEAR_C
+   #define BN_MP_MOD_C
+#endif
+
+#if defined(BN_MP_TO_SIGNED_BIN_C)
+   #define BN_MP_TO_UNSIGNED_BIN_C
+#endif
+
+#if defined(BN_MP_TO_SIGNED_BIN_N_C)
+   #define BN_MP_SIGNED_BIN_SIZE_C
+   #define BN_MP_TO_SIGNED_BIN_C
+#endif
+
+#if defined(BN_MP_TO_UNSIGNED_BIN_C)
+   #define BN_MP_INIT_COPY_C
+   #define BN_MP_ISZERO_C
+   #define BN_MP_DIV_2D_C
+   #define BN_MP_CLEAR_C
+#endif
+
+#if defined(BN_MP_TO_UNSIGNED_BIN_N_C)
+   #define BN_MP_UNSIGNED_BIN_SIZE_C
+   #define BN_MP_TO_UNSIGNED_BIN_C
+#endif
+
+#if defined(BN_MP_TOOM_MUL_C)
+   #define BN_MP_INIT_MULTI_C
+   #define BN_MP_MOD_2D_C
+   #define BN_MP_COPY_C
+   #define BN_MP_RSHD_C
+   #define BN_MP_MUL_C
+   #define BN_MP_MUL_2_C
+   #define BN_MP_ADD_C
+   #define BN_MP_SUB_C
+   #define BN_MP_DIV_2_C
+   #define BN_MP_MUL_2D_C
+   #define BN_MP_MUL_D_C
+   #define BN_MP_DIV_3_C
+   #define BN_MP_LSHD_C
+   #define BN_MP_CLEAR_MULTI_C
+#endif
+
+#if defined(BN_MP_TOOM_SQR_C)
+   #define BN_MP_INIT_MULTI_C
+   #define BN_MP_MOD_2D_C
+   #define BN_MP_COPY_C
+   #define BN_MP_RSHD_C
+   #define BN_MP_SQR_C
+   #define BN_MP_MUL_2_C
+   #define BN_MP_ADD_C
+   #define BN_MP_SUB_C
+   #define BN_MP_DIV_2_C
+   #define BN_MP_MUL_2D_C
+   #define BN_MP_MUL_D_C
+   #define BN_MP_DIV_3_C
+   #define BN_MP_LSHD_C
+   #define BN_MP_CLEAR_MULTI_C
+#endif
+
+#if defined(BN_MP_TORADIX_C)
+   #define BN_MP_ISZERO_C
+   #define BN_MP_INIT_COPY_C
+   #define BN_MP_DIV_D_C
+   #define BN_MP_CLEAR_C
+   #define BN_MP_S_RMAP_C
+#endif
+
+#if defined(BN_MP_TORADIX_N_C)
+   #define BN_MP_ISZERO_C
+   #define BN_MP_INIT_COPY_C
+   #define BN_MP_DIV_D_C
+   #define BN_MP_CLEAR_C
+   #define BN_MP_S_RMAP_C
+#endif
+
+#if defined(BN_MP_UNSIGNED_BIN_SIZE_C)
+   #define BN_MP_COUNT_BITS_C
+#endif
+
+#if defined(BN_MP_XOR_C)
+   #define BN_MP_INIT_COPY_C
+   #define BN_MP_CLAMP_C
+   #define BN_MP_EXCH_C
+   #define BN_MP_CLEAR_C
+#endif
+
+#if defined(BN_MP_ZERO_C)
+#endif
+
+#if defined(BN_PRIME_TAB_C)
+#endif
+
+#if defined(BN_REVERSE_C)
+#endif
+
+#if defined(BN_S_MP_ADD_C)
+   #define BN_MP_GROW_C
+   #define BN_MP_CLAMP_C
+#endif
+
+#if defined(BN_S_MP_EXPTMOD_C)
+   #define BN_MP_COUNT_BITS_C
+   #define BN_MP_INIT_C
+   #define BN_MP_CLEAR_C
+   #define BN_MP_REDUCE_SETUP_C
+   #define BN_MP_REDUCE_C
+   #define BN_MP_REDUCE_2K_SETUP_L_C
+   #define BN_MP_REDUCE_2K_L_C
+   #define BN_MP_MOD_C
+   #define BN_MP_COPY_C
+   #define BN_MP_SQR_C
+   #define BN_MP_MUL_C
+   #define BN_MP_SET_C
+   #define BN_MP_EXCH_C
+#endif
+
+#if defined(BN_S_MP_MUL_DIGS_C)
+   #define BN_FAST_S_MP_MUL_DIGS_C
+   #define BN_MP_INIT_SIZE_C
+   #define BN_MP_CLAMP_C
+   #define BN_MP_EXCH_C
+   #define BN_MP_CLEAR_C
+#endif
+
+#if defined(BN_S_MP_MUL_HIGH_DIGS_C)
+   #define BN_FAST_S_MP_MUL_HIGH_DIGS_C
+   #define BN_MP_INIT_SIZE_C
+   #define BN_MP_CLAMP_C
+   #define BN_MP_EXCH_C
+   #define BN_MP_CLEAR_C
+#endif
+
+#if defined(BN_S_MP_SQR_C)
+   #define BN_MP_INIT_SIZE_C
+   #define BN_MP_CLAMP_C
+   #define BN_MP_EXCH_C
+   #define BN_MP_CLEAR_C
+#endif
+
+#if defined(BN_S_MP_SUB_C)
+   #define BN_MP_GROW_C
+   #define BN_MP_CLAMP_C
+#endif
+
+#if defined(BNCORE_C)
+#endif
+
+#ifdef LTM3
+#define LTM_LAST
+#endif
+#include <tommath_superclass.h>
+#include <tommath_class.h>
+#else
+#define LTM_LAST
+#endif
+
+/* Dropbear doesn't need these. */
+#undef BN_MP_KARATSUBA_MUL_C
+#undef BN_MP_KARATSUBA_SQR_C
+#undef BN_MP_TOOM_MUL_C
+#undef BN_MP_TOOM_SQR_C
diff --git a/libtommath/tommath_superclass.h b/libtommath/tommath_superclass.h
new file mode 100644
index 0000000..b50ecb0
--- /dev/null
+++ b/libtommath/tommath_superclass.h
@@ -0,0 +1,72 @@
+/* super class file for PK algos */
+
+/* default ... include all MPI */
+#define LTM_ALL
+
+/* RSA only (does not support DH/DSA/ECC) */
+// #define SC_RSA_1
+
+/* For reference.... On an Athlon64 optimizing for speed...
+
+   LTM's mpi.o with all functions [striped] is 142KiB in size.
+
+*/
+
+/* Works for RSA only, mpi.o is 68KiB */
+#ifdef SC_RSA_1
+   #define BN_MP_SHRINK_C
+   #define BN_MP_LCM_C
+   #define BN_MP_PRIME_RANDOM_EX_C
+   #define BN_MP_INVMOD_C
+   #define BN_MP_GCD_C
+   #define BN_MP_MOD_C
+   #define BN_MP_MULMOD_C
+   #define BN_MP_ADDMOD_C
+   #define BN_MP_EXPTMOD_C
+   #define BN_MP_SET_INT_C
+   #define BN_MP_INIT_MULTI_C
+   #define BN_MP_CLEAR_MULTI_C
+   #define BN_MP_UNSIGNED_BIN_SIZE_C
+   #define BN_MP_TO_UNSIGNED_BIN_C
+   #define BN_MP_MOD_D_C
+   #define BN_MP_PRIME_RABIN_MILLER_TRIALS_C
+   #define BN_REVERSE_C
+   #define BN_PRIME_TAB_C
+
+   /* other modifiers */
+   #define BN_MP_DIV_SMALL                    /* Slower division, not critical */
+
+   /* here we are on the last pass so we turn things off.  The functions classes are still there
+    * but we remove them specifically from the build.  This also invokes tweaks in functions
+    * like removing support for even moduli, etc...
+    */
+#ifdef LTM_LAST
+   #undef  BN_MP_TOOM_MUL_C
+   #undef  BN_MP_TOOM_SQR_C
+   #undef  BN_MP_KARATSUBA_MUL_C
+   #undef  BN_MP_KARATSUBA_SQR_C
+   #undef  BN_MP_REDUCE_C
+   #undef  BN_MP_REDUCE_SETUP_C
+   #undef  BN_MP_DR_IS_MODULUS_C
+   #undef  BN_MP_DR_SETUP_C
+   #undef  BN_MP_DR_REDUCE_C
+   #undef  BN_MP_REDUCE_IS_2K_C
+   #undef  BN_MP_REDUCE_2K_SETUP_C
+   #undef  BN_MP_REDUCE_2K_C
+   #undef  BN_S_MP_EXPTMOD_C
+   #undef  BN_MP_DIV_3_C
+   #undef  BN_S_MP_MUL_HIGH_DIGS_C
+   #undef  BN_FAST_S_MP_MUL_HIGH_DIGS_C
+   #undef  BN_FAST_MP_INVMOD_C
+
+   /* To safely undefine these you have to make sure your RSA key won't exceed the Comba threshold
+    * which is roughly 255 digits [7140 bits for 32-bit machines, 15300 bits for 64-bit machines] 
+    * which means roughly speaking you can handle upto 2536-bit RSA keys with these defined without
+    * trouble.  
+    */
+   #undef  BN_S_MP_MUL_DIGS_C
+   #undef  BN_S_MP_SQR_C
+   #undef  BN_MP_MONTGOMERY_REDUCE_C
+#endif
+
+#endif
diff --git a/listener.c b/listener.c
new file mode 100644
index 0000000..dd90c6b
--- /dev/null
+++ b/listener.c
@@ -0,0 +1,165 @@
+/*
+ * Dropbear SSH
+ * 
+ * Copyright (c) 2002,2003 Matt Johnston
+ * All rights reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE. */
+
+#include "includes.h"
+#include "listener.h"
+#include "session.h"
+#include "dbutil.h"
+
+void listeners_initialise() {
+
+	/* just one slot to start with */
+	ses.listeners = (struct Listener**)m_malloc(sizeof(struct Listener*));
+	ses.listensize = 1;
+	ses.listeners[0] = NULL;
+
+}
+
+void set_listener_fds(fd_set * readfds) {
+
+	unsigned int i, j;
+	struct Listener *listener;
+
+	/* check each in turn */
+	for (i = 0; i < ses.listensize; i++) {
+		listener = ses.listeners[i];
+		if (listener != NULL) {
+			for (j = 0; j < listener->nsocks; j++) {
+				FD_SET(listener->socks[j], readfds);
+			}
+		}
+	}
+}
+
+
+void handle_listeners(fd_set * readfds) {
+
+	unsigned int i, j;
+	struct Listener *listener;
+	int sock;
+
+	/* check each in turn */
+	for (i = 0; i < ses.listensize; i++) {
+		listener = ses.listeners[i];
+		if (listener != NULL) {
+			for (j = 0; j < listener->nsocks; j++) {
+				sock = listener->socks[j];
+				if (FD_ISSET(sock, readfds)) {
+					listener->acceptor(listener, sock);
+				}
+			}
+		}
+	}
+} /* Woo brace matching */
+
+
+/* acceptor(int fd, void* typedata) is a function to accept connections, 
+ * cleanup(void* typedata) happens when cleaning up */
+struct Listener* new_listener(int socks[], unsigned int nsocks,
+		int type, void* typedata, 
+		void (*acceptor)(struct Listener* listener, int sock), 
+		void (*cleanup)(struct Listener*)) {
+
+	unsigned int i, j;
+	struct Listener *newlisten = NULL;
+	/* try get a new structure to hold it */
+	for (i = 0; i < ses.listensize; i++) {
+		if (ses.listeners[i] == NULL) {
+			break;
+		}
+	}
+
+	/* or create a new one */
+	if (i == ses.listensize) {
+		if (ses.listensize > MAX_LISTENERS) {
+			TRACE(("leave newlistener: too many already"))
+			for (j = 0; j < nsocks; j++) {
+				close(socks[i]);
+			}
+			return NULL;
+		}
+		
+		ses.listeners = (struct Listener**)m_realloc(ses.listeners,
+				(ses.listensize+LISTENER_EXTEND_SIZE)
+				*sizeof(struct Listener*));
+
+		ses.listensize += LISTENER_EXTEND_SIZE;
+
+		for (j = i; j < ses.listensize; j++) {
+			ses.listeners[j] = NULL;
+		}
+	}
+
+	for (j = 0; j < nsocks; j++) {
+		ses.maxfd = MAX(ses.maxfd, socks[j]);
+	}
+
+	TRACE(("new listener num %d ", i))
+
+	newlisten = (struct Listener*)m_malloc(sizeof(struct Listener));
+	newlisten->index = i;
+	newlisten->type = type;
+	newlisten->typedata = typedata;
+	newlisten->nsocks = nsocks;
+	memcpy(newlisten->socks, socks, nsocks * sizeof(int));
+	newlisten->acceptor = acceptor;
+	newlisten->cleanup = cleanup;
+
+	ses.listeners[i] = newlisten;
+	return newlisten;
+}
+
+/* Return the first listener which matches the type-specific comparison
+ * function. Particularly needed for global requests, like tcp */
+struct Listener * get_listener(int type, void* typedata,
+		int (*match)(void*, void*)) {
+
+	unsigned int i;
+	struct Listener* listener;
+
+	for (i = 0, listener = ses.listeners[i]; i < ses.listensize; i++) {
+		if (listener->type == type
+				&& match(typedata, listener->typedata)) {
+			return listener;
+		}
+	}
+
+	return NULL;
+}
+
+void remove_listener(struct Listener* listener) {
+
+	unsigned int j;
+
+	if (listener->cleanup) {
+		listener->cleanup(listener);
+	}
+
+	for (j = 0; j < listener->nsocks; j++) {
+		close(listener->socks[j]);
+	}
+	ses.listeners[listener->index] = NULL;
+	m_free(listener);
+
+}
diff --git a/listener.h b/listener.h
new file mode 100644
index 0000000..5092efd
--- /dev/null
+++ b/listener.h
@@ -0,0 +1,63 @@
+/*
+ * Dropbear SSH
+ * 
+ * Copyright (c) 2002,2003 Matt Johnston
+ * All rights reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE. */
+
+#ifndef _LISTENER_H
+#define _LISTENER_H
+
+#define MAX_LISTENERS 20
+#define LISTENER_EXTEND_SIZE 1
+
+struct Listener {
+
+	int socks[DROPBEAR_MAX_SOCKS];
+	unsigned int nsocks;
+
+	int index; /* index in the array of listeners */
+
+	void (*acceptor)(struct Listener*, int sock);
+	void (*cleanup)(struct Listener*);
+
+	int type; /* CHANNEL_ID_X11, CHANNEL_ID_AGENT, 
+				 CHANNEL_ID_TCPDIRECT (for clients),
+				 CHANNEL_ID_TCPFORWARDED (for servers) */
+
+	void *typedata;
+
+};
+
+void listeners_initialise();
+void handle_listeners(fd_set * readfds);
+void set_listener_fds(fd_set * readfds);
+
+struct Listener* new_listener(int socks[], unsigned int nsocks, 
+		int type, void* typedata, 
+		void (*acceptor)(struct Listener* listener, int sock), 
+		void (*cleanup)(struct Listener*));
+
+struct Listener * get_listener(int type, void* typedata,
+		int (*match)(void*, void*));
+
+void remove_listener(struct Listener* listener);
+
+#endif /* _LISTENER_H */
diff --git a/loginrec.c b/loginrec.c
new file mode 100644
index 0000000..f084566
--- /dev/null
+++ b/loginrec.c
@@ -0,0 +1,1394 @@
+/*
+ * Copyright (c) 2000 Andre Lucas.  All rights reserved.
+ * Portions copyright (c) 1998 Todd C. Miller
+ * Portions copyright (c) 1996 Jason Downs
+ * Portions copyright (c) 1996 Theo de Raadt
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ ** loginrec.c:  platform-independent login recording and lastlog retrieval
+ **/
+
+/* For now lastlog code has been removed as it wasn't being used by Dropbear. */
+
+/*
+  The new login code explained
+  ============================
+
+  This code attempts to provide a common interface to login recording
+  (utmp and friends) and last login time retrieval.
+
+  Its primary means of achieving this is to use 'struct logininfo', a
+  union of all the useful fields in the various different types of
+  system login record structures one finds on UNIX variants.
+
+  We depend on autoconf to define which recording methods are to be
+  used, and which fields are contained in the relevant data structures
+  on the local system. Many C preprocessor symbols affect which code
+  gets compiled here.
+
+  The code is designed to make it easy to modify a particular
+  recording method, without affecting other methods nor requiring so
+  many nested conditional compilation blocks as were commonplace in
+  the old code.
+
+  For login recording, we try to use the local system's libraries as
+  these are clearly most likely to work correctly. For utmp systems
+  this usually means login() and logout() or setutent() etc., probably
+  in libutil, along with logwtmp() etc. On these systems, we fall back
+  to writing the files directly if we have to, though this method
+  requires very thorough testing so we do not corrupt local auditing
+  information. These files and their access methods are very system
+  specific indeed.
+
+  For utmpx systems, the corresponding library functions are
+  setutxent() etc. To the author's knowledge, all utmpx systems have
+  these library functions and so no direct write is attempted. If such
+  a system exists and needs support, direct analogues of the [uw]tmp
+  code should suffice.
+
+  Retrieving the time of last login ('lastlog') is in some ways even
+  more problemmatic than login recording. Some systems provide a
+  simple table of all users which we seek based on uid and retrieve a
+  relatively standard structure. Others record the same information in
+  a directory with a separate file, and others don't record the
+  information separately at all. For systems in the latter category,
+  we look backwards in the wtmp or wtmpx file for the last login entry
+  for our user. Naturally this is slower and on busy systems could
+  incur a significant performance penalty.
+
+  Calling the new code
+  --------------------
+
+  In OpenSSH all login recording and retrieval is performed in
+  login.c. Here you'll find working examples. Also, in the logintest.c
+  program there are more examples.
+
+  Internal handler calling method
+  -------------------------------
+
+  When a call is made to login_login() or login_logout(), both
+  routines set a struct logininfo flag defining which action (log in,
+  or log out) is to be taken. They both then call login_write(), which
+  calls whichever of the many structure-specific handlers autoconf
+  selects for the local system.
+
+  The handlers themselves handle system data structure specifics. Both
+  struct utmp and struct utmpx have utility functions (see
+  construct_utmp*()) to try to make it simpler to add extra systems
+  that introduce new features to either structure.
+
+  While it may seem terribly wasteful to replicate so much similar
+  code for each method, experience has shown that maintaining code to
+  write both struct utmp and utmpx in one function, whilst maintaining
+  support for all systems whether they have library support or not, is
+  a difficult and time-consuming task.
+
+  Lastlog support proceeds similarly. Functions login_get_lastlog()
+  (and its OpenSSH-tuned friend login_get_lastlog_time()) call
+  getlast_entry(), which tries one of three methods to find the last
+  login time. It uses local system lastlog support if it can,
+  otherwise it tries wtmp or wtmpx before giving up and returning 0,
+  meaning "tilt".
+
+  Maintenance
+  -----------
+
+  In many cases it's possible to tweak autoconf to select the correct
+  methods for a particular platform, either by improving the detection
+  code (best), or by presetting DISABLE_<method> or CONF_<method>_FILE
+  symbols for the platform.
+
+  Use logintest to check which symbols are defined before modifying
+  configure.ac and loginrec.c. (You have to build logintest yourself
+  with 'make logintest' as it's not built by default.)
+
+  Otherwise, patches to the specific method(s) are very helpful!
+
+*/
+
+/**
+ ** TODO:
+ **   homegrown ttyslot()
+ **   test, test, test
+ **
+ ** Platform status:
+ ** ----------------
+ **
+ ** Known good:
+ **   Linux (Redhat 6.2, Debian)
+ **   Solaris
+ **   HP-UX 10.20 (gcc only)
+ **   IRIX
+ **   NeXT - M68k/HPPA/Sparc (4.2/3.3)
+ **
+ ** Testing required: Please send reports!
+ **   NetBSD
+ **   HP-UX 11
+ **   AIX
+ **
+ ** Platforms with known problems:
+ **   Some variants of Slackware Linux
+ **
+ **/
+
+
+#include "includes.h"
+#include "loginrec.h"
+#include "dbutil.h"
+#include "atomicio.h"
+
+/**
+ ** prototypes for helper functions in this file
+ **/
+
+#if HAVE_UTMP_H
+void set_utmp_time(struct logininfo *li, struct utmp *ut);
+void construct_utmp(struct logininfo *li, struct utmp *ut);
+#endif
+
+#ifdef HAVE_UTMPX_H
+void set_utmpx_time(struct logininfo *li, struct utmpx *ut);
+void construct_utmpx(struct logininfo *li, struct utmpx *ut);
+#endif
+
+int utmp_write_entry(struct logininfo *li);
+int utmpx_write_entry(struct logininfo *li);
+int wtmp_write_entry(struct logininfo *li);
+int wtmpx_write_entry(struct logininfo *li);
+int lastlog_write_entry(struct logininfo *li);
+int syslogin_write_entry(struct logininfo *li);
+
+int wtmp_get_entry(struct logininfo *li);
+int wtmpx_get_entry(struct logininfo *li);
+
+/* pick the shortest string */
+#define MIN_SIZEOF(s1,s2) ( sizeof(s1) < sizeof(s2) ? sizeof(s1) : sizeof(s2) )
+
+/**
+ ** platform-independent login functions
+ **/
+
+/* login_login(struct logininfo *)     -Record a login
+ *
+ * Call with a pointer to a struct logininfo initialised with
+ * login_init_entry() or login_alloc_entry()
+ *
+ * Returns:
+ *  >0 if successful
+ *  0  on failure (will use OpenSSH's logging facilities for diagnostics)
+ */
+int
+login_login (struct logininfo *li)
+{
+	li->type = LTYPE_LOGIN;
+	return login_write(li);
+}
+
+
+/* login_logout(struct logininfo *)     - Record a logout
+ *
+ * Call as with login_login()
+ *
+ * Returns:
+ *  >0 if successful
+ *  0  on failure (will use OpenSSH's logging facilities for diagnostics)
+ */
+int
+login_logout(struct logininfo *li)
+{
+	li->type = LTYPE_LOGOUT;
+	return login_write(li);
+}
+
+
+/* login_alloc_entry(int, char*, char*, char*)    - Allocate and initialise
+ *                                                  a logininfo structure
+ *
+ * This function creates a new struct logininfo, a data structure
+ * meant to carry the information required to portably record login info.
+ *
+ * Returns a pointer to a newly created struct logininfo. If memory
+ * allocation fails, the program halts.
+ */
+struct
+logininfo *login_alloc_entry(int pid, const char *username,
+			     const char *hostname, const char *line)
+{
+	struct logininfo *newli;
+
+	newli = (struct logininfo *) m_malloc (sizeof(*newli));
+	(void)login_init_entry(newli, pid, username, hostname, line);
+	return newli;
+}
+
+
+/* login_free_entry(struct logininfo *)    - free struct memory */
+void
+login_free_entry(struct logininfo *li)
+{
+	m_free(li);
+}
+
+
+/* login_init_entry(struct logininfo *, int, char*, char*, char*)
+ *                                        - initialise a struct logininfo
+ *
+ * Populates a new struct logininfo, a data structure meant to carry
+ * the information required to portably record login info.
+ *
+ * Returns: 1
+ */
+int
+login_init_entry(struct logininfo *li, int pid, const char *username,
+		 const char *hostname, const char *line)
+{
+	struct passwd *pw;
+
+	memset(li, 0, sizeof(*li));
+
+	li->pid = pid;
+
+	/* set the line information */
+	if (line)
+		line_fullname(li->line, line, sizeof(li->line));
+
+	if (username) {
+		strlcpy(li->username, username, sizeof(li->username));
+		pw = getpwnam(li->username);
+		if (pw == NULL)
+			dropbear_exit("login_init_entry: Cannot find user \"%s\"",
+					li->username);
+		li->uid = pw->pw_uid;
+	}
+
+	if (hostname)
+		strlcpy(li->hostname, hostname, sizeof(li->hostname));
+
+	return 1;
+}
+
+/* login_set_current_time(struct logininfo *)    - set the current time
+ *
+ * Set the current time in a logininfo structure. This function is
+ * meant to eliminate the need to deal with system dependencies for
+ * time handling.
+ */
+void
+login_set_current_time(struct logininfo *li)
+{
+	struct timeval tv;
+
+	gettimeofday(&tv, NULL);
+
+	li->tv_sec = tv.tv_sec;
+	li->tv_usec = tv.tv_usec;
+}
+
+/* copy a sockaddr_* into our logininfo */
+void
+login_set_addr(struct logininfo *li, const struct sockaddr *sa,
+	       const unsigned int sa_size)
+{
+	unsigned int bufsize = sa_size;
+
+	/* make sure we don't overrun our union */
+	if (sizeof(li->hostaddr) < sa_size)
+		bufsize = sizeof(li->hostaddr);
+
+	memcpy((void *)&(li->hostaddr.sa), (const void *)sa, bufsize);
+}
+
+
+/**
+ ** login_write: Call low-level recording functions based on autoconf
+ ** results
+ **/
+int
+login_write (struct logininfo *li)
+{
+#ifndef HAVE_CYGWIN
+	if ((int)geteuid() != 0) {
+	  dropbear_log(LOG_WARNING,
+			  "Attempt to write login records by non-root user (aborting)");
+	  return 1;
+	}
+#endif
+
+	/* set the timestamp */
+	login_set_current_time(li);
+#ifdef USE_LOGIN
+	syslogin_write_entry(li);
+#endif
+#ifdef USE_LASTLOG
+	if (li->type == LTYPE_LOGIN) {
+		lastlog_write_entry(li);
+	}
+#endif
+#ifdef USE_UTMP
+	utmp_write_entry(li);
+#endif
+#ifdef USE_WTMP
+	wtmp_write_entry(li);
+#endif
+#ifdef USE_UTMPX
+	utmpx_write_entry(li);
+#endif
+#ifdef USE_WTMPX
+	wtmpx_write_entry(li);
+#endif
+	return 0;
+}
+
+#ifdef LOGIN_NEEDS_UTMPX
+int
+login_utmp_only(struct logininfo *li)
+{
+	li->type = LTYPE_LOGIN; 
+	login_set_current_time(li);
+# ifdef USE_UTMP
+	utmp_write_entry(li);
+# endif
+# ifdef USE_WTMP
+	wtmp_write_entry(li);
+# endif
+# ifdef USE_UTMPX
+	utmpx_write_entry(li);
+# endif
+# ifdef USE_WTMPX
+	wtmpx_write_entry(li);
+# endif
+	return 0;
+}
+#endif
+
+
+
+/*
+ * 'line' string utility functions
+ *
+ * These functions process the 'line' string into one of three forms:
+ *
+ * 1. The full filename (including '/dev')
+ * 2. The stripped name (excluding '/dev')
+ * 3. The abbreviated name (e.g. /dev/ttyp00 -> yp00
+ *                               /dev/pts/1  -> ts/1 )
+ *
+ * Form 3 is used on some systems to identify a .tmp.? entry when
+ * attempting to remove it. Typically both addition and removal is
+ * performed by one application - say, sshd - so as long as the choice
+ * uniquely identifies a terminal it's ok.
+ */
+
+
+/* line_fullname(): add the leading '/dev/' if it doesn't exist make
+ * sure dst has enough space, if not just copy src (ugh) */
+char *
+line_fullname(char *dst, const char *src, size_t dstsize)
+{
+	memset(dst, '\0', dstsize);
+	if ((strncmp(src, "/dev/", 5) == 0) || (dstsize < (strlen(src) + 5))) {
+		strlcpy(dst, src, dstsize);
+	} else {
+		strlcpy(dst, "/dev/", dstsize);
+		strlcat(dst, src, dstsize);
+	}
+	return dst;
+}
+
+/* line_stripname(): strip the leading '/dev' if it exists, return dst */
+char *
+line_stripname(char *dst, const char *src, size_t dstsize)
+{
+	memset(dst, '\0', dstsize);
+	if (strncmp(src, "/dev/", 5) == 0)
+		strlcpy(dst, src + 5, dstsize);
+	else
+		strlcpy(dst, src, dstsize);
+	return dst;
+}
+
+/* line_abbrevname(): Return the abbreviated (usually four-character)
+ * form of the line (Just use the last <dstsize> characters of the
+ * full name.)
+ *
+ * NOTE: use strncpy because we do NOT necessarily want zero
+ * termination */
+char *
+line_abbrevname(char *dst, const char *src, size_t dstsize)
+{
+	size_t len;
+
+	memset(dst, '\0', dstsize);
+
+	/* Always skip prefix if present */
+	if (strncmp(src, "/dev/", 5) == 0)
+		src += 5;
+
+#ifdef WITH_ABBREV_NO_TTY
+	if (strncmp(src, "tty", 3) == 0)
+		src += 3;
+#endif
+
+	len = strlen(src);
+
+	if (len > 0) {
+		if (((int)len - dstsize) > 0)
+			src +=  ((int)len - dstsize);
+
+		/* note: _don't_ change this to strlcpy */
+		strncpy(dst, src, (size_t)dstsize);
+	}
+
+	return dst;
+}
+
+/**
+ ** utmp utility functions
+ **
+ ** These functions manipulate struct utmp, taking system differences
+ ** into account.
+ **/
+
+#if defined(USE_UTMP) || defined (USE_WTMP) || defined (USE_LOGIN)
+
+/* build the utmp structure */
+void
+set_utmp_time(struct logininfo *li, struct utmp *ut)
+{
+# ifdef HAVE_STRUCT_UTMP_UT_TV
+	ut->ut_tv.tv_sec = li->tv_sec;
+	ut->ut_tv.tv_usec = li->tv_usec;
+# else
+#  ifdef HAVE_STRUCT_UTMP_UT_TIME
+	ut->ut_time = li->tv_sec;
+#  endif
+# endif
+}
+
+void
+construct_utmp(struct logininfo *li,
+		    struct utmp *ut)
+{
+# ifdef HAVE_ADDR_V6_IN_UTMP
+	struct sockaddr_in6 *sa6;
+#  endif
+	memset(ut, '\0', sizeof(*ut));
+
+	/* First fill out fields used for both logins and logouts */
+
+# ifdef HAVE_STRUCT_UTMP_UT_ID
+	line_abbrevname(ut->ut_id, li->line, sizeof(ut->ut_id));
+# endif
+
+# ifdef HAVE_STRUCT_UTMP_UT_TYPE
+	/* This is done here to keep utmp constants out of struct logininfo */
+	switch (li->type) {
+	case LTYPE_LOGIN:
+		ut->ut_type = USER_PROCESS;
+#ifdef _UNICOS
+		cray_set_tmpdir(ut);
+#endif
+		break;
+	case LTYPE_LOGOUT:
+		ut->ut_type = DEAD_PROCESS;
+#ifdef _UNICOS
+		cray_retain_utmp(ut, li->pid);
+#endif
+		break;
+	}
+# endif
+	set_utmp_time(li, ut);
+
+	line_stripname(ut->ut_line, li->line, sizeof(ut->ut_line));
+
+# ifdef HAVE_STRUCT_UTMP_UT_PID
+	ut->ut_pid = li->pid;
+# endif
+
+	/* If we're logging out, leave all other fields blank */
+	if (li->type == LTYPE_LOGOUT)
+	  return;
+
+	/*
+	 * These fields are only used when logging in, and are blank
+	 * for logouts.
+	 */
+
+	/* Use strncpy because we don't necessarily want null termination */
+	strncpy(ut->ut_name, li->username, MIN_SIZEOF(ut->ut_name, li->username));
+# ifdef HAVE_STRUCT_UTMP_UT_HOST
+	strncpy(ut->ut_host, li->hostname, MIN_SIZEOF(ut->ut_host, li->hostname));
+# endif
+# ifdef HAVE_STRUCT_UTMP_UT_ADDR
+	/* this is just a 32-bit IP address */
+	if (li->hostaddr.sa.sa_family == AF_INET)
+		ut->ut_addr = li->hostaddr.sa_in.sin_addr.s_addr;
+# endif
+# ifdef HAVE_ADDR_V6_IN_UTMP
+	/* this is just a 128-bit IPv6 address */
+	if (li->hostaddr.sa.sa_family == AF_INET6) {
+		sa6 = ((struct sockaddr_in6 *)&li->hostaddr.sa);
+		memcpy(ut->ut_addr_v6, sa6->sin6_addr.s6_addr, 16);
+		if (IN6_IS_ADDR_V4MAPPED(&sa6->sin6_addr)) {
+			ut->ut_addr_v6[0] = ut->ut_addr_v6[3];
+			ut->ut_addr_v6[1] = 0;
+			ut->ut_addr_v6[2] = 0;
+			ut->ut_addr_v6[3] = 0;
+		}
+	}
+# endif
+}
+#endif /* USE_UTMP || USE_WTMP || USE_LOGIN */
+
+/**
+ ** utmpx utility functions
+ **
+ ** These functions manipulate struct utmpx, accounting for system
+ ** variations.
+ **/
+
+#if defined(USE_UTMPX) || defined (USE_WTMPX)
+/* build the utmpx structure */
+void
+set_utmpx_time(struct logininfo *li, struct utmpx *utx)
+{
+# ifdef HAVE_STRUCT_UTMPX_UT_TV
+	utx->ut_tv.tv_sec = li->tv_sec;
+	utx->ut_tv.tv_usec = li->tv_usec;
+# else /* HAVE_STRUCT_UTMPX_UT_TV */
+#  ifdef HAVE_STRUCT_UTMPX_UT_TIME
+	utx->ut_time = li->tv_sec;
+#  endif /* HAVE_STRUCT_UTMPX_UT_TIME */
+# endif /* HAVE_STRUCT_UTMPX_UT_TV */
+}
+
+void
+construct_utmpx(struct logininfo *li, struct utmpx *utx)
+{
+# ifdef HAVE_ADDR_V6_IN_UTMP
+	struct sockaddr_in6 *sa6;
+#  endif
+	memset(utx, '\0', sizeof(*utx));
+# ifdef HAVE_STRUCT_UTMPX_UT_ID
+	line_abbrevname(utx->ut_id, li->line, sizeof(utx->ut_id));
+# endif
+
+	/* this is done here to keep utmp constants out of loginrec.h */
+	switch (li->type) {
+	case LTYPE_LOGIN:
+		utx->ut_type = USER_PROCESS;
+		break;
+	case LTYPE_LOGOUT:
+		utx->ut_type = DEAD_PROCESS;
+		break;
+	}
+	line_stripname(utx->ut_line, li->line, sizeof(utx->ut_line));
+	set_utmpx_time(li, utx);
+	utx->ut_pid = li->pid;
+	/* strncpy(): Don't necessarily want null termination */
+	strncpy(utx->ut_name, li->username, MIN_SIZEOF(utx->ut_name, li->username));
+
+	if (li->type == LTYPE_LOGOUT)
+		return;
+
+	/*
+	 * These fields are only used when logging in, and are blank
+	 * for logouts.
+	 */
+
+# ifdef HAVE_STRUCT_UTMPX_UT_HOST
+	strncpy(utx->ut_host, li->hostname, MIN_SIZEOF(utx->ut_host, li->hostname));
+# endif
+# ifdef HAVE_STRUCT_UTMPX_UT_ADDR
+	/* this is just a 32-bit IP address */
+	if (li->hostaddr.sa.sa_family == AF_INET)
+		utx->ut_addr = li->hostaddr.sa_in.sin_addr.s_addr;
+# endif
+# ifdef HAVE_ADDR_V6_IN_UTMP
+	/* this is just a 128-bit IPv6 address */
+	if (li->hostaddr.sa.sa_family == AF_INET6) {
+		sa6 = ((struct sockaddr_in6 *)&li->hostaddr.sa);
+		memcpy(ut->ut_addr_v6, sa6->sin6_addr.s6_addr, 16);
+		if (IN6_IS_ADDR_V4MAPPED(&sa6->sin6_addr)) {
+			ut->ut_addr_v6[0] = ut->ut_addr_v6[3];
+			ut->ut_addr_v6[1] = 0;
+			ut->ut_addr_v6[2] = 0;
+			ut->ut_addr_v6[3] = 0;
+		}
+	}
+# endif
+# ifdef HAVE_STRUCT_UTMPX_UT_SYSLEN
+	/* ut_syslen is the length of the utx_host string */
+	utx->ut_syslen = MIN(strlen(li->hostname), sizeof(utx->ut_host));
+# endif
+}
+#endif /* USE_UTMPX || USE_WTMPX */
+
+/**
+ ** Low-level utmp functions
+ **/
+
+/* FIXME: (ATL) utmp_write_direct needs testing */
+#ifdef USE_UTMP
+
+/* if we can, use pututline() etc. */
+# if !defined(DISABLE_PUTUTLINE) && defined(HAVE_SETUTENT) && \
+	defined(HAVE_PUTUTLINE)
+#  define UTMP_USE_LIBRARY
+# endif
+
+
+/* write a utmp entry with the system's help (pututline() and pals) */
+# ifdef UTMP_USE_LIBRARY
+static int
+utmp_write_library(struct logininfo *li, struct utmp *ut)
+{
+	setutent();
+	pututline(ut);
+
+#  ifdef HAVE_ENDUTENT
+	endutent();
+#  endif
+	return 1;
+}
+# else /* UTMP_USE_LIBRARY */
+
+/* write a utmp entry direct to the file */
+/* This is a slightly modification of code in OpenBSD's login.c */
+static int
+utmp_write_direct(struct logininfo *li, struct utmp *ut)
+{
+	struct utmp old_ut;
+	register int fd;
+	int tty;
+
+	/* FIXME: (ATL) ttyslot() needs local implementation */
+
+#if defined(HAVE_GETTTYENT)
+	register struct ttyent *ty;
+
+	tty=0;
+
+	setttyent();
+	while ((struct ttyent *)0 != (ty = getttyent())) {
+		tty++;
+		if (!strncmp(ty->ty_name, ut->ut_line, sizeof(ut->ut_line)))
+			break;
+	}
+	endttyent();
+
+	if((struct ttyent *)0 == ty) {
+		dropbear_log(LOG_WARNING, "utmp_write_entry: tty not found");
+		return(1);
+	}
+#else /* FIXME */
+
+	tty = ttyslot(); /* seems only to work for /dev/ttyp? style names */
+
+#endif /* HAVE_GETTTYENT */
+
+	if (tty > 0 && (fd = open(UTMP_FILE, O_RDWR|O_CREAT, 0644)) >= 0) {
+		(void)lseek(fd, (off_t)(tty * sizeof(struct utmp)), SEEK_SET);
+		/*
+		 * Prevent luser from zero'ing out ut_host.
+		 * If the new ut_line is empty but the old one is not
+		 * and ut_line and ut_name match, preserve the old ut_line.
+		 */
+		if (atomicio(read, fd, &old_ut, sizeof(old_ut)) == sizeof(old_ut) &&
+			(ut->ut_host[0] == '\0') && (old_ut.ut_host[0] != '\0') &&
+			(strncmp(old_ut.ut_line, ut->ut_line, sizeof(ut->ut_line)) == 0) &&
+			(strncmp(old_ut.ut_name, ut->ut_name, sizeof(ut->ut_name)) == 0)) {
+			(void)memcpy(ut->ut_host, old_ut.ut_host, sizeof(ut->ut_host));
+		}
+
+		(void)lseek(fd, (off_t)(tty * sizeof(struct utmp)), SEEK_SET);
+		if (atomicio(write, fd, ut, sizeof(*ut)) != sizeof(*ut))
+			dropbear_log(LOG_WARNING, "utmp_write_direct: error writing %s: %s",
+			    UTMP_FILE, strerror(errno));
+
+		(void)close(fd);
+		return 1;
+	} else {
+		return 0;
+	}
+}
+# endif /* UTMP_USE_LIBRARY */
+
+static int
+utmp_perform_login(struct logininfo *li)
+{
+	struct utmp ut;
+
+	construct_utmp(li, &ut);
+# ifdef UTMP_USE_LIBRARY
+	if (!utmp_write_library(li, &ut)) {
+		dropbear_log(LOG_WARNING, "utmp_perform_login: utmp_write_library() failed");
+		return 0;
+	}
+# else
+	if (!utmp_write_direct(li, &ut)) {
+		dropbear_log(LOG_WARNING, "utmp_perform_login: utmp_write_direct() failed");
+		return 0;
+	}
+# endif
+	return 1;
+}
+
+
+static int
+utmp_perform_logout(struct logininfo *li)
+{
+	struct utmp ut;
+
+	construct_utmp(li, &ut);
+# ifdef UTMP_USE_LIBRARY
+	if (!utmp_write_library(li, &ut)) {
+		dropbear_log(LOG_WARNING, "utmp_perform_logout: utmp_write_library() failed");
+		return 0;
+	}
+# else
+	if (!utmp_write_direct(li, &ut)) {
+		dropbear_log(LOG_WARNING, "utmp_perform_logout: utmp_write_direct() failed");
+		return 0;
+	}
+# endif
+	return 1;
+}
+
+
+int
+utmp_write_entry(struct logininfo *li)
+{
+	switch(li->type) {
+	case LTYPE_LOGIN:
+		return utmp_perform_login(li);
+
+	case LTYPE_LOGOUT:
+		return utmp_perform_logout(li);
+
+	default:
+		dropbear_log(LOG_WARNING, "utmp_write_entry: invalid type field");
+		return 0;
+	}
+}
+#endif /* USE_UTMP */
+
+
+/**
+ ** Low-level utmpx functions
+ **/
+
+/* not much point if we don't want utmpx entries */
+#ifdef USE_UTMPX
+
+/* if we have the wherewithall, use pututxline etc. */
+# if !defined(DISABLE_PUTUTXLINE) && defined(HAVE_SETUTXENT) && \
+	defined(HAVE_PUTUTXLINE)
+#  define UTMPX_USE_LIBRARY
+# endif
+
+
+/* write a utmpx entry with the system's help (pututxline() and pals) */
+# ifdef UTMPX_USE_LIBRARY
+static int
+utmpx_write_library(struct logininfo *li, struct utmpx *utx)
+{
+	setutxent();
+	pututxline(utx);
+
+#  ifdef HAVE_ENDUTXENT
+	endutxent();
+#  endif
+	return 1;
+}
+
+# else /* UTMPX_USE_LIBRARY */
+
+/* write a utmp entry direct to the file */
+static int
+utmpx_write_direct(struct logininfo *li, struct utmpx *utx)
+{
+	dropbear_log(LOG_WARNING, "utmpx_write_direct: not implemented!");
+	return 0;
+}
+# endif /* UTMPX_USE_LIBRARY */
+
+static int
+utmpx_perform_login(struct logininfo *li)
+{
+	struct utmpx utx;
+
+	construct_utmpx(li, &utx);
+# ifdef UTMPX_USE_LIBRARY
+	if (!utmpx_write_library(li, &utx)) {
+		dropbear_log(LOG_WARNING, "utmpx_perform_login: utmp_write_library() failed");
+		return 0;
+	}
+# else
+	if (!utmpx_write_direct(li, &ut)) {
+		dropbear_log(LOG_WARNING, "utmpx_perform_login: utmp_write_direct() failed");
+		return 0;
+	}
+# endif
+	return 1;
+}
+
+
+static int
+utmpx_perform_logout(struct logininfo *li)
+{
+	struct utmpx utx;
+
+	construct_utmpx(li, &utx);
+# ifdef HAVE_STRUCT_UTMPX_UT_ID
+	line_abbrevname(utx.ut_id, li->line, sizeof(utx.ut_id));
+# endif
+# ifdef HAVE_STRUCT_UTMPX_UT_TYPE
+	utx.ut_type = DEAD_PROCESS;
+# endif
+
+# ifdef UTMPX_USE_LIBRARY
+	utmpx_write_library(li, &utx);
+# else
+	utmpx_write_direct(li, &utx);
+# endif
+	return 1;
+}
+
+int
+utmpx_write_entry(struct logininfo *li)
+{
+	switch(li->type) {
+	case LTYPE_LOGIN:
+		return utmpx_perform_login(li);
+	case LTYPE_LOGOUT:
+		return utmpx_perform_logout(li);
+	default:
+		dropbear_log(LOG_WARNING, "utmpx_write_entry: invalid type field");
+		return 0;
+	}
+}
+#endif /* USE_UTMPX */
+
+
+/**
+ ** Low-level wtmp functions
+ **/
+
+#ifdef USE_WTMP
+
+/* write a wtmp entry direct to the end of the file */
+/* This is a slight modification of code in OpenBSD's logwtmp.c */
+static int
+wtmp_write(struct logininfo *li, struct utmp *ut)
+{
+	struct stat buf;
+	int fd, ret = 1;
+
+	if ((fd = open(WTMP_FILE, O_WRONLY|O_APPEND, 0)) < 0) {
+		dropbear_log(LOG_WARNING, "wtmp_write: problem writing %s: %s",
+		    WTMP_FILE, strerror(errno));
+		return 0;
+	}
+	if (fstat(fd, &buf) == 0)
+		if (atomicio(write, fd, ut, sizeof(*ut)) != sizeof(*ut)) {
+			ftruncate(fd, buf.st_size);
+			dropbear_log(LOG_WARNING, "wtmp_write: problem writing %s: %s",
+			    WTMP_FILE, strerror(errno));
+			ret = 0;
+		}
+	(void)close(fd);
+	return ret;
+}
+
+static int
+wtmp_perform_login(struct logininfo *li)
+{
+	struct utmp ut;
+
+	construct_utmp(li, &ut);
+	return wtmp_write(li, &ut);
+}
+
+
+static int
+wtmp_perform_logout(struct logininfo *li)
+{
+	struct utmp ut;
+
+	construct_utmp(li, &ut);
+	return wtmp_write(li, &ut);
+}
+
+
+int
+wtmp_write_entry(struct logininfo *li)
+{
+	switch(li->type) {
+	case LTYPE_LOGIN:
+		return wtmp_perform_login(li);
+	case LTYPE_LOGOUT:
+		return wtmp_perform_logout(li);
+	default:
+		dropbear_log(LOG_WARNING, "wtmp_write_entry: invalid type field");
+		return 0;
+	}
+}
+
+
+/* Notes on fetching login data from wtmp/wtmpx
+ *
+ * Logouts are usually recorded with (amongst other things) a blank
+ * username on a given tty line.  However, some systems (HP-UX is one)
+ * leave all fields set, but change the ut_type field to DEAD_PROCESS.
+ *
+ * Since we're only looking for logins here, we know that the username
+ * must be set correctly. On systems that leave it in, we check for
+ * ut_type==USER_PROCESS (indicating a login.)
+ *
+ * Portability: Some systems may set something other than USER_PROCESS
+ * to indicate a login process. I don't know of any as I write. Also,
+ * it's possible that some systems may both leave the username in
+ * place and not have ut_type.
+ */
+
+/* return true if this wtmp entry indicates a login */
+static int
+wtmp_islogin(struct logininfo *li, struct utmp *ut)
+{
+	if (strncmp(li->username, ut->ut_name,
+		MIN_SIZEOF(li->username, ut->ut_name)) == 0) {
+# ifdef HAVE_STRUCT_UTMP_UT_TYPE
+		if (ut->ut_type & USER_PROCESS)
+			return 1;
+# else
+		return 1;
+# endif
+	}
+	return 0;
+}
+
+int
+wtmp_get_entry(struct logininfo *li)
+{
+	struct stat st;
+	struct utmp ut;
+	int fd, found=0;
+
+	/* Clear the time entries in our logininfo */
+	li->tv_sec = li->tv_usec = 0;
+
+	if ((fd = open(WTMP_FILE, O_RDONLY)) < 0) {
+		dropbear_log(LOG_WARNING, "wtmp_get_entry: problem opening %s: %s",
+		    WTMP_FILE, strerror(errno));
+		return 0;
+	}
+	if (fstat(fd, &st) != 0) {
+		dropbear_log(LOG_WARNING, "wtmp_get_entry: couldn't stat %s: %s",
+		    WTMP_FILE, strerror(errno));
+		close(fd);
+		return 0;
+	}
+
+	/* Seek to the start of the last struct utmp */
+	if (lseek(fd, -(off_t)sizeof(struct utmp), SEEK_END) == -1) {
+		/* Looks like we've got a fresh wtmp file */
+		close(fd);
+		return 0;
+	}
+
+	while (!found) {
+		if (atomicio(read, fd, &ut, sizeof(ut)) != sizeof(ut)) {
+			dropbear_log(LOG_WARNING, "wtmp_get_entry: read of %s failed: %s",
+			    WTMP_FILE, strerror(errno));
+			close (fd);
+			return 0;
+		}
+		if ( wtmp_islogin(li, &ut) ) {
+			found = 1;
+			/* We've already checked for a time in struct
+			 * utmp, in login_getlast(). */
+# ifdef HAVE_STRUCT_UTMP_UT_TIME
+			li->tv_sec = ut.ut_time;
+# else
+#  if HAVE_STRUCT_UTMP_UT_TV
+			li->tv_sec = ut.ut_tv.tv_sec;
+#  endif
+# endif
+			line_fullname(li->line, ut.ut_line,
+				      MIN_SIZEOF(li->line, ut.ut_line));
+# ifdef HAVE_STRUCT_UTMP_UT_HOST
+			strlcpy(li->hostname, ut.ut_host,
+				MIN_SIZEOF(li->hostname, ut.ut_host));
+# endif
+			continue;
+		}
+		/* Seek back 2 x struct utmp */
+		if (lseek(fd, -(off_t)(2 * sizeof(struct utmp)), SEEK_CUR) == -1) {
+			/* We've found the start of the file, so quit */
+			close (fd);
+			return 0;
+		}
+	}
+
+	/* We found an entry. Tidy up and return */
+	close(fd);
+	return 1;
+}
+# endif /* USE_WTMP */
+
+
+/**
+ ** Low-level wtmpx functions
+ **/
+
+#ifdef USE_WTMPX
+/* write a wtmpx entry direct to the end of the file */
+/* This is a slight modification of code in OpenBSD's logwtmp.c */
+static int
+wtmpx_write(struct logininfo *li, struct utmpx *utx)
+{
+	struct stat buf;
+	int fd, ret = 1;
+
+	if ((fd = open(WTMPX_FILE, O_WRONLY|O_APPEND, 0)) < 0) {
+		dropbear_log(LOG_WARNING, "wtmpx_write: problem opening %s: %s",
+		    WTMPX_FILE, strerror(errno));
+		return 0;
+	}
+
+	if (fstat(fd, &buf) == 0)
+		if (atomicio(write, fd, utx, sizeof(*utx)) != sizeof(*utx)) {
+			ftruncate(fd, buf.st_size);
+			dropbear_log(LOG_WARNING, "wtmpx_write: problem writing %s: %s",
+			    WTMPX_FILE, strerror(errno));
+			ret = 0;
+		}
+	(void)close(fd);
+
+	return ret;
+}
+
+
+static int
+wtmpx_perform_login(struct logininfo *li)
+{
+	struct utmpx utx;
+
+	construct_utmpx(li, &utx);
+	return wtmpx_write(li, &utx);
+}
+
+
+static int
+wtmpx_perform_logout(struct logininfo *li)
+{
+	struct utmpx utx;
+
+	construct_utmpx(li, &utx);
+	return wtmpx_write(li, &utx);
+}
+
+
+int
+wtmpx_write_entry(struct logininfo *li)
+{
+	switch(li->type) {
+	case LTYPE_LOGIN:
+		return wtmpx_perform_login(li);
+	case LTYPE_LOGOUT:
+		return wtmpx_perform_logout(li);
+	default:
+		dropbear_log(LOG_WARNING, "wtmpx_write_entry: invalid type field");
+		return 0;
+	}
+}
+
+/* Please see the notes above wtmp_islogin() for information about the
+   next two functions */
+
+/* Return true if this wtmpx entry indicates a login */
+static int
+wtmpx_islogin(struct logininfo *li, struct utmpx *utx)
+{
+	if ( strncmp(li->username, utx->ut_name,
+		MIN_SIZEOF(li->username, utx->ut_name)) == 0 ) {
+# ifdef HAVE_STRUCT_UTMPX_UT_TYPE
+		if (utx->ut_type == USER_PROCESS)
+			return 1;
+# else
+		return 1;
+# endif
+	}
+	return 0;
+}
+
+
+int
+wtmpx_get_entry(struct logininfo *li)
+{
+	struct stat st;
+	struct utmpx utx;
+	int fd, found=0;
+
+	/* Clear the time entries */
+	li->tv_sec = li->tv_usec = 0;
+
+	if ((fd = open(WTMPX_FILE, O_RDONLY)) < 0) {
+		dropbear_log(LOG_WARNING, "wtmpx_get_entry: problem opening %s: %s",
+		    WTMPX_FILE, strerror(errno));
+		return 0;
+	}
+	if (fstat(fd, &st) != 0) {
+		dropbear_log(LOG_WARNING, "wtmpx_get_entry: couldn't stat %s: %s",
+		    WTMPX_FILE, strerror(errno));
+		close(fd);
+		return 0;
+	}
+
+	/* Seek to the start of the last struct utmpx */
+	if (lseek(fd, -(off_t)sizeof(struct utmpx), SEEK_END) == -1 ) {
+		/* probably a newly rotated wtmpx file */
+		close(fd);
+		return 0;
+	}
+
+	while (!found) {
+		if (atomicio(read, fd, &utx, sizeof(utx)) != sizeof(utx)) {
+			dropbear_log(LOG_WARNING, "wtmpx_get_entry: read of %s failed: %s",
+			    WTMPX_FILE, strerror(errno));
+			close (fd);
+			return 0;
+		}
+		/* Logouts are recorded as a blank username on a particular line.
+		 * So, we just need to find the username in struct utmpx */
+		if ( wtmpx_islogin(li, &utx) ) {
+			found = 1;
+# ifdef HAVE_STRUCT_UTMPX_UT_TV
+			li->tv_sec = utx.ut_tv.tv_sec;
+# else
+#  ifdef HAVE_STRUCT_UTMPX_UT_TIME
+			li->tv_sec = utx.ut_time;
+#  endif
+# endif
+			line_fullname(li->line, utx.ut_line, sizeof(li->line));
+# ifdef HAVE_STRUCT_UTMPX_UT_HOST
+			strlcpy(li->hostname, utx.ut_host,
+				MIN_SIZEOF(li->hostname, utx.ut_host));
+# endif
+			continue;
+		}
+		if (lseek(fd, -(off_t)(2 * sizeof(struct utmpx)), SEEK_CUR) == -1) {
+			close (fd);
+			return 0;
+		}
+	}
+
+	close(fd);
+	return 1;
+}
+#endif /* USE_WTMPX */
+
+/**
+ ** Low-level libutil login() functions
+ **/
+
+#ifdef USE_LOGIN
+static int
+syslogin_perform_login(struct logininfo *li)
+{
+	struct utmp *ut;
+
+	if (! (ut = (struct utmp *)malloc(sizeof(*ut)))) {
+		dropbear_log(LOG_WARNING, "syslogin_perform_login: couldn't malloc()");
+		return 0;
+	}
+	construct_utmp(li, ut);
+	login(ut);
+	free(ut);
+
+	return 1;
+}
+
+static int
+syslogin_perform_logout(struct logininfo *li)
+{
+# ifdef HAVE_LOGOUT
+	char line[8];
+
+	(void)line_stripname(line, li->line, sizeof(line));
+
+	if (!logout(line)) {
+		dropbear_log(LOG_WARNING, "syslogin_perform_logout: logout(%s) returned an error: %s", line, strerror(errno));
+#  ifdef HAVE_LOGWTMP
+	} else {
+		logwtmp(line, "", "");
+#  endif
+	}
+	/* FIXME: (ATL - if the need arises) What to do if we have
+	 * login, but no logout?  what if logout but no logwtmp? All
+	 * routines are in libutil so they should all be there,
+	 * but... */
+# endif
+	return 1;
+}
+
+int
+syslogin_write_entry(struct logininfo *li)
+{
+	switch (li->type) {
+	case LTYPE_LOGIN:
+		return syslogin_perform_login(li);
+	case LTYPE_LOGOUT:
+		return syslogin_perform_logout(li);
+	default:
+		dropbear_log(LOG_WARNING, "syslogin_write_entry: Invalid type field");
+		return 0;
+	}
+}
+#endif /* USE_LOGIN */
+
+/* end of file log-syslogin.c */
+
+/**
+ ** Low-level lastlog functions
+ **/
+
+#ifdef USE_LASTLOG
+#define LL_FILE 1
+#define LL_DIR 2
+#define LL_OTHER 3
+
+static void
+lastlog_construct(struct logininfo *li, struct lastlog *last)
+{
+	/* clear the structure */
+	memset(last, '\0', sizeof(*last));
+
+	(void)line_stripname(last->ll_line, li->line, sizeof(last->ll_line));
+	strlcpy(last->ll_host, li->hostname,
+		MIN_SIZEOF(last->ll_host, li->hostname));
+	last->ll_time = li->tv_sec;
+}
+
+static int
+lastlog_filetype(char *filename)
+{
+	struct stat st;
+
+	if (stat(filename, &st) != 0) {
+		dropbear_log(LOG_WARNING, "lastlog_perform_login: Couldn't stat %s: %s", filename,
+			strerror(errno));
+		return 0;
+	}
+	if (S_ISDIR(st.st_mode))
+		return LL_DIR;
+	else if (S_ISREG(st.st_mode))
+		return LL_FILE;
+	else
+		return LL_OTHER;
+}
+
+
+/* open the file (using filemode) and seek to the login entry */
+static int
+lastlog_openseek(struct logininfo *li, int *fd, int filemode)
+{
+	off_t offset;
+	int type;
+	char lastlog_file[1024];
+
+	type = lastlog_filetype(LASTLOG_FILE);
+	switch (type) {
+		case LL_FILE:
+			strlcpy(lastlog_file, LASTLOG_FILE, sizeof(lastlog_file));
+			break;
+		case LL_DIR:
+			snprintf(lastlog_file, sizeof(lastlog_file), "%s/%s",
+				 LASTLOG_FILE, li->username);
+			break;
+		default:
+			dropbear_log(LOG_WARNING, "lastlog_openseek: %.100s is not a file or directory!",
+			    LASTLOG_FILE);
+			return 0;
+	}
+
+	*fd = open(lastlog_file, filemode);
+	if ( *fd < 0) {
+		dropbear_log(LOG_INFO, "lastlog_openseek: Couldn't open %s: %s",
+		    lastlog_file, strerror(errno));
+		return 0;
+	}
+
+	if (type == LL_FILE) {
+		/* find this uid's offset in the lastlog file */
+		offset = (off_t) ((long)li->uid * sizeof(struct lastlog));
+
+		if ( lseek(*fd, offset, SEEK_SET) != offset ) {
+			dropbear_log(LOG_WARNING, "lastlog_openseek: %s->lseek(): %s",
+			 lastlog_file, strerror(errno));
+			return 0;
+		}
+	}
+
+	return 1;
+}
+
+static int
+lastlog_perform_login(struct logininfo *li)
+{
+	struct lastlog last;
+	int fd;
+
+	/* create our struct lastlog */
+	lastlog_construct(li, &last);
+
+	if (!lastlog_openseek(li, &fd, O_RDWR|O_CREAT))
+		return(0);
+
+	/* write the entry */
+	if (atomicio(write, fd, &last, sizeof(last)) != sizeof(last)) {
+		close(fd);
+		dropbear_log(LOG_WARNING, "lastlog_write_filemode: Error writing to %s: %s",
+		    LASTLOG_FILE, strerror(errno));
+		return 0;
+	}
+
+	close(fd);
+	return 1;
+}
+
+int
+lastlog_write_entry(struct logininfo *li)
+{
+	switch(li->type) {
+	case LTYPE_LOGIN:
+		return lastlog_perform_login(li);
+	default:
+		dropbear_log(LOG_WARNING, "lastlog_write_entry: Invalid type field");
+		return 0;
+	}
+}
+
+#endif /* USE_LASTLOG */
diff --git a/loginrec.h b/loginrec.h
new file mode 100644
index 0000000..03d26b9
--- /dev/null
+++ b/loginrec.h
@@ -0,0 +1,185 @@
+#ifndef _HAVE_LOGINREC_H_
+#define _HAVE_LOGINREC_H_
+
+/*
+ * Copyright (c) 2000 Andre Lucas.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ ** loginrec.h:  platform-independent login recording and lastlog retrieval
+ **/
+
+#include "includes.h"
+
+/* RCSID("Id: loginrec.h,v 1.2 2004/05/04 10:17:43 matt Exp "); */
+
+/* The following #defines are from OpenSSH's defines.h, required for loginrec */
+
+/* FIXME: put default paths back in */
+#ifndef UTMP_FILE
+#  ifdef _PATH_UTMP
+#    define UTMP_FILE _PATH_UTMP
+#  else
+#    ifdef CONF_UTMP_FILE
+#      define UTMP_FILE CONF_UTMP_FILE
+#    endif
+#  endif
+#endif
+#ifndef WTMP_FILE
+#  ifdef _PATH_WTMP
+#    define WTMP_FILE _PATH_WTMP
+#  else
+#    ifdef CONF_WTMP_FILE
+#      define WTMP_FILE CONF_WTMP_FILE
+#    endif
+#  endif
+#endif
+/* pick up the user's location for lastlog if given */
+#ifndef LASTLOG_FILE
+#  ifdef _PATH_LASTLOG
+#    define LASTLOG_FILE _PATH_LASTLOG
+#  else
+#    ifdef CONF_LASTLOG_FILE
+#      define LASTLOG_FILE CONF_LASTLOG_FILE
+#    endif
+#  endif
+#endif
+
+
+/* The login() library function in libutil is first choice */
+#if defined(HAVE_LOGIN) && !defined(DISABLE_LOGIN)
+#  define USE_LOGIN
+
+#else
+/* Simply select your favourite login types. */
+/* Can't do if-else because some systems use several... <sigh> */
+#  if defined(UTMPX_FILE) && !defined(DISABLE_UTMPX)
+#    define USE_UTMPX
+#  endif
+#  if defined(UTMP_FILE) && !defined(DISABLE_UTMP)
+#    define USE_UTMP
+#  endif
+#  if defined(WTMPX_FILE) && !defined(DISABLE_WTMPX)
+#    define USE_WTMPX
+#  endif
+#  if defined(WTMP_FILE) && !defined(DISABLE_WTMP)
+#    define USE_WTMP
+#  endif
+
+#endif
+
+/* I hope that the presence of LASTLOG_FILE is enough to detect this */
+#if defined(LASTLOG_FILE) && !defined(DISABLE_LASTLOG)
+#  define USE_LASTLOG
+#endif
+
+
+/**
+ ** you should use the login_* calls to work around platform dependencies
+ **/
+
+/*
+ * login_netinfo structure
+ */
+
+union login_netinfo {
+	struct sockaddr sa;
+	struct sockaddr_in sa_in;
+#ifdef HAVE_STRUCT_SOCKADDR_STORAGE
+	struct sockaddr_storage sa_storage;
+#endif
+};
+
+/*
+ *   * logininfo structure  *
+ */
+/* types - different to utmp.h 'type' macros */
+/* (though set to the same value as linux, openbsd and others...) */
+#define LTYPE_LOGIN    7
+#define LTYPE_LOGOUT   8
+
+/* string lengths - set very long */
+#define LINFO_PROGSIZE 64
+#define LINFO_LINESIZE 64
+#define LINFO_NAMESIZE 64
+#define LINFO_HOSTSIZE 256
+
+struct logininfo {
+	char       progname[LINFO_PROGSIZE];     /* name of program (for PAM) */
+	int        progname_null;
+	short int  type;                         /* type of login (LTYPE_*) */
+	int        pid;                          /* PID of login process */
+	int        uid;                          /* UID of this user */
+	char       line[LINFO_LINESIZE];         /* tty/pty name */
+	char       username[LINFO_NAMESIZE];     /* login username */
+	char       hostname[LINFO_HOSTSIZE];     /* remote hostname */
+	/* 'exit_status' structure components */
+	int        exit;                        /* process exit status */
+	int        termination;                 /* process termination status */
+	/* struct timeval (sys/time.h) isn't always available, if it isn't we'll
+	 * use time_t's value as tv_sec and set tv_usec to 0
+	 */
+	unsigned int tv_sec;
+	unsigned int tv_usec;
+	union login_netinfo hostaddr;       /* caller's host address(es) */
+}; /* struct logininfo */
+
+/*
+ * login recording functions
+ */
+
+/** 'public' functions */
+
+struct logininfo *login_alloc_entry(int pid, const char *username,
+				    const char *hostname, const char *line);
+/* free a structure */
+void login_free_entry(struct logininfo *li);
+/* fill out a pre-allocated structure with useful information */
+int login_init_entry(struct logininfo *li, int pid, const char *username,
+		     const char *hostname, const char *line);
+/* place the current time in a logininfo struct */
+void login_set_current_time(struct logininfo *li);
+
+/* record the entry */
+int login_login (struct logininfo *li);
+int login_logout(struct logininfo *li);
+#ifdef LOGIN_NEEDS_UTMPX
+int login_utmp_only(struct logininfo *li);
+#endif
+
+/** End of public functions */
+
+/* record the entry */
+int login_write (struct logininfo *li);
+int login_log_entry(struct logininfo *li);
+
+/* set the network address based on network address type */
+void login_set_addr(struct logininfo *li, const struct sockaddr *sa,
+		    const unsigned int sa_size);
+
+/* produce various forms of the line filename */
+char *line_fullname(char *dst, const char *src, size_t dstsize);
+char *line_stripname(char *dst, const char *src, size_t dstsize);
+char *line_abbrevname(char *dst, const char *src, size_t dstsize);
+
+#endif /* _HAVE_LOGINREC_H_ */
diff --git a/options.h b/options.h
new file mode 100644
index 0000000..1feae40
--- /dev/null
+++ b/options.h
@@ -0,0 +1,401 @@
+/* Dropbear SSH
+ * Copyright (c) 2002,2003 Matt Johnston
+ * All rights reserved. See LICENSE for the license. */
+
+#ifndef _OPTIONS_H_
+#define _OPTIONS_H_
+
+/******************************************************************
+ * Define compile-time options below - the "#ifndef DROPBEAR_XXX .... #endif"
+ * parts are to allow for commandline -DDROPBEAR_XXX options etc.
+ ******************************************************************/
+
+#ifndef DROPBEAR_DEFPORT
+#define DROPBEAR_DEFPORT "22"
+#endif
+
+/* Default hostkey paths - these can be specified on the command line */
+#ifndef DSS_PRIV_FILENAME
+#define DSS_PRIV_FILENAME "/etc/dropbear/dropbear_dss_host_key"
+#endif
+#ifndef RSA_PRIV_FILENAME
+#define RSA_PRIV_FILENAME "/etc/dropbear/dropbear_rsa_host_key"
+#endif
+
+/* Set NON_INETD_MODE if you require daemon functionality (ie Dropbear listens
+ * on chosen ports and keeps accepting connections. This is the default.
+ *
+ * Set INETD_MODE if you want to be able to run Dropbear with inetd (or
+ * similar), where it will use stdin/stdout for connections, and each process
+ * lasts for a single connection. Dropbear should be invoked with the -i flag
+ * for inetd, and can only accept IPv4 connections.
+ *
+ * Both of these flags can be defined at once, don't compile without at least
+ * one of them. */
+#define NON_INETD_MODE
+#define INETD_MODE
+
+/* Setting this disables the fast exptmod bignum code. It saves ~5kB, but is
+ * perhaps 20% slower for pubkey operations (it is probably worth experimenting
+ * if you want to use this) */
+/*#define NO_FAST_EXPTMOD*/
+
+/* Set this if you want to use the DROPBEAR_SMALL_CODE option. This can save
+several kB in binary size, however will make the symmetrical ciphers (AES, DES
+etc) slower (perhaps by 50%). Recommended for most small systems. */
+#define DROPBEAR_SMALL_CODE
+
+/* Enable X11 Forwarding - server only */
+#define ENABLE_X11FWD
+
+/* Enable TCP Fowarding */
+/* 'Local' is "-L" style (client listening port forwarded via server)
+ * 'Remote' is "-R" style (server listening port forwarded via client) */
+
+#define ENABLE_CLI_LOCALTCPFWD
+#define ENABLE_CLI_REMOTETCPFWD
+
+#define ENABLE_SVR_LOCALTCPFWD
+#define ENABLE_SVR_REMOTETCPFWD
+
+/* Enable Authentication Agent Forwarding - server only for now */
+#define ENABLE_AGENTFWD
+
+/* Encryption - at least one required.
+ * RFC Draft requires 3DES and recommends AES128 for interoperability.
+ * Including multiple keysize variants the same cipher 
+ * (eg AES256 as well as AES128) will result in a minimal size increase.*/
+#define DROPBEAR_AES128_CBC
+#define DROPBEAR_3DES_CBC
+#define DROPBEAR_AES256_CBC
+#define DROPBEAR_BLOWFISH_CBC
+#define DROPBEAR_TWOFISH256_CBC
+#define DROPBEAR_TWOFISH128_CBC
+
+/* Message Integrity - at least one required.
+ * RFC Draft requires sha1 and recommends sha1-96.
+ * sha1-96 may be of use for slow links, as it has a smaller overhead.
+ *
+ * Note: there's no point disabling sha1 to save space, since it's used
+ * for the random number generator and public-key cryptography anyway.
+ * Disabling it here will just stop it from being used as the integrity portion
+ * of the ssh protocol.
+ *
+ * These hashes are also used for public key fingerprints in logs.
+ * If you disable MD5, Dropbear will fall back to SHA1 fingerprints,
+ * which are not the standard form. */
+#define DROPBEAR_SHA1_HMAC
+#define DROPBEAR_SHA1_96_HMAC
+#define DROPBEAR_MD5_HMAC
+
+/* Hostkey/public key algorithms - at least one required, these are used
+ * for hostkey as well as for verifying signatures with pubkey auth.
+ * Removing either of these won't save very much space.
+ * SSH2 RFC Draft requires dss, recommends rsa */
+#define DROPBEAR_RSA
+#define DROPBEAR_DSS
+
+/* RSA can be vulnerable to timing attacks which use the time required for
+ * signing to guess the private key. Blinding avoids this attack, though makes
+ * signing operations slightly slower. */
+#define RSA_BLINDING
+
+/* Define DSS_PROTOK to use PuTTY's method of generating the value k for dss,
+ * rather than just from the random byte source. Undefining this will save you
+ * ~4k in binary size with static uclibc, but your DSS hostkey could be exposed
+ * if the random number source isn't good. In general this isn't required */
+/* #define DSS_PROTOK */
+
+/* Whether to do reverse DNS lookups. */
+#define DO_HOST_LOOKUP
+
+/* Whether to print the message of the day (MOTD). This doesn't add much code
+ * size */
+#define DO_MOTD
+
+/* The MOTD file path */
+#ifndef MOTD_FILENAME
+#define MOTD_FILENAME "/etc/motd"
+#endif
+
+/* Authentication Types - at least one required.
+   RFC Draft requires pubkey auth, and recommends password */
+
+/* Note: PAM auth is quite simple, and only works for PAM modules which just do
+ * a simple "Login: " "Password: " (you can edit the strings in svr-authpam.c).
+ * It's useful for systems like OS X where standard password crypts don't work,
+ * but there's an interface via a PAM module - don't bother using it otherwise.
+ * You can't enable both PASSWORD and PAM. */
+
+#define ENABLE_SVR_PASSWORD_AUTH
+/*#define ENABLE_SVR_PAM_AUTH*/
+#define ENABLE_SVR_PUBKEY_AUTH
+
+#define ENABLE_CLI_PASSWORD_AUTH
+#define ENABLE_CLI_PUBKEY_AUTH
+#define ENABLE_CLI_INTERACT_AUTH
+
+/* Define this (as well as ENABLE_CLI_PASSWORD_AUTH) to allow the use of
+ * a helper program for the ssh client. The helper program should be
+ * specified in the SSH_ASKPASS environment variable, and dbclient
+ * should be run with DISPLAY set and no tty. The program should
+ * return the password on standard output */
+/*#define ENABLE_CLI_ASKPASS_HELPER*/
+
+/* Random device to use - define either DROPBEAR_RANDOM_DEV or
+ * DROPBEAR_PRNGD_SOCKET.
+ * DROPBEAR_RANDOM_DEV is recommended on hosts with a good /dev/(u)random,
+ * otherwise use run prngd (or egd if you want), specifying the socket. 
+ * The device will be queried for a few dozen bytes of seed a couple of times
+ * per session (or more for very long-lived sessions). */
+
+/* If you are lacking entropy on the system then using /dev/urandom
+ * will prevent Dropbear from blocking on the device. This could
+ * however significantly reduce the security of your ssh connections
+ * if the PRNG state becomes guessable - make sure you know what you are
+ * doing if you change this. */
+#define DROPBEAR_RANDOM_DEV "/dev/random"
+
+/* prngd must be manually set up to produce output */
+/*#define DROPBEAR_PRNGD_SOCKET "/var/run/dropbear-rng"*/
+
+/* Specify the number of clients we will allow to be connected but
+ * not yet authenticated. After this limit, connections are rejected */
+/* The first setting is per-IP, to avoid denial of service */
+#ifndef MAX_UNAUTH_PER_IP
+#define MAX_UNAUTH_PER_IP 5
+#endif
+
+/* And then a global limit to avoid chewing memory if connections 
+ * come from many IPs */
+#ifndef MAX_UNAUTH_CLIENTS
+#define MAX_UNAUTH_CLIENTS 30
+#endif
+
+/* Maximum number of failed authentication tries (server option) */
+#ifndef MAX_AUTH_TRIES
+#define MAX_AUTH_TRIES 10
+#endif
+
+/* The file to store the daemon's process ID, for shutdown scripts etc */
+#ifndef DROPBEAR_PIDFILE
+#define DROPBEAR_PIDFILE "/var/run/dropbear.pid"
+#endif
+
+/* The command to invoke for xauth when using X11 forwarding.
+ * "-q" for quiet */
+#ifndef XAUTH_COMMAND
+#define XAUTH_COMMAND "/usr/X11R6/bin/xauth -q"
+#endif
+
+/* if you want to enable running an sftp server (such as the one included with
+ * OpenSSH), set the path below. If the path isn't defined, sftp will not
+ * be enabled */
+#ifndef SFTPSERVER_PATH
+#define SFTPSERVER_PATH "/usr/libexec/sftp-server"
+#endif
+
+/* This is used by the scp binary when used as a client binary. If you're
+ * not using the Dropbear client, you'll need to change it */
+#define _PATH_SSH_PROGRAM "/usr/bin/dbclient"
+
+/* Multi-purpose binary configuration has now moved. Look at the top
+ * of the Makefile for instructions, or INSTALL */
+
+/*******************************************************************
+ * You shouldn't edit below here unless you know you need to.
+ *******************************************************************/
+
+#ifndef DROPBEAR_VERSION
+#define DROPBEAR_VERSION "0.47"
+#endif
+
+#define LOCAL_IDENT "SSH-2.0-dropbear_" DROPBEAR_VERSION
+#define PROGNAME "dropbear"
+
+/* Spec recommends after one hour or 1 gigabyte of data. One hour
+ * is a bit too verbose, so we try 8 hours */
+#ifndef KEX_REKEY_TIMEOUT
+#define KEX_REKEY_TIMEOUT (3600 * 8)
+#endif
+#ifndef KEX_REKEY_DATA
+#define KEX_REKEY_DATA (1<<30) /* 2^30 == 1GB, this value must be < INT_MAX */
+#endif
+/* Close connections to clients which haven't authorised after AUTH_TIMEOUT */
+#ifndef AUTH_TIMEOUT
+#define AUTH_TIMEOUT 300 /* we choose 5 minutes */
+#endif
+
+/* Minimum key sizes for DSS and RSA */
+#ifndef MIN_DSS_KEYLEN
+#define MIN_DSS_KEYLEN 512
+#endif
+#ifndef MIN_RSA_KEYLEN
+#define MIN_RSA_KEYLEN 512
+#endif
+
+#define MAX_BANNER_SIZE 2000 /* this is 25*80 chars, any more is foolish */
+#define MAX_BANNER_LINES 20 /* How many lines the client will display */
+
+/* the number of NAME=VALUE pairs to malloc for environ, if we don't have
+ * the clearenv() function */
+#define ENV_SIZE 100
+
+#define MAX_CMD_LEN 1024 /* max length of a command */
+#define MAX_TERM_LEN 200 /* max length of TERM name */
+
+#define MAX_HOST_LEN 254 /* max hostname len for tcp fwding */
+#define MAX_IP_LEN 15 /* strlen("255.255.255.255") == 15 */
+
+#define DROPBEAR_MAX_PORTS 10 /* max number of ports which can be specified,
+								 ipv4 and ipv6 don't count twice */
+
+#define _PATH_TTY "/dev/tty"
+
+/* Timeouts in seconds */
+#define SELECT_TIMEOUT 20
+
+/* success/failure defines */
+#define DROPBEAR_SUCCESS 0
+#define DROPBEAR_FAILURE -1
+
+/* various algorithm identifiers */
+#define DROPBEAR_KEX_DH_GROUP1 0
+
+#define DROPBEAR_SIGNKEY_ANY 0
+#define DROPBEAR_SIGNKEY_RSA 1
+#define DROPBEAR_SIGNKEY_DSS 2
+#define DROPBEAR_SIGNKEY_NONE 3
+
+#define DROPBEAR_COMP_NONE 0
+#define DROPBEAR_COMP_ZLIB 1
+
+/* Required for pubkey auth */
+#if defined(ENABLE_SVR_PUBKEY_AUTH) || defined(DROPBEAR_CLIENT)
+#define DROPBEAR_SIGNKEY_VERIFY
+#endif
+
+/* SHA1 is 20 bytes == 160 bits */
+#define SHA1_HASH_SIZE 20
+/* SHA512 is 64 bytes == 512 bits */
+#define SHA512_HASH_SIZE 64
+/* MD5 is 16 bytes = 128 bits */
+#define MD5_HASH_SIZE 16
+
+/* largest of MD5 and SHA1 */
+#define MAX_MAC_LEN SHA1_HASH_SIZE
+
+
+#define MAX_KEY_LEN 32 /* 256 bits for aes256 etc */
+#define MAX_IV_LEN 20 /* must be same as max blocksize, 
+						 and >= SHA1_HASH_SIZE */
+#define MAX_MAC_KEY 20
+
+#define MAX_NAME_LEN 64 /* maximum length of a protocol name, isn't
+						   explicitly specified for all protocols (just
+						   for algos) but seems valid */
+
+#define MAX_PROPOSED_ALGO 20
+
+/* size/count limits */
+#define MAX_LISTEN_ADDR 10
+
+#define MAX_PACKET_LEN 35000
+#define MIN_PACKET_LEN 16
+#define MAX_PAYLOAD_LEN 32768
+
+#define MAX_TRANS_PAYLOAD_LEN 32768
+#define MAX_TRANS_PACKET_LEN (MAX_TRANS_PAYLOAD_LEN+50)
+
+#define MAX_TRANS_WINDOW 500000000 /* 500MB is sufficient, stopping overflow */
+#define MAX_TRANS_WIN_INCR 500000000 /* overflow prevention */
+
+#define MAX_STRING_LEN 1400 /* ~= MAX_PROPOSED_ALGO * MAX_NAME_LEN, also
+							   is the max length for a password etc */
+
+/* For a 4096 bit DSS key, empirically determined */
+#define MAX_PUBKEY_SIZE 1700
+/* For a 4096 bit DSS key, empirically determined */
+#define MAX_PRIVKEY_SIZE 1700
+
+/* The maximum size of the bignum portion of the kexhash buffer */
+/* Sect. 8 of the transport draft, K_S + e + f + K */
+#define KEXHASHBUF_MAX_INTS (1700 + 130 + 130 + 130)
+
+#define DROPBEAR_MAX_SOCKS 2 /* IPv4, IPv6 are all we'll get for now. Revisit
+								in a few years time.... */
+
+#define DROPBEAR_MAX_CLI_PASS 1024
+
+#define DROPBEAR_MAX_CLI_INTERACT_PROMPTS 80 /* The number of prompts we'll 
+												accept for keyb-interactive
+												auth */
+
+#if defined(DROPBEAR_AES256_CBC) || defined(DROPBEAR_AES128_CBC)
+#define DROPBEAR_AES_CBC
+#endif
+
+#if defined(DROPBEAR_TWOFISH256_CBC) || defined(DROPBEAR_TWOFISH128_CBC)
+#define DROPBEAR_TWOFISH_CBC
+#endif
+
+#ifndef ENABLE_X11FWD
+#define DISABLE_X11FWD
+#endif
+
+#ifndef ENABLE_AGENTFWD
+#define DISABLE_AGENTFWD
+#endif
+
+#if defined(ENABLE_CLI_REMOTETCPFWD) || defined(ENABLE_CLI_LOCALTCPFWD)
+#define ENABLE_CLI_ANYTCPFWD 
+#endif
+
+#if defined(ENABLE_CLI_LOCALTCPFWD) || defined(ENABLE_SVR_REMOTETCPFWD)
+#define DROPBEAR_TCP_ACCEPT
+#endif
+
+#if defined(ENABLE_CLI_REMOTETCPFWD) || defined(ENABLE_CLI_LOCALTCPFWD) || \
+	defined(ENABLE_SVR_REMOTETCPFWD) || defined(ENABLE_SVR_LOCALTCPFWD) || \
+	defined(ENABLE_AGENTFWD) || defined(ENABLE_X11FWD)
+#define USING_LISTENERS
+#endif
+
+#if defined(DROPBEAR_CLIENT) || defined(ENABLE_SVR_PUBKEY_AUTH)
+#define DROPBEAR_KEY_LINES /* ie we're using authorized_keys or known_hosts */
+#endif
+
+#if defined(ENABLE_SVR_PASSWORD_AUTH) && defined(ENABLE_SVR_PAM_AUTH)
+#error "You can't turn on PASSWORD and PAM auth both at once. Fix it in options.h"
+#endif
+
+#if defined(DROPBEAR_RANDOM_DEV) && defined(DROPBEAR_PRNGD_SOCKET)
+#error "You can't turn on DROPBEAR_PRNGD_SOCKET and DROPBEAR_RANDOM_DEV at once"
+#endif
+
+#if !defined(DROPBEAR_RANDOM_DEV) && !defined(DROPBEAR_PRNGD_SOCKET)
+#error "You must choose one of DROPBEAR_PRNGD_SOCKET or DROPBEAR_RANDOM_DEV in options.h"
+#endif
+
+/* We use dropbear_client and dropbear_server as shortcuts to avoid redundant
+ * code, if we're just compiling as client or server */
+#if defined(DROPBEAR_SERVER) && defined(DROPBEAR_CLIENT)
+
+#define IS_DROPBEAR_SERVER (ses.isserver == 1)
+#define IS_DROPBEAR_CLIENT (ses.isserver == 0)
+
+#elif defined(DROPBEAR_SERVER)
+
+#define IS_DROPBEAR_SERVER 1
+#define IS_DROPBEAR_CLIENT 0
+
+#elif defined(DROPBEAR_CLIENT)
+
+#define IS_DROPBEAR_SERVER 0
+#define IS_DROPBEAR_CLIENT 1
+
+#else
+#error You must compiled with either DROPBEAR_CLIENT or DROPBEAR_SERVER selected
+#endif
+
+#endif /* _OPTIONS_H_ */
diff --git a/packet.c b/packet.c
new file mode 100644
index 0000000..b2c6174
--- /dev/null
+++ b/packet.c
@@ -0,0 +1,613 @@
+/*
+ * Dropbear - a SSH2 server
+ * 
+ * Copyright (c) 2002,2003 Matt Johnston
+ * All rights reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE. */
+
+#include "includes.h"
+#include "packet.h"
+#include "session.h"
+#include "dbutil.h"
+#include "ssh.h"
+#include "algo.h"
+#include "buffer.h"
+#include "kex.h"
+#include "random.h"
+#include "service.h"
+#include "auth.h"
+#include "channel.h"
+
+static void read_packet_init();
+static void writemac(buffer * outputbuffer, buffer * clearwritebuf);
+static int checkmac(buffer* hashbuf, buffer* readbuf);
+
+#define ZLIB_COMPRESS_INCR 20 /* this is 12 bytes + 0.1% of 8000 bytes */
+#define ZLIB_DECOMPRESS_INCR 100
+#ifndef DISABLE_ZLIB
+static buffer* buf_decompress(buffer* buf, unsigned int len);
+static void buf_compress(buffer * dest, buffer * src, unsigned int len);
+#endif
+
+/* non-blocking function writing out a current encrypted packet */
+void write_packet() {
+
+	int len, written;
+	buffer * writebuf = NULL;
+	
+	TRACE(("enter write_packet"))
+	dropbear_assert(!isempty(&ses.writequeue));
+
+	/* Get the next buffer in the queue of encrypted packets to write*/
+	writebuf = (buffer*)examine(&ses.writequeue);
+
+	len = writebuf->len - writebuf->pos;
+	dropbear_assert(len > 0);
+	/* Try to write as much as possible */
+	written = write(ses.sock, buf_getptr(writebuf, len), len);
+
+	if (written < 0) {
+		if (errno == EINTR) {
+			TRACE(("leave writepacket: EINTR"))
+			return;
+		} else {
+			dropbear_exit("error writing");
+		}
+	} 
+
+	if (written == 0) {
+		ses.remoteclosed();
+	}
+
+	if (written == len) {
+		/* We've finished with the packet, free it */
+		dequeue(&ses.writequeue);
+		buf_free(writebuf);
+		writebuf = NULL;
+	} else {
+		/* More packet left to write, leave it in the queue for later */
+		buf_incrpos(writebuf, written);
+	}
+
+	TRACE(("leave write_packet"))
+}
+
+/* Non-blocking function reading available portion of a packet into the
+ * ses's buffer, decrypting the length if encrypted, decrypting the
+ * full portion if possible */
+void read_packet() {
+
+	int len;
+	unsigned int maxlen;
+	unsigned char blocksize;
+
+	TRACE(("enter read_packet"))
+	blocksize = ses.keys->recv_algo_crypt->blocksize;
+	
+	if (ses.readbuf == NULL || ses.readbuf->len < blocksize) {
+		/* In the first blocksize of a packet */
+
+		/* Read the first blocksize of the packet, so we can decrypt it and
+		 * find the length of the whole packet */
+		read_packet_init();
+
+		/* If we don't have the length of decryptreadbuf, we didn't read
+		 * a whole blocksize and should exit */
+		if (ses.decryptreadbuf->len == 0) {
+			TRACE(("leave read_packet: packetinit done"))
+			return;
+		}
+	}
+
+	/* Attempt to read the remainder of the packet, note that there
+	 * mightn't be any available (EAGAIN) */
+	dropbear_assert(ses.readbuf != NULL);
+	maxlen = ses.readbuf->len - ses.readbuf->pos;
+	len = read(ses.sock, buf_getptr(ses.readbuf, maxlen), maxlen);
+
+	if (len == 0) {
+		ses.remoteclosed();
+	}
+
+	if (len < 0) {
+		if (errno == EINTR || errno == EAGAIN) {
+			TRACE(("leave read_packet: EINTR or EAGAIN"))
+			return;
+		} else {
+			dropbear_exit("error reading: %s", strerror(errno));
+		}
+	}
+
+	buf_incrpos(ses.readbuf, len);
+
+	if ((unsigned int)len == maxlen) {
+		/* The whole packet has been read */
+		decrypt_packet();
+		/* The main select() loop process_packet() to
+		 * handle the packet contents... */
+	}
+	TRACE(("leave read_packet"))
+}
+
+/* Function used to read the initial portion of a packet, and determine the
+ * length. Only called during the first BLOCKSIZE of a packet. */
+static void read_packet_init() {
+
+	unsigned int maxlen;
+	int len;
+	unsigned char blocksize;
+	unsigned char macsize;
+
+
+	blocksize = ses.keys->recv_algo_crypt->blocksize;
+	macsize = ses.keys->recv_algo_mac->hashsize;
+
+	if (ses.readbuf == NULL) {
+		/* start of a new packet */
+		ses.readbuf = buf_new(INIT_READBUF);
+		dropbear_assert(ses.decryptreadbuf == NULL);
+		ses.decryptreadbuf = buf_new(blocksize);
+	}
+
+	maxlen = blocksize - ses.readbuf->pos;
+			
+	/* read the rest of the packet if possible */
+	len = read(ses.sock, buf_getwriteptr(ses.readbuf, maxlen),
+			maxlen);
+	if (len == 0) {
+		ses.remoteclosed();
+	}
+	if (len < 0) {
+		if (errno == EINTR) {
+			TRACE(("leave read_packet_init: EINTR"))
+			return;
+		}
+		dropbear_exit("error reading: %s", strerror(errno));
+	}
+
+	buf_incrwritepos(ses.readbuf, len);
+
+	if ((unsigned int)len != maxlen) {
+		/* don't have enough bytes to determine length, get next time */
+		return;
+	}
+
+	/* now we have the first block, need to get packet length, so we decrypt
+	 * the first block (only need first 4 bytes) */
+	buf_setpos(ses.readbuf, 0);
+	if (ses.keys->recv_algo_crypt->cipherdesc == NULL) {
+		/* copy it */
+		memcpy(buf_getwriteptr(ses.decryptreadbuf, blocksize),
+				buf_getptr(ses.readbuf, blocksize),
+				blocksize);
+	} else {
+		/* decrypt it */
+		if (cbc_decrypt(buf_getptr(ses.readbuf, blocksize), 
+					buf_getwriteptr(ses.decryptreadbuf,blocksize),
+					blocksize,
+					&ses.keys->recv_symmetric_struct) != CRYPT_OK) {
+			dropbear_exit("error decrypting");
+		}
+	}
+	buf_setlen(ses.decryptreadbuf, blocksize);
+	len = buf_getint(ses.decryptreadbuf) + 4 + macsize;
+
+	buf_setpos(ses.readbuf, blocksize);
+
+	/* check packet length */
+	if ((len > MAX_PACKET_LEN) ||
+		(len < MIN_PACKET_LEN + macsize) ||
+		((len - macsize) % blocksize != 0)) {
+		dropbear_exit("bad packet size %d", len);
+	}
+
+	buf_resize(ses.readbuf, len);
+	buf_setlen(ses.readbuf, len);
+
+}
+
+/* handle the received packet */
+void decrypt_packet() {
+
+	unsigned char blocksize;
+	unsigned char macsize;
+	unsigned int padlen;
+	unsigned int len;
+
+	TRACE(("enter decrypt_packet"))
+	blocksize = ses.keys->recv_algo_crypt->blocksize;
+	macsize = ses.keys->recv_algo_mac->hashsize;
+
+	ses.kexstate.datarecv += ses.readbuf->len;
+
+	/* we've already decrypted the first blocksize in read_packet_init */
+	buf_setpos(ses.readbuf, blocksize);
+
+	buf_resize(ses.decryptreadbuf, ses.readbuf->len - macsize);
+	buf_setlen(ses.decryptreadbuf, ses.decryptreadbuf->size);
+	buf_setpos(ses.decryptreadbuf, blocksize);
+
+	/* decrypt if encryption is set, memcpy otherwise */
+	if (ses.keys->recv_algo_crypt->cipherdesc == NULL) {
+		/* copy it */
+		len = ses.readbuf->len - macsize - blocksize;
+		memcpy(buf_getwriteptr(ses.decryptreadbuf, len),
+				buf_getptr(ses.readbuf, len), len);
+	} else {
+		/* decrypt */
+		while (ses.readbuf->pos < ses.readbuf->len - macsize) {
+			if (cbc_decrypt(buf_getptr(ses.readbuf, blocksize), 
+						buf_getwriteptr(ses.decryptreadbuf, blocksize),
+						blocksize,
+						&ses.keys->recv_symmetric_struct) != CRYPT_OK) {
+				dropbear_exit("error decrypting");
+			}
+			buf_incrpos(ses.readbuf, blocksize);
+			buf_incrwritepos(ses.decryptreadbuf, blocksize);
+		}
+	}
+
+	/* check the hmac */
+	buf_setpos(ses.readbuf, ses.readbuf->len - macsize);
+	if (checkmac(ses.readbuf, ses.decryptreadbuf) != DROPBEAR_SUCCESS) {
+		dropbear_exit("Integrity error");
+	}
+
+	/* readbuf no longer required */
+	buf_free(ses.readbuf);
+	ses.readbuf = NULL;
+
+	/* get padding length */
+	buf_setpos(ses.decryptreadbuf, PACKET_PADDING_OFF);
+	padlen = buf_getbyte(ses.decryptreadbuf);
+		
+	/* payload length */
+	/* - 4 - 1 is for LEN and PADLEN values */
+	len = ses.decryptreadbuf->len - padlen - 4 - 1;
+	if ((len > MAX_PAYLOAD_LEN) || (len < 1)) {
+		dropbear_exit("bad packet size");
+	}
+
+	buf_setpos(ses.decryptreadbuf, PACKET_PAYLOAD_OFF);
+
+#ifndef DISABLE_ZLIB
+	if (ses.keys->recv_algo_comp == DROPBEAR_COMP_ZLIB) {
+		/* decompress */
+		ses.payload = buf_decompress(ses.decryptreadbuf, len);
+
+	} else 
+#endif
+	{
+		/* copy payload */
+		ses.payload = buf_new(len);
+		memcpy(ses.payload->data, buf_getptr(ses.decryptreadbuf, len), len);
+		buf_incrlen(ses.payload, len);
+	}
+
+	buf_free(ses.decryptreadbuf);
+	ses.decryptreadbuf = NULL;
+	buf_setpos(ses.payload, 0);
+
+	ses.recvseq++;
+
+	TRACE(("leave decrypt_packet"))
+}
+
+/* Checks the mac in hashbuf, for the data in readbuf.
+ * Returns DROPBEAR_SUCCESS or DROPBEAR_FAILURE */
+static int checkmac(buffer* macbuf, buffer* sourcebuf) {
+
+	unsigned int macsize;
+	hmac_state hmac;
+	unsigned char tempbuf[MAX_MAC_LEN];
+	unsigned long bufsize;
+	unsigned int len;
+
+	macsize = ses.keys->recv_algo_mac->hashsize;
+	if (macsize == 0) {
+		return DROPBEAR_SUCCESS;
+	}
+
+	/* calculate the mac */
+	if (hmac_init(&hmac, 
+				find_hash(ses.keys->recv_algo_mac->hashdesc->name), 
+				ses.keys->recvmackey, 
+				ses.keys->recv_algo_mac->keysize) 
+				!= CRYPT_OK) {
+		dropbear_exit("HMAC error");
+	}
+	
+	/* sequence number */
+	STORE32H(ses.recvseq, tempbuf);
+	if (hmac_process(&hmac, tempbuf, 4) != CRYPT_OK) {
+		dropbear_exit("HMAC error");
+	}
+
+	buf_setpos(sourcebuf, 0);
+	len = sourcebuf->len;
+	if (hmac_process(&hmac, buf_getptr(sourcebuf, len), len) != CRYPT_OK) {
+		dropbear_exit("HMAC error");
+	}
+
+	bufsize = sizeof(tempbuf);
+	if (hmac_done(&hmac, tempbuf, &bufsize) != CRYPT_OK) {
+		dropbear_exit("HMAC error");
+	}
+
+	/* compare the hash */
+	if (memcmp(tempbuf, buf_getptr(macbuf, macsize), macsize) != 0) {
+		return DROPBEAR_FAILURE;
+	} else {
+		return DROPBEAR_SUCCESS;
+	}
+}
+
+#ifndef DISABLE_ZLIB
+/* returns a pointer to a newly created buffer */
+static buffer* buf_decompress(buffer* buf, unsigned int len) {
+
+	int result;
+	buffer * ret;
+	z_streamp zstream;
+
+	zstream = ses.keys->recv_zstream;
+	ret = buf_new(len);
+
+	zstream->avail_in = len;
+	zstream->next_in = buf_getptr(buf, len);
+
+	/* decompress the payload, incrementally resizing the output buffer */
+	while (1) {
+
+		zstream->avail_out = ret->size - ret->pos;
+		zstream->next_out = buf_getwriteptr(ret, zstream->avail_out);
+
+		result = inflate(zstream, Z_SYNC_FLUSH);
+
+		buf_setlen(ret, ret->size - zstream->avail_out);
+		buf_setpos(ret, ret->len);
+
+		if (result != Z_BUF_ERROR && result != Z_OK) {
+			dropbear_exit("zlib error");
+		}
+
+		if (zstream->avail_in == 0 &&
+		   		(zstream->avail_out != 0 || result == Z_BUF_ERROR)) {
+			/* we can only exit if avail_out hasn't all been used,
+			 * and there's no remaining input */
+			return ret;
+		}
+
+		if (zstream->avail_out == 0) {
+			buf_resize(ret, ret->size + ZLIB_DECOMPRESS_INCR);
+		}
+	}
+}
+#endif
+
+
+
+	
+/* encrypt the writepayload, putting into writebuf, ready for write_packet()
+ * to put on the wire */
+void encrypt_packet() {
+
+	unsigned char padlen;
+	unsigned char blocksize, macsize;
+	buffer * writebuf; /* the packet which will go on the wire */
+	buffer * clearwritebuf; /* unencrypted, possibly compressed */
+	
+	TRACE(("enter encrypt_packet()"))
+	TRACE(("encrypt_packet type is %d", ses.writepayload->data[0]))
+	blocksize = ses.keys->trans_algo_crypt->blocksize;
+	macsize = ses.keys->trans_algo_mac->hashsize;
+
+	/* Encrypted packet len is payload+5, then worst case is if we are 3 away
+	 * from a blocksize multiple. In which case we need to pad to the
+	 * multiple, then add another blocksize (or MIN_PACKET_LEN) */
+	clearwritebuf = buf_new((ses.writepayload->len+4+1) + MIN_PACKET_LEN + 3
+#ifndef DISABLE_ZLIB
+			+ ZLIB_COMPRESS_INCR /* bit of a kludge, but we can't know len*/
+#endif
+			);
+	buf_setlen(clearwritebuf, PACKET_PAYLOAD_OFF);
+	buf_setpos(clearwritebuf, PACKET_PAYLOAD_OFF);
+
+	buf_setpos(ses.writepayload, 0);
+
+#ifndef DISABLE_ZLIB
+	/* compression */
+	if (ses.keys->trans_algo_comp == DROPBEAR_COMP_ZLIB) {
+		buf_compress(clearwritebuf, ses.writepayload, ses.writepayload->len);
+	} else
+#endif
+	{
+		memcpy(buf_getwriteptr(clearwritebuf, ses.writepayload->len),
+				buf_getptr(ses.writepayload, ses.writepayload->len),
+				ses.writepayload->len);
+		buf_incrwritepos(clearwritebuf, ses.writepayload->len);
+	}
+
+	/* finished with payload */
+	buf_burn(ses.writepayload); /* XXX This is probably a good idea, and isn't
+								   _that_ likely to hurt performance too badly.
+								   Buffers can have cleartext passwords etc, or
+								   other sensitive data */
+	buf_setpos(ses.writepayload, 0);
+	buf_setlen(ses.writepayload, 0);
+
+	/* length of padding - packet length must be a multiple of blocksize,
+	 * with a minimum of 4 bytes of padding */
+	padlen = blocksize - (clearwritebuf->len) % blocksize;
+	if (padlen < 4) {
+		padlen += blocksize;
+	}
+	/* check for min packet length */
+	if (clearwritebuf->len + padlen < MIN_PACKET_LEN) {
+		padlen += blocksize;
+	}
+
+	buf_setpos(clearwritebuf, 0);
+	/* packet length excluding the packetlength uint32 */
+	buf_putint(clearwritebuf, clearwritebuf->len + padlen - 4);
+
+	/* padding len */
+	buf_putbyte(clearwritebuf, padlen);
+	/* actual padding */
+	buf_setpos(clearwritebuf, clearwritebuf->len);
+	buf_incrlen(clearwritebuf, padlen);
+	genrandom(buf_getptr(clearwritebuf, padlen), padlen);
+
+	/* do the actual encryption */
+	buf_setpos(clearwritebuf, 0);
+	/* create a new writebuffer, this is freed when it has been put on the 
+	 * wire by writepacket() */
+	writebuf = buf_new(clearwritebuf->len + macsize);
+
+	if (ses.keys->trans_algo_crypt->cipherdesc == NULL) {
+		/* copy it */
+		memcpy(buf_getwriteptr(writebuf, clearwritebuf->len),
+				buf_getptr(clearwritebuf, clearwritebuf->len),
+				clearwritebuf->len);
+		buf_incrwritepos(writebuf, clearwritebuf->len);
+	} else {
+		/* encrypt it */
+		while (clearwritebuf->pos < clearwritebuf->len) {
+			if (cbc_encrypt(buf_getptr(clearwritebuf, blocksize),
+						buf_getwriteptr(writebuf, blocksize),
+						blocksize,
+						&ses.keys->trans_symmetric_struct) != CRYPT_OK) {
+				dropbear_exit("error encrypting");
+			}
+			buf_incrpos(clearwritebuf, blocksize);
+			buf_incrwritepos(writebuf, blocksize);
+		}
+	}
+
+	/* now add a hmac and we're done */
+	writemac(writebuf, clearwritebuf);
+
+	/* clearwritebuf is finished with */
+	buf_free(clearwritebuf);
+	clearwritebuf = NULL;
+
+	/* enqueue the packet for sending */
+	buf_setpos(writebuf, 0);
+	enqueue(&ses.writequeue, (void*)writebuf);
+
+	/* Update counts */
+	ses.kexstate.datatrans += writebuf->len;
+	ses.transseq++;
+
+	TRACE(("leave encrypt_packet()"))
+}
+
+
+/* Create the packet mac, and append H(seqno|clearbuf) to the output */
+static void writemac(buffer * outputbuffer, buffer * clearwritebuf) {
+
+	unsigned int macsize;
+	unsigned char seqbuf[4];
+	unsigned char tempbuf[MAX_MAC_LEN];
+	unsigned long bufsize;
+	hmac_state hmac;
+
+	TRACE(("enter writemac"))
+
+	macsize = ses.keys->trans_algo_mac->hashsize;
+	if (macsize > 0) {
+		/* calculate the mac */
+		if (hmac_init(&hmac, 
+					find_hash(ses.keys->trans_algo_mac->hashdesc->name), 
+					ses.keys->transmackey, 
+					ses.keys->trans_algo_mac->keysize) != CRYPT_OK) {
+			dropbear_exit("HMAC error");
+		}
+	
+		/* sequence number */
+		STORE32H(ses.transseq, seqbuf);
+		if (hmac_process(&hmac, seqbuf, 4) != CRYPT_OK) {
+			dropbear_exit("HMAC error");
+		}
+	
+		/* the actual contents */
+		buf_setpos(clearwritebuf, 0);
+		if (hmac_process(&hmac, 
+					buf_getptr(clearwritebuf, 
+						clearwritebuf->len),
+					clearwritebuf->len) != CRYPT_OK) {
+			dropbear_exit("HMAC error");
+		}
+	
+		bufsize = sizeof(tempbuf);
+		if (hmac_done(&hmac, tempbuf, &bufsize) 
+				!= CRYPT_OK) {
+			dropbear_exit("HMAC error");
+		}
+		buf_putbytes(outputbuffer, tempbuf, macsize);
+	}
+	TRACE(("leave writemac"))
+}
+
+#ifndef DISABLE_ZLIB
+/* compresses len bytes from src, outputting to dest (starting from the
+ * respective current positions. */
+static void buf_compress(buffer * dest, buffer * src, unsigned int len) {
+
+	unsigned int endpos = src->pos + len;
+	int result;
+
+	TRACE(("enter buf_compress"))
+
+	while (1) {
+
+		ses.keys->trans_zstream->avail_in = endpos - src->pos;
+		ses.keys->trans_zstream->next_in = 
+			buf_getptr(src, ses.keys->trans_zstream->avail_in);
+
+		ses.keys->trans_zstream->avail_out = dest->size - dest->pos;
+		ses.keys->trans_zstream->next_out =
+			buf_getwriteptr(dest, ses.keys->trans_zstream->avail_out);
+
+		result = deflate(ses.keys->trans_zstream, Z_SYNC_FLUSH);
+
+		buf_setpos(src, endpos - ses.keys->trans_zstream->avail_in);
+		buf_setlen(dest, dest->size - ses.keys->trans_zstream->avail_out);
+		buf_setpos(dest, dest->len);
+
+		if (result != Z_OK) {
+			dropbear_exit("zlib error");
+		}
+
+		if (ses.keys->trans_zstream->avail_in == 0) {
+			break;
+		}
+
+		dropbear_assert(ses.keys->trans_zstream->avail_out == 0);
+
+		/* the buffer has been filled, we must extend. This only happens in
+		 * unusual circumstances where the data grows in size after deflate(),
+		 * but it is possible */
+		buf_resize(dest, dest->size + ZLIB_COMPRESS_INCR);
+
+	}
+	TRACE(("leave buf_compress"))
+}
+#endif
diff --git a/packet.h b/packet.h
new file mode 100644
index 0000000..e9768cd
--- /dev/null
+++ b/packet.h
@@ -0,0 +1,48 @@
+/*
+ * Dropbear - a SSH2 server
+ * 
+ * Copyright (c) 2002,2003 Matt Johnston
+ * All rights reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE. */
+
+#ifndef _PACKET_H_
+
+#define _PACKET_H_
+
+#include "includes.h"
+
+void write_packet();
+void read_packet();
+void decrypt_packet();
+void encrypt_packet();
+
+void process_packet();
+
+typedef struct PacketType {
+	unsigned char type; /* SSH_MSG_FOO */
+	void (*handler)();
+} packettype;
+
+#define PACKET_PADDING_OFF 4
+#define PACKET_PAYLOAD_OFF 5
+
+#define INIT_READBUF 200
+
+#endif /* _PACKET_H_ */
diff --git a/process-packet.c b/process-packet.c
new file mode 100644
index 0000000..07fc130
--- /dev/null
+++ b/process-packet.c
@@ -0,0 +1,145 @@
+/*
+ * Dropbear - a SSH2 server
+ * 
+ * Copyright (c) 2002-2004 Matt Johnston
+ * All rights reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE. */
+
+#include "includes.h"
+#include "packet.h"
+#include "session.h"
+#include "dbutil.h"
+#include "ssh.h"
+#include "algo.h"
+#include "buffer.h"
+#include "kex.h"
+#include "random.h"
+#include "service.h"
+#include "auth.h"
+#include "channel.h"
+
+#define MAX_UNAUTH_PACKET_TYPE SSH_MSG_USERAUTH_PK_OK
+
+static void recv_unimplemented();
+
+/* process a decrypted packet, call the appropriate handler */
+void process_packet() {
+
+	unsigned char type;
+	unsigned int i;
+
+	TRACE(("enter process_packet"))
+
+	type = buf_getbyte(ses.payload);
+	TRACE(("process_packet: packet type = %d", type))
+
+	ses.lastpacket = type;
+
+	/* These packets we can receive at any time */
+	switch(type) {
+
+		case SSH_MSG_IGNORE:
+		case SSH_MSG_DEBUG:
+			TRACE(("received SSH_MSG_IGNORE or SSH_MSG_DEBUG"))
+			goto out;
+
+		case SSH_MSG_UNIMPLEMENTED:
+			/* debugging XXX */
+			TRACE(("SSH_MSG_UNIMPLEMENTED"))
+			dropbear_exit("received SSH_MSG_UNIMPLEMENTED");
+			
+		case SSH_MSG_DISCONNECT:
+			/* TODO cleanup? */
+			dropbear_close("Disconnect received");
+	}
+
+
+	/* This applies for KEX, where the spec says the next packet MUST be
+	 * NEWKEYS */
+	if (ses.requirenext != 0) {
+		if (ses.requirenext != type) {
+			/* TODO send disconnect? */
+			dropbear_exit("unexpected packet type %d, expected %d", type,
+					ses.requirenext);
+		} else {
+			/* Got what we expected */
+			ses.requirenext = 0;
+		}
+	}
+
+	/* Check if we should ignore this packet. Used currently only for
+	 * KEX code, with first_kex_packet_follows */
+	if (ses.ignorenext) {
+		TRACE(("Ignoring packet, type = %d", type))
+		ses.ignorenext = 0;
+		goto out;
+	}
+
+
+	/* Kindly the protocol authors gave all the preauth packets type values
+	 * less-than-or-equal-to 60 ( == MAX_UNAUTH_PACKET_TYPE ).
+	 * NOTE: if the protocol changes and new types are added, revisit this 
+	 * assumption */
+	if ( !ses.authstate.authdone && type > MAX_UNAUTH_PACKET_TYPE ) {
+		dropbear_exit("received message %d before userauth", type);
+	}
+
+	for (i = 0; ; i++) {
+		if (ses.packettypes[i].type == 0) {
+			/* end of list */
+			break;
+		}
+
+		if (ses.packettypes[i].type == type) {
+			ses.packettypes[i].handler();
+			goto out;
+		}
+	}
+
+	
+	/* TODO do something more here? */
+	TRACE(("preauth unknown packet"))
+	recv_unimplemented();
+
+out:
+	buf_burn(ses.payload); /* Clear the memory to avoid swapping it out */
+	buf_free(ses.payload);
+	ses.payload = NULL;
+
+	TRACE(("leave process_packet"))
+}
+
+
+
+/* This must be called directly after receiving the unimplemented packet.
+ * Isn't the most clean implementation, it relies on packet processing
+ * occurring directly after decryption (direct use of ses.recvseq).
+ * This is reasonably valid, since there is only a single decryption buffer */
+static void recv_unimplemented() {
+
+	CHECKCLEARTOWRITE();
+
+	buf_putbyte(ses.writepayload, SSH_MSG_UNIMPLEMENTED);
+	/* the decryption routine increments the sequence number, we must
+	 * decrement */
+	buf_putint(ses.writepayload, ses.recvseq - 1);
+
+	encrypt_packet();
+}
diff --git a/progressmeter.c b/progressmeter.c
new file mode 100644
index 0000000..0d856cb
--- /dev/null
+++ b/progressmeter.c
@@ -0,0 +1,267 @@
+#ifdef PROGRESS_METER
+/*
+ * Copyright (c) 2003 Nils Nordman.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "includes.h"
+/*RCSID("OpenBSD: progressmeter.c,v 1.15 2003/08/31 12:14:22 markus Exp ");*/
+
+#include "progressmeter.h"
+#include "atomicio.h"
+#include "scpmisc.h"
+
+#define DEFAULT_WINSIZE 80
+#define MAX_WINSIZE 512
+#define PADDING 1		/* padding between the progress indicators */
+#define UPDATE_INTERVAL 1	/* update the progress meter every second */
+#define STALL_TIME 5		/* we're stalled after this many seconds */
+
+/* determines whether we can output to the terminal */
+static int can_output(void);
+
+/* formats and inserts the specified size into the given buffer */
+static void format_size(char *, int, off_t);
+static void format_rate(char *, int, off_t);
+
+/* updates the progressmeter to reflect the current state of the transfer */
+void refresh_progress_meter(void);
+
+/* signal handler for updating the progress meter */
+static void update_progress_meter(int);
+
+static time_t start; 		/* start progress */
+static time_t last_update; 	/* last progress update */
+static char *file; 		/* name of the file being transferred */
+static off_t end_pos; 		/* ending position of transfer */
+static off_t cur_pos; 		/* transfer position as of last refresh */
+static volatile off_t *counter;	/* progress counter */
+static long stalled; 		/* how long we have been stalled */
+static int bytes_per_second; 	/* current speed in bytes per second */
+static int win_size; 		/* terminal window size */
+
+/* units for format_size */
+static const char unit[] = " KMGT";
+
+static int
+can_output(void)
+{
+	return (getpgrp() == tcgetpgrp(STDOUT_FILENO));
+}
+
+static void
+format_rate(char *buf, int size, off_t bytes)
+{
+	int i;
+
+	bytes *= 100;
+	for (i = 0; bytes >= 100*1000 && unit[i] != 'T'; i++)
+		bytes = (bytes + 512) / 1024;
+	if (i == 0) {
+		i++;
+		bytes = (bytes + 512) / 1024;
+	}
+	snprintf(buf, size, "%3lld.%1lld%c%s",
+	    (int64_t) bytes / 100,
+	    (int64_t) (bytes + 5) / 10 % 10,
+	    unit[i],
+	    i ? "B" : " ");
+}
+
+static void
+format_size(char *buf, int size, off_t bytes)
+{
+	int i;
+
+	for (i = 0; bytes >= 10000 && unit[i] != 'T'; i++)
+		bytes = (bytes + 512) / 1024;
+	snprintf(buf, size, "%4lld%c%s",
+	    (int64_t) bytes,
+	    unit[i],
+	    i ? "B" : " ");
+}
+
+void
+refresh_progress_meter(void)
+{
+	char buf[MAX_WINSIZE + 1];
+	time_t now;
+	off_t transferred;
+	double elapsed;
+	int percent;
+	int bytes_left;
+	int cur_speed;
+	int hours, minutes, seconds;
+	int i, len;
+	int file_len;
+
+	transferred = *counter - cur_pos;
+	cur_pos = *counter;
+	now = time(NULL);
+	bytes_left = end_pos - cur_pos;
+
+	if (bytes_left > 0)
+		elapsed = now - last_update;
+	else
+		elapsed = now - start;
+
+	/* calculate speed */
+	if (elapsed != 0)
+		cur_speed = (transferred / elapsed);
+	else
+		cur_speed = 0;
+
+#define AGE_FACTOR 0.9
+	if (bytes_per_second != 0) {
+		bytes_per_second = (bytes_per_second * AGE_FACTOR) +
+		    (cur_speed * (1.0 - AGE_FACTOR));
+	} else
+		bytes_per_second = cur_speed;
+
+	/* filename */
+	buf[0] = '\0';
+	file_len = win_size - 35;
+	if (file_len > 0) {
+		len = snprintf(buf, file_len + 1, "\r%s", file);
+		if (len < 0)
+			len = 0;
+		for (i = len;  i < file_len; i++ )
+			buf[i] = ' ';
+		buf[file_len] = '\0';
+	}
+
+	/* percent of transfer done */
+	if (end_pos != 0)
+		percent = ((float)cur_pos / end_pos) * 100;
+	else
+		percent = 100;
+	snprintf(buf + strlen(buf), win_size - strlen(buf),
+	    " %3d%% ", percent);
+
+	/* amount transferred */
+	format_size(buf + strlen(buf), win_size - strlen(buf),
+	    cur_pos);
+	strlcat(buf, " ", win_size);
+
+	/* bandwidth usage */
+	format_rate(buf + strlen(buf), win_size - strlen(buf),
+	    bytes_per_second);
+	strlcat(buf, "/s ", win_size);
+
+	/* ETA */
+	if (!transferred)
+		stalled += elapsed;
+	else
+		stalled = 0;
+
+	if (stalled >= STALL_TIME)
+		strlcat(buf, "- stalled -", win_size);
+	else if (bytes_per_second == 0 && bytes_left)
+		strlcat(buf, "  --:-- ETA", win_size);
+	else {
+		if (bytes_left > 0)
+			seconds = bytes_left / bytes_per_second;
+		else
+			seconds = elapsed;
+
+		hours = seconds / 3600;
+		seconds -= hours * 3600;
+		minutes = seconds / 60;
+		seconds -= minutes * 60;
+
+		if (hours != 0)
+			snprintf(buf + strlen(buf), win_size - strlen(buf),
+			    "%d:%02d:%02d", hours, minutes, seconds);
+		else
+			snprintf(buf + strlen(buf), win_size - strlen(buf),
+			    "  %02d:%02d", minutes, seconds);
+
+		if (bytes_left > 0)
+			strlcat(buf, " ETA", win_size);
+		else
+			strlcat(buf, "    ", win_size);
+	}
+
+	atomicio(vwrite, STDOUT_FILENO, buf, win_size);
+	last_update = now;
+}
+
+static void
+update_progress_meter(int ignore)
+{
+	int save_errno;
+
+	save_errno = errno;
+
+	if (can_output())
+		refresh_progress_meter();
+
+	signal(SIGALRM, update_progress_meter);
+	alarm(UPDATE_INTERVAL);
+	errno = save_errno;
+}
+
+void
+start_progress_meter(char *f, off_t filesize, off_t *stat)
+{
+	struct winsize winsize;
+
+	start = last_update = time(NULL);
+	file = f;
+	end_pos = filesize;
+	cur_pos = 0;
+	counter = stat;
+	stalled = 0;
+	bytes_per_second = 0;
+
+	if (ioctl(STDOUT_FILENO, TIOCGWINSZ, &winsize) != -1 &&
+	    winsize.ws_col != 0) {
+		if (winsize.ws_col > MAX_WINSIZE)
+			win_size = MAX_WINSIZE;
+		else
+			win_size = winsize.ws_col;
+	} else
+		win_size = DEFAULT_WINSIZE;
+	win_size += 1;					/* trailing \0 */
+
+	if (can_output())
+		refresh_progress_meter();
+
+	signal(SIGALRM, update_progress_meter);
+	alarm(UPDATE_INTERVAL);
+}
+
+void
+stop_progress_meter(void)
+{
+	alarm(0);
+
+	if (!can_output())
+		return;
+
+	/* Ensure we complete the progress */
+	if (cur_pos != end_pos)
+		refresh_progress_meter();
+
+	atomicio(vwrite, STDOUT_FILENO, "\n", 1);
+}
+#endif /* PROGRESS_METER */
diff --git a/progressmeter.h b/progressmeter.h
new file mode 100644
index 0000000..bfb9a0b
--- /dev/null
+++ b/progressmeter.h
@@ -0,0 +1,27 @@
+/*	$OpenBSD: progressmeter.h,v 1.1 2003/01/10 08:19:07 fgsch Exp $	*/
+/*
+ * Copyright (c) 2002 Nils Nordman.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+void	start_progress_meter(char *, off_t, off_t *);
+void	stop_progress_meter(void);
diff --git a/queue.c b/queue.c
new file mode 100644
index 0000000..7a80124
--- /dev/null
+++ b/queue.c
@@ -0,0 +1,89 @@
+/*
+ * Dropbear - a SSH2 server
+ * 
+ * Copyright (c) 2002,2003 Matt Johnston
+ * All rights reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE. */
+
+#include "includes.h"
+#include "dbutil.h"
+#include "queue.h"
+
+void initqueue(struct Queue* queue) {
+
+	queue->head = NULL;
+	queue->tail = NULL;
+	queue->count = 0;
+}
+
+int isempty(struct Queue* queue) {
+
+	return (queue->head == NULL);
+}
+	
+void* dequeue(struct Queue* queue) {
+
+	void* ret;
+	struct Link* oldhead;
+	dropbear_assert(!isempty(queue));
+	
+	ret = queue->head->item;
+	oldhead = queue->head;
+	
+	if (oldhead->link != NULL) {
+		queue->head = oldhead->link;
+	} else {
+		queue->head = NULL;
+		queue->tail = NULL;
+		TRACE(("empty queue dequeing"))
+	}
+
+	m_free(oldhead);
+	queue->count--;
+	return ret;
+}
+
+void *examine(struct Queue* queue) {
+
+	dropbear_assert(!isempty(queue));
+	return queue->head->item;
+}
+
+void enqueue(struct Queue* queue, void* item) {
+
+	struct Link* newlink;
+
+	TRACE(("enter enqueue"))
+	newlink = (struct Link*)m_malloc(sizeof(struct Link));
+
+	newlink->item = item;
+	newlink->link = NULL;
+
+	if (queue->tail != NULL) {
+		queue->tail->link = newlink;
+	}
+	queue->tail = newlink;
+
+	if (queue->head == NULL) {
+		queue->head = newlink;
+	}
+	queue->count++;
+	TRACE(("leave enqueue"))
+}
diff --git a/queue.h b/queue.h
new file mode 100644
index 0000000..80fbb9d
--- /dev/null
+++ b/queue.h
@@ -0,0 +1,49 @@
+/*
+ * Dropbear - a SSH2 server
+ * 
+ * Copyright (c) 2002,2003 Matt Johnston
+ * All rights reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE. */
+
+#ifndef _QUEUE_H_
+#define _QUEUE_H_
+
+struct Link {
+
+	void* item;
+	struct Link* link;
+
+};
+
+struct Queue {
+
+	struct Link* head;
+	struct Link* tail;
+	unsigned int count; /* safety value */
+
+};
+
+void initqueue(struct Queue* queue);
+int isempty(struct Queue* queue);
+void* dequeue(struct Queue* queue);
+void *examine(struct Queue* queue);
+void enqueue(struct Queue* queue, void* item);
+
+#endif
diff --git a/random.c b/random.c
new file mode 100644
index 0000000..cbbe016
--- /dev/null
+++ b/random.c
@@ -0,0 +1,241 @@
+/*
+ * Dropbear - a SSH2 server
+ * 
+ * Copyright (c) 2002,2003 Matt Johnston
+ * All rights reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE. */
+
+#include "includes.h"
+#include "buffer.h"
+#include "dbutil.h"
+#include "bignum.h"
+
+static int donerandinit = 0;
+
+/* this is used to generate unique output from the same hashpool */
+static uint32_t counter = 0;
+#define MAX_COUNTER 1<<31 /* the max value for the counter, so it won't loop */
+
+static unsigned char hashpool[SHA1_HASH_SIZE];
+
+#define INIT_SEED_SIZE 32 /* 256 bits */
+
+static void readrand(unsigned char* buf, unsigned int buflen);
+
+/* The basic setup is we read some data from /dev/(u)random or prngd and hash it
+ * into hashpool. To read data, we hash together current hashpool contents,
+ * and a counter. We feed more data in by hashing the current pool and new
+ * data into the pool.
+ *
+ * It is important to ensure that counter doesn't wrap around before we
+ * feed in new entropy.
+ *
+ */
+
+static void readrand(unsigned char* buf, unsigned int buflen) {
+
+	static int already_blocked = 0;
+	int readfd;
+	unsigned int readpos;
+	int readlen;
+#ifdef DROPBEAR_PRNGD_SOCKET
+	struct sockaddr_un egdsock;
+	char egdcmd[2];
+#endif
+
+#ifdef DROPBEAR_RANDOM_DEV
+	readfd = open(DROPBEAR_RANDOM_DEV, O_RDONLY);
+	if (readfd < 0) {
+		dropbear_exit("couldn't open random device");
+	}
+#endif
+
+#ifdef DROPBEAR_PRNGD_SOCKET
+	memset((void*)&egdsock, 0x0, sizeof(egdsock));
+	egdsock.sun_family = AF_UNIX;
+	strlcpy(egdsock.sun_path, DROPBEAR_PRNGD_SOCKET,
+			sizeof(egdsock.sun_path));
+
+	readfd = socket(PF_UNIX, SOCK_STREAM, 0);
+	if (readfd < 0) {
+		dropbear_exit("couldn't open random device");
+	}
+	/* todo - try various common locations */
+	if (connect(readfd, (struct sockaddr*)&egdsock, 
+			sizeof(struct sockaddr_un)) < 0) {
+		dropbear_exit("couldn't open random device");
+	}
+
+	if (buflen > 255)
+		dropbear_exit("can't request more than 255 bytes from egd");
+	egdcmd[0] = 0x02;	/* blocking read */
+	egdcmd[1] = (unsigned char)buflen;
+	if (write(readfd, egdcmd, 2) < 0)
+		dropbear_exit("can't send command to egd");
+#endif
+
+	/* read the actual random data */
+	readpos = 0;
+	do {
+		if (!already_blocked)
+		{
+			int ret;
+			struct timeval timeout;
+			fd_set read_fds;
+
+			timeout.tv_sec = 2; /* two seconds should be enough */
+			timeout.tv_usec = 0;
+
+			FD_ZERO(&read_fds);
+			FD_SET(readfd, &read_fds);
+			ret = select(readfd + 1, &read_fds, NULL, NULL, &timeout);
+			if (ret == 0)
+			{
+				dropbear_log(LOG_INFO, "Warning: Reading the random source seems to have blocked.\nIf you experience problems, you probably need to find a better entropy source.");
+				already_blocked = 1;
+			}
+		}
+		readlen = read(readfd, &buf[readpos], buflen - readpos);
+		if (readlen <= 0) {
+			if (readlen < 0 && errno == EINTR) {
+				continue;
+			}
+			dropbear_exit("error reading random source");
+		}
+		readpos += readlen;
+	} while (readpos < buflen);
+
+	close (readfd);
+}
+
+/* initialise the prng from /dev/(u)random or prngd */
+void seedrandom() {
+		
+	unsigned char readbuf[INIT_SEED_SIZE];
+
+	hash_state hs;
+
+	/* initialise so that things won't warn about
+     * hashing an undefined buffer */
+	if (!donerandinit) {
+		m_burn(hashpool, sizeof(hashpool));
+	}
+
+	/* get the seed data */
+	readrand(readbuf, sizeof(readbuf));
+
+	/* hash in the new seed data */
+	sha1_init(&hs);
+	sha1_process(&hs, (void*)hashpool, sizeof(hashpool));
+	sha1_process(&hs, (void*)readbuf, sizeof(readbuf));
+	sha1_done(&hs, hashpool);
+
+	counter = 0;
+	donerandinit = 1;
+}
+
+/* hash the current random pool with some unique identifiers
+ * for this process and point-in-time. this is used to separate
+ * the random pools for fork()ed processes. */
+void reseedrandom() {
+
+    pid_t pid;
+    struct timeval tv;
+
+	if (!donerandinit) {
+		dropbear_exit("seedrandom not done");
+	}
+
+    pid = getpid();
+    gettimeofday(&tv, NULL);
+
+	hash_state hs;
+	unsigned char hash[SHA1_HASH_SIZE];
+	sha1_init(&hs);
+	sha1_process(&hs, (void*)hashpool, sizeof(hashpool));
+	sha1_process(&hs, (void*)&pid, sizeof(pid));
+	sha1_process(&hs, (void*)&tv, sizeof(tv));
+	sha1_done(&hs, hashpool);
+}
+
+/* return len bytes of pseudo-random data */
+void genrandom(unsigned char* buf, unsigned int len) {
+
+	hash_state hs;
+	unsigned char hash[SHA1_HASH_SIZE];
+	unsigned int copylen;
+
+	if (!donerandinit) {
+		dropbear_exit("seedrandom not done");
+	}
+
+	while (len > 0) {
+		sha1_init(&hs);
+		sha1_process(&hs, (void*)hashpool, sizeof(hashpool));
+		sha1_process(&hs, (void*)&counter, sizeof(counter));
+		sha1_done(&hs, hash);
+
+		counter++;
+		if (counter > MAX_COUNTER) {
+			seedrandom();
+		}
+
+		copylen = MIN(len, SHA1_HASH_SIZE);
+		memcpy(buf, hash, copylen);
+		len -= copylen;
+		buf += copylen;
+	}
+	m_burn(hash, sizeof(hash));
+}
+
+/* Generates a random mp_int. 
+ * max is a *mp_int specifying an upper bound.
+ * rand must be an initialised *mp_int for the result.
+ * the result rand satisfies:  0 < rand < max 
+ * */
+void gen_random_mpint(mp_int *max, mp_int *rand) {
+
+	unsigned char *randbuf = NULL;
+	unsigned int len = 0;
+	const char masks[] = {0xff, 0x01, 0x03, 0x07, 0x0f, 0x1f, 0x3f, 0x7f};
+
+	const int size_bits = mp_count_bits(max);
+
+	len = size_bits / 8;
+	if ((size_bits % 8) != 0) {
+		len += 1;
+	}
+
+	randbuf = (unsigned char*)m_malloc(len);
+	do {
+		genrandom(randbuf, len);
+		/* Mask out the unrequired bits - mp_read_unsigned_bin expects
+		 * MSB first.*/
+		randbuf[0] &= masks[size_bits % 8];
+
+		bytes_to_mp(rand, randbuf, len);
+
+		/* keep regenerating until we get one satisfying
+		 * 0 < rand < max    */
+	} while ( ( (max != NULL) && (mp_cmp(rand, max) != MP_LT) )
+			|| (mp_cmp_d(rand, 0) != MP_GT) );
+	m_burn(randbuf, len);
+	m_free(randbuf);
+}
diff --git a/random.h b/random.h
new file mode 100644
index 0000000..84a0a39
--- /dev/null
+++ b/random.h
@@ -0,0 +1,36 @@
+/*
+ * Dropbear - a SSH2 server
+ * 
+ * Copyright (c) 2002,2003 Matt Johnston
+ * All rights reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE. */
+
+#ifndef _RANDOM_H_
+#define _RANDOM_H_
+
+struct mp_int;
+
+void seedrandom();
+void reseedrandom();
+void genrandom(unsigned char* buf, int len);
+void addrandom(unsigned char* buf, int len);
+void gen_random_mpint(mp_int *max, mp_int *rand);
+
+#endif /* _RANDOM_H_ */
diff --git a/rsa.c b/rsa.c
new file mode 100644
index 0000000..005e4ca
--- /dev/null
+++ b/rsa.c
@@ -0,0 +1,398 @@
+/*
+ * Dropbear - a SSH2 server
+ * 
+ * Copyright (c) 2002,2003 Matt Johnston
+ * All rights reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE. */
+
+/* Perform RSA operations on data, including reading keys, signing and
+ * verification.
+ *
+ * The format is specified in rfc2437, Applied Cryptography or The Handbook of
+ * Applied Cryptography detail the general algorithm. */
+
+#include "includes.h"
+#include "dbutil.h"
+#include "bignum.h"
+#include "rsa.h"
+#include "buffer.h"
+#include "ssh.h"
+#include "random.h"
+
+#ifdef DROPBEAR_RSA 
+
+static void rsa_pad_em(rsa_key * key,
+		const unsigned char * data, unsigned int len,
+		mp_int * rsa_em);
+
+/* Load a public rsa key from a buffer, initialising the values.
+ * The key will have the same format as buf_put_rsa_key.
+ * These should be freed with rsa_key_free.
+ * Returns DROPBEAR_SUCCESS or DROPBEAR_FAILURE */
+int buf_get_rsa_pub_key(buffer* buf, rsa_key *key) {
+
+	TRACE(("enter buf_get_rsa_pub_key"))
+	dropbear_assert(key != NULL);
+	key->e = m_malloc(sizeof(mp_int));
+	key->n = m_malloc(sizeof(mp_int));
+	m_mp_init_multi(key->e, key->n, NULL);
+	key->d = NULL;
+	key->p = NULL;
+	key->q = NULL;
+
+	buf_incrpos(buf, 4+SSH_SIGNKEY_RSA_LEN); /* int + "ssh-rsa" */
+
+	if (buf_getmpint(buf, key->e) == DROPBEAR_FAILURE
+	 || buf_getmpint(buf, key->n) == DROPBEAR_FAILURE) {
+		TRACE(("leave buf_get_rsa_pub_key: failure"))
+		return DROPBEAR_FAILURE;
+	}
+
+	if (mp_count_bits(key->n) < MIN_RSA_KEYLEN) {
+		dropbear_log(LOG_WARNING, "rsa key too short");
+		return DROPBEAR_FAILURE;
+	}
+
+	TRACE(("leave buf_get_rsa_pub_key: success"))
+	return DROPBEAR_SUCCESS;
+
+}
+
+/* Same as buf_get_rsa_pub_key, but reads a private "x" key at the end.
+ * Loads a private rsa key from a buffer
+ * Returns DROPBEAR_SUCCESS or DROPBEAR_FAILURE */
+int buf_get_rsa_priv_key(buffer* buf, rsa_key *key) {
+
+	dropbear_assert(key != NULL);
+
+	TRACE(("enter buf_get_rsa_priv_key"))
+
+	if (buf_get_rsa_pub_key(buf, key) == DROPBEAR_FAILURE) {
+		TRACE(("leave buf_get_rsa_priv_key: pub: ret == DROPBEAR_FAILURE"))
+		return DROPBEAR_FAILURE;
+	}
+
+	key->d = m_malloc(sizeof(mp_int));
+	m_mp_init(key->d);
+	if (buf_getmpint(buf, key->d) == DROPBEAR_FAILURE) {
+		TRACE(("leave buf_get_rsa_priv_key: d: ret == DROPBEAR_FAILURE"))
+		return DROPBEAR_FAILURE;
+	}
+
+	/* old Dropbear private keys didn't keep p and q, so we will ignore them*/
+	if (buf->pos == buf->len) {
+		key->p = NULL;
+		key->q = NULL;
+	} else {
+		key->p = m_malloc(sizeof(mp_int));
+		key->q = m_malloc(sizeof(mp_int));
+		m_mp_init_multi(key->p, key->q, NULL);
+
+		if (buf_getmpint(buf, key->p) == DROPBEAR_FAILURE) {
+			TRACE(("leave buf_get_rsa_priv_key: p: ret == DROPBEAR_FAILURE"))
+			return DROPBEAR_FAILURE;
+		}
+
+		if (buf_getmpint(buf, key->q) == DROPBEAR_FAILURE) {
+			TRACE(("leave buf_get_rsa_priv_key: q: ret == DROPBEAR_FAILURE"))
+			return DROPBEAR_FAILURE;
+		}
+	}
+
+	TRACE(("leave buf_get_rsa_priv_key"))
+	return DROPBEAR_SUCCESS;
+}
+	
+
+/* Clear and free the memory used by a public or private key */
+void rsa_key_free(rsa_key *key) {
+
+	TRACE(("enter rsa_key_free"))
+
+	if (key == NULL) {
+		TRACE(("leave rsa_key_free: key == NULL"))
+		return;
+	}
+	if (key->d) {
+		mp_clear(key->d);
+		m_free(key->d);
+	}
+	if (key->e) {
+		mp_clear(key->e);
+		m_free(key->e);
+	}
+	if (key->n) {
+		 mp_clear(key->n);
+		 m_free(key->n);
+	}
+	if (key->p) {
+		mp_clear(key->p);
+		m_free(key->p);
+	}
+	if (key->q) {
+		mp_clear(key->q);
+		m_free(key->q);
+	}
+	m_free(key);
+	TRACE(("leave rsa_key_free"))
+}
+
+/* Put the public rsa key into the buffer in the required format:
+ *
+ * string	"ssh-rsa"
+ * mp_int	e
+ * mp_int	n
+ */
+void buf_put_rsa_pub_key(buffer* buf, rsa_key *key) {
+
+	TRACE(("enter buf_put_rsa_pub_key"))
+	dropbear_assert(key != NULL);
+
+	buf_putstring(buf, SSH_SIGNKEY_RSA, SSH_SIGNKEY_RSA_LEN);
+	buf_putmpint(buf, key->e);
+	buf_putmpint(buf, key->n);
+
+	TRACE(("leave buf_put_rsa_pub_key"))
+
+}
+
+/* Same as buf_put_rsa_pub_key, but with the private "x" key appended */
+void buf_put_rsa_priv_key(buffer* buf, rsa_key *key) {
+
+	TRACE(("enter buf_put_rsa_priv_key"))
+
+	dropbear_assert(key != NULL);
+	buf_put_rsa_pub_key(buf, key);
+	buf_putmpint(buf, key->d);
+
+	/* new versions have p and q, old versions don't */
+	if (key->p) {
+		buf_putmpint(buf, key->p);
+	}
+	if (key->q) {
+		buf_putmpint(buf, key->q);
+	}
+
+
+	TRACE(("leave buf_put_rsa_priv_key"))
+
+}
+
+#ifdef DROPBEAR_SIGNKEY_VERIFY
+/* Verify a signature in buf, made on data by the key given.
+ * Returns DROPBEAR_SUCCESS or DROPBEAR_FAILURE */
+int buf_rsa_verify(buffer * buf, rsa_key *key, const unsigned char* data,
+		unsigned int len) {
+
+	unsigned int slen;
+	DEF_MP_INT(rsa_s);
+	DEF_MP_INT(rsa_mdash);
+	DEF_MP_INT(rsa_em);
+	int ret = DROPBEAR_FAILURE;
+
+	TRACE(("enter buf_rsa_verify"))
+
+	dropbear_assert(key != NULL);
+
+	m_mp_init_multi(&rsa_mdash, &rsa_s, &rsa_em, NULL);
+
+	slen = buf_getint(buf);
+	if (slen != (unsigned int)mp_unsigned_bin_size(key->n)) {
+		TRACE(("bad size"))
+		goto out;
+	}
+
+	if (mp_read_unsigned_bin(&rsa_s, buf_getptr(buf, buf->len - buf->pos),
+				buf->len - buf->pos) != MP_OKAY) {
+		TRACE(("failed reading rsa_s"))
+		goto out;
+	}
+
+	/* check that s <= n-1 */
+	if (mp_cmp(&rsa_s, key->n) != MP_LT) {
+		TRACE(("s > n-1"))
+		goto out;
+	}
+
+	/* create the magic PKCS padded value */
+	rsa_pad_em(key, data, len, &rsa_em);
+
+	if (mp_exptmod(&rsa_s, key->e, key->n, &rsa_mdash) != MP_OKAY) {
+		TRACE(("failed exptmod rsa_s"))
+		goto out;
+	}
+
+	if (mp_cmp(&rsa_em, &rsa_mdash) == MP_EQ) {
+		/* signature is valid */
+		TRACE(("success!"))
+		ret = DROPBEAR_SUCCESS;
+	}
+
+out:
+	mp_clear_multi(&rsa_mdash, &rsa_s, &rsa_em, NULL);
+	TRACE(("leave buf_rsa_verify: ret %d", ret))
+	return ret;
+}
+
+#endif /* DROPBEAR_SIGNKEY_VERIFY */
+
+/* Sign the data presented with key, writing the signature contents
+ * to the buffer */
+void buf_put_rsa_sign(buffer* buf, rsa_key *key, const unsigned char* data,
+		unsigned int len) {
+
+	unsigned int nsize, ssize;
+	unsigned int i;
+	DEF_MP_INT(rsa_s);
+	DEF_MP_INT(rsa_tmp1);
+	DEF_MP_INT(rsa_tmp2);
+	DEF_MP_INT(rsa_tmp3);
+	
+	TRACE(("enter buf_put_rsa_sign"))
+	dropbear_assert(key != NULL);
+
+	m_mp_init_multi(&rsa_s, &rsa_tmp1, &rsa_tmp2, &rsa_tmp3, NULL);
+
+	rsa_pad_em(key, data, len, &rsa_tmp1);
+
+	/* the actual signing of the padded data */
+
+#ifdef RSA_BLINDING
+
+	/* With blinding, s = (r^(-1))((em)*r^e)^d mod n */
+
+	/* generate the r blinding value */
+	/* rsa_tmp2 is r */
+	gen_random_mpint(key->n, &rsa_tmp2);
+
+	/* rsa_tmp1 is em */
+	/* em' = em * r^e mod n */
+
+	mp_exptmod(&rsa_tmp2, key->e, key->n, &rsa_s); /* rsa_s used as a temp var*/
+	mp_invmod(&rsa_tmp2, key->n, &rsa_tmp3);
+	mp_mulmod(&rsa_tmp1, &rsa_s, key->n, &rsa_tmp2);
+
+	/* rsa_tmp2 is em' */
+	/* s' = (em')^d mod n */
+	mp_exptmod(&rsa_tmp2, key->d, key->n, &rsa_tmp1);
+
+	/* rsa_tmp1 is s' */
+	/* rsa_tmp3 is r^(-1) mod n */
+	/* s = (s')r^(-1) mod n */
+	mp_mulmod(&rsa_tmp1, &rsa_tmp3, key->n, &rsa_s);
+
+#else
+
+	/* s = em^d mod n */
+	/* rsa_tmp1 is em */
+	if (mp_exptmod(&rsa_tmp1, key->d, key->n, &rsa_s) != MP_OKAY) {
+		dropbear_exit("rsa error");
+	}
+
+#endif /* RSA_BLINDING */
+
+	mp_clear_multi(&rsa_tmp1, &rsa_tmp2, &rsa_tmp3, NULL);
+	
+	/* create the signature to return */
+	buf_putstring(buf, SSH_SIGNKEY_RSA, SSH_SIGNKEY_RSA_LEN);
+
+	nsize = mp_unsigned_bin_size(key->n);
+
+	/* string rsa_signature_blob length */
+	buf_putint(buf, nsize);
+	/* pad out s to same length as n */
+	ssize = mp_unsigned_bin_size(&rsa_s);
+	dropbear_assert(ssize <= nsize);
+	for (i = 0; i < nsize-ssize; i++) {
+		buf_putbyte(buf, 0x00);
+	}
+
+	if (mp_to_unsigned_bin(&rsa_s, buf_getwriteptr(buf, ssize)) != MP_OKAY) {
+		dropbear_exit("rsa error");
+	}
+	buf_incrwritepos(buf, ssize);
+	mp_clear(&rsa_s);
+
+#if defined(DEBUG_RSA) && defined(DEBUG_TRACE)
+	printhex("RSA sig", buf->data, buf->len);
+#endif
+	
+
+	TRACE(("leave buf_put_rsa_sign"))
+}
+
+/* Creates the message value as expected by PKCS, see rfc2437 etc */
+/* format to be padded to is:
+ * EM = 01 | FF* | 00 | prefix | hash
+ *
+ * where FF is repeated enough times to make EM one byte
+ * shorter than the size of key->n
+ *
+ * prefix is the ASN1 designator prefix,
+ * hex 30 21 30 09 06 05 2B 0E 03 02 1A 05 00 04 14
+ *
+ * rsa_em must be a pointer to an initialised mp_int.
+ */
+static void rsa_pad_em(rsa_key * key,
+		const unsigned char * data, unsigned int len, 
+		mp_int * rsa_em) {
+
+	/* ASN1 designator (including the 0x00 preceding) */
+	const unsigned char rsa_asn1_magic[] = 
+		{0x00, 0x30, 0x21, 0x30, 0x09, 0x06, 0x05, 0x2b, 
+		 0x0e, 0x03, 0x02, 0x1a, 0x05, 0x00, 0x04, 0x14};
+	const unsigned int RSA_ASN1_MAGIC_LEN = 16;
+
+	buffer * rsa_EM = NULL;
+	hash_state hs;
+	unsigned int nsize;
+	
+	dropbear_assert(key != NULL);
+	dropbear_assert(data != NULL);
+	nsize = mp_unsigned_bin_size(key->n);
+
+	rsa_EM = buf_new(nsize-1);
+	/* type byte */
+	buf_putbyte(rsa_EM, 0x01);
+	/* Padding with 0xFF bytes */
+	while(rsa_EM->pos != rsa_EM->size - RSA_ASN1_MAGIC_LEN - SHA1_HASH_SIZE) {
+		buf_putbyte(rsa_EM, 0xff);
+	}
+	/* Magic ASN1 stuff */
+	memcpy(buf_getwriteptr(rsa_EM, RSA_ASN1_MAGIC_LEN),
+			rsa_asn1_magic, RSA_ASN1_MAGIC_LEN);
+	buf_incrwritepos(rsa_EM, RSA_ASN1_MAGIC_LEN);
+
+	/* The hash of the data */
+	sha1_init(&hs);
+	sha1_process(&hs, data, len);
+	sha1_done(&hs, buf_getwriteptr(rsa_EM, SHA1_HASH_SIZE));
+	buf_incrwritepos(rsa_EM, SHA1_HASH_SIZE);
+
+	dropbear_assert(rsa_EM->pos == rsa_EM->size);
+
+	/* Create the mp_int from the encoded bytes */
+	buf_setpos(rsa_EM, 0);
+	bytes_to_mp(rsa_em, buf_getptr(rsa_EM, rsa_EM->size),
+			rsa_EM->size);
+	buf_free(rsa_EM);
+}
+
+#endif /* DROPBEAR_RSA */
diff --git a/rsa.h b/rsa.h
new file mode 100644
index 0000000..545ba9b
--- /dev/null
+++ b/rsa.h
@@ -0,0 +1,61 @@
+/*
+ * Dropbear - a SSH2 server
+ * 
+ * Copyright (c) 2002,2003 Matt Johnston
+ * All rights reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE. */
+
+#ifndef _RSA_H_
+#define _RSA_H_
+
+#include "includes.h"
+#include "buffer.h"
+
+#ifdef DROPBEAR_RSA 
+
+#define RSA_SIGNATURE_SIZE 4+7+4+40
+
+struct RSA_key {
+
+	mp_int* n;
+	mp_int* e;
+	mp_int* d;
+	mp_int* p;
+	mp_int* q;
+
+};
+
+typedef struct RSA_key rsa_key;
+
+void buf_put_rsa_sign(buffer* buf, rsa_key *key, const unsigned char* data,
+		unsigned int len);
+#ifdef DROPBEAR_SIGNKEY_VERIFY
+int buf_rsa_verify(buffer * buf, rsa_key *key, const unsigned char* data,
+		unsigned int len);
+#endif
+int buf_get_rsa_pub_key(buffer* buf, rsa_key *key);
+int buf_get_rsa_priv_key(buffer* buf, rsa_key *key);
+void buf_put_rsa_pub_key(buffer* buf, rsa_key *key);
+void buf_put_rsa_priv_key(buffer* buf, rsa_key *key);
+void rsa_key_free(rsa_key *key);
+
+#endif /* DROPBEAR_RSA */
+
+#endif /* _RSA_H_ */
diff --git a/runopts.h b/runopts.h
new file mode 100644
index 0000000..5107a9d
--- /dev/null
+++ b/runopts.h
@@ -0,0 +1,119 @@
+/*
+ * Dropbear - a SSH2 server
+ * 
+ * Copyright (c) 2002,2003 Matt Johnston
+ * All rights reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE. */
+
+#ifndef _RUNOPTS_H_
+#define _RUNOPTS_H_
+
+#include "includes.h"
+#include "signkey.h"
+#include "buffer.h"
+#include "auth.h"
+#include "tcpfwd.h"
+
+typedef struct runopts {
+
+#if defined(ENABLE_SVR_REMOTETCPFWD) || defined(ENABLE_CLI_LOCALTCPFWD)
+	int listen_fwd_all;
+#endif
+
+} runopts;
+
+extern runopts opts;
+
+int readhostkey(const char * filename, sign_key * hostkey, int *type);
+
+typedef struct svr_runopts {
+
+	char * rsakeyfile;
+	char * dsskeyfile;
+	char * bannerfile;
+
+	int forkbg;
+	int usingsyslog;
+
+	/* ports is an array of the portcount listening ports */
+	char *ports[DROPBEAR_MAX_PORTS];
+	unsigned int portcount;
+
+	int inetdmode;
+
+	/* Flags indicating whether to use ipv4 and ipv6 */
+	/* not used yet
+	int ipv4;
+	int ipv6;
+	*/
+
+#ifdef DO_MOTD
+	/* whether to print the MOTD */
+	int domotd;
+#endif
+
+	int norootlogin;
+
+	int noauthpass;
+	int norootpass;
+
+#ifdef ENABLE_SVR_REMOTETCPFWD
+	int noremotetcp;
+#endif
+#ifdef ENABLE_SVR_LOCALTCPFWD
+	int nolocaltcp;
+#endif
+
+	sign_key *hostkey;
+	buffer * banner;
+
+} svr_runopts;
+
+extern svr_runopts svr_opts;
+
+void svr_getopts(int argc, char ** argv);
+void loadhostkeys();
+
+typedef struct cli_runopts {
+
+	char *progname;
+	char *remotehost;
+	char *remoteport;
+
+	char *username;
+
+	char *cmd;
+	int wantpty;
+#ifdef ENABLE_CLI_PUBKEY_AUTH
+	struct SignKeyList *privkeys; /* Keys to use for public-key auth */
+#endif
+#ifdef ENABLE_CLI_REMOTETCPFWD
+	struct TCPFwdList * remotefwds;
+#endif
+#ifdef ENABLE_CLI_LOCALTCPFWD
+	struct TCPFwdList * localfwds;
+#endif
+
+} cli_runopts;
+
+extern cli_runopts cli_opts;
+void cli_getopts(int argc, char ** argv);
+
+#endif /* _RUNOPTS_H_ */
diff --git a/scp.c b/scp.c
new file mode 100644
index 0000000..ffc4deb
--- /dev/null
+++ b/scp.c
@@ -0,0 +1,1199 @@
+/*
+ * scp - secure remote copy.  This is basically patched BSD rcp which
+ * uses ssh to do the data transfer (instead of using rcmd).
+ *
+ * NOTE: This version should NOT be suid root.  (This uses ssh to
+ * do the transfer and ssh has the necessary privileges.)
+ *
+ * 1995 Timo Rinne <tri@iki.fi>, Tatu Ylonen <ylo@cs.hut.fi>
+ *
+ * As far as I am concerned, the code I have written for this software
+ * can be used freely for any purpose.  Any derived versions of this
+ * software must be clearly marked as such, and if the derived work is
+ * incompatible with the protocol description in the RFC file, it must be
+ * called by a name other than "ssh" or "Secure Shell".
+ */
+/*
+ * Copyright (c) 1999 Theo de Raadt.  All rights reserved.
+ * Copyright (c) 1999 Aaron Campbell.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Parts from:
+ *
+ * Copyright (c) 1983, 1990, 1992, 1993, 1995
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+#include "includes.h"
+#include "atomicio.h"
+#include "compat.h"
+#include "scpmisc.h"
+#include "progressmeter.h"
+
+#define _PATH_CP "/bin/cp"
+
+#ifndef TIMEVAL_TO_TIMESPEC
+#define	TIMEVAL_TO_TIMESPEC(tv, ts) {					\
+	(ts)->tv_sec = (tv)->tv_sec;					\
+	(ts)->tv_nsec = (tv)->tv_usec * 1000;				\
+}
+#endif
+
+#ifndef timersub
+#define timersub(tvp, uvp, vvp)					 \
+	do {								\
+		(vvp)->tv_sec = (tvp)->tv_sec - (uvp)->tv_sec;	  \
+		(vvp)->tv_usec = (tvp)->tv_usec - (uvp)->tv_usec;   \
+		if ((vvp)->tv_usec < 0) {			   \
+			(vvp)->tv_sec--;				\
+			(vvp)->tv_usec += 1000000;		  \
+		}						   \
+	} while (/* CONSTCOND */ 0)
+#endif /* timersub */
+
+
+void bwlimit(int);
+
+/* Struct for addargs */
+arglist args;
+
+/* Bandwidth limit */
+off_t limitbw = 0;
+
+/* Name of current file being transferred. */
+char *curfile;
+
+/* This is set to non-zero to enable verbose mode. */
+int verbose_mode = 0;
+
+#ifdef PROGRESS_METER
+/* This is set to zero if the progressmeter is not desired. */
+int showprogress = 1;
+#endif
+
+/* This is the program to execute for the secured connection. ("ssh" or -S) */
+char *ssh_program = _PATH_SSH_PROGRAM;
+
+/* This is used to store the pid of ssh_program */
+pid_t do_cmd_pid = -1;
+
+static void
+killchild(int signo)
+{
+	if (do_cmd_pid > 1)
+		kill(do_cmd_pid, signo);
+
+	_exit(1);
+}
+
+/*
+ * This function executes the given command as the specified user on the
+ * given host.  This returns < 0 if execution fails, and >= 0 otherwise. This
+ * assigns the input and output file descriptors on success.
+ */
+
+int
+do_cmd(char *host, char *remuser, char *cmd, int *fdin, int *fdout, int argc)
+{
+	int pin[2], pout[2], reserved[2];
+
+	if (verbose_mode)
+		fprintf(stderr,
+		    "Executing: program %s host %s, user %s, command %s\n",
+		    ssh_program, host,
+		    remuser ? remuser : "(unspecified)", cmd);
+
+	/*
+	 * Reserve two descriptors so that the real pipes won't get
+	 * descriptors 0 and 1 because that will screw up dup2 below.
+	 */
+	pipe(reserved);
+
+	/* Create a socket pair for communicating with ssh. */
+	if (pipe(pin) < 0 || pipe(pout) < 0)
+	{
+		fprintf(stderr, "Fatal error: pipe: %s\n", strerror(errno));
+		exit(1);
+	}
+
+	/* Free the reserved descriptors. */
+	close(reserved[0]);
+	close(reserved[1]);
+
+    // uClinux needs to build the args here before vforking,
+    // otherwise we do it later on.
+#ifdef __uClinux__
+	args.list[0] = ssh_program;
+	if (remuser != NULL)
+		addargs(&args, "-l%s", remuser);
+	addargs(&args, "%s", host);
+	addargs(&args, "%s", cmd);
+#endif /* __uClinux__ */
+
+	/* Fork a child to execute the command on the remote host using ssh. */
+#ifdef __uClinux__
+	do_cmd_pid = vfork();
+#else
+	do_cmd_pid = fork();
+#endif /* __uClinux__ */
+	if (do_cmd_pid == 0) {
+		/* Child. */
+		close(pin[1]);
+		close(pout[0]);
+		dup2(pin[0], 0);
+		dup2(pout[1], 1);
+		close(pin[0]);
+		close(pout[1]);
+
+#ifndef __uClinux__
+		args.list[0] = ssh_program;
+		if (remuser != NULL) {
+			addargs(&args, "-l");
+			addargs(&args, "%s", remuser);
+		}
+		addargs(&args, "%s", host);
+		addargs(&args, "%s", cmd);
+#endif
+
+		execvp(ssh_program, args.list);
+		perror(ssh_program);
+		exit(1);
+	} else if (do_cmd_pid == -1) {
+		fprintf(stderr, "Fatal error: fork: %s\n", strerror(errno));
+		exit(1);
+	}
+
+#ifdef __uClinux__
+	/* clean up command */
+	/* pop cmd */
+	free(args->list[--args->num]);
+	args->list[args->num]=NULL;
+	/* pop host */
+	free(args->list[--args->num-1]);
+	args->list[args->num]=NULL;
+	/* pop user */
+	if (remuser != NULL) {
+		free(args->list[--args->num-1]);
+		args->list[args->num]=NULL;
+	}
+#endif /* __uClinux__
+	  
+	/* Parent.  Close the other side, and return the local side. */
+	close(pin[0]);
+	*fdout = pin[1];
+	close(pout[1]);
+	*fdin = pout[0];
+	signal(SIGTERM, killchild);
+	signal(SIGINT, killchild);
+	signal(SIGHUP, killchild);
+	return 0;
+}
+
+typedef struct {
+	int cnt;
+	char *buf;
+} BUF;
+
+BUF *allocbuf(BUF *, int, int);
+void lostconn(int);
+void nospace(void);
+int okname(char *);
+void run_err(const char *,...);
+void verifydir(char *);
+
+struct passwd *pwd;
+uid_t userid;
+int errs, remin, remout;
+int pflag, iamremote, iamrecursive, targetshouldbedirectory;
+
+#define	CMDNEEDS	64
+char cmd[CMDNEEDS];		/* must hold "rcp -r -p -d\0" */
+
+int response(void);
+void rsource(char *, struct stat *);
+void sink(int, char *[]);
+void source(int, char *[]);
+void tolocal(int, char *[]);
+void toremote(char *, int, char *[]);
+void usage(void);
+
+#if defined(DBMULTI_scp) || !defined(DROPBEAR_MULTI)
+#if defined(DBMULTI_scp) && defined(DROPBEAR_MULTI)
+int scp_main(int argc, char **argv)
+#else
+main(int argc, char **argv)
+#endif
+{
+	int ch, fflag, tflag, status;
+	double speed;
+	char *targ, *endp;
+	extern char *optarg;
+	extern int optind;
+
+	args.list = NULL;
+	addargs(&args, "ssh");		/* overwritten with ssh_program */
+	addargs(&args, "-x");
+	addargs(&args, "-oForwardAgent no");
+	addargs(&args, "-oClearAllForwardings yes");
+
+	fflag = tflag = 0;
+	while ((ch = getopt(argc, argv, "dfl:prtvBCc:i:P:q1246S:o:F:")) != -1)
+		switch (ch) {
+		/* User-visible flags. */
+		case '1':
+		case '2':
+		case '4':
+		case '6':
+		case 'C':
+			addargs(&args, "-%c", ch);
+			break;
+		case 'o':
+		case 'c':
+		case 'i':
+		case 'F':
+			addargs(&args, "-%c%s", ch, optarg);
+			break;
+		case 'P':
+			addargs(&args, "-p%s", optarg);
+			break;
+		case 'B':
+			addargs(&args, "-oBatchmode yes");
+			break;
+		case 'l':
+			speed = strtod(optarg, &endp);
+			if (speed <= 0 || *endp != '\0')
+				usage();
+			limitbw = speed * 1024;
+			break;
+		case 'p':
+			pflag = 1;
+			break;
+		case 'r':
+			iamrecursive = 1;
+			break;
+		case 'S':
+			ssh_program = xstrdup(optarg);
+			break;
+		case 'v':
+			addargs(&args, "-v");
+			verbose_mode = 1;
+			break;
+#ifdef PROGRESS_METER
+		case 'q':
+			showprogress = 0;
+			break;
+#endif
+
+		/* Server options. */
+		case 'd':
+			targetshouldbedirectory = 1;
+			break;
+		case 'f':	/* "from" */
+			iamremote = 1;
+			fflag = 1;
+			break;
+		case 't':	/* "to" */
+			iamremote = 1;
+			tflag = 1;
+#ifdef HAVE_CYGWIN
+			setmode(0, O_BINARY);
+#endif
+			break;
+		default:
+			usage();
+		}
+	argc -= optind;
+	argv += optind;
+
+	if ((pwd = getpwuid(userid = getuid())) == NULL) {
+		fprintf(stderr, "unknown user %u", (u_int) userid);
+	}
+
+#ifdef PROGRESS_METER
+	if (!isatty(STDERR_FILENO))
+		showprogress = 0;
+#endif
+
+	remin = STDIN_FILENO;
+	remout = STDOUT_FILENO;
+
+	if (fflag) {
+		/* Follow "protocol", send data. */
+		(void) response();
+		source(argc, argv);
+		exit(errs != 0);
+	}
+	if (tflag) {
+		/* Receive data. */
+		sink(argc, argv);
+		exit(errs != 0);
+	}
+	if (argc < 2)
+		usage();
+	if (argc > 2)
+		targetshouldbedirectory = 1;
+
+	remin = remout = -1;
+	do_cmd_pid = -1;
+	/* Command to be executed on remote system using "ssh". */
+	(void) snprintf(cmd, sizeof cmd, "scp%s%s%s%s",
+	    verbose_mode ? " -v" : "",
+	    iamrecursive ? " -r" : "", pflag ? " -p" : "",
+	    targetshouldbedirectory ? " -d" : "");
+
+	(void) signal(SIGPIPE, lostconn);
+
+	if ((targ = colon(argv[argc - 1])))	/* Dest is remote host. */
+		toremote(targ, argc, argv);
+	else {
+		tolocal(argc, argv);	/* Dest is local host. */
+		if (targetshouldbedirectory)
+			verifydir(argv[argc - 1]);
+	}
+	/*
+	 * Finally check the exit status of the ssh process, if one was forked
+	 * and no error has occured yet
+	 */
+	if (do_cmd_pid != -1 && errs == 0) {
+		if (remin != -1)
+		    (void) close(remin);
+		if (remout != -1)
+		    (void) close(remout);
+		if (waitpid(do_cmd_pid, &status, 0) == -1)
+			errs = 1;
+		else {
+			if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
+				errs = 1;
+		}
+	}
+	exit(errs != 0);
+}
+#endif /* DBMULTI stuff */
+
+void
+toremote(char *targ, int argc, char **argv)
+{
+	int i, len;
+	char *bp, *host, *src, *suser, *thost, *tuser;
+
+	*targ++ = 0;
+	if (*targ == 0)
+		targ = ".";
+
+	if ((thost = strrchr(argv[argc - 1], '@'))) {
+		/* user@host */
+		*thost++ = 0;
+		tuser = argv[argc - 1];
+		if (*tuser == '\0')
+			tuser = NULL;
+	} else {
+		thost = argv[argc - 1];
+		tuser = NULL;
+	}
+
+	for (i = 0; i < argc - 1; i++) {
+		src = colon(argv[i]);
+		if (src) {	/* remote to remote */
+			static char *ssh_options =
+			    "-x -o'ClearAllForwardings yes'";
+			*src++ = 0;
+			if (*src == 0)
+				src = ".";
+			host = strrchr(argv[i], '@');
+			len = strlen(ssh_program) + strlen(argv[i]) +
+			    strlen(src) + (tuser ? strlen(tuser) : 0) +
+			    strlen(thost) + strlen(targ) +
+			    strlen(ssh_options) + CMDNEEDS + 20;
+			bp = xmalloc(len);
+			if (host) {
+				*host++ = 0;
+				host = cleanhostname(host);
+				suser = argv[i];
+				if (*suser == '\0')
+					suser = pwd->pw_name;
+				else if (!okname(suser)) {
+					xfree(bp);
+					continue;
+				}
+				if (tuser && !okname(tuser)) {
+					xfree(bp);
+					continue;
+				}
+				snprintf(bp, len,
+				    "%s%s %s -n "
+				    "-l %s %s %s %s '%s%s%s:%s'",
+				    ssh_program, verbose_mode ? " -v" : "",
+				    ssh_options, suser, host, cmd, src,
+				    tuser ? tuser : "", tuser ? "@" : "",
+				    thost, targ);
+			} else {
+				host = cleanhostname(argv[i]);
+				snprintf(bp, len,
+				    "exec %s%s %s -n %s "
+				    "%s %s '%s%s%s:%s'",
+				    ssh_program, verbose_mode ? " -v" : "",
+				    ssh_options, host, cmd, src,
+				    tuser ? tuser : "", tuser ? "@" : "",
+				    thost, targ);
+			}
+			if (verbose_mode)
+				fprintf(stderr, "Executing: %s\n", bp);
+			(void) system(bp);
+			(void) xfree(bp);
+		} else {	/* local to remote */
+			if (remin == -1) {
+				len = strlen(targ) + CMDNEEDS + 20;
+				bp = xmalloc(len);
+				(void) snprintf(bp, len, "%s -t %s", cmd, targ);
+				host = cleanhostname(thost);
+				if (do_cmd(host, tuser, bp, &remin,
+				    &remout, argc) < 0)
+					exit(1);
+				if (response() < 0)
+					exit(1);
+				(void) xfree(bp);
+			}
+			source(1, argv + i);
+		}
+	}
+}
+
+void
+tolocal(int argc, char **argv)
+{
+	int i, len;
+	char *bp, *host, *src, *suser;
+
+	for (i = 0; i < argc - 1; i++) {
+		if (!(src = colon(argv[i]))) {	/* Local to local. */
+			len = strlen(_PATH_CP) + strlen(argv[i]) +
+			    strlen(argv[argc - 1]) + 20;
+			bp = xmalloc(len);
+			(void) snprintf(bp, len, "exec %s%s%s %s %s", _PATH_CP,
+			    iamrecursive ? " -r" : "", pflag ? " -p" : "",
+			    argv[i], argv[argc - 1]);
+			if (verbose_mode)
+				fprintf(stderr, "Executing: %s\n", bp);
+			if (system(bp))
+				++errs;
+			(void) xfree(bp);
+			continue;
+		}
+		*src++ = 0;
+		if (*src == 0)
+			src = ".";
+		if ((host = strrchr(argv[i], '@')) == NULL) {
+			host = argv[i];
+			suser = NULL;
+		} else {
+			*host++ = 0;
+			suser = argv[i];
+			if (*suser == '\0')
+				suser = pwd->pw_name;
+		}
+		host = cleanhostname(host);
+		len = strlen(src) + CMDNEEDS + 20;
+		bp = xmalloc(len);
+		(void) snprintf(bp, len, "%s -f %s", cmd, src);
+		if (do_cmd(host, suser, bp, &remin, &remout, argc) < 0) {
+			(void) xfree(bp);
+			++errs;
+			continue;
+		}
+		xfree(bp);
+		sink(1, argv + argc - 1);
+		(void) close(remin);
+		remin = remout = -1;
+	}
+}
+
+void
+source(int argc, char **argv)
+{
+	struct stat stb;
+	static BUF buffer;
+	BUF *bp;
+	off_t i, amt, result, statbytes;
+	int fd, haderr, indx;
+	char *last, *name, buf[2048];
+	int len;
+
+	for (indx = 0; indx < argc; ++indx) {
+		name = argv[indx];
+		statbytes = 0;
+		len = strlen(name);
+		while (len > 1 && name[len-1] == '/')
+			name[--len] = '\0';
+		if (strchr(name, '\n') != NULL) {
+			run_err("%s: skipping, filename contains a newline",
+			    name);
+			goto next;
+		}
+		if ((fd = open(name, O_RDONLY, 0)) < 0)
+			goto syserr;
+		if (fstat(fd, &stb) < 0) {
+syserr:			run_err("%s: %s", name, strerror(errno));
+			goto next;
+		}
+		switch (stb.st_mode & S_IFMT) {
+		case S_IFREG:
+			break;
+		case S_IFDIR:
+			if (iamrecursive) {
+				rsource(name, &stb);
+				goto next;
+			}
+			/* FALLTHROUGH */
+		default:
+			run_err("%s: not a regular file", name);
+			goto next;
+		}
+		if ((last = strrchr(name, '/')) == NULL)
+			last = name;
+		else
+			++last;
+		curfile = last;
+		if (pflag) {
+			/*
+			 * Make it compatible with possible future
+			 * versions expecting microseconds.
+			 */
+			(void) snprintf(buf, sizeof buf, "T%lu 0 %lu 0\n",
+			    (u_long) stb.st_mtime,
+			    (u_long) stb.st_atime);
+			(void) atomicio(vwrite, remout, buf, strlen(buf));
+			if (response() < 0)
+				goto next;
+		}
+#define	FILEMODEMASK	(S_ISUID|S_ISGID|S_IRWXU|S_IRWXG|S_IRWXO)
+		snprintf(buf, sizeof buf, "C%04o %lld %s\n",
+		    (u_int) (stb.st_mode & FILEMODEMASK),
+		    (int64_t)stb.st_size, last);
+		if (verbose_mode) {
+			fprintf(stderr, "Sending file modes: %s", buf);
+		}
+		(void) atomicio(vwrite, remout, buf, strlen(buf));
+		if (response() < 0)
+			goto next;
+		if ((bp = allocbuf(&buffer, fd, 2048)) == NULL) {
+next:			(void) close(fd);
+			continue;
+		}
+#ifdef PROGRESS_METER
+		if (showprogress)
+			start_progress_meter(curfile, stb.st_size, &statbytes);
+#endif
+		/* Keep writing after an error so that we stay sync'd up. */
+		for (haderr = i = 0; i < stb.st_size; i += bp->cnt) {
+			amt = bp->cnt;
+			if (i + amt > stb.st_size)
+				amt = stb.st_size - i;
+			if (!haderr) {
+				result = atomicio(read, fd, bp->buf, amt);
+				if (result != amt)
+					haderr = result >= 0 ? EIO : errno;
+			}
+			if (haderr)
+				(void) atomicio(vwrite, remout, bp->buf, amt);
+			else {
+				result = atomicio(vwrite, remout, bp->buf, amt);
+				if (result != amt)
+					haderr = result >= 0 ? EIO : errno;
+				statbytes += result;
+			}
+			if (limitbw)
+				bwlimit(amt);
+		}
+#ifdef PROGRESS_METER
+		if (showprogress)
+			stop_progress_meter();
+#endif
+
+		if (close(fd) < 0 && !haderr)
+			haderr = errno;
+		if (!haderr)
+			(void) atomicio(vwrite, remout, "", 1);
+		else
+			run_err("%s: %s", name, strerror(haderr));
+		(void) response();
+	}
+}
+
+void
+rsource(char *name, struct stat *statp)
+{
+	DIR *dirp;
+	struct dirent *dp;
+	char *last, *vect[1], path[1100];
+
+	if (!(dirp = opendir(name))) {
+		run_err("%s: %s", name, strerror(errno));
+		return;
+	}
+	last = strrchr(name, '/');
+	if (last == 0)
+		last = name;
+	else
+		last++;
+	if (pflag) {
+		(void) snprintf(path, sizeof(path), "T%lu 0 %lu 0\n",
+		    (u_long) statp->st_mtime,
+		    (u_long) statp->st_atime);
+		(void) atomicio(vwrite, remout, path, strlen(path));
+		if (response() < 0) {
+			closedir(dirp);
+			return;
+		}
+	}
+	(void) snprintf(path, sizeof path, "D%04o %d %.1024s\n",
+	    (u_int) (statp->st_mode & FILEMODEMASK), 0, last);
+	if (verbose_mode)
+		fprintf(stderr, "Entering directory: %s", path);
+	(void) atomicio(vwrite, remout, path, strlen(path));
+	if (response() < 0) {
+		closedir(dirp);
+		return;
+	}
+	while ((dp = readdir(dirp)) != NULL) {
+		if (dp->d_ino == 0)
+			continue;
+		if (!strcmp(dp->d_name, ".") || !strcmp(dp->d_name, ".."))
+			continue;
+		if (strlen(name) + 1 + strlen(dp->d_name) >= sizeof(path) - 1) {
+			run_err("%s/%s: name too long", name, dp->d_name);
+			continue;
+		}
+		(void) snprintf(path, sizeof path, "%s/%s", name, dp->d_name);
+		vect[0] = path;
+		source(1, vect);
+	}
+	(void) closedir(dirp);
+	(void) atomicio(vwrite, remout, "E\n", 2);
+	(void) response();
+}
+
+void
+bwlimit(int amount)
+{
+	static struct timeval bwstart, bwend;
+	static int lamt, thresh = 16384;
+	uint64_t wait;
+	struct timespec ts, rm;
+
+	if (!timerisset(&bwstart)) {
+		gettimeofday(&bwstart, NULL);
+		return;
+	}
+
+	lamt += amount;
+	if (lamt < thresh)
+		return;
+
+	gettimeofday(&bwend, NULL);
+	timersub(&bwend, &bwstart, &bwend);
+	if (!timerisset(&bwend))
+		return;
+
+	lamt *= 8;
+	wait = (double)1000000L * lamt / limitbw;
+
+	bwstart.tv_sec = wait / 1000000L;
+	bwstart.tv_usec = wait % 1000000L;
+
+	if (timercmp(&bwstart, &bwend, >)) {
+		timersub(&bwstart, &bwend, &bwend);
+
+		/* Adjust the wait time */
+		if (bwend.tv_sec) {
+			thresh /= 2;
+			if (thresh < 2048)
+				thresh = 2048;
+		} else if (bwend.tv_usec < 100) {
+			thresh *= 2;
+			if (thresh > 32768)
+				thresh = 32768;
+		}
+
+		TIMEVAL_TO_TIMESPEC(&bwend, &ts);
+		while (nanosleep(&ts, &rm) == -1) {
+			if (errno != EINTR)
+				break;
+			ts = rm;
+		}
+	}
+
+	lamt = 0;
+	gettimeofday(&bwstart, NULL);
+}
+
+void
+sink(int argc, char **argv)
+{
+	static BUF buffer;
+	struct stat stb;
+	enum {
+		YES, NO, DISPLAYED
+	} wrerr;
+	BUF *bp;
+	off_t i, j;
+	int amt, count, exists, first, mask, mode, ofd, omode;
+	off_t size, statbytes;
+	int setimes, targisdir, wrerrno = 0;
+	char ch, *cp, *np, *targ, *why, *vect[1], buf[2048];
+	struct timeval tv[2];
+
+#define	atime	tv[0]
+#define	mtime	tv[1]
+#define	SCREWUP(str)	do { why = str; goto screwup; } while (0)
+
+	setimes = targisdir = 0;
+	mask = umask(0);
+	if (!pflag)
+		(void) umask(mask);
+	if (argc != 1) {
+		run_err("ambiguous target");
+		exit(1);
+	}
+	targ = *argv;
+	if (targetshouldbedirectory)
+		verifydir(targ);
+
+	(void) atomicio(vwrite, remout, "", 1);
+	if (stat(targ, &stb) == 0 && S_ISDIR(stb.st_mode))
+		targisdir = 1;
+	for (first = 1;; first = 0) {
+		cp = buf;
+		if (atomicio(read, remin, cp, 1) <= 0)
+			return;
+		if (*cp++ == '\n')
+			SCREWUP("unexpected <newline>");
+		do {
+			if (atomicio(read, remin, &ch, sizeof(ch)) != sizeof(ch))
+				SCREWUP("lost connection");
+			*cp++ = ch;
+		} while (cp < &buf[sizeof(buf) - 1] && ch != '\n');
+		*cp = 0;
+
+		if (buf[0] == '\01' || buf[0] == '\02') {
+			if (iamremote == 0)
+				(void) atomicio(vwrite, STDERR_FILENO,
+				    buf + 1, strlen(buf + 1));
+			if (buf[0] == '\02')
+				exit(1);
+			++errs;
+			continue;
+		}
+		if (buf[0] == 'E') {
+			(void) atomicio(vwrite, remout, "", 1);
+			return;
+		}
+		if (ch == '\n')
+			*--cp = 0;
+
+		cp = buf;
+		if (*cp == 'T') {
+			setimes++;
+			cp++;
+			mtime.tv_sec = strtol(cp, &cp, 10);
+			if (!cp || *cp++ != ' ')
+				SCREWUP("mtime.sec not delimited");
+			mtime.tv_usec = strtol(cp, &cp, 10);
+			if (!cp || *cp++ != ' ')
+				SCREWUP("mtime.usec not delimited");
+			atime.tv_sec = strtol(cp, &cp, 10);
+			if (!cp || *cp++ != ' ')
+				SCREWUP("atime.sec not delimited");
+			atime.tv_usec = strtol(cp, &cp, 10);
+			if (!cp || *cp++ != '\0')
+				SCREWUP("atime.usec not delimited");
+			(void) atomicio(vwrite, remout, "", 1);
+			continue;
+		}
+		if (*cp != 'C' && *cp != 'D') {
+			/*
+			 * Check for the case "rcp remote:foo\* local:bar".
+			 * In this case, the line "No match." can be returned
+			 * by the shell before the rcp command on the remote is
+			 * executed so the ^Aerror_message convention isn't
+			 * followed.
+			 */
+			if (first) {
+				run_err("%s", cp);
+				exit(1);
+			}
+			SCREWUP("expected control record");
+		}
+		mode = 0;
+		for (++cp; cp < buf + 5; cp++) {
+			if (*cp < '0' || *cp > '7')
+				SCREWUP("bad mode");
+			mode = (mode << 3) | (*cp - '0');
+		}
+		if (*cp++ != ' ')
+			SCREWUP("mode not delimited");
+
+		for (size = 0; isdigit(*cp);)
+			size = size * 10 + (*cp++ - '0');
+		if (*cp++ != ' ')
+			SCREWUP("size not delimited");
+		if (targisdir) {
+			static char *namebuf;
+			static int cursize;
+			size_t need;
+
+			need = strlen(targ) + strlen(cp) + 250;
+			if (need > cursize) {
+				if (namebuf)
+					xfree(namebuf);
+				namebuf = xmalloc(need);
+				cursize = need;
+			}
+			(void) snprintf(namebuf, need, "%s%s%s", targ,
+			    strcmp(targ, "/") ? "/" : "", cp);
+			np = namebuf;
+		} else
+			np = targ;
+		curfile = cp;
+		exists = stat(np, &stb) == 0;
+		if (buf[0] == 'D') {
+			int mod_flag = pflag;
+			if (exists) {
+				if (!S_ISDIR(stb.st_mode)) {
+					errno = ENOTDIR;
+					goto bad;
+				}
+				if (pflag)
+					(void) chmod(np, mode);
+			} else {
+				/* Handle copying from a read-only
+				   directory */
+				mod_flag = 1;
+				if (mkdir(np, mode | S_IRWXU) < 0)
+					goto bad;
+			}
+			vect[0] = xstrdup(np);
+			sink(1, vect);
+			if (setimes) {
+				setimes = 0;
+				if (utimes(vect[0], tv) < 0)
+					run_err("%s: set times: %s",
+					    vect[0], strerror(errno));
+			}
+			if (mod_flag)
+				(void) chmod(vect[0], mode);
+			if (vect[0])
+				xfree(vect[0]);
+			continue;
+		}
+		omode = mode;
+		mode |= S_IWRITE;
+		if ((ofd = open(np, O_WRONLY|O_CREAT, mode)) < 0) {
+bad:			run_err("%s: %s", np, strerror(errno));
+			continue;
+		}
+		(void) atomicio(vwrite, remout, "", 1);
+		if ((bp = allocbuf(&buffer, ofd, 4096)) == NULL) {
+			(void) close(ofd);
+			continue;
+		}
+		cp = bp->buf;
+		wrerr = NO;
+
+		statbytes = 0;
+#ifdef PROGRESS_METER
+		if (showprogress)
+			start_progress_meter(curfile, size, &statbytes);
+#endif
+		for (count = i = 0; i < size; i += 4096) {
+			amt = 4096;
+			if (i + amt > size)
+				amt = size - i;
+			count += amt;
+			do {
+				j = read(remin, cp, amt);
+				if (j == -1 && (errno == EINTR ||
+				    errno == EAGAIN)) {
+					continue;
+				} else if (j <= 0) {
+					run_err("%s", j ? strerror(errno) :
+					    "dropped connection");
+					exit(1);
+				}
+				amt -= j;
+				cp += j;
+				statbytes += j;
+			} while (amt > 0);
+		
+			if (limitbw)
+				bwlimit(4096);
+
+			if (count == bp->cnt) {
+				/* Keep reading so we stay sync'd up. */
+				if (wrerr == NO) {
+					j = atomicio(vwrite, ofd, bp->buf, count);
+					if (j != count) {
+						wrerr = YES;
+						wrerrno = j >= 0 ? EIO : errno;
+					}
+				}
+				count = 0;
+				cp = bp->buf;
+			}
+		}
+#ifdef PROGRESS_METER
+		if (showprogress)
+			stop_progress_meter();
+#endif
+		if (count != 0 && wrerr == NO &&
+		    (j = atomicio(vwrite, ofd, bp->buf, count)) != count) {
+			wrerr = YES;
+			wrerrno = j >= 0 ? EIO : errno;
+		}
+		if (wrerr == NO && ftruncate(ofd, size) != 0) {
+			run_err("%s: truncate: %s", np, strerror(errno));
+			wrerr = DISPLAYED;
+		}
+		if (pflag) {
+			if (exists || omode != mode)
+#ifdef HAVE_FCHMOD
+				if (fchmod(ofd, omode))
+#else /* HAVE_FCHMOD */
+				if (chmod(np, omode))
+#endif /* HAVE_FCHMOD */
+					run_err("%s: set mode: %s",
+					    np, strerror(errno));
+		} else {
+			if (!exists && omode != mode)
+#ifdef HAVE_FCHMOD
+				if (fchmod(ofd, omode & ~mask))
+#else /* HAVE_FCHMOD */
+				if (chmod(np, omode & ~mask))
+#endif /* HAVE_FCHMOD */
+					run_err("%s: set mode: %s",
+					    np, strerror(errno));
+		}
+		if (close(ofd) == -1) {
+			wrerr = YES;
+			wrerrno = errno;
+		}
+		(void) response();
+		if (setimes && wrerr == NO) {
+			setimes = 0;
+			if (utimes(np, tv) < 0) {
+				run_err("%s: set times: %s",
+				    np, strerror(errno));
+				wrerr = DISPLAYED;
+			}
+		}
+		switch (wrerr) {
+		case YES:
+			run_err("%s: %s", np, strerror(wrerrno));
+			break;
+		case NO:
+			(void) atomicio(vwrite, remout, "", 1);
+			break;
+		case DISPLAYED:
+			break;
+		}
+	}
+screwup:
+	run_err("protocol error: %s", why);
+	exit(1);
+}
+
+int
+response(void)
+{
+	char ch, *cp, resp, rbuf[2048];
+
+	if (atomicio(read, remin, &resp, sizeof(resp)) != sizeof(resp))
+		lostconn(0);
+
+	cp = rbuf;
+	switch (resp) {
+	case 0:		/* ok */
+		return (0);
+	default:
+		*cp++ = resp;
+		/* FALLTHROUGH */
+	case 1:		/* error, followed by error msg */
+	case 2:		/* fatal error, "" */
+		do {
+			if (atomicio(read, remin, &ch, sizeof(ch)) != sizeof(ch))
+				lostconn(0);
+			*cp++ = ch;
+		} while (cp < &rbuf[sizeof(rbuf) - 1] && ch != '\n');
+
+		if (!iamremote)
+			(void) atomicio(vwrite, STDERR_FILENO, rbuf, cp - rbuf);
+		++errs;
+		if (resp == 1)
+			return (-1);
+		exit(1);
+	}
+	/* NOTREACHED */
+}
+
+void
+usage(void)
+{
+	(void) fprintf(stderr,
+	    "usage: scp [-pqrvBC1246] [-F config] [-S program] [-P port]\n"
+	    "           [-c cipher] [-i identity] [-l limit] [-o option]\n"
+	    "           [[user@]host1:]file1 [...] [[user@]host2:]file2\n");
+	exit(1);
+}
+
+void
+run_err(const char *fmt,...)
+{
+	static FILE *fp;
+	va_list ap;
+
+	++errs;
+	if (fp == NULL && !(fp = fdopen(remout, "w")))
+		return;
+	(void) fprintf(fp, "%c", 0x01);
+	(void) fprintf(fp, "scp: ");
+	va_start(ap, fmt);
+	(void) vfprintf(fp, fmt, ap);
+	va_end(ap);
+	(void) fprintf(fp, "\n");
+	(void) fflush(fp);
+
+	if (!iamremote) {
+		va_start(ap, fmt);
+		vfprintf(stderr, fmt, ap);
+		va_end(ap);
+		fprintf(stderr, "\n");
+	}
+}
+
+void
+verifydir(char *cp)
+{
+	struct stat stb;
+
+	if (!stat(cp, &stb)) {
+		if (S_ISDIR(stb.st_mode))
+			return;
+		errno = ENOTDIR;
+	}
+	run_err("%s: %s", cp, strerror(errno));
+	exit(1);
+}
+
+int
+okname(char *cp0)
+{
+	int c;
+	char *cp;
+
+	cp = cp0;
+	do {
+		c = (int)*cp;
+		if (c & 0200)
+			goto bad;
+		if (!isalpha(c) && !isdigit(c)) {
+			switch (c) {
+			case '\'':
+			case '"':
+			case '`':
+			case ' ':
+			case '#':
+				goto bad;
+			default:
+				break;
+			}
+		}
+	} while (*++cp);
+	return (1);
+
+bad:	fprintf(stderr, "%s: invalid user name\n", cp0);
+	return (0);
+}
+
+BUF *
+allocbuf(BUF *bp, int fd, int blksize)
+{
+	size_t size;
+#ifdef HAVE_STRUCT_STAT_ST_BLKSIZE
+	struct stat stb;
+
+	if (fstat(fd, &stb) < 0) {
+		run_err("fstat: %s", strerror(errno));
+		return (0);
+	}
+	size = roundup(stb.st_blksize, blksize);
+	if (size == 0)
+		size = blksize;
+#else /* HAVE_STRUCT_STAT_ST_BLKSIZE */
+	size = blksize;
+#endif /* HAVE_STRUCT_STAT_ST_BLKSIZE */
+	if (bp->cnt >= size)
+		return (bp);
+	if (bp->buf == NULL)
+		bp->buf = xmalloc(size);
+	else
+		bp->buf = xrealloc(bp->buf, size);
+	memset(bp->buf, 0, size);
+	bp->cnt = size;
+	return (bp);
+}
+
+void
+lostconn(int signo)
+{
+	if (!iamremote)
+		write(STDERR_FILENO, "lost connection\n", 16);
+	if (signo)
+		_exit(1);
+	else
+		exit(1);
+}
diff --git a/scpmisc.c b/scpmisc.c
new file mode 100644
index 0000000..e15bf4d
--- /dev/null
+++ b/scpmisc.c
@@ -0,0 +1,161 @@
+/*
+ * Copyright (c) 2000 Markus Friedl.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*RCSID("OpenBSD: misc.c,v 1.22 2003/09/18 08:49:45 markus Exp ");*/
+
+/* For xmalloc, xfree etc:
+ * Author: Tatu Ylonen <ylo@cs.hut.fi>
+ * Copyright (c) 1995 Tatu Ylonen <ylo@cs.hut.fi>, Espoo, Finland
+ *                    All rights reserved
+ * Versions of malloc and friends that check their results, and never return
+ * failure (they call fatal if they encounter an error).
+ *
+ * As far as I am concerned, the code I have written for this software
+ * can be used freely for any purpose.  Any derived versions of this
+ * software must be clearly marked as such, and if the derived work is
+ * incompatible with the protocol description in the RFC file, it must be
+ * called by a name other than "ssh" or "Secure Shell".
+ */
+
+/*RCSID("OpenBSD: xmalloc.c,v 1.16 2001/07/23 18:21:46 stevesk Exp ");*/
+
+#include "includes.h"
+#include "scpmisc.h"
+
+void *
+xmalloc(size_t size)
+{
+	void *ptr;
+
+	if (size == 0) {
+		fprintf(stderr, "xmalloc: zero size\n");
+		exit(EXIT_FAILURE);
+	}
+	ptr = malloc(size);
+	if (ptr == NULL) {
+		fprintf(stderr, "xmalloc: out of memory (allocating %lu bytes)\n", (u_long) size);
+		exit(EXIT_FAILURE);
+	}
+	return ptr;
+}
+
+void *
+xrealloc(void *ptr, size_t new_size)
+{
+	void *new_ptr;
+
+	if (new_size == 0) {
+		fprintf(stderr, "xrealloc: zero size\n");
+		exit(EXIT_FAILURE);
+	}
+	if (ptr == NULL)
+		new_ptr = malloc(new_size);
+	else
+		new_ptr = realloc(ptr, new_size);
+	if (new_ptr == NULL) {
+		fprintf(stderr, "xrealloc: out of memory (new_size %lu bytes)\n", (u_long) new_size);
+		exit(EXIT_FAILURE);
+	}
+	return new_ptr;
+}
+
+void
+xfree(void *ptr)
+{
+	if (ptr == NULL) {
+		fprintf(stderr, "xfree: NULL pointer given as argument\n");
+		exit(EXIT_FAILURE);
+	}
+	free(ptr);
+}
+
+char *
+xstrdup(const char *str)
+{
+	size_t len;
+	char *cp;
+
+	len = strlen(str) + 1;
+	cp = xmalloc(len);
+	strncpy(cp, str, len);
+	return cp;
+}
+
+char *
+cleanhostname(char *host)
+{
+	if (*host == '[' && host[strlen(host) - 1] == ']') {
+		host[strlen(host) - 1] = '\0';
+		return (host + 1);
+	} else
+		return host;
+}
+
+char *
+colon(char *cp)
+{
+	int flag = 0;
+
+	if (*cp == ':')		/* Leading colon is part of file name. */
+		return (0);
+	if (*cp == '[')
+		flag = 1;
+
+	for (; *cp; ++cp) {
+		if (*cp == '@' && *(cp+1) == '[')
+			flag = 1;
+		if (*cp == ']' && *(cp+1) == ':' && flag)
+			return (cp+1);
+		if (*cp == ':' && !flag)
+			return (cp);
+		if (*cp == '/')
+			return (0);
+	}
+	return (0);
+}
+
+/* function to assist building execv() arguments */
+void
+addargs(arglist *args, char *fmt, ...)
+{
+	va_list ap;
+	char buf[1024];
+	int nalloc;
+
+	va_start(ap, fmt);
+	vsnprintf(buf, sizeof(buf), fmt, ap);
+	va_end(ap);
+
+	nalloc = args->nalloc;
+	if (args->list == NULL) {
+		nalloc = 32;
+		args->num = 0;
+	} else if (args->num+2 >= nalloc)
+		nalloc *= 2;
+
+	args->list = xrealloc(args->list, nalloc * sizeof(char *));
+	args->nalloc = nalloc;
+	args->list[args->num++] = xstrdup(buf);
+	args->list[args->num] = NULL;
+}
diff --git a/scpmisc.h b/scpmisc.h
new file mode 100644
index 0000000..b6059c6
--- /dev/null
+++ b/scpmisc.h
@@ -0,0 +1,44 @@
+/*	$OpenBSD: misc.h,v 1.12 2002/03/19 10:49:35 markus Exp $	*/
+
+/*
+ * Author: Tatu Ylonen <ylo@cs.hut.fi>
+ * Copyright (c) 1995 Tatu Ylonen <ylo@cs.hut.fi>, Espoo, Finland
+ *                    All rights reserved
+ *
+ * As far as I am concerned, the code I have written for this software
+ * can be used freely for any purpose.  Any derived versions of this
+ * software must be clearly marked as such, and if the derived work is
+ * incompatible with the protocol description in the RFC file, it must be
+ * called by a name other than "ssh" or "Secure Shell".
+ */
+
+/* actually from atomicio, but is only used in scp code */
+#define vwrite (ssize_t (*)(int, void *, size_t))write
+
+char	*chop(char *);
+char	*strdelim(char **);
+void	 set_nonblock(int);
+void	 unset_nonblock(int);
+void	 set_nodelay(int);
+int	 a2port(const char *);
+char	*cleanhostname(char *);
+char	*colon(char *);
+long	 convtime(const char *);
+
+struct passwd *pwcopy(struct passwd *);
+
+typedef struct arglist arglist;
+struct arglist {
+	char    **list;
+	int     num;
+	int     nalloc;
+};
+void	 addargs(arglist *, char *, ...);
+
+/* from xmalloc.h */
+void	*xmalloc(size_t);
+void	*xrealloc(void *, size_t);
+void     xfree(void *);
+char	*xstrdup(const char *);
+
+
diff --git a/service.h b/service.h
new file mode 100644
index 0000000..197d8d1
--- /dev/null
+++ b/service.h
@@ -0,0 +1,32 @@
+/*
+ * Dropbear - a SSH2 server
+ * 
+ * Copyright (c) 2002,2003 Matt Johnston
+ * All rights reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE. */
+
+#ifndef _SERVICE_H_
+#define _SERVICE_H_
+
+void recv_msg_service_request(); /* Server */
+void send_msg_service_request(); /* Client */
+void recv_msg_service_accept(); /* Client */
+
+#endif /* _SERVICE_H_ */
diff --git a/session.h b/session.h
new file mode 100644
index 0000000..a4ad45f
--- /dev/null
+++ b/session.h
@@ -0,0 +1,257 @@
+/*
+ * Dropbear - a SSH2 server
+ * 
+ * Copyright (c) 2002,2003 Matt Johnston
+ * All rights reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE. */
+
+#ifndef _SESSION_H_
+#define _SESSION_H_
+
+#include "includes.h"
+#include "options.h"
+#include "buffer.h"
+#include "signkey.h"
+#include "kex.h"
+#include "auth.h"
+#include "channel.h"
+#include "queue.h"
+#include "listener.h"
+#include "packet.h"
+#include "tcpfwd.h"
+#include "chansession.h"
+
+extern int sessinitdone; /* Is set to 0 somewhere */
+extern int exitflag;
+
+void common_session_init(int sock, char* remotehost);
+void session_loop(void(*loophandler)());
+void common_session_cleanup();
+void session_identification();
+
+
+/* Server */
+void svr_session(int sock, int childpipe, char *remotehost, char *addrstring);
+void svr_dropbear_exit(int exitcode, const char* format, va_list param);
+void svr_dropbear_log(int priority, const char* format, va_list param);
+
+/* Client */
+void cli_session(int sock, char *remotehost);
+void cli_session_cleanup();
+void cleantext(unsigned char* dirtytext);
+
+struct key_context {
+
+	const struct dropbear_cipher *recv_algo_crypt; /* NULL for none */
+	const struct dropbear_cipher *trans_algo_crypt; /* NULL for none */
+	const struct dropbear_hash *recv_algo_mac; /* NULL for none */
+	const struct dropbear_hash *trans_algo_mac; /* NULL for none */
+	char algo_kex;
+	char algo_hostkey;
+
+	char recv_algo_comp; /* compression */
+	char trans_algo_comp;
+#ifndef DISABLE_ZLIB
+	z_streamp recv_zstream;
+	z_streamp trans_zstream;
+#endif
+
+	/* actual keys */
+	symmetric_CBC recv_symmetric_struct;
+	symmetric_CBC trans_symmetric_struct;
+	unsigned char recvmackey[MAX_MAC_KEY];
+	unsigned char transmackey[MAX_MAC_KEY];
+
+};
+
+struct sshsession {
+
+	/* Is it a client or server? */
+	unsigned char isserver;
+
+	long connecttimeout; /* time to disconnect if we have a timeout (for
+							userauth etc), or 0 for no timeout */
+
+	int sock;
+
+	unsigned char *remotehost; /* the peer hostname */
+
+	unsigned char *remoteident;
+
+	int maxfd; /* the maximum file descriptor to check with select() */
+
+
+	/* Packet buffers/values etc */
+	buffer *writepayload; /* Unencrypted payload to write - this is used
+							 throughout the code, as handlers fill out this
+							 buffer with the packet to send. */
+	struct Queue writequeue; /* A queue of encrypted packets to send */
+	buffer *readbuf; /* Encrypted */
+	buffer *decryptreadbuf; /* Post-decryption */
+	buffer *payload; /* Post-decompression, the actual SSH packet */
+	unsigned int transseq, recvseq; /* Sequence IDs */
+
+	/* Packet-handling flags */
+	const packettype * packettypes; /* Packet handler mappings for this
+										session, see process-packet.c */
+
+	unsigned dataallowed : 1; /* whether we can send data packets or we are in
+								 the middle of a KEX or something */
+
+	unsigned char requirenext; /* byte indicating what packet we require next, 
+								or 0x00 for any */
+
+	unsigned char ignorenext; /* whether to ignore the next packet,
+								 used for kex_follows stuff */
+
+	unsigned char lastpacket; /* What the last received packet type was */
+	
+
+
+	/* KEX/encryption related */
+	struct KEXState kexstate;
+	struct key_context *keys;
+	struct key_context *newkeys;
+	unsigned char *session_id; /* this is the hash from the first kex */
+	/* The below are used temorarily during kex, are freed after use */
+	mp_int * dh_K; /* SSH_MSG_KEXDH_REPLY and sending SSH_MSH_NEWKEYS */
+	unsigned char hash[SHA1_HASH_SIZE]; /* the hash*/
+	buffer* kexhashbuf; /* session hash buffer calculated from various packets*/
+	buffer* transkexinit; /* the kexinit packet we send should be kept so we
+							 can add it to the hash when generating keys */
+
+	algo_type*(*buf_match_algo)(buffer*buf, algo_type localalgos[],
+			int *goodguess); /* The function to use to choose which algorithm
+								to use from the ones presented by the remote
+								side. Is specific to the client/server mode,
+								hence the function-pointer callback.*/
+
+	void(*remoteclosed)(); /* A callback to handle closure of the
+									  remote connection */
+
+
+	struct AuthState authstate; /* Common amongst client and server, since most
+								   struct elements are common */
+
+	/* Channel related */
+	struct Channel ** channels; /* these pointers may be null */
+	unsigned int chansize; /* the number of Channel*s allocated for channels */
+	unsigned int chancount; /* the number of Channel*s in use */
+	const struct ChanType **chantypes; /* The valid channel types */
+
+	
+	/* TCP forwarding - where manage listeners */
+	struct Listener ** listeners;
+	unsigned int listensize;
+
+	/* Whether to allow binding to privileged ports (<1024). This doesn't
+	 * really belong here, but nowhere else fits nicely */
+	int allowprivport;
+
+};
+
+struct serversession {
+
+	/* Server specific options */
+	int childpipe; /* kept open until we successfully authenticate */
+	/* userauth */
+
+	struct ChildPid * childpids; /* array of mappings childpid<->channel */
+	unsigned int childpidsize;
+
+	/* Used to avoid a race in the exit returncode handling - see
+	 * svr-chansession.c for details */
+	struct exitinfo lastexit;
+
+	/* The numeric address they connected from, used for logging */
+	char * addrstring;
+
+};
+
+typedef enum {
+	KEX_NOTHING,
+	KEXINIT_RCVD,
+	KEXDH_INIT_SENT,
+	KEXDONE,
+
+} cli_kex_state;
+
+typedef enum {
+	STATE_NOTHING,
+	SERVICE_AUTH_REQ_SENT,
+	SERVICE_AUTH_ACCEPT_RCVD,
+	SERVICE_CONN_REQ_SENT,
+	SERVICE_CONN_ACCEPT_RCVD,
+	USERAUTH_REQ_SENT,
+	USERAUTH_FAIL_RCVD,
+	USERAUTH_SUCCESS_RCVD,
+	SESSION_RUNNING,
+
+} cli_state;
+
+struct clientsession {
+
+	mp_int *dh_e, *dh_x; /* Used during KEX */
+	cli_kex_state kex_state; /* Used for progressing KEX */
+	cli_state state; /* Used to progress auth/channelsession etc */
+	unsigned donefirstkex : 1; /* Set when we set sentnewkeys, never reset */
+
+	int tty_raw_mode; /* Whether we're in raw mode (and have to clean up) */
+	struct termios saved_tio;
+	int stdincopy;
+	int stdinflags;
+	int stdoutcopy;
+	int stdoutflags;
+	int stderrcopy;
+	int stderrflags;
+
+	int winchange; /* Set to 1 when a windowchange signal happens */
+
+	int lastauthtype; /* either AUTH_TYPE_PUBKEY or AUTH_TYPE_PASSWORD,
+						 for the last type of auth we tried */
+#ifdef ENABLE_CLI_INTERACT_AUTH
+	int auth_interact_failed; /* flag whether interactive auth can still
+								 be used */
+	int interact_request_received; /* flag whether we've received an 
+									  info request from the server for
+									  interactive auth.*/
+#endif
+	struct SignKeyList *lastprivkey;
+
+	int retval; /* What the command exit status was - we emulate it */
+#if 0
+	TODO
+	struct AgentkeyList *agentkeys; /* Keys to use for public-key auth */
+#endif
+
+};
+
+/* Global structs storing the state */
+extern struct sshsession ses;
+
+#ifdef DROPBEAR_SERVER
+extern struct serversession svr_ses;
+#endif /* DROPBEAR_SERVER */
+
+#ifdef DROPBEAR_CLIENT
+extern struct clientsession cli_ses;
+#endif /* DROPBEAR_CLIENT */
+
+#endif /* _SESSION_H_ */
diff --git a/signkey.c b/signkey.c
new file mode 100644
index 0000000..b8afb58
--- /dev/null
+++ b/signkey.c
@@ -0,0 +1,484 @@
+/*
+ * Dropbear - a SSH2 server
+ * 
+ * Copyright (c) 2002,2003 Matt Johnston
+ * All rights reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE. */
+
+#include "includes.h"
+#include "dbutil.h"
+#include "signkey.h"
+#include "buffer.h"
+#include "ssh.h"
+
+/* malloc a new sign_key and set the dss and rsa keys to NULL */
+sign_key * new_sign_key() {
+
+	sign_key * ret;
+
+	ret = (sign_key*)m_malloc(sizeof(sign_key));
+#ifdef DROPBEAR_DSS
+	ret->dsskey = NULL;
+#endif
+#ifdef DROPBEAR_RSA
+	ret->rsakey = NULL;
+#endif
+	return ret;
+
+}
+
+/* Returns "ssh-dss" or "ssh-rsa" corresponding to the type. Exits fatally
+ * if the type is invalid */
+const char* signkey_name_from_type(int type, int *namelen) {
+
+#ifdef DROPBEAR_RSA
+	if (type == DROPBEAR_SIGNKEY_RSA) {
+		*namelen = SSH_SIGNKEY_RSA_LEN;
+		return SSH_SIGNKEY_RSA;
+	}
+#endif
+#ifdef DROPBEAR_DSS
+	if (type == DROPBEAR_SIGNKEY_DSS) {
+		*namelen = SSH_SIGNKEY_DSS_LEN;
+		return SSH_SIGNKEY_DSS;
+	}
+#endif
+	dropbear_exit("bad key type %d", type);
+	return NULL; /* notreached */
+}
+
+/* Returns DROPBEAR_SIGNKEY_RSA, DROPBEAR_SIGNKEY_DSS, 
+ * or DROPBEAR_SIGNKEY_NONE */
+int signkey_type_from_name(const char* name, int namelen) {
+
+#ifdef DROPBEAR_RSA
+	if (namelen == SSH_SIGNKEY_RSA_LEN
+			&& memcmp(name, SSH_SIGNKEY_RSA, SSH_SIGNKEY_RSA_LEN) == 0) {
+		return DROPBEAR_SIGNKEY_RSA;
+	}
+#endif
+#ifdef DROPBEAR_DSS
+	if (namelen == SSH_SIGNKEY_DSS_LEN
+			&& memcmp(name, SSH_SIGNKEY_DSS, SSH_SIGNKEY_DSS_LEN) == 0) {
+		return DROPBEAR_SIGNKEY_DSS;
+	}
+#endif
+
+	return DROPBEAR_SIGNKEY_NONE;
+}
+
+/* returns DROPBEAR_SUCCESS on success, DROPBEAR_FAILURE on fail.
+ * type should be set by the caller to specify the type to read, and
+ * on return is set to the type read (useful when type = _ANY) */
+int buf_get_pub_key(buffer *buf, sign_key *key, int *type) {
+
+	unsigned char* ident;
+	unsigned int len;
+	int keytype;
+	int ret = DROPBEAR_FAILURE;
+
+	TRACE(("enter buf_get_pub_key"))
+
+	ident = buf_getstring(buf, &len);
+	keytype = signkey_type_from_name(ident, len);
+	m_free(ident);
+
+	if (*type != DROPBEAR_SIGNKEY_ANY && *type != keytype) {
+		return DROPBEAR_FAILURE;
+	}
+
+	*type = keytype;
+
+	/* Rewind the buffer back before "ssh-rsa" etc */
+	buf_incrpos(buf, -len - 4);
+
+#ifdef DROPBEAR_DSS
+	if (keytype == DROPBEAR_SIGNKEY_DSS) {
+		dss_key_free(key->dsskey);
+		key->dsskey = (dss_key*)m_malloc(sizeof(dss_key));
+		ret = buf_get_dss_pub_key(buf, key->dsskey);
+		if (ret == DROPBEAR_FAILURE) {
+			m_free(key->dsskey);
+		}
+	}
+#endif
+#ifdef DROPBEAR_RSA
+	if (keytype == DROPBEAR_SIGNKEY_RSA) {
+		rsa_key_free(key->rsakey);
+		key->rsakey = (rsa_key*)m_malloc(sizeof(rsa_key));
+		ret = buf_get_rsa_pub_key(buf, key->rsakey);
+		if (ret == DROPBEAR_FAILURE) {
+			m_free(key->rsakey);
+		}
+	}
+#endif
+
+	TRACE(("leave buf_get_pub_key"))
+
+	return ret;
+	
+}
+
+/* returns DROPBEAR_SUCCESS on success, DROPBEAR_FAILURE on fail.
+ * type should be set by the caller to specify the type to read, and
+ * on return is set to the type read (useful when type = _ANY) */
+int buf_get_priv_key(buffer *buf, sign_key *key, int *type) {
+
+	unsigned char* ident;
+	unsigned int len;
+	int keytype;
+	int ret = DROPBEAR_FAILURE;
+
+	TRACE(("enter buf_get_priv_key"))
+
+	ident = buf_getstring(buf, &len);
+	keytype = signkey_type_from_name(ident, len);
+	m_free(ident);
+
+	if (*type != DROPBEAR_SIGNKEY_ANY && *type != keytype) {
+		TRACE(("wrong key type: %d %d", *type, keytype))
+		return DROPBEAR_FAILURE;
+	}
+
+	*type = keytype;
+
+	/* Rewind the buffer back before "ssh-rsa" etc */
+	buf_incrpos(buf, -len - 4);
+
+#ifdef DROPBEAR_DSS
+	if (keytype == DROPBEAR_SIGNKEY_DSS) {
+		dss_key_free(key->dsskey);
+		key->dsskey = (dss_key*)m_malloc(sizeof(dss_key));
+		ret = buf_get_dss_priv_key(buf, key->dsskey);
+		if (ret == DROPBEAR_FAILURE) {
+			m_free(key->dsskey);
+		}
+	}
+#endif
+#ifdef DROPBEAR_RSA
+	if (keytype == DROPBEAR_SIGNKEY_RSA) {
+		rsa_key_free(key->rsakey);
+		key->rsakey = (rsa_key*)m_malloc(sizeof(rsa_key));
+		ret = buf_get_rsa_priv_key(buf, key->rsakey);
+		if (ret == DROPBEAR_FAILURE) {
+			m_free(key->rsakey);
+		}
+	}
+#endif
+
+	TRACE(("leave buf_get_priv_key"))
+
+	return ret;
+	
+}
+
+/* type is either DROPBEAR_SIGNKEY_DSS or DROPBEAR_SIGNKEY_RSA */
+void buf_put_pub_key(buffer* buf, sign_key *key, int type) {
+
+	buffer *pubkeys;
+
+	TRACE(("enter buf_put_pub_key"))
+	pubkeys = buf_new(MAX_PUBKEY_SIZE);
+	
+#ifdef DROPBEAR_DSS
+	if (type == DROPBEAR_SIGNKEY_DSS) {
+		buf_put_dss_pub_key(pubkeys, key->dsskey);
+	}
+#endif
+#ifdef DROPBEAR_RSA
+	if (type == DROPBEAR_SIGNKEY_RSA) {
+		buf_put_rsa_pub_key(pubkeys, key->rsakey);
+	}
+#endif
+	if (pubkeys->len == 0) {
+		dropbear_exit("bad key types in buf_put_pub_key");
+	}
+
+	buf_setpos(pubkeys, 0);
+	buf_putstring(buf, buf_getptr(pubkeys, pubkeys->len),
+			pubkeys->len);
+	
+	buf_free(pubkeys);
+	TRACE(("leave buf_put_pub_key"))
+}
+
+/* type is either DROPBEAR_SIGNKEY_DSS or DROPBEAR_SIGNKEY_RSA */
+void buf_put_priv_key(buffer* buf, sign_key *key, int type) {
+
+	TRACE(("enter buf_put_priv_key"))
+	TRACE(("type is %d", type))
+
+#ifdef DROPBEAR_DSS
+	if (type == DROPBEAR_SIGNKEY_DSS) {
+		buf_put_dss_priv_key(buf, key->dsskey);
+	TRACE(("leave buf_put_priv_key: dss done"))
+	return;
+	}
+#endif
+#ifdef DROPBEAR_RSA
+	if (type == DROPBEAR_SIGNKEY_RSA) {
+		buf_put_rsa_priv_key(buf, key->rsakey);
+	TRACE(("leave buf_put_priv_key: rsa done"))
+	return;
+	}
+#endif
+	dropbear_exit("bad key types in put pub key");
+}
+
+void sign_key_free(sign_key *key) {
+
+	TRACE(("enter sign_key_free"))
+
+#ifdef DROPBEAR_DSS
+	dss_key_free(key->dsskey);
+	key->dsskey = NULL;
+#endif
+#ifdef DROPBEAR_RSA
+	rsa_key_free(key->rsakey);
+	key->rsakey = NULL;
+#endif
+
+	m_free(key);
+	TRACE(("leave sign_key_free"))
+}
+
+static char hexdig(unsigned char x) {
+
+	if (x > 0xf)
+		return 'X';
+
+	if (x < 10)
+		return '0' + x;
+	else
+		return 'a' + x - 10;
+}
+
+/* Since we're not sure if we'll have md5 or sha1, we present both.
+ * MD5 is used in preference, but sha1 could still be useful */
+#ifdef DROPBEAR_MD5_HMAC
+static char * sign_key_md5_fingerprint(unsigned char* keyblob,
+		unsigned int keybloblen) {
+
+	char * ret;
+	hash_state hs;
+	unsigned char hash[MD5_HASH_SIZE];
+	unsigned int i;
+	unsigned int buflen;
+
+	md5_init(&hs);
+
+	/* skip the size int of the string - this is a bit messy */
+	md5_process(&hs, keyblob, keybloblen);
+
+	md5_done(&hs, hash);
+
+	/* "md5 hexfingerprinthere\0", each hex digit is "AB:" etc */
+	buflen = 4 + 3*MD5_HASH_SIZE;
+	ret = (char*)m_malloc(buflen);
+
+	memset(ret, 'Z', buflen);
+	strcpy(ret, "md5 ");
+
+	for (i = 0; i < MD5_HASH_SIZE; i++) {
+		unsigned int pos = 4 + i*3;
+		ret[pos] = hexdig(hash[i] >> 4);
+		ret[pos+1] = hexdig(hash[i] & 0x0f);
+		ret[pos+2] = ':';
+	}
+	ret[buflen-1] = 0x0;
+
+	return ret;
+}
+
+#else /* use SHA1 rather than MD5 for fingerprint */
+static char * sign_key_sha1_fingerprint(unsigned char* keyblob, 
+		unsigned int keybloblen) {
+
+	char * ret;
+	hash_state hs;
+	unsigned char hash[SHA1_HASH_SIZE];
+	unsigned int i;
+	unsigned int buflen;
+
+	sha1_init(&hs);
+
+	/* skip the size int of the string - this is a bit messy */
+	sha1_process(&hs, keyblob, keybloblen);
+
+	sha1_done(&hs, hash);
+
+	/* "sha1 hexfingerprinthere\0", each hex digit is "AB:" etc */
+	buflen = 5 + 3*SHA1_HASH_SIZE;
+	ret = (char*)m_malloc(buflen);
+
+	strcpy(ret, "sha1 ");
+
+	for (i = 0; i < SHA1_HASH_SIZE; i++) {
+		unsigned int pos = 5 + 3*i;
+		ret[pos] = hexdig(hash[i] >> 4);
+		ret[pos+1] = hexdig(hash[i] & 0x0f);
+		ret[pos+2] = ':';
+	}
+	ret[buflen-1] = 0x0;
+
+	return ret;
+}
+
+#endif /* MD5/SHA1 switch */
+
+/* This will return a freshly malloced string, containing a fingerprint
+ * in either sha1 or md5 */
+char * sign_key_fingerprint(unsigned char* keyblob, unsigned int keybloblen) {
+
+#ifdef DROPBEAR_MD5_HMAC
+	return sign_key_md5_fingerprint(keyblob, keybloblen);
+#else
+	return sign_key_sha1_fingerprint(keyblob, keybloblen);
+#endif
+}
+
+void buf_put_sign(buffer* buf, sign_key *key, int type, 
+		const unsigned char *data, unsigned int len) {
+
+	buffer *sigblob;
+
+	sigblob = buf_new(MAX_PUBKEY_SIZE);
+
+#ifdef DROPBEAR_DSS
+	if (type == DROPBEAR_SIGNKEY_DSS) {
+		buf_put_dss_sign(sigblob, key->dsskey, data, len);
+	}
+#endif
+#ifdef DROPBEAR_RSA
+	if (type == DROPBEAR_SIGNKEY_RSA) {
+		buf_put_rsa_sign(sigblob, key->rsakey, data, len);
+	}
+#endif
+	if (sigblob->len == 0) {
+		dropbear_exit("non-matching signing type");
+	}
+
+	buf_setpos(sigblob, 0);
+	buf_putstring(buf, buf_getptr(sigblob, sigblob->len),
+			sigblob->len);
+			
+	buf_free(sigblob);
+
+}
+
+#ifdef DROPBEAR_SIGNKEY_VERIFY
+/* Return DROPBEAR_SUCCESS or DROPBEAR_FAILURE.
+ * If FAILURE is returned, the position of
+ * buf is undefined. If SUCCESS is returned, buf will be positioned after the
+ * signature blob */
+int buf_verify(buffer * buf, sign_key *key, const unsigned char *data,
+		unsigned int len) {
+	
+	unsigned int bloblen;
+	unsigned char * ident = NULL;
+	unsigned int identlen = 0;
+
+	TRACE(("enter buf_verify"))
+
+	bloblen = buf_getint(buf);
+	ident = buf_getstring(buf, &identlen);
+
+#ifdef DROPBEAR_DSS
+	if (bloblen == DSS_SIGNATURE_SIZE &&
+			memcmp(ident, SSH_SIGNKEY_DSS, identlen) == 0) {
+		m_free(ident);
+		if (key->dsskey == NULL) {
+			dropbear_exit("no dss key to verify signature");
+		}
+		return buf_dss_verify(buf, key->dsskey, data, len);
+	}
+#endif
+
+#ifdef DROPBEAR_RSA
+	if (memcmp(ident, SSH_SIGNKEY_RSA, identlen) == 0) {
+		m_free(ident);
+		if (key->rsakey == NULL) {
+			dropbear_exit("no rsa key to verify signature");
+		}
+		return buf_rsa_verify(buf, key->rsakey, data, len);
+	}
+#endif
+
+	m_free(ident);
+	dropbear_exit("non-matching signing type");
+	return DROPBEAR_FAILURE;
+}
+#endif /* DROPBEAR_SIGNKEY_VERIFY */
+
+#ifdef DROPBEAR_KEY_LINES /* ie we're using authorized_keys or known_hosts */
+
+/* Returns DROPBEAR_SUCCESS or DROPBEAR_FAILURE when given a buffer containing
+ * a key, a key, and a type. The buffer is positioned at the start of the
+ * base64 data, and contains no trailing data */
+int cmp_base64_key(const unsigned char* keyblob, unsigned int keybloblen, 
+					const unsigned char* algoname, unsigned int algolen, 
+					buffer * line) {
+
+	buffer * decodekey = NULL;
+	int ret = DROPBEAR_FAILURE;
+	unsigned int len, filealgolen;
+	unsigned long decodekeylen;
+	unsigned char* filealgo = NULL;
+
+	/* now we have the actual data */
+	len = line->len - line->pos;
+	decodekeylen = len * 2; /* big to be safe */
+	decodekey = buf_new(decodekeylen);
+
+	if (base64_decode(buf_getptr(line, len), len,
+				buf_getwriteptr(decodekey, decodekey->size),
+				&decodekeylen) != CRYPT_OK) {
+		TRACE(("checkpubkey: base64 decode failed"))
+		goto out;
+	}
+	TRACE(("checkpubkey: base64_decode success"))
+	buf_incrlen(decodekey, decodekeylen);
+	
+	/* compare the keys */
+	if ( ( decodekeylen != keybloblen )
+			|| memcmp( buf_getptr(decodekey, decodekey->len),
+						keyblob, decodekey->len) != 0) {
+		TRACE(("checkpubkey: compare failed"))
+		goto out;
+	}
+
+	/* ... and also check that the algo specified and the algo in the key
+	 * itself match */
+	filealgolen = buf_getint(decodekey);
+	filealgo = buf_getptr(decodekey, filealgolen);
+	if (filealgolen != algolen || memcmp(filealgo, algoname, algolen) != 0) {
+		TRACE(("checkpubkey: algo match failed")) 
+		goto out;
+	}
+
+	/* All checks passed */
+	ret = DROPBEAR_SUCCESS;
+
+out:
+	buf_free(decodekey);
+	decodekey = NULL;
+	return ret;
+}
+#endif
diff --git a/signkey.h b/signkey.h
new file mode 100644
index 0000000..8bc7e8f
--- /dev/null
+++ b/signkey.h
@@ -0,0 +1,63 @@
+/*
+ * Dropbear - a SSH2 server
+ * 
+ * Copyright (c) 2002,2003 Matt Johnston
+ * All rights reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE. */
+
+#ifndef _SIGNKEY_H_
+#define _SIGNKEY_H_
+
+#include "buffer.h"
+#include "dss.h"
+#include "rsa.h"
+
+struct SIGN_key {
+
+#ifdef DROPBEAR_DSS
+	dss_key * dsskey;
+#endif
+#ifdef DROPBEAR_RSA
+	rsa_key * rsakey;
+#endif
+};
+
+typedef struct SIGN_key sign_key;
+
+sign_key * new_sign_key();
+const char* signkey_name_from_type(int type, int *namelen);
+int signkey_type_from_name(const char* name, int namelen);
+int buf_get_pub_key(buffer *buf, sign_key *key, int *type);
+int buf_get_priv_key(buffer* buf, sign_key *key, int *type);
+void buf_put_pub_key(buffer* buf, sign_key *key, int type);
+void buf_put_priv_key(buffer* buf, sign_key *key, int type);
+void sign_key_free(sign_key *key);
+void buf_put_sign(buffer* buf, sign_key *key, int type, 
+		const unsigned char *data, unsigned int len);
+#ifdef DROPBEAR_SIGNKEY_VERIFY
+int buf_verify(buffer * buf, sign_key *key, const unsigned char *data,
+		unsigned int len);
+char * sign_key_fingerprint(unsigned char* keyblob, unsigned int keybloblen);
+#endif
+int cmp_base64_key(const unsigned char* keyblob, unsigned int keybloblen, 
+					const unsigned char* algoname, unsigned int algolen, 
+					buffer * line);
+
+#endif /* _SIGNKEY_H_ */
diff --git a/ssh.h b/ssh.h
new file mode 100644
index 0000000..26b51bf
--- /dev/null
+++ b/ssh.h
@@ -0,0 +1,107 @@
+/*
+ * Dropbear - a SSH2 server
+ * 
+ * Copyright (c) 2002,2003 Matt Johnston
+ * All rights reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE. */
+
+/* This file contains the various numbers in the protocol */
+
+
+/* message numbers */
+#define SSH_MSG_DISCONNECT             1
+#define SSH_MSG_IGNORE                 2
+#define SSH_MSG_UNIMPLEMENTED          3
+#define SSH_MSG_DEBUG                  4
+#define SSH_MSG_SERVICE_REQUEST        5
+#define SSH_MSG_SERVICE_ACCEPT         6
+#define SSH_MSG_KEXINIT                20
+#define SSH_MSG_NEWKEYS                21
+#define SSH_MSG_KEXDH_INIT             30
+#define SSH_MSG_KEXDH_REPLY            31
+
+/* userauth message numbers */
+#define SSH_MSG_USERAUTH_REQUEST            50
+#define SSH_MSG_USERAUTH_FAILURE            51
+#define SSH_MSG_USERAUTH_SUCCESS            52
+#define SSH_MSG_USERAUTH_BANNER             53
+
+/* packets 60-79 are method-specific, aren't one-one mapping */
+#define SSH_MSG_USERAUTH_SPECIFIC_60   60
+
+#define SSH_MSG_USERAUTH_PASSWD_CHANGEREQ   60
+
+#define SSH_MSG_USERAUTH_PK_OK				60
+
+/* keyboard interactive auth */
+#define SSH_MSG_USERAUTH_INFO_REQUEST           60
+#define SSH_MSG_USERAUTH_INFO_RESPONSE          61
+
+
+/* If adding numbers here, check MAX_UNAUTH_PACKET_TYPE in process-packet.c
+ * is still valid */
+
+/* connect message numbers */
+#define SSH_MSG_GLOBAL_REQUEST                  80
+#define SSH_MSG_REQUEST_SUCCESS                 81
+#define SSH_MSG_REQUEST_FAILURE                 82
+#define SSH_MSG_CHANNEL_OPEN                    90
+#define SSH_MSG_CHANNEL_OPEN_CONFIRMATION       91 
+#define SSH_MSG_CHANNEL_OPEN_FAILURE            92
+#define SSH_MSG_CHANNEL_WINDOW_ADJUST           93
+#define SSH_MSG_CHANNEL_DATA                    94
+#define SSH_MSG_CHANNEL_EXTENDED_DATA           95
+#define SSH_MSG_CHANNEL_EOF                     96
+#define SSH_MSG_CHANNEL_CLOSE                   97
+#define SSH_MSG_CHANNEL_REQUEST                 98
+#define SSH_MSG_CHANNEL_SUCCESS                 99
+#define SSH_MSG_CHANNEL_FAILURE                 100
+
+/* extended data types */
+#define SSH_EXTENDED_DATA_STDERR	1
+
+/* disconnect codes */
+#define SSH_DISCONNECT_HOST_NOT_ALLOWED_TO_CONNECT      1
+#define SSH_DISCONNECT_PROTOCOL_ERROR                   2
+#define SSH_DISCONNECT_KEY_EXCHANGE_FAILED              3
+#define SSH_DISCONNECT_RESERVED                         4
+#define SSH_DISCONNECT_MAC_ERROR                        5
+#define SSH_DISCONNECT_COMPRESSION_ERROR                6
+#define SSH_DISCONNECT_SERVICE_NOT_AVAILABLE            7
+#define SSH_DISCONNECT_PROTOCOL_VERSION_NOT_SUPPORTED   8
+#define SSH_DISCONNECT_HOST_KEY_NOT_VERIFIABLE          9
+#define SSH_DISCONNECT_CONNECTION_LOST                 10
+#define SSH_DISCONNECT_BY_APPLICATION                  11
+#define SSH_DISCONNECT_TOO_MANY_CONNECTIONS            12
+#define SSH_DISCONNECT_AUTH_CANCELLED_BY_USER          13
+#define SSH_DISCONNECT_NO_MORE_AUTH_METHODS_AVAILABLE  14
+#define SSH_DISCONNECT_ILLEGAL_USER_NAME               15
+
+/* service types */
+#define SSH_SERVICE_USERAUTH "ssh-userauth"
+#define SSH_SERVICE_USERAUTH_LEN 12
+#define SSH_SERVICE_CONNECTION "ssh-connection"
+#define SSH_SERVICE_CONNECTION_LEN 14
+
+/* public key types */
+#define SSH_SIGNKEY_DSS "ssh-dss"
+#define SSH_SIGNKEY_DSS_LEN 7
+#define SSH_SIGNKEY_RSA "ssh-rsa"
+#define SSH_SIGNKEY_RSA_LEN 7
diff --git a/sshpty.c b/sshpty.c
new file mode 100644
index 0000000..3526ff0
--- /dev/null
+++ b/sshpty.c
@@ -0,0 +1,412 @@
+/*
+ * Dropbear - a SSH2 server
+ *
+ * Copied from OpenSSH-3.5p1 source, modified by Matt Johnston 2003
+ * 
+ * Author: Tatu Ylonen <ylo@cs.hut.fi>
+ * Copyright (c) 1995 Tatu Ylonen <ylo@cs.hut.fi>, Espoo, Finland
+ *                    All rights reserved
+ * Allocating a pseudo-terminal, and making it the controlling tty.
+ *
+ * As far as I am concerned, the code I have written for this software
+ * can be used freely for any purpose.  Any derived versions of this
+ * software must be clearly marked as such, and if the derived work is
+ * incompatible with the protocol description in the RFC file, it must be
+ * called by a name other than "ssh" or "Secure Shell".
+ */
+
+/*RCSID("OpenBSD: sshpty.c,v 1.7 2002/06/24 17:57:20 deraadt Exp ");*/
+
+#include "includes.h"
+#include "dbutil.h"
+#include "errno.h"
+#include "sshpty.h"
+
+/* Pty allocated with _getpty gets broken if we do I_PUSH:es to it. */
+#if defined(HAVE__GETPTY) || defined(HAVE_OPENPTY)
+#undef HAVE_DEV_PTMX
+#endif
+
+#ifdef HAVE_PTY_H
+# include <pty.h>
+#endif
+#if defined(USE_DEV_PTMX) && defined(HAVE_STROPTS_H)
+# include <stropts.h>
+#endif
+
+#ifndef O_NOCTTY
+#define O_NOCTTY 0
+#endif
+
+/*
+ * Allocates and opens a pty.  Returns 0 if no pty could be allocated, or
+ * nonzero if a pty was successfully allocated.  On success, open file
+ * descriptors for the pty and tty sides and the name of the tty side are
+ * returned (the buffer must be able to hold at least 64 characters).
+ */
+
+int
+pty_allocate(int *ptyfd, int *ttyfd, char *namebuf, int namebuflen)
+{
+#if defined(HAVE_OPENPTY)
+	/* exists in recent (4.4) BSDs and OSF/1 */
+	char *name;
+	int i;
+
+	i = openpty(ptyfd, ttyfd, NULL, NULL, NULL);
+	if (i < 0) {
+		dropbear_log(LOG_WARNING, 
+				"pty_allocate: openpty: %.100s", strerror(errno));
+		return 0;
+	}
+	name = ttyname(*ttyfd);
+	if (!name) {
+		dropbear_exit("ttyname fails for openpty device");
+	}
+
+	strlcpy(namebuf, name, namebuflen);	/* possible truncation */
+	return 1;
+#else /* HAVE_OPENPTY */
+#ifdef HAVE__GETPTY
+	/*
+	 * _getpty(3) exists in SGI Irix 4.x, 5.x & 6.x -- it generates more
+	 * pty's automagically when needed
+	 */
+	char *slave;
+
+	slave = _getpty(ptyfd, O_RDWR, 0622, 0);
+	if (slave == NULL) {
+		dropbear_log(LOG_WARNING,
+				"pty_allocate: _getpty: %.100s", strerror(errno));
+		return 0;
+	}
+	strlcpy(namebuf, slave, namebuflen);
+	/* Open the slave side. */
+	*ttyfd = open(namebuf, O_RDWR | O_NOCTTY);
+	if (*ttyfd < 0) {
+		dropbear_log(LOG_WARNING,
+				"pty_allocate error: ttyftd open error");
+		close(*ptyfd);
+		return 0;
+	}
+	return 1;
+#else /* HAVE__GETPTY */
+#if defined(USE_DEV_PTMX)
+	/*
+	 * This code is used e.g. on Solaris 2.x.  (Note that Solaris 2.3
+	 * also has bsd-style ptys, but they simply do not work.)
+	 *
+	 * Linux systems may have the /dev/ptmx device, but this code won't work.
+	 */
+	int ptm;
+	char *pts;
+
+	ptm = open("/dev/ptmx", O_RDWR | O_NOCTTY);
+	if (ptm < 0) {
+		dropbear_log(LOG_WARNING,
+				"pty_allocate: /dev/ptmx: %.100s", strerror(errno));
+		return 0;
+	}
+	if (grantpt(ptm) < 0) {
+		dropbear_log(LOG_WARNING,
+				"grantpt: %.100s", strerror(errno));
+		return 0;
+	}
+	if (unlockpt(ptm) < 0) {
+		dropbear_log(LOG_WARNING,
+				"unlockpt: %.100s", strerror(errno));
+		return 0;
+	}
+	pts = ptsname(ptm);
+	if (pts == NULL) {
+		dropbear_log(LOG_WARNING,
+				"Slave pty side name could not be obtained.");
+	}
+	strlcpy(namebuf, pts, namebuflen);
+	*ptyfd = ptm;
+
+	/* Open the slave side. */
+	*ttyfd = open(namebuf, O_RDWR | O_NOCTTY);
+	if (*ttyfd < 0) {
+		dropbear_log(LOG_ERR,
+			"error opening pts %.100s: %.100s", namebuf, strerror(errno));
+		close(*ptyfd);
+		return 0;
+	}
+#ifndef HAVE_CYGWIN
+	/*
+	 * Push the appropriate streams modules, as described in Solaris pts(7).
+	 * HP-UX pts(7) doesn't have ttcompat module.
+	 */
+	if (ioctl(*ttyfd, I_PUSH, "ptem") < 0) {
+		dropbear_log(LOG_WARNING,
+				"ioctl I_PUSH ptem: %.100s", strerror(errno));
+	}
+	if (ioctl(*ttyfd, I_PUSH, "ldterm") < 0) {
+		dropbear_log(LOG_WARNING,
+			"ioctl I_PUSH ldterm: %.100s", strerror(errno));
+	}
+#ifndef __hpux
+	if (ioctl(*ttyfd, I_PUSH, "ttcompat") < 0) {
+		dropbear_log(LOG_WARNING,
+			"ioctl I_PUSH ttcompat: %.100s", strerror(errno));
+	}
+#endif
+#endif
+	return 1;
+#else /* USE_DEV_PTMX */
+#ifdef HAVE_DEV_PTS_AND_PTC
+	/* AIX-style pty code. */
+	const char *name;
+
+	*ptyfd = open("/dev/ptc", O_RDWR | O_NOCTTY);
+	if (*ptyfd < 0) {
+		dropbear_log(LOG_ERR,
+			"Could not open /dev/ptc: %.100s", strerror(errno));
+		return 0;
+	}
+	name = ttyname(*ptyfd);
+	if (!name) {
+		dropbear_exit("ttyname fails for /dev/ptc device");
+	}
+	strlcpy(namebuf, name, namebuflen);
+	*ttyfd = open(name, O_RDWR | O_NOCTTY);
+	if (*ttyfd < 0) {
+		dropbear_log(LOG_ERR,
+			"Could not open pty slave side %.100s: %.100s",
+		    name, strerror(errno));
+		close(*ptyfd);
+		return 0;
+	}
+	return 1;
+#else /* HAVE_DEV_PTS_AND_PTC */
+
+	/* BSD-style pty code. */
+	char buf[64];
+	int i;
+	const char *ptymajors = "pqrstuvwxyzabcdefghijklmnoABCDEFGHIJKLMNOPQRSTUVWXYZ";
+	const char *ptyminors = "0123456789abcdef";
+	int num_minors = strlen(ptyminors);
+	int num_ptys = strlen(ptymajors) * num_minors;
+	struct termios tio;
+
+	for (i = 0; i < num_ptys; i++) {
+		snprintf(buf, sizeof buf, "/dev/pty%c%c", ptymajors[i / num_minors],
+			 ptyminors[i % num_minors]);
+		snprintf(namebuf, namebuflen, "/dev/tty%c%c",
+		    ptymajors[i / num_minors], ptyminors[i % num_minors]);
+
+		*ptyfd = open(buf, O_RDWR | O_NOCTTY);
+		if (*ptyfd < 0) {
+			/* Try SCO style naming */
+			snprintf(buf, sizeof buf, "/dev/ptyp%d", i);
+			snprintf(namebuf, namebuflen, "/dev/ttyp%d", i);
+			*ptyfd = open(buf, O_RDWR | O_NOCTTY);
+			if (*ptyfd < 0) {
+				continue;
+			}
+		}
+
+		/* Open the slave side. */
+		*ttyfd = open(namebuf, O_RDWR | O_NOCTTY);
+		if (*ttyfd < 0) {
+			dropbear_log(LOG_ERR,
+				"pty_allocate: %.100s: %.100s", namebuf, strerror(errno));
+			close(*ptyfd);
+			return 0;
+		}
+		/* set tty modes to a sane state for broken clients */
+		if (tcgetattr(*ptyfd, &tio) < 0) {
+			dropbear_log(LOG_WARNING,
+				"ptyallocate: tty modes failed: %.100s", strerror(errno));
+		} else {
+			tio.c_lflag |= (ECHO | ISIG | ICANON);
+			tio.c_oflag |= (OPOST | ONLCR);
+			tio.c_iflag |= ICRNL;
+
+			/* Set the new modes for the terminal. */
+			if (tcsetattr(*ptyfd, TCSANOW, &tio) < 0) {
+				dropbear_log(LOG_WARNING,
+					"Setting tty modes for pty failed: %.100s",
+					strerror(errno));
+			}
+		}
+
+		return 1;
+	}
+	dropbear_log(LOG_WARNING, "failed to open any /dev/pty?? devices");
+	return 0;
+#endif /* HAVE_DEV_PTS_AND_PTC */
+#endif /* USE_DEV_PTMX */
+#endif /* HAVE__GETPTY */
+#endif /* HAVE_OPENPTY */
+}
+
+/* Releases the tty.  Its ownership is returned to root, and permissions to 0666. */
+
+void
+pty_release(const char *tty_name)
+{
+	if (chown(tty_name, (uid_t) 0, (gid_t) 0) < 0
+			&& (errno != ENOENT)) {
+		dropbear_log(LOG_ERR,
+				"chown %.100s 0 0 failed: %.100s", tty_name, strerror(errno));
+	}
+	if (chmod(tty_name, (mode_t) 0666) < 0
+			&& (errno != ENOENT)) {
+		dropbear_log(LOG_ERR,
+			"chmod %.100s 0666 failed: %.100s", tty_name, strerror(errno));
+	}
+}
+
+/* Makes the tty the processes controlling tty and sets it to sane modes. */
+
+void
+pty_make_controlling_tty(int *ttyfd, const char *tty_name)
+{
+	int fd;
+#ifdef USE_VHANGUP
+	void *old;
+#endif /* USE_VHANGUP */
+
+	/* Solaris has a problem with TIOCNOTTY for a bg process, so
+	 * we disable the signal which would STOP the process - matt */
+	signal(SIGTTOU, SIG_IGN);
+
+	/* First disconnect from the old controlling tty. */
+#ifdef TIOCNOTTY
+	fd = open(_PATH_TTY, O_RDWR | O_NOCTTY);
+	if (fd >= 0) {
+		(void) ioctl(fd, TIOCNOTTY, NULL);
+		close(fd);
+	}
+#endif /* TIOCNOTTY */
+	if (setsid() < 0) {
+		dropbear_log(LOG_ERR,
+			"setsid: %.100s", strerror(errno));
+	}
+
+	/*
+	 * Verify that we are successfully disconnected from the controlling
+	 * tty.
+	 */
+	fd = open(_PATH_TTY, O_RDWR | O_NOCTTY);
+	if (fd >= 0) {
+		dropbear_log(LOG_ERR,
+				"Failed to disconnect from controlling tty.\n");
+		close(fd);
+	}
+	/* Make it our controlling tty. */
+#ifdef TIOCSCTTY
+	if (ioctl(*ttyfd, TIOCSCTTY, NULL) < 0) {
+		dropbear_log(LOG_ERR,
+			"ioctl(TIOCSCTTY): %.100s", strerror(errno));
+	}
+#endif /* TIOCSCTTY */
+#ifdef HAVE_NEWS4
+	if (setpgrp(0,0) < 0) {
+		dropbear_log(LOG_ERR,
+			error("SETPGRP %s",strerror(errno)));
+	}
+#endif /* HAVE_NEWS4 */
+#ifdef USE_VHANGUP
+	old = mysignal(SIGHUP, SIG_IGN);
+	vhangup();
+	mysignal(SIGHUP, old);
+#endif /* USE_VHANGUP */
+	fd = open(tty_name, O_RDWR);
+	if (fd < 0) {
+		dropbear_log(LOG_ERR,
+			"%.100s: %.100s", tty_name, strerror(errno));
+	} else {
+#ifdef USE_VHANGUP
+		close(*ttyfd);
+		*ttyfd = fd;
+#else /* USE_VHANGUP */
+		close(fd);
+#endif /* USE_VHANGUP */
+	}
+	/* Verify that we now have a controlling tty. */
+	fd = open(_PATH_TTY, O_WRONLY);
+	if (fd < 0) {
+		dropbear_log(LOG_ERR,
+			"open /dev/tty failed - could not set controlling tty: %.100s",
+		    strerror(errno));
+	} else {
+		close(fd);
+	}
+}
+
+/* Changes the window size associated with the pty. */
+
+void
+pty_change_window_size(int ptyfd, int row, int col,
+	int xpixel, int ypixel)
+{
+	struct winsize w;
+
+	w.ws_row = row;
+	w.ws_col = col;
+	w.ws_xpixel = xpixel;
+	w.ws_ypixel = ypixel;
+	(void) ioctl(ptyfd, TIOCSWINSZ, &w);
+}
+
+void
+pty_setowner(struct passwd *pw, const char *tty_name)
+{
+	struct group *grp;
+	gid_t gid;
+	mode_t mode;
+	struct stat st;
+
+	/* Determine the group to make the owner of the tty. */
+	grp = getgrnam("tty");
+	if (grp) {
+		gid = grp->gr_gid;
+		mode = S_IRUSR | S_IWUSR | S_IWGRP;
+	} else {
+		gid = pw->pw_gid;
+		mode = S_IRUSR | S_IWUSR | S_IWGRP | S_IWOTH;
+	}
+
+	/*
+	 * Change owner and mode of the tty as required.
+	 * Warn but continue if filesystem is read-only and the uids match/
+	 * tty is owned by root.
+	 */
+	if (stat(tty_name, &st)) {
+		dropbear_exit("pty_setowner: stat(%.101s) failed: %.100s",
+				tty_name, strerror(errno));
+	}
+
+	if (st.st_uid != pw->pw_uid || st.st_gid != gid) {
+		if (chown(tty_name, pw->pw_uid, gid) < 0) {
+			if (errno == EROFS &&
+			    (st.st_uid == pw->pw_uid || st.st_uid == 0)) {
+				dropbear_log(LOG_ERR,
+					"chown(%.100s, %u, %u) failed: %.100s",
+						tty_name, (unsigned int)pw->pw_uid, (unsigned int)gid,
+						strerror(errno));
+			} else {
+				dropbear_exit("chown(%.100s, %u, %u) failed: %.100s",
+				    tty_name, (unsigned int)pw->pw_uid, (unsigned int)gid,
+				    strerror(errno));
+			}
+		}
+	}
+
+	if ((st.st_mode & (S_IRWXU|S_IRWXG|S_IRWXO)) != mode) {
+		if (chmod(tty_name, mode) < 0) {
+			if (errno == EROFS &&
+			    (st.st_mode & (S_IRGRP | S_IROTH)) == 0) {
+				dropbear_log(LOG_ERR,
+					"chmod(%.100s, 0%o) failed: %.100s",
+					tty_name, mode, strerror(errno));
+			} else {
+				dropbear_exit("chmod(%.100s, 0%o) failed: %.100s",
+				    tty_name, mode, strerror(errno));
+			}
+		}
+	}
+}
diff --git a/sshpty.h b/sshpty.h
new file mode 100644
index 0000000..cf72072
--- /dev/null
+++ b/sshpty.h
@@ -0,0 +1,28 @@
+/*	$OpenBSD: sshpty.h,v 1.4 2002/03/04 17:27:39 stevesk Exp $	*/
+
+/*
+ * Copied from openssh-3.5p1 source by Matt Johnston 2003
+ *
+ * Author: Tatu Ylonen <ylo@cs.hut.fi>
+ * Copyright (c) 1995 Tatu Ylonen <ylo@cs.hut.fi>, Espoo, Finland
+ *                    All rights reserved
+ * Functions for allocating a pseudo-terminal and making it the controlling
+ * tty.
+ *
+ * As far as I am concerned, the code I have written for this software
+ * can be used freely for any purpose.  Any derived versions of this
+ * software must be clearly marked as such, and if the derived work is
+ * incompatible with the protocol description in the RFC file, it must be
+ * called by a name other than "ssh" or "Secure Shell".
+ */
+
+#ifndef SSHPTY_H
+#define SSHPTY_H
+
+int	 pty_allocate(int *, int *, char *, int);
+void	 pty_release(const char *);
+void	 pty_make_controlling_tty(int *, const char *);
+void	 pty_change_window_size(int, int, int, int, int);
+void	 pty_setowner(struct passwd *, const char *);
+
+#endif /* SSHPTY_H */
diff --git a/svr-agentfwd.c b/svr-agentfwd.c
new file mode 100644
index 0000000..5127158
--- /dev/null
+++ b/svr-agentfwd.c
@@ -0,0 +1,266 @@
+/*
+ * Dropbear - a SSH2 server
+ * 
+ * Copyright (c) 2002,2003 Matt Johnston
+ * All rights reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE. */
+
+/* This file (agentfwd.c) handles authentication agent forwarding, for OpenSSH
+ * style agents. */
+
+#include "includes.h"
+
+#ifndef DISABLE_AGENTFWD
+
+#include "agentfwd.h"
+#include "session.h"
+#include "ssh.h"
+#include "dbutil.h"
+#include "chansession.h"
+#include "channel.h"
+#include "packet.h"
+#include "buffer.h"
+#include "random.h"
+#include "listener.h"
+
+#define AGENTDIRPREFIX "/tmp/dropbear-"
+
+static int send_msg_channel_open_agent(int fd);
+static int bindagent(int fd, struct ChanSess * chansess);
+static void agentaccept(struct Listener * listener, int sock);
+
+/* Handles client requests to start agent forwarding, sets up listening socket.
+ * Returns DROPBEAR_SUCCESS or DROPBEAR_FAILURE */
+int agentreq(struct ChanSess * chansess) {
+
+	int fd;
+
+	if (chansess->agentlistener != NULL) {
+		return DROPBEAR_FAILURE;
+	}
+
+	/* create listening socket */
+	fd = socket(PF_UNIX, SOCK_STREAM, 0);
+	if (fd < 0) {
+		goto fail;
+	}
+
+	/* create the unix socket dir and file */
+	if (bindagent(fd, chansess) == DROPBEAR_FAILURE) {
+		goto fail;
+	}
+
+	/* listen */
+	if (listen(fd, 20) < 0) {
+		goto fail;
+	}
+
+	/* set non-blocking */
+	setnonblocking(fd);
+
+	/* pass if off to listener */
+	chansess->agentlistener = new_listener( &fd, 1, 0, chansess, 
+								agentaccept, NULL);
+
+	if (chansess->agentlistener == NULL) {
+		goto fail;
+	}
+
+	return DROPBEAR_SUCCESS;
+
+fail:
+	/* cleanup */
+	agentcleanup(chansess);
+
+	return DROPBEAR_FAILURE;
+}
+
+/* accepts a connection on the forwarded socket and opens a new channel for it
+ * back to the client */
+/* returns DROPBEAR_SUCCESS or DROPBEAR_FAILURE */
+static void agentaccept(struct Listener *UNUSED(listener), int sock) {
+
+	int fd;
+
+	fd = accept(sock, NULL, NULL);
+	if (fd < 0) {
+		TRACE(("accept failed"))
+		return;
+	}
+
+	if (send_msg_channel_open_agent(fd) != DROPBEAR_SUCCESS) {
+		close(fd);
+	}
+
+}
+
+/* set up the environment variable pointing to the socket. This is called
+ * just before command/shell execution, after dropping priveleges */
+void agentset(struct ChanSess * chansess) {
+
+	char *path = NULL;
+	int len;
+
+	if (chansess->agentlistener == NULL) {
+		return;
+	}
+
+	/* 2 for "/" and "\0" */
+	len = strlen(chansess->agentdir) + strlen(chansess->agentfile) + 2;
+
+	path = m_malloc(len);
+	snprintf(path, len, "%s/%s", chansess->agentdir, chansess->agentfile);
+	addnewvar("SSH_AUTH_SOCK", path);
+	m_free(path);
+}
+
+/* close the socket, remove the socket-file */
+void agentcleanup(struct ChanSess * chansess) {
+
+	char *path = NULL;
+	uid_t uid;
+	gid_t gid;
+	int len;
+
+	if (chansess->agentlistener != NULL) {
+		remove_listener(chansess->agentlistener);
+		chansess->agentlistener = NULL;
+	}
+
+	if (chansess->agentfile != NULL && chansess->agentdir != NULL) {
+
+		/* Remove the dir as the user. That way they can't cause problems except
+		 * for themselves */
+		uid = getuid();
+		gid = getgid();
+		if ((setegid(ses.authstate.pw->pw_gid)) < 0 ||
+			(seteuid(ses.authstate.pw->pw_uid)) < 0) {
+			dropbear_exit("failed to set euid");
+		}
+
+		/* 2 for "/" and "\0" */
+		len = strlen(chansess->agentdir) + strlen(chansess->agentfile) + 2;
+
+		path = m_malloc(len);
+		snprintf(path, len, "%s/%s", chansess->agentdir, chansess->agentfile);
+		unlink(path);
+		m_free(path);
+
+		rmdir(chansess->agentdir);
+
+		if ((seteuid(uid)) < 0 ||
+			(setegid(gid)) < 0) {
+			dropbear_exit("failed to revert euid");
+		}
+
+		m_free(chansess->agentfile);
+		m_free(chansess->agentdir);
+	}
+
+}
+
+static const struct ChanType chan_agent = {
+	0, /* sepfds */
+	"auth-agent@openssh.com",
+	NULL,
+	NULL,
+	NULL,
+	NULL
+};
+
+
+/* helper for accepting an agent request */
+static int send_msg_channel_open_agent(int fd) {
+
+	if (send_msg_channel_open_init(fd, &chan_agent) == DROPBEAR_SUCCESS) {
+		encrypt_packet();
+		return DROPBEAR_SUCCESS;
+	} else {
+		return DROPBEAR_FAILURE;
+	}
+}
+
+/* helper for creating the agent socket-file
+   returns DROPBEAR_SUCCESS or DROPBEAR_FAILURE */
+static int bindagent(int fd, struct ChanSess * chansess) {
+
+	struct sockaddr_un addr;
+	unsigned int prefix;
+	char path[sizeof(addr.sun_path)], sockfile[sizeof(addr.sun_path)];
+	mode_t mode;
+	int i;
+	uid_t uid;
+	gid_t gid;
+	int ret = DROPBEAR_FAILURE;
+
+	/* drop to user privs to make the dir/file */
+	uid = getuid();
+	gid = getgid();
+	if ((setegid(ses.authstate.pw->pw_gid)) < 0 ||
+		(seteuid(ses.authstate.pw->pw_uid)) < 0) {
+		dropbear_exit("failed to set euid");
+	}
+
+	memset((void*)&addr, 0x0, sizeof(addr));
+	addr.sun_family = AF_UNIX;
+
+	mode = S_IRWXU;
+
+	for (i = 0; i < 20; i++) {
+		genrandom((unsigned char*)&prefix, sizeof(prefix));
+		/* we want 32 bits (8 hex digits) - "/tmp/dropbear-f19c62c0" */
+		snprintf(path, sizeof(path), AGENTDIRPREFIX "%.8x", prefix);
+
+		if (mkdir(path, mode) == 0) {
+			goto bindsocket;
+		}
+		if (errno != EEXIST) {
+			break;
+		}
+	}
+	/* couldn't make a dir */
+	goto out;
+
+bindsocket:
+	/* Format is "/tmp/dropbear-0246dead/auth-d00f7654-23".
+	 * The "23" is the file desc, the random data is to avoid collisions
+	 * between subsequent user processes reusing socket fds (odds are now
+	 * 1/(2^64) */
+	genrandom((unsigned char*)&prefix, sizeof(prefix));
+	snprintf(sockfile, sizeof(sockfile), "auth-%.8x-%d", prefix, fd);
+			
+	snprintf(addr.sun_path, sizeof(addr.sun_path), "%s/%s", path, sockfile);
+
+	if (bind(fd, (struct sockaddr*)&addr, sizeof(addr)) == 0) {
+		chansess->agentdir = m_strdup(path);
+		chansess->agentfile = m_strdup(sockfile);
+		ret = DROPBEAR_SUCCESS;
+	}
+
+
+out:
+	if ((seteuid(uid)) < 0 ||
+		(setegid(gid)) < 0) {
+		dropbear_exit("failed to revert euid");
+	}
+	return ret;
+}
+
+#endif
diff --git a/svr-algo.c b/svr-algo.c
new file mode 100644
index 0000000..c0b7823
--- /dev/null
+++ b/svr-algo.c
@@ -0,0 +1,100 @@
+/*
+ * Dropbear - a SSH2 server
+ * SSH client implementation
+ * 
+ * Copyright (c) 2002,2003 Matt Johnston
+ * Copyright (c) 2004 by Mihnea Stoenescu
+ * All rights reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE. */
+
+#include "algo.h"
+#include "dbutil.h"
+
+/* match the first algorithm in the comma-separated list in buf which is
+ * also in localalgos[], or return NULL on failure.
+ * (*goodguess) is set to 1 if the preferred client/server algos match,
+ * 0 otherwise. This is used for checking if the kexalgo/hostkeyalgos are
+ * guessed correctly */
+algo_type * svr_buf_match_algo(buffer* buf, algo_type localalgos[],
+		int *goodguess)
+{
+
+	unsigned char * algolist = NULL;
+	unsigned char * remotealgos[MAX_PROPOSED_ALGO];
+	unsigned int len;
+	unsigned int count, i, j;
+	algo_type * ret = NULL;
+
+	*goodguess = 0;
+
+	/* get the comma-separated list from the buffer ie "algo1,algo2,algo3" */
+	algolist = buf_getstring(buf, &len);
+	/* Debug this */
+	TRACE(("buf_match_algo: %s", algolist))
+	if (len > MAX_PROPOSED_ALGO*(MAX_NAME_LEN+1)) {
+		goto out; /* just a sanity check, no other use */
+	}
+
+	/* remotealgos will contain a list of the strings parsed out */
+	/* We will have at least one string (even if it's just "") */
+	remotealgos[0] = algolist;
+	count = 1;
+	/* Iterate through, replacing ','s with NULs, to split it into
+	 * words. */
+	for (i = 0; i < len; i++) {
+		if (algolist[i] == '\0') {
+			/* someone is trying something strange */
+			goto out;
+		}
+		if (algolist[i] == ',') {
+			algolist[i] = '\0';
+			remotealgos[count] = &algolist[i+1];
+			count++;
+		}
+		if (count == MAX_PROPOSED_ALGO) {
+			break;
+		}
+	}
+
+	/* iterate and find the first match */
+	for (i = 0; i < count; i++) {
+
+		len = strlen(remotealgos[i]);
+
+		for (j = 0; localalgos[j].name != NULL; j++) {
+			if (localalgos[j].usable) {
+				if (len == strlen(localalgos[j].name) &&
+						strncmp(localalgos[j].name, remotealgos[i], len) == 0) {
+					/* set if it was a good guess */
+					if (i == 0 && j == 0) {
+						*goodguess = 1;
+					}
+					/* set the algo to return */
+					ret = &localalgos[j];
+					goto out;
+				}
+			}
+		}
+	}
+
+out:
+	m_free(algolist);
+	return ret;
+}
diff --git a/svr-auth.c b/svr-auth.c
new file mode 100644
index 0000000..f0fca38
--- /dev/null
+++ b/svr-auth.c
@@ -0,0 +1,373 @@
+/*
+ * Dropbear - a SSH2 server
+ * 
+ * Copyright (c) 2002,2003 Matt Johnston
+ * All rights reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE. */
+
+/* This file (auth.c) handles authentication requests, passing it to the
+ * particular type (auth-passwd, auth-pubkey). */
+
+#include "includes.h"
+#include "dbutil.h"
+#include "session.h"
+#include "buffer.h"
+#include "ssh.h"
+#include "packet.h"
+#include "auth.h"
+#include "runopts.h"
+
+static void authclear();
+static int checkusername(unsigned char *username, unsigned int userlen);
+static void send_msg_userauth_banner();
+
+/* initialise the first time for a session, resetting all parameters */
+void svr_authinitialise() {
+
+	ses.authstate.failcount = 0;
+	authclear();
+	
+}
+
+/* Reset the auth state, but don't reset the failcount. This is for if the
+ * user decides to try with a different username etc, and is also invoked
+ * on initialisation */
+static void authclear() {
+	
+	memset(&ses.authstate, 0, sizeof(ses.authstate));
+#ifdef ENABLE_SVR_PUBKEY_AUTH
+	ses.authstate.authtypes |= AUTH_TYPE_PUBKEY;
+#endif
+#if defined(ENABLE_SVR_PASSWORD_AUTH) || defined(ENABLE_SVR_PAM_AUTH)
+	if (!svr_opts.noauthpass) {
+		ses.authstate.authtypes |= AUTH_TYPE_PASSWORD;
+	}
+#endif
+
+}
+
+/* Send a banner message if specified to the client. The client might
+ * ignore this, but possibly serves as a legal "no trespassing" sign */
+static void send_msg_userauth_banner() {
+
+	TRACE(("enter send_msg_userauth_banner"))
+	if (svr_opts.banner == NULL) {
+		TRACE(("leave send_msg_userauth_banner: banner is NULL"))
+		return;
+	}
+
+	CHECKCLEARTOWRITE();
+
+	buf_putbyte(ses.writepayload, SSH_MSG_USERAUTH_BANNER);
+	buf_putstring(ses.writepayload, buf_getptr(svr_opts.banner,
+				svr_opts.banner->len), svr_opts.banner->len);
+	buf_putstring(ses.writepayload, "en", 2);
+
+	encrypt_packet();
+	buf_free(svr_opts.banner);
+	svr_opts.banner = NULL;
+
+	TRACE(("leave send_msg_userauth_banner"))
+}
+
+/* handle a userauth request, check validity, pass to password or pubkey
+ * checking, and handle success or failure */
+void recv_msg_userauth_request() {
+
+	unsigned char *username = NULL, *servicename = NULL, *methodname = NULL;
+	unsigned int userlen, servicelen, methodlen;
+
+	TRACE(("enter recv_msg_userauth_request"))
+
+	/* ignore packets if auth is already done */
+	if (ses.authstate.authdone == 1) {
+		TRACE(("leave recv_msg_userauth_request: authdone already"))
+		return;
+	}
+
+	/* send the banner if it exists, it will only exist once */
+	if (svr_opts.banner) {
+		send_msg_userauth_banner();
+	}
+
+	
+	username = buf_getstring(ses.payload, &userlen);
+	servicename = buf_getstring(ses.payload, &servicelen);
+	methodname = buf_getstring(ses.payload, &methodlen);
+
+	/* only handle 'ssh-connection' currently */
+	if (servicelen != SSH_SERVICE_CONNECTION_LEN
+			&& (strncmp(servicename, SSH_SERVICE_CONNECTION,
+					SSH_SERVICE_CONNECTION_LEN) != 0)) {
+		
+		/* TODO - disconnect here */
+		m_free(username);
+		m_free(servicename);
+		m_free(methodname);
+		dropbear_exit("unknown service in auth");
+	}
+
+	/* user wants to know what methods are supported */
+	if (methodlen == AUTH_METHOD_NONE_LEN &&
+			strncmp(methodname, AUTH_METHOD_NONE,
+				AUTH_METHOD_NONE_LEN) == 0) {
+		TRACE(("recv_msg_userauth_request: 'none' request"))
+		send_msg_userauth_failure(0, 0);
+		goto out;
+	}
+	
+	/* check username is good before continuing */
+	if (checkusername(username, userlen) == DROPBEAR_FAILURE) {
+		/* username is invalid/no shell/etc - send failure */
+		TRACE(("sending checkusername failure"))
+		send_msg_userauth_failure(0, 1);
+		goto out;
+	}
+
+#ifdef ENABLE_SVR_PASSWORD_AUTH
+	if (!svr_opts.noauthpass &&
+			!(svr_opts.norootpass && ses.authstate.pw->pw_uid == 0) ) {
+		/* user wants to try password auth */
+		if (methodlen == AUTH_METHOD_PASSWORD_LEN &&
+				strncmp(methodname, AUTH_METHOD_PASSWORD,
+					AUTH_METHOD_PASSWORD_LEN) == 0) {
+			svr_auth_password();
+			goto out;
+		}
+	}
+#endif
+
+#ifdef ENABLE_SVR_PAM_AUTH
+	if (!svr_opts.noauthpass &&
+			!(svr_opts.norootpass && ses.authstate.pw->pw_uid == 0) ) {
+		/* user wants to try password auth */
+		if (methodlen == AUTH_METHOD_PASSWORD_LEN &&
+				strncmp(methodname, AUTH_METHOD_PASSWORD,
+					AUTH_METHOD_PASSWORD_LEN) == 0) {
+			svr_auth_pam();
+			goto out;
+		}
+	}
+#endif
+
+#ifdef ENABLE_SVR_PUBKEY_AUTH
+	/* user wants to try pubkey auth */
+	if (methodlen == AUTH_METHOD_PUBKEY_LEN &&
+			strncmp(methodname, AUTH_METHOD_PUBKEY,
+				AUTH_METHOD_PUBKEY_LEN) == 0) {
+		svr_auth_pubkey();
+		goto out;
+	}
+#endif
+
+	/* nothing matched, we just fail */
+	send_msg_userauth_failure(0, 1);
+
+out:
+
+	m_free(username);
+	m_free(servicename);
+	m_free(methodname);
+}
+
+/* Check that the username exists, has a non-empty password, and has a valid
+ * shell.
+ * returns DROPBEAR_SUCCESS on valid username, DROPBEAR_FAILURE on failure */
+static int checkusername(unsigned char *username, unsigned int userlen) {
+
+	char* listshell = NULL;
+	char* usershell = NULL;
+	
+	TRACE(("enter checkusername"))
+	if (userlen > MAX_USERNAME_LEN) {
+		return DROPBEAR_FAILURE;
+	}
+
+	/* new user or username has changed */
+	if (ses.authstate.username == NULL ||
+		strcmp(username, ses.authstate.username) != 0) {
+			/* the username needs resetting */
+			if (ses.authstate.username != NULL) {
+				dropbear_log(LOG_WARNING, "client trying multiple usernames from %s",
+							svr_ses.addrstring);
+				m_free(ses.authstate.username);
+			}
+			authclear();
+			ses.authstate.pw = getpwnam((char*)username);
+			ses.authstate.username = m_strdup(username);
+			m_free(ses.authstate.printableuser);
+	}
+
+	/* check that user exists */
+	if (ses.authstate.pw == NULL) {
+		TRACE(("leave checkusername: user '%s' doesn't exist", username))
+		dropbear_log(LOG_WARNING,
+				"login attempt for nonexistent user from %s",
+				svr_ses.addrstring);
+		send_msg_userauth_failure(0, 1);
+		return DROPBEAR_FAILURE;
+	}
+
+	/* We can set it once we know its a real user */
+	ses.authstate.printableuser = m_strdup(ses.authstate.pw->pw_name);
+
+	/* check for non-root if desired */
+	if (svr_opts.norootlogin && ses.authstate.pw->pw_uid == 0) {
+		TRACE(("leave checkusername: root login disabled"))
+		dropbear_log(LOG_WARNING, "root login rejected");
+		send_msg_userauth_failure(0, 1);
+		return DROPBEAR_FAILURE;
+	}
+
+	/* check for an empty password */
+	if (ses.authstate.pw->pw_passwd[0] == '\0') {
+		TRACE(("leave checkusername: empty pword"))
+		dropbear_log(LOG_WARNING, "user '%s' has blank password, rejected",
+				ses.authstate.printableuser);
+		send_msg_userauth_failure(0, 1);
+		return DROPBEAR_FAILURE;
+	}
+
+	TRACE(("shell is %s", ses.authstate.pw->pw_shell))
+
+	/* check that the shell is set */
+	usershell = ses.authstate.pw->pw_shell;
+	if (usershell[0] == '\0') {
+		/* empty shell in /etc/passwd means /bin/sh according to passwd(5) */
+		usershell = "/bin/sh";
+	}
+
+	/* check the shell is valid. If /etc/shells doesn't exist, getusershell()
+	 * should return some standard shells like "/bin/sh" and "/bin/csh" (this
+	 * is platform-specific) */
+	setusershell();
+	while ((listshell = getusershell()) != NULL) {
+		TRACE(("test shell is '%s'", listshell))
+		if (strcmp(listshell, usershell) == 0) {
+			/* have a match */
+			goto goodshell;
+		}
+	}
+	/* no matching shell */
+	endusershell();
+	TRACE(("no matching shell"))
+	dropbear_log(LOG_WARNING, "user '%s' has invalid shell, rejected",
+				ses.authstate.printableuser);
+	send_msg_userauth_failure(0, 1);
+	return DROPBEAR_FAILURE;
+	
+goodshell:
+	endusershell();
+	TRACE(("matching shell"))
+
+	TRACE(("uid = %d", ses.authstate.pw->pw_uid))
+	TRACE(("leave checkusername"))
+	return DROPBEAR_SUCCESS;
+
+}
+
+/* Send a failure message to the client, in responds to a userauth_request.
+ * Partial indicates whether to set the "partial success" flag,
+ * incrfail is whether to count this failure in the failure count (which
+ * is limited. This function also handles disconnection after too many
+ * failures */
+void send_msg_userauth_failure(int partial, int incrfail) {
+
+	buffer *typebuf = NULL;
+
+	TRACE(("enter send_msg_userauth_failure"))
+
+	CHECKCLEARTOWRITE();
+	
+	buf_putbyte(ses.writepayload, SSH_MSG_USERAUTH_FAILURE);
+
+	/* put a list of allowed types */
+	typebuf = buf_new(30); /* long enough for PUBKEY and PASSWORD */
+
+	if (ses.authstate.authtypes & AUTH_TYPE_PUBKEY) {
+		buf_putbytes(typebuf, AUTH_METHOD_PUBKEY, AUTH_METHOD_PUBKEY_LEN);
+		if (ses.authstate.authtypes & AUTH_TYPE_PASSWORD) {
+			buf_putbyte(typebuf, ',');
+		}
+	}
+	
+	if (ses.authstate.authtypes & AUTH_TYPE_PASSWORD) {
+		buf_putbytes(typebuf, AUTH_METHOD_PASSWORD, AUTH_METHOD_PASSWORD_LEN);
+	}
+
+	buf_setpos(typebuf, 0);
+	buf_putstring(ses.writepayload, buf_getptr(typebuf, typebuf->len),
+			typebuf->len);
+	buf_free(typebuf);
+
+	buf_putbyte(ses.writepayload, partial ? 1 : 0);
+	encrypt_packet();
+
+	TRACE(("auth fail: methods %d, '%s'", ses.authstate.authtypes,
+				buf_getptr(typebuf, typebuf->len)));
+
+	if (incrfail) {
+		usleep(300000); /* XXX improve this */
+		ses.authstate.failcount++;
+	}
+
+	if (ses.authstate.failcount >= MAX_AUTH_TRIES) {
+		char * userstr;
+		/* XXX - send disconnect ? */
+		TRACE(("Max auth tries reached, exiting"))
+
+		if (ses.authstate.printableuser == NULL) {
+			userstr = "is invalid";
+		} else {
+			userstr = ses.authstate.printableuser;
+		}
+		dropbear_exit("Max auth tries reached - user '%s' from %s",
+				userstr, svr_ses.addrstring);
+	}
+	
+	TRACE(("leave send_msg_userauth_failure"))
+}
+
+/* Send a success message to the user, and set the "authdone" flag */
+void send_msg_userauth_success() {
+
+	TRACE(("enter send_msg_userauth_success"))
+
+	CHECKCLEARTOWRITE();
+
+	buf_putbyte(ses.writepayload, SSH_MSG_USERAUTH_SUCCESS);
+	encrypt_packet();
+
+	ses.authstate.authdone = 1;
+	ses.connecttimeout = 0;
+
+
+	if (ses.authstate.pw->pw_uid == 0) {
+		ses.allowprivport = 1;
+	}
+
+	/* Remove from the list of pre-auth sockets. Should be m_close(), since if
+	 * we fail, we might end up leaking connection slots, and disallow new
+	 * logins - a nasty situation. */							
+	m_close(svr_ses.childpipe);
+
+	TRACE(("leave send_msg_userauth_success"))
+
+}
diff --git a/svr-authpam.c b/svr-authpam.c
new file mode 100644
index 0000000..4e257ae
--- /dev/null
+++ b/svr-authpam.c
@@ -0,0 +1,258 @@
+/*
+ * Dropbear SSH
+ * 
+ * Copyright (c) 2004 Martin Carlsson
+ * Portions (c) 2004 Matt Johnston
+ * All rights reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE. */
+
+/* Validates a user password using PAM */
+
+#include "includes.h"
+#include "session.h"
+#include "buffer.h"
+#include "dbutil.h"
+#include "auth.h"
+
+#if defined(HAVE_SECURITY_PAM_APPL_H)
+#include <security/pam_appl.h>
+#elif defined (HAVE_PAM_PAM_APPL_H)
+#include <pam/pam_appl.h>
+#endif
+
+#ifdef ENABLE_SVR_PAM_AUTH
+
+struct UserDataS {
+	char* user;
+	char* passwd;
+};
+
+/* PAM conversation function - for now we only handle one message */
+int 
+pamConvFunc(int num_msg, 
+		const struct pam_message **msg,
+		struct pam_response **respp, 
+		void *appdata_ptr) {
+
+	int rc = PAM_SUCCESS;
+	struct pam_response* resp = NULL;
+	struct UserDataS* userDatap = (struct UserDataS*) appdata_ptr;
+	unsigned int msg_len = 0;
+	unsigned int i = 0;
+
+	const char* message = (*msg)->msg;
+
+	/* make a copy we can strip */
+	char * compare_message = m_strdup(message);
+
+	TRACE(("enter pamConvFunc"))
+
+	if (num_msg != 1) {
+		/* If you're getting here - Dropbear probably can't support your pam
+		 * modules. This whole file is a bit of a hack around lack of
+		 * asynchronocity in PAM anyway. */
+		dropbear_log(LOG_INFO, "pamConvFunc() called with >1 messages: not supported.");
+		return PAM_CONV_ERR;
+	}
+	
+	TRACE(("msg_style is %d", (*msg)->msg_style))
+	if (compare_message) {
+		TRACE(("message is '%s'", compare_message))
+	} else {
+		TRACE(("null message"))
+	}
+
+
+	/* Make the string lowercase. */
+	msg_len = strlen(compare_message);
+	for (i = 0; i < msg_len; i++) {
+		compare_message[i] = tolower(compare_message[i]);
+	}
+
+	/* If the string ends with ": ", remove the space. 
+	   ie "login: " vs "login:" */
+	if (msg_len > 2 
+			&& compare_message[msg_len-2] == ':' 
+			&& compare_message[msg_len-1] == ' ') {
+		compare_message[msg_len-1] = '\0';
+	}
+
+	switch((*msg)->msg_style) {
+
+		case PAM_PROMPT_ECHO_OFF:
+
+			if (!(strcmp(compare_message, "password:") == 0)) {
+				/* We don't recognise the prompt as asking for a password,
+				   so can't handle it. Add more above as required for
+				   different pam modules/implementations */
+				dropbear_log(LOG_NOTICE, "PAM unknown prompt %s (no echo)",
+						compare_message);
+				rc = PAM_CONV_ERR;
+				break;
+			}
+
+			/* You have to read the PAM module-writers' docs (do we look like
+			 * module writers? no.) to find out that the module will
+			 * free the pam_response and its resp element - ie we _must_ malloc
+			 * it here */
+			resp = (struct pam_response*) m_malloc(sizeof(struct pam_response));
+			memset(resp, 0, sizeof(struct pam_response));
+
+			resp->resp = m_strdup(userDatap->passwd);
+			m_burn(userDatap->passwd, strlen(userDatap->passwd));
+			(*respp) = resp;
+			break;
+
+
+		case PAM_PROMPT_ECHO_ON:
+
+			if (!((strcmp(compare_message, "login:" ) == 0) 
+				|| (strcmp(compare_message, "please enter username:") == 0))) {
+				/* We don't recognise the prompt as asking for a username,
+				   so can't handle it. Add more above as required for
+				   different pam modules/implementations */
+				dropbear_log(LOG_NOTICE, "PAM unknown prompt %s (with echo)",
+						compare_message);
+				rc = PAM_CONV_ERR;
+				break;
+			}
+
+			/* You have to read the PAM module-writers' docs (do we look like
+			 * module writers? no.) to find out that the module will
+			 * free the pam_response and its resp element - ie we _must_ malloc
+			 * it here */
+			resp = (struct pam_response*) m_malloc(sizeof(struct pam_response));
+			memset(resp, 0, sizeof(struct pam_response));
+
+			resp->resp = m_strdup(userDatap->user);
+			TRACE(("userDatap->user='%s'", userDatap->user))
+			(*respp) = resp;
+			break;
+
+		default:
+			TRACE(("Unknown message type"))
+			rc = PAM_CONV_ERR;
+			break;      
+	}
+
+	m_free(compare_message);
+	TRACE(("leave pamConvFunc, rc %d", rc))
+
+	return rc;
+}
+
+/* Process a password auth request, sending success or failure messages as
+ * appropriate. To the client it looks like it's doing normal password auth (as
+ * opposed to keyboard-interactive or something), so the pam module has to be
+ * fairly standard (ie just "what's your username, what's your password, OK").
+ *
+ * Keyboard interactive would be a lot nicer, but since PAM is synchronous, it
+ * gets very messy trying to send the interactive challenges, and read the
+ * interactive responses, over the network. */
+void svr_auth_pam() {
+
+	struct UserDataS userData = {NULL, NULL};
+	struct pam_conv pamConv = {
+		pamConvFunc,
+		&userData /* submitted to pamvConvFunc as appdata_ptr */ 
+	};
+
+	pam_handle_t* pamHandlep = NULL;
+
+	unsigned char * password = NULL;
+	unsigned int passwordlen;
+
+	int rc = PAM_SUCCESS;
+	unsigned char changepw;
+
+	/* check if client wants to change password */
+	changepw = buf_getbool(ses.payload);
+	if (changepw) {
+		/* not implemented by this server */
+		send_msg_userauth_failure(0, 1);
+		goto cleanup;
+	}
+
+	password = buf_getstring(ses.payload, &passwordlen);
+
+	/* used to pass data to the PAM conversation function - don't bother with
+	 * strdup() etc since these are touched only by our own conversation
+	 * function (above) which takes care of it */
+	userData.user = ses.authstate.printableuser;
+	userData.passwd = password;
+
+	/* Init pam */
+	if ((rc = pam_start("sshd", NULL, &pamConv, &pamHandlep)) != PAM_SUCCESS) {
+		dropbear_log(LOG_WARNING, "pam_start() failed, rc=%d, %s\n", 
+				rc, pam_strerror(pamHandlep, rc));
+		goto cleanup;
+	}
+
+	/* just to set it to something */
+	if ((rc = pam_set_item(pamHandlep, PAM_TTY, "ssh") != PAM_SUCCESS)) {
+		dropbear_log(LOG_WARNING, "pam_set_item() failed, rc=%d, %s\n", 
+				rc, pam_strerror(pamHandlep, rc));
+		goto cleanup;
+	}
+
+	(void) pam_fail_delay(pamHandlep, 0 /* musec_delay */);
+
+	/* (void) pam_set_item(pamHandlep, PAM_FAIL_DELAY, (void*) pamDelayFunc); */
+
+	if ((rc = pam_authenticate(pamHandlep, 0)) != PAM_SUCCESS) {
+		dropbear_log(LOG_WARNING, "pam_authenticate() failed, rc=%d, %s\n", 
+				rc, pam_strerror(pamHandlep, rc));
+		dropbear_log(LOG_WARNING,
+				"bad PAM password attempt for '%s' from %s",
+				ses.authstate.printableuser,
+				svr_ses.addrstring);
+		send_msg_userauth_failure(0, 1);
+		goto cleanup;
+	}
+
+	if ((rc = pam_acct_mgmt(pamHandlep, 0)) != PAM_SUCCESS) {
+		dropbear_log(LOG_WARNING, "pam_acct_mgmt() failed, rc=%d, %s\n", 
+				rc, pam_strerror(pamHandlep, rc));
+		dropbear_log(LOG_WARNING,
+				"bad PAM password attempt for '%s' from %s",
+				ses.authstate.printableuser,
+				svr_ses.addrstring);
+		send_msg_userauth_failure(0, 1);
+		goto cleanup;
+	}
+
+	/* successful authentication */
+	dropbear_log(LOG_NOTICE, "PAM password auth succeeded for '%s' from %s",
+			ses.authstate.printableuser,
+			svr_ses.addrstring);
+	send_msg_userauth_success();
+
+cleanup:
+	if (password != NULL) {
+		m_burn(password, passwordlen);
+		m_free(password);
+	}
+	if (pamHandlep != NULL) {
+		TRACE(("pam_end"))
+		(void) pam_end(pamHandlep, 0 /* pam_status */);
+	}
+}
+
+#endif /* ENABLE_SVR_PAM_AUTH */
diff --git a/svr-authpasswd.c b/svr-authpasswd.c
new file mode 100644
index 0000000..5be1e2a
--- /dev/null
+++ b/svr-authpasswd.c
@@ -0,0 +1,105 @@
+/*
+ * Dropbear - a SSH2 server
+ * 
+ * Copyright (c) 2002,2003 Matt Johnston
+ * All rights reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE. */
+
+/* Validates a user password */
+
+#include "includes.h"
+#include "session.h"
+#include "buffer.h"
+#include "dbutil.h"
+#include "auth.h"
+
+#ifdef ENABLE_SVR_PASSWORD_AUTH
+
+/* Process a password auth request, sending success or failure messages as
+ * appropriate */
+void svr_auth_password() {
+	
+#ifdef HAVE_SHADOW_H
+	struct spwd *spasswd = NULL;
+#endif
+	char * passwdcrypt = NULL; /* the crypt from /etc/passwd or /etc/shadow */
+	char * testcrypt = NULL; /* crypt generated from the user's password sent */
+	unsigned char * password;
+	unsigned int passwordlen;
+
+	unsigned int changepw;
+
+	passwdcrypt = ses.authstate.pw->pw_passwd;
+#ifdef HAVE_SHADOW_H
+	/* get the shadow password if possible */
+	spasswd = getspnam(ses.authstate.printableuser);
+	if (spasswd != NULL && spasswd->sp_pwdp != NULL) {
+		passwdcrypt = spasswd->sp_pwdp;
+	}
+#endif
+
+#ifdef DEBUG_HACKCRYPT
+	/* debugging crypt for non-root testing with shadows */
+	passwdcrypt = DEBUG_HACKCRYPT;
+#endif
+
+	/* check for empty password - need to do this again here
+	 * since the shadow password may differ to that tested
+	 * in auth.c */
+	if (passwdcrypt[0] == '\0') {
+		dropbear_log(LOG_WARNING, "user '%s' has blank password, rejected",
+				ses.authstate.printableuser);
+		send_msg_userauth_failure(0, 1);
+		return;
+	}
+
+	/* check if client wants to change password */
+	changepw = buf_getbool(ses.payload);
+	if (changepw) {
+		/* not implemented by this server */
+		send_msg_userauth_failure(0, 1);
+		return;
+	}
+
+	password = buf_getstring(ses.payload, &passwordlen);
+
+	/* the first bytes of passwdcrypt are the salt */
+	testcrypt = crypt((char*)password, passwdcrypt);
+	m_burn(password, passwordlen);
+	m_free(password);
+
+	if (strcmp(testcrypt, passwdcrypt) == 0) {
+		/* successful authentication */
+		dropbear_log(LOG_NOTICE, 
+				"password auth succeeded for '%s' from %s",
+				ses.authstate.printableuser,
+				svr_ses.addrstring);
+		send_msg_userauth_success();
+	} else {
+		dropbear_log(LOG_WARNING,
+				"bad password attempt for '%s' from %s",
+				ses.authstate.printableuser,
+				svr_ses.addrstring);
+		send_msg_userauth_failure(0, 1);
+	}
+
+}
+
+#endif
diff --git a/svr-authpubkey.c b/svr-authpubkey.c
new file mode 100644
index 0000000..3942bd5
--- /dev/null
+++ b/svr-authpubkey.c
@@ -0,0 +1,347 @@
+/*
+ * Dropbear - a SSH2 server
+ * 
+ * Copyright (c) 2002,2003 Matt Johnston
+ * All rights reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE. */
+
+/* Process a pubkey auth request */
+
+#include "includes.h"
+#include "session.h"
+#include "dbutil.h"
+#include "buffer.h"
+#include "signkey.h"
+#include "auth.h"
+#include "ssh.h"
+#include "packet.h"
+#include "algo.h"
+
+#ifdef ENABLE_SVR_PUBKEY_AUTH
+
+#define MIN_AUTHKEYS_LINE 10 /* "ssh-rsa AB" - short but doesn't matter */
+#define MAX_AUTHKEYS_LINE 4200 /* max length of a line in authkeys */
+
+static int checkpubkey(unsigned char* algo, unsigned int algolen,
+		unsigned char* keyblob, unsigned int keybloblen);
+static int checkpubkeyperms();
+static void send_msg_userauth_pk_ok(unsigned char* algo, unsigned int algolen,
+		unsigned char* keyblob, unsigned int keybloblen);
+static int checkfileperm(char * filename);
+
+/* process a pubkey auth request, sending success or failure message as
+ * appropriate */
+void svr_auth_pubkey() {
+
+	unsigned char testkey; /* whether we're just checking if a key is usable */
+	unsigned char* algo = NULL; /* pubkey algo */
+	unsigned int algolen;
+	unsigned char* keyblob = NULL;
+	unsigned int keybloblen;
+	buffer * signbuf = NULL;
+	sign_key * key = NULL;
+	char* fp = NULL;
+	int type = -1;
+
+	TRACE(("enter pubkeyauth"))
+
+	/* 0 indicates user just wants to check if key can be used, 1 is an
+	 * actual attempt*/
+	testkey = (buf_getbool(ses.payload) == 0);
+
+	algo = buf_getstring(ses.payload, &algolen);
+	keybloblen = buf_getint(ses.payload);
+	keyblob = buf_getptr(ses.payload, keybloblen);
+
+	/* check if the key is valid */
+	if (checkpubkey(algo, algolen, keyblob, keybloblen) == DROPBEAR_FAILURE) {
+		send_msg_userauth_failure(0, 0);
+		goto out;
+	}
+
+	/* let them know that the key is ok to use */
+	if (testkey) {
+		send_msg_userauth_pk_ok(algo, algolen, keyblob, keybloblen);
+		goto out;
+	}
+
+	/* now we can actually verify the signature */
+	
+	/* get the key */
+	key = new_sign_key();
+	type = DROPBEAR_SIGNKEY_ANY;
+	if (buf_get_pub_key(ses.payload, key, &type) == DROPBEAR_FAILURE) {
+		send_msg_userauth_failure(0, 1);
+		goto out;
+	}
+
+	/* create the data which has been signed - this a string containing
+	 * session_id, concatenated with the payload packet up to the signature */
+	signbuf = buf_new(ses.payload->pos + 4 + SHA1_HASH_SIZE);
+	buf_putstring(signbuf, ses.session_id, SHA1_HASH_SIZE);
+	buf_putbytes(signbuf, ses.payload->data, ses.payload->pos);
+	buf_setpos(signbuf, 0);
+
+	/* ... and finally verify the signature */
+	fp = sign_key_fingerprint(keyblob, keybloblen);
+	if (buf_verify(ses.payload, key, buf_getptr(signbuf, signbuf->len),
+				signbuf->len) == DROPBEAR_SUCCESS) {
+		dropbear_log(LOG_NOTICE,
+				"pubkey auth succeeded for '%s' with key %s from %s",
+				ses.authstate.printableuser, fp, svr_ses.addrstring);
+		send_msg_userauth_success();
+	} else {
+		dropbear_log(LOG_WARNING,
+				"pubkey auth bad signature for '%s' with key %s from %s",
+				ses.authstate.printableuser, fp, svr_ses.addrstring);
+		send_msg_userauth_failure(0, 1);
+	}
+	m_free(fp);
+
+out:
+	/* cleanup stuff */
+	if (signbuf) {
+		buf_free(signbuf);
+	}
+	if (algo) {
+		m_free(algo);
+	}
+	if (key) {
+		sign_key_free(key);
+		key = NULL;
+	}
+	TRACE(("leave pubkeyauth"))
+}
+
+/* Reply that the key is valid for auth, this is sent when the user sends
+ * a straight copy of their pubkey to test, to avoid having to perform
+ * expensive signing operations with a worthless key */
+static void send_msg_userauth_pk_ok(unsigned char* algo, unsigned int algolen,
+		unsigned char* keyblob, unsigned int keybloblen) {
+
+	TRACE(("enter send_msg_userauth_pk_ok"))
+	CHECKCLEARTOWRITE();
+
+	buf_putbyte(ses.writepayload, SSH_MSG_USERAUTH_PK_OK);
+	buf_putstring(ses.writepayload, algo, algolen);
+	buf_putstring(ses.writepayload, keyblob, keybloblen);
+
+	encrypt_packet();
+	TRACE(("leave send_msg_userauth_pk_ok"))
+
+}
+
+/* Checks whether a specified publickey (and associated algorithm) is an
+ * acceptable key for authentication */
+/* Returns DROPBEAR_SUCCESS if key is ok for auth, DROPBEAR_FAILURE otherwise */
+static int checkpubkey(unsigned char* algo, unsigned int algolen,
+		unsigned char* keyblob, unsigned int keybloblen) {
+
+	FILE * authfile = NULL;
+	char * filename = NULL;
+	int ret = DROPBEAR_FAILURE;
+	buffer * line = NULL;
+	unsigned int len, pos;
+	
+	TRACE(("enter checkpubkey"))
+
+	/* check that we can use the algo */
+	if (have_algo(algo, algolen, sshhostkey) == DROPBEAR_FAILURE) {
+		dropbear_log(LOG_WARNING,
+				"pubkey auth attempt with unknown algo for '%s' from %s",
+				ses.authstate.printableuser, svr_ses.addrstring);
+		goto out;
+	}
+
+	/* check file permissions, also whether file exists */
+	if (checkpubkeyperms() == DROPBEAR_FAILURE) {
+		TRACE(("bad authorized_keys permissions, or file doesn't exist"))
+		goto out;
+	}
+
+	/* we don't need to check pw and pw_dir for validity, since
+	 * its been done in checkpubkeyperms. */
+	len = strlen(ses.authstate.pw->pw_dir);
+	/* allocate max required pathname storage,
+	 * = path + "/.ssh/authorized_keys" + '\0' = pathlen + 22 */
+	filename = m_malloc(len + 22);
+	snprintf(filename, len + 22, "%s/.ssh/authorized_keys", 
+				ses.authstate.pw->pw_dir);
+
+	/* open the file */
+	authfile = fopen(filename, "r");
+	if (authfile == NULL) {
+		goto out;
+	}
+	TRACE(("checkpubkey: opened authorized_keys OK"))
+
+	line = buf_new(MAX_AUTHKEYS_LINE);
+
+	/* iterate through the lines */
+	do {
+
+		if (buf_getline(line, authfile) == DROPBEAR_FAILURE) {
+			/* EOF reached */
+			TRACE(("checkpubkey: authorized_keys EOF reached"))
+			break;
+		}
+
+		if (line->len < MIN_AUTHKEYS_LINE) {
+			TRACE(("checkpubkey: line too short"))
+			continue; /* line is too short for it to be a valid key */
+		}
+
+		/* check the key type - this also stops us from using keys
+		 * which have options with them */
+		if (strncmp(buf_getptr(line, algolen), algo, algolen) != 0) {
+			continue;
+		}
+		buf_incrpos(line, algolen);
+		
+		/* check for space (' ') character */
+		if (buf_getbyte(line) != ' ') {
+			TRACE(("checkpubkey: space character expected, isn't there"))
+			continue;
+		}
+
+		/* truncate the line at the space after the base64 data */
+		pos = line->pos;
+		for (len = 0; line->pos < line->len; len++) {
+			if (buf_getbyte(line) == ' ') break;
+		}	
+		buf_setpos(line, pos);
+		buf_setlen(line, line->pos + len);
+
+		TRACE(("checkpubkey: line pos = %d len = %d", line->pos, line->len))
+
+		ret = cmp_base64_key(keyblob, keybloblen, algo, algolen, line);
+		if (ret == DROPBEAR_SUCCESS) {
+			break;
+		}
+
+		/* We continue to the next line otherwise */
+		
+	} while (1);
+
+out:
+	if (authfile) {
+		fclose(authfile);
+	}
+	if (line) {
+		buf_free(line);
+	}
+	m_free(filename);
+	TRACE(("leave checkpubkey: ret=%d", ret))
+	return ret;
+}
+
+
+/* Returns DROPBEAR_SUCCESS if file permissions for pubkeys are ok,
+ * DROPBEAR_FAILURE otherwise.
+ * Checks that the user's homedir, ~/.ssh, and
+ * ~/.ssh/authorized_keys are all owned by either root or the user, and are
+ * g-w, o-w */
+static int checkpubkeyperms() {
+
+	char* filename = NULL; 
+	int ret = DROPBEAR_FAILURE;
+	unsigned int len;
+
+	TRACE(("enter checkpubkeyperms"))
+
+	if (ses.authstate.pw->pw_dir == NULL) {
+		goto out;
+	}
+
+	if ((len = strlen(ses.authstate.pw->pw_dir)) == 0) {
+		goto out;
+	}
+
+	/* allocate max required pathname storage,
+	 * = path + "/.ssh/authorized_keys" + '\0' = pathlen + 22 */
+	filename = m_malloc(len + 22);
+	strncpy(filename, ses.authstate.pw->pw_dir, len+1);
+
+	/* check ~ */
+	if (checkfileperm(filename) != DROPBEAR_SUCCESS) {
+		goto out;
+	}
+
+	/* check ~/.ssh */
+	strncat(filename, "/.ssh", 5); /* strlen("/.ssh") == 5 */
+	if (checkfileperm(filename) != DROPBEAR_SUCCESS) {
+		goto out;
+	}
+
+	/* now check ~/.ssh/authorized_keys */
+	strncat(filename, "/authorized_keys", 16);
+	if (checkfileperm(filename) != DROPBEAR_SUCCESS) {
+		goto out;
+	}
+
+	/* file looks ok, return success */
+	ret = DROPBEAR_SUCCESS;
+	
+out:
+	m_free(filename);
+
+	TRACE(("leave checkpubkeyperms"))
+	return ret;
+}
+
+/* Checks that a file is owned by the user or root, and isn't writable by
+ * group or other */
+/* returns DROPBEAR_SUCCESS or DROPBEAR_FAILURE */
+static int checkfileperm(char * filename) {
+	struct stat filestat;
+	int badperm = 0;
+
+	TRACE(("enter checkfileperm(%s)", filename))
+
+	if (stat(filename, &filestat) != 0) {
+		TRACE(("leave checkfileperm: stat() != 0"))
+		return DROPBEAR_FAILURE;
+	}
+	/* check ownership - user or root only*/
+	if (filestat.st_uid != ses.authstate.pw->pw_uid
+			&& filestat.st_uid != 0) {
+		badperm = 1;
+		TRACE(("wrong ownership"))
+	}
+	/* check permissions - don't want group or others +w */
+	if (filestat.st_mode & (S_IWGRP | S_IWOTH)) {
+		badperm = 1;
+		TRACE(("wrong perms"))
+	}
+	if (badperm) {
+		if (!ses.authstate.perm_warn) {
+			ses.authstate.perm_warn = 1;
+			dropbear_log(LOG_INFO, "%s must be owned by user or root, and not writable by others", filename);
+		}
+		TRACE(("leave checkfileperm: failure perms/owner"))
+		return DROPBEAR_FAILURE;
+	}
+
+	TRACE(("leave checkfileperm: success"))
+	return DROPBEAR_SUCCESS;
+}
+
+
+#endif 
diff --git a/svr-chansession.c b/svr-chansession.c
new file mode 100644
index 0000000..0916e7e
--- /dev/null
+++ b/svr-chansession.c
@@ -0,0 +1,1003 @@
+/*
+ * Dropbear - a SSH2 server
+ * 
+ * Copyright (c) 2002,2003 Matt Johnston
+ * All rights reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE. */
+
+#include "includes.h"
+#include "packet.h"
+#include "buffer.h"
+#include "session.h"
+#include "dbutil.h"
+#include "channel.h"
+#include "chansession.h"
+#include "sshpty.h"
+#include "termcodes.h"
+#include "ssh.h"
+#include "random.h"
+#include "utmp.h"
+#include "x11fwd.h"
+#include "agentfwd.h"
+#include "runopts.h"
+
+/* Handles sessions (either shells or programs) requested by the client */
+
+static int sessioncommand(struct Channel *channel, struct ChanSess *chansess,
+		int iscmd, int issubsys);
+static int sessionpty(struct ChanSess * chansess);
+static int sessionsignal(struct ChanSess *chansess);
+static int noptycommand(struct Channel *channel, struct ChanSess *chansess);
+static int ptycommand(struct Channel *channel, struct ChanSess *chansess);
+static int sessionwinchange(struct ChanSess *chansess);
+static void execchild(struct ChanSess *chansess);
+static void addchildpid(struct ChanSess *chansess, pid_t pid);
+static void sesssigchild_handler(int val);
+static void closechansess(struct Channel *channel);
+static int newchansess(struct Channel *channel);
+static void chansessionrequest(struct Channel *channel);
+
+static void send_exitsignalstatus(struct Channel *channel);
+static void send_msg_chansess_exitstatus(struct Channel * channel,
+		struct ChanSess * chansess);
+static void send_msg_chansess_exitsignal(struct Channel * channel,
+		struct ChanSess * chansess);
+static int sesscheckclose(struct Channel *channel);
+static void get_termmodes(struct ChanSess *chansess);
+
+
+/* required to clear environment */
+extern char** environ;
+
+static int sesscheckclose(struct Channel *channel) {
+	struct ChanSess *chansess = (struct ChanSess*)channel->typedata;
+	return chansess->exit.exitpid >= 0;
+}
+
+/* Handler for childs exiting, store the state for return to the client */
+
+/* There's a particular race we have to watch out for: if the forked child
+ * executes, exits, and this signal-handler is called, all before the parent
+ * gets to run, then the childpids[] array won't have the pid in it. Hence we
+ * use the svr_ses.lastexit struct to hold the exit, which is then compared by
+ * the parent when it runs. This work correctly at least in the case of a
+ * single shell spawned (ie the usual case) */
+static void sesssigchild_handler(int UNUSED(dummy)) {
+
+	int status;
+	pid_t pid;
+	unsigned int i;
+	struct sigaction sa_chld;
+	struct exitinfo *exit = NULL;
+
+	TRACE(("enter sigchld handler"))
+	while ((pid = waitpid(-1, &status, WNOHANG)) > 0) {
+		/* find the corresponding chansess */
+		for (i = 0; i < svr_ses.childpidsize; i++) {
+			if (svr_ses.childpids[i].pid == pid) {
+
+				exit = &svr_ses.childpids[i].chansess->exit;
+				break;
+			}
+		}
+
+		/* If the pid wasn't matched, then we might have hit the race mentioned
+		 * above. So we just store the info for the parent to deal with */
+		if (i == svr_ses.childpidsize) {
+			exit = &svr_ses.lastexit;
+		}
+
+		exit->exitpid = pid;
+		if (WIFEXITED(status)) {
+			exit->exitstatus = WEXITSTATUS(status);
+		}
+		if (WIFSIGNALED(status)) {
+			exit->exitsignal = WTERMSIG(status);
+#if !defined(AIX) && defined(WCOREDUMP)
+			exit->exitcore = WCOREDUMP(status);
+#else
+			exit->exitcore = 0;
+#endif
+		} else {
+			/* we use this to determine how pid exited */
+			exit->exitsignal = -1;
+		}
+		exit = NULL;
+	}
+
+	
+	sa_chld.sa_handler = sesssigchild_handler;
+	sa_chld.sa_flags = SA_NOCLDSTOP;
+	sigaction(SIGCHLD, &sa_chld, NULL);
+	TRACE(("leave sigchld handler"))
+}
+
+/* send the exit status or the signal causing termination for a session */
+/* XXX server */
+static void send_exitsignalstatus(struct Channel *channel) {
+
+	struct ChanSess *chansess = (struct ChanSess*)channel->typedata;
+
+	if (chansess->exit.exitpid >= 0) {
+		if (chansess->exit.exitsignal > 0) {
+			send_msg_chansess_exitsignal(channel, chansess);
+		} else {
+			send_msg_chansess_exitstatus(channel, chansess);
+		}
+	}
+}
+
+/* send the exitstatus to the client */
+static void send_msg_chansess_exitstatus(struct Channel * channel,
+		struct ChanSess * chansess) {
+
+	dropbear_assert(chansess->exit.exitpid != -1);
+	dropbear_assert(chansess->exit.exitsignal == -1);
+
+	CHECKCLEARTOWRITE();
+
+	buf_putbyte(ses.writepayload, SSH_MSG_CHANNEL_REQUEST);
+	buf_putint(ses.writepayload, channel->remotechan);
+	buf_putstring(ses.writepayload, "exit-status", 11);
+	buf_putbyte(ses.writepayload, 0); /* boolean FALSE */
+	buf_putint(ses.writepayload, chansess->exit.exitstatus);
+
+	encrypt_packet();
+
+}
+
+/* send the signal causing the exit to the client */
+static void send_msg_chansess_exitsignal(struct Channel * channel,
+		struct ChanSess * chansess) {
+
+	int i;
+	char* signame = NULL;
+
+	dropbear_assert(chansess->exit.exitpid != -1);
+	dropbear_assert(chansess->exit.exitsignal > 0);
+
+	CHECKCLEARTOWRITE();
+
+	/* we check that we can match a signal name, otherwise
+	 * don't send anything */
+	for (i = 0; signames[i].name != NULL; i++) {
+		if (signames[i].signal == chansess->exit.exitsignal) {
+			signame = signames[i].name;
+			break;
+		}
+	}
+
+	if (signame == NULL) {
+		return;
+	}
+
+	buf_putbyte(ses.writepayload, SSH_MSG_CHANNEL_REQUEST);
+	buf_putint(ses.writepayload, channel->remotechan);
+	buf_putstring(ses.writepayload, "exit-signal", 11);
+	buf_putbyte(ses.writepayload, 0); /* boolean FALSE */
+	buf_putstring(ses.writepayload, signame, strlen(signame));
+	buf_putbyte(ses.writepayload, chansess->exit.exitcore);
+	buf_putstring(ses.writepayload, "", 0); /* error msg */
+	buf_putstring(ses.writepayload, "", 0); /* lang */
+
+	encrypt_packet();
+}
+
+/* set up a session channel */
+static int newchansess(struct Channel *channel) {
+
+	struct ChanSess *chansess;
+
+	dropbear_assert(channel->typedata == NULL);
+
+	chansess = (struct ChanSess*)m_malloc(sizeof(struct ChanSess));
+	chansess->cmd = NULL;
+	chansess->pid = 0;
+
+	/* pty details */
+	chansess->master = -1;
+	chansess->slave = -1;
+	chansess->tty = NULL;
+	chansess->term = NULL;
+
+	chansess->exit.exitpid = -1;
+
+	channel->typedata = chansess;
+
+#ifndef DISABLE_X11FWD
+	chansess->x11listener = NULL;
+	chansess->x11authprot = NULL;
+	chansess->x11authcookie = NULL;
+#endif
+
+#ifndef DISABLE_AGENTFWD
+	chansess->agentlistener = NULL;
+	chansess->agentfile = NULL;
+	chansess->agentdir = NULL;
+#endif
+
+	return 0;
+
+}
+
+/* clean a session channel */
+static void closechansess(struct Channel *channel) {
+
+	struct ChanSess *chansess;
+	unsigned int i;
+	struct logininfo *li;
+
+	chansess = (struct ChanSess*)channel->typedata;
+
+	send_exitsignalstatus(channel);
+
+	TRACE(("enter closechansess"))
+	if (chansess == NULL) {
+		TRACE(("leave closechansess: chansess == NULL"))
+		return;
+	}
+
+	m_free(chansess->cmd);
+	m_free(chansess->term);
+
+	if (chansess->tty) {
+		/* write the utmp/wtmp login record */
+		li = login_alloc_entry(chansess->pid, ses.authstate.username,
+				ses.remotehost, chansess->tty);
+		login_logout(li);
+		login_free_entry(li);
+
+		pty_release(chansess->tty);
+		m_free(chansess->tty);
+	}
+
+#ifndef DISABLE_X11FWD
+	x11cleanup(chansess);
+#endif
+
+#ifndef DISABLE_AGENTFWD
+	agentcleanup(chansess);
+#endif
+
+	/* clear child pid entries */
+	for (i = 0; i < svr_ses.childpidsize; i++) {
+		if (svr_ses.childpids[i].chansess == chansess) {
+			dropbear_assert(svr_ses.childpids[i].pid > 0);
+			TRACE(("closing pid %d", svr_ses.childpids[i].pid))
+			TRACE(("exitpid = %d", chansess->exit.exitpid))
+			svr_ses.childpids[i].pid = -1;
+			svr_ses.childpids[i].chansess = NULL;
+		}
+	}
+				
+	m_free(chansess);
+
+	TRACE(("leave closechansess"))
+}
+
+/* Handle requests for a channel. These can be execution requests,
+ * or x11/authagent forwarding. These are passed to appropriate handlers */
+static void chansessionrequest(struct Channel *channel) {
+
+	unsigned char * type = NULL;
+	unsigned int typelen;
+	unsigned char wantreply;
+	int ret = 1;
+	struct ChanSess *chansess;
+
+	TRACE(("enter chansessionrequest"))
+
+	type = buf_getstring(ses.payload, &typelen);
+	wantreply = buf_getbool(ses.payload);
+
+	if (typelen > MAX_NAME_LEN) {
+		TRACE(("leave chansessionrequest: type too long")) /* XXX send error?*/
+		goto out;
+	}
+
+	chansess = (struct ChanSess*)channel->typedata;
+	dropbear_assert(chansess != NULL);
+	TRACE(("type is %s", type))
+
+	if (strcmp(type, "window-change") == 0) {
+		ret = sessionwinchange(chansess);
+	} else if (strcmp(type, "shell") == 0) {
+		ret = sessioncommand(channel, chansess, 0, 0);
+	} else if (strcmp(type, "pty-req") == 0) {
+		ret = sessionpty(chansess);
+	} else if (strcmp(type, "exec") == 0) {
+		ret = sessioncommand(channel, chansess, 1, 0);
+	} else if (strcmp(type, "subsystem") == 0) {
+		ret = sessioncommand(channel, chansess, 1, 1);
+#ifndef DISABLE_X11FWD
+	} else if (strcmp(type, "x11-req") == 0) {
+		ret = x11req(chansess);
+#endif
+#ifndef DISABLE_AGENTFWD
+	} else if (strcmp(type, "auth-agent-req@openssh.com") == 0) {
+		ret = agentreq(chansess);
+#endif
+	} else if (strcmp(type, "signal") == 0) {
+		ret = sessionsignal(chansess);
+	} else {
+		/* etc, todo "env", "subsystem" */
+	}
+
+out:
+
+	if (wantreply) {
+		if (ret == DROPBEAR_SUCCESS) {
+			send_msg_channel_success(channel);
+		} else {
+			send_msg_channel_failure(channel);
+		}
+	}
+
+	m_free(type);
+	TRACE(("leave chansessionrequest"))
+}
+
+
+/* Send a signal to a session's process as requested by the client*/
+static int sessionsignal(struct ChanSess *chansess) {
+
+	int sig = 0;
+	unsigned char* signame = NULL;
+	int i;
+
+	if (chansess->pid == 0) {
+		/* haven't got a process pid yet */
+		return DROPBEAR_FAILURE;
+	}
+
+	signame = buf_getstring(ses.payload, NULL);
+
+	i = 0;
+	while (signames[i].name != 0) {
+		if (strcmp(signames[i].name, signame) == 0) {
+			sig = signames[i].signal;
+			break;
+		}
+		i++;
+	}
+
+	m_free(signame);
+
+	if (sig == 0) {
+		/* failed */
+		return DROPBEAR_FAILURE;
+	}
+			
+	if (kill(chansess->pid, sig) < 0) {
+		return DROPBEAR_FAILURE;
+	} 
+
+	return DROPBEAR_SUCCESS;
+}
+
+/* Let the process know that the window size has changed, as notified from the
+ * client. Returns DROPBEAR_SUCCESS or DROPBEAR_FAILURE */
+static int sessionwinchange(struct ChanSess *chansess) {
+
+	int termc, termr, termw, termh;
+
+	if (chansess->master < 0) {
+		/* haven't got a pty yet */
+		return DROPBEAR_FAILURE;
+	}
+			
+	termc = buf_getint(ses.payload);
+	termr = buf_getint(ses.payload);
+	termw = buf_getint(ses.payload);
+	termh = buf_getint(ses.payload);
+	
+	pty_change_window_size(chansess->master, termr, termc, termw, termh);
+
+	return DROPBEAR_FAILURE;
+}
+
+static void get_termmodes(struct ChanSess *chansess) {
+
+	struct termios termio;
+	unsigned char opcode;
+	unsigned int value;
+	const struct TermCode * termcode;
+	unsigned int len;
+
+	TRACE(("enter get_termmodes"))
+
+	/* Term modes */
+	/* We'll ignore errors and continue if we can't set modes.
+	 * We're ignoring baud rates since they seem evil */
+	if (tcgetattr(chansess->master, &termio) == -1) {
+		return;
+	}
+
+	len = buf_getint(ses.payload);
+	TRACE(("term mode str %d p->l %d p->p %d", 
+				len, ses.payload->len , ses.payload->pos));
+	if (len != ses.payload->len - ses.payload->pos) {
+		dropbear_exit("bad term mode string");
+	}
+
+	if (len == 0) {
+		TRACE(("leave get_termmodes: empty terminal modes string"))
+		return;
+	}
+
+	while (((opcode = buf_getbyte(ses.payload)) != 0x00) && opcode <= 159) {
+
+		/* must be before checking type, so that value is consumed even if
+		 * we don't use it */
+		value = buf_getint(ses.payload);
+
+		/* handle types of code */
+		if (opcode > MAX_TERMCODE) {
+			continue;
+		}
+		termcode = &termcodes[(unsigned int)opcode];
+		
+
+		switch (termcode->type) {
+
+			case TERMCODE_NONE:
+				break;
+
+			case TERMCODE_CONTROLCHAR:
+				termio.c_cc[termcode->mapcode] = value;
+				break;
+
+			case TERMCODE_INPUT:
+				if (value) {
+					termio.c_iflag |= termcode->mapcode;
+				} else {
+					termio.c_iflag &= ~(termcode->mapcode);
+				}
+				break;
+
+			case TERMCODE_OUTPUT:
+				if (value) {
+					termio.c_oflag |= termcode->mapcode;
+				} else {
+					termio.c_oflag &= ~(termcode->mapcode);
+				}
+				break;
+
+			case TERMCODE_LOCAL:
+				if (value) {
+					termio.c_lflag |= termcode->mapcode;
+				} else {
+					termio.c_lflag &= ~(termcode->mapcode);
+				}
+				break;
+
+			case TERMCODE_CONTROL:
+				if (value) {
+					termio.c_cflag |= termcode->mapcode;
+				} else {
+					termio.c_cflag &= ~(termcode->mapcode);
+				}
+				break;
+				
+		}
+	}
+	if (tcsetattr(chansess->master, TCSANOW, &termio) < 0) {
+		dropbear_log(LOG_INFO, "error setting terminal attributes");
+	}
+	TRACE(("leave get_termmodes"))
+}
+
+/* Set up a session pty which will be used to execute the shell or program.
+ * The pty is allocated now, and kept for when the shell/program executes.
+ * Returns DROPBEAR_SUCCESS or DROPBEAR_FAILURE */
+static int sessionpty(struct ChanSess * chansess) {
+
+	unsigned int termlen;
+	unsigned char namebuf[65];
+
+	TRACE(("enter sessionpty"))
+	chansess->term = buf_getstring(ses.payload, &termlen);
+	if (termlen > MAX_TERM_LEN) {
+		/* TODO send disconnect ? */
+		TRACE(("leave sessionpty: term len too long"))
+		return DROPBEAR_FAILURE;
+	}
+
+	/* allocate the pty */
+	if (chansess->master != -1) {
+		dropbear_exit("multiple pty requests");
+	}
+	if (pty_allocate(&chansess->master, &chansess->slave, namebuf, 64) == 0) {
+		TRACE(("leave sessionpty: failed to allocate pty"))
+		return DROPBEAR_FAILURE;
+	}
+	
+	chansess->tty = (char*)m_strdup(namebuf);
+	if (!chansess->tty) {
+		dropbear_exit("out of memory"); /* TODO disconnect */
+	}
+
+	pty_setowner(ses.authstate.pw, chansess->tty);
+
+	/* Set up the rows/col counts */
+	sessionwinchange(chansess);
+
+	/* Read the terminal modes */
+	get_termmodes(chansess);
+
+	TRACE(("leave sessionpty"))
+	return DROPBEAR_SUCCESS;
+}
+
+/* Handle a command request from the client. This is used for both shell
+ * and command-execution requests, and passes the command to
+ * noptycommand or ptycommand as appropriate.
+ * Returns DROPBEAR_SUCCESS or DROPBEAR_FAILURE */
+static int sessioncommand(struct Channel *channel, struct ChanSess *chansess,
+		int iscmd, int issubsys) {
+
+	unsigned int cmdlen;
+	int ret;
+
+	TRACE(("enter sessioncommand"))
+
+	if (chansess->cmd != NULL) {
+		/* Note that only one command can _succeed_. The client might try
+		 * one command (which fails), then try another. Ie fallback
+		 * from sftp to scp */
+		return DROPBEAR_FAILURE;
+	}
+
+	if (iscmd) {
+		/* "exec" */
+		chansess->cmd = buf_getstring(ses.payload, &cmdlen);
+
+		if (cmdlen > MAX_CMD_LEN) {
+			m_free(chansess->cmd);
+			/* TODO - send error - too long ? */
+			return DROPBEAR_FAILURE;
+		}
+		if (issubsys) {
+#ifdef SFTPSERVER_PATH
+			if ((cmdlen == 4) && strncmp(chansess->cmd, "sftp", 4) == 0) {
+				m_free(chansess->cmd);
+				chansess->cmd = m_strdup(SFTPSERVER_PATH);
+			} else 
+#endif
+			{
+				m_free(chansess->cmd);
+				return DROPBEAR_FAILURE;
+			}
+		}
+	}
+
+	if (chansess->term == NULL) {
+		/* no pty */
+		ret = noptycommand(channel, chansess);
+	} else {
+		/* want pty */
+		ret = ptycommand(channel, chansess);
+	}
+
+	if (ret == DROPBEAR_FAILURE) {
+		m_free(chansess->cmd);
+	}
+	return ret;
+}
+
+/* Execute a command and set up redirection of stdin/stdout/stderr without a
+ * pty.
+ * Returns DROPBEAR_SUCCESS or DROPBEAR_FAILURE */
+static int noptycommand(struct Channel *channel, struct ChanSess *chansess) {
+
+	int infds[2];
+	int outfds[2];
+	int errfds[2];
+	pid_t pid;
+	unsigned int i;
+
+	TRACE(("enter noptycommand"))
+
+	/* redirect stdin/stdout/stderr */
+	if (pipe(infds) != 0)
+		return DROPBEAR_FAILURE;
+	if (pipe(outfds) != 0)
+		return DROPBEAR_FAILURE;
+	if (pipe(errfds) != 0)
+		return DROPBEAR_FAILURE;
+
+#ifdef __uClinux__
+	pid = vfork();
+#else
+	pid = fork();
+#endif
+
+	if (pid < 0)
+		return DROPBEAR_FAILURE;
+
+	if (!pid) {
+		/* child */
+
+		/* redirect stdin/stdout */
+#define FDIN 0
+#define FDOUT 1
+		if ((dup2(infds[FDIN], STDIN_FILENO) < 0) ||
+			(dup2(outfds[FDOUT], STDOUT_FILENO) < 0) ||
+			(dup2(errfds[FDOUT], STDERR_FILENO) < 0)) {
+			TRACE(("leave noptycommand: error redirecting FDs"))
+			return DROPBEAR_FAILURE;
+		}
+
+		close(infds[FDOUT]);
+		close(infds[FDIN]);
+		close(outfds[FDIN]);
+		close(outfds[FDOUT]);
+		close(errfds[FDIN]);
+		close(errfds[FDOUT]);
+
+		execchild(chansess);
+		/* not reached */
+
+	} else {
+		/* parent */
+		TRACE(("continue noptycommand: parent"))
+		chansess->pid = pid;
+
+		addchildpid(chansess, pid);
+
+		if (svr_ses.lastexit.exitpid != -1) {
+			/* The child probably exited and the signal handler triggered
+			 * possibly before we got around to adding the childpid. So we fill
+			 * out it's data manually */
+			for (i = 0; i < svr_ses.childpidsize; i++) {
+				if (svr_ses.childpids[i].pid == pid) {
+					svr_ses.childpids[i].chansess->exit = svr_ses.lastexit;
+					svr_ses.lastexit.exitpid = -1;
+				}
+			}
+		}
+
+
+		close(infds[FDIN]);
+		close(outfds[FDOUT]);
+		close(errfds[FDOUT]);
+		channel->writefd = infds[FDOUT];
+		channel->readfd = outfds[FDIN];
+		channel->errfd = errfds[FDIN];
+		ses.maxfd = MAX(ses.maxfd, channel->writefd);
+		ses.maxfd = MAX(ses.maxfd, channel->readfd);
+		ses.maxfd = MAX(ses.maxfd, channel->errfd);
+
+		setnonblocking(channel->readfd);
+		setnonblocking(channel->writefd);
+		setnonblocking(channel->errfd);
+
+	}
+#undef FDIN
+#undef FDOUT
+
+	TRACE(("leave noptycommand"))
+	return DROPBEAR_SUCCESS;
+}
+
+/* Execute a command or shell within a pty environment, and set up
+ * redirection as appropriate.
+ * Returns DROPBEAR_SUCCESS or DROPBEAR_FAILURE */
+static int ptycommand(struct Channel *channel, struct ChanSess *chansess) {
+
+	pid_t pid;
+	struct logininfo *li = NULL;
+#ifdef DO_MOTD
+	buffer * motdbuf = NULL;
+	int len;
+	struct stat sb;
+	char *hushpath = NULL;
+#endif
+
+	TRACE(("enter ptycommand"))
+
+	/* we need to have a pty allocated */
+	if (chansess->master == -1 || chansess->tty == NULL) {
+		dropbear_log(LOG_WARNING, "no pty was allocated, couldn't execute");
+		return DROPBEAR_FAILURE;
+	}
+	
+#ifdef __uClinux__
+	pid = vfork();
+#else
+	pid = fork();
+#endif
+	if (pid < 0)
+		return DROPBEAR_FAILURE;
+
+	if (pid == 0) {
+		/* child */
+		
+		/* redirect stdin/stdout/stderr */
+		close(chansess->master);
+
+		pty_make_controlling_tty(&chansess->slave, chansess->tty);
+		
+		if ((dup2(chansess->slave, STDIN_FILENO) < 0) ||
+			(dup2(chansess->slave, STDERR_FILENO) < 0) ||
+			(dup2(chansess->slave, STDOUT_FILENO) < 0)) {
+			TRACE(("leave ptycommand: error redirecting filedesc"))
+			return DROPBEAR_FAILURE;
+		}
+
+		close(chansess->slave);
+
+		/* write the utmp/wtmp login record - must be after changing the
+		 * terminal used for stdout with the dup2 above */
+		li= login_alloc_entry(getpid(), ses.authstate.username,
+				ses.remotehost, chansess->tty);
+		login_login(li);
+		login_free_entry(li);
+
+		m_free(chansess->tty);
+
+#ifdef DO_MOTD
+		if (svr_opts.domotd) {
+			/* don't show the motd if ~/.hushlogin exists */
+
+			/* 11 == strlen("/hushlogin\0") */
+			len = strlen(ses.authstate.pw->pw_dir) + 11; 
+
+			hushpath = m_malloc(len);
+			snprintf(hushpath, len, "%s/hushlogin", ses.authstate.pw->pw_dir);
+
+			if (stat(hushpath, &sb) < 0) {
+				/* more than a screenful is stupid IMHO */
+				motdbuf = buf_new(80 * 25);
+				if (buf_readfile(motdbuf, MOTD_FILENAME) == DROPBEAR_SUCCESS) {
+					buf_setpos(motdbuf, 0);
+					while (motdbuf->pos != motdbuf->len) {
+						len = motdbuf->len - motdbuf->pos;
+						len = write(STDOUT_FILENO, 
+								buf_getptr(motdbuf, len), len);
+						buf_incrpos(motdbuf, len);
+					}
+				}
+				buf_free(motdbuf);
+			}
+			m_free(hushpath);
+		}
+#endif /* DO_MOTD */
+
+		execchild(chansess);
+		/* not reached */
+
+	} else {
+		/* parent */
+		TRACE(("continue ptycommand: parent"))
+		chansess->pid = pid;
+
+		/* add a child pid */
+		addchildpid(chansess, pid);
+
+		close(chansess->slave);
+		channel->writefd = chansess->master;
+		channel->readfd = chansess->master;
+		/* don't need to set stderr here */
+		ses.maxfd = MAX(ses.maxfd, chansess->master);
+
+		setnonblocking(chansess->master);
+
+	}
+
+	TRACE(("leave ptycommand"))
+	return DROPBEAR_SUCCESS;
+}
+
+/* Add the pid of a child to the list for exit-handling */
+static void addchildpid(struct ChanSess *chansess, pid_t pid) {
+
+	unsigned int i;
+	for (i = 0; i < svr_ses.childpidsize; i++) {
+		if (svr_ses.childpids[i].pid == -1) {
+			break;
+		}
+	}
+
+	/* need to increase size */
+	if (i == svr_ses.childpidsize) {
+		svr_ses.childpids = (struct ChildPid*)m_realloc(svr_ses.childpids,
+				sizeof(struct ChildPid) * (svr_ses.childpidsize+1));
+		svr_ses.childpidsize++;
+	}
+	
+	svr_ses.childpids[i].pid = pid;
+	svr_ses.childpids[i].chansess = chansess;
+
+}
+
+/* Clean up, drop to user privileges, set up the environment and execute
+ * the command/shell. This function does not return. */
+static void execchild(struct ChanSess *chansess) {
+
+	char *argv[4];
+	char * usershell = NULL;
+	char * baseshell = NULL;
+	unsigned int i;
+
+    /* with uClinux we'll have vfork()ed, so don't want to overwrite the
+     * hostkey. can't think of a workaround to clear it */
+#ifndef __uClinux__
+	/* wipe the hostkey */
+	sign_key_free(svr_opts.hostkey);
+	svr_opts.hostkey = NULL;
+
+	/* overwrite the prng state */
+	reseedrandom();
+#endif
+
+	/* close file descriptors except stdin/stdout/stderr
+	 * Need to be sure FDs are closed here to avoid reading files as root */
+	for (i = 3; i <= (unsigned int)ses.maxfd; i++) {
+		m_close(i);
+	}
+
+	/* clear environment */
+	/* if we're debugging using valgrind etc, we need to keep the LD_PRELOAD
+	 * etc. This is hazardous, so should only be used for debugging. */
+#ifndef DEBUG_VALGRIND
+#ifdef HAVE_CLEARENV
+	clearenv();
+#else /* don't HAVE_CLEARENV */
+	/* Yay for posix. */
+	if (environ) {
+		environ[0] = NULL;
+	}
+#endif /* HAVE_CLEARENV */
+#endif /* DEBUG_VALGRIND */
+
+	/* We can only change uid/gid as root ... */
+	if (getuid() == 0) {
+
+		if ((setgid(ses.authstate.pw->pw_gid) < 0) ||
+			(initgroups(ses.authstate.pw->pw_name, 
+						ses.authstate.pw->pw_gid) < 0)) {
+			dropbear_exit("error changing user group");
+		}
+		if (setuid(ses.authstate.pw->pw_uid) < 0) {
+			dropbear_exit("error changing user");
+		}
+	} else {
+		/* ... but if the daemon is the same uid as the requested uid, we don't
+		 * need to */
+
+		/* XXX - there is a minor issue here, in that if there are multiple
+		 * usernames with the same uid, but differing groups, then the
+		 * differing groups won't be set (as with initgroups()). The solution
+		 * is for the sysadmin not to give out the UID twice */
+		if (getuid() != ses.authstate.pw->pw_uid) {
+			dropbear_exit("couldn't	change user as non-root");
+		}
+	}
+
+	/* an empty shell should be interpreted as "/bin/sh" */
+	if (ses.authstate.pw->pw_shell[0] == '\0') {
+		usershell = "/bin/sh";
+	} else {
+		usershell = ses.authstate.pw->pw_shell;
+	}
+
+	/* set env vars */
+	addnewvar("USER", ses.authstate.pw->pw_name);
+	addnewvar("LOGNAME", ses.authstate.pw->pw_name);
+	addnewvar("HOME", ses.authstate.pw->pw_dir);
+	addnewvar("SHELL", usershell);
+	if (chansess->term != NULL) {
+		addnewvar("TERM", chansess->term);
+	}
+
+	/* change directory */
+	if (chdir(ses.authstate.pw->pw_dir) < 0) {
+		dropbear_exit("error changing directory");
+	}
+
+#ifndef DISABLE_X11FWD
+	/* set up X11 forwarding if enabled */
+	x11setauth(chansess);
+#endif
+#ifndef DISABLE_AGENTFWD
+	/* set up agent env variable */
+	agentset(chansess);
+#endif
+
+	/* Re-enable SIGPIPE for the executed process */
+	if (signal(SIGPIPE, SIG_DFL) == SIG_ERR) {
+		dropbear_exit("signal() error");
+	}
+
+	baseshell = basename(usershell);
+
+	if (chansess->cmd != NULL) {
+		argv[0] = baseshell;
+	} else {
+		/* a login shell should be "-bash" for "/bin/bash" etc */
+		int len = strlen(baseshell) + 2; /* 2 for "-" */
+		argv[0] = (char*)m_malloc(len);
+		snprintf(argv[0], len, "-%s", baseshell);
+	}
+
+	if (chansess->cmd != NULL) {
+		argv[1] = "-c";
+		argv[2] = chansess->cmd;
+		argv[3] = NULL;
+	} else {
+		/* construct a shell of the form "-bash" etc */
+		argv[1] = NULL;
+	}
+
+	execv(usershell, argv);
+
+	/* only reached on error */
+	dropbear_exit("child failed");
+}
+
+const struct ChanType svrchansess = {
+	0, /* sepfds */
+	"session", /* name */
+	newchansess, /* inithandler */
+	sesscheckclose, /* checkclosehandler */
+	chansessionrequest, /* reqhandler */
+	closechansess, /* closehandler */
+};
+
+
+/* Set up the general chansession environment, in particular child-exit
+ * handling */
+void svr_chansessinitialise() {
+
+	struct sigaction sa_chld;
+
+	/* single child process intially */
+	svr_ses.childpids = (struct ChildPid*)m_malloc(sizeof(struct ChildPid));
+	svr_ses.childpids[0].pid = -1; /* unused */
+	svr_ses.childpids[0].chansess = NULL;
+	svr_ses.childpidsize = 1;
+	svr_ses.lastexit.exitpid = -1; /* Nothing has exited yet */
+	sa_chld.sa_handler = sesssigchild_handler;
+	sa_chld.sa_flags = SA_NOCLDSTOP;
+	if (sigaction(SIGCHLD, &sa_chld, NULL) < 0) {
+		dropbear_exit("signal() error");
+	}
+	
+}
+
+/* add a new environment variable, allocating space for the entry */
+void addnewvar(const char* param, const char* var) {
+
+	char* newvar = NULL;
+	int plen, vlen;
+
+	plen = strlen(param);
+	vlen = strlen(var);
+
+	newvar = m_malloc(plen + vlen + 2); /* 2 is for '=' and '\0' */
+	memcpy(newvar, param, plen);
+	newvar[plen] = '=';
+	memcpy(&newvar[plen+1], var, vlen);
+	newvar[plen+vlen+1] = '\0';
+	if (putenv(newvar) < 0) {
+		dropbear_exit("environ error");
+	}
+}
diff --git a/svr-kex.c b/svr-kex.c
new file mode 100644
index 0000000..a9954bb
--- /dev/null
+++ b/svr-kex.c
@@ -0,0 +1,104 @@
+/*
+ * Dropbear - a SSH2 server
+ * 
+ * Copyright (c) 2002,2003 Matt Johnston
+ * Copyright (c) 2004 by Mihnea Stoenescu
+ * All rights reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE. */
+
+#include "includes.h"
+#include "dbutil.h"
+#include "algo.h"
+#include "buffer.h"
+#include "session.h"
+#include "kex.h"
+#include "ssh.h"
+#include "packet.h"
+#include "bignum.h"
+#include "random.h"
+#include "runopts.h"
+
+
+static void send_msg_kexdh_reply(mp_int *dh_e);
+
+/* Handle a diffie-hellman key exchange initialisation. This involves
+ * calculating a session key reply value, and corresponding hash. These
+ * are carried out by send_msg_kexdh_reply(). recv_msg_kexdh_init() calls
+ * that function, then brings the new keys into use */
+void recv_msg_kexdh_init() {
+
+	DEF_MP_INT(dh_e);
+
+	TRACE(("enter recv_msg_kexdh_init"))
+	if (!ses.kexstate.recvkexinit) {
+		dropbear_exit("Premature kexdh_init message received");
+	}
+
+	m_mp_init(&dh_e);
+	buf_getmpint(ses.payload, &dh_e);
+
+	send_msg_kexdh_reply(&dh_e);
+
+	mp_clear(&dh_e);
+
+	send_msg_newkeys();
+	ses.requirenext = SSH_MSG_NEWKEYS;
+	TRACE(("leave recv_msg_kexdh_init"))
+}
+	
+/* Generate our side of the diffie-hellman key exchange value (dh_f), and
+ * calculate the session key using the diffie-hellman algorithm. Following
+ * that, the session hash is calculated, and signed with RSA or DSS. The
+ * result is sent to the client. 
+ *
+ * See the ietf-secsh-transport draft, section 6, for details */
+static void send_msg_kexdh_reply(mp_int *dh_e) {
+
+	DEF_MP_INT(dh_y);
+	DEF_MP_INT(dh_f);
+
+	TRACE(("enter send_msg_kexdh_reply"))
+	m_mp_init_multi(&dh_y, &dh_f, NULL);
+	
+	gen_kexdh_vals(&dh_f, &dh_y);
+
+	kexdh_comb_key(&dh_f, &dh_y, dh_e, svr_opts.hostkey);
+	mp_clear(&dh_y);
+
+	/* we can start creating the kexdh_reply packet */
+	CHECKCLEARTOWRITE();
+	buf_putbyte(ses.writepayload, SSH_MSG_KEXDH_REPLY);
+	buf_put_pub_key(ses.writepayload, svr_opts.hostkey,
+			ses.newkeys->algo_hostkey);
+
+	/* put f */
+	buf_putmpint(ses.writepayload, &dh_f);
+	mp_clear(&dh_f);
+
+	/* calc the signature */
+	buf_put_sign(ses.writepayload, svr_opts.hostkey, 
+			ses.newkeys->algo_hostkey, ses.hash, SHA1_HASH_SIZE);
+
+	/* the SSH_MSG_KEXDH_REPLY is done */
+	encrypt_packet();
+
+	TRACE(("leave send_msg_kexdh_reply"))
+}
+
diff --git a/svr-main.c b/svr-main.c
new file mode 100644
index 0000000..e59d895
--- /dev/null
+++ b/svr-main.c
@@ -0,0 +1,421 @@
+/*
+ * Dropbear - a SSH2 server
+ * 
+ * Copyright (c) 2002,2003 Matt Johnston
+ * All rights reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE. */
+
+#include "includes.h"
+#include "dbutil.h"
+#include "session.h"
+#include "buffer.h"
+#include "signkey.h"
+#include "runopts.h"
+
+static size_t listensockets(int *sock, size_t sockcount, int *maxfd);
+static void sigchld_handler(int dummy);
+static void sigsegv_handler(int);
+static void sigintterm_handler(int fish);
+#ifdef INETD_MODE
+static void main_inetd();
+#endif
+#ifdef NON_INETD_MODE
+static void main_noinetd();
+#endif
+static void commonsetup();
+
+#if defined(DBMULTI_dropbear) || !defined(DROPBEAR_MULTI)
+#if defined(DBMULTI_dropbear) && defined(DROPBEAR_MULTI)
+int dropbear_main(int argc, char ** argv)
+#else
+int main(int argc, char ** argv)
+#endif
+{
+	
+
+	_dropbear_exit = svr_dropbear_exit;
+	_dropbear_log = svr_dropbear_log;
+
+	/* get commandline options */
+	svr_getopts(argc, argv);
+
+#ifdef INETD_MODE
+	/* service program mode */
+	if (svr_opts.inetdmode) {
+		main_inetd();
+		/* notreached */
+	}
+#endif
+
+#ifdef NON_INETD_MODE
+	main_noinetd();
+	/* notreached */
+#endif
+
+	dropbear_exit("Compiled without normal mode, can't run without -i\n");
+	return -1;
+}
+#endif
+
+#ifdef INETD_MODE
+static void main_inetd() {
+
+	struct sockaddr_storage remoteaddr;
+	socklen_t remoteaddrlen;
+	char * addrstring = NULL;
+
+	/* Set up handlers, syslog, seed random */
+	commonsetup();
+
+	remoteaddrlen = sizeof(remoteaddr);
+	if (getpeername(0, (struct sockaddr*)&remoteaddr, &remoteaddrlen) < 0) {
+		dropbear_exit("Unable to getpeername: %s", strerror(errno));
+	}
+
+	/* In case our inetd was lax in logging source addresses */
+	addrstring = getaddrstring(&remoteaddr, 1);
+	dropbear_log(LOG_INFO, "Child connection from %s", addrstring);
+
+	/* Don't check the return value - it may just fail since inetd has
+	 * already done setsid() after forking (xinetd on Darwin appears to do
+	 * this */
+	setsid();
+
+	/* Start service program 
+	 * -1 is a dummy childpipe, just something we can close() without 
+	 * mattering. */
+	svr_session(0, -1, getaddrhostname(&remoteaddr), addrstring);
+
+	/* notreached */
+}
+#endif /* INETD_MODE */
+
+#ifdef NON_INETD_MODE
+void main_noinetd() {
+	fd_set fds;
+	struct timeval seltimeout;
+	unsigned int i, j;
+	int val;
+	int maxsock = -1;
+	int listensocks[MAX_LISTEN_ADDR];
+	size_t listensockcount = 0;
+	FILE *pidfile = NULL;
+
+	int childpipes[MAX_UNAUTH_CLIENTS];
+	char * preauth_addrs[MAX_UNAUTH_CLIENTS];
+
+	int childsock;
+	int childpipe[2];
+
+	/* fork */
+	if (svr_opts.forkbg) {
+		int closefds = 0;
+#ifndef DEBUG_TRACE
+		if (!svr_opts.usingsyslog) {
+			closefds = 1;
+		}
+#endif
+		if (daemon(0, closefds) < 0) {
+			dropbear_exit("Failed to daemonize: %s", strerror(errno));
+		}
+	}
+
+	commonsetup();
+
+
+	/* should be done after syslog is working */
+	if (svr_opts.forkbg) {
+		dropbear_log(LOG_INFO, "Running in background");
+	} else {
+		dropbear_log(LOG_INFO, "Not forking");
+	}
+
+	/* create a PID file so that we can be killed easily */
+	pidfile = fopen(DROPBEAR_PIDFILE, "w");
+	if (pidfile) {
+		fprintf(pidfile, "%d\n", getpid());
+		fclose(pidfile);
+	}
+
+	/* sockets to identify pre-authenticated clients */
+	for (i = 0; i < MAX_UNAUTH_CLIENTS; i++) {
+		childpipes[i] = -1;
+	}
+	bzero(preauth_addrs, sizeof(preauth_addrs));
+	
+	/* Set up the listening sockets */
+	/* XXX XXX ports */
+	listensockcount = listensockets(listensocks, MAX_LISTEN_ADDR, &maxsock);
+	if (listensockcount == 0)
+	{
+		dropbear_exit("No listening ports available.");
+	}
+
+	/* incoming connection select loop */
+	for(;;) {
+
+		FD_ZERO(&fds);
+		
+		seltimeout.tv_sec = 60;
+		seltimeout.tv_usec = 0;
+		
+		/* listening sockets */
+		for (i = 0; i < listensockcount; i++) {
+			FD_SET(listensocks[i], &fds);
+		}
+
+		/* pre-authentication clients */
+		for (i = 0; i < MAX_UNAUTH_CLIENTS; i++) {
+			if (childpipes[i] >= 0) {
+				FD_SET(childpipes[i], &fds);
+				maxsock = MAX(maxsock, childpipes[i]);
+			}
+		}
+
+		val = select(maxsock+1, &fds, NULL, NULL, &seltimeout);
+
+		if (exitflag) {
+			unlink(DROPBEAR_PIDFILE);
+			dropbear_exit("Terminated by signal");
+		}
+		
+		if (val == 0) {
+			/* timeout reached */
+			continue;
+		}
+
+		if (val < 0) {
+			if (errno == EINTR) {
+				continue;
+			}
+			dropbear_exit("Listening socket error");
+		}
+
+		/* close fds which have been authed or closed - svr-auth.c handles
+		 * closing the auth sockets on success */
+		for (i = 0; i < MAX_UNAUTH_CLIENTS; i++) {
+			if (childpipes[i] >= 0 && FD_ISSET(childpipes[i], &fds)) {
+				m_close(childpipes[i]);
+				childpipes[i] = -1;
+				m_free(preauth_addrs[i]);
+			}
+		}
+
+		/* handle each socket which has something to say */
+		for (i = 0; i < listensockcount; i++) {
+
+			struct sockaddr_storage remoteaddr;
+			socklen_t remoteaddrlen = 0;
+			size_t num_unauthed_for_addr = 0;
+			size_t num_unauthed_total = 0;
+			char * remote_addr_str = NULL;
+			pid_t fork_ret = 0;
+			size_t conn_idx = 0;
+
+			if (!FD_ISSET(listensocks[i], &fds)) 
+				continue;
+
+			remoteaddrlen = sizeof(remoteaddr);
+			childsock = accept(listensocks[i], 
+					(struct sockaddr*)&remoteaddr, &remoteaddrlen);
+
+			if (childsock < 0) {
+				/* accept failed */
+				continue;
+			}
+
+			/* Limit the number of unauthenticated connections per IP */
+			remote_addr_str = getaddrstring(&remoteaddr, 0);
+
+			num_unauthed_for_addr = 0;
+			num_unauthed_total = 0;
+			for (j = 0; j < MAX_UNAUTH_CLIENTS; j++) {
+				if (childpipes[j] >= 0) {
+					num_unauthed_total++;
+					if (strcmp(remote_addr_str, preauth_addrs[j]) == 0) {
+						num_unauthed_for_addr++;
+					}
+				} else {
+					/* a free slot */
+					conn_idx = j;
+				}
+			}
+
+			if (num_unauthed_total >= MAX_UNAUTH_CLIENTS
+					|| num_unauthed_for_addr >= MAX_UNAUTH_PER_IP) {
+				goto out;
+			}
+
+			if (pipe(childpipe) < 0) {
+				TRACE(("error creating child pipe"))
+				goto out;
+			}
+
+			fork_ret = fork();
+			if (fork_ret < 0) {
+				dropbear_log(LOG_WARNING, "error forking: %s", strerror(errno));
+				goto out;
+
+			} else if (fork_ret > 0) {
+
+				/* parent */
+				childpipes[conn_idx] = childpipe[0];
+				m_close(childpipe[1]);
+				preauth_addrs[conn_idx] = remote_addr_str;
+				remote_addr_str = NULL;
+
+			} else {
+
+				/* child */
+				char * addrstring = NULL;
+#ifdef DEBUG_FORKGPROF
+				extern void _start(void), etext(void);
+				monstartup((u_long)&_start, (u_long)&etext);
+#endif /* DEBUG_FORKGPROF */
+
+				m_free(remote_addr_str);
+				addrstring = getaddrstring(&remoteaddr, 1);
+				dropbear_log(LOG_INFO, "Child connection from %s", addrstring);
+
+				if (setsid() < 0) {
+					dropbear_exit("setsid: %s", strerror(errno));
+				}
+
+				/* make sure we close sockets */
+				for (i = 0; i < listensockcount; i++) {
+					m_close(listensocks[i]);
+				}
+
+				m_close(childpipe[0]);
+
+				/* start the session */
+				svr_session(childsock, childpipe[1], 
+								getaddrhostname(&remoteaddr),
+								addrstring);
+				/* don't return */
+				dropbear_assert(0);
+			}
+
+out:
+			/* This section is important for the parent too */
+			m_close(childsock);
+			if (remote_addr_str) {
+				m_free(remote_addr_str);
+			}
+		}
+	} /* for(;;) loop */
+
+	/* don't reach here */
+}
+#endif /* NON_INETD_MODE */
+
+
+/* catch + reap zombie children */
+static void sigchld_handler(int UNUSED(unused)) {
+	struct sigaction sa_chld;
+
+	while(waitpid(-1, NULL, WNOHANG) > 0); 
+
+	sa_chld.sa_handler = sigchld_handler;
+	sa_chld.sa_flags = SA_NOCLDSTOP;
+	if (sigaction(SIGCHLD, &sa_chld, NULL) < 0) {
+		dropbear_exit("signal() error");
+	}
+}
+
+/* catch any segvs */
+static void sigsegv_handler(int UNUSED(unused)) {
+	fprintf(stderr, "Aiee, segfault! You should probably report "
+			"this as a bug to the developer\n");
+	exit(EXIT_FAILURE);
+}
+
+/* catch ctrl-c or sigterm */
+static void sigintterm_handler(int UNUSED(unused)) {
+
+	exitflag = 1;
+}
+
+/* Things used by inetd and non-inetd modes */
+static void commonsetup() {
+
+	struct sigaction sa_chld;
+#ifndef DISABLE_SYSLOG
+	if (svr_opts.usingsyslog) {
+		startsyslog();
+	}
+#endif
+
+	/* set up cleanup handler */
+	if (signal(SIGINT, sigintterm_handler) == SIG_ERR || 
+#ifndef DEBUG_VALGRIND
+		signal(SIGTERM, sigintterm_handler) == SIG_ERR ||
+#endif
+		signal(SIGPIPE, SIG_IGN) == SIG_ERR) {
+		dropbear_exit("signal() error");
+	}
+
+	/* catch and reap zombie children */
+	sa_chld.sa_handler = sigchld_handler;
+	sa_chld.sa_flags = SA_NOCLDSTOP;
+	if (sigaction(SIGCHLD, &sa_chld, NULL) < 0) {
+		dropbear_exit("signal() error");
+	}
+	if (signal(SIGSEGV, sigsegv_handler) == SIG_ERR) {
+		dropbear_exit("signal() error");
+	}
+
+	/* Now we can setup the hostkeys - needs to be after logging is on,
+	 * otherwise we might end up blatting error messages to the socket */
+	loadhostkeys();
+
+    seedrandom();
+}
+
+/* Set up listening sockets for all the requested ports */
+static size_t listensockets(int *sock, size_t sockcount, int *maxfd) {
+	
+	unsigned int i;
+	char* errstring = NULL;
+	size_t sockpos = 0;
+	int nsock;
+
+	TRACE(("listensockets: %d to try\n", svr_opts.portcount))
+
+	for (i = 0; i < svr_opts.portcount; i++) {
+
+		TRACE(("listening on '%s'", svr_opts.ports[i]))
+
+		nsock = dropbear_listen("", svr_opts.ports[i], &sock[sockpos], 
+				sockcount - sockpos,
+				&errstring, maxfd);
+
+		if (nsock < 0) {
+			dropbear_log(LOG_WARNING, "Failed listening on '%s': %s", 
+							svr_opts.ports[i], errstring);
+			m_free(errstring);
+			continue;
+		}
+
+		sockpos += nsock;
+
+	}
+	return sockpos;
+}
diff --git a/svr-runopts.c b/svr-runopts.c
new file mode 100644
index 0000000..8d8b8df
--- /dev/null
+++ b/svr-runopts.c
@@ -0,0 +1,315 @@
+/*
+ * Dropbear - a SSH2 server
+ * 
+ * Copyright (c) 2002,2003 Matt Johnston
+ * All rights reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE. */
+
+#include "includes.h"
+#include "runopts.h"
+#include "signkey.h"
+#include "buffer.h"
+#include "dbutil.h"
+#include "algo.h"
+
+svr_runopts svr_opts; /* GLOBAL */
+
+static void printhelp(const char * progname);
+
+static void printhelp(const char * progname) {
+
+	fprintf(stderr, "Dropbear sshd v%s\n"
+					"Usage: %s [options]\n"
+					"Options are:\n"
+					"-b bannerfile	Display the contents of bannerfile"
+					" before user login\n"
+					"		(default: none)\n"
+#ifdef DROPBEAR_DSS
+					"-d dsskeyfile	Use dsskeyfile for the dss host key\n"
+					"		(default: %s)\n"
+#endif
+#ifdef DROPBEAR_RSA
+					"-r rsakeyfile	Use rsakeyfile for the rsa host key\n"
+					"		(default: %s)\n"
+#endif
+					"-F		Don't fork into background\n"
+#ifdef DISABLE_SYSLOG
+					"(Syslog support not compiled in, using stderr)\n"
+#else
+					"-E		Log to stderr rather than syslog\n"
+#endif
+#ifdef DO_MOTD
+					"-m		Don't display the motd on login\n"
+#endif
+					"-w		Disallow root logins\n"
+#if defined(ENABLE_SVR_PASSWORD_AUTH) || defined(ENABLE_SVR_PAM_AUTH)
+					"-s		Disable password logins\n"
+					"-g		Disable password logins for root\n"
+#endif
+#ifdef ENABLE_SVR_LOCALTCPFWD
+					"-j		Disable local port forwarding\n"
+#endif
+#ifdef ENABLE_SVR_REMOTETCPFWD
+					"-k		Disable remote port forwarding\n"
+					"-a		Allow connections to forwarded ports from any host\n"
+#endif
+					"-p port		Listen on specified tcp port, up to %d can be specified\n"
+					"		(default %s if none specified)\n"
+#ifdef INETD_MODE
+					"-i		Start for inetd\n"
+#endif
+#ifdef DEBUG_TRACE
+					"-v		verbose\n"
+#endif
+					,DROPBEAR_VERSION, progname,
+#ifdef DROPBEAR_DSS
+					DSS_PRIV_FILENAME,
+#endif
+#ifdef DROPBEAR_RSA
+					RSA_PRIV_FILENAME,
+#endif
+					DROPBEAR_MAX_PORTS, DROPBEAR_DEFPORT);
+}
+
+void svr_getopts(int argc, char ** argv) {
+
+	unsigned int i;
+	char ** next = 0;
+
+	/* see printhelp() for options */
+	svr_opts.rsakeyfile = NULL;
+	svr_opts.dsskeyfile = NULL;
+	svr_opts.bannerfile = NULL;
+	svr_opts.banner = NULL;
+	svr_opts.forkbg = 1;
+	svr_opts.norootlogin = 0;
+	svr_opts.noauthpass = 0;
+	svr_opts.norootpass = 0;
+	svr_opts.inetdmode = 0;
+	svr_opts.portcount = 0;
+	svr_opts.hostkey = NULL;
+#ifdef ENABLE_SVR_LOCALTCPFWD
+	svr_opts.nolocaltcp = 0;
+#endif
+#ifdef ENABLE_SVR_REMOTETCPFWD
+	svr_opts.noremotetcp = 0;
+#endif
+	/* not yet
+	opts.ipv4 = 1;
+	opts.ipv6 = 1;
+	*/
+#ifdef DO_MOTD
+	svr_opts.domotd = 1;
+#endif
+#ifndef DISABLE_SYSLOG
+	svr_opts.usingsyslog = 1;
+#endif
+#ifdef ENABLE_SVR_REMOTETCPFWD
+	opts.listen_fwd_all = 0;
+#endif
+
+	for (i = 1; i < (unsigned int)argc; i++) {
+		if (next) {
+			*next = argv[i];
+			if (*next == NULL) {
+				dropbear_exit("Invalid null argument");
+			}
+			next = 0x00;
+			continue;
+		}
+
+		if (argv[i][0] == '-') {
+			switch (argv[i][1]) {
+				case 'b':
+					next = &svr_opts.bannerfile;
+					break;
+#ifdef DROPBEAR_DSS
+				case 'd':
+					next = &svr_opts.dsskeyfile;
+					break;
+#endif
+#ifdef DROPBEAR_RSA
+				case 'r':
+					next = &svr_opts.rsakeyfile;
+					break;
+#endif
+				case 'F':
+					svr_opts.forkbg = 0;
+					break;
+#ifndef DISABLE_SYSLOG
+				case 'E':
+					svr_opts.usingsyslog = 0;
+					break;
+#endif
+#ifdef ENABLE_SVR_LOCALTCPFWD
+				case 'j':
+					svr_opts.nolocaltcp = 1;
+					break;
+#endif
+#ifdef ENABLE_SVR_REMOTETCPFWD
+				case 'k':
+					svr_opts.noremotetcp = 1;
+					break;
+				case 'a':
+					opts.listen_fwd_all = 1;
+					break;
+#endif
+#ifdef INETD_MODE
+				case 'i':
+					svr_opts.inetdmode = 1;
+					break;
+#endif
+				case 'p':
+					if (svr_opts.portcount < DROPBEAR_MAX_PORTS) {
+						svr_opts.ports[svr_opts.portcount] = NULL;
+						next = &svr_opts.ports[svr_opts.portcount];
+						/* Note: if it doesn't actually get set, we'll
+						 * decrement it after the loop */
+						svr_opts.portcount++;
+					}
+					break;
+#ifdef DO_MOTD
+				/* motd is displayed by default, -m turns it off */
+				case 'm':
+					svr_opts.domotd = 0;
+					break;
+#endif
+				case 'w':
+					svr_opts.norootlogin = 1;
+					break;
+#if defined(ENABLE_SVR_PASSWORD_AUTH) || defined(ENABLE_SVR_PAM_AUTH)
+				case 's':
+					svr_opts.noauthpass = 1;
+					break;
+				case 'g':
+					svr_opts.norootpass = 1;
+					break;
+#endif
+				case 'h':
+					printhelp(argv[0]);
+					exit(EXIT_FAILURE);
+					break;
+#ifdef DEBUG_TRACE
+				case 'v':
+					debug_trace = 1;
+					break;
+#endif
+				default:
+					fprintf(stderr, "Unknown argument %s\n", argv[i]);
+					printhelp(argv[0]);
+					exit(EXIT_FAILURE);
+					break;
+			}
+		}
+	}
+
+	/* Set up listening ports */
+	if (svr_opts.portcount == 0) {
+		svr_opts.ports[0] = m_strdup(DROPBEAR_DEFPORT);
+		svr_opts.portcount = 1;
+	} else {
+		/* we may have been given a -p option but no argument to go with
+		 * it */
+		if (svr_opts.ports[svr_opts.portcount-1] == NULL) {
+			svr_opts.portcount--;
+		}
+	}
+
+	if (svr_opts.dsskeyfile == NULL) {
+		svr_opts.dsskeyfile = DSS_PRIV_FILENAME;
+	}
+	if (svr_opts.rsakeyfile == NULL) {
+		svr_opts.rsakeyfile = RSA_PRIV_FILENAME;
+	}
+
+	if (svr_opts.bannerfile) {
+		struct stat buf;
+		if (stat(svr_opts.bannerfile, &buf) != 0) {
+			dropbear_exit("Error opening banner file '%s'",
+					svr_opts.bannerfile);
+		}
+		
+		if (buf.st_size > MAX_BANNER_SIZE) {
+			dropbear_exit("Banner file too large, max is %d bytes",
+					MAX_BANNER_SIZE);
+		}
+
+		svr_opts.banner = buf_new(buf.st_size);
+		if (buf_readfile(svr_opts.banner, svr_opts.bannerfile)!=DROPBEAR_SUCCESS) {
+			dropbear_exit("Error reading banner file '%s'",
+					svr_opts.bannerfile);
+		}
+		buf_setpos(svr_opts.banner, 0);
+	}
+
+}
+
+static void disablekey(int type, const char* filename) {
+
+	int i;
+
+	for (i = 0; sshhostkey[i].name != NULL; i++) {
+		if (sshhostkey[i].val == type) {
+			sshhostkey[i].usable = 0;
+			break;
+		}
+	}
+	dropbear_log(LOG_WARNING, "Failed reading '%s', disabling %s", filename,
+			type == DROPBEAR_SIGNKEY_DSS ? "DSS" : "RSA");
+}
+
+/* Must be called after syslog/etc is working */
+void loadhostkeys() {
+
+	int ret;
+	int type;
+
+	TRACE(("enter loadhostkeys"))
+
+	svr_opts.hostkey = new_sign_key();
+
+#ifdef DROPBEAR_RSA
+	type = DROPBEAR_SIGNKEY_RSA;
+	ret = readhostkey(svr_opts.rsakeyfile, svr_opts.hostkey, &type);
+	if (ret == DROPBEAR_FAILURE) {
+		disablekey(DROPBEAR_SIGNKEY_RSA, svr_opts.rsakeyfile);
+	}
+#endif
+#ifdef DROPBEAR_DSS
+	type = DROPBEAR_SIGNKEY_DSS;
+	ret = readhostkey(svr_opts.dsskeyfile, svr_opts.hostkey, &type);
+	if (ret == DROPBEAR_FAILURE) {
+		disablekey(DROPBEAR_SIGNKEY_DSS, svr_opts.dsskeyfile);
+	}
+#endif
+
+	if ( 1
+#ifdef DROPBEAR_DSS
+		&& svr_opts.hostkey->dsskey == NULL
+#endif
+#ifdef DROPBEAR_RSA
+		&& svr_opts.hostkey->rsakey == NULL
+#endif
+		) {
+		dropbear_exit("No hostkeys available");
+	}
+
+	TRACE(("leave loadhostkeys"))
+}
diff --git a/svr-service.c b/svr-service.c
new file mode 100644
index 0000000..2c78e7d
--- /dev/null
+++ b/svr-service.c
@@ -0,0 +1,87 @@
+/*
+ * Dropbear - a SSH2 server
+ * 
+ * Copyright (c) 2002,2003 Matt Johnston
+ * All rights reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE. */
+
+#include "includes.h"
+#include "dbutil.h"
+#include "service.h"
+#include "session.h"
+#include "packet.h"
+#include "ssh.h"
+#include "auth.h"
+
+static void send_msg_service_accept(unsigned char *name, int len);
+
+/* processes a SSH_MSG_SERVICE_REQUEST, returning 0 if finished,
+ * 1 if not */
+void recv_msg_service_request() {
+
+	unsigned char * name;
+	unsigned int len;
+
+	TRACE(("enter recv_msg_service_request"))
+
+	name = buf_getstring(ses.payload, &len);
+
+	/* ssh-userauth */
+	if (len == SSH_SERVICE_USERAUTH_LEN && 
+			strncmp(SSH_SERVICE_USERAUTH, name, len) == 0) {
+
+		send_msg_service_accept(name, len);
+		m_free(name);
+		TRACE(("leave recv_msg_service_request: done ssh-userauth"))
+		return;
+	}
+
+	/* ssh-connection */
+	if (len == SSH_SERVICE_CONNECTION_LEN &&
+			(strncmp(SSH_SERVICE_CONNECTION, name, len) == 0)) {
+		if (ses.authstate.authdone != 1) {
+			dropbear_exit("request for connection before auth");
+		}
+
+		send_msg_service_accept(name, len);
+		m_free(name);
+		TRACE(("leave recv_msg_service_request: done ssh-connection"))
+		return;
+	}
+
+	m_free(name);
+	/* TODO this should be a MSG_DISCONNECT */
+	dropbear_exit("unrecognised SSH_MSG_SERVICE_REQUEST");
+
+
+}
+
+static void send_msg_service_accept(unsigned char *name, int len) {
+
+	TRACE(("accepting service %s", name))
+
+	CHECKCLEARTOWRITE();
+
+	buf_putbyte(ses.writepayload, SSH_MSG_SERVICE_ACCEPT);
+	buf_putstring(ses.writepayload, name, len);
+
+	encrypt_packet();
+
+}
diff --git a/svr-session.c b/svr-session.c
new file mode 100644
index 0000000..70029f8
--- /dev/null
+++ b/svr-session.c
@@ -0,0 +1,201 @@
+/*
+ * Dropbear - a SSH2 server
+ * 
+ * Copyright (c) 2002,2003 Matt Johnston
+ * All rights reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE. */
+
+#include "includes.h"
+#include "session.h"
+#include "dbutil.h"
+#include "packet.h"
+#include "algo.h"
+#include "buffer.h"
+#include "dss.h"
+#include "ssh.h"
+#include "random.h"
+#include "kex.h"
+#include "channel.h"
+#include "chansession.h"
+#include "atomicio.h"
+#include "tcpfwd.h"
+#include "service.h"
+#include "auth.h"
+#include "runopts.h"
+
+static void svr_remoteclosed();
+
+struct serversession svr_ses; /* GLOBAL */
+
+static const packettype svr_packettypes[] = {
+	{SSH_MSG_CHANNEL_DATA, recv_msg_channel_data},
+	{SSH_MSG_CHANNEL_WINDOW_ADJUST, recv_msg_channel_window_adjust},
+	{SSH_MSG_USERAUTH_REQUEST, recv_msg_userauth_request}, /* server */
+	{SSH_MSG_SERVICE_REQUEST, recv_msg_service_request}, /* server */
+	{SSH_MSG_KEXINIT, recv_msg_kexinit},
+	{SSH_MSG_KEXDH_INIT, recv_msg_kexdh_init}, /* server */
+	{SSH_MSG_NEWKEYS, recv_msg_newkeys},
+#ifdef ENABLE_SVR_REMOTETCPFWD
+	{SSH_MSG_GLOBAL_REQUEST, recv_msg_global_request_remotetcp},
+#endif
+	{SSH_MSG_CHANNEL_REQUEST, recv_msg_channel_request},
+	{SSH_MSG_CHANNEL_OPEN, recv_msg_channel_open},
+	{SSH_MSG_CHANNEL_EOF, recv_msg_channel_eof},
+	{SSH_MSG_CHANNEL_CLOSE, recv_msg_channel_close},
+#ifdef USING_LISTENERS
+	{SSH_MSG_CHANNEL_OPEN_CONFIRMATION, recv_msg_channel_open_confirmation},
+	{SSH_MSG_CHANNEL_OPEN_FAILURE, recv_msg_channel_open_failure},
+#endif
+	{0, 0} /* End */
+};
+
+static const struct ChanType *svr_chantypes[] = {
+	&svrchansess,
+#ifdef ENABLE_SVR_LOCALTCPFWD
+	&svr_chan_tcpdirect,
+#endif
+	NULL /* Null termination is mandatory. */
+};
+
+void svr_session(int sock, int childpipe, 
+		char* remotehost, char *addrstring) {
+
+	struct timeval timeout;
+
+    reseedrandom();
+
+	crypto_init();
+	common_session_init(sock, remotehost);
+
+	/* Initialise server specific parts of the session */
+	svr_ses.childpipe = childpipe;
+	svr_ses.addrstring = addrstring;
+	svr_authinitialise();
+	chaninitialise(svr_chantypes);
+	svr_chansessinitialise();
+
+	if (gettimeofday(&timeout, 0) < 0) {
+		dropbear_exit("Error getting time");
+	}
+
+	ses.connecttimeout = timeout.tv_sec + AUTH_TIMEOUT;
+
+	/* set up messages etc */
+	ses.remoteclosed = svr_remoteclosed;
+
+	/* packet handlers */
+	ses.packettypes = svr_packettypes;
+	ses.buf_match_algo = svr_buf_match_algo;
+
+	ses.isserver = 1;
+
+	/* We're ready to go now */
+	sessinitdone = 1;
+
+	/* exchange identification, version etc */
+	session_identification();
+
+	/* start off with key exchange */
+	send_msg_kexinit();
+
+	/* Run the main for loop. NULL is for the dispatcher - only the client
+	 * code makes use of it */
+	session_loop(NULL);
+
+	/* Not reached */
+
+}
+
+/* failure exit - format must be <= 100 chars */
+void svr_dropbear_exit(int exitcode, const char* format, va_list param) {
+
+	char fmtbuf[300];
+
+	if (!sessinitdone) {
+		/* before session init */
+		snprintf(fmtbuf, sizeof(fmtbuf), 
+				"premature exit: %s", format);
+	} else if (ses.authstate.authdone) {
+		/* user has authenticated */
+		snprintf(fmtbuf, sizeof(fmtbuf),
+				"exit after auth (%s): %s", 
+				ses.authstate.printableuser, format);
+	} else if (ses.authstate.printableuser) {
+		/* we have a potential user */
+		snprintf(fmtbuf, sizeof(fmtbuf), 
+				"exit before auth (user '%s', %d fails): %s",
+				ses.authstate.printableuser, ses.authstate.failcount, format);
+	} else {
+		/* before userauth */
+		snprintf(fmtbuf, sizeof(fmtbuf), 
+				"exit before auth: %s", format);
+	}
+
+	_dropbear_log(LOG_INFO, fmtbuf, param);
+
+	/* must be after we've done with username etc */
+	common_session_cleanup();
+
+	exit(exitcode);
+
+}
+
+/* priority is priority as with syslog() */
+void svr_dropbear_log(int priority, const char* format, va_list param) {
+
+	char printbuf[1024];
+	char datestr[20];
+	time_t timesec;
+	int havetrace = 0;
+
+	vsnprintf(printbuf, sizeof(printbuf), format, param);
+
+#ifndef DISABLE_SYSLOG
+	if (svr_opts.usingsyslog) {
+		syslog(priority, "%s", printbuf);
+	}
+#endif
+
+	/* if we are using DEBUG_TRACE, we want to print to stderr even if
+	 * syslog is used, so it is included in error reports */
+#ifdef DEBUG_TRACE
+	havetrace = debug_trace;
+#endif
+
+	if (!svr_opts.usingsyslog || havetrace)
+	{
+		timesec = time(NULL);
+		if (strftime(datestr, sizeof(datestr), "%b %d %H:%M:%S", 
+					localtime(&timesec)) == 0) {
+			datestr[0] = '?'; datestr[1] = '\0';
+		}
+		fprintf(stderr, "[%d] %s %s\n", getpid(), datestr, printbuf);
+	}
+}
+
+/* called when the remote side closes the connection */
+static void svr_remoteclosed() {
+
+	close(ses.sock);
+	ses.sock = -1;
+	dropbear_close("Exited normally");
+
+}
+
diff --git a/svr-tcpfwd.c b/svr-tcpfwd.c
new file mode 100644
index 0000000..6391c4c
--- /dev/null
+++ b/svr-tcpfwd.c
@@ -0,0 +1,290 @@
+/*
+ * Dropbear SSH
+ * 
+ * Copyright (c) 2002,2003 Matt Johnston
+ * Copyright (c) 2004 by Mihnea Stoenescu
+ * All rights reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE. */
+
+#include "includes.h"
+#include "ssh.h"
+#include "tcpfwd.h"
+#include "dbutil.h"
+#include "session.h"
+#include "buffer.h"
+#include "packet.h"
+#include "listener.h"
+#include "runopts.h"
+
+#ifdef ENABLE_SVR_REMOTETCPFWD
+
+static void send_msg_request_success();
+static void send_msg_request_failure();
+static int svr_cancelremotetcp();
+static int svr_remotetcpreq();
+static int newtcpdirect(struct Channel * channel);
+
+
+const struct ChanType svr_chan_tcpdirect = {
+	1, /* sepfds */
+	"direct-tcpip",
+	newtcpdirect, /* init */
+	NULL, /* checkclose */
+	NULL, /* reqhandler */
+	NULL /* closehandler */
+};
+
+static const struct ChanType svr_chan_tcpremote = {
+	1, /* sepfds */
+	"forwarded-tcpip",
+	NULL,
+	NULL,
+	NULL,
+	NULL
+};
+
+/* At the moment this is completely used for tcp code (with the name reflecting
+ * that). If new request types are added, this should be replaced with code
+ * similar to the request-switching in chansession.c */
+void recv_msg_global_request_remotetcp() {
+
+	unsigned char* reqname = NULL;
+	unsigned int namelen;
+	unsigned int wantreply = 0;
+	int ret = DROPBEAR_FAILURE;
+
+	TRACE(("enter recv_msg_global_request_remotetcp"))
+
+	if (svr_opts.noremotetcp) {
+		TRACE(("leave recv_msg_global_request_remotetcp: remote tcp forwarding disabled"))
+		goto out;
+	}
+
+	reqname = buf_getstring(ses.payload, &namelen);
+	wantreply = buf_getbool(ses.payload);
+
+	if (namelen > MAX_NAME_LEN) {
+		TRACE(("name len is wrong: %d", namelen))
+		goto out;
+	}
+
+	if (strcmp("tcpip-forward", reqname) == 0) {
+		ret = svr_remotetcpreq();
+	} else if (strcmp("cancel-tcpip-forward", reqname) == 0) {
+		ret = svr_cancelremotetcp();
+	} else {
+		TRACE(("reqname isn't tcpip-forward: '%s'", reqname))
+	}
+
+out:
+	if (wantreply) {
+		if (ret == DROPBEAR_SUCCESS) {
+			send_msg_request_success();
+		} else {
+			send_msg_request_failure();
+		}
+	}
+
+	m_free(reqname);
+
+	TRACE(("leave recv_msg_global_request"))
+}
+
+
+static void send_msg_request_success() {
+
+	CHECKCLEARTOWRITE();
+	buf_putbyte(ses.writepayload, SSH_MSG_REQUEST_SUCCESS);
+	encrypt_packet();
+
+}
+
+static void send_msg_request_failure() {
+
+	CHECKCLEARTOWRITE();
+	buf_putbyte(ses.writepayload, SSH_MSG_REQUEST_FAILURE);
+	encrypt_packet();
+
+}
+
+static int matchtcp(void* typedata1, void* typedata2) {
+
+	const struct TCPListener *info1 = (struct TCPListener*)typedata1;
+	const struct TCPListener *info2 = (struct TCPListener*)typedata2;
+
+	return (info1->listenport == info2->listenport)
+			&& (info1->chantype == info2->chantype)
+			&& (strcmp(info1->listenaddr, info2->listenaddr) == 0);
+}
+
+static int svr_cancelremotetcp() {
+
+	int ret = DROPBEAR_FAILURE;
+	unsigned char * bindaddr = NULL;
+	unsigned int addrlen;
+	unsigned int port;
+	struct Listener * listener = NULL;
+	struct TCPListener tcpinfo;
+
+	TRACE(("enter cancelremotetcp"))
+
+	bindaddr = buf_getstring(ses.payload, &addrlen);
+	if (addrlen > MAX_IP_LEN) {
+		TRACE(("addr len too long: %d", addrlen))
+		goto out;
+	}
+
+	port = buf_getint(ses.payload);
+
+	tcpinfo.sendaddr = NULL;
+	tcpinfo.sendport = 0;
+	tcpinfo.listenaddr = bindaddr;
+	tcpinfo.listenport = port;
+	listener = get_listener(CHANNEL_ID_TCPFORWARDED, &tcpinfo, matchtcp);
+	if (listener) {
+		remove_listener( listener );
+		ret = DROPBEAR_SUCCESS;
+	}
+
+out:
+	m_free(bindaddr);
+	TRACE(("leave cancelremotetcp"))
+	return ret;
+}
+
+static int svr_remotetcpreq() {
+
+	int ret = DROPBEAR_FAILURE;
+	unsigned char * bindaddr = NULL;
+	unsigned int addrlen;
+	struct TCPListener *tcpinfo = NULL;
+	unsigned int port;
+
+	TRACE(("enter remotetcpreq"))
+
+	bindaddr = buf_getstring(ses.payload, &addrlen);
+	if (addrlen > MAX_IP_LEN) {
+		TRACE(("addr len too long: %d", addrlen))
+		goto out;
+	}
+
+	port = buf_getint(ses.payload);
+
+	if (port == 0) {
+		dropbear_log(LOG_INFO, "Server chosen tcpfwd ports are unsupported");
+		goto out;
+	}
+
+	if (port < 1 || port > 65535) {
+		TRACE(("invalid port: %d", port))
+		goto out;
+	}
+
+	if (!ses.allowprivport && port < IPPORT_RESERVED) {
+		TRACE(("can't assign port < 1024 for non-root"))
+		goto out;
+	}
+
+	tcpinfo = (struct TCPListener*)m_malloc(sizeof(struct TCPListener));
+	tcpinfo->sendaddr = NULL;
+	tcpinfo->sendport = 0;
+	tcpinfo->listenaddr = bindaddr;
+	tcpinfo->listenport = port;
+	tcpinfo->chantype = &svr_chan_tcpremote;
+	tcpinfo->tcp_type = forwarded;
+
+	ret = listen_tcpfwd(tcpinfo);
+
+out:
+	if (ret == DROPBEAR_FAILURE) {
+		/* we only free it if a listener wasn't created, since the listener
+		 * has to remember it if it's to be cancelled */
+		m_free(tcpinfo->listenaddr);
+		m_free(tcpinfo);
+	}
+	TRACE(("leave remotetcpreq"))
+	return ret;
+}
+
+/* Called upon creating a new direct tcp channel (ie we connect out to an
+ * address */
+static int newtcpdirect(struct Channel * channel) {
+
+	unsigned char* desthost = NULL;
+	unsigned int destport;
+	unsigned char* orighost = NULL;
+	unsigned int origport;
+	char portstring[NI_MAXSERV];
+	int sock;
+	int len;
+	int err = SSH_OPEN_ADMINISTRATIVELY_PROHIBITED;
+
+	if (svr_opts.nolocaltcp) {
+		TRACE(("leave newtcpdirect: local tcp forwarding disabled"))
+		goto out;
+	}
+
+	desthost = buf_getstring(ses.payload, &len);
+	if (len > MAX_HOST_LEN) {
+		TRACE(("leave newtcpdirect: desthost too long"))
+		goto out;
+	}
+
+	destport = buf_getint(ses.payload);
+	
+	orighost = buf_getstring(ses.payload, &len);
+	if (len > MAX_HOST_LEN) {
+		TRACE(("leave newtcpdirect: orighost too long"))
+		goto out;
+	}
+
+	origport = buf_getint(ses.payload);
+
+	/* best be sure */
+	if (origport > 65535 || destport > 65535) {
+		TRACE(("leave newtcpdirect: port > 65535"))
+		goto out;
+	}
+
+	snprintf(portstring, sizeof(portstring), "%d", destport);
+	sock = connect_remote(desthost, portstring, 1, NULL);
+	if (sock < 0) {
+		err = SSH_OPEN_CONNECT_FAILED;
+		TRACE(("leave newtcpdirect: sock failed"))
+		goto out;
+	}
+
+	ses.maxfd = MAX(ses.maxfd, sock);
+
+	 /* We don't set readfd, that will get set after the connection's
+	 * progress succeeds */
+	channel->writefd = sock;
+	channel->initconn = 1;
+	
+	err = SSH_OPEN_IN_PROGRESS;
+
+out:
+	m_free(desthost);
+	m_free(orighost);
+	TRACE(("leave newtcpdirect: err %d", err))
+	return err;
+}
+
+#endif
diff --git a/svr-x11fwd.c b/svr-x11fwd.c
new file mode 100644
index 0000000..cbc8a79
--- /dev/null
+++ b/svr-x11fwd.c
@@ -0,0 +1,236 @@
+/*
+ * Dropbear - a SSH2 server
+ * 
+ * Copyright (c) 2002,2003 Matt Johnston
+ * All rights reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE. */
+
+#include "includes.h"
+
+#ifndef DISABLE_X11FWD
+#include "x11fwd.h"
+#include "session.h"
+#include "ssh.h"
+#include "dbutil.h"
+#include "chansession.h"
+#include "channel.h"
+#include "packet.h"
+#include "buffer.h"
+
+#define X11BASEPORT 6000
+#define X11BINDBASE 6010
+
+static void x11accept(struct Listener* listener, int sock);
+static int bindport(int fd);
+static int send_msg_channel_open_x11(int fd, struct sockaddr_in* addr);
+
+/* called as a request for a session channel, sets up listening X11 */
+/* returns DROPBEAR_SUCCESS or DROPBEAR_FAILURE */
+int x11req(struct ChanSess * chansess) {
+
+	int fd;
+
+	/* we already have an x11 connection */
+	if (chansess->x11listener != NULL) {
+		return DROPBEAR_FAILURE;
+	}
+
+	chansess->x11singleconn = buf_getbool(ses.payload);
+	chansess->x11authprot = buf_getstring(ses.payload, NULL);
+	chansess->x11authcookie = buf_getstring(ses.payload, NULL);
+	chansess->x11screennum = buf_getint(ses.payload);
+
+	/* create listening socket */
+	fd = socket(PF_INET, SOCK_STREAM, 0);
+	if (fd < 0) {
+		goto fail;
+	}
+
+	/* allocate port and bind */
+	chansess->x11port = bindport(fd);
+	if (chansess->x11port < 0) {
+		goto fail;
+	}
+
+	/* listen */
+	if (listen(fd, 20) < 0) {
+		goto fail;
+	}
+
+	/* set non-blocking */
+	setnonblocking(fd);
+
+	/* listener code will handle the socket now.
+	 * No cleanup handler needed, since listener_remove only happens
+	 * from our cleanup anyway */
+	chansess->x11listener = new_listener( &fd, 1, 0, chansess, x11accept, NULL);
+	if (chansess->x11listener == NULL) {
+		goto fail;
+	}
+
+	return DROPBEAR_SUCCESS;
+
+fail:
+	/* cleanup */
+	m_free(chansess->x11authprot);
+	m_free(chansess->x11authcookie);
+	close(fd);
+
+	return DROPBEAR_FAILURE;
+}
+
+/* accepts a new X11 socket */
+/* returns DROPBEAR_FAILURE or DROPBEAR_SUCCESS */
+static void x11accept(struct Listener* listener, int sock) {
+
+	int fd;
+	struct sockaddr_in addr;
+	int len;
+	int ret;
+	struct ChanSess * chansess = (struct ChanSess *)(listener->typedata);
+
+	len = sizeof(addr);
+
+	fd = accept(sock, (struct sockaddr*)&addr, &len);
+	if (fd < 0) {
+		return;
+	}
+
+	/* if single-connection we close it up */
+	if (chansess->x11singleconn) {
+		x11cleanup(chansess);
+	}
+
+	ret = send_msg_channel_open_x11(fd, &addr);
+	if (ret == DROPBEAR_FAILURE) {
+		close(fd);
+	}
+}
+
+/* This is called after switching to the user, and sets up the xauth
+ * and environment variables.  */
+void x11setauth(struct ChanSess *chansess) {
+
+	char display[20]; /* space for "localhost:12345.123" */
+	FILE * authprog = NULL;
+	int val;
+
+	if (chansess->x11listener == NULL) {
+		return;
+	}
+
+	/* create the DISPLAY string */
+	val = snprintf(display, sizeof(display), "localhost:%d.%d",
+			chansess->x11port - X11BASEPORT, chansess->x11screennum);
+	if (val < 0 || val >= (int)sizeof(display)) {
+		/* string was truncated */
+		return;
+	}
+
+	addnewvar("DISPLAY", display);
+
+	/* create the xauth string */
+	val = snprintf(display, sizeof(display), "unix:%d.%d",
+			chansess->x11port - X11BASEPORT, chansess->x11screennum);
+	if (val < 0 || val >= (int)sizeof(display)) {
+		/* string was truncated */
+		return;
+	}
+
+	/* popen is a nice function - code is strongly based on OpenSSH's */
+	authprog = popen(XAUTH_COMMAND, "w");
+	if (authprog) {
+		fprintf(authprog, "add %s %s %s\n",
+				display, chansess->x11authprot, chansess->x11authcookie);
+		pclose(authprog);
+	} else {
+		fprintf(stderr, "Failed to run %s\n", XAUTH_COMMAND);
+	}
+}
+
+void x11cleanup(struct ChanSess *chansess) {
+
+	m_free(chansess->x11authprot);
+	m_free(chansess->x11authcookie);
+
+	TRACE(("chansess %s", chansess))
+	if (chansess->x11listener != NULL) {
+		remove_listener(chansess->x11listener);
+		chansess->x11listener = NULL;
+	}
+}
+
+static const struct ChanType chan_x11 = {
+	0, /* sepfds */
+	"x11",
+	NULL, /* inithandler */
+	NULL, /* checkclose */
+	NULL, /* reqhandler */
+	NULL /* closehandler */
+};
+
+
+static int send_msg_channel_open_x11(int fd, struct sockaddr_in* addr) {
+
+	char* ipstring = NULL;
+
+	if (send_msg_channel_open_init(fd, &chan_x11) == DROPBEAR_SUCCESS) {
+		ipstring = inet_ntoa(addr->sin_addr);
+		buf_putstring(ses.writepayload, ipstring, strlen(ipstring));
+		buf_putint(ses.writepayload, addr->sin_port);
+
+		encrypt_packet();
+		return DROPBEAR_SUCCESS;
+	} else {
+		return DROPBEAR_FAILURE;
+	}
+
+}
+
+/* returns the port bound to, or -1 on failure.
+ * Will attempt to bind to a port X11BINDBASE (6010 usually) or upwards */
+static int bindport(int fd) {
+
+	struct sockaddr_in addr;
+	uint16_t port;
+
+	memset((void*)&addr, 0x0, sizeof(addr));
+	addr.sin_family = AF_INET;
+	addr.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
+
+	/* if we can't find one in 2000 ports free, something's wrong */
+	for (port = X11BINDBASE; port < X11BINDBASE + 2000; port++) {
+		addr.sin_port = htons(port);
+		if (bind(fd, (struct sockaddr*)&addr, 
+					sizeof(struct sockaddr_in)) == 0) {
+			/* success */
+			return port;
+		}
+		if (errno == EADDRINUSE) {
+			/* try the next port */
+			continue;
+		}
+		/* otherwise it was an error we don't know about */
+		dropbear_log(LOG_DEBUG, "failed to bind x11 socket");
+		break;
+	}
+	return -1;
+}
+#endif /* DROPBEAR_X11FWD */
diff --git a/tcp-accept.c b/tcp-accept.c
new file mode 100644
index 0000000..ffb175e
--- /dev/null
+++ b/tcp-accept.c
@@ -0,0 +1,143 @@
+/*
+ * Dropbear SSH
+ * 
+ * Copyright (c) 2002,2003 Matt Johnston
+ * All rights reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE. */
+
+#include "includes.h"
+#include "ssh.h"
+#include "tcpfwd.h"
+#include "dbutil.h"
+#include "session.h"
+#include "buffer.h"
+#include "packet.h"
+#include "listener.h"
+#include "runopts.h"
+
+#ifdef DROPBEAR_TCP_ACCEPT
+
+static void cleanup_tcp(struct Listener *listener) {
+
+	struct TCPListener *tcpinfo = (struct TCPListener*)(listener->typedata);
+
+	m_free(tcpinfo->sendaddr);
+	m_free(tcpinfo->listenaddr);
+	m_free(tcpinfo);
+}
+
+static void tcp_acceptor(struct Listener *listener, int sock) {
+
+	int fd;
+	struct sockaddr_storage addr;
+	socklen_t len;
+	char ipstring[NI_MAXHOST], portstring[NI_MAXSERV];
+	struct TCPListener *tcpinfo = (struct TCPListener*)(listener->typedata);
+
+	len = sizeof(addr);
+
+	fd = accept(sock, (struct sockaddr*)&addr, &len);
+	if (fd < 0) {
+		return;
+	}
+
+	if (getnameinfo((struct sockaddr*)&addr, len, ipstring, sizeof(ipstring),
+				portstring, sizeof(portstring), 
+				NI_NUMERICHOST | NI_NUMERICSERV) != 0) {
+		return;
+	}
+
+	if (send_msg_channel_open_init(fd, tcpinfo->chantype) == DROPBEAR_SUCCESS) {
+		unsigned char* addr = NULL;
+		unsigned int port = 0;
+
+		if (tcpinfo->tcp_type == direct) {
+			/* "direct-tcpip" */
+			/* host to connect, port to connect */
+			addr = tcpinfo->sendaddr;
+			port = tcpinfo->sendport;
+		} else {
+			dropbear_assert(tcpinfo->tcp_type == forwarded);
+			/* "forwarded-tcpip" */
+			/* address that was connected, port that was connected */
+			addr = tcpinfo->listenaddr;
+			port = tcpinfo->listenport;
+		}
+
+		buf_putstring(ses.writepayload, addr, strlen(addr));
+		buf_putint(ses.writepayload, port);
+
+		/* originator ip */
+		buf_putstring(ses.writepayload, ipstring, strlen(ipstring));
+		/* originator port */
+		buf_putint(ses.writepayload, atol(portstring));
+
+		encrypt_packet();
+
+	} else {
+		/* XXX debug? */
+		close(fd);
+	}
+}
+
+int listen_tcpfwd(struct TCPListener* tcpinfo) {
+
+	char portstring[NI_MAXSERV];
+	int socks[DROPBEAR_MAX_SOCKS];
+	struct Listener *listener = NULL;
+	int nsocks;
+	char* errstring = NULL;
+	// listen_spec = NULL indicates localhost
+	const char* listen_spec = NULL;
+
+	TRACE(("enter listen_tcpfwd"))
+
+	/* first we try to bind, so don't need to do so much cleanup on failure */
+	snprintf(portstring, sizeof(portstring), "%d", tcpinfo->listenport);
+
+	/* a listenaddr of "" will indicate all interfaces */
+	if (opts.listen_fwd_all 
+			&& (strcmp(tcpinfo->listenaddr, "localhost") != 0) ) {
+		listen_spec = tcpinfo->listenaddr;
+	}
+
+	nsocks = dropbear_listen(listen_spec, portstring, socks, 
+			DROPBEAR_MAX_SOCKS, &errstring, &ses.maxfd);
+	if (nsocks < 0) {
+		dropbear_log(LOG_INFO, "TCP forward failed: %s", errstring);
+		m_free(errstring);
+		TRACE(("leave listen_tcpfwd: dropbear_listen failed"))
+		return DROPBEAR_FAILURE;
+	}
+
+	listener = new_listener(socks, nsocks, CHANNEL_ID_TCPFORWARDED, tcpinfo, 
+			tcp_acceptor, cleanup_tcp);
+
+	if (listener == NULL) {
+		m_free(tcpinfo);
+		TRACE(("leave listen_tcpfwd: listener failed"))
+		return DROPBEAR_FAILURE;
+	}
+
+	TRACE(("leave listen_tcpfwd: success"))
+	return DROPBEAR_SUCCESS;
+}
+
+#endif /* DROPBEAR_TCP_ACCEPT */
diff --git a/tcpfwd.h b/tcpfwd.h
new file mode 100644
index 0000000..28af029
--- /dev/null
+++ b/tcpfwd.h
@@ -0,0 +1,69 @@
+/*
+ * Dropbear - a SSH2 server
+ * 
+ * Copyright (c) 2002,2003 Matt Johnston
+ * All rights reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE. */
+#ifndef _TCPFWD_H
+#define _TCPFWD_H
+
+#include "channel.h"
+
+struct TCPListener {
+
+	/* For a direct-tcpip request, it's the addr/port we want the other
+	 * end to connect to */
+	unsigned char *sendaddr;
+	unsigned int sendport;
+
+	/* This is the address/port that we listen on. The address has special
+	 * meanings as per the rfc, "" for all interfaces, "localhost" for 
+	 * localhost, or a normal interface name. */
+	unsigned char *listenaddr;
+	unsigned int listenport;
+
+	const struct ChanType *chantype;
+	enum {direct, forwarded} tcp_type;
+};
+
+/* A link in a list of forwards */
+struct TCPFwdList {
+
+	const unsigned char* connectaddr;
+	unsigned int connectport;
+	unsigned int listenport;
+	struct TCPFwdList * next;
+
+};
+
+/* Server */
+void recv_msg_global_request_remotetcp();
+extern const struct ChanType svr_chan_tcpdirect;
+
+/* Client */
+void setup_localtcp();
+void setup_remotetcp();
+extern const struct ChanType cli_chan_tcpremote;
+
+/* Common */
+int listen_tcpfwd(struct TCPListener* tcpinfo);
+
+
+#endif
diff --git a/termcodes.c b/termcodes.c
new file mode 100644
index 0000000..d59505c
--- /dev/null
+++ b/termcodes.c
@@ -0,0 +1,187 @@
+/*
+ * Dropbear - a SSH2 server
+ * 
+ * Copyright (c) 2002,2003 Matt Johnston
+ * All rights reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE. */
+
+#include "includes.h"
+#include "termcodes.h"
+
+const struct TermCode termcodes[MAX_TERMCODE+1] = {
+
+		{0, 0}, /* TTY_OP_END */
+		{VINTR, TERMCODE_CONTROLCHAR}, /* control character codes */
+		{VQUIT, TERMCODE_CONTROLCHAR},
+		{VERASE, TERMCODE_CONTROLCHAR},
+		{VKILL, TERMCODE_CONTROLCHAR},
+		{VEOF, TERMCODE_CONTROLCHAR},
+		{VEOL, TERMCODE_CONTROLCHAR},
+		{VEOL2, TERMCODE_CONTROLCHAR},
+		{VSTART, TERMCODE_CONTROLCHAR},
+		{VSTOP, TERMCODE_CONTROLCHAR},
+		{VSUSP, TERMCODE_CONTROLCHAR},
+#ifdef VDSUSP
+		{VDSUSP, TERMCODE_CONTROLCHAR},
+#else
+		{0, 0},
+#endif
+#ifdef VREPRINT
+		{VREPRINT, TERMCODE_CONTROLCHAR},
+#else
+		{0, 0},
+#endif
+#ifdef AIX
+		{CERASE, TERMCODE_CONTROLCHAR},
+#else
+		{VWERASE, TERMCODE_CONTROLCHAR},
+#endif
+		{VLNEXT, TERMCODE_CONTROLCHAR},
+#ifdef VFLUSH
+		{VFLUSH, TERMCODE_CONTROLCHAR},
+#else	
+		{0, 0},
+#endif
+#ifdef VSWTCH
+		{VSWTCH, TERMCODE_CONTROLCHAR},
+#else	
+		{0, 0},
+#endif
+#ifdef VSTATUS
+		{VSTATUS, TERMCODE_CONTROLCHAR},
+#else
+		{0, 0},
+#endif
+#ifdef AIX
+		{CKILL, TERMCODE_CONTROLCHAR},
+#elif defined(VDISCARD)
+		{VDISCARD, TERMCODE_CONTROLCHAR},
+#else
+		{0, 0},
+#endif
+		{0, 0}, /* 19 */
+		{0, 0},
+		{0, 0},
+		{0, 0},
+		{0, 0},
+		{0, 0},
+		{0, 0},
+		{0, 0},
+		{0, 0},
+		{0, 0},
+		{0, 0}, /* 29 */
+		{IGNPAR, TERMCODE_INPUT}, /* input flags */
+		{PARMRK, TERMCODE_INPUT},
+		{INPCK, TERMCODE_INPUT},
+		{ISTRIP, TERMCODE_INPUT},
+		{INLCR, TERMCODE_INPUT},
+		{IGNCR, TERMCODE_INPUT},
+		{ICRNL, TERMCODE_INPUT},
+#ifdef IUCLC
+		{IUCLC, TERMCODE_INPUT},
+#else
+		{0, 0},
+#endif
+		{IXON, TERMCODE_INPUT},
+		{IXANY, TERMCODE_INPUT},
+		{IXOFF, TERMCODE_INPUT},
+#ifdef IMAXBEL
+		{IMAXBEL, TERMCODE_INPUT},
+#else
+		{0, 0},
+#endif
+		{0, 0}, /* 42 */
+		{0, 0},
+		{0, 0},
+		{0, 0},
+		{0, 0},
+		{0, 0},
+		{0, 0},
+		{0, 0}, /* 49 */
+		{ISIG, TERMCODE_LOCAL}, /* local flags */
+		{ICANON, TERMCODE_LOCAL},
+#ifdef XCASE
+		{XCASE, TERMCODE_LOCAL},
+#else
+		{0, 0},
+#endif
+		{ECHO, TERMCODE_LOCAL},
+		{ECHOE, TERMCODE_LOCAL},
+		{ECHOK, TERMCODE_LOCAL},
+		{ECHONL, TERMCODE_LOCAL},
+		{NOFLSH, TERMCODE_LOCAL},
+		{TOSTOP, TERMCODE_LOCAL},
+		{IEXTEN, TERMCODE_LOCAL},
+		{ECHOCTL, TERMCODE_LOCAL},
+		{ECHOKE, TERMCODE_LOCAL},
+#ifdef PENDIN
+		{PENDIN, TERMCODE_LOCAL},
+#else
+		{0, 0},
+#endif
+		{0, 0}, /* 63 */
+		{0, 0},
+		{0, 0},
+		{0, 0},
+		{0, 0},
+		{0, 0},
+		{0, 0}, /* 69 */
+		{OPOST, TERMCODE_OUTPUT}, /* output flags */
+#ifdef OLCUC
+		{OLCUC, TERMCODE_OUTPUT},
+#else
+		{0, 0},
+#endif
+		{ONLCR, TERMCODE_OUTPUT},
+#ifdef OCRNL
+		{OCRNL, TERMCODE_OUTPUT},
+#else
+		{0, 0},
+#endif
+#ifdef ONOCR
+		{ONOCR, TERMCODE_OUTPUT},
+#else
+		{0, 0},
+#endif
+#ifdef ONLRET
+		{ONLRET, TERMCODE_OUTPUT},
+#else
+		{0, 0},
+#endif
+		{0, 0}, /* 76 */
+		{0, 0},
+		{0, 0},
+		{0, 0},
+		{0, 0},
+		{0, 0},
+		{0, 0},
+		{0, 0},
+		{0, 0},
+		{0, 0},
+		{0, 0},
+		{0, 0},
+		{0, 0},
+		{0, 0}, /* 89 */
+		{CS7, TERMCODE_CONTROL},
+		{CS8, TERMCODE_CONTROL},
+		{PARENB, TERMCODE_CONTROL},
+		{PARODD, TERMCODE_CONTROL}
+		/* 94 */
+};
diff --git a/termcodes.h b/termcodes.h
new file mode 100644
index 0000000..00792ea
--- /dev/null
+++ b/termcodes.h
@@ -0,0 +1,46 @@
+/*
+ * Dropbear - a SSH2 server
+ * 
+ * Copyright (c) 2002,2003 Matt Johnston
+ * All rights reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE. */
+
+#ifndef _TERMCODES_H_
+#define _TERMCODES_H_
+
+#define TERMCODE_NONE 0
+#define TERMCODE_CONTROL 1
+#define TERMCODE_INPUT 2
+#define TERMCODE_OUTPUT 3
+#define TERMCODE_LOCAL 4
+#define TERMCODE_CONTROLCHAR 5
+
+#define MAX_TERMCODE 93
+
+struct TermCode {
+
+	unsigned int mapcode;
+	unsigned char type;
+
+};
+
+extern const struct TermCode termcodes[];
+
+#endif /* _TERMCODES_H_ */
diff --git a/x11fwd.h b/x11fwd.h
new file mode 100644
index 0000000..5855a68
--- /dev/null
+++ b/x11fwd.h
@@ -0,0 +1,37 @@
+/*
+ * Dropbear - a SSH2 server
+ * 
+ * Copyright (c) 2002,2003 Matt Johnston
+ * All rights reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE. */
+#ifndef _X11FWD_H_
+#define _X11FWD_H_
+#ifndef DISABLE_X11FWD
+
+#include "includes.h"
+#include "chansession.h"
+#include "channel.h"
+
+int x11req(struct ChanSess * chansess);
+void x11setauth(struct ChanSess *chansess);
+void x11cleanup(struct ChanSess *chansess);
+
+#endif /* DROPBEAR_X11FWD */
+#endif /* _X11FWD_H_ */